1 /* Loop Vectorization
2 Copyright (C) 2003-2016 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "cfgloop.h"
45 #include "params.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
49 #include "cgraph.h"
50
51 /* Loop Vectorization Pass.
52
53 This pass tries to vectorize loops.
54
55 For example, the vectorizer transforms the following simple loop:
56
57 short a[N]; short b[N]; short c[N]; int i;
58
59 for (i=0; i<N; i++){
60 a[i] = b[i] + c[i];
61 }
62
63 as if it was manually vectorized by rewriting the source code into:
64
65 typedef int __attribute__((mode(V8HI))) v8hi;
66 short a[N]; short b[N]; short c[N]; int i;
67 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
68 v8hi va, vb, vc;
69
70 for (i=0; i<N/8; i++){
71 vb = pb[i];
72 vc = pc[i];
73 va = vb + vc;
74 pa[i] = va;
75 }
76
77 The main entry to this pass is vectorize_loops(), in which
78 the vectorizer applies a set of analyses on a given set of loops,
79 followed by the actual vectorization transformation for the loops that
80 had successfully passed the analysis phase.
81 Throughout this pass we make a distinction between two types of
82 data: scalars (which are represented by SSA_NAMES), and memory references
83 ("data-refs"). These two types of data require different handling both
84 during analysis and transformation. The types of data-refs that the
85 vectorizer currently supports are ARRAY_REFS which base is an array DECL
86 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
87 accesses are required to have a simple (consecutive) access pattern.
88
89 Analysis phase:
90 ===============
91 The driver for the analysis phase is vect_analyze_loop().
92 It applies a set of analyses, some of which rely on the scalar evolution
93 analyzer (scev) developed by Sebastian Pop.
94
95 During the analysis phase the vectorizer records some information
96 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
97 loop, as well as general information about the loop as a whole, which is
98 recorded in a "loop_vec_info" struct attached to each loop.
99
100 Transformation phase:
101 =====================
102 The loop transformation phase scans all the stmts in the loop, and
103 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
104 the loop that needs to be vectorized. It inserts the vector code sequence
105 just before the scalar stmt S, and records a pointer to the vector code
106 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
107 attached to S). This pointer will be used for the vectorization of following
108 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
109 otherwise, we rely on dead code elimination for removing it.
110
111 For example, say stmt S1 was vectorized into stmt VS1:
112
113 VS1: vb = px[i];
114 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
115 S2: a = b;
116
117 To vectorize stmt S2, the vectorizer first finds the stmt that defines
118 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
119 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
120 resulting sequence would be:
121
122 VS1: vb = px[i];
123 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
124 VS2: va = vb;
125 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
126
127 Operands that are not SSA_NAMEs, are data-refs that appear in
128 load/store operations (like 'x[i]' in S1), and are handled differently.
129
130 Target modeling:
131 =================
132 Currently the only target specific information that is used is the
133 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
134 Targets that can support different sizes of vectors, for now will need
135 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
136 flexibility will be added in the future.
137
138 Since we only vectorize operations which vector form can be
139 expressed using existing tree codes, to verify that an operation is
140 supported, the vectorizer checks the relevant optab at the relevant
141 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
142 the value found is CODE_FOR_nothing, then there's no target support, and
143 we can't vectorize the stmt.
144
145 For additional information on this project see:
146 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
147 */
148
149 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
150
151 /* Function vect_determine_vectorization_factor
152
153 Determine the vectorization factor (VF). VF is the number of data elements
154 that are operated upon in parallel in a single iteration of the vectorized
155 loop. For example, when vectorizing a loop that operates on 4byte elements,
156 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
157 elements can fit in a single vector register.
158
159 We currently support vectorization of loops in which all types operated upon
160 are of the same size. Therefore this function currently sets VF according to
161 the size of the types operated upon, and fails if there are multiple sizes
162 in the loop.
163
164 VF is also the factor by which the loop iterations are strip-mined, e.g.:
165 original loop:
166 for (i=0; i<N; i++){
167 a[i] = b[i] + c[i];
168 }
169
170 vectorized loop:
171 for (i=0; i<N; i+=VF){
172 a[i:VF] = b[i:VF] + c[i:VF];
173 }
174 */
175
176 static bool
vect_determine_vectorization_factor(loop_vec_info loop_vinfo)177 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
178 {
179 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
180 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
181 unsigned nbbs = loop->num_nodes;
182 unsigned int vectorization_factor = 0;
183 tree scalar_type;
184 gphi *phi;
185 tree vectype;
186 unsigned int nunits;
187 stmt_vec_info stmt_info;
188 unsigned i;
189 HOST_WIDE_INT dummy;
190 gimple *stmt, *pattern_stmt = NULL;
191 gimple_seq pattern_def_seq = NULL;
192 gimple_stmt_iterator pattern_def_si = gsi_none ();
193 bool analyze_pattern_stmt = false;
194 bool bool_result;
195 auto_vec<stmt_vec_info> mask_producers;
196
197 if (dump_enabled_p ())
198 dump_printf_loc (MSG_NOTE, vect_location,
199 "=== vect_determine_vectorization_factor ===\n");
200
201 for (i = 0; i < nbbs; i++)
202 {
203 basic_block bb = bbs[i];
204
205 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
206 gsi_next (&si))
207 {
208 phi = si.phi ();
209 stmt_info = vinfo_for_stmt (phi);
210 if (dump_enabled_p ())
211 {
212 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
213 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
214 dump_printf (MSG_NOTE, "\n");
215 }
216
217 gcc_assert (stmt_info);
218
219 if (STMT_VINFO_RELEVANT_P (stmt_info)
220 || STMT_VINFO_LIVE_P (stmt_info))
221 {
222 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
223 scalar_type = TREE_TYPE (PHI_RESULT (phi));
224
225 if (dump_enabled_p ())
226 {
227 dump_printf_loc (MSG_NOTE, vect_location,
228 "get vectype for scalar type: ");
229 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
230 dump_printf (MSG_NOTE, "\n");
231 }
232
233 vectype = get_vectype_for_scalar_type (scalar_type);
234 if (!vectype)
235 {
236 if (dump_enabled_p ())
237 {
238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
239 "not vectorized: unsupported "
240 "data-type ");
241 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
242 scalar_type);
243 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
244 }
245 return false;
246 }
247 STMT_VINFO_VECTYPE (stmt_info) = vectype;
248
249 if (dump_enabled_p ())
250 {
251 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
252 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
253 dump_printf (MSG_NOTE, "\n");
254 }
255
256 nunits = TYPE_VECTOR_SUBPARTS (vectype);
257 if (dump_enabled_p ())
258 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
259 nunits);
260
261 if (!vectorization_factor
262 || (nunits > vectorization_factor))
263 vectorization_factor = nunits;
264 }
265 }
266
267 for (gimple_stmt_iterator si = gsi_start_bb (bb);
268 !gsi_end_p (si) || analyze_pattern_stmt;)
269 {
270 tree vf_vectype;
271
272 if (analyze_pattern_stmt)
273 stmt = pattern_stmt;
274 else
275 stmt = gsi_stmt (si);
276
277 stmt_info = vinfo_for_stmt (stmt);
278
279 if (dump_enabled_p ())
280 {
281 dump_printf_loc (MSG_NOTE, vect_location,
282 "==> examining statement: ");
283 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
284 dump_printf (MSG_NOTE, "\n");
285 }
286
287 gcc_assert (stmt_info);
288
289 /* Skip stmts which do not need to be vectorized. */
290 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
291 && !STMT_VINFO_LIVE_P (stmt_info))
292 || gimple_clobber_p (stmt))
293 {
294 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
295 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
296 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
297 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
298 {
299 stmt = pattern_stmt;
300 stmt_info = vinfo_for_stmt (pattern_stmt);
301 if (dump_enabled_p ())
302 {
303 dump_printf_loc (MSG_NOTE, vect_location,
304 "==> examining pattern statement: ");
305 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
306 dump_printf (MSG_NOTE, "\n");
307 }
308 }
309 else
310 {
311 if (dump_enabled_p ())
312 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
313 gsi_next (&si);
314 continue;
315 }
316 }
317 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
318 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
319 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
320 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
321 analyze_pattern_stmt = true;
322
323 /* If a pattern statement has def stmts, analyze them too. */
324 if (is_pattern_stmt_p (stmt_info))
325 {
326 if (pattern_def_seq == NULL)
327 {
328 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
329 pattern_def_si = gsi_start (pattern_def_seq);
330 }
331 else if (!gsi_end_p (pattern_def_si))
332 gsi_next (&pattern_def_si);
333 if (pattern_def_seq != NULL)
334 {
335 gimple *pattern_def_stmt = NULL;
336 stmt_vec_info pattern_def_stmt_info = NULL;
337
338 while (!gsi_end_p (pattern_def_si))
339 {
340 pattern_def_stmt = gsi_stmt (pattern_def_si);
341 pattern_def_stmt_info
342 = vinfo_for_stmt (pattern_def_stmt);
343 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
344 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
345 break;
346 gsi_next (&pattern_def_si);
347 }
348
349 if (!gsi_end_p (pattern_def_si))
350 {
351 if (dump_enabled_p ())
352 {
353 dump_printf_loc (MSG_NOTE, vect_location,
354 "==> examining pattern def stmt: ");
355 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
356 pattern_def_stmt, 0);
357 dump_printf (MSG_NOTE, "\n");
358 }
359
360 stmt = pattern_def_stmt;
361 stmt_info = pattern_def_stmt_info;
362 }
363 else
364 {
365 pattern_def_si = gsi_none ();
366 analyze_pattern_stmt = false;
367 }
368 }
369 else
370 analyze_pattern_stmt = false;
371 }
372
373 if (gimple_get_lhs (stmt) == NULL_TREE
374 /* MASK_STORE has no lhs, but is ok. */
375 && (!is_gimple_call (stmt)
376 || !gimple_call_internal_p (stmt)
377 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
378 {
379 if (is_gimple_call (stmt))
380 {
381 /* Ignore calls with no lhs. These must be calls to
382 #pragma omp simd functions, and what vectorization factor
383 it really needs can't be determined until
384 vectorizable_simd_clone_call. */
385 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
386 {
387 pattern_def_seq = NULL;
388 gsi_next (&si);
389 }
390 continue;
391 }
392 if (dump_enabled_p ())
393 {
394 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
395 "not vectorized: irregular stmt.");
396 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
397 0);
398 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
399 }
400 return false;
401 }
402
403 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
404 {
405 if (dump_enabled_p ())
406 {
407 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
408 "not vectorized: vector stmt in loop:");
409 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
410 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
411 }
412 return false;
413 }
414
415 bool_result = false;
416
417 if (STMT_VINFO_VECTYPE (stmt_info))
418 {
419 /* The only case when a vectype had been already set is for stmts
420 that contain a dataref, or for "pattern-stmts" (stmts
421 generated by the vectorizer to represent/replace a certain
422 idiom). */
423 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
424 || is_pattern_stmt_p (stmt_info)
425 || !gsi_end_p (pattern_def_si));
426 vectype = STMT_VINFO_VECTYPE (stmt_info);
427 }
428 else
429 {
430 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
431 if (is_gimple_call (stmt)
432 && gimple_call_internal_p (stmt)
433 && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
434 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
435 else
436 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
437
438 /* Bool ops don't participate in vectorization factor
439 computation. For comparison use compared types to
440 compute a factor. */
441 if (TREE_CODE (scalar_type) == BOOLEAN_TYPE
442 && is_gimple_assign (stmt)
443 && gimple_assign_rhs_code (stmt) != COND_EXPR)
444 {
445 if (STMT_VINFO_RELEVANT_P (stmt_info)
446 || STMT_VINFO_LIVE_P (stmt_info))
447 mask_producers.safe_push (stmt_info);
448 bool_result = true;
449
450 if (gimple_code (stmt) == GIMPLE_ASSIGN
451 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
452 == tcc_comparison
453 && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt)))
454 != BOOLEAN_TYPE)
455 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
456 else
457 {
458 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
459 {
460 pattern_def_seq = NULL;
461 gsi_next (&si);
462 }
463 continue;
464 }
465 }
466
467 if (dump_enabled_p ())
468 {
469 dump_printf_loc (MSG_NOTE, vect_location,
470 "get vectype for scalar type: ");
471 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
472 dump_printf (MSG_NOTE, "\n");
473 }
474 vectype = get_vectype_for_scalar_type (scalar_type);
475 if (!vectype)
476 {
477 if (dump_enabled_p ())
478 {
479 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
480 "not vectorized: unsupported "
481 "data-type ");
482 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
483 scalar_type);
484 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
485 }
486 return false;
487 }
488
489 if (!bool_result)
490 STMT_VINFO_VECTYPE (stmt_info) = vectype;
491
492 if (dump_enabled_p ())
493 {
494 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
495 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
496 dump_printf (MSG_NOTE, "\n");
497 }
498 }
499
500 /* Don't try to compute VF out scalar types if we stmt
501 produces boolean vector. Use result vectype instead. */
502 if (VECTOR_BOOLEAN_TYPE_P (vectype))
503 vf_vectype = vectype;
504 else
505 {
506 /* The vectorization factor is according to the smallest
507 scalar type (or the largest vector size, but we only
508 support one vector size per loop). */
509 if (!bool_result)
510 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
511 &dummy);
512 if (dump_enabled_p ())
513 {
514 dump_printf_loc (MSG_NOTE, vect_location,
515 "get vectype for scalar type: ");
516 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
517 dump_printf (MSG_NOTE, "\n");
518 }
519 vf_vectype = get_vectype_for_scalar_type (scalar_type);
520 }
521 if (!vf_vectype)
522 {
523 if (dump_enabled_p ())
524 {
525 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
526 "not vectorized: unsupported data-type ");
527 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
528 scalar_type);
529 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
530 }
531 return false;
532 }
533
534 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
535 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
536 {
537 if (dump_enabled_p ())
538 {
539 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
540 "not vectorized: different sized vector "
541 "types in statement, ");
542 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
543 vectype);
544 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
545 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
546 vf_vectype);
547 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
548 }
549 return false;
550 }
551
552 if (dump_enabled_p ())
553 {
554 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
555 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
556 dump_printf (MSG_NOTE, "\n");
557 }
558
559 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
560 if (dump_enabled_p ())
561 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
562 if (!vectorization_factor
563 || (nunits > vectorization_factor))
564 vectorization_factor = nunits;
565
566 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
567 {
568 pattern_def_seq = NULL;
569 gsi_next (&si);
570 }
571 }
572 }
573
574 /* TODO: Analyze cost. Decide if worth while to vectorize. */
575 if (dump_enabled_p ())
576 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
577 vectorization_factor);
578 if (vectorization_factor <= 1)
579 {
580 if (dump_enabled_p ())
581 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
582 "not vectorized: unsupported data-type\n");
583 return false;
584 }
585 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
586
587 for (i = 0; i < mask_producers.length (); i++)
588 {
589 tree mask_type = NULL;
590
591 stmt = STMT_VINFO_STMT (mask_producers[i]);
592
593 if (gimple_code (stmt) == GIMPLE_ASSIGN
594 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
595 && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt))) != BOOLEAN_TYPE)
596 {
597 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
598 mask_type = get_mask_type_for_scalar_type (scalar_type);
599
600 if (!mask_type)
601 {
602 if (dump_enabled_p ())
603 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
604 "not vectorized: unsupported mask\n");
605 return false;
606 }
607 }
608 else
609 {
610 tree rhs;
611 ssa_op_iter iter;
612 gimple *def_stmt;
613 enum vect_def_type dt;
614
615 FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
616 {
617 if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
618 &def_stmt, &dt, &vectype))
619 {
620 if (dump_enabled_p ())
621 {
622 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
623 "not vectorized: can't compute mask type "
624 "for statement, ");
625 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
626 0);
627 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
628 }
629 return false;
630 }
631
632 /* No vectype probably means external definition.
633 Allow it in case there is another operand which
634 allows to determine mask type. */
635 if (!vectype)
636 continue;
637
638 if (!mask_type)
639 mask_type = vectype;
640 else if (TYPE_VECTOR_SUBPARTS (mask_type)
641 != TYPE_VECTOR_SUBPARTS (vectype))
642 {
643 if (dump_enabled_p ())
644 {
645 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
646 "not vectorized: different sized masks "
647 "types in statement, ");
648 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
649 mask_type);
650 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
651 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
652 vectype);
653 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
654 }
655 return false;
656 }
657 else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
658 != VECTOR_BOOLEAN_TYPE_P (vectype))
659 {
660 if (dump_enabled_p ())
661 {
662 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
663 "not vectorized: mixed mask and "
664 "nonmask vector types in statement, ");
665 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
666 mask_type);
667 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
668 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
669 vectype);
670 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
671 }
672 return false;
673 }
674 }
675
676 /* We may compare boolean value loaded as vector of integers.
677 Fix mask_type in such case. */
678 if (mask_type
679 && !VECTOR_BOOLEAN_TYPE_P (mask_type)
680 && gimple_code (stmt) == GIMPLE_ASSIGN
681 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
682 mask_type = build_same_sized_truth_vector_type (mask_type);
683 }
684
685 /* No mask_type should mean loop invariant predicate.
686 This is probably a subject for optimization in
687 if-conversion. */
688 if (!mask_type)
689 {
690 if (dump_enabled_p ())
691 {
692 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
693 "not vectorized: can't compute mask type "
694 "for statement, ");
695 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
696 0);
697 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
698 }
699 return false;
700 }
701
702 STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
703 }
704
705 return true;
706 }
707
708
709 /* Function vect_is_simple_iv_evolution.
710
711 FORNOW: A simple evolution of an induction variables in the loop is
712 considered a polynomial evolution. */
713
714 static bool
vect_is_simple_iv_evolution(unsigned loop_nb,tree access_fn,tree * init,tree * step)715 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
716 tree * step)
717 {
718 tree init_expr;
719 tree step_expr;
720 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
721 basic_block bb;
722
723 /* When there is no evolution in this loop, the evolution function
724 is not "simple". */
725 if (evolution_part == NULL_TREE)
726 return false;
727
728 /* When the evolution is a polynomial of degree >= 2
729 the evolution function is not "simple". */
730 if (tree_is_chrec (evolution_part))
731 return false;
732
733 step_expr = evolution_part;
734 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
735
736 if (dump_enabled_p ())
737 {
738 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
739 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
740 dump_printf (MSG_NOTE, ", init: ");
741 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
742 dump_printf (MSG_NOTE, "\n");
743 }
744
745 *init = init_expr;
746 *step = step_expr;
747
748 if (TREE_CODE (step_expr) != INTEGER_CST
749 && (TREE_CODE (step_expr) != SSA_NAME
750 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
751 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
752 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
753 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
754 || !flag_associative_math)))
755 && (TREE_CODE (step_expr) != REAL_CST
756 || !flag_associative_math))
757 {
758 if (dump_enabled_p ())
759 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
760 "step unknown.\n");
761 return false;
762 }
763
764 return true;
765 }
766
767 /* Function vect_analyze_scalar_cycles_1.
768
769 Examine the cross iteration def-use cycles of scalar variables
770 in LOOP. LOOP_VINFO represents the loop that is now being
771 considered for vectorization (can be LOOP, or an outer-loop
772 enclosing LOOP). */
773
774 static void
vect_analyze_scalar_cycles_1(loop_vec_info loop_vinfo,struct loop * loop)775 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
776 {
777 basic_block bb = loop->header;
778 tree init, step;
779 auto_vec<gimple *, 64> worklist;
780 gphi_iterator gsi;
781 bool double_reduc;
782
783 if (dump_enabled_p ())
784 dump_printf_loc (MSG_NOTE, vect_location,
785 "=== vect_analyze_scalar_cycles ===\n");
786
787 /* First - identify all inductions. Reduction detection assumes that all the
788 inductions have been identified, therefore, this order must not be
789 changed. */
790 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
791 {
792 gphi *phi = gsi.phi ();
793 tree access_fn = NULL;
794 tree def = PHI_RESULT (phi);
795 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
796
797 if (dump_enabled_p ())
798 {
799 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
800 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
801 dump_printf (MSG_NOTE, "\n");
802 }
803
804 /* Skip virtual phi's. The data dependences that are associated with
805 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
806 if (virtual_operand_p (def))
807 continue;
808
809 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
810
811 /* Analyze the evolution function. */
812 access_fn = analyze_scalar_evolution (loop, def);
813 if (access_fn)
814 {
815 STRIP_NOPS (access_fn);
816 if (dump_enabled_p ())
817 {
818 dump_printf_loc (MSG_NOTE, vect_location,
819 "Access function of PHI: ");
820 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
821 dump_printf (MSG_NOTE, "\n");
822 }
823 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
824 = initial_condition_in_loop_num (access_fn, loop->num);
825 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
826 = evolution_part_in_loop_num (access_fn, loop->num);
827 }
828
829 if (!access_fn
830 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
831 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
832 && TREE_CODE (step) != INTEGER_CST))
833 {
834 worklist.safe_push (phi);
835 continue;
836 }
837
838 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
839 != NULL_TREE);
840 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
841
842 if (dump_enabled_p ())
843 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
844 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
845 }
846
847
848 /* Second - identify all reductions and nested cycles. */
849 while (worklist.length () > 0)
850 {
851 gimple *phi = worklist.pop ();
852 tree def = PHI_RESULT (phi);
853 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
854 gimple *reduc_stmt;
855 bool nested_cycle;
856
857 if (dump_enabled_p ())
858 {
859 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
860 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
861 dump_printf (MSG_NOTE, "\n");
862 }
863
864 gcc_assert (!virtual_operand_p (def)
865 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
866
867 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
868 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
869 &double_reduc, false);
870 if (reduc_stmt)
871 {
872 if (double_reduc)
873 {
874 if (dump_enabled_p ())
875 dump_printf_loc (MSG_NOTE, vect_location,
876 "Detected double reduction.\n");
877
878 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
879 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
880 vect_double_reduction_def;
881 }
882 else
883 {
884 if (nested_cycle)
885 {
886 if (dump_enabled_p ())
887 dump_printf_loc (MSG_NOTE, vect_location,
888 "Detected vectorizable nested cycle.\n");
889
890 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
891 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
892 vect_nested_cycle;
893 }
894 else
895 {
896 if (dump_enabled_p ())
897 dump_printf_loc (MSG_NOTE, vect_location,
898 "Detected reduction.\n");
899
900 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
901 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
902 vect_reduction_def;
903 /* Store the reduction cycles for possible vectorization in
904 loop-aware SLP. */
905 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
906 }
907 }
908 }
909 else
910 if (dump_enabled_p ())
911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
912 "Unknown def-use cycle pattern.\n");
913 }
914 }
915
916
917 /* Function vect_analyze_scalar_cycles.
918
919 Examine the cross iteration def-use cycles of scalar variables, by
920 analyzing the loop-header PHIs of scalar variables. Classify each
921 cycle as one of the following: invariant, induction, reduction, unknown.
922 We do that for the loop represented by LOOP_VINFO, and also to its
923 inner-loop, if exists.
924 Examples for scalar cycles:
925
926 Example1: reduction:
927
928 loop1:
929 for (i=0; i<N; i++)
930 sum += a[i];
931
932 Example2: induction:
933
934 loop2:
935 for (i=0; i<N; i++)
936 a[i] = i; */
937
938 static void
vect_analyze_scalar_cycles(loop_vec_info loop_vinfo)939 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
940 {
941 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
942
943 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
944
945 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
946 Reductions in such inner-loop therefore have different properties than
947 the reductions in the nest that gets vectorized:
948 1. When vectorized, they are executed in the same order as in the original
949 scalar loop, so we can't change the order of computation when
950 vectorizing them.
951 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
952 current checks are too strict. */
953
954 if (loop->inner)
955 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
956 }
957
958 /* Transfer group and reduction information from STMT to its pattern stmt. */
959
960 static void
vect_fixup_reduc_chain(gimple * stmt)961 vect_fixup_reduc_chain (gimple *stmt)
962 {
963 gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
964 gimple *stmtp;
965 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
966 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
967 GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
968 do
969 {
970 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
971 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
972 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
973 if (stmt)
974 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
975 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
976 }
977 while (stmt);
978 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
979 }
980
981 /* Fixup scalar cycles that now have their stmts detected as patterns. */
982
983 static void
vect_fixup_scalar_cycles_with_patterns(loop_vec_info loop_vinfo)984 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
985 {
986 gimple *first;
987 unsigned i;
988
989 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
990 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
991 {
992 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
993 while (next)
994 {
995 if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
996 break;
997 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
998 }
999 /* If not all stmt in the chain are patterns try to handle
1000 the chain without patterns. */
1001 if (! next)
1002 {
1003 vect_fixup_reduc_chain (first);
1004 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
1005 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1006 }
1007 }
1008 }
1009
1010 /* Function vect_get_loop_niters.
1011
1012 Determine how many iterations the loop is executed and place it
1013 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
1014 in NUMBER_OF_ITERATIONSM1.
1015
1016 Return the loop exit condition. */
1017
1018
1019 static gcond *
vect_get_loop_niters(struct loop * loop,tree * number_of_iterations,tree * number_of_iterationsm1)1020 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
1021 tree *number_of_iterationsm1)
1022 {
1023 tree niters;
1024
1025 if (dump_enabled_p ())
1026 dump_printf_loc (MSG_NOTE, vect_location,
1027 "=== get_loop_niters ===\n");
1028
1029 niters = number_of_latch_executions (loop);
1030 *number_of_iterationsm1 = niters;
1031
1032 /* We want the number of loop header executions which is the number
1033 of latch executions plus one.
1034 ??? For UINT_MAX latch executions this number overflows to zero
1035 for loops like do { n++; } while (n != 0); */
1036 if (niters && !chrec_contains_undetermined (niters))
1037 niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
1038 build_int_cst (TREE_TYPE (niters), 1));
1039 *number_of_iterations = niters;
1040
1041 return get_loop_exit_condition (loop);
1042 }
1043
1044
1045 /* Function bb_in_loop_p
1046
1047 Used as predicate for dfs order traversal of the loop bbs. */
1048
1049 static bool
bb_in_loop_p(const_basic_block bb,const void * data)1050 bb_in_loop_p (const_basic_block bb, const void *data)
1051 {
1052 const struct loop *const loop = (const struct loop *)data;
1053 if (flow_bb_inside_loop_p (loop, bb))
1054 return true;
1055 return false;
1056 }
1057
1058
1059 /* Function new_loop_vec_info.
1060
1061 Create and initialize a new loop_vec_info struct for LOOP, as well as
1062 stmt_vec_info structs for all the stmts in LOOP. */
1063
1064 static loop_vec_info
new_loop_vec_info(struct loop * loop)1065 new_loop_vec_info (struct loop *loop)
1066 {
1067 loop_vec_info res;
1068 basic_block *bbs;
1069 gimple_stmt_iterator si;
1070 unsigned int i, nbbs;
1071
1072 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
1073 res->kind = vec_info::loop;
1074 LOOP_VINFO_LOOP (res) = loop;
1075
1076 bbs = get_loop_body (loop);
1077
1078 /* Create/Update stmt_info for all stmts in the loop. */
1079 for (i = 0; i < loop->num_nodes; i++)
1080 {
1081 basic_block bb = bbs[i];
1082
1083 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1084 {
1085 gimple *phi = gsi_stmt (si);
1086 gimple_set_uid (phi, 0);
1087 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res));
1088 }
1089
1090 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1091 {
1092 gimple *stmt = gsi_stmt (si);
1093 gimple_set_uid (stmt, 0);
1094 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res));
1095 }
1096 }
1097
1098 /* CHECKME: We want to visit all BBs before their successors (except for
1099 latch blocks, for which this assertion wouldn't hold). In the simple
1100 case of the loop forms we allow, a dfs order of the BBs would the same
1101 as reversed postorder traversal, so we are safe. */
1102
1103 free (bbs);
1104 bbs = XCNEWVEC (basic_block, loop->num_nodes);
1105 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1106 bbs, loop->num_nodes, loop);
1107 gcc_assert (nbbs == loop->num_nodes);
1108
1109 LOOP_VINFO_BBS (res) = bbs;
1110 LOOP_VINFO_NITERSM1 (res) = NULL;
1111 LOOP_VINFO_NITERS (res) = NULL;
1112 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
1113 LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
1114 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
1115 LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
1116 LOOP_VINFO_VECT_FACTOR (res) = 0;
1117 LOOP_VINFO_LOOP_NEST (res) = vNULL;
1118 LOOP_VINFO_DATAREFS (res) = vNULL;
1119 LOOP_VINFO_DDRS (res) = vNULL;
1120 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1121 LOOP_VINFO_MAY_MISALIGN_STMTS (res) = vNULL;
1122 LOOP_VINFO_MAY_ALIAS_DDRS (res) = vNULL;
1123 LOOP_VINFO_GROUPED_STORES (res) = vNULL;
1124 LOOP_VINFO_REDUCTIONS (res) = vNULL;
1125 LOOP_VINFO_REDUCTION_CHAINS (res) = vNULL;
1126 LOOP_VINFO_SLP_INSTANCES (res) = vNULL;
1127 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1128 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
1129 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
1130 LOOP_VINFO_PEELING_FOR_NITER (res) = false;
1131 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
1132
1133 return res;
1134 }
1135
1136
1137 /* Function destroy_loop_vec_info.
1138
1139 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1140 stmts in the loop. */
1141
1142 void
destroy_loop_vec_info(loop_vec_info loop_vinfo,bool clean_stmts)1143 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1144 {
1145 struct loop *loop;
1146 basic_block *bbs;
1147 int nbbs;
1148 gimple_stmt_iterator si;
1149 int j;
1150 vec<slp_instance> slp_instances;
1151 slp_instance instance;
1152 bool swapped;
1153
1154 if (!loop_vinfo)
1155 return;
1156
1157 loop = LOOP_VINFO_LOOP (loop_vinfo);
1158
1159 bbs = LOOP_VINFO_BBS (loop_vinfo);
1160 nbbs = clean_stmts ? loop->num_nodes : 0;
1161 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1162
1163 for (j = 0; j < nbbs; j++)
1164 {
1165 basic_block bb = bbs[j];
1166 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1167 free_stmt_vec_info (gsi_stmt (si));
1168
1169 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1170 {
1171 gimple *stmt = gsi_stmt (si);
1172
1173 /* We may have broken canonical form by moving a constant
1174 into RHS1 of a commutative op. Fix such occurrences. */
1175 if (swapped && is_gimple_assign (stmt))
1176 {
1177 enum tree_code code = gimple_assign_rhs_code (stmt);
1178
1179 if ((code == PLUS_EXPR
1180 || code == POINTER_PLUS_EXPR
1181 || code == MULT_EXPR)
1182 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1183 swap_ssa_operands (stmt,
1184 gimple_assign_rhs1_ptr (stmt),
1185 gimple_assign_rhs2_ptr (stmt));
1186 }
1187
1188 /* Free stmt_vec_info. */
1189 free_stmt_vec_info (stmt);
1190 gsi_next (&si);
1191 }
1192 }
1193
1194 free (LOOP_VINFO_BBS (loop_vinfo));
1195 vect_destroy_datarefs (loop_vinfo);
1196 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1197 LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1198 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1199 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
1200 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1201 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1202 FOR_EACH_VEC_ELT (slp_instances, j, instance)
1203 vect_free_slp_instance (instance);
1204
1205 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1206 LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1207 LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1208 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1209
1210 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1211 loop_vinfo->scalar_cost_vec.release ();
1212
1213 free (loop_vinfo);
1214 loop->aux = NULL;
1215 }
1216
1217
1218 /* Calculate the cost of one scalar iteration of the loop. */
1219 static void
vect_compute_single_scalar_iteration_cost(loop_vec_info loop_vinfo)1220 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1221 {
1222 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1223 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1224 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1225 int innerloop_iters, i;
1226
1227 /* Count statements in scalar loop. Using this as scalar cost for a single
1228 iteration for now.
1229
1230 TODO: Add outer loop support.
1231
1232 TODO: Consider assigning different costs to different scalar
1233 statements. */
1234
1235 /* FORNOW. */
1236 innerloop_iters = 1;
1237 if (loop->inner)
1238 innerloop_iters = 50; /* FIXME */
1239
1240 for (i = 0; i < nbbs; i++)
1241 {
1242 gimple_stmt_iterator si;
1243 basic_block bb = bbs[i];
1244
1245 if (bb->loop_father == loop->inner)
1246 factor = innerloop_iters;
1247 else
1248 factor = 1;
1249
1250 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1251 {
1252 gimple *stmt = gsi_stmt (si);
1253 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1254
1255 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1256 continue;
1257
1258 /* Skip stmts that are not vectorized inside the loop. */
1259 if (stmt_info
1260 && !STMT_VINFO_RELEVANT_P (stmt_info)
1261 && (!STMT_VINFO_LIVE_P (stmt_info)
1262 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1263 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1264 continue;
1265
1266 vect_cost_for_stmt kind;
1267 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
1268 {
1269 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1270 kind = scalar_load;
1271 else
1272 kind = scalar_store;
1273 }
1274 else
1275 kind = scalar_stmt;
1276
1277 scalar_single_iter_cost
1278 += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1279 factor, kind, NULL, 0, vect_prologue);
1280 }
1281 }
1282 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1283 = scalar_single_iter_cost;
1284 }
1285
1286
1287 /* Function vect_analyze_loop_form_1.
1288
1289 Verify that certain CFG restrictions hold, including:
1290 - the loop has a pre-header
1291 - the loop has a single entry and exit
1292 - the loop exit condition is simple enough, and the number of iterations
1293 can be analyzed (a countable loop). */
1294
1295 bool
vect_analyze_loop_form_1(struct loop * loop,gcond ** loop_cond,tree * number_of_iterationsm1,tree * number_of_iterations,gcond ** inner_loop_cond)1296 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1297 tree *number_of_iterationsm1,
1298 tree *number_of_iterations, gcond **inner_loop_cond)
1299 {
1300 if (dump_enabled_p ())
1301 dump_printf_loc (MSG_NOTE, vect_location,
1302 "=== vect_analyze_loop_form ===\n");
1303
1304 /* Different restrictions apply when we are considering an inner-most loop,
1305 vs. an outer (nested) loop.
1306 (FORNOW. May want to relax some of these restrictions in the future). */
1307
1308 if (!loop->inner)
1309 {
1310 /* Inner-most loop. We currently require that the number of BBs is
1311 exactly 2 (the header and latch). Vectorizable inner-most loops
1312 look like this:
1313
1314 (pre-header)
1315 |
1316 header <--------+
1317 | | |
1318 | +--> latch --+
1319 |
1320 (exit-bb) */
1321
1322 if (loop->num_nodes != 2)
1323 {
1324 if (dump_enabled_p ())
1325 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1326 "not vectorized: control flow in loop.\n");
1327 return false;
1328 }
1329
1330 if (empty_block_p (loop->header))
1331 {
1332 if (dump_enabled_p ())
1333 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1334 "not vectorized: empty loop.\n");
1335 return false;
1336 }
1337 }
1338 else
1339 {
1340 struct loop *innerloop = loop->inner;
1341 edge entryedge;
1342
1343 /* Nested loop. We currently require that the loop is doubly-nested,
1344 contains a single inner loop, and the number of BBs is exactly 5.
1345 Vectorizable outer-loops look like this:
1346
1347 (pre-header)
1348 |
1349 header <---+
1350 | |
1351 inner-loop |
1352 | |
1353 tail ------+
1354 |
1355 (exit-bb)
1356
1357 The inner-loop has the properties expected of inner-most loops
1358 as described above. */
1359
1360 if ((loop->inner)->inner || (loop->inner)->next)
1361 {
1362 if (dump_enabled_p ())
1363 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1364 "not vectorized: multiple nested loops.\n");
1365 return false;
1366 }
1367
1368 if (loop->num_nodes != 5)
1369 {
1370 if (dump_enabled_p ())
1371 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1372 "not vectorized: control flow in loop.\n");
1373 return false;
1374 }
1375
1376 entryedge = loop_preheader_edge (innerloop);
1377 if (entryedge->src != loop->header
1378 || !single_exit (innerloop)
1379 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1380 {
1381 if (dump_enabled_p ())
1382 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1383 "not vectorized: unsupported outerloop form.\n");
1384 return false;
1385 }
1386
1387 /* Analyze the inner-loop. */
1388 tree inner_niterm1, inner_niter;
1389 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1390 &inner_niterm1, &inner_niter, NULL))
1391 {
1392 if (dump_enabled_p ())
1393 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1394 "not vectorized: Bad inner loop.\n");
1395 return false;
1396 }
1397
1398 if (!expr_invariant_in_loop_p (loop, inner_niter))
1399 {
1400 if (dump_enabled_p ())
1401 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1402 "not vectorized: inner-loop count not"
1403 " invariant.\n");
1404 return false;
1405 }
1406
1407 if (dump_enabled_p ())
1408 dump_printf_loc (MSG_NOTE, vect_location,
1409 "Considering outer-loop vectorization.\n");
1410 }
1411
1412 if (!single_exit (loop)
1413 || EDGE_COUNT (loop->header->preds) != 2)
1414 {
1415 if (dump_enabled_p ())
1416 {
1417 if (!single_exit (loop))
1418 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1419 "not vectorized: multiple exits.\n");
1420 else if (EDGE_COUNT (loop->header->preds) != 2)
1421 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1422 "not vectorized: too many incoming edges.\n");
1423 }
1424 return false;
1425 }
1426
1427 /* We assume that the loop exit condition is at the end of the loop. i.e,
1428 that the loop is represented as a do-while (with a proper if-guard
1429 before the loop if needed), where the loop header contains all the
1430 executable statements, and the latch is empty. */
1431 if (!empty_block_p (loop->latch)
1432 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1433 {
1434 if (dump_enabled_p ())
1435 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1436 "not vectorized: latch block not empty.\n");
1437 return false;
1438 }
1439
1440 /* Make sure there exists a single-predecessor exit bb: */
1441 if (!single_pred_p (single_exit (loop)->dest))
1442 {
1443 edge e = single_exit (loop);
1444 if (!(e->flags & EDGE_ABNORMAL))
1445 {
1446 split_loop_exit_edge (e);
1447 if (dump_enabled_p ())
1448 dump_printf (MSG_NOTE, "split exit edge.\n");
1449 }
1450 else
1451 {
1452 if (dump_enabled_p ())
1453 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1454 "not vectorized: abnormal loop exit edge.\n");
1455 return false;
1456 }
1457 }
1458
1459 *loop_cond = vect_get_loop_niters (loop, number_of_iterations,
1460 number_of_iterationsm1);
1461 if (!*loop_cond)
1462 {
1463 if (dump_enabled_p ())
1464 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1465 "not vectorized: complicated exit condition.\n");
1466 return false;
1467 }
1468
1469 if (!*number_of_iterations
1470 || chrec_contains_undetermined (*number_of_iterations))
1471 {
1472 if (dump_enabled_p ())
1473 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1474 "not vectorized: number of iterations cannot be "
1475 "computed.\n");
1476 return false;
1477 }
1478
1479 if (integer_zerop (*number_of_iterations))
1480 {
1481 if (dump_enabled_p ())
1482 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1483 "not vectorized: number of iterations = 0.\n");
1484 return false;
1485 }
1486
1487 return true;
1488 }
1489
1490 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1491
1492 loop_vec_info
vect_analyze_loop_form(struct loop * loop)1493 vect_analyze_loop_form (struct loop *loop)
1494 {
1495 tree number_of_iterations, number_of_iterationsm1;
1496 gcond *loop_cond, *inner_loop_cond = NULL;
1497
1498 if (! vect_analyze_loop_form_1 (loop, &loop_cond, &number_of_iterationsm1,
1499 &number_of_iterations, &inner_loop_cond))
1500 return NULL;
1501
1502 loop_vec_info loop_vinfo = new_loop_vec_info (loop);
1503 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1504 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1505 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1506
1507 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1508 {
1509 if (dump_enabled_p ())
1510 {
1511 dump_printf_loc (MSG_NOTE, vect_location,
1512 "Symbolic number of iterations is ");
1513 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1514 dump_printf (MSG_NOTE, "\n");
1515 }
1516 }
1517
1518 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1519 if (inner_loop_cond)
1520 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1521 = loop_exit_ctrl_vec_info_type;
1522
1523 gcc_assert (!loop->aux);
1524 loop->aux = loop_vinfo;
1525 return loop_vinfo;
1526 }
1527
1528
1529
1530 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1531 statements update the vectorization factor. */
1532
1533 static void
vect_update_vf_for_slp(loop_vec_info loop_vinfo)1534 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1535 {
1536 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1537 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1538 int nbbs = loop->num_nodes;
1539 unsigned int vectorization_factor;
1540 int i;
1541
1542 if (dump_enabled_p ())
1543 dump_printf_loc (MSG_NOTE, vect_location,
1544 "=== vect_update_vf_for_slp ===\n");
1545
1546 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1547 gcc_assert (vectorization_factor != 0);
1548
1549 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1550 vectorization factor of the loop is the unrolling factor required by
1551 the SLP instances. If that unrolling factor is 1, we say, that we
1552 perform pure SLP on loop - cross iteration parallelism is not
1553 exploited. */
1554 bool only_slp_in_loop = true;
1555 for (i = 0; i < nbbs; i++)
1556 {
1557 basic_block bb = bbs[i];
1558 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1559 gsi_next (&si))
1560 {
1561 gimple *stmt = gsi_stmt (si);
1562 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1563 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1564 && STMT_VINFO_RELATED_STMT (stmt_info))
1565 {
1566 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1567 stmt_info = vinfo_for_stmt (stmt);
1568 }
1569 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1570 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1571 && !PURE_SLP_STMT (stmt_info))
1572 /* STMT needs both SLP and loop-based vectorization. */
1573 only_slp_in_loop = false;
1574 }
1575 }
1576
1577 if (only_slp_in_loop)
1578 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1579 else
1580 vectorization_factor
1581 = least_common_multiple (vectorization_factor,
1582 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1583
1584 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1585 if (dump_enabled_p ())
1586 dump_printf_loc (MSG_NOTE, vect_location,
1587 "Updating vectorization factor to %d\n",
1588 vectorization_factor);
1589 }
1590
1591 /* Function vect_analyze_loop_operations.
1592
1593 Scan the loop stmts and make sure they are all vectorizable. */
1594
1595 static bool
vect_analyze_loop_operations(loop_vec_info loop_vinfo)1596 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1597 {
1598 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1599 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1600 int nbbs = loop->num_nodes;
1601 int i;
1602 stmt_vec_info stmt_info;
1603 bool need_to_vectorize = false;
1604 bool ok;
1605
1606 if (dump_enabled_p ())
1607 dump_printf_loc (MSG_NOTE, vect_location,
1608 "=== vect_analyze_loop_operations ===\n");
1609
1610 for (i = 0; i < nbbs; i++)
1611 {
1612 basic_block bb = bbs[i];
1613
1614 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1615 gsi_next (&si))
1616 {
1617 gphi *phi = si.phi ();
1618 ok = true;
1619
1620 stmt_info = vinfo_for_stmt (phi);
1621 if (dump_enabled_p ())
1622 {
1623 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1624 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1625 dump_printf (MSG_NOTE, "\n");
1626 }
1627 if (virtual_operand_p (gimple_phi_result (phi)))
1628 continue;
1629
1630 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1631 (i.e., a phi in the tail of the outer-loop). */
1632 if (! is_loop_header_bb_p (bb))
1633 {
1634 /* FORNOW: we currently don't support the case that these phis
1635 are not used in the outerloop (unless it is double reduction,
1636 i.e., this phi is vect_reduction_def), cause this case
1637 requires to actually do something here. */
1638 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1639 || STMT_VINFO_LIVE_P (stmt_info))
1640 && STMT_VINFO_DEF_TYPE (stmt_info)
1641 != vect_double_reduction_def)
1642 {
1643 if (dump_enabled_p ())
1644 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1645 "Unsupported loop-closed phi in "
1646 "outer-loop.\n");
1647 return false;
1648 }
1649
1650 /* If PHI is used in the outer loop, we check that its operand
1651 is defined in the inner loop. */
1652 if (STMT_VINFO_RELEVANT_P (stmt_info))
1653 {
1654 tree phi_op;
1655 gimple *op_def_stmt;
1656
1657 if (gimple_phi_num_args (phi) != 1)
1658 return false;
1659
1660 phi_op = PHI_ARG_DEF (phi, 0);
1661 if (TREE_CODE (phi_op) != SSA_NAME)
1662 return false;
1663
1664 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1665 if (gimple_nop_p (op_def_stmt)
1666 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1667 || !vinfo_for_stmt (op_def_stmt))
1668 return false;
1669
1670 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1671 != vect_used_in_outer
1672 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1673 != vect_used_in_outer_by_reduction)
1674 return false;
1675 }
1676
1677 continue;
1678 }
1679
1680 gcc_assert (stmt_info);
1681
1682 if (STMT_VINFO_LIVE_P (stmt_info))
1683 {
1684 /* FORNOW: not yet supported. */
1685 if (dump_enabled_p ())
1686 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1687 "not vectorized: value used after loop.\n");
1688 return false;
1689 }
1690
1691 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1692 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1693 {
1694 /* A scalar-dependence cycle that we don't support. */
1695 if (dump_enabled_p ())
1696 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1697 "not vectorized: scalar dependence cycle.\n");
1698 return false;
1699 }
1700
1701 if (STMT_VINFO_RELEVANT_P (stmt_info))
1702 {
1703 need_to_vectorize = true;
1704 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1705 ok = vectorizable_induction (phi, NULL, NULL);
1706 }
1707
1708 if (!ok)
1709 {
1710 if (dump_enabled_p ())
1711 {
1712 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1713 "not vectorized: relevant phi not "
1714 "supported: ");
1715 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1716 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1717 }
1718 return false;
1719 }
1720 }
1721
1722 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1723 gsi_next (&si))
1724 {
1725 gimple *stmt = gsi_stmt (si);
1726 if (!gimple_clobber_p (stmt)
1727 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1728 return false;
1729 }
1730 } /* bbs */
1731
1732 /* All operations in the loop are either irrelevant (deal with loop
1733 control, or dead), or only used outside the loop and can be moved
1734 out of the loop (e.g. invariants, inductions). The loop can be
1735 optimized away by scalar optimizations. We're better off not
1736 touching this loop. */
1737 if (!need_to_vectorize)
1738 {
1739 if (dump_enabled_p ())
1740 dump_printf_loc (MSG_NOTE, vect_location,
1741 "All the computation can be taken out of the loop.\n");
1742 if (dump_enabled_p ())
1743 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1744 "not vectorized: redundant loop. no profit to "
1745 "vectorize.\n");
1746 return false;
1747 }
1748
1749 return true;
1750 }
1751
1752
1753 /* Function vect_analyze_loop_2.
1754
1755 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1756 for it. The different analyses will record information in the
1757 loop_vec_info struct. */
1758 static bool
vect_analyze_loop_2(loop_vec_info loop_vinfo,bool & fatal)1759 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1760 {
1761 bool ok;
1762 int max_vf = MAX_VECTORIZATION_FACTOR;
1763 int min_vf = 2;
1764 unsigned int n_stmts = 0;
1765
1766 /* The first group of checks is independent of the vector size. */
1767 fatal = true;
1768
1769 /* Find all data references in the loop (which correspond to vdefs/vuses)
1770 and analyze their evolution in the loop. */
1771
1772 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1773
1774 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1775 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1776 {
1777 if (dump_enabled_p ())
1778 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1779 "not vectorized: loop nest containing two "
1780 "or more consecutive inner loops cannot be "
1781 "vectorized\n");
1782 return false;
1783 }
1784
1785 for (unsigned i = 0; i < loop->num_nodes; i++)
1786 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1787 !gsi_end_p (gsi); gsi_next (&gsi))
1788 {
1789 gimple *stmt = gsi_stmt (gsi);
1790 if (is_gimple_debug (stmt))
1791 continue;
1792 ++n_stmts;
1793 if (!find_data_references_in_stmt (loop, stmt,
1794 &LOOP_VINFO_DATAREFS (loop_vinfo)))
1795 {
1796 if (is_gimple_call (stmt) && loop->safelen)
1797 {
1798 tree fndecl = gimple_call_fndecl (stmt), op;
1799 if (fndecl != NULL_TREE)
1800 {
1801 cgraph_node *node = cgraph_node::get (fndecl);
1802 if (node != NULL && node->simd_clones != NULL)
1803 {
1804 unsigned int j, n = gimple_call_num_args (stmt);
1805 for (j = 0; j < n; j++)
1806 {
1807 op = gimple_call_arg (stmt, j);
1808 if (DECL_P (op)
1809 || (REFERENCE_CLASS_P (op)
1810 && get_base_address (op)))
1811 break;
1812 }
1813 op = gimple_call_lhs (stmt);
1814 /* Ignore #pragma omp declare simd functions
1815 if they don't have data references in the
1816 call stmt itself. */
1817 if (j == n
1818 && !(op
1819 && (DECL_P (op)
1820 || (REFERENCE_CLASS_P (op)
1821 && get_base_address (op)))))
1822 continue;
1823 }
1824 }
1825 }
1826 if (dump_enabled_p ())
1827 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1828 "not vectorized: loop contains function "
1829 "calls or data references that cannot "
1830 "be analyzed\n");
1831 return false;
1832 }
1833 }
1834
1835 /* Analyze the data references and also adjust the minimal
1836 vectorization factor according to the loads and stores. */
1837
1838 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1839 if (!ok)
1840 {
1841 if (dump_enabled_p ())
1842 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1843 "bad data references.\n");
1844 return false;
1845 }
1846
1847 /* Classify all cross-iteration scalar data-flow cycles.
1848 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1849 vect_analyze_scalar_cycles (loop_vinfo);
1850
1851 vect_pattern_recog (loop_vinfo);
1852
1853 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1854
1855 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1856 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1857
1858 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1859 if (!ok)
1860 {
1861 if (dump_enabled_p ())
1862 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1863 "bad data access.\n");
1864 return false;
1865 }
1866
1867 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1868
1869 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1870 if (!ok)
1871 {
1872 if (dump_enabled_p ())
1873 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1874 "unexpected pattern.\n");
1875 return false;
1876 }
1877
1878 /* While the rest of the analysis below depends on it in some way. */
1879 fatal = false;
1880
1881 /* Analyze data dependences between the data-refs in the loop
1882 and adjust the maximum vectorization factor according to
1883 the dependences.
1884 FORNOW: fail at the first data dependence that we encounter. */
1885
1886 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1887 if (!ok
1888 || max_vf < min_vf)
1889 {
1890 if (dump_enabled_p ())
1891 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1892 "bad data dependence.\n");
1893 return false;
1894 }
1895
1896 ok = vect_determine_vectorization_factor (loop_vinfo);
1897 if (!ok)
1898 {
1899 if (dump_enabled_p ())
1900 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1901 "can't determine vectorization factor.\n");
1902 return false;
1903 }
1904 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1905 {
1906 if (dump_enabled_p ())
1907 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1908 "bad data dependence.\n");
1909 return false;
1910 }
1911
1912 /* Compute the scalar iteration cost. */
1913 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1914
1915 int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1916 HOST_WIDE_INT estimated_niter;
1917 unsigned th;
1918 int min_scalar_loop_bound;
1919
1920 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1921 ok = vect_analyze_slp (loop_vinfo, n_stmts);
1922 if (!ok)
1923 return false;
1924
1925 /* If there are any SLP instances mark them as pure_slp. */
1926 bool slp = vect_make_slp_decision (loop_vinfo);
1927 if (slp)
1928 {
1929 /* Find stmts that need to be both vectorized and SLPed. */
1930 vect_detect_hybrid_slp (loop_vinfo);
1931
1932 /* Update the vectorization factor based on the SLP decision. */
1933 vect_update_vf_for_slp (loop_vinfo);
1934 }
1935
1936 /* This is the point where we can re-start analysis with SLP forced off. */
1937 start_over:
1938
1939 /* Now the vectorization factor is final. */
1940 unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1941 gcc_assert (vectorization_factor != 0);
1942
1943 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1944 dump_printf_loc (MSG_NOTE, vect_location,
1945 "vectorization_factor = %d, niters = "
1946 HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1947 LOOP_VINFO_INT_NITERS (loop_vinfo));
1948
1949 HOST_WIDE_INT max_niter
1950 = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1951 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1952 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1953 || (max_niter != -1
1954 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1955 {
1956 if (dump_enabled_p ())
1957 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1958 "not vectorized: iteration count smaller than "
1959 "vectorization factor.\n");
1960 return false;
1961 }
1962
1963 /* Analyze the alignment of the data-refs in the loop.
1964 Fail if a data reference is found that cannot be vectorized. */
1965
1966 ok = vect_analyze_data_refs_alignment (loop_vinfo);
1967 if (!ok)
1968 {
1969 if (dump_enabled_p ())
1970 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1971 "bad data alignment.\n");
1972 return false;
1973 }
1974
1975 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1976 It is important to call pruning after vect_analyze_data_ref_accesses,
1977 since we use grouping information gathered by interleaving analysis. */
1978 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1979 if (!ok)
1980 {
1981 if (dump_enabled_p ())
1982 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1983 "number of versioning for alias "
1984 "run-time tests exceeds %d "
1985 "(--param vect-max-version-for-alias-checks)\n",
1986 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1987 return false;
1988 }
1989
1990 /* This pass will decide on using loop versioning and/or loop peeling in
1991 order to enhance the alignment of data references in the loop. */
1992 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1993 if (!ok)
1994 {
1995 if (dump_enabled_p ())
1996 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1997 "bad data alignment.\n");
1998 return false;
1999 }
2000
2001 if (slp)
2002 {
2003 /* Analyze operations in the SLP instances. Note this may
2004 remove unsupported SLP instances which makes the above
2005 SLP kind detection invalid. */
2006 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2007 vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
2008 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2009 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2010 goto again;
2011 }
2012
2013 /* Scan all the remaining operations in the loop that are not subject
2014 to SLP and make sure they are vectorizable. */
2015 ok = vect_analyze_loop_operations (loop_vinfo);
2016 if (!ok)
2017 {
2018 if (dump_enabled_p ())
2019 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2020 "bad operation or unsupported loop bound.\n");
2021 return false;
2022 }
2023
2024 /* Analyze cost. Decide if worth while to vectorize. */
2025 int min_profitable_estimate, min_profitable_iters;
2026 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2027 &min_profitable_estimate);
2028
2029 if (min_profitable_iters < 0)
2030 {
2031 if (dump_enabled_p ())
2032 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2033 "not vectorized: vectorization not profitable.\n");
2034 if (dump_enabled_p ())
2035 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2036 "not vectorized: vector version will never be "
2037 "profitable.\n");
2038 goto again;
2039 }
2040
2041 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2042 * vectorization_factor) - 1);
2043
2044 /* Use the cost model only if it is more conservative than user specified
2045 threshold. */
2046 th = (unsigned) min_scalar_loop_bound;
2047 if (min_profitable_iters
2048 && (!min_scalar_loop_bound
2049 || min_profitable_iters > min_scalar_loop_bound))
2050 th = (unsigned) min_profitable_iters;
2051
2052 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2053
2054 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2055 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
2056 {
2057 if (dump_enabled_p ())
2058 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2059 "not vectorized: vectorization not profitable.\n");
2060 if (dump_enabled_p ())
2061 dump_printf_loc (MSG_NOTE, vect_location,
2062 "not vectorized: iteration count smaller than user "
2063 "specified loop bound parameter or minimum profitable "
2064 "iterations (whichever is more conservative).\n");
2065 goto again;
2066 }
2067
2068 estimated_niter
2069 = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2070 if (estimated_niter != -1
2071 && ((unsigned HOST_WIDE_INT) estimated_niter
2072 <= MAX (th, (unsigned)min_profitable_estimate)))
2073 {
2074 if (dump_enabled_p ())
2075 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2076 "not vectorized: estimated iteration count too "
2077 "small.\n");
2078 if (dump_enabled_p ())
2079 dump_printf_loc (MSG_NOTE, vect_location,
2080 "not vectorized: estimated iteration count smaller "
2081 "than specified loop bound parameter or minimum "
2082 "profitable iterations (whichever is more "
2083 "conservative).\n");
2084 goto again;
2085 }
2086
2087 /* Decide whether we need to create an epilogue loop to handle
2088 remaining scalar iterations. */
2089 th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
2090 / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2091 * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2092
2093 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2094 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2095 {
2096 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
2097 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
2098 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2099 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2100 }
2101 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2102 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2103 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2104 /* In case of versioning, check if the maximum number of
2105 iterations is greater than th. If they are identical,
2106 the epilogue is unnecessary. */
2107 && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
2108 && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2109 || (unsigned HOST_WIDE_INT) max_niter > th)))
2110 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2111
2112 /* If an epilogue loop is required make sure we can create one. */
2113 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2114 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2115 {
2116 if (dump_enabled_p ())
2117 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2118 if (!vect_can_advance_ivs_p (loop_vinfo)
2119 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2120 single_exit (LOOP_VINFO_LOOP
2121 (loop_vinfo))))
2122 {
2123 if (dump_enabled_p ())
2124 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2125 "not vectorized: can't create required "
2126 "epilog loop\n");
2127 goto again;
2128 }
2129 }
2130
2131 gcc_assert (vectorization_factor
2132 == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2133
2134 /* Ok to vectorize! */
2135 return true;
2136
2137 again:
2138 /* Try again with SLP forced off but if we didn't do any SLP there is
2139 no point in re-trying. */
2140 if (!slp)
2141 return false;
2142
2143 /* If there are reduction chains re-trying will fail anyway. */
2144 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2145 return false;
2146
2147 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2148 via interleaving or lane instructions. */
2149 slp_instance instance;
2150 slp_tree node;
2151 unsigned i, j;
2152 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2153 {
2154 stmt_vec_info vinfo;
2155 vinfo = vinfo_for_stmt
2156 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2157 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2158 continue;
2159 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2160 unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2161 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2162 if (! vect_store_lanes_supported (vectype, size)
2163 && ! vect_grouped_store_supported (vectype, size))
2164 return false;
2165 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2166 {
2167 vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2168 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2169 size = STMT_VINFO_GROUP_SIZE (vinfo);
2170 vectype = STMT_VINFO_VECTYPE (vinfo);
2171 if (! vect_load_lanes_supported (vectype, size)
2172 && ! vect_grouped_load_supported (vectype, size))
2173 return false;
2174 }
2175 }
2176
2177 if (dump_enabled_p ())
2178 dump_printf_loc (MSG_NOTE, vect_location,
2179 "re-trying with SLP disabled\n");
2180
2181 /* Roll back state appropriately. No SLP this time. */
2182 slp = false;
2183 /* Restore vectorization factor as it were without SLP. */
2184 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2185 /* Free the SLP instances. */
2186 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2187 vect_free_slp_instance (instance);
2188 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2189 /* Reset SLP type to loop_vect on all stmts. */
2190 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2191 {
2192 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2193 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2194 !gsi_end_p (si); gsi_next (&si))
2195 {
2196 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2197 STMT_SLP_TYPE (stmt_info) = loop_vect;
2198 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2199 {
2200 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2201 STMT_SLP_TYPE (stmt_info) = loop_vect;
2202 for (gimple_stmt_iterator pi
2203 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2204 !gsi_end_p (pi); gsi_next (&pi))
2205 {
2206 gimple *pstmt = gsi_stmt (pi);
2207 STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2208 }
2209 }
2210 }
2211 }
2212 /* Free optimized alias test DDRS. */
2213 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2214 /* Reset target cost data. */
2215 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2216 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2217 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2218 /* Reset assorted flags. */
2219 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2220 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2221 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2222
2223 goto start_over;
2224 }
2225
2226 /* Function vect_analyze_loop.
2227
2228 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2229 for it. The different analyses will record information in the
2230 loop_vec_info struct. */
2231 loop_vec_info
vect_analyze_loop(struct loop * loop)2232 vect_analyze_loop (struct loop *loop)
2233 {
2234 loop_vec_info loop_vinfo;
2235 unsigned int vector_sizes;
2236
2237 /* Autodetect first vector size we try. */
2238 current_vector_size = 0;
2239 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2240
2241 if (dump_enabled_p ())
2242 dump_printf_loc (MSG_NOTE, vect_location,
2243 "===== analyze_loop_nest =====\n");
2244
2245 if (loop_outer (loop)
2246 && loop_vec_info_for_loop (loop_outer (loop))
2247 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2248 {
2249 if (dump_enabled_p ())
2250 dump_printf_loc (MSG_NOTE, vect_location,
2251 "outer-loop already vectorized.\n");
2252 return NULL;
2253 }
2254
2255 while (1)
2256 {
2257 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2258 loop_vinfo = vect_analyze_loop_form (loop);
2259 if (!loop_vinfo)
2260 {
2261 if (dump_enabled_p ())
2262 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2263 "bad loop form.\n");
2264 return NULL;
2265 }
2266
2267 bool fatal = false;
2268 if (vect_analyze_loop_2 (loop_vinfo, fatal))
2269 {
2270 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2271
2272 return loop_vinfo;
2273 }
2274
2275 destroy_loop_vec_info (loop_vinfo, true);
2276
2277 vector_sizes &= ~current_vector_size;
2278 if (fatal
2279 || vector_sizes == 0
2280 || current_vector_size == 0)
2281 return NULL;
2282
2283 /* Try the next biggest vector size. */
2284 current_vector_size = 1 << floor_log2 (vector_sizes);
2285 if (dump_enabled_p ())
2286 dump_printf_loc (MSG_NOTE, vect_location,
2287 "***** Re-trying analysis with "
2288 "vector size %d\n", current_vector_size);
2289 }
2290 }
2291
2292
2293 /* Function reduction_code_for_scalar_code
2294
2295 Input:
2296 CODE - tree_code of a reduction operations.
2297
2298 Output:
2299 REDUC_CODE - the corresponding tree-code to be used to reduce the
2300 vector of partial results into a single scalar result, or ERROR_MARK
2301 if the operation is a supported reduction operation, but does not have
2302 such a tree-code.
2303
2304 Return FALSE if CODE currently cannot be vectorized as reduction. */
2305
2306 static bool
reduction_code_for_scalar_code(enum tree_code code,enum tree_code * reduc_code)2307 reduction_code_for_scalar_code (enum tree_code code,
2308 enum tree_code *reduc_code)
2309 {
2310 switch (code)
2311 {
2312 case MAX_EXPR:
2313 *reduc_code = REDUC_MAX_EXPR;
2314 return true;
2315
2316 case MIN_EXPR:
2317 *reduc_code = REDUC_MIN_EXPR;
2318 return true;
2319
2320 case PLUS_EXPR:
2321 *reduc_code = REDUC_PLUS_EXPR;
2322 return true;
2323
2324 case MULT_EXPR:
2325 case MINUS_EXPR:
2326 case BIT_IOR_EXPR:
2327 case BIT_XOR_EXPR:
2328 case BIT_AND_EXPR:
2329 *reduc_code = ERROR_MARK;
2330 return true;
2331
2332 default:
2333 return false;
2334 }
2335 }
2336
2337
2338 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2339 STMT is printed with a message MSG. */
2340
2341 static void
report_vect_op(int msg_type,gimple * stmt,const char * msg)2342 report_vect_op (int msg_type, gimple *stmt, const char *msg)
2343 {
2344 dump_printf_loc (msg_type, vect_location, "%s", msg);
2345 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2346 dump_printf (msg_type, "\n");
2347 }
2348
2349
2350 /* Detect SLP reduction of the form:
2351
2352 #a1 = phi <a5, a0>
2353 a2 = operation (a1)
2354 a3 = operation (a2)
2355 a4 = operation (a3)
2356 a5 = operation (a4)
2357
2358 #a = phi <a5>
2359
2360 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2361 FIRST_STMT is the first reduction stmt in the chain
2362 (a2 = operation (a1)).
2363
2364 Return TRUE if a reduction chain was detected. */
2365
2366 static bool
vect_is_slp_reduction(loop_vec_info loop_info,gimple * phi,gimple * first_stmt)2367 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2368 gimple *first_stmt)
2369 {
2370 struct loop *loop = (gimple_bb (phi))->loop_father;
2371 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2372 enum tree_code code;
2373 gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2374 stmt_vec_info use_stmt_info, current_stmt_info;
2375 tree lhs;
2376 imm_use_iterator imm_iter;
2377 use_operand_p use_p;
2378 int nloop_uses, size = 0, n_out_of_loop_uses;
2379 bool found = false;
2380
2381 if (loop != vect_loop)
2382 return false;
2383
2384 lhs = PHI_RESULT (phi);
2385 code = gimple_assign_rhs_code (first_stmt);
2386 while (1)
2387 {
2388 nloop_uses = 0;
2389 n_out_of_loop_uses = 0;
2390 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2391 {
2392 gimple *use_stmt = USE_STMT (use_p);
2393 if (is_gimple_debug (use_stmt))
2394 continue;
2395
2396 /* Check if we got back to the reduction phi. */
2397 if (use_stmt == phi)
2398 {
2399 loop_use_stmt = use_stmt;
2400 found = true;
2401 break;
2402 }
2403
2404 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2405 {
2406 loop_use_stmt = use_stmt;
2407 nloop_uses++;
2408 }
2409 else
2410 n_out_of_loop_uses++;
2411
2412 /* There are can be either a single use in the loop or two uses in
2413 phi nodes. */
2414 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2415 return false;
2416 }
2417
2418 if (found)
2419 break;
2420
2421 /* We reached a statement with no loop uses. */
2422 if (nloop_uses == 0)
2423 return false;
2424
2425 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2426 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2427 return false;
2428
2429 if (!is_gimple_assign (loop_use_stmt)
2430 || code != gimple_assign_rhs_code (loop_use_stmt)
2431 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2432 return false;
2433
2434 /* Insert USE_STMT into reduction chain. */
2435 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2436 if (current_stmt)
2437 {
2438 current_stmt_info = vinfo_for_stmt (current_stmt);
2439 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2440 GROUP_FIRST_ELEMENT (use_stmt_info)
2441 = GROUP_FIRST_ELEMENT (current_stmt_info);
2442 }
2443 else
2444 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2445
2446 lhs = gimple_assign_lhs (loop_use_stmt);
2447 current_stmt = loop_use_stmt;
2448 size++;
2449 }
2450
2451 if (!found || loop_use_stmt != phi || size < 2)
2452 return false;
2453
2454 /* Swap the operands, if needed, to make the reduction operand be the second
2455 operand. */
2456 lhs = PHI_RESULT (phi);
2457 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2458 while (next_stmt)
2459 {
2460 if (gimple_assign_rhs2 (next_stmt) == lhs)
2461 {
2462 tree op = gimple_assign_rhs1 (next_stmt);
2463 gimple *def_stmt = NULL;
2464
2465 if (TREE_CODE (op) == SSA_NAME)
2466 def_stmt = SSA_NAME_DEF_STMT (op);
2467
2468 /* Check that the other def is either defined in the loop
2469 ("vect_internal_def"), or it's an induction (defined by a
2470 loop-header phi-node). */
2471 if (def_stmt
2472 && gimple_bb (def_stmt)
2473 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2474 && (is_gimple_assign (def_stmt)
2475 || is_gimple_call (def_stmt)
2476 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2477 == vect_induction_def
2478 || (gimple_code (def_stmt) == GIMPLE_PHI
2479 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2480 == vect_internal_def
2481 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2482 {
2483 lhs = gimple_assign_lhs (next_stmt);
2484 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2485 continue;
2486 }
2487
2488 return false;
2489 }
2490 else
2491 {
2492 tree op = gimple_assign_rhs2 (next_stmt);
2493 gimple *def_stmt = NULL;
2494
2495 if (TREE_CODE (op) == SSA_NAME)
2496 def_stmt = SSA_NAME_DEF_STMT (op);
2497
2498 /* Check that the other def is either defined in the loop
2499 ("vect_internal_def"), or it's an induction (defined by a
2500 loop-header phi-node). */
2501 if (def_stmt
2502 && gimple_bb (def_stmt)
2503 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2504 && (is_gimple_assign (def_stmt)
2505 || is_gimple_call (def_stmt)
2506 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2507 == vect_induction_def
2508 || (gimple_code (def_stmt) == GIMPLE_PHI
2509 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2510 == vect_internal_def
2511 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2512 {
2513 if (dump_enabled_p ())
2514 {
2515 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2516 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2517 dump_printf (MSG_NOTE, "\n");
2518 }
2519
2520 swap_ssa_operands (next_stmt,
2521 gimple_assign_rhs1_ptr (next_stmt),
2522 gimple_assign_rhs2_ptr (next_stmt));
2523 update_stmt (next_stmt);
2524
2525 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2526 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2527 }
2528 else
2529 return false;
2530 }
2531
2532 lhs = gimple_assign_lhs (next_stmt);
2533 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2534 }
2535
2536 /* Save the chain for further analysis in SLP detection. */
2537 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2538 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2539 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2540
2541 return true;
2542 }
2543
2544
2545 /* Function vect_is_simple_reduction_1
2546
2547 (1) Detect a cross-iteration def-use cycle that represents a simple
2548 reduction computation. We look for the following pattern:
2549
2550 loop_header:
2551 a1 = phi < a0, a2 >
2552 a3 = ...
2553 a2 = operation (a3, a1)
2554
2555 or
2556
2557 a3 = ...
2558 loop_header:
2559 a1 = phi < a0, a2 >
2560 a2 = operation (a3, a1)
2561
2562 such that:
2563 1. operation is commutative and associative and it is safe to
2564 change the order of the computation (if CHECK_REDUCTION is true)
2565 2. no uses for a2 in the loop (a2 is used out of the loop)
2566 3. no uses of a1 in the loop besides the reduction operation
2567 4. no uses of a1 outside the loop.
2568
2569 Conditions 1,4 are tested here.
2570 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2571
2572 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2573 nested cycles, if CHECK_REDUCTION is false.
2574
2575 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2576 reductions:
2577
2578 a1 = phi < a0, a2 >
2579 inner loop (def of a3)
2580 a2 = phi < a3 >
2581
2582 (4) Detect condition expressions, ie:
2583 for (int i = 0; i < N; i++)
2584 if (a[i] < val)
2585 ret_val = a[i];
2586
2587 */
2588
2589 static gimple *
vect_is_simple_reduction(loop_vec_info loop_info,gimple * phi,bool check_reduction,bool * double_reduc,bool need_wrapping_integral_overflow,enum vect_reduction_type * v_reduc_type)2590 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2591 bool check_reduction, bool *double_reduc,
2592 bool need_wrapping_integral_overflow,
2593 enum vect_reduction_type *v_reduc_type)
2594 {
2595 struct loop *loop = (gimple_bb (phi))->loop_father;
2596 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2597 edge latch_e = loop_latch_edge (loop);
2598 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2599 gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2600 enum tree_code orig_code, code;
2601 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2602 tree type;
2603 int nloop_uses;
2604 tree name;
2605 imm_use_iterator imm_iter;
2606 use_operand_p use_p;
2607 bool phi_def;
2608
2609 *double_reduc = false;
2610 *v_reduc_type = TREE_CODE_REDUCTION;
2611
2612 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2613 otherwise, we assume outer loop vectorization. */
2614 gcc_assert ((check_reduction && loop == vect_loop)
2615 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2616
2617 name = PHI_RESULT (phi);
2618 /* ??? If there are no uses of the PHI result the inner loop reduction
2619 won't be detected as possibly double-reduction by vectorizable_reduction
2620 because that tries to walk the PHI arg from the preheader edge which
2621 can be constant. See PR60382. */
2622 if (has_zero_uses (name))
2623 return NULL;
2624 nloop_uses = 0;
2625 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2626 {
2627 gimple *use_stmt = USE_STMT (use_p);
2628 if (is_gimple_debug (use_stmt))
2629 continue;
2630
2631 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2632 {
2633 if (dump_enabled_p ())
2634 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2635 "intermediate value used outside loop.\n");
2636
2637 return NULL;
2638 }
2639
2640 nloop_uses++;
2641 if (nloop_uses > 1)
2642 {
2643 if (dump_enabled_p ())
2644 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2645 "reduction used in loop.\n");
2646 return NULL;
2647 }
2648
2649 phi_use_stmt = use_stmt;
2650 }
2651
2652 if (TREE_CODE (loop_arg) != SSA_NAME)
2653 {
2654 if (dump_enabled_p ())
2655 {
2656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2657 "reduction: not ssa_name: ");
2658 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2659 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2660 }
2661 return NULL;
2662 }
2663
2664 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2665 if (!def_stmt)
2666 {
2667 if (dump_enabled_p ())
2668 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2669 "reduction: no def_stmt.\n");
2670 return NULL;
2671 }
2672
2673 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2674 {
2675 if (dump_enabled_p ())
2676 {
2677 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2678 dump_printf (MSG_NOTE, "\n");
2679 }
2680 return NULL;
2681 }
2682
2683 if (is_gimple_assign (def_stmt))
2684 {
2685 name = gimple_assign_lhs (def_stmt);
2686 phi_def = false;
2687 }
2688 else
2689 {
2690 name = PHI_RESULT (def_stmt);
2691 phi_def = true;
2692 }
2693
2694 nloop_uses = 0;
2695 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2696 {
2697 gimple *use_stmt = USE_STMT (use_p);
2698 if (is_gimple_debug (use_stmt))
2699 continue;
2700 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2701 nloop_uses++;
2702 if (nloop_uses > 1)
2703 {
2704 if (dump_enabled_p ())
2705 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2706 "reduction used in loop.\n");
2707 return NULL;
2708 }
2709 }
2710
2711 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2712 defined in the inner loop. */
2713 if (phi_def)
2714 {
2715 op1 = PHI_ARG_DEF (def_stmt, 0);
2716
2717 if (gimple_phi_num_args (def_stmt) != 1
2718 || TREE_CODE (op1) != SSA_NAME)
2719 {
2720 if (dump_enabled_p ())
2721 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2722 "unsupported phi node definition.\n");
2723
2724 return NULL;
2725 }
2726
2727 def1 = SSA_NAME_DEF_STMT (op1);
2728 if (gimple_bb (def1)
2729 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2730 && loop->inner
2731 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2732 && is_gimple_assign (def1)
2733 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2734 {
2735 if (dump_enabled_p ())
2736 report_vect_op (MSG_NOTE, def_stmt,
2737 "detected double reduction: ");
2738
2739 *double_reduc = true;
2740 return def_stmt;
2741 }
2742
2743 return NULL;
2744 }
2745
2746 code = orig_code = gimple_assign_rhs_code (def_stmt);
2747
2748 /* We can handle "res -= x[i]", which is non-associative by
2749 simply rewriting this into "res += -x[i]". Avoid changing
2750 gimple instruction for the first simple tests and only do this
2751 if we're allowed to change code at all. */
2752 if (code == MINUS_EXPR
2753 && (op1 = gimple_assign_rhs1 (def_stmt))
2754 && TREE_CODE (op1) == SSA_NAME
2755 && SSA_NAME_DEF_STMT (op1) == phi)
2756 code = PLUS_EXPR;
2757
2758 if (code == COND_EXPR)
2759 {
2760 if (check_reduction)
2761 *v_reduc_type = COND_REDUCTION;
2762 }
2763 else if (!commutative_tree_code (code) || !associative_tree_code (code))
2764 {
2765 if (dump_enabled_p ())
2766 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2767 "reduction: not commutative/associative: ");
2768 return NULL;
2769 }
2770
2771 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2772 {
2773 if (code != COND_EXPR)
2774 {
2775 if (dump_enabled_p ())
2776 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2777 "reduction: not binary operation: ");
2778
2779 return NULL;
2780 }
2781
2782 op3 = gimple_assign_rhs1 (def_stmt);
2783 if (COMPARISON_CLASS_P (op3))
2784 {
2785 op4 = TREE_OPERAND (op3, 1);
2786 op3 = TREE_OPERAND (op3, 0);
2787 }
2788
2789 op1 = gimple_assign_rhs2 (def_stmt);
2790 op2 = gimple_assign_rhs3 (def_stmt);
2791
2792 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2793 {
2794 if (dump_enabled_p ())
2795 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2796 "reduction: uses not ssa_names: ");
2797
2798 return NULL;
2799 }
2800 }
2801 else
2802 {
2803 op1 = gimple_assign_rhs1 (def_stmt);
2804 op2 = gimple_assign_rhs2 (def_stmt);
2805
2806 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2807 {
2808 if (dump_enabled_p ())
2809 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2810 "reduction: uses not ssa_names: ");
2811
2812 return NULL;
2813 }
2814 }
2815
2816 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2817 if ((TREE_CODE (op1) == SSA_NAME
2818 && !types_compatible_p (type,TREE_TYPE (op1)))
2819 || (TREE_CODE (op2) == SSA_NAME
2820 && !types_compatible_p (type, TREE_TYPE (op2)))
2821 || (op3 && TREE_CODE (op3) == SSA_NAME
2822 && !types_compatible_p (type, TREE_TYPE (op3)))
2823 || (op4 && TREE_CODE (op4) == SSA_NAME
2824 && !types_compatible_p (type, TREE_TYPE (op4))))
2825 {
2826 if (dump_enabled_p ())
2827 {
2828 dump_printf_loc (MSG_NOTE, vect_location,
2829 "reduction: multiple types: operation type: ");
2830 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2831 dump_printf (MSG_NOTE, ", operands types: ");
2832 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2833 TREE_TYPE (op1));
2834 dump_printf (MSG_NOTE, ",");
2835 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2836 TREE_TYPE (op2));
2837 if (op3)
2838 {
2839 dump_printf (MSG_NOTE, ",");
2840 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2841 TREE_TYPE (op3));
2842 }
2843
2844 if (op4)
2845 {
2846 dump_printf (MSG_NOTE, ",");
2847 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2848 TREE_TYPE (op4));
2849 }
2850 dump_printf (MSG_NOTE, "\n");
2851 }
2852
2853 return NULL;
2854 }
2855
2856 /* Check that it's ok to change the order of the computation.
2857 Generally, when vectorizing a reduction we change the order of the
2858 computation. This may change the behavior of the program in some
2859 cases, so we need to check that this is ok. One exception is when
2860 vectorizing an outer-loop: the inner-loop is executed sequentially,
2861 and therefore vectorizing reductions in the inner-loop during
2862 outer-loop vectorization is safe. */
2863
2864 if (*v_reduc_type != COND_REDUCTION
2865 && check_reduction)
2866 {
2867 /* CHECKME: check for !flag_finite_math_only too? */
2868 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
2869 {
2870 /* Changing the order of operations changes the semantics. */
2871 if (dump_enabled_p ())
2872 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2873 "reduction: unsafe fp math optimization: ");
2874 return NULL;
2875 }
2876 else if (INTEGRAL_TYPE_P (type))
2877 {
2878 if (!operation_no_trapping_overflow (type, code))
2879 {
2880 /* Changing the order of operations changes the semantics. */
2881 if (dump_enabled_p ())
2882 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2883 "reduction: unsafe int math optimization"
2884 " (overflow traps): ");
2885 return NULL;
2886 }
2887 if (need_wrapping_integral_overflow
2888 && !TYPE_OVERFLOW_WRAPS (type)
2889 && operation_can_overflow (code))
2890 {
2891 /* Changing the order of operations changes the semantics. */
2892 if (dump_enabled_p ())
2893 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2894 "reduction: unsafe int math optimization"
2895 " (overflow doesn't wrap): ");
2896 return NULL;
2897 }
2898 }
2899 else if (SAT_FIXED_POINT_TYPE_P (type))
2900 {
2901 /* Changing the order of operations changes the semantics. */
2902 if (dump_enabled_p ())
2903 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2904 "reduction: unsafe fixed-point math optimization: ");
2905 return NULL;
2906 }
2907 }
2908
2909 /* Reduction is safe. We're dealing with one of the following:
2910 1) integer arithmetic and no trapv
2911 2) floating point arithmetic, and special flags permit this optimization
2912 3) nested cycle (i.e., outer loop vectorization). */
2913 if (TREE_CODE (op1) == SSA_NAME)
2914 def1 = SSA_NAME_DEF_STMT (op1);
2915
2916 if (TREE_CODE (op2) == SSA_NAME)
2917 def2 = SSA_NAME_DEF_STMT (op2);
2918
2919 if (code != COND_EXPR
2920 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2921 {
2922 if (dump_enabled_p ())
2923 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2924 return NULL;
2925 }
2926
2927 /* Check that one def is the reduction def, defined by PHI,
2928 the other def is either defined in the loop ("vect_internal_def"),
2929 or it's an induction (defined by a loop-header phi-node). */
2930
2931 if (def2 && def2 == phi
2932 && (code == COND_EXPR
2933 || !def1 || gimple_nop_p (def1)
2934 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2935 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2936 && (is_gimple_assign (def1)
2937 || is_gimple_call (def1)
2938 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2939 == vect_induction_def
2940 || (gimple_code (def1) == GIMPLE_PHI
2941 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2942 == vect_internal_def
2943 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2944 {
2945 if (dump_enabled_p ())
2946 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2947 return def_stmt;
2948 }
2949
2950 if (def1 && def1 == phi
2951 && (code == COND_EXPR
2952 || !def2 || gimple_nop_p (def2)
2953 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2954 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2955 && (is_gimple_assign (def2)
2956 || is_gimple_call (def2)
2957 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2958 == vect_induction_def
2959 || (gimple_code (def2) == GIMPLE_PHI
2960 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2961 == vect_internal_def
2962 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2963 {
2964 if (check_reduction
2965 && orig_code != MINUS_EXPR)
2966 {
2967 if (code == COND_EXPR)
2968 {
2969 /* No current known use where this case would be useful. */
2970 if (dump_enabled_p ())
2971 report_vect_op (MSG_NOTE, def_stmt,
2972 "detected reduction: cannot currently swap "
2973 "operands for cond_expr");
2974 return NULL;
2975 }
2976
2977 /* Swap operands (just for simplicity - so that the rest of the code
2978 can assume that the reduction variable is always the last (second)
2979 argument). */
2980 if (dump_enabled_p ())
2981 report_vect_op (MSG_NOTE, def_stmt,
2982 "detected reduction: need to swap operands: ");
2983
2984 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2985 gimple_assign_rhs2_ptr (def_stmt));
2986
2987 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2988 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2989 }
2990 else
2991 {
2992 if (dump_enabled_p ())
2993 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2994 }
2995
2996 return def_stmt;
2997 }
2998
2999 /* Try to find SLP reduction chain. */
3000 if (check_reduction && code != COND_EXPR
3001 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3002 {
3003 if (dump_enabled_p ())
3004 report_vect_op (MSG_NOTE, def_stmt,
3005 "reduction: detected reduction chain: ");
3006
3007 return def_stmt;
3008 }
3009
3010 if (dump_enabled_p ())
3011 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3012 "reduction: unknown pattern: ");
3013
3014 return NULL;
3015 }
3016
3017 /* Wrapper around vect_is_simple_reduction_1, which will modify code
3018 in-place if it enables detection of more reductions. Arguments
3019 as there. */
3020
3021 gimple *
vect_force_simple_reduction(loop_vec_info loop_info,gimple * phi,bool check_reduction,bool * double_reduc,bool need_wrapping_integral_overflow)3022 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3023 bool check_reduction, bool *double_reduc,
3024 bool need_wrapping_integral_overflow)
3025 {
3026 enum vect_reduction_type v_reduc_type;
3027 return vect_is_simple_reduction (loop_info, phi, check_reduction,
3028 double_reduc,
3029 need_wrapping_integral_overflow,
3030 &v_reduc_type);
3031 }
3032
3033 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3034 int
vect_get_known_peeling_cost(loop_vec_info loop_vinfo,int peel_iters_prologue,int * peel_iters_epilogue,stmt_vector_for_cost * scalar_cost_vec,stmt_vector_for_cost * prologue_cost_vec,stmt_vector_for_cost * epilogue_cost_vec)3035 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3036 int *peel_iters_epilogue,
3037 stmt_vector_for_cost *scalar_cost_vec,
3038 stmt_vector_for_cost *prologue_cost_vec,
3039 stmt_vector_for_cost *epilogue_cost_vec)
3040 {
3041 int retval = 0;
3042 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3043
3044 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3045 {
3046 *peel_iters_epilogue = vf/2;
3047 if (dump_enabled_p ())
3048 dump_printf_loc (MSG_NOTE, vect_location,
3049 "cost model: epilogue peel iters set to vf/2 "
3050 "because loop iterations are unknown .\n");
3051
3052 /* If peeled iterations are known but number of scalar loop
3053 iterations are unknown, count a taken branch per peeled loop. */
3054 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3055 NULL, 0, vect_prologue);
3056 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3057 NULL, 0, vect_epilogue);
3058 }
3059 else
3060 {
3061 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3062 peel_iters_prologue = niters < peel_iters_prologue ?
3063 niters : peel_iters_prologue;
3064 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
3065 /* If we need to peel for gaps, but no peeling is required, we have to
3066 peel VF iterations. */
3067 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3068 *peel_iters_epilogue = vf;
3069 }
3070
3071 stmt_info_for_cost *si;
3072 int j;
3073 if (peel_iters_prologue)
3074 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3075 retval += record_stmt_cost (prologue_cost_vec,
3076 si->count * peel_iters_prologue,
3077 si->kind, NULL, si->misalign,
3078 vect_prologue);
3079 if (*peel_iters_epilogue)
3080 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3081 retval += record_stmt_cost (epilogue_cost_vec,
3082 si->count * *peel_iters_epilogue,
3083 si->kind, NULL, si->misalign,
3084 vect_epilogue);
3085
3086 return retval;
3087 }
3088
3089 /* Function vect_estimate_min_profitable_iters
3090
3091 Return the number of iterations required for the vector version of the
3092 loop to be profitable relative to the cost of the scalar version of the
3093 loop. */
3094
3095 static void
vect_estimate_min_profitable_iters(loop_vec_info loop_vinfo,int * ret_min_profitable_niters,int * ret_min_profitable_estimate)3096 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3097 int *ret_min_profitable_niters,
3098 int *ret_min_profitable_estimate)
3099 {
3100 int min_profitable_iters;
3101 int min_profitable_estimate;
3102 int peel_iters_prologue;
3103 int peel_iters_epilogue;
3104 unsigned vec_inside_cost = 0;
3105 int vec_outside_cost = 0;
3106 unsigned vec_prologue_cost = 0;
3107 unsigned vec_epilogue_cost = 0;
3108 int scalar_single_iter_cost = 0;
3109 int scalar_outside_cost = 0;
3110 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3111 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3112 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3113
3114 /* Cost model disabled. */
3115 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3116 {
3117 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3118 *ret_min_profitable_niters = 0;
3119 *ret_min_profitable_estimate = 0;
3120 return;
3121 }
3122
3123 /* Requires loop versioning tests to handle misalignment. */
3124 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3125 {
3126 /* FIXME: Make cost depend on complexity of individual check. */
3127 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3128 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3129 vect_prologue);
3130 dump_printf (MSG_NOTE,
3131 "cost model: Adding cost of checks for loop "
3132 "versioning to treat misalignment.\n");
3133 }
3134
3135 /* Requires loop versioning with alias checks. */
3136 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3137 {
3138 /* FIXME: Make cost depend on complexity of individual check. */
3139 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3140 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3141 vect_prologue);
3142 dump_printf (MSG_NOTE,
3143 "cost model: Adding cost of checks for loop "
3144 "versioning aliasing.\n");
3145 }
3146
3147 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3148 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3149 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3150 vect_prologue);
3151
3152 /* Count statements in scalar loop. Using this as scalar cost for a single
3153 iteration for now.
3154
3155 TODO: Add outer loop support.
3156
3157 TODO: Consider assigning different costs to different scalar
3158 statements. */
3159
3160 scalar_single_iter_cost
3161 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3162
3163 /* Add additional cost for the peeled instructions in prologue and epilogue
3164 loop.
3165
3166 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3167 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3168
3169 TODO: Build an expression that represents peel_iters for prologue and
3170 epilogue to be used in a run-time test. */
3171
3172 if (npeel < 0)
3173 {
3174 peel_iters_prologue = vf/2;
3175 dump_printf (MSG_NOTE, "cost model: "
3176 "prologue peel iters set to vf/2.\n");
3177
3178 /* If peeling for alignment is unknown, loop bound of main loop becomes
3179 unknown. */
3180 peel_iters_epilogue = vf/2;
3181 dump_printf (MSG_NOTE, "cost model: "
3182 "epilogue peel iters set to vf/2 because "
3183 "peeling for alignment is unknown.\n");
3184
3185 /* If peeled iterations are unknown, count a taken branch and a not taken
3186 branch per peeled loop. Even if scalar loop iterations are known,
3187 vector iterations are not known since peeled prologue iterations are
3188 not known. Hence guards remain the same. */
3189 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3190 NULL, 0, vect_prologue);
3191 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3192 NULL, 0, vect_prologue);
3193 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3194 NULL, 0, vect_epilogue);
3195 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3196 NULL, 0, vect_epilogue);
3197 stmt_info_for_cost *si;
3198 int j;
3199 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3200 {
3201 struct _stmt_vec_info *stmt_info
3202 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3203 (void) add_stmt_cost (target_cost_data,
3204 si->count * peel_iters_prologue,
3205 si->kind, stmt_info, si->misalign,
3206 vect_prologue);
3207 (void) add_stmt_cost (target_cost_data,
3208 si->count * peel_iters_epilogue,
3209 si->kind, stmt_info, si->misalign,
3210 vect_epilogue);
3211 }
3212 }
3213 else
3214 {
3215 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3216 stmt_info_for_cost *si;
3217 int j;
3218 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3219
3220 prologue_cost_vec.create (2);
3221 epilogue_cost_vec.create (2);
3222 peel_iters_prologue = npeel;
3223
3224 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3225 &peel_iters_epilogue,
3226 &LOOP_VINFO_SCALAR_ITERATION_COST
3227 (loop_vinfo),
3228 &prologue_cost_vec,
3229 &epilogue_cost_vec);
3230
3231 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3232 {
3233 struct _stmt_vec_info *stmt_info
3234 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3235 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3236 si->misalign, vect_prologue);
3237 }
3238
3239 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3240 {
3241 struct _stmt_vec_info *stmt_info
3242 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3243 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3244 si->misalign, vect_epilogue);
3245 }
3246
3247 prologue_cost_vec.release ();
3248 epilogue_cost_vec.release ();
3249 }
3250
3251 /* FORNOW: The scalar outside cost is incremented in one of the
3252 following ways:
3253
3254 1. The vectorizer checks for alignment and aliasing and generates
3255 a condition that allows dynamic vectorization. A cost model
3256 check is ANDED with the versioning condition. Hence scalar code
3257 path now has the added cost of the versioning check.
3258
3259 if (cost > th & versioning_check)
3260 jmp to vector code
3261
3262 Hence run-time scalar is incremented by not-taken branch cost.
3263
3264 2. The vectorizer then checks if a prologue is required. If the
3265 cost model check was not done before during versioning, it has to
3266 be done before the prologue check.
3267
3268 if (cost <= th)
3269 prologue = scalar_iters
3270 if (prologue == 0)
3271 jmp to vector code
3272 else
3273 execute prologue
3274 if (prologue == num_iters)
3275 go to exit
3276
3277 Hence the run-time scalar cost is incremented by a taken branch,
3278 plus a not-taken branch, plus a taken branch cost.
3279
3280 3. The vectorizer then checks if an epilogue is required. If the
3281 cost model check was not done before during prologue check, it
3282 has to be done with the epilogue check.
3283
3284 if (prologue == 0)
3285 jmp to vector code
3286 else
3287 execute prologue
3288 if (prologue == num_iters)
3289 go to exit
3290 vector code:
3291 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3292 jmp to epilogue
3293
3294 Hence the run-time scalar cost should be incremented by 2 taken
3295 branches.
3296
3297 TODO: The back end may reorder the BBS's differently and reverse
3298 conditions/branch directions. Change the estimates below to
3299 something more reasonable. */
3300
3301 /* If the number of iterations is known and we do not do versioning, we can
3302 decide whether to vectorize at compile time. Hence the scalar version
3303 do not carry cost model guard costs. */
3304 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3305 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3306 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3307 {
3308 /* Cost model check occurs at versioning. */
3309 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3310 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3311 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3312 else
3313 {
3314 /* Cost model check occurs at prologue generation. */
3315 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3316 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3317 + vect_get_stmt_cost (cond_branch_not_taken);
3318 /* Cost model check occurs at epilogue generation. */
3319 else
3320 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3321 }
3322 }
3323
3324 /* Complete the target-specific cost calculations. */
3325 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3326 &vec_inside_cost, &vec_epilogue_cost);
3327
3328 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3329
3330 if (dump_enabled_p ())
3331 {
3332 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3333 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3334 vec_inside_cost);
3335 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3336 vec_prologue_cost);
3337 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3338 vec_epilogue_cost);
3339 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3340 scalar_single_iter_cost);
3341 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3342 scalar_outside_cost);
3343 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3344 vec_outside_cost);
3345 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3346 peel_iters_prologue);
3347 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3348 peel_iters_epilogue);
3349 }
3350
3351 /* Calculate number of iterations required to make the vector version
3352 profitable, relative to the loop bodies only. The following condition
3353 must hold true:
3354 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3355 where
3356 SIC = scalar iteration cost, VIC = vector iteration cost,
3357 VOC = vector outside cost, VF = vectorization factor,
3358 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3359 SOC = scalar outside cost for run time cost model check. */
3360
3361 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3362 {
3363 if (vec_outside_cost <= 0)
3364 min_profitable_iters = 1;
3365 else
3366 {
3367 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3368 - vec_inside_cost * peel_iters_prologue
3369 - vec_inside_cost * peel_iters_epilogue)
3370 / ((scalar_single_iter_cost * vf)
3371 - vec_inside_cost);
3372
3373 if ((scalar_single_iter_cost * vf * min_profitable_iters)
3374 <= (((int) vec_inside_cost * min_profitable_iters)
3375 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3376 min_profitable_iters++;
3377 }
3378 }
3379 /* vector version will never be profitable. */
3380 else
3381 {
3382 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3383 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3384 "did not happen for a simd loop");
3385
3386 if (dump_enabled_p ())
3387 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3388 "cost model: the vector iteration cost = %d "
3389 "divided by the scalar iteration cost = %d "
3390 "is greater or equal to the vectorization factor = %d"
3391 ".\n",
3392 vec_inside_cost, scalar_single_iter_cost, vf);
3393 *ret_min_profitable_niters = -1;
3394 *ret_min_profitable_estimate = -1;
3395 return;
3396 }
3397
3398 dump_printf (MSG_NOTE,
3399 " Calculated minimum iters for profitability: %d\n",
3400 min_profitable_iters);
3401
3402 min_profitable_iters =
3403 min_profitable_iters < vf ? vf : min_profitable_iters;
3404
3405 /* Because the condition we create is:
3406 if (niters <= min_profitable_iters)
3407 then skip the vectorized loop. */
3408 min_profitable_iters--;
3409
3410 if (dump_enabled_p ())
3411 dump_printf_loc (MSG_NOTE, vect_location,
3412 " Runtime profitability threshold = %d\n",
3413 min_profitable_iters);
3414
3415 *ret_min_profitable_niters = min_profitable_iters;
3416
3417 /* Calculate number of iterations required to make the vector version
3418 profitable, relative to the loop bodies only.
3419
3420 Non-vectorized variant is SIC * niters and it must win over vector
3421 variant on the expected loop trip count. The following condition must hold true:
3422 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3423
3424 if (vec_outside_cost <= 0)
3425 min_profitable_estimate = 1;
3426 else
3427 {
3428 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3429 - vec_inside_cost * peel_iters_prologue
3430 - vec_inside_cost * peel_iters_epilogue)
3431 / ((scalar_single_iter_cost * vf)
3432 - vec_inside_cost);
3433 }
3434 min_profitable_estimate --;
3435 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3436 if (dump_enabled_p ())
3437 dump_printf_loc (MSG_NOTE, vect_location,
3438 " Static estimate profitability threshold = %d\n",
3439 min_profitable_estimate);
3440
3441 *ret_min_profitable_estimate = min_profitable_estimate;
3442 }
3443
3444 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3445 vector elements (not bits) for a vector of mode MODE. */
3446 static void
calc_vec_perm_mask_for_shift(enum machine_mode mode,unsigned int offset,unsigned char * sel)3447 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3448 unsigned char *sel)
3449 {
3450 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3451
3452 for (i = 0; i < nelt; i++)
3453 sel[i] = (i + offset) & (2*nelt - 1);
3454 }
3455
3456 /* Checks whether the target supports whole-vector shifts for vectors of mode
3457 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3458 it supports vec_perm_const with masks for all necessary shift amounts. */
3459 static bool
have_whole_vector_shift(enum machine_mode mode)3460 have_whole_vector_shift (enum machine_mode mode)
3461 {
3462 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3463 return true;
3464
3465 if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3466 return false;
3467
3468 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3469 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3470
3471 for (i = nelt/2; i >= 1; i/=2)
3472 {
3473 calc_vec_perm_mask_for_shift (mode, i, sel);
3474 if (!can_vec_perm_p (mode, false, sel))
3475 return false;
3476 }
3477 return true;
3478 }
3479
3480 /* Return the reduction operand (with index REDUC_INDEX) of STMT. */
3481
3482 static tree
get_reduction_op(gimple * stmt,int reduc_index)3483 get_reduction_op (gimple *stmt, int reduc_index)
3484 {
3485 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3486 {
3487 case GIMPLE_SINGLE_RHS:
3488 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3489 == ternary_op);
3490 return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3491 case GIMPLE_UNARY_RHS:
3492 return gimple_assign_rhs1 (stmt);
3493 case GIMPLE_BINARY_RHS:
3494 return (reduc_index
3495 ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3496 case GIMPLE_TERNARY_RHS:
3497 return gimple_op (stmt, reduc_index + 1);
3498 default:
3499 gcc_unreachable ();
3500 }
3501 }
3502
3503 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3504 functions. Design better to avoid maintenance issues. */
3505
3506 /* Function vect_model_reduction_cost.
3507
3508 Models cost for a reduction operation, including the vector ops
3509 generated within the strip-mine loop, the initial definition before
3510 the loop, and the epilogue code that must be generated. */
3511
3512 static bool
vect_model_reduction_cost(stmt_vec_info stmt_info,enum tree_code reduc_code,int ncopies,int reduc_index)3513 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3514 int ncopies, int reduc_index)
3515 {
3516 int prologue_cost = 0, epilogue_cost = 0;
3517 enum tree_code code;
3518 optab optab;
3519 tree vectype;
3520 gimple *stmt, *orig_stmt;
3521 tree reduction_op;
3522 machine_mode mode;
3523 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3524 struct loop *loop = NULL;
3525 void *target_cost_data;
3526
3527 if (loop_vinfo)
3528 {
3529 loop = LOOP_VINFO_LOOP (loop_vinfo);
3530 target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3531 }
3532 else
3533 target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3534
3535 /* Condition reductions generate two reductions in the loop. */
3536 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3537 ncopies *= 2;
3538
3539 /* Cost of reduction op inside loop. */
3540 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3541 stmt_info, 0, vect_body);
3542 stmt = STMT_VINFO_STMT (stmt_info);
3543
3544 reduction_op = get_reduction_op (stmt, reduc_index);
3545
3546 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3547 if (!vectype)
3548 {
3549 if (dump_enabled_p ())
3550 {
3551 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3552 "unsupported data-type ");
3553 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3554 TREE_TYPE (reduction_op));
3555 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3556 }
3557 return false;
3558 }
3559
3560 mode = TYPE_MODE (vectype);
3561 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3562
3563 if (!orig_stmt)
3564 orig_stmt = STMT_VINFO_STMT (stmt_info);
3565
3566 code = gimple_assign_rhs_code (orig_stmt);
3567
3568 /* Add in cost for initial definition.
3569 For cond reduction we have four vectors: initial index, step, initial
3570 result of the data reduction, initial value of the index reduction. */
3571 int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3572 == COND_REDUCTION ? 4 : 1;
3573 prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3574 scalar_to_vec, stmt_info, 0,
3575 vect_prologue);
3576
3577 /* Determine cost of epilogue code.
3578
3579 We have a reduction operator that will reduce the vector in one statement.
3580 Also requires scalar extract. */
3581
3582 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3583 {
3584 if (reduc_code != ERROR_MARK)
3585 {
3586 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3587 {
3588 /* An EQ stmt and an COND_EXPR stmt. */
3589 epilogue_cost += add_stmt_cost (target_cost_data, 2,
3590 vector_stmt, stmt_info, 0,
3591 vect_epilogue);
3592 /* Reduction of the max index and a reduction of the found
3593 values. */
3594 epilogue_cost += add_stmt_cost (target_cost_data, 2,
3595 vec_to_scalar, stmt_info, 0,
3596 vect_epilogue);
3597 /* A broadcast of the max value. */
3598 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3599 scalar_to_vec, stmt_info, 0,
3600 vect_epilogue);
3601 }
3602 else
3603 {
3604 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3605 stmt_info, 0, vect_epilogue);
3606 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3607 vec_to_scalar, stmt_info, 0,
3608 vect_epilogue);
3609 }
3610 }
3611 else
3612 {
3613 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3614 tree bitsize =
3615 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3616 int element_bitsize = tree_to_uhwi (bitsize);
3617 int nelements = vec_size_in_bits / element_bitsize;
3618
3619 optab = optab_for_tree_code (code, vectype, optab_default);
3620
3621 /* We have a whole vector shift available. */
3622 if (VECTOR_MODE_P (mode)
3623 && optab_handler (optab, mode) != CODE_FOR_nothing
3624 && have_whole_vector_shift (mode))
3625 {
3626 /* Final reduction via vector shifts and the reduction operator.
3627 Also requires scalar extract. */
3628 epilogue_cost += add_stmt_cost (target_cost_data,
3629 exact_log2 (nelements) * 2,
3630 vector_stmt, stmt_info, 0,
3631 vect_epilogue);
3632 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3633 vec_to_scalar, stmt_info, 0,
3634 vect_epilogue);
3635 }
3636 else
3637 /* Use extracts and reduction op for final reduction. For N
3638 elements, we have N extracts and N-1 reduction ops. */
3639 epilogue_cost += add_stmt_cost (target_cost_data,
3640 nelements + nelements - 1,
3641 vector_stmt, stmt_info, 0,
3642 vect_epilogue);
3643 }
3644 }
3645
3646 if (dump_enabled_p ())
3647 dump_printf (MSG_NOTE,
3648 "vect_model_reduction_cost: inside_cost = %d, "
3649 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3650 prologue_cost, epilogue_cost);
3651
3652 return true;
3653 }
3654
3655
3656 /* Function vect_model_induction_cost.
3657
3658 Models cost for induction operations. */
3659
3660 static void
vect_model_induction_cost(stmt_vec_info stmt_info,int ncopies)3661 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3662 {
3663 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3664 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3665 unsigned inside_cost, prologue_cost;
3666
3667 /* loop cost for vec_loop. */
3668 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3669 stmt_info, 0, vect_body);
3670
3671 /* prologue cost for vec_init and vec_step. */
3672 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3673 stmt_info, 0, vect_prologue);
3674
3675 if (dump_enabled_p ())
3676 dump_printf_loc (MSG_NOTE, vect_location,
3677 "vect_model_induction_cost: inside_cost = %d, "
3678 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3679 }
3680
3681
3682 /* Function get_initial_def_for_induction
3683
3684 Input:
3685 STMT - a stmt that performs an induction operation in the loop.
3686 IV_PHI - the initial value of the induction variable
3687
3688 Output:
3689 Return a vector variable, initialized with the first VF values of
3690 the induction variable. E.g., for an iv with IV_PHI='X' and
3691 evolution S, for a vector of 4 units, we want to return:
3692 [X, X + S, X + 2*S, X + 3*S]. */
3693
3694 static tree
get_initial_def_for_induction(gimple * iv_phi)3695 get_initial_def_for_induction (gimple *iv_phi)
3696 {
3697 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3698 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3699 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3700 tree vectype;
3701 int nunits;
3702 edge pe = loop_preheader_edge (loop);
3703 struct loop *iv_loop;
3704 basic_block new_bb;
3705 tree new_vec, vec_init, vec_step, t;
3706 tree new_name;
3707 gimple *new_stmt;
3708 gphi *induction_phi;
3709 tree induc_def, vec_def, vec_dest;
3710 tree init_expr, step_expr;
3711 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3712 int i;
3713 int ncopies;
3714 tree expr;
3715 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3716 bool nested_in_vect_loop = false;
3717 gimple_seq stmts;
3718 imm_use_iterator imm_iter;
3719 use_operand_p use_p;
3720 gimple *exit_phi;
3721 edge latch_e;
3722 tree loop_arg;
3723 gimple_stmt_iterator si;
3724 basic_block bb = gimple_bb (iv_phi);
3725 tree stepvectype;
3726 tree resvectype;
3727
3728 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3729 if (nested_in_vect_loop_p (loop, iv_phi))
3730 {
3731 nested_in_vect_loop = true;
3732 iv_loop = loop->inner;
3733 }
3734 else
3735 iv_loop = loop;
3736 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3737
3738 latch_e = loop_latch_edge (iv_loop);
3739 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3740
3741 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3742 gcc_assert (step_expr != NULL_TREE);
3743
3744 pe = loop_preheader_edge (iv_loop);
3745 init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3746 loop_preheader_edge (iv_loop));
3747
3748 vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3749 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3750 gcc_assert (vectype);
3751 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3752 ncopies = vf / nunits;
3753
3754 gcc_assert (phi_info);
3755 gcc_assert (ncopies >= 1);
3756
3757 /* Convert the step to the desired type. */
3758 stmts = NULL;
3759 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
3760 if (stmts)
3761 {
3762 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3763 gcc_assert (!new_bb);
3764 }
3765
3766 /* Find the first insertion point in the BB. */
3767 si = gsi_after_labels (bb);
3768
3769 /* Create the vector that holds the initial_value of the induction. */
3770 if (nested_in_vect_loop)
3771 {
3772 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3773 been created during vectorization of previous stmts. We obtain it
3774 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3775 vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi);
3776 /* If the initial value is not of proper type, convert it. */
3777 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3778 {
3779 new_stmt
3780 = gimple_build_assign (vect_get_new_ssa_name (vectype,
3781 vect_simple_var,
3782 "vec_iv_"),
3783 VIEW_CONVERT_EXPR,
3784 build1 (VIEW_CONVERT_EXPR, vectype,
3785 vec_init));
3786 vec_init = gimple_assign_lhs (new_stmt);
3787 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3788 new_stmt);
3789 gcc_assert (!new_bb);
3790 set_vinfo_for_stmt (new_stmt,
3791 new_stmt_vec_info (new_stmt, loop_vinfo));
3792 }
3793 }
3794 else
3795 {
3796 vec<constructor_elt, va_gc> *v;
3797
3798 /* iv_loop is the loop to be vectorized. Create:
3799 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3800 stmts = NULL;
3801 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
3802
3803 vec_alloc (v, nunits);
3804 bool constant_p = is_gimple_min_invariant (new_name);
3805 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3806 for (i = 1; i < nunits; i++)
3807 {
3808 /* Create: new_name_i = new_name + step_expr */
3809 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
3810 new_name, step_expr);
3811 if (!is_gimple_min_invariant (new_name))
3812 constant_p = false;
3813 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3814 }
3815 if (stmts)
3816 {
3817 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3818 gcc_assert (!new_bb);
3819 }
3820
3821 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3822 if (constant_p)
3823 new_vec = build_vector_from_ctor (vectype, v);
3824 else
3825 new_vec = build_constructor (vectype, v);
3826 vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3827 }
3828
3829
3830 /* Create the vector that holds the step of the induction. */
3831 if (nested_in_vect_loop)
3832 /* iv_loop is nested in the loop to be vectorized. Generate:
3833 vec_step = [S, S, S, S] */
3834 new_name = step_expr;
3835 else
3836 {
3837 /* iv_loop is the loop to be vectorized. Generate:
3838 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3839 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3840 {
3841 expr = build_int_cst (integer_type_node, vf);
3842 expr = fold_convert (TREE_TYPE (step_expr), expr);
3843 }
3844 else
3845 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3846 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3847 expr, step_expr);
3848 if (TREE_CODE (step_expr) == SSA_NAME)
3849 new_name = vect_init_vector (iv_phi, new_name,
3850 TREE_TYPE (step_expr), NULL);
3851 }
3852
3853 t = unshare_expr (new_name);
3854 gcc_assert (CONSTANT_CLASS_P (new_name)
3855 || TREE_CODE (new_name) == SSA_NAME);
3856 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3857 gcc_assert (stepvectype);
3858 new_vec = build_vector_from_val (stepvectype, t);
3859 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3860
3861
3862 /* Create the following def-use cycle:
3863 loop prolog:
3864 vec_init = ...
3865 vec_step = ...
3866 loop:
3867 vec_iv = PHI <vec_init, vec_loop>
3868 ...
3869 STMT
3870 ...
3871 vec_loop = vec_iv + vec_step; */
3872
3873 /* Create the induction-phi that defines the induction-operand. */
3874 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3875 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3876 set_vinfo_for_stmt (induction_phi,
3877 new_stmt_vec_info (induction_phi, loop_vinfo));
3878 induc_def = PHI_RESULT (induction_phi);
3879
3880 /* Create the iv update inside the loop */
3881 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3882 vec_def = make_ssa_name (vec_dest, new_stmt);
3883 gimple_assign_set_lhs (new_stmt, vec_def);
3884 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3885 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
3886
3887 /* Set the arguments of the phi node: */
3888 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3889 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3890 UNKNOWN_LOCATION);
3891
3892
3893 /* In case that vectorization factor (VF) is bigger than the number
3894 of elements that we can fit in a vectype (nunits), we have to generate
3895 more than one vector stmt - i.e - we need to "unroll" the
3896 vector stmt by a factor VF/nunits. For more details see documentation
3897 in vectorizable_operation. */
3898
3899 if (ncopies > 1)
3900 {
3901 stmt_vec_info prev_stmt_vinfo;
3902 /* FORNOW. This restriction should be relaxed. */
3903 gcc_assert (!nested_in_vect_loop);
3904
3905 /* Create the vector that holds the step of the induction. */
3906 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3907 {
3908 expr = build_int_cst (integer_type_node, nunits);
3909 expr = fold_convert (TREE_TYPE (step_expr), expr);
3910 }
3911 else
3912 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3913 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3914 expr, step_expr);
3915 if (TREE_CODE (step_expr) == SSA_NAME)
3916 new_name = vect_init_vector (iv_phi, new_name,
3917 TREE_TYPE (step_expr), NULL);
3918 t = unshare_expr (new_name);
3919 gcc_assert (CONSTANT_CLASS_P (new_name)
3920 || TREE_CODE (new_name) == SSA_NAME);
3921 new_vec = build_vector_from_val (stepvectype, t);
3922 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3923
3924 vec_def = induc_def;
3925 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3926 for (i = 1; i < ncopies; i++)
3927 {
3928 /* vec_i = vec_prev + vec_step */
3929 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3930 vec_def, vec_step);
3931 vec_def = make_ssa_name (vec_dest, new_stmt);
3932 gimple_assign_set_lhs (new_stmt, vec_def);
3933
3934 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3935 if (!useless_type_conversion_p (resvectype, vectype))
3936 {
3937 new_stmt
3938 = gimple_build_assign
3939 (vect_get_new_vect_var (resvectype, vect_simple_var,
3940 "vec_iv_"),
3941 VIEW_CONVERT_EXPR,
3942 build1 (VIEW_CONVERT_EXPR, resvectype,
3943 gimple_assign_lhs (new_stmt)));
3944 gimple_assign_set_lhs (new_stmt,
3945 make_ssa_name
3946 (gimple_assign_lhs (new_stmt), new_stmt));
3947 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3948 }
3949 set_vinfo_for_stmt (new_stmt,
3950 new_stmt_vec_info (new_stmt, loop_vinfo));
3951 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3952 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3953 }
3954 }
3955
3956 if (nested_in_vect_loop)
3957 {
3958 /* Find the loop-closed exit-phi of the induction, and record
3959 the final vector of induction results: */
3960 exit_phi = NULL;
3961 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3962 {
3963 gimple *use_stmt = USE_STMT (use_p);
3964 if (is_gimple_debug (use_stmt))
3965 continue;
3966
3967 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3968 {
3969 exit_phi = use_stmt;
3970 break;
3971 }
3972 }
3973 if (exit_phi)
3974 {
3975 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3976 /* FORNOW. Currently not supporting the case that an inner-loop induction
3977 is not used in the outer-loop (i.e. only outside the outer-loop). */
3978 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3979 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3980
3981 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3982 if (dump_enabled_p ())
3983 {
3984 dump_printf_loc (MSG_NOTE, vect_location,
3985 "vector of inductions after inner-loop:");
3986 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3987 dump_printf (MSG_NOTE, "\n");
3988 }
3989 }
3990 }
3991
3992
3993 if (dump_enabled_p ())
3994 {
3995 dump_printf_loc (MSG_NOTE, vect_location,
3996 "transform induction: created def-use cycle: ");
3997 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3998 dump_printf (MSG_NOTE, "\n");
3999 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
4000 SSA_NAME_DEF_STMT (vec_def), 0);
4001 dump_printf (MSG_NOTE, "\n");
4002 }
4003
4004 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
4005 if (!useless_type_conversion_p (resvectype, vectype))
4006 {
4007 new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
4008 vect_simple_var,
4009 "vec_iv_"),
4010 VIEW_CONVERT_EXPR,
4011 build1 (VIEW_CONVERT_EXPR, resvectype,
4012 induc_def));
4013 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
4014 gimple_assign_set_lhs (new_stmt, induc_def);
4015 si = gsi_after_labels (bb);
4016 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
4017 set_vinfo_for_stmt (new_stmt,
4018 new_stmt_vec_info (new_stmt, loop_vinfo));
4019 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
4020 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
4021 }
4022
4023 return induc_def;
4024 }
4025
4026
4027 /* Function get_initial_def_for_reduction
4028
4029 Input:
4030 STMT - a stmt that performs a reduction operation in the loop.
4031 INIT_VAL - the initial value of the reduction variable
4032
4033 Output:
4034 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4035 of the reduction (used for adjusting the epilog - see below).
4036 Return a vector variable, initialized according to the operation that STMT
4037 performs. This vector will be used as the initial value of the
4038 vector of partial results.
4039
4040 Option1 (adjust in epilog): Initialize the vector as follows:
4041 add/bit or/xor: [0,0,...,0,0]
4042 mult/bit and: [1,1,...,1,1]
4043 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4044 and when necessary (e.g. add/mult case) let the caller know
4045 that it needs to adjust the result by init_val.
4046
4047 Option2: Initialize the vector as follows:
4048 add/bit or/xor: [init_val,0,0,...,0]
4049 mult/bit and: [init_val,1,1,...,1]
4050 min/max/cond_expr: [init_val,init_val,...,init_val]
4051 and no adjustments are needed.
4052
4053 For example, for the following code:
4054
4055 s = init_val;
4056 for (i=0;i<n;i++)
4057 s = s + a[i];
4058
4059 STMT is 's = s + a[i]', and the reduction variable is 's'.
4060 For a vector of 4 units, we want to return either [0,0,0,init_val],
4061 or [0,0,0,0] and let the caller know that it needs to adjust
4062 the result at the end by 'init_val'.
4063
4064 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4065 initialization vector is simpler (same element in all entries), if
4066 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4067
4068 A cost model should help decide between these two schemes. */
4069
4070 tree
get_initial_def_for_reduction(gimple * stmt,tree init_val,tree * adjustment_def)4071 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4072 tree *adjustment_def)
4073 {
4074 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4075 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4076 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4077 tree scalar_type = TREE_TYPE (init_val);
4078 tree vectype = get_vectype_for_scalar_type (scalar_type);
4079 int nunits;
4080 enum tree_code code = gimple_assign_rhs_code (stmt);
4081 tree def_for_init;
4082 tree init_def;
4083 tree *elts;
4084 int i;
4085 bool nested_in_vect_loop = false;
4086 REAL_VALUE_TYPE real_init_val = dconst0;
4087 int int_init_val = 0;
4088 gimple *def_stmt = NULL;
4089 gimple_seq stmts = NULL;
4090
4091 gcc_assert (vectype);
4092 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4093
4094 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4095 || SCALAR_FLOAT_TYPE_P (scalar_type));
4096
4097 if (nested_in_vect_loop_p (loop, stmt))
4098 nested_in_vect_loop = true;
4099 else
4100 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4101
4102 /* In case of double reduction we only create a vector variable to be put
4103 in the reduction phi node. The actual statement creation is done in
4104 vect_create_epilog_for_reduction. */
4105 if (adjustment_def && nested_in_vect_loop
4106 && TREE_CODE (init_val) == SSA_NAME
4107 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4108 && gimple_code (def_stmt) == GIMPLE_PHI
4109 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4110 && vinfo_for_stmt (def_stmt)
4111 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4112 == vect_double_reduction_def)
4113 {
4114 *adjustment_def = NULL;
4115 return vect_create_destination_var (init_val, vectype);
4116 }
4117
4118 /* In case of a nested reduction do not use an adjustment def as
4119 that case is not supported by the epilogue generation correctly
4120 if ncopies is not one. */
4121 if (adjustment_def && nested_in_vect_loop)
4122 {
4123 *adjustment_def = NULL;
4124 return vect_get_vec_def_for_operand (init_val, stmt);
4125 }
4126
4127 switch (code)
4128 {
4129 case WIDEN_SUM_EXPR:
4130 case DOT_PROD_EXPR:
4131 case SAD_EXPR:
4132 case PLUS_EXPR:
4133 case MINUS_EXPR:
4134 case BIT_IOR_EXPR:
4135 case BIT_XOR_EXPR:
4136 case MULT_EXPR:
4137 case BIT_AND_EXPR:
4138 /* ADJUSMENT_DEF is NULL when called from
4139 vect_create_epilog_for_reduction to vectorize double reduction. */
4140 if (adjustment_def)
4141 *adjustment_def = init_val;
4142
4143 if (code == MULT_EXPR)
4144 {
4145 real_init_val = dconst1;
4146 int_init_val = 1;
4147 }
4148
4149 if (code == BIT_AND_EXPR)
4150 int_init_val = -1;
4151
4152 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4153 def_for_init = build_real (scalar_type, real_init_val);
4154 else
4155 def_for_init = build_int_cst (scalar_type, int_init_val);
4156
4157 /* Create a vector of '0' or '1' except the first element. */
4158 elts = XALLOCAVEC (tree, nunits);
4159 for (i = nunits - 2; i >= 0; --i)
4160 elts[i + 1] = def_for_init;
4161
4162 /* Option1: the first element is '0' or '1' as well. */
4163 if (adjustment_def)
4164 {
4165 elts[0] = def_for_init;
4166 init_def = build_vector (vectype, elts);
4167 break;
4168 }
4169
4170 /* Option2: the first element is INIT_VAL. */
4171 elts[0] = init_val;
4172 if (TREE_CONSTANT (init_val))
4173 init_def = build_vector (vectype, elts);
4174 else
4175 {
4176 vec<constructor_elt, va_gc> *v;
4177 vec_alloc (v, nunits);
4178 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
4179 for (i = 1; i < nunits; ++i)
4180 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
4181 init_def = build_constructor (vectype, v);
4182 }
4183
4184 break;
4185
4186 case MIN_EXPR:
4187 case MAX_EXPR:
4188 case COND_EXPR:
4189 if (adjustment_def)
4190 {
4191 *adjustment_def = NULL_TREE;
4192 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4193 {
4194 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4195 break;
4196 }
4197 }
4198 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4199 if (! gimple_seq_empty_p (stmts))
4200 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4201 init_def = build_vector_from_val (vectype, init_val);
4202 break;
4203
4204 default:
4205 gcc_unreachable ();
4206 }
4207
4208 return init_def;
4209 }
4210
4211 /* Function vect_create_epilog_for_reduction
4212
4213 Create code at the loop-epilog to finalize the result of a reduction
4214 computation.
4215
4216 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4217 reduction statements.
4218 STMT is the scalar reduction stmt that is being vectorized.
4219 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4220 number of elements that we can fit in a vectype (nunits). In this case
4221 we have to generate more than one vector stmt - i.e - we need to "unroll"
4222 the vector stmt by a factor VF/nunits. For more details see documentation
4223 in vectorizable_operation.
4224 REDUC_CODE is the tree-code for the epilog reduction.
4225 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4226 computation.
4227 REDUC_INDEX is the index of the operand in the right hand side of the
4228 statement that is defined by REDUCTION_PHI.
4229 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4230 SLP_NODE is an SLP node containing a group of reduction statements. The
4231 first one in this group is STMT.
4232 INDUCTION_INDEX is the index of the loop for condition reductions.
4233 Otherwise it is undefined.
4234
4235 This function:
4236 1. Creates the reduction def-use cycles: sets the arguments for
4237 REDUCTION_PHIS:
4238 The loop-entry argument is the vectorized initial-value of the reduction.
4239 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4240 sums.
4241 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4242 by applying the operation specified by REDUC_CODE if available, or by
4243 other means (whole-vector shifts or a scalar loop).
4244 The function also creates a new phi node at the loop exit to preserve
4245 loop-closed form, as illustrated below.
4246
4247 The flow at the entry to this function:
4248
4249 loop:
4250 vec_def = phi <null, null> # REDUCTION_PHI
4251 VECT_DEF = vector_stmt # vectorized form of STMT
4252 s_loop = scalar_stmt # (scalar) STMT
4253 loop_exit:
4254 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4255 use <s_out0>
4256 use <s_out0>
4257
4258 The above is transformed by this function into:
4259
4260 loop:
4261 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4262 VECT_DEF = vector_stmt # vectorized form of STMT
4263 s_loop = scalar_stmt # (scalar) STMT
4264 loop_exit:
4265 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4266 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4267 v_out2 = reduce <v_out1>
4268 s_out3 = extract_field <v_out2, 0>
4269 s_out4 = adjust_result <s_out3>
4270 use <s_out4>
4271 use <s_out4>
4272 */
4273
4274 static void
vect_create_epilog_for_reduction(vec<tree> vect_defs,gimple * stmt,int ncopies,enum tree_code reduc_code,vec<gimple * > reduction_phis,int reduc_index,bool double_reduc,slp_tree slp_node,tree induction_index)4275 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4276 int ncopies, enum tree_code reduc_code,
4277 vec<gimple *> reduction_phis,
4278 int reduc_index, bool double_reduc,
4279 slp_tree slp_node, tree induction_index)
4280 {
4281 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4282 stmt_vec_info prev_phi_info;
4283 tree vectype;
4284 machine_mode mode;
4285 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4286 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4287 basic_block exit_bb;
4288 tree scalar_dest;
4289 tree scalar_type;
4290 gimple *new_phi = NULL, *phi;
4291 gimple_stmt_iterator exit_gsi;
4292 tree vec_dest;
4293 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4294 gimple *epilog_stmt = NULL;
4295 enum tree_code code = gimple_assign_rhs_code (stmt);
4296 gimple *exit_phi;
4297 tree bitsize;
4298 tree adjustment_def = NULL;
4299 tree vec_initial_def = NULL;
4300 tree reduction_op, expr, def, initial_def = NULL;
4301 tree orig_name, scalar_result;
4302 imm_use_iterator imm_iter, phi_imm_iter;
4303 use_operand_p use_p, phi_use_p;
4304 gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4305 bool nested_in_vect_loop = false;
4306 auto_vec<gimple *> new_phis;
4307 auto_vec<gimple *> inner_phis;
4308 enum vect_def_type dt = vect_unknown_def_type;
4309 int j, i;
4310 auto_vec<tree> scalar_results;
4311 unsigned int group_size = 1, k, ratio;
4312 auto_vec<tree> vec_initial_defs;
4313 auto_vec<gimple *> phis;
4314 bool slp_reduc = false;
4315 tree new_phi_result;
4316 gimple *inner_phi = NULL;
4317
4318 if (slp_node)
4319 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4320
4321 if (nested_in_vect_loop_p (loop, stmt))
4322 {
4323 outer_loop = loop;
4324 loop = loop->inner;
4325 nested_in_vect_loop = true;
4326 gcc_assert (!slp_node);
4327 }
4328
4329 reduction_op = get_reduction_op (stmt, reduc_index);
4330
4331 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4332 gcc_assert (vectype);
4333 mode = TYPE_MODE (vectype);
4334
4335 /* 1. Create the reduction def-use cycle:
4336 Set the arguments of REDUCTION_PHIS, i.e., transform
4337
4338 loop:
4339 vec_def = phi <null, null> # REDUCTION_PHI
4340 VECT_DEF = vector_stmt # vectorized form of STMT
4341 ...
4342
4343 into:
4344
4345 loop:
4346 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4347 VECT_DEF = vector_stmt # vectorized form of STMT
4348 ...
4349
4350 (in case of SLP, do it for all the phis). */
4351
4352 /* Get the loop-entry arguments. */
4353 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4354 if (slp_node)
4355 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4356 NULL, slp_node, reduc_index);
4357 else
4358 {
4359 /* Get at the scalar def before the loop, that defines the initial value
4360 of the reduction variable. */
4361 gimple *def_stmt = SSA_NAME_DEF_STMT (reduction_op);
4362 initial_def = PHI_ARG_DEF_FROM_EDGE (def_stmt,
4363 loop_preheader_edge (loop));
4364 vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4365 vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4366 &adjustment_def);
4367 vec_initial_defs.create (1);
4368 vec_initial_defs.quick_push (vec_initial_def);
4369 }
4370
4371 /* Set phi nodes arguments. */
4372 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4373 {
4374 tree vec_init_def, def;
4375 gimple_seq stmts;
4376 vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4377 true, NULL_TREE);
4378 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4379 def = vect_defs[i];
4380 for (j = 0; j < ncopies; j++)
4381 {
4382 if (j != 0)
4383 {
4384 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4385 if (nested_in_vect_loop)
4386 vec_init_def
4387 = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4388 vec_init_def);
4389 }
4390
4391 /* Set the loop-entry arg of the reduction-phi. */
4392
4393 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4394 == INTEGER_INDUC_COND_REDUCTION)
4395 {
4396 /* Initialise the reduction phi to zero. This prevents initial
4397 values of non-zero interferring with the reduction op. */
4398 gcc_assert (ncopies == 1);
4399 gcc_assert (i == 0);
4400
4401 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4402 tree zero_vec = build_zero_cst (vec_init_def_type);
4403
4404 add_phi_arg (as_a <gphi *> (phi), zero_vec,
4405 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4406 }
4407 else
4408 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4409 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4410
4411 /* Set the loop-latch arg for the reduction-phi. */
4412 if (j > 0)
4413 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4414
4415 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4416 UNKNOWN_LOCATION);
4417
4418 if (dump_enabled_p ())
4419 {
4420 dump_printf_loc (MSG_NOTE, vect_location,
4421 "transform reduction: created def-use cycle: ");
4422 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4423 dump_printf (MSG_NOTE, "\n");
4424 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4425 dump_printf (MSG_NOTE, "\n");
4426 }
4427 }
4428 }
4429
4430 /* 2. Create epilog code.
4431 The reduction epilog code operates across the elements of the vector
4432 of partial results computed by the vectorized loop.
4433 The reduction epilog code consists of:
4434
4435 step 1: compute the scalar result in a vector (v_out2)
4436 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4437 step 3: adjust the scalar result (s_out3) if needed.
4438
4439 Step 1 can be accomplished using one the following three schemes:
4440 (scheme 1) using reduc_code, if available.
4441 (scheme 2) using whole-vector shifts, if available.
4442 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4443 combined.
4444
4445 The overall epilog code looks like this:
4446
4447 s_out0 = phi <s_loop> # original EXIT_PHI
4448 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4449 v_out2 = reduce <v_out1> # step 1
4450 s_out3 = extract_field <v_out2, 0> # step 2
4451 s_out4 = adjust_result <s_out3> # step 3
4452
4453 (step 3 is optional, and steps 1 and 2 may be combined).
4454 Lastly, the uses of s_out0 are replaced by s_out4. */
4455
4456
4457 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4458 v_out1 = phi <VECT_DEF>
4459 Store them in NEW_PHIS. */
4460
4461 exit_bb = single_exit (loop)->dest;
4462 prev_phi_info = NULL;
4463 new_phis.create (vect_defs.length ());
4464 FOR_EACH_VEC_ELT (vect_defs, i, def)
4465 {
4466 for (j = 0; j < ncopies; j++)
4467 {
4468 tree new_def = copy_ssa_name (def);
4469 phi = create_phi_node (new_def, exit_bb);
4470 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4471 if (j == 0)
4472 new_phis.quick_push (phi);
4473 else
4474 {
4475 def = vect_get_vec_def_for_stmt_copy (dt, def);
4476 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4477 }
4478
4479 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4480 prev_phi_info = vinfo_for_stmt (phi);
4481 }
4482 }
4483
4484 /* The epilogue is created for the outer-loop, i.e., for the loop being
4485 vectorized. Create exit phis for the outer loop. */
4486 if (double_reduc)
4487 {
4488 loop = outer_loop;
4489 exit_bb = single_exit (loop)->dest;
4490 inner_phis.create (vect_defs.length ());
4491 FOR_EACH_VEC_ELT (new_phis, i, phi)
4492 {
4493 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4494 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4495 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4496 PHI_RESULT (phi));
4497 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4498 loop_vinfo));
4499 inner_phis.quick_push (phi);
4500 new_phis[i] = outer_phi;
4501 prev_phi_info = vinfo_for_stmt (outer_phi);
4502 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4503 {
4504 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4505 new_result = copy_ssa_name (PHI_RESULT (phi));
4506 outer_phi = create_phi_node (new_result, exit_bb);
4507 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4508 PHI_RESULT (phi));
4509 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4510 loop_vinfo));
4511 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4512 prev_phi_info = vinfo_for_stmt (outer_phi);
4513 }
4514 }
4515 }
4516
4517 exit_gsi = gsi_after_labels (exit_bb);
4518
4519 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4520 (i.e. when reduc_code is not available) and in the final adjustment
4521 code (if needed). Also get the original scalar reduction variable as
4522 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4523 represents a reduction pattern), the tree-code and scalar-def are
4524 taken from the original stmt that the pattern-stmt (STMT) replaces.
4525 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4526 are taken from STMT. */
4527
4528 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4529 if (!orig_stmt)
4530 {
4531 /* Regular reduction */
4532 orig_stmt = stmt;
4533 }
4534 else
4535 {
4536 /* Reduction pattern */
4537 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4538 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4539 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4540 }
4541
4542 code = gimple_assign_rhs_code (orig_stmt);
4543 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4544 partial results are added and not subtracted. */
4545 if (code == MINUS_EXPR)
4546 code = PLUS_EXPR;
4547
4548 scalar_dest = gimple_assign_lhs (orig_stmt);
4549 scalar_type = TREE_TYPE (scalar_dest);
4550 scalar_results.create (group_size);
4551 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4552 bitsize = TYPE_SIZE (scalar_type);
4553
4554 /* In case this is a reduction in an inner-loop while vectorizing an outer
4555 loop - we don't need to extract a single scalar result at the end of the
4556 inner-loop (unless it is double reduction, i.e., the use of reduction is
4557 outside the outer-loop). The final vector of partial results will be used
4558 in the vectorized outer-loop, or reduced to a scalar result at the end of
4559 the outer-loop. */
4560 if (nested_in_vect_loop && !double_reduc)
4561 goto vect_finalize_reduction;
4562
4563 /* SLP reduction without reduction chain, e.g.,
4564 # a1 = phi <a2, a0>
4565 # b1 = phi <b2, b0>
4566 a2 = operation (a1)
4567 b2 = operation (b1) */
4568 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4569
4570 /* In case of reduction chain, e.g.,
4571 # a1 = phi <a3, a0>
4572 a2 = operation (a1)
4573 a3 = operation (a2),
4574
4575 we may end up with more than one vector result. Here we reduce them to
4576 one vector. */
4577 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4578 {
4579 tree first_vect = PHI_RESULT (new_phis[0]);
4580 tree tmp;
4581 gassign *new_vec_stmt = NULL;
4582
4583 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4584 for (k = 1; k < new_phis.length (); k++)
4585 {
4586 gimple *next_phi = new_phis[k];
4587 tree second_vect = PHI_RESULT (next_phi);
4588
4589 tmp = build2 (code, vectype, first_vect, second_vect);
4590 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4591 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4592 gimple_assign_set_lhs (new_vec_stmt, first_vect);
4593 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4594 }
4595
4596 new_phi_result = first_vect;
4597 if (new_vec_stmt)
4598 {
4599 new_phis.truncate (0);
4600 new_phis.safe_push (new_vec_stmt);
4601 }
4602 }
4603 else
4604 new_phi_result = PHI_RESULT (new_phis[0]);
4605
4606 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4607 {
4608 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4609 various data values where the condition matched and another vector
4610 (INDUCTION_INDEX) containing all the indexes of those matches. We
4611 need to extract the last matching index (which will be the index with
4612 highest value) and use this to index into the data vector.
4613 For the case where there were no matches, the data vector will contain
4614 all default values and the index vector will be all zeros. */
4615
4616 /* Get various versions of the type of the vector of indexes. */
4617 tree index_vec_type = TREE_TYPE (induction_index);
4618 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4619 tree index_scalar_type = TREE_TYPE (index_vec_type);
4620 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4621 (index_vec_type);
4622
4623 /* Get an unsigned integer version of the type of the data vector. */
4624 int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
4625 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4626 tree vectype_unsigned = build_vector_type
4627 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4628
4629 /* First we need to create a vector (ZERO_VEC) of zeros and another
4630 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4631 can create using a MAX reduction and then expanding.
4632 In the case where the loop never made any matches, the max index will
4633 be zero. */
4634
4635 /* Vector of {0, 0, 0,...}. */
4636 tree zero_vec = make_ssa_name (vectype);
4637 tree zero_vec_rhs = build_zero_cst (vectype);
4638 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4639 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4640
4641 /* Find maximum value from the vector of found indexes. */
4642 tree max_index = make_ssa_name (index_scalar_type);
4643 gimple *max_index_stmt = gimple_build_assign (max_index, REDUC_MAX_EXPR,
4644 induction_index);
4645 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4646
4647 /* Vector of {max_index, max_index, max_index,...}. */
4648 tree max_index_vec = make_ssa_name (index_vec_type);
4649 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4650 max_index);
4651 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4652 max_index_vec_rhs);
4653 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4654
4655 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4656 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4657 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4658 otherwise. Only one value should match, resulting in a vector
4659 (VEC_COND) with one data value and the rest zeros.
4660 In the case where the loop never made any matches, every index will
4661 match, resulting in a vector with all data values (which will all be
4662 the default value). */
4663
4664 /* Compare the max index vector to the vector of found indexes to find
4665 the position of the max value. */
4666 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4667 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4668 induction_index,
4669 max_index_vec);
4670 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4671
4672 /* Use the compare to choose either values from the data vector or
4673 zero. */
4674 tree vec_cond = make_ssa_name (vectype);
4675 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4676 vec_compare, new_phi_result,
4677 zero_vec);
4678 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4679
4680 /* Finally we need to extract the data value from the vector (VEC_COND)
4681 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4682 reduction, but because this doesn't exist, we can use a MAX reduction
4683 instead. The data value might be signed or a float so we need to cast
4684 it first.
4685 In the case where the loop never made any matches, the data values are
4686 all identical, and so will reduce down correctly. */
4687
4688 /* Make the matched data values unsigned. */
4689 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4690 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4691 vec_cond);
4692 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4693 VIEW_CONVERT_EXPR,
4694 vec_cond_cast_rhs);
4695 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4696
4697 /* Reduce down to a scalar value. */
4698 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4699 optab ot = optab_for_tree_code (REDUC_MAX_EXPR, vectype_unsigned,
4700 optab_default);
4701 gcc_assert (optab_handler (ot, TYPE_MODE (vectype_unsigned))
4702 != CODE_FOR_nothing);
4703 gimple *data_reduc_stmt = gimple_build_assign (data_reduc,
4704 REDUC_MAX_EXPR,
4705 vec_cond_cast);
4706 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4707
4708 /* Convert the reduced value back to the result type and set as the
4709 result. */
4710 tree data_reduc_cast = build1 (VIEW_CONVERT_EXPR, scalar_type,
4711 data_reduc);
4712 epilog_stmt = gimple_build_assign (new_scalar_dest, data_reduc_cast);
4713 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4714 gimple_assign_set_lhs (epilog_stmt, new_temp);
4715 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4716 scalar_results.safe_push (new_temp);
4717 }
4718
4719 /* 2.3 Create the reduction code, using one of the three schemes described
4720 above. In SLP we simply need to extract all the elements from the
4721 vector (without reducing them), so we use scalar shifts. */
4722 else if (reduc_code != ERROR_MARK && !slp_reduc)
4723 {
4724 tree tmp;
4725 tree vec_elem_type;
4726
4727 /*** Case 1: Create:
4728 v_out2 = reduc_expr <v_out1> */
4729
4730 if (dump_enabled_p ())
4731 dump_printf_loc (MSG_NOTE, vect_location,
4732 "Reduce using direct vector reduction.\n");
4733
4734 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4735 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4736 {
4737 tree tmp_dest =
4738 vect_create_destination_var (scalar_dest, vec_elem_type);
4739 tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4740 epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4741 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4742 gimple_assign_set_lhs (epilog_stmt, new_temp);
4743 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4744
4745 tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4746 }
4747 else
4748 tmp = build1 (reduc_code, scalar_type, new_phi_result);
4749
4750 epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4751 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4752 gimple_assign_set_lhs (epilog_stmt, new_temp);
4753 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4754
4755 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4756 == INTEGER_INDUC_COND_REDUCTION)
4757 {
4758 /* Earlier we set the initial value to be zero. Check the result
4759 and if it is zero then replace with the original initial
4760 value. */
4761 tree zero = build_zero_cst (scalar_type);
4762 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
4763
4764 tmp = make_ssa_name (new_scalar_dest);
4765 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4766 initial_def, new_temp);
4767 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4768 new_temp = tmp;
4769 }
4770
4771 scalar_results.safe_push (new_temp);
4772 }
4773 else
4774 {
4775 bool reduce_with_shift = have_whole_vector_shift (mode);
4776 int element_bitsize = tree_to_uhwi (bitsize);
4777 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4778 tree vec_temp;
4779
4780 /* Regardless of whether we have a whole vector shift, if we're
4781 emulating the operation via tree-vect-generic, we don't want
4782 to use it. Only the first round of the reduction is likely
4783 to still be profitable via emulation. */
4784 /* ??? It might be better to emit a reduction tree code here, so that
4785 tree-vect-generic can expand the first round via bit tricks. */
4786 if (!VECTOR_MODE_P (mode))
4787 reduce_with_shift = false;
4788 else
4789 {
4790 optab optab = optab_for_tree_code (code, vectype, optab_default);
4791 if (optab_handler (optab, mode) == CODE_FOR_nothing)
4792 reduce_with_shift = false;
4793 }
4794
4795 if (reduce_with_shift && !slp_reduc)
4796 {
4797 int nelements = vec_size_in_bits / element_bitsize;
4798 unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4799
4800 int elt_offset;
4801
4802 tree zero_vec = build_zero_cst (vectype);
4803 /*** Case 2: Create:
4804 for (offset = nelements/2; offset >= 1; offset/=2)
4805 {
4806 Create: va' = vec_shift <va, offset>
4807 Create: va = vop <va, va'>
4808 } */
4809
4810 tree rhs;
4811
4812 if (dump_enabled_p ())
4813 dump_printf_loc (MSG_NOTE, vect_location,
4814 "Reduce using vector shifts\n");
4815
4816 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4817 new_temp = new_phi_result;
4818 for (elt_offset = nelements / 2;
4819 elt_offset >= 1;
4820 elt_offset /= 2)
4821 {
4822 calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4823 tree mask = vect_gen_perm_mask_any (vectype, sel);
4824 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4825 new_temp, zero_vec, mask);
4826 new_name = make_ssa_name (vec_dest, epilog_stmt);
4827 gimple_assign_set_lhs (epilog_stmt, new_name);
4828 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4829
4830 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4831 new_temp);
4832 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4833 gimple_assign_set_lhs (epilog_stmt, new_temp);
4834 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4835 }
4836
4837 /* 2.4 Extract the final scalar result. Create:
4838 s_out3 = extract_field <v_out2, bitpos> */
4839
4840 if (dump_enabled_p ())
4841 dump_printf_loc (MSG_NOTE, vect_location,
4842 "extract scalar result\n");
4843
4844 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4845 bitsize, bitsize_zero_node);
4846 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4847 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4848 gimple_assign_set_lhs (epilog_stmt, new_temp);
4849 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4850 scalar_results.safe_push (new_temp);
4851 }
4852 else
4853 {
4854 /*** Case 3: Create:
4855 s = extract_field <v_out2, 0>
4856 for (offset = element_size;
4857 offset < vector_size;
4858 offset += element_size;)
4859 {
4860 Create: s' = extract_field <v_out2, offset>
4861 Create: s = op <s, s'> // For non SLP cases
4862 } */
4863
4864 if (dump_enabled_p ())
4865 dump_printf_loc (MSG_NOTE, vect_location,
4866 "Reduce using scalar code.\n");
4867
4868 vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4869 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4870 {
4871 int bit_offset;
4872 if (gimple_code (new_phi) == GIMPLE_PHI)
4873 vec_temp = PHI_RESULT (new_phi);
4874 else
4875 vec_temp = gimple_assign_lhs (new_phi);
4876 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4877 bitsize_zero_node);
4878 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4879 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4880 gimple_assign_set_lhs (epilog_stmt, new_temp);
4881 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4882
4883 /* In SLP we don't need to apply reduction operation, so we just
4884 collect s' values in SCALAR_RESULTS. */
4885 if (slp_reduc)
4886 scalar_results.safe_push (new_temp);
4887
4888 for (bit_offset = element_bitsize;
4889 bit_offset < vec_size_in_bits;
4890 bit_offset += element_bitsize)
4891 {
4892 tree bitpos = bitsize_int (bit_offset);
4893 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4894 bitsize, bitpos);
4895
4896 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4897 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4898 gimple_assign_set_lhs (epilog_stmt, new_name);
4899 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4900
4901 if (slp_reduc)
4902 {
4903 /* In SLP we don't need to apply reduction operation, so
4904 we just collect s' values in SCALAR_RESULTS. */
4905 new_temp = new_name;
4906 scalar_results.safe_push (new_name);
4907 }
4908 else
4909 {
4910 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4911 new_name, new_temp);
4912 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4913 gimple_assign_set_lhs (epilog_stmt, new_temp);
4914 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4915 }
4916 }
4917 }
4918
4919 /* The only case where we need to reduce scalar results in SLP, is
4920 unrolling. If the size of SCALAR_RESULTS is greater than
4921 GROUP_SIZE, we reduce them combining elements modulo
4922 GROUP_SIZE. */
4923 if (slp_reduc)
4924 {
4925 tree res, first_res, new_res;
4926 gimple *new_stmt;
4927
4928 /* Reduce multiple scalar results in case of SLP unrolling. */
4929 for (j = group_size; scalar_results.iterate (j, &res);
4930 j++)
4931 {
4932 first_res = scalar_results[j % group_size];
4933 new_stmt = gimple_build_assign (new_scalar_dest, code,
4934 first_res, res);
4935 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4936 gimple_assign_set_lhs (new_stmt, new_res);
4937 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4938 scalar_results[j % group_size] = new_res;
4939 }
4940 }
4941 else
4942 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4943 scalar_results.safe_push (new_temp);
4944 }
4945 }
4946
4947 vect_finalize_reduction:
4948
4949 if (double_reduc)
4950 loop = loop->inner;
4951
4952 /* 2.5 Adjust the final result by the initial value of the reduction
4953 variable. (When such adjustment is not needed, then
4954 'adjustment_def' is zero). For example, if code is PLUS we create:
4955 new_temp = loop_exit_def + adjustment_def */
4956
4957 if (adjustment_def)
4958 {
4959 gcc_assert (!slp_reduc);
4960 if (nested_in_vect_loop)
4961 {
4962 new_phi = new_phis[0];
4963 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4964 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4965 new_dest = vect_create_destination_var (scalar_dest, vectype);
4966 }
4967 else
4968 {
4969 new_temp = scalar_results[0];
4970 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4971 expr = build2 (code, scalar_type, new_temp, adjustment_def);
4972 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4973 }
4974
4975 epilog_stmt = gimple_build_assign (new_dest, expr);
4976 new_temp = make_ssa_name (new_dest, epilog_stmt);
4977 gimple_assign_set_lhs (epilog_stmt, new_temp);
4978 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4979 if (nested_in_vect_loop)
4980 {
4981 set_vinfo_for_stmt (epilog_stmt,
4982 new_stmt_vec_info (epilog_stmt, loop_vinfo));
4983 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4984 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4985
4986 if (!double_reduc)
4987 scalar_results.quick_push (new_temp);
4988 else
4989 scalar_results[0] = new_temp;
4990 }
4991 else
4992 scalar_results[0] = new_temp;
4993
4994 new_phis[0] = epilog_stmt;
4995 }
4996
4997 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4998 phis with new adjusted scalar results, i.e., replace use <s_out0>
4999 with use <s_out4>.
5000
5001 Transform:
5002 loop_exit:
5003 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5004 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5005 v_out2 = reduce <v_out1>
5006 s_out3 = extract_field <v_out2, 0>
5007 s_out4 = adjust_result <s_out3>
5008 use <s_out0>
5009 use <s_out0>
5010
5011 into:
5012
5013 loop_exit:
5014 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5015 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5016 v_out2 = reduce <v_out1>
5017 s_out3 = extract_field <v_out2, 0>
5018 s_out4 = adjust_result <s_out3>
5019 use <s_out4>
5020 use <s_out4> */
5021
5022
5023 /* In SLP reduction chain we reduce vector results into one vector if
5024 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
5025 the last stmt in the reduction chain, since we are looking for the loop
5026 exit phi node. */
5027 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5028 {
5029 gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5030 /* Handle reduction patterns. */
5031 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5032 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5033
5034 scalar_dest = gimple_assign_lhs (dest_stmt);
5035 group_size = 1;
5036 }
5037
5038 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5039 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
5040 need to match SCALAR_RESULTS with corresponding statements. The first
5041 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5042 the first vector stmt, etc.
5043 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
5044 if (group_size > new_phis.length ())
5045 {
5046 ratio = group_size / new_phis.length ();
5047 gcc_assert (!(group_size % new_phis.length ()));
5048 }
5049 else
5050 ratio = 1;
5051
5052 for (k = 0; k < group_size; k++)
5053 {
5054 if (k % ratio == 0)
5055 {
5056 epilog_stmt = new_phis[k / ratio];
5057 reduction_phi = reduction_phis[k / ratio];
5058 if (double_reduc)
5059 inner_phi = inner_phis[k / ratio];
5060 }
5061
5062 if (slp_reduc)
5063 {
5064 gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5065
5066 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5067 /* SLP statements can't participate in patterns. */
5068 gcc_assert (!orig_stmt);
5069 scalar_dest = gimple_assign_lhs (current_stmt);
5070 }
5071
5072 phis.create (3);
5073 /* Find the loop-closed-use at the loop exit of the original scalar
5074 result. (The reduction result is expected to have two immediate uses -
5075 one at the latch block, and one at the loop exit). */
5076 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5077 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5078 && !is_gimple_debug (USE_STMT (use_p)))
5079 phis.safe_push (USE_STMT (use_p));
5080
5081 /* While we expect to have found an exit_phi because of loop-closed-ssa
5082 form we can end up without one if the scalar cycle is dead. */
5083
5084 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5085 {
5086 if (outer_loop)
5087 {
5088 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5089 gphi *vect_phi;
5090
5091 /* FORNOW. Currently not supporting the case that an inner-loop
5092 reduction is not used in the outer-loop (but only outside the
5093 outer-loop), unless it is double reduction. */
5094 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5095 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5096 || double_reduc);
5097
5098 if (double_reduc)
5099 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5100 else
5101 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5102 if (!double_reduc
5103 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5104 != vect_double_reduction_def)
5105 continue;
5106
5107 /* Handle double reduction:
5108
5109 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5110 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5111 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5112 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5113
5114 At that point the regular reduction (stmt2 and stmt3) is
5115 already vectorized, as well as the exit phi node, stmt4.
5116 Here we vectorize the phi node of double reduction, stmt1, and
5117 update all relevant statements. */
5118
5119 /* Go through all the uses of s2 to find double reduction phi
5120 node, i.e., stmt1 above. */
5121 orig_name = PHI_RESULT (exit_phi);
5122 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5123 {
5124 stmt_vec_info use_stmt_vinfo;
5125 stmt_vec_info new_phi_vinfo;
5126 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
5127 basic_block bb = gimple_bb (use_stmt);
5128 gimple *use;
5129
5130 /* Check that USE_STMT is really double reduction phi
5131 node. */
5132 if (gimple_code (use_stmt) != GIMPLE_PHI
5133 || gimple_phi_num_args (use_stmt) != 2
5134 || bb->loop_father != outer_loop)
5135 continue;
5136 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5137 if (!use_stmt_vinfo
5138 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5139 != vect_double_reduction_def)
5140 continue;
5141
5142 /* Create vector phi node for double reduction:
5143 vs1 = phi <vs0, vs2>
5144 vs1 was created previously in this function by a call to
5145 vect_get_vec_def_for_operand and is stored in
5146 vec_initial_def;
5147 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5148 vs0 is created here. */
5149
5150 /* Create vector phi node. */
5151 vect_phi = create_phi_node (vec_initial_def, bb);
5152 new_phi_vinfo = new_stmt_vec_info (vect_phi,
5153 loop_vec_info_for_loop (outer_loop));
5154 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5155
5156 /* Create vs0 - initial def of the double reduction phi. */
5157 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5158 loop_preheader_edge (outer_loop));
5159 init_def = get_initial_def_for_reduction (stmt,
5160 preheader_arg, NULL);
5161 vect_phi_init = vect_init_vector (use_stmt, init_def,
5162 vectype, NULL);
5163
5164 /* Update phi node arguments with vs0 and vs2. */
5165 add_phi_arg (vect_phi, vect_phi_init,
5166 loop_preheader_edge (outer_loop),
5167 UNKNOWN_LOCATION);
5168 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5169 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5170 if (dump_enabled_p ())
5171 {
5172 dump_printf_loc (MSG_NOTE, vect_location,
5173 "created double reduction phi node: ");
5174 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5175 dump_printf (MSG_NOTE, "\n");
5176 }
5177
5178 vect_phi_res = PHI_RESULT (vect_phi);
5179
5180 /* Replace the use, i.e., set the correct vs1 in the regular
5181 reduction phi node. FORNOW, NCOPIES is always 1, so the
5182 loop is redundant. */
5183 use = reduction_phi;
5184 for (j = 0; j < ncopies; j++)
5185 {
5186 edge pr_edge = loop_preheader_edge (loop);
5187 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5188 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5189 }
5190 }
5191 }
5192 }
5193
5194 phis.release ();
5195 if (nested_in_vect_loop)
5196 {
5197 if (double_reduc)
5198 loop = outer_loop;
5199 else
5200 continue;
5201 }
5202
5203 phis.create (3);
5204 /* Find the loop-closed-use at the loop exit of the original scalar
5205 result. (The reduction result is expected to have two immediate uses,
5206 one at the latch block, and one at the loop exit). For double
5207 reductions we are looking for exit phis of the outer loop. */
5208 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5209 {
5210 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5211 {
5212 if (!is_gimple_debug (USE_STMT (use_p)))
5213 phis.safe_push (USE_STMT (use_p));
5214 }
5215 else
5216 {
5217 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5218 {
5219 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5220
5221 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5222 {
5223 if (!flow_bb_inside_loop_p (loop,
5224 gimple_bb (USE_STMT (phi_use_p)))
5225 && !is_gimple_debug (USE_STMT (phi_use_p)))
5226 phis.safe_push (USE_STMT (phi_use_p));
5227 }
5228 }
5229 }
5230 }
5231
5232 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5233 {
5234 /* Replace the uses: */
5235 orig_name = PHI_RESULT (exit_phi);
5236 scalar_result = scalar_results[k];
5237 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5238 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5239 SET_USE (use_p, scalar_result);
5240 }
5241
5242 phis.release ();
5243 }
5244 }
5245
5246
5247 /* Function is_nonwrapping_integer_induction.
5248
5249 Check if STMT (which is part of loop LOOP) both increments and
5250 does not cause overflow. */
5251
5252 static bool
is_nonwrapping_integer_induction(gimple * stmt,struct loop * loop)5253 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5254 {
5255 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5256 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5257 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5258 tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5259 widest_int ni, max_loop_value, lhs_max;
5260 bool overflow = false;
5261
5262 /* Make sure the loop is integer based. */
5263 if (TREE_CODE (base) != INTEGER_CST
5264 || TREE_CODE (step) != INTEGER_CST)
5265 return false;
5266
5267 /* Check that the induction increments. */
5268 if (tree_int_cst_sgn (step) == -1)
5269 return false;
5270
5271 /* Check that the max size of the loop will not wrap. */
5272
5273 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5274 return true;
5275
5276 if (! max_stmt_executions (loop, &ni))
5277 return false;
5278
5279 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5280 &overflow);
5281 if (overflow)
5282 return false;
5283
5284 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5285 TYPE_SIGN (lhs_type), &overflow);
5286 if (overflow)
5287 return false;
5288
5289 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5290 <= TYPE_PRECISION (lhs_type));
5291 }
5292
5293 /* Function vectorizable_reduction.
5294
5295 Check if STMT performs a reduction operation that can be vectorized.
5296 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5297 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5298 Return FALSE if not a vectorizable STMT, TRUE otherwise.
5299
5300 This function also handles reduction idioms (patterns) that have been
5301 recognized in advance during vect_pattern_recog. In this case, STMT may be
5302 of this form:
5303 X = pattern_expr (arg0, arg1, ..., X)
5304 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5305 sequence that had been detected and replaced by the pattern-stmt (STMT).
5306
5307 This function also handles reduction of condition expressions, for example:
5308 for (int i = 0; i < N; i++)
5309 if (a[i] < value)
5310 last = a[i];
5311 This is handled by vectorising the loop and creating an additional vector
5312 containing the loop indexes for which "a[i] < value" was true. In the
5313 function epilogue this is reduced to a single max value and then used to
5314 index into the vector of results.
5315
5316 In some cases of reduction patterns, the type of the reduction variable X is
5317 different than the type of the other arguments of STMT.
5318 In such cases, the vectype that is used when transforming STMT into a vector
5319 stmt is different than the vectype that is used to determine the
5320 vectorization factor, because it consists of a different number of elements
5321 than the actual number of elements that are being operated upon in parallel.
5322
5323 For example, consider an accumulation of shorts into an int accumulator.
5324 On some targets it's possible to vectorize this pattern operating on 8
5325 shorts at a time (hence, the vectype for purposes of determining the
5326 vectorization factor should be V8HI); on the other hand, the vectype that
5327 is used to create the vector form is actually V4SI (the type of the result).
5328
5329 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5330 indicates what is the actual level of parallelism (V8HI in the example), so
5331 that the right vectorization factor would be derived. This vectype
5332 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5333 be used to create the vectorized stmt. The right vectype for the vectorized
5334 stmt is obtained from the type of the result X:
5335 get_vectype_for_scalar_type (TREE_TYPE (X))
5336
5337 This means that, contrary to "regular" reductions (or "regular" stmts in
5338 general), the following equation:
5339 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5340 does *NOT* necessarily hold for reduction patterns. */
5341
5342 bool
vectorizable_reduction(gimple * stmt,gimple_stmt_iterator * gsi,gimple ** vec_stmt,slp_tree slp_node)5343 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5344 gimple **vec_stmt, slp_tree slp_node)
5345 {
5346 tree vec_dest;
5347 tree scalar_dest;
5348 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
5349 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5350 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5351 tree vectype_in = NULL_TREE;
5352 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5353 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5354 enum tree_code code, orig_code, epilog_reduc_code;
5355 machine_mode vec_mode;
5356 int op_type;
5357 optab optab, reduc_optab;
5358 tree new_temp = NULL_TREE;
5359 gimple *def_stmt;
5360 enum vect_def_type dt;
5361 gphi *new_phi = NULL;
5362 tree scalar_type;
5363 bool is_simple_use;
5364 gimple *orig_stmt;
5365 stmt_vec_info orig_stmt_info;
5366 tree expr = NULL_TREE;
5367 int i;
5368 int ncopies;
5369 int epilog_copies;
5370 stmt_vec_info prev_stmt_info, prev_phi_info;
5371 bool single_defuse_cycle = false;
5372 tree reduc_def = NULL_TREE;
5373 gimple *new_stmt = NULL;
5374 int j;
5375 tree ops[3];
5376 bool nested_cycle = false, found_nested_cycle_def = false;
5377 gimple *reduc_def_stmt = NULL;
5378 bool double_reduc = false, dummy;
5379 basic_block def_bb;
5380 struct loop * def_stmt_loop, *outer_loop = NULL;
5381 tree def_arg;
5382 gimple *def_arg_stmt;
5383 auto_vec<tree> vec_oprnds0;
5384 auto_vec<tree> vec_oprnds1;
5385 auto_vec<tree> vect_defs;
5386 auto_vec<gimple *> phis;
5387 int vec_num;
5388 tree def0, def1, tem, op0, op1 = NULL_TREE;
5389 bool first_p = true;
5390 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5391 gimple *cond_expr_induction_def_stmt = NULL;
5392
5393 /* In case of reduction chain we switch to the first stmt in the chain, but
5394 we don't update STMT_INFO, since only the last stmt is marked as reduction
5395 and has reduction properties. */
5396 if (GROUP_FIRST_ELEMENT (stmt_info)
5397 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5398 {
5399 stmt = GROUP_FIRST_ELEMENT (stmt_info);
5400 first_p = false;
5401 }
5402
5403 if (nested_in_vect_loop_p (loop, stmt))
5404 {
5405 outer_loop = loop;
5406 loop = loop->inner;
5407 nested_cycle = true;
5408 }
5409
5410 /* 1. Is vectorizable reduction? */
5411 /* Not supportable if the reduction variable is used in the loop, unless
5412 it's a reduction chain. */
5413 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5414 && !GROUP_FIRST_ELEMENT (stmt_info))
5415 return false;
5416
5417 /* Reductions that are not used even in an enclosing outer-loop,
5418 are expected to be "live" (used out of the loop). */
5419 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5420 && !STMT_VINFO_LIVE_P (stmt_info))
5421 return false;
5422
5423 /* Make sure it was already recognized as a reduction computation. */
5424 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5425 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5426 return false;
5427
5428 /* 2. Has this been recognized as a reduction pattern?
5429
5430 Check if STMT represents a pattern that has been recognized
5431 in earlier analysis stages. For stmts that represent a pattern,
5432 the STMT_VINFO_RELATED_STMT field records the last stmt in
5433 the original sequence that constitutes the pattern. */
5434
5435 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5436 if (orig_stmt)
5437 {
5438 orig_stmt_info = vinfo_for_stmt (orig_stmt);
5439 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5440 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5441 }
5442
5443 /* 3. Check the operands of the operation. The first operands are defined
5444 inside the loop body. The last operand is the reduction variable,
5445 which is defined by the loop-header-phi. */
5446
5447 gcc_assert (is_gimple_assign (stmt));
5448
5449 /* Flatten RHS. */
5450 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5451 {
5452 case GIMPLE_SINGLE_RHS:
5453 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
5454 if (op_type == ternary_op)
5455 {
5456 tree rhs = gimple_assign_rhs1 (stmt);
5457 ops[0] = TREE_OPERAND (rhs, 0);
5458 ops[1] = TREE_OPERAND (rhs, 1);
5459 ops[2] = TREE_OPERAND (rhs, 2);
5460 code = TREE_CODE (rhs);
5461 }
5462 else
5463 return false;
5464 break;
5465
5466 case GIMPLE_BINARY_RHS:
5467 code = gimple_assign_rhs_code (stmt);
5468 op_type = TREE_CODE_LENGTH (code);
5469 gcc_assert (op_type == binary_op);
5470 ops[0] = gimple_assign_rhs1 (stmt);
5471 ops[1] = gimple_assign_rhs2 (stmt);
5472 break;
5473
5474 case GIMPLE_TERNARY_RHS:
5475 code = gimple_assign_rhs_code (stmt);
5476 op_type = TREE_CODE_LENGTH (code);
5477 gcc_assert (op_type == ternary_op);
5478 ops[0] = gimple_assign_rhs1 (stmt);
5479 ops[1] = gimple_assign_rhs2 (stmt);
5480 ops[2] = gimple_assign_rhs3 (stmt);
5481 break;
5482
5483 case GIMPLE_UNARY_RHS:
5484 return false;
5485
5486 default:
5487 gcc_unreachable ();
5488 }
5489 /* The default is that the reduction variable is the last in statement. */
5490 int reduc_index = op_type - 1;
5491 if (code == MINUS_EXPR)
5492 reduc_index = 0;
5493
5494 if (code == COND_EXPR && slp_node)
5495 return false;
5496
5497 scalar_dest = gimple_assign_lhs (stmt);
5498 scalar_type = TREE_TYPE (scalar_dest);
5499 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5500 && !SCALAR_FLOAT_TYPE_P (scalar_type))
5501 return false;
5502
5503 /* Do not try to vectorize bit-precision reductions. */
5504 if ((TYPE_PRECISION (scalar_type)
5505 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
5506 return false;
5507
5508 /* All uses but the last are expected to be defined in the loop.
5509 The last use is the reduction variable. In case of nested cycle this
5510 assumption is not true: we use reduc_index to record the index of the
5511 reduction variable. */
5512 for (i = 0; i < op_type; i++)
5513 {
5514 if (i == reduc_index)
5515 continue;
5516
5517 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
5518 if (i == 0 && code == COND_EXPR)
5519 continue;
5520
5521 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5522 &def_stmt, &dt, &tem);
5523 if (!vectype_in)
5524 vectype_in = tem;
5525 gcc_assert (is_simple_use);
5526
5527 if (dt != vect_internal_def
5528 && dt != vect_external_def
5529 && dt != vect_constant_def
5530 && dt != vect_induction_def
5531 && !(dt == vect_nested_cycle && nested_cycle))
5532 return false;
5533
5534 if (dt == vect_nested_cycle)
5535 {
5536 found_nested_cycle_def = true;
5537 reduc_def_stmt = def_stmt;
5538 reduc_index = i;
5539 }
5540
5541 if (i == 1 && code == COND_EXPR && dt == vect_induction_def)
5542 cond_expr_induction_def_stmt = def_stmt;
5543 }
5544
5545 is_simple_use = vect_is_simple_use (ops[reduc_index], loop_vinfo,
5546 &def_stmt, &dt, &tem);
5547 if (!vectype_in)
5548 vectype_in = tem;
5549 gcc_assert (is_simple_use);
5550 if (!found_nested_cycle_def)
5551 reduc_def_stmt = def_stmt;
5552
5553 if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5554 return false;
5555
5556 if (!(dt == vect_reduction_def
5557 || dt == vect_nested_cycle
5558 || ((dt == vect_internal_def || dt == vect_external_def
5559 || dt == vect_constant_def || dt == vect_induction_def)
5560 && nested_cycle && found_nested_cycle_def)))
5561 {
5562 /* For pattern recognized stmts, orig_stmt might be a reduction,
5563 but some helper statements for the pattern might not, or
5564 might be COND_EXPRs with reduction uses in the condition. */
5565 gcc_assert (orig_stmt);
5566 return false;
5567 }
5568
5569 enum vect_reduction_type v_reduc_type;
5570 gimple *tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5571 !nested_cycle, &dummy, false,
5572 &v_reduc_type);
5573
5574 /* If we have a condition reduction, see if we can simplify it further. */
5575 if (v_reduc_type == COND_REDUCTION
5576 && cond_expr_induction_def_stmt != NULL
5577 && is_nonwrapping_integer_induction (cond_expr_induction_def_stmt, loop))
5578 {
5579 if (dump_enabled_p ())
5580 dump_printf_loc (MSG_NOTE, vect_location,
5581 "condition expression based on integer induction.\n");
5582 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = INTEGER_INDUC_COND_REDUCTION;
5583 }
5584 else
5585 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5586
5587 if (orig_stmt)
5588 gcc_assert (tmp == orig_stmt
5589 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5590 else
5591 /* We changed STMT to be the first stmt in reduction chain, hence we
5592 check that in this case the first element in the chain is STMT. */
5593 gcc_assert (stmt == tmp
5594 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5595
5596 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5597 return false;
5598
5599 if (slp_node || PURE_SLP_STMT (stmt_info))
5600 ncopies = 1;
5601 else
5602 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5603 / TYPE_VECTOR_SUBPARTS (vectype_in));
5604
5605 gcc_assert (ncopies >= 1);
5606
5607 vec_mode = TYPE_MODE (vectype_in);
5608
5609 if (code == COND_EXPR)
5610 {
5611 /* Only call during the analysis stage, otherwise we'll lose
5612 STMT_VINFO_TYPE. */
5613 if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
5614 ops[reduc_index], 0, NULL))
5615 {
5616 if (dump_enabled_p ())
5617 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5618 "unsupported condition in reduction\n");
5619 return false;
5620 }
5621 }
5622 else
5623 {
5624 /* 4. Supportable by target? */
5625
5626 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5627 || code == LROTATE_EXPR || code == RROTATE_EXPR)
5628 {
5629 /* Shifts and rotates are only supported by vectorizable_shifts,
5630 not vectorizable_reduction. */
5631 if (dump_enabled_p ())
5632 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5633 "unsupported shift or rotation.\n");
5634 return false;
5635 }
5636
5637 /* 4.1. check support for the operation in the loop */
5638 optab = optab_for_tree_code (code, vectype_in, optab_default);
5639 if (!optab)
5640 {
5641 if (dump_enabled_p ())
5642 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5643 "no optab.\n");
5644
5645 return false;
5646 }
5647
5648 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5649 {
5650 if (dump_enabled_p ())
5651 dump_printf (MSG_NOTE, "op not supported by target.\n");
5652
5653 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5654 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5655 < vect_min_worthwhile_factor (code))
5656 return false;
5657
5658 if (dump_enabled_p ())
5659 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5660 }
5661
5662 /* Worthwhile without SIMD support? */
5663 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5664 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5665 < vect_min_worthwhile_factor (code))
5666 {
5667 if (dump_enabled_p ())
5668 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5669 "not worthwhile without SIMD support.\n");
5670
5671 return false;
5672 }
5673 }
5674
5675 /* 4.2. Check support for the epilog operation.
5676
5677 If STMT represents a reduction pattern, then the type of the
5678 reduction variable may be different than the type of the rest
5679 of the arguments. For example, consider the case of accumulation
5680 of shorts into an int accumulator; The original code:
5681 S1: int_a = (int) short_a;
5682 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
5683
5684 was replaced with:
5685 STMT: int_acc = widen_sum <short_a, int_acc>
5686
5687 This means that:
5688 1. The tree-code that is used to create the vector operation in the
5689 epilog code (that reduces the partial results) is not the
5690 tree-code of STMT, but is rather the tree-code of the original
5691 stmt from the pattern that STMT is replacing. I.e, in the example
5692 above we want to use 'widen_sum' in the loop, but 'plus' in the
5693 epilog.
5694 2. The type (mode) we use to check available target support
5695 for the vector operation to be created in the *epilog*, is
5696 determined by the type of the reduction variable (in the example
5697 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5698 However the type (mode) we use to check available target support
5699 for the vector operation to be created *inside the loop*, is
5700 determined by the type of the other arguments to STMT (in the
5701 example we'd check this: optab_handler (widen_sum_optab,
5702 vect_short_mode)).
5703
5704 This is contrary to "regular" reductions, in which the types of all
5705 the arguments are the same as the type of the reduction variable.
5706 For "regular" reductions we can therefore use the same vector type
5707 (and also the same tree-code) when generating the epilog code and
5708 when generating the code inside the loop. */
5709
5710 if (orig_stmt)
5711 {
5712 /* This is a reduction pattern: get the vectype from the type of the
5713 reduction variable, and get the tree-code from orig_stmt. */
5714 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5715 == TREE_CODE_REDUCTION);
5716 orig_code = gimple_assign_rhs_code (orig_stmt);
5717 gcc_assert (vectype_out);
5718 vec_mode = TYPE_MODE (vectype_out);
5719 }
5720 else
5721 {
5722 /* Regular reduction: use the same vectype and tree-code as used for
5723 the vector code inside the loop can be used for the epilog code. */
5724 orig_code = code;
5725
5726 if (code == MINUS_EXPR)
5727 orig_code = PLUS_EXPR;
5728
5729 /* For simple condition reductions, replace with the actual expression
5730 we want to base our reduction around. */
5731 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5732 == INTEGER_INDUC_COND_REDUCTION)
5733 orig_code = MAX_EXPR;
5734 }
5735
5736 if (nested_cycle)
5737 {
5738 def_bb = gimple_bb (reduc_def_stmt);
5739 def_stmt_loop = def_bb->loop_father;
5740 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5741 loop_preheader_edge (def_stmt_loop));
5742 if (TREE_CODE (def_arg) == SSA_NAME
5743 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5744 && gimple_code (def_arg_stmt) == GIMPLE_PHI
5745 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5746 && vinfo_for_stmt (def_arg_stmt)
5747 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5748 == vect_double_reduction_def)
5749 double_reduc = true;
5750 }
5751
5752 epilog_reduc_code = ERROR_MARK;
5753
5754 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == TREE_CODE_REDUCTION
5755 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5756 == INTEGER_INDUC_COND_REDUCTION)
5757 {
5758 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5759 {
5760 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5761 optab_default);
5762 if (!reduc_optab)
5763 {
5764 if (dump_enabled_p ())
5765 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5766 "no optab for reduction.\n");
5767
5768 epilog_reduc_code = ERROR_MARK;
5769 }
5770 else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5771 {
5772 if (dump_enabled_p ())
5773 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5774 "reduc op not supported by target.\n");
5775
5776 epilog_reduc_code = ERROR_MARK;
5777 }
5778
5779 /* When epilog_reduc_code is ERROR_MARK then a reduction will be
5780 generated in the epilog using multiple expressions. This does not
5781 work for condition reductions. */
5782 if (epilog_reduc_code == ERROR_MARK
5783 && STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5784 == INTEGER_INDUC_COND_REDUCTION)
5785 {
5786 if (dump_enabled_p ())
5787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5788 "no reduc code for scalar code.\n");
5789 return false;
5790 }
5791 }
5792 else
5793 {
5794 if (!nested_cycle || double_reduc)
5795 {
5796 if (dump_enabled_p ())
5797 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5798 "no reduc code for scalar code.\n");
5799
5800 return false;
5801 }
5802 }
5803 }
5804 else
5805 {
5806 int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
5807 cr_index_scalar_type = make_unsigned_type (scalar_precision);
5808 cr_index_vector_type = build_vector_type
5809 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
5810
5811 epilog_reduc_code = REDUC_MAX_EXPR;
5812 optab = optab_for_tree_code (REDUC_MAX_EXPR, cr_index_vector_type,
5813 optab_default);
5814 if (optab_handler (optab, TYPE_MODE (cr_index_vector_type))
5815 == CODE_FOR_nothing)
5816 {
5817 if (dump_enabled_p ())
5818 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5819 "reduc max op not supported by target.\n");
5820 return false;
5821 }
5822 }
5823
5824 if ((double_reduc
5825 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5826 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5827 == INTEGER_INDUC_COND_REDUCTION)
5828 && ncopies > 1)
5829 {
5830 if (dump_enabled_p ())
5831 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5832 "multiple types in double reduction or condition "
5833 "reduction.\n");
5834 return false;
5835 }
5836
5837 /* In case of widenning multiplication by a constant, we update the type
5838 of the constant to be the type of the other operand. We check that the
5839 constant fits the type in the pattern recognition pass. */
5840 if (code == DOT_PROD_EXPR
5841 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5842 {
5843 if (TREE_CODE (ops[0]) == INTEGER_CST)
5844 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5845 else if (TREE_CODE (ops[1]) == INTEGER_CST)
5846 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5847 else
5848 {
5849 if (dump_enabled_p ())
5850 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5851 "invalid types in dot-prod\n");
5852
5853 return false;
5854 }
5855 }
5856
5857 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
5858 {
5859 widest_int ni;
5860
5861 if (! max_loop_iterations (loop, &ni))
5862 {
5863 if (dump_enabled_p ())
5864 dump_printf_loc (MSG_NOTE, vect_location,
5865 "loop count not known, cannot create cond "
5866 "reduction.\n");
5867 return false;
5868 }
5869 /* Convert backedges to iterations. */
5870 ni += 1;
5871
5872 /* The additional index will be the same type as the condition. Check
5873 that the loop can fit into this less one (because we'll use up the
5874 zero slot for when there are no matches). */
5875 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
5876 if (wi::geu_p (ni, wi::to_widest (max_index)))
5877 {
5878 if (dump_enabled_p ())
5879 dump_printf_loc (MSG_NOTE, vect_location,
5880 "loop size is greater than data size.\n");
5881 return false;
5882 }
5883 }
5884
5885 if (!vec_stmt) /* transformation not required. */
5886 {
5887 if (first_p
5888 && !vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5889 reduc_index))
5890 return false;
5891 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5892 return true;
5893 }
5894
5895 /** Transform. **/
5896
5897 if (dump_enabled_p ())
5898 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5899
5900 /* FORNOW: Multiple types are not supported for condition. */
5901 if (code == COND_EXPR)
5902 gcc_assert (ncopies == 1);
5903
5904 /* Create the destination vector */
5905 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5906
5907 /* In case the vectorization factor (VF) is bigger than the number
5908 of elements that we can fit in a vectype (nunits), we have to generate
5909 more than one vector stmt - i.e - we need to "unroll" the
5910 vector stmt by a factor VF/nunits. For more details see documentation
5911 in vectorizable_operation. */
5912
5913 /* If the reduction is used in an outer loop we need to generate
5914 VF intermediate results, like so (e.g. for ncopies=2):
5915 r0 = phi (init, r0)
5916 r1 = phi (init, r1)
5917 r0 = x0 + r0;
5918 r1 = x1 + r1;
5919 (i.e. we generate VF results in 2 registers).
5920 In this case we have a separate def-use cycle for each copy, and therefore
5921 for each copy we get the vector def for the reduction variable from the
5922 respective phi node created for this copy.
5923
5924 Otherwise (the reduction is unused in the loop nest), we can combine
5925 together intermediate results, like so (e.g. for ncopies=2):
5926 r = phi (init, r)
5927 r = x0 + r;
5928 r = x1 + r;
5929 (i.e. we generate VF/2 results in a single register).
5930 In this case for each copy we get the vector def for the reduction variable
5931 from the vectorized reduction operation generated in the previous iteration.
5932 */
5933
5934 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5935 {
5936 single_defuse_cycle = true;
5937 epilog_copies = 1;
5938 }
5939 else
5940 epilog_copies = ncopies;
5941
5942 prev_stmt_info = NULL;
5943 prev_phi_info = NULL;
5944 if (slp_node)
5945 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5946 else
5947 {
5948 vec_num = 1;
5949 vec_oprnds0.create (1);
5950 if (op_type == ternary_op)
5951 vec_oprnds1.create (1);
5952 }
5953
5954 phis.create (vec_num);
5955 vect_defs.create (vec_num);
5956 if (!slp_node)
5957 vect_defs.quick_push (NULL_TREE);
5958
5959 for (j = 0; j < ncopies; j++)
5960 {
5961 if (j == 0 || !single_defuse_cycle)
5962 {
5963 for (i = 0; i < vec_num; i++)
5964 {
5965 /* Create the reduction-phi that defines the reduction
5966 operand. */
5967 new_phi = create_phi_node (vec_dest, loop->header);
5968 set_vinfo_for_stmt (new_phi,
5969 new_stmt_vec_info (new_phi, loop_vinfo));
5970 if (j == 0 || slp_node)
5971 phis.quick_push (new_phi);
5972 }
5973 }
5974
5975 if (code == COND_EXPR)
5976 {
5977 gcc_assert (!slp_node);
5978 vectorizable_condition (stmt, gsi, vec_stmt,
5979 PHI_RESULT (phis[0]),
5980 reduc_index, NULL);
5981 /* Multiple types are not supported for condition. */
5982 break;
5983 }
5984
5985 /* Handle uses. */
5986 if (j == 0)
5987 {
5988 op0 = ops[!reduc_index];
5989 if (op_type == ternary_op)
5990 {
5991 if (reduc_index == 0)
5992 op1 = ops[2];
5993 else
5994 op1 = ops[1];
5995 }
5996
5997 if (slp_node)
5998 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5999 slp_node, -1);
6000 else
6001 {
6002 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
6003 stmt);
6004 vec_oprnds0.quick_push (loop_vec_def0);
6005 if (op_type == ternary_op)
6006 {
6007 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt);
6008 vec_oprnds1.quick_push (loop_vec_def1);
6009 }
6010 }
6011 }
6012 else
6013 {
6014 if (!slp_node)
6015 {
6016 enum vect_def_type dt;
6017 gimple *dummy_stmt;
6018
6019 vect_is_simple_use (ops[!reduc_index], loop_vinfo,
6020 &dummy_stmt, &dt);
6021 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
6022 loop_vec_def0);
6023 vec_oprnds0[0] = loop_vec_def0;
6024 if (op_type == ternary_op)
6025 {
6026 vect_is_simple_use (op1, loop_vinfo, &dummy_stmt, &dt);
6027 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
6028 loop_vec_def1);
6029 vec_oprnds1[0] = loop_vec_def1;
6030 }
6031 }
6032
6033 if (single_defuse_cycle)
6034 reduc_def = gimple_assign_lhs (new_stmt);
6035
6036 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6037 }
6038
6039 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6040 {
6041 if (slp_node)
6042 reduc_def = PHI_RESULT (phis[i]);
6043 else
6044 {
6045 if (!single_defuse_cycle || j == 0)
6046 reduc_def = PHI_RESULT (new_phi);
6047 }
6048
6049 def1 = ((op_type == ternary_op)
6050 ? vec_oprnds1[i] : NULL);
6051 if (op_type == binary_op)
6052 {
6053 if (reduc_index == 0)
6054 expr = build2 (code, vectype_out, reduc_def, def0);
6055 else
6056 expr = build2 (code, vectype_out, def0, reduc_def);
6057 }
6058 else
6059 {
6060 if (reduc_index == 0)
6061 expr = build3 (code, vectype_out, reduc_def, def0, def1);
6062 else
6063 {
6064 if (reduc_index == 1)
6065 expr = build3 (code, vectype_out, def0, reduc_def, def1);
6066 else
6067 expr = build3 (code, vectype_out, def0, def1, reduc_def);
6068 }
6069 }
6070
6071 new_stmt = gimple_build_assign (vec_dest, expr);
6072 new_temp = make_ssa_name (vec_dest, new_stmt);
6073 gimple_assign_set_lhs (new_stmt, new_temp);
6074 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6075
6076 if (slp_node)
6077 {
6078 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6079 vect_defs.quick_push (new_temp);
6080 }
6081 else
6082 vect_defs[0] = new_temp;
6083 }
6084
6085 if (slp_node)
6086 continue;
6087
6088 if (j == 0)
6089 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6090 else
6091 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6092
6093 prev_stmt_info = vinfo_for_stmt (new_stmt);
6094 prev_phi_info = vinfo_for_stmt (new_phi);
6095 }
6096
6097 tree indx_before_incr, indx_after_incr, cond_name = NULL;
6098
6099 /* Finalize the reduction-phi (set its arguments) and create the
6100 epilog reduction code. */
6101 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6102 {
6103 new_temp = gimple_assign_lhs (*vec_stmt);
6104 vect_defs[0] = new_temp;
6105
6106 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6107 which is updated with the current index of the loop for every match of
6108 the original loop's cond_expr (VEC_STMT). This results in a vector
6109 containing the last time the condition passed for that vector lane.
6110 The first match will be a 1 to allow 0 to be used for non-matching
6111 indexes. If there are no matches at all then the vector will be all
6112 zeroes. */
6113 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6114 {
6115 int nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6116 int k;
6117
6118 gcc_assert (gimple_assign_rhs_code (*vec_stmt) == VEC_COND_EXPR);
6119
6120 /* First we create a simple vector induction variable which starts
6121 with the values {1,2,3,...} (SERIES_VECT) and increments by the
6122 vector size (STEP). */
6123
6124 /* Create a {1,2,3,...} vector. */
6125 tree *vtemp = XALLOCAVEC (tree, nunits_out);
6126 for (k = 0; k < nunits_out; ++k)
6127 vtemp[k] = build_int_cst (cr_index_scalar_type, k + 1);
6128 tree series_vect = build_vector (cr_index_vector_type, vtemp);
6129
6130 /* Create a vector of the step value. */
6131 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6132 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6133
6134 /* Create an induction variable. */
6135 gimple_stmt_iterator incr_gsi;
6136 bool insert_after;
6137 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
6138 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
6139 insert_after, &indx_before_incr, &indx_after_incr);
6140
6141 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6142 filled with zeros (VEC_ZERO). */
6143
6144 /* Create a vector of 0s. */
6145 tree zero = build_zero_cst (cr_index_scalar_type);
6146 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6147
6148 /* Create a vector phi node. */
6149 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6150 new_phi = create_phi_node (new_phi_tree, loop->header);
6151 set_vinfo_for_stmt (new_phi,
6152 new_stmt_vec_info (new_phi, loop_vinfo));
6153 add_phi_arg (new_phi, vec_zero, loop_preheader_edge (loop),
6154 UNKNOWN_LOCATION);
6155
6156 /* Now take the condition from the loops original cond_expr
6157 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
6158 every match uses values from the induction variable
6159 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6160 (NEW_PHI_TREE).
6161 Finally, we update the phi (NEW_PHI_TREE) to take the value of
6162 the new cond_expr (INDEX_COND_EXPR). */
6163
6164 /* Turn the condition from vec_stmt into an ssa name. */
6165 gimple_stmt_iterator vec_stmt_gsi = gsi_for_stmt (*vec_stmt);
6166 tree ccompare = gimple_assign_rhs1 (*vec_stmt);
6167 tree ccompare_name = make_ssa_name (TREE_TYPE (ccompare));
6168 gimple *ccompare_stmt = gimple_build_assign (ccompare_name,
6169 ccompare);
6170 gsi_insert_before (&vec_stmt_gsi, ccompare_stmt, GSI_SAME_STMT);
6171 gimple_assign_set_rhs1 (*vec_stmt, ccompare_name);
6172 update_stmt (*vec_stmt);
6173
6174 /* Create a conditional, where the condition is taken from vec_stmt
6175 (CCOMPARE_NAME), then is the induction index (INDEX_BEFORE_INCR)
6176 and else is the phi (NEW_PHI_TREE). */
6177 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
6178 ccompare_name, indx_before_incr,
6179 new_phi_tree);
6180 cond_name = make_ssa_name (cr_index_vector_type);
6181 gimple *index_condition = gimple_build_assign (cond_name,
6182 index_cond_expr);
6183 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
6184 stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
6185 loop_vinfo);
6186 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
6187 set_vinfo_for_stmt (index_condition, index_vec_info);
6188
6189 /* Update the phi with the vec cond. */
6190 add_phi_arg (new_phi, cond_name, loop_latch_edge (loop),
6191 UNKNOWN_LOCATION);
6192 }
6193 }
6194
6195 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
6196 epilog_reduc_code, phis, reduc_index,
6197 double_reduc, slp_node, cond_name);
6198
6199 return true;
6200 }
6201
6202 /* Function vect_min_worthwhile_factor.
6203
6204 For a loop where we could vectorize the operation indicated by CODE,
6205 return the minimum vectorization factor that makes it worthwhile
6206 to use generic vectors. */
6207 int
vect_min_worthwhile_factor(enum tree_code code)6208 vect_min_worthwhile_factor (enum tree_code code)
6209 {
6210 switch (code)
6211 {
6212 case PLUS_EXPR:
6213 case MINUS_EXPR:
6214 case NEGATE_EXPR:
6215 return 4;
6216
6217 case BIT_AND_EXPR:
6218 case BIT_IOR_EXPR:
6219 case BIT_XOR_EXPR:
6220 case BIT_NOT_EXPR:
6221 return 2;
6222
6223 default:
6224 return INT_MAX;
6225 }
6226 }
6227
6228
6229 /* Function vectorizable_induction
6230
6231 Check if PHI performs an induction computation that can be vectorized.
6232 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6233 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6234 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6235
6236 bool
vectorizable_induction(gimple * phi,gimple_stmt_iterator * gsi ATTRIBUTE_UNUSED,gimple ** vec_stmt)6237 vectorizable_induction (gimple *phi,
6238 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6239 gimple **vec_stmt)
6240 {
6241 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6242 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6243 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6244 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6245 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6246 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6247 tree vec_def;
6248
6249 gcc_assert (ncopies >= 1);
6250 /* FORNOW. These restrictions should be relaxed. */
6251 if (nested_in_vect_loop_p (loop, phi))
6252 {
6253 imm_use_iterator imm_iter;
6254 use_operand_p use_p;
6255 gimple *exit_phi;
6256 edge latch_e;
6257 tree loop_arg;
6258
6259 if (ncopies > 1)
6260 {
6261 if (dump_enabled_p ())
6262 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6263 "multiple types in nested loop.\n");
6264 return false;
6265 }
6266
6267 exit_phi = NULL;
6268 latch_e = loop_latch_edge (loop->inner);
6269 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6270 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6271 {
6272 gimple *use_stmt = USE_STMT (use_p);
6273 if (is_gimple_debug (use_stmt))
6274 continue;
6275
6276 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6277 {
6278 exit_phi = use_stmt;
6279 break;
6280 }
6281 }
6282 if (exit_phi)
6283 {
6284 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
6285 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6286 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6287 {
6288 if (dump_enabled_p ())
6289 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6290 "inner-loop induction only used outside "
6291 "of the outer vectorized loop.\n");
6292 return false;
6293 }
6294 }
6295 }
6296
6297 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6298 return false;
6299
6300 /* FORNOW: SLP not supported. */
6301 if (STMT_SLP_TYPE (stmt_info))
6302 return false;
6303
6304 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
6305
6306 if (gimple_code (phi) != GIMPLE_PHI)
6307 return false;
6308
6309 if (!vec_stmt) /* transformation not required. */
6310 {
6311 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6312 if (dump_enabled_p ())
6313 dump_printf_loc (MSG_NOTE, vect_location,
6314 "=== vectorizable_induction ===\n");
6315 vect_model_induction_cost (stmt_info, ncopies);
6316 return true;
6317 }
6318
6319 /** Transform. **/
6320
6321 if (dump_enabled_p ())
6322 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6323
6324 vec_def = get_initial_def_for_induction (phi);
6325 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
6326 return true;
6327 }
6328
6329 /* Function vectorizable_live_operation.
6330
6331 STMT computes a value that is used outside the loop. Check if
6332 it can be supported. */
6333
6334 bool
vectorizable_live_operation(gimple * stmt,gimple_stmt_iterator * gsi ATTRIBUTE_UNUSED,gimple ** vec_stmt)6335 vectorizable_live_operation (gimple *stmt,
6336 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6337 gimple **vec_stmt)
6338 {
6339 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6340 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6341 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6342 tree op;
6343 gimple *def_stmt;
6344 ssa_op_iter iter;
6345
6346 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
6347
6348 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6349 return false;
6350
6351 if (!is_gimple_assign (stmt))
6352 {
6353 if (gimple_call_internal_p (stmt)
6354 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
6355 && gimple_call_lhs (stmt)
6356 && loop->simduid
6357 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
6358 && loop->simduid
6359 == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
6360 {
6361 edge e = single_exit (loop);
6362 basic_block merge_bb = e->dest;
6363 imm_use_iterator imm_iter;
6364 use_operand_p use_p;
6365 tree lhs = gimple_call_lhs (stmt);
6366
6367 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
6368 {
6369 gimple *use_stmt = USE_STMT (use_p);
6370 if (gimple_code (use_stmt) == GIMPLE_PHI
6371 && gimple_bb (use_stmt) == merge_bb)
6372 {
6373 if (vec_stmt)
6374 {
6375 tree vfm1
6376 = build_int_cst (unsigned_type_node,
6377 loop_vinfo->vectorization_factor - 1);
6378 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
6379 }
6380 return true;
6381 }
6382 }
6383 }
6384
6385 return false;
6386 }
6387
6388 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
6389 return false;
6390
6391 /* FORNOW. CHECKME. */
6392 if (nested_in_vect_loop_p (loop, stmt))
6393 return false;
6394
6395 /* FORNOW: support only if all uses are invariant. This means
6396 that the scalar operations can remain in place, unvectorized.
6397 The original last scalar value that they compute will be used. */
6398 FOR_EACH_SSA_TREE_OPERAND (op, stmt, iter, SSA_OP_USE)
6399 {
6400 enum vect_def_type dt = vect_uninitialized_def;
6401
6402 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &dt))
6403 {
6404 if (dump_enabled_p ())
6405 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6406 "use not simple.\n");
6407 return false;
6408 }
6409
6410 if (dt != vect_external_def && dt != vect_constant_def)
6411 return false;
6412 }
6413
6414 /* No transformation is required for the cases we currently support. */
6415 return true;
6416 }
6417
6418 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
6419
6420 static void
vect_loop_kill_debug_uses(struct loop * loop,gimple * stmt)6421 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
6422 {
6423 ssa_op_iter op_iter;
6424 imm_use_iterator imm_iter;
6425 def_operand_p def_p;
6426 gimple *ustmt;
6427
6428 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
6429 {
6430 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
6431 {
6432 basic_block bb;
6433
6434 if (!is_gimple_debug (ustmt))
6435 continue;
6436
6437 bb = gimple_bb (ustmt);
6438
6439 if (!flow_bb_inside_loop_p (loop, bb))
6440 {
6441 if (gimple_debug_bind_p (ustmt))
6442 {
6443 if (dump_enabled_p ())
6444 dump_printf_loc (MSG_NOTE, vect_location,
6445 "killing debug use\n");
6446
6447 gimple_debug_bind_reset_value (ustmt);
6448 update_stmt (ustmt);
6449 }
6450 else
6451 gcc_unreachable ();
6452 }
6453 }
6454 }
6455 }
6456
6457
6458 /* This function builds ni_name = number of iterations. Statements
6459 are emitted on the loop preheader edge. */
6460
6461 static tree
vect_build_loop_niters(loop_vec_info loop_vinfo)6462 vect_build_loop_niters (loop_vec_info loop_vinfo)
6463 {
6464 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
6465 if (TREE_CODE (ni) == INTEGER_CST)
6466 return ni;
6467 else
6468 {
6469 tree ni_name, var;
6470 gimple_seq stmts = NULL;
6471 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6472
6473 var = create_tmp_var (TREE_TYPE (ni), "niters");
6474 ni_name = force_gimple_operand (ni, &stmts, false, var);
6475 if (stmts)
6476 gsi_insert_seq_on_edge_immediate (pe, stmts);
6477
6478 return ni_name;
6479 }
6480 }
6481
6482
6483 /* This function generates the following statements:
6484
6485 ni_name = number of iterations loop executes
6486 ratio = ni_name / vf
6487 ratio_mult_vf_name = ratio * vf
6488
6489 and places them on the loop preheader edge. */
6490
6491 static void
vect_generate_tmps_on_preheader(loop_vec_info loop_vinfo,tree ni_name,tree * ratio_mult_vf_name_ptr,tree * ratio_name_ptr)6492 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
6493 tree ni_name,
6494 tree *ratio_mult_vf_name_ptr,
6495 tree *ratio_name_ptr)
6496 {
6497 tree ni_minus_gap_name;
6498 tree var;
6499 tree ratio_name;
6500 tree ratio_mult_vf_name;
6501 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6502 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6503 tree log_vf;
6504
6505 log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
6506
6507 /* If epilogue loop is required because of data accesses with gaps, we
6508 subtract one iteration from the total number of iterations here for
6509 correct calculation of RATIO. */
6510 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6511 {
6512 ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6513 ni_name,
6514 build_one_cst (TREE_TYPE (ni_name)));
6515 if (!is_gimple_val (ni_minus_gap_name))
6516 {
6517 var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
6518 gimple *stmts = NULL;
6519 ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
6520 true, var);
6521 gsi_insert_seq_on_edge_immediate (pe, stmts);
6522 }
6523 }
6524 else
6525 ni_minus_gap_name = ni_name;
6526
6527 /* Create: ratio = ni >> log2(vf) */
6528 /* ??? As we have ni == number of latch executions + 1, ni could
6529 have overflown to zero. So avoid computing ratio based on ni
6530 but compute it using the fact that we know ratio will be at least
6531 one, thus via (ni - vf) >> log2(vf) + 1. */
6532 ratio_name
6533 = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
6534 fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
6535 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6536 ni_minus_gap_name,
6537 build_int_cst
6538 (TREE_TYPE (ni_name), vf)),
6539 log_vf),
6540 build_int_cst (TREE_TYPE (ni_name), 1));
6541 if (!is_gimple_val (ratio_name))
6542 {
6543 var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
6544 gimple *stmts = NULL;
6545 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
6546 gsi_insert_seq_on_edge_immediate (pe, stmts);
6547 }
6548 *ratio_name_ptr = ratio_name;
6549
6550 /* Create: ratio_mult_vf = ratio << log2 (vf). */
6551
6552 if (ratio_mult_vf_name_ptr)
6553 {
6554 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
6555 ratio_name, log_vf);
6556 if (!is_gimple_val (ratio_mult_vf_name))
6557 {
6558 var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
6559 gimple *stmts = NULL;
6560 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
6561 true, var);
6562 gsi_insert_seq_on_edge_immediate (pe, stmts);
6563 }
6564 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
6565 }
6566
6567 return;
6568 }
6569
6570
6571 /* Function vect_transform_loop.
6572
6573 The analysis phase has determined that the loop is vectorizable.
6574 Vectorize the loop - created vectorized stmts to replace the scalar
6575 stmts in the loop, and update the loop exit condition. */
6576
6577 void
vect_transform_loop(loop_vec_info loop_vinfo)6578 vect_transform_loop (loop_vec_info loop_vinfo)
6579 {
6580 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6581 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
6582 int nbbs = loop->num_nodes;
6583 int i;
6584 tree ratio = NULL;
6585 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6586 bool grouped_store;
6587 bool slp_scheduled = false;
6588 gimple *stmt, *pattern_stmt;
6589 gimple_seq pattern_def_seq = NULL;
6590 gimple_stmt_iterator pattern_def_si = gsi_none ();
6591 bool transform_pattern_stmt = false;
6592 bool check_profitability = false;
6593 int th;
6594 /* Record number of iterations before we started tampering with the profile. */
6595 gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
6596
6597 if (dump_enabled_p ())
6598 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
6599
6600 /* If profile is inprecise, we have chance to fix it up. */
6601 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6602 expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
6603
6604 /* Use the more conservative vectorization threshold. If the number
6605 of iterations is constant assume the cost check has been performed
6606 by our caller. If the threshold makes all loops profitable that
6607 run at least the vectorization factor number of times checking
6608 is pointless, too. */
6609 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
6610 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
6611 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6612 {
6613 if (dump_enabled_p ())
6614 dump_printf_loc (MSG_NOTE, vect_location,
6615 "Profitability threshold is %d loop iterations.\n",
6616 th);
6617 check_profitability = true;
6618 }
6619
6620 /* Version the loop first, if required, so the profitability check
6621 comes first. */
6622
6623 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
6624 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
6625 {
6626 vect_loop_versioning (loop_vinfo, th, check_profitability);
6627 check_profitability = false;
6628 }
6629
6630 tree ni_name = vect_build_loop_niters (loop_vinfo);
6631 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
6632
6633 /* Peel the loop if there are data refs with unknown alignment.
6634 Only one data ref with unknown store is allowed. */
6635
6636 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
6637 {
6638 vect_do_peeling_for_alignment (loop_vinfo, ni_name,
6639 th, check_profitability);
6640 check_profitability = false;
6641 /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
6642 be re-computed. */
6643 ni_name = NULL_TREE;
6644 }
6645
6646 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
6647 compile time constant), or it is a constant that doesn't divide by the
6648 vectorization factor, then an epilog loop needs to be created.
6649 We therefore duplicate the loop: the original loop will be vectorized,
6650 and will compute the first (n/VF) iterations. The second copy of the loop
6651 will remain scalar and will compute the remaining (n%VF) iterations.
6652 (VF is the vectorization factor). */
6653
6654 if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
6655 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6656 {
6657 tree ratio_mult_vf;
6658 if (!ni_name)
6659 ni_name = vect_build_loop_niters (loop_vinfo);
6660 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
6661 &ratio);
6662 vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
6663 th, check_profitability);
6664 }
6665 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6666 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
6667 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
6668 else
6669 {
6670 if (!ni_name)
6671 ni_name = vect_build_loop_niters (loop_vinfo);
6672 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
6673 }
6674
6675 /* 1) Make sure the loop header has exactly two entries
6676 2) Make sure we have a preheader basic block. */
6677
6678 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
6679
6680 split_edge (loop_preheader_edge (loop));
6681
6682 /* FORNOW: the vectorizer supports only loops which body consist
6683 of one basic block (header + empty latch). When the vectorizer will
6684 support more involved loop forms, the order by which the BBs are
6685 traversed need to be reconsidered. */
6686
6687 for (i = 0; i < nbbs; i++)
6688 {
6689 basic_block bb = bbs[i];
6690 stmt_vec_info stmt_info;
6691
6692 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
6693 gsi_next (&si))
6694 {
6695 gphi *phi = si.phi ();
6696 if (dump_enabled_p ())
6697 {
6698 dump_printf_loc (MSG_NOTE, vect_location,
6699 "------>vectorizing phi: ");
6700 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6701 dump_printf (MSG_NOTE, "\n");
6702 }
6703 stmt_info = vinfo_for_stmt (phi);
6704 if (!stmt_info)
6705 continue;
6706
6707 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6708 vect_loop_kill_debug_uses (loop, phi);
6709
6710 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6711 && !STMT_VINFO_LIVE_P (stmt_info))
6712 continue;
6713
6714 if (STMT_VINFO_VECTYPE (stmt_info)
6715 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6716 != (unsigned HOST_WIDE_INT) vectorization_factor)
6717 && dump_enabled_p ())
6718 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6719
6720 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6721 {
6722 if (dump_enabled_p ())
6723 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6724 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6725 }
6726 }
6727
6728 pattern_stmt = NULL;
6729 for (gimple_stmt_iterator si = gsi_start_bb (bb);
6730 !gsi_end_p (si) || transform_pattern_stmt;)
6731 {
6732 bool is_store;
6733
6734 if (transform_pattern_stmt)
6735 stmt = pattern_stmt;
6736 else
6737 {
6738 stmt = gsi_stmt (si);
6739 /* During vectorization remove existing clobber stmts. */
6740 if (gimple_clobber_p (stmt))
6741 {
6742 unlink_stmt_vdef (stmt);
6743 gsi_remove (&si, true);
6744 release_defs (stmt);
6745 continue;
6746 }
6747 }
6748
6749 if (dump_enabled_p ())
6750 {
6751 dump_printf_loc (MSG_NOTE, vect_location,
6752 "------>vectorizing statement: ");
6753 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6754 dump_printf (MSG_NOTE, "\n");
6755 }
6756
6757 stmt_info = vinfo_for_stmt (stmt);
6758
6759 /* vector stmts created in the outer-loop during vectorization of
6760 stmts in an inner-loop may not have a stmt_info, and do not
6761 need to be vectorized. */
6762 if (!stmt_info)
6763 {
6764 gsi_next (&si);
6765 continue;
6766 }
6767
6768 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6769 vect_loop_kill_debug_uses (loop, stmt);
6770
6771 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6772 && !STMT_VINFO_LIVE_P (stmt_info))
6773 {
6774 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6775 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6776 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6777 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6778 {
6779 stmt = pattern_stmt;
6780 stmt_info = vinfo_for_stmt (stmt);
6781 }
6782 else
6783 {
6784 gsi_next (&si);
6785 continue;
6786 }
6787 }
6788 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6789 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6790 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6791 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6792 transform_pattern_stmt = true;
6793
6794 /* If pattern statement has def stmts, vectorize them too. */
6795 if (is_pattern_stmt_p (stmt_info))
6796 {
6797 if (pattern_def_seq == NULL)
6798 {
6799 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6800 pattern_def_si = gsi_start (pattern_def_seq);
6801 }
6802 else if (!gsi_end_p (pattern_def_si))
6803 gsi_next (&pattern_def_si);
6804 if (pattern_def_seq != NULL)
6805 {
6806 gimple *pattern_def_stmt = NULL;
6807 stmt_vec_info pattern_def_stmt_info = NULL;
6808
6809 while (!gsi_end_p (pattern_def_si))
6810 {
6811 pattern_def_stmt = gsi_stmt (pattern_def_si);
6812 pattern_def_stmt_info
6813 = vinfo_for_stmt (pattern_def_stmt);
6814 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6815 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6816 break;
6817 gsi_next (&pattern_def_si);
6818 }
6819
6820 if (!gsi_end_p (pattern_def_si))
6821 {
6822 if (dump_enabled_p ())
6823 {
6824 dump_printf_loc (MSG_NOTE, vect_location,
6825 "==> vectorizing pattern def "
6826 "stmt: ");
6827 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6828 pattern_def_stmt, 0);
6829 dump_printf (MSG_NOTE, "\n");
6830 }
6831
6832 stmt = pattern_def_stmt;
6833 stmt_info = pattern_def_stmt_info;
6834 }
6835 else
6836 {
6837 pattern_def_si = gsi_none ();
6838 transform_pattern_stmt = false;
6839 }
6840 }
6841 else
6842 transform_pattern_stmt = false;
6843 }
6844
6845 if (STMT_VINFO_VECTYPE (stmt_info))
6846 {
6847 unsigned int nunits
6848 = (unsigned int)
6849 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6850 if (!STMT_SLP_TYPE (stmt_info)
6851 && nunits != (unsigned int) vectorization_factor
6852 && dump_enabled_p ())
6853 /* For SLP VF is set according to unrolling factor, and not
6854 to vector size, hence for SLP this print is not valid. */
6855 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6856 }
6857
6858 /* SLP. Schedule all the SLP instances when the first SLP stmt is
6859 reached. */
6860 if (STMT_SLP_TYPE (stmt_info))
6861 {
6862 if (!slp_scheduled)
6863 {
6864 slp_scheduled = true;
6865
6866 if (dump_enabled_p ())
6867 dump_printf_loc (MSG_NOTE, vect_location,
6868 "=== scheduling SLP instances ===\n");
6869
6870 vect_schedule_slp (loop_vinfo);
6871 }
6872
6873 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
6874 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6875 {
6876 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6877 {
6878 pattern_def_seq = NULL;
6879 gsi_next (&si);
6880 }
6881 continue;
6882 }
6883 }
6884
6885 /* -------- vectorize statement ------------ */
6886 if (dump_enabled_p ())
6887 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6888
6889 grouped_store = false;
6890 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6891 if (is_store)
6892 {
6893 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6894 {
6895 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6896 interleaving chain was completed - free all the stores in
6897 the chain. */
6898 gsi_next (&si);
6899 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6900 }
6901 else
6902 {
6903 /* Free the attached stmt_vec_info and remove the stmt. */
6904 gimple *store = gsi_stmt (si);
6905 free_stmt_vec_info (store);
6906 unlink_stmt_vdef (store);
6907 gsi_remove (&si, true);
6908 release_defs (store);
6909 }
6910
6911 /* Stores can only appear at the end of pattern statements. */
6912 gcc_assert (!transform_pattern_stmt);
6913 pattern_def_seq = NULL;
6914 }
6915 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6916 {
6917 pattern_def_seq = NULL;
6918 gsi_next (&si);
6919 }
6920 } /* stmts in BB */
6921 } /* BBs in loop */
6922
6923 slpeel_make_loop_iterate_ntimes (loop, ratio);
6924
6925 /* Reduce loop iterations by the vectorization factor. */
6926 scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6927 expected_iterations / vectorization_factor);
6928 loop->nb_iterations_upper_bound
6929 = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6930 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6931 && loop->nb_iterations_upper_bound != 0)
6932 loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6933 if (loop->any_estimate)
6934 {
6935 loop->nb_iterations_estimate
6936 = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6937 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6938 && loop->nb_iterations_estimate != 0)
6939 loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6940 }
6941
6942 if (dump_enabled_p ())
6943 {
6944 dump_printf_loc (MSG_NOTE, vect_location,
6945 "LOOP VECTORIZED\n");
6946 if (loop->inner)
6947 dump_printf_loc (MSG_NOTE, vect_location,
6948 "OUTER LOOP VECTORIZED\n");
6949 dump_printf (MSG_NOTE, "\n");
6950 }
6951
6952 /* Free SLP instances here because otherwise stmt reference counting
6953 won't work. */
6954 slp_instance instance;
6955 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
6956 vect_free_slp_instance (instance);
6957 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
6958 }
6959
6960 /* The code below is trying to perform simple optimization - revert
6961 if-conversion for masked stores, i.e. if the mask of a store is zero
6962 do not perform it and all stored value producers also if possible.
6963 For example,
6964 for (i=0; i<n; i++)
6965 if (c[i])
6966 {
6967 p1[i] += 1;
6968 p2[i] = p3[i] +2;
6969 }
6970 this transformation will produce the following semi-hammock:
6971
6972 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
6973 {
6974 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
6975 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
6976 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
6977 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
6978 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
6979 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
6980 }
6981 */
6982
6983 void
optimize_mask_stores(struct loop * loop)6984 optimize_mask_stores (struct loop *loop)
6985 {
6986 basic_block *bbs = get_loop_body (loop);
6987 unsigned nbbs = loop->num_nodes;
6988 unsigned i;
6989 basic_block bb;
6990 gimple_stmt_iterator gsi;
6991 gimple *stmt;
6992 auto_vec<gimple *> worklist;
6993
6994 vect_location = find_loop_location (loop);
6995 /* Pick up all masked stores in loop if any. */
6996 for (i = 0; i < nbbs; i++)
6997 {
6998 bb = bbs[i];
6999 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
7000 gsi_next (&gsi))
7001 {
7002 stmt = gsi_stmt (gsi);
7003 if (is_gimple_call (stmt)
7004 && gimple_call_internal_p (stmt)
7005 && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
7006 worklist.safe_push (stmt);
7007 }
7008 }
7009
7010 free (bbs);
7011 if (worklist.is_empty ())
7012 return;
7013
7014 /* Loop has masked stores. */
7015 while (!worklist.is_empty ())
7016 {
7017 gimple *last, *last_store;
7018 edge e, efalse;
7019 tree mask;
7020 basic_block store_bb, join_bb;
7021 gimple_stmt_iterator gsi_to;
7022 tree vdef, new_vdef;
7023 gphi *phi;
7024 tree vectype;
7025 tree zero;
7026
7027 last = worklist.pop ();
7028 mask = gimple_call_arg (last, 2);
7029 bb = gimple_bb (last);
7030 /* Create new bb. */
7031 e = split_block (bb, last);
7032 join_bb = e->dest;
7033 store_bb = create_empty_bb (bb);
7034 add_bb_to_loop (store_bb, loop);
7035 e->flags = EDGE_TRUE_VALUE;
7036 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
7037 /* Put STORE_BB to likely part. */
7038 efalse->probability = PROB_UNLIKELY;
7039 store_bb->frequency = PROB_ALWAYS - EDGE_FREQUENCY (efalse);
7040 make_edge (store_bb, join_bb, EDGE_FALLTHRU);
7041 if (dom_info_available_p (CDI_DOMINATORS))
7042 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
7043 if (dump_enabled_p ())
7044 dump_printf_loc (MSG_NOTE, vect_location,
7045 "Create new block %d to sink mask stores.",
7046 store_bb->index);
7047 /* Create vector comparison with boolean result. */
7048 vectype = TREE_TYPE (mask);
7049 zero = build_zero_cst (vectype);
7050 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
7051 gsi = gsi_last_bb (bb);
7052 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
7053 /* Create new PHI node for vdef of the last masked store:
7054 .MEM_2 = VDEF <.MEM_1>
7055 will be converted to
7056 .MEM.3 = VDEF <.MEM_1>
7057 and new PHI node will be created in join bb
7058 .MEM_2 = PHI <.MEM_1, .MEM_3>
7059 */
7060 vdef = gimple_vdef (last);
7061 new_vdef = make_ssa_name (gimple_vop (cfun), last);
7062 gimple_set_vdef (last, new_vdef);
7063 phi = create_phi_node (vdef, join_bb);
7064 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
7065
7066 /* Put all masked stores with the same mask to STORE_BB if possible. */
7067 while (true)
7068 {
7069 gimple_stmt_iterator gsi_from;
7070 gimple *stmt1 = NULL;
7071
7072 /* Move masked store to STORE_BB. */
7073 last_store = last;
7074 gsi = gsi_for_stmt (last);
7075 gsi_from = gsi;
7076 /* Shift GSI to the previous stmt for further traversal. */
7077 gsi_prev (&gsi);
7078 gsi_to = gsi_start_bb (store_bb);
7079 gsi_move_before (&gsi_from, &gsi_to);
7080 /* Setup GSI_TO to the non-empty block start. */
7081 gsi_to = gsi_start_bb (store_bb);
7082 if (dump_enabled_p ())
7083 {
7084 dump_printf_loc (MSG_NOTE, vect_location,
7085 "Move stmt to created bb\n");
7086 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
7087 }
7088 /* Move all stored value producers if possible. */
7089 while (!gsi_end_p (gsi))
7090 {
7091 tree lhs;
7092 imm_use_iterator imm_iter;
7093 use_operand_p use_p;
7094 bool res;
7095
7096 /* Skip debug statements. */
7097 if (is_gimple_debug (gsi_stmt (gsi)))
7098 {
7099 gsi_prev (&gsi);
7100 continue;
7101 }
7102 stmt1 = gsi_stmt (gsi);
7103 /* Do not consider statements writing to memory or having
7104 volatile operand. */
7105 if (gimple_vdef (stmt1)
7106 || gimple_has_volatile_ops (stmt1))
7107 break;
7108 gsi_from = gsi;
7109 gsi_prev (&gsi);
7110 lhs = gimple_get_lhs (stmt1);
7111 if (!lhs)
7112 break;
7113
7114 /* LHS of vectorized stmt must be SSA_NAME. */
7115 if (TREE_CODE (lhs) != SSA_NAME)
7116 break;
7117
7118 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
7119 {
7120 /* Remove dead scalar statement. */
7121 if (has_zero_uses (lhs))
7122 {
7123 gsi_remove (&gsi_from, true);
7124 continue;
7125 }
7126 }
7127
7128 /* Check that LHS does not have uses outside of STORE_BB. */
7129 res = true;
7130 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
7131 {
7132 gimple *use_stmt;
7133 use_stmt = USE_STMT (use_p);
7134 if (is_gimple_debug (use_stmt))
7135 continue;
7136 if (gimple_bb (use_stmt) != store_bb)
7137 {
7138 res = false;
7139 break;
7140 }
7141 }
7142 if (!res)
7143 break;
7144
7145 if (gimple_vuse (stmt1)
7146 && gimple_vuse (stmt1) != gimple_vuse (last_store))
7147 break;
7148
7149 /* Can move STMT1 to STORE_BB. */
7150 if (dump_enabled_p ())
7151 {
7152 dump_printf_loc (MSG_NOTE, vect_location,
7153 "Move stmt to created bb\n");
7154 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
7155 }
7156 gsi_move_before (&gsi_from, &gsi_to);
7157 /* Shift GSI_TO for further insertion. */
7158 gsi_prev (&gsi_to);
7159 }
7160 /* Put other masked stores with the same mask to STORE_BB. */
7161 if (worklist.is_empty ()
7162 || gimple_call_arg (worklist.last (), 2) != mask
7163 || worklist.last () != stmt1)
7164 break;
7165 last = worklist.pop ();
7166 }
7167 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
7168 }
7169 }
7170