1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "params.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
48
49
50 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
51
52 static void
vect_free_slp_tree(slp_tree node)53 vect_free_slp_tree (slp_tree node)
54 {
55 int i;
56 slp_tree child;
57
58 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
59 vect_free_slp_tree (child);
60
61 gimple *stmt;
62 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
63 /* After transform some stmts are removed and thus their vinfo is gone. */
64 if (vinfo_for_stmt (stmt))
65 {
66 gcc_assert (STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)) > 0);
67 STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt))--;
68 }
69
70 SLP_TREE_CHILDREN (node).release ();
71 SLP_TREE_SCALAR_STMTS (node).release ();
72 SLP_TREE_VEC_STMTS (node).release ();
73 SLP_TREE_LOAD_PERMUTATION (node).release ();
74
75 free (node);
76 }
77
78
79 /* Free the memory allocated for the SLP instance. */
80
81 void
vect_free_slp_instance(slp_instance instance)82 vect_free_slp_instance (slp_instance instance)
83 {
84 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
85 SLP_INSTANCE_LOADS (instance).release ();
86 free (instance);
87 }
88
89
90 /* Create an SLP node for SCALAR_STMTS. */
91
92 static slp_tree
vect_create_new_slp_node(vec<gimple * > scalar_stmts)93 vect_create_new_slp_node (vec<gimple *> scalar_stmts)
94 {
95 slp_tree node;
96 gimple *stmt = scalar_stmts[0];
97 unsigned int nops;
98
99 if (is_gimple_call (stmt))
100 nops = gimple_call_num_args (stmt);
101 else if (is_gimple_assign (stmt))
102 {
103 nops = gimple_num_ops (stmt) - 1;
104 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
105 nops++;
106 }
107 else if (gimple_code (stmt) == GIMPLE_PHI)
108 nops = 0;
109 else
110 return NULL;
111
112 node = XNEW (struct _slp_tree);
113 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
114 SLP_TREE_VEC_STMTS (node).create (0);
115 SLP_TREE_CHILDREN (node).create (nops);
116 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
117 SLP_TREE_TWO_OPERATORS (node) = false;
118 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
119
120 unsigned i;
121 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt)
122 STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt))++;
123
124 return node;
125 }
126
127
128 /* This structure is used in creation of an SLP tree. Each instance
129 corresponds to the same operand in a group of scalar stmts in an SLP
130 node. */
131 typedef struct _slp_oprnd_info
132 {
133 /* Def-stmts for the operands. */
134 vec<gimple *> def_stmts;
135 /* Information about the first statement, its vector def-type, type, the
136 operand itself in case it's constant, and an indication if it's a pattern
137 stmt. */
138 tree first_op_type;
139 enum vect_def_type first_dt;
140 bool first_pattern;
141 bool second_pattern;
142 } *slp_oprnd_info;
143
144
145 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
146 operand. */
147 static vec<slp_oprnd_info>
vect_create_oprnd_info(int nops,int group_size)148 vect_create_oprnd_info (int nops, int group_size)
149 {
150 int i;
151 slp_oprnd_info oprnd_info;
152 vec<slp_oprnd_info> oprnds_info;
153
154 oprnds_info.create (nops);
155 for (i = 0; i < nops; i++)
156 {
157 oprnd_info = XNEW (struct _slp_oprnd_info);
158 oprnd_info->def_stmts.create (group_size);
159 oprnd_info->first_dt = vect_uninitialized_def;
160 oprnd_info->first_op_type = NULL_TREE;
161 oprnd_info->first_pattern = false;
162 oprnd_info->second_pattern = false;
163 oprnds_info.quick_push (oprnd_info);
164 }
165
166 return oprnds_info;
167 }
168
169
170 /* Free operands info. */
171
172 static void
vect_free_oprnd_info(vec<slp_oprnd_info> & oprnds_info)173 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
174 {
175 int i;
176 slp_oprnd_info oprnd_info;
177
178 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
179 {
180 oprnd_info->def_stmts.release ();
181 XDELETE (oprnd_info);
182 }
183
184 oprnds_info.release ();
185 }
186
187
188 /* Find the place of the data-ref in STMT in the interleaving chain that starts
189 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
190
191 int
vect_get_place_in_interleaving_chain(gimple * stmt,gimple * first_stmt)192 vect_get_place_in_interleaving_chain (gimple *stmt, gimple *first_stmt)
193 {
194 gimple *next_stmt = first_stmt;
195 int result = 0;
196
197 if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
198 return -1;
199
200 do
201 {
202 if (next_stmt == stmt)
203 return result;
204 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
205 if (next_stmt)
206 result += GROUP_GAP (vinfo_for_stmt (next_stmt));
207 }
208 while (next_stmt);
209
210 return -1;
211 }
212
213 /* Check whether it is possible to load COUNT elements of type ELT_MODE
214 using the method implemented by duplicate_and_interleave. Return true
215 if so, returning the number of intermediate vectors in *NVECTORS_OUT
216 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
217 (if nonnull). */
218
219 bool
can_duplicate_and_interleave_p(unsigned int count,machine_mode elt_mode,unsigned int * nvectors_out,tree * vector_type_out,tree * permutes)220 can_duplicate_and_interleave_p (unsigned int count, machine_mode elt_mode,
221 unsigned int *nvectors_out,
222 tree *vector_type_out,
223 tree *permutes)
224 {
225 poly_int64 elt_bytes = count * GET_MODE_SIZE (elt_mode);
226 poly_int64 nelts;
227 unsigned int nvectors = 1;
228 for (;;)
229 {
230 scalar_int_mode int_mode;
231 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
232 if (multiple_p (current_vector_size, elt_bytes, &nelts)
233 && int_mode_for_size (elt_bits, 0).exists (&int_mode))
234 {
235 tree int_type = build_nonstandard_integer_type
236 (GET_MODE_BITSIZE (int_mode), 1);
237 tree vector_type = build_vector_type (int_type, nelts);
238 if (VECTOR_MODE_P (TYPE_MODE (vector_type)))
239 {
240 vec_perm_builder sel1 (nelts, 2, 3);
241 vec_perm_builder sel2 (nelts, 2, 3);
242 poly_int64 half_nelts = exact_div (nelts, 2);
243 for (unsigned int i = 0; i < 3; ++i)
244 {
245 sel1.quick_push (i);
246 sel1.quick_push (i + nelts);
247 sel2.quick_push (half_nelts + i);
248 sel2.quick_push (half_nelts + i + nelts);
249 }
250 vec_perm_indices indices1 (sel1, 2, nelts);
251 vec_perm_indices indices2 (sel2, 2, nelts);
252 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
253 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
254 {
255 if (nvectors_out)
256 *nvectors_out = nvectors;
257 if (vector_type_out)
258 *vector_type_out = vector_type;
259 if (permutes)
260 {
261 permutes[0] = vect_gen_perm_mask_checked (vector_type,
262 indices1);
263 permutes[1] = vect_gen_perm_mask_checked (vector_type,
264 indices2);
265 }
266 return true;
267 }
268 }
269 }
270 if (!multiple_p (elt_bytes, 2, &elt_bytes))
271 return false;
272 nvectors *= 2;
273 }
274 }
275
276 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
277 they are of a valid type and that they match the defs of the first stmt of
278 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
279 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
280 indicates swap is required for cond_expr stmts. Specifically, *SWAP
281 is 1 if STMT is cond and operands of comparison need to be swapped;
282 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
283 If there is any operand swap in this function, *SWAP is set to non-zero
284 value.
285 If there was a fatal error return -1; if the error could be corrected by
286 swapping operands of father node of this one, return 1; if everything is
287 ok return 0. */
288 static int
vect_get_and_check_slp_defs(vec_info * vinfo,unsigned char * swap,vec<gimple * > stmts,unsigned stmt_num,vec<slp_oprnd_info> * oprnds_info)289 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
290 vec<gimple *> stmts, unsigned stmt_num,
291 vec<slp_oprnd_info> *oprnds_info)
292 {
293 gimple *stmt = stmts[stmt_num];
294 tree oprnd;
295 unsigned int i, number_of_oprnds;
296 gimple *def_stmt;
297 enum vect_def_type dt = vect_uninitialized_def;
298 bool pattern = false;
299 slp_oprnd_info oprnd_info;
300 int first_op_idx = 1;
301 bool commutative = false;
302 bool first_op_cond = false;
303 bool first = stmt_num == 0;
304 bool second = stmt_num == 1;
305
306 if (is_gimple_call (stmt))
307 {
308 number_of_oprnds = gimple_call_num_args (stmt);
309 first_op_idx = 3;
310 }
311 else if (is_gimple_assign (stmt))
312 {
313 enum tree_code code = gimple_assign_rhs_code (stmt);
314 number_of_oprnds = gimple_num_ops (stmt) - 1;
315 /* Swap can only be done for cond_expr if asked to, otherwise we
316 could result in different comparison code to the first stmt. */
317 if (gimple_assign_rhs_code (stmt) == COND_EXPR
318 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
319 {
320 first_op_cond = true;
321 number_of_oprnds++;
322 }
323 else
324 commutative = commutative_tree_code (code);
325 }
326 else
327 return -1;
328
329 bool swapped = (*swap != 0);
330 gcc_assert (!swapped || first_op_cond);
331 for (i = 0; i < number_of_oprnds; i++)
332 {
333 again:
334 if (first_op_cond)
335 {
336 /* Map indicating how operands of cond_expr should be swapped. */
337 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
338 int *map = maps[*swap];
339
340 if (i < 2)
341 oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx), map[i]);
342 else
343 oprnd = gimple_op (stmt, map[i]);
344 }
345 else
346 oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
347
348 oprnd_info = (*oprnds_info)[i];
349
350 if (!vect_is_simple_use (oprnd, vinfo, &def_stmt, &dt))
351 {
352 if (dump_enabled_p ())
353 {
354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
355 "Build SLP failed: can't analyze def for ");
356 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
357 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
358 }
359
360 return -1;
361 }
362
363 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
364 from the pattern. Check that all the stmts of the node are in the
365 pattern. */
366 if (def_stmt && gimple_bb (def_stmt)
367 && vect_stmt_in_region_p (vinfo, def_stmt)
368 && vinfo_for_stmt (def_stmt)
369 && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
370 && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
371 && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
372 {
373 pattern = true;
374 if (!first && !oprnd_info->first_pattern
375 /* Allow different pattern state for the defs of the
376 first stmt in reduction chains. */
377 && (oprnd_info->first_dt != vect_reduction_def
378 || (!second && !oprnd_info->second_pattern)))
379 {
380 if (i == 0
381 && !swapped
382 && commutative)
383 {
384 swapped = true;
385 goto again;
386 }
387
388 if (dump_enabled_p ())
389 {
390 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
391 "Build SLP failed: some of the stmts"
392 " are in a pattern, and others are not ");
393 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
394 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
395 }
396
397 return 1;
398 }
399
400 def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
401 dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
402
403 if (dt == vect_unknown_def_type)
404 {
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
407 "Unsupported pattern.\n");
408 return -1;
409 }
410
411 switch (gimple_code (def_stmt))
412 {
413 case GIMPLE_PHI:
414 case GIMPLE_ASSIGN:
415 break;
416
417 default:
418 if (dump_enabled_p ())
419 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
420 "unsupported defining stmt:\n");
421 return -1;
422 }
423 }
424
425 if (second)
426 oprnd_info->second_pattern = pattern;
427
428 if (first)
429 {
430 oprnd_info->first_dt = dt;
431 oprnd_info->first_pattern = pattern;
432 oprnd_info->first_op_type = TREE_TYPE (oprnd);
433 }
434 else
435 {
436 /* Not first stmt of the group, check that the def-stmt/s match
437 the def-stmt/s of the first stmt. Allow different definition
438 types for reduction chains: the first stmt must be a
439 vect_reduction_def (a phi node), and the rest
440 vect_internal_def. */
441 tree type = TREE_TYPE (oprnd);
442 if ((oprnd_info->first_dt != dt
443 && !(oprnd_info->first_dt == vect_reduction_def
444 && dt == vect_internal_def)
445 && !((oprnd_info->first_dt == vect_external_def
446 || oprnd_info->first_dt == vect_constant_def)
447 && (dt == vect_external_def
448 || dt == vect_constant_def)))
449 || !types_compatible_p (oprnd_info->first_op_type, type))
450 {
451 /* Try swapping operands if we got a mismatch. */
452 if (i == 0
453 && !swapped
454 && commutative)
455 {
456 swapped = true;
457 goto again;
458 }
459
460 if (dump_enabled_p ())
461 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
462 "Build SLP failed: different types\n");
463
464 return 1;
465 }
466 if ((dt == vect_constant_def
467 || dt == vect_external_def)
468 && !current_vector_size.is_constant ()
469 && (TREE_CODE (type) == BOOLEAN_TYPE
470 || !can_duplicate_and_interleave_p (stmts.length (),
471 TYPE_MODE (type))))
472 {
473 if (dump_enabled_p ())
474 {
475 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
476 "Build SLP failed: invalid type of def "
477 "for variable-length SLP ");
478 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
479 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
480 }
481 return -1;
482 }
483 }
484
485 /* Check the types of the definitions. */
486 switch (dt)
487 {
488 case vect_constant_def:
489 case vect_external_def:
490 break;
491
492 case vect_reduction_def:
493 case vect_induction_def:
494 case vect_internal_def:
495 oprnd_info->def_stmts.quick_push (def_stmt);
496 break;
497
498 default:
499 /* FORNOW: Not supported. */
500 if (dump_enabled_p ())
501 {
502 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
503 "Build SLP failed: illegal type of def ");
504 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
505 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
506 }
507
508 return -1;
509 }
510 }
511
512 /* Swap operands. */
513 if (swapped)
514 {
515 /* If there are already uses of this stmt in a SLP instance then
516 we've committed to the operand order and can't swap it. */
517 if (STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)) != 0)
518 {
519 if (dump_enabled_p ())
520 {
521 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
522 "Build SLP failed: cannot swap operands of "
523 "shared stmt ");
524 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
525 }
526 return -1;
527 }
528
529 if (first_op_cond)
530 {
531 tree cond = gimple_assign_rhs1 (stmt);
532 enum tree_code code = TREE_CODE (cond);
533
534 /* Swap. */
535 if (*swap == 1)
536 {
537 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
538 &TREE_OPERAND (cond, 1));
539 TREE_SET_CODE (cond, swap_tree_comparison (code));
540 }
541 /* Invert. */
542 else
543 {
544 swap_ssa_operands (stmt, gimple_assign_rhs2_ptr (stmt),
545 gimple_assign_rhs3_ptr (stmt));
546 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond, 0));
547 code = invert_tree_comparison (TREE_CODE (cond), honor_nans);
548 gcc_assert (code != ERROR_MARK);
549 TREE_SET_CODE (cond, code);
550 }
551 }
552 else
553 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
554 gimple_assign_rhs2_ptr (stmt));
555 if (dump_enabled_p ())
556 {
557 dump_printf_loc (MSG_NOTE, vect_location,
558 "swapped operands to match def types in ");
559 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
560 }
561 }
562
563 *swap = swapped;
564 return 0;
565 }
566
567 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
568 caller's attempt to find the vector type in STMT with the narrowest
569 element type. Return true if VECTYPE is nonnull and if it is valid
570 for VINFO. When returning true, update MAX_NUNITS to reflect the
571 number of units in VECTYPE. VINFO, GORUP_SIZE and MAX_NUNITS are
572 as for vect_build_slp_tree. */
573
574 static bool
vect_record_max_nunits(vec_info * vinfo,gimple * stmt,unsigned int group_size,tree vectype,poly_uint64 * max_nunits)575 vect_record_max_nunits (vec_info *vinfo, gimple *stmt, unsigned int group_size,
576 tree vectype, poly_uint64 *max_nunits)
577 {
578 if (!vectype)
579 {
580 if (dump_enabled_p ())
581 {
582 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
583 "Build SLP failed: unsupported data-type in ");
584 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
585 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
586 }
587 /* Fatal mismatch. */
588 return false;
589 }
590
591 /* If populating the vector type requires unrolling then fail
592 before adjusting *max_nunits for basic-block vectorization. */
593 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
594 unsigned HOST_WIDE_INT const_nunits;
595 if (is_a <bb_vec_info> (vinfo)
596 && (!nunits.is_constant (&const_nunits)
597 || const_nunits > group_size))
598 {
599 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
600 "Build SLP failed: unrolling required "
601 "in basic block SLP\n");
602 /* Fatal mismatch. */
603 return false;
604 }
605
606 /* In case of multiple types we need to detect the smallest type. */
607 vect_update_max_nunits (max_nunits, vectype);
608 return true;
609 }
610
611 /* Verify if the scalar stmts STMTS are isomorphic, require data
612 permutation or are of unsupported types of operation. Return
613 true if they are, otherwise return false and indicate in *MATCHES
614 which stmts are not isomorphic to the first one. If MATCHES[0]
615 is false then this indicates the comparison could not be
616 carried out or the stmts will never be vectorized by SLP.
617
618 Note COND_EXPR is possibly ismorphic to another one after swapping its
619 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
620 the first stmt by swapping the two operands of comparison; set SWAP[i]
621 to 2 if stmt I is isormorphic to the first stmt by inverting the code
622 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
623 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
624
625 static bool
vect_build_slp_tree_1(vec_info * vinfo,unsigned char * swap,vec<gimple * > stmts,unsigned int group_size,unsigned nops,poly_uint64 * max_nunits,bool * matches,bool * two_operators)626 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
627 vec<gimple *> stmts, unsigned int group_size,
628 unsigned nops, poly_uint64 *max_nunits,
629 bool *matches, bool *two_operators)
630 {
631 unsigned int i;
632 gimple *first_stmt = stmts[0], *stmt = stmts[0];
633 enum tree_code first_stmt_code = ERROR_MARK;
634 enum tree_code alt_stmt_code = ERROR_MARK;
635 enum tree_code rhs_code = ERROR_MARK;
636 enum tree_code first_cond_code = ERROR_MARK;
637 tree lhs;
638 bool need_same_oprnds = false;
639 tree vectype = NULL_TREE, scalar_type, first_op1 = NULL_TREE;
640 optab optab;
641 int icode;
642 machine_mode optab_op2_mode;
643 machine_mode vec_mode;
644 HOST_WIDE_INT dummy;
645 gimple *first_load = NULL, *prev_first_load = NULL;
646
647 /* For every stmt in NODE find its def stmt/s. */
648 FOR_EACH_VEC_ELT (stmts, i, stmt)
649 {
650 swap[i] = 0;
651 matches[i] = false;
652
653 if (dump_enabled_p ())
654 {
655 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
656 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
657 }
658
659 /* Fail to vectorize statements marked as unvectorizable. */
660 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
661 {
662 if (dump_enabled_p ())
663 {
664 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
665 "Build SLP failed: unvectorizable statement ");
666 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
667 }
668 /* Fatal mismatch. */
669 matches[0] = false;
670 return false;
671 }
672
673 lhs = gimple_get_lhs (stmt);
674 if (lhs == NULL_TREE)
675 {
676 if (dump_enabled_p ())
677 {
678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
679 "Build SLP failed: not GIMPLE_ASSIGN nor "
680 "GIMPLE_CALL ");
681 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
682 }
683 /* Fatal mismatch. */
684 matches[0] = false;
685 return false;
686 }
687
688 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
689 vectype = get_vectype_for_scalar_type (scalar_type);
690 if (!vect_record_max_nunits (vinfo, stmt, group_size, vectype,
691 max_nunits))
692 {
693 /* Fatal mismatch. */
694 matches[0] = false;
695 return false;
696 }
697
698 if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
699 {
700 rhs_code = CALL_EXPR;
701 if (gimple_call_internal_p (call_stmt)
702 || gimple_call_tail_p (call_stmt)
703 || gimple_call_noreturn_p (call_stmt)
704 || !gimple_call_nothrow_p (call_stmt)
705 || gimple_call_chain (call_stmt))
706 {
707 if (dump_enabled_p ())
708 {
709 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
710 "Build SLP failed: unsupported call type ");
711 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
712 call_stmt, 0);
713 }
714 /* Fatal mismatch. */
715 matches[0] = false;
716 return false;
717 }
718 }
719 else
720 rhs_code = gimple_assign_rhs_code (stmt);
721
722 /* Check the operation. */
723 if (i == 0)
724 {
725 first_stmt_code = rhs_code;
726
727 /* Shift arguments should be equal in all the packed stmts for a
728 vector shift with scalar shift operand. */
729 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
730 || rhs_code == LROTATE_EXPR
731 || rhs_code == RROTATE_EXPR)
732 {
733 vec_mode = TYPE_MODE (vectype);
734
735 /* First see if we have a vector/vector shift. */
736 optab = optab_for_tree_code (rhs_code, vectype,
737 optab_vector);
738
739 if (!optab
740 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
741 {
742 /* No vector/vector shift, try for a vector/scalar shift. */
743 optab = optab_for_tree_code (rhs_code, vectype,
744 optab_scalar);
745
746 if (!optab)
747 {
748 if (dump_enabled_p ())
749 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
750 "Build SLP failed: no optab.\n");
751 /* Fatal mismatch. */
752 matches[0] = false;
753 return false;
754 }
755 icode = (int) optab_handler (optab, vec_mode);
756 if (icode == CODE_FOR_nothing)
757 {
758 if (dump_enabled_p ())
759 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
760 "Build SLP failed: "
761 "op not supported by target.\n");
762 /* Fatal mismatch. */
763 matches[0] = false;
764 return false;
765 }
766 optab_op2_mode = insn_data[icode].operand[2].mode;
767 if (!VECTOR_MODE_P (optab_op2_mode))
768 {
769 need_same_oprnds = true;
770 first_op1 = gimple_assign_rhs2 (stmt);
771 }
772 }
773 }
774 else if (rhs_code == WIDEN_LSHIFT_EXPR)
775 {
776 need_same_oprnds = true;
777 first_op1 = gimple_assign_rhs2 (stmt);
778 }
779 }
780 else
781 {
782 if (first_stmt_code != rhs_code
783 && alt_stmt_code == ERROR_MARK)
784 alt_stmt_code = rhs_code;
785 if (first_stmt_code != rhs_code
786 && (first_stmt_code != IMAGPART_EXPR
787 || rhs_code != REALPART_EXPR)
788 && (first_stmt_code != REALPART_EXPR
789 || rhs_code != IMAGPART_EXPR)
790 /* Handle mismatches in plus/minus by computing both
791 and merging the results. */
792 && !((first_stmt_code == PLUS_EXPR
793 || first_stmt_code == MINUS_EXPR)
794 && (alt_stmt_code == PLUS_EXPR
795 || alt_stmt_code == MINUS_EXPR)
796 && rhs_code == alt_stmt_code)
797 && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
798 && (first_stmt_code == ARRAY_REF
799 || first_stmt_code == BIT_FIELD_REF
800 || first_stmt_code == INDIRECT_REF
801 || first_stmt_code == COMPONENT_REF
802 || first_stmt_code == MEM_REF)))
803 {
804 if (dump_enabled_p ())
805 {
806 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
807 "Build SLP failed: different operation "
808 "in stmt ");
809 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
810 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
811 "original stmt ");
812 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
813 first_stmt, 0);
814 }
815 /* Mismatch. */
816 continue;
817 }
818
819 if (need_same_oprnds
820 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
821 {
822 if (dump_enabled_p ())
823 {
824 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
825 "Build SLP failed: different shift "
826 "arguments in ");
827 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
828 }
829 /* Mismatch. */
830 continue;
831 }
832
833 if (rhs_code == CALL_EXPR)
834 {
835 gimple *first_stmt = stmts[0];
836 if (gimple_call_num_args (stmt) != nops
837 || !operand_equal_p (gimple_call_fn (first_stmt),
838 gimple_call_fn (stmt), 0)
839 || gimple_call_fntype (first_stmt)
840 != gimple_call_fntype (stmt))
841 {
842 if (dump_enabled_p ())
843 {
844 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
845 "Build SLP failed: different calls in ");
846 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
847 stmt, 0);
848 }
849 /* Mismatch. */
850 continue;
851 }
852 }
853 }
854
855 /* Grouped store or load. */
856 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
857 {
858 if (REFERENCE_CLASS_P (lhs))
859 {
860 /* Store. */
861 ;
862 }
863 else
864 {
865 /* Load. */
866 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
867 if (prev_first_load)
868 {
869 /* Check that there are no loads from different interleaving
870 chains in the same node. */
871 if (prev_first_load != first_load)
872 {
873 if (dump_enabled_p ())
874 {
875 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
876 vect_location,
877 "Build SLP failed: different "
878 "interleaving chains in one node ");
879 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
880 stmt, 0);
881 }
882 /* Mismatch. */
883 continue;
884 }
885 }
886 else
887 prev_first_load = first_load;
888 }
889 } /* Grouped access. */
890 else
891 {
892 if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
893 {
894 /* Not grouped load. */
895 if (dump_enabled_p ())
896 {
897 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
898 "Build SLP failed: not grouped load ");
899 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
900 }
901
902 /* FORNOW: Not grouped loads are not supported. */
903 /* Fatal mismatch. */
904 matches[0] = false;
905 return false;
906 }
907
908 /* Not memory operation. */
909 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
910 && TREE_CODE_CLASS (rhs_code) != tcc_unary
911 && TREE_CODE_CLASS (rhs_code) != tcc_expression
912 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
913 && rhs_code != CALL_EXPR)
914 {
915 if (dump_enabled_p ())
916 {
917 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
918 "Build SLP failed: operation");
919 dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
920 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
921 }
922 /* Fatal mismatch. */
923 matches[0] = false;
924 return false;
925 }
926
927 if (rhs_code == COND_EXPR)
928 {
929 tree cond_expr = gimple_assign_rhs1 (stmt);
930 enum tree_code cond_code = TREE_CODE (cond_expr);
931 enum tree_code swap_code = ERROR_MARK;
932 enum tree_code invert_code = ERROR_MARK;
933
934 if (i == 0)
935 first_cond_code = TREE_CODE (cond_expr);
936 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
937 {
938 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
939 swap_code = swap_tree_comparison (cond_code);
940 invert_code = invert_tree_comparison (cond_code, honor_nans);
941 }
942
943 if (first_cond_code == cond_code)
944 ;
945 /* Isomorphic can be achieved by swapping. */
946 else if (first_cond_code == swap_code)
947 swap[i] = 1;
948 /* Isomorphic can be achieved by inverting. */
949 else if (first_cond_code == invert_code)
950 swap[i] = 2;
951 else
952 {
953 if (dump_enabled_p ())
954 {
955 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
956 "Build SLP failed: different"
957 " operation");
958 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
959 stmt, 0);
960 }
961 /* Mismatch. */
962 continue;
963 }
964 }
965 }
966
967 matches[i] = true;
968 }
969
970 for (i = 0; i < group_size; ++i)
971 if (!matches[i])
972 return false;
973
974 /* If we allowed a two-operation SLP node verify the target can cope
975 with the permute we are going to use. */
976 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
977 if (alt_stmt_code != ERROR_MARK
978 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
979 {
980 unsigned HOST_WIDE_INT count;
981 if (!nunits.is_constant (&count))
982 {
983 if (dump_enabled_p ())
984 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
985 "Build SLP failed: different operations "
986 "not allowed with variable-length SLP.\n");
987 return false;
988 }
989 vec_perm_builder sel (count, count, 1);
990 for (i = 0; i < count; ++i)
991 {
992 unsigned int elt = i;
993 if (gimple_assign_rhs_code (stmts[i % group_size]) == alt_stmt_code)
994 elt += count;
995 sel.quick_push (elt);
996 }
997 vec_perm_indices indices (sel, 2, count);
998 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
999 {
1000 for (i = 0; i < group_size; ++i)
1001 if (gimple_assign_rhs_code (stmts[i]) == alt_stmt_code)
1002 {
1003 matches[i] = false;
1004 if (dump_enabled_p ())
1005 {
1006 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1007 "Build SLP failed: different operation "
1008 "in stmt ");
1009 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1010 stmts[i], 0);
1011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1012 "original stmt ");
1013 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1014 first_stmt, 0);
1015 }
1016 }
1017 return false;
1018 }
1019 *two_operators = true;
1020 }
1021
1022 return true;
1023 }
1024
1025 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1026 Note we never remove apart from at destruction time so we do not
1027 need a special value for deleted that differs from empty. */
1028 struct bst_traits
1029 {
1030 typedef vec <gimple *> value_type;
1031 typedef vec <gimple *> compare_type;
1032 static inline hashval_t hash (value_type);
1033 static inline bool equal (value_type existing, value_type candidate);
is_emptybst_traits1034 static inline bool is_empty (value_type x) { return !x.exists (); }
is_deletedbst_traits1035 static inline bool is_deleted (value_type x) { return !x.exists (); }
mark_emptybst_traits1036 static inline void mark_empty (value_type &x) { x.release (); }
mark_deletedbst_traits1037 static inline void mark_deleted (value_type &x) { x.release (); }
removebst_traits1038 static inline void remove (value_type &x) { x.release (); }
1039 };
1040 inline hashval_t
hash(value_type x)1041 bst_traits::hash (value_type x)
1042 {
1043 inchash::hash h;
1044 for (unsigned i = 0; i < x.length (); ++i)
1045 h.add_int (gimple_uid (x[i]));
1046 return h.end ();
1047 }
1048 inline bool
equal(value_type existing,value_type candidate)1049 bst_traits::equal (value_type existing, value_type candidate)
1050 {
1051 if (existing.length () != candidate.length ())
1052 return false;
1053 for (unsigned i = 0; i < existing.length (); ++i)
1054 if (existing[i] != candidate[i])
1055 return false;
1056 return true;
1057 }
1058
1059 typedef hash_set <vec <gimple *>, bst_traits> scalar_stmts_set_t;
1060 static scalar_stmts_set_t *bst_fail;
1061
1062 static slp_tree
1063 vect_build_slp_tree_2 (vec_info *vinfo,
1064 vec<gimple *> stmts, unsigned int group_size,
1065 poly_uint64 *max_nunits,
1066 vec<slp_tree> *loads,
1067 bool *matches, unsigned *npermutes, unsigned *tree_size,
1068 unsigned max_tree_size);
1069
1070 static slp_tree
vect_build_slp_tree(vec_info * vinfo,vec<gimple * > stmts,unsigned int group_size,poly_uint64 * max_nunits,vec<slp_tree> * loads,bool * matches,unsigned * npermutes,unsigned * tree_size,unsigned max_tree_size)1071 vect_build_slp_tree (vec_info *vinfo,
1072 vec<gimple *> stmts, unsigned int group_size,
1073 poly_uint64 *max_nunits, vec<slp_tree> *loads,
1074 bool *matches, unsigned *npermutes, unsigned *tree_size,
1075 unsigned max_tree_size)
1076 {
1077 if (bst_fail->contains (stmts))
1078 return NULL;
1079 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, max_nunits,
1080 loads, matches, npermutes, tree_size,
1081 max_tree_size);
1082 /* When SLP build fails for stmts record this, otherwise SLP build
1083 can be exponential in time when we allow to construct parts from
1084 scalars, see PR81723. */
1085 if (! res)
1086 {
1087 vec <gimple *> x;
1088 x.create (stmts.length ());
1089 x.splice (stmts);
1090 bst_fail->add (x);
1091 }
1092 return res;
1093 }
1094
1095 /* Recursively build an SLP tree starting from NODE.
1096 Fail (and return a value not equal to zero) if def-stmts are not
1097 isomorphic, require data permutation or are of unsupported types of
1098 operation. Otherwise, return 0.
1099 The value returned is the depth in the SLP tree where a mismatch
1100 was found. */
1101
1102 static slp_tree
vect_build_slp_tree_2(vec_info * vinfo,vec<gimple * > stmts,unsigned int group_size,poly_uint64 * max_nunits,vec<slp_tree> * loads,bool * matches,unsigned * npermutes,unsigned * tree_size,unsigned max_tree_size)1103 vect_build_slp_tree_2 (vec_info *vinfo,
1104 vec<gimple *> stmts, unsigned int group_size,
1105 poly_uint64 *max_nunits,
1106 vec<slp_tree> *loads,
1107 bool *matches, unsigned *npermutes, unsigned *tree_size,
1108 unsigned max_tree_size)
1109 {
1110 unsigned nops, i, this_tree_size = 0;
1111 poly_uint64 this_max_nunits = *max_nunits;
1112 gimple *stmt;
1113 slp_tree node;
1114
1115 matches[0] = false;
1116
1117 stmt = stmts[0];
1118 if (is_gimple_call (stmt))
1119 nops = gimple_call_num_args (stmt);
1120 else if (is_gimple_assign (stmt))
1121 {
1122 nops = gimple_num_ops (stmt) - 1;
1123 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1124 nops++;
1125 }
1126 else if (gimple_code (stmt) == GIMPLE_PHI)
1127 nops = 0;
1128 else
1129 return NULL;
1130
1131 /* If the SLP node is a PHI (induction or reduction), terminate
1132 the recursion. */
1133 if (gimple_code (stmt) == GIMPLE_PHI)
1134 {
1135 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1136 tree vectype = get_vectype_for_scalar_type (scalar_type);
1137 if (!vect_record_max_nunits (vinfo, stmt, group_size, vectype,
1138 max_nunits))
1139 return NULL;
1140
1141 vect_def_type def_type = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt));
1142 /* Induction from different IVs is not supported. */
1143 if (def_type == vect_induction_def)
1144 {
1145 FOR_EACH_VEC_ELT (stmts, i, stmt)
1146 if (stmt != stmts[0])
1147 return NULL;
1148 }
1149 else
1150 {
1151 /* Else def types have to match. */
1152 FOR_EACH_VEC_ELT (stmts, i, stmt)
1153 {
1154 /* But for reduction chains only check on the first stmt. */
1155 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
1156 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt)
1157 continue;
1158 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != def_type)
1159 return NULL;
1160 }
1161 }
1162 node = vect_create_new_slp_node (stmts);
1163 return node;
1164 }
1165
1166
1167 bool two_operators = false;
1168 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1169 if (!vect_build_slp_tree_1 (vinfo, swap,
1170 stmts, group_size, nops,
1171 &this_max_nunits, matches, &two_operators))
1172 return NULL;
1173
1174 /* If the SLP node is a load, terminate the recursion. */
1175 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
1176 && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1177 {
1178 *max_nunits = this_max_nunits;
1179 node = vect_create_new_slp_node (stmts);
1180 loads->safe_push (node);
1181 return node;
1182 }
1183
1184 /* Get at the operands, verifying they are compatible. */
1185 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1186 slp_oprnd_info oprnd_info;
1187 FOR_EACH_VEC_ELT (stmts, i, stmt)
1188 {
1189 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1190 stmts, i, &oprnds_info);
1191 if (res != 0)
1192 matches[(res == -1) ? 0 : i] = false;
1193 if (!matches[0])
1194 break;
1195 }
1196 for (i = 0; i < group_size; ++i)
1197 if (!matches[i])
1198 {
1199 vect_free_oprnd_info (oprnds_info);
1200 return NULL;
1201 }
1202
1203 auto_vec<slp_tree, 4> children;
1204 auto_vec<slp_tree> this_loads;
1205
1206 stmt = stmts[0];
1207
1208 if (tree_size)
1209 max_tree_size -= *tree_size;
1210
1211 /* Create SLP_TREE nodes for the definition node/s. */
1212 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1213 {
1214 slp_tree child;
1215 unsigned old_nloads = this_loads.length ();
1216 unsigned old_tree_size = this_tree_size;
1217 unsigned int j;
1218
1219 if (oprnd_info->first_dt != vect_internal_def
1220 && oprnd_info->first_dt != vect_reduction_def
1221 && oprnd_info->first_dt != vect_induction_def)
1222 continue;
1223
1224 if (++this_tree_size > max_tree_size)
1225 {
1226 FOR_EACH_VEC_ELT (children, j, child)
1227 vect_free_slp_tree (child);
1228 vect_free_oprnd_info (oprnds_info);
1229 return NULL;
1230 }
1231
1232 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1233 group_size, &this_max_nunits,
1234 &this_loads, matches, npermutes,
1235 &this_tree_size,
1236 max_tree_size)) != NULL)
1237 {
1238 /* If we have all children of child built up from scalars then just
1239 throw that away and build it up this node from scalars. */
1240 if (!SLP_TREE_CHILDREN (child).is_empty ()
1241 /* ??? Rejecting patterns this way doesn't work. We'd have to
1242 do extra work to cancel the pattern so the uses see the
1243 scalar version. */
1244 && !is_pattern_stmt_p
1245 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])))
1246 {
1247 slp_tree grandchild;
1248
1249 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1250 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1251 break;
1252 if (!grandchild)
1253 {
1254 /* Roll back. */
1255 this_loads.truncate (old_nloads);
1256 this_tree_size = old_tree_size;
1257 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1258 vect_free_slp_tree (grandchild);
1259 SLP_TREE_CHILDREN (child).truncate (0);
1260
1261 dump_printf_loc (MSG_NOTE, vect_location,
1262 "Building parent vector operands from "
1263 "scalars instead\n");
1264 oprnd_info->def_stmts = vNULL;
1265 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1266 children.safe_push (child);
1267 continue;
1268 }
1269 }
1270
1271 oprnd_info->def_stmts = vNULL;
1272 children.safe_push (child);
1273 continue;
1274 }
1275
1276 /* If the SLP build failed fatally and we analyze a basic-block
1277 simply treat nodes we fail to build as externally defined
1278 (and thus build vectors from the scalar defs).
1279 The cost model will reject outright expensive cases.
1280 ??? This doesn't treat cases where permutation ultimatively
1281 fails (or we don't try permutation below). Ideally we'd
1282 even compute a permutation that will end up with the maximum
1283 SLP tree size... */
1284 if (is_a <bb_vec_info> (vinfo)
1285 && !matches[0]
1286 /* ??? Rejecting patterns this way doesn't work. We'd have to
1287 do extra work to cancel the pattern so the uses see the
1288 scalar version. */
1289 && !is_pattern_stmt_p (vinfo_for_stmt (stmt)))
1290 {
1291 dump_printf_loc (MSG_NOTE, vect_location,
1292 "Building vector operands from scalars\n");
1293 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1294 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1295 children.safe_push (child);
1296 oprnd_info->def_stmts = vNULL;
1297 continue;
1298 }
1299
1300 /* If the SLP build for operand zero failed and operand zero
1301 and one can be commutated try that for the scalar stmts
1302 that failed the match. */
1303 if (i == 0
1304 /* A first scalar stmt mismatch signals a fatal mismatch. */
1305 && matches[0]
1306 /* ??? For COND_EXPRs we can swap the comparison operands
1307 as well as the arms under some constraints. */
1308 && nops == 2
1309 && oprnds_info[1]->first_dt == vect_internal_def
1310 && is_gimple_assign (stmt)
1311 /* Do so only if the number of not successful permutes was nor more
1312 than a cut-ff as re-trying the recursive match on
1313 possibly each level of the tree would expose exponential
1314 behavior. */
1315 && *npermutes < 4)
1316 {
1317 /* See whether we can swap the matching or the non-matching
1318 stmt operands. */
1319 bool swap_not_matching = true;
1320 do
1321 {
1322 for (j = 0; j < group_size; ++j)
1323 {
1324 if (matches[j] != !swap_not_matching)
1325 continue;
1326 gimple *stmt = stmts[j];
1327 /* Verify if we can swap operands of this stmt. */
1328 if (!is_gimple_assign (stmt)
1329 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1330 {
1331 if (!swap_not_matching)
1332 goto fail;
1333 swap_not_matching = false;
1334 break;
1335 }
1336 /* Verify if we can safely swap or if we committed to a
1337 specific operand order already.
1338 ??? Instead of modifying GIMPLE stmts here we could
1339 record whether we want to swap operands in the SLP
1340 node and temporarily do that when processing it
1341 (or wrap operand accessors in a helper). */
1342 else if (swap[j] != 0
1343 || STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)))
1344 {
1345 if (!swap_not_matching)
1346 {
1347 if (dump_enabled_p ())
1348 {
1349 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1350 vect_location,
1351 "Build SLP failed: cannot swap "
1352 "operands of shared stmt ");
1353 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,
1354 TDF_SLIM, stmts[j], 0);
1355 }
1356 goto fail;
1357 }
1358 swap_not_matching = false;
1359 break;
1360 }
1361 }
1362 }
1363 while (j != group_size);
1364
1365 /* Swap mismatched definition stmts. */
1366 dump_printf_loc (MSG_NOTE, vect_location,
1367 "Re-trying with swapped operands of stmts ");
1368 for (j = 0; j < group_size; ++j)
1369 if (matches[j] == !swap_not_matching)
1370 {
1371 std::swap (oprnds_info[0]->def_stmts[j],
1372 oprnds_info[1]->def_stmts[j]);
1373 dump_printf (MSG_NOTE, "%d ", j);
1374 }
1375 dump_printf (MSG_NOTE, "\n");
1376 /* And try again with scratch 'matches' ... */
1377 bool *tem = XALLOCAVEC (bool, group_size);
1378 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1379 group_size, &this_max_nunits,
1380 &this_loads, tem, npermutes,
1381 &this_tree_size,
1382 max_tree_size)) != NULL)
1383 {
1384 /* ... so if successful we can apply the operand swapping
1385 to the GIMPLE IL. This is necessary because for example
1386 vect_get_slp_defs uses operand indexes and thus expects
1387 canonical operand order. This is also necessary even
1388 if we end up building the operand from scalars as
1389 we'll continue to process swapped operand two. */
1390 for (j = 0; j < group_size; ++j)
1391 {
1392 gimple *stmt = stmts[j];
1393 gimple_set_plf (stmt, GF_PLF_1, false);
1394 }
1395 for (j = 0; j < group_size; ++j)
1396 {
1397 gimple *stmt = stmts[j];
1398 if (matches[j] == !swap_not_matching)
1399 {
1400 /* Avoid swapping operands twice. */
1401 if (gimple_plf (stmt, GF_PLF_1))
1402 continue;
1403 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
1404 gimple_assign_rhs2_ptr (stmt));
1405 gimple_set_plf (stmt, GF_PLF_1, true);
1406 }
1407 }
1408 /* Verify we swap all duplicates or none. */
1409 if (flag_checking)
1410 for (j = 0; j < group_size; ++j)
1411 {
1412 gimple *stmt = stmts[j];
1413 gcc_assert (gimple_plf (stmt, GF_PLF_1)
1414 == (matches[j] == !swap_not_matching));
1415 }
1416
1417 /* If we have all children of child built up from scalars then
1418 just throw that away and build it up this node from scalars. */
1419 if (!SLP_TREE_CHILDREN (child).is_empty ()
1420 /* ??? Rejecting patterns this way doesn't work. We'd have
1421 to do extra work to cancel the pattern so the uses see the
1422 scalar version. */
1423 && !is_pattern_stmt_p
1424 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])))
1425 {
1426 unsigned int j;
1427 slp_tree grandchild;
1428
1429 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1430 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1431 break;
1432 if (!grandchild)
1433 {
1434 /* Roll back. */
1435 this_loads.truncate (old_nloads);
1436 this_tree_size = old_tree_size;
1437 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1438 vect_free_slp_tree (grandchild);
1439 SLP_TREE_CHILDREN (child).truncate (0);
1440
1441 dump_printf_loc (MSG_NOTE, vect_location,
1442 "Building parent vector operands from "
1443 "scalars instead\n");
1444 oprnd_info->def_stmts = vNULL;
1445 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1446 children.safe_push (child);
1447 continue;
1448 }
1449 }
1450
1451 oprnd_info->def_stmts = vNULL;
1452 children.safe_push (child);
1453 continue;
1454 }
1455
1456 ++*npermutes;
1457 }
1458
1459 fail:
1460 gcc_assert (child == NULL);
1461 FOR_EACH_VEC_ELT (children, j, child)
1462 vect_free_slp_tree (child);
1463 vect_free_oprnd_info (oprnds_info);
1464 return NULL;
1465 }
1466
1467 vect_free_oprnd_info (oprnds_info);
1468
1469 if (tree_size)
1470 *tree_size += this_tree_size;
1471 *max_nunits = this_max_nunits;
1472 loads->safe_splice (this_loads);
1473
1474 node = vect_create_new_slp_node (stmts);
1475 SLP_TREE_TWO_OPERATORS (node) = two_operators;
1476 SLP_TREE_CHILDREN (node).splice (children);
1477 return node;
1478 }
1479
1480 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1481
1482 static void
vect_print_slp_tree(dump_flags_t dump_kind,location_t loc,slp_tree node)1483 vect_print_slp_tree (dump_flags_t dump_kind, location_t loc, slp_tree node)
1484 {
1485 int i;
1486 gimple *stmt;
1487 slp_tree child;
1488
1489 dump_printf_loc (dump_kind, loc, "node%s\n",
1490 SLP_TREE_DEF_TYPE (node) != vect_internal_def
1491 ? " (external)" : "");
1492 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1493 {
1494 dump_printf_loc (dump_kind, loc, "\tstmt %d ", i);
1495 dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1496 }
1497 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1498 vect_print_slp_tree (dump_kind, loc, child);
1499 }
1500
1501
1502 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1503 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1504 J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1505 stmts in NODE are to be marked. */
1506
1507 static void
vect_mark_slp_stmts(slp_tree node,enum slp_vect_type mark,int j)1508 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1509 {
1510 int i;
1511 gimple *stmt;
1512 slp_tree child;
1513
1514 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1515 return;
1516
1517 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1518 if (j < 0 || i == j)
1519 STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1520
1521 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1522 vect_mark_slp_stmts (child, mark, j);
1523 }
1524
1525
1526 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1527
1528 static void
vect_mark_slp_stmts_relevant(slp_tree node)1529 vect_mark_slp_stmts_relevant (slp_tree node)
1530 {
1531 int i;
1532 gimple *stmt;
1533 stmt_vec_info stmt_info;
1534 slp_tree child;
1535
1536 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1537 return;
1538
1539 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1540 {
1541 stmt_info = vinfo_for_stmt (stmt);
1542 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1543 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1544 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1545 }
1546
1547 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1548 vect_mark_slp_stmts_relevant (child);
1549 }
1550
1551
1552 /* Rearrange the statements of NODE according to PERMUTATION. */
1553
1554 static void
vect_slp_rearrange_stmts(slp_tree node,unsigned int group_size,vec<unsigned> permutation)1555 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1556 vec<unsigned> permutation)
1557 {
1558 gimple *stmt;
1559 vec<gimple *> tmp_stmts;
1560 unsigned int i;
1561 slp_tree child;
1562
1563 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1564 vect_slp_rearrange_stmts (child, group_size, permutation);
1565
1566 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1567 tmp_stmts.create (group_size);
1568 tmp_stmts.quick_grow_cleared (group_size);
1569
1570 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1571 tmp_stmts[permutation[i]] = stmt;
1572
1573 SLP_TREE_SCALAR_STMTS (node).release ();
1574 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1575 }
1576
1577
1578 /* Attempt to reorder stmts in a reduction chain so that we don't
1579 require any load permutation. Return true if that was possible,
1580 otherwise return false. */
1581
1582 static bool
vect_attempt_slp_rearrange_stmts(slp_instance slp_instn)1583 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1584 {
1585 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1586 unsigned int i, j;
1587 unsigned int lidx;
1588 slp_tree node, load;
1589
1590 /* Compare all the permutation sequences to the first one. We know
1591 that at least one load is permuted. */
1592 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1593 if (!node->load_permutation.exists ())
1594 return false;
1595 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1596 {
1597 if (!load->load_permutation.exists ())
1598 return false;
1599 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1600 if (lidx != node->load_permutation[j])
1601 return false;
1602 }
1603
1604 /* Check that the loads in the first sequence are different and there
1605 are no gaps between them. */
1606 auto_sbitmap load_index (group_size);
1607 bitmap_clear (load_index);
1608 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1609 {
1610 if (lidx >= group_size)
1611 return false;
1612 if (bitmap_bit_p (load_index, lidx))
1613 return false;
1614
1615 bitmap_set_bit (load_index, lidx);
1616 }
1617 for (i = 0; i < group_size; i++)
1618 if (!bitmap_bit_p (load_index, i))
1619 return false;
1620
1621 /* This permutation is valid for reduction. Since the order of the
1622 statements in the nodes is not important unless they are memory
1623 accesses, we can rearrange the statements in all the nodes
1624 according to the order of the loads. */
1625 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1626 node->load_permutation);
1627
1628 /* We are done, no actual permutations need to be generated. */
1629 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1630 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1631 {
1632 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1633 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
1634 /* But we have to keep those permutations that are required because
1635 of handling of gaps. */
1636 if (known_eq (unrolling_factor, 1U)
1637 || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt))
1638 && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0))
1639 SLP_TREE_LOAD_PERMUTATION (node).release ();
1640 else
1641 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1642 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1643 }
1644
1645 return true;
1646 }
1647
1648 /* Check if the required load permutations in the SLP instance
1649 SLP_INSTN are supported. */
1650
1651 static bool
vect_supported_load_permutation_p(slp_instance slp_instn)1652 vect_supported_load_permutation_p (slp_instance slp_instn)
1653 {
1654 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1655 unsigned int i, j, k, next;
1656 slp_tree node;
1657 gimple *stmt, *load, *next_load;
1658
1659 if (dump_enabled_p ())
1660 {
1661 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1662 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1663 if (node->load_permutation.exists ())
1664 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1665 dump_printf (MSG_NOTE, "%d ", next);
1666 else
1667 for (k = 0; k < group_size; ++k)
1668 dump_printf (MSG_NOTE, "%d ", k);
1669 dump_printf (MSG_NOTE, "\n");
1670 }
1671
1672 /* In case of reduction every load permutation is allowed, since the order
1673 of the reduction statements is not important (as opposed to the case of
1674 grouped stores). The only condition we need to check is that all the
1675 load nodes are of the same size and have the same permutation (and then
1676 rearrange all the nodes of the SLP instance according to this
1677 permutation). */
1678
1679 /* Check that all the load nodes are of the same size. */
1680 /* ??? Can't we assert this? */
1681 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1682 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1683 return false;
1684
1685 node = SLP_INSTANCE_TREE (slp_instn);
1686 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1687
1688 /* Reduction (there are no data-refs in the root).
1689 In reduction chain the order of the loads is not important. */
1690 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1691 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1692 vect_attempt_slp_rearrange_stmts (slp_instn);
1693
1694 /* In basic block vectorization we allow any subchain of an interleaving
1695 chain.
1696 FORNOW: not supported in loop SLP because of realignment compications. */
1697 if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1698 {
1699 /* Check whether the loads in an instance form a subchain and thus
1700 no permutation is necessary. */
1701 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1702 {
1703 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1704 continue;
1705 bool subchain_p = true;
1706 next_load = NULL;
1707 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1708 {
1709 if (j != 0
1710 && (next_load != load
1711 || GROUP_GAP (vinfo_for_stmt (load)) != 1))
1712 {
1713 subchain_p = false;
1714 break;
1715 }
1716 next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1717 }
1718 if (subchain_p)
1719 SLP_TREE_LOAD_PERMUTATION (node).release ();
1720 else
1721 {
1722 stmt_vec_info group_info
1723 = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
1724 group_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (group_info));
1725 unsigned HOST_WIDE_INT nunits;
1726 unsigned k, maxk = 0;
1727 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
1728 if (k > maxk)
1729 maxk = k;
1730 /* In BB vectorization we may not actually use a loaded vector
1731 accessing elements in excess of GROUP_SIZE. */
1732 tree vectype = STMT_VINFO_VECTYPE (group_info);
1733 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
1734 || maxk >= (GROUP_SIZE (group_info) & ~(nunits - 1)))
1735 {
1736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1737 "BB vectorization with gaps at the end of "
1738 "a load is not supported\n");
1739 return false;
1740 }
1741
1742 /* Verify the permutation can be generated. */
1743 vec<tree> tem;
1744 unsigned n_perms;
1745 if (!vect_transform_slp_perm_load (node, tem, NULL,
1746 1, slp_instn, true, &n_perms))
1747 {
1748 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1749 vect_location,
1750 "unsupported load permutation\n");
1751 return false;
1752 }
1753 }
1754 }
1755 return true;
1756 }
1757
1758 /* For loop vectorization verify we can generate the permutation. Be
1759 conservative about the vectorization factor, there are permutations
1760 that will use three vector inputs only starting from a specific factor
1761 and the vectorization factor is not yet final.
1762 ??? The SLP instance unrolling factor might not be the maximum one. */
1763 unsigned n_perms;
1764 poly_uint64 test_vf
1765 = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn),
1766 LOOP_VINFO_VECT_FACTOR
1767 (STMT_VINFO_LOOP_VINFO (vinfo_for_stmt (stmt))));
1768 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1769 if (node->load_permutation.exists ()
1770 && !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf,
1771 slp_instn, true, &n_perms))
1772 return false;
1773
1774 return true;
1775 }
1776
1777
1778 /* Find the last store in SLP INSTANCE. */
1779
1780 gimple *
vect_find_last_scalar_stmt_in_slp(slp_tree node)1781 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1782 {
1783 gimple *last = NULL, *stmt;
1784
1785 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt); i++)
1786 {
1787 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1788 if (is_pattern_stmt_p (stmt_vinfo))
1789 last = get_later_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo), last);
1790 else
1791 last = get_later_stmt (stmt, last);
1792 }
1793
1794 return last;
1795 }
1796
1797 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE. */
1798
1799 static void
vect_analyze_slp_cost_1(slp_instance instance,slp_tree node,stmt_vector_for_cost * prologue_cost_vec,stmt_vector_for_cost * body_cost_vec,unsigned ncopies_for_cost,scalar_stmts_set_t * visited)1800 vect_analyze_slp_cost_1 (slp_instance instance, slp_tree node,
1801 stmt_vector_for_cost *prologue_cost_vec,
1802 stmt_vector_for_cost *body_cost_vec,
1803 unsigned ncopies_for_cost,
1804 scalar_stmts_set_t* visited)
1805 {
1806 unsigned i, j;
1807 slp_tree child;
1808 gimple *stmt;
1809 stmt_vec_info stmt_info;
1810 tree lhs;
1811
1812 /* If we already costed the exact same set of scalar stmts we're done.
1813 We share the generated vector stmts for those. */
1814 if (visited->contains (SLP_TREE_SCALAR_STMTS (node)))
1815 return;
1816
1817 visited->add (SLP_TREE_SCALAR_STMTS (node).copy ());
1818
1819 /* Recurse down the SLP tree. */
1820 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1821 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1822 vect_analyze_slp_cost_1 (instance, child, prologue_cost_vec,
1823 body_cost_vec, ncopies_for_cost, visited);
1824
1825 /* Look at the first scalar stmt to determine the cost. */
1826 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1827 stmt_info = vinfo_for_stmt (stmt);
1828 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1829 {
1830 vect_memory_access_type memory_access_type
1831 = (STMT_VINFO_STRIDED_P (stmt_info)
1832 ? VMAT_STRIDED_SLP
1833 : VMAT_CONTIGUOUS);
1834 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1835 vect_model_store_cost (stmt_info, ncopies_for_cost,
1836 memory_access_type, VLS_STORE,
1837 node, prologue_cost_vec, body_cost_vec);
1838 else
1839 {
1840 gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1841 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1842 {
1843 /* If the load is permuted then the alignment is determined by
1844 the first group element not by the first scalar stmt DR. */
1845 stmt = GROUP_FIRST_ELEMENT (stmt_info);
1846 stmt_info = vinfo_for_stmt (stmt);
1847 /* Record the cost for the permutation. */
1848 unsigned n_perms;
1849 vect_transform_slp_perm_load (node, vNULL, NULL,
1850 ncopies_for_cost, instance, true,
1851 &n_perms);
1852 record_stmt_cost (body_cost_vec, n_perms, vec_perm,
1853 stmt_info, 0, vect_body);
1854 unsigned assumed_nunits
1855 = vect_nunits_for_cost (STMT_VINFO_VECTYPE (stmt_info));
1856 /* And adjust the number of loads performed. This handles
1857 redundancies as well as loads that are later dead. */
1858 auto_sbitmap perm (GROUP_SIZE (stmt_info));
1859 bitmap_clear (perm);
1860 for (i = 0; i < SLP_TREE_LOAD_PERMUTATION (node).length (); ++i)
1861 bitmap_set_bit (perm, SLP_TREE_LOAD_PERMUTATION (node)[i]);
1862 ncopies_for_cost = 0;
1863 bool load_seen = false;
1864 for (i = 0; i < GROUP_SIZE (stmt_info); ++i)
1865 {
1866 if (i % assumed_nunits == 0)
1867 {
1868 if (load_seen)
1869 ncopies_for_cost++;
1870 load_seen = false;
1871 }
1872 if (bitmap_bit_p (perm, i))
1873 load_seen = true;
1874 }
1875 if (load_seen)
1876 ncopies_for_cost++;
1877 gcc_assert (ncopies_for_cost
1878 <= (GROUP_SIZE (stmt_info) - GROUP_GAP (stmt_info)
1879 + assumed_nunits - 1) / assumed_nunits);
1880 poly_uint64 uf = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1881 ncopies_for_cost *= estimated_poly_value (uf);
1882 }
1883 /* Record the cost for the vector loads. */
1884 vect_model_load_cost (stmt_info, ncopies_for_cost,
1885 memory_access_type, node, prologue_cost_vec,
1886 body_cost_vec);
1887 return;
1888 }
1889 }
1890 else if (STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type)
1891 {
1892 /* ncopies_for_cost is the number of IVs we generate. */
1893 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1894 stmt_info, 0, vect_body);
1895
1896 /* Prologue cost for the initial values and step vector. */
1897 record_stmt_cost (prologue_cost_vec, ncopies_for_cost,
1898 CONSTANT_CLASS_P
1899 (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED
1900 (stmt_info))
1901 ? vector_load : vec_construct,
1902 stmt_info, 0, vect_prologue);
1903 record_stmt_cost (prologue_cost_vec, 1,
1904 CONSTANT_CLASS_P
1905 (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info))
1906 ? vector_load : vec_construct,
1907 stmt_info, 0, vect_prologue);
1908
1909 /* ??? No easy way to get at the actual number of vector stmts
1910 to be geneated and thus the derived IVs. */
1911 }
1912 else
1913 {
1914 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1915 stmt_info, 0, vect_body);
1916 if (SLP_TREE_TWO_OPERATORS (node))
1917 {
1918 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1919 stmt_info, 0, vect_body);
1920 record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm,
1921 stmt_info, 0, vect_body);
1922 }
1923 }
1924
1925 /* Push SLP node def-type to stmts. */
1926 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1927 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
1928 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
1929 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = SLP_TREE_DEF_TYPE (child);
1930
1931 /* Scan operands and account for prologue cost of constants/externals.
1932 ??? This over-estimates cost for multiple uses and should be
1933 re-engineered. */
1934 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1935 lhs = gimple_get_lhs (stmt);
1936 for (i = 0; i < gimple_num_ops (stmt); ++i)
1937 {
1938 tree op = gimple_op (stmt, i);
1939 gimple *def_stmt;
1940 enum vect_def_type dt;
1941 if (!op || op == lhs)
1942 continue;
1943 if (vect_is_simple_use (op, stmt_info->vinfo, &def_stmt, &dt)
1944 && (dt == vect_constant_def || dt == vect_external_def))
1945 {
1946 /* Without looking at the actual initializer a vector of
1947 constants can be implemented as load from the constant pool.
1948 When all elements are the same we can use a splat. */
1949 tree vectype = get_vectype_for_scalar_type (TREE_TYPE (op));
1950 unsigned group_size = SLP_TREE_SCALAR_STMTS (node).length ();
1951 unsigned num_vects_to_check;
1952 unsigned HOST_WIDE_INT const_nunits;
1953 unsigned nelt_limit;
1954 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
1955 && ! multiple_p (const_nunits, group_size))
1956 {
1957 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
1958 nelt_limit = const_nunits;
1959 }
1960 else
1961 {
1962 /* If either the vector has variable length or the vectors
1963 are composed of repeated whole groups we only need to
1964 cost construction once. All vectors will be the same. */
1965 num_vects_to_check = 1;
1966 nelt_limit = group_size;
1967 }
1968 tree elt = NULL_TREE;
1969 unsigned nelt = 0;
1970 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
1971 {
1972 unsigned si = j % group_size;
1973 if (nelt == 0)
1974 elt = gimple_op (SLP_TREE_SCALAR_STMTS (node)[si], i);
1975 /* ??? We're just tracking whether all operands of a single
1976 vector initializer are the same, ideally we'd check if
1977 we emitted the same one already. */
1978 else if (elt != gimple_op (SLP_TREE_SCALAR_STMTS (node)[si], i))
1979 elt = NULL_TREE;
1980 nelt++;
1981 if (nelt == nelt_limit)
1982 {
1983 /* ??? We need to pass down stmt_info for a vector type
1984 even if it points to the wrong stmt. */
1985 record_stmt_cost (prologue_cost_vec, 1,
1986 dt == vect_external_def
1987 ? (elt ? scalar_to_vec : vec_construct)
1988 : vector_load,
1989 stmt_info, 0, vect_prologue);
1990 nelt = 0;
1991 }
1992 }
1993 }
1994 }
1995
1996 /* Restore stmt def-types. */
1997 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1998 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
1999 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
2000 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_internal_def;
2001 }
2002
2003 /* Compute the cost for the SLP instance INSTANCE. */
2004
2005 static void
vect_analyze_slp_cost(slp_instance instance,void * data,scalar_stmts_set_t * visited)2006 vect_analyze_slp_cost (slp_instance instance, void *data, scalar_stmts_set_t *visited)
2007 {
2008 stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
2009 unsigned ncopies_for_cost;
2010 stmt_info_for_cost *si;
2011 unsigned i;
2012
2013 /* Calculate the number of vector stmts to create based on the unrolling
2014 factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
2015 GROUP_SIZE / NUNITS otherwise. */
2016 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
2017 slp_tree node = SLP_INSTANCE_TREE (instance);
2018 stmt_vec_info stmt_info = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2019 /* Get the estimated vectorization factor, which is always one for
2020 basic-block vectorization. */
2021 unsigned int assumed_vf;
2022 if (STMT_VINFO_LOOP_VINFO (stmt_info))
2023 assumed_vf = vect_vf_for_cost (STMT_VINFO_LOOP_VINFO (stmt_info));
2024 else
2025 assumed_vf = 1;
2026 /* For reductions look at a reduction operand in case the reduction
2027 operation is widening like DOT_PROD or SAD. */
2028 tree vectype_for_cost = STMT_VINFO_VECTYPE (stmt_info);
2029 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2030 {
2031 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2032 switch (gimple_assign_rhs_code (stmt))
2033 {
2034 case DOT_PROD_EXPR:
2035 case SAD_EXPR:
2036 vectype_for_cost = get_vectype_for_scalar_type
2037 (TREE_TYPE (gimple_assign_rhs1 (stmt)));
2038 break;
2039 default:;
2040 }
2041 }
2042 unsigned int assumed_nunits = vect_nunits_for_cost (vectype_for_cost);
2043 ncopies_for_cost = (least_common_multiple (assumed_nunits,
2044 group_size * assumed_vf)
2045 / assumed_nunits);
2046
2047 prologue_cost_vec.create (10);
2048 body_cost_vec.create (10);
2049 vect_analyze_slp_cost_1 (instance, SLP_INSTANCE_TREE (instance),
2050 &prologue_cost_vec, &body_cost_vec,
2051 ncopies_for_cost, visited);
2052
2053 /* Record the prologue costs, which were delayed until we were
2054 sure that SLP was successful. */
2055 FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
2056 {
2057 struct _stmt_vec_info *stmt_info
2058 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2059 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2060 si->misalign, vect_prologue);
2061 }
2062
2063 /* Record the instance's instructions in the target cost model. */
2064 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
2065 {
2066 struct _stmt_vec_info *stmt_info
2067 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2068 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2069 si->misalign, vect_body);
2070 }
2071
2072 prologue_cost_vec.release ();
2073 body_cost_vec.release ();
2074 }
2075
2076 /* Splits a group of stores, currently beginning at FIRST_STMT, into two groups:
2077 one (still beginning at FIRST_STMT) of size GROUP1_SIZE (also containing
2078 the first GROUP1_SIZE stmts, since stores are consecutive), the second
2079 containing the remainder.
2080 Return the first stmt in the second group. */
2081
2082 static gimple *
vect_split_slp_store_group(gimple * first_stmt,unsigned group1_size)2083 vect_split_slp_store_group (gimple *first_stmt, unsigned group1_size)
2084 {
2085 stmt_vec_info first_vinfo = vinfo_for_stmt (first_stmt);
2086 gcc_assert (GROUP_FIRST_ELEMENT (first_vinfo) == first_stmt);
2087 gcc_assert (group1_size > 0);
2088 int group2_size = GROUP_SIZE (first_vinfo) - group1_size;
2089 gcc_assert (group2_size > 0);
2090 GROUP_SIZE (first_vinfo) = group1_size;
2091
2092 gimple *stmt = first_stmt;
2093 for (unsigned i = group1_size; i > 1; i--)
2094 {
2095 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2096 gcc_assert (GROUP_GAP (vinfo_for_stmt (stmt)) == 1);
2097 }
2098 /* STMT is now the last element of the first group. */
2099 gimple *group2 = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2100 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)) = 0;
2101
2102 GROUP_SIZE (vinfo_for_stmt (group2)) = group2_size;
2103 for (stmt = group2; stmt; stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)))
2104 {
2105 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = group2;
2106 gcc_assert (GROUP_GAP (vinfo_for_stmt (stmt)) == 1);
2107 }
2108
2109 /* For the second group, the GROUP_GAP is that before the original group,
2110 plus skipping over the first vector. */
2111 GROUP_GAP (vinfo_for_stmt (group2)) =
2112 GROUP_GAP (first_vinfo) + group1_size;
2113
2114 /* GROUP_GAP of the first group now has to skip over the second group too. */
2115 GROUP_GAP (first_vinfo) += group2_size;
2116
2117 if (dump_enabled_p ())
2118 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2119 group1_size, group2_size);
2120
2121 return group2;
2122 }
2123
2124 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2125 statements and a vector of NUNITS elements. */
2126
2127 static poly_uint64
calculate_unrolling_factor(poly_uint64 nunits,unsigned int group_size)2128 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2129 {
2130 return exact_div (common_multiple (nunits, group_size), group_size);
2131 }
2132
2133 /* Analyze an SLP instance starting from a group of grouped stores. Call
2134 vect_build_slp_tree to build a tree of packed stmts if possible.
2135 Return FALSE if it's impossible to SLP any stmt in the loop. */
2136
2137 static bool
vect_analyze_slp_instance(vec_info * vinfo,gimple * stmt,unsigned max_tree_size)2138 vect_analyze_slp_instance (vec_info *vinfo,
2139 gimple *stmt, unsigned max_tree_size)
2140 {
2141 slp_instance new_instance;
2142 slp_tree node;
2143 unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
2144 tree vectype, scalar_type = NULL_TREE;
2145 gimple *next;
2146 unsigned int i;
2147 vec<slp_tree> loads;
2148 struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
2149 vec<gimple *> scalar_stmts;
2150
2151 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2152 {
2153 if (dr)
2154 {
2155 scalar_type = TREE_TYPE (DR_REF (dr));
2156 vectype = get_vectype_for_scalar_type (scalar_type);
2157 }
2158 else
2159 {
2160 gcc_assert (is_a <loop_vec_info> (vinfo));
2161 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2162 }
2163
2164 group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
2165 }
2166 else
2167 {
2168 gcc_assert (is_a <loop_vec_info> (vinfo));
2169 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2170 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2171 }
2172
2173 if (!vectype)
2174 {
2175 if (dump_enabled_p ())
2176 {
2177 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2178 "Build SLP failed: unsupported data-type ");
2179 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
2180 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2181 }
2182
2183 return false;
2184 }
2185 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2186
2187 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2188 scalar_stmts.create (group_size);
2189 next = stmt;
2190 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2191 {
2192 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2193 while (next)
2194 {
2195 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
2196 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
2197 scalar_stmts.safe_push (
2198 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
2199 else
2200 scalar_stmts.safe_push (next);
2201 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2202 }
2203 /* Mark the first element of the reduction chain as reduction to properly
2204 transform the node. In the reduction analysis phase only the last
2205 element of the chain is marked as reduction. */
2206 if (!STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
2207 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_reduction_def;
2208 }
2209 else
2210 {
2211 /* Collect reduction statements. */
2212 vec<gimple *> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2213 for (i = 0; reductions.iterate (i, &next); i++)
2214 scalar_stmts.safe_push (next);
2215 }
2216
2217 loads.create (group_size);
2218
2219 /* Build the tree for the SLP instance. */
2220 bool *matches = XALLOCAVEC (bool, group_size);
2221 unsigned npermutes = 0;
2222 bst_fail = new scalar_stmts_set_t ();
2223 poly_uint64 max_nunits = nunits;
2224 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2225 &max_nunits, &loads, matches, &npermutes,
2226 NULL, max_tree_size);
2227 delete bst_fail;
2228 if (node != NULL)
2229 {
2230 /* Calculate the unrolling factor based on the smallest type. */
2231 poly_uint64 unrolling_factor
2232 = calculate_unrolling_factor (max_nunits, group_size);
2233
2234 if (maybe_ne (unrolling_factor, 1U)
2235 && is_a <bb_vec_info> (vinfo))
2236 {
2237 unsigned HOST_WIDE_INT const_max_nunits;
2238 if (!max_nunits.is_constant (&const_max_nunits)
2239 || const_max_nunits > group_size)
2240 {
2241 if (dump_enabled_p ())
2242 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2243 "Build SLP failed: store group "
2244 "size not a multiple of the vector size "
2245 "in basic block SLP\n");
2246 vect_free_slp_tree (node);
2247 loads.release ();
2248 return false;
2249 }
2250 /* Fatal mismatch. */
2251 matches[group_size / const_max_nunits * const_max_nunits] = false;
2252 vect_free_slp_tree (node);
2253 loads.release ();
2254 }
2255 else
2256 {
2257 /* Create a new SLP instance. */
2258 new_instance = XNEW (struct _slp_instance);
2259 SLP_INSTANCE_TREE (new_instance) = node;
2260 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
2261 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2262 SLP_INSTANCE_LOADS (new_instance) = loads;
2263
2264 /* Compute the load permutation. */
2265 slp_tree load_node;
2266 bool loads_permuted = false;
2267 FOR_EACH_VEC_ELT (loads, i, load_node)
2268 {
2269 vec<unsigned> load_permutation;
2270 int j;
2271 gimple *load, *first_stmt;
2272 bool this_load_permuted = false;
2273 load_permutation.create (group_size);
2274 first_stmt = GROUP_FIRST_ELEMENT
2275 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
2276 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
2277 {
2278 int load_place = vect_get_place_in_interleaving_chain
2279 (load, first_stmt);
2280 gcc_assert (load_place != -1);
2281 if (load_place != j)
2282 this_load_permuted = true;
2283 load_permutation.safe_push (load_place);
2284 }
2285 if (!this_load_permuted
2286 /* The load requires permutation when unrolling exposes
2287 a gap either because the group is larger than the SLP
2288 group-size or because there is a gap between the groups. */
2289 && (known_eq (unrolling_factor, 1U)
2290 || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt))
2291 && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0)))
2292 {
2293 load_permutation.release ();
2294 continue;
2295 }
2296 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
2297 loads_permuted = true;
2298 }
2299
2300 if (loads_permuted)
2301 {
2302 if (!vect_supported_load_permutation_p (new_instance))
2303 {
2304 if (dump_enabled_p ())
2305 {
2306 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2307 "Build SLP failed: unsupported load "
2308 "permutation ");
2309 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,
2310 TDF_SLIM, stmt, 0);
2311 }
2312 vect_free_slp_instance (new_instance);
2313 return false;
2314 }
2315 }
2316
2317 /* If the loads and stores can be handled with load/store-lan
2318 instructions do not generate this SLP instance. */
2319 if (is_a <loop_vec_info> (vinfo)
2320 && loads_permuted
2321 && dr && vect_store_lanes_supported (vectype, group_size, false))
2322 {
2323 slp_tree load_node;
2324 FOR_EACH_VEC_ELT (loads, i, load_node)
2325 {
2326 gimple *first_stmt = GROUP_FIRST_ELEMENT
2327 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
2328 stmt_vec_info stmt_vinfo = vinfo_for_stmt (first_stmt);
2329 /* Use SLP for strided accesses (or if we
2330 can't load-lanes). */
2331 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2332 || ! vect_load_lanes_supported
2333 (STMT_VINFO_VECTYPE (stmt_vinfo),
2334 GROUP_SIZE (stmt_vinfo), false))
2335 break;
2336 }
2337 if (i == loads.length ())
2338 {
2339 if (dump_enabled_p ())
2340 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2341 "Built SLP cancelled: can use "
2342 "load/store-lanes\n");
2343 vect_free_slp_instance (new_instance);
2344 return false;
2345 }
2346 }
2347
2348 vinfo->slp_instances.safe_push (new_instance);
2349
2350 if (dump_enabled_p ())
2351 {
2352 dump_printf_loc (MSG_NOTE, vect_location,
2353 "Final SLP tree for instance:\n");
2354 vect_print_slp_tree (MSG_NOTE, vect_location, node);
2355 }
2356
2357 return true;
2358 }
2359 }
2360 else
2361 {
2362 /* Failed to SLP. */
2363 /* Free the allocated memory. */
2364 scalar_stmts.release ();
2365 loads.release ();
2366 }
2367
2368 /* For basic block SLP, try to break the group up into multiples of the
2369 vector size. */
2370 unsigned HOST_WIDE_INT const_nunits;
2371 if (is_a <bb_vec_info> (vinfo)
2372 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2373 && STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
2374 && nunits.is_constant (&const_nunits))
2375 {
2376 /* We consider breaking the group only on VF boundaries from the existing
2377 start. */
2378 for (i = 0; i < group_size; i++)
2379 if (!matches[i]) break;
2380
2381 if (i >= const_nunits && i < group_size)
2382 {
2383 /* Split into two groups at the first vector boundary before i. */
2384 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2385 unsigned group1_size = i & ~(const_nunits - 1);
2386
2387 gimple *rest = vect_split_slp_store_group (stmt, group1_size);
2388 bool res = vect_analyze_slp_instance (vinfo, stmt, max_tree_size);
2389 /* If the first non-match was in the middle of a vector,
2390 skip the rest of that vector. */
2391 if (group1_size < i)
2392 {
2393 i = group1_size + const_nunits;
2394 if (i < group_size)
2395 rest = vect_split_slp_store_group (rest, const_nunits);
2396 }
2397 if (i < group_size)
2398 res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size);
2399 return res;
2400 }
2401 /* Even though the first vector did not all match, we might be able to SLP
2402 (some) of the remainder. FORNOW ignore this possibility. */
2403 }
2404
2405 return false;
2406 }
2407
2408
2409 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2410 trees of packed scalar stmts if SLP is possible. */
2411
2412 bool
vect_analyze_slp(vec_info * vinfo,unsigned max_tree_size)2413 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2414 {
2415 unsigned int i;
2416 gimple *first_element;
2417
2418 if (dump_enabled_p ())
2419 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
2420
2421 /* Find SLP sequences starting from groups of grouped stores. */
2422 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2423 vect_analyze_slp_instance (vinfo, first_element, max_tree_size);
2424
2425 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2426 {
2427 if (loop_vinfo->reduction_chains.length () > 0)
2428 {
2429 /* Find SLP sequences starting from reduction chains. */
2430 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2431 if (! vect_analyze_slp_instance (vinfo, first_element,
2432 max_tree_size))
2433 {
2434 /* Dissolve reduction chain group. */
2435 gimple *next, *stmt = first_element;
2436 while (stmt)
2437 {
2438 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2439 next = GROUP_NEXT_ELEMENT (vinfo);
2440 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2441 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2442 stmt = next;
2443 }
2444 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (first_element))
2445 = vect_internal_def;
2446 }
2447 }
2448
2449 /* Find SLP sequences starting from groups of reductions. */
2450 if (loop_vinfo->reductions.length () > 1)
2451 vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0],
2452 max_tree_size);
2453 }
2454
2455 return true;
2456 }
2457
2458
2459 /* For each possible SLP instance decide whether to SLP it and calculate overall
2460 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2461 least one instance. */
2462
2463 bool
vect_make_slp_decision(loop_vec_info loop_vinfo)2464 vect_make_slp_decision (loop_vec_info loop_vinfo)
2465 {
2466 unsigned int i;
2467 poly_uint64 unrolling_factor = 1;
2468 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2469 slp_instance instance;
2470 int decided_to_slp = 0;
2471
2472 if (dump_enabled_p ())
2473 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
2474 "\n");
2475
2476 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2477 {
2478 /* FORNOW: SLP if you can. */
2479 /* All unroll factors have the form current_vector_size * X for some
2480 rational X, so they must have a common multiple. */
2481 unrolling_factor
2482 = force_common_multiple (unrolling_factor,
2483 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2484
2485 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2486 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2487 loop-based vectorization. Such stmts will be marked as HYBRID. */
2488 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2489 decided_to_slp++;
2490 }
2491
2492 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2493
2494 if (decided_to_slp && dump_enabled_p ())
2495 {
2496 dump_printf_loc (MSG_NOTE, vect_location,
2497 "Decided to SLP %d instances. Unrolling factor ",
2498 decided_to_slp);
2499 dump_dec (MSG_NOTE, unrolling_factor);
2500 dump_printf (MSG_NOTE, "\n");
2501 }
2502
2503 return (decided_to_slp > 0);
2504 }
2505
2506
2507 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
2508 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
2509
2510 static void
vect_detect_hybrid_slp_stmts(slp_tree node,unsigned i,slp_vect_type stype)2511 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
2512 {
2513 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[i];
2514 imm_use_iterator imm_iter;
2515 gimple *use_stmt;
2516 stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
2517 slp_tree child;
2518 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2519 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2520 int j;
2521
2522 /* Propagate hybrid down the SLP tree. */
2523 if (stype == hybrid)
2524 ;
2525 else if (HYBRID_SLP_STMT (stmt_vinfo))
2526 stype = hybrid;
2527 else
2528 {
2529 /* Check if a pure SLP stmt has uses in non-SLP stmts. */
2530 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
2531 /* If we get a pattern stmt here we have to use the LHS of the
2532 original stmt for immediate uses. */
2533 if (! STMT_VINFO_IN_PATTERN_P (stmt_vinfo)
2534 && STMT_VINFO_RELATED_STMT (stmt_vinfo))
2535 stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
2536 tree def;
2537 if (gimple_code (stmt) == GIMPLE_PHI)
2538 def = gimple_phi_result (stmt);
2539 else
2540 def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF);
2541 if (def)
2542 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, def)
2543 {
2544 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2545 continue;
2546 use_vinfo = vinfo_for_stmt (use_stmt);
2547 if (STMT_VINFO_IN_PATTERN_P (use_vinfo)
2548 && STMT_VINFO_RELATED_STMT (use_vinfo))
2549 use_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (use_vinfo));
2550 if (!STMT_SLP_TYPE (use_vinfo)
2551 && (STMT_VINFO_RELEVANT (use_vinfo)
2552 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2553 && !(gimple_code (use_stmt) == GIMPLE_PHI
2554 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2555 {
2556 if (dump_enabled_p ())
2557 {
2558 dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
2559 "def in non-SLP stmt: ");
2560 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, use_stmt, 0);
2561 }
2562 stype = hybrid;
2563 }
2564 }
2565 }
2566
2567 if (stype == hybrid
2568 && !HYBRID_SLP_STMT (stmt_vinfo))
2569 {
2570 if (dump_enabled_p ())
2571 {
2572 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2573 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2574 }
2575 STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2576 }
2577
2578 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2579 if (SLP_TREE_DEF_TYPE (child) != vect_external_def)
2580 vect_detect_hybrid_slp_stmts (child, i, stype);
2581 }
2582
2583 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */
2584
2585 static tree
vect_detect_hybrid_slp_1(tree * tp,int *,void * data)2586 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2587 {
2588 walk_stmt_info *wi = (walk_stmt_info *)data;
2589 struct loop *loopp = (struct loop *)wi->info;
2590
2591 if (wi->is_lhs)
2592 return NULL_TREE;
2593
2594 if (TREE_CODE (*tp) == SSA_NAME
2595 && !SSA_NAME_IS_DEFAULT_DEF (*tp))
2596 {
2597 gimple *def_stmt = SSA_NAME_DEF_STMT (*tp);
2598 if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
2599 && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
2600 {
2601 if (dump_enabled_p ())
2602 {
2603 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2604 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2605 }
2606 STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
2607 }
2608 }
2609
2610 return NULL_TREE;
2611 }
2612
2613 static tree
vect_detect_hybrid_slp_2(gimple_stmt_iterator * gsi,bool * handled,walk_stmt_info *)2614 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2615 walk_stmt_info *)
2616 {
2617 stmt_vec_info use_vinfo = vinfo_for_stmt (gsi_stmt (*gsi));
2618 /* If the stmt is in a SLP instance then this isn't a reason
2619 to mark use definitions in other SLP instances as hybrid. */
2620 if (! STMT_SLP_TYPE (use_vinfo)
2621 && (STMT_VINFO_RELEVANT (use_vinfo)
2622 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2623 && ! (gimple_code (gsi_stmt (*gsi)) == GIMPLE_PHI
2624 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2625 ;
2626 else
2627 *handled = true;
2628 return NULL_TREE;
2629 }
2630
2631 /* Find stmts that must be both vectorized and SLPed. */
2632
2633 void
vect_detect_hybrid_slp(loop_vec_info loop_vinfo)2634 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2635 {
2636 unsigned int i;
2637 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2638 slp_instance instance;
2639
2640 if (dump_enabled_p ())
2641 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
2642 "\n");
2643
2644 /* First walk all pattern stmt in the loop and mark defs of uses as
2645 hybrid because immediate uses in them are not recorded. */
2646 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2647 {
2648 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2649 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2650 gsi_next (&gsi))
2651 {
2652 gimple *stmt = gsi_stmt (gsi);
2653 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2654 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2655 {
2656 walk_stmt_info wi;
2657 memset (&wi, 0, sizeof (wi));
2658 wi.info = LOOP_VINFO_LOOP (loop_vinfo);
2659 gimple_stmt_iterator gsi2
2660 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2661 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2662 vect_detect_hybrid_slp_1, &wi);
2663 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2664 vect_detect_hybrid_slp_2,
2665 vect_detect_hybrid_slp_1, &wi);
2666 }
2667 }
2668 }
2669
2670 /* Then walk the SLP instance trees marking stmts with uses in
2671 non-SLP stmts as hybrid, also propagating hybrid down the
2672 SLP tree, collecting the above info on-the-fly. */
2673 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2674 {
2675 for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2676 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2677 i, pure_slp);
2678 }
2679 }
2680
2681
2682 /* Initialize a bb_vec_info struct for the statements between
2683 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2684
_bb_vec_info(gimple_stmt_iterator region_begin_in,gimple_stmt_iterator region_end_in)2685 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2686 gimple_stmt_iterator region_end_in)
2687 : vec_info (vec_info::bb, init_cost (NULL)),
2688 bb (gsi_bb (region_begin_in)),
2689 region_begin (region_begin_in),
2690 region_end (region_end_in)
2691 {
2692 gimple_stmt_iterator gsi;
2693
2694 for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end);
2695 gsi_next (&gsi))
2696 {
2697 gimple *stmt = gsi_stmt (gsi);
2698 gimple_set_uid (stmt, 0);
2699 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
2700 }
2701
2702 bb->aux = this;
2703 }
2704
2705
2706 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2707 stmts in the basic block. */
2708
~_bb_vec_info()2709 _bb_vec_info::~_bb_vec_info ()
2710 {
2711 for (gimple_stmt_iterator si = region_begin;
2712 gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si))
2713 {
2714 gimple *stmt = gsi_stmt (si);
2715 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2716
2717 if (stmt_info)
2718 /* Free stmt_vec_info. */
2719 free_stmt_vec_info (stmt);
2720
2721 /* Reset region marker. */
2722 gimple_set_uid (stmt, -1);
2723 }
2724
2725 bb->aux = NULL;
2726 }
2727
2728
2729 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2730 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2731
2732 Return true if the operations are supported. */
2733
2734 static bool
vect_slp_analyze_node_operations(vec_info * vinfo,slp_tree node,slp_instance node_instance)2735 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2736 slp_instance node_instance)
2737 {
2738 bool dummy;
2739 int i, j;
2740 gimple *stmt;
2741 slp_tree child;
2742
2743 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2744 return true;
2745
2746 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2747 if (!vect_slp_analyze_node_operations (vinfo, child, node_instance))
2748 return false;
2749
2750 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2751 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2752 gcc_assert (stmt_info);
2753 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2754
2755 /* For BB vectorization vector types are assigned here.
2756 Memory accesses already got their vector type assigned
2757 in vect_analyze_data_refs. */
2758 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2759 if (bb_vinfo
2760 && ! STMT_VINFO_DATA_REF (stmt_info))
2761 {
2762 gcc_assert (PURE_SLP_STMT (stmt_info));
2763
2764 tree scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
2765 if (dump_enabled_p ())
2766 {
2767 dump_printf_loc (MSG_NOTE, vect_location,
2768 "get vectype for scalar type: ");
2769 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
2770 dump_printf (MSG_NOTE, "\n");
2771 }
2772
2773 tree vectype = get_vectype_for_scalar_type (scalar_type);
2774 if (!vectype)
2775 {
2776 if (dump_enabled_p ())
2777 {
2778 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2779 "not SLPed: unsupported data-type ");
2780 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2781 scalar_type);
2782 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2783 }
2784 return false;
2785 }
2786
2787 if (dump_enabled_p ())
2788 {
2789 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
2790 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
2791 dump_printf (MSG_NOTE, "\n");
2792 }
2793
2794 gimple *sstmt;
2795 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, sstmt)
2796 STMT_VINFO_VECTYPE (vinfo_for_stmt (sstmt)) = vectype;
2797 }
2798
2799 /* Calculate the number of vector statements to be created for the
2800 scalar stmts in this node. For SLP reductions it is equal to the
2801 number of vector statements in the children (which has already been
2802 calculated by the recursive call). Otherwise it is the number of
2803 scalar elements in one scalar iteration (GROUP_SIZE) multiplied by
2804 VF divided by the number of elements in a vector. */
2805 if (GROUP_FIRST_ELEMENT (stmt_info)
2806 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2807 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2808 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
2809 else
2810 {
2811 poly_uint64 vf;
2812 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2813 vf = loop_vinfo->vectorization_factor;
2814 else
2815 vf = 1;
2816 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance);
2817 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2818 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2819 = vect_get_num_vectors (vf * group_size, vectype);
2820 }
2821
2822 /* ??? We have to catch the case late where two first scalar stmts appear
2823 in multiple SLP children with different def type and fail. Remember
2824 original def types first since SLP_TREE_DEF_TYPE doesn't necessarily
2825 match it when that is vect_internal_def. */
2826 auto_vec<vect_def_type, 4> dt;
2827 dt.safe_grow (SLP_TREE_CHILDREN (node).length ());
2828 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2829 dt[j]
2830 = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]));
2831
2832 /* Push SLP node def-type to stmt operands. */
2833 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2834 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2835 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]))
2836 = SLP_TREE_DEF_TYPE (child);
2837
2838 /* Check everything worked out. */
2839 bool res = true;
2840 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2841 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2842 {
2843 if (STMT_VINFO_DEF_TYPE
2844 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]))
2845 != SLP_TREE_DEF_TYPE (child))
2846 res = false;
2847 }
2848 else if (STMT_VINFO_DEF_TYPE
2849 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])) != dt[j])
2850 res = false;
2851 if (!res && dump_enabled_p ())
2852 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2853 "not vectorized: same operand with different "
2854 "def type in stmt.\n");
2855 if (res)
2856 res = vect_analyze_stmt (stmt, &dummy, node, node_instance);
2857
2858 /* Restore def-types. */
2859 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2860 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]))
2861 = dt[j];
2862
2863 return res;
2864 }
2865
2866
2867 /* Analyze statements in SLP instances of VINFO. Return true if the
2868 operations are supported. */
2869
2870 bool
vect_slp_analyze_operations(vec_info * vinfo)2871 vect_slp_analyze_operations (vec_info *vinfo)
2872 {
2873 slp_instance instance;
2874 int i;
2875
2876 if (dump_enabled_p ())
2877 dump_printf_loc (MSG_NOTE, vect_location,
2878 "=== vect_slp_analyze_operations ===\n");
2879
2880 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2881 {
2882 if (!vect_slp_analyze_node_operations (vinfo,
2883 SLP_INSTANCE_TREE (instance),
2884 instance))
2885 {
2886 dump_printf_loc (MSG_NOTE, vect_location,
2887 "removing SLP instance operations starting from: ");
2888 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
2889 SLP_TREE_SCALAR_STMTS
2890 (SLP_INSTANCE_TREE (instance))[0], 0);
2891 vect_free_slp_instance (instance);
2892 vinfo->slp_instances.ordered_remove (i);
2893 }
2894 else
2895 i++;
2896 }
2897
2898 if (dump_enabled_p ())
2899 dump_printf_loc (MSG_NOTE, vect_location,
2900 "=== vect_analyze_slp_cost ===\n");
2901
2902 /* Compute the costs of the SLP instances. */
2903 scalar_stmts_set_t *visited = new scalar_stmts_set_t ();
2904 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
2905 vect_analyze_slp_cost (instance, vinfo->target_cost_data, visited);
2906 delete visited;
2907
2908 return !vinfo->slp_instances.is_empty ();
2909 }
2910
2911
2912 /* Compute the scalar cost of the SLP node NODE and its children
2913 and return it. Do not account defs that are marked in LIFE and
2914 update LIFE according to uses of NODE. */
2915
2916 static unsigned
vect_bb_slp_scalar_cost(basic_block bb,slp_tree node,vec<bool,va_heap> * life)2917 vect_bb_slp_scalar_cost (basic_block bb,
2918 slp_tree node, vec<bool, va_heap> *life)
2919 {
2920 unsigned scalar_cost = 0;
2921 unsigned i;
2922 gimple *stmt;
2923 slp_tree child;
2924
2925 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2926 {
2927 unsigned stmt_cost;
2928 ssa_op_iter op_iter;
2929 def_operand_p def_p;
2930 stmt_vec_info stmt_info;
2931
2932 if ((*life)[i])
2933 continue;
2934
2935 /* If there is a non-vectorized use of the defs then the scalar
2936 stmt is kept live in which case we do not account it or any
2937 required defs in the SLP children in the scalar cost. This
2938 way we make the vectorization more costly when compared to
2939 the scalar cost. */
2940 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2941 {
2942 imm_use_iterator use_iter;
2943 gimple *use_stmt;
2944 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2945 if (!is_gimple_debug (use_stmt)
2946 && (! vect_stmt_in_region_p (vinfo_for_stmt (stmt)->vinfo,
2947 use_stmt)
2948 || ! PURE_SLP_STMT (vinfo_for_stmt (use_stmt))))
2949 {
2950 (*life)[i] = true;
2951 BREAK_FROM_IMM_USE_STMT (use_iter);
2952 }
2953 }
2954 if ((*life)[i])
2955 continue;
2956
2957 /* Count scalar stmts only once. */
2958 if (gimple_visited_p (stmt))
2959 continue;
2960 gimple_set_visited (stmt, true);
2961
2962 stmt_info = vinfo_for_stmt (stmt);
2963 if (STMT_VINFO_DATA_REF (stmt_info))
2964 {
2965 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2966 stmt_cost = vect_get_stmt_cost (scalar_load);
2967 else
2968 stmt_cost = vect_get_stmt_cost (scalar_store);
2969 }
2970 else
2971 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2972
2973 scalar_cost += stmt_cost;
2974 }
2975
2976 auto_vec<bool, 20> subtree_life;
2977 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2978 {
2979 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2980 {
2981 /* Do not directly pass LIFE to the recursive call, copy it to
2982 confine changes in the callee to the current child/subtree. */
2983 subtree_life.safe_splice (*life);
2984 scalar_cost += vect_bb_slp_scalar_cost (bb, child, &subtree_life);
2985 subtree_life.truncate (0);
2986 }
2987 }
2988
2989 return scalar_cost;
2990 }
2991
2992 /* Check if vectorization of the basic block is profitable. */
2993
2994 static bool
vect_bb_vectorization_profitable_p(bb_vec_info bb_vinfo)2995 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2996 {
2997 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2998 slp_instance instance;
2999 int i;
3000 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
3001 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
3002
3003 /* Calculate scalar cost. */
3004 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3005 {
3006 auto_vec<bool, 20> life;
3007 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
3008 scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
3009 SLP_INSTANCE_TREE (instance),
3010 &life);
3011 }
3012
3013 /* Unset visited flag. */
3014 for (gimple_stmt_iterator gsi = bb_vinfo->region_begin;
3015 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
3016 gimple_set_visited (gsi_stmt (gsi), false);
3017
3018 /* Complete the target-specific cost calculation. */
3019 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
3020 &vec_inside_cost, &vec_epilogue_cost);
3021
3022 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
3023
3024 if (dump_enabled_p ())
3025 {
3026 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3027 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
3028 vec_inside_cost);
3029 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
3030 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
3031 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
3032 }
3033
3034 /* Vectorization is profitable if its cost is more than the cost of scalar
3035 version. Note that we err on the vector side for equal cost because
3036 the cost estimate is otherwise quite pessimistic (constant uses are
3037 free on the scalar side but cost a load on the vector side for
3038 example). */
3039 if (vec_outside_cost + vec_inside_cost > scalar_cost)
3040 return false;
3041
3042 return true;
3043 }
3044
3045 /* Check if the basic block can be vectorized. Returns a bb_vec_info
3046 if so and sets fatal to true if failure is independent of
3047 current_vector_size. */
3048
3049 static bb_vec_info
vect_slp_analyze_bb_1(gimple_stmt_iterator region_begin,gimple_stmt_iterator region_end,vec<data_reference_p> datarefs,int n_stmts,bool & fatal)3050 vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
3051 gimple_stmt_iterator region_end,
3052 vec<data_reference_p> datarefs, int n_stmts,
3053 bool &fatal)
3054 {
3055 bb_vec_info bb_vinfo;
3056 slp_instance instance;
3057 int i;
3058 poly_uint64 min_vf = 2;
3059
3060 /* The first group of checks is independent of the vector size. */
3061 fatal = true;
3062
3063 if (n_stmts > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
3064 {
3065 if (dump_enabled_p ())
3066 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3067 "not vectorized: too many instructions in "
3068 "basic block.\n");
3069 free_data_refs (datarefs);
3070 return NULL;
3071 }
3072
3073 bb_vinfo = new _bb_vec_info (region_begin, region_end);
3074 if (!bb_vinfo)
3075 return NULL;
3076
3077 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
3078
3079 /* Analyze the data references. */
3080
3081 if (!vect_analyze_data_refs (bb_vinfo, &min_vf))
3082 {
3083 if (dump_enabled_p ())
3084 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3085 "not vectorized: unhandled data-ref in basic "
3086 "block.\n");
3087
3088 delete bb_vinfo;
3089 return NULL;
3090 }
3091
3092 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
3093 {
3094 if (dump_enabled_p ())
3095 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3096 "not vectorized: not enough data-refs in "
3097 "basic block.\n");
3098
3099 delete bb_vinfo;
3100 return NULL;
3101 }
3102
3103 if (!vect_analyze_data_ref_accesses (bb_vinfo))
3104 {
3105 if (dump_enabled_p ())
3106 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3107 "not vectorized: unhandled data access in "
3108 "basic block.\n");
3109
3110 delete bb_vinfo;
3111 return NULL;
3112 }
3113
3114 /* If there are no grouped stores in the region there is no need
3115 to continue with pattern recog as vect_analyze_slp will fail
3116 anyway. */
3117 if (bb_vinfo->grouped_stores.is_empty ())
3118 {
3119 if (dump_enabled_p ())
3120 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3121 "not vectorized: no grouped stores in "
3122 "basic block.\n");
3123
3124 delete bb_vinfo;
3125 return NULL;
3126 }
3127
3128 /* While the rest of the analysis below depends on it in some way. */
3129 fatal = false;
3130
3131 vect_pattern_recog (bb_vinfo);
3132
3133 /* Check the SLP opportunities in the basic block, analyze and build SLP
3134 trees. */
3135 if (!vect_analyze_slp (bb_vinfo, n_stmts))
3136 {
3137 if (dump_enabled_p ())
3138 {
3139 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3140 "Failed to SLP the basic block.\n");
3141 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3142 "not vectorized: failed to find SLP opportunities "
3143 "in basic block.\n");
3144 }
3145
3146 delete bb_vinfo;
3147 return NULL;
3148 }
3149
3150 vect_record_base_alignments (bb_vinfo);
3151
3152 /* Analyze and verify the alignment of data references and the
3153 dependence in the SLP instances. */
3154 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
3155 {
3156 if (! vect_slp_analyze_and_verify_instance_alignment (instance)
3157 || ! vect_slp_analyze_instance_dependence (instance))
3158 {
3159 dump_printf_loc (MSG_NOTE, vect_location,
3160 "removing SLP instance operations starting from: ");
3161 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3162 SLP_TREE_SCALAR_STMTS
3163 (SLP_INSTANCE_TREE (instance))[0], 0);
3164 vect_free_slp_instance (instance);
3165 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
3166 continue;
3167 }
3168
3169 /* Mark all the statements that we want to vectorize as pure SLP and
3170 relevant. */
3171 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
3172 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
3173
3174 i++;
3175 }
3176 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
3177 {
3178 delete bb_vinfo;
3179 return NULL;
3180 }
3181
3182 if (!vect_slp_analyze_operations (bb_vinfo))
3183 {
3184 if (dump_enabled_p ())
3185 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3186 "not vectorized: bad operation in basic block.\n");
3187
3188 delete bb_vinfo;
3189 return NULL;
3190 }
3191
3192 /* Cost model: check if the vectorization is worthwhile. */
3193 if (!unlimited_cost_model (NULL)
3194 && !vect_bb_vectorization_profitable_p (bb_vinfo))
3195 {
3196 if (dump_enabled_p ())
3197 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3198 "not vectorized: vectorization is not "
3199 "profitable.\n");
3200
3201 delete bb_vinfo;
3202 return NULL;
3203 }
3204
3205 if (dump_enabled_p ())
3206 dump_printf_loc (MSG_NOTE, vect_location,
3207 "Basic block will be vectorized using SLP\n");
3208
3209 return bb_vinfo;
3210 }
3211
3212
3213 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
3214 true if anything in the basic-block was vectorized. */
3215
3216 bool
vect_slp_bb(basic_block bb)3217 vect_slp_bb (basic_block bb)
3218 {
3219 bb_vec_info bb_vinfo;
3220 gimple_stmt_iterator gsi;
3221 bool any_vectorized = false;
3222 auto_vector_sizes vector_sizes;
3223
3224 if (dump_enabled_p ())
3225 dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
3226
3227 /* Autodetect first vector size we try. */
3228 current_vector_size = 0;
3229 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
3230 unsigned int next_size = 0;
3231
3232 gsi = gsi_start_bb (bb);
3233
3234 poly_uint64 autodetected_vector_size = 0;
3235 while (1)
3236 {
3237 if (gsi_end_p (gsi))
3238 break;
3239
3240 gimple_stmt_iterator region_begin = gsi;
3241 vec<data_reference_p> datarefs = vNULL;
3242 int insns = 0;
3243
3244 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3245 {
3246 gimple *stmt = gsi_stmt (gsi);
3247 if (is_gimple_debug (stmt))
3248 continue;
3249 insns++;
3250
3251 if (gimple_location (stmt) != UNKNOWN_LOCATION)
3252 vect_location = gimple_location (stmt);
3253
3254 if (!find_data_references_in_stmt (NULL, stmt, &datarefs))
3255 break;
3256 }
3257
3258 /* Skip leading unhandled stmts. */
3259 if (gsi_stmt (region_begin) == gsi_stmt (gsi))
3260 {
3261 gsi_next (&gsi);
3262 continue;
3263 }
3264
3265 gimple_stmt_iterator region_end = gsi;
3266
3267 bool vectorized = false;
3268 bool fatal = false;
3269 bb_vinfo = vect_slp_analyze_bb_1 (region_begin, region_end,
3270 datarefs, insns, fatal);
3271 if (bb_vinfo
3272 && dbg_cnt (vect_slp))
3273 {
3274 if (dump_enabled_p ())
3275 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3276
3277 vect_schedule_slp (bb_vinfo);
3278
3279 if (dump_enabled_p ())
3280 dump_printf_loc (MSG_NOTE, vect_location,
3281 "basic block part vectorized\n");
3282
3283 vectorized = true;
3284 }
3285 delete bb_vinfo;
3286
3287 any_vectorized |= vectorized;
3288
3289 if (next_size == 0)
3290 autodetected_vector_size = current_vector_size;
3291
3292 if (next_size < vector_sizes.length ()
3293 && known_eq (vector_sizes[next_size], autodetected_vector_size))
3294 next_size += 1;
3295
3296 if (vectorized
3297 || next_size == vector_sizes.length ()
3298 || known_eq (current_vector_size, 0U)
3299 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3300 vector sizes will fail do not bother iterating. */
3301 || fatal)
3302 {
3303 if (gsi_end_p (region_end))
3304 break;
3305
3306 /* Skip the unhandled stmt. */
3307 gsi_next (&gsi);
3308
3309 /* And reset vector sizes. */
3310 current_vector_size = 0;
3311 next_size = 0;
3312 }
3313 else
3314 {
3315 /* Try the next biggest vector size. */
3316 current_vector_size = vector_sizes[next_size++];
3317 if (dump_enabled_p ())
3318 {
3319 dump_printf_loc (MSG_NOTE, vect_location,
3320 "***** Re-trying analysis with "
3321 "vector size ");
3322 dump_dec (MSG_NOTE, current_vector_size);
3323 dump_printf (MSG_NOTE, "\n");
3324 }
3325
3326 /* Start over. */
3327 gsi = region_begin;
3328 }
3329 }
3330
3331 return any_vectorized;
3332 }
3333
3334
3335 /* Return 1 if vector type of boolean constant which is OPNUM
3336 operand in statement STMT is a boolean vector. */
3337
3338 static bool
vect_mask_constant_operand_p(gimple * stmt,int opnum)3339 vect_mask_constant_operand_p (gimple *stmt, int opnum)
3340 {
3341 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3342 enum tree_code code = gimple_expr_code (stmt);
3343 tree op, vectype;
3344 gimple *def_stmt;
3345 enum vect_def_type dt;
3346
3347 /* For comparison and COND_EXPR type is chosen depending
3348 on the other comparison operand. */
3349 if (TREE_CODE_CLASS (code) == tcc_comparison)
3350 {
3351 if (opnum)
3352 op = gimple_assign_rhs1 (stmt);
3353 else
3354 op = gimple_assign_rhs2 (stmt);
3355
3356 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &def_stmt,
3357 &dt, &vectype))
3358 gcc_unreachable ();
3359
3360 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3361 }
3362
3363 if (code == COND_EXPR)
3364 {
3365 tree cond = gimple_assign_rhs1 (stmt);
3366
3367 if (TREE_CODE (cond) == SSA_NAME)
3368 op = cond;
3369 else if (opnum)
3370 op = TREE_OPERAND (cond, 1);
3371 else
3372 op = TREE_OPERAND (cond, 0);
3373
3374 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &def_stmt,
3375 &dt, &vectype))
3376 gcc_unreachable ();
3377
3378 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3379 }
3380
3381 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3382 }
3383
3384 /* Build a variable-length vector in which the elements in ELTS are repeated
3385 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3386 RESULTS and add any new instructions to SEQ.
3387
3388 The approach we use is:
3389
3390 (1) Find a vector mode VM with integer elements of mode IM.
3391
3392 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3393 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3394 from small vectors to IM.
3395
3396 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3397
3398 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3399 correct byte contents.
3400
3401 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3402
3403 We try to find the largest IM for which this sequence works, in order
3404 to cut down on the number of interleaves. */
3405
3406 void
duplicate_and_interleave(gimple_seq * seq,tree vector_type,vec<tree> elts,unsigned int nresults,vec<tree> & results)3407 duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec<tree> elts,
3408 unsigned int nresults, vec<tree> &results)
3409 {
3410 unsigned int nelts = elts.length ();
3411 tree element_type = TREE_TYPE (vector_type);
3412
3413 /* (1) Find a vector mode VM with integer elements of mode IM. */
3414 unsigned int nvectors = 1;
3415 tree new_vector_type;
3416 tree permutes[2];
3417 if (!can_duplicate_and_interleave_p (nelts, TYPE_MODE (element_type),
3418 &nvectors, &new_vector_type,
3419 permutes))
3420 gcc_unreachable ();
3421
3422 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3423 unsigned int partial_nelts = nelts / nvectors;
3424 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3425
3426 tree_vector_builder partial_elts;
3427 auto_vec<tree, 32> pieces (nvectors * 2);
3428 pieces.quick_grow (nvectors * 2);
3429 for (unsigned int i = 0; i < nvectors; ++i)
3430 {
3431 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3432 ELTS' has mode IM. */
3433 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3434 for (unsigned int j = 0; j < partial_nelts; ++j)
3435 partial_elts.quick_push (elts[i * partial_nelts + j]);
3436 tree t = gimple_build_vector (seq, &partial_elts);
3437 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3438 TREE_TYPE (new_vector_type), t);
3439
3440 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3441 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3442 }
3443
3444 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3445 correct byte contents.
3446
3447 We need to repeat the following operation log2(nvectors) times:
3448
3449 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3450 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3451
3452 However, if each input repeats every N elements and the VF is
3453 a multiple of N * 2, the HI result is the same as the LO. */
3454 unsigned int in_start = 0;
3455 unsigned int out_start = nvectors;
3456 unsigned int hi_start = nvectors / 2;
3457 /* A bound on the number of outputs needed to produce NRESULTS results
3458 in the final iteration. */
3459 unsigned int noutputs_bound = nvectors * nresults;
3460 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3461 {
3462 noutputs_bound /= 2;
3463 unsigned int limit = MIN (noutputs_bound, nvectors);
3464 for (unsigned int i = 0; i < limit; ++i)
3465 {
3466 if ((i & 1) != 0
3467 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3468 2 * in_repeat))
3469 {
3470 pieces[out_start + i] = pieces[out_start + i - 1];
3471 continue;
3472 }
3473
3474 tree output = make_ssa_name (new_vector_type);
3475 tree input1 = pieces[in_start + (i / 2)];
3476 tree input2 = pieces[in_start + (i / 2) + hi_start];
3477 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3478 input1, input2,
3479 permutes[i & 1]);
3480 gimple_seq_add_stmt (seq, stmt);
3481 pieces[out_start + i] = output;
3482 }
3483 std::swap (in_start, out_start);
3484 }
3485
3486 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3487 results.reserve (nresults);
3488 for (unsigned int i = 0; i < nresults; ++i)
3489 if (i < nvectors)
3490 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3491 pieces[in_start + i]));
3492 else
3493 results.quick_push (results[i - nvectors]);
3494 }
3495
3496
3497 /* For constant and loop invariant defs of SLP_NODE this function returns
3498 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
3499 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
3500 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
3501 REDUC_INDEX is the index of the reduction operand in the statements, unless
3502 it is -1. */
3503
3504 static void
vect_get_constant_vectors(tree op,slp_tree slp_node,vec<tree> * vec_oprnds,unsigned int op_num,unsigned int number_of_vectors)3505 vect_get_constant_vectors (tree op, slp_tree slp_node,
3506 vec<tree> *vec_oprnds,
3507 unsigned int op_num, unsigned int number_of_vectors)
3508 {
3509 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3510 gimple *stmt = stmts[0];
3511 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3512 unsigned HOST_WIDE_INT nunits;
3513 tree vec_cst;
3514 unsigned j, number_of_places_left_in_vector;
3515 tree vector_type;
3516 tree vop;
3517 int group_size = stmts.length ();
3518 unsigned int vec_num, i;
3519 unsigned number_of_copies = 1;
3520 vec<tree> voprnds;
3521 voprnds.create (number_of_vectors);
3522 bool constant_p, is_store;
3523 tree neutral_op = NULL;
3524 enum tree_code code = gimple_expr_code (stmt);
3525 gimple_seq ctor_seq = NULL;
3526 auto_vec<tree, 16> permute_results;
3527
3528 /* Check if vector type is a boolean vector. */
3529 if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
3530 && vect_mask_constant_operand_p (stmt, op_num))
3531 vector_type
3532 = build_same_sized_truth_vector_type (STMT_VINFO_VECTYPE (stmt_vinfo));
3533 else
3534 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
3535
3536 if (STMT_VINFO_DATA_REF (stmt_vinfo))
3537 {
3538 is_store = true;
3539 op = gimple_assign_rhs1 (stmt);
3540 }
3541 else
3542 is_store = false;
3543
3544 gcc_assert (op);
3545
3546 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3547 created vectors. It is greater than 1 if unrolling is performed.
3548
3549 For example, we have two scalar operands, s1 and s2 (e.g., group of
3550 strided accesses of size two), while NUNITS is four (i.e., four scalars
3551 of this type can be packed in a vector). The output vector will contain
3552 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3553 will be 2).
3554
3555 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3556 containing the operands.
3557
3558 For example, NUNITS is four as before, and the group size is 8
3559 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3560 {s5, s6, s7, s8}. */
3561
3562 /* When using duplicate_and_interleave, we just need one element for
3563 each scalar statement. */
3564 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3565 nunits = group_size;
3566
3567 number_of_copies = nunits * number_of_vectors / group_size;
3568
3569 number_of_places_left_in_vector = nunits;
3570 constant_p = true;
3571 tree_vector_builder elts (vector_type, nunits, 1);
3572 elts.quick_grow (nunits);
3573 bool place_after_defs = false;
3574 for (j = 0; j < number_of_copies; j++)
3575 {
3576 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
3577 {
3578 if (is_store)
3579 op = gimple_assign_rhs1 (stmt);
3580 else
3581 {
3582 switch (code)
3583 {
3584 case COND_EXPR:
3585 {
3586 tree cond = gimple_assign_rhs1 (stmt);
3587 if (TREE_CODE (cond) == SSA_NAME)
3588 op = gimple_op (stmt, op_num + 1);
3589 else if (op_num == 0 || op_num == 1)
3590 op = TREE_OPERAND (cond, op_num);
3591 else
3592 {
3593 if (op_num == 2)
3594 op = gimple_assign_rhs2 (stmt);
3595 else
3596 op = gimple_assign_rhs3 (stmt);
3597 }
3598 }
3599 break;
3600
3601 case CALL_EXPR:
3602 op = gimple_call_arg (stmt, op_num);
3603 break;
3604
3605 case LSHIFT_EXPR:
3606 case RSHIFT_EXPR:
3607 case LROTATE_EXPR:
3608 case RROTATE_EXPR:
3609 op = gimple_op (stmt, op_num + 1);
3610 /* Unlike the other binary operators, shifts/rotates have
3611 the shift count being int, instead of the same type as
3612 the lhs, so make sure the scalar is the right type if
3613 we are dealing with vectors of
3614 long long/long/short/char. */
3615 if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
3616 op = fold_convert (TREE_TYPE (vector_type), op);
3617 break;
3618
3619 default:
3620 op = gimple_op (stmt, op_num + 1);
3621 break;
3622 }
3623 }
3624
3625 /* Create 'vect_ = {op0,op1,...,opn}'. */
3626 number_of_places_left_in_vector--;
3627 tree orig_op = op;
3628 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3629 {
3630 if (CONSTANT_CLASS_P (op))
3631 {
3632 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3633 {
3634 /* Can't use VIEW_CONVERT_EXPR for booleans because
3635 of possibly different sizes of scalar value and
3636 vector element. */
3637 if (integer_zerop (op))
3638 op = build_int_cst (TREE_TYPE (vector_type), 0);
3639 else if (integer_onep (op))
3640 op = build_all_ones_cst (TREE_TYPE (vector_type));
3641 else
3642 gcc_unreachable ();
3643 }
3644 else
3645 op = fold_unary (VIEW_CONVERT_EXPR,
3646 TREE_TYPE (vector_type), op);
3647 gcc_assert (op && CONSTANT_CLASS_P (op));
3648 }
3649 else
3650 {
3651 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3652 gimple *init_stmt;
3653 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3654 {
3655 tree true_val
3656 = build_all_ones_cst (TREE_TYPE (vector_type));
3657 tree false_val
3658 = build_zero_cst (TREE_TYPE (vector_type));
3659 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3660 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3661 op, true_val,
3662 false_val);
3663 }
3664 else
3665 {
3666 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3667 op);
3668 init_stmt
3669 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3670 op);
3671 }
3672 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3673 op = new_temp;
3674 }
3675 }
3676 elts[number_of_places_left_in_vector] = op;
3677 if (!CONSTANT_CLASS_P (op))
3678 constant_p = false;
3679 if (TREE_CODE (orig_op) == SSA_NAME
3680 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3681 && STMT_VINFO_BB_VINFO (stmt_vinfo)
3682 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
3683 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
3684 place_after_defs = true;
3685
3686 if (number_of_places_left_in_vector == 0)
3687 {
3688 if (constant_p
3689 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3690 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3691 vec_cst = gimple_build_vector (&ctor_seq, &elts);
3692 else
3693 {
3694 if (vec_oprnds->is_empty ())
3695 duplicate_and_interleave (&ctor_seq, vector_type, elts,
3696 number_of_vectors,
3697 permute_results);
3698 vec_cst = permute_results[number_of_vectors - j - 1];
3699 }
3700 tree init;
3701 gimple_stmt_iterator gsi;
3702 if (place_after_defs)
3703 {
3704 gsi = gsi_for_stmt
3705 (vect_find_last_scalar_stmt_in_slp (slp_node));
3706 init = vect_init_vector (stmt, vec_cst, vector_type, &gsi);
3707 }
3708 else
3709 init = vect_init_vector (stmt, vec_cst, vector_type, NULL);
3710 if (ctor_seq != NULL)
3711 {
3712 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3713 gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
3714 ctor_seq = NULL;
3715 }
3716 voprnds.quick_push (init);
3717 place_after_defs = false;
3718 number_of_places_left_in_vector = nunits;
3719 constant_p = true;
3720 elts.new_vector (vector_type, nunits, 1);
3721 elts.quick_grow (nunits);
3722 }
3723 }
3724 }
3725
3726 /* Since the vectors are created in the reverse order, we should invert
3727 them. */
3728 vec_num = voprnds.length ();
3729 for (j = vec_num; j != 0; j--)
3730 {
3731 vop = voprnds[j - 1];
3732 vec_oprnds->quick_push (vop);
3733 }
3734
3735 voprnds.release ();
3736
3737 /* In case that VF is greater than the unrolling factor needed for the SLP
3738 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3739 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3740 to replicate the vectors. */
3741 while (number_of_vectors > vec_oprnds->length ())
3742 {
3743 tree neutral_vec = NULL;
3744
3745 if (neutral_op)
3746 {
3747 if (!neutral_vec)
3748 neutral_vec = build_vector_from_val (vector_type, neutral_op);
3749
3750 vec_oprnds->quick_push (neutral_vec);
3751 }
3752 else
3753 {
3754 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
3755 vec_oprnds->quick_push (vop);
3756 }
3757 }
3758 }
3759
3760
3761 /* Get vectorized definitions from SLP_NODE that contains corresponding
3762 vectorized def-stmts. */
3763
3764 static void
vect_get_slp_vect_defs(slp_tree slp_node,vec<tree> * vec_oprnds)3765 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
3766 {
3767 tree vec_oprnd;
3768 gimple *vec_def_stmt;
3769 unsigned int i;
3770
3771 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
3772
3773 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
3774 {
3775 gcc_assert (vec_def_stmt);
3776 if (gimple_code (vec_def_stmt) == GIMPLE_PHI)
3777 vec_oprnd = gimple_phi_result (vec_def_stmt);
3778 else
3779 vec_oprnd = gimple_get_lhs (vec_def_stmt);
3780 vec_oprnds->quick_push (vec_oprnd);
3781 }
3782 }
3783
3784
3785 /* Get vectorized definitions for SLP_NODE.
3786 If the scalar definitions are loop invariants or constants, collect them and
3787 call vect_get_constant_vectors() to create vector stmts.
3788 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
3789 must be stored in the corresponding child of SLP_NODE, and we call
3790 vect_get_slp_vect_defs () to retrieve them. */
3791
3792 void
vect_get_slp_defs(vec<tree> ops,slp_tree slp_node,vec<vec<tree>> * vec_oprnds)3793 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
3794 vec<vec<tree> > *vec_oprnds)
3795 {
3796 gimple *first_stmt;
3797 int number_of_vects = 0, i;
3798 unsigned int child_index = 0;
3799 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
3800 slp_tree child = NULL;
3801 vec<tree> vec_defs;
3802 tree oprnd;
3803 bool vectorized_defs;
3804
3805 first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
3806 FOR_EACH_VEC_ELT (ops, i, oprnd)
3807 {
3808 /* For each operand we check if it has vectorized definitions in a child
3809 node or we need to create them (for invariants and constants). We
3810 check if the LHS of the first stmt of the next child matches OPRND.
3811 If it does, we found the correct child. Otherwise, we call
3812 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
3813 to check this child node for the next operand. */
3814 vectorized_defs = false;
3815 if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
3816 {
3817 child = SLP_TREE_CHILDREN (slp_node)[child_index];
3818
3819 /* We have to check both pattern and original def, if available. */
3820 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3821 {
3822 gimple *first_def = SLP_TREE_SCALAR_STMTS (child)[0];
3823 gimple *related
3824 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
3825 tree first_def_op;
3826
3827 if (gimple_code (first_def) == GIMPLE_PHI)
3828 first_def_op = gimple_phi_result (first_def);
3829 else
3830 first_def_op = gimple_get_lhs (first_def);
3831 if (operand_equal_p (oprnd, first_def_op, 0)
3832 || (related
3833 && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
3834 {
3835 /* The number of vector defs is determined by the number of
3836 vector statements in the node from which we get those
3837 statements. */
3838 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
3839 vectorized_defs = true;
3840 child_index++;
3841 }
3842 }
3843 else
3844 child_index++;
3845 }
3846
3847 if (!vectorized_defs)
3848 {
3849 if (i == 0)
3850 {
3851 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
3852 /* Number of vector stmts was calculated according to LHS in
3853 vect_schedule_slp_instance (), fix it by replacing LHS with
3854 RHS, if necessary. See vect_get_smallest_scalar_type () for
3855 details. */
3856 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
3857 &rhs_size_unit);
3858 if (rhs_size_unit != lhs_size_unit)
3859 {
3860 number_of_vects *= rhs_size_unit;
3861 number_of_vects /= lhs_size_unit;
3862 }
3863 }
3864 }
3865
3866 /* Allocate memory for vectorized defs. */
3867 vec_defs = vNULL;
3868 vec_defs.create (number_of_vects);
3869
3870 /* For reduction defs we call vect_get_constant_vectors (), since we are
3871 looking for initial loop invariant values. */
3872 if (vectorized_defs)
3873 /* The defs are already vectorized. */
3874 vect_get_slp_vect_defs (child, &vec_defs);
3875 else
3876 /* Build vectors from scalar defs. */
3877 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
3878 number_of_vects);
3879
3880 vec_oprnds->quick_push (vec_defs);
3881 }
3882 }
3883
3884 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3885 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3886 permute statements for the SLP node NODE of the SLP instance
3887 SLP_NODE_INSTANCE. */
3888
3889 bool
vect_transform_slp_perm_load(slp_tree node,vec<tree> dr_chain,gimple_stmt_iterator * gsi,poly_uint64 vf,slp_instance slp_node_instance,bool analyze_only,unsigned * n_perms)3890 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3891 gimple_stmt_iterator *gsi, poly_uint64 vf,
3892 slp_instance slp_node_instance, bool analyze_only,
3893 unsigned *n_perms)
3894 {
3895 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3896 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3897 tree mask_element_type = NULL_TREE, mask_type;
3898 int vec_index = 0;
3899 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3900 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3901 unsigned int mask_element;
3902 machine_mode mode;
3903 unsigned HOST_WIDE_INT nunits, const_vf;
3904
3905 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3906 return false;
3907
3908 stmt_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info));
3909
3910 mode = TYPE_MODE (vectype);
3911
3912 /* At the moment, all permutations are represented using per-element
3913 indices, so we can't cope with variable vector lengths or
3914 vectorization factors. */
3915 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
3916 || !vf.is_constant (&const_vf))
3917 return false;
3918
3919 /* The generic VEC_PERM_EXPR code always uses an integral type of the
3920 same size as the vector element being permuted. */
3921 mask_element_type = lang_hooks.types.type_for_mode
3922 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1);
3923 mask_type = get_vectype_for_scalar_type (mask_element_type);
3924 vec_perm_builder mask (nunits, nunits, 1);
3925 mask.quick_grow (nunits);
3926 vec_perm_indices indices;
3927
3928 /* Initialize the vect stmts of NODE to properly insert the generated
3929 stmts later. */
3930 if (! analyze_only)
3931 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3932 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3933 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3934
3935 /* Generate permutation masks for every NODE. Number of masks for each NODE
3936 is equal to GROUP_SIZE.
3937 E.g., we have a group of three nodes with three loads from the same
3938 location in each node, and the vector size is 4. I.e., we have a
3939 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3940 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3941 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3942 ...
3943
3944 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3945 The last mask is illegal since we assume two operands for permute
3946 operation, and the mask element values can't be outside that range.
3947 Hence, the last mask must be converted into {2,5,5,5}.
3948 For the first two permutations we need the first and the second input
3949 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3950 we need the second and the third vectors: {b1,c1,a2,b2} and
3951 {c2,a3,b3,c3}. */
3952
3953 int vect_stmts_counter = 0;
3954 unsigned int index = 0;
3955 int first_vec_index = -1;
3956 int second_vec_index = -1;
3957 bool noop_p = true;
3958 *n_perms = 0;
3959
3960 for (unsigned int j = 0; j < const_vf; j++)
3961 {
3962 for (int k = 0; k < group_size; k++)
3963 {
3964 unsigned int i = (SLP_TREE_LOAD_PERMUTATION (node)[k]
3965 + j * STMT_VINFO_GROUP_SIZE (stmt_info));
3966 vec_index = i / nunits;
3967 mask_element = i % nunits;
3968 if (vec_index == first_vec_index
3969 || first_vec_index == -1)
3970 {
3971 first_vec_index = vec_index;
3972 }
3973 else if (vec_index == second_vec_index
3974 || second_vec_index == -1)
3975 {
3976 second_vec_index = vec_index;
3977 mask_element += nunits;
3978 }
3979 else
3980 {
3981 if (dump_enabled_p ())
3982 {
3983 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3984 "permutation requires at "
3985 "least three vectors ");
3986 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3987 stmt, 0);
3988 }
3989 gcc_assert (analyze_only);
3990 return false;
3991 }
3992
3993 gcc_assert (mask_element < 2 * nunits);
3994 if (mask_element != index)
3995 noop_p = false;
3996 mask[index++] = mask_element;
3997
3998 if (index == nunits && !noop_p)
3999 {
4000 indices.new_vector (mask, 2, nunits);
4001 if (!can_vec_perm_const_p (mode, indices))
4002 {
4003 if (dump_enabled_p ())
4004 {
4005 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4006 vect_location,
4007 "unsupported vect permute { ");
4008 for (i = 0; i < nunits; ++i)
4009 {
4010 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4011 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4012 }
4013 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4014 }
4015 gcc_assert (analyze_only);
4016 return false;
4017 }
4018
4019 ++*n_perms;
4020 }
4021
4022 if (index == nunits)
4023 {
4024 if (!analyze_only)
4025 {
4026 tree mask_vec = NULL_TREE;
4027
4028 if (! noop_p)
4029 mask_vec = vec_perm_indices_to_tree (mask_type, indices);
4030
4031 if (second_vec_index == -1)
4032 second_vec_index = first_vec_index;
4033
4034 /* Generate the permute statement if necessary. */
4035 tree first_vec = dr_chain[first_vec_index];
4036 tree second_vec = dr_chain[second_vec_index];
4037 gimple *perm_stmt;
4038 if (! noop_p)
4039 {
4040 tree perm_dest
4041 = vect_create_destination_var (gimple_assign_lhs (stmt),
4042 vectype);
4043 perm_dest = make_ssa_name (perm_dest);
4044 perm_stmt = gimple_build_assign (perm_dest,
4045 VEC_PERM_EXPR,
4046 first_vec, second_vec,
4047 mask_vec);
4048 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4049 }
4050 else
4051 /* If mask was NULL_TREE generate the requested
4052 identity transform. */
4053 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
4054
4055 /* Store the vector statement in NODE. */
4056 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
4057 }
4058
4059 index = 0;
4060 first_vec_index = -1;
4061 second_vec_index = -1;
4062 noop_p = true;
4063 }
4064 }
4065 }
4066
4067 return true;
4068 }
4069
4070 typedef hash_map <vec <gimple *>, slp_tree,
4071 simple_hashmap_traits <bst_traits, slp_tree> >
4072 scalar_stmts_to_slp_tree_map_t;
4073
4074 /* Vectorize SLP instance tree in postorder. */
4075
4076 static bool
vect_schedule_slp_instance(slp_tree node,slp_instance instance,scalar_stmts_to_slp_tree_map_t * bst_map)4077 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
4078 scalar_stmts_to_slp_tree_map_t *bst_map)
4079 {
4080 gimple *stmt;
4081 bool grouped_store, is_store;
4082 gimple_stmt_iterator si;
4083 stmt_vec_info stmt_info;
4084 unsigned int group_size;
4085 tree vectype;
4086 int i, j;
4087 slp_tree child;
4088
4089 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4090 return false;
4091
4092 /* See if we have already vectorized the same set of stmts and reuse their
4093 vectorized stmts. */
4094 if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node)))
4095 {
4096 SLP_TREE_VEC_STMTS (node).safe_splice (SLP_TREE_VEC_STMTS (*leader));
4097 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
4098 = SLP_TREE_NUMBER_OF_VEC_STMTS (*leader);
4099 return false;
4100 }
4101
4102 bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
4103 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4104 vect_schedule_slp_instance (child, instance, bst_map);
4105
4106 /* Push SLP node def-type to stmts. */
4107 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4108 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4109 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
4110 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = SLP_TREE_DEF_TYPE (child);
4111
4112 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
4113 stmt_info = vinfo_for_stmt (stmt);
4114
4115 /* VECTYPE is the type of the destination. */
4116 vectype = STMT_VINFO_VECTYPE (stmt_info);
4117 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4118 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
4119
4120 if (!SLP_TREE_VEC_STMTS (node).exists ())
4121 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
4122
4123 if (dump_enabled_p ())
4124 {
4125 dump_printf_loc (MSG_NOTE,vect_location,
4126 "------>vectorizing SLP node starting from: ");
4127 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4128 }
4129
4130 /* Vectorized stmts go before the last scalar stmt which is where
4131 all uses are ready. */
4132 si = gsi_for_stmt (vect_find_last_scalar_stmt_in_slp (node));
4133
4134 /* Mark the first element of the reduction chain as reduction to properly
4135 transform the node. In the analysis phase only the last element of the
4136 chain is marked as reduction. */
4137 if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
4138 && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
4139 {
4140 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
4141 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4142 }
4143
4144 /* Handle two-operation SLP nodes by vectorizing the group with
4145 both operations and then performing a merge. */
4146 if (SLP_TREE_TWO_OPERATORS (node))
4147 {
4148 enum tree_code code0 = gimple_assign_rhs_code (stmt);
4149 enum tree_code ocode = ERROR_MARK;
4150 gimple *ostmt;
4151 vec_perm_builder mask (group_size, group_size, 1);
4152 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt)
4153 if (gimple_assign_rhs_code (ostmt) != code0)
4154 {
4155 mask.quick_push (1);
4156 ocode = gimple_assign_rhs_code (ostmt);
4157 }
4158 else
4159 mask.quick_push (0);
4160 if (ocode != ERROR_MARK)
4161 {
4162 vec<gimple *> v0;
4163 vec<gimple *> v1;
4164 unsigned j;
4165 tree tmask = NULL_TREE;
4166 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
4167 v0 = SLP_TREE_VEC_STMTS (node).copy ();
4168 SLP_TREE_VEC_STMTS (node).truncate (0);
4169 gimple_assign_set_rhs_code (stmt, ocode);
4170 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
4171 gimple_assign_set_rhs_code (stmt, code0);
4172 v1 = SLP_TREE_VEC_STMTS (node).copy ();
4173 SLP_TREE_VEC_STMTS (node).truncate (0);
4174 tree meltype = build_nonstandard_integer_type
4175 (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1);
4176 tree mvectype = get_same_sized_vectype (meltype, vectype);
4177 unsigned k = 0, l;
4178 for (j = 0; j < v0.length (); ++j)
4179 {
4180 /* Enforced by vect_build_slp_tree, which rejects variable-length
4181 vectors for SLP_TREE_TWO_OPERATORS. */
4182 unsigned int const_nunits = nunits.to_constant ();
4183 tree_vector_builder melts (mvectype, const_nunits, 1);
4184 for (l = 0; l < const_nunits; ++l)
4185 {
4186 if (k >= group_size)
4187 k = 0;
4188 tree t = build_int_cst (meltype,
4189 mask[k++] * const_nunits + l);
4190 melts.quick_push (t);
4191 }
4192 tmask = melts.build ();
4193
4194 /* ??? Not all targets support a VEC_PERM_EXPR with a
4195 constant mask that would translate to a vec_merge RTX
4196 (with their vec_perm_const_ok). We can either not
4197 vectorize in that case or let veclower do its job.
4198 Unfortunately that isn't too great and at least for
4199 plus/minus we'd eventually like to match targets
4200 vector addsub instructions. */
4201 gimple *vstmt;
4202 vstmt = gimple_build_assign (make_ssa_name (vectype),
4203 VEC_PERM_EXPR,
4204 gimple_assign_lhs (v0[j]),
4205 gimple_assign_lhs (v1[j]), tmask);
4206 vect_finish_stmt_generation (stmt, vstmt, &si);
4207 SLP_TREE_VEC_STMTS (node).quick_push (vstmt);
4208 }
4209 v0.release ();
4210 v1.release ();
4211 return false;
4212 }
4213 }
4214 is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
4215
4216 /* Restore stmt def-types. */
4217 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4218 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4219 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
4220 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_internal_def;
4221
4222 return is_store;
4223 }
4224
4225 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4226 For loop vectorization this is done in vectorizable_call, but for SLP
4227 it needs to be deferred until end of vect_schedule_slp, because multiple
4228 SLP instances may refer to the same scalar stmt. */
4229
4230 static void
vect_remove_slp_scalar_calls(slp_tree node)4231 vect_remove_slp_scalar_calls (slp_tree node)
4232 {
4233 gimple *stmt, *new_stmt;
4234 gimple_stmt_iterator gsi;
4235 int i;
4236 slp_tree child;
4237 tree lhs;
4238 stmt_vec_info stmt_info;
4239
4240 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4241 return;
4242
4243 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4244 vect_remove_slp_scalar_calls (child);
4245
4246 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
4247 {
4248 if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
4249 continue;
4250 stmt_info = vinfo_for_stmt (stmt);
4251 if (stmt_info == NULL
4252 || is_pattern_stmt_p (stmt_info)
4253 || !PURE_SLP_STMT (stmt_info))
4254 continue;
4255 lhs = gimple_call_lhs (stmt);
4256 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4257 set_vinfo_for_stmt (new_stmt, stmt_info);
4258 set_vinfo_for_stmt (stmt, NULL);
4259 STMT_VINFO_STMT (stmt_info) = new_stmt;
4260 gsi = gsi_for_stmt (stmt);
4261 gsi_replace (&gsi, new_stmt, false);
4262 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4263 }
4264 }
4265
4266 /* Generate vector code for all SLP instances in the loop/basic block. */
4267
4268 bool
vect_schedule_slp(vec_info * vinfo)4269 vect_schedule_slp (vec_info *vinfo)
4270 {
4271 vec<slp_instance> slp_instances;
4272 slp_instance instance;
4273 unsigned int i;
4274 bool is_store = false;
4275
4276
4277 scalar_stmts_to_slp_tree_map_t *bst_map
4278 = new scalar_stmts_to_slp_tree_map_t ();
4279 slp_instances = vinfo->slp_instances;
4280 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4281 {
4282 /* Schedule the tree of INSTANCE. */
4283 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
4284 instance, bst_map);
4285 if (dump_enabled_p ())
4286 dump_printf_loc (MSG_NOTE, vect_location,
4287 "vectorizing stmts using SLP.\n");
4288 }
4289 delete bst_map;
4290
4291 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4292 {
4293 slp_tree root = SLP_INSTANCE_TREE (instance);
4294 gimple *store;
4295 unsigned int j;
4296 gimple_stmt_iterator gsi;
4297
4298 /* Remove scalar call stmts. Do not do this for basic-block
4299 vectorization as not all uses may be vectorized.
4300 ??? Why should this be necessary? DCE should be able to
4301 remove the stmts itself.
4302 ??? For BB vectorization we can as well remove scalar
4303 stmts starting from the SLP tree root if they have no
4304 uses. */
4305 if (is_a <loop_vec_info> (vinfo))
4306 vect_remove_slp_scalar_calls (root);
4307
4308 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
4309 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
4310 {
4311 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
4312 break;
4313
4314 if (is_pattern_stmt_p (vinfo_for_stmt (store)))
4315 store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
4316 /* Free the attached stmt_vec_info and remove the stmt. */
4317 gsi = gsi_for_stmt (store);
4318 unlink_stmt_vdef (store);
4319 gsi_remove (&gsi, true);
4320 release_defs (store);
4321 free_stmt_vec_info (store);
4322 }
4323 }
4324
4325 return is_store;
4326 }
4327