1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "params.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
48
49
50 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
51
52 static void
vect_free_slp_tree(slp_tree node)53 vect_free_slp_tree (slp_tree node)
54 {
55 int i;
56 slp_tree child;
57
58 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
59 vect_free_slp_tree (child);
60
61 gimple *stmt;
62 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
63 /* After transform some stmts are removed and thus their vinfo is gone. */
64 if (vinfo_for_stmt (stmt))
65 {
66 gcc_assert (STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)) > 0);
67 STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt))--;
68 }
69
70 SLP_TREE_CHILDREN (node).release ();
71 SLP_TREE_SCALAR_STMTS (node).release ();
72 SLP_TREE_VEC_STMTS (node).release ();
73 SLP_TREE_LOAD_PERMUTATION (node).release ();
74
75 free (node);
76 }
77
78
79 /* Free the memory allocated for the SLP instance. */
80
81 void
vect_free_slp_instance(slp_instance instance)82 vect_free_slp_instance (slp_instance instance)
83 {
84 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
85 SLP_INSTANCE_LOADS (instance).release ();
86 free (instance);
87 }
88
89
90 /* Create an SLP node for SCALAR_STMTS. */
91
92 static slp_tree
vect_create_new_slp_node(vec<gimple * > scalar_stmts)93 vect_create_new_slp_node (vec<gimple *> scalar_stmts)
94 {
95 slp_tree node;
96 gimple *stmt = scalar_stmts[0];
97 unsigned int nops;
98
99 if (is_gimple_call (stmt))
100 nops = gimple_call_num_args (stmt);
101 else if (is_gimple_assign (stmt))
102 {
103 nops = gimple_num_ops (stmt) - 1;
104 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
105 nops++;
106 }
107 else if (gimple_code (stmt) == GIMPLE_PHI)
108 nops = 0;
109 else
110 return NULL;
111
112 node = XNEW (struct _slp_tree);
113 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
114 SLP_TREE_VEC_STMTS (node).create (0);
115 SLP_TREE_CHILDREN (node).create (nops);
116 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
117 SLP_TREE_TWO_OPERATORS (node) = false;
118 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
119
120 unsigned i;
121 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt)
122 STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt))++;
123
124 return node;
125 }
126
127
128 /* This structure is used in creation of an SLP tree. Each instance
129 corresponds to the same operand in a group of scalar stmts in an SLP
130 node. */
131 typedef struct _slp_oprnd_info
132 {
133 /* Def-stmts for the operands. */
134 vec<gimple *> def_stmts;
135 /* Information about the first statement, its vector def-type, type, the
136 operand itself in case it's constant, and an indication if it's a pattern
137 stmt. */
138 tree first_op_type;
139 enum vect_def_type first_dt;
140 bool first_pattern;
141 bool second_pattern;
142 } *slp_oprnd_info;
143
144
145 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
146 operand. */
147 static vec<slp_oprnd_info>
vect_create_oprnd_info(int nops,int group_size)148 vect_create_oprnd_info (int nops, int group_size)
149 {
150 int i;
151 slp_oprnd_info oprnd_info;
152 vec<slp_oprnd_info> oprnds_info;
153
154 oprnds_info.create (nops);
155 for (i = 0; i < nops; i++)
156 {
157 oprnd_info = XNEW (struct _slp_oprnd_info);
158 oprnd_info->def_stmts.create (group_size);
159 oprnd_info->first_dt = vect_uninitialized_def;
160 oprnd_info->first_op_type = NULL_TREE;
161 oprnd_info->first_pattern = false;
162 oprnd_info->second_pattern = false;
163 oprnds_info.quick_push (oprnd_info);
164 }
165
166 return oprnds_info;
167 }
168
169
170 /* Free operands info. */
171
172 static void
vect_free_oprnd_info(vec<slp_oprnd_info> & oprnds_info)173 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
174 {
175 int i;
176 slp_oprnd_info oprnd_info;
177
178 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
179 {
180 oprnd_info->def_stmts.release ();
181 XDELETE (oprnd_info);
182 }
183
184 oprnds_info.release ();
185 }
186
187
188 /* Find the place of the data-ref in STMT in the interleaving chain that starts
189 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
190
191 int
vect_get_place_in_interleaving_chain(gimple * stmt,gimple * first_stmt)192 vect_get_place_in_interleaving_chain (gimple *stmt, gimple *first_stmt)
193 {
194 gimple *next_stmt = first_stmt;
195 int result = 0;
196
197 if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
198 return -1;
199
200 do
201 {
202 if (next_stmt == stmt)
203 return result;
204 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
205 if (next_stmt)
206 result += GROUP_GAP (vinfo_for_stmt (next_stmt));
207 }
208 while (next_stmt);
209
210 return -1;
211 }
212
213 /* Check whether it is possible to load COUNT elements of type ELT_MODE
214 using the method implemented by duplicate_and_interleave. Return true
215 if so, returning the number of intermediate vectors in *NVECTORS_OUT
216 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
217 (if nonnull). */
218
219 bool
can_duplicate_and_interleave_p(unsigned int count,machine_mode elt_mode,unsigned int * nvectors_out,tree * vector_type_out,tree * permutes)220 can_duplicate_and_interleave_p (unsigned int count, machine_mode elt_mode,
221 unsigned int *nvectors_out,
222 tree *vector_type_out,
223 tree *permutes)
224 {
225 poly_int64 elt_bytes = count * GET_MODE_SIZE (elt_mode);
226 poly_int64 nelts;
227 unsigned int nvectors = 1;
228 for (;;)
229 {
230 scalar_int_mode int_mode;
231 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
232 if (multiple_p (current_vector_size, elt_bytes, &nelts)
233 && int_mode_for_size (elt_bits, 0).exists (&int_mode))
234 {
235 tree int_type = build_nonstandard_integer_type
236 (GET_MODE_BITSIZE (int_mode), 1);
237 tree vector_type = build_vector_type (int_type, nelts);
238 if (VECTOR_MODE_P (TYPE_MODE (vector_type)))
239 {
240 vec_perm_builder sel1 (nelts, 2, 3);
241 vec_perm_builder sel2 (nelts, 2, 3);
242 poly_int64 half_nelts = exact_div (nelts, 2);
243 for (unsigned int i = 0; i < 3; ++i)
244 {
245 sel1.quick_push (i);
246 sel1.quick_push (i + nelts);
247 sel2.quick_push (half_nelts + i);
248 sel2.quick_push (half_nelts + i + nelts);
249 }
250 vec_perm_indices indices1 (sel1, 2, nelts);
251 vec_perm_indices indices2 (sel2, 2, nelts);
252 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
253 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
254 {
255 if (nvectors_out)
256 *nvectors_out = nvectors;
257 if (vector_type_out)
258 *vector_type_out = vector_type;
259 if (permutes)
260 {
261 permutes[0] = vect_gen_perm_mask_checked (vector_type,
262 indices1);
263 permutes[1] = vect_gen_perm_mask_checked (vector_type,
264 indices2);
265 }
266 return true;
267 }
268 }
269 }
270 if (!multiple_p (elt_bytes, 2, &elt_bytes))
271 return false;
272 nvectors *= 2;
273 }
274 }
275
276 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
277 they are of a valid type and that they match the defs of the first stmt of
278 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
279 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
280 indicates swap is required for cond_expr stmts. Specifically, *SWAP
281 is 1 if STMT is cond and operands of comparison need to be swapped;
282 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
283 If there is any operand swap in this function, *SWAP is set to non-zero
284 value.
285 If there was a fatal error return -1; if the error could be corrected by
286 swapping operands of father node of this one, return 1; if everything is
287 ok return 0. */
288 static int
vect_get_and_check_slp_defs(vec_info * vinfo,unsigned char * swap,vec<gimple * > stmts,unsigned stmt_num,vec<slp_oprnd_info> * oprnds_info)289 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
290 vec<gimple *> stmts, unsigned stmt_num,
291 vec<slp_oprnd_info> *oprnds_info)
292 {
293 gimple *stmt = stmts[stmt_num];
294 tree oprnd;
295 unsigned int i, number_of_oprnds;
296 gimple *def_stmt;
297 enum vect_def_type dt = vect_uninitialized_def;
298 bool pattern = false;
299 slp_oprnd_info oprnd_info;
300 int first_op_idx = 1;
301 bool commutative = false;
302 bool first_op_cond = false;
303 bool first = stmt_num == 0;
304 bool second = stmt_num == 1;
305
306 if (is_gimple_call (stmt))
307 {
308 number_of_oprnds = gimple_call_num_args (stmt);
309 first_op_idx = 3;
310 }
311 else if (is_gimple_assign (stmt))
312 {
313 enum tree_code code = gimple_assign_rhs_code (stmt);
314 number_of_oprnds = gimple_num_ops (stmt) - 1;
315 /* Swap can only be done for cond_expr if asked to, otherwise we
316 could result in different comparison code to the first stmt. */
317 if (gimple_assign_rhs_code (stmt) == COND_EXPR
318 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
319 {
320 first_op_cond = true;
321 number_of_oprnds++;
322 }
323 else
324 commutative = commutative_tree_code (code);
325 }
326 else
327 return -1;
328
329 bool swapped = (*swap != 0);
330 gcc_assert (!swapped || first_op_cond);
331 for (i = 0; i < number_of_oprnds; i++)
332 {
333 again:
334 if (first_op_cond)
335 {
336 /* Map indicating how operands of cond_expr should be swapped. */
337 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
338 int *map = maps[*swap];
339
340 if (i < 2)
341 oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx), map[i]);
342 else
343 oprnd = gimple_op (stmt, map[i]);
344 }
345 else
346 oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
347
348 oprnd_info = (*oprnds_info)[i];
349
350 if (!vect_is_simple_use (oprnd, vinfo, &def_stmt, &dt))
351 {
352 if (dump_enabled_p ())
353 {
354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
355 "Build SLP failed: can't analyze def for ");
356 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
357 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
358 }
359
360 return -1;
361 }
362
363 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
364 from the pattern. Check that all the stmts of the node are in the
365 pattern. */
366 if (def_stmt && gimple_bb (def_stmt)
367 && vect_stmt_in_region_p (vinfo, def_stmt)
368 && vinfo_for_stmt (def_stmt)
369 && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
370 && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
371 && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
372 {
373 pattern = true;
374 if (!first && !oprnd_info->first_pattern
375 /* Allow different pattern state for the defs of the
376 first stmt in reduction chains. */
377 && (oprnd_info->first_dt != vect_reduction_def
378 || (!second && !oprnd_info->second_pattern)))
379 {
380 if (i == 0
381 && !swapped
382 && commutative)
383 {
384 swapped = true;
385 goto again;
386 }
387
388 if (dump_enabled_p ())
389 {
390 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
391 "Build SLP failed: some of the stmts"
392 " are in a pattern, and others are not ");
393 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
394 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
395 }
396
397 return 1;
398 }
399
400 def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
401 dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
402
403 if (dt == vect_unknown_def_type)
404 {
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
407 "Unsupported pattern.\n");
408 return -1;
409 }
410
411 switch (gimple_code (def_stmt))
412 {
413 case GIMPLE_PHI:
414 case GIMPLE_ASSIGN:
415 break;
416
417 default:
418 if (dump_enabled_p ())
419 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
420 "unsupported defining stmt:\n");
421 return -1;
422 }
423 }
424
425 if (second)
426 oprnd_info->second_pattern = pattern;
427
428 if (first)
429 {
430 oprnd_info->first_dt = dt;
431 oprnd_info->first_pattern = pattern;
432 oprnd_info->first_op_type = TREE_TYPE (oprnd);
433 }
434 else
435 {
436 /* Not first stmt of the group, check that the def-stmt/s match
437 the def-stmt/s of the first stmt. Allow different definition
438 types for reduction chains: the first stmt must be a
439 vect_reduction_def (a phi node), and the rest
440 vect_internal_def. */
441 tree type = TREE_TYPE (oprnd);
442 if ((oprnd_info->first_dt != dt
443 && !(oprnd_info->first_dt == vect_reduction_def
444 && dt == vect_internal_def)
445 && !((oprnd_info->first_dt == vect_external_def
446 || oprnd_info->first_dt == vect_constant_def)
447 && (dt == vect_external_def
448 || dt == vect_constant_def)))
449 || !types_compatible_p (oprnd_info->first_op_type, type))
450 {
451 /* Try swapping operands if we got a mismatch. */
452 if (i == 0
453 && !swapped
454 && commutative)
455 {
456 swapped = true;
457 goto again;
458 }
459
460 if (dump_enabled_p ())
461 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
462 "Build SLP failed: different types\n");
463
464 return 1;
465 }
466 if ((dt == vect_constant_def
467 || dt == vect_external_def)
468 && !current_vector_size.is_constant ()
469 && (TREE_CODE (type) == BOOLEAN_TYPE
470 || !can_duplicate_and_interleave_p (stmts.length (),
471 TYPE_MODE (type))))
472 {
473 if (dump_enabled_p ())
474 {
475 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
476 "Build SLP failed: invalid type of def "
477 "for variable-length SLP ");
478 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
479 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
480 }
481 return -1;
482 }
483 }
484
485 /* Check the types of the definitions. */
486 switch (dt)
487 {
488 case vect_constant_def:
489 case vect_external_def:
490 break;
491
492 case vect_reduction_def:
493 case vect_induction_def:
494 case vect_internal_def:
495 oprnd_info->def_stmts.quick_push (def_stmt);
496 break;
497
498 default:
499 /* FORNOW: Not supported. */
500 if (dump_enabled_p ())
501 {
502 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
503 "Build SLP failed: illegal type of def ");
504 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
505 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
506 }
507
508 return -1;
509 }
510 }
511
512 /* Swap operands. */
513 if (swapped)
514 {
515 /* If there are already uses of this stmt in a SLP instance then
516 we've committed to the operand order and can't swap it. */
517 if (STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)) != 0)
518 {
519 if (dump_enabled_p ())
520 {
521 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
522 "Build SLP failed: cannot swap operands of "
523 "shared stmt ");
524 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
525 }
526 return -1;
527 }
528
529 if (first_op_cond)
530 {
531 tree cond = gimple_assign_rhs1 (stmt);
532 enum tree_code code = TREE_CODE (cond);
533
534 /* Swap. */
535 if (*swap == 1)
536 {
537 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
538 &TREE_OPERAND (cond, 1));
539 TREE_SET_CODE (cond, swap_tree_comparison (code));
540 }
541 /* Invert. */
542 else
543 {
544 swap_ssa_operands (stmt, gimple_assign_rhs2_ptr (stmt),
545 gimple_assign_rhs3_ptr (stmt));
546 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond, 0));
547 code = invert_tree_comparison (TREE_CODE (cond), honor_nans);
548 gcc_assert (code != ERROR_MARK);
549 TREE_SET_CODE (cond, code);
550 }
551 }
552 else
553 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
554 gimple_assign_rhs2_ptr (stmt));
555 if (dump_enabled_p ())
556 {
557 dump_printf_loc (MSG_NOTE, vect_location,
558 "swapped operands to match def types in ");
559 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
560 }
561 }
562
563 *swap = swapped;
564 return 0;
565 }
566
567 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
568 caller's attempt to find the vector type in STMT with the narrowest
569 element type. Return true if VECTYPE is nonnull and if it is valid
570 for VINFO. When returning true, update MAX_NUNITS to reflect the
571 number of units in VECTYPE. VINFO, GORUP_SIZE and MAX_NUNITS are
572 as for vect_build_slp_tree. */
573
574 static bool
vect_record_max_nunits(vec_info * vinfo,gimple * stmt,unsigned int group_size,tree vectype,poly_uint64 * max_nunits)575 vect_record_max_nunits (vec_info *vinfo, gimple *stmt, unsigned int group_size,
576 tree vectype, poly_uint64 *max_nunits)
577 {
578 if (!vectype)
579 {
580 if (dump_enabled_p ())
581 {
582 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
583 "Build SLP failed: unsupported data-type in ");
584 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
585 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
586 }
587 /* Fatal mismatch. */
588 return false;
589 }
590
591 /* If populating the vector type requires unrolling then fail
592 before adjusting *max_nunits for basic-block vectorization. */
593 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
594 unsigned HOST_WIDE_INT const_nunits;
595 if (is_a <bb_vec_info> (vinfo)
596 && (!nunits.is_constant (&const_nunits)
597 || const_nunits > group_size))
598 {
599 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
600 "Build SLP failed: unrolling required "
601 "in basic block SLP\n");
602 /* Fatal mismatch. */
603 return false;
604 }
605
606 /* In case of multiple types we need to detect the smallest type. */
607 vect_update_max_nunits (max_nunits, vectype);
608 return true;
609 }
610
611 /* Verify if the scalar stmts STMTS are isomorphic, require data
612 permutation or are of unsupported types of operation. Return
613 true if they are, otherwise return false and indicate in *MATCHES
614 which stmts are not isomorphic to the first one. If MATCHES[0]
615 is false then this indicates the comparison could not be
616 carried out or the stmts will never be vectorized by SLP.
617
618 Note COND_EXPR is possibly ismorphic to another one after swapping its
619 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
620 the first stmt by swapping the two operands of comparison; set SWAP[i]
621 to 2 if stmt I is isormorphic to the first stmt by inverting the code
622 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
623 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
624
625 static bool
vect_build_slp_tree_1(vec_info * vinfo,unsigned char * swap,vec<gimple * > stmts,unsigned int group_size,unsigned nops,poly_uint64 * max_nunits,bool * matches,bool * two_operators)626 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
627 vec<gimple *> stmts, unsigned int group_size,
628 unsigned nops, poly_uint64 *max_nunits,
629 bool *matches, bool *two_operators)
630 {
631 unsigned int i;
632 gimple *first_stmt = stmts[0], *stmt = stmts[0];
633 enum tree_code first_stmt_code = ERROR_MARK;
634 enum tree_code alt_stmt_code = ERROR_MARK;
635 enum tree_code rhs_code = ERROR_MARK;
636 enum tree_code first_cond_code = ERROR_MARK;
637 tree lhs;
638 bool need_same_oprnds = false;
639 tree vectype = NULL_TREE, scalar_type, first_op1 = NULL_TREE;
640 optab optab;
641 int icode;
642 machine_mode optab_op2_mode;
643 machine_mode vec_mode;
644 HOST_WIDE_INT dummy;
645 gimple *first_load = NULL, *prev_first_load = NULL;
646
647 /* For every stmt in NODE find its def stmt/s. */
648 FOR_EACH_VEC_ELT (stmts, i, stmt)
649 {
650 swap[i] = 0;
651 matches[i] = false;
652
653 if (dump_enabled_p ())
654 {
655 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
656 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
657 }
658
659 /* Fail to vectorize statements marked as unvectorizable. */
660 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
661 {
662 if (dump_enabled_p ())
663 {
664 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
665 "Build SLP failed: unvectorizable statement ");
666 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
667 }
668 /* Fatal mismatch. */
669 matches[0] = false;
670 return false;
671 }
672
673 lhs = gimple_get_lhs (stmt);
674 if (lhs == NULL_TREE)
675 {
676 if (dump_enabled_p ())
677 {
678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
679 "Build SLP failed: not GIMPLE_ASSIGN nor "
680 "GIMPLE_CALL ");
681 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
682 }
683 /* Fatal mismatch. */
684 matches[0] = false;
685 return false;
686 }
687
688 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
689 vectype = get_vectype_for_scalar_type (scalar_type);
690 if (!vect_record_max_nunits (vinfo, stmt, group_size, vectype,
691 max_nunits))
692 {
693 /* Fatal mismatch. */
694 matches[0] = false;
695 return false;
696 }
697
698 if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
699 {
700 rhs_code = CALL_EXPR;
701 if (gimple_call_internal_p (call_stmt)
702 || gimple_call_tail_p (call_stmt)
703 || gimple_call_noreturn_p (call_stmt)
704 || !gimple_call_nothrow_p (call_stmt)
705 || gimple_call_chain (call_stmt))
706 {
707 if (dump_enabled_p ())
708 {
709 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
710 "Build SLP failed: unsupported call type ");
711 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
712 call_stmt, 0);
713 }
714 /* Fatal mismatch. */
715 matches[0] = false;
716 return false;
717 }
718 }
719 else
720 rhs_code = gimple_assign_rhs_code (stmt);
721
722 /* Check the operation. */
723 if (i == 0)
724 {
725 first_stmt_code = rhs_code;
726
727 /* Shift arguments should be equal in all the packed stmts for a
728 vector shift with scalar shift operand. */
729 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
730 || rhs_code == LROTATE_EXPR
731 || rhs_code == RROTATE_EXPR)
732 {
733 vec_mode = TYPE_MODE (vectype);
734
735 /* First see if we have a vector/vector shift. */
736 optab = optab_for_tree_code (rhs_code, vectype,
737 optab_vector);
738
739 if (!optab
740 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
741 {
742 /* No vector/vector shift, try for a vector/scalar shift. */
743 optab = optab_for_tree_code (rhs_code, vectype,
744 optab_scalar);
745
746 if (!optab)
747 {
748 if (dump_enabled_p ())
749 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
750 "Build SLP failed: no optab.\n");
751 /* Fatal mismatch. */
752 matches[0] = false;
753 return false;
754 }
755 icode = (int) optab_handler (optab, vec_mode);
756 if (icode == CODE_FOR_nothing)
757 {
758 if (dump_enabled_p ())
759 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
760 "Build SLP failed: "
761 "op not supported by target.\n");
762 /* Fatal mismatch. */
763 matches[0] = false;
764 return false;
765 }
766 optab_op2_mode = insn_data[icode].operand[2].mode;
767 if (!VECTOR_MODE_P (optab_op2_mode))
768 {
769 need_same_oprnds = true;
770 first_op1 = gimple_assign_rhs2 (stmt);
771 }
772 }
773 }
774 else if (rhs_code == WIDEN_LSHIFT_EXPR)
775 {
776 need_same_oprnds = true;
777 first_op1 = gimple_assign_rhs2 (stmt);
778 }
779 }
780 else
781 {
782 if (first_stmt_code != rhs_code
783 && alt_stmt_code == ERROR_MARK)
784 alt_stmt_code = rhs_code;
785 if (first_stmt_code != rhs_code
786 && (first_stmt_code != IMAGPART_EXPR
787 || rhs_code != REALPART_EXPR)
788 && (first_stmt_code != REALPART_EXPR
789 || rhs_code != IMAGPART_EXPR)
790 /* Handle mismatches in plus/minus by computing both
791 and merging the results. */
792 && !((first_stmt_code == PLUS_EXPR
793 || first_stmt_code == MINUS_EXPR)
794 && (alt_stmt_code == PLUS_EXPR
795 || alt_stmt_code == MINUS_EXPR)
796 && rhs_code == alt_stmt_code)
797 && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
798 && (first_stmt_code == ARRAY_REF
799 || first_stmt_code == BIT_FIELD_REF
800 || first_stmt_code == INDIRECT_REF
801 || first_stmt_code == COMPONENT_REF
802 || first_stmt_code == MEM_REF)))
803 {
804 if (dump_enabled_p ())
805 {
806 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
807 "Build SLP failed: different operation "
808 "in stmt ");
809 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
810 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
811 "original stmt ");
812 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
813 first_stmt, 0);
814 }
815 /* Mismatch. */
816 continue;
817 }
818
819 if (need_same_oprnds
820 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
821 {
822 if (dump_enabled_p ())
823 {
824 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
825 "Build SLP failed: different shift "
826 "arguments in ");
827 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
828 }
829 /* Mismatch. */
830 continue;
831 }
832
833 if (rhs_code == CALL_EXPR)
834 {
835 gimple *first_stmt = stmts[0];
836 if (gimple_call_num_args (stmt) != nops
837 || !operand_equal_p (gimple_call_fn (first_stmt),
838 gimple_call_fn (stmt), 0)
839 || gimple_call_fntype (first_stmt)
840 != gimple_call_fntype (stmt))
841 {
842 if (dump_enabled_p ())
843 {
844 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
845 "Build SLP failed: different calls in ");
846 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
847 stmt, 0);
848 }
849 /* Mismatch. */
850 continue;
851 }
852 }
853 }
854
855 /* Grouped store or load. */
856 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
857 {
858 if (REFERENCE_CLASS_P (lhs))
859 {
860 /* Store. */
861 ;
862 }
863 else
864 {
865 /* Load. */
866 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
867 if (prev_first_load)
868 {
869 /* Check that there are no loads from different interleaving
870 chains in the same node. */
871 if (prev_first_load != first_load)
872 {
873 if (dump_enabled_p ())
874 {
875 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
876 vect_location,
877 "Build SLP failed: different "
878 "interleaving chains in one node ");
879 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
880 stmt, 0);
881 }
882 /* Mismatch. */
883 continue;
884 }
885 }
886 else
887 prev_first_load = first_load;
888 }
889 } /* Grouped access. */
890 else
891 {
892 if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
893 {
894 /* Not grouped load. */
895 if (dump_enabled_p ())
896 {
897 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
898 "Build SLP failed: not grouped load ");
899 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
900 }
901
902 /* FORNOW: Not grouped loads are not supported. */
903 /* Fatal mismatch. */
904 matches[0] = false;
905 return false;
906 }
907
908 /* Not memory operation. */
909 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
910 && TREE_CODE_CLASS (rhs_code) != tcc_unary
911 && TREE_CODE_CLASS (rhs_code) != tcc_expression
912 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
913 && rhs_code != CALL_EXPR)
914 {
915 if (dump_enabled_p ())
916 {
917 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
918 "Build SLP failed: operation");
919 dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
920 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
921 }
922 /* Fatal mismatch. */
923 matches[0] = false;
924 return false;
925 }
926
927 if (rhs_code == COND_EXPR)
928 {
929 tree cond_expr = gimple_assign_rhs1 (stmt);
930 enum tree_code cond_code = TREE_CODE (cond_expr);
931 enum tree_code swap_code = ERROR_MARK;
932 enum tree_code invert_code = ERROR_MARK;
933
934 if (i == 0)
935 first_cond_code = TREE_CODE (cond_expr);
936 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
937 {
938 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
939 swap_code = swap_tree_comparison (cond_code);
940 invert_code = invert_tree_comparison (cond_code, honor_nans);
941 }
942
943 if (first_cond_code == cond_code)
944 ;
945 /* Isomorphic can be achieved by swapping. */
946 else if (first_cond_code == swap_code)
947 swap[i] = 1;
948 /* Isomorphic can be achieved by inverting. */
949 else if (first_cond_code == invert_code)
950 swap[i] = 2;
951 else
952 {
953 if (dump_enabled_p ())
954 {
955 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
956 "Build SLP failed: different"
957 " operation");
958 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
959 stmt, 0);
960 }
961 /* Mismatch. */
962 continue;
963 }
964 }
965 }
966
967 matches[i] = true;
968 }
969
970 for (i = 0; i < group_size; ++i)
971 if (!matches[i])
972 return false;
973
974 /* If we allowed a two-operation SLP node verify the target can cope
975 with the permute we are going to use. */
976 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
977 if (alt_stmt_code != ERROR_MARK
978 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
979 {
980 unsigned HOST_WIDE_INT count;
981 if (!nunits.is_constant (&count))
982 {
983 if (dump_enabled_p ())
984 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
985 "Build SLP failed: different operations "
986 "not allowed with variable-length SLP.\n");
987 return false;
988 }
989 vec_perm_builder sel (count, count, 1);
990 for (i = 0; i < count; ++i)
991 {
992 unsigned int elt = i;
993 if (gimple_assign_rhs_code (stmts[i % group_size]) == alt_stmt_code)
994 elt += count;
995 sel.quick_push (elt);
996 }
997 vec_perm_indices indices (sel, 2, count);
998 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
999 {
1000 for (i = 0; i < group_size; ++i)
1001 if (gimple_assign_rhs_code (stmts[i]) == alt_stmt_code)
1002 {
1003 matches[i] = false;
1004 if (dump_enabled_p ())
1005 {
1006 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1007 "Build SLP failed: different operation "
1008 "in stmt ");
1009 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1010 stmts[i], 0);
1011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1012 "original stmt ");
1013 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1014 first_stmt, 0);
1015 }
1016 }
1017 return false;
1018 }
1019 *two_operators = true;
1020 }
1021
1022 return true;
1023 }
1024
1025 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1026 Note we never remove apart from at destruction time so we do not
1027 need a special value for deleted that differs from empty. */
1028 struct bst_traits
1029 {
1030 typedef vec <gimple *> value_type;
1031 typedef vec <gimple *> compare_type;
1032 static inline hashval_t hash (value_type);
1033 static inline bool equal (value_type existing, value_type candidate);
is_emptybst_traits1034 static inline bool is_empty (value_type x) { return !x.exists (); }
is_deletedbst_traits1035 static inline bool is_deleted (value_type x) { return !x.exists (); }
mark_emptybst_traits1036 static inline void mark_empty (value_type &x) { x.release (); }
mark_deletedbst_traits1037 static inline void mark_deleted (value_type &x) { x.release (); }
removebst_traits1038 static inline void remove (value_type &x) { x.release (); }
1039 };
1040 inline hashval_t
hash(value_type x)1041 bst_traits::hash (value_type x)
1042 {
1043 inchash::hash h;
1044 for (unsigned i = 0; i < x.length (); ++i)
1045 h.add_int (gimple_uid (x[i]));
1046 return h.end ();
1047 }
1048 inline bool
equal(value_type existing,value_type candidate)1049 bst_traits::equal (value_type existing, value_type candidate)
1050 {
1051 if (existing.length () != candidate.length ())
1052 return false;
1053 for (unsigned i = 0; i < existing.length (); ++i)
1054 if (existing[i] != candidate[i])
1055 return false;
1056 return true;
1057 }
1058
1059 typedef hash_set <vec <gimple *>, bst_traits> scalar_stmts_set_t;
1060 static scalar_stmts_set_t *bst_fail;
1061
1062 static slp_tree
1063 vect_build_slp_tree_2 (vec_info *vinfo,
1064 vec<gimple *> stmts, unsigned int group_size,
1065 poly_uint64 *max_nunits,
1066 vec<slp_tree> *loads,
1067 bool *matches, unsigned *npermutes, unsigned *tree_size,
1068 unsigned max_tree_size);
1069
1070 static slp_tree
vect_build_slp_tree(vec_info * vinfo,vec<gimple * > stmts,unsigned int group_size,poly_uint64 * max_nunits,vec<slp_tree> * loads,bool * matches,unsigned * npermutes,unsigned * tree_size,unsigned max_tree_size)1071 vect_build_slp_tree (vec_info *vinfo,
1072 vec<gimple *> stmts, unsigned int group_size,
1073 poly_uint64 *max_nunits, vec<slp_tree> *loads,
1074 bool *matches, unsigned *npermutes, unsigned *tree_size,
1075 unsigned max_tree_size)
1076 {
1077 if (bst_fail->contains (stmts))
1078 return NULL;
1079 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, max_nunits,
1080 loads, matches, npermutes, tree_size,
1081 max_tree_size);
1082 /* When SLP build fails for stmts record this, otherwise SLP build
1083 can be exponential in time when we allow to construct parts from
1084 scalars, see PR81723. */
1085 if (! res)
1086 {
1087 vec <gimple *> x;
1088 x.create (stmts.length ());
1089 x.splice (stmts);
1090 bst_fail->add (x);
1091 }
1092 return res;
1093 }
1094
1095 /* Recursively build an SLP tree starting from NODE.
1096 Fail (and return a value not equal to zero) if def-stmts are not
1097 isomorphic, require data permutation or are of unsupported types of
1098 operation. Otherwise, return 0.
1099 The value returned is the depth in the SLP tree where a mismatch
1100 was found. */
1101
1102 static slp_tree
vect_build_slp_tree_2(vec_info * vinfo,vec<gimple * > stmts,unsigned int group_size,poly_uint64 * max_nunits,vec<slp_tree> * loads,bool * matches,unsigned * npermutes,unsigned * tree_size,unsigned max_tree_size)1103 vect_build_slp_tree_2 (vec_info *vinfo,
1104 vec<gimple *> stmts, unsigned int group_size,
1105 poly_uint64 *max_nunits,
1106 vec<slp_tree> *loads,
1107 bool *matches, unsigned *npermutes, unsigned *tree_size,
1108 unsigned max_tree_size)
1109 {
1110 unsigned nops, i, this_tree_size = 0;
1111 poly_uint64 this_max_nunits = *max_nunits;
1112 gimple *stmt;
1113 slp_tree node;
1114
1115 matches[0] = false;
1116
1117 stmt = stmts[0];
1118 if (is_gimple_call (stmt))
1119 nops = gimple_call_num_args (stmt);
1120 else if (is_gimple_assign (stmt))
1121 {
1122 nops = gimple_num_ops (stmt) - 1;
1123 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1124 nops++;
1125 }
1126 else if (gimple_code (stmt) == GIMPLE_PHI)
1127 nops = 0;
1128 else
1129 return NULL;
1130
1131 /* If the SLP node is a PHI (induction or reduction), terminate
1132 the recursion. */
1133 if (gimple_code (stmt) == GIMPLE_PHI)
1134 {
1135 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1136 tree vectype = get_vectype_for_scalar_type (scalar_type);
1137 if (!vect_record_max_nunits (vinfo, stmt, group_size, vectype,
1138 max_nunits))
1139 return NULL;
1140
1141 vect_def_type def_type = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt));
1142 /* Induction from different IVs is not supported. */
1143 if (def_type == vect_induction_def)
1144 {
1145 FOR_EACH_VEC_ELT (stmts, i, stmt)
1146 if (stmt != stmts[0])
1147 return NULL;
1148 }
1149 else
1150 {
1151 /* Else def types have to match. */
1152 FOR_EACH_VEC_ELT (stmts, i, stmt)
1153 {
1154 /* But for reduction chains only check on the first stmt. */
1155 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
1156 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt)
1157 continue;
1158 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != def_type)
1159 return NULL;
1160 }
1161 }
1162 node = vect_create_new_slp_node (stmts);
1163 return node;
1164 }
1165
1166
1167 bool two_operators = false;
1168 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1169 if (!vect_build_slp_tree_1 (vinfo, swap,
1170 stmts, group_size, nops,
1171 &this_max_nunits, matches, &two_operators))
1172 return NULL;
1173
1174 /* If the SLP node is a load, terminate the recursion. */
1175 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
1176 && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1177 {
1178 *max_nunits = this_max_nunits;
1179 node = vect_create_new_slp_node (stmts);
1180 loads->safe_push (node);
1181 return node;
1182 }
1183
1184 /* Get at the operands, verifying they are compatible. */
1185 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1186 slp_oprnd_info oprnd_info;
1187 FOR_EACH_VEC_ELT (stmts, i, stmt)
1188 {
1189 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1190 stmts, i, &oprnds_info);
1191 if (res != 0)
1192 matches[(res == -1) ? 0 : i] = false;
1193 if (!matches[0])
1194 break;
1195 }
1196 for (i = 0; i < group_size; ++i)
1197 if (!matches[i])
1198 {
1199 vect_free_oprnd_info (oprnds_info);
1200 return NULL;
1201 }
1202
1203 auto_vec<slp_tree, 4> children;
1204 auto_vec<slp_tree> this_loads;
1205
1206 stmt = stmts[0];
1207
1208 if (tree_size)
1209 max_tree_size -= *tree_size;
1210
1211 /* Create SLP_TREE nodes for the definition node/s. */
1212 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1213 {
1214 slp_tree child;
1215 unsigned old_nloads = this_loads.length ();
1216 unsigned old_tree_size = this_tree_size;
1217 unsigned int j;
1218
1219 if (oprnd_info->first_dt != vect_internal_def
1220 && oprnd_info->first_dt != vect_reduction_def
1221 && oprnd_info->first_dt != vect_induction_def)
1222 continue;
1223
1224 if (++this_tree_size > max_tree_size)
1225 {
1226 FOR_EACH_VEC_ELT (children, j, child)
1227 vect_free_slp_tree (child);
1228 vect_free_oprnd_info (oprnds_info);
1229 return NULL;
1230 }
1231
1232 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1233 group_size, &this_max_nunits,
1234 &this_loads, matches, npermutes,
1235 &this_tree_size,
1236 max_tree_size)) != NULL)
1237 {
1238 /* If we have all children of child built up from scalars then just
1239 throw that away and build it up this node from scalars. */
1240 if (!SLP_TREE_CHILDREN (child).is_empty ()
1241 /* ??? Rejecting patterns this way doesn't work. We'd have to
1242 do extra work to cancel the pattern so the uses see the
1243 scalar version. */
1244 && !is_pattern_stmt_p
1245 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])))
1246 {
1247 slp_tree grandchild;
1248
1249 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1250 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1251 break;
1252 if (!grandchild)
1253 {
1254 /* Roll back. */
1255 this_loads.truncate (old_nloads);
1256 this_tree_size = old_tree_size;
1257 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1258 vect_free_slp_tree (grandchild);
1259 SLP_TREE_CHILDREN (child).truncate (0);
1260
1261 dump_printf_loc (MSG_NOTE, vect_location,
1262 "Building parent vector operands from "
1263 "scalars instead\n");
1264 oprnd_info->def_stmts = vNULL;
1265 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1266 children.safe_push (child);
1267 continue;
1268 }
1269 }
1270
1271 oprnd_info->def_stmts = vNULL;
1272 children.safe_push (child);
1273 continue;
1274 }
1275
1276 /* If the SLP build failed fatally and we analyze a basic-block
1277 simply treat nodes we fail to build as externally defined
1278 (and thus build vectors from the scalar defs).
1279 The cost model will reject outright expensive cases.
1280 ??? This doesn't treat cases where permutation ultimatively
1281 fails (or we don't try permutation below). Ideally we'd
1282 even compute a permutation that will end up with the maximum
1283 SLP tree size... */
1284 if (is_a <bb_vec_info> (vinfo)
1285 && !matches[0]
1286 /* ??? Rejecting patterns this way doesn't work. We'd have to
1287 do extra work to cancel the pattern so the uses see the
1288 scalar version. */
1289 && !is_pattern_stmt_p (vinfo_for_stmt (stmt)))
1290 {
1291 dump_printf_loc (MSG_NOTE, vect_location,
1292 "Building vector operands from scalars\n");
1293 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1294 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1295 children.safe_push (child);
1296 oprnd_info->def_stmts = vNULL;
1297 continue;
1298 }
1299
1300 /* If the SLP build for operand zero failed and operand zero
1301 and one can be commutated try that for the scalar stmts
1302 that failed the match. */
1303 if (i == 0
1304 /* A first scalar stmt mismatch signals a fatal mismatch. */
1305 && matches[0]
1306 /* ??? For COND_EXPRs we can swap the comparison operands
1307 as well as the arms under some constraints. */
1308 && nops == 2
1309 && oprnds_info[1]->first_dt == vect_internal_def
1310 && is_gimple_assign (stmt)
1311 /* Swapping operands for reductions breaks assumptions later on. */
1312 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
1313 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_double_reduction_def
1314 /* Do so only if the number of not successful permutes was nor more
1315 than a cut-ff as re-trying the recursive match on
1316 possibly each level of the tree would expose exponential
1317 behavior. */
1318 && *npermutes < 4)
1319 {
1320 /* See whether we can swap the matching or the non-matching
1321 stmt operands. */
1322 bool swap_not_matching = true;
1323 do
1324 {
1325 for (j = 0; j < group_size; ++j)
1326 {
1327 if (matches[j] != !swap_not_matching)
1328 continue;
1329 gimple *stmt = stmts[j];
1330 /* Verify if we can swap operands of this stmt. */
1331 if (!is_gimple_assign (stmt)
1332 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1333 {
1334 if (!swap_not_matching)
1335 goto fail;
1336 swap_not_matching = false;
1337 break;
1338 }
1339 /* Verify if we can safely swap or if we committed to a
1340 specific operand order already.
1341 ??? Instead of modifying GIMPLE stmts here we could
1342 record whether we want to swap operands in the SLP
1343 node and temporarily do that when processing it
1344 (or wrap operand accessors in a helper). */
1345 else if (swap[j] != 0
1346 || STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)))
1347 {
1348 if (!swap_not_matching)
1349 {
1350 if (dump_enabled_p ())
1351 {
1352 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1353 vect_location,
1354 "Build SLP failed: cannot swap "
1355 "operands of shared stmt ");
1356 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,
1357 TDF_SLIM, stmts[j], 0);
1358 }
1359 goto fail;
1360 }
1361 swap_not_matching = false;
1362 break;
1363 }
1364 }
1365 }
1366 while (j != group_size);
1367
1368 /* Swap mismatched definition stmts. */
1369 dump_printf_loc (MSG_NOTE, vect_location,
1370 "Re-trying with swapped operands of stmts ");
1371 for (j = 0; j < group_size; ++j)
1372 if (matches[j] == !swap_not_matching)
1373 {
1374 std::swap (oprnds_info[0]->def_stmts[j],
1375 oprnds_info[1]->def_stmts[j]);
1376 dump_printf (MSG_NOTE, "%d ", j);
1377 }
1378 dump_printf (MSG_NOTE, "\n");
1379 /* And try again with scratch 'matches' ... */
1380 bool *tem = XALLOCAVEC (bool, group_size);
1381 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1382 group_size, &this_max_nunits,
1383 &this_loads, tem, npermutes,
1384 &this_tree_size,
1385 max_tree_size)) != NULL)
1386 {
1387 /* ... so if successful we can apply the operand swapping
1388 to the GIMPLE IL. This is necessary because for example
1389 vect_get_slp_defs uses operand indexes and thus expects
1390 canonical operand order. This is also necessary even
1391 if we end up building the operand from scalars as
1392 we'll continue to process swapped operand two. */
1393 for (j = 0; j < group_size; ++j)
1394 {
1395 gimple *stmt = stmts[j];
1396 gimple_set_plf (stmt, GF_PLF_1, false);
1397 }
1398 for (j = 0; j < group_size; ++j)
1399 {
1400 gimple *stmt = stmts[j];
1401 if (matches[j] == !swap_not_matching)
1402 {
1403 /* Avoid swapping operands twice. */
1404 if (gimple_plf (stmt, GF_PLF_1))
1405 continue;
1406 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
1407 gimple_assign_rhs2_ptr (stmt));
1408 gimple_set_plf (stmt, GF_PLF_1, true);
1409 }
1410 }
1411 /* Verify we swap all duplicates or none. */
1412 if (flag_checking)
1413 for (j = 0; j < group_size; ++j)
1414 {
1415 gimple *stmt = stmts[j];
1416 gcc_assert (gimple_plf (stmt, GF_PLF_1)
1417 == (matches[j] == !swap_not_matching));
1418 }
1419
1420 /* If we have all children of child built up from scalars then
1421 just throw that away and build it up this node from scalars. */
1422 if (!SLP_TREE_CHILDREN (child).is_empty ()
1423 /* ??? Rejecting patterns this way doesn't work. We'd have
1424 to do extra work to cancel the pattern so the uses see the
1425 scalar version. */
1426 && !is_pattern_stmt_p
1427 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])))
1428 {
1429 unsigned int j;
1430 slp_tree grandchild;
1431
1432 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1433 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1434 break;
1435 if (!grandchild)
1436 {
1437 /* Roll back. */
1438 this_loads.truncate (old_nloads);
1439 this_tree_size = old_tree_size;
1440 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1441 vect_free_slp_tree (grandchild);
1442 SLP_TREE_CHILDREN (child).truncate (0);
1443
1444 dump_printf_loc (MSG_NOTE, vect_location,
1445 "Building parent vector operands from "
1446 "scalars instead\n");
1447 oprnd_info->def_stmts = vNULL;
1448 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1449 children.safe_push (child);
1450 continue;
1451 }
1452 }
1453
1454 oprnd_info->def_stmts = vNULL;
1455 children.safe_push (child);
1456 continue;
1457 }
1458
1459 ++*npermutes;
1460 }
1461
1462 fail:
1463 gcc_assert (child == NULL);
1464 FOR_EACH_VEC_ELT (children, j, child)
1465 vect_free_slp_tree (child);
1466 vect_free_oprnd_info (oprnds_info);
1467 return NULL;
1468 }
1469
1470 vect_free_oprnd_info (oprnds_info);
1471
1472 if (tree_size)
1473 *tree_size += this_tree_size;
1474 *max_nunits = this_max_nunits;
1475 loads->safe_splice (this_loads);
1476
1477 node = vect_create_new_slp_node (stmts);
1478 SLP_TREE_TWO_OPERATORS (node) = two_operators;
1479 SLP_TREE_CHILDREN (node).splice (children);
1480 return node;
1481 }
1482
1483 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1484
1485 static void
vect_print_slp_tree(dump_flags_t dump_kind,location_t loc,slp_tree node)1486 vect_print_slp_tree (dump_flags_t dump_kind, location_t loc, slp_tree node)
1487 {
1488 int i;
1489 gimple *stmt;
1490 slp_tree child;
1491
1492 dump_printf_loc (dump_kind, loc, "node%s\n",
1493 SLP_TREE_DEF_TYPE (node) != vect_internal_def
1494 ? " (external)" : "");
1495 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1496 {
1497 dump_printf_loc (dump_kind, loc, "\tstmt %d ", i);
1498 dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1499 }
1500 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1501 vect_print_slp_tree (dump_kind, loc, child);
1502 }
1503
1504
1505 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1506 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1507 J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1508 stmts in NODE are to be marked. */
1509
1510 static void
vect_mark_slp_stmts(slp_tree node,enum slp_vect_type mark,int j)1511 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1512 {
1513 int i;
1514 gimple *stmt;
1515 slp_tree child;
1516
1517 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1518 return;
1519
1520 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1521 if (j < 0 || i == j)
1522 STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1523
1524 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1525 vect_mark_slp_stmts (child, mark, j);
1526 }
1527
1528
1529 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1530
1531 static void
vect_mark_slp_stmts_relevant(slp_tree node)1532 vect_mark_slp_stmts_relevant (slp_tree node)
1533 {
1534 int i;
1535 gimple *stmt;
1536 stmt_vec_info stmt_info;
1537 slp_tree child;
1538
1539 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1540 return;
1541
1542 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1543 {
1544 stmt_info = vinfo_for_stmt (stmt);
1545 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1546 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1547 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1548 }
1549
1550 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1551 vect_mark_slp_stmts_relevant (child);
1552 }
1553
1554
1555 /* Rearrange the statements of NODE according to PERMUTATION. */
1556
1557 static void
vect_slp_rearrange_stmts(slp_tree node,unsigned int group_size,vec<unsigned> permutation)1558 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1559 vec<unsigned> permutation)
1560 {
1561 gimple *stmt;
1562 vec<gimple *> tmp_stmts;
1563 unsigned int i;
1564 slp_tree child;
1565
1566 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1567 vect_slp_rearrange_stmts (child, group_size, permutation);
1568
1569 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1570 tmp_stmts.create (group_size);
1571 tmp_stmts.quick_grow_cleared (group_size);
1572
1573 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1574 tmp_stmts[permutation[i]] = stmt;
1575
1576 SLP_TREE_SCALAR_STMTS (node).release ();
1577 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1578 }
1579
1580
1581 /* Attempt to reorder stmts in a reduction chain so that we don't
1582 require any load permutation. Return true if that was possible,
1583 otherwise return false. */
1584
1585 static bool
vect_attempt_slp_rearrange_stmts(slp_instance slp_instn)1586 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1587 {
1588 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1589 unsigned int i, j;
1590 unsigned int lidx;
1591 slp_tree node, load;
1592
1593 /* Compare all the permutation sequences to the first one. We know
1594 that at least one load is permuted. */
1595 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1596 if (!node->load_permutation.exists ())
1597 return false;
1598 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1599 {
1600 if (!load->load_permutation.exists ())
1601 return false;
1602 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1603 if (lidx != node->load_permutation[j])
1604 return false;
1605 }
1606
1607 /* Check that the loads in the first sequence are different and there
1608 are no gaps between them. */
1609 auto_sbitmap load_index (group_size);
1610 bitmap_clear (load_index);
1611 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1612 {
1613 if (lidx >= group_size)
1614 return false;
1615 if (bitmap_bit_p (load_index, lidx))
1616 return false;
1617
1618 bitmap_set_bit (load_index, lidx);
1619 }
1620 for (i = 0; i < group_size; i++)
1621 if (!bitmap_bit_p (load_index, i))
1622 return false;
1623
1624 /* This permutation is valid for reduction. Since the order of the
1625 statements in the nodes is not important unless they are memory
1626 accesses, we can rearrange the statements in all the nodes
1627 according to the order of the loads. */
1628 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1629 node->load_permutation);
1630
1631 /* We are done, no actual permutations need to be generated. */
1632 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1633 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1634 {
1635 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1636 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
1637 /* But we have to keep those permutations that are required because
1638 of handling of gaps. */
1639 if (known_eq (unrolling_factor, 1U)
1640 || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt))
1641 && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0))
1642 SLP_TREE_LOAD_PERMUTATION (node).release ();
1643 else
1644 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1645 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1646 }
1647
1648 return true;
1649 }
1650
1651 /* Check if the required load permutations in the SLP instance
1652 SLP_INSTN are supported. */
1653
1654 static bool
vect_supported_load_permutation_p(slp_instance slp_instn)1655 vect_supported_load_permutation_p (slp_instance slp_instn)
1656 {
1657 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1658 unsigned int i, j, k, next;
1659 slp_tree node;
1660 gimple *stmt, *load, *next_load;
1661
1662 if (dump_enabled_p ())
1663 {
1664 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1665 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1666 if (node->load_permutation.exists ())
1667 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1668 dump_printf (MSG_NOTE, "%d ", next);
1669 else
1670 for (k = 0; k < group_size; ++k)
1671 dump_printf (MSG_NOTE, "%d ", k);
1672 dump_printf (MSG_NOTE, "\n");
1673 }
1674
1675 /* In case of reduction every load permutation is allowed, since the order
1676 of the reduction statements is not important (as opposed to the case of
1677 grouped stores). The only condition we need to check is that all the
1678 load nodes are of the same size and have the same permutation (and then
1679 rearrange all the nodes of the SLP instance according to this
1680 permutation). */
1681
1682 /* Check that all the load nodes are of the same size. */
1683 /* ??? Can't we assert this? */
1684 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1685 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1686 return false;
1687
1688 node = SLP_INSTANCE_TREE (slp_instn);
1689 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1690
1691 /* Reduction (there are no data-refs in the root).
1692 In reduction chain the order of the loads is not important. */
1693 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1694 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1695 vect_attempt_slp_rearrange_stmts (slp_instn);
1696
1697 /* In basic block vectorization we allow any subchain of an interleaving
1698 chain.
1699 FORNOW: not supported in loop SLP because of realignment compications. */
1700 if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1701 {
1702 /* Check whether the loads in an instance form a subchain and thus
1703 no permutation is necessary. */
1704 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1705 {
1706 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1707 continue;
1708 bool subchain_p = true;
1709 next_load = NULL;
1710 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1711 {
1712 if (j != 0
1713 && (next_load != load
1714 || GROUP_GAP (vinfo_for_stmt (load)) != 1))
1715 {
1716 subchain_p = false;
1717 break;
1718 }
1719 next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1720 }
1721 if (subchain_p)
1722 SLP_TREE_LOAD_PERMUTATION (node).release ();
1723 else
1724 {
1725 stmt_vec_info group_info
1726 = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
1727 group_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (group_info));
1728 unsigned HOST_WIDE_INT nunits;
1729 unsigned k, maxk = 0;
1730 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
1731 if (k > maxk)
1732 maxk = k;
1733 /* In BB vectorization we may not actually use a loaded vector
1734 accessing elements in excess of GROUP_SIZE. */
1735 tree vectype = STMT_VINFO_VECTYPE (group_info);
1736 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
1737 || maxk >= (GROUP_SIZE (group_info) & ~(nunits - 1)))
1738 {
1739 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1740 "BB vectorization with gaps at the end of "
1741 "a load is not supported\n");
1742 return false;
1743 }
1744
1745 /* Verify the permutation can be generated. */
1746 vec<tree> tem;
1747 unsigned n_perms;
1748 if (!vect_transform_slp_perm_load (node, tem, NULL,
1749 1, slp_instn, true, &n_perms))
1750 {
1751 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1752 vect_location,
1753 "unsupported load permutation\n");
1754 return false;
1755 }
1756 }
1757 }
1758 return true;
1759 }
1760
1761 /* For loop vectorization verify we can generate the permutation. Be
1762 conservative about the vectorization factor, there are permutations
1763 that will use three vector inputs only starting from a specific factor
1764 and the vectorization factor is not yet final.
1765 ??? The SLP instance unrolling factor might not be the maximum one. */
1766 unsigned n_perms;
1767 poly_uint64 test_vf
1768 = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn),
1769 LOOP_VINFO_VECT_FACTOR
1770 (STMT_VINFO_LOOP_VINFO (vinfo_for_stmt (stmt))));
1771 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1772 if (node->load_permutation.exists ()
1773 && !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf,
1774 slp_instn, true, &n_perms))
1775 return false;
1776
1777 return true;
1778 }
1779
1780
1781 /* Find the last store in SLP INSTANCE. */
1782
1783 gimple *
vect_find_last_scalar_stmt_in_slp(slp_tree node)1784 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1785 {
1786 gimple *last = NULL, *stmt;
1787
1788 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt); i++)
1789 {
1790 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1791 if (is_pattern_stmt_p (stmt_vinfo))
1792 last = get_later_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo), last);
1793 else
1794 last = get_later_stmt (stmt, last);
1795 }
1796
1797 return last;
1798 }
1799
1800 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE. */
1801
1802 static void
vect_analyze_slp_cost_1(slp_instance instance,slp_tree node,stmt_vector_for_cost * prologue_cost_vec,stmt_vector_for_cost * body_cost_vec,unsigned ncopies_for_cost,scalar_stmts_set_t * visited)1803 vect_analyze_slp_cost_1 (slp_instance instance, slp_tree node,
1804 stmt_vector_for_cost *prologue_cost_vec,
1805 stmt_vector_for_cost *body_cost_vec,
1806 unsigned ncopies_for_cost,
1807 scalar_stmts_set_t* visited)
1808 {
1809 unsigned i, j;
1810 slp_tree child;
1811 gimple *stmt;
1812 stmt_vec_info stmt_info;
1813 tree lhs;
1814
1815 /* If we already costed the exact same set of scalar stmts we're done.
1816 We share the generated vector stmts for those. */
1817 if (visited->contains (SLP_TREE_SCALAR_STMTS (node)))
1818 return;
1819
1820 visited->add (SLP_TREE_SCALAR_STMTS (node).copy ());
1821
1822 /* Recurse down the SLP tree. */
1823 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1824 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1825 vect_analyze_slp_cost_1 (instance, child, prologue_cost_vec,
1826 body_cost_vec, ncopies_for_cost, visited);
1827
1828 /* Look at the first scalar stmt to determine the cost. */
1829 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1830 stmt_info = vinfo_for_stmt (stmt);
1831 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1832 {
1833 vect_memory_access_type memory_access_type
1834 = (STMT_VINFO_STRIDED_P (stmt_info)
1835 ? VMAT_STRIDED_SLP
1836 : VMAT_CONTIGUOUS);
1837 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1838 vect_model_store_cost (stmt_info, ncopies_for_cost,
1839 memory_access_type, VLS_STORE,
1840 node, prologue_cost_vec, body_cost_vec);
1841 else
1842 {
1843 gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1844 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1845 {
1846 /* If the load is permuted then the alignment is determined by
1847 the first group element not by the first scalar stmt DR. */
1848 stmt = GROUP_FIRST_ELEMENT (stmt_info);
1849 stmt_info = vinfo_for_stmt (stmt);
1850 /* Record the cost for the permutation. */
1851 unsigned n_perms;
1852 vect_transform_slp_perm_load (node, vNULL, NULL,
1853 ncopies_for_cost, instance, true,
1854 &n_perms);
1855 record_stmt_cost (body_cost_vec, n_perms, vec_perm,
1856 stmt_info, 0, vect_body);
1857 unsigned assumed_nunits
1858 = vect_nunits_for_cost (STMT_VINFO_VECTYPE (stmt_info));
1859 /* And adjust the number of loads performed. This handles
1860 redundancies as well as loads that are later dead. */
1861 auto_sbitmap perm (GROUP_SIZE (stmt_info));
1862 bitmap_clear (perm);
1863 for (i = 0; i < SLP_TREE_LOAD_PERMUTATION (node).length (); ++i)
1864 bitmap_set_bit (perm, SLP_TREE_LOAD_PERMUTATION (node)[i]);
1865 ncopies_for_cost = 0;
1866 bool load_seen = false;
1867 for (i = 0; i < GROUP_SIZE (stmt_info); ++i)
1868 {
1869 if (i % assumed_nunits == 0)
1870 {
1871 if (load_seen)
1872 ncopies_for_cost++;
1873 load_seen = false;
1874 }
1875 if (bitmap_bit_p (perm, i))
1876 load_seen = true;
1877 }
1878 if (load_seen)
1879 ncopies_for_cost++;
1880 gcc_assert (ncopies_for_cost
1881 <= (GROUP_SIZE (stmt_info) - GROUP_GAP (stmt_info)
1882 + assumed_nunits - 1) / assumed_nunits);
1883 poly_uint64 uf = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1884 ncopies_for_cost *= estimated_poly_value (uf);
1885 }
1886 /* Record the cost for the vector loads. */
1887 vect_model_load_cost (stmt_info, ncopies_for_cost,
1888 memory_access_type, node, prologue_cost_vec,
1889 body_cost_vec);
1890 return;
1891 }
1892 }
1893 else if (STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type)
1894 {
1895 /* ncopies_for_cost is the number of IVs we generate. */
1896 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1897 stmt_info, 0, vect_body);
1898
1899 /* Prologue cost for the initial values and step vector. */
1900 record_stmt_cost (prologue_cost_vec, ncopies_for_cost,
1901 CONSTANT_CLASS_P
1902 (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED
1903 (stmt_info))
1904 ? vector_load : vec_construct,
1905 stmt_info, 0, vect_prologue);
1906 record_stmt_cost (prologue_cost_vec, 1,
1907 CONSTANT_CLASS_P
1908 (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info))
1909 ? vector_load : vec_construct,
1910 stmt_info, 0, vect_prologue);
1911
1912 /* ??? No easy way to get at the actual number of vector stmts
1913 to be geneated and thus the derived IVs. */
1914 }
1915 else
1916 {
1917 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1918 stmt_info, 0, vect_body);
1919 if (SLP_TREE_TWO_OPERATORS (node))
1920 {
1921 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1922 stmt_info, 0, vect_body);
1923 record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm,
1924 stmt_info, 0, vect_body);
1925 }
1926 }
1927
1928 /* Push SLP node def-type to stmts. */
1929 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1930 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
1931 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
1932 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = SLP_TREE_DEF_TYPE (child);
1933
1934 /* Scan operands and account for prologue cost of constants/externals.
1935 ??? This over-estimates cost for multiple uses and should be
1936 re-engineered. */
1937 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1938 lhs = gimple_get_lhs (stmt);
1939 for (i = 0; i < gimple_num_ops (stmt); ++i)
1940 {
1941 tree op = gimple_op (stmt, i);
1942 gimple *def_stmt;
1943 enum vect_def_type dt;
1944 if (!op || op == lhs)
1945 continue;
1946 if (vect_is_simple_use (op, stmt_info->vinfo, &def_stmt, &dt)
1947 && (dt == vect_constant_def || dt == vect_external_def))
1948 {
1949 /* Without looking at the actual initializer a vector of
1950 constants can be implemented as load from the constant pool.
1951 When all elements are the same we can use a splat. */
1952 tree vectype = get_vectype_for_scalar_type (TREE_TYPE (op));
1953 unsigned group_size = SLP_TREE_SCALAR_STMTS (node).length ();
1954 unsigned num_vects_to_check;
1955 unsigned HOST_WIDE_INT const_nunits;
1956 unsigned nelt_limit;
1957 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
1958 && ! multiple_p (const_nunits, group_size))
1959 {
1960 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
1961 nelt_limit = const_nunits;
1962 }
1963 else
1964 {
1965 /* If either the vector has variable length or the vectors
1966 are composed of repeated whole groups we only need to
1967 cost construction once. All vectors will be the same. */
1968 num_vects_to_check = 1;
1969 nelt_limit = group_size;
1970 }
1971 tree elt = NULL_TREE;
1972 unsigned nelt = 0;
1973 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
1974 {
1975 unsigned si = j % group_size;
1976 if (nelt == 0)
1977 elt = gimple_op (SLP_TREE_SCALAR_STMTS (node)[si], i);
1978 /* ??? We're just tracking whether all operands of a single
1979 vector initializer are the same, ideally we'd check if
1980 we emitted the same one already. */
1981 else if (elt != gimple_op (SLP_TREE_SCALAR_STMTS (node)[si], i))
1982 elt = NULL_TREE;
1983 nelt++;
1984 if (nelt == nelt_limit)
1985 {
1986 /* ??? We need to pass down stmt_info for a vector type
1987 even if it points to the wrong stmt. */
1988 record_stmt_cost (prologue_cost_vec, 1,
1989 dt == vect_external_def
1990 ? (elt ? scalar_to_vec : vec_construct)
1991 : vector_load,
1992 stmt_info, 0, vect_prologue);
1993 nelt = 0;
1994 }
1995 }
1996 }
1997 }
1998
1999 /* Restore stmt def-types. */
2000 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2001 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2002 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
2003 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_internal_def;
2004 }
2005
2006 /* Compute the cost for the SLP instance INSTANCE. */
2007
2008 static void
vect_analyze_slp_cost(slp_instance instance,void * data,scalar_stmts_set_t * visited)2009 vect_analyze_slp_cost (slp_instance instance, void *data, scalar_stmts_set_t *visited)
2010 {
2011 stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
2012 unsigned ncopies_for_cost;
2013 stmt_info_for_cost *si;
2014 unsigned i;
2015
2016 /* Calculate the number of vector stmts to create based on the unrolling
2017 factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
2018 GROUP_SIZE / NUNITS otherwise. */
2019 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
2020 slp_tree node = SLP_INSTANCE_TREE (instance);
2021 stmt_vec_info stmt_info = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2022 /* Get the estimated vectorization factor, which is always one for
2023 basic-block vectorization. */
2024 unsigned int assumed_vf;
2025 if (STMT_VINFO_LOOP_VINFO (stmt_info))
2026 assumed_vf = vect_vf_for_cost (STMT_VINFO_LOOP_VINFO (stmt_info));
2027 else
2028 assumed_vf = 1;
2029 /* For reductions look at a reduction operand in case the reduction
2030 operation is widening like DOT_PROD or SAD. */
2031 tree vectype_for_cost = STMT_VINFO_VECTYPE (stmt_info);
2032 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2033 {
2034 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2035 switch (gimple_assign_rhs_code (stmt))
2036 {
2037 case DOT_PROD_EXPR:
2038 case SAD_EXPR:
2039 vectype_for_cost = get_vectype_for_scalar_type
2040 (TREE_TYPE (gimple_assign_rhs1 (stmt)));
2041 break;
2042 default:;
2043 }
2044 }
2045 unsigned int assumed_nunits = vect_nunits_for_cost (vectype_for_cost);
2046 ncopies_for_cost = (least_common_multiple (assumed_nunits,
2047 group_size * assumed_vf)
2048 / assumed_nunits);
2049
2050 prologue_cost_vec.create (10);
2051 body_cost_vec.create (10);
2052 vect_analyze_slp_cost_1 (instance, SLP_INSTANCE_TREE (instance),
2053 &prologue_cost_vec, &body_cost_vec,
2054 ncopies_for_cost, visited);
2055
2056 /* Record the prologue costs, which were delayed until we were
2057 sure that SLP was successful. */
2058 FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
2059 {
2060 struct _stmt_vec_info *stmt_info
2061 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2062 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2063 si->misalign, vect_prologue);
2064 }
2065
2066 /* Record the instance's instructions in the target cost model. */
2067 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
2068 {
2069 struct _stmt_vec_info *stmt_info
2070 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2071 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2072 si->misalign, vect_body);
2073 }
2074
2075 prologue_cost_vec.release ();
2076 body_cost_vec.release ();
2077 }
2078
2079 /* Splits a group of stores, currently beginning at FIRST_STMT, into two groups:
2080 one (still beginning at FIRST_STMT) of size GROUP1_SIZE (also containing
2081 the first GROUP1_SIZE stmts, since stores are consecutive), the second
2082 containing the remainder.
2083 Return the first stmt in the second group. */
2084
2085 static gimple *
vect_split_slp_store_group(gimple * first_stmt,unsigned group1_size)2086 vect_split_slp_store_group (gimple *first_stmt, unsigned group1_size)
2087 {
2088 stmt_vec_info first_vinfo = vinfo_for_stmt (first_stmt);
2089 gcc_assert (GROUP_FIRST_ELEMENT (first_vinfo) == first_stmt);
2090 gcc_assert (group1_size > 0);
2091 int group2_size = GROUP_SIZE (first_vinfo) - group1_size;
2092 gcc_assert (group2_size > 0);
2093 GROUP_SIZE (first_vinfo) = group1_size;
2094
2095 gimple *stmt = first_stmt;
2096 for (unsigned i = group1_size; i > 1; i--)
2097 {
2098 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2099 gcc_assert (GROUP_GAP (vinfo_for_stmt (stmt)) == 1);
2100 }
2101 /* STMT is now the last element of the first group. */
2102 gimple *group2 = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2103 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)) = 0;
2104
2105 GROUP_SIZE (vinfo_for_stmt (group2)) = group2_size;
2106 for (stmt = group2; stmt; stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)))
2107 {
2108 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = group2;
2109 gcc_assert (GROUP_GAP (vinfo_for_stmt (stmt)) == 1);
2110 }
2111
2112 /* For the second group, the GROUP_GAP is that before the original group,
2113 plus skipping over the first vector. */
2114 GROUP_GAP (vinfo_for_stmt (group2)) =
2115 GROUP_GAP (first_vinfo) + group1_size;
2116
2117 /* GROUP_GAP of the first group now has to skip over the second group too. */
2118 GROUP_GAP (first_vinfo) += group2_size;
2119
2120 if (dump_enabled_p ())
2121 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2122 group1_size, group2_size);
2123
2124 return group2;
2125 }
2126
2127 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2128 statements and a vector of NUNITS elements. */
2129
2130 static poly_uint64
calculate_unrolling_factor(poly_uint64 nunits,unsigned int group_size)2131 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2132 {
2133 return exact_div (common_multiple (nunits, group_size), group_size);
2134 }
2135
2136 /* Analyze an SLP instance starting from a group of grouped stores. Call
2137 vect_build_slp_tree to build a tree of packed stmts if possible.
2138 Return FALSE if it's impossible to SLP any stmt in the loop. */
2139
2140 static bool
vect_analyze_slp_instance(vec_info * vinfo,gimple * stmt,unsigned max_tree_size)2141 vect_analyze_slp_instance (vec_info *vinfo,
2142 gimple *stmt, unsigned max_tree_size)
2143 {
2144 slp_instance new_instance;
2145 slp_tree node;
2146 unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
2147 tree vectype, scalar_type = NULL_TREE;
2148 gimple *next;
2149 unsigned int i;
2150 vec<slp_tree> loads;
2151 struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
2152 vec<gimple *> scalar_stmts;
2153
2154 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2155 {
2156 if (dr)
2157 {
2158 scalar_type = TREE_TYPE (DR_REF (dr));
2159 vectype = get_vectype_for_scalar_type (scalar_type);
2160 }
2161 else
2162 {
2163 gcc_assert (is_a <loop_vec_info> (vinfo));
2164 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2165 }
2166
2167 group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
2168 }
2169 else
2170 {
2171 gcc_assert (is_a <loop_vec_info> (vinfo));
2172 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2173 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2174 }
2175
2176 if (!vectype)
2177 {
2178 if (dump_enabled_p ())
2179 {
2180 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2181 "Build SLP failed: unsupported data-type ");
2182 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
2183 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2184 }
2185
2186 return false;
2187 }
2188 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2189
2190 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2191 scalar_stmts.create (group_size);
2192 next = stmt;
2193 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2194 {
2195 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2196 while (next)
2197 {
2198 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
2199 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
2200 scalar_stmts.safe_push (
2201 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
2202 else
2203 scalar_stmts.safe_push (next);
2204 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2205 }
2206 /* Mark the first element of the reduction chain as reduction to properly
2207 transform the node. In the reduction analysis phase only the last
2208 element of the chain is marked as reduction. */
2209 if (!STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
2210 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_reduction_def;
2211 }
2212 else
2213 {
2214 /* Collect reduction statements. */
2215 vec<gimple *> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2216 for (i = 0; reductions.iterate (i, &next); i++)
2217 scalar_stmts.safe_push (next);
2218 }
2219
2220 loads.create (group_size);
2221
2222 /* Build the tree for the SLP instance. */
2223 bool *matches = XALLOCAVEC (bool, group_size);
2224 unsigned npermutes = 0;
2225 bst_fail = new scalar_stmts_set_t ();
2226 poly_uint64 max_nunits = nunits;
2227 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2228 &max_nunits, &loads, matches, &npermutes,
2229 NULL, max_tree_size);
2230 delete bst_fail;
2231 if (node != NULL)
2232 {
2233 /* Calculate the unrolling factor based on the smallest type. */
2234 poly_uint64 unrolling_factor
2235 = calculate_unrolling_factor (max_nunits, group_size);
2236
2237 if (maybe_ne (unrolling_factor, 1U)
2238 && is_a <bb_vec_info> (vinfo))
2239 {
2240 unsigned HOST_WIDE_INT const_max_nunits;
2241 if (!max_nunits.is_constant (&const_max_nunits)
2242 || const_max_nunits > group_size)
2243 {
2244 if (dump_enabled_p ())
2245 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2246 "Build SLP failed: store group "
2247 "size not a multiple of the vector size "
2248 "in basic block SLP\n");
2249 vect_free_slp_tree (node);
2250 loads.release ();
2251 return false;
2252 }
2253 /* Fatal mismatch. */
2254 matches[group_size / const_max_nunits * const_max_nunits] = false;
2255 vect_free_slp_tree (node);
2256 loads.release ();
2257 }
2258 else
2259 {
2260 /* Create a new SLP instance. */
2261 new_instance = XNEW (struct _slp_instance);
2262 SLP_INSTANCE_TREE (new_instance) = node;
2263 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
2264 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2265 SLP_INSTANCE_LOADS (new_instance) = loads;
2266
2267 /* Compute the load permutation. */
2268 slp_tree load_node;
2269 bool loads_permuted = false;
2270 FOR_EACH_VEC_ELT (loads, i, load_node)
2271 {
2272 vec<unsigned> load_permutation;
2273 int j;
2274 gimple *load, *first_stmt;
2275 bool this_load_permuted = false;
2276 load_permutation.create (group_size);
2277 first_stmt = GROUP_FIRST_ELEMENT
2278 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
2279 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
2280 {
2281 int load_place = vect_get_place_in_interleaving_chain
2282 (load, first_stmt);
2283 gcc_assert (load_place != -1);
2284 if (load_place != j)
2285 this_load_permuted = true;
2286 load_permutation.safe_push (load_place);
2287 }
2288 if (!this_load_permuted
2289 /* The load requires permutation when unrolling exposes
2290 a gap either because the group is larger than the SLP
2291 group-size or because there is a gap between the groups. */
2292 && (known_eq (unrolling_factor, 1U)
2293 || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt))
2294 && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0)))
2295 {
2296 load_permutation.release ();
2297 continue;
2298 }
2299 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
2300 loads_permuted = true;
2301 }
2302
2303 if (loads_permuted)
2304 {
2305 if (!vect_supported_load_permutation_p (new_instance))
2306 {
2307 if (dump_enabled_p ())
2308 {
2309 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2310 "Build SLP failed: unsupported load "
2311 "permutation ");
2312 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,
2313 TDF_SLIM, stmt, 0);
2314 }
2315 vect_free_slp_instance (new_instance);
2316 return false;
2317 }
2318 }
2319
2320 /* If the loads and stores can be handled with load/store-lan
2321 instructions do not generate this SLP instance. */
2322 if (is_a <loop_vec_info> (vinfo)
2323 && loads_permuted
2324 && dr && vect_store_lanes_supported (vectype, group_size, false))
2325 {
2326 slp_tree load_node;
2327 FOR_EACH_VEC_ELT (loads, i, load_node)
2328 {
2329 gimple *first_stmt = GROUP_FIRST_ELEMENT
2330 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
2331 stmt_vec_info stmt_vinfo = vinfo_for_stmt (first_stmt);
2332 /* Use SLP for strided accesses (or if we
2333 can't load-lanes). */
2334 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2335 || ! vect_load_lanes_supported
2336 (STMT_VINFO_VECTYPE (stmt_vinfo),
2337 GROUP_SIZE (stmt_vinfo), false))
2338 break;
2339 }
2340 if (i == loads.length ())
2341 {
2342 if (dump_enabled_p ())
2343 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2344 "Built SLP cancelled: can use "
2345 "load/store-lanes\n");
2346 vect_free_slp_instance (new_instance);
2347 return false;
2348 }
2349 }
2350
2351 vinfo->slp_instances.safe_push (new_instance);
2352
2353 if (dump_enabled_p ())
2354 {
2355 dump_printf_loc (MSG_NOTE, vect_location,
2356 "Final SLP tree for instance:\n");
2357 vect_print_slp_tree (MSG_NOTE, vect_location, node);
2358 }
2359
2360 return true;
2361 }
2362 }
2363 else
2364 {
2365 /* Failed to SLP. */
2366 /* Free the allocated memory. */
2367 scalar_stmts.release ();
2368 loads.release ();
2369 }
2370
2371 /* For basic block SLP, try to break the group up into multiples of the
2372 vector size. */
2373 unsigned HOST_WIDE_INT const_nunits;
2374 if (is_a <bb_vec_info> (vinfo)
2375 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2376 && STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
2377 && nunits.is_constant (&const_nunits))
2378 {
2379 /* We consider breaking the group only on VF boundaries from the existing
2380 start. */
2381 for (i = 0; i < group_size; i++)
2382 if (!matches[i]) break;
2383
2384 if (i >= const_nunits && i < group_size)
2385 {
2386 /* Split into two groups at the first vector boundary before i. */
2387 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2388 unsigned group1_size = i & ~(const_nunits - 1);
2389
2390 gimple *rest = vect_split_slp_store_group (stmt, group1_size);
2391 bool res = vect_analyze_slp_instance (vinfo, stmt, max_tree_size);
2392 /* If the first non-match was in the middle of a vector,
2393 skip the rest of that vector. */
2394 if (group1_size < i)
2395 {
2396 i = group1_size + const_nunits;
2397 if (i < group_size)
2398 rest = vect_split_slp_store_group (rest, const_nunits);
2399 }
2400 if (i < group_size)
2401 res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size);
2402 return res;
2403 }
2404 /* Even though the first vector did not all match, we might be able to SLP
2405 (some) of the remainder. FORNOW ignore this possibility. */
2406 }
2407
2408 return false;
2409 }
2410
2411
2412 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2413 trees of packed scalar stmts if SLP is possible. */
2414
2415 bool
vect_analyze_slp(vec_info * vinfo,unsigned max_tree_size)2416 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2417 {
2418 unsigned int i;
2419 gimple *first_element;
2420
2421 if (dump_enabled_p ())
2422 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
2423
2424 /* Find SLP sequences starting from groups of grouped stores. */
2425 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2426 vect_analyze_slp_instance (vinfo, first_element, max_tree_size);
2427
2428 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2429 {
2430 if (loop_vinfo->reduction_chains.length () > 0)
2431 {
2432 /* Find SLP sequences starting from reduction chains. */
2433 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2434 if (! vect_analyze_slp_instance (vinfo, first_element,
2435 max_tree_size))
2436 {
2437 /* Dissolve reduction chain group. */
2438 gimple *next, *stmt = first_element;
2439 while (stmt)
2440 {
2441 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2442 next = GROUP_NEXT_ELEMENT (vinfo);
2443 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2444 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2445 stmt = next;
2446 }
2447 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (first_element))
2448 = vect_internal_def;
2449 }
2450 }
2451
2452 /* Find SLP sequences starting from groups of reductions. */
2453 if (loop_vinfo->reductions.length () > 1)
2454 vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0],
2455 max_tree_size);
2456 }
2457
2458 return true;
2459 }
2460
2461
2462 /* For each possible SLP instance decide whether to SLP it and calculate overall
2463 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2464 least one instance. */
2465
2466 bool
vect_make_slp_decision(loop_vec_info loop_vinfo)2467 vect_make_slp_decision (loop_vec_info loop_vinfo)
2468 {
2469 unsigned int i;
2470 poly_uint64 unrolling_factor = 1;
2471 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2472 slp_instance instance;
2473 int decided_to_slp = 0;
2474
2475 if (dump_enabled_p ())
2476 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
2477 "\n");
2478
2479 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2480 {
2481 /* FORNOW: SLP if you can. */
2482 /* All unroll factors have the form current_vector_size * X for some
2483 rational X, so they must have a common multiple. */
2484 unrolling_factor
2485 = force_common_multiple (unrolling_factor,
2486 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2487
2488 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2489 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2490 loop-based vectorization. Such stmts will be marked as HYBRID. */
2491 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2492 decided_to_slp++;
2493 }
2494
2495 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2496
2497 if (decided_to_slp && dump_enabled_p ())
2498 {
2499 dump_printf_loc (MSG_NOTE, vect_location,
2500 "Decided to SLP %d instances. Unrolling factor ",
2501 decided_to_slp);
2502 dump_dec (MSG_NOTE, unrolling_factor);
2503 dump_printf (MSG_NOTE, "\n");
2504 }
2505
2506 return (decided_to_slp > 0);
2507 }
2508
2509
2510 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
2511 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
2512
2513 static void
vect_detect_hybrid_slp_stmts(slp_tree node,unsigned i,slp_vect_type stype)2514 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
2515 {
2516 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[i];
2517 imm_use_iterator imm_iter;
2518 gimple *use_stmt;
2519 stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
2520 slp_tree child;
2521 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2522 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2523 int j;
2524
2525 /* Propagate hybrid down the SLP tree. */
2526 if (stype == hybrid)
2527 ;
2528 else if (HYBRID_SLP_STMT (stmt_vinfo))
2529 stype = hybrid;
2530 else
2531 {
2532 /* Check if a pure SLP stmt has uses in non-SLP stmts. */
2533 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
2534 /* If we get a pattern stmt here we have to use the LHS of the
2535 original stmt for immediate uses. */
2536 if (! STMT_VINFO_IN_PATTERN_P (stmt_vinfo)
2537 && STMT_VINFO_RELATED_STMT (stmt_vinfo))
2538 stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
2539 tree def;
2540 if (gimple_code (stmt) == GIMPLE_PHI)
2541 def = gimple_phi_result (stmt);
2542 else
2543 def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF);
2544 if (def)
2545 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, def)
2546 {
2547 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2548 continue;
2549 use_vinfo = vinfo_for_stmt (use_stmt);
2550 if (STMT_VINFO_IN_PATTERN_P (use_vinfo)
2551 && STMT_VINFO_RELATED_STMT (use_vinfo))
2552 use_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (use_vinfo));
2553 if (!STMT_SLP_TYPE (use_vinfo)
2554 && (STMT_VINFO_RELEVANT (use_vinfo)
2555 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2556 && !(gimple_code (use_stmt) == GIMPLE_PHI
2557 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2558 {
2559 if (dump_enabled_p ())
2560 {
2561 dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
2562 "def in non-SLP stmt: ");
2563 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, use_stmt, 0);
2564 }
2565 stype = hybrid;
2566 }
2567 }
2568 }
2569
2570 if (stype == hybrid
2571 && !HYBRID_SLP_STMT (stmt_vinfo))
2572 {
2573 if (dump_enabled_p ())
2574 {
2575 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2576 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2577 }
2578 STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2579 }
2580
2581 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2582 if (SLP_TREE_DEF_TYPE (child) != vect_external_def)
2583 vect_detect_hybrid_slp_stmts (child, i, stype);
2584 }
2585
2586 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */
2587
2588 static tree
vect_detect_hybrid_slp_1(tree * tp,int *,void * data)2589 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2590 {
2591 walk_stmt_info *wi = (walk_stmt_info *)data;
2592 struct loop *loopp = (struct loop *)wi->info;
2593
2594 if (wi->is_lhs)
2595 return NULL_TREE;
2596
2597 if (TREE_CODE (*tp) == SSA_NAME
2598 && !SSA_NAME_IS_DEFAULT_DEF (*tp))
2599 {
2600 gimple *def_stmt = SSA_NAME_DEF_STMT (*tp);
2601 if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
2602 && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
2603 {
2604 if (dump_enabled_p ())
2605 {
2606 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2607 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2608 }
2609 STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
2610 }
2611 }
2612
2613 return NULL_TREE;
2614 }
2615
2616 static tree
vect_detect_hybrid_slp_2(gimple_stmt_iterator * gsi,bool * handled,walk_stmt_info *)2617 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2618 walk_stmt_info *)
2619 {
2620 stmt_vec_info use_vinfo = vinfo_for_stmt (gsi_stmt (*gsi));
2621 /* If the stmt is in a SLP instance then this isn't a reason
2622 to mark use definitions in other SLP instances as hybrid. */
2623 if (! STMT_SLP_TYPE (use_vinfo)
2624 && (STMT_VINFO_RELEVANT (use_vinfo)
2625 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2626 && ! (gimple_code (gsi_stmt (*gsi)) == GIMPLE_PHI
2627 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2628 ;
2629 else
2630 *handled = true;
2631 return NULL_TREE;
2632 }
2633
2634 /* Find stmts that must be both vectorized and SLPed. */
2635
2636 void
vect_detect_hybrid_slp(loop_vec_info loop_vinfo)2637 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2638 {
2639 unsigned int i;
2640 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2641 slp_instance instance;
2642
2643 if (dump_enabled_p ())
2644 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
2645 "\n");
2646
2647 /* First walk all pattern stmt in the loop and mark defs of uses as
2648 hybrid because immediate uses in them are not recorded. */
2649 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2650 {
2651 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2652 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2653 gsi_next (&gsi))
2654 {
2655 gimple *stmt = gsi_stmt (gsi);
2656 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2657 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2658 {
2659 walk_stmt_info wi;
2660 memset (&wi, 0, sizeof (wi));
2661 wi.info = LOOP_VINFO_LOOP (loop_vinfo);
2662 gimple_stmt_iterator gsi2
2663 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2664 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2665 vect_detect_hybrid_slp_1, &wi);
2666 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2667 vect_detect_hybrid_slp_2,
2668 vect_detect_hybrid_slp_1, &wi);
2669 }
2670 }
2671 }
2672
2673 /* Then walk the SLP instance trees marking stmts with uses in
2674 non-SLP stmts as hybrid, also propagating hybrid down the
2675 SLP tree, collecting the above info on-the-fly. */
2676 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2677 {
2678 for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2679 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2680 i, pure_slp);
2681 }
2682 }
2683
2684
2685 /* Initialize a bb_vec_info struct for the statements between
2686 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2687
_bb_vec_info(gimple_stmt_iterator region_begin_in,gimple_stmt_iterator region_end_in)2688 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2689 gimple_stmt_iterator region_end_in)
2690 : vec_info (vec_info::bb, init_cost (NULL)),
2691 bb (gsi_bb (region_begin_in)),
2692 region_begin (region_begin_in),
2693 region_end (region_end_in)
2694 {
2695 gimple_stmt_iterator gsi;
2696
2697 for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end);
2698 gsi_next (&gsi))
2699 {
2700 gimple *stmt = gsi_stmt (gsi);
2701 gimple_set_uid (stmt, 0);
2702 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
2703 }
2704
2705 bb->aux = this;
2706 }
2707
2708
2709 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2710 stmts in the basic block. */
2711
~_bb_vec_info()2712 _bb_vec_info::~_bb_vec_info ()
2713 {
2714 for (gimple_stmt_iterator si = region_begin;
2715 gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si))
2716 {
2717 gimple *stmt = gsi_stmt (si);
2718 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2719
2720 if (stmt_info)
2721 /* Free stmt_vec_info. */
2722 free_stmt_vec_info (stmt);
2723
2724 /* Reset region marker. */
2725 gimple_set_uid (stmt, -1);
2726 }
2727
2728 bb->aux = NULL;
2729 }
2730
2731
2732 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2733 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2734
2735 Return true if the operations are supported. */
2736
2737 static bool
vect_slp_analyze_node_operations(vec_info * vinfo,slp_tree node,slp_instance node_instance)2738 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2739 slp_instance node_instance)
2740 {
2741 bool dummy;
2742 int i, j;
2743 gimple *stmt;
2744 slp_tree child;
2745
2746 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2747 return true;
2748
2749 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2750 if (!vect_slp_analyze_node_operations (vinfo, child, node_instance))
2751 return false;
2752
2753 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2754 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2755 gcc_assert (stmt_info);
2756 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2757
2758 /* For BB vectorization vector types are assigned here.
2759 Memory accesses already got their vector type assigned
2760 in vect_analyze_data_refs. */
2761 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2762 if (bb_vinfo
2763 && ! STMT_VINFO_DATA_REF (stmt_info))
2764 {
2765 gcc_assert (PURE_SLP_STMT (stmt_info));
2766
2767 tree scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
2768 if (dump_enabled_p ())
2769 {
2770 dump_printf_loc (MSG_NOTE, vect_location,
2771 "get vectype for scalar type: ");
2772 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
2773 dump_printf (MSG_NOTE, "\n");
2774 }
2775
2776 tree vectype = get_vectype_for_scalar_type (scalar_type);
2777 if (!vectype)
2778 {
2779 if (dump_enabled_p ())
2780 {
2781 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2782 "not SLPed: unsupported data-type ");
2783 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2784 scalar_type);
2785 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2786 }
2787 return false;
2788 }
2789
2790 if (dump_enabled_p ())
2791 {
2792 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
2793 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
2794 dump_printf (MSG_NOTE, "\n");
2795 }
2796
2797 gimple *sstmt;
2798 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, sstmt)
2799 STMT_VINFO_VECTYPE (vinfo_for_stmt (sstmt)) = vectype;
2800 }
2801
2802 /* Calculate the number of vector statements to be created for the
2803 scalar stmts in this node. For SLP reductions it is equal to the
2804 number of vector statements in the children (which has already been
2805 calculated by the recursive call). Otherwise it is the number of
2806 scalar elements in one scalar iteration (GROUP_SIZE) multiplied by
2807 VF divided by the number of elements in a vector. */
2808 if (GROUP_FIRST_ELEMENT (stmt_info)
2809 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2810 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2811 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
2812 else
2813 {
2814 poly_uint64 vf;
2815 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2816 vf = loop_vinfo->vectorization_factor;
2817 else
2818 vf = 1;
2819 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance);
2820 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2821 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2822 = vect_get_num_vectors (vf * group_size, vectype);
2823 }
2824
2825 /* ??? We have to catch the case late where two first scalar stmts appear
2826 in multiple SLP children with different def type and fail. Remember
2827 original def types first since SLP_TREE_DEF_TYPE doesn't necessarily
2828 match it when that is vect_internal_def. */
2829 auto_vec<vect_def_type, 4> dt;
2830 dt.safe_grow (SLP_TREE_CHILDREN (node).length ());
2831 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2832 dt[j]
2833 = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]));
2834
2835 /* Push SLP node def-type to stmt operands. */
2836 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2837 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2838 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]))
2839 = SLP_TREE_DEF_TYPE (child);
2840
2841 /* Check everything worked out. */
2842 bool res = true;
2843 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2844 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2845 {
2846 if (STMT_VINFO_DEF_TYPE
2847 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]))
2848 != SLP_TREE_DEF_TYPE (child))
2849 res = false;
2850 }
2851 else if (STMT_VINFO_DEF_TYPE
2852 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])) != dt[j])
2853 res = false;
2854 if (!res && dump_enabled_p ())
2855 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2856 "not vectorized: same operand with different "
2857 "def type in stmt.\n");
2858 if (res)
2859 res = vect_analyze_stmt (stmt, &dummy, node, node_instance);
2860
2861 /* Restore def-types. */
2862 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2863 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]))
2864 = dt[j];
2865
2866 return res;
2867 }
2868
2869
2870 /* Analyze statements in SLP instances of VINFO. Return true if the
2871 operations are supported. */
2872
2873 bool
vect_slp_analyze_operations(vec_info * vinfo)2874 vect_slp_analyze_operations (vec_info *vinfo)
2875 {
2876 slp_instance instance;
2877 int i;
2878
2879 if (dump_enabled_p ())
2880 dump_printf_loc (MSG_NOTE, vect_location,
2881 "=== vect_slp_analyze_operations ===\n");
2882
2883 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2884 {
2885 if (!vect_slp_analyze_node_operations (vinfo,
2886 SLP_INSTANCE_TREE (instance),
2887 instance))
2888 {
2889 dump_printf_loc (MSG_NOTE, vect_location,
2890 "removing SLP instance operations starting from: ");
2891 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
2892 SLP_TREE_SCALAR_STMTS
2893 (SLP_INSTANCE_TREE (instance))[0], 0);
2894 vect_free_slp_instance (instance);
2895 vinfo->slp_instances.ordered_remove (i);
2896 }
2897 else
2898 i++;
2899 }
2900
2901 if (dump_enabled_p ())
2902 dump_printf_loc (MSG_NOTE, vect_location,
2903 "=== vect_analyze_slp_cost ===\n");
2904
2905 /* Compute the costs of the SLP instances. */
2906 scalar_stmts_set_t *visited = new scalar_stmts_set_t ();
2907 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
2908 vect_analyze_slp_cost (instance, vinfo->target_cost_data, visited);
2909 delete visited;
2910
2911 return !vinfo->slp_instances.is_empty ();
2912 }
2913
2914
2915 /* Compute the scalar cost of the SLP node NODE and its children
2916 and return it. Do not account defs that are marked in LIFE and
2917 update LIFE according to uses of NODE. */
2918
2919 static unsigned
vect_bb_slp_scalar_cost(basic_block bb,slp_tree node,vec<bool,va_heap> * life)2920 vect_bb_slp_scalar_cost (basic_block bb,
2921 slp_tree node, vec<bool, va_heap> *life)
2922 {
2923 unsigned scalar_cost = 0;
2924 unsigned i;
2925 gimple *stmt;
2926 slp_tree child;
2927
2928 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2929 {
2930 unsigned stmt_cost;
2931 ssa_op_iter op_iter;
2932 def_operand_p def_p;
2933 stmt_vec_info stmt_info;
2934
2935 if ((*life)[i])
2936 continue;
2937
2938 /* If there is a non-vectorized use of the defs then the scalar
2939 stmt is kept live in which case we do not account it or any
2940 required defs in the SLP children in the scalar cost. This
2941 way we make the vectorization more costly when compared to
2942 the scalar cost. */
2943 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2944 {
2945 imm_use_iterator use_iter;
2946 gimple *use_stmt;
2947 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2948 if (!is_gimple_debug (use_stmt)
2949 && (! vect_stmt_in_region_p (vinfo_for_stmt (stmt)->vinfo,
2950 use_stmt)
2951 || ! PURE_SLP_STMT (vinfo_for_stmt (use_stmt))))
2952 {
2953 (*life)[i] = true;
2954 BREAK_FROM_IMM_USE_STMT (use_iter);
2955 }
2956 }
2957 if ((*life)[i])
2958 continue;
2959
2960 /* Count scalar stmts only once. */
2961 if (gimple_visited_p (stmt))
2962 continue;
2963 gimple_set_visited (stmt, true);
2964
2965 stmt_info = vinfo_for_stmt (stmt);
2966 if (STMT_VINFO_DATA_REF (stmt_info))
2967 {
2968 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2969 stmt_cost = vect_get_stmt_cost (scalar_load);
2970 else
2971 stmt_cost = vect_get_stmt_cost (scalar_store);
2972 }
2973 else
2974 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2975
2976 scalar_cost += stmt_cost;
2977 }
2978
2979 auto_vec<bool, 20> subtree_life;
2980 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2981 {
2982 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2983 {
2984 /* Do not directly pass LIFE to the recursive call, copy it to
2985 confine changes in the callee to the current child/subtree. */
2986 subtree_life.safe_splice (*life);
2987 scalar_cost += vect_bb_slp_scalar_cost (bb, child, &subtree_life);
2988 subtree_life.truncate (0);
2989 }
2990 }
2991
2992 return scalar_cost;
2993 }
2994
2995 /* Check if vectorization of the basic block is profitable. */
2996
2997 static bool
vect_bb_vectorization_profitable_p(bb_vec_info bb_vinfo)2998 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2999 {
3000 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3001 slp_instance instance;
3002 int i;
3003 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
3004 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
3005
3006 /* Calculate scalar cost. */
3007 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3008 {
3009 auto_vec<bool, 20> life;
3010 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
3011 scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
3012 SLP_INSTANCE_TREE (instance),
3013 &life);
3014 }
3015
3016 /* Unset visited flag. */
3017 for (gimple_stmt_iterator gsi = bb_vinfo->region_begin;
3018 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
3019 gimple_set_visited (gsi_stmt (gsi), false);
3020
3021 /* Complete the target-specific cost calculation. */
3022 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
3023 &vec_inside_cost, &vec_epilogue_cost);
3024
3025 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
3026
3027 if (dump_enabled_p ())
3028 {
3029 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3030 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
3031 vec_inside_cost);
3032 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
3033 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
3034 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
3035 }
3036
3037 /* Vectorization is profitable if its cost is more than the cost of scalar
3038 version. Note that we err on the vector side for equal cost because
3039 the cost estimate is otherwise quite pessimistic (constant uses are
3040 free on the scalar side but cost a load on the vector side for
3041 example). */
3042 if (vec_outside_cost + vec_inside_cost > scalar_cost)
3043 return false;
3044
3045 return true;
3046 }
3047
3048 /* Check if the basic block can be vectorized. Returns a bb_vec_info
3049 if so and sets fatal to true if failure is independent of
3050 current_vector_size. */
3051
3052 static bb_vec_info
vect_slp_analyze_bb_1(gimple_stmt_iterator region_begin,gimple_stmt_iterator region_end,vec<data_reference_p> datarefs,int n_stmts,bool & fatal)3053 vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
3054 gimple_stmt_iterator region_end,
3055 vec<data_reference_p> datarefs, int n_stmts,
3056 bool &fatal)
3057 {
3058 bb_vec_info bb_vinfo;
3059 slp_instance instance;
3060 int i;
3061 poly_uint64 min_vf = 2;
3062
3063 /* The first group of checks is independent of the vector size. */
3064 fatal = true;
3065
3066 if (n_stmts > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
3067 {
3068 if (dump_enabled_p ())
3069 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3070 "not vectorized: too many instructions in "
3071 "basic block.\n");
3072 free_data_refs (datarefs);
3073 return NULL;
3074 }
3075
3076 bb_vinfo = new _bb_vec_info (region_begin, region_end);
3077 if (!bb_vinfo)
3078 return NULL;
3079
3080 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
3081
3082 /* Analyze the data references. */
3083
3084 if (!vect_analyze_data_refs (bb_vinfo, &min_vf))
3085 {
3086 if (dump_enabled_p ())
3087 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3088 "not vectorized: unhandled data-ref in basic "
3089 "block.\n");
3090
3091 delete bb_vinfo;
3092 return NULL;
3093 }
3094
3095 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
3096 {
3097 if (dump_enabled_p ())
3098 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3099 "not vectorized: not enough data-refs in "
3100 "basic block.\n");
3101
3102 delete bb_vinfo;
3103 return NULL;
3104 }
3105
3106 if (!vect_analyze_data_ref_accesses (bb_vinfo))
3107 {
3108 if (dump_enabled_p ())
3109 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3110 "not vectorized: unhandled data access in "
3111 "basic block.\n");
3112
3113 delete bb_vinfo;
3114 return NULL;
3115 }
3116
3117 /* If there are no grouped stores in the region there is no need
3118 to continue with pattern recog as vect_analyze_slp will fail
3119 anyway. */
3120 if (bb_vinfo->grouped_stores.is_empty ())
3121 {
3122 if (dump_enabled_p ())
3123 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3124 "not vectorized: no grouped stores in "
3125 "basic block.\n");
3126
3127 delete bb_vinfo;
3128 return NULL;
3129 }
3130
3131 /* While the rest of the analysis below depends on it in some way. */
3132 fatal = false;
3133
3134 vect_pattern_recog (bb_vinfo);
3135
3136 /* Check the SLP opportunities in the basic block, analyze and build SLP
3137 trees. */
3138 if (!vect_analyze_slp (bb_vinfo, n_stmts))
3139 {
3140 if (dump_enabled_p ())
3141 {
3142 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3143 "Failed to SLP the basic block.\n");
3144 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3145 "not vectorized: failed to find SLP opportunities "
3146 "in basic block.\n");
3147 }
3148
3149 delete bb_vinfo;
3150 return NULL;
3151 }
3152
3153 vect_record_base_alignments (bb_vinfo);
3154
3155 /* Analyze and verify the alignment of data references and the
3156 dependence in the SLP instances. */
3157 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
3158 {
3159 if (! vect_slp_analyze_and_verify_instance_alignment (instance)
3160 || ! vect_slp_analyze_instance_dependence (instance))
3161 {
3162 dump_printf_loc (MSG_NOTE, vect_location,
3163 "removing SLP instance operations starting from: ");
3164 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3165 SLP_TREE_SCALAR_STMTS
3166 (SLP_INSTANCE_TREE (instance))[0], 0);
3167 vect_free_slp_instance (instance);
3168 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
3169 continue;
3170 }
3171
3172 /* Mark all the statements that we want to vectorize as pure SLP and
3173 relevant. */
3174 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
3175 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
3176
3177 i++;
3178 }
3179 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
3180 {
3181 delete bb_vinfo;
3182 return NULL;
3183 }
3184
3185 if (!vect_slp_analyze_operations (bb_vinfo))
3186 {
3187 if (dump_enabled_p ())
3188 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3189 "not vectorized: bad operation in basic block.\n");
3190
3191 delete bb_vinfo;
3192 return NULL;
3193 }
3194
3195 /* Cost model: check if the vectorization is worthwhile. */
3196 if (!unlimited_cost_model (NULL)
3197 && !vect_bb_vectorization_profitable_p (bb_vinfo))
3198 {
3199 if (dump_enabled_p ())
3200 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3201 "not vectorized: vectorization is not "
3202 "profitable.\n");
3203
3204 delete bb_vinfo;
3205 return NULL;
3206 }
3207
3208 if (dump_enabled_p ())
3209 dump_printf_loc (MSG_NOTE, vect_location,
3210 "Basic block will be vectorized using SLP\n");
3211
3212 return bb_vinfo;
3213 }
3214
3215
3216 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
3217 true if anything in the basic-block was vectorized. */
3218
3219 bool
vect_slp_bb(basic_block bb)3220 vect_slp_bb (basic_block bb)
3221 {
3222 bb_vec_info bb_vinfo;
3223 gimple_stmt_iterator gsi;
3224 bool any_vectorized = false;
3225 auto_vector_sizes vector_sizes;
3226
3227 if (dump_enabled_p ())
3228 dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
3229
3230 /* Autodetect first vector size we try. */
3231 current_vector_size = 0;
3232 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
3233 unsigned int next_size = 0;
3234
3235 gsi = gsi_start_bb (bb);
3236
3237 poly_uint64 autodetected_vector_size = 0;
3238 while (1)
3239 {
3240 if (gsi_end_p (gsi))
3241 break;
3242
3243 gimple_stmt_iterator region_begin = gsi;
3244 vec<data_reference_p> datarefs = vNULL;
3245 int insns = 0;
3246
3247 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3248 {
3249 gimple *stmt = gsi_stmt (gsi);
3250 if (is_gimple_debug (stmt))
3251 continue;
3252 insns++;
3253
3254 if (gimple_location (stmt) != UNKNOWN_LOCATION)
3255 vect_location = gimple_location (stmt);
3256
3257 if (!find_data_references_in_stmt (NULL, stmt, &datarefs))
3258 break;
3259 }
3260
3261 /* Skip leading unhandled stmts. */
3262 if (gsi_stmt (region_begin) == gsi_stmt (gsi))
3263 {
3264 gsi_next (&gsi);
3265 continue;
3266 }
3267
3268 gimple_stmt_iterator region_end = gsi;
3269
3270 bool vectorized = false;
3271 bool fatal = false;
3272 bb_vinfo = vect_slp_analyze_bb_1 (region_begin, region_end,
3273 datarefs, insns, fatal);
3274 if (bb_vinfo
3275 && dbg_cnt (vect_slp))
3276 {
3277 if (dump_enabled_p ())
3278 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3279
3280 vect_schedule_slp (bb_vinfo);
3281
3282 if (dump_enabled_p ())
3283 dump_printf_loc (MSG_NOTE, vect_location,
3284 "basic block part vectorized\n");
3285
3286 vectorized = true;
3287 }
3288 delete bb_vinfo;
3289
3290 any_vectorized |= vectorized;
3291
3292 if (next_size == 0)
3293 autodetected_vector_size = current_vector_size;
3294
3295 if (next_size < vector_sizes.length ()
3296 && known_eq (vector_sizes[next_size], autodetected_vector_size))
3297 next_size += 1;
3298
3299 if (vectorized
3300 || next_size == vector_sizes.length ()
3301 || known_eq (current_vector_size, 0U)
3302 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3303 vector sizes will fail do not bother iterating. */
3304 || fatal)
3305 {
3306 if (gsi_end_p (region_end))
3307 break;
3308
3309 /* Skip the unhandled stmt. */
3310 gsi_next (&gsi);
3311
3312 /* And reset vector sizes. */
3313 current_vector_size = 0;
3314 next_size = 0;
3315 }
3316 else
3317 {
3318 /* Try the next biggest vector size. */
3319 current_vector_size = vector_sizes[next_size++];
3320 if (dump_enabled_p ())
3321 {
3322 dump_printf_loc (MSG_NOTE, vect_location,
3323 "***** Re-trying analysis with "
3324 "vector size ");
3325 dump_dec (MSG_NOTE, current_vector_size);
3326 dump_printf (MSG_NOTE, "\n");
3327 }
3328
3329 /* Start over. */
3330 gsi = region_begin;
3331 }
3332 }
3333
3334 return any_vectorized;
3335 }
3336
3337
3338 /* Return 1 if vector type of boolean constant which is OPNUM
3339 operand in statement STMT is a boolean vector. */
3340
3341 static bool
vect_mask_constant_operand_p(gimple * stmt,int opnum)3342 vect_mask_constant_operand_p (gimple *stmt, int opnum)
3343 {
3344 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3345 enum tree_code code = gimple_expr_code (stmt);
3346 tree op, vectype;
3347 gimple *def_stmt;
3348 enum vect_def_type dt;
3349
3350 /* For comparison and COND_EXPR type is chosen depending
3351 on the other comparison operand. */
3352 if (TREE_CODE_CLASS (code) == tcc_comparison)
3353 {
3354 if (opnum)
3355 op = gimple_assign_rhs1 (stmt);
3356 else
3357 op = gimple_assign_rhs2 (stmt);
3358
3359 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &def_stmt,
3360 &dt, &vectype))
3361 gcc_unreachable ();
3362
3363 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3364 }
3365
3366 if (code == COND_EXPR)
3367 {
3368 tree cond = gimple_assign_rhs1 (stmt);
3369
3370 if (TREE_CODE (cond) == SSA_NAME)
3371 op = cond;
3372 else if (opnum)
3373 op = TREE_OPERAND (cond, 1);
3374 else
3375 op = TREE_OPERAND (cond, 0);
3376
3377 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &def_stmt,
3378 &dt, &vectype))
3379 gcc_unreachable ();
3380
3381 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3382 }
3383
3384 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3385 }
3386
3387 /* Build a variable-length vector in which the elements in ELTS are repeated
3388 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3389 RESULTS and add any new instructions to SEQ.
3390
3391 The approach we use is:
3392
3393 (1) Find a vector mode VM with integer elements of mode IM.
3394
3395 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3396 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3397 from small vectors to IM.
3398
3399 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3400
3401 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3402 correct byte contents.
3403
3404 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3405
3406 We try to find the largest IM for which this sequence works, in order
3407 to cut down on the number of interleaves. */
3408
3409 void
duplicate_and_interleave(gimple_seq * seq,tree vector_type,vec<tree> elts,unsigned int nresults,vec<tree> & results)3410 duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec<tree> elts,
3411 unsigned int nresults, vec<tree> &results)
3412 {
3413 unsigned int nelts = elts.length ();
3414 tree element_type = TREE_TYPE (vector_type);
3415
3416 /* (1) Find a vector mode VM with integer elements of mode IM. */
3417 unsigned int nvectors = 1;
3418 tree new_vector_type;
3419 tree permutes[2];
3420 if (!can_duplicate_and_interleave_p (nelts, TYPE_MODE (element_type),
3421 &nvectors, &new_vector_type,
3422 permutes))
3423 gcc_unreachable ();
3424
3425 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3426 unsigned int partial_nelts = nelts / nvectors;
3427 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3428
3429 tree_vector_builder partial_elts;
3430 auto_vec<tree, 32> pieces (nvectors * 2);
3431 pieces.quick_grow (nvectors * 2);
3432 for (unsigned int i = 0; i < nvectors; ++i)
3433 {
3434 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3435 ELTS' has mode IM. */
3436 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3437 for (unsigned int j = 0; j < partial_nelts; ++j)
3438 partial_elts.quick_push (elts[i * partial_nelts + j]);
3439 tree t = gimple_build_vector (seq, &partial_elts);
3440 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3441 TREE_TYPE (new_vector_type), t);
3442
3443 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3444 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3445 }
3446
3447 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3448 correct byte contents.
3449
3450 We need to repeat the following operation log2(nvectors) times:
3451
3452 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3453 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3454
3455 However, if each input repeats every N elements and the VF is
3456 a multiple of N * 2, the HI result is the same as the LO. */
3457 unsigned int in_start = 0;
3458 unsigned int out_start = nvectors;
3459 unsigned int hi_start = nvectors / 2;
3460 /* A bound on the number of outputs needed to produce NRESULTS results
3461 in the final iteration. */
3462 unsigned int noutputs_bound = nvectors * nresults;
3463 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3464 {
3465 noutputs_bound /= 2;
3466 unsigned int limit = MIN (noutputs_bound, nvectors);
3467 for (unsigned int i = 0; i < limit; ++i)
3468 {
3469 if ((i & 1) != 0
3470 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3471 2 * in_repeat))
3472 {
3473 pieces[out_start + i] = pieces[out_start + i - 1];
3474 continue;
3475 }
3476
3477 tree output = make_ssa_name (new_vector_type);
3478 tree input1 = pieces[in_start + (i / 2)];
3479 tree input2 = pieces[in_start + (i / 2) + hi_start];
3480 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3481 input1, input2,
3482 permutes[i & 1]);
3483 gimple_seq_add_stmt (seq, stmt);
3484 pieces[out_start + i] = output;
3485 }
3486 std::swap (in_start, out_start);
3487 }
3488
3489 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3490 results.reserve (nresults);
3491 for (unsigned int i = 0; i < nresults; ++i)
3492 if (i < nvectors)
3493 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3494 pieces[in_start + i]));
3495 else
3496 results.quick_push (results[i - nvectors]);
3497 }
3498
3499
3500 /* For constant and loop invariant defs of SLP_NODE this function returns
3501 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
3502 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
3503 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
3504 REDUC_INDEX is the index of the reduction operand in the statements, unless
3505 it is -1. */
3506
3507 static void
vect_get_constant_vectors(tree op,slp_tree slp_node,vec<tree> * vec_oprnds,unsigned int op_num,unsigned int number_of_vectors)3508 vect_get_constant_vectors (tree op, slp_tree slp_node,
3509 vec<tree> *vec_oprnds,
3510 unsigned int op_num, unsigned int number_of_vectors)
3511 {
3512 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3513 gimple *stmt = stmts[0];
3514 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3515 unsigned HOST_WIDE_INT nunits;
3516 tree vec_cst;
3517 unsigned j, number_of_places_left_in_vector;
3518 tree vector_type;
3519 tree vop;
3520 int group_size = stmts.length ();
3521 unsigned int vec_num, i;
3522 unsigned number_of_copies = 1;
3523 vec<tree> voprnds;
3524 voprnds.create (number_of_vectors);
3525 bool constant_p, is_store;
3526 tree neutral_op = NULL;
3527 enum tree_code code = gimple_expr_code (stmt);
3528 gimple_seq ctor_seq = NULL;
3529 auto_vec<tree, 16> permute_results;
3530
3531 /* Check if vector type is a boolean vector. */
3532 if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
3533 && vect_mask_constant_operand_p (stmt, op_num))
3534 vector_type
3535 = build_same_sized_truth_vector_type (STMT_VINFO_VECTYPE (stmt_vinfo));
3536 else
3537 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
3538
3539 if (STMT_VINFO_DATA_REF (stmt_vinfo))
3540 {
3541 is_store = true;
3542 op = gimple_assign_rhs1 (stmt);
3543 }
3544 else
3545 is_store = false;
3546
3547 gcc_assert (op);
3548
3549 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3550 created vectors. It is greater than 1 if unrolling is performed.
3551
3552 For example, we have two scalar operands, s1 and s2 (e.g., group of
3553 strided accesses of size two), while NUNITS is four (i.e., four scalars
3554 of this type can be packed in a vector). The output vector will contain
3555 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3556 will be 2).
3557
3558 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3559 containing the operands.
3560
3561 For example, NUNITS is four as before, and the group size is 8
3562 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3563 {s5, s6, s7, s8}. */
3564
3565 /* When using duplicate_and_interleave, we just need one element for
3566 each scalar statement. */
3567 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3568 nunits = group_size;
3569
3570 number_of_copies = nunits * number_of_vectors / group_size;
3571
3572 number_of_places_left_in_vector = nunits;
3573 constant_p = true;
3574 tree_vector_builder elts (vector_type, nunits, 1);
3575 elts.quick_grow (nunits);
3576 bool place_after_defs = false;
3577 for (j = 0; j < number_of_copies; j++)
3578 {
3579 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
3580 {
3581 if (is_store)
3582 op = gimple_assign_rhs1 (stmt);
3583 else
3584 {
3585 switch (code)
3586 {
3587 case COND_EXPR:
3588 {
3589 tree cond = gimple_assign_rhs1 (stmt);
3590 if (TREE_CODE (cond) == SSA_NAME)
3591 op = gimple_op (stmt, op_num + 1);
3592 else if (op_num == 0 || op_num == 1)
3593 op = TREE_OPERAND (cond, op_num);
3594 else
3595 {
3596 if (op_num == 2)
3597 op = gimple_assign_rhs2 (stmt);
3598 else
3599 op = gimple_assign_rhs3 (stmt);
3600 }
3601 }
3602 break;
3603
3604 case CALL_EXPR:
3605 op = gimple_call_arg (stmt, op_num);
3606 break;
3607
3608 case LSHIFT_EXPR:
3609 case RSHIFT_EXPR:
3610 case LROTATE_EXPR:
3611 case RROTATE_EXPR:
3612 op = gimple_op (stmt, op_num + 1);
3613 /* Unlike the other binary operators, shifts/rotates have
3614 the shift count being int, instead of the same type as
3615 the lhs, so make sure the scalar is the right type if
3616 we are dealing with vectors of
3617 long long/long/short/char. */
3618 if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
3619 op = fold_convert (TREE_TYPE (vector_type), op);
3620 break;
3621
3622 default:
3623 op = gimple_op (stmt, op_num + 1);
3624 break;
3625 }
3626 }
3627
3628 /* Create 'vect_ = {op0,op1,...,opn}'. */
3629 number_of_places_left_in_vector--;
3630 tree orig_op = op;
3631 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3632 {
3633 if (CONSTANT_CLASS_P (op))
3634 {
3635 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3636 {
3637 /* Can't use VIEW_CONVERT_EXPR for booleans because
3638 of possibly different sizes of scalar value and
3639 vector element. */
3640 if (integer_zerop (op))
3641 op = build_int_cst (TREE_TYPE (vector_type), 0);
3642 else if (integer_onep (op))
3643 op = build_all_ones_cst (TREE_TYPE (vector_type));
3644 else
3645 gcc_unreachable ();
3646 }
3647 else
3648 op = fold_unary (VIEW_CONVERT_EXPR,
3649 TREE_TYPE (vector_type), op);
3650 gcc_assert (op && CONSTANT_CLASS_P (op));
3651 }
3652 else
3653 {
3654 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3655 gimple *init_stmt;
3656 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3657 {
3658 tree true_val
3659 = build_all_ones_cst (TREE_TYPE (vector_type));
3660 tree false_val
3661 = build_zero_cst (TREE_TYPE (vector_type));
3662 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3663 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3664 op, true_val,
3665 false_val);
3666 }
3667 else
3668 {
3669 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3670 op);
3671 init_stmt
3672 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3673 op);
3674 }
3675 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3676 op = new_temp;
3677 }
3678 }
3679 elts[number_of_places_left_in_vector] = op;
3680 if (!CONSTANT_CLASS_P (op))
3681 constant_p = false;
3682 if (TREE_CODE (orig_op) == SSA_NAME
3683 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3684 && STMT_VINFO_BB_VINFO (stmt_vinfo)
3685 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
3686 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
3687 place_after_defs = true;
3688
3689 if (number_of_places_left_in_vector == 0)
3690 {
3691 if (constant_p
3692 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3693 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3694 vec_cst = gimple_build_vector (&ctor_seq, &elts);
3695 else
3696 {
3697 if (vec_oprnds->is_empty ())
3698 duplicate_and_interleave (&ctor_seq, vector_type, elts,
3699 number_of_vectors,
3700 permute_results);
3701 vec_cst = permute_results[number_of_vectors - j - 1];
3702 }
3703 tree init;
3704 gimple_stmt_iterator gsi;
3705 if (place_after_defs)
3706 {
3707 gsi = gsi_for_stmt
3708 (vect_find_last_scalar_stmt_in_slp (slp_node));
3709 init = vect_init_vector (stmt, vec_cst, vector_type, &gsi);
3710 }
3711 else
3712 init = vect_init_vector (stmt, vec_cst, vector_type, NULL);
3713 if (ctor_seq != NULL)
3714 {
3715 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3716 gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
3717 ctor_seq = NULL;
3718 }
3719 voprnds.quick_push (init);
3720 place_after_defs = false;
3721 number_of_places_left_in_vector = nunits;
3722 constant_p = true;
3723 elts.new_vector (vector_type, nunits, 1);
3724 elts.quick_grow (nunits);
3725 }
3726 }
3727 }
3728
3729 /* Since the vectors are created in the reverse order, we should invert
3730 them. */
3731 vec_num = voprnds.length ();
3732 for (j = vec_num; j != 0; j--)
3733 {
3734 vop = voprnds[j - 1];
3735 vec_oprnds->quick_push (vop);
3736 }
3737
3738 voprnds.release ();
3739
3740 /* In case that VF is greater than the unrolling factor needed for the SLP
3741 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3742 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3743 to replicate the vectors. */
3744 while (number_of_vectors > vec_oprnds->length ())
3745 {
3746 tree neutral_vec = NULL;
3747
3748 if (neutral_op)
3749 {
3750 if (!neutral_vec)
3751 neutral_vec = build_vector_from_val (vector_type, neutral_op);
3752
3753 vec_oprnds->quick_push (neutral_vec);
3754 }
3755 else
3756 {
3757 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
3758 vec_oprnds->quick_push (vop);
3759 }
3760 }
3761 }
3762
3763
3764 /* Get vectorized definitions from SLP_NODE that contains corresponding
3765 vectorized def-stmts. */
3766
3767 static void
vect_get_slp_vect_defs(slp_tree slp_node,vec<tree> * vec_oprnds)3768 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
3769 {
3770 tree vec_oprnd;
3771 gimple *vec_def_stmt;
3772 unsigned int i;
3773
3774 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
3775
3776 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
3777 {
3778 gcc_assert (vec_def_stmt);
3779 if (gimple_code (vec_def_stmt) == GIMPLE_PHI)
3780 vec_oprnd = gimple_phi_result (vec_def_stmt);
3781 else
3782 vec_oprnd = gimple_get_lhs (vec_def_stmt);
3783 vec_oprnds->quick_push (vec_oprnd);
3784 }
3785 }
3786
3787
3788 /* Get vectorized definitions for SLP_NODE.
3789 If the scalar definitions are loop invariants or constants, collect them and
3790 call vect_get_constant_vectors() to create vector stmts.
3791 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
3792 must be stored in the corresponding child of SLP_NODE, and we call
3793 vect_get_slp_vect_defs () to retrieve them. */
3794
3795 void
vect_get_slp_defs(vec<tree> ops,slp_tree slp_node,vec<vec<tree>> * vec_oprnds)3796 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
3797 vec<vec<tree> > *vec_oprnds)
3798 {
3799 gimple *first_stmt;
3800 int number_of_vects = 0, i;
3801 unsigned int child_index = 0;
3802 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
3803 slp_tree child = NULL;
3804 vec<tree> vec_defs;
3805 tree oprnd;
3806 bool vectorized_defs;
3807
3808 first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
3809 FOR_EACH_VEC_ELT (ops, i, oprnd)
3810 {
3811 /* For each operand we check if it has vectorized definitions in a child
3812 node or we need to create them (for invariants and constants). We
3813 check if the LHS of the first stmt of the next child matches OPRND.
3814 If it does, we found the correct child. Otherwise, we call
3815 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
3816 to check this child node for the next operand. */
3817 vectorized_defs = false;
3818 if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
3819 {
3820 child = SLP_TREE_CHILDREN (slp_node)[child_index];
3821
3822 /* We have to check both pattern and original def, if available. */
3823 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3824 {
3825 gimple *first_def = SLP_TREE_SCALAR_STMTS (child)[0];
3826 gimple *related
3827 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
3828 tree first_def_op;
3829
3830 if (gimple_code (first_def) == GIMPLE_PHI)
3831 first_def_op = gimple_phi_result (first_def);
3832 else
3833 first_def_op = gimple_get_lhs (first_def);
3834 if (operand_equal_p (oprnd, first_def_op, 0)
3835 || (related
3836 && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
3837 {
3838 /* The number of vector defs is determined by the number of
3839 vector statements in the node from which we get those
3840 statements. */
3841 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
3842 vectorized_defs = true;
3843 child_index++;
3844 }
3845 }
3846 else
3847 child_index++;
3848 }
3849
3850 if (!vectorized_defs)
3851 {
3852 if (i == 0)
3853 {
3854 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
3855 /* Number of vector stmts was calculated according to LHS in
3856 vect_schedule_slp_instance (), fix it by replacing LHS with
3857 RHS, if necessary. See vect_get_smallest_scalar_type () for
3858 details. */
3859 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
3860 &rhs_size_unit);
3861 if (rhs_size_unit != lhs_size_unit)
3862 {
3863 number_of_vects *= rhs_size_unit;
3864 number_of_vects /= lhs_size_unit;
3865 }
3866 }
3867 }
3868
3869 /* Allocate memory for vectorized defs. */
3870 vec_defs = vNULL;
3871 vec_defs.create (number_of_vects);
3872
3873 /* For reduction defs we call vect_get_constant_vectors (), since we are
3874 looking for initial loop invariant values. */
3875 if (vectorized_defs)
3876 /* The defs are already vectorized. */
3877 vect_get_slp_vect_defs (child, &vec_defs);
3878 else
3879 /* Build vectors from scalar defs. */
3880 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
3881 number_of_vects);
3882
3883 vec_oprnds->quick_push (vec_defs);
3884 }
3885 }
3886
3887 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3888 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3889 permute statements for the SLP node NODE of the SLP instance
3890 SLP_NODE_INSTANCE. */
3891
3892 bool
vect_transform_slp_perm_load(slp_tree node,vec<tree> dr_chain,gimple_stmt_iterator * gsi,poly_uint64 vf,slp_instance slp_node_instance,bool analyze_only,unsigned * n_perms)3893 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3894 gimple_stmt_iterator *gsi, poly_uint64 vf,
3895 slp_instance slp_node_instance, bool analyze_only,
3896 unsigned *n_perms)
3897 {
3898 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3899 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3900 tree mask_element_type = NULL_TREE, mask_type;
3901 int vec_index = 0;
3902 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3903 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3904 unsigned int mask_element;
3905 machine_mode mode;
3906 unsigned HOST_WIDE_INT nunits, const_vf;
3907
3908 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3909 return false;
3910
3911 stmt_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info));
3912
3913 mode = TYPE_MODE (vectype);
3914
3915 /* At the moment, all permutations are represented using per-element
3916 indices, so we can't cope with variable vector lengths or
3917 vectorization factors. */
3918 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
3919 || !vf.is_constant (&const_vf))
3920 return false;
3921
3922 /* The generic VEC_PERM_EXPR code always uses an integral type of the
3923 same size as the vector element being permuted. */
3924 mask_element_type = lang_hooks.types.type_for_mode
3925 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1);
3926 mask_type = get_vectype_for_scalar_type (mask_element_type);
3927 vec_perm_builder mask (nunits, nunits, 1);
3928 mask.quick_grow (nunits);
3929 vec_perm_indices indices;
3930
3931 /* Initialize the vect stmts of NODE to properly insert the generated
3932 stmts later. */
3933 if (! analyze_only)
3934 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3935 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3936 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3937
3938 /* Generate permutation masks for every NODE. Number of masks for each NODE
3939 is equal to GROUP_SIZE.
3940 E.g., we have a group of three nodes with three loads from the same
3941 location in each node, and the vector size is 4. I.e., we have a
3942 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3943 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3944 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3945 ...
3946
3947 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3948 The last mask is illegal since we assume two operands for permute
3949 operation, and the mask element values can't be outside that range.
3950 Hence, the last mask must be converted into {2,5,5,5}.
3951 For the first two permutations we need the first and the second input
3952 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3953 we need the second and the third vectors: {b1,c1,a2,b2} and
3954 {c2,a3,b3,c3}. */
3955
3956 int vect_stmts_counter = 0;
3957 unsigned int index = 0;
3958 int first_vec_index = -1;
3959 int second_vec_index = -1;
3960 bool noop_p = true;
3961 *n_perms = 0;
3962
3963 for (unsigned int j = 0; j < const_vf; j++)
3964 {
3965 for (int k = 0; k < group_size; k++)
3966 {
3967 unsigned int i = (SLP_TREE_LOAD_PERMUTATION (node)[k]
3968 + j * STMT_VINFO_GROUP_SIZE (stmt_info));
3969 vec_index = i / nunits;
3970 mask_element = i % nunits;
3971 if (vec_index == first_vec_index
3972 || first_vec_index == -1)
3973 {
3974 first_vec_index = vec_index;
3975 }
3976 else if (vec_index == second_vec_index
3977 || second_vec_index == -1)
3978 {
3979 second_vec_index = vec_index;
3980 mask_element += nunits;
3981 }
3982 else
3983 {
3984 if (dump_enabled_p ())
3985 {
3986 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3987 "permutation requires at "
3988 "least three vectors ");
3989 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3990 stmt, 0);
3991 }
3992 gcc_assert (analyze_only);
3993 return false;
3994 }
3995
3996 gcc_assert (mask_element < 2 * nunits);
3997 if (mask_element != index)
3998 noop_p = false;
3999 mask[index++] = mask_element;
4000
4001 if (index == nunits && !noop_p)
4002 {
4003 indices.new_vector (mask, 2, nunits);
4004 if (!can_vec_perm_const_p (mode, indices))
4005 {
4006 if (dump_enabled_p ())
4007 {
4008 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4009 vect_location,
4010 "unsupported vect permute { ");
4011 for (i = 0; i < nunits; ++i)
4012 {
4013 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4014 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4015 }
4016 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4017 }
4018 gcc_assert (analyze_only);
4019 return false;
4020 }
4021
4022 ++*n_perms;
4023 }
4024
4025 if (index == nunits)
4026 {
4027 if (!analyze_only)
4028 {
4029 tree mask_vec = NULL_TREE;
4030
4031 if (! noop_p)
4032 mask_vec = vec_perm_indices_to_tree (mask_type, indices);
4033
4034 if (second_vec_index == -1)
4035 second_vec_index = first_vec_index;
4036
4037 /* Generate the permute statement if necessary. */
4038 tree first_vec = dr_chain[first_vec_index];
4039 tree second_vec = dr_chain[second_vec_index];
4040 gimple *perm_stmt;
4041 if (! noop_p)
4042 {
4043 tree perm_dest
4044 = vect_create_destination_var (gimple_assign_lhs (stmt),
4045 vectype);
4046 perm_dest = make_ssa_name (perm_dest);
4047 perm_stmt = gimple_build_assign (perm_dest,
4048 VEC_PERM_EXPR,
4049 first_vec, second_vec,
4050 mask_vec);
4051 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4052 }
4053 else
4054 /* If mask was NULL_TREE generate the requested
4055 identity transform. */
4056 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
4057
4058 /* Store the vector statement in NODE. */
4059 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
4060 }
4061
4062 index = 0;
4063 first_vec_index = -1;
4064 second_vec_index = -1;
4065 noop_p = true;
4066 }
4067 }
4068 }
4069
4070 return true;
4071 }
4072
4073 typedef hash_map <vec <gimple *>, slp_tree,
4074 simple_hashmap_traits <bst_traits, slp_tree> >
4075 scalar_stmts_to_slp_tree_map_t;
4076
4077 /* Vectorize SLP instance tree in postorder. */
4078
4079 static bool
vect_schedule_slp_instance(slp_tree node,slp_instance instance,scalar_stmts_to_slp_tree_map_t * bst_map)4080 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
4081 scalar_stmts_to_slp_tree_map_t *bst_map)
4082 {
4083 gimple *stmt;
4084 bool grouped_store, is_store;
4085 gimple_stmt_iterator si;
4086 stmt_vec_info stmt_info;
4087 unsigned int group_size;
4088 tree vectype;
4089 int i, j;
4090 slp_tree child;
4091
4092 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4093 return false;
4094
4095 /* See if we have already vectorized the same set of stmts and reuse their
4096 vectorized stmts. */
4097 if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node)))
4098 {
4099 SLP_TREE_VEC_STMTS (node).safe_splice (SLP_TREE_VEC_STMTS (*leader));
4100 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
4101 = SLP_TREE_NUMBER_OF_VEC_STMTS (*leader);
4102 return false;
4103 }
4104
4105 bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
4106 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4107 vect_schedule_slp_instance (child, instance, bst_map);
4108
4109 /* Push SLP node def-type to stmts. */
4110 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4111 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4112 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
4113 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = SLP_TREE_DEF_TYPE (child);
4114
4115 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
4116 stmt_info = vinfo_for_stmt (stmt);
4117
4118 /* VECTYPE is the type of the destination. */
4119 vectype = STMT_VINFO_VECTYPE (stmt_info);
4120 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4121 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
4122
4123 if (!SLP_TREE_VEC_STMTS (node).exists ())
4124 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
4125
4126 if (dump_enabled_p ())
4127 {
4128 dump_printf_loc (MSG_NOTE,vect_location,
4129 "------>vectorizing SLP node starting from: ");
4130 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4131 }
4132
4133 /* Vectorized stmts go before the last scalar stmt which is where
4134 all uses are ready. */
4135 si = gsi_for_stmt (vect_find_last_scalar_stmt_in_slp (node));
4136
4137 /* Mark the first element of the reduction chain as reduction to properly
4138 transform the node. In the analysis phase only the last element of the
4139 chain is marked as reduction. */
4140 if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
4141 && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
4142 {
4143 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
4144 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4145 }
4146
4147 /* Handle two-operation SLP nodes by vectorizing the group with
4148 both operations and then performing a merge. */
4149 if (SLP_TREE_TWO_OPERATORS (node))
4150 {
4151 enum tree_code code0 = gimple_assign_rhs_code (stmt);
4152 enum tree_code ocode = ERROR_MARK;
4153 gimple *ostmt;
4154 vec_perm_builder mask (group_size, group_size, 1);
4155 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt)
4156 if (gimple_assign_rhs_code (ostmt) != code0)
4157 {
4158 mask.quick_push (1);
4159 ocode = gimple_assign_rhs_code (ostmt);
4160 }
4161 else
4162 mask.quick_push (0);
4163 if (ocode != ERROR_MARK)
4164 {
4165 vec<gimple *> v0;
4166 vec<gimple *> v1;
4167 unsigned j;
4168 tree tmask = NULL_TREE;
4169 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
4170 v0 = SLP_TREE_VEC_STMTS (node).copy ();
4171 SLP_TREE_VEC_STMTS (node).truncate (0);
4172 gimple_assign_set_rhs_code (stmt, ocode);
4173 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
4174 gimple_assign_set_rhs_code (stmt, code0);
4175 v1 = SLP_TREE_VEC_STMTS (node).copy ();
4176 SLP_TREE_VEC_STMTS (node).truncate (0);
4177 tree meltype = build_nonstandard_integer_type
4178 (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1);
4179 tree mvectype = get_same_sized_vectype (meltype, vectype);
4180 unsigned k = 0, l;
4181 for (j = 0; j < v0.length (); ++j)
4182 {
4183 /* Enforced by vect_build_slp_tree, which rejects variable-length
4184 vectors for SLP_TREE_TWO_OPERATORS. */
4185 unsigned int const_nunits = nunits.to_constant ();
4186 tree_vector_builder melts (mvectype, const_nunits, 1);
4187 for (l = 0; l < const_nunits; ++l)
4188 {
4189 if (k >= group_size)
4190 k = 0;
4191 tree t = build_int_cst (meltype,
4192 mask[k++] * const_nunits + l);
4193 melts.quick_push (t);
4194 }
4195 tmask = melts.build ();
4196
4197 /* ??? Not all targets support a VEC_PERM_EXPR with a
4198 constant mask that would translate to a vec_merge RTX
4199 (with their vec_perm_const_ok). We can either not
4200 vectorize in that case or let veclower do its job.
4201 Unfortunately that isn't too great and at least for
4202 plus/minus we'd eventually like to match targets
4203 vector addsub instructions. */
4204 gimple *vstmt;
4205 vstmt = gimple_build_assign (make_ssa_name (vectype),
4206 VEC_PERM_EXPR,
4207 gimple_assign_lhs (v0[j]),
4208 gimple_assign_lhs (v1[j]), tmask);
4209 vect_finish_stmt_generation (stmt, vstmt, &si);
4210 SLP_TREE_VEC_STMTS (node).quick_push (vstmt);
4211 }
4212 v0.release ();
4213 v1.release ();
4214 return false;
4215 }
4216 }
4217 is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
4218
4219 /* Restore stmt def-types. */
4220 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4221 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4222 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
4223 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_internal_def;
4224
4225 return is_store;
4226 }
4227
4228 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4229 For loop vectorization this is done in vectorizable_call, but for SLP
4230 it needs to be deferred until end of vect_schedule_slp, because multiple
4231 SLP instances may refer to the same scalar stmt. */
4232
4233 static void
vect_remove_slp_scalar_calls(slp_tree node)4234 vect_remove_slp_scalar_calls (slp_tree node)
4235 {
4236 gimple *stmt, *new_stmt;
4237 gimple_stmt_iterator gsi;
4238 int i;
4239 slp_tree child;
4240 tree lhs;
4241 stmt_vec_info stmt_info;
4242
4243 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4244 return;
4245
4246 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4247 vect_remove_slp_scalar_calls (child);
4248
4249 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
4250 {
4251 if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
4252 continue;
4253 stmt_info = vinfo_for_stmt (stmt);
4254 if (stmt_info == NULL
4255 || is_pattern_stmt_p (stmt_info)
4256 || !PURE_SLP_STMT (stmt_info))
4257 continue;
4258 lhs = gimple_call_lhs (stmt);
4259 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4260 set_vinfo_for_stmt (new_stmt, stmt_info);
4261 set_vinfo_for_stmt (stmt, NULL);
4262 STMT_VINFO_STMT (stmt_info) = new_stmt;
4263 gsi = gsi_for_stmt (stmt);
4264 gsi_replace (&gsi, new_stmt, false);
4265 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4266 }
4267 }
4268
4269 /* Generate vector code for all SLP instances in the loop/basic block. */
4270
4271 bool
vect_schedule_slp(vec_info * vinfo)4272 vect_schedule_slp (vec_info *vinfo)
4273 {
4274 vec<slp_instance> slp_instances;
4275 slp_instance instance;
4276 unsigned int i;
4277 bool is_store = false;
4278
4279
4280 scalar_stmts_to_slp_tree_map_t *bst_map
4281 = new scalar_stmts_to_slp_tree_map_t ();
4282 slp_instances = vinfo->slp_instances;
4283 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4284 {
4285 /* Schedule the tree of INSTANCE. */
4286 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
4287 instance, bst_map);
4288 if (dump_enabled_p ())
4289 dump_printf_loc (MSG_NOTE, vect_location,
4290 "vectorizing stmts using SLP.\n");
4291 }
4292 delete bst_map;
4293
4294 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4295 {
4296 slp_tree root = SLP_INSTANCE_TREE (instance);
4297 gimple *store;
4298 unsigned int j;
4299 gimple_stmt_iterator gsi;
4300
4301 /* Remove scalar call stmts. Do not do this for basic-block
4302 vectorization as not all uses may be vectorized.
4303 ??? Why should this be necessary? DCE should be able to
4304 remove the stmts itself.
4305 ??? For BB vectorization we can as well remove scalar
4306 stmts starting from the SLP tree root if they have no
4307 uses. */
4308 if (is_a <loop_vec_info> (vinfo))
4309 vect_remove_slp_scalar_calls (root);
4310
4311 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
4312 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
4313 {
4314 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
4315 break;
4316
4317 if (is_pattern_stmt_p (vinfo_for_stmt (store)))
4318 store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
4319 /* Free the attached stmt_vec_info and remove the stmt. */
4320 gsi = gsi_for_stmt (store);
4321 unlink_stmt_vdef (store);
4322 gsi_remove (&gsi, true);
4323 release_defs (store);
4324 free_stmt_vec_info (store);
4325 }
4326 }
4327
4328 return is_store;
4329 }
4330