110d565efSmrg /* Vectorizer Specific Loop Manipulations
2*ec02198aSmrg Copyright (C) 2003-2020 Free Software Foundation, Inc.
310d565efSmrg Contributed by Dorit Naishlos <dorit@il.ibm.com>
410d565efSmrg and Ira Rosen <irar@il.ibm.com>
510d565efSmrg
610d565efSmrg This file is part of GCC.
710d565efSmrg
810d565efSmrg GCC is free software; you can redistribute it and/or modify it under
910d565efSmrg the terms of the GNU General Public License as published by the Free
1010d565efSmrg Software Foundation; either version 3, or (at your option) any later
1110d565efSmrg version.
1210d565efSmrg
1310d565efSmrg GCC is distributed in the hope that it will be useful, but WITHOUT ANY
1410d565efSmrg WARRANTY; without even the implied warranty of MERCHANTABILITY or
1510d565efSmrg FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
1610d565efSmrg for more details.
1710d565efSmrg
1810d565efSmrg You should have received a copy of the GNU General Public License
1910d565efSmrg along with GCC; see the file COPYING3. If not see
2010d565efSmrg <http://www.gnu.org/licenses/>. */
2110d565efSmrg
2210d565efSmrg #include "config.h"
2310d565efSmrg #include "system.h"
2410d565efSmrg #include "coretypes.h"
2510d565efSmrg #include "backend.h"
2610d565efSmrg #include "tree.h"
2710d565efSmrg #include "gimple.h"
2810d565efSmrg #include "cfghooks.h"
2910d565efSmrg #include "tree-pass.h"
3010d565efSmrg #include "ssa.h"
3110d565efSmrg #include "fold-const.h"
3210d565efSmrg #include "cfganal.h"
3310d565efSmrg #include "gimplify.h"
3410d565efSmrg #include "gimple-iterator.h"
3510d565efSmrg #include "gimplify-me.h"
3610d565efSmrg #include "tree-cfg.h"
3710d565efSmrg #include "tree-ssa-loop-manip.h"
3810d565efSmrg #include "tree-into-ssa.h"
3910d565efSmrg #include "tree-ssa.h"
4010d565efSmrg #include "cfgloop.h"
4110d565efSmrg #include "tree-scalar-evolution.h"
4210d565efSmrg #include "tree-vectorizer.h"
4310d565efSmrg #include "tree-ssa-loop-ivopts.h"
44c7a68eb7Smrg #include "gimple-fold.h"
45c7a68eb7Smrg #include "tree-ssa-loop-niter.h"
46c7a68eb7Smrg #include "internal-fn.h"
47c7a68eb7Smrg #include "stor-layout.h"
48c7a68eb7Smrg #include "optabs-query.h"
49c7a68eb7Smrg #include "vec-perm-indices.h"
50*ec02198aSmrg #include "insn-config.h"
51*ec02198aSmrg #include "rtl.h"
52*ec02198aSmrg #include "recog.h"
5310d565efSmrg
5410d565efSmrg /*************************************************************************
5510d565efSmrg Simple Loop Peeling Utilities
5610d565efSmrg
5710d565efSmrg Utilities to support loop peeling for vectorization purposes.
5810d565efSmrg *************************************************************************/
5910d565efSmrg
6010d565efSmrg
6110d565efSmrg /* Renames the use *OP_P. */
6210d565efSmrg
6310d565efSmrg static void
rename_use_op(use_operand_p op_p)6410d565efSmrg rename_use_op (use_operand_p op_p)
6510d565efSmrg {
6610d565efSmrg tree new_name;
6710d565efSmrg
6810d565efSmrg if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
6910d565efSmrg return;
7010d565efSmrg
7110d565efSmrg new_name = get_current_def (USE_FROM_PTR (op_p));
7210d565efSmrg
7310d565efSmrg /* Something defined outside of the loop. */
7410d565efSmrg if (!new_name)
7510d565efSmrg return;
7610d565efSmrg
7710d565efSmrg /* An ordinary ssa name defined in the loop. */
7810d565efSmrg
7910d565efSmrg SET_USE (op_p, new_name);
8010d565efSmrg }
8110d565efSmrg
8210d565efSmrg
8310d565efSmrg /* Renames the variables in basic block BB. Allow renaming of PHI arguments
8410d565efSmrg on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
8510d565efSmrg true. */
8610d565efSmrg
8710d565efSmrg static void
rename_variables_in_bb(basic_block bb,bool rename_from_outer_loop)8810d565efSmrg rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
8910d565efSmrg {
9010d565efSmrg gimple *stmt;
9110d565efSmrg use_operand_p use_p;
9210d565efSmrg ssa_op_iter iter;
9310d565efSmrg edge e;
9410d565efSmrg edge_iterator ei;
95*ec02198aSmrg class loop *loop = bb->loop_father;
96*ec02198aSmrg class loop *outer_loop = NULL;
9710d565efSmrg
9810d565efSmrg if (rename_from_outer_loop)
9910d565efSmrg {
10010d565efSmrg gcc_assert (loop);
10110d565efSmrg outer_loop = loop_outer (loop);
10210d565efSmrg }
10310d565efSmrg
10410d565efSmrg for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
10510d565efSmrg gsi_next (&gsi))
10610d565efSmrg {
10710d565efSmrg stmt = gsi_stmt (gsi);
10810d565efSmrg FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
10910d565efSmrg rename_use_op (use_p);
11010d565efSmrg }
11110d565efSmrg
11210d565efSmrg FOR_EACH_EDGE (e, ei, bb->preds)
11310d565efSmrg {
11410d565efSmrg if (!flow_bb_inside_loop_p (loop, e->src))
11510d565efSmrg {
11610d565efSmrg if (!rename_from_outer_loop)
11710d565efSmrg continue;
11810d565efSmrg if (e->src != outer_loop->header)
11910d565efSmrg {
12010d565efSmrg if (outer_loop->inner->next)
12110d565efSmrg {
12210d565efSmrg /* If outer_loop has 2 inner loops, allow there to
12310d565efSmrg be an extra basic block which decides which of the
12410d565efSmrg two loops to use using LOOP_VECTORIZED. */
12510d565efSmrg if (!single_pred_p (e->src)
12610d565efSmrg || single_pred (e->src) != outer_loop->header)
12710d565efSmrg continue;
12810d565efSmrg }
12910d565efSmrg }
13010d565efSmrg }
13110d565efSmrg for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
13210d565efSmrg gsi_next (&gsi))
13310d565efSmrg rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
13410d565efSmrg }
13510d565efSmrg }
13610d565efSmrg
13710d565efSmrg
13810d565efSmrg struct adjust_info
13910d565efSmrg {
14010d565efSmrg tree from, to;
14110d565efSmrg basic_block bb;
14210d565efSmrg };
14310d565efSmrg
14410d565efSmrg /* A stack of values to be adjusted in debug stmts. We have to
14510d565efSmrg process them LIFO, so that the closest substitution applies. If we
14610d565efSmrg processed them FIFO, without the stack, we might substitute uses
14710d565efSmrg with a PHI DEF that would soon become non-dominant, and when we got
14810d565efSmrg to the suitable one, it wouldn't have anything to substitute any
14910d565efSmrg more. */
15010d565efSmrg static vec<adjust_info, va_heap> adjust_vec;
15110d565efSmrg
15210d565efSmrg /* Adjust any debug stmts that referenced AI->from values to use the
15310d565efSmrg loop-closed AI->to, if the references are dominated by AI->bb and
15410d565efSmrg not by the definition of AI->from. */
15510d565efSmrg
15610d565efSmrg static void
adjust_debug_stmts_now(adjust_info * ai)15710d565efSmrg adjust_debug_stmts_now (adjust_info *ai)
15810d565efSmrg {
15910d565efSmrg basic_block bbphi = ai->bb;
16010d565efSmrg tree orig_def = ai->from;
16110d565efSmrg tree new_def = ai->to;
16210d565efSmrg imm_use_iterator imm_iter;
16310d565efSmrg gimple *stmt;
16410d565efSmrg basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
16510d565efSmrg
16610d565efSmrg gcc_assert (dom_info_available_p (CDI_DOMINATORS));
16710d565efSmrg
16810d565efSmrg /* Adjust any debug stmts that held onto non-loop-closed
16910d565efSmrg references. */
17010d565efSmrg FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
17110d565efSmrg {
17210d565efSmrg use_operand_p use_p;
17310d565efSmrg basic_block bbuse;
17410d565efSmrg
17510d565efSmrg if (!is_gimple_debug (stmt))
17610d565efSmrg continue;
17710d565efSmrg
17810d565efSmrg gcc_assert (gimple_debug_bind_p (stmt));
17910d565efSmrg
18010d565efSmrg bbuse = gimple_bb (stmt);
18110d565efSmrg
18210d565efSmrg if ((bbuse == bbphi
18310d565efSmrg || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
18410d565efSmrg && !(bbuse == bbdef
18510d565efSmrg || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
18610d565efSmrg {
18710d565efSmrg if (new_def)
18810d565efSmrg FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
18910d565efSmrg SET_USE (use_p, new_def);
19010d565efSmrg else
19110d565efSmrg {
19210d565efSmrg gimple_debug_bind_reset_value (stmt);
19310d565efSmrg update_stmt (stmt);
19410d565efSmrg }
19510d565efSmrg }
19610d565efSmrg }
19710d565efSmrg }
19810d565efSmrg
19910d565efSmrg /* Adjust debug stmts as scheduled before. */
20010d565efSmrg
20110d565efSmrg static void
adjust_vec_debug_stmts(void)20210d565efSmrg adjust_vec_debug_stmts (void)
20310d565efSmrg {
204c7a68eb7Smrg if (!MAY_HAVE_DEBUG_BIND_STMTS)
20510d565efSmrg return;
20610d565efSmrg
20710d565efSmrg gcc_assert (adjust_vec.exists ());
20810d565efSmrg
20910d565efSmrg while (!adjust_vec.is_empty ())
21010d565efSmrg {
21110d565efSmrg adjust_debug_stmts_now (&adjust_vec.last ());
21210d565efSmrg adjust_vec.pop ();
21310d565efSmrg }
21410d565efSmrg }
21510d565efSmrg
21610d565efSmrg /* Adjust any debug stmts that referenced FROM values to use the
21710d565efSmrg loop-closed TO, if the references are dominated by BB and not by
21810d565efSmrg the definition of FROM. If adjust_vec is non-NULL, adjustments
21910d565efSmrg will be postponed until adjust_vec_debug_stmts is called. */
22010d565efSmrg
22110d565efSmrg static void
adjust_debug_stmts(tree from,tree to,basic_block bb)22210d565efSmrg adjust_debug_stmts (tree from, tree to, basic_block bb)
22310d565efSmrg {
22410d565efSmrg adjust_info ai;
22510d565efSmrg
226c7a68eb7Smrg if (MAY_HAVE_DEBUG_BIND_STMTS
22710d565efSmrg && TREE_CODE (from) == SSA_NAME
22810d565efSmrg && ! SSA_NAME_IS_DEFAULT_DEF (from)
22910d565efSmrg && ! virtual_operand_p (from))
23010d565efSmrg {
23110d565efSmrg ai.from = from;
23210d565efSmrg ai.to = to;
23310d565efSmrg ai.bb = bb;
23410d565efSmrg
23510d565efSmrg if (adjust_vec.exists ())
23610d565efSmrg adjust_vec.safe_push (ai);
23710d565efSmrg else
23810d565efSmrg adjust_debug_stmts_now (&ai);
23910d565efSmrg }
24010d565efSmrg }
24110d565efSmrg
24210d565efSmrg /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
24310d565efSmrg to adjust any debug stmts that referenced the old phi arg,
24410d565efSmrg presumably non-loop-closed references left over from other
24510d565efSmrg transformations. */
24610d565efSmrg
24710d565efSmrg static void
adjust_phi_and_debug_stmts(gimple * update_phi,edge e,tree new_def)24810d565efSmrg adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
24910d565efSmrg {
25010d565efSmrg tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
25110d565efSmrg
25210d565efSmrg SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
25310d565efSmrg
254c7a68eb7Smrg if (MAY_HAVE_DEBUG_BIND_STMTS)
25510d565efSmrg adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
25610d565efSmrg gimple_bb (update_phi));
25710d565efSmrg }
25810d565efSmrg
259c7a68eb7Smrg /* Define one loop mask MASK from loop LOOP. INIT_MASK is the value that
260c7a68eb7Smrg the mask should have during the first iteration and NEXT_MASK is the
261c7a68eb7Smrg value that it should have on subsequent iterations. */
26210d565efSmrg
263c7a68eb7Smrg static void
vect_set_loop_mask(class loop * loop,tree mask,tree init_mask,tree next_mask)264*ec02198aSmrg vect_set_loop_mask (class loop *loop, tree mask, tree init_mask,
265c7a68eb7Smrg tree next_mask)
266c7a68eb7Smrg {
267c7a68eb7Smrg gphi *phi = create_phi_node (mask, loop->header);
268c7a68eb7Smrg add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
269c7a68eb7Smrg add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
270c7a68eb7Smrg }
27110d565efSmrg
272c7a68eb7Smrg /* Add SEQ to the end of LOOP's preheader block. */
273c7a68eb7Smrg
274c7a68eb7Smrg static void
add_preheader_seq(class loop * loop,gimple_seq seq)275*ec02198aSmrg add_preheader_seq (class loop *loop, gimple_seq seq)
276c7a68eb7Smrg {
277c7a68eb7Smrg if (seq)
278c7a68eb7Smrg {
279c7a68eb7Smrg edge pe = loop_preheader_edge (loop);
280c7a68eb7Smrg basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
281c7a68eb7Smrg gcc_assert (!new_bb);
282c7a68eb7Smrg }
283c7a68eb7Smrg }
284c7a68eb7Smrg
285c7a68eb7Smrg /* Add SEQ to the beginning of LOOP's header block. */
286c7a68eb7Smrg
287c7a68eb7Smrg static void
add_header_seq(class loop * loop,gimple_seq seq)288*ec02198aSmrg add_header_seq (class loop *loop, gimple_seq seq)
289c7a68eb7Smrg {
290c7a68eb7Smrg if (seq)
291c7a68eb7Smrg {
292c7a68eb7Smrg gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
293c7a68eb7Smrg gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
294c7a68eb7Smrg }
295c7a68eb7Smrg }
296c7a68eb7Smrg
297c7a68eb7Smrg /* Return true if the target can interleave elements of two vectors.
298c7a68eb7Smrg OFFSET is 0 if the first half of the vectors should be interleaved
299c7a68eb7Smrg or 1 if the second half should. When returning true, store the
300c7a68eb7Smrg associated permutation in INDICES. */
301c7a68eb7Smrg
302c7a68eb7Smrg static bool
interleave_supported_p(vec_perm_indices * indices,tree vectype,unsigned int offset)303c7a68eb7Smrg interleave_supported_p (vec_perm_indices *indices, tree vectype,
304c7a68eb7Smrg unsigned int offset)
305c7a68eb7Smrg {
306c7a68eb7Smrg poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
307c7a68eb7Smrg poly_uint64 base = exact_div (nelts, 2) * offset;
308c7a68eb7Smrg vec_perm_builder sel (nelts, 2, 3);
309c7a68eb7Smrg for (unsigned int i = 0; i < 3; ++i)
310c7a68eb7Smrg {
311c7a68eb7Smrg sel.quick_push (base + i);
312c7a68eb7Smrg sel.quick_push (base + i + nelts);
313c7a68eb7Smrg }
314c7a68eb7Smrg indices->new_vector (sel, 2, nelts);
315c7a68eb7Smrg return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
316c7a68eb7Smrg }
317c7a68eb7Smrg
318c7a68eb7Smrg /* Try to use permutes to define the masks in DEST_RGM using the masks
319c7a68eb7Smrg in SRC_RGM, given that the former has twice as many masks as the
320c7a68eb7Smrg latter. Return true on success, adding any new statements to SEQ. */
321c7a68eb7Smrg
322c7a68eb7Smrg static bool
vect_maybe_permute_loop_masks(gimple_seq * seq,rgroup_masks * dest_rgm,rgroup_masks * src_rgm)323c7a68eb7Smrg vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
324c7a68eb7Smrg rgroup_masks *src_rgm)
325c7a68eb7Smrg {
326c7a68eb7Smrg tree src_masktype = src_rgm->mask_type;
327c7a68eb7Smrg tree dest_masktype = dest_rgm->mask_type;
328c7a68eb7Smrg machine_mode src_mode = TYPE_MODE (src_masktype);
329*ec02198aSmrg insn_code icode1, icode2;
330c7a68eb7Smrg if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
331*ec02198aSmrg && (icode1 = optab_handler (vec_unpacku_hi_optab,
332*ec02198aSmrg src_mode)) != CODE_FOR_nothing
333*ec02198aSmrg && (icode2 = optab_handler (vec_unpacku_lo_optab,
334*ec02198aSmrg src_mode)) != CODE_FOR_nothing)
335c7a68eb7Smrg {
336c7a68eb7Smrg /* Unpacking the source masks gives at least as many mask bits as
337c7a68eb7Smrg we need. We can then VIEW_CONVERT any excess bits away. */
338*ec02198aSmrg machine_mode dest_mode = insn_data[icode1].operand[0].mode;
339*ec02198aSmrg gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
340*ec02198aSmrg tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
341c7a68eb7Smrg for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
342c7a68eb7Smrg {
343c7a68eb7Smrg tree src = src_rgm->masks[i / 2];
344c7a68eb7Smrg tree dest = dest_rgm->masks[i];
345c7a68eb7Smrg tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
346c7a68eb7Smrg ? VEC_UNPACK_HI_EXPR
347c7a68eb7Smrg : VEC_UNPACK_LO_EXPR);
348c7a68eb7Smrg gassign *stmt;
349c7a68eb7Smrg if (dest_masktype == unpack_masktype)
350c7a68eb7Smrg stmt = gimple_build_assign (dest, code, src);
351c7a68eb7Smrg else
352c7a68eb7Smrg {
353c7a68eb7Smrg tree temp = make_ssa_name (unpack_masktype);
354c7a68eb7Smrg stmt = gimple_build_assign (temp, code, src);
355c7a68eb7Smrg gimple_seq_add_stmt (seq, stmt);
356c7a68eb7Smrg stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
357c7a68eb7Smrg build1 (VIEW_CONVERT_EXPR,
358c7a68eb7Smrg dest_masktype, temp));
359c7a68eb7Smrg }
360c7a68eb7Smrg gimple_seq_add_stmt (seq, stmt);
361c7a68eb7Smrg }
362c7a68eb7Smrg return true;
363c7a68eb7Smrg }
364c7a68eb7Smrg vec_perm_indices indices[2];
365c7a68eb7Smrg if (dest_masktype == src_masktype
366c7a68eb7Smrg && interleave_supported_p (&indices[0], src_masktype, 0)
367c7a68eb7Smrg && interleave_supported_p (&indices[1], src_masktype, 1))
368c7a68eb7Smrg {
369c7a68eb7Smrg /* The destination requires twice as many mask bits as the source, so
370c7a68eb7Smrg we can use interleaving permutes to double up the number of bits. */
371c7a68eb7Smrg tree masks[2];
372c7a68eb7Smrg for (unsigned int i = 0; i < 2; ++i)
373c7a68eb7Smrg masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
374c7a68eb7Smrg for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
375c7a68eb7Smrg {
376c7a68eb7Smrg tree src = src_rgm->masks[i / 2];
377c7a68eb7Smrg tree dest = dest_rgm->masks[i];
378c7a68eb7Smrg gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
379c7a68eb7Smrg src, src, masks[i & 1]);
380c7a68eb7Smrg gimple_seq_add_stmt (seq, stmt);
381c7a68eb7Smrg }
382c7a68eb7Smrg return true;
383c7a68eb7Smrg }
384c7a68eb7Smrg return false;
385c7a68eb7Smrg }
386c7a68eb7Smrg
387c7a68eb7Smrg /* Helper for vect_set_loop_condition_masked. Generate definitions for
388c7a68eb7Smrg all the masks in RGM and return a mask that is nonzero when the loop
389c7a68eb7Smrg needs to iterate. Add any new preheader statements to PREHEADER_SEQ.
390c7a68eb7Smrg Use LOOP_COND_GSI to insert code before the exit gcond.
391c7a68eb7Smrg
392c7a68eb7Smrg RGM belongs to loop LOOP. The loop originally iterated NITERS
393*ec02198aSmrg times and has been vectorized according to LOOP_VINFO.
394c7a68eb7Smrg
395c7a68eb7Smrg If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
396c7a68eb7Smrg starts with NITERS_SKIP dummy iterations of the scalar loop before
397c7a68eb7Smrg the real work starts. The mask elements for these dummy iterations
398c7a68eb7Smrg must be 0, to ensure that the extra iterations do not have an effect.
399c7a68eb7Smrg
400c7a68eb7Smrg It is known that:
401c7a68eb7Smrg
402c7a68eb7Smrg NITERS * RGM->max_nscalars_per_iter
403c7a68eb7Smrg
404c7a68eb7Smrg does not overflow. However, MIGHT_WRAP_P says whether an induction
405c7a68eb7Smrg variable that starts at 0 and has step:
406c7a68eb7Smrg
407c7a68eb7Smrg VF * RGM->max_nscalars_per_iter
408c7a68eb7Smrg
409c7a68eb7Smrg might overflow before hitting a value above:
410c7a68eb7Smrg
411c7a68eb7Smrg (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
412c7a68eb7Smrg
413c7a68eb7Smrg This means that we cannot guarantee that such an induction variable
414c7a68eb7Smrg would ever hit a value that produces a set of all-false masks for RGM. */
415c7a68eb7Smrg
416c7a68eb7Smrg static tree
vect_set_loop_masks_directly(class loop * loop,loop_vec_info loop_vinfo,gimple_seq * preheader_seq,gimple_stmt_iterator loop_cond_gsi,rgroup_masks * rgm,tree niters,tree niters_skip,bool might_wrap_p)417*ec02198aSmrg vect_set_loop_masks_directly (class loop *loop, loop_vec_info loop_vinfo,
418c7a68eb7Smrg gimple_seq *preheader_seq,
419c7a68eb7Smrg gimple_stmt_iterator loop_cond_gsi,
420*ec02198aSmrg rgroup_masks *rgm, tree niters, tree niters_skip,
421c7a68eb7Smrg bool might_wrap_p)
422c7a68eb7Smrg {
423c7a68eb7Smrg tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
424*ec02198aSmrg tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);
425c7a68eb7Smrg tree mask_type = rgm->mask_type;
426c7a68eb7Smrg unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
427c7a68eb7Smrg poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
428*ec02198aSmrg poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
429c7a68eb7Smrg
430c7a68eb7Smrg /* Calculate the maximum number of scalar values that the rgroup
431c7a68eb7Smrg handles in total, the number that it handles for each iteration
432c7a68eb7Smrg of the vector loop, and the number that it should skip during the
433c7a68eb7Smrg first iteration of the vector loop. */
434c7a68eb7Smrg tree nscalars_total = niters;
435*ec02198aSmrg tree nscalars_step = build_int_cst (iv_type, vf);
436c7a68eb7Smrg tree nscalars_skip = niters_skip;
437c7a68eb7Smrg if (nscalars_per_iter != 1)
438c7a68eb7Smrg {
439c7a68eb7Smrg /* We checked before choosing to use a fully-masked loop that these
440c7a68eb7Smrg multiplications don't overflow. */
441*ec02198aSmrg tree compare_factor = build_int_cst (compare_type, nscalars_per_iter);
442*ec02198aSmrg tree iv_factor = build_int_cst (iv_type, nscalars_per_iter);
443c7a68eb7Smrg nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
444*ec02198aSmrg nscalars_total, compare_factor);
445*ec02198aSmrg nscalars_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
446*ec02198aSmrg nscalars_step, iv_factor);
447c7a68eb7Smrg if (nscalars_skip)
448c7a68eb7Smrg nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
449*ec02198aSmrg nscalars_skip, compare_factor);
450c7a68eb7Smrg }
451c7a68eb7Smrg
452c7a68eb7Smrg /* Create an induction variable that counts the number of scalars
453c7a68eb7Smrg processed. */
454c7a68eb7Smrg tree index_before_incr, index_after_incr;
455c7a68eb7Smrg gimple_stmt_iterator incr_gsi;
456c7a68eb7Smrg bool insert_after;
457c7a68eb7Smrg standard_iv_increment_position (loop, &incr_gsi, &insert_after);
458*ec02198aSmrg create_iv (build_int_cst (iv_type, 0), nscalars_step, NULL_TREE, loop,
459*ec02198aSmrg &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
460c7a68eb7Smrg
461*ec02198aSmrg tree zero_index = build_int_cst (compare_type, 0);
462c7a68eb7Smrg tree test_index, test_limit, first_limit;
463c7a68eb7Smrg gimple_stmt_iterator *test_gsi;
464c7a68eb7Smrg if (might_wrap_p)
465c7a68eb7Smrg {
466c7a68eb7Smrg /* In principle the loop should stop iterating once the incremented
467c7a68eb7Smrg IV reaches a value greater than or equal to:
468c7a68eb7Smrg
469c7a68eb7Smrg NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
470c7a68eb7Smrg
471c7a68eb7Smrg However, there's no guarantee that this addition doesn't overflow
472c7a68eb7Smrg the comparison type, or that the IV hits a value above it before
473c7a68eb7Smrg wrapping around. We therefore adjust the limit down by one
474c7a68eb7Smrg IV step:
475c7a68eb7Smrg
476c7a68eb7Smrg (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
477c7a68eb7Smrg -[infinite-prec] NSCALARS_STEP
478c7a68eb7Smrg
479c7a68eb7Smrg and compare the IV against this limit _before_ incrementing it.
480c7a68eb7Smrg Since the comparison type is unsigned, we actually want the
481c7a68eb7Smrg subtraction to saturate at zero:
482c7a68eb7Smrg
483c7a68eb7Smrg (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
484c7a68eb7Smrg -[sat] NSCALARS_STEP
485c7a68eb7Smrg
486c7a68eb7Smrg And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
487c7a68eb7Smrg
488c7a68eb7Smrg NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
489c7a68eb7Smrg
490c7a68eb7Smrg where the rightmost subtraction can be done directly in
491c7a68eb7Smrg COMPARE_TYPE. */
492c7a68eb7Smrg test_index = index_before_incr;
493*ec02198aSmrg tree adjust = gimple_convert (preheader_seq, compare_type,
494*ec02198aSmrg nscalars_step);
495c7a68eb7Smrg if (nscalars_skip)
496c7a68eb7Smrg adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
497c7a68eb7Smrg adjust, nscalars_skip);
498c7a68eb7Smrg test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
499c7a68eb7Smrg nscalars_total, adjust);
500c7a68eb7Smrg test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
501c7a68eb7Smrg test_limit, adjust);
502c7a68eb7Smrg test_gsi = &incr_gsi;
503c7a68eb7Smrg
504c7a68eb7Smrg /* Get a safe limit for the first iteration. */
505c7a68eb7Smrg if (nscalars_skip)
506c7a68eb7Smrg {
507c7a68eb7Smrg /* The first vector iteration can handle at most NSCALARS_STEP
508c7a68eb7Smrg scalars. NSCALARS_STEP <= CONST_LIMIT, and adding
509c7a68eb7Smrg NSCALARS_SKIP to that cannot overflow. */
510c7a68eb7Smrg tree const_limit = build_int_cst (compare_type,
511c7a68eb7Smrg LOOP_VINFO_VECT_FACTOR (loop_vinfo)
512c7a68eb7Smrg * nscalars_per_iter);
513c7a68eb7Smrg first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
514c7a68eb7Smrg nscalars_total, const_limit);
515c7a68eb7Smrg first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
516c7a68eb7Smrg first_limit, nscalars_skip);
517c7a68eb7Smrg }
518c7a68eb7Smrg else
519c7a68eb7Smrg /* For the first iteration it doesn't matter whether the IV hits
520c7a68eb7Smrg a value above NSCALARS_TOTAL. That only matters for the latch
521c7a68eb7Smrg condition. */
522c7a68eb7Smrg first_limit = nscalars_total;
523c7a68eb7Smrg }
524c7a68eb7Smrg else
525c7a68eb7Smrg {
526c7a68eb7Smrg /* Test the incremented IV, which will always hit a value above
527c7a68eb7Smrg the bound before wrapping. */
528c7a68eb7Smrg test_index = index_after_incr;
529c7a68eb7Smrg test_limit = nscalars_total;
530c7a68eb7Smrg if (nscalars_skip)
531c7a68eb7Smrg test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
532c7a68eb7Smrg test_limit, nscalars_skip);
533c7a68eb7Smrg test_gsi = &loop_cond_gsi;
534c7a68eb7Smrg
535c7a68eb7Smrg first_limit = test_limit;
536c7a68eb7Smrg }
537c7a68eb7Smrg
538*ec02198aSmrg /* Convert the IV value to the comparison type (either a no-op or
539*ec02198aSmrg a demotion). */
540*ec02198aSmrg gimple_seq test_seq = NULL;
541*ec02198aSmrg test_index = gimple_convert (&test_seq, compare_type, test_index);
542*ec02198aSmrg gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
543*ec02198aSmrg
544c7a68eb7Smrg /* Provide a definition of each mask in the group. */
545c7a68eb7Smrg tree next_mask = NULL_TREE;
546c7a68eb7Smrg tree mask;
547c7a68eb7Smrg unsigned int i;
548c7a68eb7Smrg FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
549c7a68eb7Smrg {
550c7a68eb7Smrg /* Previous masks will cover BIAS scalars. This mask covers the
551c7a68eb7Smrg next batch. */
552c7a68eb7Smrg poly_uint64 bias = nscalars_per_mask * i;
553c7a68eb7Smrg tree bias_tree = build_int_cst (compare_type, bias);
554c7a68eb7Smrg gimple *tmp_stmt;
555c7a68eb7Smrg
556c7a68eb7Smrg /* See whether the first iteration of the vector loop is known
557c7a68eb7Smrg to have a full mask. */
558c7a68eb7Smrg poly_uint64 const_limit;
559c7a68eb7Smrg bool first_iteration_full
560c7a68eb7Smrg = (poly_int_tree_p (first_limit, &const_limit)
561c7a68eb7Smrg && known_ge (const_limit, (i + 1) * nscalars_per_mask));
562c7a68eb7Smrg
563c7a68eb7Smrg /* Rather than have a new IV that starts at BIAS and goes up to
564c7a68eb7Smrg TEST_LIMIT, prefer to use the same 0-based IV for each mask
565c7a68eb7Smrg and adjust the bound down by BIAS. */
566c7a68eb7Smrg tree this_test_limit = test_limit;
567c7a68eb7Smrg if (i != 0)
568c7a68eb7Smrg {
569c7a68eb7Smrg this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
570c7a68eb7Smrg compare_type, this_test_limit,
571c7a68eb7Smrg bias_tree);
572c7a68eb7Smrg this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
573c7a68eb7Smrg compare_type, this_test_limit,
574c7a68eb7Smrg bias_tree);
575c7a68eb7Smrg }
576c7a68eb7Smrg
577c7a68eb7Smrg /* Create the initial mask. First include all scalars that
578c7a68eb7Smrg are within the loop limit. */
579c7a68eb7Smrg tree init_mask = NULL_TREE;
580c7a68eb7Smrg if (!first_iteration_full)
581c7a68eb7Smrg {
582c7a68eb7Smrg tree start, end;
583c7a68eb7Smrg if (first_limit == test_limit)
584c7a68eb7Smrg {
585c7a68eb7Smrg /* Use a natural test between zero (the initial IV value)
586c7a68eb7Smrg and the loop limit. The "else" block would be valid too,
587c7a68eb7Smrg but this choice can avoid the need to load BIAS_TREE into
588c7a68eb7Smrg a register. */
589c7a68eb7Smrg start = zero_index;
590c7a68eb7Smrg end = this_test_limit;
591c7a68eb7Smrg }
592c7a68eb7Smrg else
593c7a68eb7Smrg {
594c7a68eb7Smrg /* FIRST_LIMIT is the maximum number of scalars handled by the
595c7a68eb7Smrg first iteration of the vector loop. Test the portion
596c7a68eb7Smrg associated with this mask. */
597c7a68eb7Smrg start = bias_tree;
598c7a68eb7Smrg end = first_limit;
599c7a68eb7Smrg }
600c7a68eb7Smrg
601c7a68eb7Smrg init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
602c7a68eb7Smrg tmp_stmt = vect_gen_while (init_mask, start, end);
603c7a68eb7Smrg gimple_seq_add_stmt (preheader_seq, tmp_stmt);
604c7a68eb7Smrg }
605c7a68eb7Smrg
606c7a68eb7Smrg /* Now AND out the bits that are within the number of skipped
607c7a68eb7Smrg scalars. */
608c7a68eb7Smrg poly_uint64 const_skip;
609c7a68eb7Smrg if (nscalars_skip
610c7a68eb7Smrg && !(poly_int_tree_p (nscalars_skip, &const_skip)
611c7a68eb7Smrg && known_le (const_skip, bias)))
612c7a68eb7Smrg {
613c7a68eb7Smrg tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
614c7a68eb7Smrg bias_tree, nscalars_skip);
615c7a68eb7Smrg if (init_mask)
616c7a68eb7Smrg init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
617c7a68eb7Smrg init_mask, unskipped_mask);
618c7a68eb7Smrg else
619c7a68eb7Smrg init_mask = unskipped_mask;
620c7a68eb7Smrg }
621c7a68eb7Smrg
622c7a68eb7Smrg if (!init_mask)
623c7a68eb7Smrg /* First iteration is full. */
624c7a68eb7Smrg init_mask = build_minus_one_cst (mask_type);
625c7a68eb7Smrg
626c7a68eb7Smrg /* Get the mask value for the next iteration of the loop. */
627c7a68eb7Smrg next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
628c7a68eb7Smrg gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
629c7a68eb7Smrg gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
630c7a68eb7Smrg
631c7a68eb7Smrg vect_set_loop_mask (loop, mask, init_mask, next_mask);
632c7a68eb7Smrg }
633c7a68eb7Smrg return next_mask;
634c7a68eb7Smrg }
635c7a68eb7Smrg
636c7a68eb7Smrg /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
637c7a68eb7Smrg LOOP_VINFO describes the vectorization of LOOP. NITERS is the
638c7a68eb7Smrg number of iterations of the original scalar loop that should be
639c7a68eb7Smrg handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are
640c7a68eb7Smrg as for vect_set_loop_condition.
641c7a68eb7Smrg
642c7a68eb7Smrg Insert the branch-back condition before LOOP_COND_GSI and return the
643c7a68eb7Smrg final gcond. */
644c7a68eb7Smrg
645c7a68eb7Smrg static gcond *
vect_set_loop_condition_masked(class loop * loop,loop_vec_info loop_vinfo,tree niters,tree final_iv,bool niters_maybe_zero,gimple_stmt_iterator loop_cond_gsi)646*ec02198aSmrg vect_set_loop_condition_masked (class loop *loop, loop_vec_info loop_vinfo,
647c7a68eb7Smrg tree niters, tree final_iv,
648c7a68eb7Smrg bool niters_maybe_zero,
649c7a68eb7Smrg gimple_stmt_iterator loop_cond_gsi)
650c7a68eb7Smrg {
651c7a68eb7Smrg gimple_seq preheader_seq = NULL;
652c7a68eb7Smrg gimple_seq header_seq = NULL;
653c7a68eb7Smrg
654c7a68eb7Smrg tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
655c7a68eb7Smrg unsigned int compare_precision = TYPE_PRECISION (compare_type);
656c7a68eb7Smrg tree orig_niters = niters;
657c7a68eb7Smrg
658c7a68eb7Smrg /* Type of the initial value of NITERS. */
659c7a68eb7Smrg tree ni_actual_type = TREE_TYPE (niters);
660c7a68eb7Smrg unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
661*ec02198aSmrg tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
662c7a68eb7Smrg
663c7a68eb7Smrg /* Convert NITERS to the same size as the compare. */
664c7a68eb7Smrg if (compare_precision > ni_actual_precision
665c7a68eb7Smrg && niters_maybe_zero)
666c7a68eb7Smrg {
667c7a68eb7Smrg /* We know that there is always at least one iteration, so if the
668c7a68eb7Smrg count is zero then it must have wrapped. Cope with this by
669c7a68eb7Smrg subtracting 1 before the conversion and adding 1 to the result. */
670c7a68eb7Smrg gcc_assert (TYPE_UNSIGNED (ni_actual_type));
671c7a68eb7Smrg niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
672c7a68eb7Smrg niters, build_minus_one_cst (ni_actual_type));
673c7a68eb7Smrg niters = gimple_convert (&preheader_seq, compare_type, niters);
674c7a68eb7Smrg niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
675c7a68eb7Smrg niters, build_one_cst (compare_type));
676c7a68eb7Smrg }
677c7a68eb7Smrg else
678c7a68eb7Smrg niters = gimple_convert (&preheader_seq, compare_type, niters);
679c7a68eb7Smrg
680*ec02198aSmrg widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
681c7a68eb7Smrg
682c7a68eb7Smrg /* Iterate over all the rgroups and fill in their masks. We could use
683c7a68eb7Smrg the first mask from any rgroup for the loop condition; here we
684c7a68eb7Smrg arbitrarily pick the last. */
685c7a68eb7Smrg tree test_mask = NULL_TREE;
686c7a68eb7Smrg rgroup_masks *rgm;
687c7a68eb7Smrg unsigned int i;
688c7a68eb7Smrg vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
689c7a68eb7Smrg FOR_EACH_VEC_ELT (*masks, i, rgm)
690c7a68eb7Smrg if (!rgm->masks.is_empty ())
691c7a68eb7Smrg {
692c7a68eb7Smrg /* First try using permutes. This adds a single vector
693c7a68eb7Smrg instruction to the loop for each mask, but needs no extra
694c7a68eb7Smrg loop invariants or IVs. */
695c7a68eb7Smrg unsigned int nmasks = i + 1;
696c7a68eb7Smrg if ((nmasks & 1) == 0)
697c7a68eb7Smrg {
698c7a68eb7Smrg rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
699c7a68eb7Smrg if (!half_rgm->masks.is_empty ()
700c7a68eb7Smrg && vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
701c7a68eb7Smrg continue;
702c7a68eb7Smrg }
703c7a68eb7Smrg
704c7a68eb7Smrg /* See whether zero-based IV would ever generate all-false masks
705c7a68eb7Smrg before wrapping around. */
706c7a68eb7Smrg bool might_wrap_p
707*ec02198aSmrg = (iv_limit == -1
708c7a68eb7Smrg || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
709c7a68eb7Smrg UNSIGNED)
710c7a68eb7Smrg > compare_precision));
711c7a68eb7Smrg
712c7a68eb7Smrg /* Set up all masks for this group. */
713c7a68eb7Smrg test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
714c7a68eb7Smrg &preheader_seq,
715*ec02198aSmrg loop_cond_gsi, rgm,
716c7a68eb7Smrg niters, niters_skip,
717c7a68eb7Smrg might_wrap_p);
718c7a68eb7Smrg }
719c7a68eb7Smrg
720c7a68eb7Smrg /* Emit all accumulated statements. */
721c7a68eb7Smrg add_preheader_seq (loop, preheader_seq);
722c7a68eb7Smrg add_header_seq (loop, header_seq);
723c7a68eb7Smrg
724c7a68eb7Smrg /* Get a boolean result that tells us whether to iterate. */
725c7a68eb7Smrg edge exit_edge = single_exit (loop);
726c7a68eb7Smrg tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
727c7a68eb7Smrg tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
728c7a68eb7Smrg gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
729c7a68eb7Smrg NULL_TREE, NULL_TREE);
730c7a68eb7Smrg gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
731c7a68eb7Smrg
732c7a68eb7Smrg /* The loop iterates (NITERS - 1) / VF + 1 times.
733c7a68eb7Smrg Subtract one from this to get the latch count. */
734c7a68eb7Smrg tree step = build_int_cst (compare_type,
735c7a68eb7Smrg LOOP_VINFO_VECT_FACTOR (loop_vinfo));
736c7a68eb7Smrg tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
737c7a68eb7Smrg build_minus_one_cst (compare_type));
738c7a68eb7Smrg loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
739c7a68eb7Smrg niters_minus_one, step);
740c7a68eb7Smrg
741c7a68eb7Smrg if (final_iv)
742c7a68eb7Smrg {
743c7a68eb7Smrg gassign *assign = gimple_build_assign (final_iv, orig_niters);
744c7a68eb7Smrg gsi_insert_on_edge_immediate (single_exit (loop), assign);
745c7a68eb7Smrg }
746c7a68eb7Smrg
747c7a68eb7Smrg return cond_stmt;
748c7a68eb7Smrg }
749c7a68eb7Smrg
750c7a68eb7Smrg /* Like vect_set_loop_condition, but handle the case in which there
751c7a68eb7Smrg are no loop masks. */
752c7a68eb7Smrg
753c7a68eb7Smrg static gcond *
vect_set_loop_condition_unmasked(class loop * loop,tree niters,tree step,tree final_iv,bool niters_maybe_zero,gimple_stmt_iterator loop_cond_gsi)754*ec02198aSmrg vect_set_loop_condition_unmasked (class loop *loop, tree niters,
755c7a68eb7Smrg tree step, tree final_iv,
756c7a68eb7Smrg bool niters_maybe_zero,
757c7a68eb7Smrg gimple_stmt_iterator loop_cond_gsi)
75810d565efSmrg {
75910d565efSmrg tree indx_before_incr, indx_after_incr;
76010d565efSmrg gcond *cond_stmt;
76110d565efSmrg gcond *orig_cond;
762c7a68eb7Smrg edge pe = loop_preheader_edge (loop);
76310d565efSmrg edge exit_edge = single_exit (loop);
76410d565efSmrg gimple_stmt_iterator incr_gsi;
76510d565efSmrg bool insert_after;
76610d565efSmrg enum tree_code code;
767c7a68eb7Smrg tree niters_type = TREE_TYPE (niters);
76810d565efSmrg
76910d565efSmrg orig_cond = get_loop_exit_condition (loop);
77010d565efSmrg gcc_assert (orig_cond);
77110d565efSmrg loop_cond_gsi = gsi_for_stmt (orig_cond);
77210d565efSmrg
773c7a68eb7Smrg tree init, limit;
774c7a68eb7Smrg if (!niters_maybe_zero && integer_onep (step))
775c7a68eb7Smrg {
776c7a68eb7Smrg /* In this case we can use a simple 0-based IV:
777c7a68eb7Smrg
778c7a68eb7Smrg A:
779c7a68eb7Smrg x = 0;
780c7a68eb7Smrg do
781c7a68eb7Smrg {
782c7a68eb7Smrg ...
783c7a68eb7Smrg x += 1;
784c7a68eb7Smrg }
785c7a68eb7Smrg while (x < NITERS); */
786c7a68eb7Smrg code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
787c7a68eb7Smrg init = build_zero_cst (niters_type);
788c7a68eb7Smrg limit = niters;
789c7a68eb7Smrg }
790c7a68eb7Smrg else
791c7a68eb7Smrg {
792c7a68eb7Smrg /* The following works for all values of NITERS except 0:
793c7a68eb7Smrg
794c7a68eb7Smrg B:
795c7a68eb7Smrg x = 0;
796c7a68eb7Smrg do
797c7a68eb7Smrg {
798c7a68eb7Smrg ...
799c7a68eb7Smrg x += STEP;
800c7a68eb7Smrg }
801c7a68eb7Smrg while (x <= NITERS - STEP);
802c7a68eb7Smrg
803c7a68eb7Smrg so that the loop continues to iterate if x + STEP - 1 < NITERS
804c7a68eb7Smrg but stops if x + STEP - 1 >= NITERS.
805c7a68eb7Smrg
806c7a68eb7Smrg However, if NITERS is zero, x never hits a value above NITERS - STEP
807c7a68eb7Smrg before wrapping around. There are two obvious ways of dealing with
808c7a68eb7Smrg this:
809c7a68eb7Smrg
810c7a68eb7Smrg - start at STEP - 1 and compare x before incrementing it
811c7a68eb7Smrg - start at -1 and compare x after incrementing it
812c7a68eb7Smrg
813c7a68eb7Smrg The latter is simpler and is what we use. The loop in this case
814c7a68eb7Smrg looks like:
815c7a68eb7Smrg
816c7a68eb7Smrg C:
817c7a68eb7Smrg x = -1;
818c7a68eb7Smrg do
819c7a68eb7Smrg {
820c7a68eb7Smrg ...
821c7a68eb7Smrg x += STEP;
822c7a68eb7Smrg }
823c7a68eb7Smrg while (x < NITERS - STEP);
824c7a68eb7Smrg
825c7a68eb7Smrg In both cases the loop limit is NITERS - STEP. */
826c7a68eb7Smrg gimple_seq seq = NULL;
827c7a68eb7Smrg limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
828c7a68eb7Smrg limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
829c7a68eb7Smrg if (seq)
830c7a68eb7Smrg {
831c7a68eb7Smrg basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
832c7a68eb7Smrg gcc_assert (!new_bb);
833c7a68eb7Smrg }
834c7a68eb7Smrg if (niters_maybe_zero)
835c7a68eb7Smrg {
836c7a68eb7Smrg /* Case C. */
837c7a68eb7Smrg code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
838c7a68eb7Smrg init = build_all_ones_cst (niters_type);
839c7a68eb7Smrg }
840c7a68eb7Smrg else
841c7a68eb7Smrg {
842c7a68eb7Smrg /* Case B. */
843c7a68eb7Smrg code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
844c7a68eb7Smrg init = build_zero_cst (niters_type);
845c7a68eb7Smrg }
846c7a68eb7Smrg }
847c7a68eb7Smrg
84810d565efSmrg standard_iv_increment_position (loop, &incr_gsi, &insert_after);
84910d565efSmrg create_iv (init, step, NULL_TREE, loop,
85010d565efSmrg &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
85110d565efSmrg indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
85210d565efSmrg true, NULL_TREE, true,
85310d565efSmrg GSI_SAME_STMT);
854c7a68eb7Smrg limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
85510d565efSmrg true, GSI_SAME_STMT);
85610d565efSmrg
857c7a68eb7Smrg cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
85810d565efSmrg NULL_TREE);
85910d565efSmrg
86010d565efSmrg gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
86110d565efSmrg
862c7a68eb7Smrg /* Record the number of latch iterations. */
863c7a68eb7Smrg if (limit == niters)
864c7a68eb7Smrg /* Case A: the loop iterates NITERS times. Subtract one to get the
865c7a68eb7Smrg latch count. */
866c7a68eb7Smrg loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
867c7a68eb7Smrg build_int_cst (niters_type, 1));
868c7a68eb7Smrg else
869c7a68eb7Smrg /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
870c7a68eb7Smrg Subtract one from this to get the latch count. */
871c7a68eb7Smrg loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
872c7a68eb7Smrg limit, step);
873c7a68eb7Smrg
874c7a68eb7Smrg if (final_iv)
875c7a68eb7Smrg {
876c7a68eb7Smrg gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
877c7a68eb7Smrg indx_after_incr, init);
878c7a68eb7Smrg gsi_insert_on_edge_immediate (single_exit (loop), assign);
879c7a68eb7Smrg }
880c7a68eb7Smrg
881c7a68eb7Smrg return cond_stmt;
882c7a68eb7Smrg }
883c7a68eb7Smrg
884c7a68eb7Smrg /* If we're using fully-masked loops, make LOOP iterate:
885c7a68eb7Smrg
886c7a68eb7Smrg N == (NITERS - 1) / STEP + 1
887c7a68eb7Smrg
888c7a68eb7Smrg times. When NITERS is zero, this is equivalent to making the loop
889c7a68eb7Smrg execute (1 << M) / STEP times, where M is the precision of NITERS.
890c7a68eb7Smrg NITERS_MAYBE_ZERO is true if this last case might occur.
891c7a68eb7Smrg
892c7a68eb7Smrg If we're not using fully-masked loops, make LOOP iterate:
893c7a68eb7Smrg
894c7a68eb7Smrg N == (NITERS - STEP) / STEP + 1
895c7a68eb7Smrg
896c7a68eb7Smrg times, where NITERS is known to be outside the range [1, STEP - 1].
897c7a68eb7Smrg This is equivalent to making the loop execute NITERS / STEP times
898c7a68eb7Smrg when NITERS is nonzero and (1 << M) / STEP times otherwise.
899c7a68eb7Smrg NITERS_MAYBE_ZERO again indicates whether this last case might occur.
900c7a68eb7Smrg
901c7a68eb7Smrg If FINAL_IV is nonnull, it is an SSA name that should be set to
902c7a68eb7Smrg N * STEP on exit from the loop.
903c7a68eb7Smrg
904c7a68eb7Smrg Assumption: the exit-condition of LOOP is the last stmt in the loop. */
905c7a68eb7Smrg
906c7a68eb7Smrg void
vect_set_loop_condition(class loop * loop,loop_vec_info loop_vinfo,tree niters,tree step,tree final_iv,bool niters_maybe_zero)907*ec02198aSmrg vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
908c7a68eb7Smrg tree niters, tree step, tree final_iv,
909c7a68eb7Smrg bool niters_maybe_zero)
910c7a68eb7Smrg {
911c7a68eb7Smrg gcond *cond_stmt;
912c7a68eb7Smrg gcond *orig_cond = get_loop_exit_condition (loop);
913c7a68eb7Smrg gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
914c7a68eb7Smrg
915c7a68eb7Smrg if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
916c7a68eb7Smrg cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
917c7a68eb7Smrg final_iv, niters_maybe_zero,
918c7a68eb7Smrg loop_cond_gsi);
919c7a68eb7Smrg else
920c7a68eb7Smrg cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
921c7a68eb7Smrg final_iv, niters_maybe_zero,
922c7a68eb7Smrg loop_cond_gsi);
923c7a68eb7Smrg
924c7a68eb7Smrg /* Remove old loop exit test. */
9250fc04c29Smrg stmt_vec_info orig_cond_info;
9260fc04c29Smrg if (loop_vinfo
9270fc04c29Smrg && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
9280fc04c29Smrg loop_vinfo->remove_stmt (orig_cond_info);
9290fc04c29Smrg else
93010d565efSmrg gsi_remove (&loop_cond_gsi, true);
93110d565efSmrg
93210d565efSmrg if (dump_enabled_p ())
9330fc04c29Smrg dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
9340fc04c29Smrg cond_stmt);
93510d565efSmrg }
93610d565efSmrg
93710d565efSmrg /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
93810d565efSmrg For all PHI arguments in FROM->dest and TO->dest from those
93910d565efSmrg edges ensure that TO->dest PHI arguments have current_def
94010d565efSmrg to that in from. */
94110d565efSmrg
94210d565efSmrg static void
slpeel_duplicate_current_defs_from_edges(edge from,edge to)94310d565efSmrg slpeel_duplicate_current_defs_from_edges (edge from, edge to)
94410d565efSmrg {
94510d565efSmrg gimple_stmt_iterator gsi_from, gsi_to;
94610d565efSmrg
94710d565efSmrg for (gsi_from = gsi_start_phis (from->dest),
94810d565efSmrg gsi_to = gsi_start_phis (to->dest);
94910d565efSmrg !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
95010d565efSmrg {
95110d565efSmrg gimple *from_phi = gsi_stmt (gsi_from);
95210d565efSmrg gimple *to_phi = gsi_stmt (gsi_to);
95310d565efSmrg tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
95410d565efSmrg tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
95510d565efSmrg if (virtual_operand_p (from_arg))
95610d565efSmrg {
95710d565efSmrg gsi_next (&gsi_from);
95810d565efSmrg continue;
95910d565efSmrg }
96010d565efSmrg if (virtual_operand_p (to_arg))
96110d565efSmrg {
96210d565efSmrg gsi_next (&gsi_to);
96310d565efSmrg continue;
96410d565efSmrg }
96510d565efSmrg if (TREE_CODE (from_arg) != SSA_NAME)
96610d565efSmrg gcc_assert (operand_equal_p (from_arg, to_arg, 0));
9670fc04c29Smrg else if (TREE_CODE (to_arg) == SSA_NAME
9680fc04c29Smrg && from_arg != to_arg)
96910d565efSmrg {
97010d565efSmrg if (get_current_def (to_arg) == NULL_TREE)
9710fc04c29Smrg {
9720fc04c29Smrg gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
9730fc04c29Smrg TREE_TYPE (get_current_def
9740fc04c29Smrg (from_arg))));
97510d565efSmrg set_current_def (to_arg, get_current_def (from_arg));
97610d565efSmrg }
9770fc04c29Smrg }
97810d565efSmrg gsi_next (&gsi_from);
97910d565efSmrg gsi_next (&gsi_to);
98010d565efSmrg }
98110d565efSmrg
98210d565efSmrg gphi *from_phi = get_virtual_phi (from->dest);
98310d565efSmrg gphi *to_phi = get_virtual_phi (to->dest);
98410d565efSmrg if (from_phi)
98510d565efSmrg set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
98610d565efSmrg get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
98710d565efSmrg }
98810d565efSmrg
98910d565efSmrg
99010d565efSmrg /* Given LOOP this function generates a new copy of it and puts it
99110d565efSmrg on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
99210d565efSmrg non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
99310d565efSmrg basic blocks from SCALAR_LOOP instead of LOOP, but to either the
99410d565efSmrg entry or exit of LOOP. */
99510d565efSmrg
996*ec02198aSmrg class loop *
slpeel_tree_duplicate_loop_to_edge_cfg(class loop * loop,class loop * scalar_loop,edge e)997*ec02198aSmrg slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
998*ec02198aSmrg class loop *scalar_loop, edge e)
99910d565efSmrg {
1000*ec02198aSmrg class loop *new_loop;
100110d565efSmrg basic_block *new_bbs, *bbs, *pbbs;
100210d565efSmrg bool at_exit;
100310d565efSmrg bool was_imm_dom;
100410d565efSmrg basic_block exit_dest;
100510d565efSmrg edge exit, new_exit;
100610d565efSmrg bool duplicate_outer_loop = false;
100710d565efSmrg
100810d565efSmrg exit = single_exit (loop);
100910d565efSmrg at_exit = (e == exit);
101010d565efSmrg if (!at_exit && e != loop_preheader_edge (loop))
101110d565efSmrg return NULL;
101210d565efSmrg
101310d565efSmrg if (scalar_loop == NULL)
101410d565efSmrg scalar_loop = loop;
101510d565efSmrg
101610d565efSmrg bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
101710d565efSmrg pbbs = bbs + 1;
101810d565efSmrg get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
101910d565efSmrg /* Allow duplication of outer loops. */
102010d565efSmrg if (scalar_loop->inner)
102110d565efSmrg duplicate_outer_loop = true;
102210d565efSmrg /* Check whether duplication is possible. */
102310d565efSmrg if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
102410d565efSmrg {
102510d565efSmrg free (bbs);
102610d565efSmrg return NULL;
102710d565efSmrg }
102810d565efSmrg
102910d565efSmrg /* Generate new loop structure. */
103010d565efSmrg new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
103110d565efSmrg duplicate_subloops (scalar_loop, new_loop);
103210d565efSmrg
103310d565efSmrg exit_dest = exit->dest;
103410d565efSmrg was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
103510d565efSmrg exit_dest) == loop->header ?
103610d565efSmrg true : false);
103710d565efSmrg
103810d565efSmrg /* Also copy the pre-header, this avoids jumping through hoops to
103910d565efSmrg duplicate the loop entry PHI arguments. Create an empty
104010d565efSmrg pre-header unconditionally for this. */
104110d565efSmrg basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
104210d565efSmrg edge entry_e = single_pred_edge (preheader);
104310d565efSmrg bbs[0] = preheader;
104410d565efSmrg new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
104510d565efSmrg
104610d565efSmrg exit = single_exit (scalar_loop);
104710d565efSmrg copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
104810d565efSmrg &exit, 1, &new_exit, NULL,
104910d565efSmrg at_exit ? loop->latch : e->src, true);
105010d565efSmrg exit = single_exit (loop);
105110d565efSmrg basic_block new_preheader = new_bbs[0];
105210d565efSmrg
105310d565efSmrg add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
105410d565efSmrg
1055*ec02198aSmrg /* Skip new preheader since it's deleted if copy loop is added at entry. */
1056*ec02198aSmrg for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1057*ec02198aSmrg rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1058*ec02198aSmrg
105910d565efSmrg if (scalar_loop != loop)
106010d565efSmrg {
106110d565efSmrg /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
106210d565efSmrg SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
106310d565efSmrg but LOOP will not. slpeel_update_phi_nodes_for_guard{1,2} expects
106410d565efSmrg the LOOP SSA_NAMEs (on the exit edge and edge from latch to
106510d565efSmrg header) to have current_def set, so copy them over. */
106610d565efSmrg slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
106710d565efSmrg exit);
106810d565efSmrg slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
106910d565efSmrg 0),
107010d565efSmrg EDGE_SUCC (loop->latch, 0));
107110d565efSmrg }
107210d565efSmrg
107310d565efSmrg if (at_exit) /* Add the loop copy at exit. */
107410d565efSmrg {
107510d565efSmrg if (scalar_loop != loop)
107610d565efSmrg {
107710d565efSmrg gphi_iterator gsi;
107810d565efSmrg new_exit = redirect_edge_and_branch (new_exit, exit_dest);
107910d565efSmrg
108010d565efSmrg for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
108110d565efSmrg gsi_next (&gsi))
108210d565efSmrg {
108310d565efSmrg gphi *phi = gsi.phi ();
108410d565efSmrg tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
108510d565efSmrg location_t orig_locus
108610d565efSmrg = gimple_phi_arg_location_from_edge (phi, e);
108710d565efSmrg
108810d565efSmrg add_phi_arg (phi, orig_arg, new_exit, orig_locus);
108910d565efSmrg }
109010d565efSmrg }
109110d565efSmrg redirect_edge_and_branch_force (e, new_preheader);
109210d565efSmrg flush_pending_stmts (e);
109310d565efSmrg set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
109410d565efSmrg if (was_imm_dom || duplicate_outer_loop)
109510d565efSmrg set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
109610d565efSmrg
109710d565efSmrg /* And remove the non-necessary forwarder again. Keep the other
109810d565efSmrg one so we have a proper pre-header for the loop at the exit edge. */
109910d565efSmrg redirect_edge_pred (single_succ_edge (preheader),
110010d565efSmrg single_pred (preheader));
110110d565efSmrg delete_basic_block (preheader);
110210d565efSmrg set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
110310d565efSmrg loop_preheader_edge (scalar_loop)->src);
110410d565efSmrg }
110510d565efSmrg else /* Add the copy at entry. */
110610d565efSmrg {
110710d565efSmrg if (scalar_loop != loop)
110810d565efSmrg {
110910d565efSmrg /* Remove the non-necessary forwarder of scalar_loop again. */
111010d565efSmrg redirect_edge_pred (single_succ_edge (preheader),
111110d565efSmrg single_pred (preheader));
111210d565efSmrg delete_basic_block (preheader);
111310d565efSmrg set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
111410d565efSmrg loop_preheader_edge (scalar_loop)->src);
111510d565efSmrg preheader = split_edge (loop_preheader_edge (loop));
111610d565efSmrg entry_e = single_pred_edge (preheader);
111710d565efSmrg }
111810d565efSmrg
111910d565efSmrg redirect_edge_and_branch_force (entry_e, new_preheader);
112010d565efSmrg flush_pending_stmts (entry_e);
112110d565efSmrg set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
112210d565efSmrg
112310d565efSmrg redirect_edge_and_branch_force (new_exit, preheader);
112410d565efSmrg flush_pending_stmts (new_exit);
112510d565efSmrg set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
112610d565efSmrg
112710d565efSmrg /* And remove the non-necessary forwarder again. Keep the other
112810d565efSmrg one so we have a proper pre-header for the loop at the exit edge. */
112910d565efSmrg redirect_edge_pred (single_succ_edge (new_preheader),
113010d565efSmrg single_pred (new_preheader));
113110d565efSmrg delete_basic_block (new_preheader);
113210d565efSmrg set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
113310d565efSmrg loop_preheader_edge (new_loop)->src);
113410d565efSmrg }
113510d565efSmrg
113610d565efSmrg if (scalar_loop != loop)
113710d565efSmrg {
113810d565efSmrg /* Update new_loop->header PHIs, so that on the preheader
113910d565efSmrg edge they are the ones from loop rather than scalar_loop. */
114010d565efSmrg gphi_iterator gsi_orig, gsi_new;
114110d565efSmrg edge orig_e = loop_preheader_edge (loop);
114210d565efSmrg edge new_e = loop_preheader_edge (new_loop);
114310d565efSmrg
114410d565efSmrg for (gsi_orig = gsi_start_phis (loop->header),
114510d565efSmrg gsi_new = gsi_start_phis (new_loop->header);
114610d565efSmrg !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
114710d565efSmrg gsi_next (&gsi_orig), gsi_next (&gsi_new))
114810d565efSmrg {
114910d565efSmrg gphi *orig_phi = gsi_orig.phi ();
115010d565efSmrg gphi *new_phi = gsi_new.phi ();
115110d565efSmrg tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
115210d565efSmrg location_t orig_locus
115310d565efSmrg = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
115410d565efSmrg
115510d565efSmrg add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
115610d565efSmrg }
115710d565efSmrg }
115810d565efSmrg
115910d565efSmrg free (new_bbs);
116010d565efSmrg free (bbs);
116110d565efSmrg
116210d565efSmrg checking_verify_dominators (CDI_DOMINATORS);
116310d565efSmrg
116410d565efSmrg return new_loop;
116510d565efSmrg }
116610d565efSmrg
116710d565efSmrg
116810d565efSmrg /* Given the condition expression COND, put it as the last statement of
116910d565efSmrg GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
117010d565efSmrg DOM_BB; return the skip edge. GUARD_TO is the target basic block to
117110d565efSmrg skip the loop. PROBABILITY is the skip edge's probability. Mark the
117210d565efSmrg new edge as irreducible if IRREDUCIBLE_P is true. */
117310d565efSmrg
117410d565efSmrg static edge
slpeel_add_loop_guard(basic_block guard_bb,tree cond,basic_block guard_to,basic_block dom_bb,profile_probability probability,bool irreducible_p)117510d565efSmrg slpeel_add_loop_guard (basic_block guard_bb, tree cond,
117610d565efSmrg basic_block guard_to, basic_block dom_bb,
1177c7a68eb7Smrg profile_probability probability, bool irreducible_p)
117810d565efSmrg {
117910d565efSmrg gimple_stmt_iterator gsi;
118010d565efSmrg edge new_e, enter_e;
118110d565efSmrg gcond *cond_stmt;
118210d565efSmrg gimple_seq gimplify_stmt_list = NULL;
118310d565efSmrg
118410d565efSmrg enter_e = EDGE_SUCC (guard_bb, 0);
118510d565efSmrg enter_e->flags &= ~EDGE_FALLTHRU;
118610d565efSmrg enter_e->flags |= EDGE_FALSE_VALUE;
118710d565efSmrg gsi = gsi_last_bb (guard_bb);
118810d565efSmrg
118910d565efSmrg cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
119010d565efSmrg NULL_TREE);
119110d565efSmrg if (gimplify_stmt_list)
119210d565efSmrg gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
119310d565efSmrg
119410d565efSmrg cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
119510d565efSmrg gsi = gsi_last_bb (guard_bb);
119610d565efSmrg gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
119710d565efSmrg
119810d565efSmrg /* Add new edge to connect guard block to the merge/loop-exit block. */
119910d565efSmrg new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
120010d565efSmrg
120110d565efSmrg new_e->probability = probability;
120210d565efSmrg if (irreducible_p)
120310d565efSmrg new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
120410d565efSmrg
1205c7a68eb7Smrg enter_e->probability = probability.invert ();
120610d565efSmrg set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
120710d565efSmrg
120810d565efSmrg /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
120910d565efSmrg if (enter_e->dest->loop_father->header == enter_e->dest)
121010d565efSmrg split_edge (enter_e);
121110d565efSmrg
121210d565efSmrg return new_e;
121310d565efSmrg }
121410d565efSmrg
121510d565efSmrg
121610d565efSmrg /* This function verifies that the following restrictions apply to LOOP:
121710d565efSmrg (1) it consists of exactly 2 basic blocks - header, and an empty latch
121810d565efSmrg for innermost loop and 5 basic blocks for outer-loop.
121910d565efSmrg (2) it is single entry, single exit
122010d565efSmrg (3) its exit condition is the last stmt in the header
122110d565efSmrg (4) E is the entry/exit edge of LOOP.
122210d565efSmrg */
122310d565efSmrg
122410d565efSmrg bool
slpeel_can_duplicate_loop_p(const class loop * loop,const_edge e)1225*ec02198aSmrg slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
122610d565efSmrg {
122710d565efSmrg edge exit_e = single_exit (loop);
122810d565efSmrg edge entry_e = loop_preheader_edge (loop);
122910d565efSmrg gcond *orig_cond = get_loop_exit_condition (loop);
123010d565efSmrg gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
123110d565efSmrg unsigned int num_bb = loop->inner? 5 : 2;
123210d565efSmrg
123310d565efSmrg /* All loops have an outer scope; the only case loop->outer is NULL is for
123410d565efSmrg the function itself. */
123510d565efSmrg if (!loop_outer (loop)
123610d565efSmrg || loop->num_nodes != num_bb
123710d565efSmrg || !empty_block_p (loop->latch)
123810d565efSmrg || !single_exit (loop)
123910d565efSmrg /* Verify that new loop exit condition can be trivially modified. */
124010d565efSmrg || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
124110d565efSmrg || (e != exit_e && e != entry_e))
124210d565efSmrg return false;
124310d565efSmrg
124410d565efSmrg return true;
124510d565efSmrg }
124610d565efSmrg
124710d565efSmrg /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
124810d565efSmrg in the exit bb and rename all the uses after the loop. This simplifies
124910d565efSmrg the *guard[12] routines, which assume loop closed SSA form for all PHIs
125010d565efSmrg (but normally loop closed SSA form doesn't require virtual PHIs to be
125110d565efSmrg in the same form). Doing this early simplifies the checking what
1252*ec02198aSmrg uses should be renamed.
125310d565efSmrg
1254*ec02198aSmrg If we create a new phi after the loop, return the definition that
1255*ec02198aSmrg applies on entry to the loop, otherwise return null. */
1256*ec02198aSmrg
1257*ec02198aSmrg static tree
create_lcssa_for_virtual_phi(class loop * loop)1258*ec02198aSmrg create_lcssa_for_virtual_phi (class loop *loop)
125910d565efSmrg {
126010d565efSmrg gphi_iterator gsi;
126110d565efSmrg edge exit_e = single_exit (loop);
126210d565efSmrg
126310d565efSmrg for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
126410d565efSmrg if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
126510d565efSmrg {
126610d565efSmrg gphi *phi = gsi.phi ();
126710d565efSmrg for (gsi = gsi_start_phis (exit_e->dest);
126810d565efSmrg !gsi_end_p (gsi); gsi_next (&gsi))
126910d565efSmrg if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
127010d565efSmrg break;
127110d565efSmrg if (gsi_end_p (gsi))
127210d565efSmrg {
127310d565efSmrg tree new_vop = copy_ssa_name (PHI_RESULT (phi));
127410d565efSmrg gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
127510d565efSmrg tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
127610d565efSmrg imm_use_iterator imm_iter;
127710d565efSmrg gimple *stmt;
127810d565efSmrg use_operand_p use_p;
127910d565efSmrg
128010d565efSmrg SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
128110d565efSmrg = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
128210d565efSmrg add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
128310d565efSmrg gimple_phi_set_result (new_phi, new_vop);
128410d565efSmrg FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
128510d565efSmrg if (stmt != new_phi
128610d565efSmrg && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
128710d565efSmrg FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
128810d565efSmrg SET_USE (use_p, new_vop);
1289*ec02198aSmrg
1290*ec02198aSmrg return PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
129110d565efSmrg }
129210d565efSmrg break;
129310d565efSmrg }
1294*ec02198aSmrg return NULL_TREE;
129510d565efSmrg }
129610d565efSmrg
129710d565efSmrg /* Function vect_get_loop_location.
129810d565efSmrg
129910d565efSmrg Extract the location of the loop in the source code.
130010d565efSmrg If the loop is not well formed for vectorization, an estimated
130110d565efSmrg location is calculated.
130210d565efSmrg Return the loop location if succeed and NULL if not. */
130310d565efSmrg
13040fc04c29Smrg dump_user_location_t
find_loop_location(class loop * loop)1305*ec02198aSmrg find_loop_location (class loop *loop)
130610d565efSmrg {
130710d565efSmrg gimple *stmt = NULL;
130810d565efSmrg basic_block bb;
130910d565efSmrg gimple_stmt_iterator si;
131010d565efSmrg
131110d565efSmrg if (!loop)
13120fc04c29Smrg return dump_user_location_t ();
131310d565efSmrg
131410d565efSmrg stmt = get_loop_exit_condition (loop);
131510d565efSmrg
131610d565efSmrg if (stmt
131710d565efSmrg && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
13180fc04c29Smrg return stmt;
131910d565efSmrg
132010d565efSmrg /* If we got here the loop is probably not "well formed",
132110d565efSmrg try to estimate the loop location */
132210d565efSmrg
132310d565efSmrg if (!loop->header)
13240fc04c29Smrg return dump_user_location_t ();
132510d565efSmrg
132610d565efSmrg bb = loop->header;
132710d565efSmrg
132810d565efSmrg for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
132910d565efSmrg {
133010d565efSmrg stmt = gsi_stmt (si);
133110d565efSmrg if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
13320fc04c29Smrg return stmt;
133310d565efSmrg }
133410d565efSmrg
13350fc04c29Smrg return dump_user_location_t ();
133610d565efSmrg }
133710d565efSmrg
13380fc04c29Smrg /* Return true if the phi described by STMT_INFO defines an IV of the
13390fc04c29Smrg loop to be vectorized. */
134010d565efSmrg
134110d565efSmrg static bool
iv_phi_p(stmt_vec_info stmt_info)13420fc04c29Smrg iv_phi_p (stmt_vec_info stmt_info)
134310d565efSmrg {
13440fc04c29Smrg gphi *phi = as_a <gphi *> (stmt_info->stmt);
134510d565efSmrg if (virtual_operand_p (PHI_RESULT (phi)))
134610d565efSmrg return false;
134710d565efSmrg
134810d565efSmrg if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
134910d565efSmrg || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
135010d565efSmrg return false;
135110d565efSmrg
135210d565efSmrg return true;
135310d565efSmrg }
135410d565efSmrg
135510d565efSmrg /* Function vect_can_advance_ivs_p
135610d565efSmrg
135710d565efSmrg In case the number of iterations that LOOP iterates is unknown at compile
135810d565efSmrg time, an epilog loop will be generated, and the loop induction variables
135910d565efSmrg (IVs) will be "advanced" to the value they are supposed to take just before
136010d565efSmrg the epilog loop. Here we check that the access function of the loop IVs
136110d565efSmrg and the expression that represents the loop bound are simple enough.
136210d565efSmrg These restrictions will be relaxed in the future. */
136310d565efSmrg
136410d565efSmrg bool
vect_can_advance_ivs_p(loop_vec_info loop_vinfo)136510d565efSmrg vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
136610d565efSmrg {
1367*ec02198aSmrg class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
136810d565efSmrg basic_block bb = loop->header;
136910d565efSmrg gphi_iterator gsi;
137010d565efSmrg
137110d565efSmrg /* Analyze phi functions of the loop header. */
137210d565efSmrg
137310d565efSmrg if (dump_enabled_p ())
137410d565efSmrg dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
137510d565efSmrg for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
137610d565efSmrg {
137710d565efSmrg tree evolution_part;
137810d565efSmrg
137910d565efSmrg gphi *phi = gsi.phi ();
13800fc04c29Smrg stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
138110d565efSmrg if (dump_enabled_p ())
13820fc04c29Smrg dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
13830fc04c29Smrg phi_info->stmt);
138410d565efSmrg
138510d565efSmrg /* Skip virtual phi's. The data dependences that are associated with
138610d565efSmrg virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
138710d565efSmrg
138810d565efSmrg Skip reduction phis. */
13890fc04c29Smrg if (!iv_phi_p (phi_info))
139010d565efSmrg {
139110d565efSmrg if (dump_enabled_p ())
139210d565efSmrg dump_printf_loc (MSG_NOTE, vect_location,
139310d565efSmrg "reduc or virtual phi. skip.\n");
139410d565efSmrg continue;
139510d565efSmrg }
139610d565efSmrg
139710d565efSmrg /* Analyze the evolution function. */
139810d565efSmrg
13990fc04c29Smrg evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
140010d565efSmrg if (evolution_part == NULL_TREE)
140110d565efSmrg {
140210d565efSmrg if (dump_enabled_p ())
140310d565efSmrg dump_printf (MSG_MISSED_OPTIMIZATION,
140410d565efSmrg "No access function or evolution.\n");
140510d565efSmrg return false;
140610d565efSmrg }
140710d565efSmrg
140810d565efSmrg /* FORNOW: We do not transform initial conditions of IVs
140910d565efSmrg which evolution functions are not invariants in the loop. */
141010d565efSmrg
141110d565efSmrg if (!expr_invariant_in_loop_p (loop, evolution_part))
141210d565efSmrg {
141310d565efSmrg if (dump_enabled_p ())
141410d565efSmrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
141510d565efSmrg "evolution not invariant in loop.\n");
141610d565efSmrg return false;
141710d565efSmrg }
141810d565efSmrg
141910d565efSmrg /* FORNOW: We do not transform initial conditions of IVs
142010d565efSmrg which evolution functions are a polynomial of degree >= 2. */
142110d565efSmrg
142210d565efSmrg if (tree_is_chrec (evolution_part))
142310d565efSmrg {
142410d565efSmrg if (dump_enabled_p ())
142510d565efSmrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
142610d565efSmrg "evolution is chrec.\n");
142710d565efSmrg return false;
142810d565efSmrg }
142910d565efSmrg }
143010d565efSmrg
143110d565efSmrg return true;
143210d565efSmrg }
143310d565efSmrg
143410d565efSmrg
143510d565efSmrg /* Function vect_update_ivs_after_vectorizer.
143610d565efSmrg
143710d565efSmrg "Advance" the induction variables of LOOP to the value they should take
143810d565efSmrg after the execution of LOOP. This is currently necessary because the
143910d565efSmrg vectorizer does not handle induction variables that are used after the
144010d565efSmrg loop. Such a situation occurs when the last iterations of LOOP are
144110d565efSmrg peeled, because:
144210d565efSmrg 1. We introduced new uses after LOOP for IVs that were not originally used
144310d565efSmrg after LOOP: the IVs of LOOP are now used by an epilog loop.
144410d565efSmrg 2. LOOP is going to be vectorized; this means that it will iterate N/VF
144510d565efSmrg times, whereas the loop IVs should be bumped N times.
144610d565efSmrg
144710d565efSmrg Input:
144810d565efSmrg - LOOP - a loop that is going to be vectorized. The last few iterations
144910d565efSmrg of LOOP were peeled.
145010d565efSmrg - NITERS - the number of iterations that LOOP executes (before it is
145110d565efSmrg vectorized). i.e, the number of times the ivs should be bumped.
145210d565efSmrg - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
145310d565efSmrg coming out from LOOP on which there are uses of the LOOP ivs
145410d565efSmrg (this is the path from LOOP->exit to epilog_loop->preheader).
145510d565efSmrg
145610d565efSmrg The new definitions of the ivs are placed in LOOP->exit.
145710d565efSmrg The phi args associated with the edge UPDATE_E in the bb
145810d565efSmrg UPDATE_E->dest are updated accordingly.
145910d565efSmrg
146010d565efSmrg Assumption 1: Like the rest of the vectorizer, this function assumes
146110d565efSmrg a single loop exit that has a single predecessor.
146210d565efSmrg
146310d565efSmrg Assumption 2: The phi nodes in the LOOP header and in update_bb are
146410d565efSmrg organized in the same order.
146510d565efSmrg
146610d565efSmrg Assumption 3: The access function of the ivs is simple enough (see
146710d565efSmrg vect_can_advance_ivs_p). This assumption will be relaxed in the future.
146810d565efSmrg
146910d565efSmrg Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
147010d565efSmrg coming out of LOOP on which the ivs of LOOP are used (this is the path
147110d565efSmrg that leads to the epilog loop; other paths skip the epilog loop). This
147210d565efSmrg path starts with the edge UPDATE_E, and its destination (denoted update_bb)
147310d565efSmrg needs to have its phis updated.
147410d565efSmrg */
147510d565efSmrg
147610d565efSmrg static void
vect_update_ivs_after_vectorizer(loop_vec_info loop_vinfo,tree niters,edge update_e)147710d565efSmrg vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
147810d565efSmrg tree niters, edge update_e)
147910d565efSmrg {
148010d565efSmrg gphi_iterator gsi, gsi1;
1481*ec02198aSmrg class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
148210d565efSmrg basic_block update_bb = update_e->dest;
148310d565efSmrg basic_block exit_bb = single_exit (loop)->dest;
148410d565efSmrg
148510d565efSmrg /* Make sure there exists a single-predecessor exit bb: */
148610d565efSmrg gcc_assert (single_pred_p (exit_bb));
148710d565efSmrg gcc_assert (single_succ_edge (exit_bb) == update_e);
148810d565efSmrg
148910d565efSmrg for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
149010d565efSmrg !gsi_end_p (gsi) && !gsi_end_p (gsi1);
149110d565efSmrg gsi_next (&gsi), gsi_next (&gsi1))
149210d565efSmrg {
149310d565efSmrg tree init_expr;
149410d565efSmrg tree step_expr, off;
149510d565efSmrg tree type;
149610d565efSmrg tree var, ni, ni_name;
149710d565efSmrg gimple_stmt_iterator last_gsi;
149810d565efSmrg
149910d565efSmrg gphi *phi = gsi.phi ();
150010d565efSmrg gphi *phi1 = gsi1.phi ();
15010fc04c29Smrg stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
150210d565efSmrg if (dump_enabled_p ())
150310d565efSmrg dump_printf_loc (MSG_NOTE, vect_location,
15040fc04c29Smrg "vect_update_ivs_after_vectorizer: phi: %G", phi);
150510d565efSmrg
150610d565efSmrg /* Skip reduction and virtual phis. */
15070fc04c29Smrg if (!iv_phi_p (phi_info))
150810d565efSmrg {
150910d565efSmrg if (dump_enabled_p ())
151010d565efSmrg dump_printf_loc (MSG_NOTE, vect_location,
151110d565efSmrg "reduc or virtual phi. skip.\n");
151210d565efSmrg continue;
151310d565efSmrg }
151410d565efSmrg
151510d565efSmrg type = TREE_TYPE (gimple_phi_result (phi));
15160fc04c29Smrg step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
151710d565efSmrg step_expr = unshare_expr (step_expr);
151810d565efSmrg
151910d565efSmrg /* FORNOW: We do not support IVs whose evolution function is a polynomial
152010d565efSmrg of degree >= 2 or exponential. */
152110d565efSmrg gcc_assert (!tree_is_chrec (step_expr));
152210d565efSmrg
152310d565efSmrg init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
152410d565efSmrg
152510d565efSmrg off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
152610d565efSmrg fold_convert (TREE_TYPE (step_expr), niters),
152710d565efSmrg step_expr);
152810d565efSmrg if (POINTER_TYPE_P (type))
152910d565efSmrg ni = fold_build_pointer_plus (init_expr, off);
153010d565efSmrg else
153110d565efSmrg ni = fold_build2 (PLUS_EXPR, type,
153210d565efSmrg init_expr, fold_convert (type, off));
153310d565efSmrg
153410d565efSmrg var = create_tmp_var (type, "tmp");
153510d565efSmrg
153610d565efSmrg last_gsi = gsi_last_bb (exit_bb);
153710d565efSmrg gimple_seq new_stmts = NULL;
153810d565efSmrg ni_name = force_gimple_operand (ni, &new_stmts, false, var);
153910d565efSmrg /* Exit_bb shouldn't be empty. */
154010d565efSmrg if (!gsi_end_p (last_gsi))
154110d565efSmrg gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
154210d565efSmrg else
154310d565efSmrg gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
154410d565efSmrg
154510d565efSmrg /* Fix phi expressions in the successor bb. */
154610d565efSmrg adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
154710d565efSmrg }
154810d565efSmrg }
154910d565efSmrg
1550c7a68eb7Smrg /* Return a gimple value containing the misalignment (measured in vector
1551c7a68eb7Smrg elements) for the loop described by LOOP_VINFO, i.e. how many elements
1552c7a68eb7Smrg it is away from a perfectly aligned address. Add any new statements
1553c7a68eb7Smrg to SEQ. */
1554c7a68eb7Smrg
1555c7a68eb7Smrg static tree
get_misalign_in_elems(gimple ** seq,loop_vec_info loop_vinfo)1556c7a68eb7Smrg get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1557c7a68eb7Smrg {
15580fc04c29Smrg dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
15590fc04c29Smrg stmt_vec_info stmt_info = dr_info->stmt;
1560c7a68eb7Smrg tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1561c7a68eb7Smrg
15620fc04c29Smrg poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
15630fc04c29Smrg unsigned HOST_WIDE_INT target_align_c;
15640fc04c29Smrg tree target_align_minus_1;
1565c7a68eb7Smrg
15660fc04c29Smrg bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
15670fc04c29Smrg size_zero_node) < 0;
1568c7a68eb7Smrg tree offset = (negative
1569c7a68eb7Smrg ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1570c7a68eb7Smrg : size_zero_node);
15710fc04c29Smrg tree start_addr = vect_create_addr_base_for_vector_ref (stmt_info, seq,
1572c7a68eb7Smrg offset);
1573c7a68eb7Smrg tree type = unsigned_type_for (TREE_TYPE (start_addr));
15740fc04c29Smrg if (target_align.is_constant (&target_align_c))
15750fc04c29Smrg target_align_minus_1 = build_int_cst (type, target_align_c - 1);
15760fc04c29Smrg else
15770fc04c29Smrg {
15780fc04c29Smrg tree vla = build_int_cst (type, target_align);
15790fc04c29Smrg tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
15800fc04c29Smrg fold_build2 (MINUS_EXPR, type,
15810fc04c29Smrg build_int_cst (type, 0), vla));
15820fc04c29Smrg target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
15830fc04c29Smrg build_int_cst (type, 1));
15840fc04c29Smrg }
15850fc04c29Smrg
1586c7a68eb7Smrg HOST_WIDE_INT elem_size
1587c7a68eb7Smrg = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1588c7a68eb7Smrg tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1589c7a68eb7Smrg
1590c7a68eb7Smrg /* Create: misalign_in_bytes = addr & (target_align - 1). */
1591c7a68eb7Smrg tree int_start_addr = fold_convert (type, start_addr);
1592c7a68eb7Smrg tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1593c7a68eb7Smrg target_align_minus_1);
1594c7a68eb7Smrg
1595c7a68eb7Smrg /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
1596c7a68eb7Smrg tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1597c7a68eb7Smrg elem_size_log);
1598c7a68eb7Smrg
1599c7a68eb7Smrg return misalign_in_elems;
1600c7a68eb7Smrg }
1601c7a68eb7Smrg
160210d565efSmrg /* Function vect_gen_prolog_loop_niters
160310d565efSmrg
160410d565efSmrg Generate the number of iterations which should be peeled as prolog for the
160510d565efSmrg loop represented by LOOP_VINFO. It is calculated as the misalignment of
160610d565efSmrg DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
160710d565efSmrg As a result, after the execution of this loop, the data reference DR will
160810d565efSmrg refer to an aligned location. The following computation is generated:
160910d565efSmrg
161010d565efSmrg If the misalignment of DR is known at compile time:
161110d565efSmrg addr_mis = int mis = DR_MISALIGNMENT (dr);
161210d565efSmrg Else, compute address misalignment in bytes:
1613c7a68eb7Smrg addr_mis = addr & (target_align - 1)
161410d565efSmrg
161510d565efSmrg prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
161610d565efSmrg
161710d565efSmrg (elem_size = element type size; an element is the scalar element whose type
161810d565efSmrg is the inner type of the vectype)
161910d565efSmrg
162010d565efSmrg The computations will be emitted at the end of BB. We also compute and
162110d565efSmrg store upper bound (included) of the result in BOUND.
162210d565efSmrg
162310d565efSmrg When the step of the data-ref in the loop is not 1 (as in interleaved data
162410d565efSmrg and SLP), the number of iterations of the prolog must be divided by the step
162510d565efSmrg (which is equal to the size of interleaved group).
162610d565efSmrg
162710d565efSmrg The above formulas assume that VF == number of elements in the vector. This
162810d565efSmrg may not hold when there are multiple-types in the loop.
162910d565efSmrg In this case, for some data-references in the loop the VF does not represent
163010d565efSmrg the number of elements that fit in the vector. Therefore, instead of VF we
163110d565efSmrg use TYPE_VECTOR_SUBPARTS. */
163210d565efSmrg
163310d565efSmrg static tree
vect_gen_prolog_loop_niters(loop_vec_info loop_vinfo,basic_block bb,int * bound)163410d565efSmrg vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
163510d565efSmrg basic_block bb, int *bound)
163610d565efSmrg {
16370fc04c29Smrg dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
163810d565efSmrg tree var;
163910d565efSmrg tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
164010d565efSmrg gimple_seq stmts = NULL, new_stmts = NULL;
164110d565efSmrg tree iters, iters_name;
16420fc04c29Smrg stmt_vec_info stmt_info = dr_info->stmt;
164310d565efSmrg tree vectype = STMT_VINFO_VECTYPE (stmt_info);
16440fc04c29Smrg poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
164510d565efSmrg
164610d565efSmrg if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
164710d565efSmrg {
164810d565efSmrg int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
164910d565efSmrg
165010d565efSmrg if (dump_enabled_p ())
165110d565efSmrg dump_printf_loc (MSG_NOTE, vect_location,
165210d565efSmrg "known peeling = %d.\n", npeel);
165310d565efSmrg
165410d565efSmrg iters = build_int_cst (niters_type, npeel);
165510d565efSmrg *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
165610d565efSmrg }
165710d565efSmrg else
165810d565efSmrg {
1659c7a68eb7Smrg tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1660c7a68eb7Smrg tree type = TREE_TYPE (misalign_in_elems);
1661c7a68eb7Smrg HOST_WIDE_INT elem_size
1662c7a68eb7Smrg = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
16630fc04c29Smrg /* We only do prolog peeling if the target alignment is known at compile
16640fc04c29Smrg time. */
16650fc04c29Smrg poly_uint64 align_in_elems =
16660fc04c29Smrg exact_div (target_align, elem_size);
16670fc04c29Smrg tree align_in_elems_minus_1 =
16680fc04c29Smrg build_int_cst (type, align_in_elems - 1);
1669c7a68eb7Smrg tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1670c7a68eb7Smrg
1671c7a68eb7Smrg /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
1672c7a68eb7Smrg & (align_in_elems - 1)). */
16730fc04c29Smrg bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
16740fc04c29Smrg size_zero_node) < 0;
167510d565efSmrg if (negative)
1676c7a68eb7Smrg iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1677c7a68eb7Smrg align_in_elems_tree);
167810d565efSmrg else
1679c7a68eb7Smrg iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1680c7a68eb7Smrg misalign_in_elems);
1681c7a68eb7Smrg iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
168210d565efSmrg iters = fold_convert (niters_type, iters);
16830fc04c29Smrg unsigned HOST_WIDE_INT align_in_elems_c;
16840fc04c29Smrg if (align_in_elems.is_constant (&align_in_elems_c))
16850fc04c29Smrg *bound = align_in_elems_c - 1;
16860fc04c29Smrg else
16870fc04c29Smrg *bound = -1;
168810d565efSmrg }
168910d565efSmrg
169010d565efSmrg if (dump_enabled_p ())
169110d565efSmrg dump_printf_loc (MSG_NOTE, vect_location,
16920fc04c29Smrg "niters for prolog loop: %T\n", iters);
169310d565efSmrg
169410d565efSmrg var = create_tmp_var (niters_type, "prolog_loop_niters");
169510d565efSmrg iters_name = force_gimple_operand (iters, &new_stmts, false, var);
169610d565efSmrg
169710d565efSmrg if (new_stmts)
169810d565efSmrg gimple_seq_add_seq (&stmts, new_stmts);
169910d565efSmrg if (stmts)
170010d565efSmrg {
170110d565efSmrg gcc_assert (single_succ_p (bb));
170210d565efSmrg gimple_stmt_iterator gsi = gsi_last_bb (bb);
170310d565efSmrg if (gsi_end_p (gsi))
170410d565efSmrg gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
170510d565efSmrg else
170610d565efSmrg gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
170710d565efSmrg }
170810d565efSmrg return iters_name;
170910d565efSmrg }
171010d565efSmrg
171110d565efSmrg
171210d565efSmrg /* Function vect_update_init_of_dr
171310d565efSmrg
1714c7a68eb7Smrg If CODE is PLUS, the vector loop starts NITERS iterations after the
1715c7a68eb7Smrg scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1716c7a68eb7Smrg iterations before the scalar one (using masking to skip inactive
1717c7a68eb7Smrg elements). This function updates the information recorded in DR to
1718c7a68eb7Smrg account for the difference. Specifically, it updates the OFFSET
1719*ec02198aSmrg field of DR_INFO. */
172010d565efSmrg
172110d565efSmrg static void
vect_update_init_of_dr(dr_vec_info * dr_info,tree niters,tree_code code)1722*ec02198aSmrg vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
172310d565efSmrg {
1724*ec02198aSmrg struct data_reference *dr = dr_info->dr;
1725*ec02198aSmrg tree offset = dr_info->offset;
1726*ec02198aSmrg if (!offset)
1727*ec02198aSmrg offset = build_zero_cst (sizetype);
172810d565efSmrg
172910d565efSmrg niters = fold_build2 (MULT_EXPR, sizetype,
173010d565efSmrg fold_convert (sizetype, niters),
173110d565efSmrg fold_convert (sizetype, DR_STEP (dr)));
1732c7a68eb7Smrg offset = fold_build2 (code, sizetype,
173310d565efSmrg fold_convert (sizetype, offset), niters);
1734*ec02198aSmrg dr_info->offset = offset;
173510d565efSmrg }
173610d565efSmrg
173710d565efSmrg
173810d565efSmrg /* Function vect_update_inits_of_drs
173910d565efSmrg
1740c7a68eb7Smrg Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1741c7a68eb7Smrg CODE and NITERS are as for vect_update_inits_of_dr. */
174210d565efSmrg
1743*ec02198aSmrg void
vect_update_inits_of_drs(loop_vec_info loop_vinfo,tree niters,tree_code code)1744c7a68eb7Smrg vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1745c7a68eb7Smrg tree_code code)
174610d565efSmrg {
174710d565efSmrg unsigned int i;
174810d565efSmrg vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
174910d565efSmrg struct data_reference *dr;
175010d565efSmrg
17510fc04c29Smrg DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
175210d565efSmrg
1753*ec02198aSmrg /* Adjust niters to sizetype. We used to insert the stmts on loop preheader
1754*ec02198aSmrg here, but since we might use these niters to update the epilogues niters
1755*ec02198aSmrg and data references we can't insert them here as this definition might not
1756*ec02198aSmrg always dominate its uses. */
175710d565efSmrg if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
175810d565efSmrg niters = fold_convert (sizetype, niters);
175910d565efSmrg
176010d565efSmrg FOR_EACH_VEC_ELT (datarefs, i, dr)
17610fc04c29Smrg {
17620fc04c29Smrg dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1763*ec02198aSmrg if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)
1764*ec02198aSmrg && !STMT_VINFO_SIMD_LANE_ACCESS_P (dr_info->stmt))
1765*ec02198aSmrg vect_update_init_of_dr (dr_info, niters, code);
176610d565efSmrg }
17670fc04c29Smrg }
176810d565efSmrg
1769c7a68eb7Smrg /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1770c7a68eb7Smrg by masking. This involves calculating the number of iterations to
1771c7a68eb7Smrg be peeled and then aligning all memory references appropriately. */
1772c7a68eb7Smrg
1773c7a68eb7Smrg void
vect_prepare_for_masked_peels(loop_vec_info loop_vinfo)1774c7a68eb7Smrg vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1775c7a68eb7Smrg {
1776c7a68eb7Smrg tree misalign_in_elems;
1777c7a68eb7Smrg tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1778c7a68eb7Smrg
1779c7a68eb7Smrg gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1780c7a68eb7Smrg
1781c7a68eb7Smrg /* From the information recorded in LOOP_VINFO get the number of iterations
1782c7a68eb7Smrg that need to be skipped via masking. */
1783c7a68eb7Smrg if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1784c7a68eb7Smrg {
1785c7a68eb7Smrg poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1786c7a68eb7Smrg - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1787c7a68eb7Smrg misalign_in_elems = build_int_cst (type, misalign);
1788c7a68eb7Smrg }
1789c7a68eb7Smrg else
1790c7a68eb7Smrg {
1791c7a68eb7Smrg gimple_seq seq1 = NULL, seq2 = NULL;
1792c7a68eb7Smrg misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1793c7a68eb7Smrg misalign_in_elems = fold_convert (type, misalign_in_elems);
1794c7a68eb7Smrg misalign_in_elems = force_gimple_operand (misalign_in_elems,
1795c7a68eb7Smrg &seq2, true, NULL_TREE);
1796c7a68eb7Smrg gimple_seq_add_seq (&seq1, seq2);
1797c7a68eb7Smrg if (seq1)
1798c7a68eb7Smrg {
1799c7a68eb7Smrg edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1800c7a68eb7Smrg basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1801c7a68eb7Smrg gcc_assert (!new_bb);
1802c7a68eb7Smrg }
1803c7a68eb7Smrg }
1804c7a68eb7Smrg
1805c7a68eb7Smrg if (dump_enabled_p ())
1806c7a68eb7Smrg dump_printf_loc (MSG_NOTE, vect_location,
18070fc04c29Smrg "misalignment for fully-masked loop: %T\n",
18080fc04c29Smrg misalign_in_elems);
1809c7a68eb7Smrg
1810c7a68eb7Smrg LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1811c7a68eb7Smrg
1812c7a68eb7Smrg vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1813c7a68eb7Smrg }
181410d565efSmrg
181510d565efSmrg /* This function builds ni_name = number of iterations. Statements
1816c7a68eb7Smrg are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
1817c7a68eb7Smrg it to TRUE if new ssa_var is generated. */
181810d565efSmrg
181910d565efSmrg tree
vect_build_loop_niters(loop_vec_info loop_vinfo,bool * new_var_p)1820c7a68eb7Smrg vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
182110d565efSmrg {
182210d565efSmrg tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
182310d565efSmrg if (TREE_CODE (ni) == INTEGER_CST)
182410d565efSmrg return ni;
182510d565efSmrg else
182610d565efSmrg {
182710d565efSmrg tree ni_name, var;
182810d565efSmrg gimple_seq stmts = NULL;
182910d565efSmrg edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
183010d565efSmrg
183110d565efSmrg var = create_tmp_var (TREE_TYPE (ni), "niters");
183210d565efSmrg ni_name = force_gimple_operand (ni, &stmts, false, var);
183310d565efSmrg if (stmts)
1834c7a68eb7Smrg {
183510d565efSmrg gsi_insert_seq_on_edge_immediate (pe, stmts);
1836c7a68eb7Smrg if (new_var_p != NULL)
1837c7a68eb7Smrg *new_var_p = true;
1838c7a68eb7Smrg }
183910d565efSmrg
184010d565efSmrg return ni_name;
184110d565efSmrg }
184210d565efSmrg }
184310d565efSmrg
184410d565efSmrg /* Calculate the number of iterations above which vectorized loop will be
184510d565efSmrg preferred than scalar loop. NITERS_PROLOG is the number of iterations
184610d565efSmrg of prolog loop. If it's integer const, the integer number is also passed
1847c7a68eb7Smrg in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
1848c7a68eb7Smrg number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
1849c7a68eb7Smrg value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
1850c7a68eb7Smrg threshold below which the scalar (rather than vectorized) loop will be
1851c7a68eb7Smrg executed. This function stores the upper bound (inclusive) of the result
1852c7a68eb7Smrg in BOUND_SCALAR. */
185310d565efSmrg
185410d565efSmrg static tree
vect_gen_scalar_loop_niters(tree niters_prolog,int int_niters_prolog,int bound_prolog,poly_int64 bound_epilog,int th,poly_uint64 * bound_scalar,bool check_profitability)185510d565efSmrg vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1856c7a68eb7Smrg int bound_prolog, poly_int64 bound_epilog, int th,
1857c7a68eb7Smrg poly_uint64 *bound_scalar,
1858c7a68eb7Smrg bool check_profitability)
185910d565efSmrg {
186010d565efSmrg tree type = TREE_TYPE (niters_prolog);
186110d565efSmrg tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1862c7a68eb7Smrg build_int_cst (type, bound_epilog));
186310d565efSmrg
1864c7a68eb7Smrg *bound_scalar = bound_prolog + bound_epilog;
186510d565efSmrg if (check_profitability)
186610d565efSmrg {
186710d565efSmrg /* TH indicates the minimum niters of vectorized loop, while we
186810d565efSmrg compute the maximum niters of scalar loop. */
186910d565efSmrg th--;
187010d565efSmrg /* Peeling for constant times. */
187110d565efSmrg if (int_niters_prolog >= 0)
187210d565efSmrg {
1873c7a68eb7Smrg *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
187410d565efSmrg return build_int_cst (type, *bound_scalar);
187510d565efSmrg }
1876c7a68eb7Smrg /* Peeling an unknown number of times. Note that both BOUND_PROLOG
1877c7a68eb7Smrg and BOUND_EPILOG are inclusive upper bounds. */
1878c7a68eb7Smrg if (known_ge (th, bound_prolog + bound_epilog))
187910d565efSmrg {
188010d565efSmrg *bound_scalar = th;
188110d565efSmrg return build_int_cst (type, th);
188210d565efSmrg }
1883c7a68eb7Smrg /* Need to do runtime comparison. */
1884c7a68eb7Smrg else if (maybe_gt (th, bound_epilog))
1885c7a68eb7Smrg {
1886c7a68eb7Smrg *bound_scalar = upper_bound (*bound_scalar, th);
1887c7a68eb7Smrg return fold_build2 (MAX_EXPR, type,
1888c7a68eb7Smrg build_int_cst (type, th), niters);
1889c7a68eb7Smrg }
189010d565efSmrg }
189110d565efSmrg return niters;
189210d565efSmrg }
189310d565efSmrg
1894c7a68eb7Smrg /* NITERS is the number of times that the original scalar loop executes
1895c7a68eb7Smrg after peeling. Work out the maximum number of iterations N that can
1896c7a68eb7Smrg be handled by the vectorized form of the loop and then either:
189710d565efSmrg
1898c7a68eb7Smrg a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
189910d565efSmrg
1900c7a68eb7Smrg niters_vector = N
1901c7a68eb7Smrg
1902c7a68eb7Smrg b) set *STEP_VECTOR_PTR to one and generate:
1903c7a68eb7Smrg
1904c7a68eb7Smrg niters_vector = N / vf
1905c7a68eb7Smrg
1906c7a68eb7Smrg In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1907c7a68eb7Smrg any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
1908c7a68eb7Smrg is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
190910d565efSmrg
191010d565efSmrg void
vect_gen_vector_loop_niters(loop_vec_info loop_vinfo,tree niters,tree * niters_vector_ptr,tree * step_vector_ptr,bool niters_no_overflow)191110d565efSmrg vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1912c7a68eb7Smrg tree *niters_vector_ptr, tree *step_vector_ptr,
1913c7a68eb7Smrg bool niters_no_overflow)
191410d565efSmrg {
191510d565efSmrg tree ni_minus_gap, var;
1916c7a68eb7Smrg tree niters_vector, step_vector, type = TREE_TYPE (niters);
1917c7a68eb7Smrg poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
191810d565efSmrg edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1919c7a68eb7Smrg tree log_vf = NULL_TREE;
192010d565efSmrg
192110d565efSmrg /* If epilogue loop is required because of data accesses with gaps, we
192210d565efSmrg subtract one iteration from the total number of iterations here for
192310d565efSmrg correct calculation of RATIO. */
192410d565efSmrg if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
192510d565efSmrg {
1926c7a68eb7Smrg ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1927c7a68eb7Smrg build_one_cst (type));
192810d565efSmrg if (!is_gimple_val (ni_minus_gap))
192910d565efSmrg {
1930c7a68eb7Smrg var = create_tmp_var (type, "ni_gap");
193110d565efSmrg gimple *stmts = NULL;
193210d565efSmrg ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
193310d565efSmrg true, var);
193410d565efSmrg gsi_insert_seq_on_edge_immediate (pe, stmts);
193510d565efSmrg }
193610d565efSmrg }
193710d565efSmrg else
193810d565efSmrg ni_minus_gap = niters;
193910d565efSmrg
1940c7a68eb7Smrg unsigned HOST_WIDE_INT const_vf;
1941c7a68eb7Smrg if (vf.is_constant (&const_vf)
1942c7a68eb7Smrg && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1943c7a68eb7Smrg {
194410d565efSmrg /* Create: niters >> log2(vf) */
194510d565efSmrg /* If it's known that niters == number of latch executions + 1 doesn't
194610d565efSmrg overflow, we can generate niters >> log2(vf); otherwise we generate
194710d565efSmrg (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
194810d565efSmrg will be at least one. */
1949c7a68eb7Smrg log_vf = build_int_cst (type, exact_log2 (const_vf));
195010d565efSmrg if (niters_no_overflow)
1951c7a68eb7Smrg niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
195210d565efSmrg else
195310d565efSmrg niters_vector
1954c7a68eb7Smrg = fold_build2 (PLUS_EXPR, type,
1955c7a68eb7Smrg fold_build2 (RSHIFT_EXPR, type,
1956c7a68eb7Smrg fold_build2 (MINUS_EXPR, type,
195710d565efSmrg ni_minus_gap,
1958c7a68eb7Smrg build_int_cst (type, vf)),
195910d565efSmrg log_vf),
1960c7a68eb7Smrg build_int_cst (type, 1));
1961c7a68eb7Smrg step_vector = build_one_cst (type);
1962c7a68eb7Smrg }
1963c7a68eb7Smrg else
1964c7a68eb7Smrg {
1965c7a68eb7Smrg niters_vector = ni_minus_gap;
1966c7a68eb7Smrg step_vector = build_int_cst (type, vf);
1967c7a68eb7Smrg }
196810d565efSmrg
196910d565efSmrg if (!is_gimple_val (niters_vector))
197010d565efSmrg {
1971c7a68eb7Smrg var = create_tmp_var (type, "bnd");
1972c7a68eb7Smrg gimple_seq stmts = NULL;
197310d565efSmrg niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
197410d565efSmrg gsi_insert_seq_on_edge_immediate (pe, stmts);
1975c7a68eb7Smrg /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1976*ec02198aSmrg we set range information to make niters analyzer's life easier.
1977*ec02198aSmrg Note the number of latch iteration value can be TYPE_MAX_VALUE so
1978*ec02198aSmrg we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf. */
1979c7a68eb7Smrg if (stmts != NULL && log_vf)
1980*ec02198aSmrg {
1981*ec02198aSmrg if (niters_no_overflow)
1982c7a68eb7Smrg set_range_info (niters_vector, VR_RANGE,
1983*ec02198aSmrg wi::one (TYPE_PRECISION (type)),
1984*ec02198aSmrg wi::rshift (wi::max_value (TYPE_PRECISION (type),
1985*ec02198aSmrg TYPE_SIGN (type)),
1986*ec02198aSmrg exact_log2 (const_vf),
1987*ec02198aSmrg TYPE_SIGN (type)));
1988*ec02198aSmrg /* For VF == 1 the vector IV might also overflow so we cannot
1989*ec02198aSmrg assert a minimum value of 1. */
1990*ec02198aSmrg else if (const_vf > 1)
1991*ec02198aSmrg set_range_info (niters_vector, VR_RANGE,
1992*ec02198aSmrg wi::one (TYPE_PRECISION (type)),
1993*ec02198aSmrg wi::rshift (wi::max_value (TYPE_PRECISION (type),
1994*ec02198aSmrg TYPE_SIGN (type))
1995*ec02198aSmrg - (const_vf - 1),
1996*ec02198aSmrg exact_log2 (const_vf), TYPE_SIGN (type))
1997*ec02198aSmrg + 1);
1998*ec02198aSmrg }
199910d565efSmrg }
200010d565efSmrg *niters_vector_ptr = niters_vector;
2001c7a68eb7Smrg *step_vector_ptr = step_vector;
200210d565efSmrg
200310d565efSmrg return;
200410d565efSmrg }
200510d565efSmrg
200610d565efSmrg /* Given NITERS_VECTOR which is the number of iterations for vectorized
200710d565efSmrg loop specified by LOOP_VINFO after vectorization, compute the number
200810d565efSmrg of iterations before vectorization (niters_vector * vf) and store it
200910d565efSmrg to NITERS_VECTOR_MULT_VF_PTR. */
201010d565efSmrg
201110d565efSmrg static void
vect_gen_vector_loop_niters_mult_vf(loop_vec_info loop_vinfo,tree niters_vector,tree * niters_vector_mult_vf_ptr)201210d565efSmrg vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
201310d565efSmrg tree niters_vector,
201410d565efSmrg tree *niters_vector_mult_vf_ptr)
201510d565efSmrg {
2016c7a68eb7Smrg /* We should be using a step_vector of VF if VF is variable. */
2017c7a68eb7Smrg int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2018*ec02198aSmrg class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
201910d565efSmrg tree type = TREE_TYPE (niters_vector);
202010d565efSmrg tree log_vf = build_int_cst (type, exact_log2 (vf));
202110d565efSmrg basic_block exit_bb = single_exit (loop)->dest;
202210d565efSmrg
202310d565efSmrg gcc_assert (niters_vector_mult_vf_ptr != NULL);
202410d565efSmrg tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
202510d565efSmrg niters_vector, log_vf);
202610d565efSmrg if (!is_gimple_val (niters_vector_mult_vf))
202710d565efSmrg {
202810d565efSmrg tree var = create_tmp_var (type, "niters_vector_mult_vf");
202910d565efSmrg gimple_seq stmts = NULL;
203010d565efSmrg niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
203110d565efSmrg &stmts, true, var);
203210d565efSmrg gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
203310d565efSmrg gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
203410d565efSmrg }
203510d565efSmrg *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
203610d565efSmrg }
203710d565efSmrg
2038*ec02198aSmrg /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2039*ec02198aSmrg this function searches for the corresponding lcssa phi node in exit
2040*ec02198aSmrg bb of LOOP. If it is found, return the phi result; otherwise return
2041*ec02198aSmrg NULL. */
2042*ec02198aSmrg
2043*ec02198aSmrg static tree
find_guard_arg(class loop * loop,class loop * epilog ATTRIBUTE_UNUSED,gphi * lcssa_phi)2044*ec02198aSmrg find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2045*ec02198aSmrg gphi *lcssa_phi)
2046*ec02198aSmrg {
2047*ec02198aSmrg gphi_iterator gsi;
2048*ec02198aSmrg edge e = single_exit (loop);
2049*ec02198aSmrg
2050*ec02198aSmrg gcc_assert (single_pred_p (e->dest));
2051*ec02198aSmrg for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2052*ec02198aSmrg {
2053*ec02198aSmrg gphi *phi = gsi.phi ();
2054*ec02198aSmrg if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2055*ec02198aSmrg PHI_ARG_DEF (lcssa_phi, 0), 0))
2056*ec02198aSmrg return PHI_RESULT (phi);
2057*ec02198aSmrg }
2058*ec02198aSmrg return NULL_TREE;
2059*ec02198aSmrg }
2060*ec02198aSmrg
206110d565efSmrg /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
206210d565efSmrg from SECOND/FIRST and puts it at the original loop's preheader/exit
206310d565efSmrg edge, the two loops are arranged as below:
206410d565efSmrg
206510d565efSmrg preheader_a:
206610d565efSmrg first_loop:
206710d565efSmrg header_a:
206810d565efSmrg i_1 = PHI<i_0, i_2>;
206910d565efSmrg ...
207010d565efSmrg i_2 = i_1 + 1;
207110d565efSmrg if (cond_a)
207210d565efSmrg goto latch_a;
207310d565efSmrg else
207410d565efSmrg goto between_bb;
207510d565efSmrg latch_a:
207610d565efSmrg goto header_a;
207710d565efSmrg
207810d565efSmrg between_bb:
207910d565efSmrg ;; i_x = PHI<i_2>; ;; LCSSA phi node to be created for FIRST,
208010d565efSmrg
208110d565efSmrg second_loop:
208210d565efSmrg header_b:
208310d565efSmrg i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
208410d565efSmrg or with i_2 if no LCSSA phi is created
208510d565efSmrg under condition of CREATE_LCSSA_FOR_IV_PHIS.
208610d565efSmrg ...
208710d565efSmrg i_4 = i_3 + 1;
208810d565efSmrg if (cond_b)
208910d565efSmrg goto latch_b;
209010d565efSmrg else
209110d565efSmrg goto exit_bb;
209210d565efSmrg latch_b:
209310d565efSmrg goto header_b;
209410d565efSmrg
209510d565efSmrg exit_bb:
209610d565efSmrg
209710d565efSmrg This function creates loop closed SSA for the first loop; update the
209810d565efSmrg second loop's PHI nodes by replacing argument on incoming edge with the
209910d565efSmrg result of newly created lcssa PHI nodes. IF CREATE_LCSSA_FOR_IV_PHIS
210010d565efSmrg is false, Loop closed ssa phis will only be created for non-iv phis for
210110d565efSmrg the first loop.
210210d565efSmrg
210310d565efSmrg This function assumes exit bb of the first loop is preheader bb of the
210410d565efSmrg second loop, i.e, between_bb in the example code. With PHIs updated,
210510d565efSmrg the second loop will execute rest iterations of the first. */
210610d565efSmrg
210710d565efSmrg static void
slpeel_update_phi_nodes_for_loops(loop_vec_info loop_vinfo,class loop * first,class loop * second,bool create_lcssa_for_iv_phis)210810d565efSmrg slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2109*ec02198aSmrg class loop *first, class loop *second,
211010d565efSmrg bool create_lcssa_for_iv_phis)
211110d565efSmrg {
211210d565efSmrg gphi_iterator gsi_update, gsi_orig;
2113*ec02198aSmrg class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
211410d565efSmrg
211510d565efSmrg edge first_latch_e = EDGE_SUCC (first->latch, 0);
211610d565efSmrg edge second_preheader_e = loop_preheader_edge (second);
211710d565efSmrg basic_block between_bb = single_exit (first)->dest;
211810d565efSmrg
211910d565efSmrg gcc_assert (between_bb == second_preheader_e->src);
212010d565efSmrg gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
212110d565efSmrg /* Either the first loop or the second is the loop to be vectorized. */
212210d565efSmrg gcc_assert (loop == first || loop == second);
212310d565efSmrg
212410d565efSmrg for (gsi_orig = gsi_start_phis (first->header),
212510d565efSmrg gsi_update = gsi_start_phis (second->header);
212610d565efSmrg !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
212710d565efSmrg gsi_next (&gsi_orig), gsi_next (&gsi_update))
212810d565efSmrg {
212910d565efSmrg gphi *orig_phi = gsi_orig.phi ();
213010d565efSmrg gphi *update_phi = gsi_update.phi ();
213110d565efSmrg
213210d565efSmrg tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
213310d565efSmrg /* Generate lcssa PHI node for the first loop. */
213410d565efSmrg gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
21350fc04c29Smrg stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
21360fc04c29Smrg if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
213710d565efSmrg {
213810d565efSmrg tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
213910d565efSmrg gphi *lcssa_phi = create_phi_node (new_res, between_bb);
214010d565efSmrg add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
214110d565efSmrg arg = new_res;
214210d565efSmrg }
214310d565efSmrg
214410d565efSmrg /* Update PHI node in the second loop by replacing arg on the loop's
214510d565efSmrg incoming edge. */
214610d565efSmrg adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
214710d565efSmrg }
2148*ec02198aSmrg
2149*ec02198aSmrg /* For epilogue peeling we have to make sure to copy all LC PHIs
2150*ec02198aSmrg for correct vectorization of live stmts. */
2151*ec02198aSmrg if (loop == first)
2152*ec02198aSmrg {
2153*ec02198aSmrg basic_block orig_exit = single_exit (second)->dest;
2154*ec02198aSmrg for (gsi_orig = gsi_start_phis (orig_exit);
2155*ec02198aSmrg !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2156*ec02198aSmrg {
2157*ec02198aSmrg gphi *orig_phi = gsi_orig.phi ();
2158*ec02198aSmrg tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2159*ec02198aSmrg if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p (orig_arg))
2160*ec02198aSmrg continue;
2161*ec02198aSmrg
2162*ec02198aSmrg /* Already created in the above loop. */
2163*ec02198aSmrg if (find_guard_arg (first, second, orig_phi))
2164*ec02198aSmrg continue;
2165*ec02198aSmrg
2166*ec02198aSmrg tree new_res = copy_ssa_name (orig_arg);
2167*ec02198aSmrg gphi *lcphi = create_phi_node (new_res, between_bb);
2168*ec02198aSmrg add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2169*ec02198aSmrg }
2170*ec02198aSmrg }
217110d565efSmrg }
217210d565efSmrg
217310d565efSmrg /* Function slpeel_add_loop_guard adds guard skipping from the beginning
217410d565efSmrg of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
217510d565efSmrg are two pred edges of the merge point before UPDATE_LOOP. The two loops
217610d565efSmrg appear like below:
217710d565efSmrg
217810d565efSmrg guard_bb:
217910d565efSmrg if (cond)
218010d565efSmrg goto merge_bb;
218110d565efSmrg else
218210d565efSmrg goto skip_loop;
218310d565efSmrg
218410d565efSmrg skip_loop:
218510d565efSmrg header_a:
218610d565efSmrg i_1 = PHI<i_0, i_2>;
218710d565efSmrg ...
218810d565efSmrg i_2 = i_1 + 1;
218910d565efSmrg if (cond_a)
219010d565efSmrg goto latch_a;
219110d565efSmrg else
219210d565efSmrg goto exit_a;
219310d565efSmrg latch_a:
219410d565efSmrg goto header_a;
219510d565efSmrg
219610d565efSmrg exit_a:
219710d565efSmrg i_5 = PHI<i_2>;
219810d565efSmrg
219910d565efSmrg merge_bb:
220010d565efSmrg ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
220110d565efSmrg
220210d565efSmrg update_loop:
220310d565efSmrg header_b:
220410d565efSmrg i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
220510d565efSmrg ...
220610d565efSmrg i_4 = i_3 + 1;
220710d565efSmrg if (cond_b)
220810d565efSmrg goto latch_b;
220910d565efSmrg else
221010d565efSmrg goto exit_bb;
221110d565efSmrg latch_b:
221210d565efSmrg goto header_b;
221310d565efSmrg
221410d565efSmrg exit_bb:
221510d565efSmrg
221610d565efSmrg This function creates PHI nodes at merge_bb and replaces the use of i_5
221710d565efSmrg in the update_loop's PHI node with the result of new PHI result. */
221810d565efSmrg
221910d565efSmrg static void
slpeel_update_phi_nodes_for_guard1(class loop * skip_loop,class loop * update_loop,edge guard_edge,edge merge_edge)2220*ec02198aSmrg slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2221*ec02198aSmrg class loop *update_loop,
222210d565efSmrg edge guard_edge, edge merge_edge)
222310d565efSmrg {
22240fc04c29Smrg location_t merge_loc, guard_loc;
222510d565efSmrg edge orig_e = loop_preheader_edge (skip_loop);
222610d565efSmrg edge update_e = loop_preheader_edge (update_loop);
222710d565efSmrg gphi_iterator gsi_orig, gsi_update;
222810d565efSmrg
222910d565efSmrg for ((gsi_orig = gsi_start_phis (skip_loop->header),
223010d565efSmrg gsi_update = gsi_start_phis (update_loop->header));
223110d565efSmrg !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
223210d565efSmrg gsi_next (&gsi_orig), gsi_next (&gsi_update))
223310d565efSmrg {
223410d565efSmrg gphi *orig_phi = gsi_orig.phi ();
223510d565efSmrg gphi *update_phi = gsi_update.phi ();
223610d565efSmrg
223710d565efSmrg /* Generate new phi node at merge bb of the guard. */
223810d565efSmrg tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
223910d565efSmrg gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
224010d565efSmrg
224110d565efSmrg /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
224210d565efSmrg args in NEW_PHI for these edges. */
224310d565efSmrg tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
224410d565efSmrg tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
224510d565efSmrg merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
224610d565efSmrg guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
224710d565efSmrg add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
224810d565efSmrg add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
224910d565efSmrg
225010d565efSmrg /* Update phi in UPDATE_PHI. */
225110d565efSmrg adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
225210d565efSmrg }
225310d565efSmrg }
225410d565efSmrg
225510d565efSmrg /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
225610d565efSmrg from LOOP. Function slpeel_add_loop_guard adds guard skipping from a
225710d565efSmrg point between the two loops to the end of EPILOG. Edges GUARD_EDGE
225810d565efSmrg and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
225910d565efSmrg The CFG looks like:
226010d565efSmrg
226110d565efSmrg loop:
226210d565efSmrg header_a:
226310d565efSmrg i_1 = PHI<i_0, i_2>;
226410d565efSmrg ...
226510d565efSmrg i_2 = i_1 + 1;
226610d565efSmrg if (cond_a)
226710d565efSmrg goto latch_a;
226810d565efSmrg else
226910d565efSmrg goto exit_a;
227010d565efSmrg latch_a:
227110d565efSmrg goto header_a;
227210d565efSmrg
227310d565efSmrg exit_a:
227410d565efSmrg
227510d565efSmrg guard_bb:
227610d565efSmrg if (cond)
227710d565efSmrg goto merge_bb;
227810d565efSmrg else
227910d565efSmrg goto epilog_loop;
228010d565efSmrg
228110d565efSmrg ;; fall_through_bb
228210d565efSmrg
228310d565efSmrg epilog_loop:
228410d565efSmrg header_b:
228510d565efSmrg i_3 = PHI<i_2, i_4>;
228610d565efSmrg ...
228710d565efSmrg i_4 = i_3 + 1;
228810d565efSmrg if (cond_b)
228910d565efSmrg goto latch_b;
229010d565efSmrg else
229110d565efSmrg goto merge_bb;
229210d565efSmrg latch_b:
229310d565efSmrg goto header_b;
229410d565efSmrg
229510d565efSmrg merge_bb:
229610d565efSmrg ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
229710d565efSmrg
229810d565efSmrg exit_bb:
229910d565efSmrg i_x = PHI<i_4>; ;Use of i_4 to be replaced with i_y in merge_bb.
230010d565efSmrg
230110d565efSmrg For each name used out side EPILOG (i.e - for each name that has a lcssa
230210d565efSmrg phi in exit_bb) we create a new PHI in merge_bb. The new PHI has two
230310d565efSmrg args corresponding to GUARD_EDGE and MERGE_EDGE. Arg for MERGE_EDGE is
230410d565efSmrg the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
230510d565efSmrg by LOOP and is found in the exit bb of LOOP. Arg of the original PHI
230610d565efSmrg in exit_bb will also be updated. */
230710d565efSmrg
230810d565efSmrg static void
slpeel_update_phi_nodes_for_guard2(class loop * loop,class loop * epilog,edge guard_edge,edge merge_edge)2309*ec02198aSmrg slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
231010d565efSmrg edge guard_edge, edge merge_edge)
231110d565efSmrg {
231210d565efSmrg gphi_iterator gsi;
231310d565efSmrg basic_block merge_bb = guard_edge->dest;
231410d565efSmrg
231510d565efSmrg gcc_assert (single_succ_p (merge_bb));
231610d565efSmrg edge e = single_succ_edge (merge_bb);
231710d565efSmrg basic_block exit_bb = e->dest;
231810d565efSmrg gcc_assert (single_pred_p (exit_bb));
231910d565efSmrg gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
232010d565efSmrg
232110d565efSmrg for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
232210d565efSmrg {
232310d565efSmrg gphi *update_phi = gsi.phi ();
232410d565efSmrg tree old_arg = PHI_ARG_DEF (update_phi, 0);
232510d565efSmrg
2326*ec02198aSmrg tree merge_arg = NULL_TREE;
2327*ec02198aSmrg
2328*ec02198aSmrg /* If the old argument is a SSA_NAME use its current_def. */
2329*ec02198aSmrg if (TREE_CODE (old_arg) == SSA_NAME)
2330*ec02198aSmrg merge_arg = get_current_def (old_arg);
2331*ec02198aSmrg /* If it's a constant or doesn't have a current_def, just use the old
2332*ec02198aSmrg argument. */
233310d565efSmrg if (!merge_arg)
233410d565efSmrg merge_arg = old_arg;
233510d565efSmrg
233610d565efSmrg tree guard_arg = find_guard_arg (loop, epilog, update_phi);
233710d565efSmrg /* If the var is live after loop but not a reduction, we simply
233810d565efSmrg use the old arg. */
233910d565efSmrg if (!guard_arg)
234010d565efSmrg guard_arg = old_arg;
234110d565efSmrg
234210d565efSmrg /* Create new phi node in MERGE_BB: */
234310d565efSmrg tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
234410d565efSmrg gphi *merge_phi = create_phi_node (new_res, merge_bb);
234510d565efSmrg
234610d565efSmrg /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
234710d565efSmrg the two PHI args in merge_phi for these edges. */
234810d565efSmrg add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
234910d565efSmrg add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
235010d565efSmrg
235110d565efSmrg /* Update the original phi in exit_bb. */
235210d565efSmrg adjust_phi_and_debug_stmts (update_phi, e, new_res);
235310d565efSmrg }
235410d565efSmrg }
235510d565efSmrg
235610d565efSmrg /* EPILOG loop is duplicated from the original loop for vectorizing,
235710d565efSmrg the arg of its loop closed ssa PHI needs to be updated. */
235810d565efSmrg
235910d565efSmrg static void
slpeel_update_phi_nodes_for_lcssa(class loop * epilog)2360*ec02198aSmrg slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
236110d565efSmrg {
236210d565efSmrg gphi_iterator gsi;
236310d565efSmrg basic_block exit_bb = single_exit (epilog)->dest;
236410d565efSmrg
236510d565efSmrg gcc_assert (single_pred_p (exit_bb));
236610d565efSmrg edge e = EDGE_PRED (exit_bb, 0);
236710d565efSmrg for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
236810d565efSmrg rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
236910d565efSmrg }
237010d565efSmrg
237110d565efSmrg /* Function vect_do_peeling.
237210d565efSmrg
237310d565efSmrg Input:
237410d565efSmrg - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
237510d565efSmrg
237610d565efSmrg preheader:
237710d565efSmrg LOOP:
237810d565efSmrg header_bb:
237910d565efSmrg loop_body
238010d565efSmrg if (exit_loop_cond) goto exit_bb
238110d565efSmrg else goto header_bb
238210d565efSmrg exit_bb:
238310d565efSmrg
238410d565efSmrg - NITERS: The number of iterations of the loop.
238510d565efSmrg - NITERSM1: The number of iterations of the loop's latch.
238610d565efSmrg - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
238710d565efSmrg - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
238810d565efSmrg CHECK_PROFITABILITY is true.
238910d565efSmrg Output:
2390c7a68eb7Smrg - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2391c7a68eb7Smrg iterate after vectorization; see vect_set_loop_condition for details.
2392c7a68eb7Smrg - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2393c7a68eb7Smrg should be set to the number of scalar iterations handled by the
2394c7a68eb7Smrg vector loop. The SSA name is only used on exit from the loop.
239510d565efSmrg
239610d565efSmrg This function peels prolog and epilog from the loop, adds guards skipping
239710d565efSmrg PROLOG and EPILOG for various conditions. As a result, the changed CFG
239810d565efSmrg would look like:
239910d565efSmrg
240010d565efSmrg guard_bb_1:
240110d565efSmrg if (prefer_scalar_loop) goto merge_bb_1
240210d565efSmrg else goto guard_bb_2
240310d565efSmrg
240410d565efSmrg guard_bb_2:
240510d565efSmrg if (skip_prolog) goto merge_bb_2
240610d565efSmrg else goto prolog_preheader
240710d565efSmrg
240810d565efSmrg prolog_preheader:
240910d565efSmrg PROLOG:
241010d565efSmrg prolog_header_bb:
241110d565efSmrg prolog_body
241210d565efSmrg if (exit_prolog_cond) goto prolog_exit_bb
241310d565efSmrg else goto prolog_header_bb
241410d565efSmrg prolog_exit_bb:
241510d565efSmrg
241610d565efSmrg merge_bb_2:
241710d565efSmrg
241810d565efSmrg vector_preheader:
241910d565efSmrg VECTOR LOOP:
242010d565efSmrg vector_header_bb:
242110d565efSmrg vector_body
242210d565efSmrg if (exit_vector_cond) goto vector_exit_bb
242310d565efSmrg else goto vector_header_bb
242410d565efSmrg vector_exit_bb:
242510d565efSmrg
242610d565efSmrg guard_bb_3:
242710d565efSmrg if (skip_epilog) goto merge_bb_3
242810d565efSmrg else goto epilog_preheader
242910d565efSmrg
243010d565efSmrg merge_bb_1:
243110d565efSmrg
243210d565efSmrg epilog_preheader:
243310d565efSmrg EPILOG:
243410d565efSmrg epilog_header_bb:
243510d565efSmrg epilog_body
243610d565efSmrg if (exit_epilog_cond) goto merge_bb_3
243710d565efSmrg else goto epilog_header_bb
243810d565efSmrg
243910d565efSmrg merge_bb_3:
244010d565efSmrg
244110d565efSmrg Note this function peels prolog and epilog only if it's necessary,
244210d565efSmrg as well as guards.
2443*ec02198aSmrg This function returns the epilogue loop if a decision was made to vectorize
2444*ec02198aSmrg it, otherwise NULL.
2445*ec02198aSmrg
2446*ec02198aSmrg The analysis resulting in this epilogue loop's loop_vec_info was performed
2447*ec02198aSmrg in the same vect_analyze_loop call as the main loop's. At that time
2448*ec02198aSmrg vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2449*ec02198aSmrg vectorization factors than the main loop. This list is stored in the main
2450*ec02198aSmrg loop's loop_vec_info in the 'epilogue_vinfos' member. Everytime we decide to
2451*ec02198aSmrg vectorize the epilogue loop for a lower vectorization factor, the
2452*ec02198aSmrg loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2453*ec02198aSmrg updated and linked to the epilogue loop. This is later used to vectorize
2454*ec02198aSmrg the epilogue. The reason the loop_vec_info needs updating is that it was
2455*ec02198aSmrg constructed based on the original main loop, and the epilogue loop is a
2456*ec02198aSmrg copy of this loop, so all links pointing to statements in the original loop
2457*ec02198aSmrg need updating. Furthermore, these loop_vec_infos share the
2458*ec02198aSmrg data_reference's records, which will also need to be updated.
245910d565efSmrg
246010d565efSmrg TODO: Guard for prefer_scalar_loop should be emitted along with
246110d565efSmrg versioning conditions if loop versioning is needed. */
246210d565efSmrg
246310d565efSmrg
2464*ec02198aSmrg class loop *
vect_do_peeling(loop_vec_info loop_vinfo,tree niters,tree nitersm1,tree * niters_vector,tree * step_vector,tree * niters_vector_mult_vf_var,int th,bool check_profitability,bool niters_no_overflow,tree * advance)246510d565efSmrg vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2466c7a68eb7Smrg tree *niters_vector, tree *step_vector,
2467c7a68eb7Smrg tree *niters_vector_mult_vf_var, int th,
2468*ec02198aSmrg bool check_profitability, bool niters_no_overflow,
2469*ec02198aSmrg tree *advance)
247010d565efSmrg {
247110d565efSmrg edge e, guard_e;
247210d565efSmrg tree type = TREE_TYPE (niters), guard_cond;
247310d565efSmrg basic_block guard_bb, guard_to;
2474c7a68eb7Smrg profile_probability prob_prolog, prob_vector, prob_epilog;
2475c7a68eb7Smrg int estimated_vf;
2476c7a68eb7Smrg int prolog_peeling = 0;
2477*ec02198aSmrg bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
24780fc04c29Smrg /* We currently do not support prolog peeling if the target alignment is not
24790fc04c29Smrg known at compile time. 'vect_gen_prolog_loop_niters' depends on the
24800fc04c29Smrg target alignment being constant. */
24810fc04c29Smrg dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
24820fc04c29Smrg if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
24830fc04c29Smrg return NULL;
24840fc04c29Smrg
2485c7a68eb7Smrg if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2486c7a68eb7Smrg prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2487c7a68eb7Smrg
2488c7a68eb7Smrg poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2489c7a68eb7Smrg poly_uint64 bound_epilog = 0;
2490c7a68eb7Smrg if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2491c7a68eb7Smrg && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2492c7a68eb7Smrg bound_epilog += vf - 1;
2493c7a68eb7Smrg if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2494c7a68eb7Smrg bound_epilog += 1;
2495c7a68eb7Smrg bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2496c7a68eb7Smrg poly_uint64 bound_scalar = bound_epilog;
249710d565efSmrg
249810d565efSmrg if (!prolog_peeling && !epilog_peeling)
249910d565efSmrg return NULL;
250010d565efSmrg
2501*ec02198aSmrg /* Before doing any peeling make sure to reset debug binds outside of
2502*ec02198aSmrg the loop refering to defs not in LC SSA. */
2503*ec02198aSmrg class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2504*ec02198aSmrg for (unsigned i = 0; i < loop->num_nodes; ++i)
2505*ec02198aSmrg {
2506*ec02198aSmrg basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2507*ec02198aSmrg imm_use_iterator ui;
2508*ec02198aSmrg gimple *use_stmt;
2509*ec02198aSmrg for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2510*ec02198aSmrg gsi_next (&gsi))
2511*ec02198aSmrg {
2512*ec02198aSmrg FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
2513*ec02198aSmrg if (gimple_debug_bind_p (use_stmt)
2514*ec02198aSmrg && loop != gimple_bb (use_stmt)->loop_father
2515*ec02198aSmrg && !flow_loop_nested_p (loop,
2516*ec02198aSmrg gimple_bb (use_stmt)->loop_father))
2517*ec02198aSmrg {
2518*ec02198aSmrg gimple_debug_bind_reset_value (use_stmt);
2519*ec02198aSmrg update_stmt (use_stmt);
2520*ec02198aSmrg }
2521*ec02198aSmrg }
2522*ec02198aSmrg for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2523*ec02198aSmrg gsi_next (&gsi))
2524*ec02198aSmrg {
2525*ec02198aSmrg ssa_op_iter op_iter;
2526*ec02198aSmrg def_operand_p def_p;
2527*ec02198aSmrg FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
2528*ec02198aSmrg FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
2529*ec02198aSmrg if (gimple_debug_bind_p (use_stmt)
2530*ec02198aSmrg && loop != gimple_bb (use_stmt)->loop_father
2531*ec02198aSmrg && !flow_loop_nested_p (loop,
2532*ec02198aSmrg gimple_bb (use_stmt)->loop_father))
2533*ec02198aSmrg {
2534*ec02198aSmrg gimple_debug_bind_reset_value (use_stmt);
2535*ec02198aSmrg update_stmt (use_stmt);
2536*ec02198aSmrg }
2537*ec02198aSmrg }
2538*ec02198aSmrg }
2539*ec02198aSmrg
2540c7a68eb7Smrg prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2541c7a68eb7Smrg estimated_vf = vect_vf_for_cost (loop_vinfo);
2542c7a68eb7Smrg if (estimated_vf == 2)
2543c7a68eb7Smrg estimated_vf = 3;
2544c7a68eb7Smrg prob_prolog = prob_epilog = profile_probability::guessed_always ()
2545c7a68eb7Smrg .apply_scale (estimated_vf - 1, estimated_vf);
254610d565efSmrg
2547*ec02198aSmrg class loop *prolog, *epilog = NULL;
2548*ec02198aSmrg class loop *first_loop = loop;
254910d565efSmrg bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2550*ec02198aSmrg
2551*ec02198aSmrg /* We might have a queued need to update virtual SSA form. As we
2552*ec02198aSmrg delete the update SSA machinery below after doing a regular
2553*ec02198aSmrg incremental SSA update during loop copying make sure we don't
2554*ec02198aSmrg lose that fact.
2555*ec02198aSmrg ??? Needing to update virtual SSA form by renaming is unfortunate
2556*ec02198aSmrg but not all of the vectorizer code inserting new loads / stores
2557*ec02198aSmrg properly assigns virtual operands to those statements. */
255810d565efSmrg update_ssa (TODO_update_ssa_only_virtuals);
255910d565efSmrg
2560*ec02198aSmrg create_lcssa_for_virtual_phi (loop);
2561*ec02198aSmrg
2562*ec02198aSmrg /* If we're vectorizing an epilogue loop, the update_ssa above will
2563*ec02198aSmrg have ensured that the virtual operand is in SSA form throughout the
2564*ec02198aSmrg vectorized main loop. Normally it is possible to trace the updated
2565*ec02198aSmrg vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2566*ec02198aSmrg back to scalar-stmt vuses, meaning that the effect of the SSA update
2567*ec02198aSmrg remains local to the main loop. However, there are rare cases in
2568*ec02198aSmrg which the vectorized loop has vdefs even when the original scalar
2569*ec02198aSmrg loop didn't. For example, vectorizing a load with IFN_LOAD_LANES
2570*ec02198aSmrg introduces clobbers of the temporary vector array, which in turn
2571*ec02198aSmrg needs new vdefs. If the scalar loop doesn't write to memory, these
2572*ec02198aSmrg new vdefs will be the only ones in the vector loop.
2573*ec02198aSmrg
2574*ec02198aSmrg In that case, update_ssa will have added a new virtual phi to the
2575*ec02198aSmrg main loop, which previously didn't need one. Ensure that we (locally)
2576*ec02198aSmrg maintain LCSSA form for the virtual operand, just as we would have
2577*ec02198aSmrg done if the virtual phi had existed from the outset. This makes it
2578*ec02198aSmrg easier to duplicate the scalar epilogue loop below. */
2579*ec02198aSmrg tree vop_to_rename = NULL_TREE;
2580*ec02198aSmrg if (loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
2581*ec02198aSmrg {
2582*ec02198aSmrg class loop *orig_loop = LOOP_VINFO_LOOP (orig_loop_vinfo);
2583*ec02198aSmrg vop_to_rename = create_lcssa_for_virtual_phi (orig_loop);
2584*ec02198aSmrg }
2585*ec02198aSmrg
2586c7a68eb7Smrg if (MAY_HAVE_DEBUG_BIND_STMTS)
258710d565efSmrg {
258810d565efSmrg gcc_assert (!adjust_vec.exists ());
258910d565efSmrg adjust_vec.create (32);
259010d565efSmrg }
259110d565efSmrg initialize_original_copy_tables ();
259210d565efSmrg
2593c7a68eb7Smrg /* Record the anchor bb at which the guard should be placed if the scalar
2594c7a68eb7Smrg loop might be preferred. */
2595c7a68eb7Smrg basic_block anchor = loop_preheader_edge (loop)->src;
2596c7a68eb7Smrg
2597c7a68eb7Smrg /* Generate the number of iterations for the prolog loop. We do this here
2598c7a68eb7Smrg so that we can also get the upper bound on the number of iterations. */
2599c7a68eb7Smrg tree niters_prolog;
2600c7a68eb7Smrg int bound_prolog = 0;
2601c7a68eb7Smrg if (prolog_peeling)
2602c7a68eb7Smrg niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2603c7a68eb7Smrg &bound_prolog);
2604c7a68eb7Smrg else
2605c7a68eb7Smrg niters_prolog = build_int_cst (type, 0);
2606c7a68eb7Smrg
2607*ec02198aSmrg loop_vec_info epilogue_vinfo = NULL;
2608*ec02198aSmrg if (vect_epilogues)
2609*ec02198aSmrg {
2610*ec02198aSmrg epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2611*ec02198aSmrg loop_vinfo->epilogue_vinfos.ordered_remove (0);
2612*ec02198aSmrg }
2613*ec02198aSmrg
2614*ec02198aSmrg tree niters_vector_mult_vf = NULL_TREE;
2615*ec02198aSmrg /* Saving NITERs before the loop, as this may be changed by prologue. */
2616*ec02198aSmrg tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2617*ec02198aSmrg edge update_e = NULL, skip_e = NULL;
2618*ec02198aSmrg unsigned int lowest_vf = constant_lower_bound (vf);
2619*ec02198aSmrg /* If we know the number of scalar iterations for the main loop we should
2620*ec02198aSmrg check whether after the main loop there are enough iterations left over
2621*ec02198aSmrg for the epilogue. */
2622*ec02198aSmrg if (vect_epilogues
2623*ec02198aSmrg && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2624*ec02198aSmrg && prolog_peeling >= 0
2625*ec02198aSmrg && known_eq (vf, lowest_vf))
2626*ec02198aSmrg {
2627*ec02198aSmrg unsigned HOST_WIDE_INT eiters
2628*ec02198aSmrg = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2629*ec02198aSmrg - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2630*ec02198aSmrg
2631*ec02198aSmrg eiters -= prolog_peeling;
2632*ec02198aSmrg eiters
2633*ec02198aSmrg = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2634*ec02198aSmrg
2635*ec02198aSmrg unsigned int ratio;
2636*ec02198aSmrg unsigned int epilogue_gaps
2637*ec02198aSmrg = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2638*ec02198aSmrg while (!(constant_multiple_p
2639*ec02198aSmrg (GET_MODE_SIZE (loop_vinfo->vector_mode),
2640*ec02198aSmrg GET_MODE_SIZE (epilogue_vinfo->vector_mode), &ratio)
2641*ec02198aSmrg && eiters >= lowest_vf / ratio + epilogue_gaps))
2642*ec02198aSmrg {
2643*ec02198aSmrg delete epilogue_vinfo;
2644*ec02198aSmrg epilogue_vinfo = NULL;
2645*ec02198aSmrg if (loop_vinfo->epilogue_vinfos.length () == 0)
2646*ec02198aSmrg {
2647*ec02198aSmrg vect_epilogues = false;
2648*ec02198aSmrg break;
2649*ec02198aSmrg }
2650*ec02198aSmrg epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2651*ec02198aSmrg loop_vinfo->epilogue_vinfos.ordered_remove (0);
2652*ec02198aSmrg epilogue_gaps = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2653*ec02198aSmrg }
2654*ec02198aSmrg }
265510d565efSmrg /* Prolog loop may be skipped. */
265610d565efSmrg bool skip_prolog = (prolog_peeling != 0);
2657*ec02198aSmrg /* Skip this loop to epilog when there are not enough iterations to enter this
2658*ec02198aSmrg vectorized loop. If true we should perform runtime checks on the NITERS
2659*ec02198aSmrg to check whether we should skip the current vectorized loop. If we know
2660*ec02198aSmrg the number of scalar iterations we may choose to add a runtime check if
2661*ec02198aSmrg this number "maybe" smaller than the number of iterations required
2662*ec02198aSmrg when we know the number of scalar iterations may potentially
2663*ec02198aSmrg be smaller than the number of iterations required to enter this loop, for
2664*ec02198aSmrg this we use the upper bounds on the prolog and epilog peeling. When we
2665*ec02198aSmrg don't know the number of iterations and don't require versioning it is
2666*ec02198aSmrg because we have asserted that there are enough scalar iterations to enter
2667*ec02198aSmrg the main loop, so this skip is not necessary. When we are versioning then
2668*ec02198aSmrg we only add such a skip if we have chosen to vectorize the epilogue. */
2669c7a68eb7Smrg bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2670c7a68eb7Smrg ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2671c7a68eb7Smrg bound_prolog + bound_epilog)
2672*ec02198aSmrg : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2673*ec02198aSmrg || vect_epilogues));
267410d565efSmrg /* Epilog loop must be executed if the number of iterations for epilog
267510d565efSmrg loop is known at compile time, otherwise we need to add a check at
267610d565efSmrg the end of vector loop and skip to the end of epilog loop. */
267710d565efSmrg bool skip_epilog = (prolog_peeling < 0
2678c7a68eb7Smrg || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2679c7a68eb7Smrg || !vf.is_constant ());
268010d565efSmrg /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
268110d565efSmrg if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
268210d565efSmrg skip_epilog = false;
268310d565efSmrg
268410d565efSmrg if (skip_vector)
268510d565efSmrg {
268610d565efSmrg split_edge (loop_preheader_edge (loop));
268710d565efSmrg
268810d565efSmrg /* Due to the order in which we peel prolog and epilog, we first
268910d565efSmrg propagate probability to the whole loop. The purpose is to
269010d565efSmrg avoid adjusting probabilities of both prolog and vector loops
269110d565efSmrg separately. Note in this case, the probability of epilog loop
269210d565efSmrg needs to be scaled back later. */
269310d565efSmrg basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2694c7a68eb7Smrg if (prob_vector.initialized_p ())
2695c7a68eb7Smrg {
2696c7a68eb7Smrg scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2697c7a68eb7Smrg scale_loop_profile (loop, prob_vector, 0);
2698c7a68eb7Smrg }
269910d565efSmrg }
270010d565efSmrg
27010fc04c29Smrg dump_user_location_t loop_loc = find_loop_location (loop);
2702*ec02198aSmrg class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2703*ec02198aSmrg if (vect_epilogues)
2704*ec02198aSmrg /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2705*ec02198aSmrg use the original scalar loop as remaining epilogue if necessary. */
2706*ec02198aSmrg LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2707*ec02198aSmrg = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2708*ec02198aSmrg
270910d565efSmrg if (prolog_peeling)
271010d565efSmrg {
271110d565efSmrg e = loop_preheader_edge (loop);
271210d565efSmrg if (!slpeel_can_duplicate_loop_p (loop, e))
271310d565efSmrg {
271410d565efSmrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
271510d565efSmrg "loop can't be duplicated to preheader edge.\n");
271610d565efSmrg gcc_unreachable ();
271710d565efSmrg }
271810d565efSmrg /* Peel prolog and put it on preheader edge of loop. */
271910d565efSmrg prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
272010d565efSmrg if (!prolog)
272110d565efSmrg {
272210d565efSmrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
272310d565efSmrg "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
272410d565efSmrg gcc_unreachable ();
272510d565efSmrg }
27260fc04c29Smrg prolog->force_vectorize = false;
272710d565efSmrg slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
272810d565efSmrg first_loop = prolog;
272910d565efSmrg reset_original_copy_tables ();
273010d565efSmrg
2731c7a68eb7Smrg /* Update the number of iterations for prolog loop. */
2732c7a68eb7Smrg tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2733c7a68eb7Smrg vect_set_loop_condition (prolog, NULL, niters_prolog,
2734c7a68eb7Smrg step_prolog, NULL_TREE, false);
273510d565efSmrg
273610d565efSmrg /* Skip the prolog loop. */
273710d565efSmrg if (skip_prolog)
273810d565efSmrg {
273910d565efSmrg guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
274010d565efSmrg niters_prolog, build_int_cst (type, 0));
274110d565efSmrg guard_bb = loop_preheader_edge (prolog)->src;
274210d565efSmrg basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
274310d565efSmrg guard_to = split_edge (loop_preheader_edge (loop));
274410d565efSmrg guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
274510d565efSmrg guard_to, guard_bb,
2746c7a68eb7Smrg prob_prolog.invert (),
274710d565efSmrg irred_flag);
274810d565efSmrg e = EDGE_PRED (guard_to, 0);
274910d565efSmrg e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
275010d565efSmrg slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
275110d565efSmrg
2752c7a68eb7Smrg scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
275310d565efSmrg scale_loop_profile (prolog, prob_prolog, bound_prolog);
275410d565efSmrg }
2755*ec02198aSmrg
275610d565efSmrg /* Update init address of DRs. */
2757c7a68eb7Smrg vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
275810d565efSmrg /* Update niters for vector loop. */
275910d565efSmrg LOOP_VINFO_NITERS (loop_vinfo)
276010d565efSmrg = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
276110d565efSmrg LOOP_VINFO_NITERSM1 (loop_vinfo)
276210d565efSmrg = fold_build2 (MINUS_EXPR, type,
276310d565efSmrg LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2764c7a68eb7Smrg bool new_var_p = false;
2765c7a68eb7Smrg niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2766c7a68eb7Smrg /* It's guaranteed that vector loop bound before vectorization is at
2767c7a68eb7Smrg least VF, so set range information for newly generated var. */
2768c7a68eb7Smrg if (new_var_p)
2769c7a68eb7Smrg set_range_info (niters, VR_RANGE,
2770c7a68eb7Smrg wi::to_wide (build_int_cst (type, vf)),
2771c7a68eb7Smrg wi::to_wide (TYPE_MAX_VALUE (type)));
277210d565efSmrg
277310d565efSmrg /* Prolog iterates at most bound_prolog times, latch iterates at
277410d565efSmrg most bound_prolog - 1 times. */
277510d565efSmrg record_niter_bound (prolog, bound_prolog - 1, false, true);
277610d565efSmrg delete_update_ssa ();
277710d565efSmrg adjust_vec_debug_stmts ();
277810d565efSmrg scev_reset ();
277910d565efSmrg }
278010d565efSmrg
278110d565efSmrg if (epilog_peeling)
278210d565efSmrg {
278310d565efSmrg e = single_exit (loop);
278410d565efSmrg if (!slpeel_can_duplicate_loop_p (loop, e))
278510d565efSmrg {
278610d565efSmrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
278710d565efSmrg "loop can't be duplicated to exit edge.\n");
278810d565efSmrg gcc_unreachable ();
278910d565efSmrg }
2790*ec02198aSmrg /* Peel epilog and put it on exit edge of loop. If we are vectorizing
2791*ec02198aSmrg said epilog then we should use a copy of the main loop as a starting
2792*ec02198aSmrg point. This loop may have already had some preliminary transformations
2793*ec02198aSmrg to allow for more optimal vectorization, for example if-conversion.
2794*ec02198aSmrg If we are not vectorizing the epilog then we should use the scalar loop
2795*ec02198aSmrg as the transformations mentioned above make less or no sense when not
2796*ec02198aSmrg vectorizing. */
2797*ec02198aSmrg epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2798*ec02198aSmrg if (vop_to_rename)
2799*ec02198aSmrg {
2800*ec02198aSmrg /* Vectorizing the main loop can sometimes introduce a vdef to
2801*ec02198aSmrg a loop that previously didn't have one; see the comment above
2802*ec02198aSmrg the definition of VOP_TO_RENAME for details. The definition
2803*ec02198aSmrg D that holds on E will then be different from the definition
2804*ec02198aSmrg VOP_TO_RENAME that holds during SCALAR_LOOP, so we need to
2805*ec02198aSmrg rename VOP_TO_RENAME to D when copying the loop.
2806*ec02198aSmrg
2807*ec02198aSmrg The virtual operand is in LCSSA form for the main loop,
2808*ec02198aSmrg and no stmt between the main loop and E needs a vdef,
2809*ec02198aSmrg so we know that D is provided by a phi rather than by a
2810*ec02198aSmrg vdef on a normal gimple stmt. */
2811*ec02198aSmrg basic_block vdef_bb = e->src;
2812*ec02198aSmrg gphi *vphi;
2813*ec02198aSmrg while (!(vphi = get_virtual_phi (vdef_bb)))
2814*ec02198aSmrg vdef_bb = get_immediate_dominator (CDI_DOMINATORS, vdef_bb);
2815*ec02198aSmrg gcc_assert (vop_to_rename != gimple_phi_result (vphi));
2816*ec02198aSmrg set_current_def (vop_to_rename, gimple_phi_result (vphi));
2817*ec02198aSmrg }
2818*ec02198aSmrg epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
281910d565efSmrg if (!epilog)
282010d565efSmrg {
282110d565efSmrg dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
282210d565efSmrg "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
282310d565efSmrg gcc_unreachable ();
282410d565efSmrg }
28250fc04c29Smrg epilog->force_vectorize = false;
282610d565efSmrg slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
282710d565efSmrg
282810d565efSmrg /* Scalar version loop may be preferred. In this case, add guard
282910d565efSmrg and skip to epilog. Note this only happens when the number of
283010d565efSmrg iterations of loop is unknown at compile time, otherwise this
283110d565efSmrg won't be vectorized. */
283210d565efSmrg if (skip_vector)
283310d565efSmrg {
283410d565efSmrg /* Additional epilogue iteration is peeled if gap exists. */
283510d565efSmrg tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2836c7a68eb7Smrg bound_prolog, bound_epilog,
283710d565efSmrg th, &bound_scalar,
283810d565efSmrg check_profitability);
283910d565efSmrg /* Build guard against NITERSM1 since NITERS may overflow. */
284010d565efSmrg guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
284110d565efSmrg guard_bb = anchor;
284210d565efSmrg guard_to = split_edge (loop_preheader_edge (epilog));
284310d565efSmrg guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
284410d565efSmrg guard_to, guard_bb,
2845c7a68eb7Smrg prob_vector.invert (),
284610d565efSmrg irred_flag);
2847*ec02198aSmrg skip_e = guard_e;
284810d565efSmrg e = EDGE_PRED (guard_to, 0);
284910d565efSmrg e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
285010d565efSmrg slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
285110d565efSmrg
285210d565efSmrg /* Simply propagate profile info from guard_bb to guard_to which is
285310d565efSmrg a merge point of control flow. */
285410d565efSmrg guard_to->count = guard_bb->count;
2855c7a68eb7Smrg
2856c7a68eb7Smrg /* Scale probability of epilog loop back.
2857c7a68eb7Smrg FIXME: We should avoid scaling down and back up. Profile may
2858c7a68eb7Smrg get lost if we scale down to 0. */
2859c7a68eb7Smrg basic_block *bbs = get_loop_body (epilog);
2860c7a68eb7Smrg for (unsigned int i = 0; i < epilog->num_nodes; i++)
2861c7a68eb7Smrg bbs[i]->count = bbs[i]->count.apply_scale
2862c7a68eb7Smrg (bbs[i]->count,
2863c7a68eb7Smrg bbs[i]->count.apply_probability
2864c7a68eb7Smrg (prob_vector));
2865c7a68eb7Smrg free (bbs);
286610d565efSmrg }
286710d565efSmrg
286810d565efSmrg basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
286910d565efSmrg /* If loop is peeled for non-zero constant times, now niters refers to
287010d565efSmrg orig_niters - prolog_peeling, it won't overflow even the orig_niters
287110d565efSmrg overflows. */
287210d565efSmrg niters_no_overflow |= (prolog_peeling > 0);
287310d565efSmrg vect_gen_vector_loop_niters (loop_vinfo, niters,
2874c7a68eb7Smrg niters_vector, step_vector,
2875c7a68eb7Smrg niters_no_overflow);
2876c7a68eb7Smrg if (!integer_onep (*step_vector))
2877c7a68eb7Smrg {
2878c7a68eb7Smrg /* On exit from the loop we will have an easy way of calcalating
2879c7a68eb7Smrg NITERS_VECTOR / STEP * STEP. Install a dummy definition
2880c7a68eb7Smrg until then. */
2881c7a68eb7Smrg niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2882c7a68eb7Smrg SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2883c7a68eb7Smrg *niters_vector_mult_vf_var = niters_vector_mult_vf;
2884c7a68eb7Smrg }
2885c7a68eb7Smrg else
288610d565efSmrg vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
288710d565efSmrg &niters_vector_mult_vf);
288810d565efSmrg /* Update IVs of original loop as if they were advanced by
288910d565efSmrg niters_vector_mult_vf steps. */
289010d565efSmrg gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2891*ec02198aSmrg update_e = skip_vector ? e : loop_preheader_edge (epilog);
289210d565efSmrg vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
289310d565efSmrg update_e);
289410d565efSmrg
289510d565efSmrg if (skip_epilog)
289610d565efSmrg {
289710d565efSmrg guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
289810d565efSmrg niters, niters_vector_mult_vf);
289910d565efSmrg guard_bb = single_exit (loop)->dest;
290010d565efSmrg guard_to = split_edge (single_exit (epilog));
290110d565efSmrg guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
290210d565efSmrg skip_vector ? anchor : guard_bb,
2903c7a68eb7Smrg prob_epilog.invert (),
290410d565efSmrg irred_flag);
290510d565efSmrg slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
290610d565efSmrg single_exit (epilog));
290710d565efSmrg /* Only need to handle basic block before epilog loop if it's not
290810d565efSmrg the guard_bb, which is the case when skip_vector is true. */
290910d565efSmrg if (guard_bb != bb_before_epilog)
291010d565efSmrg {
2911c7a68eb7Smrg prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
291210d565efSmrg
2913c7a68eb7Smrg scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
291410d565efSmrg }
2915c7a68eb7Smrg scale_loop_profile (epilog, prob_epilog, 0);
291610d565efSmrg }
291710d565efSmrg else
291810d565efSmrg slpeel_update_phi_nodes_for_lcssa (epilog);
291910d565efSmrg
2920c7a68eb7Smrg unsigned HOST_WIDE_INT bound;
2921c7a68eb7Smrg if (bound_scalar.is_constant (&bound))
2922c7a68eb7Smrg {
2923c7a68eb7Smrg gcc_assert (bound != 0);
2924c7a68eb7Smrg /* -1 to convert loop iterations to latch iterations. */
2925c7a68eb7Smrg record_niter_bound (epilog, bound - 1, false, true);
2926c7a68eb7Smrg }
292710d565efSmrg
292810d565efSmrg delete_update_ssa ();
292910d565efSmrg adjust_vec_debug_stmts ();
293010d565efSmrg scev_reset ();
293110d565efSmrg }
2932*ec02198aSmrg
2933*ec02198aSmrg if (vect_epilogues)
2934*ec02198aSmrg {
2935*ec02198aSmrg epilog->aux = epilogue_vinfo;
2936*ec02198aSmrg LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
2937*ec02198aSmrg
2938*ec02198aSmrg loop_constraint_clear (epilog, LOOP_C_INFINITE);
2939*ec02198aSmrg
2940*ec02198aSmrg /* We now must calculate the number of NITERS performed by the previous
2941*ec02198aSmrg loop and EPILOGUE_NITERS to be performed by the epilogue. */
2942*ec02198aSmrg tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
2943*ec02198aSmrg niters_prolog, niters_vector_mult_vf);
2944*ec02198aSmrg
2945*ec02198aSmrg /* If skip_vector we may skip the previous loop, we insert a phi-node to
2946*ec02198aSmrg determine whether we are coming from the previous vectorized loop
2947*ec02198aSmrg using the update_e edge or the skip_vector basic block using the
2948*ec02198aSmrg skip_e edge. */
2949*ec02198aSmrg if (skip_vector)
2950*ec02198aSmrg {
2951*ec02198aSmrg gcc_assert (update_e != NULL && skip_e != NULL);
2952*ec02198aSmrg gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
2953*ec02198aSmrg update_e->dest);
2954*ec02198aSmrg tree new_ssa = make_ssa_name (TREE_TYPE (niters));
2955*ec02198aSmrg gimple *stmt = gimple_build_assign (new_ssa, niters);
2956*ec02198aSmrg gimple_stmt_iterator gsi;
2957*ec02198aSmrg if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
2958*ec02198aSmrg && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
2959*ec02198aSmrg {
2960*ec02198aSmrg gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
2961*ec02198aSmrg gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
2962*ec02198aSmrg }
2963*ec02198aSmrg else
2964*ec02198aSmrg {
2965*ec02198aSmrg gsi = gsi_last_bb (update_e->src);
2966*ec02198aSmrg gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
2967*ec02198aSmrg }
2968*ec02198aSmrg
2969*ec02198aSmrg niters = new_ssa;
2970*ec02198aSmrg add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
2971*ec02198aSmrg add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
2972*ec02198aSmrg UNKNOWN_LOCATION);
2973*ec02198aSmrg niters = PHI_RESULT (new_phi);
2974*ec02198aSmrg }
2975*ec02198aSmrg
2976*ec02198aSmrg /* Subtract the number of iterations performed by the vectorized loop
2977*ec02198aSmrg from the number of total iterations. */
2978*ec02198aSmrg tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
2979*ec02198aSmrg before_loop_niters,
2980*ec02198aSmrg niters);
2981*ec02198aSmrg
2982*ec02198aSmrg LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
2983*ec02198aSmrg LOOP_VINFO_NITERSM1 (epilogue_vinfo)
2984*ec02198aSmrg = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
2985*ec02198aSmrg epilogue_niters,
2986*ec02198aSmrg build_one_cst (TREE_TYPE (epilogue_niters)));
2987*ec02198aSmrg
2988*ec02198aSmrg /* Set ADVANCE to the number of iterations performed by the previous
2989*ec02198aSmrg loop and its prologue. */
2990*ec02198aSmrg *advance = niters;
2991*ec02198aSmrg
2992*ec02198aSmrg /* Redo the peeling for niter analysis as the NITERs and alignment
2993*ec02198aSmrg may have been updated to take the main loop into account. */
2994*ec02198aSmrg determine_peel_for_niter (epilogue_vinfo);
2995*ec02198aSmrg }
2996*ec02198aSmrg
299710d565efSmrg adjust_vec.release ();
299810d565efSmrg free_original_copy_tables ();
299910d565efSmrg
3000*ec02198aSmrg return vect_epilogues ? epilog : NULL;
300110d565efSmrg }
300210d565efSmrg
300310d565efSmrg /* Function vect_create_cond_for_niters_checks.
300410d565efSmrg
300510d565efSmrg Create a conditional expression that represents the run-time checks for
3006c7a68eb7Smrg loop's niter. The loop is guaranteed to terminate if the run-time
300710d565efSmrg checks hold.
300810d565efSmrg
300910d565efSmrg Input:
301010d565efSmrg COND_EXPR - input conditional expression. New conditions will be chained
301110d565efSmrg with logical AND operation. If it is NULL, then the function
301210d565efSmrg is used to return the number of alias checks.
301310d565efSmrg LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
301410d565efSmrg to be checked.
301510d565efSmrg
301610d565efSmrg Output:
301710d565efSmrg COND_EXPR - conditional expression.
301810d565efSmrg
301910d565efSmrg The returned COND_EXPR is the conditional expression to be used in the
302010d565efSmrg if statement that controls which version of the loop gets executed at
302110d565efSmrg runtime. */
302210d565efSmrg
302310d565efSmrg static void
vect_create_cond_for_niters_checks(loop_vec_info loop_vinfo,tree * cond_expr)302410d565efSmrg vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
302510d565efSmrg {
302610d565efSmrg tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
302710d565efSmrg
302810d565efSmrg if (*cond_expr)
302910d565efSmrg *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
303010d565efSmrg *cond_expr, part_cond_expr);
303110d565efSmrg else
303210d565efSmrg *cond_expr = part_cond_expr;
303310d565efSmrg }
303410d565efSmrg
3035c7a68eb7Smrg /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3036c7a68eb7Smrg and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
3037c7a68eb7Smrg
3038c7a68eb7Smrg static void
chain_cond_expr(tree * cond_expr,tree part_cond_expr)3039c7a68eb7Smrg chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3040c7a68eb7Smrg {
3041c7a68eb7Smrg if (*cond_expr)
3042c7a68eb7Smrg *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3043c7a68eb7Smrg *cond_expr, part_cond_expr);
3044c7a68eb7Smrg else
3045c7a68eb7Smrg *cond_expr = part_cond_expr;
3046c7a68eb7Smrg }
3047c7a68eb7Smrg
304810d565efSmrg /* Function vect_create_cond_for_align_checks.
304910d565efSmrg
305010d565efSmrg Create a conditional expression that represents the alignment checks for
305110d565efSmrg all of data references (array element references) whose alignment must be
305210d565efSmrg checked at runtime.
305310d565efSmrg
305410d565efSmrg Input:
305510d565efSmrg COND_EXPR - input conditional expression. New conditions will be chained
305610d565efSmrg with logical AND operation.
305710d565efSmrg LOOP_VINFO - two fields of the loop information are used.
305810d565efSmrg LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
305910d565efSmrg LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
306010d565efSmrg
306110d565efSmrg Output:
306210d565efSmrg COND_EXPR_STMT_LIST - statements needed to construct the conditional
306310d565efSmrg expression.
306410d565efSmrg The returned value is the conditional expression to be used in the if
306510d565efSmrg statement that controls which version of the loop gets executed at runtime.
306610d565efSmrg
306710d565efSmrg The algorithm makes two assumptions:
306810d565efSmrg 1) The number of bytes "n" in a vector is a power of 2.
306910d565efSmrg 2) An address "a" is aligned if a%n is zero and that this
307010d565efSmrg test can be done as a&(n-1) == 0. For example, for 16
307110d565efSmrg byte vectors the test is a&0xf == 0. */
307210d565efSmrg
307310d565efSmrg static void
vect_create_cond_for_align_checks(loop_vec_info loop_vinfo,tree * cond_expr,gimple_seq * cond_expr_stmt_list)307410d565efSmrg vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
307510d565efSmrg tree *cond_expr,
307610d565efSmrg gimple_seq *cond_expr_stmt_list)
307710d565efSmrg {
30780fc04c29Smrg vec<stmt_vec_info> may_misalign_stmts
307910d565efSmrg = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
30800fc04c29Smrg stmt_vec_info stmt_info;
308110d565efSmrg int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
308210d565efSmrg tree mask_cst;
308310d565efSmrg unsigned int i;
308410d565efSmrg tree int_ptrsize_type;
308510d565efSmrg char tmp_name[20];
308610d565efSmrg tree or_tmp_name = NULL_TREE;
308710d565efSmrg tree and_tmp_name;
308810d565efSmrg gimple *and_stmt;
308910d565efSmrg tree ptrsize_zero;
309010d565efSmrg tree part_cond_expr;
309110d565efSmrg
309210d565efSmrg /* Check that mask is one less than a power of 2, i.e., mask is
309310d565efSmrg all zeros followed by all ones. */
309410d565efSmrg gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
309510d565efSmrg
309610d565efSmrg int_ptrsize_type = signed_type_for (ptr_type_node);
309710d565efSmrg
309810d565efSmrg /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
309910d565efSmrg of the first vector of the i'th data reference. */
310010d565efSmrg
31010fc04c29Smrg FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
310210d565efSmrg {
310310d565efSmrg gimple_seq new_stmt_list = NULL;
310410d565efSmrg tree addr_base;
310510d565efSmrg tree addr_tmp_name;
310610d565efSmrg tree new_or_tmp_name;
310710d565efSmrg gimple *addr_stmt, *or_stmt;
31080fc04c29Smrg tree vectype = STMT_VINFO_VECTYPE (stmt_info);
310910d565efSmrg bool negative = tree_int_cst_compare
31100fc04c29Smrg (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
311110d565efSmrg tree offset = negative
311210d565efSmrg ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
311310d565efSmrg
311410d565efSmrg /* create: addr_tmp = (int)(address_of_first_vector) */
311510d565efSmrg addr_base =
31160fc04c29Smrg vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
3117c7a68eb7Smrg offset);
311810d565efSmrg if (new_stmt_list != NULL)
311910d565efSmrg gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
312010d565efSmrg
312110d565efSmrg sprintf (tmp_name, "addr2int%d", i);
312210d565efSmrg addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
312310d565efSmrg addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
312410d565efSmrg gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
312510d565efSmrg
312610d565efSmrg /* The addresses are OR together. */
312710d565efSmrg
312810d565efSmrg if (or_tmp_name != NULL_TREE)
312910d565efSmrg {
313010d565efSmrg /* create: or_tmp = or_tmp | addr_tmp */
313110d565efSmrg sprintf (tmp_name, "orptrs%d", i);
313210d565efSmrg new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
313310d565efSmrg or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
313410d565efSmrg or_tmp_name, addr_tmp_name);
313510d565efSmrg gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
313610d565efSmrg or_tmp_name = new_or_tmp_name;
313710d565efSmrg }
313810d565efSmrg else
313910d565efSmrg or_tmp_name = addr_tmp_name;
314010d565efSmrg
314110d565efSmrg } /* end for i */
314210d565efSmrg
314310d565efSmrg mask_cst = build_int_cst (int_ptrsize_type, mask);
314410d565efSmrg
314510d565efSmrg /* create: and_tmp = or_tmp & mask */
314610d565efSmrg and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
314710d565efSmrg
314810d565efSmrg and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
314910d565efSmrg or_tmp_name, mask_cst);
315010d565efSmrg gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
315110d565efSmrg
315210d565efSmrg /* Make and_tmp the left operand of the conditional test against zero.
315310d565efSmrg if and_tmp has a nonzero bit then some address is unaligned. */
315410d565efSmrg ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
315510d565efSmrg part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
315610d565efSmrg and_tmp_name, ptrsize_zero);
3157c7a68eb7Smrg chain_cond_expr (cond_expr, part_cond_expr);
315810d565efSmrg }
315910d565efSmrg
3160c7a68eb7Smrg /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3161c7a68eb7Smrg create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3162c7a68eb7Smrg Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3163c7a68eb7Smrg and this new condition are true. Treat a null *COND_EXPR as "true". */
316410d565efSmrg
316510d565efSmrg static void
vect_create_cond_for_unequal_addrs(loop_vec_info loop_vinfo,tree * cond_expr)3166c7a68eb7Smrg vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
316710d565efSmrg {
3168c7a68eb7Smrg vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3169c7a68eb7Smrg unsigned int i;
3170c7a68eb7Smrg vec_object_pair *pair;
3171c7a68eb7Smrg FOR_EACH_VEC_ELT (pairs, i, pair)
317210d565efSmrg {
3173c7a68eb7Smrg tree addr1 = build_fold_addr_expr (pair->first);
3174c7a68eb7Smrg tree addr2 = build_fold_addr_expr (pair->second);
3175c7a68eb7Smrg tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3176c7a68eb7Smrg addr1, addr2);
3177c7a68eb7Smrg chain_cond_expr (cond_expr, part_cond_expr);
3178c7a68eb7Smrg }
317910d565efSmrg }
318010d565efSmrg
3181c7a68eb7Smrg /* Create an expression that is true when all lower-bound conditions for
3182c7a68eb7Smrg the vectorized loop are met. Chain this condition with *COND_EXPR. */
3183c7a68eb7Smrg
3184c7a68eb7Smrg static void
vect_create_cond_for_lower_bounds(loop_vec_info loop_vinfo,tree * cond_expr)3185c7a68eb7Smrg vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
318610d565efSmrg {
3187c7a68eb7Smrg vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3188c7a68eb7Smrg for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3189c7a68eb7Smrg {
3190c7a68eb7Smrg tree expr = lower_bounds[i].expr;
3191c7a68eb7Smrg tree type = unsigned_type_for (TREE_TYPE (expr));
3192c7a68eb7Smrg expr = fold_convert (type, expr);
3193c7a68eb7Smrg poly_uint64 bound = lower_bounds[i].min_value;
3194c7a68eb7Smrg if (!lower_bounds[i].unsigned_p)
3195c7a68eb7Smrg {
3196c7a68eb7Smrg expr = fold_build2 (PLUS_EXPR, type, expr,
3197c7a68eb7Smrg build_int_cstu (type, bound - 1));
3198c7a68eb7Smrg bound += bound - 1;
319910d565efSmrg }
3200c7a68eb7Smrg tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3201c7a68eb7Smrg build_int_cstu (type, bound));
3202c7a68eb7Smrg chain_cond_expr (cond_expr, part_cond_expr);
3203c7a68eb7Smrg }
320410d565efSmrg }
320510d565efSmrg
320610d565efSmrg /* Function vect_create_cond_for_alias_checks.
320710d565efSmrg
320810d565efSmrg Create a conditional expression that represents the run-time checks for
320910d565efSmrg overlapping of address ranges represented by a list of data references
321010d565efSmrg relations passed as input.
321110d565efSmrg
321210d565efSmrg Input:
321310d565efSmrg COND_EXPR - input conditional expression. New conditions will be chained
321410d565efSmrg with logical AND operation. If it is NULL, then the function
321510d565efSmrg is used to return the number of alias checks.
321610d565efSmrg LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
321710d565efSmrg to be checked.
321810d565efSmrg
321910d565efSmrg Output:
322010d565efSmrg COND_EXPR - conditional expression.
322110d565efSmrg
322210d565efSmrg The returned COND_EXPR is the conditional expression to be used in the if
322310d565efSmrg statement that controls which version of the loop gets executed at runtime.
322410d565efSmrg */
322510d565efSmrg
322610d565efSmrg void
vect_create_cond_for_alias_checks(loop_vec_info loop_vinfo,tree * cond_expr)322710d565efSmrg vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
322810d565efSmrg {
322910d565efSmrg vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
323010d565efSmrg LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
323110d565efSmrg
323210d565efSmrg if (comp_alias_ddrs.is_empty ())
323310d565efSmrg return;
323410d565efSmrg
3235c7a68eb7Smrg create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3236c7a68eb7Smrg &comp_alias_ddrs, cond_expr);
323710d565efSmrg if (dump_enabled_p ())
323810d565efSmrg dump_printf_loc (MSG_NOTE, vect_location,
323910d565efSmrg "created %u versioning for alias checks.\n",
324010d565efSmrg comp_alias_ddrs.length ());
324110d565efSmrg }
324210d565efSmrg
324310d565efSmrg
324410d565efSmrg /* Function vect_loop_versioning.
324510d565efSmrg
324610d565efSmrg If the loop has data references that may or may not be aligned or/and
324710d565efSmrg has data reference relations whose independence was not proven then
324810d565efSmrg two versions of the loop need to be generated, one which is vectorized
324910d565efSmrg and one which isn't. A test is then generated to control which of the
325010d565efSmrg loops is executed. The test checks for the alignment of all of the
325110d565efSmrg data references that may or may not be aligned. An additional
325210d565efSmrg sequence of runtime tests is generated for each pairs of DDRs whose
325310d565efSmrg independence was not proven. The vectorized version of loop is
325410d565efSmrg executed only if both alias and alignment tests are passed.
325510d565efSmrg
325610d565efSmrg The test generated to check which version of loop is executed
325710d565efSmrg is modified to also check for profitability as indicated by the
325810d565efSmrg cost model threshold TH.
325910d565efSmrg
326010d565efSmrg The versioning precondition(s) are placed in *COND_EXPR and
326110d565efSmrg *COND_EXPR_STMT_LIST. */
326210d565efSmrg
3263*ec02198aSmrg class loop *
vect_loop_versioning(loop_vec_info loop_vinfo,gimple * loop_vectorized_call)326410d565efSmrg vect_loop_versioning (loop_vec_info loop_vinfo,
3265*ec02198aSmrg gimple *loop_vectorized_call)
326610d565efSmrg {
3267*ec02198aSmrg class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3268*ec02198aSmrg class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
326910d565efSmrg basic_block condition_bb;
327010d565efSmrg gphi_iterator gsi;
327110d565efSmrg gimple_stmt_iterator cond_exp_gsi;
327210d565efSmrg basic_block merge_bb;
327310d565efSmrg basic_block new_exit_bb;
327410d565efSmrg edge new_exit_e, e;
327510d565efSmrg gphi *orig_phi, *new_phi;
327610d565efSmrg tree cond_expr = NULL_TREE;
327710d565efSmrg gimple_seq cond_expr_stmt_list = NULL;
327810d565efSmrg tree arg;
3279c7a68eb7Smrg profile_probability prob = profile_probability::likely ();
328010d565efSmrg gimple_seq gimplify_stmt_list = NULL;
3281c7a68eb7Smrg tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
328210d565efSmrg bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
328310d565efSmrg bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
328410d565efSmrg bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3285*ec02198aSmrg poly_uint64 versioning_threshold
3286*ec02198aSmrg = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
32870fc04c29Smrg tree version_simd_if_cond
32880fc04c29Smrg = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3289*ec02198aSmrg unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
329010d565efSmrg
3291*ec02198aSmrg if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3292*ec02198aSmrg && !ordered_p (th, versioning_threshold))
3293c7a68eb7Smrg cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
329410d565efSmrg build_int_cst (TREE_TYPE (scalar_loop_iters),
3295c7a68eb7Smrg th - 1));
3296c7a68eb7Smrg if (maybe_ne (versioning_threshold, 0U))
3297c7a68eb7Smrg {
3298c7a68eb7Smrg tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3299c7a68eb7Smrg build_int_cst (TREE_TYPE (scalar_loop_iters),
3300c7a68eb7Smrg versioning_threshold - 1));
3301c7a68eb7Smrg if (cond_expr)
3302c7a68eb7Smrg cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3303c7a68eb7Smrg expr, cond_expr);
3304c7a68eb7Smrg else
3305c7a68eb7Smrg cond_expr = expr;
3306c7a68eb7Smrg }
330710d565efSmrg
330810d565efSmrg if (version_niter)
330910d565efSmrg vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
331010d565efSmrg
331110d565efSmrg if (cond_expr)
3312*ec02198aSmrg cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3313*ec02198aSmrg &cond_expr_stmt_list,
331410d565efSmrg is_gimple_condexpr, NULL_TREE);
331510d565efSmrg
331610d565efSmrg if (version_align)
331710d565efSmrg vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
331810d565efSmrg &cond_expr_stmt_list);
331910d565efSmrg
332010d565efSmrg if (version_alias)
3321c7a68eb7Smrg {
3322c7a68eb7Smrg vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3323c7a68eb7Smrg vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
332410d565efSmrg vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3325c7a68eb7Smrg }
332610d565efSmrg
33270fc04c29Smrg if (version_simd_if_cond)
33280fc04c29Smrg {
33290fc04c29Smrg gcc_assert (dom_info_available_p (CDI_DOMINATORS));
33300fc04c29Smrg if (flag_checking)
33310fc04c29Smrg if (basic_block bb
33320fc04c29Smrg = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
33330fc04c29Smrg gcc_assert (bb != loop->header
33340fc04c29Smrg && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
33350fc04c29Smrg && (scalar_loop == NULL
33360fc04c29Smrg || (bb != scalar_loop->header
33370fc04c29Smrg && dominated_by_p (CDI_DOMINATORS,
33380fc04c29Smrg scalar_loop->header, bb))));
33390fc04c29Smrg tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
33400fc04c29Smrg tree c = fold_build2 (NE_EXPR, boolean_type_node,
33410fc04c29Smrg version_simd_if_cond, zero);
33420fc04c29Smrg if (cond_expr)
33430fc04c29Smrg cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
33440fc04c29Smrg c, cond_expr);
33450fc04c29Smrg else
33460fc04c29Smrg cond_expr = c;
33470fc04c29Smrg if (dump_enabled_p ())
33480fc04c29Smrg dump_printf_loc (MSG_NOTE, vect_location,
33490fc04c29Smrg "created versioning for simd if condition check.\n");
33500fc04c29Smrg }
33510fc04c29Smrg
3352c7a68eb7Smrg cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3353c7a68eb7Smrg &gimplify_stmt_list,
335410d565efSmrg is_gimple_condexpr, NULL_TREE);
335510d565efSmrg gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
335610d565efSmrg
3357*ec02198aSmrg /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3358*ec02198aSmrg invariant in. */
3359*ec02198aSmrg class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3360*ec02198aSmrg for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3361*ec02198aSmrg !gsi_end_p (gsi); gsi_next (&gsi))
336210d565efSmrg {
3363*ec02198aSmrg gimple *stmt = gsi_stmt (gsi);
3364*ec02198aSmrg update_stmt (stmt);
3365*ec02198aSmrg ssa_op_iter iter;
3366*ec02198aSmrg use_operand_p use_p;
3367*ec02198aSmrg basic_block def_bb;
3368*ec02198aSmrg FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3369*ec02198aSmrg if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3370*ec02198aSmrg && flow_bb_inside_loop_p (outermost, def_bb))
3371*ec02198aSmrg outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3372*ec02198aSmrg }
337310d565efSmrg
3374*ec02198aSmrg /* Search for the outermost loop we can version. Avoid versioning of
3375*ec02198aSmrg non-perfect nests but allow if-conversion versioned loops inside. */
3376*ec02198aSmrg class loop *loop_to_version = loop;
3377*ec02198aSmrg if (flow_loop_nested_p (outermost, loop))
3378*ec02198aSmrg {
3379*ec02198aSmrg if (dump_enabled_p ())
3380*ec02198aSmrg dump_printf_loc (MSG_NOTE, vect_location,
3381*ec02198aSmrg "trying to apply versioning to outer loop %d\n",
3382*ec02198aSmrg outermost->num);
3383*ec02198aSmrg if (outermost->num == 0)
3384*ec02198aSmrg outermost = superloop_at_depth (loop, 1);
3385*ec02198aSmrg /* And avoid applying versioning on non-perfect nests. */
3386*ec02198aSmrg while (loop_to_version != outermost
3387*ec02198aSmrg && (e = single_exit (loop_outer (loop_to_version)))
3388*ec02198aSmrg && !(e->flags & EDGE_COMPLEX)
3389*ec02198aSmrg && (!loop_outer (loop_to_version)->inner->next
3390*ec02198aSmrg || vect_loop_vectorized_call (loop_to_version))
3391*ec02198aSmrg && (!loop_outer (loop_to_version)->inner->next
3392*ec02198aSmrg || !loop_outer (loop_to_version)->inner->next->next))
3393*ec02198aSmrg loop_to_version = loop_outer (loop_to_version);
3394*ec02198aSmrg }
3395*ec02198aSmrg
3396*ec02198aSmrg /* Apply versioning. If there is already a scalar version created by
3397*ec02198aSmrg if-conversion re-use that. Note we cannot re-use the copy of
3398*ec02198aSmrg an if-converted outer-loop when vectorizing the inner loop only. */
3399*ec02198aSmrg gcond *cond;
3400*ec02198aSmrg if ((!loop_to_version->inner || loop == loop_to_version)
3401*ec02198aSmrg && loop_vectorized_call)
3402*ec02198aSmrg {
3403*ec02198aSmrg gcc_assert (scalar_loop);
3404*ec02198aSmrg condition_bb = gimple_bb (loop_vectorized_call);
3405*ec02198aSmrg cond = as_a <gcond *> (last_stmt (condition_bb));
3406*ec02198aSmrg gimple_cond_set_condition_from_tree (cond, cond_expr);
3407*ec02198aSmrg update_stmt (cond);
3408*ec02198aSmrg
3409*ec02198aSmrg if (cond_expr_stmt_list)
3410*ec02198aSmrg {
3411*ec02198aSmrg cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3412*ec02198aSmrg gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3413*ec02198aSmrg GSI_SAME_STMT);
3414*ec02198aSmrg }
3415*ec02198aSmrg
3416*ec02198aSmrg /* if-conversion uses profile_probability::always () for both paths,
3417*ec02198aSmrg reset the paths probabilities appropriately. */
3418*ec02198aSmrg edge te, fe;
3419*ec02198aSmrg extract_true_false_edges_from_block (condition_bb, &te, &fe);
3420*ec02198aSmrg te->probability = prob;
3421*ec02198aSmrg fe->probability = prob.invert ();
3422*ec02198aSmrg /* We can scale loops counts immediately but have to postpone
3423*ec02198aSmrg scaling the scalar loop because we re-use it during peeling. */
3424*ec02198aSmrg scale_loop_frequencies (loop_to_version, te->probability);
3425*ec02198aSmrg LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3426*ec02198aSmrg
3427*ec02198aSmrg nloop = scalar_loop;
3428*ec02198aSmrg if (dump_enabled_p ())
3429*ec02198aSmrg dump_printf_loc (MSG_NOTE, vect_location,
3430*ec02198aSmrg "reusing %sloop version created by if conversion\n",
3431*ec02198aSmrg loop_to_version != loop ? "outer " : "");
343210d565efSmrg }
343310d565efSmrg else
3434*ec02198aSmrg {
3435*ec02198aSmrg if (loop_to_version != loop
3436*ec02198aSmrg && dump_enabled_p ())
3437*ec02198aSmrg dump_printf_loc (MSG_NOTE, vect_location,
3438*ec02198aSmrg "applying loop versioning to outer loop %d\n",
3439*ec02198aSmrg loop_to_version->num);
3440*ec02198aSmrg
3441*ec02198aSmrg initialize_original_copy_tables ();
3442*ec02198aSmrg nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3443c7a68eb7Smrg prob, prob.invert (), prob, prob.invert (), true);
3444*ec02198aSmrg gcc_assert (nloop);
3445*ec02198aSmrg nloop = get_loop_copy (loop);
3446*ec02198aSmrg
3447*ec02198aSmrg /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3448*ec02198aSmrg reap those otherwise; they also refer to the original
3449*ec02198aSmrg loops. */
3450*ec02198aSmrg class loop *l = loop;
3451*ec02198aSmrg while (gimple *call = vect_loop_vectorized_call (l))
3452*ec02198aSmrg {
3453*ec02198aSmrg call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3454*ec02198aSmrg fold_loop_internal_call (call, boolean_false_node);
3455*ec02198aSmrg l = loop_outer (l);
3456*ec02198aSmrg }
3457*ec02198aSmrg free_original_copy_tables ();
3458*ec02198aSmrg
3459*ec02198aSmrg if (cond_expr_stmt_list)
3460*ec02198aSmrg {
3461*ec02198aSmrg cond_exp_gsi = gsi_last_bb (condition_bb);
3462*ec02198aSmrg gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3463*ec02198aSmrg GSI_SAME_STMT);
3464*ec02198aSmrg }
3465*ec02198aSmrg
3466*ec02198aSmrg /* Loop versioning violates an assumption we try to maintain during
3467*ec02198aSmrg vectorization - that the loop exit block has a single predecessor.
3468*ec02198aSmrg After versioning, the exit block of both loop versions is the same
3469*ec02198aSmrg basic block (i.e. it has two predecessors). Just in order to simplify
3470*ec02198aSmrg following transformations in the vectorizer, we fix this situation
3471*ec02198aSmrg here by adding a new (empty) block on the exit-edge of the loop,
3472*ec02198aSmrg with the proper loop-exit phis to maintain loop-closed-form.
3473*ec02198aSmrg If loop versioning wasn't done from loop, but scalar_loop instead,
3474*ec02198aSmrg merge_bb will have already just a single successor. */
3475*ec02198aSmrg
3476*ec02198aSmrg merge_bb = single_exit (loop_to_version)->dest;
3477*ec02198aSmrg if (EDGE_COUNT (merge_bb->preds) >= 2)
3478*ec02198aSmrg {
3479*ec02198aSmrg gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3480*ec02198aSmrg new_exit_bb = split_edge (single_exit (loop_to_version));
3481*ec02198aSmrg new_exit_e = single_exit (loop_to_version);
3482*ec02198aSmrg e = EDGE_SUCC (new_exit_bb, 0);
3483*ec02198aSmrg
3484*ec02198aSmrg for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3485*ec02198aSmrg gsi_next (&gsi))
3486*ec02198aSmrg {
3487*ec02198aSmrg tree new_res;
3488*ec02198aSmrg orig_phi = gsi.phi ();
3489*ec02198aSmrg new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3490*ec02198aSmrg new_phi = create_phi_node (new_res, new_exit_bb);
3491*ec02198aSmrg arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3492*ec02198aSmrg add_phi_arg (new_phi, arg, new_exit_e,
3493*ec02198aSmrg gimple_phi_arg_location_from_edge (orig_phi, e));
3494*ec02198aSmrg adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3495*ec02198aSmrg }
3496*ec02198aSmrg }
3497*ec02198aSmrg
3498*ec02198aSmrg update_ssa (TODO_update_ssa);
3499*ec02198aSmrg }
350010d565efSmrg
350110d565efSmrg if (version_niter)
350210d565efSmrg {
350310d565efSmrg /* The versioned loop could be infinite, we need to clear existing
350410d565efSmrg niter information which is copied from the original loop. */
350510d565efSmrg gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
350610d565efSmrg vect_free_loop_info_assumptions (nloop);
350710d565efSmrg /* And set constraint LOOP_C_INFINITE for niter analyzer. */
350810d565efSmrg loop_constraint_set (loop, LOOP_C_INFINITE);
350910d565efSmrg }
351010d565efSmrg
35110fc04c29Smrg if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
351210d565efSmrg && dump_enabled_p ())
351310d565efSmrg {
351410d565efSmrg if (version_alias)
35150fc04c29Smrg dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
35160fc04c29Smrg vect_location,
351710d565efSmrg "loop versioned for vectorization because of "
351810d565efSmrg "possible aliasing\n");
351910d565efSmrg if (version_align)
35200fc04c29Smrg dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
35210fc04c29Smrg vect_location,
352210d565efSmrg "loop versioned for vectorization to enhance "
352310d565efSmrg "alignment\n");
352410d565efSmrg
352510d565efSmrg }
35260fc04c29Smrg
35270fc04c29Smrg return nloop;
352810d565efSmrg }
3529