110d565efSmrg /* Loop unrolling.
2*ec02198aSmrg Copyright (C) 2002-2020 Free Software Foundation, Inc.
310d565efSmrg
410d565efSmrg This file is part of GCC.
510d565efSmrg
610d565efSmrg GCC is free software; you can redistribute it and/or modify it under
710d565efSmrg the terms of the GNU General Public License as published by the Free
810d565efSmrg Software Foundation; either version 3, or (at your option) any later
910d565efSmrg version.
1010d565efSmrg
1110d565efSmrg GCC is distributed in the hope that it will be useful, but WITHOUT ANY
1210d565efSmrg WARRANTY; without even the implied warranty of MERCHANTABILITY or
1310d565efSmrg FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
1410d565efSmrg for more details.
1510d565efSmrg
1610d565efSmrg You should have received a copy of the GNU General Public License
1710d565efSmrg along with GCC; see the file COPYING3. If not see
1810d565efSmrg <http://www.gnu.org/licenses/>. */
1910d565efSmrg
2010d565efSmrg #include "config.h"
2110d565efSmrg #include "system.h"
2210d565efSmrg #include "coretypes.h"
2310d565efSmrg #include "backend.h"
2410d565efSmrg #include "target.h"
2510d565efSmrg #include "rtl.h"
2610d565efSmrg #include "tree.h"
2710d565efSmrg #include "cfghooks.h"
2810d565efSmrg #include "memmodel.h"
2910d565efSmrg #include "optabs.h"
3010d565efSmrg #include "emit-rtl.h"
3110d565efSmrg #include "recog.h"
3210d565efSmrg #include "profile.h"
3310d565efSmrg #include "cfgrtl.h"
3410d565efSmrg #include "cfgloop.h"
3510d565efSmrg #include "dojump.h"
3610d565efSmrg #include "expr.h"
3710d565efSmrg #include "dumpfile.h"
3810d565efSmrg
3910d565efSmrg /* This pass performs loop unrolling. We only perform this
4010d565efSmrg optimization on innermost loops (with single exception) because
4110d565efSmrg the impact on performance is greatest here, and we want to avoid
4210d565efSmrg unnecessary code size growth. The gain is caused by greater sequentiality
4310d565efSmrg of code, better code to optimize for further passes and in some cases
4410d565efSmrg by fewer testings of exit conditions. The main problem is code growth,
4510d565efSmrg that impacts performance negatively due to effect of caches.
4610d565efSmrg
4710d565efSmrg What we do:
4810d565efSmrg
4910d565efSmrg -- unrolling of loops that roll constant times; this is almost always
5010d565efSmrg win, as we get rid of exit condition tests.
5110d565efSmrg -- unrolling of loops that roll number of times that we can compute
5210d565efSmrg in runtime; we also get rid of exit condition tests here, but there
5310d565efSmrg is the extra expense for calculating the number of iterations
5410d565efSmrg -- simple unrolling of remaining loops; this is performed only if we
5510d565efSmrg are asked to, as the gain is questionable in this case and often
5610d565efSmrg it may even slow down the code
5710d565efSmrg For more detailed descriptions of each of those, see comments at
5810d565efSmrg appropriate function below.
5910d565efSmrg
6010d565efSmrg There is a lot of parameters (defined and described in params.def) that
6110d565efSmrg control how much we unroll.
6210d565efSmrg
6310d565efSmrg ??? A great problem is that we don't have a good way how to determine
6410d565efSmrg how many times we should unroll the loop; the experiments I have made
6510d565efSmrg showed that this choice may affect performance in order of several %.
6610d565efSmrg */
6710d565efSmrg
6810d565efSmrg /* Information about induction variables to split. */
6910d565efSmrg
7010d565efSmrg struct iv_to_split
7110d565efSmrg {
7210d565efSmrg rtx_insn *insn; /* The insn in that the induction variable occurs. */
7310d565efSmrg rtx orig_var; /* The variable (register) for the IV before split. */
7410d565efSmrg rtx base_var; /* The variable on that the values in the further
7510d565efSmrg iterations are based. */
7610d565efSmrg rtx step; /* Step of the induction variable. */
7710d565efSmrg struct iv_to_split *next; /* Next entry in walking order. */
7810d565efSmrg };
7910d565efSmrg
8010d565efSmrg /* Information about accumulators to expand. */
8110d565efSmrg
8210d565efSmrg struct var_to_expand
8310d565efSmrg {
8410d565efSmrg rtx_insn *insn; /* The insn in that the variable expansion occurs. */
8510d565efSmrg rtx reg; /* The accumulator which is expanded. */
8610d565efSmrg vec<rtx> var_expansions; /* The copies of the accumulator which is expanded. */
8710d565efSmrg struct var_to_expand *next; /* Next entry in walking order. */
8810d565efSmrg enum rtx_code op; /* The type of the accumulation - addition, subtraction
8910d565efSmrg or multiplication. */
9010d565efSmrg int expansion_count; /* Count the number of expansions generated so far. */
9110d565efSmrg int reuse_expansion; /* The expansion we intend to reuse to expand
9210d565efSmrg the accumulator. If REUSE_EXPANSION is 0 reuse
9310d565efSmrg the original accumulator. Else use
9410d565efSmrg var_expansions[REUSE_EXPANSION - 1]. */
9510d565efSmrg };
9610d565efSmrg
9710d565efSmrg /* Hashtable helper for iv_to_split. */
9810d565efSmrg
9910d565efSmrg struct iv_split_hasher : free_ptr_hash <iv_to_split>
10010d565efSmrg {
10110d565efSmrg static inline hashval_t hash (const iv_to_split *);
10210d565efSmrg static inline bool equal (const iv_to_split *, const iv_to_split *);
10310d565efSmrg };
10410d565efSmrg
10510d565efSmrg
10610d565efSmrg /* A hash function for information about insns to split. */
10710d565efSmrg
10810d565efSmrg inline hashval_t
hash(const iv_to_split * ivts)10910d565efSmrg iv_split_hasher::hash (const iv_to_split *ivts)
11010d565efSmrg {
11110d565efSmrg return (hashval_t) INSN_UID (ivts->insn);
11210d565efSmrg }
11310d565efSmrg
11410d565efSmrg /* An equality functions for information about insns to split. */
11510d565efSmrg
11610d565efSmrg inline bool
equal(const iv_to_split * i1,const iv_to_split * i2)11710d565efSmrg iv_split_hasher::equal (const iv_to_split *i1, const iv_to_split *i2)
11810d565efSmrg {
11910d565efSmrg return i1->insn == i2->insn;
12010d565efSmrg }
12110d565efSmrg
12210d565efSmrg /* Hashtable helper for iv_to_split. */
12310d565efSmrg
12410d565efSmrg struct var_expand_hasher : free_ptr_hash <var_to_expand>
12510d565efSmrg {
12610d565efSmrg static inline hashval_t hash (const var_to_expand *);
12710d565efSmrg static inline bool equal (const var_to_expand *, const var_to_expand *);
12810d565efSmrg };
12910d565efSmrg
13010d565efSmrg /* Return a hash for VES. */
13110d565efSmrg
13210d565efSmrg inline hashval_t
hash(const var_to_expand * ves)13310d565efSmrg var_expand_hasher::hash (const var_to_expand *ves)
13410d565efSmrg {
13510d565efSmrg return (hashval_t) INSN_UID (ves->insn);
13610d565efSmrg }
13710d565efSmrg
13810d565efSmrg /* Return true if I1 and I2 refer to the same instruction. */
13910d565efSmrg
14010d565efSmrg inline bool
equal(const var_to_expand * i1,const var_to_expand * i2)14110d565efSmrg var_expand_hasher::equal (const var_to_expand *i1, const var_to_expand *i2)
14210d565efSmrg {
14310d565efSmrg return i1->insn == i2->insn;
14410d565efSmrg }
14510d565efSmrg
14610d565efSmrg /* Information about optimization applied in
14710d565efSmrg the unrolled loop. */
14810d565efSmrg
14910d565efSmrg struct opt_info
15010d565efSmrg {
15110d565efSmrg hash_table<iv_split_hasher> *insns_to_split; /* A hashtable of insns to
15210d565efSmrg split. */
15310d565efSmrg struct iv_to_split *iv_to_split_head; /* The first iv to split. */
15410d565efSmrg struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list. */
15510d565efSmrg hash_table<var_expand_hasher> *insns_with_var_to_expand; /* A hashtable of
15610d565efSmrg insns with accumulators to expand. */
15710d565efSmrg struct var_to_expand *var_to_expand_head; /* The first var to expand. */
15810d565efSmrg struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list. */
15910d565efSmrg unsigned first_new_block; /* The first basic block that was
16010d565efSmrg duplicated. */
16110d565efSmrg basic_block loop_exit; /* The loop exit basic block. */
16210d565efSmrg basic_block loop_preheader; /* The loop preheader basic block. */
16310d565efSmrg };
16410d565efSmrg
165*ec02198aSmrg static void decide_unroll_stupid (class loop *, int);
166*ec02198aSmrg static void decide_unroll_constant_iterations (class loop *, int);
167*ec02198aSmrg static void decide_unroll_runtime_iterations (class loop *, int);
168*ec02198aSmrg static void unroll_loop_stupid (class loop *);
16910d565efSmrg static void decide_unrolling (int);
170*ec02198aSmrg static void unroll_loop_constant_iterations (class loop *);
171*ec02198aSmrg static void unroll_loop_runtime_iterations (class loop *);
172*ec02198aSmrg static struct opt_info *analyze_insns_in_loop (class loop *);
17310d565efSmrg static void opt_info_start_duplication (struct opt_info *);
17410d565efSmrg static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
17510d565efSmrg static void free_opt_info (struct opt_info *);
176*ec02198aSmrg static struct var_to_expand *analyze_insn_to_expand_var (class loop*, rtx_insn *);
177*ec02198aSmrg static bool referenced_in_one_insn_in_loop_p (class loop *, rtx, int *);
17810d565efSmrg static struct iv_to_split *analyze_iv_to_split_insn (rtx_insn *);
17910d565efSmrg static void expand_var_during_unrolling (struct var_to_expand *, rtx_insn *);
18010d565efSmrg static void insert_var_expansion_initialization (struct var_to_expand *,
18110d565efSmrg basic_block);
18210d565efSmrg static void combine_var_copies_in_loop_exit (struct var_to_expand *,
18310d565efSmrg basic_block);
18410d565efSmrg static rtx get_expansion (struct var_to_expand *);
18510d565efSmrg
18610d565efSmrg /* Emit a message summarizing the unroll that will be
18710d565efSmrg performed for LOOP, along with the loop's location LOCUS, if
18810d565efSmrg appropriate given the dump or -fopt-info settings. */
18910d565efSmrg
19010d565efSmrg static void
report_unroll(class loop * loop,dump_location_t locus)191*ec02198aSmrg report_unroll (class loop *loop, dump_location_t locus)
19210d565efSmrg {
193c7a68eb7Smrg dump_flags_t report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS;
19410d565efSmrg
19510d565efSmrg if (loop->lpt_decision.decision == LPT_NONE)
19610d565efSmrg return;
19710d565efSmrg
19810d565efSmrg if (!dump_enabled_p ())
19910d565efSmrg return;
20010d565efSmrg
2010fc04c29Smrg dump_metadata_t metadata (report_flags, locus.get_impl_location ());
2020fc04c29Smrg dump_printf_loc (metadata, locus.get_user_location (),
20310d565efSmrg "loop unrolled %d times",
20410d565efSmrg loop->lpt_decision.times);
205c7a68eb7Smrg if (profile_info && loop->header->count.initialized_p ())
2060fc04c29Smrg dump_printf (metadata,
20710d565efSmrg " (header execution count %d)",
208c7a68eb7Smrg (int)loop->header->count.to_gcov_type ());
20910d565efSmrg
2100fc04c29Smrg dump_printf (metadata, "\n");
21110d565efSmrg }
21210d565efSmrg
21310d565efSmrg /* Decide whether unroll loops and how much. */
21410d565efSmrg static void
decide_unrolling(int flags)21510d565efSmrg decide_unrolling (int flags)
21610d565efSmrg {
217*ec02198aSmrg class loop *loop;
21810d565efSmrg
21910d565efSmrg /* Scan the loops, inner ones first. */
22010d565efSmrg FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
22110d565efSmrg {
22210d565efSmrg loop->lpt_decision.decision = LPT_NONE;
2230fc04c29Smrg dump_user_location_t locus = get_loop_location (loop);
22410d565efSmrg
22510d565efSmrg if (dump_enabled_p ())
226c7a68eb7Smrg dump_printf_loc (MSG_NOTE, locus,
227c7a68eb7Smrg "considering unrolling loop %d at BB %d\n",
22810d565efSmrg loop->num, loop->header->index);
22910d565efSmrg
230c7a68eb7Smrg if (loop->unroll == 1)
231c7a68eb7Smrg {
232c7a68eb7Smrg if (dump_file)
233c7a68eb7Smrg fprintf (dump_file,
234c7a68eb7Smrg ";; Not unrolling loop, user didn't want it unrolled\n");
235c7a68eb7Smrg continue;
236c7a68eb7Smrg }
237c7a68eb7Smrg
23810d565efSmrg /* Do not peel cold areas. */
23910d565efSmrg if (optimize_loop_for_size_p (loop))
24010d565efSmrg {
24110d565efSmrg if (dump_file)
24210d565efSmrg fprintf (dump_file, ";; Not considering loop, cold area\n");
24310d565efSmrg continue;
24410d565efSmrg }
24510d565efSmrg
24610d565efSmrg /* Can the loop be manipulated? */
24710d565efSmrg if (!can_duplicate_loop_p (loop))
24810d565efSmrg {
24910d565efSmrg if (dump_file)
25010d565efSmrg fprintf (dump_file,
25110d565efSmrg ";; Not considering loop, cannot duplicate\n");
25210d565efSmrg continue;
25310d565efSmrg }
25410d565efSmrg
25510d565efSmrg /* Skip non-innermost loops. */
25610d565efSmrg if (loop->inner)
25710d565efSmrg {
25810d565efSmrg if (dump_file)
25910d565efSmrg fprintf (dump_file, ";; Not considering loop, is not innermost\n");
26010d565efSmrg continue;
26110d565efSmrg }
26210d565efSmrg
26310d565efSmrg loop->ninsns = num_loop_insns (loop);
26410d565efSmrg loop->av_ninsns = average_num_loop_insns (loop);
26510d565efSmrg
266c7a68eb7Smrg /* Try transformations one by one in decreasing order of priority. */
26710d565efSmrg decide_unroll_constant_iterations (loop, flags);
26810d565efSmrg if (loop->lpt_decision.decision == LPT_NONE)
26910d565efSmrg decide_unroll_runtime_iterations (loop, flags);
27010d565efSmrg if (loop->lpt_decision.decision == LPT_NONE)
27110d565efSmrg decide_unroll_stupid (loop, flags);
27210d565efSmrg
27310d565efSmrg report_unroll (loop, locus);
27410d565efSmrg }
27510d565efSmrg }
27610d565efSmrg
27710d565efSmrg /* Unroll LOOPS. */
27810d565efSmrg void
unroll_loops(int flags)27910d565efSmrg unroll_loops (int flags)
28010d565efSmrg {
281*ec02198aSmrg class loop *loop;
28210d565efSmrg bool changed = false;
28310d565efSmrg
28410d565efSmrg /* Now decide rest of unrolling. */
28510d565efSmrg decide_unrolling (flags);
28610d565efSmrg
28710d565efSmrg /* Scan the loops, inner ones first. */
28810d565efSmrg FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
28910d565efSmrg {
29010d565efSmrg /* And perform the appropriate transformations. */
29110d565efSmrg switch (loop->lpt_decision.decision)
29210d565efSmrg {
29310d565efSmrg case LPT_UNROLL_CONSTANT:
29410d565efSmrg unroll_loop_constant_iterations (loop);
29510d565efSmrg changed = true;
29610d565efSmrg break;
29710d565efSmrg case LPT_UNROLL_RUNTIME:
29810d565efSmrg unroll_loop_runtime_iterations (loop);
29910d565efSmrg changed = true;
30010d565efSmrg break;
30110d565efSmrg case LPT_UNROLL_STUPID:
30210d565efSmrg unroll_loop_stupid (loop);
30310d565efSmrg changed = true;
30410d565efSmrg break;
30510d565efSmrg case LPT_NONE:
30610d565efSmrg break;
30710d565efSmrg default:
30810d565efSmrg gcc_unreachable ();
30910d565efSmrg }
31010d565efSmrg }
31110d565efSmrg
31210d565efSmrg if (changed)
31310d565efSmrg {
31410d565efSmrg calculate_dominance_info (CDI_DOMINATORS);
31510d565efSmrg fix_loop_structure (NULL);
31610d565efSmrg }
31710d565efSmrg
31810d565efSmrg iv_analysis_done ();
31910d565efSmrg }
32010d565efSmrg
32110d565efSmrg /* Check whether exit of the LOOP is at the end of loop body. */
32210d565efSmrg
32310d565efSmrg static bool
loop_exit_at_end_p(class loop * loop)324*ec02198aSmrg loop_exit_at_end_p (class loop *loop)
32510d565efSmrg {
326*ec02198aSmrg class niter_desc *desc = get_simple_loop_desc (loop);
32710d565efSmrg rtx_insn *insn;
32810d565efSmrg
32910d565efSmrg /* We should never have conditional in latch block. */
33010d565efSmrg gcc_assert (desc->in_edge->dest != loop->header);
33110d565efSmrg
33210d565efSmrg if (desc->in_edge->dest != loop->latch)
33310d565efSmrg return false;
33410d565efSmrg
33510d565efSmrg /* Check that the latch is empty. */
33610d565efSmrg FOR_BB_INSNS (loop->latch, insn)
33710d565efSmrg {
33810d565efSmrg if (INSN_P (insn) && active_insn_p (insn))
33910d565efSmrg return false;
34010d565efSmrg }
34110d565efSmrg
34210d565efSmrg return true;
34310d565efSmrg }
34410d565efSmrg
34510d565efSmrg /* Decide whether to unroll LOOP iterating constant number of times
34610d565efSmrg and how much. */
34710d565efSmrg
34810d565efSmrg static void
decide_unroll_constant_iterations(class loop * loop,int flags)349*ec02198aSmrg decide_unroll_constant_iterations (class loop *loop, int flags)
35010d565efSmrg {
35110d565efSmrg unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
352*ec02198aSmrg class niter_desc *desc;
35310d565efSmrg widest_int iterations;
35410d565efSmrg
355c7a68eb7Smrg /* If we were not asked to unroll this loop, just return back silently. */
356c7a68eb7Smrg if (!(flags & UAP_UNROLL) && !loop->unroll)
35710d565efSmrg return;
35810d565efSmrg
359c7a68eb7Smrg if (dump_enabled_p ())
360c7a68eb7Smrg dump_printf (MSG_NOTE,
361c7a68eb7Smrg "considering unrolling loop with constant "
36210d565efSmrg "number of iterations\n");
36310d565efSmrg
36410d565efSmrg /* nunroll = total number of copies of the original loop body in
365c7a68eb7Smrg unrolled loop (i.e. if it is 2, we have to duplicate loop body once). */
366*ec02198aSmrg nunroll = param_max_unrolled_insns / loop->ninsns;
36710d565efSmrg nunroll_by_av
368*ec02198aSmrg = param_max_average_unrolled_insns / loop->av_ninsns;
36910d565efSmrg if (nunroll > nunroll_by_av)
37010d565efSmrg nunroll = nunroll_by_av;
371*ec02198aSmrg if (nunroll > (unsigned) param_max_unroll_times)
372*ec02198aSmrg nunroll = param_max_unroll_times;
37310d565efSmrg
37410d565efSmrg if (targetm.loop_unroll_adjust)
37510d565efSmrg nunroll = targetm.loop_unroll_adjust (nunroll, loop);
37610d565efSmrg
37710d565efSmrg /* Skip big loops. */
37810d565efSmrg if (nunroll <= 1)
37910d565efSmrg {
38010d565efSmrg if (dump_file)
38110d565efSmrg fprintf (dump_file, ";; Not considering loop, is too big\n");
38210d565efSmrg return;
38310d565efSmrg }
38410d565efSmrg
38510d565efSmrg /* Check for simple loops. */
38610d565efSmrg desc = get_simple_loop_desc (loop);
38710d565efSmrg
38810d565efSmrg /* Check number of iterations. */
38910d565efSmrg if (!desc->simple_p || !desc->const_iter || desc->assumptions)
39010d565efSmrg {
39110d565efSmrg if (dump_file)
39210d565efSmrg fprintf (dump_file,
39310d565efSmrg ";; Unable to prove that the loop iterates constant times\n");
39410d565efSmrg return;
39510d565efSmrg }
39610d565efSmrg
397c7a68eb7Smrg /* Check for an explicit unrolling factor. */
398c7a68eb7Smrg if (loop->unroll > 0 && loop->unroll < USHRT_MAX)
399c7a68eb7Smrg {
400c7a68eb7Smrg /* However we cannot unroll completely at the RTL level a loop with
401c7a68eb7Smrg constant number of iterations; it should have been peeled instead. */
402c7a68eb7Smrg if (desc->niter == 0 || (unsigned) loop->unroll > desc->niter - 1)
403c7a68eb7Smrg {
404c7a68eb7Smrg if (dump_file)
405c7a68eb7Smrg fprintf (dump_file, ";; Loop should have been peeled\n");
406c7a68eb7Smrg }
407c7a68eb7Smrg else
408c7a68eb7Smrg {
409c7a68eb7Smrg loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
410c7a68eb7Smrg loop->lpt_decision.times = loop->unroll - 1;
411c7a68eb7Smrg }
412c7a68eb7Smrg return;
413c7a68eb7Smrg }
414c7a68eb7Smrg
41510d565efSmrg /* Check whether the loop rolls enough to consider.
41610d565efSmrg Consult also loop bounds and profile; in the case the loop has more
41710d565efSmrg than one exit it may well loop less than determined maximal number
41810d565efSmrg of iterations. */
41910d565efSmrg if (desc->niter < 2 * nunroll
42010d565efSmrg || ((get_estimated_loop_iterations (loop, &iterations)
42110d565efSmrg || get_likely_max_loop_iterations (loop, &iterations))
42210d565efSmrg && wi::ltu_p (iterations, 2 * nunroll)))
42310d565efSmrg {
42410d565efSmrg if (dump_file)
42510d565efSmrg fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
42610d565efSmrg return;
42710d565efSmrg }
42810d565efSmrg
42910d565efSmrg /* Success; now compute number of iterations to unroll. We alter
43010d565efSmrg nunroll so that as few as possible copies of loop body are
43110d565efSmrg necessary, while still not decreasing the number of unrollings
43210d565efSmrg too much (at most by 1). */
43310d565efSmrg best_copies = 2 * nunroll + 10;
43410d565efSmrg
43510d565efSmrg i = 2 * nunroll + 2;
436c7a68eb7Smrg if (i > desc->niter - 2)
43710d565efSmrg i = desc->niter - 2;
43810d565efSmrg
43910d565efSmrg for (; i >= nunroll - 1; i--)
44010d565efSmrg {
44110d565efSmrg unsigned exit_mod = desc->niter % (i + 1);
44210d565efSmrg
44310d565efSmrg if (!loop_exit_at_end_p (loop))
44410d565efSmrg n_copies = exit_mod + i + 1;
44510d565efSmrg else if (exit_mod != (unsigned) i
44610d565efSmrg || desc->noloop_assumptions != NULL_RTX)
44710d565efSmrg n_copies = exit_mod + i + 2;
44810d565efSmrg else
44910d565efSmrg n_copies = i + 1;
45010d565efSmrg
45110d565efSmrg if (n_copies < best_copies)
45210d565efSmrg {
45310d565efSmrg best_copies = n_copies;
45410d565efSmrg best_unroll = i;
45510d565efSmrg }
45610d565efSmrg }
45710d565efSmrg
45810d565efSmrg loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
45910d565efSmrg loop->lpt_decision.times = best_unroll;
46010d565efSmrg }
46110d565efSmrg
46210d565efSmrg /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
46310d565efSmrg The transformation does this:
46410d565efSmrg
46510d565efSmrg for (i = 0; i < 102; i++)
46610d565efSmrg body;
46710d565efSmrg
46810d565efSmrg ==> (LOOP->LPT_DECISION.TIMES == 3)
46910d565efSmrg
47010d565efSmrg i = 0;
47110d565efSmrg body; i++;
47210d565efSmrg body; i++;
47310d565efSmrg while (i < 102)
47410d565efSmrg {
47510d565efSmrg body; i++;
47610d565efSmrg body; i++;
47710d565efSmrg body; i++;
47810d565efSmrg body; i++;
47910d565efSmrg }
48010d565efSmrg */
48110d565efSmrg static void
unroll_loop_constant_iterations(class loop * loop)482*ec02198aSmrg unroll_loop_constant_iterations (class loop *loop)
48310d565efSmrg {
48410d565efSmrg unsigned HOST_WIDE_INT niter;
48510d565efSmrg unsigned exit_mod;
48610d565efSmrg unsigned i;
48710d565efSmrg edge e;
48810d565efSmrg unsigned max_unroll = loop->lpt_decision.times;
489*ec02198aSmrg class niter_desc *desc = get_simple_loop_desc (loop);
49010d565efSmrg bool exit_at_end = loop_exit_at_end_p (loop);
49110d565efSmrg struct opt_info *opt_info = NULL;
49210d565efSmrg bool ok;
49310d565efSmrg
49410d565efSmrg niter = desc->niter;
49510d565efSmrg
49610d565efSmrg /* Should not get here (such loop should be peeled instead). */
49710d565efSmrg gcc_assert (niter > max_unroll + 1);
49810d565efSmrg
49910d565efSmrg exit_mod = niter % (max_unroll + 1);
50010d565efSmrg
50110d565efSmrg auto_sbitmap wont_exit (max_unroll + 2);
50210d565efSmrg bitmap_ones (wont_exit);
50310d565efSmrg
50410d565efSmrg auto_vec<edge> remove_edges;
50510d565efSmrg if (flag_split_ivs_in_unroller
50610d565efSmrg || flag_variable_expansion_in_unroller)
50710d565efSmrg opt_info = analyze_insns_in_loop (loop);
50810d565efSmrg
50910d565efSmrg if (!exit_at_end)
51010d565efSmrg {
51110d565efSmrg /* The exit is not at the end of the loop; leave exit test
51210d565efSmrg in the first copy, so that the loops that start with test
51310d565efSmrg of exit condition have continuous body after unrolling. */
51410d565efSmrg
51510d565efSmrg if (dump_file)
51610d565efSmrg fprintf (dump_file, ";; Condition at beginning of loop.\n");
51710d565efSmrg
51810d565efSmrg /* Peel exit_mod iterations. */
51910d565efSmrg bitmap_clear_bit (wont_exit, 0);
52010d565efSmrg if (desc->noloop_assumptions)
52110d565efSmrg bitmap_clear_bit (wont_exit, 1);
52210d565efSmrg
52310d565efSmrg if (exit_mod)
52410d565efSmrg {
52510d565efSmrg opt_info_start_duplication (opt_info);
52610d565efSmrg ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
52710d565efSmrg exit_mod,
52810d565efSmrg wont_exit, desc->out_edge,
52910d565efSmrg &remove_edges,
53010d565efSmrg DLTHE_FLAG_UPDATE_FREQ
53110d565efSmrg | (opt_info && exit_mod > 1
53210d565efSmrg ? DLTHE_RECORD_COPY_NUMBER
53310d565efSmrg : 0));
53410d565efSmrg gcc_assert (ok);
53510d565efSmrg
53610d565efSmrg if (opt_info && exit_mod > 1)
53710d565efSmrg apply_opt_in_copies (opt_info, exit_mod, false, false);
53810d565efSmrg
53910d565efSmrg desc->noloop_assumptions = NULL_RTX;
54010d565efSmrg desc->niter -= exit_mod;
54110d565efSmrg loop->nb_iterations_upper_bound -= exit_mod;
54210d565efSmrg if (loop->any_estimate
54310d565efSmrg && wi::leu_p (exit_mod, loop->nb_iterations_estimate))
54410d565efSmrg loop->nb_iterations_estimate -= exit_mod;
54510d565efSmrg else
54610d565efSmrg loop->any_estimate = false;
54710d565efSmrg if (loop->any_likely_upper_bound
54810d565efSmrg && wi::leu_p (exit_mod, loop->nb_iterations_likely_upper_bound))
54910d565efSmrg loop->nb_iterations_likely_upper_bound -= exit_mod;
55010d565efSmrg else
55110d565efSmrg loop->any_likely_upper_bound = false;
55210d565efSmrg }
55310d565efSmrg
55410d565efSmrg bitmap_set_bit (wont_exit, 1);
55510d565efSmrg }
55610d565efSmrg else
55710d565efSmrg {
55810d565efSmrg /* Leave exit test in last copy, for the same reason as above if
55910d565efSmrg the loop tests the condition at the end of loop body. */
56010d565efSmrg
56110d565efSmrg if (dump_file)
56210d565efSmrg fprintf (dump_file, ";; Condition at end of loop.\n");
56310d565efSmrg
56410d565efSmrg /* We know that niter >= max_unroll + 2; so we do not need to care of
56510d565efSmrg case when we would exit before reaching the loop. So just peel
56610d565efSmrg exit_mod + 1 iterations. */
56710d565efSmrg if (exit_mod != max_unroll
56810d565efSmrg || desc->noloop_assumptions)
56910d565efSmrg {
57010d565efSmrg bitmap_clear_bit (wont_exit, 0);
57110d565efSmrg if (desc->noloop_assumptions)
57210d565efSmrg bitmap_clear_bit (wont_exit, 1);
57310d565efSmrg
57410d565efSmrg opt_info_start_duplication (opt_info);
57510d565efSmrg ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
57610d565efSmrg exit_mod + 1,
57710d565efSmrg wont_exit, desc->out_edge,
57810d565efSmrg &remove_edges,
57910d565efSmrg DLTHE_FLAG_UPDATE_FREQ
58010d565efSmrg | (opt_info && exit_mod > 0
58110d565efSmrg ? DLTHE_RECORD_COPY_NUMBER
58210d565efSmrg : 0));
58310d565efSmrg gcc_assert (ok);
58410d565efSmrg
58510d565efSmrg if (opt_info && exit_mod > 0)
58610d565efSmrg apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
58710d565efSmrg
58810d565efSmrg desc->niter -= exit_mod + 1;
58910d565efSmrg loop->nb_iterations_upper_bound -= exit_mod + 1;
59010d565efSmrg if (loop->any_estimate
59110d565efSmrg && wi::leu_p (exit_mod + 1, loop->nb_iterations_estimate))
59210d565efSmrg loop->nb_iterations_estimate -= exit_mod + 1;
59310d565efSmrg else
59410d565efSmrg loop->any_estimate = false;
59510d565efSmrg if (loop->any_likely_upper_bound
59610d565efSmrg && wi::leu_p (exit_mod + 1, loop->nb_iterations_likely_upper_bound))
59710d565efSmrg loop->nb_iterations_likely_upper_bound -= exit_mod + 1;
59810d565efSmrg else
59910d565efSmrg loop->any_likely_upper_bound = false;
60010d565efSmrg desc->noloop_assumptions = NULL_RTX;
60110d565efSmrg
60210d565efSmrg bitmap_set_bit (wont_exit, 0);
60310d565efSmrg bitmap_set_bit (wont_exit, 1);
60410d565efSmrg }
60510d565efSmrg
60610d565efSmrg bitmap_clear_bit (wont_exit, max_unroll);
60710d565efSmrg }
60810d565efSmrg
60910d565efSmrg /* Now unroll the loop. */
61010d565efSmrg
61110d565efSmrg opt_info_start_duplication (opt_info);
61210d565efSmrg ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
61310d565efSmrg max_unroll,
61410d565efSmrg wont_exit, desc->out_edge,
61510d565efSmrg &remove_edges,
61610d565efSmrg DLTHE_FLAG_UPDATE_FREQ
61710d565efSmrg | (opt_info
61810d565efSmrg ? DLTHE_RECORD_COPY_NUMBER
61910d565efSmrg : 0));
62010d565efSmrg gcc_assert (ok);
62110d565efSmrg
62210d565efSmrg if (opt_info)
62310d565efSmrg {
62410d565efSmrg apply_opt_in_copies (opt_info, max_unroll, true, true);
62510d565efSmrg free_opt_info (opt_info);
62610d565efSmrg }
62710d565efSmrg
62810d565efSmrg if (exit_at_end)
62910d565efSmrg {
63010d565efSmrg basic_block exit_block = get_bb_copy (desc->in_edge->src);
63110d565efSmrg /* Find a new in and out edge; they are in the last copy we have made. */
63210d565efSmrg
63310d565efSmrg if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
63410d565efSmrg {
63510d565efSmrg desc->out_edge = EDGE_SUCC (exit_block, 0);
63610d565efSmrg desc->in_edge = EDGE_SUCC (exit_block, 1);
63710d565efSmrg }
63810d565efSmrg else
63910d565efSmrg {
64010d565efSmrg desc->out_edge = EDGE_SUCC (exit_block, 1);
64110d565efSmrg desc->in_edge = EDGE_SUCC (exit_block, 0);
64210d565efSmrg }
64310d565efSmrg }
64410d565efSmrg
64510d565efSmrg desc->niter /= max_unroll + 1;
64610d565efSmrg loop->nb_iterations_upper_bound
64710d565efSmrg = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
64810d565efSmrg if (loop->any_estimate)
64910d565efSmrg loop->nb_iterations_estimate
65010d565efSmrg = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
65110d565efSmrg if (loop->any_likely_upper_bound)
65210d565efSmrg loop->nb_iterations_likely_upper_bound
65310d565efSmrg = wi::udiv_trunc (loop->nb_iterations_likely_upper_bound, max_unroll + 1);
65421ff1670Smrg desc->niter_expr = gen_int_mode (desc->niter, desc->mode);
65510d565efSmrg
65610d565efSmrg /* Remove the edges. */
65710d565efSmrg FOR_EACH_VEC_ELT (remove_edges, i, e)
65810d565efSmrg remove_path (e);
65910d565efSmrg
66010d565efSmrg if (dump_file)
66110d565efSmrg fprintf (dump_file,
66210d565efSmrg ";; Unrolled loop %d times, constant # of iterations %i insns\n",
66310d565efSmrg max_unroll, num_loop_insns (loop));
66410d565efSmrg }
66510d565efSmrg
66610d565efSmrg /* Decide whether to unroll LOOP iterating runtime computable number of times
66710d565efSmrg and how much. */
66810d565efSmrg static void
decide_unroll_runtime_iterations(class loop * loop,int flags)669*ec02198aSmrg decide_unroll_runtime_iterations (class loop *loop, int flags)
67010d565efSmrg {
67110d565efSmrg unsigned nunroll, nunroll_by_av, i;
672*ec02198aSmrg class niter_desc *desc;
67310d565efSmrg widest_int iterations;
67410d565efSmrg
675c7a68eb7Smrg /* If we were not asked to unroll this loop, just return back silently. */
676c7a68eb7Smrg if (!(flags & UAP_UNROLL) && !loop->unroll)
67710d565efSmrg return;
67810d565efSmrg
679c7a68eb7Smrg if (dump_enabled_p ())
680c7a68eb7Smrg dump_printf (MSG_NOTE,
681c7a68eb7Smrg "considering unrolling loop with runtime-"
68210d565efSmrg "computable number of iterations\n");
68310d565efSmrg
68410d565efSmrg /* nunroll = total number of copies of the original loop body in
68510d565efSmrg unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
686*ec02198aSmrg nunroll = param_max_unrolled_insns / loop->ninsns;
687*ec02198aSmrg nunroll_by_av = param_max_average_unrolled_insns / loop->av_ninsns;
68810d565efSmrg if (nunroll > nunroll_by_av)
68910d565efSmrg nunroll = nunroll_by_av;
690*ec02198aSmrg if (nunroll > (unsigned) param_max_unroll_times)
691*ec02198aSmrg nunroll = param_max_unroll_times;
69210d565efSmrg
69310d565efSmrg if (targetm.loop_unroll_adjust)
69410d565efSmrg nunroll = targetm.loop_unroll_adjust (nunroll, loop);
69510d565efSmrg
696c7a68eb7Smrg if (loop->unroll > 0 && loop->unroll < USHRT_MAX)
697c7a68eb7Smrg nunroll = loop->unroll;
698c7a68eb7Smrg
69910d565efSmrg /* Skip big loops. */
70010d565efSmrg if (nunroll <= 1)
70110d565efSmrg {
70210d565efSmrg if (dump_file)
70310d565efSmrg fprintf (dump_file, ";; Not considering loop, is too big\n");
70410d565efSmrg return;
70510d565efSmrg }
70610d565efSmrg
70710d565efSmrg /* Check for simple loops. */
70810d565efSmrg desc = get_simple_loop_desc (loop);
70910d565efSmrg
71010d565efSmrg /* Check simpleness. */
71110d565efSmrg if (!desc->simple_p || desc->assumptions)
71210d565efSmrg {
71310d565efSmrg if (dump_file)
71410d565efSmrg fprintf (dump_file,
71510d565efSmrg ";; Unable to prove that the number of iterations "
71610d565efSmrg "can be counted in runtime\n");
71710d565efSmrg return;
71810d565efSmrg }
71910d565efSmrg
72010d565efSmrg if (desc->const_iter)
72110d565efSmrg {
72210d565efSmrg if (dump_file)
72310d565efSmrg fprintf (dump_file, ";; Loop iterates constant times\n");
72410d565efSmrg return;
72510d565efSmrg }
72610d565efSmrg
72710d565efSmrg /* Check whether the loop rolls. */
72810d565efSmrg if ((get_estimated_loop_iterations (loop, &iterations)
72910d565efSmrg || get_likely_max_loop_iterations (loop, &iterations))
73010d565efSmrg && wi::ltu_p (iterations, 2 * nunroll))
73110d565efSmrg {
73210d565efSmrg if (dump_file)
73310d565efSmrg fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
73410d565efSmrg return;
73510d565efSmrg }
73610d565efSmrg
737c7a68eb7Smrg /* Success; now force nunroll to be power of 2, as code-gen
738c7a68eb7Smrg requires it, we are unable to cope with overflows in
739c7a68eb7Smrg computation of number of iterations. */
74010d565efSmrg for (i = 1; 2 * i <= nunroll; i *= 2)
74110d565efSmrg continue;
74210d565efSmrg
74310d565efSmrg loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
74410d565efSmrg loop->lpt_decision.times = i - 1;
74510d565efSmrg }
74610d565efSmrg
74710d565efSmrg /* Splits edge E and inserts the sequence of instructions INSNS on it, and
74810d565efSmrg returns the newly created block. If INSNS is NULL_RTX, nothing is changed
74910d565efSmrg and NULL is returned instead. */
75010d565efSmrg
75110d565efSmrg basic_block
split_edge_and_insert(edge e,rtx_insn * insns)75210d565efSmrg split_edge_and_insert (edge e, rtx_insn *insns)
75310d565efSmrg {
75410d565efSmrg basic_block bb;
75510d565efSmrg
75610d565efSmrg if (!insns)
75710d565efSmrg return NULL;
75810d565efSmrg bb = split_edge (e);
75910d565efSmrg emit_insn_after (insns, BB_END (bb));
76010d565efSmrg
76110d565efSmrg /* ??? We used to assume that INSNS can contain control flow insns, and
76210d565efSmrg that we had to try to find sub basic blocks in BB to maintain a valid
76310d565efSmrg CFG. For this purpose we used to set the BB_SUPERBLOCK flag on BB
76410d565efSmrg and call break_superblocks when going out of cfglayout mode. But it
76510d565efSmrg turns out that this never happens; and that if it does ever happen,
76610d565efSmrg the verify_flow_info at the end of the RTL loop passes would fail.
76710d565efSmrg
76810d565efSmrg There are two reasons why we expected we could have control flow insns
76910d565efSmrg in INSNS. The first is when a comparison has to be done in parts, and
77010d565efSmrg the second is when the number of iterations is computed for loops with
77110d565efSmrg the number of iterations known at runtime. In both cases, test cases
77210d565efSmrg to get control flow in INSNS appear to be impossible to construct:
77310d565efSmrg
77410d565efSmrg * If do_compare_rtx_and_jump needs several branches to do comparison
77510d565efSmrg in a mode that needs comparison by parts, we cannot analyze the
77610d565efSmrg number of iterations of the loop, and we never get to unrolling it.
77710d565efSmrg
77810d565efSmrg * The code in expand_divmod that was suspected to cause creation of
77910d565efSmrg branching code seems to be only accessed for signed division. The
78010d565efSmrg divisions used by # of iterations analysis are always unsigned.
78110d565efSmrg Problems might arise on architectures that emits branching code
78210d565efSmrg for some operations that may appear in the unroller (especially
78310d565efSmrg for division), but we have no such architectures.
78410d565efSmrg
78510d565efSmrg Considering all this, it was decided that we should for now assume
78610d565efSmrg that INSNS can in theory contain control flow insns, but in practice
78710d565efSmrg it never does. So we don't handle the theoretical case, and should
78810d565efSmrg a real failure ever show up, we have a pretty good clue for how to
78910d565efSmrg fix it. */
79010d565efSmrg
79110d565efSmrg return bb;
79210d565efSmrg }
79310d565efSmrg
79410d565efSmrg /* Prepare a sequence comparing OP0 with OP1 using COMP and jumping to LABEL if
79510d565efSmrg true, with probability PROB. If CINSN is not NULL, it is the insn to copy
79610d565efSmrg in order to create a jump. */
79710d565efSmrg
79810d565efSmrg static rtx_insn *
compare_and_jump_seq(rtx op0,rtx op1,enum rtx_code comp,rtx_code_label * label,profile_probability prob,rtx_insn * cinsn)79910d565efSmrg compare_and_jump_seq (rtx op0, rtx op1, enum rtx_code comp,
800c7a68eb7Smrg rtx_code_label *label, profile_probability prob,
801c7a68eb7Smrg rtx_insn *cinsn)
80210d565efSmrg {
80310d565efSmrg rtx_insn *seq;
80410d565efSmrg rtx_jump_insn *jump;
80510d565efSmrg rtx cond;
80610d565efSmrg machine_mode mode;
80710d565efSmrg
80810d565efSmrg mode = GET_MODE (op0);
80910d565efSmrg if (mode == VOIDmode)
81010d565efSmrg mode = GET_MODE (op1);
81110d565efSmrg
81210d565efSmrg start_sequence ();
81310d565efSmrg if (GET_MODE_CLASS (mode) == MODE_CC)
81410d565efSmrg {
81510d565efSmrg /* A hack -- there seems to be no easy generic way how to make a
81610d565efSmrg conditional jump from a ccmode comparison. */
81710d565efSmrg gcc_assert (cinsn);
81810d565efSmrg cond = XEXP (SET_SRC (pc_set (cinsn)), 0);
81910d565efSmrg gcc_assert (GET_CODE (cond) == comp);
82010d565efSmrg gcc_assert (rtx_equal_p (op0, XEXP (cond, 0)));
82110d565efSmrg gcc_assert (rtx_equal_p (op1, XEXP (cond, 1)));
82210d565efSmrg emit_jump_insn (copy_insn (PATTERN (cinsn)));
82310d565efSmrg jump = as_a <rtx_jump_insn *> (get_last_insn ());
82410d565efSmrg JUMP_LABEL (jump) = JUMP_LABEL (cinsn);
82510d565efSmrg LABEL_NUSES (JUMP_LABEL (jump))++;
82610d565efSmrg redirect_jump (jump, label, 0);
82710d565efSmrg }
82810d565efSmrg else
82910d565efSmrg {
83010d565efSmrg gcc_assert (!cinsn);
83110d565efSmrg
83210d565efSmrg op0 = force_operand (op0, NULL_RTX);
83310d565efSmrg op1 = force_operand (op1, NULL_RTX);
83410d565efSmrg do_compare_rtx_and_jump (op0, op1, comp, 0,
835c7a68eb7Smrg mode, NULL_RTX, NULL, label,
836c7a68eb7Smrg profile_probability::uninitialized ());
83710d565efSmrg jump = as_a <rtx_jump_insn *> (get_last_insn ());
83810d565efSmrg jump->set_jump_target (label);
83910d565efSmrg LABEL_NUSES (label)++;
84010d565efSmrg }
841c7a68eb7Smrg if (prob.initialized_p ())
842c7a68eb7Smrg add_reg_br_prob_note (jump, prob);
84310d565efSmrg
84410d565efSmrg seq = get_insns ();
84510d565efSmrg end_sequence ();
84610d565efSmrg
84710d565efSmrg return seq;
84810d565efSmrg }
84910d565efSmrg
850c7a68eb7Smrg /* Unroll LOOP for which we are able to count number of iterations in
851c7a68eb7Smrg runtime LOOP->LPT_DECISION.TIMES times. The times value must be a
852c7a68eb7Smrg power of two. The transformation does this (with some extra care
853c7a68eb7Smrg for case n < 0):
85410d565efSmrg
85510d565efSmrg for (i = 0; i < n; i++)
85610d565efSmrg body;
85710d565efSmrg
85810d565efSmrg ==> (LOOP->LPT_DECISION.TIMES == 3)
85910d565efSmrg
86010d565efSmrg i = 0;
86110d565efSmrg mod = n % 4;
86210d565efSmrg
86310d565efSmrg switch (mod)
86410d565efSmrg {
86510d565efSmrg case 3:
86610d565efSmrg body; i++;
86710d565efSmrg case 2:
86810d565efSmrg body; i++;
86910d565efSmrg case 1:
87010d565efSmrg body; i++;
87110d565efSmrg case 0: ;
87210d565efSmrg }
87310d565efSmrg
87410d565efSmrg while (i < n)
87510d565efSmrg {
87610d565efSmrg body; i++;
87710d565efSmrg body; i++;
87810d565efSmrg body; i++;
87910d565efSmrg body; i++;
88010d565efSmrg }
88110d565efSmrg */
88210d565efSmrg static void
unroll_loop_runtime_iterations(class loop * loop)883*ec02198aSmrg unroll_loop_runtime_iterations (class loop *loop)
88410d565efSmrg {
88510d565efSmrg rtx old_niter, niter, tmp;
88610d565efSmrg rtx_insn *init_code, *branch_code;
887c7a68eb7Smrg unsigned i, j;
888c7a68eb7Smrg profile_probability p;
88910d565efSmrg basic_block preheader, *body, swtch, ezc_swtch = NULL;
890c7a68eb7Smrg int may_exit_copy;
891c7a68eb7Smrg profile_count iter_count, new_count;
89210d565efSmrg unsigned n_peel;
89310d565efSmrg edge e;
89410d565efSmrg bool extra_zero_check, last_may_exit;
89510d565efSmrg unsigned max_unroll = loop->lpt_decision.times;
896*ec02198aSmrg class niter_desc *desc = get_simple_loop_desc (loop);
89710d565efSmrg bool exit_at_end = loop_exit_at_end_p (loop);
89810d565efSmrg struct opt_info *opt_info = NULL;
89910d565efSmrg bool ok;
90010d565efSmrg
90110d565efSmrg if (flag_split_ivs_in_unroller
90210d565efSmrg || flag_variable_expansion_in_unroller)
90310d565efSmrg opt_info = analyze_insns_in_loop (loop);
90410d565efSmrg
90510d565efSmrg /* Remember blocks whose dominators will have to be updated. */
90610d565efSmrg auto_vec<basic_block> dom_bbs;
90710d565efSmrg
90810d565efSmrg body = get_loop_body (loop);
90910d565efSmrg for (i = 0; i < loop->num_nodes; i++)
91010d565efSmrg {
91110d565efSmrg vec<basic_block> ldom;
91210d565efSmrg basic_block bb;
91310d565efSmrg
91410d565efSmrg ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
91510d565efSmrg FOR_EACH_VEC_ELT (ldom, j, bb)
91610d565efSmrg if (!flow_bb_inside_loop_p (loop, bb))
91710d565efSmrg dom_bbs.safe_push (bb);
91810d565efSmrg
91910d565efSmrg ldom.release ();
92010d565efSmrg }
92110d565efSmrg free (body);
92210d565efSmrg
92310d565efSmrg if (!exit_at_end)
92410d565efSmrg {
92510d565efSmrg /* Leave exit in first copy (for explanation why see comment in
92610d565efSmrg unroll_loop_constant_iterations). */
92710d565efSmrg may_exit_copy = 0;
92810d565efSmrg n_peel = max_unroll - 1;
92910d565efSmrg extra_zero_check = true;
93010d565efSmrg last_may_exit = false;
93110d565efSmrg }
93210d565efSmrg else
93310d565efSmrg {
93410d565efSmrg /* Leave exit in last copy (for explanation why see comment in
93510d565efSmrg unroll_loop_constant_iterations). */
93610d565efSmrg may_exit_copy = max_unroll;
93710d565efSmrg n_peel = max_unroll;
93810d565efSmrg extra_zero_check = false;
93910d565efSmrg last_may_exit = true;
94010d565efSmrg }
94110d565efSmrg
94210d565efSmrg /* Get expression for number of iterations. */
94310d565efSmrg start_sequence ();
94410d565efSmrg old_niter = niter = gen_reg_rtx (desc->mode);
94510d565efSmrg tmp = force_operand (copy_rtx (desc->niter_expr), niter);
94610d565efSmrg if (tmp != niter)
94710d565efSmrg emit_move_insn (niter, tmp);
94810d565efSmrg
94910d565efSmrg /* For loops that exit at end and whose number of iterations is reliable,
95010d565efSmrg add one to niter to account for first pass through loop body before
95110d565efSmrg reaching exit test. */
95210d565efSmrg if (exit_at_end && !desc->noloop_assumptions)
95310d565efSmrg {
95410d565efSmrg niter = expand_simple_binop (desc->mode, PLUS,
95510d565efSmrg niter, const1_rtx,
95610d565efSmrg NULL_RTX, 0, OPTAB_LIB_WIDEN);
95710d565efSmrg old_niter = niter;
95810d565efSmrg }
95910d565efSmrg
96010d565efSmrg /* Count modulo by ANDing it with max_unroll; we use the fact that
96110d565efSmrg the number of unrollings is a power of two, and thus this is correct
96210d565efSmrg even if there is overflow in the computation. */
96310d565efSmrg niter = expand_simple_binop (desc->mode, AND,
96410d565efSmrg niter, gen_int_mode (max_unroll, desc->mode),
96510d565efSmrg NULL_RTX, 0, OPTAB_LIB_WIDEN);
96610d565efSmrg
96710d565efSmrg init_code = get_insns ();
96810d565efSmrg end_sequence ();
96910d565efSmrg unshare_all_rtl_in_chain (init_code);
97010d565efSmrg
97110d565efSmrg /* Precondition the loop. */
97210d565efSmrg split_edge_and_insert (loop_preheader_edge (loop), init_code);
97310d565efSmrg
97410d565efSmrg auto_vec<edge> remove_edges;
97510d565efSmrg
97610d565efSmrg auto_sbitmap wont_exit (max_unroll + 2);
97710d565efSmrg
97810d565efSmrg if (extra_zero_check || desc->noloop_assumptions)
97910d565efSmrg {
98010d565efSmrg /* Peel the first copy of loop body. Leave the exit test if the number
98110d565efSmrg of iterations is not reliable. Also record the place of the extra zero
98210d565efSmrg check. */
98310d565efSmrg bitmap_clear (wont_exit);
98410d565efSmrg if (!desc->noloop_assumptions)
98510d565efSmrg bitmap_set_bit (wont_exit, 1);
98610d565efSmrg ezc_swtch = loop_preheader_edge (loop)->src;
98710d565efSmrg ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
98810d565efSmrg 1, wont_exit, desc->out_edge,
98910d565efSmrg &remove_edges,
99010d565efSmrg DLTHE_FLAG_UPDATE_FREQ);
99110d565efSmrg gcc_assert (ok);
99210d565efSmrg }
99310d565efSmrg
99410d565efSmrg /* Record the place where switch will be built for preconditioning. */
99510d565efSmrg swtch = split_edge (loop_preheader_edge (loop));
99610d565efSmrg
997c7a68eb7Smrg /* Compute count increments for each switch block and initialize
99810d565efSmrg innermost switch block. Switch blocks and peeled loop copies are built
99910d565efSmrg from innermost outward. */
1000c7a68eb7Smrg iter_count = new_count = swtch->count.apply_scale (1, max_unroll + 1);
100110d565efSmrg swtch->count = new_count;
100210d565efSmrg
100310d565efSmrg for (i = 0; i < n_peel; i++)
100410d565efSmrg {
100510d565efSmrg /* Peel the copy. */
100610d565efSmrg bitmap_clear (wont_exit);
100710d565efSmrg if (i != n_peel - 1 || !last_may_exit)
100810d565efSmrg bitmap_set_bit (wont_exit, 1);
100910d565efSmrg ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
101010d565efSmrg 1, wont_exit, desc->out_edge,
101110d565efSmrg &remove_edges,
101210d565efSmrg DLTHE_FLAG_UPDATE_FREQ);
101310d565efSmrg gcc_assert (ok);
101410d565efSmrg
101510d565efSmrg /* Create item for switch. */
101610d565efSmrg j = n_peel - i - (extra_zero_check ? 0 : 1);
1017c7a68eb7Smrg p = profile_probability::always ().apply_scale (1, i + 2);
101810d565efSmrg
101910d565efSmrg preheader = split_edge (loop_preheader_edge (loop));
1020c7a68eb7Smrg /* Add in count of edge from switch block. */
102110d565efSmrg preheader->count += iter_count;
102221ff1670Smrg branch_code = compare_and_jump_seq (copy_rtx (niter),
102321ff1670Smrg gen_int_mode (j, desc->mode), EQ,
1024c7a68eb7Smrg block_label (preheader), p, NULL);
102510d565efSmrg
102610d565efSmrg /* We rely on the fact that the compare and jump cannot be optimized out,
102710d565efSmrg and hence the cfg we create is correct. */
102810d565efSmrg gcc_assert (branch_code != NULL_RTX);
102910d565efSmrg
103010d565efSmrg swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
103110d565efSmrg set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1032c7a68eb7Smrg single_succ_edge (swtch)->probability = p.invert ();
103310d565efSmrg new_count += iter_count;
103410d565efSmrg swtch->count = new_count;
103510d565efSmrg e = make_edge (swtch, preheader,
103610d565efSmrg single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
103710d565efSmrg e->probability = p;
103810d565efSmrg }
103910d565efSmrg
104010d565efSmrg if (extra_zero_check)
104110d565efSmrg {
104210d565efSmrg /* Add branch for zero iterations. */
1043c7a68eb7Smrg p = profile_probability::always ().apply_scale (1, max_unroll + 1);
104410d565efSmrg swtch = ezc_swtch;
104510d565efSmrg preheader = split_edge (loop_preheader_edge (loop));
1046c7a68eb7Smrg /* Recompute count adjustments since initial peel copy may
104710d565efSmrg have exited and reduced those values that were computed above. */
1048c7a68eb7Smrg iter_count = swtch->count.apply_scale (1, max_unroll + 1);
1049c7a68eb7Smrg /* Add in count of edge from switch block. */
105010d565efSmrg preheader->count += iter_count;
105110d565efSmrg branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
105210d565efSmrg block_label (preheader), p,
105310d565efSmrg NULL);
105410d565efSmrg gcc_assert (branch_code != NULL_RTX);
105510d565efSmrg
105610d565efSmrg swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
105710d565efSmrg set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1058c7a68eb7Smrg single_succ_edge (swtch)->probability = p.invert ();
105910d565efSmrg e = make_edge (swtch, preheader,
106010d565efSmrg single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
106110d565efSmrg e->probability = p;
106210d565efSmrg }
106310d565efSmrg
106410d565efSmrg /* Recount dominators for outer blocks. */
106510d565efSmrg iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
106610d565efSmrg
106710d565efSmrg /* And unroll loop. */
106810d565efSmrg
106910d565efSmrg bitmap_ones (wont_exit);
107010d565efSmrg bitmap_clear_bit (wont_exit, may_exit_copy);
107110d565efSmrg opt_info_start_duplication (opt_info);
107210d565efSmrg
107310d565efSmrg ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
107410d565efSmrg max_unroll,
107510d565efSmrg wont_exit, desc->out_edge,
107610d565efSmrg &remove_edges,
107710d565efSmrg DLTHE_FLAG_UPDATE_FREQ
107810d565efSmrg | (opt_info
107910d565efSmrg ? DLTHE_RECORD_COPY_NUMBER
108010d565efSmrg : 0));
108110d565efSmrg gcc_assert (ok);
108210d565efSmrg
108310d565efSmrg if (opt_info)
108410d565efSmrg {
108510d565efSmrg apply_opt_in_copies (opt_info, max_unroll, true, true);
108610d565efSmrg free_opt_info (opt_info);
108710d565efSmrg }
108810d565efSmrg
108910d565efSmrg if (exit_at_end)
109010d565efSmrg {
109110d565efSmrg basic_block exit_block = get_bb_copy (desc->in_edge->src);
109210d565efSmrg /* Find a new in and out edge; they are in the last copy we have
109310d565efSmrg made. */
109410d565efSmrg
109510d565efSmrg if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
109610d565efSmrg {
109710d565efSmrg desc->out_edge = EDGE_SUCC (exit_block, 0);
109810d565efSmrg desc->in_edge = EDGE_SUCC (exit_block, 1);
109910d565efSmrg }
110010d565efSmrg else
110110d565efSmrg {
110210d565efSmrg desc->out_edge = EDGE_SUCC (exit_block, 1);
110310d565efSmrg desc->in_edge = EDGE_SUCC (exit_block, 0);
110410d565efSmrg }
110510d565efSmrg }
110610d565efSmrg
110710d565efSmrg /* Remove the edges. */
110810d565efSmrg FOR_EACH_VEC_ELT (remove_edges, i, e)
110910d565efSmrg remove_path (e);
111010d565efSmrg
111110d565efSmrg /* We must be careful when updating the number of iterations due to
111210d565efSmrg preconditioning and the fact that the value must be valid at entry
111310d565efSmrg of the loop. After passing through the above code, we see that
111410d565efSmrg the correct new number of iterations is this: */
111510d565efSmrg gcc_assert (!desc->const_iter);
111610d565efSmrg desc->niter_expr =
111710d565efSmrg simplify_gen_binary (UDIV, desc->mode, old_niter,
111810d565efSmrg gen_int_mode (max_unroll + 1, desc->mode));
111910d565efSmrg loop->nb_iterations_upper_bound
112010d565efSmrg = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
112110d565efSmrg if (loop->any_estimate)
112210d565efSmrg loop->nb_iterations_estimate
112310d565efSmrg = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
112410d565efSmrg if (loop->any_likely_upper_bound)
112510d565efSmrg loop->nb_iterations_likely_upper_bound
112610d565efSmrg = wi::udiv_trunc (loop->nb_iterations_likely_upper_bound, max_unroll + 1);
112710d565efSmrg if (exit_at_end)
112810d565efSmrg {
112910d565efSmrg desc->niter_expr =
113010d565efSmrg simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
113110d565efSmrg desc->noloop_assumptions = NULL_RTX;
113210d565efSmrg --loop->nb_iterations_upper_bound;
113310d565efSmrg if (loop->any_estimate
113410d565efSmrg && loop->nb_iterations_estimate != 0)
113510d565efSmrg --loop->nb_iterations_estimate;
113610d565efSmrg else
113710d565efSmrg loop->any_estimate = false;
113810d565efSmrg if (loop->any_likely_upper_bound
113910d565efSmrg && loop->nb_iterations_likely_upper_bound != 0)
114010d565efSmrg --loop->nb_iterations_likely_upper_bound;
114110d565efSmrg else
114210d565efSmrg loop->any_likely_upper_bound = false;
114310d565efSmrg }
114410d565efSmrg
114510d565efSmrg if (dump_file)
114610d565efSmrg fprintf (dump_file,
114710d565efSmrg ";; Unrolled loop %d times, counting # of iterations "
114810d565efSmrg "in runtime, %i insns\n",
114910d565efSmrg max_unroll, num_loop_insns (loop));
115010d565efSmrg }
115110d565efSmrg
115210d565efSmrg /* Decide whether to unroll LOOP stupidly and how much. */
115310d565efSmrg static void
decide_unroll_stupid(class loop * loop,int flags)1154*ec02198aSmrg decide_unroll_stupid (class loop *loop, int flags)
115510d565efSmrg {
115610d565efSmrg unsigned nunroll, nunroll_by_av, i;
1157*ec02198aSmrg class niter_desc *desc;
115810d565efSmrg widest_int iterations;
115910d565efSmrg
1160c7a68eb7Smrg /* If we were not asked to unroll this loop, just return back silently. */
1161c7a68eb7Smrg if (!(flags & UAP_UNROLL_ALL) && !loop->unroll)
116210d565efSmrg return;
116310d565efSmrg
1164c7a68eb7Smrg if (dump_enabled_p ())
1165c7a68eb7Smrg dump_printf (MSG_NOTE, "considering unrolling loop stupidly\n");
116610d565efSmrg
116710d565efSmrg /* nunroll = total number of copies of the original loop body in
116810d565efSmrg unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
1169*ec02198aSmrg nunroll = param_max_unrolled_insns / loop->ninsns;
117010d565efSmrg nunroll_by_av
1171*ec02198aSmrg = param_max_average_unrolled_insns / loop->av_ninsns;
117210d565efSmrg if (nunroll > nunroll_by_av)
117310d565efSmrg nunroll = nunroll_by_av;
1174*ec02198aSmrg if (nunroll > (unsigned) param_max_unroll_times)
1175*ec02198aSmrg nunroll = param_max_unroll_times;
117610d565efSmrg
117710d565efSmrg if (targetm.loop_unroll_adjust)
117810d565efSmrg nunroll = targetm.loop_unroll_adjust (nunroll, loop);
117910d565efSmrg
1180c7a68eb7Smrg if (loop->unroll > 0 && loop->unroll < USHRT_MAX)
1181c7a68eb7Smrg nunroll = loop->unroll;
1182c7a68eb7Smrg
118310d565efSmrg /* Skip big loops. */
118410d565efSmrg if (nunroll <= 1)
118510d565efSmrg {
118610d565efSmrg if (dump_file)
118710d565efSmrg fprintf (dump_file, ";; Not considering loop, is too big\n");
118810d565efSmrg return;
118910d565efSmrg }
119010d565efSmrg
119110d565efSmrg /* Check for simple loops. */
119210d565efSmrg desc = get_simple_loop_desc (loop);
119310d565efSmrg
119410d565efSmrg /* Check simpleness. */
119510d565efSmrg if (desc->simple_p && !desc->assumptions)
119610d565efSmrg {
119710d565efSmrg if (dump_file)
1198c7a68eb7Smrg fprintf (dump_file, ";; Loop is simple\n");
119910d565efSmrg return;
120010d565efSmrg }
120110d565efSmrg
120210d565efSmrg /* Do not unroll loops with branches inside -- it increases number
120310d565efSmrg of mispredicts.
120410d565efSmrg TODO: this heuristic needs tunning; call inside the loop body
120510d565efSmrg is also relatively good reason to not unroll. */
120610d565efSmrg if (num_loop_branches (loop) > 1)
120710d565efSmrg {
120810d565efSmrg if (dump_file)
120910d565efSmrg fprintf (dump_file, ";; Not unrolling, contains branches\n");
121010d565efSmrg return;
121110d565efSmrg }
121210d565efSmrg
121310d565efSmrg /* Check whether the loop rolls. */
121410d565efSmrg if ((get_estimated_loop_iterations (loop, &iterations)
121510d565efSmrg || get_likely_max_loop_iterations (loop, &iterations))
121610d565efSmrg && wi::ltu_p (iterations, 2 * nunroll))
121710d565efSmrg {
121810d565efSmrg if (dump_file)
121910d565efSmrg fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
122010d565efSmrg return;
122110d565efSmrg }
122210d565efSmrg
122310d565efSmrg /* Success. Now force nunroll to be power of 2, as it seems that this
122410d565efSmrg improves results (partially because of better alignments, partially
122510d565efSmrg because of some dark magic). */
122610d565efSmrg for (i = 1; 2 * i <= nunroll; i *= 2)
122710d565efSmrg continue;
122810d565efSmrg
122910d565efSmrg loop->lpt_decision.decision = LPT_UNROLL_STUPID;
123010d565efSmrg loop->lpt_decision.times = i - 1;
123110d565efSmrg }
123210d565efSmrg
123310d565efSmrg /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation does this:
123410d565efSmrg
123510d565efSmrg while (cond)
123610d565efSmrg body;
123710d565efSmrg
123810d565efSmrg ==> (LOOP->LPT_DECISION.TIMES == 3)
123910d565efSmrg
124010d565efSmrg while (cond)
124110d565efSmrg {
124210d565efSmrg body;
124310d565efSmrg if (!cond) break;
124410d565efSmrg body;
124510d565efSmrg if (!cond) break;
124610d565efSmrg body;
124710d565efSmrg if (!cond) break;
124810d565efSmrg body;
124910d565efSmrg }
125010d565efSmrg */
125110d565efSmrg static void
unroll_loop_stupid(class loop * loop)1252*ec02198aSmrg unroll_loop_stupid (class loop *loop)
125310d565efSmrg {
125410d565efSmrg unsigned nunroll = loop->lpt_decision.times;
1255*ec02198aSmrg class niter_desc *desc = get_simple_loop_desc (loop);
125610d565efSmrg struct opt_info *opt_info = NULL;
125710d565efSmrg bool ok;
125810d565efSmrg
125910d565efSmrg if (flag_split_ivs_in_unroller
126010d565efSmrg || flag_variable_expansion_in_unroller)
126110d565efSmrg opt_info = analyze_insns_in_loop (loop);
126210d565efSmrg
126310d565efSmrg auto_sbitmap wont_exit (nunroll + 1);
126410d565efSmrg bitmap_clear (wont_exit);
126510d565efSmrg opt_info_start_duplication (opt_info);
126610d565efSmrg
126710d565efSmrg ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
126810d565efSmrg nunroll, wont_exit,
126910d565efSmrg NULL, NULL,
127010d565efSmrg DLTHE_FLAG_UPDATE_FREQ
127110d565efSmrg | (opt_info
127210d565efSmrg ? DLTHE_RECORD_COPY_NUMBER
127310d565efSmrg : 0));
127410d565efSmrg gcc_assert (ok);
127510d565efSmrg
127610d565efSmrg if (opt_info)
127710d565efSmrg {
127810d565efSmrg apply_opt_in_copies (opt_info, nunroll, true, true);
127910d565efSmrg free_opt_info (opt_info);
128010d565efSmrg }
128110d565efSmrg
128210d565efSmrg if (desc->simple_p)
128310d565efSmrg {
128410d565efSmrg /* We indeed may get here provided that there are nontrivial assumptions
128510d565efSmrg for a loop to be really simple. We could update the counts, but the
128610d565efSmrg problem is that we are unable to decide which exit will be taken
128710d565efSmrg (not really true in case the number of iterations is constant,
128810d565efSmrg but no one will do anything with this information, so we do not
128910d565efSmrg worry about it). */
129010d565efSmrg desc->simple_p = false;
129110d565efSmrg }
129210d565efSmrg
129310d565efSmrg if (dump_file)
129410d565efSmrg fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
129510d565efSmrg nunroll, num_loop_insns (loop));
129610d565efSmrg }
129710d565efSmrg
129810d565efSmrg /* Returns true if REG is referenced in one nondebug insn in LOOP.
129910d565efSmrg Set *DEBUG_USES to the number of debug insns that reference the
130010d565efSmrg variable. */
130110d565efSmrg
130210d565efSmrg static bool
referenced_in_one_insn_in_loop_p(class loop * loop,rtx reg,int * debug_uses)1303*ec02198aSmrg referenced_in_one_insn_in_loop_p (class loop *loop, rtx reg,
130410d565efSmrg int *debug_uses)
130510d565efSmrg {
130610d565efSmrg basic_block *body, bb;
130710d565efSmrg unsigned i;
130810d565efSmrg int count_ref = 0;
130910d565efSmrg rtx_insn *insn;
131010d565efSmrg
131110d565efSmrg body = get_loop_body (loop);
131210d565efSmrg for (i = 0; i < loop->num_nodes; i++)
131310d565efSmrg {
131410d565efSmrg bb = body[i];
131510d565efSmrg
131610d565efSmrg FOR_BB_INSNS (bb, insn)
131710d565efSmrg if (!rtx_referenced_p (reg, insn))
131810d565efSmrg continue;
131910d565efSmrg else if (DEBUG_INSN_P (insn))
132010d565efSmrg ++*debug_uses;
132110d565efSmrg else if (++count_ref > 1)
132210d565efSmrg break;
132310d565efSmrg }
132410d565efSmrg free (body);
132510d565efSmrg return (count_ref == 1);
132610d565efSmrg }
132710d565efSmrg
132810d565efSmrg /* Reset the DEBUG_USES debug insns in LOOP that reference REG. */
132910d565efSmrg
133010d565efSmrg static void
reset_debug_uses_in_loop(class loop * loop,rtx reg,int debug_uses)1331*ec02198aSmrg reset_debug_uses_in_loop (class loop *loop, rtx reg, int debug_uses)
133210d565efSmrg {
133310d565efSmrg basic_block *body, bb;
133410d565efSmrg unsigned i;
133510d565efSmrg rtx_insn *insn;
133610d565efSmrg
133710d565efSmrg body = get_loop_body (loop);
133810d565efSmrg for (i = 0; debug_uses && i < loop->num_nodes; i++)
133910d565efSmrg {
134010d565efSmrg bb = body[i];
134110d565efSmrg
134210d565efSmrg FOR_BB_INSNS (bb, insn)
134310d565efSmrg if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
134410d565efSmrg continue;
134510d565efSmrg else
134610d565efSmrg {
134710d565efSmrg validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
134810d565efSmrg gen_rtx_UNKNOWN_VAR_LOC (), 0);
134910d565efSmrg if (!--debug_uses)
135010d565efSmrg break;
135110d565efSmrg }
135210d565efSmrg }
135310d565efSmrg free (body);
135410d565efSmrg }
135510d565efSmrg
135610d565efSmrg /* Determine whether INSN contains an accumulator
135710d565efSmrg which can be expanded into separate copies,
135810d565efSmrg one for each copy of the LOOP body.
135910d565efSmrg
136010d565efSmrg for (i = 0 ; i < n; i++)
136110d565efSmrg sum += a[i];
136210d565efSmrg
136310d565efSmrg ==>
136410d565efSmrg
136510d565efSmrg sum += a[i]
136610d565efSmrg ....
136710d565efSmrg i = i+1;
136810d565efSmrg sum1 += a[i]
136910d565efSmrg ....
137010d565efSmrg i = i+1
137110d565efSmrg sum2 += a[i];
137210d565efSmrg ....
137310d565efSmrg
137410d565efSmrg Return NULL if INSN contains no opportunity for expansion of accumulator.
137510d565efSmrg Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
137610d565efSmrg information and return a pointer to it.
137710d565efSmrg */
137810d565efSmrg
137910d565efSmrg static struct var_to_expand *
analyze_insn_to_expand_var(class loop * loop,rtx_insn * insn)1380*ec02198aSmrg analyze_insn_to_expand_var (class loop *loop, rtx_insn *insn)
138110d565efSmrg {
138210d565efSmrg rtx set, dest, src;
138310d565efSmrg struct var_to_expand *ves;
138410d565efSmrg unsigned accum_pos;
138510d565efSmrg enum rtx_code code;
138610d565efSmrg int debug_uses = 0;
138710d565efSmrg
138810d565efSmrg set = single_set (insn);
138910d565efSmrg if (!set)
139010d565efSmrg return NULL;
139110d565efSmrg
139210d565efSmrg dest = SET_DEST (set);
139310d565efSmrg src = SET_SRC (set);
139410d565efSmrg code = GET_CODE (src);
139510d565efSmrg
139610d565efSmrg if (code != PLUS && code != MINUS && code != MULT && code != FMA)
139710d565efSmrg return NULL;
139810d565efSmrg
139910d565efSmrg if (FLOAT_MODE_P (GET_MODE (dest)))
140010d565efSmrg {
140110d565efSmrg if (!flag_associative_math)
140210d565efSmrg return NULL;
140310d565efSmrg /* In the case of FMA, we're also changing the rounding. */
140410d565efSmrg if (code == FMA && !flag_unsafe_math_optimizations)
140510d565efSmrg return NULL;
140610d565efSmrg }
140710d565efSmrg
140810d565efSmrg /* Hmm, this is a bit paradoxical. We know that INSN is a valid insn
140910d565efSmrg in MD. But if there is no optab to generate the insn, we cannot
141010d565efSmrg perform the variable expansion. This can happen if an MD provides
141110d565efSmrg an insn but not a named pattern to generate it, for example to avoid
141210d565efSmrg producing code that needs additional mode switches like for x87/mmx.
141310d565efSmrg
141410d565efSmrg So we check have_insn_for which looks for an optab for the operation
141510d565efSmrg in SRC. If it doesn't exist, we can't perform the expansion even
141610d565efSmrg though INSN is valid. */
141710d565efSmrg if (!have_insn_for (code, GET_MODE (src)))
141810d565efSmrg return NULL;
141910d565efSmrg
142010d565efSmrg if (!REG_P (dest)
142110d565efSmrg && !(GET_CODE (dest) == SUBREG
142210d565efSmrg && REG_P (SUBREG_REG (dest))))
142310d565efSmrg return NULL;
142410d565efSmrg
142510d565efSmrg /* Find the accumulator use within the operation. */
142610d565efSmrg if (code == FMA)
142710d565efSmrg {
142810d565efSmrg /* We only support accumulation via FMA in the ADD position. */
142910d565efSmrg if (!rtx_equal_p (dest, XEXP (src, 2)))
143010d565efSmrg return NULL;
143110d565efSmrg accum_pos = 2;
143210d565efSmrg }
143310d565efSmrg else if (rtx_equal_p (dest, XEXP (src, 0)))
143410d565efSmrg accum_pos = 0;
143510d565efSmrg else if (rtx_equal_p (dest, XEXP (src, 1)))
143610d565efSmrg {
143710d565efSmrg /* The method of expansion that we are using; which includes the
143810d565efSmrg initialization of the expansions with zero and the summation of
143910d565efSmrg the expansions at the end of the computation will yield wrong
144010d565efSmrg results for (x = something - x) thus avoid using it in that case. */
144110d565efSmrg if (code == MINUS)
144210d565efSmrg return NULL;
144310d565efSmrg accum_pos = 1;
144410d565efSmrg }
144510d565efSmrg else
144610d565efSmrg return NULL;
144710d565efSmrg
144810d565efSmrg /* It must not otherwise be used. */
144910d565efSmrg if (code == FMA)
145010d565efSmrg {
145110d565efSmrg if (rtx_referenced_p (dest, XEXP (src, 0))
145210d565efSmrg || rtx_referenced_p (dest, XEXP (src, 1)))
145310d565efSmrg return NULL;
145410d565efSmrg }
145510d565efSmrg else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
145610d565efSmrg return NULL;
145710d565efSmrg
145810d565efSmrg /* It must be used in exactly one insn. */
145910d565efSmrg if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
146010d565efSmrg return NULL;
146110d565efSmrg
146210d565efSmrg if (dump_file)
146310d565efSmrg {
146410d565efSmrg fprintf (dump_file, "\n;; Expanding Accumulator ");
146510d565efSmrg print_rtl (dump_file, dest);
146610d565efSmrg fprintf (dump_file, "\n");
146710d565efSmrg }
146810d565efSmrg
146910d565efSmrg if (debug_uses)
147010d565efSmrg /* Instead of resetting the debug insns, we could replace each
147110d565efSmrg debug use in the loop with the sum or product of all expanded
147210d565efSmrg accumulators. Since we'll only know of all expansions at the
147310d565efSmrg end, we'd have to keep track of which vars_to_expand a debug
147410d565efSmrg insn in the loop references, take note of each copy of the
147510d565efSmrg debug insn during unrolling, and when it's all done, compute
147610d565efSmrg the sum or product of each variable and adjust the original
147710d565efSmrg debug insn and each copy thereof. What a pain! */
147810d565efSmrg reset_debug_uses_in_loop (loop, dest, debug_uses);
147910d565efSmrg
148010d565efSmrg /* Record the accumulator to expand. */
148110d565efSmrg ves = XNEW (struct var_to_expand);
148210d565efSmrg ves->insn = insn;
148310d565efSmrg ves->reg = copy_rtx (dest);
148410d565efSmrg ves->var_expansions.create (1);
148510d565efSmrg ves->next = NULL;
148610d565efSmrg ves->op = GET_CODE (src);
148710d565efSmrg ves->expansion_count = 0;
148810d565efSmrg ves->reuse_expansion = 0;
148910d565efSmrg return ves;
149010d565efSmrg }
149110d565efSmrg
149210d565efSmrg /* Determine whether there is an induction variable in INSN that
149310d565efSmrg we would like to split during unrolling.
149410d565efSmrg
149510d565efSmrg I.e. replace
149610d565efSmrg
149710d565efSmrg i = i + 1;
149810d565efSmrg ...
149910d565efSmrg i = i + 1;
150010d565efSmrg ...
150110d565efSmrg i = i + 1;
150210d565efSmrg ...
150310d565efSmrg
150410d565efSmrg type chains by
150510d565efSmrg
150610d565efSmrg i0 = i + 1
150710d565efSmrg ...
150810d565efSmrg i = i0 + 1
150910d565efSmrg ...
151010d565efSmrg i = i0 + 2
151110d565efSmrg ...
151210d565efSmrg
151310d565efSmrg Return NULL if INSN contains no interesting IVs. Otherwise, allocate
151410d565efSmrg an IV_TO_SPLIT structure, fill it with the relevant information and return a
151510d565efSmrg pointer to it. */
151610d565efSmrg
151710d565efSmrg static struct iv_to_split *
analyze_iv_to_split_insn(rtx_insn * insn)151810d565efSmrg analyze_iv_to_split_insn (rtx_insn *insn)
151910d565efSmrg {
152010d565efSmrg rtx set, dest;
1521*ec02198aSmrg class rtx_iv iv;
152210d565efSmrg struct iv_to_split *ivts;
1523c7a68eb7Smrg scalar_int_mode mode;
152410d565efSmrg bool ok;
152510d565efSmrg
152610d565efSmrg /* For now we just split the basic induction variables. Later this may be
152710d565efSmrg extended for example by selecting also addresses of memory references. */
152810d565efSmrg set = single_set (insn);
152910d565efSmrg if (!set)
153010d565efSmrg return NULL;
153110d565efSmrg
153210d565efSmrg dest = SET_DEST (set);
1533c7a68eb7Smrg if (!REG_P (dest) || !is_a <scalar_int_mode> (GET_MODE (dest), &mode))
153410d565efSmrg return NULL;
153510d565efSmrg
1536c7a68eb7Smrg if (!biv_p (insn, mode, dest))
153710d565efSmrg return NULL;
153810d565efSmrg
153910d565efSmrg ok = iv_analyze_result (insn, dest, &iv);
154010d565efSmrg
154110d565efSmrg /* This used to be an assert under the assumption that if biv_p returns
154210d565efSmrg true that iv_analyze_result must also return true. However, that
154310d565efSmrg assumption is not strictly correct as evidenced by pr25569.
154410d565efSmrg
154510d565efSmrg Returning NULL when iv_analyze_result returns false is safe and
154610d565efSmrg avoids the problems in pr25569 until the iv_analyze_* routines
154710d565efSmrg can be fixed, which is apparently hard and time consuming
154810d565efSmrg according to their author. */
154910d565efSmrg if (! ok)
155010d565efSmrg return NULL;
155110d565efSmrg
155210d565efSmrg if (iv.step == const0_rtx
155310d565efSmrg || iv.mode != iv.extend_mode)
155410d565efSmrg return NULL;
155510d565efSmrg
155610d565efSmrg /* Record the insn to split. */
155710d565efSmrg ivts = XNEW (struct iv_to_split);
155810d565efSmrg ivts->insn = insn;
155910d565efSmrg ivts->orig_var = dest;
156010d565efSmrg ivts->base_var = NULL_RTX;
156110d565efSmrg ivts->step = iv.step;
156210d565efSmrg ivts->next = NULL;
156310d565efSmrg
156410d565efSmrg return ivts;
156510d565efSmrg }
156610d565efSmrg
156710d565efSmrg /* Determines which of insns in LOOP can be optimized.
156810d565efSmrg Return a OPT_INFO struct with the relevant hash tables filled
156910d565efSmrg with all insns to be optimized. The FIRST_NEW_BLOCK field
157010d565efSmrg is undefined for the return value. */
157110d565efSmrg
157210d565efSmrg static struct opt_info *
analyze_insns_in_loop(class loop * loop)1573*ec02198aSmrg analyze_insns_in_loop (class loop *loop)
157410d565efSmrg {
157510d565efSmrg basic_block *body, bb;
157610d565efSmrg unsigned i;
157710d565efSmrg struct opt_info *opt_info = XCNEW (struct opt_info);
157810d565efSmrg rtx_insn *insn;
157910d565efSmrg struct iv_to_split *ivts = NULL;
158010d565efSmrg struct var_to_expand *ves = NULL;
158110d565efSmrg iv_to_split **slot1;
158210d565efSmrg var_to_expand **slot2;
158310d565efSmrg vec<edge> edges = get_loop_exit_edges (loop);
158410d565efSmrg edge exit;
158510d565efSmrg bool can_apply = false;
158610d565efSmrg
158710d565efSmrg iv_analysis_loop_init (loop);
158810d565efSmrg
158910d565efSmrg body = get_loop_body (loop);
159010d565efSmrg
159110d565efSmrg if (flag_split_ivs_in_unroller)
159210d565efSmrg {
159310d565efSmrg opt_info->insns_to_split
159410d565efSmrg = new hash_table<iv_split_hasher> (5 * loop->num_nodes);
159510d565efSmrg opt_info->iv_to_split_head = NULL;
159610d565efSmrg opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
159710d565efSmrg }
159810d565efSmrg
159910d565efSmrg /* Record the loop exit bb and loop preheader before the unrolling. */
160010d565efSmrg opt_info->loop_preheader = loop_preheader_edge (loop)->src;
160110d565efSmrg
160210d565efSmrg if (edges.length () == 1)
160310d565efSmrg {
160410d565efSmrg exit = edges[0];
160510d565efSmrg if (!(exit->flags & EDGE_COMPLEX))
160610d565efSmrg {
160710d565efSmrg opt_info->loop_exit = split_edge (exit);
160810d565efSmrg can_apply = true;
160910d565efSmrg }
161010d565efSmrg }
161110d565efSmrg
161210d565efSmrg if (flag_variable_expansion_in_unroller
161310d565efSmrg && can_apply)
161410d565efSmrg {
161510d565efSmrg opt_info->insns_with_var_to_expand
161610d565efSmrg = new hash_table<var_expand_hasher> (5 * loop->num_nodes);
161710d565efSmrg opt_info->var_to_expand_head = NULL;
161810d565efSmrg opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
161910d565efSmrg }
162010d565efSmrg
162110d565efSmrg for (i = 0; i < loop->num_nodes; i++)
162210d565efSmrg {
162310d565efSmrg bb = body[i];
162410d565efSmrg if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
162510d565efSmrg continue;
162610d565efSmrg
162710d565efSmrg FOR_BB_INSNS (bb, insn)
162810d565efSmrg {
162910d565efSmrg if (!INSN_P (insn))
163010d565efSmrg continue;
163110d565efSmrg
163210d565efSmrg if (opt_info->insns_to_split)
163310d565efSmrg ivts = analyze_iv_to_split_insn (insn);
163410d565efSmrg
163510d565efSmrg if (ivts)
163610d565efSmrg {
163710d565efSmrg slot1 = opt_info->insns_to_split->find_slot (ivts, INSERT);
163810d565efSmrg gcc_assert (*slot1 == NULL);
163910d565efSmrg *slot1 = ivts;
164010d565efSmrg *opt_info->iv_to_split_tail = ivts;
164110d565efSmrg opt_info->iv_to_split_tail = &ivts->next;
164210d565efSmrg continue;
164310d565efSmrg }
164410d565efSmrg
164510d565efSmrg if (opt_info->insns_with_var_to_expand)
164610d565efSmrg ves = analyze_insn_to_expand_var (loop, insn);
164710d565efSmrg
164810d565efSmrg if (ves)
164910d565efSmrg {
165010d565efSmrg slot2 = opt_info->insns_with_var_to_expand->find_slot (ves, INSERT);
165110d565efSmrg gcc_assert (*slot2 == NULL);
165210d565efSmrg *slot2 = ves;
165310d565efSmrg *opt_info->var_to_expand_tail = ves;
165410d565efSmrg opt_info->var_to_expand_tail = &ves->next;
165510d565efSmrg }
165610d565efSmrg }
165710d565efSmrg }
165810d565efSmrg
165910d565efSmrg edges.release ();
166010d565efSmrg free (body);
166110d565efSmrg return opt_info;
166210d565efSmrg }
166310d565efSmrg
166410d565efSmrg /* Called just before loop duplication. Records start of duplicated area
166510d565efSmrg to OPT_INFO. */
166610d565efSmrg
166710d565efSmrg static void
opt_info_start_duplication(struct opt_info * opt_info)166810d565efSmrg opt_info_start_duplication (struct opt_info *opt_info)
166910d565efSmrg {
167010d565efSmrg if (opt_info)
167110d565efSmrg opt_info->first_new_block = last_basic_block_for_fn (cfun);
167210d565efSmrg }
167310d565efSmrg
167410d565efSmrg /* Determine the number of iterations between initialization of the base
167510d565efSmrg variable and the current copy (N_COPY). N_COPIES is the total number
167610d565efSmrg of newly created copies. UNROLLING is true if we are unrolling
167710d565efSmrg (not peeling) the loop. */
167810d565efSmrg
167910d565efSmrg static unsigned
determine_split_iv_delta(unsigned n_copy,unsigned n_copies,bool unrolling)168010d565efSmrg determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
168110d565efSmrg {
168210d565efSmrg if (unrolling)
168310d565efSmrg {
168410d565efSmrg /* If we are unrolling, initialization is done in the original loop
168510d565efSmrg body (number 0). */
168610d565efSmrg return n_copy;
168710d565efSmrg }
168810d565efSmrg else
168910d565efSmrg {
169010d565efSmrg /* If we are peeling, the copy in that the initialization occurs has
169110d565efSmrg number 1. The original loop (number 0) is the last. */
169210d565efSmrg if (n_copy)
169310d565efSmrg return n_copy - 1;
169410d565efSmrg else
169510d565efSmrg return n_copies;
169610d565efSmrg }
169710d565efSmrg }
169810d565efSmrg
169910d565efSmrg /* Allocate basic variable for the induction variable chain. */
170010d565efSmrg
170110d565efSmrg static void
allocate_basic_variable(struct iv_to_split * ivts)170210d565efSmrg allocate_basic_variable (struct iv_to_split *ivts)
170310d565efSmrg {
170410d565efSmrg rtx expr = SET_SRC (single_set (ivts->insn));
170510d565efSmrg
170610d565efSmrg ivts->base_var = gen_reg_rtx (GET_MODE (expr));
170710d565efSmrg }
170810d565efSmrg
170910d565efSmrg /* Insert initialization of basic variable of IVTS before INSN, taking
171010d565efSmrg the initial value from INSN. */
171110d565efSmrg
171210d565efSmrg static void
insert_base_initialization(struct iv_to_split * ivts,rtx_insn * insn)171310d565efSmrg insert_base_initialization (struct iv_to_split *ivts, rtx_insn *insn)
171410d565efSmrg {
171510d565efSmrg rtx expr = copy_rtx (SET_SRC (single_set (insn)));
171610d565efSmrg rtx_insn *seq;
171710d565efSmrg
171810d565efSmrg start_sequence ();
171910d565efSmrg expr = force_operand (expr, ivts->base_var);
172010d565efSmrg if (expr != ivts->base_var)
172110d565efSmrg emit_move_insn (ivts->base_var, expr);
172210d565efSmrg seq = get_insns ();
172310d565efSmrg end_sequence ();
172410d565efSmrg
172510d565efSmrg emit_insn_before (seq, insn);
172610d565efSmrg }
172710d565efSmrg
172810d565efSmrg /* Replace the use of induction variable described in IVTS in INSN
172910d565efSmrg by base variable + DELTA * step. */
173010d565efSmrg
173110d565efSmrg static void
split_iv(struct iv_to_split * ivts,rtx_insn * insn,unsigned delta)173210d565efSmrg split_iv (struct iv_to_split *ivts, rtx_insn *insn, unsigned delta)
173310d565efSmrg {
173410d565efSmrg rtx expr, *loc, incr, var;
173510d565efSmrg rtx_insn *seq;
173610d565efSmrg machine_mode mode = GET_MODE (ivts->base_var);
173710d565efSmrg rtx src, dest, set;
173810d565efSmrg
173910d565efSmrg /* Construct base + DELTA * step. */
174010d565efSmrg if (!delta)
174110d565efSmrg expr = ivts->base_var;
174210d565efSmrg else
174310d565efSmrg {
174410d565efSmrg incr = simplify_gen_binary (MULT, mode,
1745c7a68eb7Smrg copy_rtx (ivts->step),
1746c7a68eb7Smrg gen_int_mode (delta, mode));
174710d565efSmrg expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
174810d565efSmrg ivts->base_var, incr);
174910d565efSmrg }
175010d565efSmrg
175110d565efSmrg /* Figure out where to do the replacement. */
175210d565efSmrg loc = &SET_SRC (single_set (insn));
175310d565efSmrg
175410d565efSmrg /* If we can make the replacement right away, we're done. */
175510d565efSmrg if (validate_change (insn, loc, expr, 0))
175610d565efSmrg return;
175710d565efSmrg
175810d565efSmrg /* Otherwise, force EXPR into a register and try again. */
175910d565efSmrg start_sequence ();
176010d565efSmrg var = gen_reg_rtx (mode);
176110d565efSmrg expr = force_operand (expr, var);
176210d565efSmrg if (expr != var)
176310d565efSmrg emit_move_insn (var, expr);
176410d565efSmrg seq = get_insns ();
176510d565efSmrg end_sequence ();
176610d565efSmrg emit_insn_before (seq, insn);
176710d565efSmrg
176810d565efSmrg if (validate_change (insn, loc, var, 0))
176910d565efSmrg return;
177010d565efSmrg
177110d565efSmrg /* The last chance. Try recreating the assignment in insn
177210d565efSmrg completely from scratch. */
177310d565efSmrg set = single_set (insn);
177410d565efSmrg gcc_assert (set);
177510d565efSmrg
177610d565efSmrg start_sequence ();
177710d565efSmrg *loc = var;
177810d565efSmrg src = copy_rtx (SET_SRC (set));
177910d565efSmrg dest = copy_rtx (SET_DEST (set));
178010d565efSmrg src = force_operand (src, dest);
178110d565efSmrg if (src != dest)
178210d565efSmrg emit_move_insn (dest, src);
178310d565efSmrg seq = get_insns ();
178410d565efSmrg end_sequence ();
178510d565efSmrg
178610d565efSmrg emit_insn_before (seq, insn);
178710d565efSmrg delete_insn (insn);
178810d565efSmrg }
178910d565efSmrg
179010d565efSmrg
179110d565efSmrg /* Return one expansion of the accumulator recorded in struct VE. */
179210d565efSmrg
179310d565efSmrg static rtx
get_expansion(struct var_to_expand * ve)179410d565efSmrg get_expansion (struct var_to_expand *ve)
179510d565efSmrg {
179610d565efSmrg rtx reg;
179710d565efSmrg
179810d565efSmrg if (ve->reuse_expansion == 0)
179910d565efSmrg reg = ve->reg;
180010d565efSmrg else
180110d565efSmrg reg = ve->var_expansions[ve->reuse_expansion - 1];
180210d565efSmrg
180310d565efSmrg if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
180410d565efSmrg ve->reuse_expansion = 0;
180510d565efSmrg else
180610d565efSmrg ve->reuse_expansion++;
180710d565efSmrg
180810d565efSmrg return reg;
180910d565efSmrg }
181010d565efSmrg
181110d565efSmrg
181210d565efSmrg /* Given INSN replace the uses of the accumulator recorded in VE
181310d565efSmrg with a new register. */
181410d565efSmrg
181510d565efSmrg static void
expand_var_during_unrolling(struct var_to_expand * ve,rtx_insn * insn)181610d565efSmrg expand_var_during_unrolling (struct var_to_expand *ve, rtx_insn *insn)
181710d565efSmrg {
181810d565efSmrg rtx new_reg, set;
181910d565efSmrg bool really_new_expansion = false;
182010d565efSmrg
182110d565efSmrg set = single_set (insn);
182210d565efSmrg gcc_assert (set);
182310d565efSmrg
182410d565efSmrg /* Generate a new register only if the expansion limit has not been
182510d565efSmrg reached. Else reuse an already existing expansion. */
1826*ec02198aSmrg if (param_max_variable_expansions > ve->expansion_count)
182710d565efSmrg {
182810d565efSmrg really_new_expansion = true;
182910d565efSmrg new_reg = gen_reg_rtx (GET_MODE (ve->reg));
183010d565efSmrg }
183110d565efSmrg else
183210d565efSmrg new_reg = get_expansion (ve);
183310d565efSmrg
183410d565efSmrg validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
183510d565efSmrg if (apply_change_group ())
183610d565efSmrg if (really_new_expansion)
183710d565efSmrg {
183810d565efSmrg ve->var_expansions.safe_push (new_reg);
183910d565efSmrg ve->expansion_count++;
184010d565efSmrg }
184110d565efSmrg }
184210d565efSmrg
184310d565efSmrg /* Initialize the variable expansions in loop preheader. PLACE is the
184410d565efSmrg loop-preheader basic block where the initialization of the
184510d565efSmrg expansions should take place. The expansions are initialized with
184610d565efSmrg (-0) when the operation is plus or minus to honor sign zero. This
184710d565efSmrg way we can prevent cases where the sign of the final result is
184810d565efSmrg effected by the sign of the expansion. Here is an example to
184910d565efSmrg demonstrate this:
185010d565efSmrg
185110d565efSmrg for (i = 0 ; i < n; i++)
185210d565efSmrg sum += something;
185310d565efSmrg
185410d565efSmrg ==>
185510d565efSmrg
185610d565efSmrg sum += something
185710d565efSmrg ....
185810d565efSmrg i = i+1;
185910d565efSmrg sum1 += something
186010d565efSmrg ....
186110d565efSmrg i = i+1
186210d565efSmrg sum2 += something;
186310d565efSmrg ....
186410d565efSmrg
186510d565efSmrg When SUM is initialized with -zero and SOMETHING is also -zero; the
186610d565efSmrg final result of sum should be -zero thus the expansions sum1 and sum2
186710d565efSmrg should be initialized with -zero as well (otherwise we will get +zero
186810d565efSmrg as the final result). */
186910d565efSmrg
187010d565efSmrg static void
insert_var_expansion_initialization(struct var_to_expand * ve,basic_block place)187110d565efSmrg insert_var_expansion_initialization (struct var_to_expand *ve,
187210d565efSmrg basic_block place)
187310d565efSmrg {
187410d565efSmrg rtx_insn *seq;
187510d565efSmrg rtx var, zero_init;
187610d565efSmrg unsigned i;
187710d565efSmrg machine_mode mode = GET_MODE (ve->reg);
187810d565efSmrg bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
187910d565efSmrg
188010d565efSmrg if (ve->var_expansions.length () == 0)
188110d565efSmrg return;
188210d565efSmrg
188310d565efSmrg start_sequence ();
188410d565efSmrg switch (ve->op)
188510d565efSmrg {
188610d565efSmrg case FMA:
188710d565efSmrg /* Note that we only accumulate FMA via the ADD operand. */
188810d565efSmrg case PLUS:
188910d565efSmrg case MINUS:
189010d565efSmrg FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
189110d565efSmrg {
189210d565efSmrg if (honor_signed_zero_p)
189310d565efSmrg zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
189410d565efSmrg else
189510d565efSmrg zero_init = CONST0_RTX (mode);
189610d565efSmrg emit_move_insn (var, zero_init);
189710d565efSmrg }
189810d565efSmrg break;
189910d565efSmrg
190010d565efSmrg case MULT:
190110d565efSmrg FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
190210d565efSmrg {
190310d565efSmrg zero_init = CONST1_RTX (GET_MODE (var));
190410d565efSmrg emit_move_insn (var, zero_init);
190510d565efSmrg }
190610d565efSmrg break;
190710d565efSmrg
190810d565efSmrg default:
190910d565efSmrg gcc_unreachable ();
191010d565efSmrg }
191110d565efSmrg
191210d565efSmrg seq = get_insns ();
191310d565efSmrg end_sequence ();
191410d565efSmrg
191510d565efSmrg emit_insn_after (seq, BB_END (place));
191610d565efSmrg }
191710d565efSmrg
191810d565efSmrg /* Combine the variable expansions at the loop exit. PLACE is the
191910d565efSmrg loop exit basic block where the summation of the expansions should
192010d565efSmrg take place. */
192110d565efSmrg
192210d565efSmrg static void
combine_var_copies_in_loop_exit(struct var_to_expand * ve,basic_block place)192310d565efSmrg combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
192410d565efSmrg {
192510d565efSmrg rtx sum = ve->reg;
192610d565efSmrg rtx expr, var;
192710d565efSmrg rtx_insn *seq, *insn;
192810d565efSmrg unsigned i;
192910d565efSmrg
193010d565efSmrg if (ve->var_expansions.length () == 0)
193110d565efSmrg return;
193210d565efSmrg
193310d565efSmrg /* ve->reg might be SUBREG or some other non-shareable RTL, and we use
193410d565efSmrg it both here and as the destination of the assignment. */
193510d565efSmrg sum = copy_rtx (sum);
193610d565efSmrg start_sequence ();
193710d565efSmrg switch (ve->op)
193810d565efSmrg {
193910d565efSmrg case FMA:
194010d565efSmrg /* Note that we only accumulate FMA via the ADD operand. */
194110d565efSmrg case PLUS:
194210d565efSmrg case MINUS:
194310d565efSmrg FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
194410d565efSmrg sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
194510d565efSmrg break;
194610d565efSmrg
194710d565efSmrg case MULT:
194810d565efSmrg FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
194910d565efSmrg sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
195010d565efSmrg break;
195110d565efSmrg
195210d565efSmrg default:
195310d565efSmrg gcc_unreachable ();
195410d565efSmrg }
195510d565efSmrg
195610d565efSmrg expr = force_operand (sum, ve->reg);
195710d565efSmrg if (expr != ve->reg)
195810d565efSmrg emit_move_insn (ve->reg, expr);
195910d565efSmrg seq = get_insns ();
196010d565efSmrg end_sequence ();
196110d565efSmrg
196210d565efSmrg insn = BB_HEAD (place);
196310d565efSmrg while (!NOTE_INSN_BASIC_BLOCK_P (insn))
196410d565efSmrg insn = NEXT_INSN (insn);
196510d565efSmrg
196610d565efSmrg emit_insn_after (seq, insn);
196710d565efSmrg }
196810d565efSmrg
196910d565efSmrg /* Strip away REG_EQUAL notes for IVs we're splitting.
197010d565efSmrg
197110d565efSmrg Updating REG_EQUAL notes for IVs we split is tricky: We
197210d565efSmrg cannot tell until after unrolling, DF-rescanning, and liveness
197310d565efSmrg updating, whether an EQ_USE is reached by the split IV while
197410d565efSmrg the IV reg is still live. See PR55006.
197510d565efSmrg
197610d565efSmrg ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
197710d565efSmrg because RTL loop-iv requires us to defer rescanning insns and
197810d565efSmrg any notes attached to them. So resort to old techniques... */
197910d565efSmrg
198010d565efSmrg static void
maybe_strip_eq_note_for_split_iv(struct opt_info * opt_info,rtx_insn * insn)198110d565efSmrg maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx_insn *insn)
198210d565efSmrg {
198310d565efSmrg struct iv_to_split *ivts;
198410d565efSmrg rtx note = find_reg_equal_equiv_note (insn);
198510d565efSmrg if (! note)
198610d565efSmrg return;
198710d565efSmrg for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
198810d565efSmrg if (reg_mentioned_p (ivts->orig_var, note))
198910d565efSmrg {
199010d565efSmrg remove_note (insn, note);
199110d565efSmrg return;
199210d565efSmrg }
199310d565efSmrg }
199410d565efSmrg
199510d565efSmrg /* Apply loop optimizations in loop copies using the
199610d565efSmrg data which gathered during the unrolling. Structure
199710d565efSmrg OPT_INFO record that data.
199810d565efSmrg
199910d565efSmrg UNROLLING is true if we unrolled (not peeled) the loop.
200010d565efSmrg REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
200110d565efSmrg the loop (as it should happen in complete unrolling, but not in ordinary
200210d565efSmrg peeling of the loop). */
200310d565efSmrg
200410d565efSmrg static void
apply_opt_in_copies(struct opt_info * opt_info,unsigned n_copies,bool unrolling,bool rewrite_original_loop)200510d565efSmrg apply_opt_in_copies (struct opt_info *opt_info,
200610d565efSmrg unsigned n_copies, bool unrolling,
200710d565efSmrg bool rewrite_original_loop)
200810d565efSmrg {
200910d565efSmrg unsigned i, delta;
201010d565efSmrg basic_block bb, orig_bb;
201110d565efSmrg rtx_insn *insn, *orig_insn, *next;
201210d565efSmrg struct iv_to_split ivts_templ, *ivts;
201310d565efSmrg struct var_to_expand ve_templ, *ves;
201410d565efSmrg
201510d565efSmrg /* Sanity check -- we need to put initialization in the original loop
201610d565efSmrg body. */
201710d565efSmrg gcc_assert (!unrolling || rewrite_original_loop);
201810d565efSmrg
201910d565efSmrg /* Allocate the basic variables (i0). */
202010d565efSmrg if (opt_info->insns_to_split)
202110d565efSmrg for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
202210d565efSmrg allocate_basic_variable (ivts);
202310d565efSmrg
202410d565efSmrg for (i = opt_info->first_new_block;
202510d565efSmrg i < (unsigned) last_basic_block_for_fn (cfun);
202610d565efSmrg i++)
202710d565efSmrg {
202810d565efSmrg bb = BASIC_BLOCK_FOR_FN (cfun, i);
202910d565efSmrg orig_bb = get_bb_original (bb);
203010d565efSmrg
203110d565efSmrg /* bb->aux holds position in copy sequence initialized by
203210d565efSmrg duplicate_loop_to_header_edge. */
203310d565efSmrg delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
203410d565efSmrg unrolling);
203510d565efSmrg bb->aux = 0;
203610d565efSmrg orig_insn = BB_HEAD (orig_bb);
203710d565efSmrg FOR_BB_INSNS_SAFE (bb, insn, next)
203810d565efSmrg {
203910d565efSmrg if (!INSN_P (insn)
2040c7a68eb7Smrg || (DEBUG_BIND_INSN_P (insn)
2041c7a68eb7Smrg && INSN_VAR_LOCATION_DECL (insn)
204210d565efSmrg && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
204310d565efSmrg continue;
204410d565efSmrg
204510d565efSmrg while (!INSN_P (orig_insn)
2046c7a68eb7Smrg || (DEBUG_BIND_INSN_P (orig_insn)
2047c7a68eb7Smrg && INSN_VAR_LOCATION_DECL (orig_insn)
204810d565efSmrg && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
204910d565efSmrg == LABEL_DECL)))
205010d565efSmrg orig_insn = NEXT_INSN (orig_insn);
205110d565efSmrg
205210d565efSmrg ivts_templ.insn = orig_insn;
205310d565efSmrg ve_templ.insn = orig_insn;
205410d565efSmrg
205510d565efSmrg /* Apply splitting iv optimization. */
205610d565efSmrg if (opt_info->insns_to_split)
205710d565efSmrg {
205810d565efSmrg maybe_strip_eq_note_for_split_iv (opt_info, insn);
205910d565efSmrg
206010d565efSmrg ivts = opt_info->insns_to_split->find (&ivts_templ);
206110d565efSmrg
206210d565efSmrg if (ivts)
206310d565efSmrg {
206410d565efSmrg gcc_assert (GET_CODE (PATTERN (insn))
206510d565efSmrg == GET_CODE (PATTERN (orig_insn)));
206610d565efSmrg
206710d565efSmrg if (!delta)
206810d565efSmrg insert_base_initialization (ivts, insn);
206910d565efSmrg split_iv (ivts, insn, delta);
207010d565efSmrg }
207110d565efSmrg }
207210d565efSmrg /* Apply variable expansion optimization. */
207310d565efSmrg if (unrolling && opt_info->insns_with_var_to_expand)
207410d565efSmrg {
207510d565efSmrg ves = (struct var_to_expand *)
207610d565efSmrg opt_info->insns_with_var_to_expand->find (&ve_templ);
207710d565efSmrg if (ves)
207810d565efSmrg {
207910d565efSmrg gcc_assert (GET_CODE (PATTERN (insn))
208010d565efSmrg == GET_CODE (PATTERN (orig_insn)));
208110d565efSmrg expand_var_during_unrolling (ves, insn);
208210d565efSmrg }
208310d565efSmrg }
208410d565efSmrg orig_insn = NEXT_INSN (orig_insn);
208510d565efSmrg }
208610d565efSmrg }
208710d565efSmrg
208810d565efSmrg if (!rewrite_original_loop)
208910d565efSmrg return;
209010d565efSmrg
209110d565efSmrg /* Initialize the variable expansions in the loop preheader
209210d565efSmrg and take care of combining them at the loop exit. */
209310d565efSmrg if (opt_info->insns_with_var_to_expand)
209410d565efSmrg {
209510d565efSmrg for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
209610d565efSmrg insert_var_expansion_initialization (ves, opt_info->loop_preheader);
209710d565efSmrg for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
209810d565efSmrg combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
209910d565efSmrg }
210010d565efSmrg
210110d565efSmrg /* Rewrite also the original loop body. Find them as originals of the blocks
210210d565efSmrg in the last copied iteration, i.e. those that have
210310d565efSmrg get_bb_copy (get_bb_original (bb)) == bb. */
210410d565efSmrg for (i = opt_info->first_new_block;
210510d565efSmrg i < (unsigned) last_basic_block_for_fn (cfun);
210610d565efSmrg i++)
210710d565efSmrg {
210810d565efSmrg bb = BASIC_BLOCK_FOR_FN (cfun, i);
210910d565efSmrg orig_bb = get_bb_original (bb);
211010d565efSmrg if (get_bb_copy (orig_bb) != bb)
211110d565efSmrg continue;
211210d565efSmrg
211310d565efSmrg delta = determine_split_iv_delta (0, n_copies, unrolling);
211410d565efSmrg for (orig_insn = BB_HEAD (orig_bb);
211510d565efSmrg orig_insn != NEXT_INSN (BB_END (bb));
211610d565efSmrg orig_insn = next)
211710d565efSmrg {
211810d565efSmrg next = NEXT_INSN (orig_insn);
211910d565efSmrg
212010d565efSmrg if (!INSN_P (orig_insn))
212110d565efSmrg continue;
212210d565efSmrg
212310d565efSmrg ivts_templ.insn = orig_insn;
212410d565efSmrg if (opt_info->insns_to_split)
212510d565efSmrg {
212610d565efSmrg maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
212710d565efSmrg
212810d565efSmrg ivts = (struct iv_to_split *)
212910d565efSmrg opt_info->insns_to_split->find (&ivts_templ);
213010d565efSmrg if (ivts)
213110d565efSmrg {
213210d565efSmrg if (!delta)
213310d565efSmrg insert_base_initialization (ivts, orig_insn);
213410d565efSmrg split_iv (ivts, orig_insn, delta);
213510d565efSmrg continue;
213610d565efSmrg }
213710d565efSmrg }
213810d565efSmrg
213910d565efSmrg }
214010d565efSmrg }
214110d565efSmrg }
214210d565efSmrg
214310d565efSmrg /* Release OPT_INFO. */
214410d565efSmrg
214510d565efSmrg static void
free_opt_info(struct opt_info * opt_info)214610d565efSmrg free_opt_info (struct opt_info *opt_info)
214710d565efSmrg {
214810d565efSmrg delete opt_info->insns_to_split;
214910d565efSmrg opt_info->insns_to_split = NULL;
215010d565efSmrg if (opt_info->insns_with_var_to_expand)
215110d565efSmrg {
215210d565efSmrg struct var_to_expand *ves;
215310d565efSmrg
215410d565efSmrg for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
215510d565efSmrg ves->var_expansions.release ();
215610d565efSmrg delete opt_info->insns_with_var_to_expand;
215710d565efSmrg opt_info->insns_with_var_to_expand = NULL;
215810d565efSmrg }
215910d565efSmrg free (opt_info);
216010d565efSmrg }
2161