138fd1498Szrj /* Loop unrolling.
238fd1498Szrj Copyright (C) 2002-2018 Free Software Foundation, Inc.
338fd1498Szrj
438fd1498Szrj This file is part of GCC.
538fd1498Szrj
638fd1498Szrj GCC is free software; you can redistribute it and/or modify it under
738fd1498Szrj the terms of the GNU General Public License as published by the Free
838fd1498Szrj Software Foundation; either version 3, or (at your option) any later
938fd1498Szrj version.
1038fd1498Szrj
1138fd1498Szrj GCC is distributed in the hope that it will be useful, but WITHOUT ANY
1238fd1498Szrj WARRANTY; without even the implied warranty of MERCHANTABILITY or
1338fd1498Szrj FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
1438fd1498Szrj for more details.
1538fd1498Szrj
1638fd1498Szrj You should have received a copy of the GNU General Public License
1738fd1498Szrj along with GCC; see the file COPYING3. If not see
1838fd1498Szrj <http://www.gnu.org/licenses/>. */
1938fd1498Szrj
2038fd1498Szrj #include "config.h"
2138fd1498Szrj #include "system.h"
2238fd1498Szrj #include "coretypes.h"
2338fd1498Szrj #include "backend.h"
2438fd1498Szrj #include "target.h"
2538fd1498Szrj #include "rtl.h"
2638fd1498Szrj #include "tree.h"
2738fd1498Szrj #include "cfghooks.h"
2838fd1498Szrj #include "memmodel.h"
2938fd1498Szrj #include "optabs.h"
3038fd1498Szrj #include "emit-rtl.h"
3138fd1498Szrj #include "recog.h"
3238fd1498Szrj #include "profile.h"
3338fd1498Szrj #include "cfgrtl.h"
3438fd1498Szrj #include "cfgloop.h"
3538fd1498Szrj #include "params.h"
3638fd1498Szrj #include "dojump.h"
3738fd1498Szrj #include "expr.h"
3838fd1498Szrj #include "dumpfile.h"
3938fd1498Szrj
4038fd1498Szrj /* This pass performs loop unrolling. We only perform this
4138fd1498Szrj optimization on innermost loops (with single exception) because
4238fd1498Szrj the impact on performance is greatest here, and we want to avoid
4338fd1498Szrj unnecessary code size growth. The gain is caused by greater sequentiality
4438fd1498Szrj of code, better code to optimize for further passes and in some cases
4538fd1498Szrj by fewer testings of exit conditions. The main problem is code growth,
4638fd1498Szrj that impacts performance negatively due to effect of caches.
4738fd1498Szrj
4838fd1498Szrj What we do:
4938fd1498Szrj
5038fd1498Szrj -- unrolling of loops that roll constant times; this is almost always
5138fd1498Szrj win, as we get rid of exit condition tests.
5238fd1498Szrj -- unrolling of loops that roll number of times that we can compute
5338fd1498Szrj in runtime; we also get rid of exit condition tests here, but there
5438fd1498Szrj is the extra expense for calculating the number of iterations
5538fd1498Szrj -- simple unrolling of remaining loops; this is performed only if we
5638fd1498Szrj are asked to, as the gain is questionable in this case and often
5738fd1498Szrj it may even slow down the code
5838fd1498Szrj For more detailed descriptions of each of those, see comments at
5938fd1498Szrj appropriate function below.
6038fd1498Szrj
6138fd1498Szrj There is a lot of parameters (defined and described in params.def) that
6238fd1498Szrj control how much we unroll.
6338fd1498Szrj
6438fd1498Szrj ??? A great problem is that we don't have a good way how to determine
6538fd1498Szrj how many times we should unroll the loop; the experiments I have made
6638fd1498Szrj showed that this choice may affect performance in order of several %.
6738fd1498Szrj */
6838fd1498Szrj
6938fd1498Szrj /* Information about induction variables to split. */
7038fd1498Szrj
7138fd1498Szrj struct iv_to_split
7238fd1498Szrj {
7338fd1498Szrj rtx_insn *insn; /* The insn in that the induction variable occurs. */
7438fd1498Szrj rtx orig_var; /* The variable (register) for the IV before split. */
7538fd1498Szrj rtx base_var; /* The variable on that the values in the further
7638fd1498Szrj iterations are based. */
7738fd1498Szrj rtx step; /* Step of the induction variable. */
7838fd1498Szrj struct iv_to_split *next; /* Next entry in walking order. */
7938fd1498Szrj };
8038fd1498Szrj
8138fd1498Szrj /* Information about accumulators to expand. */
8238fd1498Szrj
8338fd1498Szrj struct var_to_expand
8438fd1498Szrj {
8538fd1498Szrj rtx_insn *insn; /* The insn in that the variable expansion occurs. */
8638fd1498Szrj rtx reg; /* The accumulator which is expanded. */
8738fd1498Szrj vec<rtx> var_expansions; /* The copies of the accumulator which is expanded. */
8838fd1498Szrj struct var_to_expand *next; /* Next entry in walking order. */
8938fd1498Szrj enum rtx_code op; /* The type of the accumulation - addition, subtraction
9038fd1498Szrj or multiplication. */
9138fd1498Szrj int expansion_count; /* Count the number of expansions generated so far. */
9238fd1498Szrj int reuse_expansion; /* The expansion we intend to reuse to expand
9338fd1498Szrj the accumulator. If REUSE_EXPANSION is 0 reuse
9438fd1498Szrj the original accumulator. Else use
9538fd1498Szrj var_expansions[REUSE_EXPANSION - 1]. */
9638fd1498Szrj };
9738fd1498Szrj
9838fd1498Szrj /* Hashtable helper for iv_to_split. */
9938fd1498Szrj
10038fd1498Szrj struct iv_split_hasher : free_ptr_hash <iv_to_split>
10138fd1498Szrj {
10238fd1498Szrj static inline hashval_t hash (const iv_to_split *);
10338fd1498Szrj static inline bool equal (const iv_to_split *, const iv_to_split *);
10438fd1498Szrj };
10538fd1498Szrj
10638fd1498Szrj
10738fd1498Szrj /* A hash function for information about insns to split. */
10838fd1498Szrj
10938fd1498Szrj inline hashval_t
hash(const iv_to_split * ivts)11038fd1498Szrj iv_split_hasher::hash (const iv_to_split *ivts)
11138fd1498Szrj {
11238fd1498Szrj return (hashval_t) INSN_UID (ivts->insn);
11338fd1498Szrj }
11438fd1498Szrj
11538fd1498Szrj /* An equality functions for information about insns to split. */
11638fd1498Szrj
11738fd1498Szrj inline bool
equal(const iv_to_split * i1,const iv_to_split * i2)11838fd1498Szrj iv_split_hasher::equal (const iv_to_split *i1, const iv_to_split *i2)
11938fd1498Szrj {
12038fd1498Szrj return i1->insn == i2->insn;
12138fd1498Szrj }
12238fd1498Szrj
12338fd1498Szrj /* Hashtable helper for iv_to_split. */
12438fd1498Szrj
12538fd1498Szrj struct var_expand_hasher : free_ptr_hash <var_to_expand>
12638fd1498Szrj {
12738fd1498Szrj static inline hashval_t hash (const var_to_expand *);
12838fd1498Szrj static inline bool equal (const var_to_expand *, const var_to_expand *);
12938fd1498Szrj };
13038fd1498Szrj
13138fd1498Szrj /* Return a hash for VES. */
13238fd1498Szrj
13338fd1498Szrj inline hashval_t
hash(const var_to_expand * ves)13438fd1498Szrj var_expand_hasher::hash (const var_to_expand *ves)
13538fd1498Szrj {
13638fd1498Szrj return (hashval_t) INSN_UID (ves->insn);
13738fd1498Szrj }
13838fd1498Szrj
13938fd1498Szrj /* Return true if I1 and I2 refer to the same instruction. */
14038fd1498Szrj
14138fd1498Szrj inline bool
equal(const var_to_expand * i1,const var_to_expand * i2)14238fd1498Szrj var_expand_hasher::equal (const var_to_expand *i1, const var_to_expand *i2)
14338fd1498Szrj {
14438fd1498Szrj return i1->insn == i2->insn;
14538fd1498Szrj }
14638fd1498Szrj
14738fd1498Szrj /* Information about optimization applied in
14838fd1498Szrj the unrolled loop. */
14938fd1498Szrj
15038fd1498Szrj struct opt_info
15138fd1498Szrj {
15238fd1498Szrj hash_table<iv_split_hasher> *insns_to_split; /* A hashtable of insns to
15338fd1498Szrj split. */
15438fd1498Szrj struct iv_to_split *iv_to_split_head; /* The first iv to split. */
15538fd1498Szrj struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list. */
15638fd1498Szrj hash_table<var_expand_hasher> *insns_with_var_to_expand; /* A hashtable of
15738fd1498Szrj insns with accumulators to expand. */
15838fd1498Szrj struct var_to_expand *var_to_expand_head; /* The first var to expand. */
15938fd1498Szrj struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list. */
16038fd1498Szrj unsigned first_new_block; /* The first basic block that was
16138fd1498Szrj duplicated. */
16238fd1498Szrj basic_block loop_exit; /* The loop exit basic block. */
16338fd1498Szrj basic_block loop_preheader; /* The loop preheader basic block. */
16438fd1498Szrj };
16538fd1498Szrj
16638fd1498Szrj static void decide_unroll_stupid (struct loop *, int);
16738fd1498Szrj static void decide_unroll_constant_iterations (struct loop *, int);
16838fd1498Szrj static void decide_unroll_runtime_iterations (struct loop *, int);
16938fd1498Szrj static void unroll_loop_stupid (struct loop *);
17038fd1498Szrj static void decide_unrolling (int);
17138fd1498Szrj static void unroll_loop_constant_iterations (struct loop *);
17238fd1498Szrj static void unroll_loop_runtime_iterations (struct loop *);
17338fd1498Szrj static struct opt_info *analyze_insns_in_loop (struct loop *);
17438fd1498Szrj static void opt_info_start_duplication (struct opt_info *);
17538fd1498Szrj static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
17638fd1498Szrj static void free_opt_info (struct opt_info *);
17738fd1498Szrj static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx_insn *);
17838fd1498Szrj static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
17938fd1498Szrj static struct iv_to_split *analyze_iv_to_split_insn (rtx_insn *);
18038fd1498Szrj static void expand_var_during_unrolling (struct var_to_expand *, rtx_insn *);
18138fd1498Szrj static void insert_var_expansion_initialization (struct var_to_expand *,
18238fd1498Szrj basic_block);
18338fd1498Szrj static void combine_var_copies_in_loop_exit (struct var_to_expand *,
18438fd1498Szrj basic_block);
18538fd1498Szrj static rtx get_expansion (struct var_to_expand *);
18638fd1498Szrj
18738fd1498Szrj /* Emit a message summarizing the unroll that will be
18838fd1498Szrj performed for LOOP, along with the loop's location LOCUS, if
18938fd1498Szrj appropriate given the dump or -fopt-info settings. */
19038fd1498Szrj
19138fd1498Szrj static void
report_unroll(struct loop * loop,location_t locus)19238fd1498Szrj report_unroll (struct loop *loop, location_t locus)
19338fd1498Szrj {
19438fd1498Szrj dump_flags_t report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS;
19538fd1498Szrj
19638fd1498Szrj if (loop->lpt_decision.decision == LPT_NONE)
19738fd1498Szrj return;
19838fd1498Szrj
19938fd1498Szrj if (!dump_enabled_p ())
20038fd1498Szrj return;
20138fd1498Szrj
20238fd1498Szrj dump_printf_loc (report_flags, locus,
20338fd1498Szrj "loop unrolled %d times",
20438fd1498Szrj loop->lpt_decision.times);
20538fd1498Szrj if (profile_info && loop->header->count.initialized_p ())
20638fd1498Szrj dump_printf (report_flags,
20738fd1498Szrj " (header execution count %d)",
20838fd1498Szrj (int)loop->header->count.to_gcov_type ());
20938fd1498Szrj
21038fd1498Szrj dump_printf (report_flags, "\n");
21138fd1498Szrj }
21238fd1498Szrj
21338fd1498Szrj /* Decide whether unroll loops and how much. */
21438fd1498Szrj static void
decide_unrolling(int flags)21538fd1498Szrj decide_unrolling (int flags)
21638fd1498Szrj {
21738fd1498Szrj struct loop *loop;
21838fd1498Szrj
21938fd1498Szrj /* Scan the loops, inner ones first. */
22038fd1498Szrj FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
22138fd1498Szrj {
22238fd1498Szrj loop->lpt_decision.decision = LPT_NONE;
22338fd1498Szrj location_t locus = get_loop_location (loop);
22438fd1498Szrj
22538fd1498Szrj if (dump_enabled_p ())
22638fd1498Szrj dump_printf_loc (MSG_NOTE, locus,
22738fd1498Szrj "considering unrolling loop %d at BB %d\n",
22838fd1498Szrj loop->num, loop->header->index);
22938fd1498Szrj
23038fd1498Szrj if (loop->unroll == 1)
23138fd1498Szrj {
23238fd1498Szrj if (dump_file)
23338fd1498Szrj fprintf (dump_file,
23438fd1498Szrj ";; Not unrolling loop, user didn't want it unrolled\n");
23538fd1498Szrj continue;
23638fd1498Szrj }
23738fd1498Szrj
23838fd1498Szrj /* Do not peel cold areas. */
23938fd1498Szrj if (optimize_loop_for_size_p (loop))
24038fd1498Szrj {
24138fd1498Szrj if (dump_file)
24238fd1498Szrj fprintf (dump_file, ";; Not considering loop, cold area\n");
24338fd1498Szrj continue;
24438fd1498Szrj }
24538fd1498Szrj
24638fd1498Szrj /* Can the loop be manipulated? */
24738fd1498Szrj if (!can_duplicate_loop_p (loop))
24838fd1498Szrj {
24938fd1498Szrj if (dump_file)
25038fd1498Szrj fprintf (dump_file,
25138fd1498Szrj ";; Not considering loop, cannot duplicate\n");
25238fd1498Szrj continue;
25338fd1498Szrj }
25438fd1498Szrj
25538fd1498Szrj /* Skip non-innermost loops. */
25638fd1498Szrj if (loop->inner)
25738fd1498Szrj {
25838fd1498Szrj if (dump_file)
25938fd1498Szrj fprintf (dump_file, ";; Not considering loop, is not innermost\n");
26038fd1498Szrj continue;
26138fd1498Szrj }
26238fd1498Szrj
26338fd1498Szrj loop->ninsns = num_loop_insns (loop);
26438fd1498Szrj loop->av_ninsns = average_num_loop_insns (loop);
26538fd1498Szrj
26638fd1498Szrj /* Try transformations one by one in decreasing order of priority. */
26738fd1498Szrj decide_unroll_constant_iterations (loop, flags);
26838fd1498Szrj if (loop->lpt_decision.decision == LPT_NONE)
26938fd1498Szrj decide_unroll_runtime_iterations (loop, flags);
27038fd1498Szrj if (loop->lpt_decision.decision == LPT_NONE)
27138fd1498Szrj decide_unroll_stupid (loop, flags);
27238fd1498Szrj
27338fd1498Szrj report_unroll (loop, locus);
27438fd1498Szrj }
27538fd1498Szrj }
27638fd1498Szrj
27738fd1498Szrj /* Unroll LOOPS. */
27838fd1498Szrj void
unroll_loops(int flags)27938fd1498Szrj unroll_loops (int flags)
28038fd1498Szrj {
28138fd1498Szrj struct loop *loop;
28238fd1498Szrj bool changed = false;
28338fd1498Szrj
28438fd1498Szrj /* Now decide rest of unrolling. */
28538fd1498Szrj decide_unrolling (flags);
28638fd1498Szrj
28738fd1498Szrj /* Scan the loops, inner ones first. */
28838fd1498Szrj FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
28938fd1498Szrj {
29038fd1498Szrj /* And perform the appropriate transformations. */
29138fd1498Szrj switch (loop->lpt_decision.decision)
29238fd1498Szrj {
29338fd1498Szrj case LPT_UNROLL_CONSTANT:
29438fd1498Szrj unroll_loop_constant_iterations (loop);
29538fd1498Szrj changed = true;
29638fd1498Szrj break;
29738fd1498Szrj case LPT_UNROLL_RUNTIME:
29838fd1498Szrj unroll_loop_runtime_iterations (loop);
29938fd1498Szrj changed = true;
30038fd1498Szrj break;
30138fd1498Szrj case LPT_UNROLL_STUPID:
30238fd1498Szrj unroll_loop_stupid (loop);
30338fd1498Szrj changed = true;
30438fd1498Szrj break;
30538fd1498Szrj case LPT_NONE:
30638fd1498Szrj break;
30738fd1498Szrj default:
30838fd1498Szrj gcc_unreachable ();
30938fd1498Szrj }
31038fd1498Szrj }
31138fd1498Szrj
31238fd1498Szrj if (changed)
31338fd1498Szrj {
31438fd1498Szrj calculate_dominance_info (CDI_DOMINATORS);
31538fd1498Szrj fix_loop_structure (NULL);
31638fd1498Szrj }
31738fd1498Szrj
31838fd1498Szrj iv_analysis_done ();
31938fd1498Szrj }
32038fd1498Szrj
32138fd1498Szrj /* Check whether exit of the LOOP is at the end of loop body. */
32238fd1498Szrj
32338fd1498Szrj static bool
loop_exit_at_end_p(struct loop * loop)32438fd1498Szrj loop_exit_at_end_p (struct loop *loop)
32538fd1498Szrj {
32638fd1498Szrj struct niter_desc *desc = get_simple_loop_desc (loop);
32738fd1498Szrj rtx_insn *insn;
32838fd1498Szrj
32938fd1498Szrj /* We should never have conditional in latch block. */
33038fd1498Szrj gcc_assert (desc->in_edge->dest != loop->header);
33138fd1498Szrj
33238fd1498Szrj if (desc->in_edge->dest != loop->latch)
33338fd1498Szrj return false;
33438fd1498Szrj
33538fd1498Szrj /* Check that the latch is empty. */
33638fd1498Szrj FOR_BB_INSNS (loop->latch, insn)
33738fd1498Szrj {
33838fd1498Szrj if (INSN_P (insn) && active_insn_p (insn))
33938fd1498Szrj return false;
34038fd1498Szrj }
34138fd1498Szrj
34238fd1498Szrj return true;
34338fd1498Szrj }
34438fd1498Szrj
34538fd1498Szrj /* Decide whether to unroll LOOP iterating constant number of times
34638fd1498Szrj and how much. */
34738fd1498Szrj
34838fd1498Szrj static void
decide_unroll_constant_iterations(struct loop * loop,int flags)34938fd1498Szrj decide_unroll_constant_iterations (struct loop *loop, int flags)
35038fd1498Szrj {
35138fd1498Szrj unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
35238fd1498Szrj struct niter_desc *desc;
35338fd1498Szrj widest_int iterations;
35438fd1498Szrj
35538fd1498Szrj /* If we were not asked to unroll this loop, just return back silently. */
35638fd1498Szrj if (!(flags & UAP_UNROLL) && !loop->unroll)
35738fd1498Szrj return;
35838fd1498Szrj
35938fd1498Szrj if (dump_enabled_p ())
36038fd1498Szrj dump_printf (MSG_NOTE,
36138fd1498Szrj "considering unrolling loop with constant "
36238fd1498Szrj "number of iterations\n");
36338fd1498Szrj
36438fd1498Szrj /* nunroll = total number of copies of the original loop body in
36538fd1498Szrj unrolled loop (i.e. if it is 2, we have to duplicate loop body once). */
36638fd1498Szrj nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
36738fd1498Szrj nunroll_by_av
36838fd1498Szrj = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
36938fd1498Szrj if (nunroll > nunroll_by_av)
37038fd1498Szrj nunroll = nunroll_by_av;
37138fd1498Szrj if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
37238fd1498Szrj nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
37338fd1498Szrj
37438fd1498Szrj if (targetm.loop_unroll_adjust)
37538fd1498Szrj nunroll = targetm.loop_unroll_adjust (nunroll, loop);
37638fd1498Szrj
37738fd1498Szrj /* Skip big loops. */
37838fd1498Szrj if (nunroll <= 1)
37938fd1498Szrj {
38038fd1498Szrj if (dump_file)
38138fd1498Szrj fprintf (dump_file, ";; Not considering loop, is too big\n");
38238fd1498Szrj return;
38338fd1498Szrj }
38438fd1498Szrj
38538fd1498Szrj /* Check for simple loops. */
38638fd1498Szrj desc = get_simple_loop_desc (loop);
38738fd1498Szrj
38838fd1498Szrj /* Check number of iterations. */
38938fd1498Szrj if (!desc->simple_p || !desc->const_iter || desc->assumptions)
39038fd1498Szrj {
39138fd1498Szrj if (dump_file)
39238fd1498Szrj fprintf (dump_file,
39338fd1498Szrj ";; Unable to prove that the loop iterates constant times\n");
39438fd1498Szrj return;
39538fd1498Szrj }
39638fd1498Szrj
39738fd1498Szrj /* Check for an explicit unrolling factor. */
39838fd1498Szrj if (loop->unroll > 0 && loop->unroll < USHRT_MAX)
39938fd1498Szrj {
40038fd1498Szrj /* However we cannot unroll completely at the RTL level a loop with
40138fd1498Szrj constant number of iterations; it should have been peeled instead. */
402*e215fc28Szrj if (desc->niter == 0 || (unsigned) loop->unroll > desc->niter - 1)
40338fd1498Szrj {
40438fd1498Szrj if (dump_file)
40538fd1498Szrj fprintf (dump_file, ";; Loop should have been peeled\n");
40638fd1498Szrj }
40738fd1498Szrj else
40838fd1498Szrj {
40938fd1498Szrj loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
41038fd1498Szrj loop->lpt_decision.times = loop->unroll - 1;
41138fd1498Szrj }
41238fd1498Szrj return;
41338fd1498Szrj }
41438fd1498Szrj
41538fd1498Szrj /* Check whether the loop rolls enough to consider.
41638fd1498Szrj Consult also loop bounds and profile; in the case the loop has more
41738fd1498Szrj than one exit it may well loop less than determined maximal number
41838fd1498Szrj of iterations. */
41938fd1498Szrj if (desc->niter < 2 * nunroll
42038fd1498Szrj || ((get_estimated_loop_iterations (loop, &iterations)
42138fd1498Szrj || get_likely_max_loop_iterations (loop, &iterations))
42238fd1498Szrj && wi::ltu_p (iterations, 2 * nunroll)))
42338fd1498Szrj {
42438fd1498Szrj if (dump_file)
42538fd1498Szrj fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
42638fd1498Szrj return;
42738fd1498Szrj }
42838fd1498Szrj
42938fd1498Szrj /* Success; now compute number of iterations to unroll. We alter
43038fd1498Szrj nunroll so that as few as possible copies of loop body are
43138fd1498Szrj necessary, while still not decreasing the number of unrollings
43238fd1498Szrj too much (at most by 1). */
43338fd1498Szrj best_copies = 2 * nunroll + 10;
43438fd1498Szrj
43538fd1498Szrj i = 2 * nunroll + 2;
43638fd1498Szrj if (i > desc->niter - 2)
43738fd1498Szrj i = desc->niter - 2;
43838fd1498Szrj
43938fd1498Szrj for (; i >= nunroll - 1; i--)
44038fd1498Szrj {
44138fd1498Szrj unsigned exit_mod = desc->niter % (i + 1);
44238fd1498Szrj
44338fd1498Szrj if (!loop_exit_at_end_p (loop))
44438fd1498Szrj n_copies = exit_mod + i + 1;
44538fd1498Szrj else if (exit_mod != (unsigned) i
44638fd1498Szrj || desc->noloop_assumptions != NULL_RTX)
44738fd1498Szrj n_copies = exit_mod + i + 2;
44838fd1498Szrj else
44938fd1498Szrj n_copies = i + 1;
45038fd1498Szrj
45138fd1498Szrj if (n_copies < best_copies)
45238fd1498Szrj {
45338fd1498Szrj best_copies = n_copies;
45438fd1498Szrj best_unroll = i;
45538fd1498Szrj }
45638fd1498Szrj }
45738fd1498Szrj
45838fd1498Szrj loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
45938fd1498Szrj loop->lpt_decision.times = best_unroll;
46038fd1498Szrj }
46138fd1498Szrj
46238fd1498Szrj /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
46338fd1498Szrj The transformation does this:
46438fd1498Szrj
46538fd1498Szrj for (i = 0; i < 102; i++)
46638fd1498Szrj body;
46738fd1498Szrj
46838fd1498Szrj ==> (LOOP->LPT_DECISION.TIMES == 3)
46938fd1498Szrj
47038fd1498Szrj i = 0;
47138fd1498Szrj body; i++;
47238fd1498Szrj body; i++;
47338fd1498Szrj while (i < 102)
47438fd1498Szrj {
47538fd1498Szrj body; i++;
47638fd1498Szrj body; i++;
47738fd1498Szrj body; i++;
47838fd1498Szrj body; i++;
47938fd1498Szrj }
48038fd1498Szrj */
48138fd1498Szrj static void
unroll_loop_constant_iterations(struct loop * loop)48238fd1498Szrj unroll_loop_constant_iterations (struct loop *loop)
48338fd1498Szrj {
48438fd1498Szrj unsigned HOST_WIDE_INT niter;
48538fd1498Szrj unsigned exit_mod;
48638fd1498Szrj unsigned i;
48738fd1498Szrj edge e;
48838fd1498Szrj unsigned max_unroll = loop->lpt_decision.times;
48938fd1498Szrj struct niter_desc *desc = get_simple_loop_desc (loop);
49038fd1498Szrj bool exit_at_end = loop_exit_at_end_p (loop);
49138fd1498Szrj struct opt_info *opt_info = NULL;
49238fd1498Szrj bool ok;
49338fd1498Szrj
49438fd1498Szrj niter = desc->niter;
49538fd1498Szrj
49638fd1498Szrj /* Should not get here (such loop should be peeled instead). */
49738fd1498Szrj gcc_assert (niter > max_unroll + 1);
49838fd1498Szrj
49938fd1498Szrj exit_mod = niter % (max_unroll + 1);
50038fd1498Szrj
50138fd1498Szrj auto_sbitmap wont_exit (max_unroll + 2);
50238fd1498Szrj bitmap_ones (wont_exit);
50338fd1498Szrj
50438fd1498Szrj auto_vec<edge> remove_edges;
50538fd1498Szrj if (flag_split_ivs_in_unroller
50638fd1498Szrj || flag_variable_expansion_in_unroller)
50738fd1498Szrj opt_info = analyze_insns_in_loop (loop);
50838fd1498Szrj
50938fd1498Szrj if (!exit_at_end)
51038fd1498Szrj {
51138fd1498Szrj /* The exit is not at the end of the loop; leave exit test
51238fd1498Szrj in the first copy, so that the loops that start with test
51338fd1498Szrj of exit condition have continuous body after unrolling. */
51438fd1498Szrj
51538fd1498Szrj if (dump_file)
51638fd1498Szrj fprintf (dump_file, ";; Condition at beginning of loop.\n");
51738fd1498Szrj
51838fd1498Szrj /* Peel exit_mod iterations. */
51938fd1498Szrj bitmap_clear_bit (wont_exit, 0);
52038fd1498Szrj if (desc->noloop_assumptions)
52138fd1498Szrj bitmap_clear_bit (wont_exit, 1);
52238fd1498Szrj
52338fd1498Szrj if (exit_mod)
52438fd1498Szrj {
52538fd1498Szrj opt_info_start_duplication (opt_info);
52638fd1498Szrj ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
52738fd1498Szrj exit_mod,
52838fd1498Szrj wont_exit, desc->out_edge,
52938fd1498Szrj &remove_edges,
53038fd1498Szrj DLTHE_FLAG_UPDATE_FREQ
53138fd1498Szrj | (opt_info && exit_mod > 1
53238fd1498Szrj ? DLTHE_RECORD_COPY_NUMBER
53338fd1498Szrj : 0));
53438fd1498Szrj gcc_assert (ok);
53538fd1498Szrj
53638fd1498Szrj if (opt_info && exit_mod > 1)
53738fd1498Szrj apply_opt_in_copies (opt_info, exit_mod, false, false);
53838fd1498Szrj
53938fd1498Szrj desc->noloop_assumptions = NULL_RTX;
54038fd1498Szrj desc->niter -= exit_mod;
54138fd1498Szrj loop->nb_iterations_upper_bound -= exit_mod;
54238fd1498Szrj if (loop->any_estimate
54338fd1498Szrj && wi::leu_p (exit_mod, loop->nb_iterations_estimate))
54438fd1498Szrj loop->nb_iterations_estimate -= exit_mod;
54538fd1498Szrj else
54638fd1498Szrj loop->any_estimate = false;
54738fd1498Szrj if (loop->any_likely_upper_bound
54838fd1498Szrj && wi::leu_p (exit_mod, loop->nb_iterations_likely_upper_bound))
54938fd1498Szrj loop->nb_iterations_likely_upper_bound -= exit_mod;
55038fd1498Szrj else
55138fd1498Szrj loop->any_likely_upper_bound = false;
55238fd1498Szrj }
55338fd1498Szrj
55438fd1498Szrj bitmap_set_bit (wont_exit, 1);
55538fd1498Szrj }
55638fd1498Szrj else
55738fd1498Szrj {
55838fd1498Szrj /* Leave exit test in last copy, for the same reason as above if
55938fd1498Szrj the loop tests the condition at the end of loop body. */
56038fd1498Szrj
56138fd1498Szrj if (dump_file)
56238fd1498Szrj fprintf (dump_file, ";; Condition at end of loop.\n");
56338fd1498Szrj
56438fd1498Szrj /* We know that niter >= max_unroll + 2; so we do not need to care of
56538fd1498Szrj case when we would exit before reaching the loop. So just peel
56638fd1498Szrj exit_mod + 1 iterations. */
56738fd1498Szrj if (exit_mod != max_unroll
56838fd1498Szrj || desc->noloop_assumptions)
56938fd1498Szrj {
57038fd1498Szrj bitmap_clear_bit (wont_exit, 0);
57138fd1498Szrj if (desc->noloop_assumptions)
57238fd1498Szrj bitmap_clear_bit (wont_exit, 1);
57338fd1498Szrj
57438fd1498Szrj opt_info_start_duplication (opt_info);
57538fd1498Szrj ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
57638fd1498Szrj exit_mod + 1,
57738fd1498Szrj wont_exit, desc->out_edge,
57838fd1498Szrj &remove_edges,
57938fd1498Szrj DLTHE_FLAG_UPDATE_FREQ
58038fd1498Szrj | (opt_info && exit_mod > 0
58138fd1498Szrj ? DLTHE_RECORD_COPY_NUMBER
58238fd1498Szrj : 0));
58338fd1498Szrj gcc_assert (ok);
58438fd1498Szrj
58538fd1498Szrj if (opt_info && exit_mod > 0)
58638fd1498Szrj apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
58738fd1498Szrj
58838fd1498Szrj desc->niter -= exit_mod + 1;
58938fd1498Szrj loop->nb_iterations_upper_bound -= exit_mod + 1;
59038fd1498Szrj if (loop->any_estimate
59138fd1498Szrj && wi::leu_p (exit_mod + 1, loop->nb_iterations_estimate))
59238fd1498Szrj loop->nb_iterations_estimate -= exit_mod + 1;
59338fd1498Szrj else
59438fd1498Szrj loop->any_estimate = false;
59538fd1498Szrj if (loop->any_likely_upper_bound
59638fd1498Szrj && wi::leu_p (exit_mod + 1, loop->nb_iterations_likely_upper_bound))
59738fd1498Szrj loop->nb_iterations_likely_upper_bound -= exit_mod + 1;
59838fd1498Szrj else
59938fd1498Szrj loop->any_likely_upper_bound = false;
60038fd1498Szrj desc->noloop_assumptions = NULL_RTX;
60138fd1498Szrj
60238fd1498Szrj bitmap_set_bit (wont_exit, 0);
60338fd1498Szrj bitmap_set_bit (wont_exit, 1);
60438fd1498Szrj }
60538fd1498Szrj
60638fd1498Szrj bitmap_clear_bit (wont_exit, max_unroll);
60738fd1498Szrj }
60838fd1498Szrj
60938fd1498Szrj /* Now unroll the loop. */
61038fd1498Szrj
61138fd1498Szrj opt_info_start_duplication (opt_info);
61238fd1498Szrj ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
61338fd1498Szrj max_unroll,
61438fd1498Szrj wont_exit, desc->out_edge,
61538fd1498Szrj &remove_edges,
61638fd1498Szrj DLTHE_FLAG_UPDATE_FREQ
61738fd1498Szrj | (opt_info
61838fd1498Szrj ? DLTHE_RECORD_COPY_NUMBER
61938fd1498Szrj : 0));
62038fd1498Szrj gcc_assert (ok);
62138fd1498Szrj
62238fd1498Szrj if (opt_info)
62338fd1498Szrj {
62438fd1498Szrj apply_opt_in_copies (opt_info, max_unroll, true, true);
62538fd1498Szrj free_opt_info (opt_info);
62638fd1498Szrj }
62738fd1498Szrj
62838fd1498Szrj if (exit_at_end)
62938fd1498Szrj {
63038fd1498Szrj basic_block exit_block = get_bb_copy (desc->in_edge->src);
63138fd1498Szrj /* Find a new in and out edge; they are in the last copy we have made. */
63238fd1498Szrj
63338fd1498Szrj if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
63438fd1498Szrj {
63538fd1498Szrj desc->out_edge = EDGE_SUCC (exit_block, 0);
63638fd1498Szrj desc->in_edge = EDGE_SUCC (exit_block, 1);
63738fd1498Szrj }
63838fd1498Szrj else
63938fd1498Szrj {
64038fd1498Szrj desc->out_edge = EDGE_SUCC (exit_block, 1);
64138fd1498Szrj desc->in_edge = EDGE_SUCC (exit_block, 0);
64238fd1498Szrj }
64338fd1498Szrj }
64438fd1498Szrj
64538fd1498Szrj desc->niter /= max_unroll + 1;
64638fd1498Szrj loop->nb_iterations_upper_bound
64738fd1498Szrj = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
64838fd1498Szrj if (loop->any_estimate)
64938fd1498Szrj loop->nb_iterations_estimate
65038fd1498Szrj = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
65138fd1498Szrj if (loop->any_likely_upper_bound)
65238fd1498Szrj loop->nb_iterations_likely_upper_bound
65338fd1498Szrj = wi::udiv_trunc (loop->nb_iterations_likely_upper_bound, max_unroll + 1);
65438fd1498Szrj desc->niter_expr = GEN_INT (desc->niter);
65538fd1498Szrj
65638fd1498Szrj /* Remove the edges. */
65738fd1498Szrj FOR_EACH_VEC_ELT (remove_edges, i, e)
65838fd1498Szrj remove_path (e);
65938fd1498Szrj
66038fd1498Szrj if (dump_file)
66138fd1498Szrj fprintf (dump_file,
66238fd1498Szrj ";; Unrolled loop %d times, constant # of iterations %i insns\n",
66338fd1498Szrj max_unroll, num_loop_insns (loop));
66438fd1498Szrj }
66538fd1498Szrj
66638fd1498Szrj /* Decide whether to unroll LOOP iterating runtime computable number of times
66738fd1498Szrj and how much. */
66838fd1498Szrj static void
decide_unroll_runtime_iterations(struct loop * loop,int flags)66938fd1498Szrj decide_unroll_runtime_iterations (struct loop *loop, int flags)
67038fd1498Szrj {
67138fd1498Szrj unsigned nunroll, nunroll_by_av, i;
67238fd1498Szrj struct niter_desc *desc;
67338fd1498Szrj widest_int iterations;
67438fd1498Szrj
67538fd1498Szrj /* If we were not asked to unroll this loop, just return back silently. */
67638fd1498Szrj if (!(flags & UAP_UNROLL) && !loop->unroll)
67738fd1498Szrj return;
67838fd1498Szrj
67938fd1498Szrj if (dump_enabled_p ())
68038fd1498Szrj dump_printf (MSG_NOTE,
68138fd1498Szrj "considering unrolling loop with runtime-"
68238fd1498Szrj "computable number of iterations\n");
68338fd1498Szrj
68438fd1498Szrj /* nunroll = total number of copies of the original loop body in
68538fd1498Szrj unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
68638fd1498Szrj nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
68738fd1498Szrj nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
68838fd1498Szrj if (nunroll > nunroll_by_av)
68938fd1498Szrj nunroll = nunroll_by_av;
69038fd1498Szrj if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
69138fd1498Szrj nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
69238fd1498Szrj
69338fd1498Szrj if (targetm.loop_unroll_adjust)
69438fd1498Szrj nunroll = targetm.loop_unroll_adjust (nunroll, loop);
69538fd1498Szrj
69638fd1498Szrj if (loop->unroll > 0 && loop->unroll < USHRT_MAX)
69738fd1498Szrj nunroll = loop->unroll;
69838fd1498Szrj
69938fd1498Szrj /* Skip big loops. */
70038fd1498Szrj if (nunroll <= 1)
70138fd1498Szrj {
70238fd1498Szrj if (dump_file)
70338fd1498Szrj fprintf (dump_file, ";; Not considering loop, is too big\n");
70438fd1498Szrj return;
70538fd1498Szrj }
70638fd1498Szrj
70738fd1498Szrj /* Check for simple loops. */
70838fd1498Szrj desc = get_simple_loop_desc (loop);
70938fd1498Szrj
71038fd1498Szrj /* Check simpleness. */
71138fd1498Szrj if (!desc->simple_p || desc->assumptions)
71238fd1498Szrj {
71338fd1498Szrj if (dump_file)
71438fd1498Szrj fprintf (dump_file,
71538fd1498Szrj ";; Unable to prove that the number of iterations "
71638fd1498Szrj "can be counted in runtime\n");
71738fd1498Szrj return;
71838fd1498Szrj }
71938fd1498Szrj
72038fd1498Szrj if (desc->const_iter)
72138fd1498Szrj {
72238fd1498Szrj if (dump_file)
72338fd1498Szrj fprintf (dump_file, ";; Loop iterates constant times\n");
72438fd1498Szrj return;
72538fd1498Szrj }
72638fd1498Szrj
72738fd1498Szrj /* Check whether the loop rolls. */
72838fd1498Szrj if ((get_estimated_loop_iterations (loop, &iterations)
72938fd1498Szrj || get_likely_max_loop_iterations (loop, &iterations))
73038fd1498Szrj && wi::ltu_p (iterations, 2 * nunroll))
73138fd1498Szrj {
73238fd1498Szrj if (dump_file)
73338fd1498Szrj fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
73438fd1498Szrj return;
73538fd1498Szrj }
73638fd1498Szrj
73738fd1498Szrj /* Success; now force nunroll to be power of 2, as code-gen
73838fd1498Szrj requires it, we are unable to cope with overflows in
73938fd1498Szrj computation of number of iterations. */
74038fd1498Szrj for (i = 1; 2 * i <= nunroll; i *= 2)
74138fd1498Szrj continue;
74238fd1498Szrj
74338fd1498Szrj loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
74438fd1498Szrj loop->lpt_decision.times = i - 1;
74538fd1498Szrj }
74638fd1498Szrj
74738fd1498Szrj /* Splits edge E and inserts the sequence of instructions INSNS on it, and
74838fd1498Szrj returns the newly created block. If INSNS is NULL_RTX, nothing is changed
74938fd1498Szrj and NULL is returned instead. */
75038fd1498Szrj
75138fd1498Szrj basic_block
split_edge_and_insert(edge e,rtx_insn * insns)75238fd1498Szrj split_edge_and_insert (edge e, rtx_insn *insns)
75338fd1498Szrj {
75438fd1498Szrj basic_block bb;
75538fd1498Szrj
75638fd1498Szrj if (!insns)
75738fd1498Szrj return NULL;
75838fd1498Szrj bb = split_edge (e);
75938fd1498Szrj emit_insn_after (insns, BB_END (bb));
76038fd1498Szrj
76138fd1498Szrj /* ??? We used to assume that INSNS can contain control flow insns, and
76238fd1498Szrj that we had to try to find sub basic blocks in BB to maintain a valid
76338fd1498Szrj CFG. For this purpose we used to set the BB_SUPERBLOCK flag on BB
76438fd1498Szrj and call break_superblocks when going out of cfglayout mode. But it
76538fd1498Szrj turns out that this never happens; and that if it does ever happen,
76638fd1498Szrj the verify_flow_info at the end of the RTL loop passes would fail.
76738fd1498Szrj
76838fd1498Szrj There are two reasons why we expected we could have control flow insns
76938fd1498Szrj in INSNS. The first is when a comparison has to be done in parts, and
77038fd1498Szrj the second is when the number of iterations is computed for loops with
77138fd1498Szrj the number of iterations known at runtime. In both cases, test cases
77238fd1498Szrj to get control flow in INSNS appear to be impossible to construct:
77338fd1498Szrj
77438fd1498Szrj * If do_compare_rtx_and_jump needs several branches to do comparison
77538fd1498Szrj in a mode that needs comparison by parts, we cannot analyze the
77638fd1498Szrj number of iterations of the loop, and we never get to unrolling it.
77738fd1498Szrj
77838fd1498Szrj * The code in expand_divmod that was suspected to cause creation of
77938fd1498Szrj branching code seems to be only accessed for signed division. The
78038fd1498Szrj divisions used by # of iterations analysis are always unsigned.
78138fd1498Szrj Problems might arise on architectures that emits branching code
78238fd1498Szrj for some operations that may appear in the unroller (especially
78338fd1498Szrj for division), but we have no such architectures.
78438fd1498Szrj
78538fd1498Szrj Considering all this, it was decided that we should for now assume
78638fd1498Szrj that INSNS can in theory contain control flow insns, but in practice
78738fd1498Szrj it never does. So we don't handle the theoretical case, and should
78838fd1498Szrj a real failure ever show up, we have a pretty good clue for how to
78938fd1498Szrj fix it. */
79038fd1498Szrj
79138fd1498Szrj return bb;
79238fd1498Szrj }
79338fd1498Szrj
79438fd1498Szrj /* Prepare a sequence comparing OP0 with OP1 using COMP and jumping to LABEL if
79538fd1498Szrj true, with probability PROB. If CINSN is not NULL, it is the insn to copy
79638fd1498Szrj in order to create a jump. */
79738fd1498Szrj
79838fd1498Szrj static rtx_insn *
compare_and_jump_seq(rtx op0,rtx op1,enum rtx_code comp,rtx_code_label * label,profile_probability prob,rtx_insn * cinsn)79938fd1498Szrj compare_and_jump_seq (rtx op0, rtx op1, enum rtx_code comp,
80038fd1498Szrj rtx_code_label *label, profile_probability prob,
80138fd1498Szrj rtx_insn *cinsn)
80238fd1498Szrj {
80338fd1498Szrj rtx_insn *seq;
80438fd1498Szrj rtx_jump_insn *jump;
80538fd1498Szrj rtx cond;
80638fd1498Szrj machine_mode mode;
80738fd1498Szrj
80838fd1498Szrj mode = GET_MODE (op0);
80938fd1498Szrj if (mode == VOIDmode)
81038fd1498Szrj mode = GET_MODE (op1);
81138fd1498Szrj
81238fd1498Szrj start_sequence ();
81338fd1498Szrj if (GET_MODE_CLASS (mode) == MODE_CC)
81438fd1498Szrj {
81538fd1498Szrj /* A hack -- there seems to be no easy generic way how to make a
81638fd1498Szrj conditional jump from a ccmode comparison. */
81738fd1498Szrj gcc_assert (cinsn);
81838fd1498Szrj cond = XEXP (SET_SRC (pc_set (cinsn)), 0);
81938fd1498Szrj gcc_assert (GET_CODE (cond) == comp);
82038fd1498Szrj gcc_assert (rtx_equal_p (op0, XEXP (cond, 0)));
82138fd1498Szrj gcc_assert (rtx_equal_p (op1, XEXP (cond, 1)));
82238fd1498Szrj emit_jump_insn (copy_insn (PATTERN (cinsn)));
82338fd1498Szrj jump = as_a <rtx_jump_insn *> (get_last_insn ());
82438fd1498Szrj JUMP_LABEL (jump) = JUMP_LABEL (cinsn);
82538fd1498Szrj LABEL_NUSES (JUMP_LABEL (jump))++;
82638fd1498Szrj redirect_jump (jump, label, 0);
82738fd1498Szrj }
82838fd1498Szrj else
82938fd1498Szrj {
83038fd1498Szrj gcc_assert (!cinsn);
83138fd1498Szrj
83238fd1498Szrj op0 = force_operand (op0, NULL_RTX);
83338fd1498Szrj op1 = force_operand (op1, NULL_RTX);
83438fd1498Szrj do_compare_rtx_and_jump (op0, op1, comp, 0,
83538fd1498Szrj mode, NULL_RTX, NULL, label,
83638fd1498Szrj profile_probability::uninitialized ());
83738fd1498Szrj jump = as_a <rtx_jump_insn *> (get_last_insn ());
83838fd1498Szrj jump->set_jump_target (label);
83938fd1498Szrj LABEL_NUSES (label)++;
84038fd1498Szrj }
84138fd1498Szrj if (prob.initialized_p ())
84238fd1498Szrj add_reg_br_prob_note (jump, prob);
84338fd1498Szrj
84438fd1498Szrj seq = get_insns ();
84538fd1498Szrj end_sequence ();
84638fd1498Szrj
84738fd1498Szrj return seq;
84838fd1498Szrj }
84938fd1498Szrj
85038fd1498Szrj /* Unroll LOOP for which we are able to count number of iterations in
85138fd1498Szrj runtime LOOP->LPT_DECISION.TIMES times. The times value must be a
85238fd1498Szrj power of two. The transformation does this (with some extra care
85338fd1498Szrj for case n < 0):
85438fd1498Szrj
85538fd1498Szrj for (i = 0; i < n; i++)
85638fd1498Szrj body;
85738fd1498Szrj
85838fd1498Szrj ==> (LOOP->LPT_DECISION.TIMES == 3)
85938fd1498Szrj
86038fd1498Szrj i = 0;
86138fd1498Szrj mod = n % 4;
86238fd1498Szrj
86338fd1498Szrj switch (mod)
86438fd1498Szrj {
86538fd1498Szrj case 3:
86638fd1498Szrj body; i++;
86738fd1498Szrj case 2:
86838fd1498Szrj body; i++;
86938fd1498Szrj case 1:
87038fd1498Szrj body; i++;
87138fd1498Szrj case 0: ;
87238fd1498Szrj }
87338fd1498Szrj
87438fd1498Szrj while (i < n)
87538fd1498Szrj {
87638fd1498Szrj body; i++;
87738fd1498Szrj body; i++;
87838fd1498Szrj body; i++;
87938fd1498Szrj body; i++;
88038fd1498Szrj }
88138fd1498Szrj */
88238fd1498Szrj static void
unroll_loop_runtime_iterations(struct loop * loop)88338fd1498Szrj unroll_loop_runtime_iterations (struct loop *loop)
88438fd1498Szrj {
88538fd1498Szrj rtx old_niter, niter, tmp;
88638fd1498Szrj rtx_insn *init_code, *branch_code;
88738fd1498Szrj unsigned i, j;
88838fd1498Szrj profile_probability p;
88938fd1498Szrj basic_block preheader, *body, swtch, ezc_swtch = NULL;
89038fd1498Szrj int may_exit_copy;
89138fd1498Szrj profile_count iter_count, new_count;
89238fd1498Szrj unsigned n_peel;
89338fd1498Szrj edge e;
89438fd1498Szrj bool extra_zero_check, last_may_exit;
89538fd1498Szrj unsigned max_unroll = loop->lpt_decision.times;
89638fd1498Szrj struct niter_desc *desc = get_simple_loop_desc (loop);
89738fd1498Szrj bool exit_at_end = loop_exit_at_end_p (loop);
89838fd1498Szrj struct opt_info *opt_info = NULL;
89938fd1498Szrj bool ok;
90038fd1498Szrj
90138fd1498Szrj if (flag_split_ivs_in_unroller
90238fd1498Szrj || flag_variable_expansion_in_unroller)
90338fd1498Szrj opt_info = analyze_insns_in_loop (loop);
90438fd1498Szrj
90538fd1498Szrj /* Remember blocks whose dominators will have to be updated. */
90638fd1498Szrj auto_vec<basic_block> dom_bbs;
90738fd1498Szrj
90838fd1498Szrj body = get_loop_body (loop);
90938fd1498Szrj for (i = 0; i < loop->num_nodes; i++)
91038fd1498Szrj {
91138fd1498Szrj vec<basic_block> ldom;
91238fd1498Szrj basic_block bb;
91338fd1498Szrj
91438fd1498Szrj ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
91538fd1498Szrj FOR_EACH_VEC_ELT (ldom, j, bb)
91638fd1498Szrj if (!flow_bb_inside_loop_p (loop, bb))
91738fd1498Szrj dom_bbs.safe_push (bb);
91838fd1498Szrj
91938fd1498Szrj ldom.release ();
92038fd1498Szrj }
92138fd1498Szrj free (body);
92238fd1498Szrj
92338fd1498Szrj if (!exit_at_end)
92438fd1498Szrj {
92538fd1498Szrj /* Leave exit in first copy (for explanation why see comment in
92638fd1498Szrj unroll_loop_constant_iterations). */
92738fd1498Szrj may_exit_copy = 0;
92838fd1498Szrj n_peel = max_unroll - 1;
92938fd1498Szrj extra_zero_check = true;
93038fd1498Szrj last_may_exit = false;
93138fd1498Szrj }
93238fd1498Szrj else
93338fd1498Szrj {
93438fd1498Szrj /* Leave exit in last copy (for explanation why see comment in
93538fd1498Szrj unroll_loop_constant_iterations). */
93638fd1498Szrj may_exit_copy = max_unroll;
93738fd1498Szrj n_peel = max_unroll;
93838fd1498Szrj extra_zero_check = false;
93938fd1498Szrj last_may_exit = true;
94038fd1498Szrj }
94138fd1498Szrj
94238fd1498Szrj /* Get expression for number of iterations. */
94338fd1498Szrj start_sequence ();
94438fd1498Szrj old_niter = niter = gen_reg_rtx (desc->mode);
94538fd1498Szrj tmp = force_operand (copy_rtx (desc->niter_expr), niter);
94638fd1498Szrj if (tmp != niter)
94738fd1498Szrj emit_move_insn (niter, tmp);
94838fd1498Szrj
94938fd1498Szrj /* For loops that exit at end and whose number of iterations is reliable,
95038fd1498Szrj add one to niter to account for first pass through loop body before
95138fd1498Szrj reaching exit test. */
95238fd1498Szrj if (exit_at_end && !desc->noloop_assumptions)
95338fd1498Szrj {
95438fd1498Szrj niter = expand_simple_binop (desc->mode, PLUS,
95538fd1498Szrj niter, const1_rtx,
95638fd1498Szrj NULL_RTX, 0, OPTAB_LIB_WIDEN);
95738fd1498Szrj old_niter = niter;
95838fd1498Szrj }
95938fd1498Szrj
96038fd1498Szrj /* Count modulo by ANDing it with max_unroll; we use the fact that
96138fd1498Szrj the number of unrollings is a power of two, and thus this is correct
96238fd1498Szrj even if there is overflow in the computation. */
96338fd1498Szrj niter = expand_simple_binop (desc->mode, AND,
96438fd1498Szrj niter, gen_int_mode (max_unroll, desc->mode),
96538fd1498Szrj NULL_RTX, 0, OPTAB_LIB_WIDEN);
96638fd1498Szrj
96738fd1498Szrj init_code = get_insns ();
96838fd1498Szrj end_sequence ();
96938fd1498Szrj unshare_all_rtl_in_chain (init_code);
97038fd1498Szrj
97138fd1498Szrj /* Precondition the loop. */
97238fd1498Szrj split_edge_and_insert (loop_preheader_edge (loop), init_code);
97338fd1498Szrj
97438fd1498Szrj auto_vec<edge> remove_edges;
97538fd1498Szrj
97638fd1498Szrj auto_sbitmap wont_exit (max_unroll + 2);
97738fd1498Szrj
97838fd1498Szrj if (extra_zero_check || desc->noloop_assumptions)
97938fd1498Szrj {
98038fd1498Szrj /* Peel the first copy of loop body. Leave the exit test if the number
98138fd1498Szrj of iterations is not reliable. Also record the place of the extra zero
98238fd1498Szrj check. */
98338fd1498Szrj bitmap_clear (wont_exit);
98438fd1498Szrj if (!desc->noloop_assumptions)
98538fd1498Szrj bitmap_set_bit (wont_exit, 1);
98638fd1498Szrj ezc_swtch = loop_preheader_edge (loop)->src;
98738fd1498Szrj ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
98838fd1498Szrj 1, wont_exit, desc->out_edge,
98938fd1498Szrj &remove_edges,
99038fd1498Szrj DLTHE_FLAG_UPDATE_FREQ);
99138fd1498Szrj gcc_assert (ok);
99238fd1498Szrj }
99338fd1498Szrj
99438fd1498Szrj /* Record the place where switch will be built for preconditioning. */
99538fd1498Szrj swtch = split_edge (loop_preheader_edge (loop));
99638fd1498Szrj
99738fd1498Szrj /* Compute count increments for each switch block and initialize
99838fd1498Szrj innermost switch block. Switch blocks and peeled loop copies are built
99938fd1498Szrj from innermost outward. */
100038fd1498Szrj iter_count = new_count = swtch->count.apply_scale (1, max_unroll + 1);
100138fd1498Szrj swtch->count = new_count;
100238fd1498Szrj
100338fd1498Szrj for (i = 0; i < n_peel; i++)
100438fd1498Szrj {
100538fd1498Szrj /* Peel the copy. */
100638fd1498Szrj bitmap_clear (wont_exit);
100738fd1498Szrj if (i != n_peel - 1 || !last_may_exit)
100838fd1498Szrj bitmap_set_bit (wont_exit, 1);
100938fd1498Szrj ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
101038fd1498Szrj 1, wont_exit, desc->out_edge,
101138fd1498Szrj &remove_edges,
101238fd1498Szrj DLTHE_FLAG_UPDATE_FREQ);
101338fd1498Szrj gcc_assert (ok);
101438fd1498Szrj
101538fd1498Szrj /* Create item for switch. */
101638fd1498Szrj j = n_peel - i - (extra_zero_check ? 0 : 1);
101738fd1498Szrj p = profile_probability::always ().apply_scale (1, i + 2);
101838fd1498Szrj
101938fd1498Szrj preheader = split_edge (loop_preheader_edge (loop));
102038fd1498Szrj /* Add in count of edge from switch block. */
102138fd1498Szrj preheader->count += iter_count;
102238fd1498Szrj branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
102338fd1498Szrj block_label (preheader), p,
102438fd1498Szrj NULL);
102538fd1498Szrj
102638fd1498Szrj /* We rely on the fact that the compare and jump cannot be optimized out,
102738fd1498Szrj and hence the cfg we create is correct. */
102838fd1498Szrj gcc_assert (branch_code != NULL_RTX);
102938fd1498Szrj
103038fd1498Szrj swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
103138fd1498Szrj set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
103238fd1498Szrj single_succ_edge (swtch)->probability = p.invert ();
103338fd1498Szrj new_count += iter_count;
103438fd1498Szrj swtch->count = new_count;
103538fd1498Szrj e = make_edge (swtch, preheader,
103638fd1498Szrj single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
103738fd1498Szrj e->probability = p;
103838fd1498Szrj }
103938fd1498Szrj
104038fd1498Szrj if (extra_zero_check)
104138fd1498Szrj {
104238fd1498Szrj /* Add branch for zero iterations. */
104338fd1498Szrj p = profile_probability::always ().apply_scale (1, max_unroll + 1);
104438fd1498Szrj swtch = ezc_swtch;
104538fd1498Szrj preheader = split_edge (loop_preheader_edge (loop));
104638fd1498Szrj /* Recompute count adjustments since initial peel copy may
104738fd1498Szrj have exited and reduced those values that were computed above. */
104838fd1498Szrj iter_count = swtch->count.apply_scale (1, max_unroll + 1);
104938fd1498Szrj /* Add in count of edge from switch block. */
105038fd1498Szrj preheader->count += iter_count;
105138fd1498Szrj branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
105238fd1498Szrj block_label (preheader), p,
105338fd1498Szrj NULL);
105438fd1498Szrj gcc_assert (branch_code != NULL_RTX);
105538fd1498Szrj
105638fd1498Szrj swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
105738fd1498Szrj set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
105838fd1498Szrj single_succ_edge (swtch)->probability = p.invert ();
105938fd1498Szrj e = make_edge (swtch, preheader,
106038fd1498Szrj single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
106138fd1498Szrj e->probability = p;
106238fd1498Szrj }
106338fd1498Szrj
106438fd1498Szrj /* Recount dominators for outer blocks. */
106538fd1498Szrj iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
106638fd1498Szrj
106738fd1498Szrj /* And unroll loop. */
106838fd1498Szrj
106938fd1498Szrj bitmap_ones (wont_exit);
107038fd1498Szrj bitmap_clear_bit (wont_exit, may_exit_copy);
107138fd1498Szrj opt_info_start_duplication (opt_info);
107238fd1498Szrj
107338fd1498Szrj ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
107438fd1498Szrj max_unroll,
107538fd1498Szrj wont_exit, desc->out_edge,
107638fd1498Szrj &remove_edges,
107738fd1498Szrj DLTHE_FLAG_UPDATE_FREQ
107838fd1498Szrj | (opt_info
107938fd1498Szrj ? DLTHE_RECORD_COPY_NUMBER
108038fd1498Szrj : 0));
108138fd1498Szrj gcc_assert (ok);
108238fd1498Szrj
108338fd1498Szrj if (opt_info)
108438fd1498Szrj {
108538fd1498Szrj apply_opt_in_copies (opt_info, max_unroll, true, true);
108638fd1498Szrj free_opt_info (opt_info);
108738fd1498Szrj }
108838fd1498Szrj
108938fd1498Szrj if (exit_at_end)
109038fd1498Szrj {
109138fd1498Szrj basic_block exit_block = get_bb_copy (desc->in_edge->src);
109238fd1498Szrj /* Find a new in and out edge; they are in the last copy we have
109338fd1498Szrj made. */
109438fd1498Szrj
109538fd1498Szrj if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
109638fd1498Szrj {
109738fd1498Szrj desc->out_edge = EDGE_SUCC (exit_block, 0);
109838fd1498Szrj desc->in_edge = EDGE_SUCC (exit_block, 1);
109938fd1498Szrj }
110038fd1498Szrj else
110138fd1498Szrj {
110238fd1498Szrj desc->out_edge = EDGE_SUCC (exit_block, 1);
110338fd1498Szrj desc->in_edge = EDGE_SUCC (exit_block, 0);
110438fd1498Szrj }
110538fd1498Szrj }
110638fd1498Szrj
110738fd1498Szrj /* Remove the edges. */
110838fd1498Szrj FOR_EACH_VEC_ELT (remove_edges, i, e)
110938fd1498Szrj remove_path (e);
111038fd1498Szrj
111138fd1498Szrj /* We must be careful when updating the number of iterations due to
111238fd1498Szrj preconditioning and the fact that the value must be valid at entry
111338fd1498Szrj of the loop. After passing through the above code, we see that
111438fd1498Szrj the correct new number of iterations is this: */
111538fd1498Szrj gcc_assert (!desc->const_iter);
111638fd1498Szrj desc->niter_expr =
111738fd1498Szrj simplify_gen_binary (UDIV, desc->mode, old_niter,
111838fd1498Szrj gen_int_mode (max_unroll + 1, desc->mode));
111938fd1498Szrj loop->nb_iterations_upper_bound
112038fd1498Szrj = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
112138fd1498Szrj if (loop->any_estimate)
112238fd1498Szrj loop->nb_iterations_estimate
112338fd1498Szrj = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
112438fd1498Szrj if (loop->any_likely_upper_bound)
112538fd1498Szrj loop->nb_iterations_likely_upper_bound
112638fd1498Szrj = wi::udiv_trunc (loop->nb_iterations_likely_upper_bound, max_unroll + 1);
112738fd1498Szrj if (exit_at_end)
112838fd1498Szrj {
112938fd1498Szrj desc->niter_expr =
113038fd1498Szrj simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
113138fd1498Szrj desc->noloop_assumptions = NULL_RTX;
113238fd1498Szrj --loop->nb_iterations_upper_bound;
113338fd1498Szrj if (loop->any_estimate
113438fd1498Szrj && loop->nb_iterations_estimate != 0)
113538fd1498Szrj --loop->nb_iterations_estimate;
113638fd1498Szrj else
113738fd1498Szrj loop->any_estimate = false;
113838fd1498Szrj if (loop->any_likely_upper_bound
113938fd1498Szrj && loop->nb_iterations_likely_upper_bound != 0)
114038fd1498Szrj --loop->nb_iterations_likely_upper_bound;
114138fd1498Szrj else
114238fd1498Szrj loop->any_likely_upper_bound = false;
114338fd1498Szrj }
114438fd1498Szrj
114538fd1498Szrj if (dump_file)
114638fd1498Szrj fprintf (dump_file,
114738fd1498Szrj ";; Unrolled loop %d times, counting # of iterations "
114838fd1498Szrj "in runtime, %i insns\n",
114938fd1498Szrj max_unroll, num_loop_insns (loop));
115038fd1498Szrj }
115138fd1498Szrj
115238fd1498Szrj /* Decide whether to unroll LOOP stupidly and how much. */
115338fd1498Szrj static void
decide_unroll_stupid(struct loop * loop,int flags)115438fd1498Szrj decide_unroll_stupid (struct loop *loop, int flags)
115538fd1498Szrj {
115638fd1498Szrj unsigned nunroll, nunroll_by_av, i;
115738fd1498Szrj struct niter_desc *desc;
115838fd1498Szrj widest_int iterations;
115938fd1498Szrj
116038fd1498Szrj /* If we were not asked to unroll this loop, just return back silently. */
116138fd1498Szrj if (!(flags & UAP_UNROLL_ALL) && !loop->unroll)
116238fd1498Szrj return;
116338fd1498Szrj
116438fd1498Szrj if (dump_enabled_p ())
116538fd1498Szrj dump_printf (MSG_NOTE, "considering unrolling loop stupidly\n");
116638fd1498Szrj
116738fd1498Szrj /* nunroll = total number of copies of the original loop body in
116838fd1498Szrj unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
116938fd1498Szrj nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
117038fd1498Szrj nunroll_by_av
117138fd1498Szrj = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
117238fd1498Szrj if (nunroll > nunroll_by_av)
117338fd1498Szrj nunroll = nunroll_by_av;
117438fd1498Szrj if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
117538fd1498Szrj nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
117638fd1498Szrj
117738fd1498Szrj if (targetm.loop_unroll_adjust)
117838fd1498Szrj nunroll = targetm.loop_unroll_adjust (nunroll, loop);
117938fd1498Szrj
118038fd1498Szrj if (loop->unroll > 0 && loop->unroll < USHRT_MAX)
118138fd1498Szrj nunroll = loop->unroll;
118238fd1498Szrj
118338fd1498Szrj /* Skip big loops. */
118438fd1498Szrj if (nunroll <= 1)
118538fd1498Szrj {
118638fd1498Szrj if (dump_file)
118738fd1498Szrj fprintf (dump_file, ";; Not considering loop, is too big\n");
118838fd1498Szrj return;
118938fd1498Szrj }
119038fd1498Szrj
119138fd1498Szrj /* Check for simple loops. */
119238fd1498Szrj desc = get_simple_loop_desc (loop);
119338fd1498Szrj
119438fd1498Szrj /* Check simpleness. */
119538fd1498Szrj if (desc->simple_p && !desc->assumptions)
119638fd1498Szrj {
119738fd1498Szrj if (dump_file)
119838fd1498Szrj fprintf (dump_file, ";; Loop is simple\n");
119938fd1498Szrj return;
120038fd1498Szrj }
120138fd1498Szrj
120238fd1498Szrj /* Do not unroll loops with branches inside -- it increases number
120338fd1498Szrj of mispredicts.
120438fd1498Szrj TODO: this heuristic needs tunning; call inside the loop body
120538fd1498Szrj is also relatively good reason to not unroll. */
120638fd1498Szrj if (num_loop_branches (loop) > 1)
120738fd1498Szrj {
120838fd1498Szrj if (dump_file)
120938fd1498Szrj fprintf (dump_file, ";; Not unrolling, contains branches\n");
121038fd1498Szrj return;
121138fd1498Szrj }
121238fd1498Szrj
121338fd1498Szrj /* Check whether the loop rolls. */
121438fd1498Szrj if ((get_estimated_loop_iterations (loop, &iterations)
121538fd1498Szrj || get_likely_max_loop_iterations (loop, &iterations))
121638fd1498Szrj && wi::ltu_p (iterations, 2 * nunroll))
121738fd1498Szrj {
121838fd1498Szrj if (dump_file)
121938fd1498Szrj fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
122038fd1498Szrj return;
122138fd1498Szrj }
122238fd1498Szrj
122338fd1498Szrj /* Success. Now force nunroll to be power of 2, as it seems that this
122438fd1498Szrj improves results (partially because of better alignments, partially
122538fd1498Szrj because of some dark magic). */
122638fd1498Szrj for (i = 1; 2 * i <= nunroll; i *= 2)
122738fd1498Szrj continue;
122838fd1498Szrj
122938fd1498Szrj loop->lpt_decision.decision = LPT_UNROLL_STUPID;
123038fd1498Szrj loop->lpt_decision.times = i - 1;
123138fd1498Szrj }
123238fd1498Szrj
123338fd1498Szrj /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation does this:
123438fd1498Szrj
123538fd1498Szrj while (cond)
123638fd1498Szrj body;
123738fd1498Szrj
123838fd1498Szrj ==> (LOOP->LPT_DECISION.TIMES == 3)
123938fd1498Szrj
124038fd1498Szrj while (cond)
124138fd1498Szrj {
124238fd1498Szrj body;
124338fd1498Szrj if (!cond) break;
124438fd1498Szrj body;
124538fd1498Szrj if (!cond) break;
124638fd1498Szrj body;
124738fd1498Szrj if (!cond) break;
124838fd1498Szrj body;
124938fd1498Szrj }
125038fd1498Szrj */
125138fd1498Szrj static void
unroll_loop_stupid(struct loop * loop)125238fd1498Szrj unroll_loop_stupid (struct loop *loop)
125338fd1498Szrj {
125438fd1498Szrj unsigned nunroll = loop->lpt_decision.times;
125538fd1498Szrj struct niter_desc *desc = get_simple_loop_desc (loop);
125638fd1498Szrj struct opt_info *opt_info = NULL;
125738fd1498Szrj bool ok;
125838fd1498Szrj
125938fd1498Szrj if (flag_split_ivs_in_unroller
126038fd1498Szrj || flag_variable_expansion_in_unroller)
126138fd1498Szrj opt_info = analyze_insns_in_loop (loop);
126238fd1498Szrj
126338fd1498Szrj auto_sbitmap wont_exit (nunroll + 1);
126438fd1498Szrj bitmap_clear (wont_exit);
126538fd1498Szrj opt_info_start_duplication (opt_info);
126638fd1498Szrj
126738fd1498Szrj ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
126838fd1498Szrj nunroll, wont_exit,
126938fd1498Szrj NULL, NULL,
127038fd1498Szrj DLTHE_FLAG_UPDATE_FREQ
127138fd1498Szrj | (opt_info
127238fd1498Szrj ? DLTHE_RECORD_COPY_NUMBER
127338fd1498Szrj : 0));
127438fd1498Szrj gcc_assert (ok);
127538fd1498Szrj
127638fd1498Szrj if (opt_info)
127738fd1498Szrj {
127838fd1498Szrj apply_opt_in_copies (opt_info, nunroll, true, true);
127938fd1498Szrj free_opt_info (opt_info);
128038fd1498Szrj }
128138fd1498Szrj
128238fd1498Szrj if (desc->simple_p)
128338fd1498Szrj {
128438fd1498Szrj /* We indeed may get here provided that there are nontrivial assumptions
128538fd1498Szrj for a loop to be really simple. We could update the counts, but the
128638fd1498Szrj problem is that we are unable to decide which exit will be taken
128738fd1498Szrj (not really true in case the number of iterations is constant,
128838fd1498Szrj but no one will do anything with this information, so we do not
128938fd1498Szrj worry about it). */
129038fd1498Szrj desc->simple_p = false;
129138fd1498Szrj }
129238fd1498Szrj
129338fd1498Szrj if (dump_file)
129438fd1498Szrj fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
129538fd1498Szrj nunroll, num_loop_insns (loop));
129638fd1498Szrj }
129738fd1498Szrj
129838fd1498Szrj /* Returns true if REG is referenced in one nondebug insn in LOOP.
129938fd1498Szrj Set *DEBUG_USES to the number of debug insns that reference the
130038fd1498Szrj variable. */
130138fd1498Szrj
130238fd1498Szrj static bool
referenced_in_one_insn_in_loop_p(struct loop * loop,rtx reg,int * debug_uses)130338fd1498Szrj referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
130438fd1498Szrj int *debug_uses)
130538fd1498Szrj {
130638fd1498Szrj basic_block *body, bb;
130738fd1498Szrj unsigned i;
130838fd1498Szrj int count_ref = 0;
130938fd1498Szrj rtx_insn *insn;
131038fd1498Szrj
131138fd1498Szrj body = get_loop_body (loop);
131238fd1498Szrj for (i = 0; i < loop->num_nodes; i++)
131338fd1498Szrj {
131438fd1498Szrj bb = body[i];
131538fd1498Szrj
131638fd1498Szrj FOR_BB_INSNS (bb, insn)
131738fd1498Szrj if (!rtx_referenced_p (reg, insn))
131838fd1498Szrj continue;
131938fd1498Szrj else if (DEBUG_INSN_P (insn))
132038fd1498Szrj ++*debug_uses;
132138fd1498Szrj else if (++count_ref > 1)
132238fd1498Szrj break;
132338fd1498Szrj }
132438fd1498Szrj free (body);
132538fd1498Szrj return (count_ref == 1);
132638fd1498Szrj }
132738fd1498Szrj
132838fd1498Szrj /* Reset the DEBUG_USES debug insns in LOOP that reference REG. */
132938fd1498Szrj
133038fd1498Szrj static void
reset_debug_uses_in_loop(struct loop * loop,rtx reg,int debug_uses)133138fd1498Szrj reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
133238fd1498Szrj {
133338fd1498Szrj basic_block *body, bb;
133438fd1498Szrj unsigned i;
133538fd1498Szrj rtx_insn *insn;
133638fd1498Szrj
133738fd1498Szrj body = get_loop_body (loop);
133838fd1498Szrj for (i = 0; debug_uses && i < loop->num_nodes; i++)
133938fd1498Szrj {
134038fd1498Szrj bb = body[i];
134138fd1498Szrj
134238fd1498Szrj FOR_BB_INSNS (bb, insn)
134338fd1498Szrj if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
134438fd1498Szrj continue;
134538fd1498Szrj else
134638fd1498Szrj {
134738fd1498Szrj validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
134838fd1498Szrj gen_rtx_UNKNOWN_VAR_LOC (), 0);
134938fd1498Szrj if (!--debug_uses)
135038fd1498Szrj break;
135138fd1498Szrj }
135238fd1498Szrj }
135338fd1498Szrj free (body);
135438fd1498Szrj }
135538fd1498Szrj
135638fd1498Szrj /* Determine whether INSN contains an accumulator
135738fd1498Szrj which can be expanded into separate copies,
135838fd1498Szrj one for each copy of the LOOP body.
135938fd1498Szrj
136038fd1498Szrj for (i = 0 ; i < n; i++)
136138fd1498Szrj sum += a[i];
136238fd1498Szrj
136338fd1498Szrj ==>
136438fd1498Szrj
136538fd1498Szrj sum += a[i]
136638fd1498Szrj ....
136738fd1498Szrj i = i+1;
136838fd1498Szrj sum1 += a[i]
136938fd1498Szrj ....
137038fd1498Szrj i = i+1
137138fd1498Szrj sum2 += a[i];
137238fd1498Szrj ....
137338fd1498Szrj
137438fd1498Szrj Return NULL if INSN contains no opportunity for expansion of accumulator.
137538fd1498Szrj Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
137638fd1498Szrj information and return a pointer to it.
137738fd1498Szrj */
137838fd1498Szrj
137938fd1498Szrj static struct var_to_expand *
analyze_insn_to_expand_var(struct loop * loop,rtx_insn * insn)138038fd1498Szrj analyze_insn_to_expand_var (struct loop *loop, rtx_insn *insn)
138138fd1498Szrj {
138238fd1498Szrj rtx set, dest, src;
138338fd1498Szrj struct var_to_expand *ves;
138438fd1498Szrj unsigned accum_pos;
138538fd1498Szrj enum rtx_code code;
138638fd1498Szrj int debug_uses = 0;
138738fd1498Szrj
138838fd1498Szrj set = single_set (insn);
138938fd1498Szrj if (!set)
139038fd1498Szrj return NULL;
139138fd1498Szrj
139238fd1498Szrj dest = SET_DEST (set);
139338fd1498Szrj src = SET_SRC (set);
139438fd1498Szrj code = GET_CODE (src);
139538fd1498Szrj
139638fd1498Szrj if (code != PLUS && code != MINUS && code != MULT && code != FMA)
139738fd1498Szrj return NULL;
139838fd1498Szrj
139938fd1498Szrj if (FLOAT_MODE_P (GET_MODE (dest)))
140038fd1498Szrj {
140138fd1498Szrj if (!flag_associative_math)
140238fd1498Szrj return NULL;
140338fd1498Szrj /* In the case of FMA, we're also changing the rounding. */
140438fd1498Szrj if (code == FMA && !flag_unsafe_math_optimizations)
140538fd1498Szrj return NULL;
140638fd1498Szrj }
140738fd1498Szrj
140838fd1498Szrj /* Hmm, this is a bit paradoxical. We know that INSN is a valid insn
140938fd1498Szrj in MD. But if there is no optab to generate the insn, we can not
141038fd1498Szrj perform the variable expansion. This can happen if an MD provides
141138fd1498Szrj an insn but not a named pattern to generate it, for example to avoid
141238fd1498Szrj producing code that needs additional mode switches like for x87/mmx.
141338fd1498Szrj
141438fd1498Szrj So we check have_insn_for which looks for an optab for the operation
141538fd1498Szrj in SRC. If it doesn't exist, we can't perform the expansion even
141638fd1498Szrj though INSN is valid. */
141738fd1498Szrj if (!have_insn_for (code, GET_MODE (src)))
141838fd1498Szrj return NULL;
141938fd1498Szrj
142038fd1498Szrj if (!REG_P (dest)
142138fd1498Szrj && !(GET_CODE (dest) == SUBREG
142238fd1498Szrj && REG_P (SUBREG_REG (dest))))
142338fd1498Szrj return NULL;
142438fd1498Szrj
142538fd1498Szrj /* Find the accumulator use within the operation. */
142638fd1498Szrj if (code == FMA)
142738fd1498Szrj {
142838fd1498Szrj /* We only support accumulation via FMA in the ADD position. */
142938fd1498Szrj if (!rtx_equal_p (dest, XEXP (src, 2)))
143038fd1498Szrj return NULL;
143138fd1498Szrj accum_pos = 2;
143238fd1498Szrj }
143338fd1498Szrj else if (rtx_equal_p (dest, XEXP (src, 0)))
143438fd1498Szrj accum_pos = 0;
143538fd1498Szrj else if (rtx_equal_p (dest, XEXP (src, 1)))
143638fd1498Szrj {
143738fd1498Szrj /* The method of expansion that we are using; which includes the
143838fd1498Szrj initialization of the expansions with zero and the summation of
143938fd1498Szrj the expansions at the end of the computation will yield wrong
144038fd1498Szrj results for (x = something - x) thus avoid using it in that case. */
144138fd1498Szrj if (code == MINUS)
144238fd1498Szrj return NULL;
144338fd1498Szrj accum_pos = 1;
144438fd1498Szrj }
144538fd1498Szrj else
144638fd1498Szrj return NULL;
144738fd1498Szrj
144838fd1498Szrj /* It must not otherwise be used. */
144938fd1498Szrj if (code == FMA)
145038fd1498Szrj {
145138fd1498Szrj if (rtx_referenced_p (dest, XEXP (src, 0))
145238fd1498Szrj || rtx_referenced_p (dest, XEXP (src, 1)))
145338fd1498Szrj return NULL;
145438fd1498Szrj }
145538fd1498Szrj else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
145638fd1498Szrj return NULL;
145738fd1498Szrj
145838fd1498Szrj /* It must be used in exactly one insn. */
145938fd1498Szrj if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
146038fd1498Szrj return NULL;
146138fd1498Szrj
146238fd1498Szrj if (dump_file)
146338fd1498Szrj {
146438fd1498Szrj fprintf (dump_file, "\n;; Expanding Accumulator ");
146538fd1498Szrj print_rtl (dump_file, dest);
146638fd1498Szrj fprintf (dump_file, "\n");
146738fd1498Szrj }
146838fd1498Szrj
146938fd1498Szrj if (debug_uses)
147038fd1498Szrj /* Instead of resetting the debug insns, we could replace each
147138fd1498Szrj debug use in the loop with the sum or product of all expanded
147238fd1498Szrj accumulators. Since we'll only know of all expansions at the
147338fd1498Szrj end, we'd have to keep track of which vars_to_expand a debug
147438fd1498Szrj insn in the loop references, take note of each copy of the
147538fd1498Szrj debug insn during unrolling, and when it's all done, compute
147638fd1498Szrj the sum or product of each variable and adjust the original
147738fd1498Szrj debug insn and each copy thereof. What a pain! */
147838fd1498Szrj reset_debug_uses_in_loop (loop, dest, debug_uses);
147938fd1498Szrj
148038fd1498Szrj /* Record the accumulator to expand. */
148138fd1498Szrj ves = XNEW (struct var_to_expand);
148238fd1498Szrj ves->insn = insn;
148338fd1498Szrj ves->reg = copy_rtx (dest);
148438fd1498Szrj ves->var_expansions.create (1);
148538fd1498Szrj ves->next = NULL;
148638fd1498Szrj ves->op = GET_CODE (src);
148738fd1498Szrj ves->expansion_count = 0;
148838fd1498Szrj ves->reuse_expansion = 0;
148938fd1498Szrj return ves;
149038fd1498Szrj }
149138fd1498Szrj
149238fd1498Szrj /* Determine whether there is an induction variable in INSN that
149338fd1498Szrj we would like to split during unrolling.
149438fd1498Szrj
149538fd1498Szrj I.e. replace
149638fd1498Szrj
149738fd1498Szrj i = i + 1;
149838fd1498Szrj ...
149938fd1498Szrj i = i + 1;
150038fd1498Szrj ...
150138fd1498Szrj i = i + 1;
150238fd1498Szrj ...
150338fd1498Szrj
150438fd1498Szrj type chains by
150538fd1498Szrj
150638fd1498Szrj i0 = i + 1
150738fd1498Szrj ...
150838fd1498Szrj i = i0 + 1
150938fd1498Szrj ...
151038fd1498Szrj i = i0 + 2
151138fd1498Szrj ...
151238fd1498Szrj
151338fd1498Szrj Return NULL if INSN contains no interesting IVs. Otherwise, allocate
151438fd1498Szrj an IV_TO_SPLIT structure, fill it with the relevant information and return a
151538fd1498Szrj pointer to it. */
151638fd1498Szrj
151738fd1498Szrj static struct iv_to_split *
analyze_iv_to_split_insn(rtx_insn * insn)151838fd1498Szrj analyze_iv_to_split_insn (rtx_insn *insn)
151938fd1498Szrj {
152038fd1498Szrj rtx set, dest;
152138fd1498Szrj struct rtx_iv iv;
152238fd1498Szrj struct iv_to_split *ivts;
152338fd1498Szrj scalar_int_mode mode;
152438fd1498Szrj bool ok;
152538fd1498Szrj
152638fd1498Szrj /* For now we just split the basic induction variables. Later this may be
152738fd1498Szrj extended for example by selecting also addresses of memory references. */
152838fd1498Szrj set = single_set (insn);
152938fd1498Szrj if (!set)
153038fd1498Szrj return NULL;
153138fd1498Szrj
153238fd1498Szrj dest = SET_DEST (set);
153338fd1498Szrj if (!REG_P (dest) || !is_a <scalar_int_mode> (GET_MODE (dest), &mode))
153438fd1498Szrj return NULL;
153538fd1498Szrj
153638fd1498Szrj if (!biv_p (insn, mode, dest))
153738fd1498Szrj return NULL;
153838fd1498Szrj
153938fd1498Szrj ok = iv_analyze_result (insn, dest, &iv);
154038fd1498Szrj
154138fd1498Szrj /* This used to be an assert under the assumption that if biv_p returns
154238fd1498Szrj true that iv_analyze_result must also return true. However, that
154338fd1498Szrj assumption is not strictly correct as evidenced by pr25569.
154438fd1498Szrj
154538fd1498Szrj Returning NULL when iv_analyze_result returns false is safe and
154638fd1498Szrj avoids the problems in pr25569 until the iv_analyze_* routines
154738fd1498Szrj can be fixed, which is apparently hard and time consuming
154838fd1498Szrj according to their author. */
154938fd1498Szrj if (! ok)
155038fd1498Szrj return NULL;
155138fd1498Szrj
155238fd1498Szrj if (iv.step == const0_rtx
155338fd1498Szrj || iv.mode != iv.extend_mode)
155438fd1498Szrj return NULL;
155538fd1498Szrj
155638fd1498Szrj /* Record the insn to split. */
155738fd1498Szrj ivts = XNEW (struct iv_to_split);
155838fd1498Szrj ivts->insn = insn;
155938fd1498Szrj ivts->orig_var = dest;
156038fd1498Szrj ivts->base_var = NULL_RTX;
156138fd1498Szrj ivts->step = iv.step;
156238fd1498Szrj ivts->next = NULL;
156338fd1498Szrj
156438fd1498Szrj return ivts;
156538fd1498Szrj }
156638fd1498Szrj
156738fd1498Szrj /* Determines which of insns in LOOP can be optimized.
156838fd1498Szrj Return a OPT_INFO struct with the relevant hash tables filled
156938fd1498Szrj with all insns to be optimized. The FIRST_NEW_BLOCK field
157038fd1498Szrj is undefined for the return value. */
157138fd1498Szrj
157238fd1498Szrj static struct opt_info *
analyze_insns_in_loop(struct loop * loop)157338fd1498Szrj analyze_insns_in_loop (struct loop *loop)
157438fd1498Szrj {
157538fd1498Szrj basic_block *body, bb;
157638fd1498Szrj unsigned i;
157738fd1498Szrj struct opt_info *opt_info = XCNEW (struct opt_info);
157838fd1498Szrj rtx_insn *insn;
157938fd1498Szrj struct iv_to_split *ivts = NULL;
158038fd1498Szrj struct var_to_expand *ves = NULL;
158138fd1498Szrj iv_to_split **slot1;
158238fd1498Szrj var_to_expand **slot2;
158338fd1498Szrj vec<edge> edges = get_loop_exit_edges (loop);
158438fd1498Szrj edge exit;
158538fd1498Szrj bool can_apply = false;
158638fd1498Szrj
158738fd1498Szrj iv_analysis_loop_init (loop);
158838fd1498Szrj
158938fd1498Szrj body = get_loop_body (loop);
159038fd1498Szrj
159138fd1498Szrj if (flag_split_ivs_in_unroller)
159238fd1498Szrj {
159338fd1498Szrj opt_info->insns_to_split
159438fd1498Szrj = new hash_table<iv_split_hasher> (5 * loop->num_nodes);
159538fd1498Szrj opt_info->iv_to_split_head = NULL;
159638fd1498Szrj opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
159738fd1498Szrj }
159838fd1498Szrj
159938fd1498Szrj /* Record the loop exit bb and loop preheader before the unrolling. */
160038fd1498Szrj opt_info->loop_preheader = loop_preheader_edge (loop)->src;
160138fd1498Szrj
160238fd1498Szrj if (edges.length () == 1)
160338fd1498Szrj {
160438fd1498Szrj exit = edges[0];
160538fd1498Szrj if (!(exit->flags & EDGE_COMPLEX))
160638fd1498Szrj {
160738fd1498Szrj opt_info->loop_exit = split_edge (exit);
160838fd1498Szrj can_apply = true;
160938fd1498Szrj }
161038fd1498Szrj }
161138fd1498Szrj
161238fd1498Szrj if (flag_variable_expansion_in_unroller
161338fd1498Szrj && can_apply)
161438fd1498Szrj {
161538fd1498Szrj opt_info->insns_with_var_to_expand
161638fd1498Szrj = new hash_table<var_expand_hasher> (5 * loop->num_nodes);
161738fd1498Szrj opt_info->var_to_expand_head = NULL;
161838fd1498Szrj opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
161938fd1498Szrj }
162038fd1498Szrj
162138fd1498Szrj for (i = 0; i < loop->num_nodes; i++)
162238fd1498Szrj {
162338fd1498Szrj bb = body[i];
162438fd1498Szrj if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
162538fd1498Szrj continue;
162638fd1498Szrj
162738fd1498Szrj FOR_BB_INSNS (bb, insn)
162838fd1498Szrj {
162938fd1498Szrj if (!INSN_P (insn))
163038fd1498Szrj continue;
163138fd1498Szrj
163238fd1498Szrj if (opt_info->insns_to_split)
163338fd1498Szrj ivts = analyze_iv_to_split_insn (insn);
163438fd1498Szrj
163538fd1498Szrj if (ivts)
163638fd1498Szrj {
163738fd1498Szrj slot1 = opt_info->insns_to_split->find_slot (ivts, INSERT);
163838fd1498Szrj gcc_assert (*slot1 == NULL);
163938fd1498Szrj *slot1 = ivts;
164038fd1498Szrj *opt_info->iv_to_split_tail = ivts;
164138fd1498Szrj opt_info->iv_to_split_tail = &ivts->next;
164238fd1498Szrj continue;
164338fd1498Szrj }
164438fd1498Szrj
164538fd1498Szrj if (opt_info->insns_with_var_to_expand)
164638fd1498Szrj ves = analyze_insn_to_expand_var (loop, insn);
164738fd1498Szrj
164838fd1498Szrj if (ves)
164938fd1498Szrj {
165038fd1498Szrj slot2 = opt_info->insns_with_var_to_expand->find_slot (ves, INSERT);
165138fd1498Szrj gcc_assert (*slot2 == NULL);
165238fd1498Szrj *slot2 = ves;
165338fd1498Szrj *opt_info->var_to_expand_tail = ves;
165438fd1498Szrj opt_info->var_to_expand_tail = &ves->next;
165538fd1498Szrj }
165638fd1498Szrj }
165738fd1498Szrj }
165838fd1498Szrj
165938fd1498Szrj edges.release ();
166038fd1498Szrj free (body);
166138fd1498Szrj return opt_info;
166238fd1498Szrj }
166338fd1498Szrj
166438fd1498Szrj /* Called just before loop duplication. Records start of duplicated area
166538fd1498Szrj to OPT_INFO. */
166638fd1498Szrj
166738fd1498Szrj static void
opt_info_start_duplication(struct opt_info * opt_info)166838fd1498Szrj opt_info_start_duplication (struct opt_info *opt_info)
166938fd1498Szrj {
167038fd1498Szrj if (opt_info)
167138fd1498Szrj opt_info->first_new_block = last_basic_block_for_fn (cfun);
167238fd1498Szrj }
167338fd1498Szrj
167438fd1498Szrj /* Determine the number of iterations between initialization of the base
167538fd1498Szrj variable and the current copy (N_COPY). N_COPIES is the total number
167638fd1498Szrj of newly created copies. UNROLLING is true if we are unrolling
167738fd1498Szrj (not peeling) the loop. */
167838fd1498Szrj
167938fd1498Szrj static unsigned
determine_split_iv_delta(unsigned n_copy,unsigned n_copies,bool unrolling)168038fd1498Szrj determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
168138fd1498Szrj {
168238fd1498Szrj if (unrolling)
168338fd1498Szrj {
168438fd1498Szrj /* If we are unrolling, initialization is done in the original loop
168538fd1498Szrj body (number 0). */
168638fd1498Szrj return n_copy;
168738fd1498Szrj }
168838fd1498Szrj else
168938fd1498Szrj {
169038fd1498Szrj /* If we are peeling, the copy in that the initialization occurs has
169138fd1498Szrj number 1. The original loop (number 0) is the last. */
169238fd1498Szrj if (n_copy)
169338fd1498Szrj return n_copy - 1;
169438fd1498Szrj else
169538fd1498Szrj return n_copies;
169638fd1498Szrj }
169738fd1498Szrj }
169838fd1498Szrj
169938fd1498Szrj /* Allocate basic variable for the induction variable chain. */
170038fd1498Szrj
170138fd1498Szrj static void
allocate_basic_variable(struct iv_to_split * ivts)170238fd1498Szrj allocate_basic_variable (struct iv_to_split *ivts)
170338fd1498Szrj {
170438fd1498Szrj rtx expr = SET_SRC (single_set (ivts->insn));
170538fd1498Szrj
170638fd1498Szrj ivts->base_var = gen_reg_rtx (GET_MODE (expr));
170738fd1498Szrj }
170838fd1498Szrj
170938fd1498Szrj /* Insert initialization of basic variable of IVTS before INSN, taking
171038fd1498Szrj the initial value from INSN. */
171138fd1498Szrj
171238fd1498Szrj static void
insert_base_initialization(struct iv_to_split * ivts,rtx_insn * insn)171338fd1498Szrj insert_base_initialization (struct iv_to_split *ivts, rtx_insn *insn)
171438fd1498Szrj {
171538fd1498Szrj rtx expr = copy_rtx (SET_SRC (single_set (insn)));
171638fd1498Szrj rtx_insn *seq;
171738fd1498Szrj
171838fd1498Szrj start_sequence ();
171938fd1498Szrj expr = force_operand (expr, ivts->base_var);
172038fd1498Szrj if (expr != ivts->base_var)
172138fd1498Szrj emit_move_insn (ivts->base_var, expr);
172238fd1498Szrj seq = get_insns ();
172338fd1498Szrj end_sequence ();
172438fd1498Szrj
172538fd1498Szrj emit_insn_before (seq, insn);
172638fd1498Szrj }
172738fd1498Szrj
172838fd1498Szrj /* Replace the use of induction variable described in IVTS in INSN
172938fd1498Szrj by base variable + DELTA * step. */
173038fd1498Szrj
173138fd1498Szrj static void
split_iv(struct iv_to_split * ivts,rtx_insn * insn,unsigned delta)173238fd1498Szrj split_iv (struct iv_to_split *ivts, rtx_insn *insn, unsigned delta)
173338fd1498Szrj {
173438fd1498Szrj rtx expr, *loc, incr, var;
173538fd1498Szrj rtx_insn *seq;
173638fd1498Szrj machine_mode mode = GET_MODE (ivts->base_var);
173738fd1498Szrj rtx src, dest, set;
173838fd1498Szrj
173938fd1498Szrj /* Construct base + DELTA * step. */
174038fd1498Szrj if (!delta)
174138fd1498Szrj expr = ivts->base_var;
174238fd1498Szrj else
174338fd1498Szrj {
174438fd1498Szrj incr = simplify_gen_binary (MULT, mode,
174538fd1498Szrj copy_rtx (ivts->step),
174638fd1498Szrj gen_int_mode (delta, mode));
174738fd1498Szrj expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
174838fd1498Szrj ivts->base_var, incr);
174938fd1498Szrj }
175038fd1498Szrj
175138fd1498Szrj /* Figure out where to do the replacement. */
175238fd1498Szrj loc = &SET_SRC (single_set (insn));
175338fd1498Szrj
175438fd1498Szrj /* If we can make the replacement right away, we're done. */
175538fd1498Szrj if (validate_change (insn, loc, expr, 0))
175638fd1498Szrj return;
175738fd1498Szrj
175838fd1498Szrj /* Otherwise, force EXPR into a register and try again. */
175938fd1498Szrj start_sequence ();
176038fd1498Szrj var = gen_reg_rtx (mode);
176138fd1498Szrj expr = force_operand (expr, var);
176238fd1498Szrj if (expr != var)
176338fd1498Szrj emit_move_insn (var, expr);
176438fd1498Szrj seq = get_insns ();
176538fd1498Szrj end_sequence ();
176638fd1498Szrj emit_insn_before (seq, insn);
176738fd1498Szrj
176838fd1498Szrj if (validate_change (insn, loc, var, 0))
176938fd1498Szrj return;
177038fd1498Szrj
177138fd1498Szrj /* The last chance. Try recreating the assignment in insn
177238fd1498Szrj completely from scratch. */
177338fd1498Szrj set = single_set (insn);
177438fd1498Szrj gcc_assert (set);
177538fd1498Szrj
177638fd1498Szrj start_sequence ();
177738fd1498Szrj *loc = var;
177838fd1498Szrj src = copy_rtx (SET_SRC (set));
177938fd1498Szrj dest = copy_rtx (SET_DEST (set));
178038fd1498Szrj src = force_operand (src, dest);
178138fd1498Szrj if (src != dest)
178238fd1498Szrj emit_move_insn (dest, src);
178338fd1498Szrj seq = get_insns ();
178438fd1498Szrj end_sequence ();
178538fd1498Szrj
178638fd1498Szrj emit_insn_before (seq, insn);
178738fd1498Szrj delete_insn (insn);
178838fd1498Szrj }
178938fd1498Szrj
179038fd1498Szrj
179138fd1498Szrj /* Return one expansion of the accumulator recorded in struct VE. */
179238fd1498Szrj
179338fd1498Szrj static rtx
get_expansion(struct var_to_expand * ve)179438fd1498Szrj get_expansion (struct var_to_expand *ve)
179538fd1498Szrj {
179638fd1498Szrj rtx reg;
179738fd1498Szrj
179838fd1498Szrj if (ve->reuse_expansion == 0)
179938fd1498Szrj reg = ve->reg;
180038fd1498Szrj else
180138fd1498Szrj reg = ve->var_expansions[ve->reuse_expansion - 1];
180238fd1498Szrj
180338fd1498Szrj if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
180438fd1498Szrj ve->reuse_expansion = 0;
180538fd1498Szrj else
180638fd1498Szrj ve->reuse_expansion++;
180738fd1498Szrj
180838fd1498Szrj return reg;
180938fd1498Szrj }
181038fd1498Szrj
181138fd1498Szrj
181238fd1498Szrj /* Given INSN replace the uses of the accumulator recorded in VE
181338fd1498Szrj with a new register. */
181438fd1498Szrj
181538fd1498Szrj static void
expand_var_during_unrolling(struct var_to_expand * ve,rtx_insn * insn)181638fd1498Szrj expand_var_during_unrolling (struct var_to_expand *ve, rtx_insn *insn)
181738fd1498Szrj {
181838fd1498Szrj rtx new_reg, set;
181938fd1498Szrj bool really_new_expansion = false;
182038fd1498Szrj
182138fd1498Szrj set = single_set (insn);
182238fd1498Szrj gcc_assert (set);
182338fd1498Szrj
182438fd1498Szrj /* Generate a new register only if the expansion limit has not been
182538fd1498Szrj reached. Else reuse an already existing expansion. */
182638fd1498Szrj if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
182738fd1498Szrj {
182838fd1498Szrj really_new_expansion = true;
182938fd1498Szrj new_reg = gen_reg_rtx (GET_MODE (ve->reg));
183038fd1498Szrj }
183138fd1498Szrj else
183238fd1498Szrj new_reg = get_expansion (ve);
183338fd1498Szrj
183438fd1498Szrj validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
183538fd1498Szrj if (apply_change_group ())
183638fd1498Szrj if (really_new_expansion)
183738fd1498Szrj {
183838fd1498Szrj ve->var_expansions.safe_push (new_reg);
183938fd1498Szrj ve->expansion_count++;
184038fd1498Szrj }
184138fd1498Szrj }
184238fd1498Szrj
184338fd1498Szrj /* Initialize the variable expansions in loop preheader. PLACE is the
184438fd1498Szrj loop-preheader basic block where the initialization of the
184538fd1498Szrj expansions should take place. The expansions are initialized with
184638fd1498Szrj (-0) when the operation is plus or minus to honor sign zero. This
184738fd1498Szrj way we can prevent cases where the sign of the final result is
184838fd1498Szrj effected by the sign of the expansion. Here is an example to
184938fd1498Szrj demonstrate this:
185038fd1498Szrj
185138fd1498Szrj for (i = 0 ; i < n; i++)
185238fd1498Szrj sum += something;
185338fd1498Szrj
185438fd1498Szrj ==>
185538fd1498Szrj
185638fd1498Szrj sum += something
185738fd1498Szrj ....
185838fd1498Szrj i = i+1;
185938fd1498Szrj sum1 += something
186038fd1498Szrj ....
186138fd1498Szrj i = i+1
186238fd1498Szrj sum2 += something;
186338fd1498Szrj ....
186438fd1498Szrj
186538fd1498Szrj When SUM is initialized with -zero and SOMETHING is also -zero; the
186638fd1498Szrj final result of sum should be -zero thus the expansions sum1 and sum2
186738fd1498Szrj should be initialized with -zero as well (otherwise we will get +zero
186838fd1498Szrj as the final result). */
186938fd1498Szrj
187038fd1498Szrj static void
insert_var_expansion_initialization(struct var_to_expand * ve,basic_block place)187138fd1498Szrj insert_var_expansion_initialization (struct var_to_expand *ve,
187238fd1498Szrj basic_block place)
187338fd1498Szrj {
187438fd1498Szrj rtx_insn *seq;
187538fd1498Szrj rtx var, zero_init;
187638fd1498Szrj unsigned i;
187738fd1498Szrj machine_mode mode = GET_MODE (ve->reg);
187838fd1498Szrj bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
187938fd1498Szrj
188038fd1498Szrj if (ve->var_expansions.length () == 0)
188138fd1498Szrj return;
188238fd1498Szrj
188338fd1498Szrj start_sequence ();
188438fd1498Szrj switch (ve->op)
188538fd1498Szrj {
188638fd1498Szrj case FMA:
188738fd1498Szrj /* Note that we only accumulate FMA via the ADD operand. */
188838fd1498Szrj case PLUS:
188938fd1498Szrj case MINUS:
189038fd1498Szrj FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
189138fd1498Szrj {
189238fd1498Szrj if (honor_signed_zero_p)
189338fd1498Szrj zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
189438fd1498Szrj else
189538fd1498Szrj zero_init = CONST0_RTX (mode);
189638fd1498Szrj emit_move_insn (var, zero_init);
189738fd1498Szrj }
189838fd1498Szrj break;
189938fd1498Szrj
190038fd1498Szrj case MULT:
190138fd1498Szrj FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
190238fd1498Szrj {
190338fd1498Szrj zero_init = CONST1_RTX (GET_MODE (var));
190438fd1498Szrj emit_move_insn (var, zero_init);
190538fd1498Szrj }
190638fd1498Szrj break;
190738fd1498Szrj
190838fd1498Szrj default:
190938fd1498Szrj gcc_unreachable ();
191038fd1498Szrj }
191138fd1498Szrj
191238fd1498Szrj seq = get_insns ();
191338fd1498Szrj end_sequence ();
191438fd1498Szrj
191538fd1498Szrj emit_insn_after (seq, BB_END (place));
191638fd1498Szrj }
191738fd1498Szrj
191838fd1498Szrj /* Combine the variable expansions at the loop exit. PLACE is the
191938fd1498Szrj loop exit basic block where the summation of the expansions should
192038fd1498Szrj take place. */
192138fd1498Szrj
192238fd1498Szrj static void
combine_var_copies_in_loop_exit(struct var_to_expand * ve,basic_block place)192338fd1498Szrj combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
192438fd1498Szrj {
192538fd1498Szrj rtx sum = ve->reg;
192638fd1498Szrj rtx expr, var;
192738fd1498Szrj rtx_insn *seq, *insn;
192838fd1498Szrj unsigned i;
192938fd1498Szrj
193038fd1498Szrj if (ve->var_expansions.length () == 0)
193138fd1498Szrj return;
193238fd1498Szrj
193338fd1498Szrj /* ve->reg might be SUBREG or some other non-shareable RTL, and we use
193438fd1498Szrj it both here and as the destination of the assignment. */
193538fd1498Szrj sum = copy_rtx (sum);
193638fd1498Szrj start_sequence ();
193738fd1498Szrj switch (ve->op)
193838fd1498Szrj {
193938fd1498Szrj case FMA:
194038fd1498Szrj /* Note that we only accumulate FMA via the ADD operand. */
194138fd1498Szrj case PLUS:
194238fd1498Szrj case MINUS:
194338fd1498Szrj FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
194438fd1498Szrj sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
194538fd1498Szrj break;
194638fd1498Szrj
194738fd1498Szrj case MULT:
194838fd1498Szrj FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
194938fd1498Szrj sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
195038fd1498Szrj break;
195138fd1498Szrj
195238fd1498Szrj default:
195338fd1498Szrj gcc_unreachable ();
195438fd1498Szrj }
195538fd1498Szrj
195638fd1498Szrj expr = force_operand (sum, ve->reg);
195738fd1498Szrj if (expr != ve->reg)
195838fd1498Szrj emit_move_insn (ve->reg, expr);
195938fd1498Szrj seq = get_insns ();
196038fd1498Szrj end_sequence ();
196138fd1498Szrj
196238fd1498Szrj insn = BB_HEAD (place);
196338fd1498Szrj while (!NOTE_INSN_BASIC_BLOCK_P (insn))
196438fd1498Szrj insn = NEXT_INSN (insn);
196538fd1498Szrj
196638fd1498Szrj emit_insn_after (seq, insn);
196738fd1498Szrj }
196838fd1498Szrj
196938fd1498Szrj /* Strip away REG_EQUAL notes for IVs we're splitting.
197038fd1498Szrj
197138fd1498Szrj Updating REG_EQUAL notes for IVs we split is tricky: We
197238fd1498Szrj cannot tell until after unrolling, DF-rescanning, and liveness
197338fd1498Szrj updating, whether an EQ_USE is reached by the split IV while
197438fd1498Szrj the IV reg is still live. See PR55006.
197538fd1498Szrj
197638fd1498Szrj ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
197738fd1498Szrj because RTL loop-iv requires us to defer rescanning insns and
197838fd1498Szrj any notes attached to them. So resort to old techniques... */
197938fd1498Szrj
198038fd1498Szrj static void
maybe_strip_eq_note_for_split_iv(struct opt_info * opt_info,rtx_insn * insn)198138fd1498Szrj maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx_insn *insn)
198238fd1498Szrj {
198338fd1498Szrj struct iv_to_split *ivts;
198438fd1498Szrj rtx note = find_reg_equal_equiv_note (insn);
198538fd1498Szrj if (! note)
198638fd1498Szrj return;
198738fd1498Szrj for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
198838fd1498Szrj if (reg_mentioned_p (ivts->orig_var, note))
198938fd1498Szrj {
199038fd1498Szrj remove_note (insn, note);
199138fd1498Szrj return;
199238fd1498Szrj }
199338fd1498Szrj }
199438fd1498Szrj
199538fd1498Szrj /* Apply loop optimizations in loop copies using the
199638fd1498Szrj data which gathered during the unrolling. Structure
199738fd1498Szrj OPT_INFO record that data.
199838fd1498Szrj
199938fd1498Szrj UNROLLING is true if we unrolled (not peeled) the loop.
200038fd1498Szrj REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
200138fd1498Szrj the loop (as it should happen in complete unrolling, but not in ordinary
200238fd1498Szrj peeling of the loop). */
200338fd1498Szrj
200438fd1498Szrj static void
apply_opt_in_copies(struct opt_info * opt_info,unsigned n_copies,bool unrolling,bool rewrite_original_loop)200538fd1498Szrj apply_opt_in_copies (struct opt_info *opt_info,
200638fd1498Szrj unsigned n_copies, bool unrolling,
200738fd1498Szrj bool rewrite_original_loop)
200838fd1498Szrj {
200938fd1498Szrj unsigned i, delta;
201038fd1498Szrj basic_block bb, orig_bb;
201138fd1498Szrj rtx_insn *insn, *orig_insn, *next;
201238fd1498Szrj struct iv_to_split ivts_templ, *ivts;
201338fd1498Szrj struct var_to_expand ve_templ, *ves;
201438fd1498Szrj
201538fd1498Szrj /* Sanity check -- we need to put initialization in the original loop
201638fd1498Szrj body. */
201738fd1498Szrj gcc_assert (!unrolling || rewrite_original_loop);
201838fd1498Szrj
201938fd1498Szrj /* Allocate the basic variables (i0). */
202038fd1498Szrj if (opt_info->insns_to_split)
202138fd1498Szrj for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
202238fd1498Szrj allocate_basic_variable (ivts);
202338fd1498Szrj
202438fd1498Szrj for (i = opt_info->first_new_block;
202538fd1498Szrj i < (unsigned) last_basic_block_for_fn (cfun);
202638fd1498Szrj i++)
202738fd1498Szrj {
202838fd1498Szrj bb = BASIC_BLOCK_FOR_FN (cfun, i);
202938fd1498Szrj orig_bb = get_bb_original (bb);
203038fd1498Szrj
203138fd1498Szrj /* bb->aux holds position in copy sequence initialized by
203238fd1498Szrj duplicate_loop_to_header_edge. */
203338fd1498Szrj delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
203438fd1498Szrj unrolling);
203538fd1498Szrj bb->aux = 0;
203638fd1498Szrj orig_insn = BB_HEAD (orig_bb);
203738fd1498Szrj FOR_BB_INSNS_SAFE (bb, insn, next)
203838fd1498Szrj {
203938fd1498Szrj if (!INSN_P (insn)
204038fd1498Szrj || (DEBUG_BIND_INSN_P (insn)
204138fd1498Szrj && INSN_VAR_LOCATION_DECL (insn)
204238fd1498Szrj && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
204338fd1498Szrj continue;
204438fd1498Szrj
204538fd1498Szrj while (!INSN_P (orig_insn)
204638fd1498Szrj || (DEBUG_BIND_INSN_P (orig_insn)
204738fd1498Szrj && INSN_VAR_LOCATION_DECL (orig_insn)
204838fd1498Szrj && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
204938fd1498Szrj == LABEL_DECL)))
205038fd1498Szrj orig_insn = NEXT_INSN (orig_insn);
205138fd1498Szrj
205238fd1498Szrj ivts_templ.insn = orig_insn;
205338fd1498Szrj ve_templ.insn = orig_insn;
205438fd1498Szrj
205538fd1498Szrj /* Apply splitting iv optimization. */
205638fd1498Szrj if (opt_info->insns_to_split)
205738fd1498Szrj {
205838fd1498Szrj maybe_strip_eq_note_for_split_iv (opt_info, insn);
205938fd1498Szrj
206038fd1498Szrj ivts = opt_info->insns_to_split->find (&ivts_templ);
206138fd1498Szrj
206238fd1498Szrj if (ivts)
206338fd1498Szrj {
206438fd1498Szrj gcc_assert (GET_CODE (PATTERN (insn))
206538fd1498Szrj == GET_CODE (PATTERN (orig_insn)));
206638fd1498Szrj
206738fd1498Szrj if (!delta)
206838fd1498Szrj insert_base_initialization (ivts, insn);
206938fd1498Szrj split_iv (ivts, insn, delta);
207038fd1498Szrj }
207138fd1498Szrj }
207238fd1498Szrj /* Apply variable expansion optimization. */
207338fd1498Szrj if (unrolling && opt_info->insns_with_var_to_expand)
207438fd1498Szrj {
207538fd1498Szrj ves = (struct var_to_expand *)
207638fd1498Szrj opt_info->insns_with_var_to_expand->find (&ve_templ);
207738fd1498Szrj if (ves)
207838fd1498Szrj {
207938fd1498Szrj gcc_assert (GET_CODE (PATTERN (insn))
208038fd1498Szrj == GET_CODE (PATTERN (orig_insn)));
208138fd1498Szrj expand_var_during_unrolling (ves, insn);
208238fd1498Szrj }
208338fd1498Szrj }
208438fd1498Szrj orig_insn = NEXT_INSN (orig_insn);
208538fd1498Szrj }
208638fd1498Szrj }
208738fd1498Szrj
208838fd1498Szrj if (!rewrite_original_loop)
208938fd1498Szrj return;
209038fd1498Szrj
209138fd1498Szrj /* Initialize the variable expansions in the loop preheader
209238fd1498Szrj and take care of combining them at the loop exit. */
209338fd1498Szrj if (opt_info->insns_with_var_to_expand)
209438fd1498Szrj {
209538fd1498Szrj for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
209638fd1498Szrj insert_var_expansion_initialization (ves, opt_info->loop_preheader);
209738fd1498Szrj for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
209838fd1498Szrj combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
209938fd1498Szrj }
210038fd1498Szrj
210138fd1498Szrj /* Rewrite also the original loop body. Find them as originals of the blocks
210238fd1498Szrj in the last copied iteration, i.e. those that have
210338fd1498Szrj get_bb_copy (get_bb_original (bb)) == bb. */
210438fd1498Szrj for (i = opt_info->first_new_block;
210538fd1498Szrj i < (unsigned) last_basic_block_for_fn (cfun);
210638fd1498Szrj i++)
210738fd1498Szrj {
210838fd1498Szrj bb = BASIC_BLOCK_FOR_FN (cfun, i);
210938fd1498Szrj orig_bb = get_bb_original (bb);
211038fd1498Szrj if (get_bb_copy (orig_bb) != bb)
211138fd1498Szrj continue;
211238fd1498Szrj
211338fd1498Szrj delta = determine_split_iv_delta (0, n_copies, unrolling);
211438fd1498Szrj for (orig_insn = BB_HEAD (orig_bb);
211538fd1498Szrj orig_insn != NEXT_INSN (BB_END (bb));
211638fd1498Szrj orig_insn = next)
211738fd1498Szrj {
211838fd1498Szrj next = NEXT_INSN (orig_insn);
211938fd1498Szrj
212038fd1498Szrj if (!INSN_P (orig_insn))
212138fd1498Szrj continue;
212238fd1498Szrj
212338fd1498Szrj ivts_templ.insn = orig_insn;
212438fd1498Szrj if (opt_info->insns_to_split)
212538fd1498Szrj {
212638fd1498Szrj maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
212738fd1498Szrj
212838fd1498Szrj ivts = (struct iv_to_split *)
212938fd1498Szrj opt_info->insns_to_split->find (&ivts_templ);
213038fd1498Szrj if (ivts)
213138fd1498Szrj {
213238fd1498Szrj if (!delta)
213338fd1498Szrj insert_base_initialization (ivts, orig_insn);
213438fd1498Szrj split_iv (ivts, orig_insn, delta);
213538fd1498Szrj continue;
213638fd1498Szrj }
213738fd1498Szrj }
213838fd1498Szrj
213938fd1498Szrj }
214038fd1498Szrj }
214138fd1498Szrj }
214238fd1498Szrj
214338fd1498Szrj /* Release OPT_INFO. */
214438fd1498Szrj
214538fd1498Szrj static void
free_opt_info(struct opt_info * opt_info)214638fd1498Szrj free_opt_info (struct opt_info *opt_info)
214738fd1498Szrj {
214838fd1498Szrj delete opt_info->insns_to_split;
214938fd1498Szrj opt_info->insns_to_split = NULL;
215038fd1498Szrj if (opt_info->insns_with_var_to_expand)
215138fd1498Szrj {
215238fd1498Szrj struct var_to_expand *ves;
215338fd1498Szrj
215438fd1498Szrj for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
215538fd1498Szrj ves->var_expansions.release ();
215638fd1498Szrj delete opt_info->insns_with_var_to_expand;
215738fd1498Szrj opt_info->insns_with_var_to_expand = NULL;
215838fd1498Szrj }
215938fd1498Szrj free (opt_info);
216038fd1498Szrj }
2161