138fd1498Szrj /* Loop unroll-and-jam.
238fd1498Szrj Copyright (C) 2017-2018 Free Software Foundation, Inc.
338fd1498Szrj
438fd1498Szrj This file is part of GCC.
538fd1498Szrj
638fd1498Szrj GCC is free software; you can redistribute it and/or modify it
738fd1498Szrj under the terms of the GNU General Public License as published by the
838fd1498Szrj Free Software Foundation; either version 3, or (at your option) any
938fd1498Szrj later version.
1038fd1498Szrj
1138fd1498Szrj GCC is distributed in the hope that it will be useful, but WITHOUT
1238fd1498Szrj ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
1338fd1498Szrj FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
1438fd1498Szrj for more details.
1538fd1498Szrj
1638fd1498Szrj You should have received a copy of the GNU General Public License
1738fd1498Szrj along with GCC; see the file COPYING3. If not see
1838fd1498Szrj <http://www.gnu.org/licenses/>. */
1938fd1498Szrj
2038fd1498Szrj #include "config.h"
2138fd1498Szrj #include "system.h"
2238fd1498Szrj #include "coretypes.h"
2338fd1498Szrj #include "params.h"
2438fd1498Szrj #include "tree-pass.h"
2538fd1498Szrj #include "backend.h"
2638fd1498Szrj #include "tree.h"
2738fd1498Szrj #include "gimple.h"
2838fd1498Szrj #include "ssa.h"
2938fd1498Szrj #include "fold-const.h"
3038fd1498Szrj #include "tree-cfg.h"
3138fd1498Szrj #include "tree-ssa.h"
3238fd1498Szrj #include "tree-ssa-loop-niter.h"
3338fd1498Szrj #include "tree-ssa-loop.h"
3438fd1498Szrj #include "tree-ssa-loop-manip.h"
3538fd1498Szrj #include "cfgloop.h"
3638fd1498Szrj #include "tree-scalar-evolution.h"
3738fd1498Szrj #include "gimple-iterator.h"
3838fd1498Szrj #include "cfghooks.h"
3938fd1498Szrj #include "tree-data-ref.h"
4038fd1498Szrj #include "tree-ssa-loop-ivopts.h"
4138fd1498Szrj #include "tree-vectorizer.h"
4238fd1498Szrj
4338fd1498Szrj /* Unroll and Jam transformation
4438fd1498Szrj
4538fd1498Szrj This is a combination of two transformations, where the second
4638fd1498Szrj is not always valid. It's applicable if a loop nest has redundancies
4738fd1498Szrj over the iterations of an outer loop while not having that with
4838fd1498Szrj an inner loop.
4938fd1498Szrj
5038fd1498Szrj Given this nest:
5138fd1498Szrj for (i) {
5238fd1498Szrj for (j) {
5338fd1498Szrj B(i,j)
5438fd1498Szrj }
5538fd1498Szrj }
5638fd1498Szrj
5738fd1498Szrj first unroll:
5838fd1498Szrj for (i by 2) {
5938fd1498Szrj for (j) {
6038fd1498Szrj B(i,j)
6138fd1498Szrj }
6238fd1498Szrj for (j) {
6338fd1498Szrj B(i+1,j)
6438fd1498Szrj }
6538fd1498Szrj }
6638fd1498Szrj
6738fd1498Szrj then fuse the two adjacent inner loops resulting from that:
6838fd1498Szrj for (i by 2) {
6938fd1498Szrj for (j) {
7038fd1498Szrj B(i,j)
7138fd1498Szrj B(i+1,j)
7238fd1498Szrj }
7338fd1498Szrj }
7438fd1498Szrj
7538fd1498Szrj As the order of evaluations of the body B changes this is valid
7638fd1498Szrj only in certain situations: all distance vectors need to be forward.
7738fd1498Szrj Additionally if there are multiple induction variables than just
7838fd1498Szrj a counting control IV (j above) we can also deal with some situations.
7938fd1498Szrj
8038fd1498Szrj The validity is checked by unroll_jam_possible_p, and the data-dep
8138fd1498Szrj testing below.
8238fd1498Szrj
8338fd1498Szrj A trivial example where the fusion is wrong would be when
8438fd1498Szrj B(i,j) == x[j-1] = x[j];
8538fd1498Szrj for (i by 2) {
8638fd1498Szrj for (j) {
8738fd1498Szrj x[j-1] = x[j];
8838fd1498Szrj }
8938fd1498Szrj for (j) {
9038fd1498Szrj x[j-1] = x[j];
9138fd1498Szrj }
9238fd1498Szrj } effect: move content to front by two elements
9338fd1498Szrj -->
9438fd1498Szrj for (i by 2) {
9538fd1498Szrj for (j) {
9638fd1498Szrj x[j-1] = x[j];
9738fd1498Szrj x[j-1] = x[j];
9838fd1498Szrj }
9938fd1498Szrj } effect: move content to front by one element
10038fd1498Szrj */
10138fd1498Szrj
10238fd1498Szrj /* Modify the loop tree for the fact that all code once belonging
10338fd1498Szrj to the OLD loop or the outer loop of OLD now is inside LOOP. */
10438fd1498Szrj
10538fd1498Szrj static void
merge_loop_tree(struct loop * loop,struct loop * old)10638fd1498Szrj merge_loop_tree (struct loop *loop, struct loop *old)
10738fd1498Szrj {
10838fd1498Szrj basic_block *bbs;
10938fd1498Szrj int i, n;
11038fd1498Szrj struct loop *subloop;
11138fd1498Szrj edge e;
11238fd1498Szrj edge_iterator ei;
11338fd1498Szrj
11438fd1498Szrj /* Find its nodes. */
11538fd1498Szrj bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
11638fd1498Szrj n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun));
11738fd1498Szrj
11838fd1498Szrj for (i = 0; i < n; i++)
11938fd1498Szrj {
12038fd1498Szrj /* If the block was direct child of OLD loop it's now part
12138fd1498Szrj of LOOP. If it was outside OLD, then it moved into LOOP
12238fd1498Szrj as well. This avoids changing the loop father for BBs
12338fd1498Szrj in inner loops of OLD. */
12438fd1498Szrj if (bbs[i]->loop_father == old
12538fd1498Szrj || loop_depth (bbs[i]->loop_father) < loop_depth (old))
12638fd1498Szrj {
12738fd1498Szrj remove_bb_from_loops (bbs[i]);
12838fd1498Szrj add_bb_to_loop (bbs[i], loop);
12938fd1498Szrj continue;
13038fd1498Szrj }
13138fd1498Szrj
13238fd1498Szrj /* If we find a direct subloop of OLD, move it to LOOP. */
13338fd1498Szrj subloop = bbs[i]->loop_father;
13438fd1498Szrj if (loop_outer (subloop) == old && subloop->header == bbs[i])
13538fd1498Szrj {
13638fd1498Szrj flow_loop_tree_node_remove (subloop);
13738fd1498Szrj flow_loop_tree_node_add (loop, subloop);
13838fd1498Szrj }
13938fd1498Szrj }
14038fd1498Szrj
14138fd1498Szrj /* Update the information about loop exit edges. */
14238fd1498Szrj for (i = 0; i < n; i++)
14338fd1498Szrj {
14438fd1498Szrj FOR_EACH_EDGE (e, ei, bbs[i]->succs)
14538fd1498Szrj {
14638fd1498Szrj rescan_loop_exit (e, false, false);
14738fd1498Szrj }
14838fd1498Szrj }
14938fd1498Szrj
15038fd1498Szrj loop->num_nodes = n;
15138fd1498Szrj
15238fd1498Szrj free (bbs);
15338fd1498Szrj }
15438fd1498Szrj
15538fd1498Szrj /* BB is part of the outer loop of an unroll-and-jam situation.
15638fd1498Szrj Check if any statements therein would prevent the transformation. */
15738fd1498Szrj
15838fd1498Szrj static bool
bb_prevents_fusion_p(basic_block bb)15938fd1498Szrj bb_prevents_fusion_p (basic_block bb)
16038fd1498Szrj {
16138fd1498Szrj gimple_stmt_iterator gsi;
16238fd1498Szrj /* BB is duplicated by outer unrolling and then all N-1 first copies
16338fd1498Szrj move into the body of the fused inner loop. If BB exits the outer loop
164*58e805e6Szrj the last copy still does so, and the first N-1 copies are cancelled
16538fd1498Szrj by loop unrolling, so also after fusion it's the exit block.
16638fd1498Szrj But there might be other reasons that prevent fusion:
16738fd1498Szrj * stores or unknown side-effects prevent fusion
16838fd1498Szrj * loads don't
16938fd1498Szrj * computations into SSA names: these aren't problematic. Their
17038fd1498Szrj result will be unused on the exit edges of the first N-1 copies
17138fd1498Szrj (those aren't taken after unrolling). If they are used on the
17238fd1498Szrj other edge (the one leading to the outer latch block) they are
17338fd1498Szrj loop-carried (on the outer loop) and the Nth copy of BB will
17438fd1498Szrj compute them again (i.e. the first N-1 copies will be dead). */
17538fd1498Szrj for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
17638fd1498Szrj {
17738fd1498Szrj gimple *g = gsi_stmt (gsi);
17838fd1498Szrj if (gimple_vdef (g) || gimple_has_side_effects (g))
17938fd1498Szrj return true;
18038fd1498Szrj }
18138fd1498Szrj return false;
18238fd1498Szrj }
18338fd1498Szrj
18438fd1498Szrj /* Given an inner loop LOOP (of some OUTER loop) determine if
18538fd1498Szrj we can safely fuse copies of it (generated by outer unrolling).
18638fd1498Szrj If so return true, otherwise return false. */
18738fd1498Szrj
18838fd1498Szrj static bool
unroll_jam_possible_p(struct loop * outer,struct loop * loop)18938fd1498Szrj unroll_jam_possible_p (struct loop *outer, struct loop *loop)
19038fd1498Szrj {
19138fd1498Szrj basic_block *bbs;
19238fd1498Szrj int i, n;
19338fd1498Szrj struct tree_niter_desc niter;
19438fd1498Szrj
19538fd1498Szrj /* When fusing the loops we skip the latch block
19638fd1498Szrj of the first one, so it mustn't have any effects to
19738fd1498Szrj preserve. */
19838fd1498Szrj if (!empty_block_p (loop->latch))
19938fd1498Szrj return false;
20038fd1498Szrj
20138fd1498Szrj if (!single_exit (loop))
20238fd1498Szrj return false;
20338fd1498Szrj
20438fd1498Szrj /* We need a perfect nest. Quick check for adjacent inner loops. */
20538fd1498Szrj if (outer->inner != loop || loop->next)
20638fd1498Szrj return false;
20738fd1498Szrj
20838fd1498Szrj /* Prevent head-controlled inner loops, that we usually have.
20938fd1498Szrj The guard block would need to be accepted
21038fd1498Szrj (invariant condition either entering or skipping the loop),
21138fd1498Szrj without also accepting arbitrary control flow. When unswitching
21238fd1498Szrj ran before us (as with -O3) this won't be a problem because its
21338fd1498Szrj outer loop unswitching will have moved out the invariant condition.
21438fd1498Szrj
21538fd1498Szrj If we do that we need to extend fuse_loops() to cope with this
21638fd1498Szrj by threading through the (still invariant) copied condition
21738fd1498Szrj between the two loop copies. */
21838fd1498Szrj if (!dominated_by_p (CDI_DOMINATORS, outer->latch, loop->header))
21938fd1498Szrj return false;
22038fd1498Szrj
22138fd1498Szrj /* The number of iterations of the inner loop must be loop invariant
22238fd1498Szrj with respect to the outer loop. */
22338fd1498Szrj if (!number_of_iterations_exit (loop, single_exit (loop), &niter,
22438fd1498Szrj false, true)
22538fd1498Szrj || niter.cmp == ERROR_MARK
22638fd1498Szrj || !integer_zerop (niter.may_be_zero)
22738fd1498Szrj || !expr_invariant_in_loop_p (outer, niter.niter))
22838fd1498Szrj return false;
22938fd1498Szrj
230*58e805e6Szrj /* If the inner loop produces any values that are used inside the
231*58e805e6Szrj outer loop (except the virtual op) then it can flow
232*58e805e6Szrj back (perhaps indirectly) into the inner loop. This prevents
233*58e805e6Szrj fusion: without fusion the value at the last iteration is used,
234*58e805e6Szrj with fusion the value after the initial iteration is used.
235*58e805e6Szrj
236*58e805e6Szrj If all uses are outside the outer loop this doesn't prevent fusion;
237*58e805e6Szrj the value of the last iteration is still used (and the values from
238*58e805e6Szrj all intermediate iterations are dead). */
239*58e805e6Szrj gphi_iterator psi;
240*58e805e6Szrj for (psi = gsi_start_phis (single_exit (loop)->dest);
241*58e805e6Szrj !gsi_end_p (psi); gsi_next (&psi))
242*58e805e6Szrj {
243*58e805e6Szrj imm_use_iterator imm_iter;
244*58e805e6Szrj use_operand_p use_p;
245*58e805e6Szrj tree op = gimple_phi_result (psi.phi ());
246*58e805e6Szrj if (virtual_operand_p (op))
247*58e805e6Szrj continue;
248*58e805e6Szrj FOR_EACH_IMM_USE_FAST (use_p, imm_iter, op)
249*58e805e6Szrj {
250*58e805e6Szrj gimple *use_stmt = USE_STMT (use_p);
251*58e805e6Szrj if (!is_gimple_debug (use_stmt)
252*58e805e6Szrj && flow_bb_inside_loop_p (outer, gimple_bb (use_stmt)))
253*58e805e6Szrj return false;
254*58e805e6Szrj }
255*58e805e6Szrj }
256*58e805e6Szrj
25738fd1498Szrj /* And check blocks belonging to just outer loop. */
25838fd1498Szrj bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
25938fd1498Szrj n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun));
26038fd1498Szrj
26138fd1498Szrj for (i = 0; i < n; i++)
26238fd1498Szrj if (bbs[i]->loop_father == outer && bb_prevents_fusion_p (bbs[i]))
26338fd1498Szrj break;
26438fd1498Szrj free (bbs);
26538fd1498Szrj if (i != n)
26638fd1498Szrj return false;
26738fd1498Szrj
26838fd1498Szrj /* For now we can safely fuse copies of LOOP only if all
26938fd1498Szrj loop carried variables are inductions (or the virtual op).
27038fd1498Szrj
27138fd1498Szrj We could handle reductions as well (the initial value in the second
27238fd1498Szrj body would be the after-iter value of the first body) if it's over
27338fd1498Szrj an associative and commutative operation. We wouldn't
27438fd1498Szrj be able to handle unknown cycles. */
27538fd1498Szrj for (psi = gsi_start_phis (loop->header); !gsi_end_p (psi); gsi_next (&psi))
27638fd1498Szrj {
27738fd1498Szrj affine_iv iv;
27838fd1498Szrj tree op = gimple_phi_result (psi.phi ());
27938fd1498Szrj
28038fd1498Szrj if (virtual_operand_p (op))
28138fd1498Szrj continue;
28238fd1498Szrj if (!simple_iv (loop, loop, op, &iv, true))
28338fd1498Szrj return false;
28438fd1498Szrj /* The inductions must be regular, loop invariant step and initial
28538fd1498Szrj value. */
28638fd1498Szrj if (!expr_invariant_in_loop_p (outer, iv.step)
28738fd1498Szrj || !expr_invariant_in_loop_p (outer, iv.base))
28838fd1498Szrj return false;
28938fd1498Szrj /* XXX With more effort we could also be able to deal with inductions
29038fd1498Szrj where the initial value is loop variant but a simple IV in the
29138fd1498Szrj outer loop. The initial value for the second body would be
29238fd1498Szrj the original initial value plus iv.base.step. The next value
29338fd1498Szrj for the fused loop would be the original next value of the first
29438fd1498Szrj copy, _not_ the next value of the second body. */
29538fd1498Szrj }
29638fd1498Szrj
29738fd1498Szrj return true;
29838fd1498Szrj }
29938fd1498Szrj
30038fd1498Szrj /* Fuse LOOP with all further neighbors. The loops are expected to
30138fd1498Szrj be in appropriate form. */
30238fd1498Szrj
30338fd1498Szrj static void
fuse_loops(struct loop * loop)30438fd1498Szrj fuse_loops (struct loop *loop)
30538fd1498Szrj {
30638fd1498Szrj struct loop *next = loop->next;
30738fd1498Szrj
30838fd1498Szrj while (next)
30938fd1498Szrj {
31038fd1498Szrj edge e;
31138fd1498Szrj
31238fd1498Szrj remove_branch (single_pred_edge (loop->latch));
31338fd1498Szrj /* Make delete_basic_block not fiddle with the loop structure. */
31438fd1498Szrj basic_block oldlatch = loop->latch;
31538fd1498Szrj loop->latch = NULL;
31638fd1498Szrj delete_basic_block (oldlatch);
31738fd1498Szrj e = redirect_edge_and_branch (loop_latch_edge (next),
31838fd1498Szrj loop->header);
31938fd1498Szrj loop->latch = e->src;
32038fd1498Szrj flush_pending_stmts (e);
32138fd1498Szrj
32238fd1498Szrj gcc_assert (EDGE_COUNT (next->header->preds) == 1);
32338fd1498Szrj
32438fd1498Szrj /* The PHI nodes of the second body (single-argument now)
32538fd1498Szrj need adjustments to use the right values: either directly
32638fd1498Szrj the value of the corresponding PHI in the first copy or
32738fd1498Szrj the one leaving the first body which unrolling did for us.
32838fd1498Szrj
32938fd1498Szrj See also unroll_jam_possible_p() for further possibilities. */
33038fd1498Szrj gphi_iterator psi_first, psi_second;
33138fd1498Szrj e = single_pred_edge (next->header);
33238fd1498Szrj for (psi_first = gsi_start_phis (loop->header),
33338fd1498Szrj psi_second = gsi_start_phis (next->header);
33438fd1498Szrj !gsi_end_p (psi_first);
33538fd1498Szrj gsi_next (&psi_first), gsi_next (&psi_second))
33638fd1498Szrj {
33738fd1498Szrj gphi *phi_first = psi_first.phi ();
33838fd1498Szrj gphi *phi_second = psi_second.phi ();
33938fd1498Szrj tree firstop = gimple_phi_result (phi_first);
34038fd1498Szrj /* The virtual operand is correct already as it's
34138fd1498Szrj always live at exit, hence has a LCSSA node and outer
34238fd1498Szrj loop unrolling updated SSA form. */
34338fd1498Szrj if (virtual_operand_p (firstop))
34438fd1498Szrj continue;
34538fd1498Szrj
34638fd1498Szrj /* Due to unroll_jam_possible_p() we know that this is
34738fd1498Szrj an induction. The second body goes over the same
34838fd1498Szrj iteration space. */
34938fd1498Szrj add_phi_arg (phi_second, firstop, e,
35038fd1498Szrj gimple_location (phi_first));
35138fd1498Szrj }
35238fd1498Szrj gcc_assert (gsi_end_p (psi_second));
35338fd1498Szrj
35438fd1498Szrj merge_loop_tree (loop, next);
35538fd1498Szrj gcc_assert (!next->num_nodes);
35638fd1498Szrj struct loop *ln = next->next;
35738fd1498Szrj delete_loop (next);
35838fd1498Szrj next = ln;
35938fd1498Szrj }
36038fd1498Szrj rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
36138fd1498Szrj }
36238fd1498Szrj
36338fd1498Szrj /* Returns true if the distance in DDR can be determined and adjusts
36438fd1498Szrj the unroll factor in *UNROLL to make unrolling valid for that distance.
36538fd1498Szrj Otherwise return false.
36638fd1498Szrj
36738fd1498Szrj If this data dep can lead to a removed memory reference, increment
36838fd1498Szrj *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor
36938fd1498Szrj for this to happen. */
37038fd1498Szrj
37138fd1498Szrj static bool
adjust_unroll_factor(struct data_dependence_relation * ddr,unsigned * unroll,unsigned * profit_unroll,unsigned * removed)37238fd1498Szrj adjust_unroll_factor (struct data_dependence_relation *ddr,
37338fd1498Szrj unsigned *unroll, unsigned *profit_unroll,
37438fd1498Szrj unsigned *removed)
37538fd1498Szrj {
37638fd1498Szrj bool ret = false;
37738fd1498Szrj if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
37838fd1498Szrj {
37938fd1498Szrj if (DDR_NUM_DIST_VECTS (ddr) == 0)
38038fd1498Szrj return false;
38138fd1498Szrj unsigned i;
38238fd1498Szrj lambda_vector dist_v;
38338fd1498Szrj FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
38438fd1498Szrj {
38538fd1498Szrj /* A distance (a,b) is at worst transformed into (a/N,b) by the
38638fd1498Szrj unrolling (factor N), so the transformation is valid if
38738fd1498Szrj a >= N, or b > 0, or b is zero and a > 0. Otherwise the unroll
38838fd1498Szrj factor needs to be limited so that the first condition holds.
38938fd1498Szrj That may limit the factor down to zero in the worst case. */
39038fd1498Szrj int dist = dist_v[0];
39138fd1498Szrj if (dist < 0)
39238fd1498Szrj gcc_unreachable ();
39338fd1498Szrj else if ((unsigned)dist >= *unroll)
39438fd1498Szrj ;
39538fd1498Szrj else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
39638fd1498Szrj || (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
39738fd1498Szrj && dist > 0))
39838fd1498Szrj ;
39938fd1498Szrj else
40038fd1498Szrj *unroll = dist;
40138fd1498Szrj
40238fd1498Szrj /* With a distance (a,0) it's always profitable to unroll-and-jam
40338fd1498Szrj (by a+1), because one memory reference will go away. With
40438fd1498Szrj (a,b) and b != 0 that's less clear. We will increase the
40538fd1498Szrj number of streams without lowering the number of mem refs.
40638fd1498Szrj So for now only handle the first situation. */
40738fd1498Szrj if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
40838fd1498Szrj {
40938fd1498Szrj *profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1);
41038fd1498Szrj (*removed)++;
41138fd1498Szrj }
41238fd1498Szrj
41338fd1498Szrj ret = true;
41438fd1498Szrj }
41538fd1498Szrj }
41638fd1498Szrj return ret;
41738fd1498Szrj }
41838fd1498Szrj
41938fd1498Szrj /* Main entry point for the unroll-and-jam transformation
42038fd1498Szrj described above. */
42138fd1498Szrj
42238fd1498Szrj static unsigned int
tree_loop_unroll_and_jam(void)42338fd1498Szrj tree_loop_unroll_and_jam (void)
42438fd1498Szrj {
42538fd1498Szrj struct loop *loop;
42638fd1498Szrj bool changed = false;
42738fd1498Szrj
42838fd1498Szrj gcc_assert (scev_initialized_p ());
42938fd1498Szrj
43038fd1498Szrj /* Go through all innermost loops. */
43138fd1498Szrj FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
43238fd1498Szrj {
43338fd1498Szrj struct loop *outer = loop_outer (loop);
43438fd1498Szrj
43538fd1498Szrj if (loop_depth (loop) < 2
43638fd1498Szrj || optimize_loop_nest_for_size_p (outer))
43738fd1498Szrj continue;
43838fd1498Szrj
43938fd1498Szrj if (!unroll_jam_possible_p (outer, loop))
44038fd1498Szrj continue;
44138fd1498Szrj
44238fd1498Szrj vec<data_reference_p> datarefs;
44338fd1498Szrj vec<ddr_p> dependences;
44438fd1498Szrj unsigned unroll_factor, profit_unroll, removed;
44538fd1498Szrj struct tree_niter_desc desc;
44638fd1498Szrj bool unroll = false;
44738fd1498Szrj
44838fd1498Szrj auto_vec<loop_p, 3> loop_nest;
44938fd1498Szrj dependences.create (10);
45038fd1498Szrj datarefs.create (10);
45138fd1498Szrj if (!compute_data_dependences_for_loop (outer, true, &loop_nest,
45238fd1498Szrj &datarefs, &dependences))
45338fd1498Szrj {
45438fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS))
45538fd1498Szrj fprintf (dump_file, "Cannot analyze data dependencies\n");
45638fd1498Szrj free_data_refs (datarefs);
45738fd1498Szrj free_dependence_relations (dependences);
458*58e805e6Szrj continue;
45938fd1498Szrj }
46038fd1498Szrj if (!datarefs.length ())
46138fd1498Szrj continue;
46238fd1498Szrj
46338fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS))
46438fd1498Szrj dump_data_dependence_relations (dump_file, dependences);
46538fd1498Szrj
46638fd1498Szrj unroll_factor = (unsigned)-1;
46738fd1498Szrj profit_unroll = 1;
46838fd1498Szrj removed = 0;
46938fd1498Szrj
47038fd1498Szrj /* Check all dependencies. */
47138fd1498Szrj unsigned i;
47238fd1498Szrj struct data_dependence_relation *ddr;
47338fd1498Szrj FOR_EACH_VEC_ELT (dependences, i, ddr)
47438fd1498Szrj {
47538fd1498Szrj struct data_reference *dra, *drb;
47638fd1498Szrj
47738fd1498Szrj /* If the refs are independend there's nothing to do. */
47838fd1498Szrj if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
47938fd1498Szrj continue;
48038fd1498Szrj dra = DDR_A (ddr);
48138fd1498Szrj drb = DDR_B (ddr);
48238fd1498Szrj /* Nothing interesting for the self dependencies. */
48338fd1498Szrj if (dra == drb)
48438fd1498Szrj continue;
48538fd1498Szrj
48638fd1498Szrj /* Now check the distance vector, for determining a sensible
48738fd1498Szrj outer unroll factor, and for validity of merging the inner
48838fd1498Szrj loop copies. */
48938fd1498Szrj if (!adjust_unroll_factor (ddr, &unroll_factor, &profit_unroll,
49038fd1498Szrj &removed))
49138fd1498Szrj {
49238fd1498Szrj /* Couldn't get the distance vector. For two reads that's
49338fd1498Szrj harmless (we assume we should unroll). For at least
49438fd1498Szrj one write this means we can't check the dependence direction
49538fd1498Szrj and hence can't determine safety. */
49638fd1498Szrj
49738fd1498Szrj if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb))
49838fd1498Szrj {
49938fd1498Szrj unroll_factor = 0;
50038fd1498Szrj break;
50138fd1498Szrj }
50238fd1498Szrj }
50338fd1498Szrj }
50438fd1498Szrj
50538fd1498Szrj /* We regard a user-specified minimum percentage of zero as a request
50638fd1498Szrj to ignore all profitability concerns and apply the transformation
50738fd1498Szrj always. */
50838fd1498Szrj if (!PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
50938fd1498Szrj profit_unroll = 2;
51038fd1498Szrj else if (removed * 100 / datarefs.length ()
51138fd1498Szrj < (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
51238fd1498Szrj profit_unroll = 1;
51338fd1498Szrj if (unroll_factor > profit_unroll)
51438fd1498Szrj unroll_factor = profit_unroll;
51538fd1498Szrj if (unroll_factor > (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL))
51638fd1498Szrj unroll_factor = PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL);
51738fd1498Szrj unroll = (unroll_factor > 1
51838fd1498Szrj && can_unroll_loop_p (outer, unroll_factor, &desc));
51938fd1498Szrj
52038fd1498Szrj if (unroll)
52138fd1498Szrj {
52238fd1498Szrj if (dump_enabled_p ())
52338fd1498Szrj dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS,
52438fd1498Szrj find_loop_location (outer),
52538fd1498Szrj "applying unroll and jam with factor %d\n",
52638fd1498Szrj unroll_factor);
52738fd1498Szrj initialize_original_copy_tables ();
52838fd1498Szrj tree_unroll_loop (outer, unroll_factor, single_dom_exit (outer),
52938fd1498Szrj &desc);
53038fd1498Szrj free_original_copy_tables ();
53138fd1498Szrj fuse_loops (outer->inner);
53238fd1498Szrj changed = true;
53338fd1498Szrj }
53438fd1498Szrj
53538fd1498Szrj loop_nest.release ();
53638fd1498Szrj free_dependence_relations (dependences);
53738fd1498Szrj free_data_refs (datarefs);
53838fd1498Szrj }
53938fd1498Szrj
54038fd1498Szrj if (changed)
54138fd1498Szrj {
54238fd1498Szrj scev_reset ();
54338fd1498Szrj free_dominance_info (CDI_DOMINATORS);
54438fd1498Szrj return TODO_cleanup_cfg;
54538fd1498Szrj }
54638fd1498Szrj return 0;
54738fd1498Szrj }
54838fd1498Szrj
54938fd1498Szrj /* Pass boilerplate */
55038fd1498Szrj
55138fd1498Szrj namespace {
55238fd1498Szrj
55338fd1498Szrj const pass_data pass_data_loop_jam =
55438fd1498Szrj {
55538fd1498Szrj GIMPLE_PASS, /* type */
55638fd1498Szrj "unrolljam", /* name */
55738fd1498Szrj OPTGROUP_LOOP, /* optinfo_flags */
55838fd1498Szrj TV_LOOP_JAM, /* tv_id */
55938fd1498Szrj PROP_cfg, /* properties_required */
56038fd1498Szrj 0, /* properties_provided */
56138fd1498Szrj 0, /* properties_destroyed */
56238fd1498Szrj 0, /* todo_flags_start */
56338fd1498Szrj 0, /* todo_flags_finish */
56438fd1498Szrj };
56538fd1498Szrj
56638fd1498Szrj class pass_loop_jam : public gimple_opt_pass
56738fd1498Szrj {
56838fd1498Szrj public:
pass_loop_jam(gcc::context * ctxt)56938fd1498Szrj pass_loop_jam (gcc::context *ctxt)
57038fd1498Szrj : gimple_opt_pass (pass_data_loop_jam, ctxt)
57138fd1498Szrj {}
57238fd1498Szrj
57338fd1498Szrj /* opt_pass methods: */
gate(function *)57438fd1498Szrj virtual bool gate (function *) { return flag_unroll_jam != 0; }
57538fd1498Szrj virtual unsigned int execute (function *);
57638fd1498Szrj
57738fd1498Szrj };
57838fd1498Szrj
57938fd1498Szrj unsigned int
execute(function * fun)58038fd1498Szrj pass_loop_jam::execute (function *fun)
58138fd1498Szrj {
58238fd1498Szrj if (number_of_loops (fun) <= 1)
58338fd1498Szrj return 0;
58438fd1498Szrj
58538fd1498Szrj return tree_loop_unroll_and_jam ();
58638fd1498Szrj }
58738fd1498Szrj
58838fd1498Szrj }
58938fd1498Szrj
59038fd1498Szrj gimple_opt_pass *
make_pass_loop_jam(gcc::context * ctxt)59138fd1498Szrj make_pass_loop_jam (gcc::context *ctxt)
59238fd1498Szrj {
59338fd1498Szrj return new pass_loop_jam (ctxt);
59438fd1498Szrj }
595