138fd1498Szrj /* Loop unroll-and-jam.
238fd1498Szrj    Copyright (C) 2017-2018 Free Software Foundation, Inc.
338fd1498Szrj 
438fd1498Szrj This file is part of GCC.
538fd1498Szrj 
638fd1498Szrj GCC is free software; you can redistribute it and/or modify it
738fd1498Szrj under the terms of the GNU General Public License as published by the
838fd1498Szrj Free Software Foundation; either version 3, or (at your option) any
938fd1498Szrj later version.
1038fd1498Szrj 
1138fd1498Szrj GCC is distributed in the hope that it will be useful, but WITHOUT
1238fd1498Szrj ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
1338fd1498Szrj FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
1438fd1498Szrj for more details.
1538fd1498Szrj 
1638fd1498Szrj You should have received a copy of the GNU General Public License
1738fd1498Szrj along with GCC; see the file COPYING3.  If not see
1838fd1498Szrj <http://www.gnu.org/licenses/>.  */
1938fd1498Szrj 
2038fd1498Szrj #include "config.h"
2138fd1498Szrj #include "system.h"
2238fd1498Szrj #include "coretypes.h"
2338fd1498Szrj #include "params.h"
2438fd1498Szrj #include "tree-pass.h"
2538fd1498Szrj #include "backend.h"
2638fd1498Szrj #include "tree.h"
2738fd1498Szrj #include "gimple.h"
2838fd1498Szrj #include "ssa.h"
2938fd1498Szrj #include "fold-const.h"
3038fd1498Szrj #include "tree-cfg.h"
3138fd1498Szrj #include "tree-ssa.h"
3238fd1498Szrj #include "tree-ssa-loop-niter.h"
3338fd1498Szrj #include "tree-ssa-loop.h"
3438fd1498Szrj #include "tree-ssa-loop-manip.h"
3538fd1498Szrj #include "cfgloop.h"
3638fd1498Szrj #include "tree-scalar-evolution.h"
3738fd1498Szrj #include "gimple-iterator.h"
3838fd1498Szrj #include "cfghooks.h"
3938fd1498Szrj #include "tree-data-ref.h"
4038fd1498Szrj #include "tree-ssa-loop-ivopts.h"
4138fd1498Szrj #include "tree-vectorizer.h"
4238fd1498Szrj 
4338fd1498Szrj /* Unroll and Jam transformation
4438fd1498Szrj 
4538fd1498Szrj    This is a combination of two transformations, where the second
4638fd1498Szrj    is not always valid.  It's applicable if a loop nest has redundancies
4738fd1498Szrj    over the iterations of an outer loop while not having that with
4838fd1498Szrj    an inner loop.
4938fd1498Szrj 
5038fd1498Szrj    Given this nest:
5138fd1498Szrj        for (i) {
5238fd1498Szrj 	 for (j) {
5338fd1498Szrj 	   B(i,j)
5438fd1498Szrj 	 }
5538fd1498Szrj        }
5638fd1498Szrj 
5738fd1498Szrj    first unroll:
5838fd1498Szrj        for (i by 2) {
5938fd1498Szrj 	 for (j) {
6038fd1498Szrj 	   B(i,j)
6138fd1498Szrj 	 }
6238fd1498Szrj 	 for (j) {
6338fd1498Szrj 	   B(i+1,j)
6438fd1498Szrj 	 }
6538fd1498Szrj        }
6638fd1498Szrj 
6738fd1498Szrj    then fuse the two adjacent inner loops resulting from that:
6838fd1498Szrj        for (i by 2) {
6938fd1498Szrj 	 for (j) {
7038fd1498Szrj 	   B(i,j)
7138fd1498Szrj 	   B(i+1,j)
7238fd1498Szrj 	 }
7338fd1498Szrj        }
7438fd1498Szrj 
7538fd1498Szrj    As the order of evaluations of the body B changes this is valid
7638fd1498Szrj    only in certain situations: all distance vectors need to be forward.
7738fd1498Szrj    Additionally if there are multiple induction variables than just
7838fd1498Szrj    a counting control IV (j above) we can also deal with some situations.
7938fd1498Szrj 
8038fd1498Szrj    The validity is checked by unroll_jam_possible_p, and the data-dep
8138fd1498Szrj    testing below.
8238fd1498Szrj 
8338fd1498Szrj    A trivial example where the fusion is wrong would be when
8438fd1498Szrj    B(i,j) == x[j-1] = x[j];
8538fd1498Szrj        for (i by 2) {
8638fd1498Szrj 	 for (j) {
8738fd1498Szrj 	   x[j-1] = x[j];
8838fd1498Szrj 	 }
8938fd1498Szrj 	 for (j) {
9038fd1498Szrj 	   x[j-1] = x[j];
9138fd1498Szrj 	 }
9238fd1498Szrj        }  effect: move content to front by two elements
9338fd1498Szrj        -->
9438fd1498Szrj        for (i by 2) {
9538fd1498Szrj 	 for (j) {
9638fd1498Szrj 	   x[j-1] = x[j];
9738fd1498Szrj 	   x[j-1] = x[j];
9838fd1498Szrj 	 }
9938fd1498Szrj        }  effect: move content to front by one element
10038fd1498Szrj */
10138fd1498Szrj 
10238fd1498Szrj /* Modify the loop tree for the fact that all code once belonging
10338fd1498Szrj    to the OLD loop or the outer loop of OLD now is inside LOOP.  */
10438fd1498Szrj 
10538fd1498Szrj static void
merge_loop_tree(struct loop * loop,struct loop * old)10638fd1498Szrj merge_loop_tree (struct loop *loop, struct loop *old)
10738fd1498Szrj {
10838fd1498Szrj   basic_block *bbs;
10938fd1498Szrj   int i, n;
11038fd1498Szrj   struct loop *subloop;
11138fd1498Szrj   edge e;
11238fd1498Szrj   edge_iterator ei;
11338fd1498Szrj 
11438fd1498Szrj   /* Find its nodes.  */
11538fd1498Szrj   bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
11638fd1498Szrj   n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun));
11738fd1498Szrj 
11838fd1498Szrj   for (i = 0; i < n; i++)
11938fd1498Szrj     {
12038fd1498Szrj       /* If the block was direct child of OLD loop it's now part
12138fd1498Szrj          of LOOP.  If it was outside OLD, then it moved into LOOP
12238fd1498Szrj 	 as well.  This avoids changing the loop father for BBs
12338fd1498Szrj 	 in inner loops of OLD.  */
12438fd1498Szrj       if (bbs[i]->loop_father == old
12538fd1498Szrj 	  || loop_depth (bbs[i]->loop_father) < loop_depth (old))
12638fd1498Szrj 	{
12738fd1498Szrj 	  remove_bb_from_loops (bbs[i]);
12838fd1498Szrj 	  add_bb_to_loop (bbs[i], loop);
12938fd1498Szrj 	  continue;
13038fd1498Szrj 	}
13138fd1498Szrj 
13238fd1498Szrj       /* If we find a direct subloop of OLD, move it to LOOP.  */
13338fd1498Szrj       subloop = bbs[i]->loop_father;
13438fd1498Szrj       if (loop_outer (subloop) == old && subloop->header == bbs[i])
13538fd1498Szrj 	{
13638fd1498Szrj 	  flow_loop_tree_node_remove (subloop);
13738fd1498Szrj 	  flow_loop_tree_node_add (loop, subloop);
13838fd1498Szrj 	}
13938fd1498Szrj     }
14038fd1498Szrj 
14138fd1498Szrj   /* Update the information about loop exit edges.  */
14238fd1498Szrj   for (i = 0; i < n; i++)
14338fd1498Szrj     {
14438fd1498Szrj       FOR_EACH_EDGE (e, ei, bbs[i]->succs)
14538fd1498Szrj 	{
14638fd1498Szrj 	  rescan_loop_exit (e, false, false);
14738fd1498Szrj 	}
14838fd1498Szrj     }
14938fd1498Szrj 
15038fd1498Szrj   loop->num_nodes = n;
15138fd1498Szrj 
15238fd1498Szrj   free (bbs);
15338fd1498Szrj }
15438fd1498Szrj 
15538fd1498Szrj /* BB is part of the outer loop of an unroll-and-jam situation.
15638fd1498Szrj    Check if any statements therein would prevent the transformation.  */
15738fd1498Szrj 
15838fd1498Szrj static bool
bb_prevents_fusion_p(basic_block bb)15938fd1498Szrj bb_prevents_fusion_p (basic_block bb)
16038fd1498Szrj {
16138fd1498Szrj   gimple_stmt_iterator gsi;
16238fd1498Szrj   /* BB is duplicated by outer unrolling and then all N-1 first copies
16338fd1498Szrj      move into the body of the fused inner loop.  If BB exits the outer loop
164*58e805e6Szrj      the last copy still does so, and the first N-1 copies are cancelled
16538fd1498Szrj      by loop unrolling, so also after fusion it's the exit block.
16638fd1498Szrj      But there might be other reasons that prevent fusion:
16738fd1498Szrj        * stores or unknown side-effects prevent fusion
16838fd1498Szrj        * loads don't
16938fd1498Szrj        * computations into SSA names: these aren't problematic.  Their
17038fd1498Szrj          result will be unused on the exit edges of the first N-1 copies
17138fd1498Szrj 	 (those aren't taken after unrolling).  If they are used on the
17238fd1498Szrj 	 other edge (the one leading to the outer latch block) they are
17338fd1498Szrj 	 loop-carried (on the outer loop) and the Nth copy of BB will
17438fd1498Szrj 	 compute them again (i.e. the first N-1 copies will be dead).  */
17538fd1498Szrj   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
17638fd1498Szrj     {
17738fd1498Szrj       gimple *g = gsi_stmt (gsi);
17838fd1498Szrj       if (gimple_vdef (g) || gimple_has_side_effects (g))
17938fd1498Szrj 	return true;
18038fd1498Szrj     }
18138fd1498Szrj   return false;
18238fd1498Szrj }
18338fd1498Szrj 
18438fd1498Szrj /* Given an inner loop LOOP (of some OUTER loop) determine if
18538fd1498Szrj    we can safely fuse copies of it (generated by outer unrolling).
18638fd1498Szrj    If so return true, otherwise return false.  */
18738fd1498Szrj 
18838fd1498Szrj static bool
unroll_jam_possible_p(struct loop * outer,struct loop * loop)18938fd1498Szrj unroll_jam_possible_p (struct loop *outer, struct loop *loop)
19038fd1498Szrj {
19138fd1498Szrj   basic_block *bbs;
19238fd1498Szrj   int i, n;
19338fd1498Szrj   struct tree_niter_desc niter;
19438fd1498Szrj 
19538fd1498Szrj   /* When fusing the loops we skip the latch block
19638fd1498Szrj      of the first one, so it mustn't have any effects to
19738fd1498Szrj      preserve.  */
19838fd1498Szrj   if (!empty_block_p (loop->latch))
19938fd1498Szrj     return false;
20038fd1498Szrj 
20138fd1498Szrj   if (!single_exit (loop))
20238fd1498Szrj     return false;
20338fd1498Szrj 
20438fd1498Szrj   /* We need a perfect nest.  Quick check for adjacent inner loops.  */
20538fd1498Szrj   if (outer->inner != loop || loop->next)
20638fd1498Szrj     return false;
20738fd1498Szrj 
20838fd1498Szrj   /* Prevent head-controlled inner loops, that we usually have.
20938fd1498Szrj      The guard block would need to be accepted
21038fd1498Szrj      (invariant condition either entering or skipping the loop),
21138fd1498Szrj      without also accepting arbitrary control flow.  When unswitching
21238fd1498Szrj      ran before us (as with -O3) this won't be a problem because its
21338fd1498Szrj      outer loop unswitching will have moved out the invariant condition.
21438fd1498Szrj 
21538fd1498Szrj      If we do that we need to extend fuse_loops() to cope with this
21638fd1498Szrj      by threading through the (still invariant) copied condition
21738fd1498Szrj      between the two loop copies.  */
21838fd1498Szrj   if (!dominated_by_p (CDI_DOMINATORS, outer->latch, loop->header))
21938fd1498Szrj     return false;
22038fd1498Szrj 
22138fd1498Szrj   /* The number of iterations of the inner loop must be loop invariant
22238fd1498Szrj      with respect to the outer loop.  */
22338fd1498Szrj   if (!number_of_iterations_exit (loop, single_exit (loop), &niter,
22438fd1498Szrj 				 false, true)
22538fd1498Szrj       || niter.cmp == ERROR_MARK
22638fd1498Szrj       || !integer_zerop (niter.may_be_zero)
22738fd1498Szrj       || !expr_invariant_in_loop_p (outer, niter.niter))
22838fd1498Szrj     return false;
22938fd1498Szrj 
230*58e805e6Szrj   /* If the inner loop produces any values that are used inside the
231*58e805e6Szrj      outer loop (except the virtual op) then it can flow
232*58e805e6Szrj      back (perhaps indirectly) into the inner loop.  This prevents
233*58e805e6Szrj      fusion: without fusion the value at the last iteration is used,
234*58e805e6Szrj      with fusion the value after the initial iteration is used.
235*58e805e6Szrj 
236*58e805e6Szrj      If all uses are outside the outer loop this doesn't prevent fusion;
237*58e805e6Szrj      the value of the last iteration is still used (and the values from
238*58e805e6Szrj      all intermediate iterations are dead).  */
239*58e805e6Szrj   gphi_iterator psi;
240*58e805e6Szrj   for (psi = gsi_start_phis (single_exit (loop)->dest);
241*58e805e6Szrj        !gsi_end_p (psi); gsi_next (&psi))
242*58e805e6Szrj     {
243*58e805e6Szrj       imm_use_iterator imm_iter;
244*58e805e6Szrj       use_operand_p use_p;
245*58e805e6Szrj       tree op = gimple_phi_result (psi.phi ());
246*58e805e6Szrj       if (virtual_operand_p (op))
247*58e805e6Szrj 	continue;
248*58e805e6Szrj       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, op)
249*58e805e6Szrj 	{
250*58e805e6Szrj 	  gimple *use_stmt = USE_STMT (use_p);
251*58e805e6Szrj 	  if (!is_gimple_debug (use_stmt)
252*58e805e6Szrj 	      && flow_bb_inside_loop_p (outer, gimple_bb (use_stmt)))
253*58e805e6Szrj 	    return false;
254*58e805e6Szrj 	}
255*58e805e6Szrj     }
256*58e805e6Szrj 
25738fd1498Szrj   /* And check blocks belonging to just outer loop.  */
25838fd1498Szrj   bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
25938fd1498Szrj   n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun));
26038fd1498Szrj 
26138fd1498Szrj   for (i = 0; i < n; i++)
26238fd1498Szrj     if (bbs[i]->loop_father == outer && bb_prevents_fusion_p (bbs[i]))
26338fd1498Szrj       break;
26438fd1498Szrj   free (bbs);
26538fd1498Szrj   if (i != n)
26638fd1498Szrj     return false;
26738fd1498Szrj 
26838fd1498Szrj   /* For now we can safely fuse copies of LOOP only if all
26938fd1498Szrj      loop carried variables are inductions (or the virtual op).
27038fd1498Szrj 
27138fd1498Szrj      We could handle reductions as well (the initial value in the second
27238fd1498Szrj      body would be the after-iter value of the first body) if it's over
27338fd1498Szrj      an associative and commutative operation.  We wouldn't
27438fd1498Szrj      be able to handle unknown cycles.  */
27538fd1498Szrj   for (psi = gsi_start_phis (loop->header); !gsi_end_p (psi); gsi_next (&psi))
27638fd1498Szrj     {
27738fd1498Szrj       affine_iv iv;
27838fd1498Szrj       tree op = gimple_phi_result (psi.phi ());
27938fd1498Szrj 
28038fd1498Szrj       if (virtual_operand_p (op))
28138fd1498Szrj 	continue;
28238fd1498Szrj       if (!simple_iv (loop, loop, op, &iv, true))
28338fd1498Szrj 	return false;
28438fd1498Szrj       /* The inductions must be regular, loop invariant step and initial
28538fd1498Szrj          value.  */
28638fd1498Szrj       if (!expr_invariant_in_loop_p (outer, iv.step)
28738fd1498Szrj 	  || !expr_invariant_in_loop_p (outer, iv.base))
28838fd1498Szrj 	return false;
28938fd1498Szrj       /* XXX With more effort we could also be able to deal with inductions
29038fd1498Szrj          where the initial value is loop variant but a simple IV in the
29138fd1498Szrj 	 outer loop.  The initial value for the second body would be
29238fd1498Szrj 	 the original initial value plus iv.base.step.  The next value
29338fd1498Szrj 	 for the fused loop would be the original next value of the first
29438fd1498Szrj 	 copy, _not_ the next value of the second body.  */
29538fd1498Szrj     }
29638fd1498Szrj 
29738fd1498Szrj   return true;
29838fd1498Szrj }
29938fd1498Szrj 
30038fd1498Szrj /* Fuse LOOP with all further neighbors.  The loops are expected to
30138fd1498Szrj    be in appropriate form.  */
30238fd1498Szrj 
30338fd1498Szrj static void
fuse_loops(struct loop * loop)30438fd1498Szrj fuse_loops (struct loop *loop)
30538fd1498Szrj {
30638fd1498Szrj   struct loop *next = loop->next;
30738fd1498Szrj 
30838fd1498Szrj   while (next)
30938fd1498Szrj     {
31038fd1498Szrj       edge e;
31138fd1498Szrj 
31238fd1498Szrj       remove_branch (single_pred_edge (loop->latch));
31338fd1498Szrj       /* Make delete_basic_block not fiddle with the loop structure.  */
31438fd1498Szrj       basic_block oldlatch = loop->latch;
31538fd1498Szrj       loop->latch = NULL;
31638fd1498Szrj       delete_basic_block (oldlatch);
31738fd1498Szrj       e = redirect_edge_and_branch (loop_latch_edge (next),
31838fd1498Szrj 				    loop->header);
31938fd1498Szrj       loop->latch = e->src;
32038fd1498Szrj       flush_pending_stmts (e);
32138fd1498Szrj 
32238fd1498Szrj       gcc_assert (EDGE_COUNT (next->header->preds) == 1);
32338fd1498Szrj 
32438fd1498Szrj       /* The PHI nodes of the second body (single-argument now)
32538fd1498Szrj          need adjustments to use the right values: either directly
32638fd1498Szrj 	 the value of the corresponding PHI in the first copy or
32738fd1498Szrj 	 the one leaving the first body which unrolling did for us.
32838fd1498Szrj 
32938fd1498Szrj 	 See also unroll_jam_possible_p() for further possibilities.  */
33038fd1498Szrj       gphi_iterator psi_first, psi_second;
33138fd1498Szrj       e = single_pred_edge (next->header);
33238fd1498Szrj       for (psi_first = gsi_start_phis (loop->header),
33338fd1498Szrj 	   psi_second = gsi_start_phis (next->header);
33438fd1498Szrj 	   !gsi_end_p (psi_first);
33538fd1498Szrj 	   gsi_next (&psi_first), gsi_next (&psi_second))
33638fd1498Szrj 	{
33738fd1498Szrj 	  gphi *phi_first = psi_first.phi ();
33838fd1498Szrj 	  gphi *phi_second = psi_second.phi ();
33938fd1498Szrj 	  tree firstop = gimple_phi_result (phi_first);
34038fd1498Szrj 	  /* The virtual operand is correct already as it's
34138fd1498Szrj 	     always live at exit, hence has a LCSSA node and outer
34238fd1498Szrj 	     loop unrolling updated SSA form.  */
34338fd1498Szrj 	  if (virtual_operand_p (firstop))
34438fd1498Szrj 	    continue;
34538fd1498Szrj 
34638fd1498Szrj 	  /* Due to unroll_jam_possible_p() we know that this is
34738fd1498Szrj 	     an induction.  The second body goes over the same
34838fd1498Szrj 	     iteration space.  */
34938fd1498Szrj 	  add_phi_arg (phi_second, firstop, e,
35038fd1498Szrj 		       gimple_location (phi_first));
35138fd1498Szrj 	}
35238fd1498Szrj       gcc_assert (gsi_end_p (psi_second));
35338fd1498Szrj 
35438fd1498Szrj       merge_loop_tree (loop, next);
35538fd1498Szrj       gcc_assert (!next->num_nodes);
35638fd1498Szrj       struct loop *ln = next->next;
35738fd1498Szrj       delete_loop (next);
35838fd1498Szrj       next = ln;
35938fd1498Szrj     }
36038fd1498Szrj   rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
36138fd1498Szrj }
36238fd1498Szrj 
36338fd1498Szrj /* Returns true if the distance in DDR can be determined and adjusts
36438fd1498Szrj    the unroll factor in *UNROLL to make unrolling valid for that distance.
36538fd1498Szrj    Otherwise return false.
36638fd1498Szrj 
36738fd1498Szrj    If this data dep can lead to a removed memory reference, increment
36838fd1498Szrj    *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor
36938fd1498Szrj    for this to happen.  */
37038fd1498Szrj 
37138fd1498Szrj static bool
adjust_unroll_factor(struct data_dependence_relation * ddr,unsigned * unroll,unsigned * profit_unroll,unsigned * removed)37238fd1498Szrj adjust_unroll_factor (struct data_dependence_relation *ddr,
37338fd1498Szrj 		      unsigned *unroll, unsigned *profit_unroll,
37438fd1498Szrj 		      unsigned *removed)
37538fd1498Szrj {
37638fd1498Szrj   bool ret = false;
37738fd1498Szrj   if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
37838fd1498Szrj     {
37938fd1498Szrj       if (DDR_NUM_DIST_VECTS (ddr) == 0)
38038fd1498Szrj 	return false;
38138fd1498Szrj       unsigned i;
38238fd1498Szrj       lambda_vector dist_v;
38338fd1498Szrj       FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
38438fd1498Szrj 	{
38538fd1498Szrj 	  /* A distance (a,b) is at worst transformed into (a/N,b) by the
38638fd1498Szrj 	     unrolling (factor N), so the transformation is valid if
38738fd1498Szrj 	     a >= N, or b > 0, or b is zero and a > 0.  Otherwise the unroll
38838fd1498Szrj 	     factor needs to be limited so that the first condition holds.
38938fd1498Szrj 	     That may limit the factor down to zero in the worst case.  */
39038fd1498Szrj 	  int dist = dist_v[0];
39138fd1498Szrj 	  if (dist < 0)
39238fd1498Szrj 	    gcc_unreachable ();
39338fd1498Szrj 	  else if ((unsigned)dist >= *unroll)
39438fd1498Szrj 	    ;
39538fd1498Szrj 	  else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
39638fd1498Szrj 		   || (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
39738fd1498Szrj 		       && dist > 0))
39838fd1498Szrj 	    ;
39938fd1498Szrj 	  else
40038fd1498Szrj 	    *unroll = dist;
40138fd1498Szrj 
40238fd1498Szrj 	  /* With a distance (a,0) it's always profitable to unroll-and-jam
40338fd1498Szrj 	     (by a+1), because one memory reference will go away.  With
40438fd1498Szrj 	     (a,b) and b != 0 that's less clear.  We will increase the
40538fd1498Szrj 	     number of streams without lowering the number of mem refs.
40638fd1498Szrj 	     So for now only handle the first situation.  */
40738fd1498Szrj 	  if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
40838fd1498Szrj 	    {
40938fd1498Szrj 	      *profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1);
41038fd1498Szrj 	      (*removed)++;
41138fd1498Szrj 	    }
41238fd1498Szrj 
41338fd1498Szrj 	  ret = true;
41438fd1498Szrj 	}
41538fd1498Szrj     }
41638fd1498Szrj   return ret;
41738fd1498Szrj }
41838fd1498Szrj 
41938fd1498Szrj /* Main entry point for the unroll-and-jam transformation
42038fd1498Szrj    described above.  */
42138fd1498Szrj 
42238fd1498Szrj static unsigned int
tree_loop_unroll_and_jam(void)42338fd1498Szrj tree_loop_unroll_and_jam (void)
42438fd1498Szrj {
42538fd1498Szrj   struct loop *loop;
42638fd1498Szrj   bool changed = false;
42738fd1498Szrj 
42838fd1498Szrj   gcc_assert (scev_initialized_p ());
42938fd1498Szrj 
43038fd1498Szrj   /* Go through all innermost loops.  */
43138fd1498Szrj   FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
43238fd1498Szrj     {
43338fd1498Szrj       struct loop *outer = loop_outer (loop);
43438fd1498Szrj 
43538fd1498Szrj       if (loop_depth (loop) < 2
43638fd1498Szrj 	  || optimize_loop_nest_for_size_p (outer))
43738fd1498Szrj 	continue;
43838fd1498Szrj 
43938fd1498Szrj       if (!unroll_jam_possible_p (outer, loop))
44038fd1498Szrj 	continue;
44138fd1498Szrj 
44238fd1498Szrj       vec<data_reference_p> datarefs;
44338fd1498Szrj       vec<ddr_p> dependences;
44438fd1498Szrj       unsigned unroll_factor, profit_unroll, removed;
44538fd1498Szrj       struct tree_niter_desc desc;
44638fd1498Szrj       bool unroll = false;
44738fd1498Szrj 
44838fd1498Szrj       auto_vec<loop_p, 3> loop_nest;
44938fd1498Szrj       dependences.create (10);
45038fd1498Szrj       datarefs.create (10);
45138fd1498Szrj       if (!compute_data_dependences_for_loop (outer, true, &loop_nest,
45238fd1498Szrj 					       &datarefs, &dependences))
45338fd1498Szrj 	{
45438fd1498Szrj 	  if (dump_file && (dump_flags & TDF_DETAILS))
45538fd1498Szrj 	    fprintf (dump_file, "Cannot analyze data dependencies\n");
45638fd1498Szrj 	  free_data_refs (datarefs);
45738fd1498Szrj 	  free_dependence_relations (dependences);
458*58e805e6Szrj 	  continue;
45938fd1498Szrj 	}
46038fd1498Szrj       if (!datarefs.length ())
46138fd1498Szrj 	continue;
46238fd1498Szrj 
46338fd1498Szrj       if (dump_file && (dump_flags & TDF_DETAILS))
46438fd1498Szrj 	dump_data_dependence_relations (dump_file, dependences);
46538fd1498Szrj 
46638fd1498Szrj       unroll_factor = (unsigned)-1;
46738fd1498Szrj       profit_unroll = 1;
46838fd1498Szrj       removed = 0;
46938fd1498Szrj 
47038fd1498Szrj       /* Check all dependencies.  */
47138fd1498Szrj       unsigned i;
47238fd1498Szrj       struct data_dependence_relation *ddr;
47338fd1498Szrj       FOR_EACH_VEC_ELT (dependences, i, ddr)
47438fd1498Szrj 	{
47538fd1498Szrj 	  struct data_reference *dra, *drb;
47638fd1498Szrj 
47738fd1498Szrj 	  /* If the refs are independend there's nothing to do.  */
47838fd1498Szrj 	  if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
47938fd1498Szrj 	    continue;
48038fd1498Szrj 	  dra = DDR_A (ddr);
48138fd1498Szrj 	  drb = DDR_B (ddr);
48238fd1498Szrj 	  /* Nothing interesting for the self dependencies.  */
48338fd1498Szrj 	  if (dra == drb)
48438fd1498Szrj 	    continue;
48538fd1498Szrj 
48638fd1498Szrj 	  /* Now check the distance vector, for determining a sensible
48738fd1498Szrj 	     outer unroll factor, and for validity of merging the inner
48838fd1498Szrj 	     loop copies.  */
48938fd1498Szrj 	  if (!adjust_unroll_factor (ddr, &unroll_factor, &profit_unroll,
49038fd1498Szrj 				     &removed))
49138fd1498Szrj 	    {
49238fd1498Szrj 	      /* Couldn't get the distance vector.  For two reads that's
49338fd1498Szrj 	         harmless (we assume we should unroll).  For at least
49438fd1498Szrj 		 one write this means we can't check the dependence direction
49538fd1498Szrj 		 and hence can't determine safety.  */
49638fd1498Szrj 
49738fd1498Szrj 	      if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb))
49838fd1498Szrj 		{
49938fd1498Szrj 		  unroll_factor = 0;
50038fd1498Szrj 		  break;
50138fd1498Szrj 		}
50238fd1498Szrj 	    }
50338fd1498Szrj 	}
50438fd1498Szrj 
50538fd1498Szrj       /* We regard a user-specified minimum percentage of zero as a request
50638fd1498Szrj          to ignore all profitability concerns and apply the transformation
50738fd1498Szrj 	 always.  */
50838fd1498Szrj       if (!PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
50938fd1498Szrj 	profit_unroll = 2;
51038fd1498Szrj       else if (removed * 100 / datarefs.length ()
51138fd1498Szrj 	  < (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
51238fd1498Szrj 	profit_unroll = 1;
51338fd1498Szrj       if (unroll_factor > profit_unroll)
51438fd1498Szrj 	unroll_factor = profit_unroll;
51538fd1498Szrj       if (unroll_factor > (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL))
51638fd1498Szrj 	unroll_factor = PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL);
51738fd1498Szrj       unroll = (unroll_factor > 1
51838fd1498Szrj 		&& can_unroll_loop_p (outer, unroll_factor, &desc));
51938fd1498Szrj 
52038fd1498Szrj       if (unroll)
52138fd1498Szrj 	{
52238fd1498Szrj 	  if (dump_enabled_p ())
52338fd1498Szrj 	    dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS,
52438fd1498Szrj 			     find_loop_location (outer),
52538fd1498Szrj 			     "applying unroll and jam with factor %d\n",
52638fd1498Szrj 			     unroll_factor);
52738fd1498Szrj 	  initialize_original_copy_tables ();
52838fd1498Szrj 	  tree_unroll_loop (outer, unroll_factor, single_dom_exit (outer),
52938fd1498Szrj 			    &desc);
53038fd1498Szrj 	  free_original_copy_tables ();
53138fd1498Szrj 	  fuse_loops (outer->inner);
53238fd1498Szrj 	  changed = true;
53338fd1498Szrj 	}
53438fd1498Szrj 
53538fd1498Szrj       loop_nest.release ();
53638fd1498Szrj       free_dependence_relations (dependences);
53738fd1498Szrj       free_data_refs (datarefs);
53838fd1498Szrj     }
53938fd1498Szrj 
54038fd1498Szrj   if (changed)
54138fd1498Szrj     {
54238fd1498Szrj       scev_reset ();
54338fd1498Szrj       free_dominance_info (CDI_DOMINATORS);
54438fd1498Szrj       return TODO_cleanup_cfg;
54538fd1498Szrj     }
54638fd1498Szrj   return 0;
54738fd1498Szrj }
54838fd1498Szrj 
54938fd1498Szrj /* Pass boilerplate */
55038fd1498Szrj 
55138fd1498Szrj namespace {
55238fd1498Szrj 
55338fd1498Szrj const pass_data pass_data_loop_jam =
55438fd1498Szrj {
55538fd1498Szrj   GIMPLE_PASS, /* type */
55638fd1498Szrj   "unrolljam", /* name */
55738fd1498Szrj   OPTGROUP_LOOP, /* optinfo_flags */
55838fd1498Szrj   TV_LOOP_JAM, /* tv_id */
55938fd1498Szrj   PROP_cfg, /* properties_required */
56038fd1498Szrj   0, /* properties_provided */
56138fd1498Szrj   0, /* properties_destroyed */
56238fd1498Szrj   0, /* todo_flags_start */
56338fd1498Szrj   0, /* todo_flags_finish */
56438fd1498Szrj };
56538fd1498Szrj 
56638fd1498Szrj class pass_loop_jam : public gimple_opt_pass
56738fd1498Szrj {
56838fd1498Szrj public:
pass_loop_jam(gcc::context * ctxt)56938fd1498Szrj   pass_loop_jam (gcc::context *ctxt)
57038fd1498Szrj     : gimple_opt_pass (pass_data_loop_jam, ctxt)
57138fd1498Szrj   {}
57238fd1498Szrj 
57338fd1498Szrj   /* opt_pass methods: */
gate(function *)57438fd1498Szrj   virtual bool gate (function *) { return flag_unroll_jam != 0; }
57538fd1498Szrj   virtual unsigned int execute (function *);
57638fd1498Szrj 
57738fd1498Szrj };
57838fd1498Szrj 
57938fd1498Szrj unsigned int
execute(function * fun)58038fd1498Szrj pass_loop_jam::execute (function *fun)
58138fd1498Szrj {
58238fd1498Szrj   if (number_of_loops (fun) <= 1)
58338fd1498Szrj     return 0;
58438fd1498Szrj 
58538fd1498Szrj   return tree_loop_unroll_and_jam ();
58638fd1498Szrj }
58738fd1498Szrj 
58838fd1498Szrj }
58938fd1498Szrj 
59038fd1498Szrj gimple_opt_pass *
make_pass_loop_jam(gcc::context * ctxt)59138fd1498Szrj make_pass_loop_jam (gcc::context *ctxt)
59238fd1498Szrj {
59338fd1498Szrj   return new pass_loop_jam (ctxt);
59438fd1498Szrj }
595