1 /* Loop unroll-and-jam.
2    Copyright (C) 2017-2018 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
10 
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3.  If not see
18 <http://www.gnu.org/licenses/>.  */
19 
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "params.h"
24 #include "tree-pass.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "ssa.h"
29 #include "fold-const.h"
30 #include "tree-cfg.h"
31 #include "tree-ssa.h"
32 #include "tree-ssa-loop-niter.h"
33 #include "tree-ssa-loop.h"
34 #include "tree-ssa-loop-manip.h"
35 #include "cfgloop.h"
36 #include "tree-scalar-evolution.h"
37 #include "gimple-iterator.h"
38 #include "cfghooks.h"
39 #include "tree-data-ref.h"
40 #include "tree-ssa-loop-ivopts.h"
41 #include "tree-vectorizer.h"
42 
43 /* Unroll and Jam transformation
44 
45    This is a combination of two transformations, where the second
46    is not always valid.  It's applicable if a loop nest has redundancies
47    over the iterations of an outer loop while not having that with
48    an inner loop.
49 
50    Given this nest:
51        for (i) {
52 	 for (j) {
53 	   B(i,j)
54 	 }
55        }
56 
57    first unroll:
58        for (i by 2) {
59 	 for (j) {
60 	   B(i,j)
61 	 }
62 	 for (j) {
63 	   B(i+1,j)
64 	 }
65        }
66 
67    then fuse the two adjacent inner loops resulting from that:
68        for (i by 2) {
69 	 for (j) {
70 	   B(i,j)
71 	   B(i+1,j)
72 	 }
73        }
74 
75    As the order of evaluations of the body B changes this is valid
76    only in certain situations: all distance vectors need to be forward.
77    Additionally if there are multiple induction variables than just
78    a counting control IV (j above) we can also deal with some situations.
79 
80    The validity is checked by unroll_jam_possible_p, and the data-dep
81    testing below.
82 
83    A trivial example where the fusion is wrong would be when
84    B(i,j) == x[j-1] = x[j];
85        for (i by 2) {
86 	 for (j) {
87 	   x[j-1] = x[j];
88 	 }
89 	 for (j) {
90 	   x[j-1] = x[j];
91 	 }
92        }  effect: move content to front by two elements
93        -->
94        for (i by 2) {
95 	 for (j) {
96 	   x[j-1] = x[j];
97 	   x[j-1] = x[j];
98 	 }
99        }  effect: move content to front by one element
100 */
101 
102 /* Modify the loop tree for the fact that all code once belonging
103    to the OLD loop or the outer loop of OLD now is inside LOOP.  */
104 
105 static void
merge_loop_tree(struct loop * loop,struct loop * old)106 merge_loop_tree (struct loop *loop, struct loop *old)
107 {
108   basic_block *bbs;
109   int i, n;
110   struct loop *subloop;
111   edge e;
112   edge_iterator ei;
113 
114   /* Find its nodes.  */
115   bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
116   n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun));
117 
118   for (i = 0; i < n; i++)
119     {
120       /* If the block was direct child of OLD loop it's now part
121          of LOOP.  If it was outside OLD, then it moved into LOOP
122 	 as well.  This avoids changing the loop father for BBs
123 	 in inner loops of OLD.  */
124       if (bbs[i]->loop_father == old
125 	  || loop_depth (bbs[i]->loop_father) < loop_depth (old))
126 	{
127 	  remove_bb_from_loops (bbs[i]);
128 	  add_bb_to_loop (bbs[i], loop);
129 	  continue;
130 	}
131 
132       /* If we find a direct subloop of OLD, move it to LOOP.  */
133       subloop = bbs[i]->loop_father;
134       if (loop_outer (subloop) == old && subloop->header == bbs[i])
135 	{
136 	  flow_loop_tree_node_remove (subloop);
137 	  flow_loop_tree_node_add (loop, subloop);
138 	}
139     }
140 
141   /* Update the information about loop exit edges.  */
142   for (i = 0; i < n; i++)
143     {
144       FOR_EACH_EDGE (e, ei, bbs[i]->succs)
145 	{
146 	  rescan_loop_exit (e, false, false);
147 	}
148     }
149 
150   loop->num_nodes = n;
151 
152   free (bbs);
153 }
154 
155 /* BB is part of the outer loop of an unroll-and-jam situation.
156    Check if any statements therein would prevent the transformation.  */
157 
158 static bool
bb_prevents_fusion_p(basic_block bb)159 bb_prevents_fusion_p (basic_block bb)
160 {
161   gimple_stmt_iterator gsi;
162   /* BB is duplicated by outer unrolling and then all N-1 first copies
163      move into the body of the fused inner loop.  If BB exits the outer loop
164      the last copy still does so, and the first N-1 copies are cancelled
165      by loop unrolling, so also after fusion it's the exit block.
166      But there might be other reasons that prevent fusion:
167        * stores or unknown side-effects prevent fusion
168        * loads don't
169        * computations into SSA names: these aren't problematic.  Their
170          result will be unused on the exit edges of the first N-1 copies
171 	 (those aren't taken after unrolling).  If they are used on the
172 	 other edge (the one leading to the outer latch block) they are
173 	 loop-carried (on the outer loop) and the Nth copy of BB will
174 	 compute them again (i.e. the first N-1 copies will be dead).  */
175   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
176     {
177       gimple *g = gsi_stmt (gsi);
178       if (gimple_vdef (g) || gimple_has_side_effects (g))
179 	return true;
180     }
181   return false;
182 }
183 
184 /* Given an inner loop LOOP (of some OUTER loop) determine if
185    we can safely fuse copies of it (generated by outer unrolling).
186    If so return true, otherwise return false.  */
187 
188 static bool
unroll_jam_possible_p(struct loop * outer,struct loop * loop)189 unroll_jam_possible_p (struct loop *outer, struct loop *loop)
190 {
191   basic_block *bbs;
192   int i, n;
193   struct tree_niter_desc niter;
194 
195   /* When fusing the loops we skip the latch block
196      of the first one, so it mustn't have any effects to
197      preserve.  */
198   if (!empty_block_p (loop->latch))
199     return false;
200 
201   if (!single_exit (loop))
202     return false;
203 
204   /* We need a perfect nest.  Quick check for adjacent inner loops.  */
205   if (outer->inner != loop || loop->next)
206     return false;
207 
208   /* Prevent head-controlled inner loops, that we usually have.
209      The guard block would need to be accepted
210      (invariant condition either entering or skipping the loop),
211      without also accepting arbitrary control flow.  When unswitching
212      ran before us (as with -O3) this won't be a problem because its
213      outer loop unswitching will have moved out the invariant condition.
214 
215      If we do that we need to extend fuse_loops() to cope with this
216      by threading through the (still invariant) copied condition
217      between the two loop copies.  */
218   if (!dominated_by_p (CDI_DOMINATORS, outer->latch, loop->header))
219     return false;
220 
221   /* The number of iterations of the inner loop must be loop invariant
222      with respect to the outer loop.  */
223   if (!number_of_iterations_exit (loop, single_exit (loop), &niter,
224 				 false, true)
225       || niter.cmp == ERROR_MARK
226       || !integer_zerop (niter.may_be_zero)
227       || !expr_invariant_in_loop_p (outer, niter.niter))
228     return false;
229 
230   /* If the inner loop produces any values that are used inside the
231      outer loop (except the virtual op) then it can flow
232      back (perhaps indirectly) into the inner loop.  This prevents
233      fusion: without fusion the value at the last iteration is used,
234      with fusion the value after the initial iteration is used.
235 
236      If all uses are outside the outer loop this doesn't prevent fusion;
237      the value of the last iteration is still used (and the values from
238      all intermediate iterations are dead).  */
239   gphi_iterator psi;
240   for (psi = gsi_start_phis (single_exit (loop)->dest);
241        !gsi_end_p (psi); gsi_next (&psi))
242     {
243       imm_use_iterator imm_iter;
244       use_operand_p use_p;
245       tree op = gimple_phi_result (psi.phi ());
246       if (virtual_operand_p (op))
247 	continue;
248       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, op)
249 	{
250 	  gimple *use_stmt = USE_STMT (use_p);
251 	  if (!is_gimple_debug (use_stmt)
252 	      && flow_bb_inside_loop_p (outer, gimple_bb (use_stmt)))
253 	    return false;
254 	}
255     }
256 
257   /* And check blocks belonging to just outer loop.  */
258   bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
259   n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun));
260 
261   for (i = 0; i < n; i++)
262     if (bbs[i]->loop_father == outer && bb_prevents_fusion_p (bbs[i]))
263       break;
264   free (bbs);
265   if (i != n)
266     return false;
267 
268   /* For now we can safely fuse copies of LOOP only if all
269      loop carried variables are inductions (or the virtual op).
270 
271      We could handle reductions as well (the initial value in the second
272      body would be the after-iter value of the first body) if it's over
273      an associative and commutative operation.  We wouldn't
274      be able to handle unknown cycles.  */
275   for (psi = gsi_start_phis (loop->header); !gsi_end_p (psi); gsi_next (&psi))
276     {
277       affine_iv iv;
278       tree op = gimple_phi_result (psi.phi ());
279 
280       if (virtual_operand_p (op))
281 	continue;
282       if (!simple_iv (loop, loop, op, &iv, true))
283 	return false;
284       /* The inductions must be regular, loop invariant step and initial
285          value.  */
286       if (!expr_invariant_in_loop_p (outer, iv.step)
287 	  || !expr_invariant_in_loop_p (outer, iv.base))
288 	return false;
289       /* XXX With more effort we could also be able to deal with inductions
290          where the initial value is loop variant but a simple IV in the
291 	 outer loop.  The initial value for the second body would be
292 	 the original initial value plus iv.base.step.  The next value
293 	 for the fused loop would be the original next value of the first
294 	 copy, _not_ the next value of the second body.  */
295     }
296 
297   return true;
298 }
299 
300 /* Fuse LOOP with all further neighbors.  The loops are expected to
301    be in appropriate form.  */
302 
303 static void
fuse_loops(struct loop * loop)304 fuse_loops (struct loop *loop)
305 {
306   struct loop *next = loop->next;
307 
308   while (next)
309     {
310       edge e;
311 
312       remove_branch (single_pred_edge (loop->latch));
313       /* Make delete_basic_block not fiddle with the loop structure.  */
314       basic_block oldlatch = loop->latch;
315       loop->latch = NULL;
316       delete_basic_block (oldlatch);
317       e = redirect_edge_and_branch (loop_latch_edge (next),
318 				    loop->header);
319       loop->latch = e->src;
320       flush_pending_stmts (e);
321 
322       gcc_assert (EDGE_COUNT (next->header->preds) == 1);
323 
324       /* The PHI nodes of the second body (single-argument now)
325          need adjustments to use the right values: either directly
326 	 the value of the corresponding PHI in the first copy or
327 	 the one leaving the first body which unrolling did for us.
328 
329 	 See also unroll_jam_possible_p() for further possibilities.  */
330       gphi_iterator psi_first, psi_second;
331       e = single_pred_edge (next->header);
332       for (psi_first = gsi_start_phis (loop->header),
333 	   psi_second = gsi_start_phis (next->header);
334 	   !gsi_end_p (psi_first);
335 	   gsi_next (&psi_first), gsi_next (&psi_second))
336 	{
337 	  gphi *phi_first = psi_first.phi ();
338 	  gphi *phi_second = psi_second.phi ();
339 	  tree firstop = gimple_phi_result (phi_first);
340 	  /* The virtual operand is correct already as it's
341 	     always live at exit, hence has a LCSSA node and outer
342 	     loop unrolling updated SSA form.  */
343 	  if (virtual_operand_p (firstop))
344 	    continue;
345 
346 	  /* Due to unroll_jam_possible_p() we know that this is
347 	     an induction.  The second body goes over the same
348 	     iteration space.  */
349 	  add_phi_arg (phi_second, firstop, e,
350 		       gimple_location (phi_first));
351 	}
352       gcc_assert (gsi_end_p (psi_second));
353 
354       merge_loop_tree (loop, next);
355       gcc_assert (!next->num_nodes);
356       struct loop *ln = next->next;
357       delete_loop (next);
358       next = ln;
359     }
360   rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
361 }
362 
363 /* Returns true if the distance in DDR can be determined and adjusts
364    the unroll factor in *UNROLL to make unrolling valid for that distance.
365    Otherwise return false.
366 
367    If this data dep can lead to a removed memory reference, increment
368    *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor
369    for this to happen.  */
370 
371 static bool
adjust_unroll_factor(struct data_dependence_relation * ddr,unsigned * unroll,unsigned * profit_unroll,unsigned * removed)372 adjust_unroll_factor (struct data_dependence_relation *ddr,
373 		      unsigned *unroll, unsigned *profit_unroll,
374 		      unsigned *removed)
375 {
376   bool ret = false;
377   if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
378     {
379       if (DDR_NUM_DIST_VECTS (ddr) == 0)
380 	return false;
381       unsigned i;
382       lambda_vector dist_v;
383       FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
384 	{
385 	  /* A distance (a,b) is at worst transformed into (a/N,b) by the
386 	     unrolling (factor N), so the transformation is valid if
387 	     a >= N, or b > 0, or b is zero and a > 0.  Otherwise the unroll
388 	     factor needs to be limited so that the first condition holds.
389 	     That may limit the factor down to zero in the worst case.  */
390 	  int dist = dist_v[0];
391 	  if (dist < 0)
392 	    gcc_unreachable ();
393 	  else if ((unsigned)dist >= *unroll)
394 	    ;
395 	  else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
396 		   || (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
397 		       && dist > 0))
398 	    ;
399 	  else
400 	    *unroll = dist;
401 
402 	  /* With a distance (a,0) it's always profitable to unroll-and-jam
403 	     (by a+1), because one memory reference will go away.  With
404 	     (a,b) and b != 0 that's less clear.  We will increase the
405 	     number of streams without lowering the number of mem refs.
406 	     So for now only handle the first situation.  */
407 	  if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
408 	    {
409 	      *profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1);
410 	      (*removed)++;
411 	    }
412 
413 	  ret = true;
414 	}
415     }
416   return ret;
417 }
418 
419 /* Main entry point for the unroll-and-jam transformation
420    described above.  */
421 
422 static unsigned int
tree_loop_unroll_and_jam(void)423 tree_loop_unroll_and_jam (void)
424 {
425   struct loop *loop;
426   bool changed = false;
427 
428   gcc_assert (scev_initialized_p ());
429 
430   /* Go through all innermost loops.  */
431   FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
432     {
433       struct loop *outer = loop_outer (loop);
434 
435       if (loop_depth (loop) < 2
436 	  || optimize_loop_nest_for_size_p (outer))
437 	continue;
438 
439       if (!unroll_jam_possible_p (outer, loop))
440 	continue;
441 
442       vec<data_reference_p> datarefs;
443       vec<ddr_p> dependences;
444       unsigned unroll_factor, profit_unroll, removed;
445       struct tree_niter_desc desc;
446       bool unroll = false;
447 
448       auto_vec<loop_p, 3> loop_nest;
449       dependences.create (10);
450       datarefs.create (10);
451       if (!compute_data_dependences_for_loop (outer, true, &loop_nest,
452 					       &datarefs, &dependences))
453 	{
454 	  if (dump_file && (dump_flags & TDF_DETAILS))
455 	    fprintf (dump_file, "Cannot analyze data dependencies\n");
456 	  free_data_refs (datarefs);
457 	  free_dependence_relations (dependences);
458 	  continue;
459 	}
460       if (!datarefs.length ())
461 	continue;
462 
463       if (dump_file && (dump_flags & TDF_DETAILS))
464 	dump_data_dependence_relations (dump_file, dependences);
465 
466       unroll_factor = (unsigned)-1;
467       profit_unroll = 1;
468       removed = 0;
469 
470       /* Check all dependencies.  */
471       unsigned i;
472       struct data_dependence_relation *ddr;
473       FOR_EACH_VEC_ELT (dependences, i, ddr)
474 	{
475 	  struct data_reference *dra, *drb;
476 
477 	  /* If the refs are independend there's nothing to do.  */
478 	  if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
479 	    continue;
480 	  dra = DDR_A (ddr);
481 	  drb = DDR_B (ddr);
482 	  /* Nothing interesting for the self dependencies.  */
483 	  if (dra == drb)
484 	    continue;
485 
486 	  /* Now check the distance vector, for determining a sensible
487 	     outer unroll factor, and for validity of merging the inner
488 	     loop copies.  */
489 	  if (!adjust_unroll_factor (ddr, &unroll_factor, &profit_unroll,
490 				     &removed))
491 	    {
492 	      /* Couldn't get the distance vector.  For two reads that's
493 	         harmless (we assume we should unroll).  For at least
494 		 one write this means we can't check the dependence direction
495 		 and hence can't determine safety.  */
496 
497 	      if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb))
498 		{
499 		  unroll_factor = 0;
500 		  break;
501 		}
502 	    }
503 	}
504 
505       /* We regard a user-specified minimum percentage of zero as a request
506          to ignore all profitability concerns and apply the transformation
507 	 always.  */
508       if (!PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
509 	profit_unroll = 2;
510       else if (removed * 100 / datarefs.length ()
511 	  < (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
512 	profit_unroll = 1;
513       if (unroll_factor > profit_unroll)
514 	unroll_factor = profit_unroll;
515       if (unroll_factor > (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL))
516 	unroll_factor = PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL);
517       unroll = (unroll_factor > 1
518 		&& can_unroll_loop_p (outer, unroll_factor, &desc));
519 
520       if (unroll)
521 	{
522 	  if (dump_enabled_p ())
523 	    dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS,
524 			     find_loop_location (outer),
525 			     "applying unroll and jam with factor %d\n",
526 			     unroll_factor);
527 	  initialize_original_copy_tables ();
528 	  tree_unroll_loop (outer, unroll_factor, single_dom_exit (outer),
529 			    &desc);
530 	  free_original_copy_tables ();
531 	  fuse_loops (outer->inner);
532 	  changed = true;
533 	}
534 
535       loop_nest.release ();
536       free_dependence_relations (dependences);
537       free_data_refs (datarefs);
538     }
539 
540   if (changed)
541     {
542       scev_reset ();
543       free_dominance_info (CDI_DOMINATORS);
544       return TODO_cleanup_cfg;
545     }
546   return 0;
547 }
548 
549 /* Pass boilerplate */
550 
551 namespace {
552 
553 const pass_data pass_data_loop_jam =
554 {
555   GIMPLE_PASS, /* type */
556   "unrolljam", /* name */
557   OPTGROUP_LOOP, /* optinfo_flags */
558   TV_LOOP_JAM, /* tv_id */
559   PROP_cfg, /* properties_required */
560   0, /* properties_provided */
561   0, /* properties_destroyed */
562   0, /* todo_flags_start */
563   0, /* todo_flags_finish */
564 };
565 
566 class pass_loop_jam : public gimple_opt_pass
567 {
568 public:
pass_loop_jam(gcc::context * ctxt)569   pass_loop_jam (gcc::context *ctxt)
570     : gimple_opt_pass (pass_data_loop_jam, ctxt)
571   {}
572 
573   /* opt_pass methods: */
gate(function *)574   virtual bool gate (function *) { return flag_unroll_jam != 0; }
575   virtual unsigned int execute (function *);
576 
577 };
578 
579 unsigned int
execute(function * fun)580 pass_loop_jam::execute (function *fun)
581 {
582   if (number_of_loops (fun) <= 1)
583     return 0;
584 
585   return tree_loop_unroll_and_jam ();
586 }
587 
588 }
589 
590 gimple_opt_pass *
make_pass_loop_jam(gcc::context * ctxt)591 make_pass_loop_jam (gcc::context *ctxt)
592 {
593   return new pass_loop_jam (ctxt);
594 }
595