1 /* Loop interchange.
2    Copyright (C) 2017-2021 Free Software Foundation, Inc.
3    Contributed by ARM Ltd.
4 
5 This file is part of GCC.
6 
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 3, or (at your option) any
10 later version.
11 
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15 for more details.
16 
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3.  If not see
19 <http://www.gnu.org/licenses/>.  */
20 
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "is-a.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "tree-pass.h"
29 #include "ssa.h"
30 #include "gimple-pretty-print.h"
31 #include "fold-const.h"
32 #include "gimplify.h"
33 #include "gimple-iterator.h"
34 #include "gimplify-me.h"
35 #include "cfgloop.h"
36 #include "tree-ssa.h"
37 #include "tree-scalar-evolution.h"
38 #include "tree-ssa-loop-manip.h"
39 #include "tree-ssa-loop-niter.h"
40 #include "tree-ssa-loop-ivopts.h"
41 #include "tree-ssa-dce.h"
42 #include "tree-data-ref.h"
43 #include "tree-vectorizer.h"
44 
45 /* This pass performs loop interchange: for example, the loop nest
46 
47    for (int j = 0; j < N; j++)
48      for (int k = 0; k < N; k++)
49        for (int i = 0; i < N; i++)
50 	 c[i][j] = c[i][j] + a[i][k]*b[k][j];
51 
52    is transformed to
53 
54    for (int i = 0; i < N; i++)
55      for (int j = 0; j < N; j++)
56        for (int k = 0; k < N; k++)
57 	 c[i][j] = c[i][j] + a[i][k]*b[k][j];
58 
59    This pass implements loop interchange in the following steps:
60 
61      1) Find perfect loop nest for each innermost loop and compute data
62 	dependence relations for it.  For above example, loop nest is
63 	<loop_j, loop_k, loop_i>.
64      2) From innermost to outermost loop, this pass tries to interchange
65 	each loop pair.  For above case, it firstly tries to interchange
66 	<loop_k, loop_i> and loop nest becomes <loop_j, loop_i, loop_k>.
67 	Then it tries to interchange <loop_j, loop_i> and loop nest becomes
68 	<loop_i, loop_j, loop_k>.  The overall effect is to move innermost
69 	loop to the outermost position.  For loop pair <loop_i, loop_j>
70 	to be interchanged, we:
71      3) Check if data dependence relations are valid for loop interchange.
72      4) Check if both loops can be interchanged in terms of transformation.
73      5) Check if interchanging the two loops is profitable.
74      6) Interchange the two loops by mapping induction variables.
75 
76    This pass also handles reductions in loop nest.  So far we only support
77    simple reduction of inner loop and double reduction of the loop nest.  */
78 
79 /* Maximum number of stmts in each loop that should be interchanged.  */
80 #define MAX_NUM_STMT    (param_loop_interchange_max_num_stmts)
81 /* Maximum number of data references in loop nest.  */
82 #define MAX_DATAREFS    (param_loop_max_datarefs_for_datadeps)
83 
84 /* Comparison ratio of access stride between inner/outer loops to be
85    interchanged.  This is the minimum stride ratio for loop interchange
86    to be profitable.  */
87 #define OUTER_STRIDE_RATIO  (param_loop_interchange_stride_ratio)
88 /* The same as above, but we require higher ratio for interchanging the
89    innermost two loops.  */
90 #define INNER_STRIDE_RATIO  ((OUTER_STRIDE_RATIO) + 1)
91 
92 /* Comparison ratio of stmt cost between inner/outer loops.  Loops won't
93    be interchanged if outer loop has too many stmts.  */
94 #define STMT_COST_RATIO     (3)
95 
96 /* Vector of strides that DR accesses in each level loop of a loop nest.  */
97 #define DR_ACCESS_STRIDE(dr) ((vec<tree> *) dr->aux)
98 
99 /* Structure recording loop induction variable.  */
100 typedef struct induction
101 {
102   /* IV itself.  */
103   tree var;
104   /* IV's initializing value, which is the init arg of the IV PHI node.  */
105   tree init_val;
106   /* IV's initializing expr, which is (the expanded result of) init_val.  */
107   tree init_expr;
108   /* IV's step.  */
109   tree step;
110 } *induction_p;
111 
112 /* Enum type for loop reduction variable.  */
113 enum reduction_type
114 {
115   UNKNOWN_RTYPE = 0,
116   SIMPLE_RTYPE,
117   DOUBLE_RTYPE
118 };
119 
120 /* Structure recording loop reduction variable.  */
121 typedef struct reduction
122 {
123   /* Reduction itself.  */
124   tree var;
125   /* PHI node defining reduction variable.  */
126   gphi *phi;
127   /* Init and next variables of the reduction.  */
128   tree init;
129   tree next;
130   /* Lcssa PHI node if reduction is used outside of its definition loop.  */
131   gphi *lcssa_phi;
132   /* Stmts defining init and next.  */
133   gimple *producer;
134   gimple *consumer;
135   /* If init is loaded from memory, this is the loading memory reference.  */
136   tree init_ref;
137   /* If reduction is finally stored to memory, this is the stored memory
138      reference.  */
139   tree fini_ref;
140   enum reduction_type type;
141 } *reduction_p;
142 
143 
144 /* Dump reduction RE.  */
145 
146 static void
dump_reduction(reduction_p re)147 dump_reduction (reduction_p re)
148 {
149   if (re->type == SIMPLE_RTYPE)
150     fprintf (dump_file, "  Simple reduction:  ");
151   else if (re->type == DOUBLE_RTYPE)
152     fprintf (dump_file, "  Double reduction:  ");
153   else
154     fprintf (dump_file, "  Unknown reduction:  ");
155 
156   print_gimple_stmt (dump_file, re->phi, 0);
157 }
158 
159 /* Dump LOOP's induction IV.  */
160 static void
dump_induction(class loop * loop,induction_p iv)161 dump_induction (class loop *loop, induction_p iv)
162 {
163   fprintf (dump_file, "  Induction:  ");
164   print_generic_expr (dump_file, iv->var, TDF_SLIM);
165   fprintf (dump_file, " = {");
166   print_generic_expr (dump_file, iv->init_expr, TDF_SLIM);
167   fprintf (dump_file, ", ");
168   print_generic_expr (dump_file, iv->step, TDF_SLIM);
169   fprintf (dump_file, "}_%d\n", loop->num);
170 }
171 
172 /* Loop candidate for interchange.  */
173 
174 class loop_cand
175 {
176 public:
177   loop_cand (class loop *, class loop *);
178   ~loop_cand ();
179 
180   reduction_p find_reduction_by_stmt (gimple *);
181   void classify_simple_reduction (reduction_p);
182   bool analyze_iloop_reduction_var (tree);
183   bool analyze_oloop_reduction_var (loop_cand *, tree);
184   bool analyze_induction_var (tree, tree);
185   bool analyze_carried_vars (loop_cand *);
186   bool analyze_lcssa_phis (void);
187   bool can_interchange_p (loop_cand *);
188   void undo_simple_reduction (reduction_p, bitmap);
189 
190   /* The loop itself.  */
191   class loop *m_loop;
192   /* The outer loop for interchange.  It equals to loop if this loop cand
193      itself represents the outer loop.  */
194   class loop *m_outer;
195   /* Vector of induction variables in loop.  */
196   vec<induction_p> m_inductions;
197   /* Vector of reduction variables in loop.  */
198   vec<reduction_p> m_reductions;
199   /* Lcssa PHI nodes of this loop.  */
200   vec<gphi *> m_lcssa_nodes;
201   /* Single exit edge of this loop.  */
202   edge m_exit;
203   /* Basic blocks of this loop.  */
204   basic_block *m_bbs;
205   /* Number of stmts of this loop.  Inner loops' stmts are not included.  */
206   int m_num_stmts;
207   /* Number of constant initialized simple reduction.  */
208   int m_const_init_reduc;
209 };
210 
211 /* Constructor.  */
212 
loop_cand(class loop * loop,class loop * outer)213 loop_cand::loop_cand (class loop *loop, class loop *outer)
214   : m_loop (loop), m_outer (outer), m_exit (single_exit (loop)),
215     m_bbs (get_loop_body (loop)), m_num_stmts (0), m_const_init_reduc (0)
216 {
217     m_inductions.create (3);
218     m_reductions.create (3);
219     m_lcssa_nodes.create (3);
220 }
221 
222 /* Destructor.  */
223 
~loop_cand()224 loop_cand::~loop_cand ()
225 {
226   induction_p iv;
227   for (unsigned i = 0; m_inductions.iterate (i, &iv); ++i)
228     free (iv);
229 
230   reduction_p re;
231   for (unsigned i = 0; m_reductions.iterate (i, &re); ++i)
232     free (re);
233 
234   m_inductions.release ();
235   m_reductions.release ();
236   m_lcssa_nodes.release ();
237   free (m_bbs);
238 }
239 
240 /* Return single use stmt of VAR in LOOP, otherwise return NULL.  */
241 
242 static gimple *
single_use_in_loop(tree var,class loop * loop)243 single_use_in_loop (tree var, class loop *loop)
244 {
245   gimple *stmt, *res = NULL;
246   use_operand_p use_p;
247   imm_use_iterator iterator;
248 
249   FOR_EACH_IMM_USE_FAST (use_p, iterator, var)
250     {
251       stmt = USE_STMT (use_p);
252       if (is_gimple_debug (stmt))
253 	continue;
254 
255       if (!flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
256 	continue;
257 
258       if (res)
259 	return NULL;
260 
261       res = stmt;
262     }
263   return res;
264 }
265 
266 /* Return true if E is unsupported in loop interchange, i.e, E is a complex
267    edge or part of irreducible loop.  */
268 
269 static inline bool
unsupported_edge(edge e)270 unsupported_edge (edge e)
271 {
272   return (e->flags & (EDGE_COMPLEX | EDGE_IRREDUCIBLE_LOOP));
273 }
274 
275 /* Return the reduction if STMT is one of its lcssa PHI, producer or consumer
276    stmt.  */
277 
278 reduction_p
find_reduction_by_stmt(gimple * stmt)279 loop_cand::find_reduction_by_stmt (gimple *stmt)
280 {
281   gphi *phi = dyn_cast <gphi *> (stmt);
282   reduction_p re;
283 
284   for (unsigned i = 0; m_reductions.iterate (i, &re); ++i)
285     if ((phi != NULL && phi == re->lcssa_phi)
286 	|| (stmt == re->producer || stmt == re->consumer))
287       return re;
288 
289   return NULL;
290 }
291 
292 /* Return true if current loop_cand be interchanged.  ILOOP is not NULL if
293    current loop_cand is outer loop in loop nest.  */
294 
295 bool
can_interchange_p(loop_cand * iloop)296 loop_cand::can_interchange_p (loop_cand *iloop)
297 {
298   /* For now we only support at most one reduction.  */
299   unsigned allowed_reduction_num = 1;
300 
301   /* Only support reduction if the loop nest to be interchanged is the
302      innermostin two loops.  */
303   if ((iloop == NULL && m_loop->inner != NULL)
304        || (iloop != NULL && iloop->m_loop->inner != NULL))
305     allowed_reduction_num = 0;
306 
307   if (m_reductions.length () > allowed_reduction_num
308       || (m_reductions.length () == 1
309 	  && m_reductions[0]->type == UNKNOWN_RTYPE))
310     return false;
311 
312   /* Only support lcssa PHI node which is for reduction.  */
313   if (m_lcssa_nodes.length () > allowed_reduction_num)
314     return false;
315 
316   /* Check if basic block has any unsupported operation.  Note basic blocks
317      of inner loops are not checked here.  */
318   for (unsigned i = 0; i < m_loop->num_nodes; i++)
319     {
320       basic_block bb = m_bbs[i];
321       gphi_iterator psi;
322       gimple_stmt_iterator gsi;
323 
324       /* Skip basic blocks of inner loops.  */
325       if (bb->loop_father != m_loop)
326        continue;
327 
328       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
329 	{
330 	  gimple *stmt = gsi_stmt (gsi);
331 	  if (is_gimple_debug (stmt))
332 	    continue;
333 
334 	  if (gimple_has_side_effects (stmt))
335 	    return false;
336 
337 	  m_num_stmts++;
338 	  if (gcall *call = dyn_cast <gcall *> (stmt))
339 	    {
340 	      /* In basic block of outer loop, the call should be cheap since
341 		 it will be moved to inner loop.  */
342 	      if (iloop != NULL
343 		  && !gimple_inexpensive_call_p (call))
344 		return false;
345 	      continue;
346 	    }
347 
348 	  if (!iloop || !gimple_vuse (stmt))
349 	    continue;
350 
351 	  /* Support stmt accessing memory in outer loop only if it is for
352 	     inner loop's reduction.  */
353 	  if (iloop->find_reduction_by_stmt (stmt))
354 	    continue;
355 
356 	  tree lhs;
357 	  /* Support loop invariant memory reference if it's only used once by
358 	     inner loop.  */
359 	  /* ???  How's this checking for invariantness?  */
360 	  if (gimple_assign_single_p (stmt)
361 	      && (lhs = gimple_assign_lhs (stmt)) != NULL_TREE
362 	      && TREE_CODE (lhs) == SSA_NAME
363 	      && single_use_in_loop (lhs, iloop->m_loop))
364 	    continue;
365 
366 	  return false;
367 	}
368       /* Check if loop has too many stmts.  */
369       if (m_num_stmts > MAX_NUM_STMT)
370 	return false;
371 
372       /* Allow PHI nodes in any basic block of inner loop, PHI nodes in outer
373 	 loop's header, or PHI nodes in dest bb of inner loop's exit edge.  */
374       if (!iloop || bb == m_loop->header
375 	  || bb == iloop->m_exit->dest)
376 	continue;
377 
378       /* Don't allow any other PHI nodes.  */
379       for (psi = gsi_start_phis (bb); !gsi_end_p (psi); gsi_next (&psi))
380 	if (!virtual_operand_p (PHI_RESULT (psi.phi ())))
381 	  return false;
382     }
383 
384   return true;
385 }
386 
387 /* Programmers and optimizers (like loop store motion) may optimize code:
388 
389      for (int i = 0; i < N; i++)
390        for (int j = 0; j < N; j++)
391 	 a[i] += b[j][i] * c[j][i];
392 
393    into reduction:
394 
395      for (int i = 0; i < N; i++)
396        {
397 	 // producer.  Note sum can be intitialized to a constant.
398 	 int sum = a[i];
399 	 for (int j = 0; j < N; j++)
400 	   {
401 	     sum += b[j][i] * c[j][i];
402 	   }
403 	 // consumer.
404 	 a[i] = sum;
405        }
406 
407    The result code can't be interchanged without undoing the optimization.
408    This function classifies this kind reduction and records information so
409    that we can undo the store motion during interchange.  */
410 
411 void
classify_simple_reduction(reduction_p re)412 loop_cand::classify_simple_reduction (reduction_p re)
413 {
414   gimple *producer, *consumer;
415 
416   /* Check init variable of reduction and how it is initialized.  */
417   if (TREE_CODE (re->init) == SSA_NAME)
418     {
419       producer = SSA_NAME_DEF_STMT (re->init);
420       re->producer = producer;
421       basic_block bb = gimple_bb (producer);
422       if (!bb || bb->loop_father != m_outer)
423 	return;
424 
425       if (!gimple_assign_load_p (producer))
426 	return;
427 
428       re->init_ref = gimple_assign_rhs1 (producer);
429     }
430   else if (CONSTANT_CLASS_P (re->init))
431     m_const_init_reduc++;
432   else
433     return;
434 
435   /* Check how reduction variable is used.  */
436   consumer = single_use_in_loop (PHI_RESULT (re->lcssa_phi), m_outer);
437   if (!consumer
438       || !gimple_store_p (consumer))
439     return;
440 
441   re->fini_ref = gimple_get_lhs (consumer);
442   re->consumer = consumer;
443 
444   /* Simple reduction with constant initializer.  */
445   if (!re->init_ref)
446     {
447       gcc_assert (CONSTANT_CLASS_P (re->init));
448       re->init_ref = unshare_expr (re->fini_ref);
449     }
450 
451   /* Require memory references in producer and consumer are the same so
452      that we can undo reduction during interchange.  */
453   if (re->init_ref && !operand_equal_p (re->init_ref, re->fini_ref, 0))
454     return;
455 
456   re->type = SIMPLE_RTYPE;
457 }
458 
459 /* Analyze reduction variable VAR for inner loop of the loop nest to be
460    interchanged.  Return true if analysis succeeds.  */
461 
462 bool
analyze_iloop_reduction_var(tree var)463 loop_cand::analyze_iloop_reduction_var (tree var)
464 {
465   gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (var));
466   gphi *lcssa_phi = NULL, *use_phi;
467   tree init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (m_loop));
468   tree next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (m_loop));
469   reduction_p re;
470   gimple *stmt, *next_def, *single_use = NULL;
471   use_operand_p use_p;
472   imm_use_iterator iterator;
473 
474   if (TREE_CODE (next) != SSA_NAME)
475     return false;
476 
477   next_def = SSA_NAME_DEF_STMT (next);
478   basic_block bb = gimple_bb (next_def);
479   if (!bb || !flow_bb_inside_loop_p (m_loop, bb))
480     return false;
481 
482   /* In restricted reduction, the var is (and must be) used in defining
483      the updated var.  The process can be depicted as below:
484 
485 		var ;; = PHI<init, next>
486 		 |
487 		 |
488 		 v
489       +---------------------+
490       | reduction operators | <-- other operands
491       +---------------------+
492 		 |
493 		 |
494 		 v
495 		next
496 
497      In terms loop interchange, we don't change how NEXT is computed based
498      on VAR and OTHER OPERANDS.  In case of double reduction in loop nest
499      to be interchanged, we don't changed it at all.  In the case of simple
500      reduction in inner loop, we only make change how VAR/NEXT is loaded or
501      stored.  With these conditions, we can relax restrictions on reduction
502      in a way that reduction operation is seen as black box.  In general,
503      we can ignore reassociation of reduction operator; we can handle fake
504      reductions in which VAR is not even used to compute NEXT.  */
505   if (! single_imm_use (var, &use_p, &single_use)
506       || ! flow_bb_inside_loop_p (m_loop, gimple_bb (single_use)))
507     return false;
508 
509   /* Check the reduction operation.  We require a left-associative operation.
510      For FP math we also need to be allowed to associate operations.  */
511   if (gassign *ass = dyn_cast <gassign *> (single_use))
512     {
513       enum tree_code code = gimple_assign_rhs_code (ass);
514       if (! (associative_tree_code (code)
515 	     || (code == MINUS_EXPR
516 		 && use_p->use == gimple_assign_rhs1_ptr (ass)))
517 	  || (FLOAT_TYPE_P (TREE_TYPE (var))
518 	      && ! flag_associative_math))
519 	return false;
520     }
521   else
522     return false;
523 
524   /* Handle and verify a series of stmts feeding the reduction op.  */
525   if (single_use != next_def
526       && !check_reduction_path (dump_user_location_t (), m_loop, phi, next,
527 				gimple_assign_rhs_code (single_use)))
528     return false;
529 
530   /* Only support cases in which INIT is used in inner loop.  */
531   if (TREE_CODE (init) == SSA_NAME)
532     FOR_EACH_IMM_USE_FAST (use_p, iterator, init)
533       {
534 	stmt = USE_STMT (use_p);
535 	if (is_gimple_debug (stmt))
536 	  continue;
537 
538 	if (!flow_bb_inside_loop_p (m_loop, gimple_bb (stmt)))
539 	  return false;
540       }
541 
542   FOR_EACH_IMM_USE_FAST (use_p, iterator, next)
543     {
544       stmt = USE_STMT (use_p);
545       if (is_gimple_debug (stmt))
546 	continue;
547 
548       /* Or else it's used in PHI itself.  */
549       use_phi = dyn_cast <gphi *> (stmt);
550       if (use_phi == phi)
551 	continue;
552 
553       if (use_phi != NULL
554 	  && lcssa_phi == NULL
555 	  && gimple_bb (stmt) == m_exit->dest
556 	  && PHI_ARG_DEF_FROM_EDGE (use_phi, m_exit) == next)
557 	lcssa_phi = use_phi;
558       else
559 	return false;
560     }
561   if (!lcssa_phi)
562     return false;
563 
564   re = XCNEW (struct reduction);
565   re->var = var;
566   re->init = init;
567   re->next = next;
568   re->phi = phi;
569   re->lcssa_phi = lcssa_phi;
570 
571   classify_simple_reduction (re);
572 
573   if (dump_file && (dump_flags & TDF_DETAILS))
574     dump_reduction (re);
575 
576   m_reductions.safe_push (re);
577   return true;
578 }
579 
580 /* Analyze reduction variable VAR for outer loop of the loop nest to be
581    interchanged.  ILOOP is not NULL and points to inner loop.  For the
582    moment, we only support double reduction for outer loop, like:
583 
584      for (int i = 0; i < n; i++)
585        {
586 	 int sum = 0;
587 
588 	 for (int j = 0; j < n; j++)    // outer loop
589 	   for (int k = 0; k < n; k++)  // inner loop
590 	     sum += a[i][k]*b[k][j];
591 
592 	 s[i] = sum;
593        }
594 
595    Note the innermost two loops are the loop nest to be interchanged.
596    Return true if analysis succeeds.  */
597 
598 bool
analyze_oloop_reduction_var(loop_cand * iloop,tree var)599 loop_cand::analyze_oloop_reduction_var (loop_cand *iloop, tree var)
600 {
601   gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (var));
602   gphi *lcssa_phi = NULL, *use_phi;
603   tree init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (m_loop));
604   tree next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (m_loop));
605   reduction_p re;
606   gimple *stmt, *next_def;
607   use_operand_p use_p;
608   imm_use_iterator iterator;
609 
610   if (TREE_CODE (next) != SSA_NAME)
611     return false;
612 
613   next_def = SSA_NAME_DEF_STMT (next);
614   basic_block bb = gimple_bb (next_def);
615   if (!bb || !flow_bb_inside_loop_p (m_loop, bb))
616     return false;
617 
618   /* Find inner loop's simple reduction that uses var as initializer.  */
619   reduction_p inner_re = NULL;
620   for (unsigned i = 0; iloop->m_reductions.iterate (i, &inner_re); ++i)
621     if (inner_re->init == var || operand_equal_p (inner_re->init, var, 0))
622       break;
623 
624   if (inner_re == NULL
625       || inner_re->type != UNKNOWN_RTYPE
626       || inner_re->producer != phi)
627     return false;
628 
629   /* In case of double reduction, outer loop's reduction should be updated
630      by inner loop's simple reduction.  */
631   if (next_def != inner_re->lcssa_phi)
632     return false;
633 
634   /* Outer loop's reduction should only be used to initialize inner loop's
635      simple reduction.  */
636   if (! single_imm_use (var, &use_p, &stmt)
637       || stmt != inner_re->phi)
638     return false;
639 
640   /* Check this reduction is correctly used outside of loop via lcssa phi.  */
641   FOR_EACH_IMM_USE_FAST (use_p, iterator, next)
642     {
643       stmt = USE_STMT (use_p);
644       if (is_gimple_debug (stmt))
645 	continue;
646 
647       /* Or else it's used in PHI itself.  */
648       use_phi = dyn_cast <gphi *> (stmt);
649       if (use_phi == phi)
650 	continue;
651 
652       if (lcssa_phi == NULL
653 	  && use_phi != NULL
654 	  && gimple_bb (stmt) == m_exit->dest
655 	  && PHI_ARG_DEF_FROM_EDGE (use_phi, m_exit) == next)
656 	lcssa_phi = use_phi;
657       else
658 	return false;
659     }
660   if (!lcssa_phi)
661     return false;
662 
663   re = XCNEW (struct reduction);
664   re->var = var;
665   re->init = init;
666   re->next = next;
667   re->phi = phi;
668   re->lcssa_phi = lcssa_phi;
669   re->type = DOUBLE_RTYPE;
670   inner_re->type = DOUBLE_RTYPE;
671 
672   if (dump_file && (dump_flags & TDF_DETAILS))
673     dump_reduction (re);
674 
675   m_reductions.safe_push (re);
676   return true;
677 }
678 
679 /* Return true if VAR is induction variable of current loop whose scev is
680    specified by CHREC.  */
681 
682 bool
analyze_induction_var(tree var,tree chrec)683 loop_cand::analyze_induction_var (tree var, tree chrec)
684 {
685   gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (var));
686   tree init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (m_loop));
687 
688   /* Var is loop invariant, though it's unlikely to happen.  */
689   if (tree_does_not_contain_chrecs (chrec))
690     {
691       /* Punt on floating point invariants if honoring signed zeros,
692 	 representing that as + 0.0 would change the result if init
693 	 is -0.0.  Similarly for SNaNs it can raise exception.  */
694       if (HONOR_SIGNED_ZEROS (chrec) || HONOR_SNANS (chrec))
695 	return false;
696       struct induction *iv = XCNEW (struct induction);
697       iv->var = var;
698       iv->init_val = init;
699       iv->init_expr = chrec;
700       iv->step = build_zero_cst (TREE_TYPE (chrec));
701       m_inductions.safe_push (iv);
702       return true;
703     }
704 
705   if (TREE_CODE (chrec) != POLYNOMIAL_CHREC
706       || CHREC_VARIABLE (chrec) != (unsigned) m_loop->num
707       || tree_contains_chrecs (CHREC_LEFT (chrec), NULL)
708       || tree_contains_chrecs (CHREC_RIGHT (chrec), NULL))
709     return false;
710 
711   struct induction *iv = XCNEW (struct induction);
712   iv->var = var;
713   iv->init_val = init;
714   iv->init_expr = CHREC_LEFT (chrec);
715   iv->step = CHREC_RIGHT (chrec);
716 
717   if (dump_file && (dump_flags & TDF_DETAILS))
718     dump_induction (m_loop, iv);
719 
720   m_inductions.safe_push (iv);
721   return true;
722 }
723 
724 /* Return true if all loop carried variables defined in loop header can
725    be successfully analyzed.  */
726 
727 bool
analyze_carried_vars(loop_cand * iloop)728 loop_cand::analyze_carried_vars (loop_cand *iloop)
729 {
730   edge e = loop_preheader_edge (m_outer);
731   gphi_iterator gsi;
732 
733   if (dump_file && (dump_flags & TDF_DETAILS))
734     fprintf (dump_file, "\nLoop(%d) carried vars:\n", m_loop->num);
735 
736   for (gsi = gsi_start_phis (m_loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
737     {
738       gphi *phi = gsi.phi ();
739 
740       tree var = PHI_RESULT (phi);
741       if (virtual_operand_p (var))
742 	continue;
743 
744       tree chrec = analyze_scalar_evolution (m_loop, var);
745       chrec = instantiate_scev (e, m_loop, chrec);
746 
747       /* Analyze var as reduction variable.  */
748       if (chrec_contains_undetermined (chrec)
749 	  || chrec_contains_symbols_defined_in_loop (chrec, m_outer->num))
750 	{
751 	  if (iloop && !analyze_oloop_reduction_var (iloop, var))
752 	    return false;
753 	  if (!iloop && !analyze_iloop_reduction_var (var))
754 	    return false;
755 	}
756       /* Analyze var as induction variable.  */
757       else if (!analyze_induction_var (var, chrec))
758 	return false;
759     }
760 
761   return true;
762 }
763 
764 /* Return TRUE if loop closed PHI nodes can be analyzed successfully.  */
765 
766 bool
analyze_lcssa_phis(void)767 loop_cand::analyze_lcssa_phis (void)
768 {
769   gphi_iterator gsi;
770   for (gsi = gsi_start_phis (m_exit->dest); !gsi_end_p (gsi); gsi_next (&gsi))
771     {
772       gphi *phi = gsi.phi ();
773 
774       if (virtual_operand_p (PHI_RESULT (phi)))
775 	continue;
776 
777       /* TODO: We only support lcssa phi for reduction for now.  */
778       if (!find_reduction_by_stmt (phi))
779 	return false;
780     }
781 
782   return true;
783 }
784 
785 /* CONSUMER is a stmt in BB storing reduction result into memory object.
786    When the reduction is intialized from constant value, we need to add
787    a stmt loading from the memory object to target basic block in inner
788    loop during undoing the reduction.  Problem is that memory reference
789    may use ssa variables not dominating the target basic block.  This
790    function finds all stmts on which CONSUMER depends in basic block BB,
791    records and returns them via STMTS.  */
792 
793 static void
find_deps_in_bb_for_stmt(gimple_seq * stmts,basic_block bb,gimple * consumer)794 find_deps_in_bb_for_stmt (gimple_seq *stmts, basic_block bb, gimple *consumer)
795 {
796   auto_vec<gimple *, 4> worklist;
797   use_operand_p use_p;
798   ssa_op_iter iter;
799   gimple *stmt, *def_stmt;
800   gimple_stmt_iterator gsi;
801 
802   /* First clear flag for stmts in bb.  */
803   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
804     gimple_set_plf (gsi_stmt (gsi), GF_PLF_1, false);
805 
806   /* DFS search all depended stmts in bb and mark flag for these stmts.  */
807   worklist.safe_push (consumer);
808   while (!worklist.is_empty ())
809     {
810       stmt = worklist.pop ();
811       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
812 	{
813 	  def_stmt = SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p));
814 
815 	  if (is_a <gphi *> (def_stmt)
816 	      || gimple_bb (def_stmt) != bb
817 	      || gimple_plf (def_stmt, GF_PLF_1))
818 	    continue;
819 
820 	  worklist.safe_push (def_stmt);
821 	}
822       gimple_set_plf (stmt, GF_PLF_1, true);
823     }
824   for (gsi = gsi_start_nondebug_bb (bb);
825        !gsi_end_p (gsi) && (stmt = gsi_stmt (gsi)) != consumer;)
826     {
827       /* Move dep stmts to sequence STMTS.  */
828       if (gimple_plf (stmt, GF_PLF_1))
829 	{
830 	  gsi_remove (&gsi, false);
831 	  gimple_seq_add_stmt_without_update (stmts, stmt);
832 	}
833       else
834 	gsi_next_nondebug (&gsi);
835     }
836 }
837 
838 /* User can write, optimizers can generate simple reduction RE for inner
839    loop.  In order to make interchange valid, we have to undo reduction by
840    moving producer and consumer stmts into the inner loop.  For example,
841    below code:
842 
843      init = MEM_REF[idx];		//producer
844      loop:
845        var = phi<init, next>
846        next = var op ...
847      reduc_sum = phi<next>
848      MEM_REF[idx] = reduc_sum		//consumer
849 
850    is transformed into:
851 
852      loop:
853        new_var = MEM_REF[idx];		//producer after moving
854        next = new_var op ...
855        MEM_REF[idx] = next;		//consumer after moving
856 
857    Note if the reduction variable is initialized to constant, like:
858 
859      var = phi<0.0, next>
860 
861    we compute new_var as below:
862 
863      loop:
864        tmp = MEM_REF[idx];
865        new_var = !first_iteration ? tmp : 0.0;
866 
867    so that the initial const is used in the first iteration of loop.  Also
868    record ssa variables for dead code elimination in DCE_SEEDS.  */
869 
870 void
undo_simple_reduction(reduction_p re,bitmap dce_seeds)871 loop_cand::undo_simple_reduction (reduction_p re, bitmap dce_seeds)
872 {
873   gimple *stmt;
874   gimple_stmt_iterator from, to = gsi_after_labels (m_loop->header);
875   gimple_seq stmts = NULL;
876   tree new_var;
877 
878   /* Prepare the initialization stmts and insert it to inner loop.  */
879   if (re->producer != NULL)
880     {
881       gimple_set_vuse (re->producer, NULL_TREE);
882       update_stmt (re->producer);
883       from = gsi_for_stmt (re->producer);
884       gsi_remove (&from, false);
885       gimple_seq_add_stmt_without_update (&stmts, re->producer);
886       new_var = re->init;
887     }
888   else
889     {
890       /* Find all stmts on which expression "MEM_REF[idx]" depends.  */
891       find_deps_in_bb_for_stmt (&stmts, gimple_bb (re->consumer), re->consumer);
892       /* Because we generate new stmt loading from the MEM_REF to TMP.  */
893       tree cond, tmp = copy_ssa_name (re->var);
894       stmt = gimple_build_assign (tmp, re->init_ref);
895       gimple_seq_add_stmt_without_update (&stmts, stmt);
896 
897       /* Init new_var to MEM_REF or CONST depending on if it is the first
898 	 iteration.  */
899       induction_p iv = m_inductions[0];
900       cond = fold_build2 (NE_EXPR, boolean_type_node, iv->var, iv->init_val);
901       new_var = copy_ssa_name (re->var);
902       stmt = gimple_build_assign (new_var, COND_EXPR, cond, tmp, re->init);
903       gimple_seq_add_stmt_without_update (&stmts, stmt);
904     }
905   gsi_insert_seq_before (&to, stmts, GSI_SAME_STMT);
906 
907   /* Replace all uses of reduction var with new variable.  */
908   use_operand_p use_p;
909   imm_use_iterator iterator;
910   FOR_EACH_IMM_USE_STMT (stmt, iterator, re->var)
911     {
912       FOR_EACH_IMM_USE_ON_STMT (use_p, iterator)
913 	SET_USE (use_p, new_var);
914 
915       update_stmt (stmt);
916     }
917 
918   /* Move consumer stmt into inner loop, just after reduction next's def.  */
919   unlink_stmt_vdef (re->consumer);
920   release_ssa_name (gimple_vdef (re->consumer));
921   gimple_set_vdef (re->consumer, NULL_TREE);
922   gimple_set_vuse (re->consumer, NULL_TREE);
923   gimple_assign_set_rhs1 (re->consumer, re->next);
924   update_stmt (re->consumer);
925   from = gsi_for_stmt (re->consumer);
926   to = gsi_for_stmt (SSA_NAME_DEF_STMT (re->next));
927   gsi_move_after (&from, &to);
928 
929   /* Mark the reduction variables for DCE.  */
930   bitmap_set_bit (dce_seeds, SSA_NAME_VERSION (re->var));
931   bitmap_set_bit (dce_seeds, SSA_NAME_VERSION (PHI_RESULT (re->lcssa_phi)));
932 }
933 
934 /* Free DATAREFS and its auxiliary memory.  */
935 
936 static void
free_data_refs_with_aux(vec<data_reference_p> datarefs)937 free_data_refs_with_aux (vec<data_reference_p> datarefs)
938 {
939   data_reference_p dr;
940   for (unsigned i = 0; datarefs.iterate (i, &dr); ++i)
941     if (dr->aux != NULL)
942       {
943 	DR_ACCESS_STRIDE (dr)->release ();
944 	delete (vec<tree> *) dr->aux;
945       }
946 
947   free_data_refs (datarefs);
948 }
949 
950 /* Class for loop interchange transformation.  */
951 
952 class tree_loop_interchange
953 {
954 public:
tree_loop_interchange(vec<class loop * > loop_nest)955   tree_loop_interchange (vec<class loop *> loop_nest)
956     : m_loop_nest (loop_nest), m_niters_iv_var (NULL_TREE),
957       m_dce_seeds (BITMAP_ALLOC (NULL)) { }
~tree_loop_interchange()958   ~tree_loop_interchange () { BITMAP_FREE (m_dce_seeds); }
959   bool interchange (vec<data_reference_p>, vec<ddr_p>);
960 
961 private:
962   void update_data_info (unsigned, unsigned, vec<data_reference_p>, vec<ddr_p>);
963   bool valid_data_dependences (unsigned, unsigned, vec<ddr_p>);
964   void interchange_loops (loop_cand &, loop_cand &);
965   void map_inductions_to_loop (loop_cand &, loop_cand &);
966   void move_code_to_inner_loop (class loop *, class loop *, basic_block *);
967 
968   /* The whole loop nest in which interchange is ongoing.  */
969   vec<class loop *> m_loop_nest;
970   /* We create new IV which is only used in loop's exit condition check.
971      In case of 3-level loop nest interchange, when we interchange the
972      innermost two loops, new IV created in the middle level loop does
973      not need to be preserved in interchanging the outermost two loops
974      later.  We record the IV so that it can be skipped.  */
975   tree m_niters_iv_var;
976   /* Bitmap of seed variables for dead code elimination after interchange.  */
977   bitmap m_dce_seeds;
978 };
979 
980 /* Update data refs' access stride and dependence information after loop
981    interchange.  I_IDX/O_IDX gives indices of interchanged loops in loop
982    nest.  DATAREFS are data references.  DDRS are data dependences.  */
983 
984 void
update_data_info(unsigned i_idx,unsigned o_idx,vec<data_reference_p> datarefs,vec<ddr_p> ddrs)985 tree_loop_interchange::update_data_info (unsigned i_idx, unsigned o_idx,
986 					 vec<data_reference_p> datarefs,
987 					 vec<ddr_p> ddrs)
988 {
989   struct data_reference *dr;
990   struct data_dependence_relation *ddr;
991 
992   /* Update strides of data references.  */
993   for (unsigned i = 0; datarefs.iterate (i, &dr); ++i)
994     {
995       vec<tree> *stride = DR_ACCESS_STRIDE (dr);
996       gcc_assert (stride->length () > i_idx);
997       std::swap ((*stride)[i_idx], (*stride)[o_idx]);
998     }
999   /* Update data dependences.  */
1000   for (unsigned i = 0; ddrs.iterate (i, &ddr); ++i)
1001     if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
1002       {
1003         for (unsigned j = 0; j < DDR_NUM_DIST_VECTS (ddr); ++j)
1004 	  {
1005 	    lambda_vector dist_vect = DDR_DIST_VECT (ddr, j);
1006 	    std::swap (dist_vect[i_idx], dist_vect[o_idx]);
1007 	  }
1008       }
1009 }
1010 
1011 /* Check data dependence relations, return TRUE if it's valid to interchange
1012    two loops specified by I_IDX/O_IDX.  Theoretically, interchanging the two
1013    loops is valid only if dist vector, after interchanging, doesn't have '>'
1014    as the leftmost non-'=' direction.  Practically, this function have been
1015    conservative here by not checking some valid cases.  */
1016 
1017 bool
valid_data_dependences(unsigned i_idx,unsigned o_idx,vec<ddr_p> ddrs)1018 tree_loop_interchange::valid_data_dependences (unsigned i_idx, unsigned o_idx,
1019 					       vec<ddr_p> ddrs)
1020 {
1021   struct data_dependence_relation *ddr;
1022 
1023   for (unsigned i = 0; ddrs.iterate (i, &ddr); ++i)
1024     {
1025       /* Skip no-dependence case.  */
1026       if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1027 	continue;
1028 
1029       for (unsigned j = 0; j < DDR_NUM_DIST_VECTS (ddr); ++j)
1030 	{
1031 	  lambda_vector dist_vect = DDR_DIST_VECT (ddr, j);
1032 	  unsigned level = dependence_level (dist_vect, m_loop_nest.length ());
1033 
1034 	  /* If there is no carried dependence.  */
1035 	  if (level == 0)
1036 	    continue;
1037 
1038 	  level --;
1039 
1040 	  /* If dependence is not carried by any loop in between the two
1041 	     loops [oloop, iloop] to interchange.  */
1042 	  if (level < o_idx || level > i_idx)
1043 	    continue;
1044 
1045 	  /* Be conservative, skip case if either direction at i_idx/o_idx
1046 	     levels is not '=' or '<'.  */
1047 	  if ((!DDR_REVERSED_P (ddr) && dist_vect[i_idx] < 0)
1048 	      || (DDR_REVERSED_P (ddr) && dist_vect[i_idx] > 0)
1049 	      || (!DDR_REVERSED_P (ddr) && dist_vect[o_idx] < 0)
1050 	      || (DDR_REVERSED_P (ddr) && dist_vect[o_idx] > 0))
1051 	    return false;
1052 	}
1053     }
1054 
1055   return true;
1056 }
1057 
1058 /* Interchange two loops specified by ILOOP and OLOOP.  */
1059 
1060 void
interchange_loops(loop_cand & iloop,loop_cand & oloop)1061 tree_loop_interchange::interchange_loops (loop_cand &iloop, loop_cand &oloop)
1062 {
1063   reduction_p re;
1064   gimple_stmt_iterator gsi;
1065   tree i_niters, o_niters, var_after;
1066 
1067   /* Undo inner loop's simple reduction.  */
1068   for (unsigned i = 0; iloop.m_reductions.iterate (i, &re); ++i)
1069     if (re->type != DOUBLE_RTYPE)
1070       {
1071 	if (re->producer)
1072 	  reset_debug_uses (re->producer);
1073 
1074 	iloop.undo_simple_reduction (re, m_dce_seeds);
1075       }
1076 
1077   /* Only need to reset debug uses for double reduction.  */
1078   for (unsigned i = 0; oloop.m_reductions.iterate (i, &re); ++i)
1079     {
1080       gcc_assert (re->type == DOUBLE_RTYPE);
1081       reset_debug_uses (SSA_NAME_DEF_STMT (re->var));
1082       reset_debug_uses (SSA_NAME_DEF_STMT (re->next));
1083     }
1084 
1085   /* Prepare niters for both loops.  */
1086   class loop *loop_nest = m_loop_nest[0];
1087   edge instantiate_below = loop_preheader_edge (loop_nest);
1088   gsi = gsi_last_bb (loop_preheader_edge (loop_nest)->src);
1089   i_niters = number_of_latch_executions (iloop.m_loop);
1090   i_niters = analyze_scalar_evolution (loop_outer (iloop.m_loop), i_niters);
1091   i_niters = instantiate_scev (instantiate_below, loop_outer (iloop.m_loop),
1092 			       i_niters);
1093   i_niters = force_gimple_operand_gsi (&gsi, unshare_expr (i_niters), true,
1094 				       NULL_TREE, false, GSI_CONTINUE_LINKING);
1095   o_niters = number_of_latch_executions (oloop.m_loop);
1096   if (oloop.m_loop != loop_nest)
1097     {
1098       o_niters = analyze_scalar_evolution (loop_outer (oloop.m_loop), o_niters);
1099       o_niters = instantiate_scev (instantiate_below, loop_outer (oloop.m_loop),
1100 				   o_niters);
1101     }
1102   o_niters = force_gimple_operand_gsi (&gsi, unshare_expr (o_niters), true,
1103 				       NULL_TREE, false, GSI_CONTINUE_LINKING);
1104 
1105   /* Move src's code to tgt loop.  This is necessary when src is the outer
1106      loop and tgt is the inner loop.  */
1107   move_code_to_inner_loop (oloop.m_loop, iloop.m_loop, oloop.m_bbs);
1108 
1109   /* Map outer loop's IV to inner loop, and vice versa.  */
1110   map_inductions_to_loop (oloop, iloop);
1111   map_inductions_to_loop (iloop, oloop);
1112 
1113   /* Create canonical IV for both loops.  Note canonical IV for outer/inner
1114      loop is actually from inner/outer loop.  Also we record the new IV
1115      created for the outer loop so that it can be skipped in later loop
1116      interchange.  */
1117   create_canonical_iv (oloop.m_loop, oloop.m_exit,
1118 		       i_niters, &m_niters_iv_var, &var_after);
1119   bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_after));
1120   create_canonical_iv (iloop.m_loop, iloop.m_exit,
1121 		       o_niters, NULL, &var_after);
1122   bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_after));
1123 
1124   /* Scrap niters estimation of interchanged loops.  */
1125   iloop.m_loop->any_upper_bound = false;
1126   iloop.m_loop->any_likely_upper_bound = false;
1127   free_numbers_of_iterations_estimates (iloop.m_loop);
1128   oloop.m_loop->any_upper_bound = false;
1129   oloop.m_loop->any_likely_upper_bound = false;
1130   free_numbers_of_iterations_estimates (oloop.m_loop);
1131 
1132   /* Clear all cached scev information.  This is expensive but shouldn't be
1133      a problem given we interchange in very limited times.  */
1134   scev_reset_htab ();
1135 
1136   /* ???  The association between the loop data structure and the
1137      CFG changed, so what was loop N at the source level is now
1138      loop M.  We should think of retaining the association or breaking
1139      it fully by creating a new loop instead of re-using the "wrong" one.  */
1140 }
1141 
1142 /* Map induction variables of SRC loop to TGT loop.  The function firstly
1143    creates the same IV of SRC loop in TGT loop, then deletes the original
1144    IV and re-initialize it using the newly created IV.  For example, loop
1145    nest:
1146 
1147      for (i = 0; i < N; i++)
1148        for (j = 0; j < M; j++)
1149 	 {
1150 	   //use of i;
1151 	   //use of j;
1152 	 }
1153 
1154    will be transformed into:
1155 
1156      for (jj = 0; jj < M; jj++)
1157        for (ii = 0; ii < N; ii++)
1158 	 {
1159 	   //use of ii;
1160 	   //use of jj;
1161 	 }
1162 
1163    after loop interchange.  */
1164 
1165 void
map_inductions_to_loop(loop_cand & src,loop_cand & tgt)1166 tree_loop_interchange::map_inductions_to_loop (loop_cand &src, loop_cand &tgt)
1167 {
1168   induction_p iv;
1169   edge e = tgt.m_exit;
1170   gimple_stmt_iterator incr_pos = gsi_last_bb (e->src), gsi;
1171 
1172   /* Map source loop's IV to target loop.  */
1173   for (unsigned i = 0; src.m_inductions.iterate (i, &iv); ++i)
1174     {
1175       gimple *use_stmt, *stmt = SSA_NAME_DEF_STMT (iv->var);
1176       gcc_assert (is_a <gphi *> (stmt));
1177 
1178       use_operand_p use_p;
1179       /* Only map original IV to target loop.  */
1180       if (m_niters_iv_var != iv->var)
1181 	{
1182 	  /* Map the IV by creating the same one in target loop.  */
1183 	  tree var_before, var_after;
1184 	  tree base = unshare_expr (iv->init_expr);
1185 	  tree step = unshare_expr (iv->step);
1186 	  create_iv (base, step, SSA_NAME_VAR (iv->var),
1187 		     tgt.m_loop, &incr_pos, false, &var_before, &var_after);
1188 	  bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_before));
1189 	  bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_after));
1190 
1191 	  /* Replace uses of the original IV var with newly created IV var.  */
1192 	  imm_use_iterator imm_iter;
1193 	  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, iv->var)
1194 	    {
1195 	      FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1196 		SET_USE (use_p, var_before);
1197 
1198 	      update_stmt (use_stmt);
1199 	    }
1200 	}
1201 
1202       /* Mark all uses for DCE.  */
1203       ssa_op_iter op_iter;
1204       FOR_EACH_PHI_OR_STMT_USE (use_p, stmt, op_iter, SSA_OP_USE)
1205 	{
1206 	  tree use = USE_FROM_PTR (use_p);
1207 	  if (TREE_CODE (use) == SSA_NAME
1208 	      && ! SSA_NAME_IS_DEFAULT_DEF (use))
1209 	    bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (use));
1210 	}
1211 
1212       /* Delete definition of the original IV in the source loop.  */
1213       gsi = gsi_for_stmt (stmt);
1214       remove_phi_node (&gsi, true);
1215     }
1216 }
1217 
1218 /* Move stmts of outer loop to inner loop.  */
1219 
1220 void
move_code_to_inner_loop(class loop * outer,class loop * inner,basic_block * outer_bbs)1221 tree_loop_interchange::move_code_to_inner_loop (class loop *outer,
1222 						class loop *inner,
1223 						basic_block *outer_bbs)
1224 {
1225   basic_block oloop_exit_bb = single_exit (outer)->src;
1226   gimple_stmt_iterator gsi, to;
1227 
1228   for (unsigned i = 0; i < outer->num_nodes; i++)
1229     {
1230       basic_block bb = outer_bbs[i];
1231 
1232       /* Skip basic blocks of inner loop.  */
1233       if (flow_bb_inside_loop_p (inner, bb))
1234 	continue;
1235 
1236       /* Move code from header/latch to header/latch.  */
1237       if (bb == outer->header)
1238 	to = gsi_after_labels (inner->header);
1239       else if (bb == outer->latch)
1240 	to = gsi_after_labels (inner->latch);
1241       else
1242 	/* Otherwise, simply move to exit->src.  */
1243 	to = gsi_last_bb (single_exit (inner)->src);
1244 
1245       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
1246 	{
1247 	  gimple *stmt = gsi_stmt (gsi);
1248 
1249 	  if (oloop_exit_bb == bb
1250 	      && stmt == gsi_stmt (gsi_last_bb (oloop_exit_bb)))
1251 	    {
1252 	      gsi_next (&gsi);
1253 	      continue;
1254 	    }
1255 
1256 	  if (gimple_vdef (stmt))
1257 	    {
1258 	      unlink_stmt_vdef (stmt);
1259 	      release_ssa_name (gimple_vdef (stmt));
1260 	      gimple_set_vdef (stmt, NULL_TREE);
1261 	    }
1262 	  if (gimple_vuse (stmt))
1263 	    {
1264 	      gimple_set_vuse (stmt, NULL_TREE);
1265 	      update_stmt (stmt);
1266 	    }
1267 
1268 	  reset_debug_uses (stmt);
1269 	  gsi_move_before (&gsi, &to);
1270 	}
1271     }
1272 }
1273 
1274 /* Given data reference DR in LOOP_NEST, the function computes DR's access
1275    stride at each level of loop from innermost LOOP to outer.  On success,
1276    it saves access stride at each level loop in a vector which is pointed
1277    by DR->aux.  For example:
1278 
1279      int arr[100][100][100];
1280      for (i = 0; i < 100; i++)       ;(DR->aux)strides[0] = 40000
1281        for (j = 100; j > 0; j--)     ;(DR->aux)strides[1] = 400
1282 	 for (k = 0; k < 100; k++)   ;(DR->aux)strides[2] = 4
1283 	   arr[i][j - 1][k] = 0;  */
1284 
1285 static void
compute_access_stride(class loop * & loop_nest,class loop * loop,data_reference_p dr)1286 compute_access_stride (class loop *&loop_nest, class loop *loop,
1287 		       data_reference_p dr)
1288 {
1289   vec<tree> *strides = new vec<tree> ();
1290   dr->aux = strides;
1291 
1292   basic_block bb = gimple_bb (DR_STMT (dr));
1293   if (!flow_bb_inside_loop_p (loop_nest, bb))
1294     return;
1295   while (!flow_bb_inside_loop_p (loop, bb))
1296     {
1297       strides->safe_push (build_int_cst (sizetype, 0));
1298       loop = loop_outer (loop);
1299     }
1300   gcc_assert (loop == bb->loop_father);
1301 
1302   tree ref = DR_REF (dr);
1303   if (TREE_CODE (ref) == COMPONENT_REF
1304       && DECL_BIT_FIELD (TREE_OPERAND (ref, 1)))
1305     {
1306       /* We can't take address of bitfields.  If the bitfield is at constant
1307 	 offset from the start of the struct, just use address of the
1308 	 struct, for analysis of the strides that shouldn't matter.  */
1309       if (!TREE_OPERAND (ref, 2)
1310 	  || TREE_CODE (TREE_OPERAND (ref, 2)) == INTEGER_CST)
1311 	ref = TREE_OPERAND (ref, 0);
1312       /* Otherwise, if we have a bit field representative, use that.  */
1313       else if (DECL_BIT_FIELD_REPRESENTATIVE (TREE_OPERAND (ref, 1))
1314 	       != NULL_TREE)
1315 	{
1316 	  tree repr = DECL_BIT_FIELD_REPRESENTATIVE (TREE_OPERAND (ref, 1));
1317 	  ref = build3 (COMPONENT_REF, TREE_TYPE (repr), TREE_OPERAND (ref, 0),
1318 			repr, TREE_OPERAND (ref, 2));
1319 	}
1320       /* Otherwise punt.  */
1321       else
1322 	return;
1323     }
1324   tree scev_base = build_fold_addr_expr (ref);
1325   tree scev = analyze_scalar_evolution (loop, scev_base);
1326   if (chrec_contains_undetermined (scev))
1327     return;
1328 
1329   tree orig_scev = scev;
1330   do
1331     {
1332       scev = instantiate_scev (loop_preheader_edge (loop_nest),
1333 			       loop, orig_scev);
1334       if (! chrec_contains_undetermined (scev))
1335 	break;
1336 
1337       /* If we couldn't instantiate for the desired nest, shrink it.  */
1338       if (loop_nest == loop)
1339 	return;
1340       loop_nest = loop_nest->inner;
1341     } while (1);
1342 
1343   tree sl = scev;
1344   class loop *expected = loop;
1345   while (TREE_CODE (sl) == POLYNOMIAL_CHREC)
1346     {
1347       class loop *sl_loop = get_chrec_loop (sl);
1348       while (sl_loop != expected)
1349 	{
1350 	  strides->safe_push (size_int (0));
1351 	  expected = loop_outer (expected);
1352 	}
1353       strides->safe_push (CHREC_RIGHT (sl));
1354       sl = CHREC_LEFT (sl);
1355       expected = loop_outer (expected);
1356     }
1357   if (! tree_contains_chrecs (sl, NULL))
1358     while (expected != loop_outer (loop_nest))
1359       {
1360 	strides->safe_push (size_int (0));
1361 	expected = loop_outer (expected);
1362       }
1363 }
1364 
1365 /* Given loop nest LOOP_NEST with innermost LOOP, the function computes
1366    access strides with respect to each level loop for all data refs in
1367    DATAREFS from inner loop to outer loop.  On success, it returns the
1368    outermost loop that access strides can be computed successfully for
1369    all data references.  If access strides cannot be computed at least
1370    for two levels of loop for any data reference, it returns NULL.  */
1371 
1372 static class loop *
compute_access_strides(class loop * loop_nest,class loop * loop,vec<data_reference_p> datarefs)1373 compute_access_strides (class loop *loop_nest, class loop *loop,
1374 			vec<data_reference_p> datarefs)
1375 {
1376   unsigned i, j, num_loops = (unsigned) -1;
1377   data_reference_p dr;
1378   vec<tree> *stride;
1379 
1380   class loop *interesting_loop_nest = loop_nest;
1381   for (i = 0; datarefs.iterate (i, &dr); ++i)
1382     {
1383       compute_access_stride (interesting_loop_nest, loop, dr);
1384       stride = DR_ACCESS_STRIDE (dr);
1385       if (stride->length () < num_loops)
1386 	{
1387 	  num_loops = stride->length ();
1388 	  if (num_loops < 2)
1389 	    return NULL;
1390 	}
1391     }
1392 
1393   for (i = 0; datarefs.iterate (i, &dr); ++i)
1394     {
1395       stride = DR_ACCESS_STRIDE (dr);
1396       if (stride->length () > num_loops)
1397 	stride->truncate (num_loops);
1398 
1399       for (j = 0; j < (num_loops >> 1); ++j)
1400 	std::swap ((*stride)[j], (*stride)[num_loops - j - 1]);
1401     }
1402 
1403   loop = superloop_at_depth (loop, loop_depth (loop) + 1 - num_loops);
1404   gcc_assert (loop_nest == loop || flow_loop_nested_p (loop_nest, loop));
1405   return loop;
1406 }
1407 
1408 /* Prune access strides for data references in DATAREFS by removing strides
1409    of loops that isn't in current LOOP_NEST.  */
1410 
1411 static void
prune_access_strides_not_in_loop(class loop * loop_nest,class loop * innermost,vec<data_reference_p> datarefs)1412 prune_access_strides_not_in_loop (class loop *loop_nest,
1413 				  class loop *innermost,
1414 				  vec<data_reference_p> datarefs)
1415 {
1416   data_reference_p dr;
1417   unsigned num_loops = loop_depth (innermost) - loop_depth (loop_nest) + 1;
1418   gcc_assert (num_loops > 1);
1419 
1420   /* Block remove strides of loops that is not in current loop nest.  */
1421   for (unsigned i = 0; datarefs.iterate (i, &dr); ++i)
1422     {
1423       vec<tree> *stride = DR_ACCESS_STRIDE (dr);
1424       if (stride->length () > num_loops)
1425 	stride->block_remove (0, stride->length () - num_loops);
1426     }
1427 }
1428 
1429 /* Dump access strides for all DATAREFS.  */
1430 
1431 static void
dump_access_strides(vec<data_reference_p> datarefs)1432 dump_access_strides (vec<data_reference_p> datarefs)
1433 {
1434   data_reference_p dr;
1435   fprintf (dump_file, "Access Strides for DRs:\n");
1436   for (unsigned i = 0; datarefs.iterate (i, &dr); ++i)
1437     {
1438       fprintf (dump_file, "  ");
1439       print_generic_expr (dump_file, DR_REF (dr), TDF_SLIM);
1440       fprintf (dump_file, ":\t\t<");
1441 
1442       vec<tree> *stride = DR_ACCESS_STRIDE (dr);
1443       unsigned num_loops = stride->length ();
1444       for (unsigned j = 0; j < num_loops; ++j)
1445 	{
1446 	  print_generic_expr (dump_file, (*stride)[j], TDF_SLIM);
1447 	  fprintf (dump_file, "%s", (j < num_loops - 1) ? ",\t" : ">\n");
1448 	}
1449     }
1450 }
1451 
1452 /* Return true if it's profitable to interchange two loops whose index
1453    in whole loop nest vector are I_IDX/O_IDX respectively.  The function
1454    computes and compares three types information from all DATAREFS:
1455      1) Access stride for loop I_IDX and O_IDX.
1456      2) Number of invariant memory references with respect to I_IDX before
1457 	and after loop interchange.
1458      3) Flags indicating if all memory references access sequential memory
1459 	in ILOOP, before and after loop interchange.
1460    If INNMOST_LOOP_P is true, the two loops for interchanging are the two
1461    innermost loops in loop nest.  This function also dumps information if
1462    DUMP_INFO_P is true.  */
1463 
1464 static bool
should_interchange_loops(unsigned i_idx,unsigned o_idx,vec<data_reference_p> datarefs,unsigned i_stmt_cost,unsigned o_stmt_cost,bool innermost_loops_p,bool dump_info_p=true)1465 should_interchange_loops (unsigned i_idx, unsigned o_idx,
1466 			  vec<data_reference_p> datarefs,
1467 			  unsigned i_stmt_cost, unsigned o_stmt_cost,
1468 			  bool innermost_loops_p, bool dump_info_p = true)
1469 {
1470   unsigned HOST_WIDE_INT ratio;
1471   unsigned i, j, num_old_inv_drs = 0, num_new_inv_drs = 0;
1472   struct data_reference *dr;
1473   bool all_seq_dr_before_p = true, all_seq_dr_after_p = true;
1474   widest_int iloop_strides = 0, oloop_strides = 0;
1475   unsigned num_unresolved_drs = 0;
1476   unsigned num_resolved_ok_drs = 0;
1477   unsigned num_resolved_not_ok_drs = 0;
1478 
1479   if (dump_info_p && dump_file && (dump_flags & TDF_DETAILS))
1480     fprintf (dump_file, "\nData ref strides:\n\tmem_ref:\t\tiloop\toloop\n");
1481 
1482   for (i = 0; datarefs.iterate (i, &dr); ++i)
1483     {
1484       vec<tree> *stride = DR_ACCESS_STRIDE (dr);
1485       tree iloop_stride = (*stride)[i_idx], oloop_stride = (*stride)[o_idx];
1486 
1487       bool subloop_stride_p = false;
1488       /* Data ref can't be invariant or sequential access at current loop if
1489 	 its address changes with respect to any subloops.  */
1490       for (j = i_idx + 1; j < stride->length (); ++j)
1491 	if (!integer_zerop ((*stride)[j]))
1492 	  {
1493 	    subloop_stride_p = true;
1494 	    break;
1495 	  }
1496 
1497       if (integer_zerop (iloop_stride))
1498 	{
1499 	  if (!subloop_stride_p)
1500 	    num_old_inv_drs++;
1501 	}
1502       if (integer_zerop (oloop_stride))
1503 	{
1504 	  if (!subloop_stride_p)
1505 	    num_new_inv_drs++;
1506 	}
1507 
1508       if (TREE_CODE (iloop_stride) == INTEGER_CST
1509 	  && TREE_CODE (oloop_stride) == INTEGER_CST)
1510 	{
1511 	  iloop_strides = wi::add (iloop_strides, wi::to_widest (iloop_stride));
1512 	  oloop_strides = wi::add (oloop_strides, wi::to_widest (oloop_stride));
1513 	}
1514       else if (multiple_of_p (TREE_TYPE (iloop_stride),
1515 			      iloop_stride, oloop_stride))
1516 	num_resolved_ok_drs++;
1517       else if (multiple_of_p (TREE_TYPE (iloop_stride),
1518 			      oloop_stride, iloop_stride))
1519 	num_resolved_not_ok_drs++;
1520       else
1521 	num_unresolved_drs++;
1522 
1523       /* Data ref can't be sequential access if its address changes in sub
1524 	 loop.  */
1525       if (subloop_stride_p)
1526 	{
1527 	  all_seq_dr_before_p = false;
1528 	  all_seq_dr_after_p = false;
1529 	  continue;
1530 	}
1531       /* Track if all data references are sequential accesses before/after loop
1532 	 interchange.  Note invariant is considered sequential here.  */
1533       tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
1534       if (all_seq_dr_before_p
1535 	  && ! (integer_zerop (iloop_stride)
1536 		|| operand_equal_p (access_size, iloop_stride, 0)))
1537 	all_seq_dr_before_p = false;
1538       if (all_seq_dr_after_p
1539 	  && ! (integer_zerop (oloop_stride)
1540 		|| operand_equal_p (access_size, oloop_stride, 0)))
1541 	all_seq_dr_after_p = false;
1542     }
1543 
1544   if (dump_info_p && dump_file && (dump_flags & TDF_DETAILS))
1545     {
1546       fprintf (dump_file, "\toverall:\t\t");
1547       print_decu (iloop_strides, dump_file);
1548       fprintf (dump_file, "\t");
1549       print_decu (oloop_strides, dump_file);
1550       fprintf (dump_file, "\n");
1551 
1552       fprintf (dump_file, "Invariant data ref: before(%d), after(%d)\n",
1553 	       num_old_inv_drs, num_new_inv_drs);
1554       fprintf (dump_file, "All consecutive stride: before(%s), after(%s)\n",
1555 	       all_seq_dr_before_p ? "true" : "false",
1556 	       all_seq_dr_after_p ? "true" : "false");
1557       fprintf (dump_file, "OK to interchage with variable strides: %d\n",
1558 	       num_resolved_ok_drs);
1559       fprintf (dump_file, "Not OK to interchage with variable strides: %d\n",
1560 	       num_resolved_not_ok_drs);
1561       fprintf (dump_file, "Variable strides we cannot decide: %d\n",
1562 	       num_unresolved_drs);
1563       fprintf (dump_file, "Stmt cost of inner loop: %d\n", i_stmt_cost);
1564       fprintf (dump_file, "Stmt cost of outer loop: %d\n", o_stmt_cost);
1565     }
1566 
1567   if (num_unresolved_drs != 0 || num_resolved_not_ok_drs != 0)
1568     return false;
1569 
1570   /* Stmts of outer loop will be moved to inner loop.  If there are two many
1571      such stmts, it could make inner loop costly.  Here we compare stmt cost
1572      between outer and inner loops.  */
1573   if (i_stmt_cost && o_stmt_cost
1574       && num_old_inv_drs + o_stmt_cost > num_new_inv_drs
1575       && o_stmt_cost * STMT_COST_RATIO > i_stmt_cost)
1576     return false;
1577 
1578   /* We use different stride comparison ratio for interchanging innermost
1579      two loops or not.  The idea is to be conservative in interchange for
1580      the innermost loops.  */
1581   ratio = innermost_loops_p ? INNER_STRIDE_RATIO : OUTER_STRIDE_RATIO;
1582   /* Do interchange if it gives better data locality behavior.  */
1583   if (wi::gtu_p (iloop_strides, wi::mul (oloop_strides, ratio)))
1584     return true;
1585   if (wi::gtu_p (iloop_strides, oloop_strides))
1586     {
1587       /* Or it creates more invariant memory references.  */
1588       if ((!all_seq_dr_before_p || all_seq_dr_after_p)
1589 	  && num_new_inv_drs > num_old_inv_drs)
1590 	return true;
1591       /* Or it makes all memory references sequential.  */
1592       if (num_new_inv_drs >= num_old_inv_drs
1593 	  && !all_seq_dr_before_p && all_seq_dr_after_p)
1594 	return true;
1595     }
1596 
1597   return false;
1598 }
1599 
1600 /* Try to interchange inner loop of a loop nest to outer level.  */
1601 
1602 bool
interchange(vec<data_reference_p> datarefs,vec<ddr_p> ddrs)1603 tree_loop_interchange::interchange (vec<data_reference_p> datarefs,
1604 				    vec<ddr_p> ddrs)
1605 {
1606   dump_user_location_t loc = find_loop_location (m_loop_nest[0]);
1607   bool changed_p = false;
1608   /* In each iteration we try to interchange I-th loop with (I+1)-th loop.
1609      The overall effect is to push inner loop to outermost level in whole
1610      loop nest.  */
1611   for (unsigned i = m_loop_nest.length (); i > 1; --i)
1612     {
1613       unsigned i_idx = i - 1, o_idx = i - 2;
1614 
1615       /* Check validity for loop interchange.  */
1616       if (!valid_data_dependences (i_idx, o_idx, ddrs))
1617 	break;
1618 
1619       loop_cand iloop (m_loop_nest[i_idx], m_loop_nest[o_idx]);
1620       loop_cand oloop (m_loop_nest[o_idx], m_loop_nest[o_idx]);
1621 
1622       /* Check if we can do transformation for loop interchange.  */
1623       if (!iloop.analyze_carried_vars (NULL)
1624 	  || !iloop.analyze_lcssa_phis ()
1625 	  || !oloop.analyze_carried_vars (&iloop)
1626 	  || !oloop.analyze_lcssa_phis ()
1627 	  || !iloop.can_interchange_p (NULL)
1628 	  || !oloop.can_interchange_p (&iloop))
1629 	break;
1630 
1631       /* Outer loop's stmts will be moved to inner loop during interchange.
1632 	 If there are many of them, it may make inner loop very costly.  We
1633 	 need to check number of outer loop's stmts in profit cost model of
1634 	 interchange.  */
1635       int stmt_cost = oloop.m_num_stmts;
1636       /* Count out the exit checking stmt of outer loop.  */
1637       stmt_cost --;
1638       /* Count out IV's increasing stmt, IVOPTs takes care if it.  */
1639       stmt_cost -= oloop.m_inductions.length ();
1640       /* Count in the additional load and cond_expr stmts caused by inner
1641 	 loop's constant initialized reduction.  */
1642       stmt_cost += iloop.m_const_init_reduc * 2;
1643       if (stmt_cost < 0)
1644 	stmt_cost = 0;
1645 
1646       /* Check profitability for loop interchange.  */
1647       if (should_interchange_loops (i_idx, o_idx, datarefs,
1648 				    (unsigned) iloop.m_num_stmts,
1649 				    (unsigned) stmt_cost,
1650 				    iloop.m_loop->inner == NULL))
1651 	{
1652 	  if (dump_file && (dump_flags & TDF_DETAILS))
1653 	    fprintf (dump_file,
1654 		     "Loop_pair<outer:%d, inner:%d> is interchanged\n\n",
1655 		     oloop.m_loop->num, iloop.m_loop->num);
1656 
1657 	  changed_p = true;
1658 	  interchange_loops (iloop, oloop);
1659 	  /* No need to update if there is no further loop interchange.  */
1660 	  if (o_idx > 0)
1661 	    update_data_info (i_idx, o_idx, datarefs, ddrs);
1662 	}
1663       else
1664 	{
1665 	  if (dump_file && (dump_flags & TDF_DETAILS))
1666 	    fprintf (dump_file,
1667 		     "Loop_pair<outer:%d, inner:%d> is not interchanged\n\n",
1668 		     oloop.m_loop->num, iloop.m_loop->num);
1669 	}
1670     }
1671   simple_dce_from_worklist (m_dce_seeds);
1672 
1673   if (changed_p && dump_enabled_p ())
1674     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
1675 		     "loops interchanged in loop nest\n");
1676 
1677   return changed_p;
1678 }
1679 
1680 
1681 /* Loop interchange pass.  */
1682 
1683 namespace {
1684 
1685 const pass_data pass_data_linterchange =
1686 {
1687   GIMPLE_PASS, /* type */
1688   "linterchange", /* name */
1689   OPTGROUP_LOOP, /* optinfo_flags */
1690   TV_LINTERCHANGE, /* tv_id */
1691   PROP_cfg, /* properties_required */
1692   0, /* properties_provided */
1693   0, /* properties_destroyed */
1694   0, /* todo_flags_start */
1695   0, /* todo_flags_finish */
1696 };
1697 
1698 class pass_linterchange : public gimple_opt_pass
1699 {
1700 public:
pass_linterchange(gcc::context * ctxt)1701   pass_linterchange (gcc::context *ctxt)
1702     : gimple_opt_pass (pass_data_linterchange, ctxt)
1703   {}
1704 
1705   /* opt_pass methods: */
clone()1706   opt_pass * clone () { return new pass_linterchange (m_ctxt); }
gate(function *)1707   virtual bool gate (function *) { return flag_loop_interchange; }
1708   virtual unsigned int execute (function *);
1709 
1710 }; // class pass_linterchange
1711 
1712 
1713 /* Return true if LOOP has proper form for interchange.  We check three
1714    conditions in the function:
1715      1) In general, a loop can be interchanged only if it doesn't have
1716 	basic blocks other than header, exit and latch besides possible
1717 	inner loop nest.  This basically restricts loop interchange to
1718 	below form loop nests:
1719 
1720           header<---+
1721             |       |
1722             v       |
1723         INNER_LOOP  |
1724             |       |
1725             v       |
1726           exit--->latch
1727 
1728      2) Data reference in basic block that executes in different times
1729 	than loop head/exit is not allowed.
1730      3) Record the innermost outer loop that doesn't form rectangle loop
1731 	nest with LOOP.  */
1732 
1733 static bool
proper_loop_form_for_interchange(class loop * loop,class loop ** min_outer)1734 proper_loop_form_for_interchange (class loop *loop, class loop **min_outer)
1735 {
1736   edge e0, e1, exit;
1737 
1738   /* Don't interchange if loop has unsupported information for the moment.  */
1739   if (loop->safelen > 0
1740       || loop->constraints != 0
1741       || loop->can_be_parallel
1742       || loop->dont_vectorize
1743       || loop->force_vectorize
1744       || loop->in_oacc_kernels_region
1745       || loop->orig_loop_num != 0
1746       || loop->simduid != NULL_TREE)
1747     return false;
1748 
1749   /* Don't interchange if outer loop has basic block other than header, exit
1750      and latch.  */
1751   if (loop->inner != NULL
1752       && loop->num_nodes != loop->inner->num_nodes + 3)
1753     return false;
1754 
1755   if ((exit = single_dom_exit (loop)) == NULL)
1756     return false;
1757 
1758   /* Check control flow on loop header/exit blocks.  */
1759   if (loop->header == exit->src
1760       && (EDGE_COUNT (loop->header->preds) != 2
1761 	  || EDGE_COUNT (loop->header->succs) != 2))
1762     return false;
1763   else if (loop->header != exit->src
1764 	   && (EDGE_COUNT (loop->header->preds) != 2
1765 	       || !single_succ_p (loop->header)
1766 	       || unsupported_edge (single_succ_edge (loop->header))
1767 	       || EDGE_COUNT (exit->src->succs) != 2
1768 	       || !single_pred_p (exit->src)
1769 	       || unsupported_edge (single_pred_edge (exit->src))))
1770     return false;
1771 
1772   e0 = EDGE_PRED (loop->header, 0);
1773   e1 = EDGE_PRED (loop->header, 1);
1774   if (unsupported_edge (e0) || unsupported_edge (e1)
1775       || (e0->src != loop->latch && e1->src != loop->latch)
1776       || (e0->src->loop_father == loop && e1->src->loop_father == loop))
1777     return false;
1778 
1779   e0 = EDGE_SUCC (exit->src, 0);
1780   e1 = EDGE_SUCC (exit->src, 1);
1781   if (unsupported_edge (e0) || unsupported_edge (e1)
1782       || (e0->dest != loop->latch && e1->dest != loop->latch)
1783       || (e0->dest->loop_father == loop && e1->dest->loop_father == loop))
1784     return false;
1785 
1786   /* Don't interchange if any reference is in basic block that doesn't
1787      dominate exit block.  */
1788   basic_block *bbs = get_loop_body (loop);
1789   for (unsigned i = 0; i < loop->num_nodes; i++)
1790     {
1791       basic_block bb = bbs[i];
1792 
1793       if (bb->loop_father != loop
1794 	  || bb == loop->header || bb == exit->src
1795 	  || dominated_by_p (CDI_DOMINATORS, exit->src, bb))
1796 	continue;
1797 
1798       for (gimple_stmt_iterator gsi = gsi_start_nondebug_bb (bb);
1799 	   !gsi_end_p (gsi); gsi_next_nondebug (&gsi))
1800 	if (gimple_vuse (gsi_stmt (gsi)))
1801 	  {
1802 	    free (bbs);
1803 	    return false;
1804 	  }
1805     }
1806   free (bbs);
1807 
1808   tree niters = number_of_latch_executions (loop);
1809   niters = analyze_scalar_evolution (loop_outer (loop), niters);
1810   if (!niters || chrec_contains_undetermined (niters))
1811     return false;
1812 
1813   /* Record the innermost outer loop that doesn't form rectangle loop nest.  */
1814   for (loop_p loop2 = loop_outer (loop);
1815        loop2 && flow_loop_nested_p (*min_outer, loop2);
1816        loop2 = loop_outer (loop2))
1817     {
1818       niters = instantiate_scev (loop_preheader_edge (loop2),
1819 				 loop_outer (loop), niters);
1820       if (!evolution_function_is_invariant_p (niters, loop2->num))
1821 	{
1822 	  *min_outer = loop2;
1823 	  break;
1824 	}
1825     }
1826   return true;
1827 }
1828 
1829 /* Return true if any two adjacent loops in loop nest [INNERMOST, LOOP_NEST]
1830    should be interchanged by looking into all DATAREFS.  */
1831 
1832 static bool
should_interchange_loop_nest(class loop * loop_nest,class loop * innermost,vec<data_reference_p> datarefs)1833 should_interchange_loop_nest (class loop *loop_nest, class loop *innermost,
1834 			      vec<data_reference_p> datarefs)
1835 {
1836   unsigned idx = loop_depth (innermost) - loop_depth (loop_nest);
1837   gcc_assert (idx > 0);
1838 
1839   /* Check if any two adjacent loops should be interchanged.  */
1840   for (class loop *loop = innermost;
1841        loop != loop_nest; loop = loop_outer (loop), idx--)
1842     if (should_interchange_loops (idx, idx - 1, datarefs, 0, 0,
1843 				  loop == innermost, false))
1844       return true;
1845 
1846   return false;
1847 }
1848 
1849 /* Given loop nest LOOP_NEST and data references DATAREFS, compute data
1850    dependences for loop interchange and store it in DDRS.  Note we compute
1851    dependences directly rather than call generic interface so that we can
1852    return on unknown dependence instantly.  */
1853 
1854 static bool
tree_loop_interchange_compute_ddrs(vec<loop_p> loop_nest,vec<data_reference_p> datarefs,vec<ddr_p> * ddrs)1855 tree_loop_interchange_compute_ddrs (vec<loop_p> loop_nest,
1856 				    vec<data_reference_p> datarefs,
1857 				    vec<ddr_p> *ddrs)
1858 {
1859   struct data_reference *a, *b;
1860   class loop *innermost = loop_nest.last ();
1861 
1862   for (unsigned i = 0; datarefs.iterate (i, &a); ++i)
1863     {
1864       bool a_outer_p = gimple_bb (DR_STMT (a))->loop_father != innermost;
1865       for (unsigned j = i + 1; datarefs.iterate (j, &b); ++j)
1866 	if (DR_IS_WRITE (a) || DR_IS_WRITE (b))
1867 	  {
1868 	    bool b_outer_p = gimple_bb (DR_STMT (b))->loop_father != innermost;
1869 	    /* Don't support multiple write references in outer loop.  */
1870 	    if (a_outer_p && b_outer_p && DR_IS_WRITE (a) && DR_IS_WRITE (b))
1871 	      return false;
1872 
1873 	    ddr_p ddr = initialize_data_dependence_relation (a, b, loop_nest);
1874 	    ddrs->safe_push (ddr);
1875 	    compute_affine_dependence (ddr, loop_nest[0]);
1876 
1877 	    /* Give up if ddr is unknown dependence or classic direct vector
1878 	       is not available.  */
1879 	    if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1880 		|| (DDR_ARE_DEPENDENT (ddr) == NULL_TREE
1881 		    && DDR_NUM_DIR_VECTS (ddr) == 0))
1882 	      return false;
1883 
1884 	    /* If either data references is in outer loop of nest, we require
1885 	       no dependence here because the data reference need to be moved
1886 	       into inner loop during interchange.  */
1887 	    if (a_outer_p && b_outer_p
1888 		&& operand_equal_p (DR_REF (a), DR_REF (b), 0))
1889 	      continue;
1890 	    if (DDR_ARE_DEPENDENT (ddr) != chrec_known
1891 		&& (a_outer_p || b_outer_p))
1892 	      return false;
1893 	}
1894     }
1895 
1896   return true;
1897 }
1898 
1899 /* Prune DATAREFS by removing any data reference not inside of LOOP.  */
1900 
1901 static inline void
prune_datarefs_not_in_loop(class loop * loop,vec<data_reference_p> datarefs)1902 prune_datarefs_not_in_loop (class loop *loop, vec<data_reference_p> datarefs)
1903 {
1904   unsigned i, j;
1905   struct data_reference *dr;
1906 
1907   for (i = 0, j = 0; datarefs.iterate (i, &dr); ++i)
1908     {
1909       if (flow_bb_inside_loop_p (loop, gimple_bb (DR_STMT (dr))))
1910 	datarefs[j++] = dr;
1911       else
1912 	{
1913 	  if (dr->aux)
1914 	    {
1915 	      DR_ACCESS_STRIDE (dr)->release ();
1916 	      delete (vec<tree> *) dr->aux;
1917 	    }
1918 	  free_data_ref (dr);
1919 	}
1920     }
1921   datarefs.truncate (j);
1922 }
1923 
1924 /* Find and store data references in DATAREFS for LOOP nest.  If there's
1925    difficult data reference in a basic block, we shrink the loop nest to
1926    inner loop of that basic block's father loop.  On success, return the
1927    outer loop of the result loop nest.  */
1928 
1929 static class loop *
prepare_data_references(class loop * loop,vec<data_reference_p> * datarefs)1930 prepare_data_references (class loop *loop, vec<data_reference_p> *datarefs)
1931 {
1932   class loop *loop_nest = loop;
1933   vec<data_reference_p> *bb_refs;
1934   basic_block bb, *bbs = get_loop_body_in_dom_order (loop);
1935 
1936   for (unsigned i = 0; i < loop->num_nodes; i++)
1937     bbs[i]->aux = NULL;
1938 
1939   /* Find data references for all basic blocks.  Shrink loop nest on difficult
1940      data reference.  */
1941   for (unsigned i = 0; loop_nest && i < loop->num_nodes; ++i)
1942     {
1943       bb = bbs[i];
1944       if (!flow_bb_inside_loop_p (loop_nest, bb))
1945 	continue;
1946 
1947       bb_refs = new vec<data_reference_p> ();
1948       if (find_data_references_in_bb (loop, bb, bb_refs) == chrec_dont_know)
1949         {
1950 	  loop_nest = bb->loop_father->inner;
1951 	  if (loop_nest && !loop_nest->inner)
1952 	    loop_nest = NULL;
1953 
1954 	  free_data_refs (*bb_refs);
1955           delete bb_refs;
1956         }
1957       else if (bb_refs->is_empty ())
1958 	{
1959 	  bb_refs->release ();
1960 	  delete bb_refs;
1961 	}
1962       else
1963 	bb->aux = bb_refs;
1964     }
1965 
1966   /* Collect all data references in loop nest.  */
1967   for (unsigned i = 0; i < loop->num_nodes; i++)
1968     {
1969       bb = bbs[i];
1970       if (!bb->aux)
1971 	continue;
1972 
1973       bb_refs = (vec<data_reference_p> *) bb->aux;
1974       if (loop_nest && flow_bb_inside_loop_p (loop_nest, bb))
1975 	{
1976 	  datarefs->safe_splice (*bb_refs);
1977 	  bb_refs->release ();
1978 	}
1979       else
1980 	free_data_refs (*bb_refs);
1981 
1982       delete bb_refs;
1983       bb->aux = NULL;
1984     }
1985   free (bbs);
1986 
1987   return loop_nest;
1988 }
1989 
1990 /* Given innermost LOOP, return true if perfect loop nest can be found and
1991    data dependences can be computed.  If succeed, record the perfect loop
1992    nest in LOOP_NEST; record all data references in DATAREFS and record all
1993    data dependence relations in DDRS.
1994 
1995    We do support a restricted form of imperfect loop nest, i.e, loop nest
1996    with load/store in outer loop initializing/finalizing simple reduction
1997    of the innermost loop.  For such outer loop reference, we require that
1998    it has no dependence with others sinve it will be moved to inner loop
1999    in interchange.  */
2000 
2001 static bool
prepare_perfect_loop_nest(class loop * loop,vec<loop_p> * loop_nest,vec<data_reference_p> * datarefs,vec<ddr_p> * ddrs)2002 prepare_perfect_loop_nest (class loop *loop, vec<loop_p> *loop_nest,
2003 			   vec<data_reference_p> *datarefs, vec<ddr_p> *ddrs)
2004 {
2005   class loop *start_loop = NULL, *innermost = loop;
2006   class loop *outermost = loops_for_fn (cfun)->tree_root;
2007 
2008   /* Find loop nest from the innermost loop.  The outermost is the innermost
2009      outer*/
2010   while (loop->num != 0 && loop->inner == start_loop
2011 	 && flow_loop_nested_p (outermost, loop))
2012     {
2013       if (!proper_loop_form_for_interchange (loop, &outermost))
2014 	break;
2015 
2016       start_loop = loop;
2017       /* If this loop has sibling loop, the father loop won't be in perfect
2018 	 loop nest.  */
2019       if (loop->next != NULL)
2020 	break;
2021 
2022       loop = loop_outer (loop);
2023     }
2024   if (!start_loop || !start_loop->inner)
2025     return false;
2026 
2027   /* Prepare the data reference vector for the loop nest, pruning outer
2028      loops we cannot handle.  */
2029   start_loop = prepare_data_references (start_loop, datarefs);
2030   if (!start_loop
2031       /* Check if there is no data reference.  */
2032       || datarefs->is_empty ()
2033       /* Check if there are too many of data references.  */
2034       || (int) datarefs->length () > MAX_DATAREFS)
2035     return false;
2036 
2037   /* Compute access strides for all data references, pruning outer
2038      loops we cannot analyze refs in.  */
2039   start_loop = compute_access_strides (start_loop, innermost, *datarefs);
2040   if (!start_loop)
2041     return false;
2042 
2043   /* Check if any interchange is profitable in the loop nest.  */
2044   if (!should_interchange_loop_nest (start_loop, innermost, *datarefs))
2045     return false;
2046 
2047   /* Check if data dependences can be computed for loop nest starting from
2048      start_loop.  */
2049   loop = start_loop;
2050   do {
2051     loop_nest->truncate (0);
2052 
2053     if (loop != start_loop)
2054       prune_datarefs_not_in_loop (start_loop, *datarefs);
2055 
2056     if (find_loop_nest (start_loop, loop_nest)
2057 	&& tree_loop_interchange_compute_ddrs (*loop_nest, *datarefs, ddrs))
2058       {
2059 	if (dump_file && (dump_flags & TDF_DETAILS))
2060 	  fprintf (dump_file,
2061 		   "\nConsider loop interchange for loop_nest<%d - %d>\n",
2062 		   start_loop->num, innermost->num);
2063 
2064 	if (loop != start_loop)
2065 	  prune_access_strides_not_in_loop (start_loop, innermost, *datarefs);
2066 
2067 	if (dump_file && (dump_flags & TDF_DETAILS))
2068 	  dump_access_strides (*datarefs);
2069 
2070 	return true;
2071       }
2072 
2073     free_dependence_relations (*ddrs);
2074     *ddrs = vNULL;
2075     /* Try to compute data dependences with the outermost loop stripped.  */
2076     loop = start_loop;
2077     start_loop = start_loop->inner;
2078   } while (start_loop && start_loop->inner);
2079 
2080   return false;
2081 }
2082 
2083 /* Main entry for loop interchange pass.  */
2084 
2085 unsigned int
execute(function * fun)2086 pass_linterchange::execute (function *fun)
2087 {
2088   if (number_of_loops (fun) <= 2)
2089     return 0;
2090 
2091   bool changed_p = false;
2092   for (auto loop : loops_list (cfun, LI_ONLY_INNERMOST))
2093     {
2094       vec<loop_p> loop_nest = vNULL;
2095       vec<data_reference_p> datarefs = vNULL;
2096       vec<ddr_p> ddrs = vNULL;
2097       if (prepare_perfect_loop_nest (loop, &loop_nest, &datarefs, &ddrs))
2098 	{
2099 	  tree_loop_interchange loop_interchange (loop_nest);
2100 	  changed_p |= loop_interchange.interchange (datarefs, ddrs);
2101 	}
2102       free_dependence_relations (ddrs);
2103       free_data_refs_with_aux (datarefs);
2104       loop_nest.release ();
2105     }
2106 
2107   if (changed_p)
2108     {
2109       unsigned todo = TODO_update_ssa_only_virtuals;
2110       todo |= loop_invariant_motion_in_fun (cfun, false);
2111       scev_reset ();
2112       return todo;
2113     }
2114   return 0;
2115 }
2116 
2117 } // anon namespace
2118 
2119 gimple_opt_pass *
make_pass_linterchange(gcc::context * ctxt)2120 make_pass_linterchange (gcc::context *ctxt)
2121 {
2122   return new pass_linterchange (ctxt);
2123 }
2124