1 /* Loop interchange.
2    Copyright (C) 2017-2018 Free Software Foundation, Inc.
3    Contributed by ARM Ltd.
4 
5 This file is part of GCC.
6 
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 3, or (at your option) any
10 later version.
11 
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15 for more details.
16 
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3.  If not see
19 <http://www.gnu.org/licenses/>.  */
20 
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "is-a.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "tree-pass.h"
29 #include "ssa.h"
30 #include "gimple-pretty-print.h"
31 #include "fold-const.h"
32 #include "gimplify.h"
33 #include "gimple-iterator.h"
34 #include "gimplify-me.h"
35 #include "cfgloop.h"
36 #include "params.h"
37 #include "tree-ssa.h"
38 #include "tree-scalar-evolution.h"
39 #include "tree-ssa-loop-manip.h"
40 #include "tree-ssa-loop-niter.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-dce.h"
43 #include "tree-data-ref.h"
44 #include "tree-vectorizer.h"
45 
46 /* This pass performs loop interchange: for example, the loop nest
47 
48    for (int j = 0; j < N; j++)
49      for (int k = 0; k < N; k++)
50        for (int i = 0; i < N; i++)
51 	 c[i][j] = c[i][j] + a[i][k]*b[k][j];
52 
53    is transformed to
54 
55    for (int i = 0; i < N; i++)
56      for (int j = 0; j < N; j++)
57        for (int k = 0; k < N; k++)
58 	 c[i][j] = c[i][j] + a[i][k]*b[k][j];
59 
60    This pass implements loop interchange in the following steps:
61 
62      1) Find perfect loop nest for each innermost loop and compute data
63 	dependence relations for it.  For above example, loop nest is
64 	<loop_j, loop_k, loop_i>.
65      2) From innermost to outermost loop, this pass tries to interchange
66 	each loop pair.  For above case, it firstly tries to interchange
67 	<loop_k, loop_i> and loop nest becomes <loop_j, loop_i, loop_k>.
68 	Then it tries to interchange <loop_j, loop_i> and loop nest becomes
69 	<loop_i, loop_j, loop_k>.  The overall effect is to move innermost
70 	loop to the outermost position.  For loop pair <loop_i, loop_j>
71 	to be interchanged, we:
72      3) Check if data dependence relations are valid for loop interchange.
73      4) Check if both loops can be interchanged in terms of transformation.
74      5) Check if interchanging the two loops is profitable.
75      6) Interchange the two loops by mapping induction variables.
76 
77    This pass also handles reductions in loop nest.  So far we only support
78    simple reduction of inner loop and double reduction of the loop nest.  */
79 
80 /* Maximum number of stmts in each loop that should be interchanged.  */
81 #define MAX_NUM_STMT    (PARAM_VALUE (PARAM_LOOP_INTERCHANGE_MAX_NUM_STMTS))
82 /* Maximum number of data references in loop nest.  */
83 #define MAX_DATAREFS    (PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
84 
85 /* Comparison ratio of access stride between inner/outer loops to be
86    interchanged.  This is the minimum stride ratio for loop interchange
87    to be profitable.  */
88 #define OUTER_STRIDE_RATIO  (PARAM_VALUE (PARAM_LOOP_INTERCHANGE_STRIDE_RATIO))
89 /* The same as above, but we require higher ratio for interchanging the
90    innermost two loops.  */
91 #define INNER_STRIDE_RATIO  ((OUTER_STRIDE_RATIO) + 1)
92 
93 /* Comparison ratio of stmt cost between inner/outer loops.  Loops won't
94    be interchanged if outer loop has too many stmts.  */
95 #define STMT_COST_RATIO     (3)
96 
97 /* Vector of strides that DR accesses in each level loop of a loop nest.  */
98 #define DR_ACCESS_STRIDE(dr) ((vec<tree> *) dr->aux)
99 
100 /* Structure recording loop induction variable.  */
101 typedef struct induction
102 {
103   /* IV itself.  */
104   tree var;
105   /* IV's initializing value, which is the init arg of the IV PHI node.  */
106   tree init_val;
107   /* IV's initializing expr, which is (the expanded result of) init_val.  */
108   tree init_expr;
109   /* IV's step.  */
110   tree step;
111 } *induction_p;
112 
113 /* Enum type for loop reduction variable.  */
114 enum reduction_type
115 {
116   UNKNOWN_RTYPE = 0,
117   SIMPLE_RTYPE,
118   DOUBLE_RTYPE
119 };
120 
121 /* Structure recording loop reduction variable.  */
122 typedef struct reduction
123 {
124   /* Reduction itself.  */
125   tree var;
126   /* PHI node defining reduction variable.  */
127   gphi *phi;
128   /* Init and next variables of the reduction.  */
129   tree init;
130   tree next;
131   /* Lcssa PHI node if reduction is used outside of its definition loop.  */
132   gphi *lcssa_phi;
133   /* Stmts defining init and next.  */
134   gimple *producer;
135   gimple *consumer;
136   /* If init is loaded from memory, this is the loading memory reference.  */
137   tree init_ref;
138   /* If reduction is finally stored to memory, this is the stored memory
139      reference.  */
140   tree fini_ref;
141   enum reduction_type type;
142 } *reduction_p;
143 
144 
145 /* Dump reduction RE.  */
146 
147 static void
148 dump_reduction (reduction_p re)
149 {
150   if (re->type == SIMPLE_RTYPE)
151     fprintf (dump_file, "  Simple reduction:  ");
152   else if (re->type == DOUBLE_RTYPE)
153     fprintf (dump_file, "  Double reduction:  ");
154   else
155     fprintf (dump_file, "  Unknown reduction:  ");
156 
157   print_gimple_stmt (dump_file, re->phi, 0);
158 }
159 
160 /* Dump LOOP's induction IV.  */
161 static void
162 dump_induction (struct loop *loop, induction_p iv)
163 {
164   fprintf (dump_file, "  Induction:  ");
165   print_generic_expr (dump_file, iv->var, TDF_SLIM);
166   fprintf (dump_file, " = {");
167   print_generic_expr (dump_file, iv->init_expr, TDF_SLIM);
168   fprintf (dump_file, ", ");
169   print_generic_expr (dump_file, iv->step, TDF_SLIM);
170   fprintf (dump_file, "}_%d\n", loop->num);
171 }
172 
173 /* Loop candidate for interchange.  */
174 
175 struct loop_cand
176 {
177   loop_cand (struct loop *, struct loop *);
178   ~loop_cand ();
179 
180   reduction_p find_reduction_by_stmt (gimple *);
181   void classify_simple_reduction (reduction_p);
182   bool analyze_iloop_reduction_var (tree);
183   bool analyze_oloop_reduction_var (loop_cand *, tree);
184   bool analyze_induction_var (tree, tree);
185   bool analyze_carried_vars (loop_cand *);
186   bool analyze_lcssa_phis (void);
187   bool can_interchange_p (loop_cand *);
188   void undo_simple_reduction (reduction_p, bitmap);
189 
190   /* The loop itself.  */
191   struct loop *m_loop;
192   /* The outer loop for interchange.  It equals to loop if this loop cand
193      itself represents the outer loop.  */
194   struct loop *m_outer;
195   /* Vector of induction variables in loop.  */
196   vec<induction_p> m_inductions;
197   /* Vector of reduction variables in loop.  */
198   vec<reduction_p> m_reductions;
199   /* Lcssa PHI nodes of this loop.  */
200   vec<gphi *> m_lcssa_nodes;
201   /* Single exit edge of this loop.  */
202   edge m_exit;
203   /* Basic blocks of this loop.  */
204   basic_block *m_bbs;
205   /* Number of stmts of this loop.  Inner loops' stmts are not included.  */
206   int m_num_stmts;
207   /* Number of constant initialized simple reduction.  */
208   int m_const_init_reduc;
209 };
210 
211 /* Constructor.  */
212 
213 loop_cand::loop_cand (struct loop *loop, struct loop *outer)
214   : m_loop (loop), m_outer (outer), m_exit (single_exit (loop)),
215     m_bbs (get_loop_body (loop)), m_num_stmts (0), m_const_init_reduc (0)
216 {
217     m_inductions.create (3);
218     m_reductions.create (3);
219     m_lcssa_nodes.create (3);
220 }
221 
222 /* Destructor.  */
223 
224 loop_cand::~loop_cand ()
225 {
226   induction_p iv;
227   for (unsigned i = 0; m_inductions.iterate (i, &iv); ++i)
228     free (iv);
229 
230   reduction_p re;
231   for (unsigned i = 0; m_reductions.iterate (i, &re); ++i)
232     free (re);
233 
234   m_inductions.release ();
235   m_reductions.release ();
236   m_lcssa_nodes.release ();
237   free (m_bbs);
238 }
239 
240 /* Return single use stmt of VAR in LOOP, otherwise return NULL.  */
241 
242 static gimple *
243 single_use_in_loop (tree var, struct loop *loop)
244 {
245   gimple *stmt, *res = NULL;
246   use_operand_p use_p;
247   imm_use_iterator iterator;
248 
249   FOR_EACH_IMM_USE_FAST (use_p, iterator, var)
250     {
251       stmt = USE_STMT (use_p);
252       if (is_gimple_debug (stmt))
253 	continue;
254 
255       if (!flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
256 	continue;
257 
258       if (res)
259 	return NULL;
260 
261       res = stmt;
262     }
263   return res;
264 }
265 
266 /* Return true if E is unsupported in loop interchange, i.e, E is a complex
267    edge or part of irreducible loop.  */
268 
269 static inline bool
270 unsupported_edge (edge e)
271 {
272   return (e->flags & (EDGE_COMPLEX | EDGE_IRREDUCIBLE_LOOP));
273 }
274 
275 /* Return the reduction if STMT is one of its lcssa PHI, producer or consumer
276    stmt.  */
277 
278 reduction_p
279 loop_cand::find_reduction_by_stmt (gimple *stmt)
280 {
281   gphi *phi = dyn_cast <gphi *> (stmt);
282   reduction_p re;
283 
284   for (unsigned i = 0; m_reductions.iterate (i, &re); ++i)
285     if ((phi != NULL && phi == re->lcssa_phi)
286 	|| (stmt == re->producer || stmt == re->consumer))
287       return re;
288 
289   return NULL;
290 }
291 
292 /* Return true if current loop_cand be interchanged.  ILOOP is not NULL if
293    current loop_cand is outer loop in loop nest.  */
294 
295 bool
296 loop_cand::can_interchange_p (loop_cand *iloop)
297 {
298   /* For now we only support at most one reduction.  */
299   unsigned allowed_reduction_num = 1;
300 
301   /* Only support reduction if the loop nest to be interchanged is the
302      innermostin two loops.  */
303   if ((iloop == NULL && m_loop->inner != NULL)
304        || (iloop != NULL && iloop->m_loop->inner != NULL))
305     allowed_reduction_num = 0;
306 
307   if (m_reductions.length () > allowed_reduction_num
308       || (m_reductions.length () == 1
309 	  && m_reductions[0]->type == UNKNOWN_RTYPE))
310     return false;
311 
312   /* Only support lcssa PHI node which is for reduction.  */
313   if (m_lcssa_nodes.length () > allowed_reduction_num)
314     return false;
315 
316   /* Check if basic block has any unsupported operation.  Note basic blocks
317      of inner loops are not checked here.  */
318   for (unsigned i = 0; i < m_loop->num_nodes; i++)
319     {
320       basic_block bb = m_bbs[i];
321       gphi_iterator psi;
322       gimple_stmt_iterator gsi;
323 
324       /* Skip basic blocks of inner loops.  */
325       if (bb->loop_father != m_loop)
326        continue;
327 
328       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
329 	{
330 	  gimple *stmt = gsi_stmt (gsi);
331 	  if (is_gimple_debug (stmt))
332 	    continue;
333 
334 	  if (gimple_has_side_effects (stmt))
335 	    return false;
336 
337 	  m_num_stmts++;
338 	  if (gcall *call = dyn_cast <gcall *> (stmt))
339 	    {
340 	      /* In basic block of outer loop, the call should be cheap since
341 		 it will be moved to inner loop.  */
342 	      if (iloop != NULL
343 		  && !gimple_inexpensive_call_p (call))
344 		return false;
345 	      continue;
346 	    }
347 
348 	  if (!iloop || !gimple_vuse (stmt))
349 	    continue;
350 
351 	  /* Support stmt accessing memory in outer loop only if it is for
352 	     inner loop's reduction.  */
353 	  if (iloop->find_reduction_by_stmt (stmt))
354 	    continue;
355 
356 	  tree lhs;
357 	  /* Support loop invariant memory reference if it's only used once by
358 	     inner loop.  */
359 	  /* ???  How's this checking for invariantness?  */
360 	  if (gimple_assign_single_p (stmt)
361 	      && (lhs = gimple_assign_lhs (stmt)) != NULL_TREE
362 	      && TREE_CODE (lhs) == SSA_NAME
363 	      && single_use_in_loop (lhs, iloop->m_loop))
364 	    continue;
365 
366 	  return false;
367 	}
368       /* Check if loop has too many stmts.  */
369       if (m_num_stmts > MAX_NUM_STMT)
370 	return false;
371 
372       /* Allow PHI nodes in any basic block of inner loop, PHI nodes in outer
373 	 loop's header, or PHI nodes in dest bb of inner loop's exit edge.  */
374       if (!iloop || bb == m_loop->header
375 	  || bb == iloop->m_exit->dest)
376 	continue;
377 
378       /* Don't allow any other PHI nodes.  */
379       for (psi = gsi_start_phis (bb); !gsi_end_p (psi); gsi_next (&psi))
380 	if (!virtual_operand_p (PHI_RESULT (psi.phi ())))
381 	  return false;
382     }
383 
384   return true;
385 }
386 
387 /* Programmers and optimizers (like loop store motion) may optimize code:
388 
389      for (int i = 0; i < N; i++)
390        for (int j = 0; j < N; j++)
391 	 a[i] += b[j][i] * c[j][i];
392 
393    into reduction:
394 
395      for (int i = 0; i < N; i++)
396        {
397 	 // producer.  Note sum can be intitialized to a constant.
398 	 int sum = a[i];
399 	 for (int j = 0; j < N; j++)
400 	   {
401 	     sum += b[j][i] * c[j][i];
402 	   }
403 	 // consumer.
404 	 a[i] = sum;
405        }
406 
407    The result code can't be interchanged without undoing the optimization.
408    This function classifies this kind reduction and records information so
409    that we can undo the store motion during interchange.  */
410 
411 void
412 loop_cand::classify_simple_reduction (reduction_p re)
413 {
414   gimple *producer, *consumer;
415 
416   /* Check init variable of reduction and how it is initialized.  */
417   if (TREE_CODE (re->init) == SSA_NAME)
418     {
419       producer = SSA_NAME_DEF_STMT (re->init);
420       re->producer = producer;
421       basic_block bb = gimple_bb (producer);
422       if (!bb || bb->loop_father != m_outer)
423 	return;
424 
425       if (!gimple_assign_load_p (producer))
426 	return;
427 
428       re->init_ref = gimple_assign_rhs1 (producer);
429     }
430   else if (CONSTANT_CLASS_P (re->init))
431     m_const_init_reduc++;
432   else
433     return;
434 
435   /* Check how reduction variable is used.  */
436   consumer = single_use_in_loop (PHI_RESULT (re->lcssa_phi), m_outer);
437   if (!consumer
438       || !gimple_store_p (consumer))
439     return;
440 
441   re->fini_ref = gimple_get_lhs (consumer);
442   re->consumer = consumer;
443 
444   /* Simple reduction with constant initializer.  */
445   if (!re->init_ref)
446     {
447       gcc_assert (CONSTANT_CLASS_P (re->init));
448       re->init_ref = unshare_expr (re->fini_ref);
449     }
450 
451   /* Require memory references in producer and consumer are the same so
452      that we can undo reduction during interchange.  */
453   if (re->init_ref && !operand_equal_p (re->init_ref, re->fini_ref, 0))
454     return;
455 
456   re->type = SIMPLE_RTYPE;
457 }
458 
459 /* Analyze reduction variable VAR for inner loop of the loop nest to be
460    interchanged.  Return true if analysis succeeds.  */
461 
462 bool
463 loop_cand::analyze_iloop_reduction_var (tree var)
464 {
465   gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (var));
466   gphi *lcssa_phi = NULL, *use_phi;
467   tree init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (m_loop));
468   tree next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (m_loop));
469   reduction_p re;
470   gimple *stmt, *next_def, *single_use = NULL;
471   use_operand_p use_p;
472   imm_use_iterator iterator;
473 
474   if (TREE_CODE (next) != SSA_NAME)
475     return false;
476 
477   next_def = SSA_NAME_DEF_STMT (next);
478   basic_block bb = gimple_bb (next_def);
479   if (!bb || !flow_bb_inside_loop_p (m_loop, bb))
480     return false;
481 
482   /* In restricted reduction, the var is (and must be) used in defining
483      the updated var.  The process can be depicted as below:
484 
485 		var ;; = PHI<init, next>
486 		 |
487 		 |
488 		 v
489       +---------------------+
490       | reduction operators | <-- other operands
491       +---------------------+
492 		 |
493 		 |
494 		 v
495 		next
496 
497      In terms loop interchange, we don't change how NEXT is computed based
498      on VAR and OTHER OPERANDS.  In case of double reduction in loop nest
499      to be interchanged, we don't changed it at all.  In the case of simple
500      reduction in inner loop, we only make change how VAR/NEXT is loaded or
501      stored.  With these conditions, we can relax restrictions on reduction
502      in a way that reduction operation is seen as black box.  In general,
503      we can ignore reassociation of reduction operator; we can handle fake
504      reductions in which VAR is not even used to compute NEXT.  */
505   if (! single_imm_use (var, &use_p, &single_use)
506       || ! flow_bb_inside_loop_p (m_loop, gimple_bb (single_use)))
507     return false;
508 
509   /* Check the reduction operation.  We require a left-associative operation.
510      For FP math we also need to be allowed to associate operations.  */
511   if (gassign *ass = dyn_cast <gassign *> (single_use))
512     {
513       enum tree_code code = gimple_assign_rhs_code (ass);
514       if (! (associative_tree_code (code)
515 	     || (code == MINUS_EXPR
516 		 && use_p->use == gimple_assign_rhs1_ptr (ass)))
517 	  || (FLOAT_TYPE_P (TREE_TYPE (var))
518 	      && ! flag_associative_math))
519 	return false;
520     }
521   else
522     return false;
523 
524   /* Handle and verify a series of stmts feeding the reduction op.  */
525   if (single_use != next_def
526       && !check_reduction_path (UNKNOWN_LOCATION, m_loop, phi, next,
527 				gimple_assign_rhs_code (single_use)))
528     return false;
529 
530   /* Only support cases in which INIT is used in inner loop.  */
531   if (TREE_CODE (init) == SSA_NAME)
532     FOR_EACH_IMM_USE_FAST (use_p, iterator, init)
533       {
534 	stmt = USE_STMT (use_p);
535 	if (is_gimple_debug (stmt))
536 	  continue;
537 
538 	if (!flow_bb_inside_loop_p (m_loop, gimple_bb (stmt)))
539 	  return false;
540       }
541 
542   FOR_EACH_IMM_USE_FAST (use_p, iterator, next)
543     {
544       stmt = USE_STMT (use_p);
545       if (is_gimple_debug (stmt))
546 	continue;
547 
548       /* Or else it's used in PHI itself.  */
549       use_phi = dyn_cast <gphi *> (stmt);
550       if (use_phi == phi)
551 	continue;
552 
553       if (use_phi != NULL
554 	  && lcssa_phi == NULL
555 	  && gimple_bb (stmt) == m_exit->dest
556 	  && PHI_ARG_DEF_FROM_EDGE (use_phi, m_exit) == next)
557 	lcssa_phi = use_phi;
558       else
559 	return false;
560     }
561   if (!lcssa_phi)
562     return false;
563 
564   re = XCNEW (struct reduction);
565   re->var = var;
566   re->init = init;
567   re->next = next;
568   re->phi = phi;
569   re->lcssa_phi = lcssa_phi;
570 
571   classify_simple_reduction (re);
572 
573   if (dump_file && (dump_flags & TDF_DETAILS))
574     dump_reduction (re);
575 
576   m_reductions.safe_push (re);
577   return true;
578 }
579 
580 /* Analyze reduction variable VAR for outer loop of the loop nest to be
581    interchanged.  ILOOP is not NULL and points to inner loop.  For the
582    moment, we only support double reduction for outer loop, like:
583 
584      for (int i = 0; i < n; i++)
585        {
586 	 int sum = 0;
587 
588 	 for (int j = 0; j < n; j++)    // outer loop
589 	   for (int k = 0; k < n; k++)  // inner loop
590 	     sum += a[i][k]*b[k][j];
591 
592 	 s[i] = sum;
593        }
594 
595    Note the innermost two loops are the loop nest to be interchanged.
596    Return true if analysis succeeds.  */
597 
598 bool
599 loop_cand::analyze_oloop_reduction_var (loop_cand *iloop, tree var)
600 {
601   gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (var));
602   gphi *lcssa_phi = NULL, *use_phi;
603   tree init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (m_loop));
604   tree next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (m_loop));
605   reduction_p re;
606   gimple *stmt, *next_def;
607   use_operand_p use_p;
608   imm_use_iterator iterator;
609 
610   if (TREE_CODE (next) != SSA_NAME)
611     return false;
612 
613   next_def = SSA_NAME_DEF_STMT (next);
614   basic_block bb = gimple_bb (next_def);
615   if (!bb || !flow_bb_inside_loop_p (m_loop, bb))
616     return false;
617 
618   /* Find inner loop's simple reduction that uses var as initializer.  */
619   reduction_p inner_re = NULL;
620   for (unsigned i = 0; iloop->m_reductions.iterate (i, &inner_re); ++i)
621     if (inner_re->init == var || operand_equal_p (inner_re->init, var, 0))
622       break;
623 
624   if (inner_re == NULL
625       || inner_re->type != UNKNOWN_RTYPE
626       || inner_re->producer != phi)
627     return false;
628 
629   /* In case of double reduction, outer loop's reduction should be updated
630      by inner loop's simple reduction.  */
631   if (next_def != inner_re->lcssa_phi)
632     return false;
633 
634   /* Outer loop's reduction should only be used to initialize inner loop's
635      simple reduction.  */
636   if (! single_imm_use (var, &use_p, &stmt)
637       || stmt != inner_re->phi)
638     return false;
639 
640   /* Check this reduction is correctly used outside of loop via lcssa phi.  */
641   FOR_EACH_IMM_USE_FAST (use_p, iterator, next)
642     {
643       stmt = USE_STMT (use_p);
644       if (is_gimple_debug (stmt))
645 	continue;
646 
647       /* Or else it's used in PHI itself.  */
648       use_phi = dyn_cast <gphi *> (stmt);
649       if (use_phi == phi)
650 	continue;
651 
652       if (lcssa_phi == NULL
653 	  && use_phi != NULL
654 	  && gimple_bb (stmt) == m_exit->dest
655 	  && PHI_ARG_DEF_FROM_EDGE (use_phi, m_exit) == next)
656 	lcssa_phi = use_phi;
657       else
658 	return false;
659     }
660   if (!lcssa_phi)
661     return false;
662 
663   re = XCNEW (struct reduction);
664   re->var = var;
665   re->init = init;
666   re->next = next;
667   re->phi = phi;
668   re->lcssa_phi = lcssa_phi;
669   re->type = DOUBLE_RTYPE;
670   inner_re->type = DOUBLE_RTYPE;
671 
672   if (dump_file && (dump_flags & TDF_DETAILS))
673     dump_reduction (re);
674 
675   m_reductions.safe_push (re);
676   return true;
677 }
678 
679 /* Return true if VAR is induction variable of current loop whose scev is
680    specified by CHREC.  */
681 
682 bool
683 loop_cand::analyze_induction_var (tree var, tree chrec)
684 {
685   gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (var));
686   tree init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (m_loop));
687 
688   /* Var is loop invariant, though it's unlikely to happen.  */
689   if (tree_does_not_contain_chrecs (chrec))
690     {
691       struct induction *iv = XCNEW (struct induction);
692       iv->var = var;
693       iv->init_val = init;
694       iv->init_expr = chrec;
695       iv->step = build_int_cst (TREE_TYPE (chrec), 0);
696       m_inductions.safe_push (iv);
697       return true;
698     }
699 
700   if (TREE_CODE (chrec) != POLYNOMIAL_CHREC
701       || CHREC_VARIABLE (chrec) != (unsigned) m_loop->num
702       || tree_contains_chrecs (CHREC_LEFT (chrec), NULL)
703       || tree_contains_chrecs (CHREC_RIGHT (chrec), NULL))
704     return false;
705 
706   struct induction *iv = XCNEW (struct induction);
707   iv->var = var;
708   iv->init_val = init;
709   iv->init_expr = CHREC_LEFT (chrec);
710   iv->step = CHREC_RIGHT (chrec);
711 
712   if (dump_file && (dump_flags & TDF_DETAILS))
713     dump_induction (m_loop, iv);
714 
715   m_inductions.safe_push (iv);
716   return true;
717 }
718 
719 /* Return true if all loop carried variables defined in loop header can
720    be successfully analyzed.  */
721 
722 bool
723 loop_cand::analyze_carried_vars (loop_cand *iloop)
724 {
725   edge e = loop_preheader_edge (m_outer);
726   gphi_iterator gsi;
727 
728   if (dump_file && (dump_flags & TDF_DETAILS))
729     fprintf (dump_file, "\nLoop(%d) carried vars:\n", m_loop->num);
730 
731   for (gsi = gsi_start_phis (m_loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
732     {
733       gphi *phi = gsi.phi ();
734 
735       tree var = PHI_RESULT (phi);
736       if (virtual_operand_p (var))
737 	continue;
738 
739       tree chrec = analyze_scalar_evolution (m_loop, var);
740       chrec = instantiate_scev (e, m_loop, chrec);
741 
742       /* Analyze var as reduction variable.  */
743       if (chrec_contains_undetermined (chrec)
744 	  || chrec_contains_symbols_defined_in_loop (chrec, m_outer->num))
745 	{
746 	  if (iloop && !analyze_oloop_reduction_var (iloop, var))
747 	    return false;
748 	  if (!iloop && !analyze_iloop_reduction_var (var))
749 	    return false;
750 	}
751       /* Analyze var as induction variable.  */
752       else if (!analyze_induction_var (var, chrec))
753 	return false;
754     }
755 
756   return true;
757 }
758 
759 /* Return TRUE if loop closed PHI nodes can be analyzed successfully.  */
760 
761 bool
762 loop_cand::analyze_lcssa_phis (void)
763 {
764   gphi_iterator gsi;
765   for (gsi = gsi_start_phis (m_exit->dest); !gsi_end_p (gsi); gsi_next (&gsi))
766     {
767       gphi *phi = gsi.phi ();
768 
769       if (virtual_operand_p (PHI_RESULT (phi)))
770 	continue;
771 
772       /* TODO: We only support lcssa phi for reduction for now.  */
773       if (!find_reduction_by_stmt (phi))
774 	return false;
775     }
776 
777   return true;
778 }
779 
780 /* CONSUMER is a stmt in BB storing reduction result into memory object.
781    When the reduction is intialized from constant value, we need to add
782    a stmt loading from the memory object to target basic block in inner
783    loop during undoing the reduction.  Problem is that memory reference
784    may use ssa variables not dominating the target basic block.  This
785    function finds all stmts on which CONSUMER depends in basic block BB,
786    records and returns them via STMTS.  */
787 
788 static void
789 find_deps_in_bb_for_stmt (gimple_seq *stmts, basic_block bb, gimple *consumer)
790 {
791   auto_vec<gimple *, 4> worklist;
792   use_operand_p use_p;
793   ssa_op_iter iter;
794   gimple *stmt, *def_stmt;
795   gimple_stmt_iterator gsi;
796 
797   /* First clear flag for stmts in bb.  */
798   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
799     gimple_set_plf (gsi_stmt (gsi), GF_PLF_1, false);
800 
801   /* DFS search all depended stmts in bb and mark flag for these stmts.  */
802   worklist.safe_push (consumer);
803   while (!worklist.is_empty ())
804     {
805       stmt = worklist.pop ();
806       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
807 	{
808 	  def_stmt = SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p));
809 
810 	  if (is_a <gphi *> (def_stmt)
811 	      || gimple_bb (def_stmt) != bb
812 	      || gimple_plf (def_stmt, GF_PLF_1))
813 	    continue;
814 
815 	  worklist.safe_push (def_stmt);
816 	}
817       gimple_set_plf (stmt, GF_PLF_1, true);
818     }
819   for (gsi = gsi_start_nondebug_bb (bb);
820        !gsi_end_p (gsi) && (stmt = gsi_stmt (gsi)) != consumer;)
821     {
822       /* Move dep stmts to sequence STMTS.  */
823       if (gimple_plf (stmt, GF_PLF_1))
824 	{
825 	  gsi_remove (&gsi, false);
826 	  gimple_seq_add_stmt_without_update (stmts, stmt);
827 	}
828       else
829 	gsi_next_nondebug (&gsi);
830     }
831 }
832 
833 /* User can write, optimizers can generate simple reduction RE for inner
834    loop.  In order to make interchange valid, we have to undo reduction by
835    moving producer and consumer stmts into the inner loop.  For example,
836    below code:
837 
838      init = MEM_REF[idx];		//producer
839      loop:
840        var = phi<init, next>
841        next = var op ...
842      reduc_sum = phi<next>
843      MEM_REF[idx] = reduc_sum		//consumer
844 
845    is transformed into:
846 
847      loop:
848        new_var = MEM_REF[idx];		//producer after moving
849        next = new_var op ...
850        MEM_REF[idx] = next;		//consumer after moving
851 
852    Note if the reduction variable is initialized to constant, like:
853 
854      var = phi<0.0, next>
855 
856    we compute new_var as below:
857 
858      loop:
859        tmp = MEM_REF[idx];
860        new_var = !first_iteration ? tmp : 0.0;
861 
862    so that the initial const is used in the first iteration of loop.  Also
863    record ssa variables for dead code elimination in DCE_SEEDS.  */
864 
865 void
866 loop_cand::undo_simple_reduction (reduction_p re, bitmap dce_seeds)
867 {
868   gimple *stmt;
869   gimple_stmt_iterator from, to = gsi_after_labels (m_loop->header);
870   gimple_seq stmts = NULL;
871   tree new_var;
872 
873   /* Prepare the initialization stmts and insert it to inner loop.  */
874   if (re->producer != NULL)
875     {
876       gimple_set_vuse (re->producer, NULL_TREE);
877       from = gsi_for_stmt (re->producer);
878       gsi_remove (&from, false);
879       gimple_seq_add_stmt_without_update (&stmts, re->producer);
880       new_var = re->init;
881     }
882   else
883     {
884       /* Find all stmts on which expression "MEM_REF[idx]" depends.  */
885       find_deps_in_bb_for_stmt (&stmts, gimple_bb (re->consumer), re->consumer);
886       /* Because we generate new stmt loading from the MEM_REF to TMP.  */
887       tree cond, tmp = copy_ssa_name (re->var);
888       stmt = gimple_build_assign (tmp, re->init_ref);
889       gimple_seq_add_stmt_without_update (&stmts, stmt);
890 
891       /* Init new_var to MEM_REF or CONST depending on if it is the first
892 	 iteration.  */
893       induction_p iv = m_inductions[0];
894       cond = fold_build2 (NE_EXPR, boolean_type_node, iv->var, iv->init_val);
895       new_var = copy_ssa_name (re->var);
896       stmt = gimple_build_assign (new_var, COND_EXPR, cond, tmp, re->init);
897       gimple_seq_add_stmt_without_update (&stmts, stmt);
898     }
899   gsi_insert_seq_before (&to, stmts, GSI_SAME_STMT);
900 
901   /* Replace all uses of reduction var with new variable.  */
902   use_operand_p use_p;
903   imm_use_iterator iterator;
904   FOR_EACH_IMM_USE_STMT (stmt, iterator, re->var)
905     {
906       FOR_EACH_IMM_USE_ON_STMT (use_p, iterator)
907 	SET_USE (use_p, new_var);
908 
909       update_stmt (stmt);
910     }
911 
912   /* Move consumer stmt into inner loop, just after reduction next's def.  */
913   unlink_stmt_vdef (re->consumer);
914   release_ssa_name (gimple_vdef (re->consumer));
915   gimple_set_vdef (re->consumer, NULL_TREE);
916   gimple_set_vuse (re->consumer, NULL_TREE);
917   gimple_assign_set_rhs1 (re->consumer, re->next);
918   from = gsi_for_stmt (re->consumer);
919   to = gsi_for_stmt (SSA_NAME_DEF_STMT (re->next));
920   gsi_move_after (&from, &to);
921 
922   /* Mark the reduction variables for DCE.  */
923   bitmap_set_bit (dce_seeds, SSA_NAME_VERSION (re->var));
924   bitmap_set_bit (dce_seeds, SSA_NAME_VERSION (PHI_RESULT (re->lcssa_phi)));
925 }
926 
927 /* Free DATAREFS and its auxiliary memory.  */
928 
929 static void
930 free_data_refs_with_aux (vec<data_reference_p> datarefs)
931 {
932   data_reference_p dr;
933   for (unsigned i = 0; datarefs.iterate (i, &dr); ++i)
934     if (dr->aux != NULL)
935       {
936 	DR_ACCESS_STRIDE (dr)->release ();
937 	delete (vec<tree> *) dr->aux;
938       }
939 
940   free_data_refs (datarefs);
941 }
942 
943 /* Class for loop interchange transformation.  */
944 
945 class tree_loop_interchange
946 {
947 public:
948   tree_loop_interchange (vec<struct loop *> loop_nest)
949     : m_loop_nest (loop_nest), m_niters_iv_var (NULL_TREE),
950       m_dce_seeds (BITMAP_ALLOC (NULL)) { }
951   ~tree_loop_interchange () { BITMAP_FREE (m_dce_seeds); }
952   bool interchange (vec<data_reference_p>, vec<ddr_p>);
953 
954 private:
955   void update_data_info (unsigned, unsigned, vec<data_reference_p>, vec<ddr_p>);
956   bool valid_data_dependences (unsigned, unsigned, vec<ddr_p>);
957   void interchange_loops (loop_cand &, loop_cand &);
958   void map_inductions_to_loop (loop_cand &, loop_cand &);
959   void move_code_to_inner_loop (struct loop *, struct loop *, basic_block *);
960 
961   /* The whole loop nest in which interchange is ongoing.  */
962   vec<struct loop *> m_loop_nest;
963   /* We create new IV which is only used in loop's exit condition check.
964      In case of 3-level loop nest interchange, when we interchange the
965      innermost two loops, new IV created in the middle level loop does
966      not need to be preserved in interchanging the outermost two loops
967      later.  We record the IV so that it can be skipped.  */
968   tree m_niters_iv_var;
969   /* Bitmap of seed variables for dead code elimination after interchange.  */
970   bitmap m_dce_seeds;
971 };
972 
973 /* Update data refs' access stride and dependence information after loop
974    interchange.  I_IDX/O_IDX gives indices of interchanged loops in loop
975    nest.  DATAREFS are data references.  DDRS are data dependences.  */
976 
977 void
978 tree_loop_interchange::update_data_info (unsigned i_idx, unsigned o_idx,
979 					 vec<data_reference_p> datarefs,
980 					 vec<ddr_p> ddrs)
981 {
982   struct data_reference *dr;
983   struct data_dependence_relation *ddr;
984 
985   /* Update strides of data references.  */
986   for (unsigned i = 0; datarefs.iterate (i, &dr); ++i)
987     {
988       vec<tree> *stride = DR_ACCESS_STRIDE (dr);
989       gcc_assert (stride->length () > i_idx);
990       std::swap ((*stride)[i_idx], (*stride)[o_idx]);
991     }
992   /* Update data dependences.  */
993   for (unsigned i = 0; ddrs.iterate (i, &ddr); ++i)
994     if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
995       {
996         for (unsigned j = 0; j < DDR_NUM_DIST_VECTS (ddr); ++j)
997 	  {
998 	    lambda_vector dist_vect = DDR_DIST_VECT (ddr, j);
999 	    std::swap (dist_vect[i_idx], dist_vect[o_idx]);
1000 	  }
1001       }
1002 }
1003 
1004 /* Check data dependence relations, return TRUE if it's valid to interchange
1005    two loops specified by I_IDX/O_IDX.  Theoretically, interchanging the two
1006    loops is valid only if dist vector, after interchanging, doesn't have '>'
1007    as the leftmost non-'=' direction.  Practically, this function have been
1008    conservative here by not checking some valid cases.  */
1009 
1010 bool
1011 tree_loop_interchange::valid_data_dependences (unsigned i_idx, unsigned o_idx,
1012 					       vec<ddr_p> ddrs)
1013 {
1014   struct data_dependence_relation *ddr;
1015 
1016   for (unsigned i = 0; ddrs.iterate (i, &ddr); ++i)
1017     {
1018       /* Skip no-dependence case.  */
1019       if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1020 	continue;
1021 
1022       for (unsigned j = 0; j < DDR_NUM_DIST_VECTS (ddr); ++j)
1023 	{
1024 	  lambda_vector dist_vect = DDR_DIST_VECT (ddr, j);
1025 	  unsigned level = dependence_level (dist_vect, m_loop_nest.length ());
1026 
1027 	  /* If there is no carried dependence.  */
1028 	  if (level == 0)
1029 	    continue;
1030 
1031 	  level --;
1032 
1033 	  /* If dependence is not carried by any loop in between the two
1034 	     loops [oloop, iloop] to interchange.  */
1035 	  if (level < o_idx || level > i_idx)
1036 	    continue;
1037 
1038 	  /* Be conservative, skip case if either direction at i_idx/o_idx
1039 	     levels is not '=' or '<'.  */
1040 	  if (dist_vect[i_idx] < 0 || dist_vect[o_idx] < 0)
1041 	    return false;
1042 	}
1043     }
1044 
1045   return true;
1046 }
1047 
1048 /* Interchange two loops specified by ILOOP and OLOOP.  */
1049 
1050 void
1051 tree_loop_interchange::interchange_loops (loop_cand &iloop, loop_cand &oloop)
1052 {
1053   reduction_p re;
1054   gimple_stmt_iterator gsi;
1055   tree i_niters, o_niters, var_after;
1056 
1057   /* Undo inner loop's simple reduction.  */
1058   for (unsigned i = 0; iloop.m_reductions.iterate (i, &re); ++i)
1059     if (re->type != DOUBLE_RTYPE)
1060       {
1061 	if (re->producer)
1062 	  reset_debug_uses (re->producer);
1063 
1064 	iloop.undo_simple_reduction (re, m_dce_seeds);
1065       }
1066 
1067   /* Only need to reset debug uses for double reduction.  */
1068   for (unsigned i = 0; oloop.m_reductions.iterate (i, &re); ++i)
1069     {
1070       gcc_assert (re->type == DOUBLE_RTYPE);
1071       reset_debug_uses (SSA_NAME_DEF_STMT (re->var));
1072       reset_debug_uses (SSA_NAME_DEF_STMT (re->next));
1073     }
1074 
1075   /* Prepare niters for both loops.  */
1076   struct loop *loop_nest = m_loop_nest[0];
1077   edge instantiate_below = loop_preheader_edge (loop_nest);
1078   gsi = gsi_last_bb (loop_preheader_edge (loop_nest)->src);
1079   i_niters = number_of_latch_executions (iloop.m_loop);
1080   i_niters = analyze_scalar_evolution (loop_outer (iloop.m_loop), i_niters);
1081   i_niters = instantiate_scev (instantiate_below, loop_outer (iloop.m_loop),
1082 			       i_niters);
1083   i_niters = force_gimple_operand_gsi (&gsi, unshare_expr (i_niters), true,
1084 				       NULL_TREE, false, GSI_CONTINUE_LINKING);
1085   o_niters = number_of_latch_executions (oloop.m_loop);
1086   if (oloop.m_loop != loop_nest)
1087     {
1088       o_niters = analyze_scalar_evolution (loop_outer (oloop.m_loop), o_niters);
1089       o_niters = instantiate_scev (instantiate_below, loop_outer (oloop.m_loop),
1090 				   o_niters);
1091     }
1092   o_niters = force_gimple_operand_gsi (&gsi, unshare_expr (o_niters), true,
1093 				       NULL_TREE, false, GSI_CONTINUE_LINKING);
1094 
1095   /* Move src's code to tgt loop.  This is necessary when src is the outer
1096      loop and tgt is the inner loop.  */
1097   move_code_to_inner_loop (oloop.m_loop, iloop.m_loop, oloop.m_bbs);
1098 
1099   /* Map outer loop's IV to inner loop, and vice versa.  */
1100   map_inductions_to_loop (oloop, iloop);
1101   map_inductions_to_loop (iloop, oloop);
1102 
1103   /* Create canonical IV for both loops.  Note canonical IV for outer/inner
1104      loop is actually from inner/outer loop.  Also we record the new IV
1105      created for the outer loop so that it can be skipped in later loop
1106      interchange.  */
1107   create_canonical_iv (oloop.m_loop, oloop.m_exit,
1108 		       i_niters, &m_niters_iv_var, &var_after);
1109   bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_after));
1110   create_canonical_iv (iloop.m_loop, iloop.m_exit,
1111 		       o_niters, NULL, &var_after);
1112   bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_after));
1113 
1114   /* Scrap niters estimation of interchanged loops.  */
1115   iloop.m_loop->any_upper_bound = false;
1116   iloop.m_loop->any_likely_upper_bound = false;
1117   free_numbers_of_iterations_estimates (iloop.m_loop);
1118   oloop.m_loop->any_upper_bound = false;
1119   oloop.m_loop->any_likely_upper_bound = false;
1120   free_numbers_of_iterations_estimates (oloop.m_loop);
1121 
1122   /* Clear all cached scev information.  This is expensive but shouldn't be
1123      a problem given we interchange in very limited times.  */
1124   scev_reset_htab ();
1125 
1126   /* ???  The association between the loop data structure and the
1127      CFG changed, so what was loop N at the source level is now
1128      loop M.  We should think of retaining the association or breaking
1129      it fully by creating a new loop instead of re-using the "wrong" one.  */
1130 }
1131 
1132 /* Map induction variables of SRC loop to TGT loop.  The function firstly
1133    creates the same IV of SRC loop in TGT loop, then deletes the original
1134    IV and re-initialize it using the newly created IV.  For example, loop
1135    nest:
1136 
1137      for (i = 0; i < N; i++)
1138        for (j = 0; j < M; j++)
1139 	 {
1140 	   //use of i;
1141 	   //use of j;
1142 	 }
1143 
1144    will be transformed into:
1145 
1146      for (jj = 0; jj < M; jj++)
1147        for (ii = 0; ii < N; ii++)
1148 	 {
1149 	   //use of ii;
1150 	   //use of jj;
1151 	 }
1152 
1153    after loop interchange.  */
1154 
1155 void
1156 tree_loop_interchange::map_inductions_to_loop (loop_cand &src, loop_cand &tgt)
1157 {
1158   induction_p iv;
1159   edge e = tgt.m_exit;
1160   gimple_stmt_iterator incr_pos = gsi_last_bb (e->src), gsi;
1161 
1162   /* Map source loop's IV to target loop.  */
1163   for (unsigned i = 0; src.m_inductions.iterate (i, &iv); ++i)
1164     {
1165       gimple *use_stmt, *stmt = SSA_NAME_DEF_STMT (iv->var);
1166       gcc_assert (is_a <gphi *> (stmt));
1167 
1168       use_operand_p use_p;
1169       /* Only map original IV to target loop.  */
1170       if (m_niters_iv_var != iv->var)
1171 	{
1172 	  /* Map the IV by creating the same one in target loop.  */
1173 	  tree var_before, var_after;
1174 	  tree base = unshare_expr (iv->init_expr);
1175 	  tree step = unshare_expr (iv->step);
1176 	  create_iv (base, step, SSA_NAME_VAR (iv->var),
1177 		     tgt.m_loop, &incr_pos, false, &var_before, &var_after);
1178 	  bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_before));
1179 	  bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_after));
1180 
1181 	  /* Replace uses of the original IV var with newly created IV var.  */
1182 	  imm_use_iterator imm_iter;
1183 	  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, iv->var)
1184 	    {
1185 	      FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1186 		SET_USE (use_p, var_before);
1187 
1188 	      update_stmt (use_stmt);
1189 	    }
1190 	}
1191 
1192       /* Mark all uses for DCE.  */
1193       ssa_op_iter op_iter;
1194       FOR_EACH_PHI_OR_STMT_USE (use_p, stmt, op_iter, SSA_OP_USE)
1195 	{
1196 	  tree use = USE_FROM_PTR (use_p);
1197 	  if (TREE_CODE (use) == SSA_NAME
1198 	      && ! SSA_NAME_IS_DEFAULT_DEF (use))
1199 	    bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (use));
1200 	}
1201 
1202       /* Delete definition of the original IV in the source loop.  */
1203       gsi = gsi_for_stmt (stmt);
1204       remove_phi_node (&gsi, true);
1205     }
1206 }
1207 
1208 /* Move stmts of outer loop to inner loop.  */
1209 
1210 void
1211 tree_loop_interchange::move_code_to_inner_loop (struct loop *outer,
1212 						struct loop *inner,
1213 						basic_block *outer_bbs)
1214 {
1215   basic_block oloop_exit_bb = single_exit (outer)->src;
1216   gimple_stmt_iterator gsi, to;
1217 
1218   for (unsigned i = 0; i < outer->num_nodes; i++)
1219     {
1220       basic_block bb = outer_bbs[i];
1221 
1222       /* Skip basic blocks of inner loop.  */
1223       if (flow_bb_inside_loop_p (inner, bb))
1224 	continue;
1225 
1226       /* Move code from header/latch to header/latch.  */
1227       if (bb == outer->header)
1228 	to = gsi_after_labels (inner->header);
1229       else if (bb == outer->latch)
1230 	to = gsi_after_labels (inner->latch);
1231       else
1232 	/* Otherwise, simply move to exit->src.  */
1233 	to = gsi_last_bb (single_exit (inner)->src);
1234 
1235       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
1236 	{
1237 	  gimple *stmt = gsi_stmt (gsi);
1238 
1239 	  if (oloop_exit_bb == bb
1240 	      && stmt == gsi_stmt (gsi_last_bb (oloop_exit_bb)))
1241 	    {
1242 	      gsi_next (&gsi);
1243 	      continue;
1244 	    }
1245 
1246 	  if (gimple_vuse (stmt))
1247 	    gimple_set_vuse (stmt, NULL_TREE);
1248 	  if (gimple_vdef (stmt))
1249 	    {
1250 	      unlink_stmt_vdef (stmt);
1251 	      release_ssa_name (gimple_vdef (stmt));
1252 	      gimple_set_vdef (stmt, NULL_TREE);
1253 	    }
1254 
1255 	  reset_debug_uses (stmt);
1256 	  gsi_move_before (&gsi, &to);
1257 	}
1258     }
1259 }
1260 
1261 /* Given data reference DR in LOOP_NEST, the function computes DR's access
1262    stride at each level of loop from innermost LOOP to outer.  On success,
1263    it saves access stride at each level loop in a vector which is pointed
1264    by DR->aux.  For example:
1265 
1266      int arr[100][100][100];
1267      for (i = 0; i < 100; i++)       ;(DR->aux)strides[0] = 40000
1268        for (j = 100; j > 0; j--)     ;(DR->aux)strides[1] = 400
1269 	 for (k = 0; k < 100; k++)   ;(DR->aux)strides[2] = 4
1270 	   arr[i][j - 1][k] = 0;  */
1271 
1272 static void
1273 compute_access_stride (struct loop *loop_nest, struct loop *loop,
1274 		       data_reference_p dr)
1275 {
1276   vec<tree> *strides = new vec<tree> ();
1277   basic_block bb = gimple_bb (DR_STMT (dr));
1278 
1279   while (!flow_bb_inside_loop_p (loop, bb))
1280     {
1281       strides->safe_push (build_int_cst (sizetype, 0));
1282       loop = loop_outer (loop);
1283     }
1284   gcc_assert (loop == bb->loop_father);
1285 
1286   tree ref = DR_REF (dr);
1287   if (TREE_CODE (ref) == COMPONENT_REF
1288       && DECL_BIT_FIELD (TREE_OPERAND (ref, 1)))
1289     {
1290       /* We can't take address of bitfields.  If the bitfield is at constant
1291 	 offset from the start of the struct, just use address of the
1292 	 struct, for analysis of the strides that shouldn't matter.  */
1293       if (!TREE_OPERAND (ref, 2)
1294 	  || TREE_CODE (TREE_OPERAND (ref, 2)) == INTEGER_CST)
1295 	ref = TREE_OPERAND (ref, 0);
1296       /* Otherwise, if we have a bit field representative, use that.  */
1297       else if (DECL_BIT_FIELD_REPRESENTATIVE (TREE_OPERAND (ref, 1))
1298 	       != NULL_TREE)
1299 	{
1300 	  tree repr = DECL_BIT_FIELD_REPRESENTATIVE (TREE_OPERAND (ref, 1));
1301 	  ref = build3 (COMPONENT_REF, TREE_TYPE (repr), TREE_OPERAND (ref, 0),
1302 			repr, TREE_OPERAND (ref, 2));
1303 	}
1304       /* Otherwise punt.  */
1305       else
1306 	{
1307 	  dr->aux = strides;
1308 	  return;
1309 	}
1310     }
1311   tree scev_base = build_fold_addr_expr (ref);
1312   tree scev = analyze_scalar_evolution (loop, scev_base);
1313   scev = instantiate_scev (loop_preheader_edge (loop_nest), loop, scev);
1314   if (! chrec_contains_undetermined (scev))
1315     {
1316       tree sl = scev;
1317       struct loop *expected = loop;
1318       while (TREE_CODE (sl) == POLYNOMIAL_CHREC)
1319 	{
1320 	  struct loop *sl_loop = get_chrec_loop (sl);
1321 	  while (sl_loop != expected)
1322 	    {
1323 	      strides->safe_push (size_int (0));
1324 	      expected = loop_outer (expected);
1325 	    }
1326 	  strides->safe_push (CHREC_RIGHT (sl));
1327 	  sl = CHREC_LEFT (sl);
1328 	  expected = loop_outer (expected);
1329 	}
1330       if (! tree_contains_chrecs (sl, NULL))
1331 	while (expected != loop_outer (loop_nest))
1332 	  {
1333 	    strides->safe_push (size_int (0));
1334 	    expected = loop_outer (expected);
1335 	  }
1336     }
1337 
1338   dr->aux = strides;
1339 }
1340 
1341 /* Given loop nest LOOP_NEST with innermost LOOP, the function computes
1342    access strides with respect to each level loop for all data refs in
1343    DATAREFS from inner loop to outer loop.  On success, it returns the
1344    outermost loop that access strides can be computed successfully for
1345    all data references.  If access strides cannot be computed at least
1346    for two levels of loop for any data reference, it returns NULL.  */
1347 
1348 static struct loop *
1349 compute_access_strides (struct loop *loop_nest, struct loop *loop,
1350 			vec<data_reference_p> datarefs)
1351 {
1352   unsigned i, j, num_loops = (unsigned) -1;
1353   data_reference_p dr;
1354   vec<tree> *stride;
1355 
1356   for (i = 0; datarefs.iterate (i, &dr); ++i)
1357     {
1358       compute_access_stride (loop_nest, loop, dr);
1359       stride = DR_ACCESS_STRIDE (dr);
1360       if (stride->length () < num_loops)
1361 	{
1362 	  num_loops = stride->length ();
1363 	  if (num_loops < 2)
1364 	    return NULL;
1365 	}
1366     }
1367 
1368   for (i = 0; datarefs.iterate (i, &dr); ++i)
1369     {
1370       stride = DR_ACCESS_STRIDE (dr);
1371       if (stride->length () > num_loops)
1372 	stride->truncate (num_loops);
1373 
1374       for (j = 0; j < (num_loops >> 1); ++j)
1375 	std::swap ((*stride)[j], (*stride)[num_loops - j - 1]);
1376     }
1377 
1378   loop = superloop_at_depth (loop, loop_depth (loop) + 1 - num_loops);
1379   gcc_assert (loop_nest == loop || flow_loop_nested_p (loop_nest, loop));
1380   return loop;
1381 }
1382 
1383 /* Prune access strides for data references in DATAREFS by removing strides
1384    of loops that isn't in current LOOP_NEST.  */
1385 
1386 static void
1387 prune_access_strides_not_in_loop (struct loop *loop_nest,
1388 				  struct loop *innermost,
1389 				  vec<data_reference_p> datarefs)
1390 {
1391   data_reference_p dr;
1392   unsigned num_loops = loop_depth (innermost) - loop_depth (loop_nest) + 1;
1393   gcc_assert (num_loops > 1);
1394 
1395   /* Block remove strides of loops that is not in current loop nest.  */
1396   for (unsigned i = 0; datarefs.iterate (i, &dr); ++i)
1397     {
1398       vec<tree> *stride = DR_ACCESS_STRIDE (dr);
1399       if (stride->length () > num_loops)
1400 	stride->block_remove (0, stride->length () - num_loops);
1401     }
1402 }
1403 
1404 /* Dump access strides for all DATAREFS.  */
1405 
1406 static void
1407 dump_access_strides (vec<data_reference_p> datarefs)
1408 {
1409   data_reference_p dr;
1410   fprintf (dump_file, "Access Strides for DRs:\n");
1411   for (unsigned i = 0; datarefs.iterate (i, &dr); ++i)
1412     {
1413       fprintf (dump_file, "  ");
1414       print_generic_expr (dump_file, DR_REF (dr), TDF_SLIM);
1415       fprintf (dump_file, ":\t\t<");
1416 
1417       vec<tree> *stride = DR_ACCESS_STRIDE (dr);
1418       unsigned num_loops = stride->length ();
1419       for (unsigned j = 0; j < num_loops; ++j)
1420 	{
1421 	  print_generic_expr (dump_file, (*stride)[j], TDF_SLIM);
1422 	  fprintf (dump_file, "%s", (j < num_loops - 1) ? ",\t" : ">\n");
1423 	}
1424     }
1425 }
1426 
1427 /* Return true if it's profitable to interchange two loops whose index
1428    in whole loop nest vector are I_IDX/O_IDX respectively.  The function
1429    computes and compares three types information from all DATAREFS:
1430      1) Access stride for loop I_IDX and O_IDX.
1431      2) Number of invariant memory references with respect to I_IDX before
1432 	and after loop interchange.
1433      3) Flags indicating if all memory references access sequential memory
1434 	in ILOOP, before and after loop interchange.
1435    If INNMOST_LOOP_P is true, the two loops for interchanging are the two
1436    innermost loops in loop nest.  This function also dumps information if
1437    DUMP_INFO_P is true.  */
1438 
1439 static bool
1440 should_interchange_loops (unsigned i_idx, unsigned o_idx,
1441 			  vec<data_reference_p> datarefs,
1442 			  unsigned i_stmt_cost, unsigned o_stmt_cost,
1443 			  bool innermost_loops_p, bool dump_info_p = true)
1444 {
1445   unsigned HOST_WIDE_INT ratio;
1446   unsigned i, j, num_old_inv_drs = 0, num_new_inv_drs = 0;
1447   struct data_reference *dr;
1448   bool all_seq_dr_before_p = true, all_seq_dr_after_p = true;
1449   widest_int iloop_strides = 0, oloop_strides = 0;
1450   unsigned num_unresolved_drs = 0;
1451   unsigned num_resolved_ok_drs = 0;
1452   unsigned num_resolved_not_ok_drs = 0;
1453 
1454   if (dump_info_p && dump_file && (dump_flags & TDF_DETAILS))
1455     fprintf (dump_file, "\nData ref strides:\n\tmem_ref:\t\tiloop\toloop\n");
1456 
1457   for (i = 0; datarefs.iterate (i, &dr); ++i)
1458     {
1459       vec<tree> *stride = DR_ACCESS_STRIDE (dr);
1460       tree iloop_stride = (*stride)[i_idx], oloop_stride = (*stride)[o_idx];
1461 
1462       bool subloop_stride_p = false;
1463       /* Data ref can't be invariant or sequential access at current loop if
1464 	 its address changes with respect to any subloops.  */
1465       for (j = i_idx + 1; j < stride->length (); ++j)
1466 	if (!integer_zerop ((*stride)[j]))
1467 	  {
1468 	    subloop_stride_p = true;
1469 	    break;
1470 	  }
1471 
1472       if (integer_zerop (iloop_stride))
1473 	{
1474 	  if (!subloop_stride_p)
1475 	    num_old_inv_drs++;
1476 	}
1477       if (integer_zerop (oloop_stride))
1478 	{
1479 	  if (!subloop_stride_p)
1480 	    num_new_inv_drs++;
1481 	}
1482 
1483       if (TREE_CODE (iloop_stride) == INTEGER_CST
1484 	  && TREE_CODE (oloop_stride) == INTEGER_CST)
1485 	{
1486 	  iloop_strides = wi::add (iloop_strides, wi::to_widest (iloop_stride));
1487 	  oloop_strides = wi::add (oloop_strides, wi::to_widest (oloop_stride));
1488 	}
1489       else if (multiple_of_p (TREE_TYPE (iloop_stride),
1490 			      iloop_stride, oloop_stride))
1491 	num_resolved_ok_drs++;
1492       else if (multiple_of_p (TREE_TYPE (iloop_stride),
1493 			      oloop_stride, iloop_stride))
1494 	num_resolved_not_ok_drs++;
1495       else
1496 	num_unresolved_drs++;
1497 
1498       /* Data ref can't be sequential access if its address changes in sub
1499 	 loop.  */
1500       if (subloop_stride_p)
1501 	{
1502 	  all_seq_dr_before_p = false;
1503 	  all_seq_dr_after_p = false;
1504 	  continue;
1505 	}
1506       /* Track if all data references are sequential accesses before/after loop
1507 	 interchange.  Note invariant is considered sequential here.  */
1508       tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
1509       if (all_seq_dr_before_p
1510 	  && ! (integer_zerop (iloop_stride)
1511 		|| operand_equal_p (access_size, iloop_stride, 0)))
1512 	all_seq_dr_before_p = false;
1513       if (all_seq_dr_after_p
1514 	  && ! (integer_zerop (oloop_stride)
1515 		|| operand_equal_p (access_size, oloop_stride, 0)))
1516 	all_seq_dr_after_p = false;
1517     }
1518 
1519   if (dump_info_p && dump_file && (dump_flags & TDF_DETAILS))
1520     {
1521       fprintf (dump_file, "\toverall:\t\t");
1522       print_decu (iloop_strides, dump_file);
1523       fprintf (dump_file, "\t");
1524       print_decu (oloop_strides, dump_file);
1525       fprintf (dump_file, "\n");
1526 
1527       fprintf (dump_file, "Invariant data ref: before(%d), after(%d)\n",
1528 	       num_old_inv_drs, num_new_inv_drs);
1529       fprintf (dump_file, "All consecutive stride: before(%s), after(%s)\n",
1530 	       all_seq_dr_before_p ? "true" : "false",
1531 	       all_seq_dr_after_p ? "true" : "false");
1532       fprintf (dump_file, "OK to interchage with variable strides: %d\n",
1533 	       num_resolved_ok_drs);
1534       fprintf (dump_file, "Not OK to interchage with variable strides: %d\n",
1535 	       num_resolved_not_ok_drs);
1536       fprintf (dump_file, "Variable strides we cannot decide: %d\n",
1537 	       num_unresolved_drs);
1538       fprintf (dump_file, "Stmt cost of inner loop: %d\n", i_stmt_cost);
1539       fprintf (dump_file, "Stmt cost of outer loop: %d\n", o_stmt_cost);
1540     }
1541 
1542   if (num_unresolved_drs != 0 || num_resolved_not_ok_drs != 0)
1543     return false;
1544 
1545   /* Stmts of outer loop will be moved to inner loop.  If there are two many
1546      such stmts, it could make inner loop costly.  Here we compare stmt cost
1547      between outer and inner loops.  */
1548   if (i_stmt_cost && o_stmt_cost
1549       && num_old_inv_drs + o_stmt_cost > num_new_inv_drs
1550       && o_stmt_cost * STMT_COST_RATIO > i_stmt_cost)
1551     return false;
1552 
1553   /* We use different stride comparison ratio for interchanging innermost
1554      two loops or not.  The idea is to be conservative in interchange for
1555      the innermost loops.  */
1556   ratio = innermost_loops_p ? INNER_STRIDE_RATIO : OUTER_STRIDE_RATIO;
1557   /* Do interchange if it gives better data locality behavior.  */
1558   if (wi::gtu_p (iloop_strides, wi::mul (oloop_strides, ratio)))
1559     return true;
1560   if (wi::gtu_p (iloop_strides, oloop_strides))
1561     {
1562       /* Or it creates more invariant memory references.  */
1563       if ((!all_seq_dr_before_p || all_seq_dr_after_p)
1564 	  && num_new_inv_drs > num_old_inv_drs)
1565 	return true;
1566       /* Or it makes all memory references sequential.  */
1567       if (num_new_inv_drs >= num_old_inv_drs
1568 	  && !all_seq_dr_before_p && all_seq_dr_after_p)
1569 	return true;
1570     }
1571 
1572   return false;
1573 }
1574 
1575 /* Try to interchange inner loop of a loop nest to outer level.  */
1576 
1577 bool
1578 tree_loop_interchange::interchange (vec<data_reference_p> datarefs,
1579 				    vec<ddr_p> ddrs)
1580 {
1581   location_t loc = find_loop_location (m_loop_nest[0]);
1582   bool changed_p = false;
1583   /* In each iteration we try to interchange I-th loop with (I+1)-th loop.
1584      The overall effect is to push inner loop to outermost level in whole
1585      loop nest.  */
1586   for (unsigned i = m_loop_nest.length (); i > 1; --i)
1587     {
1588       unsigned i_idx = i - 1, o_idx = i - 2;
1589 
1590       /* Check validity for loop interchange.  */
1591       if (!valid_data_dependences (i_idx, o_idx, ddrs))
1592 	break;
1593 
1594       loop_cand iloop (m_loop_nest[i_idx], m_loop_nest[o_idx]);
1595       loop_cand oloop (m_loop_nest[o_idx], m_loop_nest[o_idx]);
1596 
1597       /* Check if we can do transformation for loop interchange.  */
1598       if (!iloop.analyze_carried_vars (NULL)
1599 	  || !iloop.analyze_lcssa_phis ()
1600 	  || !oloop.analyze_carried_vars (&iloop)
1601 	  || !oloop.analyze_lcssa_phis ()
1602 	  || !iloop.can_interchange_p (NULL)
1603 	  || !oloop.can_interchange_p (&iloop))
1604 	break;
1605 
1606       /* Outer loop's stmts will be moved to inner loop during interchange.
1607 	 If there are many of them, it may make inner loop very costly.  We
1608 	 need to check number of outer loop's stmts in profit cost model of
1609 	 interchange.  */
1610       int stmt_cost = oloop.m_num_stmts;
1611       /* Count out the exit checking stmt of outer loop.  */
1612       stmt_cost --;
1613       /* Count out IV's increasing stmt, IVOPTs takes care if it.  */
1614       stmt_cost -= oloop.m_inductions.length ();
1615       /* Count in the additional load and cond_expr stmts caused by inner
1616 	 loop's constant initialized reduction.  */
1617       stmt_cost += iloop.m_const_init_reduc * 2;
1618       if (stmt_cost < 0)
1619 	stmt_cost = 0;
1620 
1621       /* Check profitability for loop interchange.  */
1622       if (should_interchange_loops (i_idx, o_idx, datarefs,
1623 				    (unsigned) iloop.m_num_stmts,
1624 				    (unsigned) stmt_cost,
1625 				    iloop.m_loop->inner == NULL))
1626 	{
1627 	  if (dump_file && (dump_flags & TDF_DETAILS))
1628 	    fprintf (dump_file,
1629 		     "Loop_pair<outer:%d, inner:%d> is interchanged\n\n",
1630 		     oloop.m_loop->num, iloop.m_loop->num);
1631 
1632 	  changed_p = true;
1633 	  interchange_loops (iloop, oloop);
1634 	  /* No need to update if there is no further loop interchange.  */
1635 	  if (o_idx > 0)
1636 	    update_data_info (i_idx, o_idx, datarefs, ddrs);
1637 	}
1638       else
1639 	{
1640 	  if (dump_file && (dump_flags & TDF_DETAILS))
1641 	    fprintf (dump_file,
1642 		     "Loop_pair<outer:%d, inner:%d> is not interchanged\n\n",
1643 		     oloop.m_loop->num, iloop.m_loop->num);
1644 	}
1645     }
1646   simple_dce_from_worklist (m_dce_seeds);
1647 
1648   if (changed_p)
1649     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
1650 		     "loops interchanged in loop nest\n");
1651 
1652   return changed_p;
1653 }
1654 
1655 
1656 /* Loop interchange pass.  */
1657 
1658 namespace {
1659 
1660 const pass_data pass_data_linterchange =
1661 {
1662   GIMPLE_PASS, /* type */
1663   "linterchange", /* name */
1664   OPTGROUP_LOOP, /* optinfo_flags */
1665   TV_LINTERCHANGE, /* tv_id */
1666   PROP_cfg, /* properties_required */
1667   0, /* properties_provided */
1668   0, /* properties_destroyed */
1669   0, /* todo_flags_start */
1670   0, /* todo_flags_finish */
1671 };
1672 
1673 class pass_linterchange : public gimple_opt_pass
1674 {
1675 public:
1676   pass_linterchange (gcc::context *ctxt)
1677     : gimple_opt_pass (pass_data_linterchange, ctxt)
1678   {}
1679 
1680   /* opt_pass methods: */
1681   opt_pass * clone () { return new pass_linterchange (m_ctxt); }
1682   virtual bool gate (function *) { return flag_loop_interchange; }
1683   virtual unsigned int execute (function *);
1684 
1685 }; // class pass_linterchange
1686 
1687 
1688 /* Return true if LOOP has proper form for interchange.  We check three
1689    conditions in the function:
1690      1) In general, a loop can be interchanged only if it doesn't have
1691 	basic blocks other than header, exit and latch besides possible
1692 	inner loop nest.  This basically restricts loop interchange to
1693 	below form loop nests:
1694 
1695           header<---+
1696             |       |
1697             v       |
1698         INNER_LOOP  |
1699             |       |
1700             v       |
1701           exit--->latch
1702 
1703      2) Data reference in basic block that executes in different times
1704 	than loop head/exit is not allowed.
1705      3) Record the innermost outer loop that doesn't form rectangle loop
1706 	nest with LOOP.  */
1707 
1708 static bool
1709 proper_loop_form_for_interchange (struct loop *loop, struct loop **min_outer)
1710 {
1711   edge e0, e1, exit;
1712 
1713   /* Don't interchange if loop has unsupported information for the moment.  */
1714   if (loop->safelen > 0
1715       || loop->constraints != 0
1716       || loop->can_be_parallel
1717       || loop->dont_vectorize
1718       || loop->force_vectorize
1719       || loop->in_oacc_kernels_region
1720       || loop->orig_loop_num != 0
1721       || loop->simduid != NULL_TREE)
1722     return false;
1723 
1724   /* Don't interchange if outer loop has basic block other than header, exit
1725      and latch.  */
1726   if (loop->inner != NULL
1727       && loop->num_nodes != loop->inner->num_nodes + 3)
1728     return false;
1729 
1730   if ((exit = single_dom_exit (loop)) == NULL)
1731     return false;
1732 
1733   /* Check control flow on loop header/exit blocks.  */
1734   if (loop->header == exit->src
1735       && (EDGE_COUNT (loop->header->preds) != 2
1736 	  || EDGE_COUNT (loop->header->succs) != 2))
1737     return false;
1738   else if (loop->header != exit->src
1739 	   && (EDGE_COUNT (loop->header->preds) != 2
1740 	       || !single_succ_p (loop->header)
1741 	       || unsupported_edge (single_succ_edge (loop->header))
1742 	       || EDGE_COUNT (exit->src->succs) != 2
1743 	       || !single_pred_p (exit->src)
1744 	       || unsupported_edge (single_pred_edge (exit->src))))
1745     return false;
1746 
1747   e0 = EDGE_PRED (loop->header, 0);
1748   e1 = EDGE_PRED (loop->header, 1);
1749   if (unsupported_edge (e0) || unsupported_edge (e1)
1750       || (e0->src != loop->latch && e1->src != loop->latch)
1751       || (e0->src->loop_father == loop && e1->src->loop_father == loop))
1752     return false;
1753 
1754   e0 = EDGE_SUCC (exit->src, 0);
1755   e1 = EDGE_SUCC (exit->src, 1);
1756   if (unsupported_edge (e0) || unsupported_edge (e1)
1757       || (e0->dest != loop->latch && e1->dest != loop->latch)
1758       || (e0->dest->loop_father == loop && e1->dest->loop_father == loop))
1759     return false;
1760 
1761   /* Don't interchange if any reference is in basic block that doesn't
1762      dominate exit block.  */
1763   basic_block *bbs = get_loop_body (loop);
1764   for (unsigned i = 0; i < loop->num_nodes; i++)
1765     {
1766       basic_block bb = bbs[i];
1767 
1768       if (bb->loop_father != loop
1769 	  || bb == loop->header || bb == exit->src
1770 	  || dominated_by_p (CDI_DOMINATORS, exit->src, bb))
1771 	continue;
1772 
1773       for (gimple_stmt_iterator gsi = gsi_start_nondebug_bb (bb);
1774 	   !gsi_end_p (gsi); gsi_next_nondebug (&gsi))
1775 	if (gimple_vuse (gsi_stmt (gsi)))
1776 	  {
1777 	    free (bbs);
1778 	    return false;
1779 	  }
1780     }
1781   free (bbs);
1782 
1783   tree niters = number_of_latch_executions (loop);
1784   niters = analyze_scalar_evolution (loop_outer (loop), niters);
1785   if (!niters || chrec_contains_undetermined (niters))
1786     return false;
1787 
1788   /* Record the innermost outer loop that doesn't form rectangle loop nest.  */
1789   for (loop_p loop2 = loop_outer (loop);
1790        loop2 && flow_loop_nested_p (*min_outer, loop2);
1791        loop2 = loop_outer (loop2))
1792     {
1793       niters = instantiate_scev (loop_preheader_edge (loop2),
1794 				 loop_outer (loop), niters);
1795       if (!evolution_function_is_invariant_p (niters, loop2->num))
1796 	{
1797 	  *min_outer = loop2;
1798 	  break;
1799 	}
1800     }
1801   return true;
1802 }
1803 
1804 /* Return true if any two adjacent loops in loop nest [INNERMOST, LOOP_NEST]
1805    should be interchanged by looking into all DATAREFS.  */
1806 
1807 static bool
1808 should_interchange_loop_nest (struct loop *loop_nest, struct loop *innermost,
1809 			      vec<data_reference_p> datarefs)
1810 {
1811   unsigned idx = loop_depth (innermost) - loop_depth (loop_nest);
1812   gcc_assert (idx > 0);
1813 
1814   /* Check if any two adjacent loops should be interchanged.  */
1815   for (struct loop *loop = innermost;
1816        loop != loop_nest; loop = loop_outer (loop), idx--)
1817     if (should_interchange_loops (idx, idx - 1, datarefs, 0, 0,
1818 				  loop == innermost, false))
1819       return true;
1820 
1821   return false;
1822 }
1823 
1824 /* Given loop nest LOOP_NEST and data references DATAREFS, compute data
1825    dependences for loop interchange and store it in DDRS.  Note we compute
1826    dependences directly rather than call generic interface so that we can
1827    return on unknown dependence instantly.  */
1828 
1829 static bool
1830 tree_loop_interchange_compute_ddrs (vec<loop_p> loop_nest,
1831 				    vec<data_reference_p> datarefs,
1832 				    vec<ddr_p> *ddrs)
1833 {
1834   struct data_reference *a, *b;
1835   struct loop *innermost = loop_nest.last ();
1836 
1837   for (unsigned i = 0; datarefs.iterate (i, &a); ++i)
1838     {
1839       bool a_outer_p = gimple_bb (DR_STMT (a))->loop_father != innermost;
1840       for (unsigned j = i + 1; datarefs.iterate (j, &b); ++j)
1841 	if (DR_IS_WRITE (a) || DR_IS_WRITE (b))
1842 	  {
1843 	    bool b_outer_p = gimple_bb (DR_STMT (b))->loop_father != innermost;
1844 	    /* Don't support multiple write references in outer loop.  */
1845 	    if (a_outer_p && b_outer_p && DR_IS_WRITE (a) && DR_IS_WRITE (b))
1846 	      return false;
1847 
1848 	    ddr_p ddr = initialize_data_dependence_relation (a, b, loop_nest);
1849 	    ddrs->safe_push (ddr);
1850 	    compute_affine_dependence (ddr, loop_nest[0]);
1851 
1852 	    /* Give up if ddr is unknown dependence or classic direct vector
1853 	       is not available.  */
1854 	    if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1855 		|| (DDR_ARE_DEPENDENT (ddr) == NULL_TREE
1856 		    && DDR_NUM_DIR_VECTS (ddr) == 0))
1857 	      return false;
1858 
1859 	    /* If either data references is in outer loop of nest, we require
1860 	       no dependence here because the data reference need to be moved
1861 	       into inner loop during interchange.  */
1862 	    if (a_outer_p && b_outer_p
1863 		&& operand_equal_p (DR_REF (a), DR_REF (b), 0))
1864 	      continue;
1865 	    if (DDR_ARE_DEPENDENT (ddr) != chrec_known
1866 		&& (a_outer_p || b_outer_p))
1867 	      return false;
1868 	}
1869     }
1870 
1871   return true;
1872 }
1873 
1874 /* Prune DATAREFS by removing any data reference not inside of LOOP.  */
1875 
1876 static inline void
1877 prune_datarefs_not_in_loop (struct loop *loop, vec<data_reference_p> datarefs)
1878 {
1879   unsigned i, j;
1880   struct data_reference *dr;
1881 
1882   for (i = 0, j = 0; datarefs.iterate (i, &dr); ++i)
1883     {
1884       if (flow_bb_inside_loop_p (loop, gimple_bb (DR_STMT (dr))))
1885 	datarefs[j++] = dr;
1886       else
1887 	{
1888 	  if (dr->aux)
1889 	    {
1890 	      DR_ACCESS_STRIDE (dr)->release ();
1891 	      delete (vec<tree> *) dr->aux;
1892 	    }
1893 	  free_data_ref (dr);
1894 	}
1895     }
1896   datarefs.truncate (j);
1897 }
1898 
1899 /* Find and store data references in DATAREFS for LOOP nest.  If there's
1900    difficult data reference in a basic block, we shrink the loop nest to
1901    inner loop of that basic block's father loop.  On success, return the
1902    outer loop of the result loop nest.  */
1903 
1904 static struct loop *
1905 prepare_data_references (struct loop *loop, vec<data_reference_p> *datarefs)
1906 {
1907   struct loop *loop_nest = loop;
1908   vec<data_reference_p> *bb_refs;
1909   basic_block bb, *bbs = get_loop_body_in_dom_order (loop);
1910 
1911   for (unsigned i = 0; i < loop->num_nodes; i++)
1912     bbs[i]->aux = NULL;
1913 
1914   /* Find data references for all basic blocks.  Shrink loop nest on difficult
1915      data reference.  */
1916   for (unsigned i = 0; loop_nest && i < loop->num_nodes; ++i)
1917     {
1918       bb = bbs[i];
1919       if (!flow_bb_inside_loop_p (loop_nest, bb))
1920 	continue;
1921 
1922       bb_refs = new vec<data_reference_p> ();
1923       if (find_data_references_in_bb (loop, bb, bb_refs) == chrec_dont_know)
1924         {
1925 	  loop_nest = bb->loop_father->inner;
1926 	  if (loop_nest && !loop_nest->inner)
1927 	    loop_nest = NULL;
1928 
1929 	  free_data_refs (*bb_refs);
1930           delete bb_refs;
1931         }
1932       else if (bb_refs->is_empty ())
1933 	delete bb_refs;
1934       else
1935 	bb->aux = bb_refs;
1936     }
1937 
1938   /* Collect all data references in loop nest.  */
1939   for (unsigned i = 0; i < loop->num_nodes; i++)
1940     {
1941       bb = bbs[i];
1942       if (!bb->aux)
1943 	continue;
1944 
1945       bb_refs = (vec<data_reference_p> *) bb->aux;
1946       if (loop_nest && flow_bb_inside_loop_p (loop_nest, bb))
1947 	datarefs->safe_splice (*bb_refs);
1948       else
1949 	free_data_refs (*bb_refs);
1950 
1951       delete bb_refs;
1952       bb->aux = NULL;
1953     }
1954   free (bbs);
1955 
1956   return loop_nest;
1957 }
1958 
1959 /* Given innermost LOOP, return true if perfect loop nest can be found and
1960    data dependences can be computed.  If succeed, record the perfect loop
1961    nest in LOOP_NEST; record all data references in DATAREFS and record all
1962    data dependence relations in DDRS.
1963 
1964    We do support a restricted form of imperfect loop nest, i.e, loop nest
1965    with load/store in outer loop initializing/finalizing simple reduction
1966    of the innermost loop.  For such outer loop reference, we require that
1967    it has no dependence with others sinve it will be moved to inner loop
1968    in interchange.  */
1969 
1970 static bool
1971 prepare_perfect_loop_nest (struct loop *loop, vec<loop_p> *loop_nest,
1972 			   vec<data_reference_p> *datarefs, vec<ddr_p> *ddrs)
1973 {
1974   struct loop *start_loop = NULL, *innermost = loop;
1975   struct loop *outermost = loops_for_fn (cfun)->tree_root;
1976 
1977   /* Find loop nest from the innermost loop.  The outermost is the innermost
1978      outer*/
1979   while (loop->num != 0 && loop->inner == start_loop
1980 	 && flow_loop_nested_p (outermost, loop))
1981     {
1982       if (!proper_loop_form_for_interchange (loop, &outermost))
1983 	break;
1984 
1985       start_loop = loop;
1986       /* If this loop has sibling loop, the father loop won't be in perfect
1987 	 loop nest.  */
1988       if (loop->next != NULL)
1989 	break;
1990 
1991       loop = loop_outer (loop);
1992     }
1993   if (!start_loop || !start_loop->inner)
1994     return false;
1995 
1996   /* Prepare the data reference vector for the loop nest, pruning outer
1997      loops we cannot handle.  */
1998   start_loop = prepare_data_references (start_loop, datarefs);
1999   if (!start_loop
2000       /* Check if there is no data reference.  */
2001       || datarefs->is_empty ()
2002       /* Check if there are too many of data references.  */
2003       || (int) datarefs->length () > MAX_DATAREFS)
2004     return false;
2005 
2006   /* Compute access strides for all data references, pruning outer
2007      loops we cannot analyze refs in.  */
2008   start_loop = compute_access_strides (start_loop, innermost, *datarefs);
2009   if (!start_loop)
2010     return false;
2011 
2012   /* Check if any interchange is profitable in the loop nest.  */
2013   if (!should_interchange_loop_nest (start_loop, innermost, *datarefs))
2014     return false;
2015 
2016   /* Check if data dependences can be computed for loop nest starting from
2017      start_loop.  */
2018   loop = start_loop;
2019   do {
2020     loop_nest->truncate (0);
2021 
2022     if (loop != start_loop)
2023       prune_datarefs_not_in_loop (start_loop, *datarefs);
2024 
2025     if (find_loop_nest (start_loop, loop_nest)
2026 	&& tree_loop_interchange_compute_ddrs (*loop_nest, *datarefs, ddrs))
2027       {
2028 	if (dump_file && (dump_flags & TDF_DETAILS))
2029 	  fprintf (dump_file,
2030 		   "\nConsider loop interchange for loop_nest<%d - %d>\n",
2031 		   start_loop->num, innermost->num);
2032 
2033 	if (loop != start_loop)
2034 	  prune_access_strides_not_in_loop (start_loop, innermost, *datarefs);
2035 
2036 	if (dump_file && (dump_flags & TDF_DETAILS))
2037 	  dump_access_strides (*datarefs);
2038 
2039 	return true;
2040       }
2041 
2042     free_dependence_relations (*ddrs);
2043     *ddrs = vNULL;
2044     /* Try to compute data dependences with the outermost loop stripped.  */
2045     loop = start_loop;
2046     start_loop = start_loop->inner;
2047   } while (start_loop && start_loop->inner);
2048 
2049   return false;
2050 }
2051 
2052 /* Main entry for loop interchange pass.  */
2053 
2054 unsigned int
2055 pass_linterchange::execute (function *fun)
2056 {
2057   if (number_of_loops (fun) <= 2)
2058     return 0;
2059 
2060   bool changed_p = false;
2061   struct loop *loop;
2062   FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
2063     {
2064       vec<loop_p> loop_nest = vNULL;
2065       vec<data_reference_p> datarefs = vNULL;
2066       vec<ddr_p> ddrs = vNULL;
2067       if (prepare_perfect_loop_nest (loop, &loop_nest, &datarefs, &ddrs))
2068 	{
2069 	  tree_loop_interchange loop_interchange (loop_nest);
2070 	  changed_p |= loop_interchange.interchange (datarefs, ddrs);
2071 	}
2072       free_dependence_relations (ddrs);
2073       free_data_refs_with_aux (datarefs);
2074       loop_nest.release ();
2075     }
2076 
2077   return changed_p ? (TODO_update_ssa_only_virtuals) : 0;
2078 }
2079 
2080 } // anon namespace
2081 
2082 gimple_opt_pass *
2083 make_pass_linterchange (gcc::context *ctxt)
2084 {
2085   return new pass_linterchange (ctxt);
2086 }
2087