xref: /openbsd/gnu/gcc/gcc/modulo-sched.c (revision 404b540a)
1 /* Swing Modulo Scheduling implementation.
2    Copyright (C) 2004, 2005, 2006
3    Free Software Foundation, Inc.
4    Contributed by Ayal Zaks and Mustafa Hagog <zaks,mustafa@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 2, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING.  If not, write to the Free
20 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
21 02110-1301, USA.  */
22 
23 
24 #include "config.h"
25 #include "system.h"
26 #include "coretypes.h"
27 #include "tm.h"
28 #include "toplev.h"
29 #include "rtl.h"
30 #include "tm_p.h"
31 #include "hard-reg-set.h"
32 #include "regs.h"
33 #include "function.h"
34 #include "flags.h"
35 #include "insn-config.h"
36 #include "insn-attr.h"
37 #include "except.h"
38 #include "toplev.h"
39 #include "recog.h"
40 #include "sched-int.h"
41 #include "target.h"
42 #include "cfglayout.h"
43 #include "cfgloop.h"
44 #include "cfghooks.h"
45 #include "expr.h"
46 #include "params.h"
47 #include "gcov-io.h"
48 #include "df.h"
49 #include "ddg.h"
50 #include "timevar.h"
51 #include "tree-pass.h"
52 
53 #ifdef INSN_SCHEDULING
54 
55 /* This file contains the implementation of the Swing Modulo Scheduler,
56    described in the following references:
57    [1] J. Llosa, A. Gonzalez, E. Ayguade, M. Valero., and J. Eckhardt.
58        Lifetime--sensitive modulo scheduling in a production environment.
59        IEEE Trans. on Comps., 50(3), March 2001
60    [2] J. Llosa, A. Gonzalez, E. Ayguade, and M. Valero.
61        Swing Modulo Scheduling: A Lifetime Sensitive Approach.
62        PACT '96 , pages 80-87, October 1996 (Boston - Massachusetts - USA).
63 
64    The basic structure is:
65    1. Build a data-dependence graph (DDG) for each loop.
66    2. Use the DDG to order the insns of a loop (not in topological order
67       necessarily, but rather) trying to place each insn after all its
68       predecessors _or_ after all its successors.
69    3. Compute MII: a lower bound on the number of cycles to schedule the loop.
70    4. Use the ordering to perform list-scheduling of the loop:
71       1. Set II = MII.  We will try to schedule the loop within II cycles.
72       2. Try to schedule the insns one by one according to the ordering.
73 	 For each insn compute an interval of cycles by considering already-
74 	 scheduled preds and succs (and associated latencies); try to place
75 	 the insn in the cycles of this window checking for potential
76 	 resource conflicts (using the DFA interface).
77 	 Note: this is different from the cycle-scheduling of schedule_insns;
78 	 here the insns are not scheduled monotonically top-down (nor bottom-
79 	 up).
80       3. If failed in scheduling all insns - bump II++ and try again, unless
81 	 II reaches an upper bound MaxII, in which case report failure.
82    5. If we succeeded in scheduling the loop within II cycles, we now
83       generate prolog and epilog, decrease the counter of the loop, and
84       perform modulo variable expansion for live ranges that span more than
85       II cycles (i.e. use register copies to prevent a def from overwriting
86       itself before reaching the use).
87 */
88 
89 
90 /* This page defines partial-schedule structures and functions for
91    modulo scheduling.  */
92 
93 typedef struct partial_schedule *partial_schedule_ptr;
94 typedef struct ps_insn *ps_insn_ptr;
95 
96 /* The minimum (absolute) cycle that a node of ps was scheduled in.  */
97 #define PS_MIN_CYCLE(ps) (((partial_schedule_ptr)(ps))->min_cycle)
98 
99 /* The maximum (absolute) cycle that a node of ps was scheduled in.  */
100 #define PS_MAX_CYCLE(ps) (((partial_schedule_ptr)(ps))->max_cycle)
101 
102 /* Perform signed modulo, always returning a non-negative value.  */
103 #define SMODULO(x,y) ((x) % (y) < 0 ? ((x) % (y) + (y)) : (x) % (y))
104 
105 /* The number of different iterations the nodes in ps span, assuming
106    the stage boundaries are placed efficiently.  */
107 #define PS_STAGE_COUNT(ps) ((PS_MAX_CYCLE (ps) - PS_MIN_CYCLE (ps) \
108 			     + 1 + (ps)->ii - 1) / (ps)->ii)
109 
110 /* A single instruction in the partial schedule.  */
111 struct ps_insn
112 {
113   /* The corresponding DDG_NODE.  */
114   ddg_node_ptr node;
115 
116   /* The (absolute) cycle in which the PS instruction is scheduled.
117      Same as SCHED_TIME (node).  */
118   int cycle;
119 
120   /* The next/prev PS_INSN in the same row.  */
121   ps_insn_ptr next_in_row,
122 	      prev_in_row;
123 
124   /* The number of nodes in the same row that come after this node.  */
125   int row_rest_count;
126 };
127 
128 /* Holds the partial schedule as an array of II rows.  Each entry of the
129    array points to a linked list of PS_INSNs, which represents the
130    instructions that are scheduled for that row.  */
131 struct partial_schedule
132 {
133   int ii;	/* Number of rows in the partial schedule.  */
134   int history;  /* Threshold for conflict checking using DFA.  */
135 
136   /* rows[i] points to linked list of insns scheduled in row i (0<=i<ii).  */
137   ps_insn_ptr *rows;
138 
139   /* The earliest absolute cycle of an insn in the partial schedule.  */
140   int min_cycle;
141 
142   /* The latest absolute cycle of an insn in the partial schedule.  */
143   int max_cycle;
144 
145   ddg_ptr g;	/* The DDG of the insns in the partial schedule.  */
146 };
147 
148 /* We use this to record all the register replacements we do in
149    the kernel so we can undo SMS if it is not profitable.  */
150 struct undo_replace_buff_elem
151 {
152   rtx insn;
153   rtx orig_reg;
154   rtx new_reg;
155   struct undo_replace_buff_elem *next;
156 };
157 
158 
159 
160 static partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int history);
161 static void free_partial_schedule (partial_schedule_ptr);
162 static void reset_partial_schedule (partial_schedule_ptr, int new_ii);
163 void print_partial_schedule (partial_schedule_ptr, FILE *);
164 static int kernel_number_of_cycles (rtx first_insn, rtx last_insn);
165 static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
166 						ddg_node_ptr node, int cycle,
167 						sbitmap must_precede,
168 						sbitmap must_follow);
169 static void rotate_partial_schedule (partial_schedule_ptr, int);
170 void set_row_column_for_ps (partial_schedule_ptr);
171 static bool ps_unschedule_node (partial_schedule_ptr, ddg_node_ptr );
172 
173 
174 /* This page defines constants and structures for the modulo scheduling
175    driver.  */
176 
177 /* As in haifa-sched.c:  */
178 /* issue_rate is the number of insns that can be scheduled in the same
179    machine cycle.  It can be defined in the config/mach/mach.h file,
180    otherwise we set it to 1.  */
181 
182 static int issue_rate;
183 
184 static int sms_order_nodes (ddg_ptr, int, int * result);
185 static void set_node_sched_params (ddg_ptr);
186 static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int, int *);
187 static void permute_partial_schedule (partial_schedule_ptr ps, rtx last);
188 static void generate_prolog_epilog (partial_schedule_ptr ,struct loop * loop, rtx);
189 static void duplicate_insns_of_cycles (partial_schedule_ptr ps,
190 				       int from_stage, int to_stage,
191 				       int is_prolog);
192 
193 #define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap)
194 #define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time)
195 #define SCHED_FIRST_REG_MOVE(x) \
196 	(((node_sched_params_ptr)(x)->aux.info)->first_reg_move)
197 #define SCHED_NREG_MOVES(x) \
198 	(((node_sched_params_ptr)(x)->aux.info)->nreg_moves)
199 #define SCHED_ROW(x) (((node_sched_params_ptr)(x)->aux.info)->row)
200 #define SCHED_STAGE(x) (((node_sched_params_ptr)(x)->aux.info)->stage)
201 #define SCHED_COLUMN(x) (((node_sched_params_ptr)(x)->aux.info)->column)
202 
203 /* The scheduling parameters held for each node.  */
204 typedef struct node_sched_params
205 {
206   int asap;	/* A lower-bound on the absolute scheduling cycle.  */
207   int time;	/* The absolute scheduling cycle (time >= asap).  */
208 
209   /* The following field (first_reg_move) is a pointer to the first
210      register-move instruction added to handle the modulo-variable-expansion
211      of the register defined by this node.  This register-move copies the
212      original register defined by the node.  */
213   rtx first_reg_move;
214 
215   /* The number of register-move instructions added, immediately preceding
216      first_reg_move.  */
217   int nreg_moves;
218 
219   int row;    /* Holds time % ii.  */
220   int stage;  /* Holds time / ii.  */
221 
222   /* The column of a node inside the ps.  If nodes u, v are on the same row,
223      u will precede v if column (u) < column (v).  */
224   int column;
225 } *node_sched_params_ptr;
226 
227 
228 /* The following three functions are copied from the current scheduler
229    code in order to use sched_analyze() for computing the dependencies.
230    They are used when initializing the sched_info structure.  */
231 static const char *
sms_print_insn(rtx insn,int aligned ATTRIBUTE_UNUSED)232 sms_print_insn (rtx insn, int aligned ATTRIBUTE_UNUSED)
233 {
234   static char tmp[80];
235 
236   sprintf (tmp, "i%4d", INSN_UID (insn));
237   return tmp;
238 }
239 
240 static void
compute_jump_reg_dependencies(rtx insn ATTRIBUTE_UNUSED,regset cond_exec ATTRIBUTE_UNUSED,regset used ATTRIBUTE_UNUSED,regset set ATTRIBUTE_UNUSED)241 compute_jump_reg_dependencies (rtx insn ATTRIBUTE_UNUSED,
242 			       regset cond_exec ATTRIBUTE_UNUSED,
243 			       regset used ATTRIBUTE_UNUSED,
244 			       regset set ATTRIBUTE_UNUSED)
245 {
246 }
247 
248 static struct sched_info sms_sched_info =
249 {
250   NULL,
251   NULL,
252   NULL,
253   NULL,
254   NULL,
255   sms_print_insn,
256   NULL,
257   compute_jump_reg_dependencies,
258   NULL, NULL,
259   NULL, NULL,
260   0, 0, 0,
261 
262   NULL, NULL, NULL, NULL, NULL,
263 #ifdef ENABLE_CHECKING
264   NULL,
265 #endif
266   0
267 };
268 
269 
270 /* Return the register decremented and tested in INSN,
271    or zero if it is not a decrement-and-branch insn.  */
272 
273 static rtx
doloop_register_get(rtx insn ATTRIBUTE_UNUSED)274 doloop_register_get (rtx insn ATTRIBUTE_UNUSED)
275 {
276 #ifdef HAVE_doloop_end
277   rtx pattern, reg, condition;
278 
279   if (! JUMP_P (insn))
280     return NULL_RTX;
281 
282   pattern = PATTERN (insn);
283   condition = doloop_condition_get (pattern);
284   if (! condition)
285     return NULL_RTX;
286 
287   if (REG_P (XEXP (condition, 0)))
288     reg = XEXP (condition, 0);
289   else if (GET_CODE (XEXP (condition, 0)) == PLUS
290 	   && REG_P (XEXP (XEXP (condition, 0), 0)))
291     reg = XEXP (XEXP (condition, 0), 0);
292   else
293     gcc_unreachable ();
294 
295   return reg;
296 #else
297   return NULL_RTX;
298 #endif
299 }
300 
301 /* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
302    that the number of iterations is a compile-time constant.  If so,
303    return the rtx that sets COUNT_REG to a constant, and set COUNT to
304    this constant.  Otherwise return 0.  */
305 static rtx
const_iteration_count(rtx count_reg,basic_block pre_header,HOST_WIDEST_INT * count)306 const_iteration_count (rtx count_reg, basic_block pre_header,
307 		       HOST_WIDEST_INT * count)
308 {
309   rtx insn;
310   rtx head, tail;
311 
312   if (! pre_header)
313     return NULL_RTX;
314 
315   get_ebb_head_tail (pre_header, pre_header, &head, &tail);
316 
317   for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn))
318     if (INSN_P (insn) && single_set (insn) &&
319 	rtx_equal_p (count_reg, SET_DEST (single_set (insn))))
320       {
321 	rtx pat = single_set (insn);
322 
323 	if (GET_CODE (SET_SRC (pat)) == CONST_INT)
324 	  {
325 	    *count = INTVAL (SET_SRC (pat));
326 	    return insn;
327 	  }
328 
329 	return NULL_RTX;
330       }
331 
332   return NULL_RTX;
333 }
334 
335 /* A very simple resource-based lower bound on the initiation interval.
336    ??? Improve the accuracy of this bound by considering the
337    utilization of various units.  */
338 static int
res_MII(ddg_ptr g)339 res_MII (ddg_ptr g)
340 {
341   return (g->num_nodes / issue_rate);
342 }
343 
344 
345 /* Points to the array that contains the sched data for each node.  */
346 static node_sched_params_ptr node_sched_params;
347 
348 /* Allocate sched_params for each node and initialize it.  Assumes that
349    the aux field of each node contain the asap bound (computed earlier),
350    and copies it into the sched_params field.  */
351 static void
set_node_sched_params(ddg_ptr g)352 set_node_sched_params (ddg_ptr g)
353 {
354   int i;
355 
356   /* Allocate for each node in the DDG a place to hold the "sched_data".  */
357   /* Initialize ASAP/ALAP/HIGHT to zero.  */
358   node_sched_params = (node_sched_params_ptr)
359 		       xcalloc (g->num_nodes,
360 				sizeof (struct node_sched_params));
361 
362   /* Set the pointer of the general data of the node to point to the
363      appropriate sched_params structure.  */
364   for (i = 0; i < g->num_nodes; i++)
365     {
366       /* Watch out for aliasing problems?  */
367       node_sched_params[i].asap = g->nodes[i].aux.count;
368       g->nodes[i].aux.info = &node_sched_params[i];
369     }
370 }
371 
372 static void
print_node_sched_params(FILE * file,int num_nodes)373 print_node_sched_params (FILE * file, int num_nodes)
374 {
375   int i;
376 
377   if (! file)
378     return;
379   for (i = 0; i < num_nodes; i++)
380     {
381       node_sched_params_ptr nsp = &node_sched_params[i];
382       rtx reg_move = nsp->first_reg_move;
383       int j;
384 
385       fprintf (file, "Node %d:\n", i);
386       fprintf (file, " asap = %d:\n", nsp->asap);
387       fprintf (file, " time = %d:\n", nsp->time);
388       fprintf (file, " nreg_moves = %d:\n", nsp->nreg_moves);
389       for (j = 0; j < nsp->nreg_moves; j++)
390 	{
391 	  fprintf (file, " reg_move = ");
392 	  print_rtl_single (file, reg_move);
393 	  reg_move = PREV_INSN (reg_move);
394 	}
395     }
396 }
397 
398 /* Calculate an upper bound for II.  SMS should not schedule the loop if it
399    requires more cycles than this bound.  Currently set to the sum of the
400    longest latency edge for each node.  Reset based on experiments.  */
401 static int
calculate_maxii(ddg_ptr g)402 calculate_maxii (ddg_ptr g)
403 {
404   int i;
405   int maxii = 0;
406 
407   for (i = 0; i < g->num_nodes; i++)
408     {
409       ddg_node_ptr u = &g->nodes[i];
410       ddg_edge_ptr e;
411       int max_edge_latency = 0;
412 
413       for (e = u->out; e; e = e->next_out)
414 	max_edge_latency = MAX (max_edge_latency, e->latency);
415 
416       maxii += max_edge_latency;
417     }
418   return maxii;
419 }
420 
421 /*
422    Breaking intra-loop register anti-dependences:
423    Each intra-loop register anti-dependence implies a cross-iteration true
424    dependence of distance 1. Therefore, we can remove such false dependencies
425    and figure out if the partial schedule broke them by checking if (for a
426    true-dependence of distance 1): SCHED_TIME (def) < SCHED_TIME (use) and
427    if so generate a register move.   The number of such moves is equal to:
428               SCHED_TIME (use) - SCHED_TIME (def)       { 0 broken
429    nreg_moves = ----------------------------------- + 1 - {   dependence.
430                             ii                          { 1 if not.
431 */
432 static struct undo_replace_buff_elem *
generate_reg_moves(partial_schedule_ptr ps)433 generate_reg_moves (partial_schedule_ptr ps)
434 {
435   ddg_ptr g = ps->g;
436   int ii = ps->ii;
437   int i;
438   struct undo_replace_buff_elem *reg_move_replaces = NULL;
439 
440   for (i = 0; i < g->num_nodes; i++)
441     {
442       ddg_node_ptr u = &g->nodes[i];
443       ddg_edge_ptr e;
444       int nreg_moves = 0, i_reg_move;
445       sbitmap *uses_of_defs;
446       rtx last_reg_move;
447       rtx prev_reg, old_reg;
448 
449       /* Compute the number of reg_moves needed for u, by looking at life
450 	 ranges started at u (excluding self-loops).  */
451       for (e = u->out; e; e = e->next_out)
452 	if (e->type == TRUE_DEP && e->dest != e->src)
453 	  {
454 	    int nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
455 
456             if (e->distance == 1)
457               nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
458 
459 	    /* If dest precedes src in the schedule of the kernel, then dest
460 	       will read before src writes and we can save one reg_copy.  */
461 	    if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
462 		&& SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
463 	      nreg_moves4e--;
464 
465 	    nreg_moves = MAX (nreg_moves, nreg_moves4e);
466 	  }
467 
468       if (nreg_moves == 0)
469 	continue;
470 
471       /* Every use of the register defined by node may require a different
472 	 copy of this register, depending on the time the use is scheduled.
473 	 Set a bitmap vector, telling which nodes use each copy of this
474 	 register.  */
475       uses_of_defs = sbitmap_vector_alloc (nreg_moves, g->num_nodes);
476       sbitmap_vector_zero (uses_of_defs, nreg_moves);
477       for (e = u->out; e; e = e->next_out)
478 	if (e->type == TRUE_DEP && e->dest != e->src)
479 	  {
480 	    int dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
481 
482 	    if (e->distance == 1)
483 	      dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
484 
485 	    if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
486 		&& SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
487 	      dest_copy--;
488 
489 	    if (dest_copy)
490 	      SET_BIT (uses_of_defs[dest_copy - 1], e->dest->cuid);
491 	  }
492 
493       /* Now generate the reg_moves, attaching relevant uses to them.  */
494       SCHED_NREG_MOVES (u) = nreg_moves;
495       old_reg = prev_reg = copy_rtx (SET_DEST (single_set (u->insn)));
496       last_reg_move = u->insn;
497 
498       for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
499 	{
500 	  unsigned int i_use = 0;
501 	  rtx new_reg = gen_reg_rtx (GET_MODE (prev_reg));
502 	  rtx reg_move = gen_move_insn (new_reg, prev_reg);
503 	  sbitmap_iterator sbi;
504 
505 	  add_insn_before (reg_move, last_reg_move);
506 	  last_reg_move = reg_move;
507 
508 	  if (!SCHED_FIRST_REG_MOVE (u))
509 	    SCHED_FIRST_REG_MOVE (u) = reg_move;
510 
511 	  EXECUTE_IF_SET_IN_SBITMAP (uses_of_defs[i_reg_move], 0, i_use, sbi)
512 	    {
513 	      struct undo_replace_buff_elem *rep;
514 
515 	      rep = (struct undo_replace_buff_elem *)
516 		    xcalloc (1, sizeof (struct undo_replace_buff_elem));
517 	      rep->insn = g->nodes[i_use].insn;
518 	      rep->orig_reg = old_reg;
519 	      rep->new_reg = new_reg;
520 
521 	      if (! reg_move_replaces)
522 		reg_move_replaces = rep;
523 	      else
524 		{
525 		  rep->next = reg_move_replaces;
526 		  reg_move_replaces = rep;
527 		}
528 
529 	      replace_rtx (g->nodes[i_use].insn, old_reg, new_reg);
530 	    }
531 
532 	  prev_reg = new_reg;
533 	}
534       sbitmap_vector_free (uses_of_defs);
535     }
536   return reg_move_replaces;
537 }
538 
539 /* We call this when we want to undo the SMS schedule for a given loop.
540    One of the things that we do is to delete the register moves generated
541    for the sake of SMS; this function deletes the register move instructions
542    recorded in the undo buffer.  */
543 static void
undo_generate_reg_moves(partial_schedule_ptr ps,struct undo_replace_buff_elem * reg_move_replaces)544 undo_generate_reg_moves (partial_schedule_ptr ps,
545 			 struct undo_replace_buff_elem *reg_move_replaces)
546 {
547   int i,j;
548 
549   for (i = 0; i < ps->g->num_nodes; i++)
550     {
551       ddg_node_ptr u = &ps->g->nodes[i];
552       rtx prev;
553       rtx crr = SCHED_FIRST_REG_MOVE (u);
554 
555       for (j = 0; j < SCHED_NREG_MOVES (u); j++)
556 	{
557 	  prev = PREV_INSN (crr);
558 	  delete_insn (crr);
559 	  crr = prev;
560 	}
561       SCHED_FIRST_REG_MOVE (u) = NULL_RTX;
562     }
563 
564   while (reg_move_replaces)
565     {
566       struct undo_replace_buff_elem *rep = reg_move_replaces;
567 
568       reg_move_replaces = reg_move_replaces->next;
569       replace_rtx (rep->insn, rep->new_reg, rep->orig_reg);
570     }
571 }
572 
573 /* Free memory allocated for the undo buffer.  */
574 static void
free_undo_replace_buff(struct undo_replace_buff_elem * reg_move_replaces)575 free_undo_replace_buff (struct undo_replace_buff_elem *reg_move_replaces)
576 {
577 
578   while (reg_move_replaces)
579     {
580       struct undo_replace_buff_elem *rep = reg_move_replaces;
581 
582       reg_move_replaces = reg_move_replaces->next;
583       free (rep);
584     }
585 }
586 
587 /* Bump the SCHED_TIMEs of all nodes to start from zero.  Set the values
588    of SCHED_ROW and SCHED_STAGE.  */
589 static void
normalize_sched_times(partial_schedule_ptr ps)590 normalize_sched_times (partial_schedule_ptr ps)
591 {
592   int i;
593   ddg_ptr g = ps->g;
594   int amount = PS_MIN_CYCLE (ps);
595   int ii = ps->ii;
596 
597   /* Don't include the closing branch assuming that it is the last node.  */
598   for (i = 0; i < g->num_nodes - 1; i++)
599     {
600       ddg_node_ptr u = &g->nodes[i];
601       int normalized_time = SCHED_TIME (u) - amount;
602 
603       gcc_assert (normalized_time >= 0);
604 
605       SCHED_TIME (u) = normalized_time;
606       SCHED_ROW (u) = normalized_time % ii;
607       SCHED_STAGE (u) = normalized_time / ii;
608     }
609 }
610 
611 /* Set SCHED_COLUMN of each node according to its position in PS.  */
612 static void
set_columns_for_ps(partial_schedule_ptr ps)613 set_columns_for_ps (partial_schedule_ptr ps)
614 {
615   int row;
616 
617   for (row = 0; row < ps->ii; row++)
618     {
619       ps_insn_ptr cur_insn = ps->rows[row];
620       int column = 0;
621 
622       for (; cur_insn; cur_insn = cur_insn->next_in_row)
623 	SCHED_COLUMN (cur_insn->node) = column++;
624     }
625 }
626 
627 /* Permute the insns according to their order in PS, from row 0 to
628    row ii-1, and position them right before LAST.  This schedules
629    the insns of the loop kernel.  */
630 static void
permute_partial_schedule(partial_schedule_ptr ps,rtx last)631 permute_partial_schedule (partial_schedule_ptr ps, rtx last)
632 {
633   int ii = ps->ii;
634   int row;
635   ps_insn_ptr ps_ij;
636 
637   for (row = 0; row < ii ; row++)
638     for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
639       if (PREV_INSN (last) != ps_ij->node->insn)
640       	reorder_insns_nobb (ps_ij->node->first_note, ps_ij->node->insn,
641 			    PREV_INSN (last));
642 }
643 
644 /* As part of undoing SMS we return to the original ordering of the
645    instructions inside the loop kernel.  Given the partial schedule PS, this
646    function returns the ordering of the instruction according to their CUID
647    in the DDG (PS->G), which is the original order of the instruction before
648    performing SMS.  */
649 static void
undo_permute_partial_schedule(partial_schedule_ptr ps,rtx last)650 undo_permute_partial_schedule (partial_schedule_ptr ps, rtx last)
651 {
652   int i;
653 
654   for (i = 0 ; i < ps->g->num_nodes; i++)
655     if (last == ps->g->nodes[i].insn
656 	|| last == ps->g->nodes[i].first_note)
657       break;
658     else if (PREV_INSN (last) != ps->g->nodes[i].insn)
659       reorder_insns_nobb (ps->g->nodes[i].first_note, ps->g->nodes[i].insn,
660 			  PREV_INSN (last));
661 }
662 
663 /* Used to generate the prologue & epilogue.  Duplicate the subset of
664    nodes whose stages are between FROM_STAGE and TO_STAGE (inclusive
665    of both), together with a prefix/suffix of their reg_moves.  */
666 static void
duplicate_insns_of_cycles(partial_schedule_ptr ps,int from_stage,int to_stage,int for_prolog)667 duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
668 			   int to_stage, int for_prolog)
669 {
670   int row;
671   ps_insn_ptr ps_ij;
672 
673   for (row = 0; row < ps->ii; row++)
674     for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
675       {
676 	ddg_node_ptr u_node = ps_ij->node;
677 	int j, i_reg_moves;
678 	rtx reg_move = NULL_RTX;
679 
680 	if (for_prolog)
681 	  {
682 	    /* SCHED_STAGE (u_node) >= from_stage == 0.  Generate increasing
683 	       number of reg_moves starting with the second occurrence of
684 	       u_node, which is generated if its SCHED_STAGE <= to_stage.  */
685 	    i_reg_moves = to_stage - SCHED_STAGE (u_node) + 1;
686 	    i_reg_moves = MAX (i_reg_moves, 0);
687 	    i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
688 
689 	    /* The reg_moves start from the *first* reg_move backwards.  */
690 	    if (i_reg_moves)
691 	      {
692 		reg_move = SCHED_FIRST_REG_MOVE (u_node);
693 		for (j = 1; j < i_reg_moves; j++)
694 		  reg_move = PREV_INSN (reg_move);
695 	      }
696 	  }
697 	else /* It's for the epilog.  */
698 	  {
699 	    /* SCHED_STAGE (u_node) <= to_stage.  Generate all reg_moves,
700 	       starting to decrease one stage after u_node no longer occurs;
701 	       that is, generate all reg_moves until
702 	       SCHED_STAGE (u_node) == from_stage - 1.  */
703 	    i_reg_moves = SCHED_NREG_MOVES (u_node)
704 	    	       - (from_stage - SCHED_STAGE (u_node) - 1);
705 	    i_reg_moves = MAX (i_reg_moves, 0);
706 	    i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
707 
708 	    /* The reg_moves start from the *last* reg_move forwards.  */
709 	    if (i_reg_moves)
710 	      {
711 		reg_move = SCHED_FIRST_REG_MOVE (u_node);
712 		for (j = 1; j < SCHED_NREG_MOVES (u_node); j++)
713 		  reg_move = PREV_INSN (reg_move);
714 	      }
715 	  }
716 
717 	for (j = 0; j < i_reg_moves; j++, reg_move = NEXT_INSN (reg_move))
718 	  emit_insn (copy_rtx (PATTERN (reg_move)));
719 	if (SCHED_STAGE (u_node) >= from_stage
720 	    && SCHED_STAGE (u_node) <= to_stage)
721 	  duplicate_insn_chain (u_node->first_note, u_node->insn);
722       }
723 }
724 
725 
726 /* Generate the instructions (including reg_moves) for prolog & epilog.  */
727 static void
generate_prolog_epilog(partial_schedule_ptr ps,struct loop * loop,rtx count_reg)728 generate_prolog_epilog (partial_schedule_ptr ps, struct loop * loop, rtx count_reg)
729 {
730   int i;
731   int last_stage = PS_STAGE_COUNT (ps) - 1;
732   edge e;
733 
734   /* Generate the prolog, inserting its insns on the loop-entry edge.  */
735   start_sequence ();
736 
737   if (count_reg)
738    /* Generate a subtract instruction at the beginning of the prolog to
739       adjust the loop count by STAGE_COUNT.  */
740    emit_insn (gen_sub2_insn (count_reg, GEN_INT (last_stage)));
741 
742   for (i = 0; i < last_stage; i++)
743     duplicate_insns_of_cycles (ps, 0, i, 1);
744 
745   /* Put the prolog ,  on the one and only entry edge.  */
746   e = loop_preheader_edge (loop);
747   loop_split_edge_with(e , get_insns());
748 
749   end_sequence ();
750 
751   /* Generate the epilog, inserting its insns on the loop-exit edge.  */
752   start_sequence ();
753 
754   for (i = 0; i < last_stage; i++)
755     duplicate_insns_of_cycles (ps, i + 1, last_stage, 0);
756 
757   /* Put the epilogue on the one and only one exit edge.  */
758   gcc_assert (loop->single_exit);
759   e = loop->single_exit;
760   loop_split_edge_with(e , get_insns());
761   end_sequence ();
762 }
763 
764 /* Return the line note insn preceding INSN, for debugging.  Taken from
765    emit-rtl.c.  */
766 static rtx
find_line_note(rtx insn)767 find_line_note (rtx insn)
768 {
769   for (; insn; insn = PREV_INSN (insn))
770     if (NOTE_P (insn)
771 	&& NOTE_LINE_NUMBER (insn) >= 0)
772       break;
773 
774   return insn;
775 }
776 
777 /* Return true if all the BBs of the loop are empty except the
778    loop header.  */
779 static bool
loop_single_full_bb_p(struct loop * loop)780 loop_single_full_bb_p (struct loop *loop)
781 {
782   unsigned i;
783   basic_block *bbs = get_loop_body (loop);
784 
785   for (i = 0; i < loop->num_nodes ; i++)
786     {
787       rtx head, tail;
788       bool empty_bb = true;
789 
790       if (bbs[i] == loop->header)
791         continue;
792 
793       /* Make sure that basic blocks other than the header
794          have only notes labels or jumps.  */
795       get_ebb_head_tail (bbs[i], bbs[i], &head, &tail);
796       for (; head != NEXT_INSN (tail); head = NEXT_INSN (head))
797         {
798           if (NOTE_P (head) || LABEL_P (head)
799  	      || (INSN_P (head) && JUMP_P (head)))
800  	    continue;
801  	  empty_bb = false;
802  	  break;
803         }
804 
805       if (! empty_bb)
806         {
807           free (bbs);
808           return false;
809         }
810     }
811   free (bbs);
812   return true;
813 }
814 
815 /* A simple loop from SMS point of view; it is a loop that is composed of
816    either a single basic block or two BBs - a header and a latch.  */
817 #define SIMPLE_SMS_LOOP_P(loop) ((loop->num_nodes < 3 ) 		    \
818 				  && (EDGE_COUNT (loop->latch->preds) == 1) \
819                                   && (EDGE_COUNT (loop->latch->succs) == 1))
820 
821 /* Return true if the loop is in its canonical form and false if not.
822    i.e. SIMPLE_SMS_LOOP_P and have one preheader block, and single exit.  */
823 static bool
loop_canon_p(struct loop * loop)824 loop_canon_p (struct loop *loop)
825 {
826 
827   if (loop->inner || ! loop->outer)
828     return false;
829 
830   if (!loop->single_exit)
831     {
832       if (dump_file)
833 	{
834 	  rtx line_note = find_line_note (BB_END (loop->header));
835 
836 	  fprintf (dump_file, "SMS loop many exits ");
837 	  if (line_note)
838 	    {
839 	      expanded_location xloc;
840 	      NOTE_EXPANDED_LOCATION (xloc, line_note);
841 	      fprintf (dump_file, " %s %d (file, line)\n",
842 		       xloc.file, xloc.line);
843 	    }
844 	}
845       return false;
846     }
847 
848   if (! SIMPLE_SMS_LOOP_P (loop) && ! loop_single_full_bb_p (loop))
849     {
850       if (dump_file)
851 	{
852 	  rtx line_note = find_line_note (BB_END (loop->header));
853 
854 	  fprintf (dump_file, "SMS loop many BBs. ");
855 	  if (line_note)
856 	    {
857 	      expanded_location xloc;
858   	      NOTE_EXPANDED_LOCATION (xloc, line_note);
859 	      fprintf (dump_file, " %s %d (file, line)\n",
860 		       xloc.file, xloc.line);
861 	    }
862 	}
863       return false;
864     }
865 
866     return true;
867 }
868 
869 /* If there are more than one entry for the loop,
870    make it one by splitting the first entry edge and
871    redirecting the others to the new BB.  */
872 static void
canon_loop(struct loop * loop)873 canon_loop (struct loop *loop)
874 {
875   edge e;
876   edge_iterator i;
877 
878   /* Avoid annoying special cases of edges going to exit
879      block.  */
880   FOR_EACH_EDGE (e, i, EXIT_BLOCK_PTR->preds)
881     if ((e->flags & EDGE_FALLTHRU) && (EDGE_COUNT (e->src->succs) > 1))
882       loop_split_edge_with (e, NULL_RTX);
883 
884   if (loop->latch == loop->header
885       || EDGE_COUNT (loop->latch->succs) > 1)
886     {
887       FOR_EACH_EDGE (e, i, loop->header->preds)
888         if (e->src == loop->latch)
889           break;
890       loop_split_edge_with (e, NULL_RTX);
891     }
892 }
893 
894 /* Main entry point, perform SMS scheduling on the loops of the function
895    that consist of single basic blocks.  */
896 static void
sms_schedule(void)897 sms_schedule (void)
898 {
899   static int passes = 0;
900   rtx insn;
901   ddg_ptr *g_arr, g;
902   int * node_order;
903   int maxii;
904   unsigned i,num_loops;
905   partial_schedule_ptr ps;
906   struct df *df;
907   struct loops *loops;
908   basic_block bb = NULL;
909   /* vars to the versioning only if needed*/
910   struct loop * nloop;
911   basic_block condition_bb = NULL;
912   edge latch_edge;
913   gcov_type trip_count = 0;
914 
915   loops = loop_optimizer_init (LOOPS_HAVE_PREHEADERS
916 			       | LOOPS_HAVE_MARKED_SINGLE_EXITS);
917   if (!loops)
918     return;  /* There is no loops to schedule.  */
919 
920   /* Initialize issue_rate.  */
921   if (targetm.sched.issue_rate)
922     {
923       int temp = reload_completed;
924 
925       reload_completed = 1;
926       issue_rate = targetm.sched.issue_rate ();
927       reload_completed = temp;
928     }
929   else
930     issue_rate = 1;
931 
932   /* Initialize the scheduler.  */
933   current_sched_info = &sms_sched_info;
934   sched_init ();
935 
936   /* Init Data Flow analysis, to be used in interloop dep calculation.  */
937   df = df_init (DF_HARD_REGS | DF_EQUIV_NOTES | DF_SUBREGS);
938   df_rd_add_problem (df, 0);
939   df_ru_add_problem (df, 0);
940   df_chain_add_problem (df, DF_DU_CHAIN | DF_UD_CHAIN);
941   df_analyze (df);
942 
943   if (dump_file)
944     df_dump (df, dump_file);
945 
946   /* Allocate memory to hold the DDG array one entry for each loop.
947      We use loop->num as index into this array.  */
948   g_arr = XCNEWVEC (ddg_ptr, loops->num);
949 
950 
951   /* Build DDGs for all the relevant loops and hold them in G_ARR
952      indexed by the loop index.  */
953   for (i = 0; i < loops->num; i++)
954     {
955       rtx head, tail;
956       rtx count_reg;
957       struct loop *loop = loops->parray[i];
958 
959       /* For debugging.  */
960       if ((passes++ > MAX_SMS_LOOP_NUMBER) && (MAX_SMS_LOOP_NUMBER != -1))
961         {
962           if (dump_file)
963             fprintf (dump_file, "SMS reached MAX_PASSES... \n");
964 
965           break;
966         }
967 
968       if (! loop_canon_p (loop))
969         continue;
970 
971       if (! loop_single_full_bb_p (loop))
972 	continue;
973 
974       bb = loop->header;
975 
976       get_ebb_head_tail (bb, bb, &head, &tail);
977       latch_edge = loop_latch_edge (loop);
978       gcc_assert (loop->single_exit);
979       if (loop->single_exit->count)
980 	trip_count = latch_edge->count / loop->single_exit->count;
981 
982       /* Perfrom SMS only on loops that their average count is above threshold.  */
983 
984       if ( latch_edge->count
985           && (latch_edge->count < loop->single_exit->count * SMS_LOOP_AVERAGE_COUNT_THRESHOLD))
986 	{
987 	  if (dump_file)
988 	    {
989 	      rtx line_note = find_line_note (tail);
990 
991 	      if (line_note)
992 		{
993 		  expanded_location xloc;
994 		  NOTE_EXPANDED_LOCATION (xloc, line_note);
995 		  fprintf (dump_file, "SMS bb %s %d (file, line)\n",
996 			   xloc.file, xloc.line);
997 		}
998 	      fprintf (dump_file, "SMS single-bb-loop\n");
999 	      if (profile_info && flag_branch_probabilities)
1000 	    	{
1001 	      	  fprintf (dump_file, "SMS loop-count ");
1002 	      	  fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1003 	             	   (HOST_WIDEST_INT) bb->count);
1004 	      	  fprintf (dump_file, "\n");
1005                   fprintf (dump_file, "SMS trip-count ");
1006                   fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1007                            (HOST_WIDEST_INT) trip_count);
1008                   fprintf (dump_file, "\n");
1009 	      	  fprintf (dump_file, "SMS profile-sum-max ");
1010 	      	  fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1011 	          	   (HOST_WIDEST_INT) profile_info->sum_max);
1012 	      	  fprintf (dump_file, "\n");
1013 	    	}
1014 	    }
1015           continue;
1016         }
1017 
1018       /* Make sure this is a doloop.  */
1019       if ( !(count_reg = doloop_register_get (tail)))
1020 	continue;
1021 
1022       /* Don't handle BBs with calls or barriers, or !single_set insns.  */
1023       for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
1024 	if (CALL_P (insn)
1025 	    || BARRIER_P (insn)
1026 	    || (INSN_P (insn) && !JUMP_P (insn)
1027 		&& !single_set (insn) && GET_CODE (PATTERN (insn)) != USE))
1028 	  break;
1029 
1030       if (insn != NEXT_INSN (tail))
1031 	{
1032 	  if (dump_file)
1033 	    {
1034 	      if (CALL_P (insn))
1035 		fprintf (dump_file, "SMS loop-with-call\n");
1036 	      else if (BARRIER_P (insn))
1037 		fprintf (dump_file, "SMS loop-with-barrier\n");
1038 	      else
1039 		fprintf (dump_file, "SMS loop-with-not-single-set\n");
1040 	      print_rtl_single (dump_file, insn);
1041 	    }
1042 
1043 	  continue;
1044 	}
1045 
1046       if (! (g = create_ddg (bb, df, 0)))
1047         {
1048           if (dump_file)
1049 	    fprintf (dump_file, "SMS doloop\n");
1050 	  continue;
1051         }
1052 
1053       g_arr[i] = g;
1054     }
1055 
1056   /* Release Data Flow analysis data structures.  */
1057   df_finish (df);
1058   df = NULL;
1059 
1060   /* We don't want to perform SMS on new loops - created by versioning.  */
1061   num_loops = loops->num;
1062   /* Go over the built DDGs and perfrom SMS for each one of them.  */
1063   for (i = 0; i < num_loops; i++)
1064     {
1065       rtx head, tail;
1066       rtx count_reg, count_init;
1067       int mii, rec_mii;
1068       unsigned stage_count = 0;
1069       HOST_WIDEST_INT loop_count = 0;
1070       struct loop *loop = loops->parray[i];
1071 
1072       if (! (g = g_arr[i]))
1073         continue;
1074 
1075       if (dump_file)
1076 	print_ddg (dump_file, g);
1077 
1078       get_ebb_head_tail (loop->header, loop->header, &head, &tail);
1079 
1080       latch_edge = loop_latch_edge (loop);
1081       gcc_assert (loop->single_exit);
1082       if (loop->single_exit->count)
1083 	trip_count = latch_edge->count / loop->single_exit->count;
1084 
1085       if (dump_file)
1086 	{
1087 	  rtx line_note = find_line_note (tail);
1088 
1089 	  if (line_note)
1090 	    {
1091 	      expanded_location xloc;
1092 	      NOTE_EXPANDED_LOCATION (xloc, line_note);
1093 	      fprintf (dump_file, "SMS bb %s %d (file, line)\n",
1094 		       xloc.file, xloc.line);
1095 	    }
1096 	  fprintf (dump_file, "SMS single-bb-loop\n");
1097 	  if (profile_info && flag_branch_probabilities)
1098 	    {
1099 	      fprintf (dump_file, "SMS loop-count ");
1100 	      fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1101 	               (HOST_WIDEST_INT) bb->count);
1102 	      fprintf (dump_file, "\n");
1103 	      fprintf (dump_file, "SMS profile-sum-max ");
1104 	      fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1105 	               (HOST_WIDEST_INT) profile_info->sum_max);
1106 	      fprintf (dump_file, "\n");
1107 	    }
1108 	  fprintf (dump_file, "SMS doloop\n");
1109 	  fprintf (dump_file, "SMS built-ddg %d\n", g->num_nodes);
1110           fprintf (dump_file, "SMS num-loads %d\n", g->num_loads);
1111           fprintf (dump_file, "SMS num-stores %d\n", g->num_stores);
1112 	}
1113 
1114 
1115       /* In case of th loop have doloop register it gets special
1116 	 handling.  */
1117       count_init = NULL_RTX;
1118       if ((count_reg = doloop_register_get (tail)))
1119 	{
1120 	  basic_block pre_header;
1121 
1122 	  pre_header = loop_preheader_edge (loop)->src;
1123 	  count_init = const_iteration_count (count_reg, pre_header,
1124 					      &loop_count);
1125 	}
1126       gcc_assert (count_reg);
1127 
1128       if (dump_file && count_init)
1129         {
1130           fprintf (dump_file, "SMS const-doloop ");
1131           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1132 		     loop_count);
1133           fprintf (dump_file, "\n");
1134         }
1135 
1136       node_order = XNEWVEC (int, g->num_nodes);
1137 
1138       mii = 1; /* Need to pass some estimate of mii.  */
1139       rec_mii = sms_order_nodes (g, mii, node_order);
1140       mii = MAX (res_MII (g), rec_mii);
1141       maxii = (calculate_maxii (g) * SMS_MAX_II_FACTOR) / 100;
1142 
1143       if (dump_file)
1144 	fprintf (dump_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
1145 		 rec_mii, mii, maxii);
1146 
1147       /* After sms_order_nodes and before sms_schedule_by_order, to copy over
1148 	 ASAP.  */
1149       set_node_sched_params (g);
1150 
1151       ps = sms_schedule_by_order (g, mii, maxii, node_order);
1152 
1153       if (ps)
1154 	stage_count = PS_STAGE_COUNT (ps);
1155 
1156       /* Stage count of 1 means that there is no interleaving between
1157          iterations, let the scheduling passes do the job.  */
1158       if (stage_count < 1
1159 	  || (count_init && (loop_count <= stage_count))
1160 	  || (flag_branch_probabilities && (trip_count <= stage_count)))
1161 	{
1162 	  if (dump_file)
1163 	    {
1164 	      fprintf (dump_file, "SMS failed... \n");
1165 	      fprintf (dump_file, "SMS sched-failed (stage-count=%d, loop-count=", stage_count);
1166 	      fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, loop_count);
1167 	      fprintf (dump_file, ", trip-count=");
1168 	      fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, trip_count);
1169 	      fprintf (dump_file, ")\n");
1170 	    }
1171 	  continue;
1172 	}
1173       else
1174 	{
1175 	  int orig_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
1176 	  int new_cycles;
1177 	  struct undo_replace_buff_elem *reg_move_replaces;
1178 
1179 	  if (dump_file)
1180 	    {
1181 	      fprintf (dump_file,
1182 		       "SMS succeeded %d %d (with ii, sc)\n", ps->ii,
1183 		       stage_count);
1184 	      print_partial_schedule (ps, dump_file);
1185 	      fprintf (dump_file,
1186 		       "SMS Branch (%d) will later be scheduled at cycle %d.\n",
1187 		       g->closing_branch->cuid, PS_MIN_CYCLE (ps) - 1);
1188 	    }
1189 
1190 	  /* Set the stage boundaries.  If the DDG is built with closing_branch_deps,
1191 	     the closing_branch was scheduled and should appear in the last (ii-1)
1192 	     row.  Otherwise, we are free to schedule the branch, and we let nodes
1193 	     that were scheduled at the first PS_MIN_CYCLE cycle appear in the first
1194 	     row; this should reduce stage_count to minimum.  */
1195 	  normalize_sched_times (ps);
1196 	  rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
1197 	  set_columns_for_ps (ps);
1198 
1199 	  /* Generate the kernel just to be able to measure its cycles.  */
1200 	  permute_partial_schedule (ps, g->closing_branch->first_note);
1201 	  reg_move_replaces = generate_reg_moves (ps);
1202 
1203 	  /* Get the number of cycles the new kernel expect to execute in.  */
1204 	  new_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
1205 
1206 	  /* Get back to the original loop so we can do loop versioning.  */
1207 	  undo_permute_partial_schedule (ps, g->closing_branch->first_note);
1208 	  if (reg_move_replaces)
1209 	    undo_generate_reg_moves (ps, reg_move_replaces);
1210 
1211 	  if ( new_cycles >= orig_cycles)
1212 	    {
1213 	      /* SMS is not profitable so undo the permutation and reg move generation
1214 	         and return the kernel to its original state.  */
1215 	      if (dump_file)
1216 		fprintf (dump_file, "Undoing SMS because it is not profitable.\n");
1217 
1218 	    }
1219 	  else
1220 	    {
1221 	      canon_loop (loop);
1222 
1223               /* case the BCT count is not known , Do loop-versioning */
1224 	      if (count_reg && ! count_init)
1225 		{
1226 		  rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg,
1227 						 GEN_INT(stage_count));
1228 
1229 		  nloop = loop_version (loops, loop, comp_rtx, &condition_bb,
1230 					true);
1231 		}
1232 
1233 	      /* Set new iteration count of loop kernel.  */
1234               if (count_reg && count_init)
1235 		SET_SRC (single_set (count_init)) = GEN_INT (loop_count
1236 							     - stage_count + 1);
1237 
1238 	      /* Now apply the scheduled kernel to the RTL of the loop.  */
1239 	      permute_partial_schedule (ps, g->closing_branch->first_note);
1240 
1241               /* Mark this loop as software pipelined so the later
1242 	      scheduling passes doesn't touch it.  */
1243 	      if (! flag_resched_modulo_sched)
1244 		g->bb->flags |= BB_DISABLE_SCHEDULE;
1245 	      /* The life-info is not valid any more.  */
1246 	      g->bb->flags |= BB_DIRTY;
1247 
1248 	      reg_move_replaces = generate_reg_moves (ps);
1249 	      if (dump_file)
1250 		print_node_sched_params (dump_file, g->num_nodes);
1251 	      /* Generate prolog and epilog.  */
1252 	      if (count_reg && !count_init)
1253 		generate_prolog_epilog (ps, loop, count_reg);
1254 	      else
1255 	 	generate_prolog_epilog (ps, loop, NULL_RTX);
1256 	    }
1257 	  free_undo_replace_buff (reg_move_replaces);
1258 	}
1259 
1260       free_partial_schedule (ps);
1261       free (node_sched_params);
1262       free (node_order);
1263       free_ddg (g);
1264     }
1265 
1266   free (g_arr);
1267 
1268   /* Release scheduler data, needed until now because of DFA.  */
1269   sched_finish ();
1270   loop_optimizer_finalize (loops);
1271 }
1272 
1273 /* The SMS scheduling algorithm itself
1274    -----------------------------------
1275    Input: 'O' an ordered list of insns of a loop.
1276    Output: A scheduling of the loop - kernel, prolog, and epilogue.
1277 
1278    'Q' is the empty Set
1279    'PS' is the partial schedule; it holds the currently scheduled nodes with
1280 	their cycle/slot.
1281    'PSP' previously scheduled predecessors.
1282    'PSS' previously scheduled successors.
1283    't(u)' the cycle where u is scheduled.
1284    'l(u)' is the latency of u.
1285    'd(v,u)' is the dependence distance from v to u.
1286    'ASAP(u)' the earliest time at which u could be scheduled as computed in
1287 	     the node ordering phase.
1288    'check_hardware_resources_conflicts(u, PS, c)'
1289 			     run a trace around cycle/slot through DFA model
1290 			     to check resource conflicts involving instruction u
1291 			     at cycle c given the partial schedule PS.
1292    'add_to_partial_schedule_at_time(u, PS, c)'
1293 			     Add the node/instruction u to the partial schedule
1294 			     PS at time c.
1295    'calculate_register_pressure(PS)'
1296 			     Given a schedule of instructions, calculate the register
1297 			     pressure it implies.  One implementation could be the
1298 			     maximum number of overlapping live ranges.
1299    'maxRP' The maximum allowed register pressure, it is usually derived from the number
1300 	   registers available in the hardware.
1301 
1302    1. II = MII.
1303    2. PS = empty list
1304    3. for each node u in O in pre-computed order
1305    4.   if (PSP(u) != Q && PSS(u) == Q) then
1306    5.     Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1307    6.     start = Early_start; end = Early_start + II - 1; step = 1
1308    11.  else if (PSP(u) == Q && PSS(u) != Q) then
1309    12.      Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1310    13.     start = Late_start; end = Late_start - II + 1; step = -1
1311    14.  else if (PSP(u) != Q && PSS(u) != Q) then
1312    15.     Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1313    16.     Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1314    17.     start = Early_start;
1315    18.     end = min(Early_start + II - 1 , Late_start);
1316    19.     step = 1
1317    20.     else "if (PSP(u) == Q && PSS(u) == Q)"
1318    21.	  start = ASAP(u); end = start + II - 1; step = 1
1319    22.  endif
1320 
1321    23.  success = false
1322    24.  for (c = start ; c != end ; c += step)
1323    25.     if check_hardware_resources_conflicts(u, PS, c) then
1324    26.       add_to_partial_schedule_at_time(u, PS, c)
1325    27.       success = true
1326    28.       break
1327    29.     endif
1328    30.  endfor
1329    31.  if (success == false) then
1330    32.    II = II + 1
1331    33.    if (II > maxII) then
1332    34.       finish - failed to schedule
1333    35.	 endif
1334    36.    goto 2.
1335    37.  endif
1336    38. endfor
1337    39. if (calculate_register_pressure(PS) > maxRP) then
1338    40.    goto 32.
1339    41. endif
1340    42. compute epilogue & prologue
1341    43. finish - succeeded to schedule
1342 */
1343 
1344 /* A limit on the number of cycles that resource conflicts can span.  ??? Should
1345    be provided by DFA, and be dependent on the type of insn scheduled.  Currently
1346    set to 0 to save compile time.  */
1347 #define DFA_HISTORY SMS_DFA_HISTORY
1348 
1349 /* Given the partial schedule PS, this function calculates and returns the
1350    cycles in which we can schedule the node with the given index I.
1351    NOTE: Here we do the backtracking in SMS, in some special cases. We have
1352    noticed that there are several cases in which we fail    to SMS the loop
1353    because the sched window of a node is empty    due to tight data-deps. In
1354    such cases we want to unschedule    some of the predecessors/successors
1355    until we get non-empty    scheduling window.  It returns -1 if the
1356    scheduling window is empty and zero otherwise.  */
1357 
1358 static int
get_sched_window(partial_schedule_ptr ps,int * nodes_order,int i,sbitmap sched_nodes,int ii,int * start_p,int * step_p,int * end_p)1359 get_sched_window (partial_schedule_ptr ps, int *nodes_order, int i,
1360 		  sbitmap sched_nodes, int ii, int *start_p, int *step_p, int *end_p)
1361 {
1362   int start, step, end;
1363   ddg_edge_ptr e;
1364   int u = nodes_order [i];
1365   ddg_node_ptr u_node = &ps->g->nodes[u];
1366   sbitmap psp = sbitmap_alloc (ps->g->num_nodes);
1367   sbitmap pss = sbitmap_alloc (ps->g->num_nodes);
1368   sbitmap u_node_preds = NODE_PREDECESSORS (u_node);
1369   sbitmap u_node_succs = NODE_SUCCESSORS (u_node);
1370   int psp_not_empty;
1371   int pss_not_empty;
1372 
1373   /* 1. compute sched window for u (start, end, step).  */
1374   sbitmap_zero (psp);
1375   sbitmap_zero (pss);
1376   psp_not_empty = sbitmap_a_and_b_cg (psp, u_node_preds, sched_nodes);
1377   pss_not_empty = sbitmap_a_and_b_cg (pss, u_node_succs, sched_nodes);
1378 
1379   if (psp_not_empty && !pss_not_empty)
1380     {
1381       int early_start = INT_MIN;
1382 
1383       end = INT_MAX;
1384       for (e = u_node->in; e != 0; e = e->next_in)
1385 	{
1386 	  ddg_node_ptr v_node = e->src;
1387 	  if (TEST_BIT (sched_nodes, v_node->cuid))
1388 	    {
1389 	      int node_st = SCHED_TIME (v_node)
1390 	      		    + e->latency - (e->distance * ii);
1391 
1392 	      early_start = MAX (early_start, node_st);
1393 
1394 	      if (e->data_type == MEM_DEP)
1395 		end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1396 	    }
1397 	}
1398       start = early_start;
1399       end = MIN (end, early_start + ii);
1400       step = 1;
1401     }
1402 
1403   else if (!psp_not_empty && pss_not_empty)
1404     {
1405       int late_start = INT_MAX;
1406 
1407       end = INT_MIN;
1408       for (e = u_node->out; e != 0; e = e->next_out)
1409 	{
1410 	  ddg_node_ptr v_node = e->dest;
1411 	  if (TEST_BIT (sched_nodes, v_node->cuid))
1412 	    {
1413 	      late_start = MIN (late_start,
1414 				SCHED_TIME (v_node) - e->latency
1415 				+ (e->distance * ii));
1416 	      if (e->data_type == MEM_DEP)
1417 		end = MAX (end, SCHED_TIME (v_node) - ii + 1);
1418 	    }
1419 	}
1420       start = late_start;
1421       end = MAX (end, late_start - ii);
1422       step = -1;
1423     }
1424 
1425   else if (psp_not_empty && pss_not_empty)
1426     {
1427       int early_start = INT_MIN;
1428       int late_start = INT_MAX;
1429 
1430       start = INT_MIN;
1431       end = INT_MAX;
1432       for (e = u_node->in; e != 0; e = e->next_in)
1433 	{
1434 	  ddg_node_ptr v_node = e->src;
1435 
1436 	  if (TEST_BIT (sched_nodes, v_node->cuid))
1437 	    {
1438 	      early_start = MAX (early_start,
1439 				 SCHED_TIME (v_node) + e->latency
1440 				 - (e->distance * ii));
1441 	      if (e->data_type == MEM_DEP)
1442 		end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1443 	    }
1444 	}
1445       for (e = u_node->out; e != 0; e = e->next_out)
1446 	{
1447 	  ddg_node_ptr v_node = e->dest;
1448 
1449 	  if (TEST_BIT (sched_nodes, v_node->cuid))
1450 	    {
1451 	      late_start = MIN (late_start,
1452 				SCHED_TIME (v_node) - e->latency
1453 				+ (e->distance * ii));
1454 	      if (e->data_type == MEM_DEP)
1455 		start = MAX (start, SCHED_TIME (v_node) - ii + 1);
1456 	    }
1457 	}
1458       start = MAX (start, early_start);
1459       end = MIN (end, MIN (early_start + ii, late_start + 1));
1460       step = 1;
1461     }
1462   else /* psp is empty && pss is empty.  */
1463     {
1464       start = SCHED_ASAP (u_node);
1465       end = start + ii;
1466       step = 1;
1467     }
1468 
1469   *start_p = start;
1470   *step_p = step;
1471   *end_p = end;
1472   sbitmap_free (psp);
1473   sbitmap_free (pss);
1474 
1475   if ((start >= end && step == 1) || (start <= end && step == -1))
1476     return -1;
1477   else
1478     return 0;
1479 }
1480 
1481 /* This function implements the scheduling algorithm for SMS according to the
1482    above algorithm.  */
1483 static partial_schedule_ptr
sms_schedule_by_order(ddg_ptr g,int mii,int maxii,int * nodes_order)1484 sms_schedule_by_order (ddg_ptr g, int mii, int maxii, int *nodes_order)
1485 {
1486   int ii = mii;
1487   int i, c, success;
1488   int try_again_with_larger_ii = true;
1489   int num_nodes = g->num_nodes;
1490   ddg_edge_ptr e;
1491   int start, end, step; /* Place together into one struct?  */
1492   sbitmap sched_nodes = sbitmap_alloc (num_nodes);
1493   sbitmap must_precede = sbitmap_alloc (num_nodes);
1494   sbitmap must_follow = sbitmap_alloc (num_nodes);
1495   sbitmap tobe_scheduled = sbitmap_alloc (num_nodes);
1496 
1497   partial_schedule_ptr ps = create_partial_schedule (ii, g, DFA_HISTORY);
1498 
1499   sbitmap_ones (tobe_scheduled);
1500   sbitmap_zero (sched_nodes);
1501 
1502   while ((! sbitmap_equal (tobe_scheduled, sched_nodes)
1503 	 || try_again_with_larger_ii ) && ii < maxii)
1504     {
1505       int j;
1506       bool unscheduled_nodes = false;
1507 
1508       if (dump_file)
1509 	fprintf(dump_file, "Starting with ii=%d\n", ii);
1510       if (try_again_with_larger_ii)
1511 	{
1512 	  try_again_with_larger_ii = false;
1513 	  sbitmap_zero (sched_nodes);
1514 	}
1515 
1516       for (i = 0; i < num_nodes; i++)
1517 	{
1518 	  int u = nodes_order[i];
1519   	  ddg_node_ptr u_node = &ps->g->nodes[u];
1520 	  rtx insn = u_node->insn;
1521 
1522 	  if (!INSN_P (insn))
1523 	    {
1524 	      RESET_BIT (tobe_scheduled, u);
1525 	      continue;
1526 	    }
1527 
1528 	  if (JUMP_P (insn)) /* Closing branch handled later.  */
1529 	    {
1530 	      RESET_BIT (tobe_scheduled, u);
1531 	      continue;
1532 	    }
1533 
1534 	  if (TEST_BIT (sched_nodes, u))
1535 	    continue;
1536 
1537 	  /* Try to get non-empty scheduling window.  */
1538 	  j = i;
1539 	  while (get_sched_window (ps, nodes_order, i, sched_nodes, ii, &start, &step, &end) < 0
1540 		 && j > 0)
1541 	    {
1542 	      unscheduled_nodes = true;
1543 	      if (TEST_BIT (NODE_PREDECESSORS (u_node), nodes_order[j - 1])
1544 		  || TEST_BIT (NODE_SUCCESSORS (u_node), nodes_order[j - 1]))
1545 		{
1546 		  ps_unschedule_node (ps, &ps->g->nodes[nodes_order[j - 1]]);
1547 		  RESET_BIT (sched_nodes, nodes_order [j - 1]);
1548 		}
1549 	      j--;
1550 	    }
1551 	  if (j < 0)
1552 	    {
1553 	      /* ??? Try backtracking instead of immediately ii++?  */
1554 	      ii++;
1555 	      try_again_with_larger_ii = true;
1556 	      reset_partial_schedule (ps, ii);
1557 	      break;
1558 	    }
1559 	  /* 2. Try scheduling u in window.  */
1560 	  if (dump_file)
1561 	    fprintf(dump_file, "Trying to schedule node %d in (%d .. %d) step %d\n",
1562 		    u, start, end, step);
1563 
1564           /* use must_follow & must_precede bitmaps to determine order
1565 	     of nodes within the cycle.  */
1566           sbitmap_zero (must_precede);
1567           sbitmap_zero (must_follow);
1568       	  for (e = u_node->in; e != 0; e = e->next_in)
1569             if (TEST_BIT (sched_nodes, e->src->cuid)
1570 	        && e->latency == (ii * e->distance)
1571 		&& start == SCHED_TIME (e->src))
1572              SET_BIT (must_precede, e->src->cuid);
1573 
1574 	  for (e = u_node->out; e != 0; e = e->next_out)
1575             if (TEST_BIT (sched_nodes, e->dest->cuid)
1576 	        && e->latency == (ii * e->distance)
1577 		&& end == SCHED_TIME (e->dest))
1578              SET_BIT (must_follow, e->dest->cuid);
1579 
1580 	  success = 0;
1581 	  if ((step > 0 && start < end) ||  (step < 0 && start > end))
1582 	    for (c = start; c != end; c += step)
1583 	      {
1584 		ps_insn_ptr psi;
1585 
1586 		psi = ps_add_node_check_conflicts (ps, u_node, c,
1587 						   must_precede,
1588 						   must_follow);
1589 
1590   		if (psi)
1591 		  {
1592 		    SCHED_TIME (u_node) = c;
1593 		    SET_BIT (sched_nodes, u);
1594 		    success = 1;
1595 		    if (dump_file)
1596 		      fprintf(dump_file, "Schedule in %d\n", c);
1597 		    break;
1598 		  }
1599 	      }
1600 	  if (!success)
1601 	    {
1602 	      /* ??? Try backtracking instead of immediately ii++?  */
1603 	      ii++;
1604 	      try_again_with_larger_ii = true;
1605 	      reset_partial_schedule (ps, ii);
1606 	      break;
1607 	    }
1608 	  if (unscheduled_nodes)
1609 	    break;
1610 
1611 	  /* ??? If (success), check register pressure estimates.  */
1612 	} /* Continue with next node.  */
1613     } /* While try_again_with_larger_ii.  */
1614 
1615   sbitmap_free (sched_nodes);
1616   sbitmap_free (must_precede);
1617   sbitmap_free (must_follow);
1618   sbitmap_free (tobe_scheduled);
1619 
1620   if (ii >= maxii)
1621     {
1622       free_partial_schedule (ps);
1623       ps = NULL;
1624     }
1625   return ps;
1626 }
1627 
1628 
1629 /* This page implements the algorithm for ordering the nodes of a DDG
1630    for modulo scheduling, activated through the
1631    "int sms_order_nodes (ddg_ptr, int mii, int * result)" API.  */
1632 
1633 #define ORDER_PARAMS(x) ((struct node_order_params *) (x)->aux.info)
1634 #define ASAP(x) (ORDER_PARAMS ((x))->asap)
1635 #define ALAP(x) (ORDER_PARAMS ((x))->alap)
1636 #define HEIGHT(x) (ORDER_PARAMS ((x))->height)
1637 #define MOB(x) (ALAP ((x)) - ASAP ((x)))
1638 #define DEPTH(x) (ASAP ((x)))
1639 
1640 typedef struct node_order_params * nopa;
1641 
1642 static void order_nodes_of_sccs (ddg_all_sccs_ptr, int * result);
1643 static int order_nodes_in_scc (ddg_ptr, sbitmap, sbitmap, int*, int);
1644 static nopa  calculate_order_params (ddg_ptr, int mii);
1645 static int find_max_asap (ddg_ptr, sbitmap);
1646 static int find_max_hv_min_mob (ddg_ptr, sbitmap);
1647 static int find_max_dv_min_mob (ddg_ptr, sbitmap);
1648 
1649 enum sms_direction {BOTTOMUP, TOPDOWN};
1650 
1651 struct node_order_params
1652 {
1653   int asap;
1654   int alap;
1655   int height;
1656 };
1657 
1658 /* Check if NODE_ORDER contains a permutation of 0 .. NUM_NODES-1.  */
1659 static void
check_nodes_order(int * node_order,int num_nodes)1660 check_nodes_order (int *node_order, int num_nodes)
1661 {
1662   int i;
1663   sbitmap tmp = sbitmap_alloc (num_nodes);
1664 
1665   sbitmap_zero (tmp);
1666 
1667   for (i = 0; i < num_nodes; i++)
1668     {
1669       int u = node_order[i];
1670 
1671       gcc_assert (u < num_nodes && u >= 0 && !TEST_BIT (tmp, u));
1672 
1673       SET_BIT (tmp, u);
1674     }
1675 
1676   sbitmap_free (tmp);
1677 }
1678 
1679 /* Order the nodes of G for scheduling and pass the result in
1680    NODE_ORDER.  Also set aux.count of each node to ASAP.
1681    Return the recMII for the given DDG.  */
1682 static int
sms_order_nodes(ddg_ptr g,int mii,int * node_order)1683 sms_order_nodes (ddg_ptr g, int mii, int * node_order)
1684 {
1685   int i;
1686   int rec_mii = 0;
1687   ddg_all_sccs_ptr sccs = create_ddg_all_sccs (g);
1688 
1689   nopa nops = calculate_order_params (g, mii);
1690 
1691   order_nodes_of_sccs (sccs, node_order);
1692 
1693   if (sccs->num_sccs > 0)
1694     /* First SCC has the largest recurrence_length.  */
1695     rec_mii = sccs->sccs[0]->recurrence_length;
1696 
1697   /* Save ASAP before destroying node_order_params.  */
1698   for (i = 0; i < g->num_nodes; i++)
1699     {
1700       ddg_node_ptr v = &g->nodes[i];
1701       v->aux.count = ASAP (v);
1702     }
1703 
1704   free (nops);
1705   free_ddg_all_sccs (sccs);
1706   check_nodes_order (node_order, g->num_nodes);
1707 
1708   return rec_mii;
1709 }
1710 
1711 static void
order_nodes_of_sccs(ddg_all_sccs_ptr all_sccs,int * node_order)1712 order_nodes_of_sccs (ddg_all_sccs_ptr all_sccs, int * node_order)
1713 {
1714   int i, pos = 0;
1715   ddg_ptr g = all_sccs->ddg;
1716   int num_nodes = g->num_nodes;
1717   sbitmap prev_sccs = sbitmap_alloc (num_nodes);
1718   sbitmap on_path = sbitmap_alloc (num_nodes);
1719   sbitmap tmp = sbitmap_alloc (num_nodes);
1720   sbitmap ones = sbitmap_alloc (num_nodes);
1721 
1722   sbitmap_zero (prev_sccs);
1723   sbitmap_ones (ones);
1724 
1725   /* Perfrom the node ordering starting from the SCC with the highest recMII.
1726      For each SCC order the nodes according to their ASAP/ALAP/HEIGHT etc.  */
1727   for (i = 0; i < all_sccs->num_sccs; i++)
1728     {
1729       ddg_scc_ptr scc = all_sccs->sccs[i];
1730 
1731       /* Add nodes on paths from previous SCCs to the current SCC.  */
1732       find_nodes_on_paths (on_path, g, prev_sccs, scc->nodes);
1733       sbitmap_a_or_b (tmp, scc->nodes, on_path);
1734 
1735       /* Add nodes on paths from the current SCC to previous SCCs.  */
1736       find_nodes_on_paths (on_path, g, scc->nodes, prev_sccs);
1737       sbitmap_a_or_b (tmp, tmp, on_path);
1738 
1739       /* Remove nodes of previous SCCs from current extended SCC.  */
1740       sbitmap_difference (tmp, tmp, prev_sccs);
1741 
1742       pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1743       /* Above call to order_nodes_in_scc updated prev_sccs |= tmp.  */
1744     }
1745 
1746   /* Handle the remaining nodes that do not belong to any scc.  Each call
1747      to order_nodes_in_scc handles a single connected component.  */
1748   while (pos < g->num_nodes)
1749     {
1750       sbitmap_difference (tmp, ones, prev_sccs);
1751       pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1752     }
1753   sbitmap_free (prev_sccs);
1754   sbitmap_free (on_path);
1755   sbitmap_free (tmp);
1756   sbitmap_free (ones);
1757 }
1758 
1759 /* MII is needed if we consider backarcs (that do not close recursive cycles).  */
1760 static struct node_order_params *
calculate_order_params(ddg_ptr g,int mii ATTRIBUTE_UNUSED)1761 calculate_order_params (ddg_ptr g, int mii ATTRIBUTE_UNUSED)
1762 {
1763   int u;
1764   int max_asap;
1765   int num_nodes = g->num_nodes;
1766   ddg_edge_ptr e;
1767   /* Allocate a place to hold ordering params for each node in the DDG.  */
1768   nopa node_order_params_arr;
1769 
1770   /* Initialize of ASAP/ALAP/HEIGHT to zero.  */
1771   node_order_params_arr = (nopa) xcalloc (num_nodes,
1772 					  sizeof (struct node_order_params));
1773 
1774   /* Set the aux pointer of each node to point to its order_params structure.  */
1775   for (u = 0; u < num_nodes; u++)
1776     g->nodes[u].aux.info = &node_order_params_arr[u];
1777 
1778   /* Disregarding a backarc from each recursive cycle to obtain a DAG,
1779      calculate ASAP, ALAP, mobility, distance, and height for each node
1780      in the dependence (direct acyclic) graph.  */
1781 
1782   /* We assume that the nodes in the array are in topological order.  */
1783 
1784   max_asap = 0;
1785   for (u = 0; u < num_nodes; u++)
1786     {
1787       ddg_node_ptr u_node = &g->nodes[u];
1788 
1789       ASAP (u_node) = 0;
1790       for (e = u_node->in; e; e = e->next_in)
1791 	if (e->distance == 0)
1792 	  ASAP (u_node) = MAX (ASAP (u_node),
1793 			       ASAP (e->src) + e->latency);
1794       max_asap = MAX (max_asap, ASAP (u_node));
1795     }
1796 
1797   for (u = num_nodes - 1; u > -1; u--)
1798     {
1799       ddg_node_ptr u_node = &g->nodes[u];
1800 
1801       ALAP (u_node) = max_asap;
1802       HEIGHT (u_node) = 0;
1803       for (e = u_node->out; e; e = e->next_out)
1804 	if (e->distance == 0)
1805 	  {
1806 	    ALAP (u_node) = MIN (ALAP (u_node),
1807 				 ALAP (e->dest) - e->latency);
1808 	    HEIGHT (u_node) = MAX (HEIGHT (u_node),
1809 				   HEIGHT (e->dest) + e->latency);
1810 	  }
1811     }
1812 
1813   return node_order_params_arr;
1814 }
1815 
1816 static int
find_max_asap(ddg_ptr g,sbitmap nodes)1817 find_max_asap (ddg_ptr g, sbitmap nodes)
1818 {
1819   unsigned int u = 0;
1820   int max_asap = -1;
1821   int result = -1;
1822   sbitmap_iterator sbi;
1823 
1824   EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
1825     {
1826       ddg_node_ptr u_node = &g->nodes[u];
1827 
1828       if (max_asap < ASAP (u_node))
1829 	{
1830 	  max_asap = ASAP (u_node);
1831 	  result = u;
1832 	}
1833     }
1834   return result;
1835 }
1836 
1837 static int
find_max_hv_min_mob(ddg_ptr g,sbitmap nodes)1838 find_max_hv_min_mob (ddg_ptr g, sbitmap nodes)
1839 {
1840   unsigned int u = 0;
1841   int max_hv = -1;
1842   int min_mob = INT_MAX;
1843   int result = -1;
1844   sbitmap_iterator sbi;
1845 
1846   EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
1847     {
1848       ddg_node_ptr u_node = &g->nodes[u];
1849 
1850       if (max_hv < HEIGHT (u_node))
1851 	{
1852 	  max_hv = HEIGHT (u_node);
1853 	  min_mob = MOB (u_node);
1854 	  result = u;
1855 	}
1856       else if ((max_hv == HEIGHT (u_node))
1857 	       && (min_mob > MOB (u_node)))
1858 	{
1859 	  min_mob = MOB (u_node);
1860 	  result = u;
1861 	}
1862     }
1863   return result;
1864 }
1865 
1866 static int
find_max_dv_min_mob(ddg_ptr g,sbitmap nodes)1867 find_max_dv_min_mob (ddg_ptr g, sbitmap nodes)
1868 {
1869   unsigned int u = 0;
1870   int max_dv = -1;
1871   int min_mob = INT_MAX;
1872   int result = -1;
1873   sbitmap_iterator sbi;
1874 
1875   EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
1876     {
1877       ddg_node_ptr u_node = &g->nodes[u];
1878 
1879       if (max_dv < DEPTH (u_node))
1880 	{
1881 	  max_dv = DEPTH (u_node);
1882 	  min_mob = MOB (u_node);
1883 	  result = u;
1884 	}
1885       else if ((max_dv == DEPTH (u_node))
1886 	       && (min_mob > MOB (u_node)))
1887 	{
1888 	  min_mob = MOB (u_node);
1889 	  result = u;
1890 	}
1891     }
1892   return result;
1893 }
1894 
1895 /* Places the nodes of SCC into the NODE_ORDER array starting
1896    at position POS, according to the SMS ordering algorithm.
1897    NODES_ORDERED (in&out parameter) holds the bitset of all nodes in
1898    the NODE_ORDER array, starting from position zero.  */
1899 static int
order_nodes_in_scc(ddg_ptr g,sbitmap nodes_ordered,sbitmap scc,int * node_order,int pos)1900 order_nodes_in_scc (ddg_ptr g, sbitmap nodes_ordered, sbitmap scc,
1901 		    int * node_order, int pos)
1902 {
1903   enum sms_direction dir;
1904   int num_nodes = g->num_nodes;
1905   sbitmap workset = sbitmap_alloc (num_nodes);
1906   sbitmap tmp = sbitmap_alloc (num_nodes);
1907   sbitmap zero_bitmap = sbitmap_alloc (num_nodes);
1908   sbitmap predecessors = sbitmap_alloc (num_nodes);
1909   sbitmap successors = sbitmap_alloc (num_nodes);
1910 
1911   sbitmap_zero (predecessors);
1912   find_predecessors (predecessors, g, nodes_ordered);
1913 
1914   sbitmap_zero (successors);
1915   find_successors (successors, g, nodes_ordered);
1916 
1917   sbitmap_zero (tmp);
1918   if (sbitmap_a_and_b_cg (tmp, predecessors, scc))
1919     {
1920       sbitmap_copy (workset, tmp);
1921       dir = BOTTOMUP;
1922     }
1923   else if (sbitmap_a_and_b_cg (tmp, successors, scc))
1924     {
1925       sbitmap_copy (workset, tmp);
1926       dir = TOPDOWN;
1927     }
1928   else
1929     {
1930       int u;
1931 
1932       sbitmap_zero (workset);
1933       if ((u = find_max_asap (g, scc)) >= 0)
1934 	SET_BIT (workset, u);
1935       dir = BOTTOMUP;
1936     }
1937 
1938   sbitmap_zero (zero_bitmap);
1939   while (!sbitmap_equal (workset, zero_bitmap))
1940     {
1941       int v;
1942       ddg_node_ptr v_node;
1943       sbitmap v_node_preds;
1944       sbitmap v_node_succs;
1945 
1946       if (dir == TOPDOWN)
1947 	{
1948 	  while (!sbitmap_equal (workset, zero_bitmap))
1949 	    {
1950 	      v = find_max_hv_min_mob (g, workset);
1951 	      v_node = &g->nodes[v];
1952 	      node_order[pos++] = v;
1953 	      v_node_succs = NODE_SUCCESSORS (v_node);
1954 	      sbitmap_a_and_b (tmp, v_node_succs, scc);
1955 
1956 	      /* Don't consider the already ordered successors again.  */
1957 	      sbitmap_difference (tmp, tmp, nodes_ordered);
1958 	      sbitmap_a_or_b (workset, workset, tmp);
1959 	      RESET_BIT (workset, v);
1960 	      SET_BIT (nodes_ordered, v);
1961 	    }
1962 	  dir = BOTTOMUP;
1963 	  sbitmap_zero (predecessors);
1964 	  find_predecessors (predecessors, g, nodes_ordered);
1965 	  sbitmap_a_and_b (workset, predecessors, scc);
1966 	}
1967       else
1968 	{
1969 	  while (!sbitmap_equal (workset, zero_bitmap))
1970 	    {
1971 	      v = find_max_dv_min_mob (g, workset);
1972 	      v_node = &g->nodes[v];
1973 	      node_order[pos++] = v;
1974 	      v_node_preds = NODE_PREDECESSORS (v_node);
1975 	      sbitmap_a_and_b (tmp, v_node_preds, scc);
1976 
1977 	      /* Don't consider the already ordered predecessors again.  */
1978 	      sbitmap_difference (tmp, tmp, nodes_ordered);
1979 	      sbitmap_a_or_b (workset, workset, tmp);
1980 	      RESET_BIT (workset, v);
1981 	      SET_BIT (nodes_ordered, v);
1982 	    }
1983 	  dir = TOPDOWN;
1984 	  sbitmap_zero (successors);
1985 	  find_successors (successors, g, nodes_ordered);
1986 	  sbitmap_a_and_b (workset, successors, scc);
1987 	}
1988     }
1989   sbitmap_free (tmp);
1990   sbitmap_free (workset);
1991   sbitmap_free (zero_bitmap);
1992   sbitmap_free (predecessors);
1993   sbitmap_free (successors);
1994   return pos;
1995 }
1996 
1997 
1998 /* This page contains functions for manipulating partial-schedules during
1999    modulo scheduling.  */
2000 
2001 /* Create a partial schedule and allocate a memory to hold II rows.  */
2002 
2003 static partial_schedule_ptr
create_partial_schedule(int ii,ddg_ptr g,int history)2004 create_partial_schedule (int ii, ddg_ptr g, int history)
2005 {
2006   partial_schedule_ptr ps = XNEW (struct partial_schedule);
2007   ps->rows = (ps_insn_ptr *) xcalloc (ii, sizeof (ps_insn_ptr));
2008   ps->ii = ii;
2009   ps->history = history;
2010   ps->min_cycle = INT_MAX;
2011   ps->max_cycle = INT_MIN;
2012   ps->g = g;
2013 
2014   return ps;
2015 }
2016 
2017 /* Free the PS_INSNs in rows array of the given partial schedule.
2018    ??? Consider caching the PS_INSN's.  */
2019 static void
free_ps_insns(partial_schedule_ptr ps)2020 free_ps_insns (partial_schedule_ptr ps)
2021 {
2022   int i;
2023 
2024   for (i = 0; i < ps->ii; i++)
2025     {
2026       while (ps->rows[i])
2027 	{
2028 	  ps_insn_ptr ps_insn = ps->rows[i]->next_in_row;
2029 
2030 	  free (ps->rows[i]);
2031 	  ps->rows[i] = ps_insn;
2032 	}
2033       ps->rows[i] = NULL;
2034     }
2035 }
2036 
2037 /* Free all the memory allocated to the partial schedule.  */
2038 
2039 static void
free_partial_schedule(partial_schedule_ptr ps)2040 free_partial_schedule (partial_schedule_ptr ps)
2041 {
2042   if (!ps)
2043     return;
2044   free_ps_insns (ps);
2045   free (ps->rows);
2046   free (ps);
2047 }
2048 
2049 /* Clear the rows array with its PS_INSNs, and create a new one with
2050    NEW_II rows.  */
2051 
2052 static void
reset_partial_schedule(partial_schedule_ptr ps,int new_ii)2053 reset_partial_schedule (partial_schedule_ptr ps, int new_ii)
2054 {
2055   if (!ps)
2056     return;
2057   free_ps_insns (ps);
2058   if (new_ii == ps->ii)
2059     return;
2060   ps->rows = (ps_insn_ptr *) xrealloc (ps->rows, new_ii
2061 						 * sizeof (ps_insn_ptr));
2062   memset (ps->rows, 0, new_ii * sizeof (ps_insn_ptr));
2063   ps->ii = new_ii;
2064   ps->min_cycle = INT_MAX;
2065   ps->max_cycle = INT_MIN;
2066 }
2067 
2068 /* Prints the partial schedule as an ii rows array, for each rows
2069    print the ids of the insns in it.  */
2070 void
print_partial_schedule(partial_schedule_ptr ps,FILE * dump)2071 print_partial_schedule (partial_schedule_ptr ps, FILE *dump)
2072 {
2073   int i;
2074 
2075   for (i = 0; i < ps->ii; i++)
2076     {
2077       ps_insn_ptr ps_i = ps->rows[i];
2078 
2079       fprintf (dump, "\n[CYCLE %d ]: ", i);
2080       while (ps_i)
2081 	{
2082 	  fprintf (dump, "%d, ",
2083 		   INSN_UID (ps_i->node->insn));
2084 	  ps_i = ps_i->next_in_row;
2085 	}
2086     }
2087 }
2088 
2089 /* Creates an object of PS_INSN and initializes it to the given parameters.  */
2090 static ps_insn_ptr
create_ps_insn(ddg_node_ptr node,int rest_count,int cycle)2091 create_ps_insn (ddg_node_ptr node, int rest_count, int cycle)
2092 {
2093   ps_insn_ptr ps_i = XNEW (struct ps_insn);
2094 
2095   ps_i->node = node;
2096   ps_i->next_in_row = NULL;
2097   ps_i->prev_in_row = NULL;
2098   ps_i->row_rest_count = rest_count;
2099   ps_i->cycle = cycle;
2100 
2101   return ps_i;
2102 }
2103 
2104 
2105 /* Removes the given PS_INSN from the partial schedule.  Returns false if the
2106    node is not found in the partial schedule, else returns true.  */
2107 static bool
remove_node_from_ps(partial_schedule_ptr ps,ps_insn_ptr ps_i)2108 remove_node_from_ps (partial_schedule_ptr ps, ps_insn_ptr ps_i)
2109 {
2110   int row;
2111 
2112   if (!ps || !ps_i)
2113     return false;
2114 
2115   row = SMODULO (ps_i->cycle, ps->ii);
2116   if (! ps_i->prev_in_row)
2117     {
2118       if (ps_i != ps->rows[row])
2119 	return false;
2120 
2121       ps->rows[row] = ps_i->next_in_row;
2122       if (ps->rows[row])
2123 	ps->rows[row]->prev_in_row = NULL;
2124     }
2125   else
2126     {
2127       ps_i->prev_in_row->next_in_row = ps_i->next_in_row;
2128       if (ps_i->next_in_row)
2129 	ps_i->next_in_row->prev_in_row = ps_i->prev_in_row;
2130     }
2131   free (ps_i);
2132   return true;
2133 }
2134 
2135 /* Unlike what literature describes for modulo scheduling (which focuses
2136    on VLIW machines) the order of the instructions inside a cycle is
2137    important.  Given the bitmaps MUST_FOLLOW and MUST_PRECEDE we know
2138    where the current instruction should go relative to the already
2139    scheduled instructions in the given cycle.  Go over these
2140    instructions and find the first possible column to put it in.  */
2141 static bool
ps_insn_find_column(partial_schedule_ptr ps,ps_insn_ptr ps_i,sbitmap must_precede,sbitmap must_follow)2142 ps_insn_find_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2143 		     sbitmap must_precede, sbitmap must_follow)
2144 {
2145   ps_insn_ptr next_ps_i;
2146   ps_insn_ptr first_must_follow = NULL;
2147   ps_insn_ptr last_must_precede = NULL;
2148   int row;
2149 
2150   if (! ps_i)
2151     return false;
2152 
2153   row = SMODULO (ps_i->cycle, ps->ii);
2154 
2155   /* Find the first must follow and the last must precede
2156      and insert the node immediately after the must precede
2157      but make sure that it there is no must follow after it.  */
2158   for (next_ps_i = ps->rows[row];
2159        next_ps_i;
2160        next_ps_i = next_ps_i->next_in_row)
2161     {
2162       if (TEST_BIT (must_follow, next_ps_i->node->cuid)
2163 	  && ! first_must_follow)
2164         first_must_follow = next_ps_i;
2165       if (TEST_BIT (must_precede, next_ps_i->node->cuid))
2166         {
2167           /* If we have already met a node that must follow, then
2168 	     there is no possible column.  */
2169   	  if (first_must_follow)
2170             return false;
2171 	  else
2172             last_must_precede = next_ps_i;
2173         }
2174     }
2175 
2176   /* Now insert the node after INSERT_AFTER_PSI.  */
2177 
2178   if (! last_must_precede)
2179     {
2180       ps_i->next_in_row = ps->rows[row];
2181       ps_i->prev_in_row = NULL;
2182       if (ps_i->next_in_row)
2183     	ps_i->next_in_row->prev_in_row = ps_i;
2184       ps->rows[row] = ps_i;
2185     }
2186   else
2187     {
2188       ps_i->next_in_row = last_must_precede->next_in_row;
2189       last_must_precede->next_in_row = ps_i;
2190       ps_i->prev_in_row = last_must_precede;
2191       if (ps_i->next_in_row)
2192         ps_i->next_in_row->prev_in_row = ps_i;
2193     }
2194 
2195   return true;
2196 }
2197 
2198 /* Advances the PS_INSN one column in its current row; returns false
2199    in failure and true in success.  Bit N is set in MUST_FOLLOW if
2200    the node with cuid N must be come after the node pointed to by
2201    PS_I when scheduled in the same cycle.  */
2202 static int
ps_insn_advance_column(partial_schedule_ptr ps,ps_insn_ptr ps_i,sbitmap must_follow)2203 ps_insn_advance_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2204 			sbitmap must_follow)
2205 {
2206   ps_insn_ptr prev, next;
2207   int row;
2208   ddg_node_ptr next_node;
2209 
2210   if (!ps || !ps_i)
2211     return false;
2212 
2213   row = SMODULO (ps_i->cycle, ps->ii);
2214 
2215   if (! ps_i->next_in_row)
2216     return false;
2217 
2218   next_node = ps_i->next_in_row->node;
2219 
2220   /* Check if next_in_row is dependent on ps_i, both having same sched
2221      times (typically ANTI_DEP).  If so, ps_i cannot skip over it.  */
2222   if (TEST_BIT (must_follow, next_node->cuid))
2223     return false;
2224 
2225   /* Advance PS_I over its next_in_row in the doubly linked list.  */
2226   prev = ps_i->prev_in_row;
2227   next = ps_i->next_in_row;
2228 
2229   if (ps_i == ps->rows[row])
2230     ps->rows[row] = next;
2231 
2232   ps_i->next_in_row = next->next_in_row;
2233 
2234   if (next->next_in_row)
2235     next->next_in_row->prev_in_row = ps_i;
2236 
2237   next->next_in_row = ps_i;
2238   ps_i->prev_in_row = next;
2239 
2240   next->prev_in_row = prev;
2241   if (prev)
2242     prev->next_in_row = next;
2243 
2244   return true;
2245 }
2246 
2247 /* Inserts a DDG_NODE to the given partial schedule at the given cycle.
2248    Returns 0 if this is not possible and a PS_INSN otherwise.  Bit N is
2249    set in MUST_PRECEDE/MUST_FOLLOW if the node with cuid N must be come
2250    before/after (respectively) the node pointed to by PS_I when scheduled
2251    in the same cycle.  */
2252 static ps_insn_ptr
add_node_to_ps(partial_schedule_ptr ps,ddg_node_ptr node,int cycle,sbitmap must_precede,sbitmap must_follow)2253 add_node_to_ps (partial_schedule_ptr ps, ddg_node_ptr node, int cycle,
2254 		sbitmap must_precede, sbitmap must_follow)
2255 {
2256   ps_insn_ptr ps_i;
2257   int rest_count = 1;
2258   int row = SMODULO (cycle, ps->ii);
2259 
2260   if (ps->rows[row]
2261       && ps->rows[row]->row_rest_count >= issue_rate)
2262     return NULL;
2263 
2264   if (ps->rows[row])
2265     rest_count += ps->rows[row]->row_rest_count;
2266 
2267   ps_i = create_ps_insn (node, rest_count, cycle);
2268 
2269   /* Finds and inserts PS_I according to MUST_FOLLOW and
2270      MUST_PRECEDE.  */
2271   if (! ps_insn_find_column (ps, ps_i, must_precede, must_follow))
2272     {
2273       free (ps_i);
2274       return NULL;
2275     }
2276 
2277   return ps_i;
2278 }
2279 
2280 /* Advance time one cycle.  Assumes DFA is being used.  */
2281 static void
advance_one_cycle(void)2282 advance_one_cycle (void)
2283 {
2284   if (targetm.sched.dfa_pre_cycle_insn)
2285     state_transition (curr_state,
2286 		      targetm.sched.dfa_pre_cycle_insn ());
2287 
2288   state_transition (curr_state, NULL);
2289 
2290   if (targetm.sched.dfa_post_cycle_insn)
2291     state_transition (curr_state,
2292 		      targetm.sched.dfa_post_cycle_insn ());
2293 }
2294 
2295 /* Given the kernel of a loop (from FIRST_INSN to LAST_INSN), finds
2296    the number of cycles according to DFA that the kernel fits in,
2297    we use this to check if we done well with SMS after we add
2298    register moves.  In some cases register moves overhead makes
2299    it even worse than the original loop.  We want SMS to be performed
2300    when it gives less cycles after register moves are added.  */
2301 static int
kernel_number_of_cycles(rtx first_insn,rtx last_insn)2302 kernel_number_of_cycles (rtx first_insn, rtx last_insn)
2303 {
2304   int cycles = 0;
2305   rtx insn;
2306   int can_issue_more = issue_rate;
2307 
2308   state_reset (curr_state);
2309 
2310   for (insn = first_insn;
2311        insn != NULL_RTX && insn != last_insn;
2312        insn = NEXT_INSN (insn))
2313     {
2314       if (! INSN_P (insn) || GET_CODE (PATTERN (insn)) == USE)
2315 	continue;
2316 
2317       /* Check if there is room for the current insn.  */
2318       if (!can_issue_more || state_dead_lock_p (curr_state))
2319 	{
2320 	  cycles ++;
2321 	  advance_one_cycle ();
2322 	  can_issue_more = issue_rate;
2323 	}
2324 
2325 	/* Update the DFA state and return with failure if the DFA found
2326 	   recource conflicts.  */
2327       if (state_transition (curr_state, insn) >= 0)
2328 	{
2329 	  cycles ++;
2330 	  advance_one_cycle ();
2331 	  can_issue_more = issue_rate;
2332 	}
2333 
2334       if (targetm.sched.variable_issue)
2335 	can_issue_more =
2336 	  targetm.sched.variable_issue (sched_dump, sched_verbose,
2337 					insn, can_issue_more);
2338       /* A naked CLOBBER or USE generates no instruction, so don't
2339 	 let them consume issue slots.  */
2340       else if (GET_CODE (PATTERN (insn)) != USE
2341 	       && GET_CODE (PATTERN (insn)) != CLOBBER)
2342 	can_issue_more--;
2343     }
2344   return cycles;
2345 }
2346 
2347 /* Checks if PS has resource conflicts according to DFA, starting from
2348    FROM cycle to TO cycle; returns true if there are conflicts and false
2349    if there are no conflicts.  Assumes DFA is being used.  */
2350 static int
ps_has_conflicts(partial_schedule_ptr ps,int from,int to)2351 ps_has_conflicts (partial_schedule_ptr ps, int from, int to)
2352 {
2353   int cycle;
2354 
2355   state_reset (curr_state);
2356 
2357   for (cycle = from; cycle <= to; cycle++)
2358     {
2359       ps_insn_ptr crr_insn;
2360       /* Holds the remaining issue slots in the current row.  */
2361       int can_issue_more = issue_rate;
2362 
2363       /* Walk through the DFA for the current row.  */
2364       for (crr_insn = ps->rows[SMODULO (cycle, ps->ii)];
2365 	   crr_insn;
2366 	   crr_insn = crr_insn->next_in_row)
2367 	{
2368 	  rtx insn = crr_insn->node->insn;
2369 
2370 	  if (!INSN_P (insn))
2371 	    continue;
2372 
2373 	  /* Check if there is room for the current insn.  */
2374 	  if (!can_issue_more || state_dead_lock_p (curr_state))
2375 	    return true;
2376 
2377 	  /* Update the DFA state and return with failure if the DFA found
2378 	     recource conflicts.  */
2379 	  if (state_transition (curr_state, insn) >= 0)
2380 	    return true;
2381 
2382 	  if (targetm.sched.variable_issue)
2383 	    can_issue_more =
2384 	      targetm.sched.variable_issue (sched_dump, sched_verbose,
2385 					    insn, can_issue_more);
2386 	  /* A naked CLOBBER or USE generates no instruction, so don't
2387 	     let them consume issue slots.  */
2388 	  else if (GET_CODE (PATTERN (insn)) != USE
2389 		   && GET_CODE (PATTERN (insn)) != CLOBBER)
2390 	    can_issue_more--;
2391 	}
2392 
2393       /* Advance the DFA to the next cycle.  */
2394       advance_one_cycle ();
2395     }
2396   return false;
2397 }
2398 
2399 /* Checks if the given node causes resource conflicts when added to PS at
2400    cycle C.  If not the node is added to PS and returned; otherwise zero
2401    is returned.  Bit N is set in MUST_PRECEDE/MUST_FOLLOW if the node with
2402    cuid N must be come before/after (respectively) the node pointed to by
2403    PS_I when scheduled in the same cycle.  */
2404 ps_insn_ptr
ps_add_node_check_conflicts(partial_schedule_ptr ps,ddg_node_ptr n,int c,sbitmap must_precede,sbitmap must_follow)2405 ps_add_node_check_conflicts (partial_schedule_ptr ps, ddg_node_ptr n,
2406    			     int c, sbitmap must_precede,
2407 			     sbitmap must_follow)
2408 {
2409   int has_conflicts = 0;
2410   ps_insn_ptr ps_i;
2411 
2412   /* First add the node to the PS, if this succeeds check for
2413      conflicts, trying different issue slots in the same row.  */
2414   if (! (ps_i = add_node_to_ps (ps, n, c, must_precede, must_follow)))
2415     return NULL; /* Failed to insert the node at the given cycle.  */
2416 
2417   has_conflicts = ps_has_conflicts (ps, c, c)
2418 		  || (ps->history > 0
2419 		      && ps_has_conflicts (ps,
2420 					   c - ps->history,
2421 					   c + ps->history));
2422 
2423   /* Try different issue slots to find one that the given node can be
2424      scheduled in without conflicts.  */
2425   while (has_conflicts)
2426     {
2427       if (! ps_insn_advance_column (ps, ps_i, must_follow))
2428 	break;
2429       has_conflicts = ps_has_conflicts (ps, c, c)
2430 		      || (ps->history > 0
2431 			  && ps_has_conflicts (ps,
2432 					       c - ps->history,
2433 					       c + ps->history));
2434     }
2435 
2436   if (has_conflicts)
2437     {
2438       remove_node_from_ps (ps, ps_i);
2439       return NULL;
2440     }
2441 
2442   ps->min_cycle = MIN (ps->min_cycle, c);
2443   ps->max_cycle = MAX (ps->max_cycle, c);
2444   return ps_i;
2445 }
2446 
2447 /* Rotate the rows of PS such that insns scheduled at time
2448    START_CYCLE will appear in row 0.  Updates max/min_cycles.  */
2449 void
rotate_partial_schedule(partial_schedule_ptr ps,int start_cycle)2450 rotate_partial_schedule (partial_schedule_ptr ps, int start_cycle)
2451 {
2452   int i, row, backward_rotates;
2453   int last_row = ps->ii - 1;
2454 
2455   if (start_cycle == 0)
2456     return;
2457 
2458   backward_rotates = SMODULO (start_cycle, ps->ii);
2459 
2460   /* Revisit later and optimize this into a single loop.  */
2461   for (i = 0; i < backward_rotates; i++)
2462     {
2463       ps_insn_ptr first_row = ps->rows[0];
2464 
2465       for (row = 0; row < last_row; row++)
2466 	ps->rows[row] = ps->rows[row+1];
2467 
2468       ps->rows[last_row] = first_row;
2469     }
2470 
2471   ps->max_cycle -= start_cycle;
2472   ps->min_cycle -= start_cycle;
2473 }
2474 
2475 /* Remove the node N from the partial schedule PS; because we restart the DFA
2476    each time we want to check for resource conflicts; this is equivalent to
2477    unscheduling the node N.  */
2478 static bool
ps_unschedule_node(partial_schedule_ptr ps,ddg_node_ptr n)2479 ps_unschedule_node (partial_schedule_ptr ps, ddg_node_ptr n)
2480 {
2481   ps_insn_ptr ps_i;
2482   int row = SMODULO (SCHED_TIME (n), ps->ii);
2483 
2484   if (row < 0 || row > ps->ii)
2485     return false;
2486 
2487   for (ps_i = ps->rows[row];
2488        ps_i &&  ps_i->node != n;
2489        ps_i = ps_i->next_in_row);
2490   if (!ps_i)
2491     return false;
2492 
2493   return remove_node_from_ps (ps, ps_i);
2494 }
2495 #endif /* INSN_SCHEDULING */
2496 
2497 static bool
gate_handle_sms(void)2498 gate_handle_sms (void)
2499 {
2500   return (optimize > 0 && flag_modulo_sched);
2501 }
2502 
2503 
2504 /* Run instruction scheduler.  */
2505 /* Perform SMS module scheduling.  */
2506 static unsigned int
rest_of_handle_sms(void)2507 rest_of_handle_sms (void)
2508 {
2509 #ifdef INSN_SCHEDULING
2510   basic_block bb;
2511 
2512   /* We want to be able to create new pseudos.  */
2513   no_new_pseudos = 0;
2514   /* Collect loop information to be used in SMS.  */
2515   cfg_layout_initialize (CLEANUP_UPDATE_LIFE);
2516   sms_schedule ();
2517 
2518   /* Update the life information, because we add pseudos.  */
2519   max_regno = max_reg_num ();
2520   allocate_reg_info (max_regno, FALSE, FALSE);
2521   update_life_info (NULL, UPDATE_LIFE_GLOBAL_RM_NOTES,
2522                     (PROP_DEATH_NOTES
2523                      | PROP_REG_INFO
2524                      | PROP_KILL_DEAD_CODE
2525                      | PROP_SCAN_DEAD_CODE));
2526 
2527   no_new_pseudos = 1;
2528 
2529   /* Finalize layout changes.  */
2530   FOR_EACH_BB (bb)
2531     if (bb->next_bb != EXIT_BLOCK_PTR)
2532       bb->aux = bb->next_bb;
2533   cfg_layout_finalize ();
2534   free_dominance_info (CDI_DOMINATORS);
2535 #endif /* INSN_SCHEDULING */
2536   return 0;
2537 }
2538 
2539 struct tree_opt_pass pass_sms =
2540 {
2541   "sms",                                /* name */
2542   gate_handle_sms,                      /* gate */
2543   rest_of_handle_sms,                   /* execute */
2544   NULL,                                 /* sub */
2545   NULL,                                 /* next */
2546   0,                                    /* static_pass_number */
2547   TV_SMS,                               /* tv_id */
2548   0,                                    /* properties_required */
2549   0,                                    /* properties_provided */
2550   0,                                    /* properties_destroyed */
2551   TODO_dump_func,                       /* todo_flags_start */
2552   TODO_dump_func |
2553   TODO_ggc_collect,                     /* todo_flags_finish */
2554   'm'                                   /* letter */
2555 };
2556 
2557