1 /* Swing Modulo Scheduling implementation.
2    Copyright (C) 2004-2013 Free Software Foundation, Inc.
3    Contributed by Ayal Zaks and Mustafa Hagog <zaks,mustafa@il.ibm.com>
4 
5 This file is part of GCC.
6 
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
10 version.
11 
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15 for more details.
16 
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3.  If not see
19 <http://www.gnu.org/licenses/>.  */
20 
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "diagnostic-core.h"
27 #include "rtl.h"
28 #include "tm_p.h"
29 #include "hard-reg-set.h"
30 #include "regs.h"
31 #include "function.h"
32 #include "flags.h"
33 #include "insn-config.h"
34 #include "insn-attr.h"
35 #include "except.h"
36 #include "recog.h"
37 #include "sched-int.h"
38 #include "target.h"
39 #include "cfgloop.h"
40 #include "expr.h"
41 #include "params.h"
42 #include "gcov-io.h"
43 #include "ddg.h"
44 #include "tree-pass.h"
45 #include "dbgcnt.h"
46 #include "df.h"
47 
48 #ifdef INSN_SCHEDULING
49 
50 /* This file contains the implementation of the Swing Modulo Scheduler,
51    described in the following references:
52    [1] J. Llosa, A. Gonzalez, E. Ayguade, M. Valero., and J. Eckhardt.
53        Lifetime--sensitive modulo scheduling in a production environment.
54        IEEE Trans. on Comps., 50(3), March 2001
55    [2] J. Llosa, A. Gonzalez, E. Ayguade, and M. Valero.
56        Swing Modulo Scheduling: A Lifetime Sensitive Approach.
57        PACT '96 , pages 80-87, October 1996 (Boston - Massachusetts - USA).
58 
59    The basic structure is:
60    1. Build a data-dependence graph (DDG) for each loop.
61    2. Use the DDG to order the insns of a loop (not in topological order
62       necessarily, but rather) trying to place each insn after all its
63       predecessors _or_ after all its successors.
64    3. Compute MII: a lower bound on the number of cycles to schedule the loop.
65    4. Use the ordering to perform list-scheduling of the loop:
66       1. Set II = MII.  We will try to schedule the loop within II cycles.
67       2. Try to schedule the insns one by one according to the ordering.
68 	 For each insn compute an interval of cycles by considering already-
69 	 scheduled preds and succs (and associated latencies); try to place
70 	 the insn in the cycles of this window checking for potential
71 	 resource conflicts (using the DFA interface).
72 	 Note: this is different from the cycle-scheduling of schedule_insns;
73 	 here the insns are not scheduled monotonically top-down (nor bottom-
74 	 up).
75       3. If failed in scheduling all insns - bump II++ and try again, unless
76 	 II reaches an upper bound MaxII, in which case report failure.
77    5. If we succeeded in scheduling the loop within II cycles, we now
78       generate prolog and epilog, decrease the counter of the loop, and
79       perform modulo variable expansion for live ranges that span more than
80       II cycles (i.e. use register copies to prevent a def from overwriting
81       itself before reaching the use).
82 
83     SMS works with countable loops (1) whose control part can be easily
84     decoupled from the rest of the loop and (2) whose loop count can
85     be easily adjusted.  This is because we peel a constant number of
86     iterations into a prologue and epilogue for which we want to avoid
87     emitting the control part, and a kernel which is to iterate that
88     constant number of iterations less than the original loop.  So the
89     control part should be a set of insns clearly identified and having
90     its own iv, not otherwise used in the loop (at-least for now), which
91     initializes a register before the loop to the number of iterations.
92     Currently SMS relies on the do-loop pattern to recognize such loops,
93     where (1) the control part comprises of all insns defining and/or
94     using a certain 'count' register and (2) the loop count can be
95     adjusted by modifying this register prior to the loop.
96     TODO: Rely on cfgloop analysis instead.  */
97 
98 /* This page defines partial-schedule structures and functions for
99    modulo scheduling.  */
100 
101 typedef struct partial_schedule *partial_schedule_ptr;
102 typedef struct ps_insn *ps_insn_ptr;
103 
104 /* The minimum (absolute) cycle that a node of ps was scheduled in.  */
105 #define PS_MIN_CYCLE(ps) (((partial_schedule_ptr)(ps))->min_cycle)
106 
107 /* The maximum (absolute) cycle that a node of ps was scheduled in.  */
108 #define PS_MAX_CYCLE(ps) (((partial_schedule_ptr)(ps))->max_cycle)
109 
110 /* Perform signed modulo, always returning a non-negative value.  */
111 #define SMODULO(x,y) ((x) % (y) < 0 ? ((x) % (y) + (y)) : (x) % (y))
112 
113 /* The number of different iterations the nodes in ps span, assuming
114    the stage boundaries are placed efficiently.  */
115 #define CALC_STAGE_COUNT(max_cycle,min_cycle,ii) ((max_cycle - min_cycle \
116                          + 1 + ii - 1) / ii)
117 /* The stage count of ps.  */
118 #define PS_STAGE_COUNT(ps) (((partial_schedule_ptr)(ps))->stage_count)
119 
120 /* A single instruction in the partial schedule.  */
121 struct ps_insn
122 {
123   /* Identifies the instruction to be scheduled.  Values smaller than
124      the ddg's num_nodes refer directly to ddg nodes.  A value of
125      X - num_nodes refers to register move X.  */
126   int id;
127 
128   /* The (absolute) cycle in which the PS instruction is scheduled.
129      Same as SCHED_TIME (node).  */
130   int cycle;
131 
132   /* The next/prev PS_INSN in the same row.  */
133   ps_insn_ptr next_in_row,
134 	      prev_in_row;
135 
136 };
137 
138 /* Information about a register move that has been added to a partial
139    schedule.  */
140 struct ps_reg_move_info
141 {
142   /* The source of the move is defined by the ps_insn with id DEF.
143      The destination is used by the ps_insns with the ids in USES.  */
144   int def;
145   sbitmap uses;
146 
147   /* The original form of USES' instructions used OLD_REG, but they
148      should now use NEW_REG.  */
149   rtx old_reg;
150   rtx new_reg;
151 
152   /* The number of consecutive stages that the move occupies.  */
153   int num_consecutive_stages;
154 
155   /* An instruction that sets NEW_REG to the correct value.  The first
156      move associated with DEF will have an rhs of OLD_REG; later moves
157      use the result of the previous move.  */
158   rtx insn;
159 };
160 
161 typedef struct ps_reg_move_info ps_reg_move_info;
162 
163 /* Holds the partial schedule as an array of II rows.  Each entry of the
164    array points to a linked list of PS_INSNs, which represents the
165    instructions that are scheduled for that row.  */
166 struct partial_schedule
167 {
168   int ii;	/* Number of rows in the partial schedule.  */
169   int history;  /* Threshold for conflict checking using DFA.  */
170 
171   /* rows[i] points to linked list of insns scheduled in row i (0<=i<ii).  */
172   ps_insn_ptr *rows;
173 
174   /* All the moves added for this partial schedule.  Index X has
175      a ps_insn id of X + g->num_nodes.  */
176   vec<ps_reg_move_info> reg_moves;
177 
178   /*  rows_length[i] holds the number of instructions in the row.
179       It is used only (as an optimization) to back off quickly from
180       trying to schedule a node in a full row; that is, to avoid running
181       through futile DFA state transitions.  */
182   int *rows_length;
183 
184   /* The earliest absolute cycle of an insn in the partial schedule.  */
185   int min_cycle;
186 
187   /* The latest absolute cycle of an insn in the partial schedule.  */
188   int max_cycle;
189 
190   ddg_ptr g;	/* The DDG of the insns in the partial schedule.  */
191 
192   int stage_count;  /* The stage count of the partial schedule.  */
193 };
194 
195 
196 static partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int history);
197 static void free_partial_schedule (partial_schedule_ptr);
198 static void reset_partial_schedule (partial_schedule_ptr, int new_ii);
199 void print_partial_schedule (partial_schedule_ptr, FILE *);
200 static void verify_partial_schedule (partial_schedule_ptr, sbitmap);
201 static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
202 						int, int, sbitmap, sbitmap);
203 static void rotate_partial_schedule (partial_schedule_ptr, int);
204 void set_row_column_for_ps (partial_schedule_ptr);
205 static void ps_insert_empty_row (partial_schedule_ptr, int, sbitmap);
206 static int compute_split_row (sbitmap, int, int, int, ddg_node_ptr);
207 
208 
209 /* This page defines constants and structures for the modulo scheduling
210    driver.  */
211 
212 static int sms_order_nodes (ddg_ptr, int, int *, int *);
213 static void set_node_sched_params (ddg_ptr);
214 static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int, int *);
215 static void permute_partial_schedule (partial_schedule_ptr, rtx);
216 static void generate_prolog_epilog (partial_schedule_ptr, struct loop *,
217                                     rtx, rtx);
218 static int calculate_stage_count (partial_schedule_ptr, int);
219 static void calculate_must_precede_follow (ddg_node_ptr, int, int,
220 					   int, int, sbitmap, sbitmap, sbitmap);
221 static int get_sched_window (partial_schedule_ptr, ddg_node_ptr,
222 			     sbitmap, int, int *, int *, int *);
223 static bool try_scheduling_node_in_cycle (partial_schedule_ptr, int, int,
224 					  sbitmap, int *, sbitmap, sbitmap);
225 static void remove_node_from_ps (partial_schedule_ptr, ps_insn_ptr);
226 
227 #define NODE_ASAP(node) ((node)->aux.count)
228 
229 #define SCHED_PARAMS(x) (&node_sched_param_vec[x])
230 #define SCHED_TIME(x) (SCHED_PARAMS (x)->time)
231 #define SCHED_ROW(x) (SCHED_PARAMS (x)->row)
232 #define SCHED_STAGE(x) (SCHED_PARAMS (x)->stage)
233 #define SCHED_COLUMN(x) (SCHED_PARAMS (x)->column)
234 
235 /* The scheduling parameters held for each node.  */
236 typedef struct node_sched_params
237 {
238   int time;	/* The absolute scheduling cycle.  */
239 
240   int row;    /* Holds time % ii.  */
241   int stage;  /* Holds time / ii.  */
242 
243   /* The column of a node inside the ps.  If nodes u, v are on the same row,
244      u will precede v if column (u) < column (v).  */
245   int column;
246 } *node_sched_params_ptr;
247 
248 typedef struct node_sched_params node_sched_params;
249 
250 /* The following three functions are copied from the current scheduler
251    code in order to use sched_analyze() for computing the dependencies.
252    They are used when initializing the sched_info structure.  */
253 static const char *
sms_print_insn(const_rtx insn,int aligned ATTRIBUTE_UNUSED)254 sms_print_insn (const_rtx insn, int aligned ATTRIBUTE_UNUSED)
255 {
256   static char tmp[80];
257 
258   sprintf (tmp, "i%4d", INSN_UID (insn));
259   return tmp;
260 }
261 
262 static void
compute_jump_reg_dependencies(rtx insn ATTRIBUTE_UNUSED,regset used ATTRIBUTE_UNUSED)263 compute_jump_reg_dependencies (rtx insn ATTRIBUTE_UNUSED,
264 			       regset used ATTRIBUTE_UNUSED)
265 {
266 }
267 
268 static struct common_sched_info_def sms_common_sched_info;
269 
270 static struct sched_deps_info_def sms_sched_deps_info =
271   {
272     compute_jump_reg_dependencies,
273     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
274     NULL,
275     0, 0, 0
276   };
277 
278 static struct haifa_sched_info sms_sched_info =
279 {
280   NULL,
281   NULL,
282   NULL,
283   NULL,
284   NULL,
285   sms_print_insn,
286   NULL,
287   NULL, /* insn_finishes_block_p */
288   NULL, NULL,
289   NULL, NULL,
290   0, 0,
291 
292   NULL, NULL, NULL, NULL,
293   NULL, NULL,
294   0
295 };
296 
297 /* Partial schedule instruction ID in PS is a register move.  Return
298    information about it.  */
299 static struct ps_reg_move_info *
ps_reg_move(partial_schedule_ptr ps,int id)300 ps_reg_move (partial_schedule_ptr ps, int id)
301 {
302   gcc_checking_assert (id >= ps->g->num_nodes);
303   return &ps->reg_moves[id - ps->g->num_nodes];
304 }
305 
306 /* Return the rtl instruction that is being scheduled by partial schedule
307    instruction ID, which belongs to schedule PS.  */
308 static rtx
ps_rtl_insn(partial_schedule_ptr ps,int id)309 ps_rtl_insn (partial_schedule_ptr ps, int id)
310 {
311   if (id < ps->g->num_nodes)
312     return ps->g->nodes[id].insn;
313   else
314     return ps_reg_move (ps, id)->insn;
315 }
316 
317 /* Partial schedule instruction ID, which belongs to PS, occurred in
318    the original (unscheduled) loop.  Return the first instruction
319    in the loop that was associated with ps_rtl_insn (PS, ID).
320    If the instruction had some notes before it, this is the first
321    of those notes.  */
322 static rtx
ps_first_note(partial_schedule_ptr ps,int id)323 ps_first_note (partial_schedule_ptr ps, int id)
324 {
325   gcc_assert (id < ps->g->num_nodes);
326   return ps->g->nodes[id].first_note;
327 }
328 
329 /* Return the number of consecutive stages that are occupied by
330    partial schedule instruction ID in PS.  */
331 static int
ps_num_consecutive_stages(partial_schedule_ptr ps,int id)332 ps_num_consecutive_stages (partial_schedule_ptr ps, int id)
333 {
334   if (id < ps->g->num_nodes)
335     return 1;
336   else
337     return ps_reg_move (ps, id)->num_consecutive_stages;
338 }
339 
340 /* Given HEAD and TAIL which are the first and last insns in a loop;
341    return the register which controls the loop.  Return zero if it has
342    more than one occurrence in the loop besides the control part or the
343    do-loop pattern is not of the form we expect.  */
344 static rtx
doloop_register_get(rtx head ATTRIBUTE_UNUSED,rtx tail ATTRIBUTE_UNUSED)345 doloop_register_get (rtx head ATTRIBUTE_UNUSED, rtx tail ATTRIBUTE_UNUSED)
346 {
347 #ifdef HAVE_doloop_end
348   rtx reg, condition, insn, first_insn_not_to_check;
349 
350   if (!JUMP_P (tail))
351     return NULL_RTX;
352 
353   /* TODO: Free SMS's dependence on doloop_condition_get.  */
354   condition = doloop_condition_get (tail);
355   if (! condition)
356     return NULL_RTX;
357 
358   if (REG_P (XEXP (condition, 0)))
359     reg = XEXP (condition, 0);
360   else if (GET_CODE (XEXP (condition, 0)) == PLUS
361 	   && REG_P (XEXP (XEXP (condition, 0), 0)))
362     reg = XEXP (XEXP (condition, 0), 0);
363   else
364     gcc_unreachable ();
365 
366   /* Check that the COUNT_REG has no other occurrences in the loop
367      until the decrement.  We assume the control part consists of
368      either a single (parallel) branch-on-count or a (non-parallel)
369      branch immediately preceded by a single (decrement) insn.  */
370   first_insn_not_to_check = (GET_CODE (PATTERN (tail)) == PARALLEL ? tail
371                              : prev_nondebug_insn (tail));
372 
373   for (insn = head; insn != first_insn_not_to_check; insn = NEXT_INSN (insn))
374     if (!DEBUG_INSN_P (insn) && reg_mentioned_p (reg, insn))
375       {
376         if (dump_file)
377         {
378           fprintf (dump_file, "SMS count_reg found ");
379           print_rtl_single (dump_file, reg);
380           fprintf (dump_file, " outside control in insn:\n");
381           print_rtl_single (dump_file, insn);
382         }
383 
384         return NULL_RTX;
385       }
386 
387   return reg;
388 #else
389   return NULL_RTX;
390 #endif
391 }
392 
393 /* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
394    that the number of iterations is a compile-time constant.  If so,
395    return the rtx that sets COUNT_REG to a constant, and set COUNT to
396    this constant.  Otherwise return 0.  */
397 static rtx
const_iteration_count(rtx count_reg,basic_block pre_header,HOST_WIDEST_INT * count)398 const_iteration_count (rtx count_reg, basic_block pre_header,
399 		       HOST_WIDEST_INT * count)
400 {
401   rtx insn;
402   rtx head, tail;
403 
404   if (! pre_header)
405     return NULL_RTX;
406 
407   get_ebb_head_tail (pre_header, pre_header, &head, &tail);
408 
409   for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn))
410     if (NONDEBUG_INSN_P (insn) && single_set (insn) &&
411 	rtx_equal_p (count_reg, SET_DEST (single_set (insn))))
412       {
413 	rtx pat = single_set (insn);
414 
415 	if (CONST_INT_P (SET_SRC (pat)))
416 	  {
417 	    *count = INTVAL (SET_SRC (pat));
418 	    return insn;
419 	  }
420 
421 	return NULL_RTX;
422       }
423 
424   return NULL_RTX;
425 }
426 
427 /* A very simple resource-based lower bound on the initiation interval.
428    ??? Improve the accuracy of this bound by considering the
429    utilization of various units.  */
430 static int
res_MII(ddg_ptr g)431 res_MII (ddg_ptr g)
432 {
433   if (targetm.sched.sms_res_mii)
434     return targetm.sched.sms_res_mii (g);
435 
436   return ((g->num_nodes - g->num_debug) / issue_rate);
437 }
438 
439 
440 /* A vector that contains the sched data for each ps_insn.  */
441 static vec<node_sched_params> node_sched_param_vec;
442 
443 /* Allocate sched_params for each node and initialize it.  */
444 static void
set_node_sched_params(ddg_ptr g)445 set_node_sched_params (ddg_ptr g)
446 {
447   node_sched_param_vec.truncate (0);
448   node_sched_param_vec.safe_grow_cleared (g->num_nodes);
449 }
450 
451 /* Make sure that node_sched_param_vec has an entry for every move in PS.  */
452 static void
extend_node_sched_params(partial_schedule_ptr ps)453 extend_node_sched_params (partial_schedule_ptr ps)
454 {
455   node_sched_param_vec.safe_grow_cleared (ps->g->num_nodes
456 					  + ps->reg_moves.length ());
457 }
458 
459 /* Update the sched_params (time, row and stage) for node U using the II,
460    the CYCLE of U and MIN_CYCLE.
461    We're not simply taking the following
462    SCHED_STAGE (u) = CALC_STAGE_COUNT (SCHED_TIME (u), min_cycle, ii);
463    because the stages may not be aligned on cycle 0.  */
464 static void
update_node_sched_params(int u,int ii,int cycle,int min_cycle)465 update_node_sched_params (int u, int ii, int cycle, int min_cycle)
466 {
467   int sc_until_cycle_zero;
468   int stage;
469 
470   SCHED_TIME (u) = cycle;
471   SCHED_ROW (u) = SMODULO (cycle, ii);
472 
473   /* The calculation of stage count is done adding the number
474      of stages before cycle zero and after cycle zero.  */
475   sc_until_cycle_zero = CALC_STAGE_COUNT (-1, min_cycle, ii);
476 
477   if (SCHED_TIME (u) < 0)
478     {
479       stage = CALC_STAGE_COUNT (-1, SCHED_TIME (u), ii);
480       SCHED_STAGE (u) = sc_until_cycle_zero - stage;
481     }
482   else
483     {
484       stage = CALC_STAGE_COUNT (SCHED_TIME (u), 0, ii);
485       SCHED_STAGE (u) = sc_until_cycle_zero + stage - 1;
486     }
487 }
488 
489 static void
print_node_sched_params(FILE * file,int num_nodes,partial_schedule_ptr ps)490 print_node_sched_params (FILE *file, int num_nodes, partial_schedule_ptr ps)
491 {
492   int i;
493 
494   if (! file)
495     return;
496   for (i = 0; i < num_nodes; i++)
497     {
498       node_sched_params_ptr nsp = SCHED_PARAMS (i);
499 
500       fprintf (file, "Node = %d; INSN = %d\n", i,
501 	       INSN_UID (ps_rtl_insn (ps, i)));
502       fprintf (file, " asap = %d:\n", NODE_ASAP (&ps->g->nodes[i]));
503       fprintf (file, " time = %d:\n", nsp->time);
504       fprintf (file, " stage = %d:\n", nsp->stage);
505     }
506 }
507 
508 /* Set SCHED_COLUMN for each instruction in row ROW of PS.  */
509 static void
set_columns_for_row(partial_schedule_ptr ps,int row)510 set_columns_for_row (partial_schedule_ptr ps, int row)
511 {
512   ps_insn_ptr cur_insn;
513   int column;
514 
515   column = 0;
516   for (cur_insn = ps->rows[row]; cur_insn; cur_insn = cur_insn->next_in_row)
517     SCHED_COLUMN (cur_insn->id) = column++;
518 }
519 
520 /* Set SCHED_COLUMN for each instruction in PS.  */
521 static void
set_columns_for_ps(partial_schedule_ptr ps)522 set_columns_for_ps (partial_schedule_ptr ps)
523 {
524   int row;
525 
526   for (row = 0; row < ps->ii; row++)
527     set_columns_for_row (ps, row);
528 }
529 
530 /* Try to schedule the move with ps_insn identifier I_REG_MOVE in PS.
531    Its single predecessor has already been scheduled, as has its
532    ddg node successors.  (The move may have also another move as its
533    successor, in which case that successor will be scheduled later.)
534 
535    The move is part of a chain that satisfies register dependencies
536    between a producing ddg node and various consuming ddg nodes.
537    If some of these dependencies have a distance of 1 (meaning that
538    the use is upward-exposed) then DISTANCE1_USES is nonnull and
539    contains the set of uses with distance-1 dependencies.
540    DISTANCE1_USES is null otherwise.
541 
542    MUST_FOLLOW is a scratch bitmap that is big enough to hold
543    all current ps_insn ids.
544 
545    Return true on success.  */
546 static bool
schedule_reg_move(partial_schedule_ptr ps,int i_reg_move,sbitmap distance1_uses,sbitmap must_follow)547 schedule_reg_move (partial_schedule_ptr ps, int i_reg_move,
548 		   sbitmap distance1_uses, sbitmap must_follow)
549 {
550   unsigned int u;
551   int this_time, this_distance, this_start, this_end, this_latency;
552   int start, end, c, ii;
553   sbitmap_iterator sbi;
554   ps_reg_move_info *move;
555   rtx this_insn;
556   ps_insn_ptr psi;
557 
558   move = ps_reg_move (ps, i_reg_move);
559   ii = ps->ii;
560   if (dump_file)
561     {
562       fprintf (dump_file, "Scheduling register move INSN %d; ii = %d"
563 	       ", min cycle = %d\n\n", INSN_UID (move->insn), ii,
564 	       PS_MIN_CYCLE (ps));
565       print_rtl_single (dump_file, move->insn);
566       fprintf (dump_file, "\n%11s %11s %5s\n", "start", "end", "time");
567       fprintf (dump_file, "=========== =========== =====\n");
568     }
569 
570   start = INT_MIN;
571   end = INT_MAX;
572 
573   /* For dependencies of distance 1 between a producer ddg node A
574      and consumer ddg node B, we have a chain of dependencies:
575 
576         A --(T,L1,1)--> M1 --(T,L2,0)--> M2 ... --(T,Ln,0)--> B
577 
578      where Mi is the ith move.  For dependencies of distance 0 between
579      a producer ddg node A and consumer ddg node C, we have a chain of
580      dependencies:
581 
582         A --(T,L1',0)--> M1' --(T,L2',0)--> M2' ... --(T,Ln',0)--> C
583 
584      where Mi' occupies the same position as Mi but occurs a stage later.
585      We can only schedule each move once, so if we have both types of
586      chain, we model the second as:
587 
588         A --(T,L1',1)--> M1 --(T,L2',0)--> M2 ... --(T,Ln',-1)--> C
589 
590      First handle the dependencies between the previously-scheduled
591      predecessor and the move.  */
592   this_insn = ps_rtl_insn (ps, move->def);
593   this_latency = insn_latency (this_insn, move->insn);
594   this_distance = distance1_uses && move->def < ps->g->num_nodes ? 1 : 0;
595   this_time = SCHED_TIME (move->def) - this_distance * ii;
596   this_start = this_time + this_latency;
597   this_end = this_time + ii;
598   if (dump_file)
599     fprintf (dump_file, "%11d %11d %5d %d --(T,%d,%d)--> %d\n",
600 	     this_start, this_end, SCHED_TIME (move->def),
601 	     INSN_UID (this_insn), this_latency, this_distance,
602 	     INSN_UID (move->insn));
603 
604   if (start < this_start)
605     start = this_start;
606   if (end > this_end)
607     end = this_end;
608 
609   /* Handle the dependencies between the move and previously-scheduled
610      successors.  */
611   EXECUTE_IF_SET_IN_BITMAP (move->uses, 0, u, sbi)
612     {
613       this_insn = ps_rtl_insn (ps, u);
614       this_latency = insn_latency (move->insn, this_insn);
615       if (distance1_uses && !bitmap_bit_p (distance1_uses, u))
616 	this_distance = -1;
617       else
618 	this_distance = 0;
619       this_time = SCHED_TIME (u) + this_distance * ii;
620       this_start = this_time - ii;
621       this_end = this_time - this_latency;
622       if (dump_file)
623 	fprintf (dump_file, "%11d %11d %5d %d --(T,%d,%d)--> %d\n",
624 		 this_start, this_end, SCHED_TIME (u), INSN_UID (move->insn),
625 		 this_latency, this_distance, INSN_UID (this_insn));
626 
627       if (start < this_start)
628 	start = this_start;
629       if (end > this_end)
630 	end = this_end;
631     }
632 
633   if (dump_file)
634     {
635       fprintf (dump_file, "----------- ----------- -----\n");
636       fprintf (dump_file, "%11d %11d %5s %s\n", start, end, "", "(max, min)");
637     }
638 
639   bitmap_clear (must_follow);
640   bitmap_set_bit (must_follow, move->def);
641 
642   start = MAX (start, end - (ii - 1));
643   for (c = end; c >= start; c--)
644     {
645       psi = ps_add_node_check_conflicts (ps, i_reg_move, c,
646 					 move->uses, must_follow);
647       if (psi)
648 	{
649 	  update_node_sched_params (i_reg_move, ii, c, PS_MIN_CYCLE (ps));
650 	  if (dump_file)
651 	    fprintf (dump_file, "\nScheduled register move INSN %d at"
652 		     " time %d, row %d\n\n", INSN_UID (move->insn), c,
653 		     SCHED_ROW (i_reg_move));
654 	  return true;
655 	}
656     }
657 
658   if (dump_file)
659     fprintf (dump_file, "\nNo available slot\n\n");
660 
661   return false;
662 }
663 
664 /*
665    Breaking intra-loop register anti-dependences:
666    Each intra-loop register anti-dependence implies a cross-iteration true
667    dependence of distance 1. Therefore, we can remove such false dependencies
668    and figure out if the partial schedule broke them by checking if (for a
669    true-dependence of distance 1): SCHED_TIME (def) < SCHED_TIME (use) and
670    if so generate a register move.   The number of such moves is equal to:
671               SCHED_TIME (use) - SCHED_TIME (def)       { 0 broken
672    nreg_moves = ----------------------------------- + 1 - {   dependence.
673                             ii                          { 1 if not.
674 */
675 static bool
schedule_reg_moves(partial_schedule_ptr ps)676 schedule_reg_moves (partial_schedule_ptr ps)
677 {
678   ddg_ptr g = ps->g;
679   int ii = ps->ii;
680   int i;
681 
682   for (i = 0; i < g->num_nodes; i++)
683     {
684       ddg_node_ptr u = &g->nodes[i];
685       ddg_edge_ptr e;
686       int nreg_moves = 0, i_reg_move;
687       rtx prev_reg, old_reg;
688       int first_move;
689       int distances[2];
690       sbitmap must_follow;
691       sbitmap distance1_uses;
692       rtx set = single_set (u->insn);
693 
694       /* Skip instructions that do not set a register.  */
695       if ((set && !REG_P (SET_DEST (set))))
696         continue;
697 
698       /* Compute the number of reg_moves needed for u, by looking at life
699 	 ranges started at u (excluding self-loops).  */
700       distances[0] = distances[1] = false;
701       for (e = u->out; e; e = e->next_out)
702 	if (e->type == TRUE_DEP && e->dest != e->src)
703 	  {
704 	    int nreg_moves4e = (SCHED_TIME (e->dest->cuid)
705 				- SCHED_TIME (e->src->cuid)) / ii;
706 
707             if (e->distance == 1)
708               nreg_moves4e = (SCHED_TIME (e->dest->cuid)
709 			      - SCHED_TIME (e->src->cuid) + ii) / ii;
710 
711 	    /* If dest precedes src in the schedule of the kernel, then dest
712 	       will read before src writes and we can save one reg_copy.  */
713 	    if (SCHED_ROW (e->dest->cuid) == SCHED_ROW (e->src->cuid)
714 		&& SCHED_COLUMN (e->dest->cuid) < SCHED_COLUMN (e->src->cuid))
715 	      nreg_moves4e--;
716 
717             if (nreg_moves4e >= 1)
718 	      {
719 		/* !single_set instructions are not supported yet and
720 		   thus we do not except to encounter them in the loop
721 		   except from the doloop part.  For the latter case
722 		   we assume no regmoves are generated as the doloop
723 		   instructions are tied to the branch with an edge.  */
724 		gcc_assert (set);
725 		/* If the instruction contains auto-inc register then
726 		   validate that the regmov is being generated for the
727 		   target regsiter rather then the inc'ed register.	*/
728 		gcc_assert (!autoinc_var_is_used_p (u->insn, e->dest->insn));
729 	      }
730 
731 	    if (nreg_moves4e)
732 	      {
733 		gcc_assert (e->distance < 2);
734 		distances[e->distance] = true;
735 	      }
736 	    nreg_moves = MAX (nreg_moves, nreg_moves4e);
737 	  }
738 
739       if (nreg_moves == 0)
740 	continue;
741 
742       /* Create NREG_MOVES register moves.  */
743       first_move = ps->reg_moves.length ();
744       ps->reg_moves.safe_grow_cleared (first_move + nreg_moves);
745       extend_node_sched_params (ps);
746 
747       /* Record the moves associated with this node.  */
748       first_move += ps->g->num_nodes;
749 
750       /* Generate each move.  */
751       old_reg = prev_reg = SET_DEST (single_set (u->insn));
752       for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
753 	{
754 	  ps_reg_move_info *move = ps_reg_move (ps, first_move + i_reg_move);
755 
756 	  move->def = i_reg_move > 0 ? first_move + i_reg_move - 1 : i;
757 	  move->uses = sbitmap_alloc (first_move + nreg_moves);
758 	  move->old_reg = old_reg;
759 	  move->new_reg = gen_reg_rtx (GET_MODE (prev_reg));
760 	  move->num_consecutive_stages = distances[0] && distances[1] ? 2 : 1;
761 	  move->insn = gen_move_insn (move->new_reg, copy_rtx (prev_reg));
762 	  bitmap_clear (move->uses);
763 
764 	  prev_reg = move->new_reg;
765 	}
766 
767       distance1_uses = distances[1] ? sbitmap_alloc (g->num_nodes) : NULL;
768 
769       /* Every use of the register defined by node may require a different
770 	 copy of this register, depending on the time the use is scheduled.
771 	 Record which uses require which move results.  */
772       for (e = u->out; e; e = e->next_out)
773 	if (e->type == TRUE_DEP && e->dest != e->src)
774 	  {
775 	    int dest_copy = (SCHED_TIME (e->dest->cuid)
776 			     - SCHED_TIME (e->src->cuid)) / ii;
777 
778 	    if (e->distance == 1)
779 	      dest_copy = (SCHED_TIME (e->dest->cuid)
780 			   - SCHED_TIME (e->src->cuid) + ii) / ii;
781 
782 	    if (SCHED_ROW (e->dest->cuid) == SCHED_ROW (e->src->cuid)
783 		&& SCHED_COLUMN (e->dest->cuid) < SCHED_COLUMN (e->src->cuid))
784 	      dest_copy--;
785 
786 	    if (dest_copy)
787 	      {
788 		ps_reg_move_info *move;
789 
790 		move = ps_reg_move (ps, first_move + dest_copy - 1);
791 		bitmap_set_bit (move->uses, e->dest->cuid);
792 		if (e->distance == 1)
793 		  bitmap_set_bit (distance1_uses, e->dest->cuid);
794 	      }
795 	  }
796 
797       must_follow = sbitmap_alloc (first_move + nreg_moves);
798       for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
799 	if (!schedule_reg_move (ps, first_move + i_reg_move,
800 				distance1_uses, must_follow))
801 	  break;
802       sbitmap_free (must_follow);
803       if (distance1_uses)
804 	sbitmap_free (distance1_uses);
805       if (i_reg_move < nreg_moves)
806 	return false;
807     }
808   return true;
809 }
810 
811 /* Emit the moves associatied with PS.  Apply the substitutions
812    associated with them.  */
813 static void
apply_reg_moves(partial_schedule_ptr ps)814 apply_reg_moves (partial_schedule_ptr ps)
815 {
816   ps_reg_move_info *move;
817   int i;
818 
819   FOR_EACH_VEC_ELT (ps->reg_moves, i, move)
820     {
821       unsigned int i_use;
822       sbitmap_iterator sbi;
823 
824       EXECUTE_IF_SET_IN_BITMAP (move->uses, 0, i_use, sbi)
825 	{
826 	  replace_rtx (ps->g->nodes[i_use].insn, move->old_reg, move->new_reg);
827 	  df_insn_rescan (ps->g->nodes[i_use].insn);
828 	}
829     }
830 }
831 
832 /* Bump the SCHED_TIMEs of all nodes by AMOUNT.  Set the values of
833    SCHED_ROW and SCHED_STAGE.  Instruction scheduled on cycle AMOUNT
834    will move to cycle zero.  */
835 static void
reset_sched_times(partial_schedule_ptr ps,int amount)836 reset_sched_times (partial_schedule_ptr ps, int amount)
837 {
838   int row;
839   int ii = ps->ii;
840   ps_insn_ptr crr_insn;
841 
842   for (row = 0; row < ii; row++)
843     for (crr_insn = ps->rows[row]; crr_insn; crr_insn = crr_insn->next_in_row)
844       {
845 	int u = crr_insn->id;
846 	int normalized_time = SCHED_TIME (u) - amount;
847 	int new_min_cycle = PS_MIN_CYCLE (ps) - amount;
848 
849         if (dump_file)
850           {
851             /* Print the scheduling times after the rotation.  */
852 	    rtx insn = ps_rtl_insn (ps, u);
853 
854             fprintf (dump_file, "crr_insn->node=%d (insn id %d), "
855                      "crr_insn->cycle=%d, min_cycle=%d", u,
856                      INSN_UID (insn), normalized_time, new_min_cycle);
857             if (JUMP_P (insn))
858               fprintf (dump_file, " (branch)");
859             fprintf (dump_file, "\n");
860           }
861 
862 	gcc_assert (SCHED_TIME (u) >= ps->min_cycle);
863 	gcc_assert (SCHED_TIME (u) <= ps->max_cycle);
864 
865 	crr_insn->cycle = normalized_time;
866 	update_node_sched_params (u, ii, normalized_time, new_min_cycle);
867       }
868 }
869 
870 /* Permute the insns according to their order in PS, from row 0 to
871    row ii-1, and position them right before LAST.  This schedules
872    the insns of the loop kernel.  */
873 static void
permute_partial_schedule(partial_schedule_ptr ps,rtx last)874 permute_partial_schedule (partial_schedule_ptr ps, rtx last)
875 {
876   int ii = ps->ii;
877   int row;
878   ps_insn_ptr ps_ij;
879 
880   for (row = 0; row < ii ; row++)
881     for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
882       {
883 	rtx insn = ps_rtl_insn (ps, ps_ij->id);
884 
885 	if (PREV_INSN (last) != insn)
886 	  {
887 	    if (ps_ij->id < ps->g->num_nodes)
888 	      reorder_insns_nobb (ps_first_note (ps, ps_ij->id), insn,
889 				  PREV_INSN (last));
890 	    else
891 	      add_insn_before (insn, last, NULL);
892 	  }
893       }
894 }
895 
896 /* Set bitmaps TMP_FOLLOW and TMP_PRECEDE to MUST_FOLLOW and MUST_PRECEDE
897    respectively only if cycle C falls on the border of the scheduling
898    window boundaries marked by START and END cycles.  STEP is the
899    direction of the window.  */
900 static inline void
set_must_precede_follow(sbitmap * tmp_follow,sbitmap must_follow,sbitmap * tmp_precede,sbitmap must_precede,int c,int start,int end,int step)901 set_must_precede_follow (sbitmap *tmp_follow, sbitmap must_follow,
902 			 sbitmap *tmp_precede, sbitmap must_precede, int c,
903 			 int start, int end, int step)
904 {
905   *tmp_precede = NULL;
906   *tmp_follow = NULL;
907 
908   if (c == start)
909     {
910       if (step == 1)
911 	*tmp_precede = must_precede;
912       else			/* step == -1.  */
913 	*tmp_follow = must_follow;
914     }
915   if (c == end - step)
916     {
917       if (step == 1)
918 	*tmp_follow = must_follow;
919       else			/* step == -1.  */
920 	*tmp_precede = must_precede;
921     }
922 
923 }
924 
925 /* Return True if the branch can be moved to row ii-1 while
926    normalizing the partial schedule PS to start from cycle zero and thus
927    optimize the SC.  Otherwise return False.  */
928 static bool
optimize_sc(partial_schedule_ptr ps,ddg_ptr g)929 optimize_sc (partial_schedule_ptr ps, ddg_ptr g)
930 {
931   int amount = PS_MIN_CYCLE (ps);
932   sbitmap sched_nodes = sbitmap_alloc (g->num_nodes);
933   int start, end, step;
934   int ii = ps->ii;
935   bool ok = false;
936   int stage_count, stage_count_curr;
937 
938   /* Compare the SC after normalization and SC after bringing the branch
939      to row ii-1.  If they are equal just bail out.  */
940   stage_count = calculate_stage_count (ps, amount);
941   stage_count_curr =
942     calculate_stage_count (ps, SCHED_TIME (g->closing_branch->cuid) - (ii - 1));
943 
944   if (stage_count == stage_count_curr)
945     {
946       if (dump_file)
947 	fprintf (dump_file, "SMS SC already optimized.\n");
948 
949       ok = false;
950       goto clear;
951     }
952 
953   if (dump_file)
954     {
955       fprintf (dump_file, "SMS Trying to optimize branch location\n");
956       fprintf (dump_file, "SMS partial schedule before trial:\n");
957       print_partial_schedule (ps, dump_file);
958     }
959 
960   /* First, normalize the partial scheduling.  */
961   reset_sched_times (ps, amount);
962   rotate_partial_schedule (ps, amount);
963   if (dump_file)
964     {
965       fprintf (dump_file,
966 	       "SMS partial schedule after normalization (ii, %d, SC %d):\n",
967 	       ii, stage_count);
968       print_partial_schedule (ps, dump_file);
969     }
970 
971   if (SMODULO (SCHED_TIME (g->closing_branch->cuid), ii) == ii - 1)
972     {
973       ok = true;
974       goto clear;
975     }
976 
977   bitmap_ones (sched_nodes);
978 
979   /* Calculate the new placement of the branch.  It should be in row
980      ii-1 and fall into it's scheduling window.  */
981   if (get_sched_window (ps, g->closing_branch, sched_nodes, ii, &start,
982 			&step, &end) == 0)
983     {
984       bool success;
985       ps_insn_ptr next_ps_i;
986       int branch_cycle = SCHED_TIME (g->closing_branch->cuid);
987       int row = SMODULO (branch_cycle, ps->ii);
988       int num_splits = 0;
989       sbitmap must_precede, must_follow, tmp_precede, tmp_follow;
990       int c;
991 
992       if (dump_file)
993 	fprintf (dump_file, "\nTrying to schedule node %d "
994 		 "INSN = %d  in (%d .. %d) step %d\n",
995 		 g->closing_branch->cuid,
996 		 (INSN_UID (g->closing_branch->insn)), start, end, step);
997 
998       gcc_assert ((step > 0 && start < end) || (step < 0 && start > end));
999       if (step == 1)
1000 	{
1001 	  c = start + ii - SMODULO (start, ii) - 1;
1002 	  gcc_assert (c >= start);
1003 	  if (c >= end)
1004 	    {
1005 	      ok = false;
1006 	      if (dump_file)
1007 		fprintf (dump_file,
1008 			 "SMS failed to schedule branch at cycle: %d\n", c);
1009 	      goto clear;
1010 	    }
1011 	}
1012       else
1013 	{
1014 	  c = start - SMODULO (start, ii) - 1;
1015 	  gcc_assert (c <= start);
1016 
1017 	  if (c <= end)
1018 	    {
1019 	      if (dump_file)
1020 		fprintf (dump_file,
1021 			 "SMS failed to schedule branch at cycle: %d\n", c);
1022 	      ok = false;
1023 	      goto clear;
1024 	    }
1025 	}
1026 
1027       must_precede = sbitmap_alloc (g->num_nodes);
1028       must_follow = sbitmap_alloc (g->num_nodes);
1029 
1030       /* Try to schedule the branch is it's new cycle.  */
1031       calculate_must_precede_follow (g->closing_branch, start, end,
1032 				     step, ii, sched_nodes,
1033 				     must_precede, must_follow);
1034 
1035       set_must_precede_follow (&tmp_follow, must_follow, &tmp_precede,
1036 			       must_precede, c, start, end, step);
1037 
1038       /* Find the element in the partial schedule related to the closing
1039          branch so we can remove it from it's current cycle.  */
1040       for (next_ps_i = ps->rows[row];
1041 	   next_ps_i; next_ps_i = next_ps_i->next_in_row)
1042 	if (next_ps_i->id == g->closing_branch->cuid)
1043 	  break;
1044 
1045       remove_node_from_ps (ps, next_ps_i);
1046       success =
1047 	try_scheduling_node_in_cycle (ps, g->closing_branch->cuid, c,
1048 				      sched_nodes, &num_splits,
1049 				      tmp_precede, tmp_follow);
1050       gcc_assert (num_splits == 0);
1051       if (!success)
1052 	{
1053 	  if (dump_file)
1054 	    fprintf (dump_file,
1055 		     "SMS failed to schedule branch at cycle: %d, "
1056 		     "bringing it back to cycle %d\n", c, branch_cycle);
1057 
1058 	  /* The branch was failed to be placed in row ii - 1.
1059 	     Put it back in it's original place in the partial
1060 	     schedualing.  */
1061 	  set_must_precede_follow (&tmp_follow, must_follow, &tmp_precede,
1062 				   must_precede, branch_cycle, start, end,
1063 				   step);
1064 	  success =
1065 	    try_scheduling_node_in_cycle (ps, g->closing_branch->cuid,
1066 					  branch_cycle, sched_nodes,
1067 					  &num_splits, tmp_precede,
1068 					  tmp_follow);
1069 	  gcc_assert (success && (num_splits == 0));
1070 	  ok = false;
1071 	}
1072       else
1073 	{
1074 	  /* The branch is placed in row ii - 1.  */
1075 	  if (dump_file)
1076 	    fprintf (dump_file,
1077 		     "SMS success in moving branch to cycle %d\n", c);
1078 
1079 	  update_node_sched_params (g->closing_branch->cuid, ii, c,
1080 				    PS_MIN_CYCLE (ps));
1081 	  ok = true;
1082 	}
1083 
1084       free (must_precede);
1085       free (must_follow);
1086     }
1087 
1088 clear:
1089   free (sched_nodes);
1090   return ok;
1091 }
1092 
1093 static void
duplicate_insns_of_cycles(partial_schedule_ptr ps,int from_stage,int to_stage,rtx count_reg)1094 duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
1095 			   int to_stage, rtx count_reg)
1096 {
1097   int row;
1098   ps_insn_ptr ps_ij;
1099 
1100   for (row = 0; row < ps->ii; row++)
1101     for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
1102       {
1103 	int u = ps_ij->id;
1104 	int first_u, last_u;
1105 	rtx u_insn;
1106 
1107         /* Do not duplicate any insn which refers to count_reg as it
1108            belongs to the control part.
1109            The closing branch is scheduled as well and thus should
1110            be ignored.
1111            TODO: This should be done by analyzing the control part of
1112            the loop.  */
1113 	u_insn = ps_rtl_insn (ps, u);
1114         if (reg_mentioned_p (count_reg, u_insn)
1115             || JUMP_P (u_insn))
1116           continue;
1117 
1118 	first_u = SCHED_STAGE (u);
1119 	last_u = first_u + ps_num_consecutive_stages (ps, u) - 1;
1120 	if (from_stage <= last_u && to_stage >= first_u)
1121 	  {
1122 	    if (u < ps->g->num_nodes)
1123 	      duplicate_insn_chain (ps_first_note (ps, u), u_insn);
1124 	    else
1125 	      emit_insn (copy_rtx (PATTERN (u_insn)));
1126 	  }
1127       }
1128 }
1129 
1130 
1131 /* Generate the instructions (including reg_moves) for prolog & epilog.  */
1132 static void
generate_prolog_epilog(partial_schedule_ptr ps,struct loop * loop,rtx count_reg,rtx count_init)1133 generate_prolog_epilog (partial_schedule_ptr ps, struct loop *loop,
1134                         rtx count_reg, rtx count_init)
1135 {
1136   int i;
1137   int last_stage = PS_STAGE_COUNT (ps) - 1;
1138   edge e;
1139 
1140   /* Generate the prolog, inserting its insns on the loop-entry edge.  */
1141   start_sequence ();
1142 
1143   if (!count_init)
1144     {
1145       /* Generate instructions at the beginning of the prolog to
1146          adjust the loop count by STAGE_COUNT.  If loop count is constant
1147          (count_init), this constant is adjusted by STAGE_COUNT in
1148          generate_prolog_epilog function.  */
1149       rtx sub_reg = NULL_RTX;
1150 
1151       sub_reg = expand_simple_binop (GET_MODE (count_reg), MINUS,
1152                                      count_reg, GEN_INT (last_stage),
1153                                      count_reg, 1, OPTAB_DIRECT);
1154       gcc_assert (REG_P (sub_reg));
1155       if (REGNO (sub_reg) != REGNO (count_reg))
1156         emit_move_insn (count_reg, sub_reg);
1157     }
1158 
1159   for (i = 0; i < last_stage; i++)
1160     duplicate_insns_of_cycles (ps, 0, i, count_reg);
1161 
1162   /* Put the prolog on the entry edge.  */
1163   e = loop_preheader_edge (loop);
1164   split_edge_and_insert (e, get_insns ());
1165   if (!flag_resched_modulo_sched)
1166     e->dest->flags |= BB_DISABLE_SCHEDULE;
1167 
1168   end_sequence ();
1169 
1170   /* Generate the epilog, inserting its insns on the loop-exit edge.  */
1171   start_sequence ();
1172 
1173   for (i = 0; i < last_stage; i++)
1174     duplicate_insns_of_cycles (ps, i + 1, last_stage, count_reg);
1175 
1176   /* Put the epilogue on the exit edge.  */
1177   gcc_assert (single_exit (loop));
1178   e = single_exit (loop);
1179   split_edge_and_insert (e, get_insns ());
1180   if (!flag_resched_modulo_sched)
1181     e->dest->flags |= BB_DISABLE_SCHEDULE;
1182 
1183   end_sequence ();
1184 }
1185 
1186 /* Mark LOOP as software pipelined so the later
1187    scheduling passes don't touch it.  */
1188 static void
mark_loop_unsched(struct loop * loop)1189 mark_loop_unsched (struct loop *loop)
1190 {
1191   unsigned i;
1192   basic_block *bbs = get_loop_body (loop);
1193 
1194   for (i = 0; i < loop->num_nodes; i++)
1195     bbs[i]->flags |= BB_DISABLE_SCHEDULE;
1196 
1197   free (bbs);
1198 }
1199 
1200 /* Return true if all the BBs of the loop are empty except the
1201    loop header.  */
1202 static bool
loop_single_full_bb_p(struct loop * loop)1203 loop_single_full_bb_p (struct loop *loop)
1204 {
1205   unsigned i;
1206   basic_block *bbs = get_loop_body (loop);
1207 
1208   for (i = 0; i < loop->num_nodes ; i++)
1209     {
1210       rtx head, tail;
1211       bool empty_bb = true;
1212 
1213       if (bbs[i] == loop->header)
1214         continue;
1215 
1216       /* Make sure that basic blocks other than the header
1217          have only notes labels or jumps.  */
1218       get_ebb_head_tail (bbs[i], bbs[i], &head, &tail);
1219       for (; head != NEXT_INSN (tail); head = NEXT_INSN (head))
1220         {
1221           if (NOTE_P (head) || LABEL_P (head)
1222  	      || (INSN_P (head) && (DEBUG_INSN_P (head) || JUMP_P (head))))
1223  	    continue;
1224  	  empty_bb = false;
1225  	  break;
1226         }
1227 
1228       if (! empty_bb)
1229         {
1230           free (bbs);
1231           return false;
1232         }
1233     }
1234   free (bbs);
1235   return true;
1236 }
1237 
1238 /* Dump file:line from INSN's location info to dump_file.  */
1239 
1240 static void
dump_insn_location(rtx insn)1241 dump_insn_location (rtx insn)
1242 {
1243   if (dump_file && INSN_LOCATION (insn))
1244     {
1245       const char *file = insn_file (insn);
1246       if (file)
1247 	fprintf (dump_file, " %s:%i", file, insn_line (insn));
1248     }
1249 }
1250 
1251 /* A simple loop from SMS point of view; it is a loop that is composed of
1252    either a single basic block or two BBs - a header and a latch.  */
1253 #define SIMPLE_SMS_LOOP_P(loop) ((loop->num_nodes < 3 ) 		    \
1254 				  && (EDGE_COUNT (loop->latch->preds) == 1) \
1255                                   && (EDGE_COUNT (loop->latch->succs) == 1))
1256 
1257 /* Return true if the loop is in its canonical form and false if not.
1258    i.e. SIMPLE_SMS_LOOP_P and have one preheader block, and single exit.  */
1259 static bool
loop_canon_p(struct loop * loop)1260 loop_canon_p (struct loop *loop)
1261 {
1262 
1263   if (loop->inner || !loop_outer (loop))
1264   {
1265     if (dump_file)
1266       fprintf (dump_file, "SMS loop inner or !loop_outer\n");
1267     return false;
1268   }
1269 
1270   if (!single_exit (loop))
1271     {
1272       if (dump_file)
1273 	{
1274 	  rtx insn = BB_END (loop->header);
1275 
1276 	  fprintf (dump_file, "SMS loop many exits");
1277 	  dump_insn_location (insn);
1278 	  fprintf (dump_file, "\n");
1279 	}
1280       return false;
1281     }
1282 
1283   if (! SIMPLE_SMS_LOOP_P (loop) && ! loop_single_full_bb_p (loop))
1284     {
1285       if (dump_file)
1286 	{
1287 	  rtx insn = BB_END (loop->header);
1288 
1289 	  fprintf (dump_file, "SMS loop many BBs.");
1290 	  dump_insn_location (insn);
1291 	  fprintf (dump_file, "\n");
1292 	}
1293       return false;
1294     }
1295 
1296     return true;
1297 }
1298 
1299 /* If there are more than one entry for the loop,
1300    make it one by splitting the first entry edge and
1301    redirecting the others to the new BB.  */
1302 static void
canon_loop(struct loop * loop)1303 canon_loop (struct loop *loop)
1304 {
1305   edge e;
1306   edge_iterator i;
1307 
1308   /* Avoid annoying special cases of edges going to exit
1309      block.  */
1310   FOR_EACH_EDGE (e, i, EXIT_BLOCK_PTR->preds)
1311     if ((e->flags & EDGE_FALLTHRU) && (EDGE_COUNT (e->src->succs) > 1))
1312       split_edge (e);
1313 
1314   if (loop->latch == loop->header
1315       || EDGE_COUNT (loop->latch->succs) > 1)
1316     {
1317       FOR_EACH_EDGE (e, i, loop->header->preds)
1318         if (e->src == loop->latch)
1319           break;
1320       split_edge (e);
1321     }
1322 }
1323 
1324 /* Setup infos.  */
1325 static void
setup_sched_infos(void)1326 setup_sched_infos (void)
1327 {
1328   memcpy (&sms_common_sched_info, &haifa_common_sched_info,
1329 	  sizeof (sms_common_sched_info));
1330   sms_common_sched_info.sched_pass_id = SCHED_SMS_PASS;
1331   common_sched_info = &sms_common_sched_info;
1332 
1333   sched_deps_info = &sms_sched_deps_info;
1334   current_sched_info = &sms_sched_info;
1335 }
1336 
1337 /* Probability in % that the sms-ed loop rolls enough so that optimized
1338    version may be entered.  Just a guess.  */
1339 #define PROB_SMS_ENOUGH_ITERATIONS 80
1340 
1341 /* Used to calculate the upper bound of ii.  */
1342 #define MAXII_FACTOR 2
1343 
1344 /* Main entry point, perform SMS scheduling on the loops of the function
1345    that consist of single basic blocks.  */
1346 static void
sms_schedule(void)1347 sms_schedule (void)
1348 {
1349   rtx insn;
1350   ddg_ptr *g_arr, g;
1351   int * node_order;
1352   int maxii, max_asap;
1353   loop_iterator li;
1354   partial_schedule_ptr ps;
1355   basic_block bb = NULL;
1356   struct loop *loop;
1357   basic_block condition_bb = NULL;
1358   edge latch_edge;
1359   gcov_type trip_count = 0;
1360 
1361   loop_optimizer_init (LOOPS_HAVE_PREHEADERS
1362 		       | LOOPS_HAVE_RECORDED_EXITS);
1363   if (number_of_loops () <= 1)
1364     {
1365       loop_optimizer_finalize ();
1366       return;  /* There are no loops to schedule.  */
1367     }
1368 
1369   /* Initialize issue_rate.  */
1370   if (targetm.sched.issue_rate)
1371     {
1372       int temp = reload_completed;
1373 
1374       reload_completed = 1;
1375       issue_rate = targetm.sched.issue_rate ();
1376       reload_completed = temp;
1377     }
1378   else
1379     issue_rate = 1;
1380 
1381   /* Initialize the scheduler.  */
1382   setup_sched_infos ();
1383   haifa_sched_init ();
1384 
1385   /* Allocate memory to hold the DDG array one entry for each loop.
1386      We use loop->num as index into this array.  */
1387   g_arr = XCNEWVEC (ddg_ptr, number_of_loops ());
1388 
1389   if (dump_file)
1390   {
1391     fprintf (dump_file, "\n\nSMS analysis phase\n");
1392     fprintf (dump_file, "===================\n\n");
1393   }
1394 
1395   /* Build DDGs for all the relevant loops and hold them in G_ARR
1396      indexed by the loop index.  */
1397   FOR_EACH_LOOP (li, loop, 0)
1398     {
1399       rtx head, tail;
1400       rtx count_reg;
1401 
1402       /* For debugging.  */
1403       if (dbg_cnt (sms_sched_loop) == false)
1404         {
1405           if (dump_file)
1406             fprintf (dump_file, "SMS reached max limit... \n");
1407 
1408 	  FOR_EACH_LOOP_BREAK (li);
1409         }
1410 
1411       if (dump_file)
1412 	{
1413 	  rtx insn = BB_END (loop->header);
1414 
1415 	  fprintf (dump_file, "SMS loop num: %d", loop->num);
1416 	  dump_insn_location (insn);
1417 	  fprintf (dump_file, "\n");
1418 	}
1419 
1420       if (! loop_canon_p (loop))
1421         continue;
1422 
1423       if (! loop_single_full_bb_p (loop))
1424       {
1425         if (dump_file)
1426           fprintf (dump_file, "SMS not loop_single_full_bb_p\n");
1427 	continue;
1428       }
1429 
1430       bb = loop->header;
1431 
1432       get_ebb_head_tail (bb, bb, &head, &tail);
1433       latch_edge = loop_latch_edge (loop);
1434       gcc_assert (single_exit (loop));
1435       if (single_exit (loop)->count)
1436 	trip_count = latch_edge->count / single_exit (loop)->count;
1437 
1438       /* Perform SMS only on loops that their average count is above threshold.  */
1439 
1440       if ( latch_edge->count
1441           && (latch_edge->count < single_exit (loop)->count * SMS_LOOP_AVERAGE_COUNT_THRESHOLD))
1442 	{
1443 	  if (dump_file)
1444 	    {
1445 	      dump_insn_location (tail);
1446 	      fprintf (dump_file, "\nSMS single-bb-loop\n");
1447 	      if (profile_info && flag_branch_probabilities)
1448 	    	{
1449 	      	  fprintf (dump_file, "SMS loop-count ");
1450 	      	  fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1451 	             	   (HOST_WIDEST_INT) bb->count);
1452 	      	  fprintf (dump_file, "\n");
1453                   fprintf (dump_file, "SMS trip-count ");
1454                   fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1455                            (HOST_WIDEST_INT) trip_count);
1456                   fprintf (dump_file, "\n");
1457 	      	  fprintf (dump_file, "SMS profile-sum-max ");
1458 	      	  fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1459 	          	   (HOST_WIDEST_INT) profile_info->sum_max);
1460 	      	  fprintf (dump_file, "\n");
1461 	    	}
1462 	    }
1463           continue;
1464         }
1465 
1466       /* Make sure this is a doloop.  */
1467       if ( !(count_reg = doloop_register_get (head, tail)))
1468       {
1469         if (dump_file)
1470           fprintf (dump_file, "SMS doloop_register_get failed\n");
1471 	continue;
1472       }
1473 
1474       /* Don't handle BBs with calls or barriers
1475 	 or !single_set with the exception of instructions that include
1476 	 count_reg---these instructions are part of the control part
1477 	 that do-loop recognizes.
1478          ??? Should handle insns defining subregs.  */
1479      for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
1480       {
1481          rtx set;
1482 
1483         if (CALL_P (insn)
1484             || BARRIER_P (insn)
1485             || (NONDEBUG_INSN_P (insn) && !JUMP_P (insn)
1486                 && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE
1487                 && !reg_mentioned_p (count_reg, insn))
1488             || (INSN_P (insn) && (set = single_set (insn))
1489                 && GET_CODE (SET_DEST (set)) == SUBREG))
1490         break;
1491       }
1492 
1493       if (insn != NEXT_INSN (tail))
1494 	{
1495 	  if (dump_file)
1496 	    {
1497 	      if (CALL_P (insn))
1498 		fprintf (dump_file, "SMS loop-with-call\n");
1499 	      else if (BARRIER_P (insn))
1500 		fprintf (dump_file, "SMS loop-with-barrier\n");
1501               else if ((NONDEBUG_INSN_P (insn) && !JUMP_P (insn)
1502                 && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE))
1503                 fprintf (dump_file, "SMS loop-with-not-single-set\n");
1504               else
1505                fprintf (dump_file, "SMS loop with subreg in lhs\n");
1506 	      print_rtl_single (dump_file, insn);
1507 	    }
1508 
1509 	  continue;
1510 	}
1511 
1512       /* Always schedule the closing branch with the rest of the
1513          instructions. The branch is rotated to be in row ii-1 at the
1514          end of the scheduling procedure to make sure it's the last
1515          instruction in the iteration.  */
1516       if (! (g = create_ddg (bb, 1)))
1517         {
1518           if (dump_file)
1519 	    fprintf (dump_file, "SMS create_ddg failed\n");
1520 	  continue;
1521         }
1522 
1523       g_arr[loop->num] = g;
1524       if (dump_file)
1525         fprintf (dump_file, "...OK\n");
1526 
1527     }
1528   if (dump_file)
1529   {
1530     fprintf (dump_file, "\nSMS transformation phase\n");
1531     fprintf (dump_file, "=========================\n\n");
1532   }
1533 
1534   /* We don't want to perform SMS on new loops - created by versioning.  */
1535   FOR_EACH_LOOP (li, loop, 0)
1536     {
1537       rtx head, tail;
1538       rtx count_reg, count_init;
1539       int mii, rec_mii, stage_count, min_cycle;
1540       HOST_WIDEST_INT loop_count = 0;
1541       bool opt_sc_p;
1542 
1543       if (! (g = g_arr[loop->num]))
1544         continue;
1545 
1546       if (dump_file)
1547 	{
1548 	  rtx insn = BB_END (loop->header);
1549 
1550 	  fprintf (dump_file, "SMS loop num: %d", loop->num);
1551 	  dump_insn_location (insn);
1552 	  fprintf (dump_file, "\n");
1553 
1554 	  print_ddg (dump_file, g);
1555 	}
1556 
1557       get_ebb_head_tail (loop->header, loop->header, &head, &tail);
1558 
1559       latch_edge = loop_latch_edge (loop);
1560       gcc_assert (single_exit (loop));
1561       if (single_exit (loop)->count)
1562 	trip_count = latch_edge->count / single_exit (loop)->count;
1563 
1564       if (dump_file)
1565 	{
1566 	  dump_insn_location (tail);
1567 	  fprintf (dump_file, "\nSMS single-bb-loop\n");
1568 	  if (profile_info && flag_branch_probabilities)
1569 	    {
1570 	      fprintf (dump_file, "SMS loop-count ");
1571 	      fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1572 	               (HOST_WIDEST_INT) bb->count);
1573 	      fprintf (dump_file, "\n");
1574 	      fprintf (dump_file, "SMS profile-sum-max ");
1575 	      fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1576 	               (HOST_WIDEST_INT) profile_info->sum_max);
1577 	      fprintf (dump_file, "\n");
1578 	    }
1579 	  fprintf (dump_file, "SMS doloop\n");
1580 	  fprintf (dump_file, "SMS built-ddg %d\n", g->num_nodes);
1581           fprintf (dump_file, "SMS num-loads %d\n", g->num_loads);
1582           fprintf (dump_file, "SMS num-stores %d\n", g->num_stores);
1583 	}
1584 
1585 
1586       /* In case of th loop have doloop register it gets special
1587 	 handling.  */
1588       count_init = NULL_RTX;
1589       if ((count_reg = doloop_register_get (head, tail)))
1590 	{
1591 	  basic_block pre_header;
1592 
1593 	  pre_header = loop_preheader_edge (loop)->src;
1594 	  count_init = const_iteration_count (count_reg, pre_header,
1595 					      &loop_count);
1596 	}
1597       gcc_assert (count_reg);
1598 
1599       if (dump_file && count_init)
1600         {
1601           fprintf (dump_file, "SMS const-doloop ");
1602           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1603 		     loop_count);
1604           fprintf (dump_file, "\n");
1605         }
1606 
1607       node_order = XNEWVEC (int, g->num_nodes);
1608 
1609       mii = 1; /* Need to pass some estimate of mii.  */
1610       rec_mii = sms_order_nodes (g, mii, node_order, &max_asap);
1611       mii = MAX (res_MII (g), rec_mii);
1612       maxii = MAX (max_asap, MAXII_FACTOR * mii);
1613 
1614       if (dump_file)
1615 	fprintf (dump_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
1616 		 rec_mii, mii, maxii);
1617 
1618       for (;;)
1619 	{
1620 	  set_node_sched_params (g);
1621 
1622 	  stage_count = 0;
1623 	  opt_sc_p = false;
1624 	  ps = sms_schedule_by_order (g, mii, maxii, node_order);
1625 
1626 	  if (ps)
1627 	    {
1628 	      /* Try to achieve optimized SC by normalizing the partial
1629 		 schedule (having the cycles start from cycle zero).
1630 		 The branch location must be placed in row ii-1 in the
1631 		 final scheduling.	If failed, shift all instructions to
1632 		 position the branch in row ii-1.  */
1633 	      opt_sc_p = optimize_sc (ps, g);
1634 	      if (opt_sc_p)
1635 		stage_count = calculate_stage_count (ps, 0);
1636 	      else
1637 		{
1638 		  /* Bring the branch to cycle ii-1.  */
1639 		  int amount = (SCHED_TIME (g->closing_branch->cuid)
1640 				- (ps->ii - 1));
1641 
1642 		  if (dump_file)
1643 		    fprintf (dump_file, "SMS schedule branch at cycle ii-1\n");
1644 
1645 		  stage_count = calculate_stage_count (ps, amount);
1646 		}
1647 
1648 	      gcc_assert (stage_count >= 1);
1649 	    }
1650 
1651 	  /* The default value of PARAM_SMS_MIN_SC is 2 as stage count of
1652 	     1 means that there is no interleaving between iterations thus
1653 	     we let the scheduling passes do the job in this case.  */
1654 	  if (stage_count < PARAM_VALUE (PARAM_SMS_MIN_SC)
1655 	      || (count_init && (loop_count <= stage_count))
1656 	      || (flag_branch_probabilities && (trip_count <= stage_count)))
1657 	    {
1658 	      if (dump_file)
1659 		{
1660 		  fprintf (dump_file, "SMS failed... \n");
1661 		  fprintf (dump_file, "SMS sched-failed (stage-count=%d,"
1662 			   " loop-count=", stage_count);
1663 		  fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, loop_count);
1664 		  fprintf (dump_file, ", trip-count=");
1665 		  fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, trip_count);
1666 		  fprintf (dump_file, ")\n");
1667 		}
1668 	      break;
1669 	    }
1670 
1671           if (!opt_sc_p)
1672             {
1673 	      /* Rotate the partial schedule to have the branch in row ii-1.  */
1674               int amount = SCHED_TIME (g->closing_branch->cuid) - (ps->ii - 1);
1675 
1676               reset_sched_times (ps, amount);
1677               rotate_partial_schedule (ps, amount);
1678             }
1679 
1680 	  set_columns_for_ps (ps);
1681 
1682 	  min_cycle = PS_MIN_CYCLE (ps) - SMODULO (PS_MIN_CYCLE (ps), ps->ii);
1683 	  if (!schedule_reg_moves (ps))
1684 	    {
1685 	      mii = ps->ii + 1;
1686 	      free_partial_schedule (ps);
1687 	      continue;
1688 	    }
1689 
1690 	  /* Moves that handle incoming values might have been added
1691 	     to a new first stage.  Bump the stage count if so.
1692 
1693 	     ??? Perhaps we could consider rotating the schedule here
1694 	     instead?  */
1695 	  if (PS_MIN_CYCLE (ps) < min_cycle)
1696 	    {
1697 	      reset_sched_times (ps, 0);
1698 	      stage_count++;
1699 	    }
1700 
1701 	  /* The stage count should now be correct without rotation.  */
1702 	  gcc_checking_assert (stage_count == calculate_stage_count (ps, 0));
1703 	  PS_STAGE_COUNT (ps) = stage_count;
1704 
1705 	  canon_loop (loop);
1706 
1707           if (dump_file)
1708             {
1709 	      dump_insn_location (tail);
1710 	      fprintf (dump_file, " SMS succeeded %d %d (with ii, sc)\n",
1711 		       ps->ii, stage_count);
1712 	      print_partial_schedule (ps, dump_file);
1713 	    }
1714 
1715           /* case the BCT count is not known , Do loop-versioning */
1716 	  if (count_reg && ! count_init)
1717             {
1718 	      rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg,
1719 	  				     GEN_INT(stage_count));
1720 	      unsigned prob = (PROB_SMS_ENOUGH_ITERATIONS
1721 			       * REG_BR_PROB_BASE) / 100;
1722 
1723 	      loop_version (loop, comp_rtx, &condition_bb,
1724 	  		    prob, prob, REG_BR_PROB_BASE - prob,
1725 			    true);
1726 	     }
1727 
1728 	  /* Set new iteration count of loop kernel.  */
1729           if (count_reg && count_init)
1730 	    SET_SRC (single_set (count_init)) = GEN_INT (loop_count
1731 						     - stage_count + 1);
1732 
1733 	  /* Now apply the scheduled kernel to the RTL of the loop.  */
1734 	  permute_partial_schedule (ps, g->closing_branch->first_note);
1735 
1736           /* Mark this loop as software pipelined so the later
1737 	     scheduling passes don't touch it.  */
1738 	  if (! flag_resched_modulo_sched)
1739 	    mark_loop_unsched (loop);
1740 
1741 	  /* The life-info is not valid any more.  */
1742 	  df_set_bb_dirty (g->bb);
1743 
1744 	  apply_reg_moves (ps);
1745 	  if (dump_file)
1746 	    print_node_sched_params (dump_file, g->num_nodes, ps);
1747 	  /* Generate prolog and epilog.  */
1748           generate_prolog_epilog (ps, loop, count_reg, count_init);
1749 	  break;
1750 	}
1751 
1752       free_partial_schedule (ps);
1753       node_sched_param_vec.release ();
1754       free (node_order);
1755       free_ddg (g);
1756     }
1757 
1758   free (g_arr);
1759 
1760   /* Release scheduler data, needed until now because of DFA.  */
1761   haifa_sched_finish ();
1762   loop_optimizer_finalize ();
1763 }
1764 
1765 /* The SMS scheduling algorithm itself
1766    -----------------------------------
1767    Input: 'O' an ordered list of insns of a loop.
1768    Output: A scheduling of the loop - kernel, prolog, and epilogue.
1769 
1770    'Q' is the empty Set
1771    'PS' is the partial schedule; it holds the currently scheduled nodes with
1772 	their cycle/slot.
1773    'PSP' previously scheduled predecessors.
1774    'PSS' previously scheduled successors.
1775    't(u)' the cycle where u is scheduled.
1776    'l(u)' is the latency of u.
1777    'd(v,u)' is the dependence distance from v to u.
1778    'ASAP(u)' the earliest time at which u could be scheduled as computed in
1779 	     the node ordering phase.
1780    'check_hardware_resources_conflicts(u, PS, c)'
1781 			     run a trace around cycle/slot through DFA model
1782 			     to check resource conflicts involving instruction u
1783 			     at cycle c given the partial schedule PS.
1784    'add_to_partial_schedule_at_time(u, PS, c)'
1785 			     Add the node/instruction u to the partial schedule
1786 			     PS at time c.
1787    'calculate_register_pressure(PS)'
1788 			     Given a schedule of instructions, calculate the register
1789 			     pressure it implies.  One implementation could be the
1790 			     maximum number of overlapping live ranges.
1791    'maxRP' The maximum allowed register pressure, it is usually derived from the number
1792 	   registers available in the hardware.
1793 
1794    1. II = MII.
1795    2. PS = empty list
1796    3. for each node u in O in pre-computed order
1797    4.   if (PSP(u) != Q && PSS(u) == Q) then
1798    5.     Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1799    6.     start = Early_start; end = Early_start + II - 1; step = 1
1800    11.  else if (PSP(u) == Q && PSS(u) != Q) then
1801    12.      Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1802    13.     start = Late_start; end = Late_start - II + 1; step = -1
1803    14.  else if (PSP(u) != Q && PSS(u) != Q) then
1804    15.     Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1805    16.     Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1806    17.     start = Early_start;
1807    18.     end = min(Early_start + II - 1 , Late_start);
1808    19.     step = 1
1809    20.     else "if (PSP(u) == Q && PSS(u) == Q)"
1810    21.	  start = ASAP(u); end = start + II - 1; step = 1
1811    22.  endif
1812 
1813    23.  success = false
1814    24.  for (c = start ; c != end ; c += step)
1815    25.     if check_hardware_resources_conflicts(u, PS, c) then
1816    26.       add_to_partial_schedule_at_time(u, PS, c)
1817    27.       success = true
1818    28.       break
1819    29.     endif
1820    30.  endfor
1821    31.  if (success == false) then
1822    32.    II = II + 1
1823    33.    if (II > maxII) then
1824    34.       finish - failed to schedule
1825    35.	 endif
1826    36.    goto 2.
1827    37.  endif
1828    38. endfor
1829    39. if (calculate_register_pressure(PS) > maxRP) then
1830    40.    goto 32.
1831    41. endif
1832    42. compute epilogue & prologue
1833    43. finish - succeeded to schedule
1834 
1835    ??? The algorithm restricts the scheduling window to II cycles.
1836    In rare cases, it may be better to allow windows of II+1 cycles.
1837    The window would then start and end on the same row, but with
1838    different "must precede" and "must follow" requirements.  */
1839 
1840 /* A limit on the number of cycles that resource conflicts can span.  ??? Should
1841    be provided by DFA, and be dependent on the type of insn scheduled.  Currently
1842    set to 0 to save compile time.  */
1843 #define DFA_HISTORY SMS_DFA_HISTORY
1844 
1845 /* A threshold for the number of repeated unsuccessful attempts to insert
1846    an empty row, before we flush the partial schedule and start over.  */
1847 #define MAX_SPLIT_NUM 10
1848 /* Given the partial schedule PS, this function calculates and returns the
1849    cycles in which we can schedule the node with the given index I.
1850    NOTE: Here we do the backtracking in SMS, in some special cases. We have
1851    noticed that there are several cases in which we fail    to SMS the loop
1852    because the sched window of a node is empty    due to tight data-deps. In
1853    such cases we want to unschedule    some of the predecessors/successors
1854    until we get non-empty    scheduling window.  It returns -1 if the
1855    scheduling window is empty and zero otherwise.  */
1856 
1857 static int
get_sched_window(partial_schedule_ptr ps,ddg_node_ptr u_node,sbitmap sched_nodes,int ii,int * start_p,int * step_p,int * end_p)1858 get_sched_window (partial_schedule_ptr ps, ddg_node_ptr u_node,
1859 		  sbitmap sched_nodes, int ii, int *start_p, int *step_p,
1860 		  int *end_p)
1861 {
1862   int start, step, end;
1863   int early_start, late_start;
1864   ddg_edge_ptr e;
1865   sbitmap psp = sbitmap_alloc (ps->g->num_nodes);
1866   sbitmap pss = sbitmap_alloc (ps->g->num_nodes);
1867   sbitmap u_node_preds = NODE_PREDECESSORS (u_node);
1868   sbitmap u_node_succs = NODE_SUCCESSORS (u_node);
1869   int psp_not_empty;
1870   int pss_not_empty;
1871   int count_preds;
1872   int count_succs;
1873 
1874   /* 1. compute sched window for u (start, end, step).  */
1875   bitmap_clear (psp);
1876   bitmap_clear (pss);
1877   psp_not_empty = bitmap_and (psp, u_node_preds, sched_nodes);
1878   pss_not_empty = bitmap_and (pss, u_node_succs, sched_nodes);
1879 
1880   /* We first compute a forward range (start <= end), then decide whether
1881      to reverse it.  */
1882   early_start = INT_MIN;
1883   late_start = INT_MAX;
1884   start = INT_MIN;
1885   end = INT_MAX;
1886   step = 1;
1887 
1888   count_preds = 0;
1889   count_succs = 0;
1890 
1891   if (dump_file && (psp_not_empty || pss_not_empty))
1892     {
1893       fprintf (dump_file, "\nAnalyzing dependencies for node %d (INSN %d)"
1894 	       "; ii = %d\n\n", u_node->cuid, INSN_UID (u_node->insn), ii);
1895       fprintf (dump_file, "%11s %11s %11s %11s %5s\n",
1896 	       "start", "early start", "late start", "end", "time");
1897       fprintf (dump_file, "=========== =========== =========== ==========="
1898 	       " =====\n");
1899     }
1900   /* Calculate early_start and limit end.  Both bounds are inclusive.  */
1901   if (psp_not_empty)
1902     for (e = u_node->in; e != 0; e = e->next_in)
1903       {
1904 	int v = e->src->cuid;
1905 
1906 	if (bitmap_bit_p (sched_nodes, v))
1907 	  {
1908 	    int p_st = SCHED_TIME (v);
1909 	    int earliest = p_st + e->latency - (e->distance * ii);
1910 	    int latest = (e->data_type == MEM_DEP ? p_st + ii - 1 : INT_MAX);
1911 
1912 	    if (dump_file)
1913 	      {
1914 		fprintf (dump_file, "%11s %11d %11s %11d %5d",
1915 			 "", earliest, "", latest, p_st);
1916 		print_ddg_edge (dump_file, e);
1917 		fprintf (dump_file, "\n");
1918 	      }
1919 
1920 	    early_start = MAX (early_start, earliest);
1921 	    end = MIN (end, latest);
1922 
1923 	    if (e->type == TRUE_DEP && e->data_type == REG_DEP)
1924 	      count_preds++;
1925 	  }
1926       }
1927 
1928   /* Calculate late_start and limit start.  Both bounds are inclusive.  */
1929   if (pss_not_empty)
1930     for (e = u_node->out; e != 0; e = e->next_out)
1931       {
1932 	int v = e->dest->cuid;
1933 
1934 	if (bitmap_bit_p (sched_nodes, v))
1935 	  {
1936 	    int s_st = SCHED_TIME (v);
1937 	    int earliest = (e->data_type == MEM_DEP ? s_st - ii + 1 : INT_MIN);
1938 	    int latest = s_st - e->latency + (e->distance * ii);
1939 
1940 	    if (dump_file)
1941 	      {
1942 		fprintf (dump_file, "%11d %11s %11d %11s %5d",
1943 			 earliest, "", latest, "", s_st);
1944 		print_ddg_edge (dump_file, e);
1945 		fprintf (dump_file, "\n");
1946 	      }
1947 
1948 	    start = MAX (start, earliest);
1949 	    late_start = MIN (late_start, latest);
1950 
1951 	    if (e->type == TRUE_DEP && e->data_type == REG_DEP)
1952 	      count_succs++;
1953 	  }
1954       }
1955 
1956   if (dump_file && (psp_not_empty || pss_not_empty))
1957     {
1958       fprintf (dump_file, "----------- ----------- ----------- -----------"
1959 	       " -----\n");
1960       fprintf (dump_file, "%11d %11d %11d %11d %5s %s\n",
1961 	       start, early_start, late_start, end, "",
1962 	       "(max, max, min, min)");
1963     }
1964 
1965   /* Get a target scheduling window no bigger than ii.  */
1966   if (early_start == INT_MIN && late_start == INT_MAX)
1967     early_start = NODE_ASAP (u_node);
1968   else if (early_start == INT_MIN)
1969     early_start = late_start - (ii - 1);
1970   late_start = MIN (late_start, early_start + (ii - 1));
1971 
1972   /* Apply memory dependence limits.  */
1973   start = MAX (start, early_start);
1974   end = MIN (end, late_start);
1975 
1976   if (dump_file && (psp_not_empty || pss_not_empty))
1977     fprintf (dump_file, "%11s %11d %11d %11s %5s final window\n",
1978 	     "", start, end, "", "");
1979 
1980   /* If there are at least as many successors as predecessors, schedule the
1981      node close to its successors.  */
1982   if (pss_not_empty && count_succs >= count_preds)
1983     {
1984       int tmp = end;
1985       end = start;
1986       start = tmp;
1987       step = -1;
1988     }
1989 
1990   /* Now that we've finalized the window, make END an exclusive rather
1991      than an inclusive bound.  */
1992   end += step;
1993 
1994   *start_p = start;
1995   *step_p = step;
1996   *end_p = end;
1997   sbitmap_free (psp);
1998   sbitmap_free (pss);
1999 
2000   if ((start >= end && step == 1) || (start <= end && step == -1))
2001     {
2002       if (dump_file)
2003 	fprintf (dump_file, "\nEmpty window: start=%d, end=%d, step=%d\n",
2004 		 start, end, step);
2005       return -1;
2006     }
2007 
2008   return 0;
2009 }
2010 
2011 /* Calculate MUST_PRECEDE/MUST_FOLLOW bitmaps of U_NODE; which is the
2012    node currently been scheduled.  At the end of the calculation
2013    MUST_PRECEDE/MUST_FOLLOW contains all predecessors/successors of
2014    U_NODE which are (1) already scheduled in the first/last row of
2015    U_NODE's scheduling window, (2) whose dependence inequality with U
2016    becomes an equality when U is scheduled in this same row, and (3)
2017    whose dependence latency is zero.
2018 
2019    The first and last rows are calculated using the following parameters:
2020    START/END rows - The cycles that begins/ends the traversal on the window;
2021    searching for an empty cycle to schedule U_NODE.
2022    STEP - The direction in which we traverse the window.
2023    II - The initiation interval.  */
2024 
2025 static void
calculate_must_precede_follow(ddg_node_ptr u_node,int start,int end,int step,int ii,sbitmap sched_nodes,sbitmap must_precede,sbitmap must_follow)2026 calculate_must_precede_follow (ddg_node_ptr u_node, int start, int end,
2027 			       int step, int ii, sbitmap sched_nodes,
2028 			       sbitmap must_precede, sbitmap must_follow)
2029 {
2030   ddg_edge_ptr e;
2031   int first_cycle_in_window, last_cycle_in_window;
2032 
2033   gcc_assert (must_precede && must_follow);
2034 
2035   /* Consider the following scheduling window:
2036      {first_cycle_in_window, first_cycle_in_window+1, ...,
2037      last_cycle_in_window}.  If step is 1 then the following will be
2038      the order we traverse the window: {start=first_cycle_in_window,
2039      first_cycle_in_window+1, ..., end=last_cycle_in_window+1},
2040      or {start=last_cycle_in_window, last_cycle_in_window-1, ...,
2041      end=first_cycle_in_window-1} if step is -1.  */
2042   first_cycle_in_window = (step == 1) ? start : end - step;
2043   last_cycle_in_window = (step == 1) ? end - step : start;
2044 
2045   bitmap_clear (must_precede);
2046   bitmap_clear (must_follow);
2047 
2048   if (dump_file)
2049     fprintf (dump_file, "\nmust_precede: ");
2050 
2051   /* Instead of checking if:
2052       (SMODULO (SCHED_TIME (e->src), ii) == first_row_in_window)
2053       && ((SCHED_TIME (e->src) + e->latency - (e->distance * ii)) ==
2054              first_cycle_in_window)
2055       && e->latency == 0
2056      we use the fact that latency is non-negative:
2057       SCHED_TIME (e->src) - (e->distance * ii) <=
2058       SCHED_TIME (e->src) + e->latency - (e->distance * ii)) <=
2059       first_cycle_in_window
2060      and check only if
2061       SCHED_TIME (e->src) - (e->distance * ii) == first_cycle_in_window  */
2062   for (e = u_node->in; e != 0; e = e->next_in)
2063     if (bitmap_bit_p (sched_nodes, e->src->cuid)
2064 	&& ((SCHED_TIME (e->src->cuid) - (e->distance * ii)) ==
2065              first_cycle_in_window))
2066       {
2067 	if (dump_file)
2068 	  fprintf (dump_file, "%d ", e->src->cuid);
2069 
2070 	bitmap_set_bit (must_precede, e->src->cuid);
2071       }
2072 
2073   if (dump_file)
2074     fprintf (dump_file, "\nmust_follow: ");
2075 
2076   /* Instead of checking if:
2077       (SMODULO (SCHED_TIME (e->dest), ii) == last_row_in_window)
2078       && ((SCHED_TIME (e->dest) - e->latency + (e->distance * ii)) ==
2079              last_cycle_in_window)
2080       && e->latency == 0
2081      we use the fact that latency is non-negative:
2082       SCHED_TIME (e->dest) + (e->distance * ii) >=
2083       SCHED_TIME (e->dest) - e->latency + (e->distance * ii)) >=
2084       last_cycle_in_window
2085      and check only if
2086       SCHED_TIME (e->dest) + (e->distance * ii) == last_cycle_in_window  */
2087   for (e = u_node->out; e != 0; e = e->next_out)
2088     if (bitmap_bit_p (sched_nodes, e->dest->cuid)
2089 	&& ((SCHED_TIME (e->dest->cuid) + (e->distance * ii)) ==
2090              last_cycle_in_window))
2091       {
2092 	if (dump_file)
2093 	  fprintf (dump_file, "%d ", e->dest->cuid);
2094 
2095 	bitmap_set_bit (must_follow, e->dest->cuid);
2096       }
2097 
2098   if (dump_file)
2099     fprintf (dump_file, "\n");
2100 }
2101 
2102 /* Return 1 if U_NODE can be scheduled in CYCLE.  Use the following
2103    parameters to decide if that's possible:
2104    PS - The partial schedule.
2105    U - The serial number of U_NODE.
2106    NUM_SPLITS - The number of row splits made so far.
2107    MUST_PRECEDE - The nodes that must precede U_NODE. (only valid at
2108    the first row of the scheduling window)
2109    MUST_FOLLOW - The nodes that must follow U_NODE. (only valid at the
2110    last row of the scheduling window)  */
2111 
2112 static bool
try_scheduling_node_in_cycle(partial_schedule_ptr ps,int u,int cycle,sbitmap sched_nodes,int * num_splits,sbitmap must_precede,sbitmap must_follow)2113 try_scheduling_node_in_cycle (partial_schedule_ptr ps,
2114 			      int u, int cycle, sbitmap sched_nodes,
2115 			      int *num_splits, sbitmap must_precede,
2116 			      sbitmap must_follow)
2117 {
2118   ps_insn_ptr psi;
2119   bool success = 0;
2120 
2121   verify_partial_schedule (ps, sched_nodes);
2122   psi = ps_add_node_check_conflicts (ps, u, cycle, must_precede, must_follow);
2123   if (psi)
2124     {
2125       SCHED_TIME (u) = cycle;
2126       bitmap_set_bit (sched_nodes, u);
2127       success = 1;
2128       *num_splits = 0;
2129       if (dump_file)
2130 	fprintf (dump_file, "Scheduled w/o split in %d\n", cycle);
2131 
2132     }
2133 
2134   return success;
2135 }
2136 
2137 /* This function implements the scheduling algorithm for SMS according to the
2138    above algorithm.  */
2139 static partial_schedule_ptr
sms_schedule_by_order(ddg_ptr g,int mii,int maxii,int * nodes_order)2140 sms_schedule_by_order (ddg_ptr g, int mii, int maxii, int *nodes_order)
2141 {
2142   int ii = mii;
2143   int i, c, success, num_splits = 0;
2144   int flush_and_start_over = true;
2145   int num_nodes = g->num_nodes;
2146   int start, end, step; /* Place together into one struct?  */
2147   sbitmap sched_nodes = sbitmap_alloc (num_nodes);
2148   sbitmap must_precede = sbitmap_alloc (num_nodes);
2149   sbitmap must_follow = sbitmap_alloc (num_nodes);
2150   sbitmap tobe_scheduled = sbitmap_alloc (num_nodes);
2151 
2152   partial_schedule_ptr ps = create_partial_schedule (ii, g, DFA_HISTORY);
2153 
2154   bitmap_ones (tobe_scheduled);
2155   bitmap_clear (sched_nodes);
2156 
2157   while (flush_and_start_over && (ii < maxii))
2158     {
2159 
2160       if (dump_file)
2161 	fprintf (dump_file, "Starting with ii=%d\n", ii);
2162       flush_and_start_over = false;
2163       bitmap_clear (sched_nodes);
2164 
2165       for (i = 0; i < num_nodes; i++)
2166 	{
2167 	  int u = nodes_order[i];
2168   	  ddg_node_ptr u_node = &ps->g->nodes[u];
2169 	  rtx insn = u_node->insn;
2170 
2171 	  if (!NONDEBUG_INSN_P (insn))
2172 	    {
2173 	      bitmap_clear_bit (tobe_scheduled, u);
2174 	      continue;
2175 	    }
2176 
2177 	  if (bitmap_bit_p (sched_nodes, u))
2178 	    continue;
2179 
2180 	  /* Try to get non-empty scheduling window.  */
2181 	 success = 0;
2182          if (get_sched_window (ps, u_node, sched_nodes, ii, &start,
2183                                 &step, &end) == 0)
2184             {
2185               if (dump_file)
2186                 fprintf (dump_file, "\nTrying to schedule node %d "
2187 			 "INSN = %d  in (%d .. %d) step %d\n", u, (INSN_UID
2188                         (g->nodes[u].insn)), start, end, step);
2189 
2190               gcc_assert ((step > 0 && start < end)
2191                           || (step < 0 && start > end));
2192 
2193               calculate_must_precede_follow (u_node, start, end, step, ii,
2194                                              sched_nodes, must_precede,
2195                                              must_follow);
2196 
2197               for (c = start; c != end; c += step)
2198                 {
2199 		  sbitmap tmp_precede, tmp_follow;
2200 
2201                   set_must_precede_follow (&tmp_follow, must_follow,
2202 		                           &tmp_precede, must_precede,
2203                                            c, start, end, step);
2204                   success =
2205                     try_scheduling_node_in_cycle (ps, u, c,
2206                                                   sched_nodes,
2207                                                   &num_splits, tmp_precede,
2208                                                   tmp_follow);
2209                   if (success)
2210                     break;
2211                 }
2212 
2213               verify_partial_schedule (ps, sched_nodes);
2214             }
2215             if (!success)
2216             {
2217               int split_row;
2218 
2219               if (ii++ == maxii)
2220                 break;
2221 
2222               if (num_splits >= MAX_SPLIT_NUM)
2223                 {
2224                   num_splits = 0;
2225                   flush_and_start_over = true;
2226                   verify_partial_schedule (ps, sched_nodes);
2227                   reset_partial_schedule (ps, ii);
2228                   verify_partial_schedule (ps, sched_nodes);
2229                   break;
2230                 }
2231 
2232               num_splits++;
2233               /* The scheduling window is exclusive of 'end'
2234                  whereas compute_split_window() expects an inclusive,
2235                  ordered range.  */
2236               if (step == 1)
2237                 split_row = compute_split_row (sched_nodes, start, end - 1,
2238                                                ps->ii, u_node);
2239               else
2240                 split_row = compute_split_row (sched_nodes, end + 1, start,
2241                                                ps->ii, u_node);
2242 
2243               ps_insert_empty_row (ps, split_row, sched_nodes);
2244               i--;              /* Go back and retry node i.  */
2245 
2246               if (dump_file)
2247                 fprintf (dump_file, "num_splits=%d\n", num_splits);
2248             }
2249 
2250           /* ??? If (success), check register pressure estimates.  */
2251         }                       /* Continue with next node.  */
2252     }                           /* While flush_and_start_over.  */
2253   if (ii >= maxii)
2254     {
2255       free_partial_schedule (ps);
2256       ps = NULL;
2257     }
2258   else
2259     gcc_assert (bitmap_equal_p (tobe_scheduled, sched_nodes));
2260 
2261   sbitmap_free (sched_nodes);
2262   sbitmap_free (must_precede);
2263   sbitmap_free (must_follow);
2264   sbitmap_free (tobe_scheduled);
2265 
2266   return ps;
2267 }
2268 
2269 /* This function inserts a new empty row into PS at the position
2270    according to SPLITROW, keeping all already scheduled instructions
2271    intact and updating their SCHED_TIME and cycle accordingly.  */
2272 static void
ps_insert_empty_row(partial_schedule_ptr ps,int split_row,sbitmap sched_nodes)2273 ps_insert_empty_row (partial_schedule_ptr ps, int split_row,
2274 		     sbitmap sched_nodes)
2275 {
2276   ps_insn_ptr crr_insn;
2277   ps_insn_ptr *rows_new;
2278   int ii = ps->ii;
2279   int new_ii = ii + 1;
2280   int row;
2281   int *rows_length_new;
2282 
2283   verify_partial_schedule (ps, sched_nodes);
2284 
2285   /* We normalize sched_time and rotate ps to have only non-negative sched
2286      times, for simplicity of updating cycles after inserting new row.  */
2287   split_row -= ps->min_cycle;
2288   split_row = SMODULO (split_row, ii);
2289   if (dump_file)
2290     fprintf (dump_file, "split_row=%d\n", split_row);
2291 
2292   reset_sched_times (ps, PS_MIN_CYCLE (ps));
2293   rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
2294 
2295   rows_new = (ps_insn_ptr *) xcalloc (new_ii, sizeof (ps_insn_ptr));
2296   rows_length_new = (int *) xcalloc (new_ii, sizeof (int));
2297   for (row = 0; row < split_row; row++)
2298     {
2299       rows_new[row] = ps->rows[row];
2300       rows_length_new[row] = ps->rows_length[row];
2301       ps->rows[row] = NULL;
2302       for (crr_insn = rows_new[row];
2303 	   crr_insn; crr_insn = crr_insn->next_in_row)
2304 	{
2305 	  int u = crr_insn->id;
2306 	  int new_time = SCHED_TIME (u) + (SCHED_TIME (u) / ii);
2307 
2308 	  SCHED_TIME (u) = new_time;
2309 	  crr_insn->cycle = new_time;
2310 	  SCHED_ROW (u) = new_time % new_ii;
2311 	  SCHED_STAGE (u) = new_time / new_ii;
2312 	}
2313 
2314     }
2315 
2316   rows_new[split_row] = NULL;
2317 
2318   for (row = split_row; row < ii; row++)
2319     {
2320       rows_new[row + 1] = ps->rows[row];
2321       rows_length_new[row + 1] = ps->rows_length[row];
2322       ps->rows[row] = NULL;
2323       for (crr_insn = rows_new[row + 1];
2324 	   crr_insn; crr_insn = crr_insn->next_in_row)
2325 	{
2326 	  int u = crr_insn->id;
2327 	  int new_time = SCHED_TIME (u) + (SCHED_TIME (u) / ii) + 1;
2328 
2329 	  SCHED_TIME (u) = new_time;
2330 	  crr_insn->cycle = new_time;
2331 	  SCHED_ROW (u) = new_time % new_ii;
2332 	  SCHED_STAGE (u) = new_time / new_ii;
2333 	}
2334     }
2335 
2336   /* Updating ps.  */
2337   ps->min_cycle = ps->min_cycle + ps->min_cycle / ii
2338     + (SMODULO (ps->min_cycle, ii) >= split_row ? 1 : 0);
2339   ps->max_cycle = ps->max_cycle + ps->max_cycle / ii
2340     + (SMODULO (ps->max_cycle, ii) >= split_row ? 1 : 0);
2341   free (ps->rows);
2342   ps->rows = rows_new;
2343   free (ps->rows_length);
2344   ps->rows_length = rows_length_new;
2345   ps->ii = new_ii;
2346   gcc_assert (ps->min_cycle >= 0);
2347 
2348   verify_partial_schedule (ps, sched_nodes);
2349 
2350   if (dump_file)
2351     fprintf (dump_file, "min_cycle=%d, max_cycle=%d\n", ps->min_cycle,
2352 	     ps->max_cycle);
2353 }
2354 
2355 /* Given U_NODE which is the node that failed to be scheduled; LOW and
2356    UP which are the boundaries of it's scheduling window; compute using
2357    SCHED_NODES and II a row in the partial schedule that can be split
2358    which will separate a critical predecessor from a critical successor
2359    thereby expanding the window, and return it.  */
2360 static int
compute_split_row(sbitmap sched_nodes,int low,int up,int ii,ddg_node_ptr u_node)2361 compute_split_row (sbitmap sched_nodes, int low, int up, int ii,
2362 		   ddg_node_ptr u_node)
2363 {
2364   ddg_edge_ptr e;
2365   int lower = INT_MIN, upper = INT_MAX;
2366   int crit_pred = -1;
2367   int crit_succ = -1;
2368   int crit_cycle;
2369 
2370   for (e = u_node->in; e != 0; e = e->next_in)
2371     {
2372       int v = e->src->cuid;
2373 
2374       if (bitmap_bit_p (sched_nodes, v)
2375 	  && (low == SCHED_TIME (v) + e->latency - (e->distance * ii)))
2376 	if (SCHED_TIME (v) > lower)
2377 	  {
2378 	    crit_pred = v;
2379 	    lower = SCHED_TIME (v);
2380 	  }
2381     }
2382 
2383   if (crit_pred >= 0)
2384     {
2385       crit_cycle = SCHED_TIME (crit_pred) + 1;
2386       return SMODULO (crit_cycle, ii);
2387     }
2388 
2389   for (e = u_node->out; e != 0; e = e->next_out)
2390     {
2391       int v = e->dest->cuid;
2392 
2393       if (bitmap_bit_p (sched_nodes, v)
2394 	  && (up == SCHED_TIME (v) - e->latency + (e->distance * ii)))
2395 	if (SCHED_TIME (v) < upper)
2396 	  {
2397 	    crit_succ = v;
2398 	    upper = SCHED_TIME (v);
2399 	  }
2400     }
2401 
2402   if (crit_succ >= 0)
2403     {
2404       crit_cycle = SCHED_TIME (crit_succ);
2405       return SMODULO (crit_cycle, ii);
2406     }
2407 
2408   if (dump_file)
2409     fprintf (dump_file, "Both crit_pred and crit_succ are NULL\n");
2410 
2411   return SMODULO ((low + up + 1) / 2, ii);
2412 }
2413 
2414 static void
verify_partial_schedule(partial_schedule_ptr ps,sbitmap sched_nodes)2415 verify_partial_schedule (partial_schedule_ptr ps, sbitmap sched_nodes)
2416 {
2417   int row;
2418   ps_insn_ptr crr_insn;
2419 
2420   for (row = 0; row < ps->ii; row++)
2421     {
2422       int length = 0;
2423 
2424       for (crr_insn = ps->rows[row]; crr_insn; crr_insn = crr_insn->next_in_row)
2425 	{
2426 	  int u = crr_insn->id;
2427 
2428 	  length++;
2429 	  gcc_assert (bitmap_bit_p (sched_nodes, u));
2430 	  /* ??? Test also that all nodes of sched_nodes are in ps, perhaps by
2431 	     popcount (sched_nodes) == number of insns in ps.  */
2432 	  gcc_assert (SCHED_TIME (u) >= ps->min_cycle);
2433 	  gcc_assert (SCHED_TIME (u) <= ps->max_cycle);
2434 	}
2435 
2436       gcc_assert (ps->rows_length[row] == length);
2437     }
2438 }
2439 
2440 
2441 /* This page implements the algorithm for ordering the nodes of a DDG
2442    for modulo scheduling, activated through the
2443    "int sms_order_nodes (ddg_ptr, int mii, int * result)" API.  */
2444 
2445 #define ORDER_PARAMS(x) ((struct node_order_params *) (x)->aux.info)
2446 #define ASAP(x) (ORDER_PARAMS ((x))->asap)
2447 #define ALAP(x) (ORDER_PARAMS ((x))->alap)
2448 #define HEIGHT(x) (ORDER_PARAMS ((x))->height)
2449 #define MOB(x) (ALAP ((x)) - ASAP ((x)))
2450 #define DEPTH(x) (ASAP ((x)))
2451 
2452 typedef struct node_order_params * nopa;
2453 
2454 static void order_nodes_of_sccs (ddg_all_sccs_ptr, int * result);
2455 static int order_nodes_in_scc (ddg_ptr, sbitmap, sbitmap, int*, int);
2456 static nopa  calculate_order_params (ddg_ptr, int, int *);
2457 static int find_max_asap (ddg_ptr, sbitmap);
2458 static int find_max_hv_min_mob (ddg_ptr, sbitmap);
2459 static int find_max_dv_min_mob (ddg_ptr, sbitmap);
2460 
2461 enum sms_direction {BOTTOMUP, TOPDOWN};
2462 
2463 struct node_order_params
2464 {
2465   int asap;
2466   int alap;
2467   int height;
2468 };
2469 
2470 /* Check if NODE_ORDER contains a permutation of 0 .. NUM_NODES-1.  */
2471 static void
check_nodes_order(int * node_order,int num_nodes)2472 check_nodes_order (int *node_order, int num_nodes)
2473 {
2474   int i;
2475   sbitmap tmp = sbitmap_alloc (num_nodes);
2476 
2477   bitmap_clear (tmp);
2478 
2479   if (dump_file)
2480     fprintf (dump_file, "SMS final nodes order: \n");
2481 
2482   for (i = 0; i < num_nodes; i++)
2483     {
2484       int u = node_order[i];
2485 
2486       if (dump_file)
2487         fprintf (dump_file, "%d ", u);
2488       gcc_assert (u < num_nodes && u >= 0 && !bitmap_bit_p (tmp, u));
2489 
2490       bitmap_set_bit (tmp, u);
2491     }
2492 
2493   if (dump_file)
2494     fprintf (dump_file, "\n");
2495 
2496   sbitmap_free (tmp);
2497 }
2498 
2499 /* Order the nodes of G for scheduling and pass the result in
2500    NODE_ORDER.  Also set aux.count of each node to ASAP.
2501    Put maximal ASAP to PMAX_ASAP.  Return the recMII for the given DDG.  */
2502 static int
sms_order_nodes(ddg_ptr g,int mii,int * node_order,int * pmax_asap)2503 sms_order_nodes (ddg_ptr g, int mii, int * node_order, int *pmax_asap)
2504 {
2505   int i;
2506   int rec_mii = 0;
2507   ddg_all_sccs_ptr sccs = create_ddg_all_sccs (g);
2508 
2509   nopa nops = calculate_order_params (g, mii, pmax_asap);
2510 
2511   if (dump_file)
2512     print_sccs (dump_file, sccs, g);
2513 
2514   order_nodes_of_sccs (sccs, node_order);
2515 
2516   if (sccs->num_sccs > 0)
2517     /* First SCC has the largest recurrence_length.  */
2518     rec_mii = sccs->sccs[0]->recurrence_length;
2519 
2520   /* Save ASAP before destroying node_order_params.  */
2521   for (i = 0; i < g->num_nodes; i++)
2522     {
2523       ddg_node_ptr v = &g->nodes[i];
2524       v->aux.count = ASAP (v);
2525     }
2526 
2527   free (nops);
2528   free_ddg_all_sccs (sccs);
2529   check_nodes_order (node_order, g->num_nodes);
2530 
2531   return rec_mii;
2532 }
2533 
2534 static void
order_nodes_of_sccs(ddg_all_sccs_ptr all_sccs,int * node_order)2535 order_nodes_of_sccs (ddg_all_sccs_ptr all_sccs, int * node_order)
2536 {
2537   int i, pos = 0;
2538   ddg_ptr g = all_sccs->ddg;
2539   int num_nodes = g->num_nodes;
2540   sbitmap prev_sccs = sbitmap_alloc (num_nodes);
2541   sbitmap on_path = sbitmap_alloc (num_nodes);
2542   sbitmap tmp = sbitmap_alloc (num_nodes);
2543   sbitmap ones = sbitmap_alloc (num_nodes);
2544 
2545   bitmap_clear (prev_sccs);
2546   bitmap_ones (ones);
2547 
2548   /* Perform the node ordering starting from the SCC with the highest recMII.
2549      For each SCC order the nodes according to their ASAP/ALAP/HEIGHT etc.  */
2550   for (i = 0; i < all_sccs->num_sccs; i++)
2551     {
2552       ddg_scc_ptr scc = all_sccs->sccs[i];
2553 
2554       /* Add nodes on paths from previous SCCs to the current SCC.  */
2555       find_nodes_on_paths (on_path, g, prev_sccs, scc->nodes);
2556       bitmap_ior (tmp, scc->nodes, on_path);
2557 
2558       /* Add nodes on paths from the current SCC to previous SCCs.  */
2559       find_nodes_on_paths (on_path, g, scc->nodes, prev_sccs);
2560       bitmap_ior (tmp, tmp, on_path);
2561 
2562       /* Remove nodes of previous SCCs from current extended SCC.  */
2563       bitmap_and_compl (tmp, tmp, prev_sccs);
2564 
2565       pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
2566       /* Above call to order_nodes_in_scc updated prev_sccs |= tmp.  */
2567     }
2568 
2569   /* Handle the remaining nodes that do not belong to any scc.  Each call
2570      to order_nodes_in_scc handles a single connected component.  */
2571   while (pos < g->num_nodes)
2572     {
2573       bitmap_and_compl (tmp, ones, prev_sccs);
2574       pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
2575     }
2576   sbitmap_free (prev_sccs);
2577   sbitmap_free (on_path);
2578   sbitmap_free (tmp);
2579   sbitmap_free (ones);
2580 }
2581 
2582 /* MII is needed if we consider backarcs (that do not close recursive cycles).  */
2583 static struct node_order_params *
calculate_order_params(ddg_ptr g,int mii ATTRIBUTE_UNUSED,int * pmax_asap)2584 calculate_order_params (ddg_ptr g, int mii ATTRIBUTE_UNUSED, int *pmax_asap)
2585 {
2586   int u;
2587   int max_asap;
2588   int num_nodes = g->num_nodes;
2589   ddg_edge_ptr e;
2590   /* Allocate a place to hold ordering params for each node in the DDG.  */
2591   nopa node_order_params_arr;
2592 
2593   /* Initialize of ASAP/ALAP/HEIGHT to zero.  */
2594   node_order_params_arr = (nopa) xcalloc (num_nodes,
2595 					  sizeof (struct node_order_params));
2596 
2597   /* Set the aux pointer of each node to point to its order_params structure.  */
2598   for (u = 0; u < num_nodes; u++)
2599     g->nodes[u].aux.info = &node_order_params_arr[u];
2600 
2601   /* Disregarding a backarc from each recursive cycle to obtain a DAG,
2602      calculate ASAP, ALAP, mobility, distance, and height for each node
2603      in the dependence (direct acyclic) graph.  */
2604 
2605   /* We assume that the nodes in the array are in topological order.  */
2606 
2607   max_asap = 0;
2608   for (u = 0; u < num_nodes; u++)
2609     {
2610       ddg_node_ptr u_node = &g->nodes[u];
2611 
2612       ASAP (u_node) = 0;
2613       for (e = u_node->in; e; e = e->next_in)
2614 	if (e->distance == 0)
2615 	  ASAP (u_node) = MAX (ASAP (u_node),
2616 			       ASAP (e->src) + e->latency);
2617       max_asap = MAX (max_asap, ASAP (u_node));
2618     }
2619 
2620   for (u = num_nodes - 1; u > -1; u--)
2621     {
2622       ddg_node_ptr u_node = &g->nodes[u];
2623 
2624       ALAP (u_node) = max_asap;
2625       HEIGHT (u_node) = 0;
2626       for (e = u_node->out; e; e = e->next_out)
2627 	if (e->distance == 0)
2628 	  {
2629 	    ALAP (u_node) = MIN (ALAP (u_node),
2630 				 ALAP (e->dest) - e->latency);
2631 	    HEIGHT (u_node) = MAX (HEIGHT (u_node),
2632 				   HEIGHT (e->dest) + e->latency);
2633 	  }
2634     }
2635   if (dump_file)
2636   {
2637     fprintf (dump_file, "\nOrder params\n");
2638     for (u = 0; u < num_nodes; u++)
2639       {
2640         ddg_node_ptr u_node = &g->nodes[u];
2641 
2642         fprintf (dump_file, "node %d, ASAP: %d, ALAP: %d, HEIGHT: %d\n", u,
2643                  ASAP (u_node), ALAP (u_node), HEIGHT (u_node));
2644       }
2645   }
2646 
2647   *pmax_asap = max_asap;
2648   return node_order_params_arr;
2649 }
2650 
2651 static int
find_max_asap(ddg_ptr g,sbitmap nodes)2652 find_max_asap (ddg_ptr g, sbitmap nodes)
2653 {
2654   unsigned int u = 0;
2655   int max_asap = -1;
2656   int result = -1;
2657   sbitmap_iterator sbi;
2658 
2659   EXECUTE_IF_SET_IN_BITMAP (nodes, 0, u, sbi)
2660     {
2661       ddg_node_ptr u_node = &g->nodes[u];
2662 
2663       if (max_asap < ASAP (u_node))
2664 	{
2665 	  max_asap = ASAP (u_node);
2666 	  result = u;
2667 	}
2668     }
2669   return result;
2670 }
2671 
2672 static int
find_max_hv_min_mob(ddg_ptr g,sbitmap nodes)2673 find_max_hv_min_mob (ddg_ptr g, sbitmap nodes)
2674 {
2675   unsigned int u = 0;
2676   int max_hv = -1;
2677   int min_mob = INT_MAX;
2678   int result = -1;
2679   sbitmap_iterator sbi;
2680 
2681   EXECUTE_IF_SET_IN_BITMAP (nodes, 0, u, sbi)
2682     {
2683       ddg_node_ptr u_node = &g->nodes[u];
2684 
2685       if (max_hv < HEIGHT (u_node))
2686 	{
2687 	  max_hv = HEIGHT (u_node);
2688 	  min_mob = MOB (u_node);
2689 	  result = u;
2690 	}
2691       else if ((max_hv == HEIGHT (u_node))
2692 	       && (min_mob > MOB (u_node)))
2693 	{
2694 	  min_mob = MOB (u_node);
2695 	  result = u;
2696 	}
2697     }
2698   return result;
2699 }
2700 
2701 static int
find_max_dv_min_mob(ddg_ptr g,sbitmap nodes)2702 find_max_dv_min_mob (ddg_ptr g, sbitmap nodes)
2703 {
2704   unsigned int u = 0;
2705   int max_dv = -1;
2706   int min_mob = INT_MAX;
2707   int result = -1;
2708   sbitmap_iterator sbi;
2709 
2710   EXECUTE_IF_SET_IN_BITMAP (nodes, 0, u, sbi)
2711     {
2712       ddg_node_ptr u_node = &g->nodes[u];
2713 
2714       if (max_dv < DEPTH (u_node))
2715 	{
2716 	  max_dv = DEPTH (u_node);
2717 	  min_mob = MOB (u_node);
2718 	  result = u;
2719 	}
2720       else if ((max_dv == DEPTH (u_node))
2721 	       && (min_mob > MOB (u_node)))
2722 	{
2723 	  min_mob = MOB (u_node);
2724 	  result = u;
2725 	}
2726     }
2727   return result;
2728 }
2729 
2730 /* Places the nodes of SCC into the NODE_ORDER array starting
2731    at position POS, according to the SMS ordering algorithm.
2732    NODES_ORDERED (in&out parameter) holds the bitset of all nodes in
2733    the NODE_ORDER array, starting from position zero.  */
2734 static int
order_nodes_in_scc(ddg_ptr g,sbitmap nodes_ordered,sbitmap scc,int * node_order,int pos)2735 order_nodes_in_scc (ddg_ptr g, sbitmap nodes_ordered, sbitmap scc,
2736 		    int * node_order, int pos)
2737 {
2738   enum sms_direction dir;
2739   int num_nodes = g->num_nodes;
2740   sbitmap workset = sbitmap_alloc (num_nodes);
2741   sbitmap tmp = sbitmap_alloc (num_nodes);
2742   sbitmap zero_bitmap = sbitmap_alloc (num_nodes);
2743   sbitmap predecessors = sbitmap_alloc (num_nodes);
2744   sbitmap successors = sbitmap_alloc (num_nodes);
2745 
2746   bitmap_clear (predecessors);
2747   find_predecessors (predecessors, g, nodes_ordered);
2748 
2749   bitmap_clear (successors);
2750   find_successors (successors, g, nodes_ordered);
2751 
2752   bitmap_clear (tmp);
2753   if (bitmap_and (tmp, predecessors, scc))
2754     {
2755       bitmap_copy (workset, tmp);
2756       dir = BOTTOMUP;
2757     }
2758   else if (bitmap_and (tmp, successors, scc))
2759     {
2760       bitmap_copy (workset, tmp);
2761       dir = TOPDOWN;
2762     }
2763   else
2764     {
2765       int u;
2766 
2767       bitmap_clear (workset);
2768       if ((u = find_max_asap (g, scc)) >= 0)
2769 	bitmap_set_bit (workset, u);
2770       dir = BOTTOMUP;
2771     }
2772 
2773   bitmap_clear (zero_bitmap);
2774   while (!bitmap_equal_p (workset, zero_bitmap))
2775     {
2776       int v;
2777       ddg_node_ptr v_node;
2778       sbitmap v_node_preds;
2779       sbitmap v_node_succs;
2780 
2781       if (dir == TOPDOWN)
2782 	{
2783 	  while (!bitmap_equal_p (workset, zero_bitmap))
2784 	    {
2785 	      v = find_max_hv_min_mob (g, workset);
2786 	      v_node = &g->nodes[v];
2787 	      node_order[pos++] = v;
2788 	      v_node_succs = NODE_SUCCESSORS (v_node);
2789 	      bitmap_and (tmp, v_node_succs, scc);
2790 
2791 	      /* Don't consider the already ordered successors again.  */
2792 	      bitmap_and_compl (tmp, tmp, nodes_ordered);
2793 	      bitmap_ior (workset, workset, tmp);
2794 	      bitmap_clear_bit (workset, v);
2795 	      bitmap_set_bit (nodes_ordered, v);
2796 	    }
2797 	  dir = BOTTOMUP;
2798 	  bitmap_clear (predecessors);
2799 	  find_predecessors (predecessors, g, nodes_ordered);
2800 	  bitmap_and (workset, predecessors, scc);
2801 	}
2802       else
2803 	{
2804 	  while (!bitmap_equal_p (workset, zero_bitmap))
2805 	    {
2806 	      v = find_max_dv_min_mob (g, workset);
2807 	      v_node = &g->nodes[v];
2808 	      node_order[pos++] = v;
2809 	      v_node_preds = NODE_PREDECESSORS (v_node);
2810 	      bitmap_and (tmp, v_node_preds, scc);
2811 
2812 	      /* Don't consider the already ordered predecessors again.  */
2813 	      bitmap_and_compl (tmp, tmp, nodes_ordered);
2814 	      bitmap_ior (workset, workset, tmp);
2815 	      bitmap_clear_bit (workset, v);
2816 	      bitmap_set_bit (nodes_ordered, v);
2817 	    }
2818 	  dir = TOPDOWN;
2819 	  bitmap_clear (successors);
2820 	  find_successors (successors, g, nodes_ordered);
2821 	  bitmap_and (workset, successors, scc);
2822 	}
2823     }
2824   sbitmap_free (tmp);
2825   sbitmap_free (workset);
2826   sbitmap_free (zero_bitmap);
2827   sbitmap_free (predecessors);
2828   sbitmap_free (successors);
2829   return pos;
2830 }
2831 
2832 
2833 /* This page contains functions for manipulating partial-schedules during
2834    modulo scheduling.  */
2835 
2836 /* Create a partial schedule and allocate a memory to hold II rows.  */
2837 
2838 static partial_schedule_ptr
create_partial_schedule(int ii,ddg_ptr g,int history)2839 create_partial_schedule (int ii, ddg_ptr g, int history)
2840 {
2841   partial_schedule_ptr ps = XNEW (struct partial_schedule);
2842   ps->rows = (ps_insn_ptr *) xcalloc (ii, sizeof (ps_insn_ptr));
2843   ps->rows_length = (int *) xcalloc (ii, sizeof (int));
2844   ps->reg_moves.create (0);
2845   ps->ii = ii;
2846   ps->history = history;
2847   ps->min_cycle = INT_MAX;
2848   ps->max_cycle = INT_MIN;
2849   ps->g = g;
2850 
2851   return ps;
2852 }
2853 
2854 /* Free the PS_INSNs in rows array of the given partial schedule.
2855    ??? Consider caching the PS_INSN's.  */
2856 static void
free_ps_insns(partial_schedule_ptr ps)2857 free_ps_insns (partial_schedule_ptr ps)
2858 {
2859   int i;
2860 
2861   for (i = 0; i < ps->ii; i++)
2862     {
2863       while (ps->rows[i])
2864 	{
2865 	  ps_insn_ptr ps_insn = ps->rows[i]->next_in_row;
2866 
2867 	  free (ps->rows[i]);
2868 	  ps->rows[i] = ps_insn;
2869 	}
2870       ps->rows[i] = NULL;
2871     }
2872 }
2873 
2874 /* Free all the memory allocated to the partial schedule.  */
2875 
2876 static void
free_partial_schedule(partial_schedule_ptr ps)2877 free_partial_schedule (partial_schedule_ptr ps)
2878 {
2879   ps_reg_move_info *move;
2880   unsigned int i;
2881 
2882   if (!ps)
2883     return;
2884 
2885   FOR_EACH_VEC_ELT (ps->reg_moves, i, move)
2886     sbitmap_free (move->uses);
2887   ps->reg_moves.release ();
2888 
2889   free_ps_insns (ps);
2890   free (ps->rows);
2891   free (ps->rows_length);
2892   free (ps);
2893 }
2894 
2895 /* Clear the rows array with its PS_INSNs, and create a new one with
2896    NEW_II rows.  */
2897 
2898 static void
reset_partial_schedule(partial_schedule_ptr ps,int new_ii)2899 reset_partial_schedule (partial_schedule_ptr ps, int new_ii)
2900 {
2901   if (!ps)
2902     return;
2903   free_ps_insns (ps);
2904   if (new_ii == ps->ii)
2905     return;
2906   ps->rows = (ps_insn_ptr *) xrealloc (ps->rows, new_ii
2907 						 * sizeof (ps_insn_ptr));
2908   memset (ps->rows, 0, new_ii * sizeof (ps_insn_ptr));
2909   ps->rows_length = (int *) xrealloc (ps->rows_length, new_ii * sizeof (int));
2910   memset (ps->rows_length, 0, new_ii * sizeof (int));
2911   ps->ii = new_ii;
2912   ps->min_cycle = INT_MAX;
2913   ps->max_cycle = INT_MIN;
2914 }
2915 
2916 /* Prints the partial schedule as an ii rows array, for each rows
2917    print the ids of the insns in it.  */
2918 void
print_partial_schedule(partial_schedule_ptr ps,FILE * dump)2919 print_partial_schedule (partial_schedule_ptr ps, FILE *dump)
2920 {
2921   int i;
2922 
2923   for (i = 0; i < ps->ii; i++)
2924     {
2925       ps_insn_ptr ps_i = ps->rows[i];
2926 
2927       fprintf (dump, "\n[ROW %d ]: ", i);
2928       while (ps_i)
2929 	{
2930 	  rtx insn = ps_rtl_insn (ps, ps_i->id);
2931 
2932 	  if (JUMP_P (insn))
2933 	    fprintf (dump, "%d (branch), ", INSN_UID (insn));
2934 	  else
2935 	    fprintf (dump, "%d, ", INSN_UID (insn));
2936 
2937 	  ps_i = ps_i->next_in_row;
2938 	}
2939     }
2940 }
2941 
2942 /* Creates an object of PS_INSN and initializes it to the given parameters.  */
2943 static ps_insn_ptr
create_ps_insn(int id,int cycle)2944 create_ps_insn (int id, int cycle)
2945 {
2946   ps_insn_ptr ps_i = XNEW (struct ps_insn);
2947 
2948   ps_i->id = id;
2949   ps_i->next_in_row = NULL;
2950   ps_i->prev_in_row = NULL;
2951   ps_i->cycle = cycle;
2952 
2953   return ps_i;
2954 }
2955 
2956 
2957 /* Removes the given PS_INSN from the partial schedule.  */
2958 static void
remove_node_from_ps(partial_schedule_ptr ps,ps_insn_ptr ps_i)2959 remove_node_from_ps (partial_schedule_ptr ps, ps_insn_ptr ps_i)
2960 {
2961   int row;
2962 
2963   gcc_assert (ps && ps_i);
2964 
2965   row = SMODULO (ps_i->cycle, ps->ii);
2966   if (! ps_i->prev_in_row)
2967     {
2968       gcc_assert (ps_i == ps->rows[row]);
2969       ps->rows[row] = ps_i->next_in_row;
2970       if (ps->rows[row])
2971 	ps->rows[row]->prev_in_row = NULL;
2972     }
2973   else
2974     {
2975       ps_i->prev_in_row->next_in_row = ps_i->next_in_row;
2976       if (ps_i->next_in_row)
2977 	ps_i->next_in_row->prev_in_row = ps_i->prev_in_row;
2978     }
2979 
2980   ps->rows_length[row] -= 1;
2981   free (ps_i);
2982   return;
2983 }
2984 
2985 /* Unlike what literature describes for modulo scheduling (which focuses
2986    on VLIW machines) the order of the instructions inside a cycle is
2987    important.  Given the bitmaps MUST_FOLLOW and MUST_PRECEDE we know
2988    where the current instruction should go relative to the already
2989    scheduled instructions in the given cycle.  Go over these
2990    instructions and find the first possible column to put it in.  */
2991 static bool
ps_insn_find_column(partial_schedule_ptr ps,ps_insn_ptr ps_i,sbitmap must_precede,sbitmap must_follow)2992 ps_insn_find_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2993 		     sbitmap must_precede, sbitmap must_follow)
2994 {
2995   ps_insn_ptr next_ps_i;
2996   ps_insn_ptr first_must_follow = NULL;
2997   ps_insn_ptr last_must_precede = NULL;
2998   ps_insn_ptr last_in_row = NULL;
2999   int row;
3000 
3001   if (! ps_i)
3002     return false;
3003 
3004   row = SMODULO (ps_i->cycle, ps->ii);
3005 
3006   /* Find the first must follow and the last must precede
3007      and insert the node immediately after the must precede
3008      but make sure that it there is no must follow after it.  */
3009   for (next_ps_i = ps->rows[row];
3010        next_ps_i;
3011        next_ps_i = next_ps_i->next_in_row)
3012     {
3013       if (must_follow
3014 	  && bitmap_bit_p (must_follow, next_ps_i->id)
3015 	  && ! first_must_follow)
3016         first_must_follow = next_ps_i;
3017       if (must_precede && bitmap_bit_p (must_precede, next_ps_i->id))
3018         {
3019           /* If we have already met a node that must follow, then
3020 	     there is no possible column.  */
3021   	  if (first_must_follow)
3022             return false;
3023 	  else
3024             last_must_precede = next_ps_i;
3025         }
3026       /* The closing branch must be the last in the row.  */
3027       if (must_precede
3028 	  && bitmap_bit_p (must_precede, next_ps_i->id)
3029 	  && JUMP_P (ps_rtl_insn (ps, next_ps_i->id)))
3030 	return false;
3031 
3032        last_in_row = next_ps_i;
3033     }
3034 
3035   /* The closing branch is scheduled as well.  Make sure there is no
3036      dependent instruction after it as the branch should be the last
3037      instruction in the row.  */
3038   if (JUMP_P (ps_rtl_insn (ps, ps_i->id)))
3039     {
3040       if (first_must_follow)
3041 	return false;
3042       if (last_in_row)
3043 	{
3044 	  /* Make the branch the last in the row.  New instructions
3045 	     will be inserted at the beginning of the row or after the
3046 	     last must_precede instruction thus the branch is guaranteed
3047 	     to remain the last instruction in the row.  */
3048 	  last_in_row->next_in_row = ps_i;
3049 	  ps_i->prev_in_row = last_in_row;
3050 	  ps_i->next_in_row = NULL;
3051 	}
3052       else
3053 	ps->rows[row] = ps_i;
3054       return true;
3055     }
3056 
3057   /* Now insert the node after INSERT_AFTER_PSI.  */
3058 
3059   if (! last_must_precede)
3060     {
3061       ps_i->next_in_row = ps->rows[row];
3062       ps_i->prev_in_row = NULL;
3063       if (ps_i->next_in_row)
3064     	ps_i->next_in_row->prev_in_row = ps_i;
3065       ps->rows[row] = ps_i;
3066     }
3067   else
3068     {
3069       ps_i->next_in_row = last_must_precede->next_in_row;
3070       last_must_precede->next_in_row = ps_i;
3071       ps_i->prev_in_row = last_must_precede;
3072       if (ps_i->next_in_row)
3073         ps_i->next_in_row->prev_in_row = ps_i;
3074     }
3075 
3076   return true;
3077 }
3078 
3079 /* Advances the PS_INSN one column in its current row; returns false
3080    in failure and true in success.  Bit N is set in MUST_FOLLOW if
3081    the node with cuid N must be come after the node pointed to by
3082    PS_I when scheduled in the same cycle.  */
3083 static int
ps_insn_advance_column(partial_schedule_ptr ps,ps_insn_ptr ps_i,sbitmap must_follow)3084 ps_insn_advance_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
3085 			sbitmap must_follow)
3086 {
3087   ps_insn_ptr prev, next;
3088   int row;
3089 
3090   if (!ps || !ps_i)
3091     return false;
3092 
3093   row = SMODULO (ps_i->cycle, ps->ii);
3094 
3095   if (! ps_i->next_in_row)
3096     return false;
3097 
3098   /* Check if next_in_row is dependent on ps_i, both having same sched
3099      times (typically ANTI_DEP).  If so, ps_i cannot skip over it.  */
3100   if (must_follow && bitmap_bit_p (must_follow, ps_i->next_in_row->id))
3101     return false;
3102 
3103   /* Advance PS_I over its next_in_row in the doubly linked list.  */
3104   prev = ps_i->prev_in_row;
3105   next = ps_i->next_in_row;
3106 
3107   if (ps_i == ps->rows[row])
3108     ps->rows[row] = next;
3109 
3110   ps_i->next_in_row = next->next_in_row;
3111 
3112   if (next->next_in_row)
3113     next->next_in_row->prev_in_row = ps_i;
3114 
3115   next->next_in_row = ps_i;
3116   ps_i->prev_in_row = next;
3117 
3118   next->prev_in_row = prev;
3119   if (prev)
3120     prev->next_in_row = next;
3121 
3122   return true;
3123 }
3124 
3125 /* Inserts a DDG_NODE to the given partial schedule at the given cycle.
3126    Returns 0 if this is not possible and a PS_INSN otherwise.  Bit N is
3127    set in MUST_PRECEDE/MUST_FOLLOW if the node with cuid N must be come
3128    before/after (respectively) the node pointed to by PS_I when scheduled
3129    in the same cycle.  */
3130 static ps_insn_ptr
add_node_to_ps(partial_schedule_ptr ps,int id,int cycle,sbitmap must_precede,sbitmap must_follow)3131 add_node_to_ps (partial_schedule_ptr ps, int id, int cycle,
3132 		sbitmap must_precede, sbitmap must_follow)
3133 {
3134   ps_insn_ptr ps_i;
3135   int row = SMODULO (cycle, ps->ii);
3136 
3137   if (ps->rows_length[row] >= issue_rate)
3138     return NULL;
3139 
3140   ps_i = create_ps_insn (id, cycle);
3141 
3142   /* Finds and inserts PS_I according to MUST_FOLLOW and
3143      MUST_PRECEDE.  */
3144   if (! ps_insn_find_column (ps, ps_i, must_precede, must_follow))
3145     {
3146       free (ps_i);
3147       return NULL;
3148     }
3149 
3150   ps->rows_length[row] += 1;
3151   return ps_i;
3152 }
3153 
3154 /* Advance time one cycle.  Assumes DFA is being used.  */
3155 static void
advance_one_cycle(void)3156 advance_one_cycle (void)
3157 {
3158   if (targetm.sched.dfa_pre_cycle_insn)
3159     state_transition (curr_state,
3160 		      targetm.sched.dfa_pre_cycle_insn ());
3161 
3162   state_transition (curr_state, NULL);
3163 
3164   if (targetm.sched.dfa_post_cycle_insn)
3165     state_transition (curr_state,
3166 		      targetm.sched.dfa_post_cycle_insn ());
3167 }
3168 
3169 
3170 
3171 /* Checks if PS has resource conflicts according to DFA, starting from
3172    FROM cycle to TO cycle; returns true if there are conflicts and false
3173    if there are no conflicts.  Assumes DFA is being used.  */
3174 static int
ps_has_conflicts(partial_schedule_ptr ps,int from,int to)3175 ps_has_conflicts (partial_schedule_ptr ps, int from, int to)
3176 {
3177   int cycle;
3178 
3179   state_reset (curr_state);
3180 
3181   for (cycle = from; cycle <= to; cycle++)
3182     {
3183       ps_insn_ptr crr_insn;
3184       /* Holds the remaining issue slots in the current row.  */
3185       int can_issue_more = issue_rate;
3186 
3187       /* Walk through the DFA for the current row.  */
3188       for (crr_insn = ps->rows[SMODULO (cycle, ps->ii)];
3189 	   crr_insn;
3190 	   crr_insn = crr_insn->next_in_row)
3191 	{
3192 	  rtx insn = ps_rtl_insn (ps, crr_insn->id);
3193 
3194 	  if (!NONDEBUG_INSN_P (insn))
3195 	    continue;
3196 
3197 	  /* Check if there is room for the current insn.  */
3198 	  if (!can_issue_more || state_dead_lock_p (curr_state))
3199 	    return true;
3200 
3201 	  /* Update the DFA state and return with failure if the DFA found
3202 	     resource conflicts.  */
3203 	  if (state_transition (curr_state, insn) >= 0)
3204 	    return true;
3205 
3206 	  if (targetm.sched.variable_issue)
3207 	    can_issue_more =
3208 	      targetm.sched.variable_issue (sched_dump, sched_verbose,
3209 					    insn, can_issue_more);
3210 	  /* A naked CLOBBER or USE generates no instruction, so don't
3211 	     let them consume issue slots.  */
3212 	  else if (GET_CODE (PATTERN (insn)) != USE
3213 		   && GET_CODE (PATTERN (insn)) != CLOBBER)
3214 	    can_issue_more--;
3215 	}
3216 
3217       /* Advance the DFA to the next cycle.  */
3218       advance_one_cycle ();
3219     }
3220   return false;
3221 }
3222 
3223 /* Checks if the given node causes resource conflicts when added to PS at
3224    cycle C.  If not the node is added to PS and returned; otherwise zero
3225    is returned.  Bit N is set in MUST_PRECEDE/MUST_FOLLOW if the node with
3226    cuid N must be come before/after (respectively) the node pointed to by
3227    PS_I when scheduled in the same cycle.  */
3228 ps_insn_ptr
ps_add_node_check_conflicts(partial_schedule_ptr ps,int n,int c,sbitmap must_precede,sbitmap must_follow)3229 ps_add_node_check_conflicts (partial_schedule_ptr ps, int n,
3230    			     int c, sbitmap must_precede,
3231 			     sbitmap must_follow)
3232 {
3233   int has_conflicts = 0;
3234   ps_insn_ptr ps_i;
3235 
3236   /* First add the node to the PS, if this succeeds check for
3237      conflicts, trying different issue slots in the same row.  */
3238   if (! (ps_i = add_node_to_ps (ps, n, c, must_precede, must_follow)))
3239     return NULL; /* Failed to insert the node at the given cycle.  */
3240 
3241   has_conflicts = ps_has_conflicts (ps, c, c)
3242 		  || (ps->history > 0
3243 		      && ps_has_conflicts (ps,
3244 					   c - ps->history,
3245 					   c + ps->history));
3246 
3247   /* Try different issue slots to find one that the given node can be
3248      scheduled in without conflicts.  */
3249   while (has_conflicts)
3250     {
3251       if (! ps_insn_advance_column (ps, ps_i, must_follow))
3252 	break;
3253       has_conflicts = ps_has_conflicts (ps, c, c)
3254 		      || (ps->history > 0
3255 			  && ps_has_conflicts (ps,
3256 					       c - ps->history,
3257 					       c + ps->history));
3258     }
3259 
3260   if (has_conflicts)
3261     {
3262       remove_node_from_ps (ps, ps_i);
3263       return NULL;
3264     }
3265 
3266   ps->min_cycle = MIN (ps->min_cycle, c);
3267   ps->max_cycle = MAX (ps->max_cycle, c);
3268   return ps_i;
3269 }
3270 
3271 /* Calculate the stage count of the partial schedule PS.  The calculation
3272    takes into account the rotation amount passed in ROTATION_AMOUNT.  */
3273 int
calculate_stage_count(partial_schedule_ptr ps,int rotation_amount)3274 calculate_stage_count (partial_schedule_ptr ps, int rotation_amount)
3275 {
3276   int new_min_cycle = PS_MIN_CYCLE (ps) - rotation_amount;
3277   int new_max_cycle = PS_MAX_CYCLE (ps) - rotation_amount;
3278   int stage_count = CALC_STAGE_COUNT (-1, new_min_cycle, ps->ii);
3279 
3280   /* The calculation of stage count is done adding the number of stages
3281      before cycle zero and after cycle zero.  */
3282   stage_count += CALC_STAGE_COUNT (new_max_cycle, 0, ps->ii);
3283 
3284   return stage_count;
3285 }
3286 
3287 /* Rotate the rows of PS such that insns scheduled at time
3288    START_CYCLE will appear in row 0.  Updates max/min_cycles.  */
3289 void
rotate_partial_schedule(partial_schedule_ptr ps,int start_cycle)3290 rotate_partial_schedule (partial_schedule_ptr ps, int start_cycle)
3291 {
3292   int i, row, backward_rotates;
3293   int last_row = ps->ii - 1;
3294 
3295   if (start_cycle == 0)
3296     return;
3297 
3298   backward_rotates = SMODULO (start_cycle, ps->ii);
3299 
3300   /* Revisit later and optimize this into a single loop.  */
3301   for (i = 0; i < backward_rotates; i++)
3302     {
3303       ps_insn_ptr first_row = ps->rows[0];
3304       int first_row_length = ps->rows_length[0];
3305 
3306       for (row = 0; row < last_row; row++)
3307 	{
3308 	  ps->rows[row] = ps->rows[row + 1];
3309 	  ps->rows_length[row] = ps->rows_length[row + 1];
3310 	}
3311 
3312       ps->rows[last_row] = first_row;
3313       ps->rows_length[last_row] = first_row_length;
3314     }
3315 
3316   ps->max_cycle -= start_cycle;
3317   ps->min_cycle -= start_cycle;
3318 }
3319 
3320 #endif /* INSN_SCHEDULING */
3321 
3322 static bool
gate_handle_sms(void)3323 gate_handle_sms (void)
3324 {
3325   return (optimize > 0 && flag_modulo_sched);
3326 }
3327 
3328 
3329 /* Run instruction scheduler.  */
3330 /* Perform SMS module scheduling.  */
3331 static unsigned int
rest_of_handle_sms(void)3332 rest_of_handle_sms (void)
3333 {
3334 #ifdef INSN_SCHEDULING
3335   basic_block bb;
3336 
3337   /* Collect loop information to be used in SMS.  */
3338   cfg_layout_initialize (0);
3339   sms_schedule ();
3340 
3341   /* Update the life information, because we add pseudos.  */
3342   max_regno = max_reg_num ();
3343 
3344   /* Finalize layout changes.  */
3345   FOR_EACH_BB (bb)
3346     if (bb->next_bb != EXIT_BLOCK_PTR)
3347       bb->aux = bb->next_bb;
3348   free_dominance_info (CDI_DOMINATORS);
3349   cfg_layout_finalize ();
3350 #endif /* INSN_SCHEDULING */
3351   return 0;
3352 }
3353 
3354 struct rtl_opt_pass pass_sms =
3355 {
3356  {
3357   RTL_PASS,
3358   "sms",                                /* name */
3359   OPTGROUP_NONE,                        /* optinfo_flags */
3360   gate_handle_sms,                      /* gate */
3361   rest_of_handle_sms,                   /* execute */
3362   NULL,                                 /* sub */
3363   NULL,                                 /* next */
3364   0,                                    /* static_pass_number */
3365   TV_SMS,                               /* tv_id */
3366   0,                                    /* properties_required */
3367   0,                                    /* properties_provided */
3368   0,                                    /* properties_destroyed */
3369   0,                                    /* todo_flags_start */
3370   TODO_df_finish
3371     | TODO_verify_flow
3372     | TODO_verify_rtl_sharing
3373     | TODO_ggc_collect                  /* todo_flags_finish */
3374  }
3375 };
3376