1 /* Swing Modulo Scheduling implementation.
2    Copyright (C) 2004-2019 Free Software Foundation, Inc.
3    Contributed by Ayal Zaks and Mustafa Hagog <zaks,mustafa@il.ibm.com>
4 
5 This file is part of GCC.
6 
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
10 version.
11 
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15 for more details.
16 
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3.  If not see
19 <http://www.gnu.org/licenses/>.  */
20 
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "cfghooks.h"
30 #include "df.h"
31 #include "memmodel.h"
32 #include "optabs.h"
33 #include "regs.h"
34 #include "emit-rtl.h"
35 #include "gcov-io.h"
36 #include "profile.h"
37 #include "insn-attr.h"
38 #include "cfgrtl.h"
39 #include "sched-int.h"
40 #include "cfgloop.h"
41 #include "expr.h"
42 #include "params.h"
43 #include "ddg.h"
44 #include "tree-pass.h"
45 #include "dbgcnt.h"
46 #include "loop-unroll.h"
47 #include "hard-reg-set.h"
48 
49 #ifdef INSN_SCHEDULING
50 
51 /* This file contains the implementation of the Swing Modulo Scheduler,
52    described in the following references:
53    [1] J. Llosa, A. Gonzalez, E. Ayguade, M. Valero., and J. Eckhardt.
54        Lifetime--sensitive modulo scheduling in a production environment.
55        IEEE Trans. on Comps., 50(3), March 2001
56    [2] J. Llosa, A. Gonzalez, E. Ayguade, and M. Valero.
57        Swing Modulo Scheduling: A Lifetime Sensitive Approach.
58        PACT '96 , pages 80-87, October 1996 (Boston - Massachusetts - USA).
59 
60    The basic structure is:
61    1. Build a data-dependence graph (DDG) for each loop.
62    2. Use the DDG to order the insns of a loop (not in topological order
63       necessarily, but rather) trying to place each insn after all its
64       predecessors _or_ after all its successors.
65    3. Compute MII: a lower bound on the number of cycles to schedule the loop.
66    4. Use the ordering to perform list-scheduling of the loop:
67       1. Set II = MII.  We will try to schedule the loop within II cycles.
68       2. Try to schedule the insns one by one according to the ordering.
69 	 For each insn compute an interval of cycles by considering already-
70 	 scheduled preds and succs (and associated latencies); try to place
71 	 the insn in the cycles of this window checking for potential
72 	 resource conflicts (using the DFA interface).
73 	 Note: this is different from the cycle-scheduling of schedule_insns;
74 	 here the insns are not scheduled monotonically top-down (nor bottom-
75 	 up).
76       3. If failed in scheduling all insns - bump II++ and try again, unless
77 	 II reaches an upper bound MaxII, in which case report failure.
78    5. If we succeeded in scheduling the loop within II cycles, we now
79       generate prolog and epilog, decrease the counter of the loop, and
80       perform modulo variable expansion for live ranges that span more than
81       II cycles (i.e. use register copies to prevent a def from overwriting
82       itself before reaching the use).
83 
84     SMS works with countable loops (1) whose control part can be easily
85     decoupled from the rest of the loop and (2) whose loop count can
86     be easily adjusted.  This is because we peel a constant number of
87     iterations into a prologue and epilogue for which we want to avoid
88     emitting the control part, and a kernel which is to iterate that
89     constant number of iterations less than the original loop.  So the
90     control part should be a set of insns clearly identified and having
91     its own iv, not otherwise used in the loop (at-least for now), which
92     initializes a register before the loop to the number of iterations.
93     Currently SMS relies on the do-loop pattern to recognize such loops,
94     where (1) the control part comprises of all insns defining and/or
95     using a certain 'count' register and (2) the loop count can be
96     adjusted by modifying this register prior to the loop.
97     TODO: Rely on cfgloop analysis instead.  */
98 
99 /* This page defines partial-schedule structures and functions for
100    modulo scheduling.  */
101 
102 typedef struct partial_schedule *partial_schedule_ptr;
103 typedef struct ps_insn *ps_insn_ptr;
104 
105 /* The minimum (absolute) cycle that a node of ps was scheduled in.  */
106 #define PS_MIN_CYCLE(ps) (((partial_schedule_ptr)(ps))->min_cycle)
107 
108 /* The maximum (absolute) cycle that a node of ps was scheduled in.  */
109 #define PS_MAX_CYCLE(ps) (((partial_schedule_ptr)(ps))->max_cycle)
110 
111 /* Perform signed modulo, always returning a non-negative value.  */
112 #define SMODULO(x,y) ((x) % (y) < 0 ? ((x) % (y) + (y)) : (x) % (y))
113 
114 /* The number of different iterations the nodes in ps span, assuming
115    the stage boundaries are placed efficiently.  */
116 #define CALC_STAGE_COUNT(max_cycle,min_cycle,ii) ((max_cycle - min_cycle \
117                          + 1 + ii - 1) / ii)
118 /* The stage count of ps.  */
119 #define PS_STAGE_COUNT(ps) (((partial_schedule_ptr)(ps))->stage_count)
120 
121 /* A single instruction in the partial schedule.  */
122 struct ps_insn
123 {
124   /* Identifies the instruction to be scheduled.  Values smaller than
125      the ddg's num_nodes refer directly to ddg nodes.  A value of
126      X - num_nodes refers to register move X.  */
127   int id;
128 
129   /* The (absolute) cycle in which the PS instruction is scheduled.
130      Same as SCHED_TIME (node).  */
131   int cycle;
132 
133   /* The next/prev PS_INSN in the same row.  */
134   ps_insn_ptr next_in_row,
135 	      prev_in_row;
136 
137 };
138 
139 /* Information about a register move that has been added to a partial
140    schedule.  */
141 struct ps_reg_move_info
142 {
143   /* The source of the move is defined by the ps_insn with id DEF.
144      The destination is used by the ps_insns with the ids in USES.  */
145   int def;
146   sbitmap uses;
147 
148   /* The original form of USES' instructions used OLD_REG, but they
149      should now use NEW_REG.  */
150   rtx old_reg;
151   rtx new_reg;
152 
153   /* The number of consecutive stages that the move occupies.  */
154   int num_consecutive_stages;
155 
156   /* An instruction that sets NEW_REG to the correct value.  The first
157      move associated with DEF will have an rhs of OLD_REG; later moves
158      use the result of the previous move.  */
159   rtx_insn *insn;
160 };
161 
162 /* Holds the partial schedule as an array of II rows.  Each entry of the
163    array points to a linked list of PS_INSNs, which represents the
164    instructions that are scheduled for that row.  */
165 struct partial_schedule
166 {
167   int ii;	/* Number of rows in the partial schedule.  */
168   int history;  /* Threshold for conflict checking using DFA.  */
169 
170   /* rows[i] points to linked list of insns scheduled in row i (0<=i<ii).  */
171   ps_insn_ptr *rows;
172 
173   /* All the moves added for this partial schedule.  Index X has
174      a ps_insn id of X + g->num_nodes.  */
175   vec<ps_reg_move_info> reg_moves;
176 
177   /*  rows_length[i] holds the number of instructions in the row.
178       It is used only (as an optimization) to back off quickly from
179       trying to schedule a node in a full row; that is, to avoid running
180       through futile DFA state transitions.  */
181   int *rows_length;
182 
183   /* The earliest absolute cycle of an insn in the partial schedule.  */
184   int min_cycle;
185 
186   /* The latest absolute cycle of an insn in the partial schedule.  */
187   int max_cycle;
188 
189   ddg_ptr g;	/* The DDG of the insns in the partial schedule.  */
190 
191   int stage_count;  /* The stage count of the partial schedule.  */
192 };
193 
194 
195 static partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int history);
196 static void free_partial_schedule (partial_schedule_ptr);
197 static void reset_partial_schedule (partial_schedule_ptr, int new_ii);
198 void print_partial_schedule (partial_schedule_ptr, FILE *);
199 static void verify_partial_schedule (partial_schedule_ptr, sbitmap);
200 static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
201 						int, int, sbitmap, sbitmap);
202 static void rotate_partial_schedule (partial_schedule_ptr, int);
203 void set_row_column_for_ps (partial_schedule_ptr);
204 static void ps_insert_empty_row (partial_schedule_ptr, int, sbitmap);
205 static int compute_split_row (sbitmap, int, int, int, ddg_node_ptr);
206 
207 
208 /* This page defines constants and structures for the modulo scheduling
209    driver.  */
210 
211 static int sms_order_nodes (ddg_ptr, int, int *, int *);
212 static void set_node_sched_params (ddg_ptr);
213 static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int, int *);
214 static void permute_partial_schedule (partial_schedule_ptr, rtx_insn *);
215 static int calculate_stage_count (partial_schedule_ptr, int);
216 static void calculate_must_precede_follow (ddg_node_ptr, int, int,
217 					   int, int, sbitmap, sbitmap, sbitmap);
218 static int get_sched_window (partial_schedule_ptr, ddg_node_ptr,
219 			     sbitmap, int, int *, int *, int *);
220 static bool try_scheduling_node_in_cycle (partial_schedule_ptr, int, int,
221 					  sbitmap, int *, sbitmap, sbitmap);
222 static void remove_node_from_ps (partial_schedule_ptr, ps_insn_ptr);
223 
224 #define NODE_ASAP(node) ((node)->aux.count)
225 
226 #define SCHED_PARAMS(x) (&node_sched_param_vec[x])
227 #define SCHED_TIME(x) (SCHED_PARAMS (x)->time)
228 #define SCHED_ROW(x) (SCHED_PARAMS (x)->row)
229 #define SCHED_STAGE(x) (SCHED_PARAMS (x)->stage)
230 #define SCHED_COLUMN(x) (SCHED_PARAMS (x)->column)
231 
232 /* The scheduling parameters held for each node.  */
233 typedef struct node_sched_params
234 {
235   int time;	/* The absolute scheduling cycle.  */
236 
237   int row;    /* Holds time % ii.  */
238   int stage;  /* Holds time / ii.  */
239 
240   /* The column of a node inside the ps.  If nodes u, v are on the same row,
241      u will precede v if column (u) < column (v).  */
242   int column;
243 } *node_sched_params_ptr;
244 
245 /* The following three functions are copied from the current scheduler
246    code in order to use sched_analyze() for computing the dependencies.
247    They are used when initializing the sched_info structure.  */
248 static const char *
sms_print_insn(const rtx_insn * insn,int aligned ATTRIBUTE_UNUSED)249 sms_print_insn (const rtx_insn *insn, int aligned ATTRIBUTE_UNUSED)
250 {
251   static char tmp[80];
252 
253   sprintf (tmp, "i%4d", INSN_UID (insn));
254   return tmp;
255 }
256 
257 static void
compute_jump_reg_dependencies(rtx insn ATTRIBUTE_UNUSED,regset used ATTRIBUTE_UNUSED)258 compute_jump_reg_dependencies (rtx insn ATTRIBUTE_UNUSED,
259 			       regset used ATTRIBUTE_UNUSED)
260 {
261 }
262 
263 static struct common_sched_info_def sms_common_sched_info;
264 
265 static struct sched_deps_info_def sms_sched_deps_info =
266   {
267     compute_jump_reg_dependencies,
268     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
269     NULL,
270     0, 0, 0
271   };
272 
273 static struct haifa_sched_info sms_sched_info =
274 {
275   NULL,
276   NULL,
277   NULL,
278   NULL,
279   NULL,
280   sms_print_insn,
281   NULL,
282   NULL, /* insn_finishes_block_p */
283   NULL, NULL,
284   NULL, NULL,
285   0, 0,
286 
287   NULL, NULL, NULL, NULL,
288   NULL, NULL,
289   0
290 };
291 
292 /* Partial schedule instruction ID in PS is a register move.  Return
293    information about it.  */
294 static struct ps_reg_move_info *
ps_reg_move(partial_schedule_ptr ps,int id)295 ps_reg_move (partial_schedule_ptr ps, int id)
296 {
297   gcc_checking_assert (id >= ps->g->num_nodes);
298   return &ps->reg_moves[id - ps->g->num_nodes];
299 }
300 
301 /* Return the rtl instruction that is being scheduled by partial schedule
302    instruction ID, which belongs to schedule PS.  */
303 static rtx_insn *
ps_rtl_insn(partial_schedule_ptr ps,int id)304 ps_rtl_insn (partial_schedule_ptr ps, int id)
305 {
306   if (id < ps->g->num_nodes)
307     return ps->g->nodes[id].insn;
308   else
309     return ps_reg_move (ps, id)->insn;
310 }
311 
312 /* Partial schedule instruction ID, which belongs to PS, occurred in
313    the original (unscheduled) loop.  Return the first instruction
314    in the loop that was associated with ps_rtl_insn (PS, ID).
315    If the instruction had some notes before it, this is the first
316    of those notes.  */
317 static rtx_insn *
ps_first_note(partial_schedule_ptr ps,int id)318 ps_first_note (partial_schedule_ptr ps, int id)
319 {
320   gcc_assert (id < ps->g->num_nodes);
321   return ps->g->nodes[id].first_note;
322 }
323 
324 /* Return the number of consecutive stages that are occupied by
325    partial schedule instruction ID in PS.  */
326 static int
ps_num_consecutive_stages(partial_schedule_ptr ps,int id)327 ps_num_consecutive_stages (partial_schedule_ptr ps, int id)
328 {
329   if (id < ps->g->num_nodes)
330     return 1;
331   else
332     return ps_reg_move (ps, id)->num_consecutive_stages;
333 }
334 
335 /* Given HEAD and TAIL which are the first and last insns in a loop;
336    return the register which controls the loop.  Return zero if it has
337    more than one occurrence in the loop besides the control part or the
338    do-loop pattern is not of the form we expect.  */
339 static rtx
doloop_register_get(rtx_insn * head,rtx_insn * tail)340 doloop_register_get (rtx_insn *head, rtx_insn *tail)
341 {
342   rtx reg, condition;
343   rtx_insn *insn, *first_insn_not_to_check;
344 
345   if (!JUMP_P (tail))
346     return NULL_RTX;
347 
348   if (!targetm.code_for_doloop_end)
349     return NULL_RTX;
350 
351   /* TODO: Free SMS's dependence on doloop_condition_get.  */
352   condition = doloop_condition_get (tail);
353   if (! condition)
354     return NULL_RTX;
355 
356   if (REG_P (XEXP (condition, 0)))
357     reg = XEXP (condition, 0);
358   else if (GET_CODE (XEXP (condition, 0)) == PLUS
359 	   && REG_P (XEXP (XEXP (condition, 0), 0)))
360     reg = XEXP (XEXP (condition, 0), 0);
361   else
362     gcc_unreachable ();
363 
364   /* Check that the COUNT_REG has no other occurrences in the loop
365      until the decrement.  We assume the control part consists of
366      either a single (parallel) branch-on-count or a (non-parallel)
367      branch immediately preceded by a single (decrement) insn.  */
368   first_insn_not_to_check = (GET_CODE (PATTERN (tail)) == PARALLEL ? tail
369                              : prev_nondebug_insn (tail));
370 
371   for (insn = head; insn != first_insn_not_to_check; insn = NEXT_INSN (insn))
372     if (!DEBUG_INSN_P (insn) && reg_mentioned_p (reg, insn))
373       {
374         if (dump_file)
375         {
376           fprintf (dump_file, "SMS count_reg found ");
377           print_rtl_single (dump_file, reg);
378           fprintf (dump_file, " outside control in insn:\n");
379           print_rtl_single (dump_file, insn);
380         }
381 
382         return NULL_RTX;
383       }
384 
385   return reg;
386 }
387 
388 /* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
389    that the number of iterations is a compile-time constant.  If so,
390    return the rtx_insn that sets COUNT_REG to a constant, and set COUNT to
391    this constant.  Otherwise return 0.  */
392 static rtx_insn *
const_iteration_count(rtx count_reg,basic_block pre_header,int64_t * count,bool * adjust_inplace)393 const_iteration_count (rtx count_reg, basic_block pre_header,
394 		       int64_t *count, bool* adjust_inplace)
395 {
396   rtx_insn *insn;
397   rtx_insn *head, *tail;
398 
399   *adjust_inplace = false;
400   bool read_after = false;
401 
402   if (! pre_header)
403     return NULL;
404 
405   get_ebb_head_tail (pre_header, pre_header, &head, &tail);
406 
407   for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn))
408     if (single_set (insn) && rtx_equal_p (count_reg,
409 					  SET_DEST (single_set (insn))))
410       {
411 	rtx pat = single_set (insn);
412 
413 	if (CONST_INT_P (SET_SRC (pat)))
414 	  {
415 	    *count = INTVAL (SET_SRC (pat));
416 	    *adjust_inplace = !read_after;
417 	    return insn;
418 	  }
419 
420 	return NULL;
421       }
422     else if (NONDEBUG_INSN_P (insn) && reg_mentioned_p (count_reg, insn))
423       {
424 	read_after = true;
425 	if (reg_set_p (count_reg, insn))
426 	   break;
427       }
428 
429   return NULL;
430 }
431 
432 /* A very simple resource-based lower bound on the initiation interval.
433    ??? Improve the accuracy of this bound by considering the
434    utilization of various units.  */
435 static int
res_MII(ddg_ptr g)436 res_MII (ddg_ptr g)
437 {
438   if (targetm.sched.sms_res_mii)
439     return targetm.sched.sms_res_mii (g);
440 
441   return ((g->num_nodes - g->num_debug) / issue_rate);
442 }
443 
444 
445 /* A vector that contains the sched data for each ps_insn.  */
446 static vec<node_sched_params> node_sched_param_vec;
447 
448 /* Allocate sched_params for each node and initialize it.  */
449 static void
set_node_sched_params(ddg_ptr g)450 set_node_sched_params (ddg_ptr g)
451 {
452   node_sched_param_vec.truncate (0);
453   node_sched_param_vec.safe_grow_cleared (g->num_nodes);
454 }
455 
456 /* Make sure that node_sched_param_vec has an entry for every move in PS.  */
457 static void
extend_node_sched_params(partial_schedule_ptr ps)458 extend_node_sched_params (partial_schedule_ptr ps)
459 {
460   node_sched_param_vec.safe_grow_cleared (ps->g->num_nodes
461 					  + ps->reg_moves.length ());
462 }
463 
464 /* Update the sched_params (time, row and stage) for node U using the II,
465    the CYCLE of U and MIN_CYCLE.
466    We're not simply taking the following
467    SCHED_STAGE (u) = CALC_STAGE_COUNT (SCHED_TIME (u), min_cycle, ii);
468    because the stages may not be aligned on cycle 0.  */
469 static void
update_node_sched_params(int u,int ii,int cycle,int min_cycle)470 update_node_sched_params (int u, int ii, int cycle, int min_cycle)
471 {
472   int sc_until_cycle_zero;
473   int stage;
474 
475   SCHED_TIME (u) = cycle;
476   SCHED_ROW (u) = SMODULO (cycle, ii);
477 
478   /* The calculation of stage count is done adding the number
479      of stages before cycle zero and after cycle zero.  */
480   sc_until_cycle_zero = CALC_STAGE_COUNT (-1, min_cycle, ii);
481 
482   if (SCHED_TIME (u) < 0)
483     {
484       stage = CALC_STAGE_COUNT (-1, SCHED_TIME (u), ii);
485       SCHED_STAGE (u) = sc_until_cycle_zero - stage;
486     }
487   else
488     {
489       stage = CALC_STAGE_COUNT (SCHED_TIME (u), 0, ii);
490       SCHED_STAGE (u) = sc_until_cycle_zero + stage - 1;
491     }
492 }
493 
494 static void
print_node_sched_params(FILE * file,int num_nodes,partial_schedule_ptr ps)495 print_node_sched_params (FILE *file, int num_nodes, partial_schedule_ptr ps)
496 {
497   int i;
498 
499   if (! file)
500     return;
501   for (i = 0; i < num_nodes; i++)
502     {
503       node_sched_params_ptr nsp = SCHED_PARAMS (i);
504 
505       fprintf (file, "Node = %d; INSN = %d\n", i,
506 	       INSN_UID (ps_rtl_insn (ps, i)));
507       fprintf (file, " asap = %d:\n", NODE_ASAP (&ps->g->nodes[i]));
508       fprintf (file, " time = %d:\n", nsp->time);
509       fprintf (file, " stage = %d:\n", nsp->stage);
510     }
511 }
512 
513 /* Set SCHED_COLUMN for each instruction in row ROW of PS.  */
514 static void
set_columns_for_row(partial_schedule_ptr ps,int row)515 set_columns_for_row (partial_schedule_ptr ps, int row)
516 {
517   ps_insn_ptr cur_insn;
518   int column;
519 
520   column = 0;
521   for (cur_insn = ps->rows[row]; cur_insn; cur_insn = cur_insn->next_in_row)
522     SCHED_COLUMN (cur_insn->id) = column++;
523 }
524 
525 /* Set SCHED_COLUMN for each instruction in PS.  */
526 static void
set_columns_for_ps(partial_schedule_ptr ps)527 set_columns_for_ps (partial_schedule_ptr ps)
528 {
529   int row;
530 
531   for (row = 0; row < ps->ii; row++)
532     set_columns_for_row (ps, row);
533 }
534 
535 /* Try to schedule the move with ps_insn identifier I_REG_MOVE in PS.
536    Its single predecessor has already been scheduled, as has its
537    ddg node successors.  (The move may have also another move as its
538    successor, in which case that successor will be scheduled later.)
539 
540    The move is part of a chain that satisfies register dependencies
541    between a producing ddg node and various consuming ddg nodes.
542    If some of these dependencies have a distance of 1 (meaning that
543    the use is upward-exposed) then DISTANCE1_USES is nonnull and
544    contains the set of uses with distance-1 dependencies.
545    DISTANCE1_USES is null otherwise.
546 
547    MUST_FOLLOW is a scratch bitmap that is big enough to hold
548    all current ps_insn ids.
549 
550    Return true on success.  */
551 static bool
schedule_reg_move(partial_schedule_ptr ps,int i_reg_move,sbitmap distance1_uses,sbitmap must_follow)552 schedule_reg_move (partial_schedule_ptr ps, int i_reg_move,
553 		   sbitmap distance1_uses, sbitmap must_follow)
554 {
555   unsigned int u;
556   int this_time, this_distance, this_start, this_end, this_latency;
557   int start, end, c, ii;
558   sbitmap_iterator sbi;
559   ps_reg_move_info *move;
560   rtx_insn *this_insn;
561   ps_insn_ptr psi;
562 
563   move = ps_reg_move (ps, i_reg_move);
564   ii = ps->ii;
565   if (dump_file)
566     {
567       fprintf (dump_file, "Scheduling register move INSN %d; ii = %d"
568 	       ", min cycle = %d\n\n", INSN_UID (move->insn), ii,
569 	       PS_MIN_CYCLE (ps));
570       print_rtl_single (dump_file, move->insn);
571       fprintf (dump_file, "\n%11s %11s %5s\n", "start", "end", "time");
572       fprintf (dump_file, "=========== =========== =====\n");
573     }
574 
575   start = INT_MIN;
576   end = INT_MAX;
577 
578   /* For dependencies of distance 1 between a producer ddg node A
579      and consumer ddg node B, we have a chain of dependencies:
580 
581         A --(T,L1,1)--> M1 --(T,L2,0)--> M2 ... --(T,Ln,0)--> B
582 
583      where Mi is the ith move.  For dependencies of distance 0 between
584      a producer ddg node A and consumer ddg node C, we have a chain of
585      dependencies:
586 
587         A --(T,L1',0)--> M1' --(T,L2',0)--> M2' ... --(T,Ln',0)--> C
588 
589      where Mi' occupies the same position as Mi but occurs a stage later.
590      We can only schedule each move once, so if we have both types of
591      chain, we model the second as:
592 
593         A --(T,L1',1)--> M1 --(T,L2',0)--> M2 ... --(T,Ln',-1)--> C
594 
595      First handle the dependencies between the previously-scheduled
596      predecessor and the move.  */
597   this_insn = ps_rtl_insn (ps, move->def);
598   this_latency = insn_latency (this_insn, move->insn);
599   this_distance = distance1_uses && move->def < ps->g->num_nodes ? 1 : 0;
600   this_time = SCHED_TIME (move->def) - this_distance * ii;
601   this_start = this_time + this_latency;
602   this_end = this_time + ii;
603   if (dump_file)
604     fprintf (dump_file, "%11d %11d %5d %d --(T,%d,%d)--> %d\n",
605 	     this_start, this_end, SCHED_TIME (move->def),
606 	     INSN_UID (this_insn), this_latency, this_distance,
607 	     INSN_UID (move->insn));
608 
609   if (start < this_start)
610     start = this_start;
611   if (end > this_end)
612     end = this_end;
613 
614   /* Handle the dependencies between the move and previously-scheduled
615      successors.  */
616   EXECUTE_IF_SET_IN_BITMAP (move->uses, 0, u, sbi)
617     {
618       this_insn = ps_rtl_insn (ps, u);
619       this_latency = insn_latency (move->insn, this_insn);
620       if (distance1_uses && !bitmap_bit_p (distance1_uses, u))
621 	this_distance = -1;
622       else
623 	this_distance = 0;
624       this_time = SCHED_TIME (u) + this_distance * ii;
625       this_start = this_time - ii;
626       this_end = this_time - this_latency;
627       if (dump_file)
628 	fprintf (dump_file, "%11d %11d %5d %d --(T,%d,%d)--> %d\n",
629 		 this_start, this_end, SCHED_TIME (u), INSN_UID (move->insn),
630 		 this_latency, this_distance, INSN_UID (this_insn));
631 
632       if (start < this_start)
633 	start = this_start;
634       if (end > this_end)
635 	end = this_end;
636     }
637 
638   if (dump_file)
639     {
640       fprintf (dump_file, "----------- ----------- -----\n");
641       fprintf (dump_file, "%11d %11d %5s %s\n", start, end, "", "(max, min)");
642     }
643 
644   bitmap_clear (must_follow);
645   bitmap_set_bit (must_follow, move->def);
646 
647   start = MAX (start, end - (ii - 1));
648   for (c = end; c >= start; c--)
649     {
650       psi = ps_add_node_check_conflicts (ps, i_reg_move, c,
651 					 move->uses, must_follow);
652       if (psi)
653 	{
654 	  update_node_sched_params (i_reg_move, ii, c, PS_MIN_CYCLE (ps));
655 	  if (dump_file)
656 	    fprintf (dump_file, "\nScheduled register move INSN %d at"
657 		     " time %d, row %d\n\n", INSN_UID (move->insn), c,
658 		     SCHED_ROW (i_reg_move));
659 	  return true;
660 	}
661     }
662 
663   if (dump_file)
664     fprintf (dump_file, "\nNo available slot\n\n");
665 
666   return false;
667 }
668 
669 /*
670    Breaking intra-loop register anti-dependences:
671    Each intra-loop register anti-dependence implies a cross-iteration true
672    dependence of distance 1. Therefore, we can remove such false dependencies
673    and figure out if the partial schedule broke them by checking if (for a
674    true-dependence of distance 1): SCHED_TIME (def) < SCHED_TIME (use) and
675    if so generate a register move.   The number of such moves is equal to:
676               SCHED_TIME (use) - SCHED_TIME (def)       { 0 broken
677    nreg_moves = ----------------------------------- + 1 - {   dependence.
678                             ii                          { 1 if not.
679 */
680 static bool
schedule_reg_moves(partial_schedule_ptr ps)681 schedule_reg_moves (partial_schedule_ptr ps)
682 {
683   ddg_ptr g = ps->g;
684   int ii = ps->ii;
685   int i;
686 
687   for (i = 0; i < g->num_nodes; i++)
688     {
689       ddg_node_ptr u = &g->nodes[i];
690       ddg_edge_ptr e;
691       int nreg_moves = 0, i_reg_move;
692       rtx prev_reg, old_reg;
693       int first_move;
694       int distances[2];
695       sbitmap distance1_uses;
696       rtx set = single_set (u->insn);
697 
698       /* Skip instructions that do not set a register.  */
699       if (set && !REG_P (SET_DEST (set)))
700         continue;
701 
702       /* Compute the number of reg_moves needed for u, by looking at life
703 	 ranges started at u (excluding self-loops).  */
704       distances[0] = distances[1] = false;
705       for (e = u->out; e; e = e->next_out)
706 	if (e->type == TRUE_DEP && e->dest != e->src)
707 	  {
708 	    int nreg_moves4e = (SCHED_TIME (e->dest->cuid)
709 				- SCHED_TIME (e->src->cuid)) / ii;
710 
711             if (e->distance == 1)
712               nreg_moves4e = (SCHED_TIME (e->dest->cuid)
713 			      - SCHED_TIME (e->src->cuid) + ii) / ii;
714 
715 	    /* If dest precedes src in the schedule of the kernel, then dest
716 	       will read before src writes and we can save one reg_copy.  */
717 	    if (SCHED_ROW (e->dest->cuid) == SCHED_ROW (e->src->cuid)
718 		&& SCHED_COLUMN (e->dest->cuid) < SCHED_COLUMN (e->src->cuid))
719 	      nreg_moves4e--;
720 
721             if (nreg_moves4e >= 1)
722 	      {
723 		/* !single_set instructions are not supported yet and
724 		   thus we do not except to encounter them in the loop
725 		   except from the doloop part.  For the latter case
726 		   we assume no regmoves are generated as the doloop
727 		   instructions are tied to the branch with an edge.  */
728 		gcc_assert (set);
729 		/* If the instruction contains auto-inc register then
730 		   validate that the regmov is being generated for the
731 		   target regsiter rather then the inc'ed register.	*/
732 		gcc_assert (!autoinc_var_is_used_p (u->insn, e->dest->insn));
733 	      }
734 
735 	    if (nreg_moves4e)
736 	      {
737 		gcc_assert (e->distance < 2);
738 		distances[e->distance] = true;
739 	      }
740 	    nreg_moves = MAX (nreg_moves, nreg_moves4e);
741 	  }
742 
743       if (nreg_moves == 0)
744 	continue;
745 
746       /* Create NREG_MOVES register moves.  */
747       first_move = ps->reg_moves.length ();
748       ps->reg_moves.safe_grow_cleared (first_move + nreg_moves);
749       extend_node_sched_params (ps);
750 
751       /* Record the moves associated with this node.  */
752       first_move += ps->g->num_nodes;
753 
754       /* Generate each move.  */
755       old_reg = prev_reg = SET_DEST (set);
756       if (HARD_REGISTER_P (old_reg))
757 	return false;
758 
759       for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
760 	{
761 	  ps_reg_move_info *move = ps_reg_move (ps, first_move + i_reg_move);
762 
763 	  move->def = i_reg_move > 0 ? first_move + i_reg_move - 1 : i;
764 	  move->uses = sbitmap_alloc (first_move + nreg_moves);
765 	  move->old_reg = old_reg;
766 	  move->new_reg = gen_reg_rtx (GET_MODE (prev_reg));
767 	  move->num_consecutive_stages = distances[0] && distances[1] ? 2 : 1;
768 	  move->insn = gen_move_insn (move->new_reg, copy_rtx (prev_reg));
769 	  bitmap_clear (move->uses);
770 
771 	  prev_reg = move->new_reg;
772 	}
773 
774       distance1_uses = distances[1] ? sbitmap_alloc (g->num_nodes) : NULL;
775 
776       if (distance1_uses)
777 	bitmap_clear (distance1_uses);
778 
779       /* Every use of the register defined by node may require a different
780 	 copy of this register, depending on the time the use is scheduled.
781 	 Record which uses require which move results.  */
782       for (e = u->out; e; e = e->next_out)
783 	if (e->type == TRUE_DEP && e->dest != e->src)
784 	  {
785 	    int dest_copy = (SCHED_TIME (e->dest->cuid)
786 			     - SCHED_TIME (e->src->cuid)) / ii;
787 
788 	    if (e->distance == 1)
789 	      dest_copy = (SCHED_TIME (e->dest->cuid)
790 			   - SCHED_TIME (e->src->cuid) + ii) / ii;
791 
792 	    if (SCHED_ROW (e->dest->cuid) == SCHED_ROW (e->src->cuid)
793 		&& SCHED_COLUMN (e->dest->cuid) < SCHED_COLUMN (e->src->cuid))
794 	      dest_copy--;
795 
796 	    if (dest_copy)
797 	      {
798 		ps_reg_move_info *move;
799 
800 		move = ps_reg_move (ps, first_move + dest_copy - 1);
801 		bitmap_set_bit (move->uses, e->dest->cuid);
802 		if (e->distance == 1)
803 		  bitmap_set_bit (distance1_uses, e->dest->cuid);
804 	      }
805 	  }
806 
807       auto_sbitmap must_follow (first_move + nreg_moves);
808       for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
809 	if (!schedule_reg_move (ps, first_move + i_reg_move,
810 				distance1_uses, must_follow))
811 	  break;
812       if (distance1_uses)
813 	sbitmap_free (distance1_uses);
814       if (i_reg_move < nreg_moves)
815 	return false;
816     }
817   return true;
818 }
819 
820 /* Emit the moves associated with PS.  Apply the substitutions
821    associated with them.  */
822 static void
apply_reg_moves(partial_schedule_ptr ps)823 apply_reg_moves (partial_schedule_ptr ps)
824 {
825   ps_reg_move_info *move;
826   int i;
827 
828   FOR_EACH_VEC_ELT (ps->reg_moves, i, move)
829     {
830       unsigned int i_use;
831       sbitmap_iterator sbi;
832 
833       EXECUTE_IF_SET_IN_BITMAP (move->uses, 0, i_use, sbi)
834 	{
835 	  replace_rtx (ps->g->nodes[i_use].insn, move->old_reg, move->new_reg);
836 	  df_insn_rescan (ps->g->nodes[i_use].insn);
837 	}
838     }
839 }
840 
841 /* Bump the SCHED_TIMEs of all nodes by AMOUNT.  Set the values of
842    SCHED_ROW and SCHED_STAGE.  Instruction scheduled on cycle AMOUNT
843    will move to cycle zero.  */
844 static void
reset_sched_times(partial_schedule_ptr ps,int amount)845 reset_sched_times (partial_schedule_ptr ps, int amount)
846 {
847   int row;
848   int ii = ps->ii;
849   ps_insn_ptr crr_insn;
850 
851   for (row = 0; row < ii; row++)
852     for (crr_insn = ps->rows[row]; crr_insn; crr_insn = crr_insn->next_in_row)
853       {
854 	int u = crr_insn->id;
855 	int normalized_time = SCHED_TIME (u) - amount;
856 	int new_min_cycle = PS_MIN_CYCLE (ps) - amount;
857 
858         if (dump_file)
859           {
860             /* Print the scheduling times after the rotation.  */
861 	    rtx_insn *insn = ps_rtl_insn (ps, u);
862 
863             fprintf (dump_file, "crr_insn->node=%d (insn id %d), "
864                      "crr_insn->cycle=%d, min_cycle=%d", u,
865                      INSN_UID (insn), normalized_time, new_min_cycle);
866             if (JUMP_P (insn))
867               fprintf (dump_file, " (branch)");
868             fprintf (dump_file, "\n");
869           }
870 
871 	gcc_assert (SCHED_TIME (u) >= ps->min_cycle);
872 	gcc_assert (SCHED_TIME (u) <= ps->max_cycle);
873 
874 	crr_insn->cycle = normalized_time;
875 	update_node_sched_params (u, ii, normalized_time, new_min_cycle);
876       }
877 }
878 
879 /* Permute the insns according to their order in PS, from row 0 to
880    row ii-1, and position them right before LAST.  This schedules
881    the insns of the loop kernel.  */
882 static void
permute_partial_schedule(partial_schedule_ptr ps,rtx_insn * last)883 permute_partial_schedule (partial_schedule_ptr ps, rtx_insn *last)
884 {
885   int ii = ps->ii;
886   int row;
887   ps_insn_ptr ps_ij;
888 
889   for (row = 0; row < ii ; row++)
890     for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
891       {
892 	rtx_insn *insn = ps_rtl_insn (ps, ps_ij->id);
893 
894 	if (PREV_INSN (last) != insn)
895 	  {
896 	    if (ps_ij->id < ps->g->num_nodes)
897 	      reorder_insns_nobb (ps_first_note (ps, ps_ij->id), insn,
898 				  PREV_INSN (last));
899 	    else
900 	      add_insn_before (insn, last, NULL);
901 	  }
902       }
903 }
904 
905 /* Set bitmaps TMP_FOLLOW and TMP_PRECEDE to MUST_FOLLOW and MUST_PRECEDE
906    respectively only if cycle C falls on the border of the scheduling
907    window boundaries marked by START and END cycles.  STEP is the
908    direction of the window.  */
909 static inline void
set_must_precede_follow(sbitmap * tmp_follow,sbitmap must_follow,sbitmap * tmp_precede,sbitmap must_precede,int c,int start,int end,int step)910 set_must_precede_follow (sbitmap *tmp_follow, sbitmap must_follow,
911 			 sbitmap *tmp_precede, sbitmap must_precede, int c,
912 			 int start, int end, int step)
913 {
914   *tmp_precede = NULL;
915   *tmp_follow = NULL;
916 
917   if (c == start)
918     {
919       if (step == 1)
920 	*tmp_precede = must_precede;
921       else			/* step == -1.  */
922 	*tmp_follow = must_follow;
923     }
924   if (c == end - step)
925     {
926       if (step == 1)
927 	*tmp_follow = must_follow;
928       else			/* step == -1.  */
929 	*tmp_precede = must_precede;
930     }
931 
932 }
933 
934 /* Return True if the branch can be moved to row ii-1 while
935    normalizing the partial schedule PS to start from cycle zero and thus
936    optimize the SC.  Otherwise return False.  */
937 static bool
optimize_sc(partial_schedule_ptr ps,ddg_ptr g)938 optimize_sc (partial_schedule_ptr ps, ddg_ptr g)
939 {
940   int amount = PS_MIN_CYCLE (ps);
941   int start, end, step;
942   int ii = ps->ii;
943   bool ok = false;
944   int stage_count, stage_count_curr;
945 
946   /* Compare the SC after normalization and SC after bringing the branch
947      to row ii-1.  If they are equal just bail out.  */
948   stage_count = calculate_stage_count (ps, amount);
949   stage_count_curr =
950     calculate_stage_count (ps, SCHED_TIME (g->closing_branch->cuid) - (ii - 1));
951 
952   if (stage_count == stage_count_curr)
953     {
954       if (dump_file)
955 	fprintf (dump_file, "SMS SC already optimized.\n");
956 
957       return false;
958     }
959 
960   if (dump_file)
961     {
962       fprintf (dump_file, "SMS Trying to optimize branch location\n");
963       fprintf (dump_file, "SMS partial schedule before trial:\n");
964       print_partial_schedule (ps, dump_file);
965     }
966 
967   /* First, normalize the partial scheduling.  */
968   reset_sched_times (ps, amount);
969   rotate_partial_schedule (ps, amount);
970   if (dump_file)
971     {
972       fprintf (dump_file,
973 	       "SMS partial schedule after normalization (ii, %d, SC %d):\n",
974 	       ii, stage_count);
975       print_partial_schedule (ps, dump_file);
976     }
977 
978   if (SMODULO (SCHED_TIME (g->closing_branch->cuid), ii) == ii - 1)
979     return true;
980 
981   auto_sbitmap sched_nodes (g->num_nodes);
982   bitmap_ones (sched_nodes);
983 
984   /* Calculate the new placement of the branch.  It should be in row
985      ii-1 and fall into it's scheduling window.  */
986   if (get_sched_window (ps, g->closing_branch, sched_nodes, ii, &start,
987 			&step, &end) == 0)
988     {
989       bool success;
990       ps_insn_ptr next_ps_i;
991       int branch_cycle = SCHED_TIME (g->closing_branch->cuid);
992       int row = SMODULO (branch_cycle, ps->ii);
993       int num_splits = 0;
994       sbitmap tmp_precede, tmp_follow;
995       int min_cycle, c;
996 
997       if (dump_file)
998 	fprintf (dump_file, "\nTrying to schedule node %d "
999 		 "INSN = %d  in (%d .. %d) step %d\n",
1000 		 g->closing_branch->cuid,
1001 		 (INSN_UID (g->closing_branch->insn)), start, end, step);
1002 
1003       gcc_assert ((step > 0 && start < end) || (step < 0 && start > end));
1004       if (step == 1)
1005 	{
1006 	  c = start + ii - SMODULO (start, ii) - 1;
1007 	  gcc_assert (c >= start);
1008 	  if (c >= end)
1009 	    {
1010 	      if (dump_file)
1011 		fprintf (dump_file,
1012 			 "SMS failed to schedule branch at cycle: %d\n", c);
1013 	      return false;
1014 	    }
1015 	}
1016       else
1017 	{
1018 	  c = start - SMODULO (start, ii) - 1;
1019 	  gcc_assert (c <= start);
1020 
1021 	  if (c <= end)
1022 	    {
1023 	      if (dump_file)
1024 		fprintf (dump_file,
1025 			 "SMS failed to schedule branch at cycle: %d\n", c);
1026 	      return false;
1027 	    }
1028 	}
1029 
1030       auto_sbitmap must_precede (g->num_nodes);
1031       auto_sbitmap must_follow (g->num_nodes);
1032 
1033       /* Try to schedule the branch is it's new cycle.  */
1034       calculate_must_precede_follow (g->closing_branch, start, end,
1035 				     step, ii, sched_nodes,
1036 				     must_precede, must_follow);
1037 
1038       set_must_precede_follow (&tmp_follow, must_follow, &tmp_precede,
1039 			       must_precede, c, start, end, step);
1040 
1041       /* Find the element in the partial schedule related to the closing
1042          branch so we can remove it from it's current cycle.  */
1043       for (next_ps_i = ps->rows[row];
1044 	   next_ps_i; next_ps_i = next_ps_i->next_in_row)
1045 	if (next_ps_i->id == g->closing_branch->cuid)
1046 	  break;
1047 
1048       min_cycle = PS_MIN_CYCLE (ps) - SMODULO (PS_MIN_CYCLE (ps), ps->ii);
1049       remove_node_from_ps (ps, next_ps_i);
1050       success =
1051 	try_scheduling_node_in_cycle (ps, g->closing_branch->cuid, c,
1052 				      sched_nodes, &num_splits,
1053 				      tmp_precede, tmp_follow);
1054       gcc_assert (num_splits == 0);
1055       if (!success)
1056 	{
1057 	  if (dump_file)
1058 	    fprintf (dump_file,
1059 		     "SMS failed to schedule branch at cycle: %d, "
1060 		     "bringing it back to cycle %d\n", c, branch_cycle);
1061 
1062 	  /* The branch was failed to be placed in row ii - 1.
1063 	     Put it back in it's original place in the partial
1064 	     schedualing.  */
1065 	  set_must_precede_follow (&tmp_follow, must_follow, &tmp_precede,
1066 				   must_precede, branch_cycle, start, end,
1067 				   step);
1068 	  success =
1069 	    try_scheduling_node_in_cycle (ps, g->closing_branch->cuid,
1070 					  branch_cycle, sched_nodes,
1071 					  &num_splits, tmp_precede,
1072 					  tmp_follow);
1073 	  gcc_assert (success && (num_splits == 0));
1074 	  ok = false;
1075 	}
1076       else
1077 	{
1078 	  /* The branch is placed in row ii - 1.  */
1079 	  if (dump_file)
1080 	    fprintf (dump_file,
1081 		     "SMS success in moving branch to cycle %d\n", c);
1082 
1083 	  update_node_sched_params (g->closing_branch->cuid, ii, c,
1084 				    PS_MIN_CYCLE (ps));
1085 	  ok = true;
1086 	}
1087 
1088       /* This might have been added to a new first stage.  */
1089       if (PS_MIN_CYCLE (ps) < min_cycle)
1090 	reset_sched_times (ps, 0);
1091     }
1092 
1093   return ok;
1094 }
1095 
1096 static void
duplicate_insns_of_cycles(partial_schedule_ptr ps,int from_stage,int to_stage,rtx count_reg)1097 duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
1098 			   int to_stage, rtx count_reg)
1099 {
1100   int row;
1101   ps_insn_ptr ps_ij;
1102 
1103   for (row = 0; row < ps->ii; row++)
1104     for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
1105       {
1106 	int u = ps_ij->id;
1107 	int first_u, last_u;
1108 	rtx_insn *u_insn;
1109 
1110         /* Do not duplicate any insn which refers to count_reg as it
1111            belongs to the control part.
1112            The closing branch is scheduled as well and thus should
1113            be ignored.
1114            TODO: This should be done by analyzing the control part of
1115            the loop.  */
1116 	u_insn = ps_rtl_insn (ps, u);
1117         if (reg_mentioned_p (count_reg, u_insn)
1118             || JUMP_P (u_insn))
1119           continue;
1120 
1121 	first_u = SCHED_STAGE (u);
1122 	last_u = first_u + ps_num_consecutive_stages (ps, u) - 1;
1123 	if (from_stage <= last_u && to_stage >= first_u)
1124 	  {
1125 	    if (u < ps->g->num_nodes)
1126 	      duplicate_insn_chain (ps_first_note (ps, u), u_insn);
1127 	    else
1128 	      emit_insn (copy_rtx (PATTERN (u_insn)));
1129 	  }
1130       }
1131 }
1132 
1133 
1134 /* Generate the instructions (including reg_moves) for prolog & epilog.  */
1135 static void
generate_prolog_epilog(partial_schedule_ptr ps,struct loop * loop,rtx count_reg,bool adjust_init)1136 generate_prolog_epilog (partial_schedule_ptr ps, struct loop *loop,
1137 			rtx count_reg, bool adjust_init)
1138 {
1139   int i;
1140   int last_stage = PS_STAGE_COUNT (ps) - 1;
1141   edge e;
1142 
1143   /* Generate the prolog, inserting its insns on the loop-entry edge.  */
1144   start_sequence ();
1145 
1146   if (adjust_init)
1147     {
1148       /* Generate instructions at the beginning of the prolog to
1149 	 adjust the loop count by STAGE_COUNT.  If loop count is constant
1150 	 and it not used anywhere in prologue, this constant is adjusted by
1151 	 STAGE_COUNT outside of generate_prolog_epilog function.  */
1152       rtx sub_reg = NULL_RTX;
1153 
1154       sub_reg = expand_simple_binop (GET_MODE (count_reg), MINUS, count_reg,
1155 				     gen_int_mode (last_stage,
1156 						   GET_MODE (count_reg)),
1157                                      count_reg, 1, OPTAB_DIRECT);
1158       gcc_assert (REG_P (sub_reg));
1159       if (REGNO (sub_reg) != REGNO (count_reg))
1160         emit_move_insn (count_reg, sub_reg);
1161     }
1162 
1163   for (i = 0; i < last_stage; i++)
1164     duplicate_insns_of_cycles (ps, 0, i, count_reg);
1165 
1166   /* Put the prolog on the entry edge.  */
1167   e = loop_preheader_edge (loop);
1168   split_edge_and_insert (e, get_insns ());
1169   if (!flag_resched_modulo_sched)
1170     e->dest->flags |= BB_DISABLE_SCHEDULE;
1171 
1172   end_sequence ();
1173 
1174   /* Generate the epilog, inserting its insns on the loop-exit edge.  */
1175   start_sequence ();
1176 
1177   for (i = 0; i < last_stage; i++)
1178     duplicate_insns_of_cycles (ps, i + 1, last_stage, count_reg);
1179 
1180   /* Put the epilogue on the exit edge.  */
1181   gcc_assert (single_exit (loop));
1182   e = single_exit (loop);
1183   split_edge_and_insert (e, get_insns ());
1184   if (!flag_resched_modulo_sched)
1185     e->dest->flags |= BB_DISABLE_SCHEDULE;
1186 
1187   end_sequence ();
1188 }
1189 
1190 /* Mark LOOP as software pipelined so the later
1191    scheduling passes don't touch it.  */
1192 static void
mark_loop_unsched(struct loop * loop)1193 mark_loop_unsched (struct loop *loop)
1194 {
1195   unsigned i;
1196   basic_block *bbs = get_loop_body (loop);
1197 
1198   for (i = 0; i < loop->num_nodes; i++)
1199     bbs[i]->flags |= BB_DISABLE_SCHEDULE;
1200 
1201   free (bbs);
1202 }
1203 
1204 /* Return true if all the BBs of the loop are empty except the
1205    loop header.  */
1206 static bool
loop_single_full_bb_p(struct loop * loop)1207 loop_single_full_bb_p (struct loop *loop)
1208 {
1209   unsigned i;
1210   basic_block *bbs = get_loop_body (loop);
1211 
1212   for (i = 0; i < loop->num_nodes ; i++)
1213     {
1214       rtx_insn *head, *tail;
1215       bool empty_bb = true;
1216 
1217       if (bbs[i] == loop->header)
1218         continue;
1219 
1220       /* Make sure that basic blocks other than the header
1221          have only notes labels or jumps.  */
1222       get_ebb_head_tail (bbs[i], bbs[i], &head, &tail);
1223       for (; head != NEXT_INSN (tail); head = NEXT_INSN (head))
1224         {
1225           if (NOTE_P (head) || LABEL_P (head)
1226  	      || (INSN_P (head) && (DEBUG_INSN_P (head) || JUMP_P (head))))
1227  	    continue;
1228  	  empty_bb = false;
1229  	  break;
1230         }
1231 
1232       if (! empty_bb)
1233         {
1234           free (bbs);
1235           return false;
1236         }
1237     }
1238   free (bbs);
1239   return true;
1240 }
1241 
1242 /* Dump file:line from INSN's location info to dump_file.  */
1243 
1244 static void
dump_insn_location(rtx_insn * insn)1245 dump_insn_location (rtx_insn *insn)
1246 {
1247   if (dump_file && INSN_HAS_LOCATION (insn))
1248     {
1249       expanded_location xloc = insn_location (insn);
1250       fprintf (dump_file, " %s:%i", xloc.file, xloc.line);
1251     }
1252 }
1253 
1254 /* A simple loop from SMS point of view; it is a loop that is composed of
1255    either a single basic block or two BBs - a header and a latch.  */
1256 #define SIMPLE_SMS_LOOP_P(loop) ((loop->num_nodes < 3 ) 		    \
1257 				  && (EDGE_COUNT (loop->latch->preds) == 1) \
1258                                   && (EDGE_COUNT (loop->latch->succs) == 1))
1259 
1260 /* Return true if the loop is in its canonical form and false if not.
1261    i.e. SIMPLE_SMS_LOOP_P and have one preheader block, and single exit.  */
1262 static bool
loop_canon_p(struct loop * loop)1263 loop_canon_p (struct loop *loop)
1264 {
1265 
1266   if (loop->inner || !loop_outer (loop))
1267   {
1268     if (dump_file)
1269       fprintf (dump_file, "SMS loop inner or !loop_outer\n");
1270     return false;
1271   }
1272 
1273   if (!single_exit (loop))
1274     {
1275       if (dump_file)
1276 	{
1277 	  rtx_insn *insn = BB_END (loop->header);
1278 
1279 	  fprintf (dump_file, "SMS loop many exits");
1280 	  dump_insn_location (insn);
1281 	  fprintf (dump_file, "\n");
1282 	}
1283       return false;
1284     }
1285 
1286   if (! SIMPLE_SMS_LOOP_P (loop) && ! loop_single_full_bb_p (loop))
1287     {
1288       if (dump_file)
1289 	{
1290 	  rtx_insn *insn = BB_END (loop->header);
1291 
1292 	  fprintf (dump_file, "SMS loop many BBs.");
1293 	  dump_insn_location (insn);
1294 	  fprintf (dump_file, "\n");
1295 	}
1296       return false;
1297     }
1298 
1299     return true;
1300 }
1301 
1302 /* If there are more than one entry for the loop,
1303    make it one by splitting the first entry edge and
1304    redirecting the others to the new BB.  */
1305 static void
canon_loop(struct loop * loop)1306 canon_loop (struct loop *loop)
1307 {
1308   edge e;
1309   edge_iterator i;
1310 
1311   /* Avoid annoying special cases of edges going to exit
1312      block.  */
1313   FOR_EACH_EDGE (e, i, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
1314     if ((e->flags & EDGE_FALLTHRU) && (EDGE_COUNT (e->src->succs) > 1))
1315       split_edge (e);
1316 
1317   if (loop->latch == loop->header
1318       || EDGE_COUNT (loop->latch->succs) > 1)
1319     {
1320       FOR_EACH_EDGE (e, i, loop->header->preds)
1321         if (e->src == loop->latch)
1322           break;
1323       split_edge (e);
1324     }
1325 }
1326 
1327 /* Setup infos.  */
1328 static void
setup_sched_infos(void)1329 setup_sched_infos (void)
1330 {
1331   memcpy (&sms_common_sched_info, &haifa_common_sched_info,
1332 	  sizeof (sms_common_sched_info));
1333   sms_common_sched_info.sched_pass_id = SCHED_SMS_PASS;
1334   common_sched_info = &sms_common_sched_info;
1335 
1336   sched_deps_info = &sms_sched_deps_info;
1337   current_sched_info = &sms_sched_info;
1338 }
1339 
1340 /* Probability in % that the sms-ed loop rolls enough so that optimized
1341    version may be entered.  Just a guess.  */
1342 #define PROB_SMS_ENOUGH_ITERATIONS 80
1343 
1344 /* Used to calculate the upper bound of ii.  */
1345 #define MAXII_FACTOR 2
1346 
1347 /* Main entry point, perform SMS scheduling on the loops of the function
1348    that consist of single basic blocks.  */
1349 static void
sms_schedule(void)1350 sms_schedule (void)
1351 {
1352   rtx_insn *insn;
1353   ddg_ptr *g_arr, g;
1354   int * node_order;
1355   int maxii, max_asap;
1356   partial_schedule_ptr ps;
1357   basic_block bb = NULL;
1358   struct loop *loop;
1359   basic_block condition_bb = NULL;
1360   edge latch_edge;
1361   HOST_WIDE_INT trip_count, max_trip_count;
1362   HARD_REG_SET prohibited_regs;
1363 
1364   loop_optimizer_init (LOOPS_HAVE_PREHEADERS
1365 		       | LOOPS_HAVE_RECORDED_EXITS);
1366   if (number_of_loops (cfun) <= 1)
1367     {
1368       loop_optimizer_finalize ();
1369       return;  /* There are no loops to schedule.  */
1370     }
1371 
1372   /* Initialize issue_rate.  */
1373   if (targetm.sched.issue_rate)
1374     {
1375       int temp = reload_completed;
1376 
1377       reload_completed = 1;
1378       issue_rate = targetm.sched.issue_rate ();
1379       reload_completed = temp;
1380     }
1381   else
1382     issue_rate = 1;
1383 
1384   /* Initialize the scheduler.  */
1385   setup_sched_infos ();
1386   haifa_sched_init ();
1387 
1388   /* Allocate memory to hold the DDG array one entry for each loop.
1389      We use loop->num as index into this array.  */
1390   g_arr = XCNEWVEC (ddg_ptr, number_of_loops (cfun));
1391 
1392   REG_SET_TO_HARD_REG_SET (prohibited_regs, &df->regular_block_artificial_uses);
1393 
1394   if (dump_file)
1395   {
1396     fprintf (dump_file, "\n\nSMS analysis phase\n");
1397     fprintf (dump_file, "===================\n\n");
1398   }
1399 
1400   /* Build DDGs for all the relevant loops and hold them in G_ARR
1401      indexed by the loop index.  */
1402   FOR_EACH_LOOP (loop, 0)
1403     {
1404       rtx_insn *head, *tail;
1405       rtx count_reg;
1406 
1407       /* For debugging.  */
1408       if (dbg_cnt (sms_sched_loop) == false)
1409         {
1410           if (dump_file)
1411             fprintf (dump_file, "SMS reached max limit... \n");
1412 
1413 	  break;
1414         }
1415 
1416       if (dump_file)
1417 	{
1418 	  rtx_insn *insn = BB_END (loop->header);
1419 
1420 	  fprintf (dump_file, "SMS loop num: %d", loop->num);
1421 	  dump_insn_location (insn);
1422 	  fprintf (dump_file, "\n");
1423 	}
1424 
1425       if (! loop_canon_p (loop))
1426         continue;
1427 
1428       if (! loop_single_full_bb_p (loop))
1429       {
1430         if (dump_file)
1431           fprintf (dump_file, "SMS not loop_single_full_bb_p\n");
1432 	continue;
1433       }
1434 
1435       bb = loop->header;
1436 
1437       get_ebb_head_tail (bb, bb, &head, &tail);
1438       latch_edge = loop_latch_edge (loop);
1439       gcc_assert (single_exit (loop));
1440       trip_count = get_estimated_loop_iterations_int (loop);
1441       max_trip_count = get_max_loop_iterations_int (loop);
1442 
1443       /* Perform SMS only on loops that their average count is above threshold.  */
1444 
1445       if ( latch_edge->count () > profile_count::zero ()
1446           && (latch_edge->count()
1447 	      < single_exit (loop)->count ().apply_scale
1448 				 (SMS_LOOP_AVERAGE_COUNT_THRESHOLD, 1)))
1449 	{
1450 	  if (dump_file)
1451 	    {
1452 	      dump_insn_location (tail);
1453 	      fprintf (dump_file, "\nSMS single-bb-loop\n");
1454 	      if (profile_info && flag_branch_probabilities)
1455 	    	{
1456 	      	  fprintf (dump_file, "SMS loop-count ");
1457 	      	  fprintf (dump_file, "%" PRId64,
1458 	             	   (int64_t) bb->count.to_gcov_type ());
1459 	      	  fprintf (dump_file, "\n");
1460                   fprintf (dump_file, "SMS trip-count ");
1461                   fprintf (dump_file, "%" PRId64 "max %" PRId64,
1462                            (int64_t) trip_count, (int64_t) max_trip_count);
1463                   fprintf (dump_file, "\n");
1464 	    	}
1465 	    }
1466           continue;
1467         }
1468 
1469       /* Make sure this is a doloop.  */
1470       if ( !(count_reg = doloop_register_get (head, tail)))
1471       {
1472         if (dump_file)
1473           fprintf (dump_file, "SMS doloop_register_get failed\n");
1474 	continue;
1475       }
1476 
1477       /* Don't handle BBs with calls or barriers
1478 	 or !single_set with the exception of do-loop control part insns.
1479          ??? Should handle insns defining subregs.  */
1480       for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
1481 	{
1482 	  if (INSN_P (insn))
1483 	    {
1484 	      HARD_REG_SET regs;
1485 	      CLEAR_HARD_REG_SET (regs);
1486 	      note_stores (PATTERN (insn), record_hard_reg_sets, &regs);
1487 	      if (hard_reg_set_intersect_p (regs, prohibited_regs))
1488 		break;
1489 	    }
1490 
1491 	  if (CALL_P (insn)
1492 	      || BARRIER_P (insn)
1493 	      || (INSN_P (insn) && single_set (insn)
1494 		  && GET_CODE (SET_DEST (single_set (insn))) == SUBREG)
1495 	      /* Not a single set.  */
1496 	      || (NONDEBUG_INSN_P (insn) && !JUMP_P (insn)
1497 		  && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE
1498 		  /* But non-single-set allowed in one special case.  */
1499 		  && (insn != prev_nondebug_insn (tail)
1500 		      || !reg_mentioned_p (count_reg, insn))))
1501 	    break;
1502 	}
1503 
1504       if (insn != NEXT_INSN (tail))
1505 	{
1506 	  if (dump_file)
1507 	    {
1508 	      if (CALL_P (insn))
1509 		fprintf (dump_file, "SMS loop-with-call\n");
1510 	      else if (BARRIER_P (insn))
1511 		fprintf (dump_file, "SMS loop-with-barrier\n");
1512 	      else if (INSN_P (insn) && single_set (insn)
1513 		       && GET_CODE (SET_DEST (single_set (insn))) == SUBREG)
1514 		fprintf (dump_file, "SMS loop with subreg in lhs\n");
1515 	      else
1516 		fprintf (dump_file,
1517 			 "SMS loop-with-not-single-set-or-prohibited-reg\n");
1518 
1519 	      print_rtl_single (dump_file, insn);
1520 	    }
1521 
1522 	  continue;
1523 	}
1524 
1525       /* Always schedule the closing branch with the rest of the
1526          instructions. The branch is rotated to be in row ii-1 at the
1527          end of the scheduling procedure to make sure it's the last
1528          instruction in the iteration.  */
1529       if (! (g = create_ddg (bb, 1)))
1530         {
1531           if (dump_file)
1532 	    fprintf (dump_file, "SMS create_ddg failed\n");
1533 	  continue;
1534         }
1535 
1536       g_arr[loop->num] = g;
1537       if (dump_file)
1538         fprintf (dump_file, "...OK\n");
1539 
1540     }
1541   if (dump_file)
1542   {
1543     fprintf (dump_file, "\nSMS transformation phase\n");
1544     fprintf (dump_file, "=========================\n\n");
1545   }
1546 
1547   /* We don't want to perform SMS on new loops - created by versioning.  */
1548   FOR_EACH_LOOP (loop, 0)
1549     {
1550       rtx_insn *head, *tail;
1551       rtx count_reg;
1552       rtx_insn *count_init;
1553       int mii, rec_mii, stage_count, min_cycle;
1554       int64_t loop_count = 0;
1555       bool opt_sc_p, adjust_inplace = false;
1556       basic_block pre_header;
1557 
1558       if (! (g = g_arr[loop->num]))
1559         continue;
1560 
1561       if (dump_file)
1562 	{
1563 	  rtx_insn *insn = BB_END (loop->header);
1564 
1565 	  fprintf (dump_file, "SMS loop num: %d", loop->num);
1566 	  dump_insn_location (insn);
1567 	  fprintf (dump_file, "\n");
1568 
1569 	  print_ddg (dump_file, g);
1570 	}
1571 
1572       get_ebb_head_tail (loop->header, loop->header, &head, &tail);
1573 
1574       latch_edge = loop_latch_edge (loop);
1575       gcc_assert (single_exit (loop));
1576       trip_count = get_estimated_loop_iterations_int (loop);
1577       max_trip_count = get_max_loop_iterations_int (loop);
1578 
1579       if (dump_file)
1580 	{
1581 	  dump_insn_location (tail);
1582 	  fprintf (dump_file, "\nSMS single-bb-loop\n");
1583 	  if (profile_info && flag_branch_probabilities)
1584 	    {
1585 	      fprintf (dump_file, "SMS loop-count ");
1586 	      fprintf (dump_file, "%" PRId64,
1587 	               (int64_t) bb->count.to_gcov_type ());
1588 	      fprintf (dump_file, "\n");
1589 	    }
1590 	  fprintf (dump_file, "SMS doloop\n");
1591 	  fprintf (dump_file, "SMS built-ddg %d\n", g->num_nodes);
1592           fprintf (dump_file, "SMS num-loads %d\n", g->num_loads);
1593           fprintf (dump_file, "SMS num-stores %d\n", g->num_stores);
1594 	}
1595 
1596 
1597       count_reg = doloop_register_get (head, tail);
1598       gcc_assert (count_reg);
1599 
1600       pre_header = loop_preheader_edge (loop)->src;
1601       count_init = const_iteration_count (count_reg, pre_header, &loop_count,
1602 					  &adjust_inplace);
1603 
1604       if (dump_file && count_init)
1605         {
1606           fprintf (dump_file, "SMS const-doloop ");
1607           fprintf (dump_file, "%" PRId64,
1608 		     loop_count);
1609           fprintf (dump_file, "\n");
1610         }
1611 
1612       node_order = XNEWVEC (int, g->num_nodes);
1613 
1614       mii = 1; /* Need to pass some estimate of mii.  */
1615       rec_mii = sms_order_nodes (g, mii, node_order, &max_asap);
1616       mii = MAX (res_MII (g), rec_mii);
1617       mii = MAX (mii, 1);
1618       maxii = MAX (max_asap, MAXII_FACTOR * mii);
1619 
1620       if (dump_file)
1621 	fprintf (dump_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
1622 		 rec_mii, mii, maxii);
1623 
1624       for (;;)
1625 	{
1626 	  set_node_sched_params (g);
1627 
1628 	  stage_count = 0;
1629 	  opt_sc_p = false;
1630 	  ps = sms_schedule_by_order (g, mii, maxii, node_order);
1631 
1632 	  if (ps)
1633 	    {
1634 	      /* Try to achieve optimized SC by normalizing the partial
1635 		 schedule (having the cycles start from cycle zero).
1636 		 The branch location must be placed in row ii-1 in the
1637 		 final scheduling.	If failed, shift all instructions to
1638 		 position the branch in row ii-1.  */
1639 	      opt_sc_p = optimize_sc (ps, g);
1640 	      if (opt_sc_p)
1641 		stage_count = calculate_stage_count (ps, 0);
1642 	      else
1643 		{
1644 		  /* Bring the branch to cycle ii-1.  */
1645 		  int amount = (SCHED_TIME (g->closing_branch->cuid)
1646 				- (ps->ii - 1));
1647 
1648 		  if (dump_file)
1649 		    fprintf (dump_file, "SMS schedule branch at cycle ii-1\n");
1650 
1651 		  stage_count = calculate_stage_count (ps, amount);
1652 		}
1653 
1654 	      gcc_assert (stage_count >= 1);
1655 	    }
1656 
1657 	  /* The default value of PARAM_SMS_MIN_SC is 2 as stage count of
1658 	     1 means that there is no interleaving between iterations thus
1659 	     we let the scheduling passes do the job in this case.  */
1660 	  if (stage_count < PARAM_VALUE (PARAM_SMS_MIN_SC)
1661 	      || (count_init && (loop_count <= stage_count))
1662 	      || (max_trip_count >= 0 && max_trip_count <= stage_count)
1663 	      || (trip_count >= 0 && trip_count <= stage_count))
1664 	    {
1665 	      if (dump_file)
1666 		{
1667 		  fprintf (dump_file, "SMS failed... \n");
1668 		  fprintf (dump_file, "SMS sched-failed (stage-count=%d,"
1669 			   " loop-count=", stage_count);
1670 		  fprintf (dump_file, "%" PRId64, loop_count);
1671 		  fprintf (dump_file, ", trip-count=");
1672 		  fprintf (dump_file, "%" PRId64 "max %" PRId64,
1673 			   (int64_t) trip_count, (int64_t) max_trip_count);
1674 		  fprintf (dump_file, ")\n");
1675 		}
1676 	      break;
1677 	    }
1678 
1679           if (!opt_sc_p)
1680             {
1681 	      /* Rotate the partial schedule to have the branch in row ii-1.  */
1682               int amount = SCHED_TIME (g->closing_branch->cuid) - (ps->ii - 1);
1683 
1684               reset_sched_times (ps, amount);
1685               rotate_partial_schedule (ps, amount);
1686             }
1687 
1688 	  set_columns_for_ps (ps);
1689 
1690 	  min_cycle = PS_MIN_CYCLE (ps) - SMODULO (PS_MIN_CYCLE (ps), ps->ii);
1691 	  if (!schedule_reg_moves (ps))
1692 	    {
1693 	      mii = ps->ii + 1;
1694 	      free_partial_schedule (ps);
1695 	      continue;
1696 	    }
1697 
1698 	  /* Moves that handle incoming values might have been added
1699 	     to a new first stage.  Bump the stage count if so.
1700 
1701 	     ??? Perhaps we could consider rotating the schedule here
1702 	     instead?  */
1703 	  if (PS_MIN_CYCLE (ps) < min_cycle)
1704 	    {
1705 	      reset_sched_times (ps, 0);
1706 	      stage_count++;
1707 	    }
1708 
1709 	  /* The stage count should now be correct without rotation.  */
1710 	  gcc_checking_assert (stage_count == calculate_stage_count (ps, 0));
1711 	  PS_STAGE_COUNT (ps) = stage_count;
1712 
1713 	  canon_loop (loop);
1714 
1715           if (dump_file)
1716             {
1717 	      dump_insn_location (tail);
1718 	      fprintf (dump_file, " SMS succeeded %d %d (with ii, sc)\n",
1719 		       ps->ii, stage_count);
1720 	      print_partial_schedule (ps, dump_file);
1721 	    }
1722 
1723 	  if (count_init)
1724 	    {
1725 	       if (adjust_inplace)
1726 		{
1727 		  /* When possible, set new iteration count of loop kernel in
1728 		     place.  Otherwise, generate_prolog_epilog creates an insn
1729 		     to adjust.  */
1730 		  SET_SRC (single_set (count_init)) = GEN_INT (loop_count
1731 							    - stage_count + 1);
1732 		}
1733 	    }
1734 	  else
1735             {
1736 	      /* case the BCT count is not known , Do loop-versioning */
1737 	      rtx comp_rtx = gen_rtx_GT (VOIDmode, count_reg,
1738 					 gen_int_mode (stage_count,
1739 						       GET_MODE (count_reg)));
1740 	      profile_probability prob = profile_probability::guessed_always ()
1741 				.apply_scale (PROB_SMS_ENOUGH_ITERATIONS, 100);
1742 
1743 	      loop_version (loop, comp_rtx, &condition_bb,
1744 	  		    prob, prob.invert (),
1745 			    prob, prob.invert (), true);
1746 	    }
1747 
1748 	  /* Now apply the scheduled kernel to the RTL of the loop.  */
1749 	  permute_partial_schedule (ps, g->closing_branch->first_note);
1750 
1751           /* Mark this loop as software pipelined so the later
1752 	     scheduling passes don't touch it.  */
1753 	  if (! flag_resched_modulo_sched)
1754 	    mark_loop_unsched (loop);
1755 
1756 	  /* The life-info is not valid any more.  */
1757 	  df_set_bb_dirty (g->bb);
1758 
1759 	  apply_reg_moves (ps);
1760 	  if (dump_file)
1761 	    print_node_sched_params (dump_file, g->num_nodes, ps);
1762 	  /* Generate prolog and epilog.  */
1763 	  generate_prolog_epilog (ps, loop, count_reg, !adjust_inplace);
1764 	  break;
1765 	}
1766 
1767       free_partial_schedule (ps);
1768       node_sched_param_vec.release ();
1769       free (node_order);
1770       free_ddg (g);
1771     }
1772 
1773   free (g_arr);
1774 
1775   /* Release scheduler data, needed until now because of DFA.  */
1776   haifa_sched_finish ();
1777   loop_optimizer_finalize ();
1778 }
1779 
1780 /* The SMS scheduling algorithm itself
1781    -----------------------------------
1782    Input: 'O' an ordered list of insns of a loop.
1783    Output: A scheduling of the loop - kernel, prolog, and epilogue.
1784 
1785    'Q' is the empty Set
1786    'PS' is the partial schedule; it holds the currently scheduled nodes with
1787 	their cycle/slot.
1788    'PSP' previously scheduled predecessors.
1789    'PSS' previously scheduled successors.
1790    't(u)' the cycle where u is scheduled.
1791    'l(u)' is the latency of u.
1792    'd(v,u)' is the dependence distance from v to u.
1793    'ASAP(u)' the earliest time at which u could be scheduled as computed in
1794 	     the node ordering phase.
1795    'check_hardware_resources_conflicts(u, PS, c)'
1796 			     run a trace around cycle/slot through DFA model
1797 			     to check resource conflicts involving instruction u
1798 			     at cycle c given the partial schedule PS.
1799    'add_to_partial_schedule_at_time(u, PS, c)'
1800 			     Add the node/instruction u to the partial schedule
1801 			     PS at time c.
1802    'calculate_register_pressure(PS)'
1803 			     Given a schedule of instructions, calculate the register
1804 			     pressure it implies.  One implementation could be the
1805 			     maximum number of overlapping live ranges.
1806    'maxRP' The maximum allowed register pressure, it is usually derived from the number
1807 	   registers available in the hardware.
1808 
1809    1. II = MII.
1810    2. PS = empty list
1811    3. for each node u in O in pre-computed order
1812    4.   if (PSP(u) != Q && PSS(u) == Q) then
1813    5.     Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1814    6.     start = Early_start; end = Early_start + II - 1; step = 1
1815    11.  else if (PSP(u) == Q && PSS(u) != Q) then
1816    12.      Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1817    13.     start = Late_start; end = Late_start - II + 1; step = -1
1818    14.  else if (PSP(u) != Q && PSS(u) != Q) then
1819    15.     Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1820    16.     Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1821    17.     start = Early_start;
1822    18.     end = min(Early_start + II - 1 , Late_start);
1823    19.     step = 1
1824    20.     else "if (PSP(u) == Q && PSS(u) == Q)"
1825    21.	  start = ASAP(u); end = start + II - 1; step = 1
1826    22.  endif
1827 
1828    23.  success = false
1829    24.  for (c = start ; c != end ; c += step)
1830    25.     if check_hardware_resources_conflicts(u, PS, c) then
1831    26.       add_to_partial_schedule_at_time(u, PS, c)
1832    27.       success = true
1833    28.       break
1834    29.     endif
1835    30.  endfor
1836    31.  if (success == false) then
1837    32.    II = II + 1
1838    33.    if (II > maxII) then
1839    34.       finish - failed to schedule
1840    35.	 endif
1841    36.    goto 2.
1842    37.  endif
1843    38. endfor
1844    39. if (calculate_register_pressure(PS) > maxRP) then
1845    40.    goto 32.
1846    41. endif
1847    42. compute epilogue & prologue
1848    43. finish - succeeded to schedule
1849 
1850    ??? The algorithm restricts the scheduling window to II cycles.
1851    In rare cases, it may be better to allow windows of II+1 cycles.
1852    The window would then start and end on the same row, but with
1853    different "must precede" and "must follow" requirements.  */
1854 
1855 /* A limit on the number of cycles that resource conflicts can span.  ??? Should
1856    be provided by DFA, and be dependent on the type of insn scheduled.  Currently
1857    set to 0 to save compile time.  */
1858 #define DFA_HISTORY SMS_DFA_HISTORY
1859 
1860 /* A threshold for the number of repeated unsuccessful attempts to insert
1861    an empty row, before we flush the partial schedule and start over.  */
1862 #define MAX_SPLIT_NUM 10
1863 /* Given the partial schedule PS, this function calculates and returns the
1864    cycles in which we can schedule the node with the given index I.
1865    NOTE: Here we do the backtracking in SMS, in some special cases. We have
1866    noticed that there are several cases in which we fail    to SMS the loop
1867    because the sched window of a node is empty    due to tight data-deps. In
1868    such cases we want to unschedule    some of the predecessors/successors
1869    until we get non-empty    scheduling window.  It returns -1 if the
1870    scheduling window is empty and zero otherwise.  */
1871 
1872 static int
get_sched_window(partial_schedule_ptr ps,ddg_node_ptr u_node,sbitmap sched_nodes,int ii,int * start_p,int * step_p,int * end_p)1873 get_sched_window (partial_schedule_ptr ps, ddg_node_ptr u_node,
1874 		  sbitmap sched_nodes, int ii, int *start_p, int *step_p,
1875 		  int *end_p)
1876 {
1877   int start, step, end;
1878   int early_start, late_start;
1879   ddg_edge_ptr e;
1880   auto_sbitmap psp (ps->g->num_nodes);
1881   auto_sbitmap pss (ps->g->num_nodes);
1882   sbitmap u_node_preds = NODE_PREDECESSORS (u_node);
1883   sbitmap u_node_succs = NODE_SUCCESSORS (u_node);
1884   int psp_not_empty;
1885   int pss_not_empty;
1886   int count_preds;
1887   int count_succs;
1888 
1889   /* 1. compute sched window for u (start, end, step).  */
1890   bitmap_clear (psp);
1891   bitmap_clear (pss);
1892   psp_not_empty = bitmap_and (psp, u_node_preds, sched_nodes);
1893   pss_not_empty = bitmap_and (pss, u_node_succs, sched_nodes);
1894 
1895   /* We first compute a forward range (start <= end), then decide whether
1896      to reverse it.  */
1897   early_start = INT_MIN;
1898   late_start = INT_MAX;
1899   start = INT_MIN;
1900   end = INT_MAX;
1901   step = 1;
1902 
1903   count_preds = 0;
1904   count_succs = 0;
1905 
1906   if (dump_file && (psp_not_empty || pss_not_empty))
1907     {
1908       fprintf (dump_file, "\nAnalyzing dependencies for node %d (INSN %d)"
1909 	       "; ii = %d\n\n", u_node->cuid, INSN_UID (u_node->insn), ii);
1910       fprintf (dump_file, "%11s %11s %11s %11s %5s\n",
1911 	       "start", "early start", "late start", "end", "time");
1912       fprintf (dump_file, "=========== =========== =========== ==========="
1913 	       " =====\n");
1914     }
1915   /* Calculate early_start and limit end.  Both bounds are inclusive.  */
1916   if (psp_not_empty)
1917     for (e = u_node->in; e != 0; e = e->next_in)
1918       {
1919 	int v = e->src->cuid;
1920 
1921 	if (bitmap_bit_p (sched_nodes, v))
1922 	  {
1923 	    int p_st = SCHED_TIME (v);
1924 	    int earliest = p_st + e->latency - (e->distance * ii);
1925 	    int latest = (e->data_type == MEM_DEP ? p_st + ii - 1 : INT_MAX);
1926 
1927 	    if (dump_file)
1928 	      {
1929 		fprintf (dump_file, "%11s %11d %11s %11d %5d",
1930 			 "", earliest, "", latest, p_st);
1931 		print_ddg_edge (dump_file, e);
1932 		fprintf (dump_file, "\n");
1933 	      }
1934 
1935 	    early_start = MAX (early_start, earliest);
1936 	    end = MIN (end, latest);
1937 
1938 	    if (e->type == TRUE_DEP && e->data_type == REG_DEP)
1939 	      count_preds++;
1940 	  }
1941       }
1942 
1943   /* Calculate late_start and limit start.  Both bounds are inclusive.  */
1944   if (pss_not_empty)
1945     for (e = u_node->out; e != 0; e = e->next_out)
1946       {
1947 	int v = e->dest->cuid;
1948 
1949 	if (bitmap_bit_p (sched_nodes, v))
1950 	  {
1951 	    int s_st = SCHED_TIME (v);
1952 	    int earliest = (e->data_type == MEM_DEP ? s_st - ii + 1 : INT_MIN);
1953 	    int latest = s_st - e->latency + (e->distance * ii);
1954 
1955 	    if (dump_file)
1956 	      {
1957 		fprintf (dump_file, "%11d %11s %11d %11s %5d",
1958 			 earliest, "", latest, "", s_st);
1959 		print_ddg_edge (dump_file, e);
1960 		fprintf (dump_file, "\n");
1961 	      }
1962 
1963 	    start = MAX (start, earliest);
1964 	    late_start = MIN (late_start, latest);
1965 
1966 	    if (e->type == TRUE_DEP && e->data_type == REG_DEP)
1967 	      count_succs++;
1968 	  }
1969       }
1970 
1971   if (dump_file && (psp_not_empty || pss_not_empty))
1972     {
1973       fprintf (dump_file, "----------- ----------- ----------- -----------"
1974 	       " -----\n");
1975       fprintf (dump_file, "%11d %11d %11d %11d %5s %s\n",
1976 	       start, early_start, late_start, end, "",
1977 	       "(max, max, min, min)");
1978     }
1979 
1980   /* Get a target scheduling window no bigger than ii.  */
1981   if (early_start == INT_MIN && late_start == INT_MAX)
1982     early_start = NODE_ASAP (u_node);
1983   else if (early_start == INT_MIN)
1984     early_start = late_start - (ii - 1);
1985   late_start = MIN (late_start, early_start + (ii - 1));
1986 
1987   /* Apply memory dependence limits.  */
1988   start = MAX (start, early_start);
1989   end = MIN (end, late_start);
1990 
1991   if (dump_file && (psp_not_empty || pss_not_empty))
1992     fprintf (dump_file, "%11s %11d %11d %11s %5s final window\n",
1993 	     "", start, end, "", "");
1994 
1995   /* If there are at least as many successors as predecessors, schedule the
1996      node close to its successors.  */
1997   if (pss_not_empty && count_succs >= count_preds)
1998     {
1999       std::swap (start, end);
2000       step = -1;
2001     }
2002 
2003   /* Now that we've finalized the window, make END an exclusive rather
2004      than an inclusive bound.  */
2005   end += step;
2006 
2007   *start_p = start;
2008   *step_p = step;
2009   *end_p = end;
2010 
2011   if ((start >= end && step == 1) || (start <= end && step == -1))
2012     {
2013       if (dump_file)
2014 	fprintf (dump_file, "\nEmpty window: start=%d, end=%d, step=%d\n",
2015 		 start, end, step);
2016       return -1;
2017     }
2018 
2019   return 0;
2020 }
2021 
2022 /* Calculate MUST_PRECEDE/MUST_FOLLOW bitmaps of U_NODE; which is the
2023    node currently been scheduled.  At the end of the calculation
2024    MUST_PRECEDE/MUST_FOLLOW contains all predecessors/successors of
2025    U_NODE which are (1) already scheduled in the first/last row of
2026    U_NODE's scheduling window, (2) whose dependence inequality with U
2027    becomes an equality when U is scheduled in this same row, and (3)
2028    whose dependence latency is zero.
2029 
2030    The first and last rows are calculated using the following parameters:
2031    START/END rows - The cycles that begins/ends the traversal on the window;
2032    searching for an empty cycle to schedule U_NODE.
2033    STEP - The direction in which we traverse the window.
2034    II - The initiation interval.  */
2035 
2036 static void
calculate_must_precede_follow(ddg_node_ptr u_node,int start,int end,int step,int ii,sbitmap sched_nodes,sbitmap must_precede,sbitmap must_follow)2037 calculate_must_precede_follow (ddg_node_ptr u_node, int start, int end,
2038 			       int step, int ii, sbitmap sched_nodes,
2039 			       sbitmap must_precede, sbitmap must_follow)
2040 {
2041   ddg_edge_ptr e;
2042   int first_cycle_in_window, last_cycle_in_window;
2043 
2044   gcc_assert (must_precede && must_follow);
2045 
2046   /* Consider the following scheduling window:
2047      {first_cycle_in_window, first_cycle_in_window+1, ...,
2048      last_cycle_in_window}.  If step is 1 then the following will be
2049      the order we traverse the window: {start=first_cycle_in_window,
2050      first_cycle_in_window+1, ..., end=last_cycle_in_window+1},
2051      or {start=last_cycle_in_window, last_cycle_in_window-1, ...,
2052      end=first_cycle_in_window-1} if step is -1.  */
2053   first_cycle_in_window = (step == 1) ? start : end - step;
2054   last_cycle_in_window = (step == 1) ? end - step : start;
2055 
2056   bitmap_clear (must_precede);
2057   bitmap_clear (must_follow);
2058 
2059   if (dump_file)
2060     fprintf (dump_file, "\nmust_precede: ");
2061 
2062   /* Instead of checking if:
2063       (SMODULO (SCHED_TIME (e->src), ii) == first_row_in_window)
2064       && ((SCHED_TIME (e->src) + e->latency - (e->distance * ii)) ==
2065              first_cycle_in_window)
2066       && e->latency == 0
2067      we use the fact that latency is non-negative:
2068       SCHED_TIME (e->src) - (e->distance * ii) <=
2069       SCHED_TIME (e->src) + e->latency - (e->distance * ii)) <=
2070       first_cycle_in_window
2071      and check only if
2072       SCHED_TIME (e->src) - (e->distance * ii) == first_cycle_in_window  */
2073   for (e = u_node->in; e != 0; e = e->next_in)
2074     if (bitmap_bit_p (sched_nodes, e->src->cuid)
2075 	&& ((SCHED_TIME (e->src->cuid) - (e->distance * ii)) ==
2076              first_cycle_in_window))
2077       {
2078 	if (dump_file)
2079 	  fprintf (dump_file, "%d ", e->src->cuid);
2080 
2081 	bitmap_set_bit (must_precede, e->src->cuid);
2082       }
2083 
2084   if (dump_file)
2085     fprintf (dump_file, "\nmust_follow: ");
2086 
2087   /* Instead of checking if:
2088       (SMODULO (SCHED_TIME (e->dest), ii) == last_row_in_window)
2089       && ((SCHED_TIME (e->dest) - e->latency + (e->distance * ii)) ==
2090              last_cycle_in_window)
2091       && e->latency == 0
2092      we use the fact that latency is non-negative:
2093       SCHED_TIME (e->dest) + (e->distance * ii) >=
2094       SCHED_TIME (e->dest) - e->latency + (e->distance * ii)) >=
2095       last_cycle_in_window
2096      and check only if
2097       SCHED_TIME (e->dest) + (e->distance * ii) == last_cycle_in_window  */
2098   for (e = u_node->out; e != 0; e = e->next_out)
2099     if (bitmap_bit_p (sched_nodes, e->dest->cuid)
2100 	&& ((SCHED_TIME (e->dest->cuid) + (e->distance * ii)) ==
2101              last_cycle_in_window))
2102       {
2103 	if (dump_file)
2104 	  fprintf (dump_file, "%d ", e->dest->cuid);
2105 
2106 	bitmap_set_bit (must_follow, e->dest->cuid);
2107       }
2108 
2109   if (dump_file)
2110     fprintf (dump_file, "\n");
2111 }
2112 
2113 /* Return 1 if U_NODE can be scheduled in CYCLE.  Use the following
2114    parameters to decide if that's possible:
2115    PS - The partial schedule.
2116    U - The serial number of U_NODE.
2117    NUM_SPLITS - The number of row splits made so far.
2118    MUST_PRECEDE - The nodes that must precede U_NODE. (only valid at
2119    the first row of the scheduling window)
2120    MUST_FOLLOW - The nodes that must follow U_NODE. (only valid at the
2121    last row of the scheduling window)  */
2122 
2123 static bool
try_scheduling_node_in_cycle(partial_schedule_ptr ps,int u,int cycle,sbitmap sched_nodes,int * num_splits,sbitmap must_precede,sbitmap must_follow)2124 try_scheduling_node_in_cycle (partial_schedule_ptr ps,
2125 			      int u, int cycle, sbitmap sched_nodes,
2126 			      int *num_splits, sbitmap must_precede,
2127 			      sbitmap must_follow)
2128 {
2129   ps_insn_ptr psi;
2130   bool success = 0;
2131 
2132   verify_partial_schedule (ps, sched_nodes);
2133   psi = ps_add_node_check_conflicts (ps, u, cycle, must_precede, must_follow);
2134   if (psi)
2135     {
2136       SCHED_TIME (u) = cycle;
2137       bitmap_set_bit (sched_nodes, u);
2138       success = 1;
2139       *num_splits = 0;
2140       if (dump_file)
2141 	fprintf (dump_file, "Scheduled w/o split in %d\n", cycle);
2142 
2143     }
2144 
2145   return success;
2146 }
2147 
2148 /* This function implements the scheduling algorithm for SMS according to the
2149    above algorithm.  */
2150 static partial_schedule_ptr
sms_schedule_by_order(ddg_ptr g,int mii,int maxii,int * nodes_order)2151 sms_schedule_by_order (ddg_ptr g, int mii, int maxii, int *nodes_order)
2152 {
2153   int ii = mii;
2154   int i, c, success, num_splits = 0;
2155   int flush_and_start_over = true;
2156   int num_nodes = g->num_nodes;
2157   int start, end, step; /* Place together into one struct?  */
2158   auto_sbitmap sched_nodes (num_nodes);
2159   auto_sbitmap must_precede (num_nodes);
2160   auto_sbitmap must_follow (num_nodes);
2161   auto_sbitmap tobe_scheduled (num_nodes);
2162 
2163   partial_schedule_ptr ps = create_partial_schedule (ii, g, DFA_HISTORY);
2164 
2165   bitmap_ones (tobe_scheduled);
2166   bitmap_clear (sched_nodes);
2167 
2168   while (flush_and_start_over && (ii < maxii))
2169     {
2170 
2171       if (dump_file)
2172 	fprintf (dump_file, "Starting with ii=%d\n", ii);
2173       flush_and_start_over = false;
2174       bitmap_clear (sched_nodes);
2175 
2176       for (i = 0; i < num_nodes; i++)
2177 	{
2178 	  int u = nodes_order[i];
2179   	  ddg_node_ptr u_node = &ps->g->nodes[u];
2180 	  rtx_insn *insn = u_node->insn;
2181 
2182 	  if (!NONDEBUG_INSN_P (insn))
2183 	    {
2184 	      bitmap_clear_bit (tobe_scheduled, u);
2185 	      continue;
2186 	    }
2187 
2188 	  if (bitmap_bit_p (sched_nodes, u))
2189 	    continue;
2190 
2191 	  /* Try to get non-empty scheduling window.  */
2192 	 success = 0;
2193          if (get_sched_window (ps, u_node, sched_nodes, ii, &start,
2194                                 &step, &end) == 0)
2195             {
2196               if (dump_file)
2197                 fprintf (dump_file, "\nTrying to schedule node %d "
2198 			 "INSN = %d  in (%d .. %d) step %d\n", u, (INSN_UID
2199                         (g->nodes[u].insn)), start, end, step);
2200 
2201               gcc_assert ((step > 0 && start < end)
2202                           || (step < 0 && start > end));
2203 
2204               calculate_must_precede_follow (u_node, start, end, step, ii,
2205                                              sched_nodes, must_precede,
2206                                              must_follow);
2207 
2208               for (c = start; c != end; c += step)
2209                 {
2210 		  sbitmap tmp_precede, tmp_follow;
2211 
2212                   set_must_precede_follow (&tmp_follow, must_follow,
2213 		                           &tmp_precede, must_precede,
2214                                            c, start, end, step);
2215                   success =
2216                     try_scheduling_node_in_cycle (ps, u, c,
2217                                                   sched_nodes,
2218                                                   &num_splits, tmp_precede,
2219                                                   tmp_follow);
2220                   if (success)
2221                     break;
2222                 }
2223 
2224               verify_partial_schedule (ps, sched_nodes);
2225             }
2226             if (!success)
2227             {
2228               int split_row;
2229 
2230               if (ii++ == maxii)
2231                 break;
2232 
2233               if (num_splits >= MAX_SPLIT_NUM)
2234                 {
2235                   num_splits = 0;
2236                   flush_and_start_over = true;
2237                   verify_partial_schedule (ps, sched_nodes);
2238                   reset_partial_schedule (ps, ii);
2239                   verify_partial_schedule (ps, sched_nodes);
2240                   break;
2241                 }
2242 
2243               num_splits++;
2244               /* The scheduling window is exclusive of 'end'
2245                  whereas compute_split_window() expects an inclusive,
2246                  ordered range.  */
2247               if (step == 1)
2248                 split_row = compute_split_row (sched_nodes, start, end - 1,
2249                                                ps->ii, u_node);
2250               else
2251                 split_row = compute_split_row (sched_nodes, end + 1, start,
2252                                                ps->ii, u_node);
2253 
2254               ps_insert_empty_row (ps, split_row, sched_nodes);
2255               i--;              /* Go back and retry node i.  */
2256 
2257               if (dump_file)
2258                 fprintf (dump_file, "num_splits=%d\n", num_splits);
2259             }
2260 
2261           /* ??? If (success), check register pressure estimates.  */
2262         }                       /* Continue with next node.  */
2263     }                           /* While flush_and_start_over.  */
2264   if (ii >= maxii)
2265     {
2266       free_partial_schedule (ps);
2267       ps = NULL;
2268     }
2269   else
2270     gcc_assert (bitmap_equal_p (tobe_scheduled, sched_nodes));
2271 
2272   return ps;
2273 }
2274 
2275 /* This function inserts a new empty row into PS at the position
2276    according to SPLITROW, keeping all already scheduled instructions
2277    intact and updating their SCHED_TIME and cycle accordingly.  */
2278 static void
ps_insert_empty_row(partial_schedule_ptr ps,int split_row,sbitmap sched_nodes)2279 ps_insert_empty_row (partial_schedule_ptr ps, int split_row,
2280 		     sbitmap sched_nodes)
2281 {
2282   ps_insn_ptr crr_insn;
2283   ps_insn_ptr *rows_new;
2284   int ii = ps->ii;
2285   int new_ii = ii + 1;
2286   int row;
2287   int *rows_length_new;
2288 
2289   verify_partial_schedule (ps, sched_nodes);
2290 
2291   /* We normalize sched_time and rotate ps to have only non-negative sched
2292      times, for simplicity of updating cycles after inserting new row.  */
2293   split_row -= ps->min_cycle;
2294   split_row = SMODULO (split_row, ii);
2295   if (dump_file)
2296     fprintf (dump_file, "split_row=%d\n", split_row);
2297 
2298   reset_sched_times (ps, PS_MIN_CYCLE (ps));
2299   rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
2300 
2301   rows_new = (ps_insn_ptr *) xcalloc (new_ii, sizeof (ps_insn_ptr));
2302   rows_length_new = (int *) xcalloc (new_ii, sizeof (int));
2303   for (row = 0; row < split_row; row++)
2304     {
2305       rows_new[row] = ps->rows[row];
2306       rows_length_new[row] = ps->rows_length[row];
2307       ps->rows[row] = NULL;
2308       for (crr_insn = rows_new[row];
2309 	   crr_insn; crr_insn = crr_insn->next_in_row)
2310 	{
2311 	  int u = crr_insn->id;
2312 	  int new_time = SCHED_TIME (u) + (SCHED_TIME (u) / ii);
2313 
2314 	  SCHED_TIME (u) = new_time;
2315 	  crr_insn->cycle = new_time;
2316 	  SCHED_ROW (u) = new_time % new_ii;
2317 	  SCHED_STAGE (u) = new_time / new_ii;
2318 	}
2319 
2320     }
2321 
2322   rows_new[split_row] = NULL;
2323 
2324   for (row = split_row; row < ii; row++)
2325     {
2326       rows_new[row + 1] = ps->rows[row];
2327       rows_length_new[row + 1] = ps->rows_length[row];
2328       ps->rows[row] = NULL;
2329       for (crr_insn = rows_new[row + 1];
2330 	   crr_insn; crr_insn = crr_insn->next_in_row)
2331 	{
2332 	  int u = crr_insn->id;
2333 	  int new_time = SCHED_TIME (u) + (SCHED_TIME (u) / ii) + 1;
2334 
2335 	  SCHED_TIME (u) = new_time;
2336 	  crr_insn->cycle = new_time;
2337 	  SCHED_ROW (u) = new_time % new_ii;
2338 	  SCHED_STAGE (u) = new_time / new_ii;
2339 	}
2340     }
2341 
2342   /* Updating ps.  */
2343   ps->min_cycle = ps->min_cycle + ps->min_cycle / ii
2344     + (SMODULO (ps->min_cycle, ii) >= split_row ? 1 : 0);
2345   ps->max_cycle = ps->max_cycle + ps->max_cycle / ii
2346     + (SMODULO (ps->max_cycle, ii) >= split_row ? 1 : 0);
2347   free (ps->rows);
2348   ps->rows = rows_new;
2349   free (ps->rows_length);
2350   ps->rows_length = rows_length_new;
2351   ps->ii = new_ii;
2352   gcc_assert (ps->min_cycle >= 0);
2353 
2354   verify_partial_schedule (ps, sched_nodes);
2355 
2356   if (dump_file)
2357     fprintf (dump_file, "min_cycle=%d, max_cycle=%d\n", ps->min_cycle,
2358 	     ps->max_cycle);
2359 }
2360 
2361 /* Given U_NODE which is the node that failed to be scheduled; LOW and
2362    UP which are the boundaries of it's scheduling window; compute using
2363    SCHED_NODES and II a row in the partial schedule that can be split
2364    which will separate a critical predecessor from a critical successor
2365    thereby expanding the window, and return it.  */
2366 static int
compute_split_row(sbitmap sched_nodes,int low,int up,int ii,ddg_node_ptr u_node)2367 compute_split_row (sbitmap sched_nodes, int low, int up, int ii,
2368 		   ddg_node_ptr u_node)
2369 {
2370   ddg_edge_ptr e;
2371   int lower = INT_MIN, upper = INT_MAX;
2372   int crit_pred = -1;
2373   int crit_succ = -1;
2374   int crit_cycle;
2375 
2376   for (e = u_node->in; e != 0; e = e->next_in)
2377     {
2378       int v = e->src->cuid;
2379 
2380       if (bitmap_bit_p (sched_nodes, v)
2381 	  && (low == SCHED_TIME (v) + e->latency - (e->distance * ii)))
2382 	if (SCHED_TIME (v) > lower)
2383 	  {
2384 	    crit_pred = v;
2385 	    lower = SCHED_TIME (v);
2386 	  }
2387     }
2388 
2389   if (crit_pred >= 0)
2390     {
2391       crit_cycle = SCHED_TIME (crit_pred) + 1;
2392       return SMODULO (crit_cycle, ii);
2393     }
2394 
2395   for (e = u_node->out; e != 0; e = e->next_out)
2396     {
2397       int v = e->dest->cuid;
2398 
2399       if (bitmap_bit_p (sched_nodes, v)
2400 	  && (up == SCHED_TIME (v) - e->latency + (e->distance * ii)))
2401 	if (SCHED_TIME (v) < upper)
2402 	  {
2403 	    crit_succ = v;
2404 	    upper = SCHED_TIME (v);
2405 	  }
2406     }
2407 
2408   if (crit_succ >= 0)
2409     {
2410       crit_cycle = SCHED_TIME (crit_succ);
2411       return SMODULO (crit_cycle, ii);
2412     }
2413 
2414   if (dump_file)
2415     fprintf (dump_file, "Both crit_pred and crit_succ are NULL\n");
2416 
2417   return SMODULO ((low + up + 1) / 2, ii);
2418 }
2419 
2420 static void
verify_partial_schedule(partial_schedule_ptr ps,sbitmap sched_nodes)2421 verify_partial_schedule (partial_schedule_ptr ps, sbitmap sched_nodes)
2422 {
2423   int row;
2424   ps_insn_ptr crr_insn;
2425 
2426   for (row = 0; row < ps->ii; row++)
2427     {
2428       int length = 0;
2429 
2430       for (crr_insn = ps->rows[row]; crr_insn; crr_insn = crr_insn->next_in_row)
2431 	{
2432 	  int u = crr_insn->id;
2433 
2434 	  length++;
2435 	  gcc_assert (bitmap_bit_p (sched_nodes, u));
2436 	  /* ??? Test also that all nodes of sched_nodes are in ps, perhaps by
2437 	     popcount (sched_nodes) == number of insns in ps.  */
2438 	  gcc_assert (SCHED_TIME (u) >= ps->min_cycle);
2439 	  gcc_assert (SCHED_TIME (u) <= ps->max_cycle);
2440 	}
2441 
2442       gcc_assert (ps->rows_length[row] == length);
2443     }
2444 }
2445 
2446 
2447 /* This page implements the algorithm for ordering the nodes of a DDG
2448    for modulo scheduling, activated through the
2449    "int sms_order_nodes (ddg_ptr, int mii, int * result)" API.  */
2450 
2451 #define ORDER_PARAMS(x) ((struct node_order_params *) (x)->aux.info)
2452 #define ASAP(x) (ORDER_PARAMS ((x))->asap)
2453 #define ALAP(x) (ORDER_PARAMS ((x))->alap)
2454 #define HEIGHT(x) (ORDER_PARAMS ((x))->height)
2455 #define MOB(x) (ALAP ((x)) - ASAP ((x)))
2456 #define DEPTH(x) (ASAP ((x)))
2457 
2458 typedef struct node_order_params * nopa;
2459 
2460 static void order_nodes_of_sccs (ddg_all_sccs_ptr, int * result);
2461 static int order_nodes_in_scc (ddg_ptr, sbitmap, sbitmap, int*, int);
2462 static nopa  calculate_order_params (ddg_ptr, int, int *);
2463 static int find_max_asap (ddg_ptr, sbitmap);
2464 static int find_max_hv_min_mob (ddg_ptr, sbitmap);
2465 static int find_max_dv_min_mob (ddg_ptr, sbitmap);
2466 
2467 enum sms_direction {BOTTOMUP, TOPDOWN};
2468 
2469 struct node_order_params
2470 {
2471   int asap;
2472   int alap;
2473   int height;
2474 };
2475 
2476 /* Check if NODE_ORDER contains a permutation of 0 .. NUM_NODES-1.  */
2477 static void
check_nodes_order(int * node_order,int num_nodes)2478 check_nodes_order (int *node_order, int num_nodes)
2479 {
2480   int i;
2481   auto_sbitmap tmp (num_nodes);
2482 
2483   bitmap_clear (tmp);
2484 
2485   if (dump_file)
2486     fprintf (dump_file, "SMS final nodes order: \n");
2487 
2488   for (i = 0; i < num_nodes; i++)
2489     {
2490       int u = node_order[i];
2491 
2492       if (dump_file)
2493         fprintf (dump_file, "%d ", u);
2494       gcc_assert (u < num_nodes && u >= 0 && !bitmap_bit_p (tmp, u));
2495 
2496       bitmap_set_bit (tmp, u);
2497     }
2498 
2499   if (dump_file)
2500     fprintf (dump_file, "\n");
2501 }
2502 
2503 /* Order the nodes of G for scheduling and pass the result in
2504    NODE_ORDER.  Also set aux.count of each node to ASAP.
2505    Put maximal ASAP to PMAX_ASAP.  Return the recMII for the given DDG.  */
2506 static int
sms_order_nodes(ddg_ptr g,int mii,int * node_order,int * pmax_asap)2507 sms_order_nodes (ddg_ptr g, int mii, int * node_order, int *pmax_asap)
2508 {
2509   int i;
2510   int rec_mii = 0;
2511   ddg_all_sccs_ptr sccs = create_ddg_all_sccs (g);
2512 
2513   nopa nops = calculate_order_params (g, mii, pmax_asap);
2514 
2515   if (dump_file)
2516     print_sccs (dump_file, sccs, g);
2517 
2518   order_nodes_of_sccs (sccs, node_order);
2519 
2520   if (sccs->num_sccs > 0)
2521     /* First SCC has the largest recurrence_length.  */
2522     rec_mii = sccs->sccs[0]->recurrence_length;
2523 
2524   /* Save ASAP before destroying node_order_params.  */
2525   for (i = 0; i < g->num_nodes; i++)
2526     {
2527       ddg_node_ptr v = &g->nodes[i];
2528       v->aux.count = ASAP (v);
2529     }
2530 
2531   free (nops);
2532   free_ddg_all_sccs (sccs);
2533   check_nodes_order (node_order, g->num_nodes);
2534 
2535   return rec_mii;
2536 }
2537 
2538 static void
order_nodes_of_sccs(ddg_all_sccs_ptr all_sccs,int * node_order)2539 order_nodes_of_sccs (ddg_all_sccs_ptr all_sccs, int * node_order)
2540 {
2541   int i, pos = 0;
2542   ddg_ptr g = all_sccs->ddg;
2543   int num_nodes = g->num_nodes;
2544   auto_sbitmap prev_sccs (num_nodes);
2545   auto_sbitmap on_path (num_nodes);
2546   auto_sbitmap tmp (num_nodes);
2547   auto_sbitmap ones (num_nodes);
2548 
2549   bitmap_clear (prev_sccs);
2550   bitmap_ones (ones);
2551 
2552   /* Perform the node ordering starting from the SCC with the highest recMII.
2553      For each SCC order the nodes according to their ASAP/ALAP/HEIGHT etc.  */
2554   for (i = 0; i < all_sccs->num_sccs; i++)
2555     {
2556       ddg_scc_ptr scc = all_sccs->sccs[i];
2557 
2558       /* Add nodes on paths from previous SCCs to the current SCC.  */
2559       find_nodes_on_paths (on_path, g, prev_sccs, scc->nodes);
2560       bitmap_ior (tmp, scc->nodes, on_path);
2561 
2562       /* Add nodes on paths from the current SCC to previous SCCs.  */
2563       find_nodes_on_paths (on_path, g, scc->nodes, prev_sccs);
2564       bitmap_ior (tmp, tmp, on_path);
2565 
2566       /* Remove nodes of previous SCCs from current extended SCC.  */
2567       bitmap_and_compl (tmp, tmp, prev_sccs);
2568 
2569       pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
2570       /* Above call to order_nodes_in_scc updated prev_sccs |= tmp.  */
2571     }
2572 
2573   /* Handle the remaining nodes that do not belong to any scc.  Each call
2574      to order_nodes_in_scc handles a single connected component.  */
2575   while (pos < g->num_nodes)
2576     {
2577       bitmap_and_compl (tmp, ones, prev_sccs);
2578       pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
2579     }
2580 }
2581 
2582 /* MII is needed if we consider backarcs (that do not close recursive cycles).  */
2583 static struct node_order_params *
calculate_order_params(ddg_ptr g,int mii ATTRIBUTE_UNUSED,int * pmax_asap)2584 calculate_order_params (ddg_ptr g, int mii ATTRIBUTE_UNUSED, int *pmax_asap)
2585 {
2586   int u;
2587   int max_asap;
2588   int num_nodes = g->num_nodes;
2589   ddg_edge_ptr e;
2590   /* Allocate a place to hold ordering params for each node in the DDG.  */
2591   nopa node_order_params_arr;
2592 
2593   /* Initialize of ASAP/ALAP/HEIGHT to zero.  */
2594   node_order_params_arr = (nopa) xcalloc (num_nodes,
2595 					  sizeof (struct node_order_params));
2596 
2597   /* Set the aux pointer of each node to point to its order_params structure.  */
2598   for (u = 0; u < num_nodes; u++)
2599     g->nodes[u].aux.info = &node_order_params_arr[u];
2600 
2601   /* Disregarding a backarc from each recursive cycle to obtain a DAG,
2602      calculate ASAP, ALAP, mobility, distance, and height for each node
2603      in the dependence (direct acyclic) graph.  */
2604 
2605   /* We assume that the nodes in the array are in topological order.  */
2606 
2607   max_asap = 0;
2608   for (u = 0; u < num_nodes; u++)
2609     {
2610       ddg_node_ptr u_node = &g->nodes[u];
2611 
2612       ASAP (u_node) = 0;
2613       for (e = u_node->in; e; e = e->next_in)
2614 	if (e->distance == 0)
2615 	  ASAP (u_node) = MAX (ASAP (u_node),
2616 			       ASAP (e->src) + e->latency);
2617       max_asap = MAX (max_asap, ASAP (u_node));
2618     }
2619 
2620   for (u = num_nodes - 1; u > -1; u--)
2621     {
2622       ddg_node_ptr u_node = &g->nodes[u];
2623 
2624       ALAP (u_node) = max_asap;
2625       HEIGHT (u_node) = 0;
2626       for (e = u_node->out; e; e = e->next_out)
2627 	if (e->distance == 0)
2628 	  {
2629 	    ALAP (u_node) = MIN (ALAP (u_node),
2630 				 ALAP (e->dest) - e->latency);
2631 	    HEIGHT (u_node) = MAX (HEIGHT (u_node),
2632 				   HEIGHT (e->dest) + e->latency);
2633 	  }
2634     }
2635   if (dump_file)
2636   {
2637     fprintf (dump_file, "\nOrder params\n");
2638     for (u = 0; u < num_nodes; u++)
2639       {
2640         ddg_node_ptr u_node = &g->nodes[u];
2641 
2642         fprintf (dump_file, "node %d, ASAP: %d, ALAP: %d, HEIGHT: %d\n", u,
2643                  ASAP (u_node), ALAP (u_node), HEIGHT (u_node));
2644       }
2645   }
2646 
2647   *pmax_asap = max_asap;
2648   return node_order_params_arr;
2649 }
2650 
2651 static int
find_max_asap(ddg_ptr g,sbitmap nodes)2652 find_max_asap (ddg_ptr g, sbitmap nodes)
2653 {
2654   unsigned int u = 0;
2655   int max_asap = -1;
2656   int result = -1;
2657   sbitmap_iterator sbi;
2658 
2659   EXECUTE_IF_SET_IN_BITMAP (nodes, 0, u, sbi)
2660     {
2661       ddg_node_ptr u_node = &g->nodes[u];
2662 
2663       if (max_asap < ASAP (u_node))
2664 	{
2665 	  max_asap = ASAP (u_node);
2666 	  result = u;
2667 	}
2668     }
2669   return result;
2670 }
2671 
2672 static int
find_max_hv_min_mob(ddg_ptr g,sbitmap nodes)2673 find_max_hv_min_mob (ddg_ptr g, sbitmap nodes)
2674 {
2675   unsigned int u = 0;
2676   int max_hv = -1;
2677   int min_mob = INT_MAX;
2678   int result = -1;
2679   sbitmap_iterator sbi;
2680 
2681   EXECUTE_IF_SET_IN_BITMAP (nodes, 0, u, sbi)
2682     {
2683       ddg_node_ptr u_node = &g->nodes[u];
2684 
2685       if (max_hv < HEIGHT (u_node))
2686 	{
2687 	  max_hv = HEIGHT (u_node);
2688 	  min_mob = MOB (u_node);
2689 	  result = u;
2690 	}
2691       else if ((max_hv == HEIGHT (u_node))
2692 	       && (min_mob > MOB (u_node)))
2693 	{
2694 	  min_mob = MOB (u_node);
2695 	  result = u;
2696 	}
2697     }
2698   return result;
2699 }
2700 
2701 static int
find_max_dv_min_mob(ddg_ptr g,sbitmap nodes)2702 find_max_dv_min_mob (ddg_ptr g, sbitmap nodes)
2703 {
2704   unsigned int u = 0;
2705   int max_dv = -1;
2706   int min_mob = INT_MAX;
2707   int result = -1;
2708   sbitmap_iterator sbi;
2709 
2710   EXECUTE_IF_SET_IN_BITMAP (nodes, 0, u, sbi)
2711     {
2712       ddg_node_ptr u_node = &g->nodes[u];
2713 
2714       if (max_dv < DEPTH (u_node))
2715 	{
2716 	  max_dv = DEPTH (u_node);
2717 	  min_mob = MOB (u_node);
2718 	  result = u;
2719 	}
2720       else if ((max_dv == DEPTH (u_node))
2721 	       && (min_mob > MOB (u_node)))
2722 	{
2723 	  min_mob = MOB (u_node);
2724 	  result = u;
2725 	}
2726     }
2727   return result;
2728 }
2729 
2730 /* Places the nodes of SCC into the NODE_ORDER array starting
2731    at position POS, according to the SMS ordering algorithm.
2732    NODES_ORDERED (in&out parameter) holds the bitset of all nodes in
2733    the NODE_ORDER array, starting from position zero.  */
2734 static int
order_nodes_in_scc(ddg_ptr g,sbitmap nodes_ordered,sbitmap scc,int * node_order,int pos)2735 order_nodes_in_scc (ddg_ptr g, sbitmap nodes_ordered, sbitmap scc,
2736 		    int * node_order, int pos)
2737 {
2738   enum sms_direction dir;
2739   int num_nodes = g->num_nodes;
2740   auto_sbitmap workset (num_nodes);
2741   auto_sbitmap tmp (num_nodes);
2742   sbitmap zero_bitmap = sbitmap_alloc (num_nodes);
2743   auto_sbitmap predecessors (num_nodes);
2744   auto_sbitmap successors (num_nodes);
2745 
2746   bitmap_clear (predecessors);
2747   find_predecessors (predecessors, g, nodes_ordered);
2748 
2749   bitmap_clear (successors);
2750   find_successors (successors, g, nodes_ordered);
2751 
2752   bitmap_clear (tmp);
2753   if (bitmap_and (tmp, predecessors, scc))
2754     {
2755       bitmap_copy (workset, tmp);
2756       dir = BOTTOMUP;
2757     }
2758   else if (bitmap_and (tmp, successors, scc))
2759     {
2760       bitmap_copy (workset, tmp);
2761       dir = TOPDOWN;
2762     }
2763   else
2764     {
2765       int u;
2766 
2767       bitmap_clear (workset);
2768       if ((u = find_max_asap (g, scc)) >= 0)
2769 	bitmap_set_bit (workset, u);
2770       dir = BOTTOMUP;
2771     }
2772 
2773   bitmap_clear (zero_bitmap);
2774   while (!bitmap_equal_p (workset, zero_bitmap))
2775     {
2776       int v;
2777       ddg_node_ptr v_node;
2778       sbitmap v_node_preds;
2779       sbitmap v_node_succs;
2780 
2781       if (dir == TOPDOWN)
2782 	{
2783 	  while (!bitmap_equal_p (workset, zero_bitmap))
2784 	    {
2785 	      v = find_max_hv_min_mob (g, workset);
2786 	      v_node = &g->nodes[v];
2787 	      node_order[pos++] = v;
2788 	      v_node_succs = NODE_SUCCESSORS (v_node);
2789 	      bitmap_and (tmp, v_node_succs, scc);
2790 
2791 	      /* Don't consider the already ordered successors again.  */
2792 	      bitmap_and_compl (tmp, tmp, nodes_ordered);
2793 	      bitmap_ior (workset, workset, tmp);
2794 	      bitmap_clear_bit (workset, v);
2795 	      bitmap_set_bit (nodes_ordered, v);
2796 	    }
2797 	  dir = BOTTOMUP;
2798 	  bitmap_clear (predecessors);
2799 	  find_predecessors (predecessors, g, nodes_ordered);
2800 	  bitmap_and (workset, predecessors, scc);
2801 	}
2802       else
2803 	{
2804 	  while (!bitmap_equal_p (workset, zero_bitmap))
2805 	    {
2806 	      v = find_max_dv_min_mob (g, workset);
2807 	      v_node = &g->nodes[v];
2808 	      node_order[pos++] = v;
2809 	      v_node_preds = NODE_PREDECESSORS (v_node);
2810 	      bitmap_and (tmp, v_node_preds, scc);
2811 
2812 	      /* Don't consider the already ordered predecessors again.  */
2813 	      bitmap_and_compl (tmp, tmp, nodes_ordered);
2814 	      bitmap_ior (workset, workset, tmp);
2815 	      bitmap_clear_bit (workset, v);
2816 	      bitmap_set_bit (nodes_ordered, v);
2817 	    }
2818 	  dir = TOPDOWN;
2819 	  bitmap_clear (successors);
2820 	  find_successors (successors, g, nodes_ordered);
2821 	  bitmap_and (workset, successors, scc);
2822 	}
2823     }
2824   sbitmap_free (zero_bitmap);
2825   return pos;
2826 }
2827 
2828 
2829 /* This page contains functions for manipulating partial-schedules during
2830    modulo scheduling.  */
2831 
2832 /* Create a partial schedule and allocate a memory to hold II rows.  */
2833 
2834 static partial_schedule_ptr
create_partial_schedule(int ii,ddg_ptr g,int history)2835 create_partial_schedule (int ii, ddg_ptr g, int history)
2836 {
2837   partial_schedule_ptr ps = XNEW (struct partial_schedule);
2838   ps->rows = (ps_insn_ptr *) xcalloc (ii, sizeof (ps_insn_ptr));
2839   ps->rows_length = (int *) xcalloc (ii, sizeof (int));
2840   ps->reg_moves.create (0);
2841   ps->ii = ii;
2842   ps->history = history;
2843   ps->min_cycle = INT_MAX;
2844   ps->max_cycle = INT_MIN;
2845   ps->g = g;
2846 
2847   return ps;
2848 }
2849 
2850 /* Free the PS_INSNs in rows array of the given partial schedule.
2851    ??? Consider caching the PS_INSN's.  */
2852 static void
free_ps_insns(partial_schedule_ptr ps)2853 free_ps_insns (partial_schedule_ptr ps)
2854 {
2855   int i;
2856 
2857   for (i = 0; i < ps->ii; i++)
2858     {
2859       while (ps->rows[i])
2860 	{
2861 	  ps_insn_ptr ps_insn = ps->rows[i]->next_in_row;
2862 
2863 	  free (ps->rows[i]);
2864 	  ps->rows[i] = ps_insn;
2865 	}
2866       ps->rows[i] = NULL;
2867     }
2868 }
2869 
2870 /* Free all the memory allocated to the partial schedule.  */
2871 
2872 static void
free_partial_schedule(partial_schedule_ptr ps)2873 free_partial_schedule (partial_schedule_ptr ps)
2874 {
2875   ps_reg_move_info *move;
2876   unsigned int i;
2877 
2878   if (!ps)
2879     return;
2880 
2881   FOR_EACH_VEC_ELT (ps->reg_moves, i, move)
2882     sbitmap_free (move->uses);
2883   ps->reg_moves.release ();
2884 
2885   free_ps_insns (ps);
2886   free (ps->rows);
2887   free (ps->rows_length);
2888   free (ps);
2889 }
2890 
2891 /* Clear the rows array with its PS_INSNs, and create a new one with
2892    NEW_II rows.  */
2893 
2894 static void
reset_partial_schedule(partial_schedule_ptr ps,int new_ii)2895 reset_partial_schedule (partial_schedule_ptr ps, int new_ii)
2896 {
2897   if (!ps)
2898     return;
2899   free_ps_insns (ps);
2900   if (new_ii == ps->ii)
2901     return;
2902   ps->rows = (ps_insn_ptr *) xrealloc (ps->rows, new_ii
2903 						 * sizeof (ps_insn_ptr));
2904   memset (ps->rows, 0, new_ii * sizeof (ps_insn_ptr));
2905   ps->rows_length = (int *) xrealloc (ps->rows_length, new_ii * sizeof (int));
2906   memset (ps->rows_length, 0, new_ii * sizeof (int));
2907   ps->ii = new_ii;
2908   ps->min_cycle = INT_MAX;
2909   ps->max_cycle = INT_MIN;
2910 }
2911 
2912 /* Prints the partial schedule as an ii rows array, for each rows
2913    print the ids of the insns in it.  */
2914 void
print_partial_schedule(partial_schedule_ptr ps,FILE * dump)2915 print_partial_schedule (partial_schedule_ptr ps, FILE *dump)
2916 {
2917   int i;
2918 
2919   for (i = 0; i < ps->ii; i++)
2920     {
2921       ps_insn_ptr ps_i = ps->rows[i];
2922 
2923       fprintf (dump, "\n[ROW %d ]: ", i);
2924       while (ps_i)
2925 	{
2926 	  rtx_insn *insn = ps_rtl_insn (ps, ps_i->id);
2927 
2928 	  if (JUMP_P (insn))
2929 	    fprintf (dump, "%d (branch), ", INSN_UID (insn));
2930 	  else
2931 	    fprintf (dump, "%d, ", INSN_UID (insn));
2932 
2933 	  ps_i = ps_i->next_in_row;
2934 	}
2935     }
2936 }
2937 
2938 /* Creates an object of PS_INSN and initializes it to the given parameters.  */
2939 static ps_insn_ptr
create_ps_insn(int id,int cycle)2940 create_ps_insn (int id, int cycle)
2941 {
2942   ps_insn_ptr ps_i = XNEW (struct ps_insn);
2943 
2944   ps_i->id = id;
2945   ps_i->next_in_row = NULL;
2946   ps_i->prev_in_row = NULL;
2947   ps_i->cycle = cycle;
2948 
2949   return ps_i;
2950 }
2951 
2952 
2953 /* Removes the given PS_INSN from the partial schedule.  */
2954 static void
remove_node_from_ps(partial_schedule_ptr ps,ps_insn_ptr ps_i)2955 remove_node_from_ps (partial_schedule_ptr ps, ps_insn_ptr ps_i)
2956 {
2957   int row;
2958 
2959   gcc_assert (ps && ps_i);
2960 
2961   row = SMODULO (ps_i->cycle, ps->ii);
2962   if (! ps_i->prev_in_row)
2963     {
2964       gcc_assert (ps_i == ps->rows[row]);
2965       ps->rows[row] = ps_i->next_in_row;
2966       if (ps->rows[row])
2967 	ps->rows[row]->prev_in_row = NULL;
2968     }
2969   else
2970     {
2971       ps_i->prev_in_row->next_in_row = ps_i->next_in_row;
2972       if (ps_i->next_in_row)
2973 	ps_i->next_in_row->prev_in_row = ps_i->prev_in_row;
2974     }
2975 
2976   ps->rows_length[row] -= 1;
2977   free (ps_i);
2978   return;
2979 }
2980 
2981 /* Unlike what literature describes for modulo scheduling (which focuses
2982    on VLIW machines) the order of the instructions inside a cycle is
2983    important.  Given the bitmaps MUST_FOLLOW and MUST_PRECEDE we know
2984    where the current instruction should go relative to the already
2985    scheduled instructions in the given cycle.  Go over these
2986    instructions and find the first possible column to put it in.  */
2987 static bool
ps_insn_find_column(partial_schedule_ptr ps,ps_insn_ptr ps_i,sbitmap must_precede,sbitmap must_follow)2988 ps_insn_find_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2989 		     sbitmap must_precede, sbitmap must_follow)
2990 {
2991   ps_insn_ptr next_ps_i;
2992   ps_insn_ptr first_must_follow = NULL;
2993   ps_insn_ptr last_must_precede = NULL;
2994   ps_insn_ptr last_in_row = NULL;
2995   int row;
2996 
2997   if (! ps_i)
2998     return false;
2999 
3000   row = SMODULO (ps_i->cycle, ps->ii);
3001 
3002   /* Find the first must follow and the last must precede
3003      and insert the node immediately after the must precede
3004      but make sure that it there is no must follow after it.  */
3005   for (next_ps_i = ps->rows[row];
3006        next_ps_i;
3007        next_ps_i = next_ps_i->next_in_row)
3008     {
3009       if (must_follow
3010 	  && bitmap_bit_p (must_follow, next_ps_i->id)
3011 	  && ! first_must_follow)
3012         first_must_follow = next_ps_i;
3013       if (must_precede && bitmap_bit_p (must_precede, next_ps_i->id))
3014         {
3015           /* If we have already met a node that must follow, then
3016 	     there is no possible column.  */
3017   	  if (first_must_follow)
3018             return false;
3019 	  else
3020             last_must_precede = next_ps_i;
3021         }
3022       /* The closing branch must be the last in the row.  */
3023       if (JUMP_P (ps_rtl_insn (ps, next_ps_i->id)))
3024 	return false;
3025 
3026        last_in_row = next_ps_i;
3027     }
3028 
3029   /* The closing branch is scheduled as well.  Make sure there is no
3030      dependent instruction after it as the branch should be the last
3031      instruction in the row.  */
3032   if (JUMP_P (ps_rtl_insn (ps, ps_i->id)))
3033     {
3034       if (first_must_follow)
3035 	return false;
3036       if (last_in_row)
3037 	{
3038 	  /* Make the branch the last in the row.  New instructions
3039 	     will be inserted at the beginning of the row or after the
3040 	     last must_precede instruction thus the branch is guaranteed
3041 	     to remain the last instruction in the row.  */
3042 	  last_in_row->next_in_row = ps_i;
3043 	  ps_i->prev_in_row = last_in_row;
3044 	  ps_i->next_in_row = NULL;
3045 	}
3046       else
3047 	ps->rows[row] = ps_i;
3048       return true;
3049     }
3050 
3051   /* Now insert the node after INSERT_AFTER_PSI.  */
3052 
3053   if (! last_must_precede)
3054     {
3055       ps_i->next_in_row = ps->rows[row];
3056       ps_i->prev_in_row = NULL;
3057       if (ps_i->next_in_row)
3058     	ps_i->next_in_row->prev_in_row = ps_i;
3059       ps->rows[row] = ps_i;
3060     }
3061   else
3062     {
3063       ps_i->next_in_row = last_must_precede->next_in_row;
3064       last_must_precede->next_in_row = ps_i;
3065       ps_i->prev_in_row = last_must_precede;
3066       if (ps_i->next_in_row)
3067         ps_i->next_in_row->prev_in_row = ps_i;
3068     }
3069 
3070   return true;
3071 }
3072 
3073 /* Advances the PS_INSN one column in its current row; returns false
3074    in failure and true in success.  Bit N is set in MUST_FOLLOW if
3075    the node with cuid N must be come after the node pointed to by
3076    PS_I when scheduled in the same cycle.  */
3077 static int
ps_insn_advance_column(partial_schedule_ptr ps,ps_insn_ptr ps_i,sbitmap must_follow)3078 ps_insn_advance_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
3079 			sbitmap must_follow)
3080 {
3081   ps_insn_ptr prev, next;
3082   int row;
3083 
3084   if (!ps || !ps_i)
3085     return false;
3086 
3087   row = SMODULO (ps_i->cycle, ps->ii);
3088 
3089   if (! ps_i->next_in_row)
3090     return false;
3091 
3092   /* Check if next_in_row is dependent on ps_i, both having same sched
3093      times (typically ANTI_DEP).  If so, ps_i cannot skip over it.  */
3094   if (must_follow && bitmap_bit_p (must_follow, ps_i->next_in_row->id))
3095     return false;
3096 
3097   /* Advance PS_I over its next_in_row in the doubly linked list.  */
3098   prev = ps_i->prev_in_row;
3099   next = ps_i->next_in_row;
3100 
3101   if (ps_i == ps->rows[row])
3102     ps->rows[row] = next;
3103 
3104   ps_i->next_in_row = next->next_in_row;
3105 
3106   if (next->next_in_row)
3107     next->next_in_row->prev_in_row = ps_i;
3108 
3109   next->next_in_row = ps_i;
3110   ps_i->prev_in_row = next;
3111 
3112   next->prev_in_row = prev;
3113   if (prev)
3114     prev->next_in_row = next;
3115 
3116   return true;
3117 }
3118 
3119 /* Inserts a DDG_NODE to the given partial schedule at the given cycle.
3120    Returns 0 if this is not possible and a PS_INSN otherwise.  Bit N is
3121    set in MUST_PRECEDE/MUST_FOLLOW if the node with cuid N must be come
3122    before/after (respectively) the node pointed to by PS_I when scheduled
3123    in the same cycle.  */
3124 static ps_insn_ptr
add_node_to_ps(partial_schedule_ptr ps,int id,int cycle,sbitmap must_precede,sbitmap must_follow)3125 add_node_to_ps (partial_schedule_ptr ps, int id, int cycle,
3126 		sbitmap must_precede, sbitmap must_follow)
3127 {
3128   ps_insn_ptr ps_i;
3129   int row = SMODULO (cycle, ps->ii);
3130 
3131   if (ps->rows_length[row] >= issue_rate)
3132     return NULL;
3133 
3134   ps_i = create_ps_insn (id, cycle);
3135 
3136   /* Finds and inserts PS_I according to MUST_FOLLOW and
3137      MUST_PRECEDE.  */
3138   if (! ps_insn_find_column (ps, ps_i, must_precede, must_follow))
3139     {
3140       free (ps_i);
3141       return NULL;
3142     }
3143 
3144   ps->rows_length[row] += 1;
3145   return ps_i;
3146 }
3147 
3148 /* Advance time one cycle.  Assumes DFA is being used.  */
3149 static void
advance_one_cycle(void)3150 advance_one_cycle (void)
3151 {
3152   if (targetm.sched.dfa_pre_cycle_insn)
3153     state_transition (curr_state,
3154 		      targetm.sched.dfa_pre_cycle_insn ());
3155 
3156   state_transition (curr_state, NULL);
3157 
3158   if (targetm.sched.dfa_post_cycle_insn)
3159     state_transition (curr_state,
3160 		      targetm.sched.dfa_post_cycle_insn ());
3161 }
3162 
3163 
3164 
3165 /* Checks if PS has resource conflicts according to DFA, starting from
3166    FROM cycle to TO cycle; returns true if there are conflicts and false
3167    if there are no conflicts.  Assumes DFA is being used.  */
3168 static int
ps_has_conflicts(partial_schedule_ptr ps,int from,int to)3169 ps_has_conflicts (partial_schedule_ptr ps, int from, int to)
3170 {
3171   int cycle;
3172 
3173   state_reset (curr_state);
3174 
3175   for (cycle = from; cycle <= to; cycle++)
3176     {
3177       ps_insn_ptr crr_insn;
3178       /* Holds the remaining issue slots in the current row.  */
3179       int can_issue_more = issue_rate;
3180 
3181       /* Walk through the DFA for the current row.  */
3182       for (crr_insn = ps->rows[SMODULO (cycle, ps->ii)];
3183 	   crr_insn;
3184 	   crr_insn = crr_insn->next_in_row)
3185 	{
3186 	  rtx_insn *insn = ps_rtl_insn (ps, crr_insn->id);
3187 
3188 	  if (!NONDEBUG_INSN_P (insn))
3189 	    continue;
3190 
3191 	  /* Check if there is room for the current insn.  */
3192 	  if (!can_issue_more || state_dead_lock_p (curr_state))
3193 	    return true;
3194 
3195 	  /* Update the DFA state and return with failure if the DFA found
3196 	     resource conflicts.  */
3197 	  if (state_transition (curr_state, insn) >= 0)
3198 	    return true;
3199 
3200 	  if (targetm.sched.variable_issue)
3201 	    can_issue_more =
3202 	      targetm.sched.variable_issue (sched_dump, sched_verbose,
3203 					    insn, can_issue_more);
3204 	  /* A naked CLOBBER or USE generates no instruction, so don't
3205 	     let them consume issue slots.  */
3206 	  else if (GET_CODE (PATTERN (insn)) != USE
3207 		   && GET_CODE (PATTERN (insn)) != CLOBBER)
3208 	    can_issue_more--;
3209 	}
3210 
3211       /* Advance the DFA to the next cycle.  */
3212       advance_one_cycle ();
3213     }
3214   return false;
3215 }
3216 
3217 /* Checks if the given node causes resource conflicts when added to PS at
3218    cycle C.  If not the node is added to PS and returned; otherwise zero
3219    is returned.  Bit N is set in MUST_PRECEDE/MUST_FOLLOW if the node with
3220    cuid N must be come before/after (respectively) the node pointed to by
3221    PS_I when scheduled in the same cycle.  */
3222 ps_insn_ptr
ps_add_node_check_conflicts(partial_schedule_ptr ps,int n,int c,sbitmap must_precede,sbitmap must_follow)3223 ps_add_node_check_conflicts (partial_schedule_ptr ps, int n,
3224    			     int c, sbitmap must_precede,
3225 			     sbitmap must_follow)
3226 {
3227   int i, first, amount, has_conflicts = 0;
3228   ps_insn_ptr ps_i;
3229 
3230   /* First add the node to the PS, if this succeeds check for
3231      conflicts, trying different issue slots in the same row.  */
3232   if (! (ps_i = add_node_to_ps (ps, n, c, must_precede, must_follow)))
3233     return NULL; /* Failed to insert the node at the given cycle.  */
3234 
3235   while (1)
3236     {
3237       has_conflicts = ps_has_conflicts (ps, c, c);
3238       if (ps->history > 0 && !has_conflicts)
3239 	{
3240 	  /* Check all 2h+1 intervals, starting from c-2h..c up to c..2h,
3241 	     but not more than ii intervals.  */
3242 	  first = c - ps->history;
3243 	  amount = 2 * ps->history + 1;
3244 	  if (amount > ps->ii)
3245 	    amount = ps->ii;
3246 	  for (i = first; i < first + amount; i++)
3247 	    {
3248 	      has_conflicts = ps_has_conflicts (ps,
3249 						i - ps->history,
3250 						i + ps->history);
3251 	      if (has_conflicts)
3252 		break;
3253 	    }
3254 	}
3255       if (!has_conflicts)
3256 	break;
3257       /* Try different issue slots to find one that the given node can be
3258 	 scheduled in without conflicts.  */
3259       if (! ps_insn_advance_column (ps, ps_i, must_follow))
3260 	break;
3261     }
3262 
3263   if (has_conflicts)
3264     {
3265       remove_node_from_ps (ps, ps_i);
3266       return NULL;
3267     }
3268 
3269   ps->min_cycle = MIN (ps->min_cycle, c);
3270   ps->max_cycle = MAX (ps->max_cycle, c);
3271   return ps_i;
3272 }
3273 
3274 /* Calculate the stage count of the partial schedule PS.  The calculation
3275    takes into account the rotation amount passed in ROTATION_AMOUNT.  */
3276 int
calculate_stage_count(partial_schedule_ptr ps,int rotation_amount)3277 calculate_stage_count (partial_schedule_ptr ps, int rotation_amount)
3278 {
3279   int new_min_cycle = PS_MIN_CYCLE (ps) - rotation_amount;
3280   int new_max_cycle = PS_MAX_CYCLE (ps) - rotation_amount;
3281   int stage_count = CALC_STAGE_COUNT (-1, new_min_cycle, ps->ii);
3282 
3283   /* The calculation of stage count is done adding the number of stages
3284      before cycle zero and after cycle zero.  */
3285   stage_count += CALC_STAGE_COUNT (new_max_cycle, 0, ps->ii);
3286 
3287   return stage_count;
3288 }
3289 
3290 /* Rotate the rows of PS such that insns scheduled at time
3291    START_CYCLE will appear in row 0.  Updates max/min_cycles.  */
3292 void
rotate_partial_schedule(partial_schedule_ptr ps,int start_cycle)3293 rotate_partial_schedule (partial_schedule_ptr ps, int start_cycle)
3294 {
3295   int i, row, backward_rotates;
3296   int last_row = ps->ii - 1;
3297 
3298   if (start_cycle == 0)
3299     return;
3300 
3301   backward_rotates = SMODULO (start_cycle, ps->ii);
3302 
3303   /* Revisit later and optimize this into a single loop.  */
3304   for (i = 0; i < backward_rotates; i++)
3305     {
3306       ps_insn_ptr first_row = ps->rows[0];
3307       int first_row_length = ps->rows_length[0];
3308 
3309       for (row = 0; row < last_row; row++)
3310 	{
3311 	  ps->rows[row] = ps->rows[row + 1];
3312 	  ps->rows_length[row] = ps->rows_length[row + 1];
3313 	}
3314 
3315       ps->rows[last_row] = first_row;
3316       ps->rows_length[last_row] = first_row_length;
3317     }
3318 
3319   ps->max_cycle -= start_cycle;
3320   ps->min_cycle -= start_cycle;
3321 }
3322 
3323 #endif /* INSN_SCHEDULING */
3324 
3325 /* Run instruction scheduler.  */
3326 /* Perform SMS module scheduling.  */
3327 
3328 namespace {
3329 
3330 const pass_data pass_data_sms =
3331 {
3332   RTL_PASS, /* type */
3333   "sms", /* name */
3334   OPTGROUP_NONE, /* optinfo_flags */
3335   TV_SMS, /* tv_id */
3336   0, /* properties_required */
3337   0, /* properties_provided */
3338   0, /* properties_destroyed */
3339   0, /* todo_flags_start */
3340   TODO_df_finish, /* todo_flags_finish */
3341 };
3342 
3343 class pass_sms : public rtl_opt_pass
3344 {
3345 public:
pass_sms(gcc::context * ctxt)3346   pass_sms (gcc::context *ctxt)
3347     : rtl_opt_pass (pass_data_sms, ctxt)
3348   {}
3349 
3350   /* opt_pass methods: */
gate(function *)3351   virtual bool gate (function *)
3352 {
3353   return (optimize > 0 && flag_modulo_sched);
3354 }
3355 
3356   virtual unsigned int execute (function *);
3357 
3358 }; // class pass_sms
3359 
3360 unsigned int
execute(function * fun ATTRIBUTE_UNUSED)3361 pass_sms::execute (function *fun ATTRIBUTE_UNUSED)
3362 {
3363 #ifdef INSN_SCHEDULING
3364   basic_block bb;
3365 
3366   /* Collect loop information to be used in SMS.  */
3367   cfg_layout_initialize (0);
3368   sms_schedule ();
3369 
3370   /* Update the life information, because we add pseudos.  */
3371   max_regno = max_reg_num ();
3372 
3373   /* Finalize layout changes.  */
3374   FOR_EACH_BB_FN (bb, fun)
3375     if (bb->next_bb != EXIT_BLOCK_PTR_FOR_FN (fun))
3376       bb->aux = bb->next_bb;
3377   free_dominance_info (CDI_DOMINATORS);
3378   cfg_layout_finalize ();
3379 #endif /* INSN_SCHEDULING */
3380   return 0;
3381 }
3382 
3383 } // anon namespace
3384 
3385 rtl_opt_pass *
make_pass_sms(gcc::context * ctxt)3386 make_pass_sms (gcc::context *ctxt)
3387 {
3388   return new pass_sms (ctxt);
3389 }
3390