1 /* Swing Modulo Scheduling implementation.
2    Copyright (C) 2004-2021 Free Software Foundation, Inc.
3    Contributed by Ayal Zaks and Mustafa Hagog <zaks,mustafa@il.ibm.com>
4 
5 This file is part of GCC.
6 
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
10 version.
11 
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15 for more details.
16 
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3.  If not see
19 <http://www.gnu.org/licenses/>.  */
20 
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "cfghooks.h"
30 #include "df.h"
31 #include "memmodel.h"
32 #include "optabs.h"
33 #include "regs.h"
34 #include "emit-rtl.h"
35 #include "gcov-io.h"
36 #include "profile.h"
37 #include "insn-attr.h"
38 #include "cfgrtl.h"
39 #include "sched-int.h"
40 #include "cfgloop.h"
41 #include "expr.h"
42 #include "ddg.h"
43 #include "tree-pass.h"
44 #include "dbgcnt.h"
45 #include "loop-unroll.h"
46 #include "hard-reg-set.h"
47 
48 #ifdef INSN_SCHEDULING
49 
50 /* This file contains the implementation of the Swing Modulo Scheduler,
51    described in the following references:
52    [1] J. Llosa, A. Gonzalez, E. Ayguade, M. Valero., and J. Eckhardt.
53        Lifetime--sensitive modulo scheduling in a production environment.
54        IEEE Trans. on Comps., 50(3), March 2001
55    [2] J. Llosa, A. Gonzalez, E. Ayguade, and M. Valero.
56        Swing Modulo Scheduling: A Lifetime Sensitive Approach.
57        PACT '96 , pages 80-87, October 1996 (Boston - Massachusetts - USA).
58 
59    The basic structure is:
60    1. Build a data-dependence graph (DDG) for each loop.
61    2. Use the DDG to order the insns of a loop (not in topological order
62       necessarily, but rather) trying to place each insn after all its
63       predecessors _or_ after all its successors.
64    3. Compute MII: a lower bound on the number of cycles to schedule the loop.
65    4. Use the ordering to perform list-scheduling of the loop:
66       1. Set II = MII.  We will try to schedule the loop within II cycles.
67       2. Try to schedule the insns one by one according to the ordering.
68 	 For each insn compute an interval of cycles by considering already-
69 	 scheduled preds and succs (and associated latencies); try to place
70 	 the insn in the cycles of this window checking for potential
71 	 resource conflicts (using the DFA interface).
72 	 Note: this is different from the cycle-scheduling of schedule_insns;
73 	 here the insns are not scheduled monotonically top-down (nor bottom-
74 	 up).
75       3. If failed in scheduling all insns - bump II++ and try again, unless
76 	 II reaches an upper bound MaxII, in which case report failure.
77    5. If we succeeded in scheduling the loop within II cycles, we now
78       generate prolog and epilog, decrease the counter of the loop, and
79       perform modulo variable expansion for live ranges that span more than
80       II cycles (i.e. use register copies to prevent a def from overwriting
81       itself before reaching the use).
82 
83     SMS works with countable loops (1) whose control part can be easily
84     decoupled from the rest of the loop and (2) whose loop count can
85     be easily adjusted.  This is because we peel a constant number of
86     iterations into a prologue and epilogue for which we want to avoid
87     emitting the control part, and a kernel which is to iterate that
88     constant number of iterations less than the original loop.  So the
89     control part should be a set of insns clearly identified and having
90     its own iv, not otherwise used in the loop (at-least for now), which
91     initializes a register before the loop to the number of iterations.
92     Currently SMS relies on the do-loop pattern to recognize such loops,
93     where (1) the control part comprises of all insns defining and/or
94     using a certain 'count' register and (2) the loop count can be
95     adjusted by modifying this register prior to the loop.
96     TODO: Rely on cfgloop analysis instead.  */
97 
98 /* This page defines partial-schedule structures and functions for
99    modulo scheduling.  */
100 
101 typedef struct partial_schedule *partial_schedule_ptr;
102 typedef struct ps_insn *ps_insn_ptr;
103 
104 /* The minimum (absolute) cycle that a node of ps was scheduled in.  */
105 #define PS_MIN_CYCLE(ps) (((partial_schedule_ptr)(ps))->min_cycle)
106 
107 /* The maximum (absolute) cycle that a node of ps was scheduled in.  */
108 #define PS_MAX_CYCLE(ps) (((partial_schedule_ptr)(ps))->max_cycle)
109 
110 /* Perform signed modulo, always returning a non-negative value.  */
111 #define SMODULO(x,y) ((x) % (y) < 0 ? ((x) % (y) + (y)) : (x) % (y))
112 
113 /* The number of different iterations the nodes in ps span, assuming
114    the stage boundaries are placed efficiently.  */
115 #define CALC_STAGE_COUNT(max_cycle,min_cycle,ii) ((max_cycle - min_cycle \
116                          + 1 + ii - 1) / ii)
117 /* The stage count of ps.  */
118 #define PS_STAGE_COUNT(ps) (((partial_schedule_ptr)(ps))->stage_count)
119 
120 /* A single instruction in the partial schedule.  */
121 struct ps_insn
122 {
123   /* Identifies the instruction to be scheduled.  Values smaller than
124      the ddg's num_nodes refer directly to ddg nodes.  A value of
125      X - num_nodes refers to register move X.  */
126   int id;
127 
128   /* The (absolute) cycle in which the PS instruction is scheduled.
129      Same as SCHED_TIME (node).  */
130   int cycle;
131 
132   /* The next/prev PS_INSN in the same row.  */
133   ps_insn_ptr next_in_row,
134 	      prev_in_row;
135 
136 };
137 
138 /* Information about a register move that has been added to a partial
139    schedule.  */
140 struct ps_reg_move_info
141 {
142   /* The source of the move is defined by the ps_insn with id DEF.
143      The destination is used by the ps_insns with the ids in USES.  */
144   int def;
145   sbitmap uses;
146 
147   /* The original form of USES' instructions used OLD_REG, but they
148      should now use NEW_REG.  */
149   rtx old_reg;
150   rtx new_reg;
151 
152   /* The number of consecutive stages that the move occupies.  */
153   int num_consecutive_stages;
154 
155   /* An instruction that sets NEW_REG to the correct value.  The first
156      move associated with DEF will have an rhs of OLD_REG; later moves
157      use the result of the previous move.  */
158   rtx_insn *insn;
159 };
160 
161 /* Holds the partial schedule as an array of II rows.  Each entry of the
162    array points to a linked list of PS_INSNs, which represents the
163    instructions that are scheduled for that row.  */
164 struct partial_schedule
165 {
166   int ii;	/* Number of rows in the partial schedule.  */
167   int history;  /* Threshold for conflict checking using DFA.  */
168 
169   /* rows[i] points to linked list of insns scheduled in row i (0<=i<ii).  */
170   ps_insn_ptr *rows;
171 
172   /* All the moves added for this partial schedule.  Index X has
173      a ps_insn id of X + g->num_nodes.  */
174   vec<ps_reg_move_info> reg_moves;
175 
176   /*  rows_length[i] holds the number of instructions in the row.
177       It is used only (as an optimization) to back off quickly from
178       trying to schedule a node in a full row; that is, to avoid running
179       through futile DFA state transitions.  */
180   int *rows_length;
181 
182   /* The earliest absolute cycle of an insn in the partial schedule.  */
183   int min_cycle;
184 
185   /* The latest absolute cycle of an insn in the partial schedule.  */
186   int max_cycle;
187 
188   ddg_ptr g;	/* The DDG of the insns in the partial schedule.  */
189 
190   int stage_count;  /* The stage count of the partial schedule.  */
191 };
192 
193 
194 static partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int history);
195 static void free_partial_schedule (partial_schedule_ptr);
196 static void reset_partial_schedule (partial_schedule_ptr, int new_ii);
197 void print_partial_schedule (partial_schedule_ptr, FILE *);
198 static void verify_partial_schedule (partial_schedule_ptr, sbitmap);
199 static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
200 						int, int, sbitmap, sbitmap);
201 static void rotate_partial_schedule (partial_schedule_ptr, int);
202 void set_row_column_for_ps (partial_schedule_ptr);
203 static void ps_insert_empty_row (partial_schedule_ptr, int, sbitmap);
204 static int compute_split_row (sbitmap, int, int, int, ddg_node_ptr);
205 
206 
207 /* This page defines constants and structures for the modulo scheduling
208    driver.  */
209 
210 static int sms_order_nodes (ddg_ptr, int, int *, int *);
211 static void set_node_sched_params (ddg_ptr);
212 static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int, int *);
213 static void permute_partial_schedule (partial_schedule_ptr, rtx_insn *);
214 static int calculate_stage_count (partial_schedule_ptr, int);
215 static void calculate_must_precede_follow (ddg_node_ptr, int, int,
216 					   int, int, sbitmap, sbitmap, sbitmap);
217 static int get_sched_window (partial_schedule_ptr, ddg_node_ptr,
218 			     sbitmap, int, int *, int *, int *);
219 static bool try_scheduling_node_in_cycle (partial_schedule_ptr, int, int,
220 					  sbitmap, int *, sbitmap, sbitmap);
221 static void remove_node_from_ps (partial_schedule_ptr, ps_insn_ptr);
222 
223 #define NODE_ASAP(node) ((node)->aux.count)
224 
225 #define SCHED_PARAMS(x) (&node_sched_param_vec[x])
226 #define SCHED_TIME(x) (SCHED_PARAMS (x)->time)
227 #define SCHED_ROW(x) (SCHED_PARAMS (x)->row)
228 #define SCHED_STAGE(x) (SCHED_PARAMS (x)->stage)
229 #define SCHED_COLUMN(x) (SCHED_PARAMS (x)->column)
230 
231 /* The scheduling parameters held for each node.  */
232 typedef struct node_sched_params
233 {
234   int time;	/* The absolute scheduling cycle.  */
235 
236   int row;    /* Holds time % ii.  */
237   int stage;  /* Holds time / ii.  */
238 
239   /* The column of a node inside the ps.  If nodes u, v are on the same row,
240      u will precede v if column (u) < column (v).  */
241   int column;
242 } *node_sched_params_ptr;
243 
244 /* The following three functions are copied from the current scheduler
245    code in order to use sched_analyze() for computing the dependencies.
246    They are used when initializing the sched_info structure.  */
247 static const char *
sms_print_insn(const rtx_insn * insn,int aligned ATTRIBUTE_UNUSED)248 sms_print_insn (const rtx_insn *insn, int aligned ATTRIBUTE_UNUSED)
249 {
250   static char tmp[80];
251 
252   sprintf (tmp, "i%4d", INSN_UID (insn));
253   return tmp;
254 }
255 
256 static void
compute_jump_reg_dependencies(rtx insn ATTRIBUTE_UNUSED,regset used ATTRIBUTE_UNUSED)257 compute_jump_reg_dependencies (rtx insn ATTRIBUTE_UNUSED,
258 			       regset used ATTRIBUTE_UNUSED)
259 {
260 }
261 
262 static struct common_sched_info_def sms_common_sched_info;
263 
264 static struct sched_deps_info_def sms_sched_deps_info =
265   {
266     compute_jump_reg_dependencies,
267     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
268     NULL,
269     0, 0, 0
270   };
271 
272 static struct haifa_sched_info sms_sched_info =
273 {
274   NULL,
275   NULL,
276   NULL,
277   NULL,
278   NULL,
279   sms_print_insn,
280   NULL,
281   NULL, /* insn_finishes_block_p */
282   NULL, NULL,
283   NULL, NULL,
284   0, 0,
285 
286   NULL, NULL, NULL, NULL,
287   NULL, NULL,
288   0
289 };
290 
291 /* Partial schedule instruction ID in PS is a register move.  Return
292    information about it.  */
293 static struct ps_reg_move_info *
ps_reg_move(partial_schedule_ptr ps,int id)294 ps_reg_move (partial_schedule_ptr ps, int id)
295 {
296   gcc_checking_assert (id >= ps->g->num_nodes);
297   return &ps->reg_moves[id - ps->g->num_nodes];
298 }
299 
300 /* Return the rtl instruction that is being scheduled by partial schedule
301    instruction ID, which belongs to schedule PS.  */
302 static rtx_insn *
ps_rtl_insn(partial_schedule_ptr ps,int id)303 ps_rtl_insn (partial_schedule_ptr ps, int id)
304 {
305   if (id < ps->g->num_nodes)
306     return ps->g->nodes[id].insn;
307   else
308     return ps_reg_move (ps, id)->insn;
309 }
310 
311 /* Partial schedule instruction ID, which belongs to PS, occurred in
312    the original (unscheduled) loop.  Return the first instruction
313    in the loop that was associated with ps_rtl_insn (PS, ID).
314    If the instruction had some notes before it, this is the first
315    of those notes.  */
316 static rtx_insn *
ps_first_note(partial_schedule_ptr ps,int id)317 ps_first_note (partial_schedule_ptr ps, int id)
318 {
319   gcc_assert (id < ps->g->num_nodes);
320   return ps->g->nodes[id].first_note;
321 }
322 
323 /* Return the number of consecutive stages that are occupied by
324    partial schedule instruction ID in PS.  */
325 static int
ps_num_consecutive_stages(partial_schedule_ptr ps,int id)326 ps_num_consecutive_stages (partial_schedule_ptr ps, int id)
327 {
328   if (id < ps->g->num_nodes)
329     return 1;
330   else
331     return ps_reg_move (ps, id)->num_consecutive_stages;
332 }
333 
334 /* Given HEAD and TAIL which are the first and last insns in a loop;
335    return the register which controls the loop.  Return zero if it has
336    more than one occurrence in the loop besides the control part or the
337    do-loop pattern is not of the form we expect.  */
338 static rtx
doloop_register_get(rtx_insn * head,rtx_insn * tail)339 doloop_register_get (rtx_insn *head, rtx_insn *tail)
340 {
341   rtx reg, condition;
342   rtx_insn *insn, *first_insn_not_to_check;
343 
344   if (!JUMP_P (tail))
345     return NULL_RTX;
346 
347   if (!targetm.code_for_doloop_end)
348     return NULL_RTX;
349 
350   /* TODO: Free SMS's dependence on doloop_condition_get.  */
351   condition = doloop_condition_get (tail);
352   if (! condition)
353     return NULL_RTX;
354 
355   if (REG_P (XEXP (condition, 0)))
356     reg = XEXP (condition, 0);
357   else if (GET_CODE (XEXP (condition, 0)) == PLUS
358 	   && REG_P (XEXP (XEXP (condition, 0), 0)))
359     reg = XEXP (XEXP (condition, 0), 0);
360   else
361     gcc_unreachable ();
362 
363   /* Check that the COUNT_REG has no other occurrences in the loop
364      until the decrement.  We assume the control part consists of
365      either a single (parallel) branch-on-count or a (non-parallel)
366      branch immediately preceded by a single (decrement) insn.  */
367   first_insn_not_to_check = (GET_CODE (PATTERN (tail)) == PARALLEL ? tail
368                              : prev_nondebug_insn (tail));
369 
370   for (insn = head; insn != first_insn_not_to_check; insn = NEXT_INSN (insn))
371     if (NONDEBUG_INSN_P (insn) && reg_mentioned_p (reg, insn))
372       {
373         if (dump_file)
374         {
375           fprintf (dump_file, "SMS count_reg found ");
376           print_rtl_single (dump_file, reg);
377           fprintf (dump_file, " outside control in insn:\n");
378           print_rtl_single (dump_file, insn);
379         }
380 
381         return NULL_RTX;
382       }
383 
384   return reg;
385 }
386 
387 /* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
388    that the number of iterations is a compile-time constant.  If so,
389    return the rtx_insn that sets COUNT_REG to a constant, and set COUNT to
390    this constant.  Otherwise return 0.  */
391 static rtx_insn *
const_iteration_count(rtx count_reg,basic_block pre_header,int64_t * count,bool * adjust_inplace)392 const_iteration_count (rtx count_reg, basic_block pre_header,
393 		       int64_t *count, bool* adjust_inplace)
394 {
395   rtx_insn *insn;
396   rtx_insn *head, *tail;
397 
398   *adjust_inplace = false;
399   bool read_after = false;
400 
401   if (! pre_header)
402     return NULL;
403 
404   get_ebb_head_tail (pre_header, pre_header, &head, &tail);
405 
406   for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn))
407     if (single_set (insn) && rtx_equal_p (count_reg,
408 					  SET_DEST (single_set (insn))))
409       {
410 	rtx pat = single_set (insn);
411 
412 	if (CONST_INT_P (SET_SRC (pat)))
413 	  {
414 	    *count = INTVAL (SET_SRC (pat));
415 	    *adjust_inplace = !read_after;
416 	    return insn;
417 	  }
418 
419 	return NULL;
420       }
421     else if (NONDEBUG_INSN_P (insn) && reg_mentioned_p (count_reg, insn))
422       {
423 	read_after = true;
424 	if (reg_set_p (count_reg, insn))
425 	   break;
426       }
427 
428   return NULL;
429 }
430 
431 /* A very simple resource-based lower bound on the initiation interval.
432    ??? Improve the accuracy of this bound by considering the
433    utilization of various units.  */
434 static int
res_MII(ddg_ptr g)435 res_MII (ddg_ptr g)
436 {
437   if (targetm.sched.sms_res_mii)
438     return targetm.sched.sms_res_mii (g);
439 
440   return g->num_nodes / issue_rate;
441 }
442 
443 
444 /* A vector that contains the sched data for each ps_insn.  */
445 static vec<node_sched_params> node_sched_param_vec;
446 
447 /* Allocate sched_params for each node and initialize it.  */
448 static void
set_node_sched_params(ddg_ptr g)449 set_node_sched_params (ddg_ptr g)
450 {
451   node_sched_param_vec.truncate (0);
452   node_sched_param_vec.safe_grow_cleared (g->num_nodes, true);
453 }
454 
455 /* Make sure that node_sched_param_vec has an entry for every move in PS.  */
456 static void
extend_node_sched_params(partial_schedule_ptr ps)457 extend_node_sched_params (partial_schedule_ptr ps)
458 {
459   node_sched_param_vec.safe_grow_cleared (ps->g->num_nodes
460 					  + ps->reg_moves.length (), true);
461 }
462 
463 /* Update the sched_params (time, row and stage) for node U using the II,
464    the CYCLE of U and MIN_CYCLE.
465    We're not simply taking the following
466    SCHED_STAGE (u) = CALC_STAGE_COUNT (SCHED_TIME (u), min_cycle, ii);
467    because the stages may not be aligned on cycle 0.  */
468 static void
update_node_sched_params(int u,int ii,int cycle,int min_cycle)469 update_node_sched_params (int u, int ii, int cycle, int min_cycle)
470 {
471   int sc_until_cycle_zero;
472   int stage;
473 
474   SCHED_TIME (u) = cycle;
475   SCHED_ROW (u) = SMODULO (cycle, ii);
476 
477   /* The calculation of stage count is done adding the number
478      of stages before cycle zero and after cycle zero.  */
479   sc_until_cycle_zero = CALC_STAGE_COUNT (-1, min_cycle, ii);
480 
481   if (SCHED_TIME (u) < 0)
482     {
483       stage = CALC_STAGE_COUNT (-1, SCHED_TIME (u), ii);
484       SCHED_STAGE (u) = sc_until_cycle_zero - stage;
485     }
486   else
487     {
488       stage = CALC_STAGE_COUNT (SCHED_TIME (u), 0, ii);
489       SCHED_STAGE (u) = sc_until_cycle_zero + stage - 1;
490     }
491 }
492 
493 static void
print_node_sched_params(FILE * file,int num_nodes,partial_schedule_ptr ps)494 print_node_sched_params (FILE *file, int num_nodes, partial_schedule_ptr ps)
495 {
496   int i;
497 
498   if (! file)
499     return;
500   for (i = 0; i < num_nodes; i++)
501     {
502       node_sched_params_ptr nsp = SCHED_PARAMS (i);
503 
504       fprintf (file, "Node = %d; INSN = %d\n", i,
505 	       INSN_UID (ps_rtl_insn (ps, i)));
506       fprintf (file, " asap = %d:\n", NODE_ASAP (&ps->g->nodes[i]));
507       fprintf (file, " time = %d:\n", nsp->time);
508       fprintf (file, " stage = %d:\n", nsp->stage);
509     }
510 }
511 
512 /* Set SCHED_COLUMN for each instruction in row ROW of PS.  */
513 static void
set_columns_for_row(partial_schedule_ptr ps,int row)514 set_columns_for_row (partial_schedule_ptr ps, int row)
515 {
516   ps_insn_ptr cur_insn;
517   int column;
518 
519   column = 0;
520   for (cur_insn = ps->rows[row]; cur_insn; cur_insn = cur_insn->next_in_row)
521     SCHED_COLUMN (cur_insn->id) = column++;
522 }
523 
524 /* Set SCHED_COLUMN for each instruction in PS.  */
525 static void
set_columns_for_ps(partial_schedule_ptr ps)526 set_columns_for_ps (partial_schedule_ptr ps)
527 {
528   int row;
529 
530   for (row = 0; row < ps->ii; row++)
531     set_columns_for_row (ps, row);
532 }
533 
534 /* Try to schedule the move with ps_insn identifier I_REG_MOVE in PS.
535    Its single predecessor has already been scheduled, as has its
536    ddg node successors.  (The move may have also another move as its
537    successor, in which case that successor will be scheduled later.)
538 
539    The move is part of a chain that satisfies register dependencies
540    between a producing ddg node and various consuming ddg nodes.
541    If some of these dependencies have a distance of 1 (meaning that
542    the use is upward-exposed) then DISTANCE1_USES is nonnull and
543    contains the set of uses with distance-1 dependencies.
544    DISTANCE1_USES is null otherwise.
545 
546    MUST_FOLLOW is a scratch bitmap that is big enough to hold
547    all current ps_insn ids.
548 
549    Return true on success.  */
550 static bool
schedule_reg_move(partial_schedule_ptr ps,int i_reg_move,sbitmap distance1_uses,sbitmap must_follow)551 schedule_reg_move (partial_schedule_ptr ps, int i_reg_move,
552 		   sbitmap distance1_uses, sbitmap must_follow)
553 {
554   unsigned int u;
555   int this_time, this_distance, this_start, this_end, this_latency;
556   int start, end, c, ii;
557   sbitmap_iterator sbi;
558   ps_reg_move_info *move;
559   rtx_insn *this_insn;
560   ps_insn_ptr psi;
561 
562   move = ps_reg_move (ps, i_reg_move);
563   ii = ps->ii;
564   if (dump_file)
565     {
566       fprintf (dump_file, "Scheduling register move INSN %d; ii = %d"
567 	       ", min cycle = %d\n\n", INSN_UID (move->insn), ii,
568 	       PS_MIN_CYCLE (ps));
569       print_rtl_single (dump_file, move->insn);
570       fprintf (dump_file, "\n%11s %11s %5s\n", "start", "end", "time");
571       fprintf (dump_file, "=========== =========== =====\n");
572     }
573 
574   start = INT_MIN;
575   end = INT_MAX;
576 
577   /* For dependencies of distance 1 between a producer ddg node A
578      and consumer ddg node B, we have a chain of dependencies:
579 
580         A --(T,L1,1)--> M1 --(T,L2,0)--> M2 ... --(T,Ln,0)--> B
581 
582      where Mi is the ith move.  For dependencies of distance 0 between
583      a producer ddg node A and consumer ddg node C, we have a chain of
584      dependencies:
585 
586         A --(T,L1',0)--> M1' --(T,L2',0)--> M2' ... --(T,Ln',0)--> C
587 
588      where Mi' occupies the same position as Mi but occurs a stage later.
589      We can only schedule each move once, so if we have both types of
590      chain, we model the second as:
591 
592         A --(T,L1',1)--> M1 --(T,L2',0)--> M2 ... --(T,Ln',-1)--> C
593 
594      First handle the dependencies between the previously-scheduled
595      predecessor and the move.  */
596   this_insn = ps_rtl_insn (ps, move->def);
597   this_latency = insn_latency (this_insn, move->insn);
598   this_distance = distance1_uses && move->def < ps->g->num_nodes ? 1 : 0;
599   this_time = SCHED_TIME (move->def) - this_distance * ii;
600   this_start = this_time + this_latency;
601   this_end = this_time + ii;
602   if (dump_file)
603     fprintf (dump_file, "%11d %11d %5d %d --(T,%d,%d)--> %d\n",
604 	     this_start, this_end, SCHED_TIME (move->def),
605 	     INSN_UID (this_insn), this_latency, this_distance,
606 	     INSN_UID (move->insn));
607 
608   if (start < this_start)
609     start = this_start;
610   if (end > this_end)
611     end = this_end;
612 
613   /* Handle the dependencies between the move and previously-scheduled
614      successors.  */
615   EXECUTE_IF_SET_IN_BITMAP (move->uses, 0, u, sbi)
616     {
617       this_insn = ps_rtl_insn (ps, u);
618       this_latency = insn_latency (move->insn, this_insn);
619       if (distance1_uses && !bitmap_bit_p (distance1_uses, u))
620 	this_distance = -1;
621       else
622 	this_distance = 0;
623       this_time = SCHED_TIME (u) + this_distance * ii;
624       this_start = this_time - ii;
625       this_end = this_time - this_latency;
626       if (dump_file)
627 	fprintf (dump_file, "%11d %11d %5d %d --(T,%d,%d)--> %d\n",
628 		 this_start, this_end, SCHED_TIME (u), INSN_UID (move->insn),
629 		 this_latency, this_distance, INSN_UID (this_insn));
630 
631       if (start < this_start)
632 	start = this_start;
633       if (end > this_end)
634 	end = this_end;
635     }
636 
637   if (dump_file)
638     {
639       fprintf (dump_file, "----------- ----------- -----\n");
640       fprintf (dump_file, "%11d %11d %5s %s\n", start, end, "", "(max, min)");
641     }
642 
643   bitmap_clear (must_follow);
644   bitmap_set_bit (must_follow, move->def);
645 
646   start = MAX (start, end - (ii - 1));
647   for (c = end; c >= start; c--)
648     {
649       psi = ps_add_node_check_conflicts (ps, i_reg_move, c,
650 					 move->uses, must_follow);
651       if (psi)
652 	{
653 	  update_node_sched_params (i_reg_move, ii, c, PS_MIN_CYCLE (ps));
654 	  if (dump_file)
655 	    fprintf (dump_file, "\nScheduled register move INSN %d at"
656 		     " time %d, row %d\n\n", INSN_UID (move->insn), c,
657 		     SCHED_ROW (i_reg_move));
658 	  return true;
659 	}
660     }
661 
662   if (dump_file)
663     fprintf (dump_file, "\nNo available slot\n\n");
664 
665   return false;
666 }
667 
668 /*
669    Breaking intra-loop register anti-dependences:
670    Each intra-loop register anti-dependence implies a cross-iteration true
671    dependence of distance 1. Therefore, we can remove such false dependencies
672    and figure out if the partial schedule broke them by checking if (for a
673    true-dependence of distance 1): SCHED_TIME (def) < SCHED_TIME (use) and
674    if so generate a register move.   The number of such moves is equal to:
675               SCHED_TIME (use) - SCHED_TIME (def)       { 0 broken
676    nreg_moves = ----------------------------------- + 1 - {   dependence.
677                             ii                          { 1 if not.
678 */
679 static bool
schedule_reg_moves(partial_schedule_ptr ps)680 schedule_reg_moves (partial_schedule_ptr ps)
681 {
682   ddg_ptr g = ps->g;
683   int ii = ps->ii;
684   int i;
685 
686   for (i = 0; i < g->num_nodes; i++)
687     {
688       ddg_node_ptr u = &g->nodes[i];
689       ddg_edge_ptr e;
690       int nreg_moves = 0, i_reg_move;
691       rtx prev_reg, old_reg;
692       int first_move;
693       int distances[2];
694       sbitmap distance1_uses;
695       rtx set = single_set (u->insn);
696 
697       /* Skip instructions that do not set a register.  */
698       if (set && !REG_P (SET_DEST (set)))
699         continue;
700 
701       /* Compute the number of reg_moves needed for u, by looking at life
702 	 ranges started at u (excluding self-loops).  */
703       distances[0] = distances[1] = false;
704       for (e = u->out; e; e = e->next_out)
705 	if (e->type == TRUE_DEP && e->dest != e->src)
706 	  {
707 	    int nreg_moves4e = (SCHED_TIME (e->dest->cuid)
708 				- SCHED_TIME (e->src->cuid)) / ii;
709 
710             if (e->distance == 1)
711               nreg_moves4e = (SCHED_TIME (e->dest->cuid)
712 			      - SCHED_TIME (e->src->cuid) + ii) / ii;
713 
714 	    /* If dest precedes src in the schedule of the kernel, then dest
715 	       will read before src writes and we can save one reg_copy.  */
716 	    if (SCHED_ROW (e->dest->cuid) == SCHED_ROW (e->src->cuid)
717 		&& SCHED_COLUMN (e->dest->cuid) < SCHED_COLUMN (e->src->cuid))
718 	      nreg_moves4e--;
719 
720             if (nreg_moves4e >= 1)
721 	      {
722 		/* !single_set instructions are not supported yet and
723 		   thus we do not except to encounter them in the loop
724 		   except from the doloop part.  For the latter case
725 		   we assume no regmoves are generated as the doloop
726 		   instructions are tied to the branch with an edge.  */
727 		gcc_assert (set);
728 		/* If the instruction contains auto-inc register then
729 		   validate that the regmov is being generated for the
730 		   target regsiter rather then the inc'ed register.	*/
731 		gcc_assert (!autoinc_var_is_used_p (u->insn, e->dest->insn));
732 	      }
733 
734 	    if (nreg_moves4e)
735 	      {
736 		gcc_assert (e->distance < 2);
737 		distances[e->distance] = true;
738 	      }
739 	    nreg_moves = MAX (nreg_moves, nreg_moves4e);
740 	  }
741 
742       if (nreg_moves == 0)
743 	continue;
744 
745       /* Create NREG_MOVES register moves.  */
746       first_move = ps->reg_moves.length ();
747       ps->reg_moves.safe_grow_cleared (first_move + nreg_moves, true);
748       extend_node_sched_params (ps);
749 
750       /* Record the moves associated with this node.  */
751       first_move += ps->g->num_nodes;
752 
753       /* Generate each move.  */
754       old_reg = prev_reg = SET_DEST (set);
755       if (HARD_REGISTER_P (old_reg))
756 	return false;
757 
758       for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
759 	{
760 	  ps_reg_move_info *move = ps_reg_move (ps, first_move + i_reg_move);
761 
762 	  move->def = i_reg_move > 0 ? first_move + i_reg_move - 1 : i;
763 	  move->uses = sbitmap_alloc (first_move + nreg_moves);
764 	  move->old_reg = old_reg;
765 	  move->new_reg = gen_reg_rtx (GET_MODE (prev_reg));
766 	  move->num_consecutive_stages = distances[0] && distances[1] ? 2 : 1;
767 	  move->insn = gen_move_insn (move->new_reg, copy_rtx (prev_reg));
768 	  bitmap_clear (move->uses);
769 
770 	  prev_reg = move->new_reg;
771 	}
772 
773       distance1_uses = distances[1] ? sbitmap_alloc (g->num_nodes) : NULL;
774 
775       if (distance1_uses)
776 	bitmap_clear (distance1_uses);
777 
778       /* Every use of the register defined by node may require a different
779 	 copy of this register, depending on the time the use is scheduled.
780 	 Record which uses require which move results.  */
781       for (e = u->out; e; e = e->next_out)
782 	if (e->type == TRUE_DEP && e->dest != e->src)
783 	  {
784 	    int dest_copy = (SCHED_TIME (e->dest->cuid)
785 			     - SCHED_TIME (e->src->cuid)) / ii;
786 
787 	    if (e->distance == 1)
788 	      dest_copy = (SCHED_TIME (e->dest->cuid)
789 			   - SCHED_TIME (e->src->cuid) + ii) / ii;
790 
791 	    if (SCHED_ROW (e->dest->cuid) == SCHED_ROW (e->src->cuid)
792 		&& SCHED_COLUMN (e->dest->cuid) < SCHED_COLUMN (e->src->cuid))
793 	      dest_copy--;
794 
795 	    if (dest_copy)
796 	      {
797 		ps_reg_move_info *move;
798 
799 		move = ps_reg_move (ps, first_move + dest_copy - 1);
800 		bitmap_set_bit (move->uses, e->dest->cuid);
801 		if (e->distance == 1)
802 		  bitmap_set_bit (distance1_uses, e->dest->cuid);
803 	      }
804 	  }
805 
806       auto_sbitmap must_follow (first_move + nreg_moves);
807       for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
808 	if (!schedule_reg_move (ps, first_move + i_reg_move,
809 				distance1_uses, must_follow))
810 	  break;
811       if (distance1_uses)
812 	sbitmap_free (distance1_uses);
813       if (i_reg_move < nreg_moves)
814 	return false;
815     }
816   return true;
817 }
818 
819 /* Emit the moves associated with PS.  Apply the substitutions
820    associated with them.  */
821 static void
apply_reg_moves(partial_schedule_ptr ps)822 apply_reg_moves (partial_schedule_ptr ps)
823 {
824   ps_reg_move_info *move;
825   int i;
826 
827   FOR_EACH_VEC_ELT (ps->reg_moves, i, move)
828     {
829       unsigned int i_use;
830       sbitmap_iterator sbi;
831 
832       EXECUTE_IF_SET_IN_BITMAP (move->uses, 0, i_use, sbi)
833 	{
834 	  replace_rtx (ps->g->nodes[i_use].insn, move->old_reg, move->new_reg);
835 	  df_insn_rescan (ps->g->nodes[i_use].insn);
836 	}
837     }
838 }
839 
840 /* Bump the SCHED_TIMEs of all nodes by AMOUNT.  Set the values of
841    SCHED_ROW and SCHED_STAGE.  Instruction scheduled on cycle AMOUNT
842    will move to cycle zero.  */
843 static void
reset_sched_times(partial_schedule_ptr ps,int amount)844 reset_sched_times (partial_schedule_ptr ps, int amount)
845 {
846   int row;
847   int ii = ps->ii;
848   ps_insn_ptr crr_insn;
849 
850   for (row = 0; row < ii; row++)
851     for (crr_insn = ps->rows[row]; crr_insn; crr_insn = crr_insn->next_in_row)
852       {
853 	int u = crr_insn->id;
854 	int normalized_time = SCHED_TIME (u) - amount;
855 	int new_min_cycle = PS_MIN_CYCLE (ps) - amount;
856 
857         if (dump_file)
858           {
859             /* Print the scheduling times after the rotation.  */
860 	    rtx_insn *insn = ps_rtl_insn (ps, u);
861 
862             fprintf (dump_file, "crr_insn->node=%d (insn id %d), "
863                      "crr_insn->cycle=%d, min_cycle=%d", u,
864                      INSN_UID (insn), normalized_time, new_min_cycle);
865             if (JUMP_P (insn))
866               fprintf (dump_file, " (branch)");
867             fprintf (dump_file, "\n");
868           }
869 
870 	gcc_assert (SCHED_TIME (u) >= ps->min_cycle);
871 	gcc_assert (SCHED_TIME (u) <= ps->max_cycle);
872 
873 	crr_insn->cycle = normalized_time;
874 	update_node_sched_params (u, ii, normalized_time, new_min_cycle);
875       }
876 }
877 
878 /* Permute the insns according to their order in PS, from row 0 to
879    row ii-1, and position them right before LAST.  This schedules
880    the insns of the loop kernel.  */
881 static void
permute_partial_schedule(partial_schedule_ptr ps,rtx_insn * last)882 permute_partial_schedule (partial_schedule_ptr ps, rtx_insn *last)
883 {
884   int ii = ps->ii;
885   int row;
886   ps_insn_ptr ps_ij;
887 
888   for (row = 0; row < ii ; row++)
889     for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
890       {
891 	rtx_insn *insn = ps_rtl_insn (ps, ps_ij->id);
892 
893 	if (PREV_INSN (last) != insn)
894 	  {
895 	    if (ps_ij->id < ps->g->num_nodes)
896 	      reorder_insns_nobb (ps_first_note (ps, ps_ij->id), insn,
897 				  PREV_INSN (last));
898 	    else
899 	      add_insn_before (insn, last, NULL);
900 	  }
901       }
902 }
903 
904 /* Set bitmaps TMP_FOLLOW and TMP_PRECEDE to MUST_FOLLOW and MUST_PRECEDE
905    respectively only if cycle C falls on the border of the scheduling
906    window boundaries marked by START and END cycles.  STEP is the
907    direction of the window.  */
908 static inline void
set_must_precede_follow(sbitmap * tmp_follow,sbitmap must_follow,sbitmap * tmp_precede,sbitmap must_precede,int c,int start,int end,int step)909 set_must_precede_follow (sbitmap *tmp_follow, sbitmap must_follow,
910 			 sbitmap *tmp_precede, sbitmap must_precede, int c,
911 			 int start, int end, int step)
912 {
913   *tmp_precede = NULL;
914   *tmp_follow = NULL;
915 
916   if (c == start)
917     {
918       if (step == 1)
919 	*tmp_precede = must_precede;
920       else			/* step == -1.  */
921 	*tmp_follow = must_follow;
922     }
923   if (c == end - step)
924     {
925       if (step == 1)
926 	*tmp_follow = must_follow;
927       else			/* step == -1.  */
928 	*tmp_precede = must_precede;
929     }
930 
931 }
932 
933 /* Return True if the branch can be moved to row ii-1 while
934    normalizing the partial schedule PS to start from cycle zero and thus
935    optimize the SC.  Otherwise return False.  */
936 static bool
optimize_sc(partial_schedule_ptr ps,ddg_ptr g)937 optimize_sc (partial_schedule_ptr ps, ddg_ptr g)
938 {
939   int amount = PS_MIN_CYCLE (ps);
940   int start, end, step;
941   int ii = ps->ii;
942   bool ok = false;
943   int stage_count, stage_count_curr;
944 
945   /* Compare the SC after normalization and SC after bringing the branch
946      to row ii-1.  If they are equal just bail out.  */
947   stage_count = calculate_stage_count (ps, amount);
948   stage_count_curr =
949     calculate_stage_count (ps, SCHED_TIME (g->closing_branch->cuid) - (ii - 1));
950 
951   if (stage_count == stage_count_curr)
952     {
953       if (dump_file)
954 	fprintf (dump_file, "SMS SC already optimized.\n");
955 
956       return false;
957     }
958 
959   if (dump_file)
960     {
961       fprintf (dump_file, "SMS Trying to optimize branch location\n");
962       fprintf (dump_file, "SMS partial schedule before trial:\n");
963       print_partial_schedule (ps, dump_file);
964     }
965 
966   /* First, normalize the partial scheduling.  */
967   reset_sched_times (ps, amount);
968   rotate_partial_schedule (ps, amount);
969   if (dump_file)
970     {
971       fprintf (dump_file,
972 	       "SMS partial schedule after normalization (ii, %d, SC %d):\n",
973 	       ii, stage_count);
974       print_partial_schedule (ps, dump_file);
975     }
976 
977   if (SMODULO (SCHED_TIME (g->closing_branch->cuid), ii) == ii - 1)
978     return true;
979 
980   auto_sbitmap sched_nodes (g->num_nodes);
981   bitmap_ones (sched_nodes);
982 
983   /* Calculate the new placement of the branch.  It should be in row
984      ii-1 and fall into it's scheduling window.  */
985   if (get_sched_window (ps, g->closing_branch, sched_nodes, ii, &start,
986 			&step, &end) == 0)
987     {
988       bool success;
989       ps_insn_ptr next_ps_i;
990       int branch_cycle = SCHED_TIME (g->closing_branch->cuid);
991       int row = SMODULO (branch_cycle, ps->ii);
992       int num_splits = 0;
993       sbitmap tmp_precede, tmp_follow;
994       int min_cycle, c;
995 
996       if (dump_file)
997 	fprintf (dump_file, "\nTrying to schedule node %d "
998 		 "INSN = %d  in (%d .. %d) step %d\n",
999 		 g->closing_branch->cuid,
1000 		 (INSN_UID (g->closing_branch->insn)), start, end, step);
1001 
1002       gcc_assert ((step > 0 && start < end) || (step < 0 && start > end));
1003       if (step == 1)
1004 	{
1005 	  c = start + ii - SMODULO (start, ii) - 1;
1006 	  gcc_assert (c >= start);
1007 	  if (c >= end)
1008 	    {
1009 	      if (dump_file)
1010 		fprintf (dump_file,
1011 			 "SMS failed to schedule branch at cycle: %d\n", c);
1012 	      return false;
1013 	    }
1014 	}
1015       else
1016 	{
1017 	  c = start - SMODULO (start, ii) - 1;
1018 	  gcc_assert (c <= start);
1019 
1020 	  if (c <= end)
1021 	    {
1022 	      if (dump_file)
1023 		fprintf (dump_file,
1024 			 "SMS failed to schedule branch at cycle: %d\n", c);
1025 	      return false;
1026 	    }
1027 	}
1028 
1029       auto_sbitmap must_precede (g->num_nodes);
1030       auto_sbitmap must_follow (g->num_nodes);
1031 
1032       /* Try to schedule the branch is it's new cycle.  */
1033       calculate_must_precede_follow (g->closing_branch, start, end,
1034 				     step, ii, sched_nodes,
1035 				     must_precede, must_follow);
1036 
1037       set_must_precede_follow (&tmp_follow, must_follow, &tmp_precede,
1038 			       must_precede, c, start, end, step);
1039 
1040       /* Find the element in the partial schedule related to the closing
1041          branch so we can remove it from it's current cycle.  */
1042       for (next_ps_i = ps->rows[row];
1043 	   next_ps_i; next_ps_i = next_ps_i->next_in_row)
1044 	if (next_ps_i->id == g->closing_branch->cuid)
1045 	  break;
1046 
1047       min_cycle = PS_MIN_CYCLE (ps) - SMODULO (PS_MIN_CYCLE (ps), ps->ii);
1048       remove_node_from_ps (ps, next_ps_i);
1049       success =
1050 	try_scheduling_node_in_cycle (ps, g->closing_branch->cuid, c,
1051 				      sched_nodes, &num_splits,
1052 				      tmp_precede, tmp_follow);
1053       gcc_assert (num_splits == 0);
1054       if (!success)
1055 	{
1056 	  if (dump_file)
1057 	    fprintf (dump_file,
1058 		     "SMS failed to schedule branch at cycle: %d, "
1059 		     "bringing it back to cycle %d\n", c, branch_cycle);
1060 
1061 	  /* The branch was failed to be placed in row ii - 1.
1062 	     Put it back in it's original place in the partial
1063 	     schedualing.  */
1064 	  set_must_precede_follow (&tmp_follow, must_follow, &tmp_precede,
1065 				   must_precede, branch_cycle, start, end,
1066 				   step);
1067 	  success =
1068 	    try_scheduling_node_in_cycle (ps, g->closing_branch->cuid,
1069 					  branch_cycle, sched_nodes,
1070 					  &num_splits, tmp_precede,
1071 					  tmp_follow);
1072 	  gcc_assert (success && (num_splits == 0));
1073 	  ok = false;
1074 	}
1075       else
1076 	{
1077 	  /* The branch is placed in row ii - 1.  */
1078 	  if (dump_file)
1079 	    fprintf (dump_file,
1080 		     "SMS success in moving branch to cycle %d\n", c);
1081 
1082 	  update_node_sched_params (g->closing_branch->cuid, ii, c,
1083 				    PS_MIN_CYCLE (ps));
1084 	  ok = true;
1085 	}
1086 
1087       /* This might have been added to a new first stage.  */
1088       if (PS_MIN_CYCLE (ps) < min_cycle)
1089 	reset_sched_times (ps, 0);
1090     }
1091 
1092   return ok;
1093 }
1094 
1095 static void
duplicate_insns_of_cycles(partial_schedule_ptr ps,int from_stage,int to_stage,rtx count_reg,class loop * loop)1096 duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
1097 			   int to_stage, rtx count_reg, class loop *loop)
1098 {
1099   int row;
1100   ps_insn_ptr ps_ij;
1101   copy_bb_data id;
1102 
1103   for (row = 0; row < ps->ii; row++)
1104     for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
1105       {
1106 	int u = ps_ij->id;
1107 	int first_u, last_u;
1108 	rtx_insn *u_insn;
1109 
1110         /* Do not duplicate any insn which refers to count_reg as it
1111            belongs to the control part.
1112            The closing branch is scheduled as well and thus should
1113            be ignored.
1114            TODO: This should be done by analyzing the control part of
1115            the loop.  */
1116 	u_insn = ps_rtl_insn (ps, u);
1117         if (reg_mentioned_p (count_reg, u_insn)
1118             || JUMP_P (u_insn))
1119           continue;
1120 
1121 	first_u = SCHED_STAGE (u);
1122 	last_u = first_u + ps_num_consecutive_stages (ps, u) - 1;
1123 	if (from_stage <= last_u && to_stage >= first_u)
1124 	  {
1125 	    if (u < ps->g->num_nodes)
1126 	      duplicate_insn_chain (ps_first_note (ps, u), u_insn,
1127 				    loop, &id);
1128 	    else
1129 	      emit_insn (copy_rtx (PATTERN (u_insn)));
1130 	  }
1131       }
1132 }
1133 
1134 
1135 /* Generate the instructions (including reg_moves) for prolog & epilog.  */
1136 static void
generate_prolog_epilog(partial_schedule_ptr ps,class loop * loop,rtx count_reg,bool adjust_init)1137 generate_prolog_epilog (partial_schedule_ptr ps, class loop *loop,
1138 			rtx count_reg, bool adjust_init)
1139 {
1140   int i;
1141   int last_stage = PS_STAGE_COUNT (ps) - 1;
1142   edge e;
1143 
1144   /* Generate the prolog, inserting its insns on the loop-entry edge.  */
1145   start_sequence ();
1146 
1147   if (adjust_init)
1148     {
1149       /* Generate instructions at the beginning of the prolog to
1150 	 adjust the loop count by STAGE_COUNT.  If loop count is constant
1151 	 and it not used anywhere in prologue, this constant is adjusted by
1152 	 STAGE_COUNT outside of generate_prolog_epilog function.  */
1153       rtx sub_reg = NULL_RTX;
1154 
1155       sub_reg = expand_simple_binop (GET_MODE (count_reg), MINUS, count_reg,
1156 				     gen_int_mode (last_stage,
1157 						   GET_MODE (count_reg)),
1158                                      count_reg, 1, OPTAB_DIRECT);
1159       gcc_assert (REG_P (sub_reg));
1160       if (REGNO (sub_reg) != REGNO (count_reg))
1161         emit_move_insn (count_reg, sub_reg);
1162     }
1163 
1164   for (i = 0; i < last_stage; i++)
1165     duplicate_insns_of_cycles (ps, 0, i, count_reg, loop);
1166 
1167   /* Put the prolog on the entry edge.  */
1168   e = loop_preheader_edge (loop);
1169   split_edge_and_insert (e, get_insns ());
1170   if (!flag_resched_modulo_sched)
1171     e->dest->flags |= BB_DISABLE_SCHEDULE;
1172 
1173   end_sequence ();
1174 
1175   /* Generate the epilog, inserting its insns on the loop-exit edge.  */
1176   start_sequence ();
1177 
1178   for (i = 0; i < last_stage; i++)
1179     duplicate_insns_of_cycles (ps, i + 1, last_stage, count_reg, loop);
1180 
1181   /* Put the epilogue on the exit edge.  */
1182   gcc_assert (single_exit (loop));
1183   e = single_exit (loop);
1184   split_edge_and_insert (e, get_insns ());
1185   if (!flag_resched_modulo_sched)
1186     e->dest->flags |= BB_DISABLE_SCHEDULE;
1187 
1188   end_sequence ();
1189 }
1190 
1191 /* Mark LOOP as software pipelined so the later
1192    scheduling passes don't touch it.  */
1193 static void
mark_loop_unsched(class loop * loop)1194 mark_loop_unsched (class loop *loop)
1195 {
1196   unsigned i;
1197   basic_block *bbs = get_loop_body (loop);
1198 
1199   for (i = 0; i < loop->num_nodes; i++)
1200     bbs[i]->flags |= BB_DISABLE_SCHEDULE;
1201 
1202   free (bbs);
1203 }
1204 
1205 /* Return true if all the BBs of the loop are empty except the
1206    loop header.  */
1207 static bool
loop_single_full_bb_p(class loop * loop)1208 loop_single_full_bb_p (class loop *loop)
1209 {
1210   unsigned i;
1211   basic_block *bbs = get_loop_body (loop);
1212 
1213   for (i = 0; i < loop->num_nodes ; i++)
1214     {
1215       rtx_insn *head, *tail;
1216       bool empty_bb = true;
1217 
1218       if (bbs[i] == loop->header)
1219         continue;
1220 
1221       /* Make sure that basic blocks other than the header
1222          have only notes labels or jumps.  */
1223       get_ebb_head_tail (bbs[i], bbs[i], &head, &tail);
1224       for (; head != NEXT_INSN (tail); head = NEXT_INSN (head))
1225         {
1226           if (NOTE_P (head) || LABEL_P (head)
1227  	      || (INSN_P (head) && (DEBUG_INSN_P (head) || JUMP_P (head))))
1228  	    continue;
1229  	  empty_bb = false;
1230  	  break;
1231         }
1232 
1233       if (! empty_bb)
1234         {
1235           free (bbs);
1236           return false;
1237         }
1238     }
1239   free (bbs);
1240   return true;
1241 }
1242 
1243 /* Dump file:line from INSN's location info to dump_file.  */
1244 
1245 static void
dump_insn_location(rtx_insn * insn)1246 dump_insn_location (rtx_insn *insn)
1247 {
1248   if (dump_file && INSN_HAS_LOCATION (insn))
1249     {
1250       expanded_location xloc = insn_location (insn);
1251       fprintf (dump_file, " %s:%i", xloc.file, xloc.line);
1252     }
1253 }
1254 
1255 /* A simple loop from SMS point of view; it is a loop that is composed of
1256    either a single basic block or two BBs - a header and a latch.  */
1257 #define SIMPLE_SMS_LOOP_P(loop) ((loop->num_nodes < 3 ) 		    \
1258 				  && (EDGE_COUNT (loop->latch->preds) == 1) \
1259                                   && (EDGE_COUNT (loop->latch->succs) == 1))
1260 
1261 /* Return true if the loop is in its canonical form and false if not.
1262    i.e. SIMPLE_SMS_LOOP_P and have one preheader block, and single exit.  */
1263 static bool
loop_canon_p(class loop * loop)1264 loop_canon_p (class loop *loop)
1265 {
1266 
1267   if (loop->inner || !loop_outer (loop))
1268   {
1269     if (dump_file)
1270       fprintf (dump_file, "SMS loop inner or !loop_outer\n");
1271     return false;
1272   }
1273 
1274   if (!single_exit (loop))
1275     {
1276       if (dump_file)
1277 	{
1278 	  rtx_insn *insn = BB_END (loop->header);
1279 
1280 	  fprintf (dump_file, "SMS loop many exits");
1281 	  dump_insn_location (insn);
1282 	  fprintf (dump_file, "\n");
1283 	}
1284       return false;
1285     }
1286 
1287   if (! SIMPLE_SMS_LOOP_P (loop) && ! loop_single_full_bb_p (loop))
1288     {
1289       if (dump_file)
1290 	{
1291 	  rtx_insn *insn = BB_END (loop->header);
1292 
1293 	  fprintf (dump_file, "SMS loop many BBs.");
1294 	  dump_insn_location (insn);
1295 	  fprintf (dump_file, "\n");
1296 	}
1297       return false;
1298     }
1299 
1300     return true;
1301 }
1302 
1303 /* If there are more than one entry for the loop,
1304    make it one by splitting the first entry edge and
1305    redirecting the others to the new BB.  */
1306 static void
canon_loop(class loop * loop)1307 canon_loop (class loop *loop)
1308 {
1309   edge e;
1310   edge_iterator i;
1311 
1312   /* Avoid annoying special cases of edges going to exit
1313      block.  */
1314   FOR_EACH_EDGE (e, i, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
1315     if ((e->flags & EDGE_FALLTHRU) && (EDGE_COUNT (e->src->succs) > 1))
1316       split_edge (e);
1317 
1318   if (loop->latch == loop->header
1319       || EDGE_COUNT (loop->latch->succs) > 1)
1320     {
1321       FOR_EACH_EDGE (e, i, loop->header->preds)
1322         if (e->src == loop->latch)
1323           break;
1324       split_edge (e);
1325     }
1326 }
1327 
1328 /* Setup infos.  */
1329 static void
setup_sched_infos(void)1330 setup_sched_infos (void)
1331 {
1332   memcpy (&sms_common_sched_info, &haifa_common_sched_info,
1333 	  sizeof (sms_common_sched_info));
1334   sms_common_sched_info.sched_pass_id = SCHED_SMS_PASS;
1335   common_sched_info = &sms_common_sched_info;
1336 
1337   sched_deps_info = &sms_sched_deps_info;
1338   current_sched_info = &sms_sched_info;
1339 }
1340 
1341 /* Probability in % that the sms-ed loop rolls enough so that optimized
1342    version may be entered.  Just a guess.  */
1343 #define PROB_SMS_ENOUGH_ITERATIONS 80
1344 
1345 /* Main entry point, perform SMS scheduling on the loops of the function
1346    that consist of single basic blocks.  */
1347 static void
sms_schedule(void)1348 sms_schedule (void)
1349 {
1350   rtx_insn *insn;
1351   ddg_ptr *g_arr, g;
1352   int * node_order;
1353   int maxii, max_asap;
1354   partial_schedule_ptr ps;
1355   basic_block bb = NULL;
1356   basic_block condition_bb = NULL;
1357   edge latch_edge;
1358   HOST_WIDE_INT trip_count, max_trip_count;
1359   HARD_REG_SET prohibited_regs;
1360 
1361   loop_optimizer_init (LOOPS_HAVE_PREHEADERS
1362 		       | LOOPS_HAVE_RECORDED_EXITS);
1363   if (number_of_loops (cfun) <= 1)
1364     {
1365       loop_optimizer_finalize ();
1366       return;  /* There are no loops to schedule.  */
1367     }
1368 
1369   /* Initialize issue_rate.  */
1370   if (targetm.sched.issue_rate)
1371     {
1372       int temp = reload_completed;
1373 
1374       reload_completed = 1;
1375       issue_rate = targetm.sched.issue_rate ();
1376       reload_completed = temp;
1377     }
1378   else
1379     issue_rate = 1;
1380 
1381   /* Initialize the scheduler.  */
1382   setup_sched_infos ();
1383   haifa_sched_init ();
1384 
1385   /* Allocate memory to hold the DDG array one entry for each loop.
1386      We use loop->num as index into this array.  */
1387   g_arr = XCNEWVEC (ddg_ptr, number_of_loops (cfun));
1388 
1389   REG_SET_TO_HARD_REG_SET (prohibited_regs, &df->regular_block_artificial_uses);
1390 
1391   if (dump_file)
1392   {
1393     fprintf (dump_file, "\n\nSMS analysis phase\n");
1394     fprintf (dump_file, "===================\n\n");
1395   }
1396 
1397   /* Build DDGs for all the relevant loops and hold them in G_ARR
1398      indexed by the loop index.  */
1399   for (auto loop : loops_list (cfun, 0))
1400     {
1401       rtx_insn *head, *tail;
1402       rtx count_reg;
1403 
1404       /* For debugging.  */
1405       if (dbg_cnt (sms_sched_loop) == false)
1406         {
1407           if (dump_file)
1408             fprintf (dump_file, "SMS reached max limit... \n");
1409 
1410 	  break;
1411         }
1412 
1413       if (dump_file)
1414 	{
1415 	  rtx_insn *insn = BB_END (loop->header);
1416 
1417 	  fprintf (dump_file, "SMS loop num: %d", loop->num);
1418 	  dump_insn_location (insn);
1419 	  fprintf (dump_file, "\n");
1420 	}
1421 
1422       if (! loop_canon_p (loop))
1423         continue;
1424 
1425       if (! loop_single_full_bb_p (loop))
1426       {
1427         if (dump_file)
1428           fprintf (dump_file, "SMS not loop_single_full_bb_p\n");
1429 	continue;
1430       }
1431 
1432       bb = loop->header;
1433 
1434       get_ebb_head_tail (bb, bb, &head, &tail);
1435       latch_edge = loop_latch_edge (loop);
1436       gcc_assert (single_exit (loop));
1437       trip_count = get_estimated_loop_iterations_int (loop);
1438       max_trip_count = get_max_loop_iterations_int (loop);
1439 
1440       /* Perform SMS only on loops that their average count is above threshold.  */
1441 
1442       if ( latch_edge->count () > profile_count::zero ()
1443           && (latch_edge->count()
1444 	      < single_exit (loop)->count ().apply_scale
1445 				 (param_sms_loop_average_count_threshold, 1)))
1446 	{
1447 	  if (dump_file)
1448 	    {
1449 	      dump_insn_location (tail);
1450 	      fprintf (dump_file, "\nSMS single-bb-loop\n");
1451 	      if (profile_info && flag_branch_probabilities)
1452 	    	{
1453 	      	  fprintf (dump_file, "SMS loop-count ");
1454 	      	  fprintf (dump_file, "%" PRId64,
1455 	             	   (int64_t) bb->count.to_gcov_type ());
1456 	      	  fprintf (dump_file, "\n");
1457                   fprintf (dump_file, "SMS trip-count ");
1458                   fprintf (dump_file, "%" PRId64 "max %" PRId64,
1459                            (int64_t) trip_count, (int64_t) max_trip_count);
1460                   fprintf (dump_file, "\n");
1461 	    	}
1462 	    }
1463           continue;
1464         }
1465 
1466       /* Make sure this is a doloop.  */
1467       if ( !(count_reg = doloop_register_get (head, tail)))
1468       {
1469         if (dump_file)
1470           fprintf (dump_file, "SMS doloop_register_get failed\n");
1471 	continue;
1472       }
1473 
1474       /* Don't handle BBs with calls or barriers
1475 	 or !single_set with the exception of do-loop control part insns.
1476          ??? Should handle insns defining subregs.  */
1477       for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
1478 	{
1479 	  if (INSN_P (insn))
1480 	    {
1481 	      HARD_REG_SET regs;
1482 	      CLEAR_HARD_REG_SET (regs);
1483 	      note_stores (insn, record_hard_reg_sets, &regs);
1484 	      if (hard_reg_set_intersect_p (regs, prohibited_regs))
1485 		break;
1486 	    }
1487 
1488 	  if (CALL_P (insn)
1489 	      || BARRIER_P (insn)
1490 	      || (INSN_P (insn) && single_set (insn)
1491 		  && GET_CODE (SET_DEST (single_set (insn))) == SUBREG)
1492 	      /* Not a single set.  */
1493 	      || (NONDEBUG_INSN_P (insn) && !JUMP_P (insn)
1494 		  && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE
1495 		  /* But non-single-set allowed in one special case.  */
1496 		  && (insn != prev_nondebug_insn (tail)
1497 		      || !reg_mentioned_p (count_reg, insn))))
1498 	    break;
1499 	}
1500 
1501       if (insn != NEXT_INSN (tail))
1502 	{
1503 	  if (dump_file)
1504 	    {
1505 	      if (CALL_P (insn))
1506 		fprintf (dump_file, "SMS loop-with-call\n");
1507 	      else if (BARRIER_P (insn))
1508 		fprintf (dump_file, "SMS loop-with-barrier\n");
1509 	      else if (INSN_P (insn) && single_set (insn)
1510 		       && GET_CODE (SET_DEST (single_set (insn))) == SUBREG)
1511 		fprintf (dump_file, "SMS loop with subreg in lhs\n");
1512 	      else
1513 		fprintf (dump_file,
1514 			 "SMS loop-with-not-single-set-or-prohibited-reg\n");
1515 
1516 	      print_rtl_single (dump_file, insn);
1517 	    }
1518 
1519 	  continue;
1520 	}
1521 
1522       /* Always schedule the closing branch with the rest of the
1523          instructions. The branch is rotated to be in row ii-1 at the
1524          end of the scheduling procedure to make sure it's the last
1525          instruction in the iteration.  */
1526       if (! (g = create_ddg (bb, 1)))
1527         {
1528           if (dump_file)
1529 	    fprintf (dump_file, "SMS create_ddg failed\n");
1530 	  continue;
1531         }
1532 
1533       g_arr[loop->num] = g;
1534       if (dump_file)
1535         fprintf (dump_file, "...OK\n");
1536 
1537     }
1538   if (dump_file)
1539   {
1540     fprintf (dump_file, "\nSMS transformation phase\n");
1541     fprintf (dump_file, "=========================\n\n");
1542   }
1543 
1544   /* We don't want to perform SMS on new loops - created by versioning.  */
1545   for (auto loop : loops_list (cfun, 0))
1546     {
1547       rtx_insn *head, *tail;
1548       rtx count_reg;
1549       rtx_insn *count_init;
1550       int mii, rec_mii, stage_count, min_cycle;
1551       int64_t loop_count = 0;
1552       bool opt_sc_p, adjust_inplace = false;
1553       basic_block pre_header;
1554 
1555       if (! (g = g_arr[loop->num]))
1556         continue;
1557 
1558       if (dump_file)
1559 	{
1560 	  rtx_insn *insn = BB_END (loop->header);
1561 
1562 	  fprintf (dump_file, "SMS loop num: %d", loop->num);
1563 	  dump_insn_location (insn);
1564 	  fprintf (dump_file, "\n");
1565 
1566 	  print_ddg (dump_file, g);
1567 	}
1568 
1569       get_ebb_head_tail (loop->header, loop->header, &head, &tail);
1570 
1571       latch_edge = loop_latch_edge (loop);
1572       gcc_assert (single_exit (loop));
1573       trip_count = get_estimated_loop_iterations_int (loop);
1574       max_trip_count = get_max_loop_iterations_int (loop);
1575 
1576       if (dump_file)
1577 	{
1578 	  dump_insn_location (tail);
1579 	  fprintf (dump_file, "\nSMS single-bb-loop\n");
1580 	  if (profile_info && flag_branch_probabilities)
1581 	    {
1582 	      fprintf (dump_file, "SMS loop-count ");
1583 	      fprintf (dump_file, "%" PRId64,
1584 	               (int64_t) bb->count.to_gcov_type ());
1585 	      fprintf (dump_file, "\n");
1586 	    }
1587 	  fprintf (dump_file, "SMS doloop\n");
1588 	  fprintf (dump_file, "SMS built-ddg %d\n", g->num_nodes);
1589           fprintf (dump_file, "SMS num-loads %d\n", g->num_loads);
1590           fprintf (dump_file, "SMS num-stores %d\n", g->num_stores);
1591 	}
1592 
1593 
1594       count_reg = doloop_register_get (head, tail);
1595       gcc_assert (count_reg);
1596 
1597       pre_header = loop_preheader_edge (loop)->src;
1598       count_init = const_iteration_count (count_reg, pre_header, &loop_count,
1599 					  &adjust_inplace);
1600 
1601       if (dump_file && count_init)
1602         {
1603           fprintf (dump_file, "SMS const-doloop ");
1604           fprintf (dump_file, "%" PRId64,
1605 		     loop_count);
1606           fprintf (dump_file, "\n");
1607         }
1608 
1609       node_order = XNEWVEC (int, g->num_nodes);
1610 
1611       mii = 1; /* Need to pass some estimate of mii.  */
1612       rec_mii = sms_order_nodes (g, mii, node_order, &max_asap);
1613       mii = MAX (res_MII (g), rec_mii);
1614       mii = MAX (mii, 1);
1615       maxii = MAX (max_asap, param_sms_max_ii_factor * mii);
1616 
1617       if (dump_file)
1618 	fprintf (dump_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
1619 		 rec_mii, mii, maxii);
1620 
1621       for (;;)
1622 	{
1623 	  set_node_sched_params (g);
1624 
1625 	  stage_count = 0;
1626 	  opt_sc_p = false;
1627 	  ps = sms_schedule_by_order (g, mii, maxii, node_order);
1628 
1629 	  if (ps)
1630 	    {
1631 	      /* Try to achieve optimized SC by normalizing the partial
1632 		 schedule (having the cycles start from cycle zero).
1633 		 The branch location must be placed in row ii-1 in the
1634 		 final scheduling.	If failed, shift all instructions to
1635 		 position the branch in row ii-1.  */
1636 	      opt_sc_p = optimize_sc (ps, g);
1637 	      if (opt_sc_p)
1638 		stage_count = calculate_stage_count (ps, 0);
1639 	      else
1640 		{
1641 		  /* Bring the branch to cycle ii-1.  */
1642 		  int amount = (SCHED_TIME (g->closing_branch->cuid)
1643 				- (ps->ii - 1));
1644 
1645 		  if (dump_file)
1646 		    fprintf (dump_file, "SMS schedule branch at cycle ii-1\n");
1647 
1648 		  stage_count = calculate_stage_count (ps, amount);
1649 		}
1650 
1651 	      gcc_assert (stage_count >= 1);
1652 	    }
1653 
1654 	  /* The default value of param_sms_min_sc is 2 as stage count of
1655 	     1 means that there is no interleaving between iterations thus
1656 	     we let the scheduling passes do the job in this case.  */
1657 	  if (stage_count < param_sms_min_sc
1658 	      || (count_init && (loop_count <= stage_count))
1659 	      || (max_trip_count >= 0 && max_trip_count <= stage_count)
1660 	      || (trip_count >= 0 && trip_count <= stage_count))
1661 	    {
1662 	      if (dump_file)
1663 		{
1664 		  fprintf (dump_file, "SMS failed... \n");
1665 		  fprintf (dump_file, "SMS sched-failed (stage-count=%d,"
1666 			   " loop-count=", stage_count);
1667 		  fprintf (dump_file, "%" PRId64, loop_count);
1668 		  fprintf (dump_file, ", trip-count=");
1669 		  fprintf (dump_file, "%" PRId64 "max %" PRId64,
1670 			   (int64_t) trip_count, (int64_t) max_trip_count);
1671 		  fprintf (dump_file, ")\n");
1672 		}
1673 	      break;
1674 	    }
1675 
1676           if (!opt_sc_p)
1677             {
1678 	      /* Rotate the partial schedule to have the branch in row ii-1.  */
1679               int amount = SCHED_TIME (g->closing_branch->cuid) - (ps->ii - 1);
1680 
1681               reset_sched_times (ps, amount);
1682               rotate_partial_schedule (ps, amount);
1683             }
1684 
1685 	  set_columns_for_ps (ps);
1686 
1687 	  min_cycle = PS_MIN_CYCLE (ps) - SMODULO (PS_MIN_CYCLE (ps), ps->ii);
1688 	  if (!schedule_reg_moves (ps))
1689 	    {
1690 	      mii = ps->ii + 1;
1691 	      free_partial_schedule (ps);
1692 	      continue;
1693 	    }
1694 
1695 	  /* Moves that handle incoming values might have been added
1696 	     to a new first stage.  Bump the stage count if so.
1697 
1698 	     ??? Perhaps we could consider rotating the schedule here
1699 	     instead?  */
1700 	  if (PS_MIN_CYCLE (ps) < min_cycle)
1701 	    {
1702 	      reset_sched_times (ps, 0);
1703 	      stage_count++;
1704 	    }
1705 
1706 	  /* The stage count should now be correct without rotation.  */
1707 	  gcc_checking_assert (stage_count == calculate_stage_count (ps, 0));
1708 	  PS_STAGE_COUNT (ps) = stage_count;
1709 
1710 	  canon_loop (loop);
1711 
1712           if (dump_file)
1713             {
1714 	      dump_insn_location (tail);
1715 	      fprintf (dump_file, " SMS succeeded %d %d (with ii, sc)\n",
1716 		       ps->ii, stage_count);
1717 	      print_partial_schedule (ps, dump_file);
1718 	    }
1719 
1720 	  if (count_init)
1721 	    {
1722 	       if (adjust_inplace)
1723 		{
1724 		  /* When possible, set new iteration count of loop kernel in
1725 		     place.  Otherwise, generate_prolog_epilog creates an insn
1726 		     to adjust.  */
1727 		  SET_SRC (single_set (count_init)) = GEN_INT (loop_count
1728 							    - stage_count + 1);
1729 		}
1730 	    }
1731 	  else
1732             {
1733 	      /* case the BCT count is not known , Do loop-versioning */
1734 	      rtx comp_rtx = gen_rtx_GT (VOIDmode, count_reg,
1735 					 gen_int_mode (stage_count,
1736 						       GET_MODE (count_reg)));
1737 	      profile_probability prob = profile_probability::guessed_always ()
1738 				.apply_scale (PROB_SMS_ENOUGH_ITERATIONS, 100);
1739 
1740 	      loop_version (loop, comp_rtx, &condition_bb,
1741 	  		    prob, prob.invert (),
1742 			    prob, prob.invert (), true);
1743 	    }
1744 
1745 	  /* Now apply the scheduled kernel to the RTL of the loop.  */
1746 	  permute_partial_schedule (ps, g->closing_branch->first_note);
1747 
1748           /* Mark this loop as software pipelined so the later
1749 	     scheduling passes don't touch it.  */
1750 	  if (! flag_resched_modulo_sched)
1751 	    mark_loop_unsched (loop);
1752 
1753 	  /* The life-info is not valid any more.  */
1754 	  df_set_bb_dirty (g->bb);
1755 
1756 	  apply_reg_moves (ps);
1757 	  if (dump_file)
1758 	    print_node_sched_params (dump_file, g->num_nodes, ps);
1759 	  /* Generate prolog and epilog.  */
1760 	  generate_prolog_epilog (ps, loop, count_reg, !adjust_inplace);
1761 	  break;
1762 	}
1763 
1764       free_partial_schedule (ps);
1765       node_sched_param_vec.release ();
1766       free (node_order);
1767       free_ddg (g);
1768     }
1769 
1770   free (g_arr);
1771 
1772   /* Release scheduler data, needed until now because of DFA.  */
1773   haifa_sched_finish ();
1774   loop_optimizer_finalize ();
1775 }
1776 
1777 /* The SMS scheduling algorithm itself
1778    -----------------------------------
1779    Input: 'O' an ordered list of insns of a loop.
1780    Output: A scheduling of the loop - kernel, prolog, and epilogue.
1781 
1782    'Q' is the empty Set
1783    'PS' is the partial schedule; it holds the currently scheduled nodes with
1784 	their cycle/slot.
1785    'PSP' previously scheduled predecessors.
1786    'PSS' previously scheduled successors.
1787    't(u)' the cycle where u is scheduled.
1788    'l(u)' is the latency of u.
1789    'd(v,u)' is the dependence distance from v to u.
1790    'ASAP(u)' the earliest time at which u could be scheduled as computed in
1791 	     the node ordering phase.
1792    'check_hardware_resources_conflicts(u, PS, c)'
1793 			     run a trace around cycle/slot through DFA model
1794 			     to check resource conflicts involving instruction u
1795 			     at cycle c given the partial schedule PS.
1796    'add_to_partial_schedule_at_time(u, PS, c)'
1797 			     Add the node/instruction u to the partial schedule
1798 			     PS at time c.
1799    'calculate_register_pressure(PS)'
1800 			     Given a schedule of instructions, calculate the register
1801 			     pressure it implies.  One implementation could be the
1802 			     maximum number of overlapping live ranges.
1803    'maxRP' The maximum allowed register pressure, it is usually derived from the number
1804 	   registers available in the hardware.
1805 
1806    1. II = MII.
1807    2. PS = empty list
1808    3. for each node u in O in pre-computed order
1809    4.   if (PSP(u) != Q && PSS(u) == Q) then
1810    5.     Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1811    6.     start = Early_start; end = Early_start + II - 1; step = 1
1812    11.  else if (PSP(u) == Q && PSS(u) != Q) then
1813    12.      Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1814    13.     start = Late_start; end = Late_start - II + 1; step = -1
1815    14.  else if (PSP(u) != Q && PSS(u) != Q) then
1816    15.     Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1817    16.     Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1818    17.     start = Early_start;
1819    18.     end = min(Early_start + II - 1 , Late_start);
1820    19.     step = 1
1821    20.     else "if (PSP(u) == Q && PSS(u) == Q)"
1822    21.	  start = ASAP(u); end = start + II - 1; step = 1
1823    22.  endif
1824 
1825    23.  success = false
1826    24.  for (c = start ; c != end ; c += step)
1827    25.     if check_hardware_resources_conflicts(u, PS, c) then
1828    26.       add_to_partial_schedule_at_time(u, PS, c)
1829    27.       success = true
1830    28.       break
1831    29.     endif
1832    30.  endfor
1833    31.  if (success == false) then
1834    32.    II = II + 1
1835    33.    if (II > maxII) then
1836    34.       finish - failed to schedule
1837    35.	 endif
1838    36.    goto 2.
1839    37.  endif
1840    38. endfor
1841    39. if (calculate_register_pressure(PS) > maxRP) then
1842    40.    goto 32.
1843    41. endif
1844    42. compute epilogue & prologue
1845    43. finish - succeeded to schedule
1846 
1847    ??? The algorithm restricts the scheduling window to II cycles.
1848    In rare cases, it may be better to allow windows of II+1 cycles.
1849    The window would then start and end on the same row, but with
1850    different "must precede" and "must follow" requirements.  */
1851 
1852 /* A threshold for the number of repeated unsuccessful attempts to insert
1853    an empty row, before we flush the partial schedule and start over.  */
1854 #define MAX_SPLIT_NUM 10
1855 /* Given the partial schedule PS, this function calculates and returns the
1856    cycles in which we can schedule the node with the given index I.
1857    NOTE: Here we do the backtracking in SMS, in some special cases. We have
1858    noticed that there are several cases in which we fail    to SMS the loop
1859    because the sched window of a node is empty    due to tight data-deps. In
1860    such cases we want to unschedule    some of the predecessors/successors
1861    until we get non-empty    scheduling window.  It returns -1 if the
1862    scheduling window is empty and zero otherwise.  */
1863 
1864 static int
get_sched_window(partial_schedule_ptr ps,ddg_node_ptr u_node,sbitmap sched_nodes,int ii,int * start_p,int * step_p,int * end_p)1865 get_sched_window (partial_schedule_ptr ps, ddg_node_ptr u_node,
1866 		  sbitmap sched_nodes, int ii, int *start_p, int *step_p,
1867 		  int *end_p)
1868 {
1869   int start, step, end;
1870   int early_start, late_start;
1871   ddg_edge_ptr e;
1872   auto_sbitmap psp (ps->g->num_nodes);
1873   auto_sbitmap pss (ps->g->num_nodes);
1874   sbitmap u_node_preds = NODE_PREDECESSORS (u_node);
1875   sbitmap u_node_succs = NODE_SUCCESSORS (u_node);
1876   int psp_not_empty;
1877   int pss_not_empty;
1878   int count_preds;
1879   int count_succs;
1880 
1881   /* 1. compute sched window for u (start, end, step).  */
1882   bitmap_clear (psp);
1883   bitmap_clear (pss);
1884   psp_not_empty = bitmap_and (psp, u_node_preds, sched_nodes);
1885   pss_not_empty = bitmap_and (pss, u_node_succs, sched_nodes);
1886 
1887   /* We first compute a forward range (start <= end), then decide whether
1888      to reverse it.  */
1889   early_start = INT_MIN;
1890   late_start = INT_MAX;
1891   start = INT_MIN;
1892   end = INT_MAX;
1893   step = 1;
1894 
1895   count_preds = 0;
1896   count_succs = 0;
1897 
1898   if (dump_file && (psp_not_empty || pss_not_empty))
1899     {
1900       fprintf (dump_file, "\nAnalyzing dependencies for node %d (INSN %d)"
1901 	       "; ii = %d\n\n", u_node->cuid, INSN_UID (u_node->insn), ii);
1902       fprintf (dump_file, "%11s %11s %11s %11s %5s\n",
1903 	       "start", "early start", "late start", "end", "time");
1904       fprintf (dump_file, "=========== =========== =========== ==========="
1905 	       " =====\n");
1906     }
1907   /* Calculate early_start and limit end.  Both bounds are inclusive.  */
1908   if (psp_not_empty)
1909     for (e = u_node->in; e != 0; e = e->next_in)
1910       {
1911 	int v = e->src->cuid;
1912 
1913 	if (bitmap_bit_p (sched_nodes, v))
1914 	  {
1915 	    int p_st = SCHED_TIME (v);
1916 	    int earliest = p_st + e->latency - (e->distance * ii);
1917 	    int latest = (e->data_type == MEM_DEP ? p_st + ii - 1 : INT_MAX);
1918 
1919 	    if (dump_file)
1920 	      {
1921 		fprintf (dump_file, "%11s %11d %11s %11d %5d",
1922 			 "", earliest, "", latest, p_st);
1923 		print_ddg_edge (dump_file, e);
1924 		fprintf (dump_file, "\n");
1925 	      }
1926 
1927 	    early_start = MAX (early_start, earliest);
1928 	    end = MIN (end, latest);
1929 
1930 	    if (e->type == TRUE_DEP && e->data_type == REG_DEP)
1931 	      count_preds++;
1932 	  }
1933       }
1934 
1935   /* Calculate late_start and limit start.  Both bounds are inclusive.  */
1936   if (pss_not_empty)
1937     for (e = u_node->out; e != 0; e = e->next_out)
1938       {
1939 	int v = e->dest->cuid;
1940 
1941 	if (bitmap_bit_p (sched_nodes, v))
1942 	  {
1943 	    int s_st = SCHED_TIME (v);
1944 	    int earliest = (e->data_type == MEM_DEP ? s_st - ii + 1 : INT_MIN);
1945 	    int latest = s_st - e->latency + (e->distance * ii);
1946 
1947 	    if (dump_file)
1948 	      {
1949 		fprintf (dump_file, "%11d %11s %11d %11s %5d",
1950 			 earliest, "", latest, "", s_st);
1951 		print_ddg_edge (dump_file, e);
1952 		fprintf (dump_file, "\n");
1953 	      }
1954 
1955 	    start = MAX (start, earliest);
1956 	    late_start = MIN (late_start, latest);
1957 
1958 	    if (e->type == TRUE_DEP && e->data_type == REG_DEP)
1959 	      count_succs++;
1960 	  }
1961       }
1962 
1963   if (dump_file && (psp_not_empty || pss_not_empty))
1964     {
1965       fprintf (dump_file, "----------- ----------- ----------- -----------"
1966 	       " -----\n");
1967       fprintf (dump_file, "%11d %11d %11d %11d %5s %s\n",
1968 	       start, early_start, late_start, end, "",
1969 	       "(max, max, min, min)");
1970     }
1971 
1972   /* Get a target scheduling window no bigger than ii.  */
1973   if (early_start == INT_MIN && late_start == INT_MAX)
1974     early_start = NODE_ASAP (u_node);
1975   else if (early_start == INT_MIN)
1976     early_start = late_start - (ii - 1);
1977   late_start = MIN (late_start, early_start + (ii - 1));
1978 
1979   /* Apply memory dependence limits.  */
1980   start = MAX (start, early_start);
1981   end = MIN (end, late_start);
1982 
1983   if (dump_file && (psp_not_empty || pss_not_empty))
1984     fprintf (dump_file, "%11s %11d %11d %11s %5s final window\n",
1985 	     "", start, end, "", "");
1986 
1987   /* If there are at least as many successors as predecessors, schedule the
1988      node close to its successors.  */
1989   if (pss_not_empty && count_succs >= count_preds)
1990     {
1991       std::swap (start, end);
1992       step = -1;
1993     }
1994 
1995   /* Now that we've finalized the window, make END an exclusive rather
1996      than an inclusive bound.  */
1997   end += step;
1998 
1999   *start_p = start;
2000   *step_p = step;
2001   *end_p = end;
2002 
2003   if ((start >= end && step == 1) || (start <= end && step == -1))
2004     {
2005       if (dump_file)
2006 	fprintf (dump_file, "\nEmpty window: start=%d, end=%d, step=%d\n",
2007 		 start, end, step);
2008       return -1;
2009     }
2010 
2011   return 0;
2012 }
2013 
2014 /* Calculate MUST_PRECEDE/MUST_FOLLOW bitmaps of U_NODE; which is the
2015    node currently been scheduled.  At the end of the calculation
2016    MUST_PRECEDE/MUST_FOLLOW contains all predecessors/successors of
2017    U_NODE which are (1) already scheduled in the first/last row of
2018    U_NODE's scheduling window, (2) whose dependence inequality with U
2019    becomes an equality when U is scheduled in this same row, and (3)
2020    whose dependence latency is zero.
2021 
2022    The first and last rows are calculated using the following parameters:
2023    START/END rows - The cycles that begins/ends the traversal on the window;
2024    searching for an empty cycle to schedule U_NODE.
2025    STEP - The direction in which we traverse the window.
2026    II - The initiation interval.  */
2027 
2028 static void
calculate_must_precede_follow(ddg_node_ptr u_node,int start,int end,int step,int ii,sbitmap sched_nodes,sbitmap must_precede,sbitmap must_follow)2029 calculate_must_precede_follow (ddg_node_ptr u_node, int start, int end,
2030 			       int step, int ii, sbitmap sched_nodes,
2031 			       sbitmap must_precede, sbitmap must_follow)
2032 {
2033   ddg_edge_ptr e;
2034   int first_cycle_in_window, last_cycle_in_window;
2035 
2036   gcc_assert (must_precede && must_follow);
2037 
2038   /* Consider the following scheduling window:
2039      {first_cycle_in_window, first_cycle_in_window+1, ...,
2040      last_cycle_in_window}.  If step is 1 then the following will be
2041      the order we traverse the window: {start=first_cycle_in_window,
2042      first_cycle_in_window+1, ..., end=last_cycle_in_window+1},
2043      or {start=last_cycle_in_window, last_cycle_in_window-1, ...,
2044      end=first_cycle_in_window-1} if step is -1.  */
2045   first_cycle_in_window = (step == 1) ? start : end - step;
2046   last_cycle_in_window = (step == 1) ? end - step : start;
2047 
2048   bitmap_clear (must_precede);
2049   bitmap_clear (must_follow);
2050 
2051   if (dump_file)
2052     fprintf (dump_file, "\nmust_precede: ");
2053 
2054   /* Instead of checking if:
2055       (SMODULO (SCHED_TIME (e->src), ii) == first_row_in_window)
2056       && ((SCHED_TIME (e->src) + e->latency - (e->distance * ii)) ==
2057              first_cycle_in_window)
2058       && e->latency == 0
2059      we use the fact that latency is non-negative:
2060       SCHED_TIME (e->src) - (e->distance * ii) <=
2061       SCHED_TIME (e->src) + e->latency - (e->distance * ii)) <=
2062       first_cycle_in_window
2063      and check only if
2064       SCHED_TIME (e->src) - (e->distance * ii) == first_cycle_in_window  */
2065   for (e = u_node->in; e != 0; e = e->next_in)
2066     if (bitmap_bit_p (sched_nodes, e->src->cuid)
2067 	&& ((SCHED_TIME (e->src->cuid) - (e->distance * ii)) ==
2068              first_cycle_in_window))
2069       {
2070 	if (dump_file)
2071 	  fprintf (dump_file, "%d ", e->src->cuid);
2072 
2073 	bitmap_set_bit (must_precede, e->src->cuid);
2074       }
2075 
2076   if (dump_file)
2077     fprintf (dump_file, "\nmust_follow: ");
2078 
2079   /* Instead of checking if:
2080       (SMODULO (SCHED_TIME (e->dest), ii) == last_row_in_window)
2081       && ((SCHED_TIME (e->dest) - e->latency + (e->distance * ii)) ==
2082              last_cycle_in_window)
2083       && e->latency == 0
2084      we use the fact that latency is non-negative:
2085       SCHED_TIME (e->dest) + (e->distance * ii) >=
2086       SCHED_TIME (e->dest) - e->latency + (e->distance * ii)) >=
2087       last_cycle_in_window
2088      and check only if
2089       SCHED_TIME (e->dest) + (e->distance * ii) == last_cycle_in_window  */
2090   for (e = u_node->out; e != 0; e = e->next_out)
2091     if (bitmap_bit_p (sched_nodes, e->dest->cuid)
2092 	&& ((SCHED_TIME (e->dest->cuid) + (e->distance * ii)) ==
2093              last_cycle_in_window))
2094       {
2095 	if (dump_file)
2096 	  fprintf (dump_file, "%d ", e->dest->cuid);
2097 
2098 	bitmap_set_bit (must_follow, e->dest->cuid);
2099       }
2100 
2101   if (dump_file)
2102     fprintf (dump_file, "\n");
2103 }
2104 
2105 /* Return 1 if U_NODE can be scheduled in CYCLE.  Use the following
2106    parameters to decide if that's possible:
2107    PS - The partial schedule.
2108    U - The serial number of U_NODE.
2109    NUM_SPLITS - The number of row splits made so far.
2110    MUST_PRECEDE - The nodes that must precede U_NODE. (only valid at
2111    the first row of the scheduling window)
2112    MUST_FOLLOW - The nodes that must follow U_NODE. (only valid at the
2113    last row of the scheduling window)  */
2114 
2115 static bool
try_scheduling_node_in_cycle(partial_schedule_ptr ps,int u,int cycle,sbitmap sched_nodes,int * num_splits,sbitmap must_precede,sbitmap must_follow)2116 try_scheduling_node_in_cycle (partial_schedule_ptr ps,
2117 			      int u, int cycle, sbitmap sched_nodes,
2118 			      int *num_splits, sbitmap must_precede,
2119 			      sbitmap must_follow)
2120 {
2121   ps_insn_ptr psi;
2122   bool success = 0;
2123 
2124   verify_partial_schedule (ps, sched_nodes);
2125   psi = ps_add_node_check_conflicts (ps, u, cycle, must_precede, must_follow);
2126   if (psi)
2127     {
2128       SCHED_TIME (u) = cycle;
2129       bitmap_set_bit (sched_nodes, u);
2130       success = 1;
2131       *num_splits = 0;
2132       if (dump_file)
2133 	fprintf (dump_file, "Scheduled w/o split in %d\n", cycle);
2134 
2135     }
2136 
2137   return success;
2138 }
2139 
2140 /* This function implements the scheduling algorithm for SMS according to the
2141    above algorithm.  */
2142 static partial_schedule_ptr
sms_schedule_by_order(ddg_ptr g,int mii,int maxii,int * nodes_order)2143 sms_schedule_by_order (ddg_ptr g, int mii, int maxii, int *nodes_order)
2144 {
2145   int ii = mii;
2146   int i, c, success, num_splits = 0;
2147   int flush_and_start_over = true;
2148   int num_nodes = g->num_nodes;
2149   int start, end, step; /* Place together into one struct?  */
2150   auto_sbitmap sched_nodes (num_nodes);
2151   auto_sbitmap must_precede (num_nodes);
2152   auto_sbitmap must_follow (num_nodes);
2153   auto_sbitmap tobe_scheduled (num_nodes);
2154 
2155   /* Value of param_sms_dfa_history is a limit on the number of cycles that
2156      resource conflicts can span.  ??? Should be provided by DFA, and be
2157      dependent on the type of insn scheduled.  Set to 0 by default to save
2158      compile time.  */
2159   partial_schedule_ptr ps = create_partial_schedule (ii, g,
2160 						     param_sms_dfa_history);
2161 
2162   bitmap_ones (tobe_scheduled);
2163   bitmap_clear (sched_nodes);
2164 
2165   while (flush_and_start_over && (ii < maxii))
2166     {
2167 
2168       if (dump_file)
2169 	fprintf (dump_file, "Starting with ii=%d\n", ii);
2170       flush_and_start_over = false;
2171       bitmap_clear (sched_nodes);
2172 
2173       for (i = 0; i < num_nodes; i++)
2174 	{
2175 	  int u = nodes_order[i];
2176   	  ddg_node_ptr u_node = &ps->g->nodes[u];
2177 	  rtx_insn *insn = u_node->insn;
2178 
2179 	  gcc_checking_assert (NONDEBUG_INSN_P (insn));
2180 
2181 	  if (bitmap_bit_p (sched_nodes, u))
2182 	    continue;
2183 
2184 	  /* Try to get non-empty scheduling window.  */
2185 	 success = 0;
2186          if (get_sched_window (ps, u_node, sched_nodes, ii, &start,
2187                                 &step, &end) == 0)
2188             {
2189               if (dump_file)
2190                 fprintf (dump_file, "\nTrying to schedule node %d "
2191 			 "INSN = %d  in (%d .. %d) step %d\n", u, (INSN_UID
2192                         (g->nodes[u].insn)), start, end, step);
2193 
2194               gcc_assert ((step > 0 && start < end)
2195                           || (step < 0 && start > end));
2196 
2197               calculate_must_precede_follow (u_node, start, end, step, ii,
2198                                              sched_nodes, must_precede,
2199                                              must_follow);
2200 
2201               for (c = start; c != end; c += step)
2202                 {
2203 		  sbitmap tmp_precede, tmp_follow;
2204 
2205                   set_must_precede_follow (&tmp_follow, must_follow,
2206 		                           &tmp_precede, must_precede,
2207                                            c, start, end, step);
2208                   success =
2209                     try_scheduling_node_in_cycle (ps, u, c,
2210                                                   sched_nodes,
2211                                                   &num_splits, tmp_precede,
2212                                                   tmp_follow);
2213                   if (success)
2214                     break;
2215                 }
2216 
2217               verify_partial_schedule (ps, sched_nodes);
2218             }
2219             if (!success)
2220             {
2221               int split_row;
2222 
2223               if (ii++ == maxii)
2224                 break;
2225 
2226               if (num_splits >= MAX_SPLIT_NUM)
2227                 {
2228                   num_splits = 0;
2229                   flush_and_start_over = true;
2230                   verify_partial_schedule (ps, sched_nodes);
2231                   reset_partial_schedule (ps, ii);
2232                   verify_partial_schedule (ps, sched_nodes);
2233                   break;
2234                 }
2235 
2236               num_splits++;
2237               /* The scheduling window is exclusive of 'end'
2238                  whereas compute_split_window() expects an inclusive,
2239                  ordered range.  */
2240               if (step == 1)
2241                 split_row = compute_split_row (sched_nodes, start, end - 1,
2242                                                ps->ii, u_node);
2243               else
2244                 split_row = compute_split_row (sched_nodes, end + 1, start,
2245                                                ps->ii, u_node);
2246 
2247               ps_insert_empty_row (ps, split_row, sched_nodes);
2248               i--;              /* Go back and retry node i.  */
2249 
2250               if (dump_file)
2251                 fprintf (dump_file, "num_splits=%d\n", num_splits);
2252             }
2253 
2254           /* ??? If (success), check register pressure estimates.  */
2255         }                       /* Continue with next node.  */
2256     }                           /* While flush_and_start_over.  */
2257   if (ii >= maxii)
2258     {
2259       free_partial_schedule (ps);
2260       ps = NULL;
2261     }
2262   else
2263     gcc_assert (bitmap_equal_p (tobe_scheduled, sched_nodes));
2264 
2265   return ps;
2266 }
2267 
2268 /* This function inserts a new empty row into PS at the position
2269    according to SPLITROW, keeping all already scheduled instructions
2270    intact and updating their SCHED_TIME and cycle accordingly.  */
2271 static void
ps_insert_empty_row(partial_schedule_ptr ps,int split_row,sbitmap sched_nodes)2272 ps_insert_empty_row (partial_schedule_ptr ps, int split_row,
2273 		     sbitmap sched_nodes)
2274 {
2275   ps_insn_ptr crr_insn;
2276   ps_insn_ptr *rows_new;
2277   int ii = ps->ii;
2278   int new_ii = ii + 1;
2279   int row;
2280   int *rows_length_new;
2281 
2282   verify_partial_schedule (ps, sched_nodes);
2283 
2284   /* We normalize sched_time and rotate ps to have only non-negative sched
2285      times, for simplicity of updating cycles after inserting new row.  */
2286   split_row -= ps->min_cycle;
2287   split_row = SMODULO (split_row, ii);
2288   if (dump_file)
2289     fprintf (dump_file, "split_row=%d\n", split_row);
2290 
2291   reset_sched_times (ps, PS_MIN_CYCLE (ps));
2292   rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
2293 
2294   rows_new = (ps_insn_ptr *) xcalloc (new_ii, sizeof (ps_insn_ptr));
2295   rows_length_new = (int *) xcalloc (new_ii, sizeof (int));
2296   for (row = 0; row < split_row; row++)
2297     {
2298       rows_new[row] = ps->rows[row];
2299       rows_length_new[row] = ps->rows_length[row];
2300       ps->rows[row] = NULL;
2301       for (crr_insn = rows_new[row];
2302 	   crr_insn; crr_insn = crr_insn->next_in_row)
2303 	{
2304 	  int u = crr_insn->id;
2305 	  int new_time = SCHED_TIME (u) + (SCHED_TIME (u) / ii);
2306 
2307 	  SCHED_TIME (u) = new_time;
2308 	  crr_insn->cycle = new_time;
2309 	  SCHED_ROW (u) = new_time % new_ii;
2310 	  SCHED_STAGE (u) = new_time / new_ii;
2311 	}
2312 
2313     }
2314 
2315   rows_new[split_row] = NULL;
2316 
2317   for (row = split_row; row < ii; row++)
2318     {
2319       rows_new[row + 1] = ps->rows[row];
2320       rows_length_new[row + 1] = ps->rows_length[row];
2321       ps->rows[row] = NULL;
2322       for (crr_insn = rows_new[row + 1];
2323 	   crr_insn; crr_insn = crr_insn->next_in_row)
2324 	{
2325 	  int u = crr_insn->id;
2326 	  int new_time = SCHED_TIME (u) + (SCHED_TIME (u) / ii) + 1;
2327 
2328 	  SCHED_TIME (u) = new_time;
2329 	  crr_insn->cycle = new_time;
2330 	  SCHED_ROW (u) = new_time % new_ii;
2331 	  SCHED_STAGE (u) = new_time / new_ii;
2332 	}
2333     }
2334 
2335   /* Updating ps.  */
2336   ps->min_cycle = ps->min_cycle + ps->min_cycle / ii
2337     + (SMODULO (ps->min_cycle, ii) >= split_row ? 1 : 0);
2338   ps->max_cycle = ps->max_cycle + ps->max_cycle / ii
2339     + (SMODULO (ps->max_cycle, ii) >= split_row ? 1 : 0);
2340   free (ps->rows);
2341   ps->rows = rows_new;
2342   free (ps->rows_length);
2343   ps->rows_length = rows_length_new;
2344   ps->ii = new_ii;
2345   gcc_assert (ps->min_cycle >= 0);
2346 
2347   verify_partial_schedule (ps, sched_nodes);
2348 
2349   if (dump_file)
2350     fprintf (dump_file, "min_cycle=%d, max_cycle=%d\n", ps->min_cycle,
2351 	     ps->max_cycle);
2352 }
2353 
2354 /* Given U_NODE which is the node that failed to be scheduled; LOW and
2355    UP which are the boundaries of it's scheduling window; compute using
2356    SCHED_NODES and II a row in the partial schedule that can be split
2357    which will separate a critical predecessor from a critical successor
2358    thereby expanding the window, and return it.  */
2359 static int
compute_split_row(sbitmap sched_nodes,int low,int up,int ii,ddg_node_ptr u_node)2360 compute_split_row (sbitmap sched_nodes, int low, int up, int ii,
2361 		   ddg_node_ptr u_node)
2362 {
2363   ddg_edge_ptr e;
2364   int lower = INT_MIN, upper = INT_MAX;
2365   int crit_pred = -1;
2366   int crit_succ = -1;
2367   int crit_cycle;
2368 
2369   for (e = u_node->in; e != 0; e = e->next_in)
2370     {
2371       int v = e->src->cuid;
2372 
2373       if (bitmap_bit_p (sched_nodes, v)
2374 	  && (low == SCHED_TIME (v) + e->latency - (e->distance * ii)))
2375 	if (SCHED_TIME (v) > lower)
2376 	  {
2377 	    crit_pred = v;
2378 	    lower = SCHED_TIME (v);
2379 	  }
2380     }
2381 
2382   if (crit_pred >= 0)
2383     {
2384       crit_cycle = SCHED_TIME (crit_pred) + 1;
2385       return SMODULO (crit_cycle, ii);
2386     }
2387 
2388   for (e = u_node->out; e != 0; e = e->next_out)
2389     {
2390       int v = e->dest->cuid;
2391 
2392       if (bitmap_bit_p (sched_nodes, v)
2393 	  && (up == SCHED_TIME (v) - e->latency + (e->distance * ii)))
2394 	if (SCHED_TIME (v) < upper)
2395 	  {
2396 	    crit_succ = v;
2397 	    upper = SCHED_TIME (v);
2398 	  }
2399     }
2400 
2401   if (crit_succ >= 0)
2402     {
2403       crit_cycle = SCHED_TIME (crit_succ);
2404       return SMODULO (crit_cycle, ii);
2405     }
2406 
2407   if (dump_file)
2408     fprintf (dump_file, "Both crit_pred and crit_succ are NULL\n");
2409 
2410   return SMODULO ((low + up + 1) / 2, ii);
2411 }
2412 
2413 static void
verify_partial_schedule(partial_schedule_ptr ps,sbitmap sched_nodes)2414 verify_partial_schedule (partial_schedule_ptr ps, sbitmap sched_nodes)
2415 {
2416   int row;
2417   ps_insn_ptr crr_insn;
2418 
2419   for (row = 0; row < ps->ii; row++)
2420     {
2421       int length = 0;
2422 
2423       for (crr_insn = ps->rows[row]; crr_insn; crr_insn = crr_insn->next_in_row)
2424 	{
2425 	  int u = crr_insn->id;
2426 
2427 	  length++;
2428 	  gcc_assert (bitmap_bit_p (sched_nodes, u));
2429 	  /* ??? Test also that all nodes of sched_nodes are in ps, perhaps by
2430 	     popcount (sched_nodes) == number of insns in ps.  */
2431 	  gcc_assert (SCHED_TIME (u) >= ps->min_cycle);
2432 	  gcc_assert (SCHED_TIME (u) <= ps->max_cycle);
2433 	}
2434 
2435       gcc_assert (ps->rows_length[row] == length);
2436     }
2437 }
2438 
2439 
2440 /* This page implements the algorithm for ordering the nodes of a DDG
2441    for modulo scheduling, activated through the
2442    "int sms_order_nodes (ddg_ptr, int mii, int * result)" API.  */
2443 
2444 #define ORDER_PARAMS(x) ((struct node_order_params *) (x)->aux.info)
2445 #define ASAP(x) (ORDER_PARAMS ((x))->asap)
2446 #define ALAP(x) (ORDER_PARAMS ((x))->alap)
2447 #define HEIGHT(x) (ORDER_PARAMS ((x))->height)
2448 #define MOB(x) (ALAP ((x)) - ASAP ((x)))
2449 #define DEPTH(x) (ASAP ((x)))
2450 
2451 typedef struct node_order_params * nopa;
2452 
2453 static void order_nodes_of_sccs (ddg_all_sccs_ptr, int * result);
2454 static int order_nodes_in_scc (ddg_ptr, sbitmap, sbitmap, int*, int);
2455 static nopa  calculate_order_params (ddg_ptr, int, int *);
2456 static int find_max_asap (ddg_ptr, sbitmap);
2457 static int find_max_hv_min_mob (ddg_ptr, sbitmap);
2458 static int find_max_dv_min_mob (ddg_ptr, sbitmap);
2459 
2460 enum sms_direction {BOTTOMUP, TOPDOWN};
2461 
2462 struct node_order_params
2463 {
2464   int asap;
2465   int alap;
2466   int height;
2467 };
2468 
2469 /* Check if NODE_ORDER contains a permutation of 0 .. NUM_NODES-1.  */
2470 static void
check_nodes_order(int * node_order,int num_nodes)2471 check_nodes_order (int *node_order, int num_nodes)
2472 {
2473   int i;
2474   auto_sbitmap tmp (num_nodes);
2475 
2476   bitmap_clear (tmp);
2477 
2478   if (dump_file)
2479     fprintf (dump_file, "SMS final nodes order: \n");
2480 
2481   for (i = 0; i < num_nodes; i++)
2482     {
2483       int u = node_order[i];
2484 
2485       if (dump_file)
2486         fprintf (dump_file, "%d ", u);
2487       gcc_assert (u < num_nodes && u >= 0 && !bitmap_bit_p (tmp, u));
2488 
2489       bitmap_set_bit (tmp, u);
2490     }
2491 
2492   if (dump_file)
2493     fprintf (dump_file, "\n");
2494 }
2495 
2496 /* Order the nodes of G for scheduling and pass the result in
2497    NODE_ORDER.  Also set aux.count of each node to ASAP.
2498    Put maximal ASAP to PMAX_ASAP.  Return the recMII for the given DDG.  */
2499 static int
sms_order_nodes(ddg_ptr g,int mii,int * node_order,int * pmax_asap)2500 sms_order_nodes (ddg_ptr g, int mii, int * node_order, int *pmax_asap)
2501 {
2502   int i;
2503   int rec_mii = 0;
2504   ddg_all_sccs_ptr sccs = create_ddg_all_sccs (g);
2505 
2506   nopa nops = calculate_order_params (g, mii, pmax_asap);
2507 
2508   if (dump_file)
2509     print_sccs (dump_file, sccs, g);
2510 
2511   order_nodes_of_sccs (sccs, node_order);
2512 
2513   if (sccs->num_sccs > 0)
2514     /* First SCC has the largest recurrence_length.  */
2515     rec_mii = sccs->sccs[0]->recurrence_length;
2516 
2517   /* Save ASAP before destroying node_order_params.  */
2518   for (i = 0; i < g->num_nodes; i++)
2519     {
2520       ddg_node_ptr v = &g->nodes[i];
2521       v->aux.count = ASAP (v);
2522     }
2523 
2524   free (nops);
2525   free_ddg_all_sccs (sccs);
2526   check_nodes_order (node_order, g->num_nodes);
2527 
2528   return rec_mii;
2529 }
2530 
2531 static void
order_nodes_of_sccs(ddg_all_sccs_ptr all_sccs,int * node_order)2532 order_nodes_of_sccs (ddg_all_sccs_ptr all_sccs, int * node_order)
2533 {
2534   int i, pos = 0;
2535   ddg_ptr g = all_sccs->ddg;
2536   int num_nodes = g->num_nodes;
2537   auto_sbitmap prev_sccs (num_nodes);
2538   auto_sbitmap on_path (num_nodes);
2539   auto_sbitmap tmp (num_nodes);
2540   auto_sbitmap ones (num_nodes);
2541 
2542   bitmap_clear (prev_sccs);
2543   bitmap_ones (ones);
2544 
2545   /* Perform the node ordering starting from the SCC with the highest recMII.
2546      For each SCC order the nodes according to their ASAP/ALAP/HEIGHT etc.  */
2547   for (i = 0; i < all_sccs->num_sccs; i++)
2548     {
2549       ddg_scc_ptr scc = all_sccs->sccs[i];
2550 
2551       /* Add nodes on paths from previous SCCs to the current SCC.  */
2552       find_nodes_on_paths (on_path, g, prev_sccs, scc->nodes);
2553       bitmap_ior (tmp, scc->nodes, on_path);
2554 
2555       /* Add nodes on paths from the current SCC to previous SCCs.  */
2556       find_nodes_on_paths (on_path, g, scc->nodes, prev_sccs);
2557       bitmap_ior (tmp, tmp, on_path);
2558 
2559       /* Remove nodes of previous SCCs from current extended SCC.  */
2560       bitmap_and_compl (tmp, tmp, prev_sccs);
2561 
2562       pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
2563       /* Above call to order_nodes_in_scc updated prev_sccs |= tmp.  */
2564     }
2565 
2566   /* Handle the remaining nodes that do not belong to any scc.  Each call
2567      to order_nodes_in_scc handles a single connected component.  */
2568   while (pos < g->num_nodes)
2569     {
2570       bitmap_and_compl (tmp, ones, prev_sccs);
2571       pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
2572     }
2573 }
2574 
2575 /* MII is needed if we consider backarcs (that do not close recursive cycles).  */
2576 static struct node_order_params *
calculate_order_params(ddg_ptr g,int mii ATTRIBUTE_UNUSED,int * pmax_asap)2577 calculate_order_params (ddg_ptr g, int mii ATTRIBUTE_UNUSED, int *pmax_asap)
2578 {
2579   int u;
2580   int max_asap;
2581   int num_nodes = g->num_nodes;
2582   ddg_edge_ptr e;
2583   /* Allocate a place to hold ordering params for each node in the DDG.  */
2584   nopa node_order_params_arr;
2585 
2586   /* Initialize of ASAP/ALAP/HEIGHT to zero.  */
2587   node_order_params_arr = (nopa) xcalloc (num_nodes,
2588 					  sizeof (struct node_order_params));
2589 
2590   /* Set the aux pointer of each node to point to its order_params structure.  */
2591   for (u = 0; u < num_nodes; u++)
2592     g->nodes[u].aux.info = &node_order_params_arr[u];
2593 
2594   /* Disregarding a backarc from each recursive cycle to obtain a DAG,
2595      calculate ASAP, ALAP, mobility, distance, and height for each node
2596      in the dependence (direct acyclic) graph.  */
2597 
2598   /* We assume that the nodes in the array are in topological order.  */
2599 
2600   max_asap = 0;
2601   for (u = 0; u < num_nodes; u++)
2602     {
2603       ddg_node_ptr u_node = &g->nodes[u];
2604 
2605       ASAP (u_node) = 0;
2606       for (e = u_node->in; e; e = e->next_in)
2607 	if (e->distance == 0)
2608 	  ASAP (u_node) = MAX (ASAP (u_node),
2609 			       ASAP (e->src) + e->latency);
2610       max_asap = MAX (max_asap, ASAP (u_node));
2611     }
2612 
2613   for (u = num_nodes - 1; u > -1; u--)
2614     {
2615       ddg_node_ptr u_node = &g->nodes[u];
2616 
2617       ALAP (u_node) = max_asap;
2618       HEIGHT (u_node) = 0;
2619       for (e = u_node->out; e; e = e->next_out)
2620 	if (e->distance == 0)
2621 	  {
2622 	    ALAP (u_node) = MIN (ALAP (u_node),
2623 				 ALAP (e->dest) - e->latency);
2624 	    HEIGHT (u_node) = MAX (HEIGHT (u_node),
2625 				   HEIGHT (e->dest) + e->latency);
2626 	  }
2627     }
2628   if (dump_file)
2629   {
2630     fprintf (dump_file, "\nOrder params\n");
2631     for (u = 0; u < num_nodes; u++)
2632       {
2633         ddg_node_ptr u_node = &g->nodes[u];
2634 
2635         fprintf (dump_file, "node %d, ASAP: %d, ALAP: %d, HEIGHT: %d\n", u,
2636                  ASAP (u_node), ALAP (u_node), HEIGHT (u_node));
2637       }
2638   }
2639 
2640   *pmax_asap = max_asap;
2641   return node_order_params_arr;
2642 }
2643 
2644 static int
find_max_asap(ddg_ptr g,sbitmap nodes)2645 find_max_asap (ddg_ptr g, sbitmap nodes)
2646 {
2647   unsigned int u = 0;
2648   int max_asap = -1;
2649   int result = -1;
2650   sbitmap_iterator sbi;
2651 
2652   EXECUTE_IF_SET_IN_BITMAP (nodes, 0, u, sbi)
2653     {
2654       ddg_node_ptr u_node = &g->nodes[u];
2655 
2656       if (max_asap < ASAP (u_node))
2657 	{
2658 	  max_asap = ASAP (u_node);
2659 	  result = u;
2660 	}
2661     }
2662   return result;
2663 }
2664 
2665 static int
find_max_hv_min_mob(ddg_ptr g,sbitmap nodes)2666 find_max_hv_min_mob (ddg_ptr g, sbitmap nodes)
2667 {
2668   unsigned int u = 0;
2669   int max_hv = -1;
2670   int min_mob = INT_MAX;
2671   int result = -1;
2672   sbitmap_iterator sbi;
2673 
2674   EXECUTE_IF_SET_IN_BITMAP (nodes, 0, u, sbi)
2675     {
2676       ddg_node_ptr u_node = &g->nodes[u];
2677 
2678       if (max_hv < HEIGHT (u_node))
2679 	{
2680 	  max_hv = HEIGHT (u_node);
2681 	  min_mob = MOB (u_node);
2682 	  result = u;
2683 	}
2684       else if ((max_hv == HEIGHT (u_node))
2685 	       && (min_mob > MOB (u_node)))
2686 	{
2687 	  min_mob = MOB (u_node);
2688 	  result = u;
2689 	}
2690     }
2691   return result;
2692 }
2693 
2694 static int
find_max_dv_min_mob(ddg_ptr g,sbitmap nodes)2695 find_max_dv_min_mob (ddg_ptr g, sbitmap nodes)
2696 {
2697   unsigned int u = 0;
2698   int max_dv = -1;
2699   int min_mob = INT_MAX;
2700   int result = -1;
2701   sbitmap_iterator sbi;
2702 
2703   EXECUTE_IF_SET_IN_BITMAP (nodes, 0, u, sbi)
2704     {
2705       ddg_node_ptr u_node = &g->nodes[u];
2706 
2707       if (max_dv < DEPTH (u_node))
2708 	{
2709 	  max_dv = DEPTH (u_node);
2710 	  min_mob = MOB (u_node);
2711 	  result = u;
2712 	}
2713       else if ((max_dv == DEPTH (u_node))
2714 	       && (min_mob > MOB (u_node)))
2715 	{
2716 	  min_mob = MOB (u_node);
2717 	  result = u;
2718 	}
2719     }
2720   return result;
2721 }
2722 
2723 /* Places the nodes of SCC into the NODE_ORDER array starting
2724    at position POS, according to the SMS ordering algorithm.
2725    NODES_ORDERED (in&out parameter) holds the bitset of all nodes in
2726    the NODE_ORDER array, starting from position zero.  */
2727 static int
order_nodes_in_scc(ddg_ptr g,sbitmap nodes_ordered,sbitmap scc,int * node_order,int pos)2728 order_nodes_in_scc (ddg_ptr g, sbitmap nodes_ordered, sbitmap scc,
2729 		    int * node_order, int pos)
2730 {
2731   enum sms_direction dir;
2732   int num_nodes = g->num_nodes;
2733   auto_sbitmap workset (num_nodes);
2734   auto_sbitmap tmp (num_nodes);
2735   sbitmap zero_bitmap = sbitmap_alloc (num_nodes);
2736   auto_sbitmap predecessors (num_nodes);
2737   auto_sbitmap successors (num_nodes);
2738 
2739   bitmap_clear (predecessors);
2740   find_predecessors (predecessors, g, nodes_ordered);
2741 
2742   bitmap_clear (successors);
2743   find_successors (successors, g, nodes_ordered);
2744 
2745   bitmap_clear (tmp);
2746   if (bitmap_and (tmp, predecessors, scc))
2747     {
2748       bitmap_copy (workset, tmp);
2749       dir = BOTTOMUP;
2750     }
2751   else if (bitmap_and (tmp, successors, scc))
2752     {
2753       bitmap_copy (workset, tmp);
2754       dir = TOPDOWN;
2755     }
2756   else
2757     {
2758       int u;
2759 
2760       bitmap_clear (workset);
2761       if ((u = find_max_asap (g, scc)) >= 0)
2762 	bitmap_set_bit (workset, u);
2763       dir = BOTTOMUP;
2764     }
2765 
2766   bitmap_clear (zero_bitmap);
2767   while (!bitmap_equal_p (workset, zero_bitmap))
2768     {
2769       int v;
2770       ddg_node_ptr v_node;
2771       sbitmap v_node_preds;
2772       sbitmap v_node_succs;
2773 
2774       if (dir == TOPDOWN)
2775 	{
2776 	  while (!bitmap_equal_p (workset, zero_bitmap))
2777 	    {
2778 	      v = find_max_hv_min_mob (g, workset);
2779 	      v_node = &g->nodes[v];
2780 	      node_order[pos++] = v;
2781 	      v_node_succs = NODE_SUCCESSORS (v_node);
2782 	      bitmap_and (tmp, v_node_succs, scc);
2783 
2784 	      /* Don't consider the already ordered successors again.  */
2785 	      bitmap_and_compl (tmp, tmp, nodes_ordered);
2786 	      bitmap_ior (workset, workset, tmp);
2787 	      bitmap_clear_bit (workset, v);
2788 	      bitmap_set_bit (nodes_ordered, v);
2789 	    }
2790 	  dir = BOTTOMUP;
2791 	  bitmap_clear (predecessors);
2792 	  find_predecessors (predecessors, g, nodes_ordered);
2793 	  bitmap_and (workset, predecessors, scc);
2794 	}
2795       else
2796 	{
2797 	  while (!bitmap_equal_p (workset, zero_bitmap))
2798 	    {
2799 	      v = find_max_dv_min_mob (g, workset);
2800 	      v_node = &g->nodes[v];
2801 	      node_order[pos++] = v;
2802 	      v_node_preds = NODE_PREDECESSORS (v_node);
2803 	      bitmap_and (tmp, v_node_preds, scc);
2804 
2805 	      /* Don't consider the already ordered predecessors again.  */
2806 	      bitmap_and_compl (tmp, tmp, nodes_ordered);
2807 	      bitmap_ior (workset, workset, tmp);
2808 	      bitmap_clear_bit (workset, v);
2809 	      bitmap_set_bit (nodes_ordered, v);
2810 	    }
2811 	  dir = TOPDOWN;
2812 	  bitmap_clear (successors);
2813 	  find_successors (successors, g, nodes_ordered);
2814 	  bitmap_and (workset, successors, scc);
2815 	}
2816     }
2817   sbitmap_free (zero_bitmap);
2818   return pos;
2819 }
2820 
2821 
2822 /* This page contains functions for manipulating partial-schedules during
2823    modulo scheduling.  */
2824 
2825 /* Create a partial schedule and allocate a memory to hold II rows.  */
2826 
2827 static partial_schedule_ptr
create_partial_schedule(int ii,ddg_ptr g,int history)2828 create_partial_schedule (int ii, ddg_ptr g, int history)
2829 {
2830   partial_schedule_ptr ps = XNEW (struct partial_schedule);
2831   ps->rows = (ps_insn_ptr *) xcalloc (ii, sizeof (ps_insn_ptr));
2832   ps->rows_length = (int *) xcalloc (ii, sizeof (int));
2833   ps->reg_moves.create (0);
2834   ps->ii = ii;
2835   ps->history = history;
2836   ps->min_cycle = INT_MAX;
2837   ps->max_cycle = INT_MIN;
2838   ps->g = g;
2839 
2840   return ps;
2841 }
2842 
2843 /* Free the PS_INSNs in rows array of the given partial schedule.
2844    ??? Consider caching the PS_INSN's.  */
2845 static void
free_ps_insns(partial_schedule_ptr ps)2846 free_ps_insns (partial_schedule_ptr ps)
2847 {
2848   int i;
2849 
2850   for (i = 0; i < ps->ii; i++)
2851     {
2852       while (ps->rows[i])
2853 	{
2854 	  ps_insn_ptr ps_insn = ps->rows[i]->next_in_row;
2855 
2856 	  free (ps->rows[i]);
2857 	  ps->rows[i] = ps_insn;
2858 	}
2859       ps->rows[i] = NULL;
2860     }
2861 }
2862 
2863 /* Free all the memory allocated to the partial schedule.  */
2864 
2865 static void
free_partial_schedule(partial_schedule_ptr ps)2866 free_partial_schedule (partial_schedule_ptr ps)
2867 {
2868   ps_reg_move_info *move;
2869   unsigned int i;
2870 
2871   if (!ps)
2872     return;
2873 
2874   FOR_EACH_VEC_ELT (ps->reg_moves, i, move)
2875     sbitmap_free (move->uses);
2876   ps->reg_moves.release ();
2877 
2878   free_ps_insns (ps);
2879   free (ps->rows);
2880   free (ps->rows_length);
2881   free (ps);
2882 }
2883 
2884 /* Clear the rows array with its PS_INSNs, and create a new one with
2885    NEW_II rows.  */
2886 
2887 static void
reset_partial_schedule(partial_schedule_ptr ps,int new_ii)2888 reset_partial_schedule (partial_schedule_ptr ps, int new_ii)
2889 {
2890   if (!ps)
2891     return;
2892   free_ps_insns (ps);
2893   if (new_ii == ps->ii)
2894     return;
2895   ps->rows = (ps_insn_ptr *) xrealloc (ps->rows, new_ii
2896 						 * sizeof (ps_insn_ptr));
2897   memset (ps->rows, 0, new_ii * sizeof (ps_insn_ptr));
2898   ps->rows_length = (int *) xrealloc (ps->rows_length, new_ii * sizeof (int));
2899   memset (ps->rows_length, 0, new_ii * sizeof (int));
2900   ps->ii = new_ii;
2901   ps->min_cycle = INT_MAX;
2902   ps->max_cycle = INT_MIN;
2903 }
2904 
2905 /* Prints the partial schedule as an ii rows array, for each rows
2906    print the ids of the insns in it.  */
2907 void
print_partial_schedule(partial_schedule_ptr ps,FILE * dump)2908 print_partial_schedule (partial_schedule_ptr ps, FILE *dump)
2909 {
2910   int i;
2911 
2912   for (i = 0; i < ps->ii; i++)
2913     {
2914       ps_insn_ptr ps_i = ps->rows[i];
2915 
2916       fprintf (dump, "\n[ROW %d ]: ", i);
2917       while (ps_i)
2918 	{
2919 	  rtx_insn *insn = ps_rtl_insn (ps, ps_i->id);
2920 
2921 	  if (JUMP_P (insn))
2922 	    fprintf (dump, "%d (branch), ", INSN_UID (insn));
2923 	  else
2924 	    fprintf (dump, "%d, ", INSN_UID (insn));
2925 
2926 	  ps_i = ps_i->next_in_row;
2927 	}
2928     }
2929 }
2930 
2931 /* Creates an object of PS_INSN and initializes it to the given parameters.  */
2932 static ps_insn_ptr
create_ps_insn(int id,int cycle)2933 create_ps_insn (int id, int cycle)
2934 {
2935   ps_insn_ptr ps_i = XNEW (struct ps_insn);
2936 
2937   ps_i->id = id;
2938   ps_i->next_in_row = NULL;
2939   ps_i->prev_in_row = NULL;
2940   ps_i->cycle = cycle;
2941 
2942   return ps_i;
2943 }
2944 
2945 
2946 /* Removes the given PS_INSN from the partial schedule.  */
2947 static void
remove_node_from_ps(partial_schedule_ptr ps,ps_insn_ptr ps_i)2948 remove_node_from_ps (partial_schedule_ptr ps, ps_insn_ptr ps_i)
2949 {
2950   int row;
2951 
2952   gcc_assert (ps && ps_i);
2953 
2954   row = SMODULO (ps_i->cycle, ps->ii);
2955   if (! ps_i->prev_in_row)
2956     {
2957       gcc_assert (ps_i == ps->rows[row]);
2958       ps->rows[row] = ps_i->next_in_row;
2959       if (ps->rows[row])
2960 	ps->rows[row]->prev_in_row = NULL;
2961     }
2962   else
2963     {
2964       ps_i->prev_in_row->next_in_row = ps_i->next_in_row;
2965       if (ps_i->next_in_row)
2966 	ps_i->next_in_row->prev_in_row = ps_i->prev_in_row;
2967     }
2968 
2969   ps->rows_length[row] -= 1;
2970   free (ps_i);
2971   return;
2972 }
2973 
2974 /* Unlike what literature describes for modulo scheduling (which focuses
2975    on VLIW machines) the order of the instructions inside a cycle is
2976    important.  Given the bitmaps MUST_FOLLOW and MUST_PRECEDE we know
2977    where the current instruction should go relative to the already
2978    scheduled instructions in the given cycle.  Go over these
2979    instructions and find the first possible column to put it in.  */
2980 static bool
ps_insn_find_column(partial_schedule_ptr ps,ps_insn_ptr ps_i,sbitmap must_precede,sbitmap must_follow)2981 ps_insn_find_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2982 		     sbitmap must_precede, sbitmap must_follow)
2983 {
2984   ps_insn_ptr next_ps_i;
2985   ps_insn_ptr first_must_follow = NULL;
2986   ps_insn_ptr last_must_precede = NULL;
2987   ps_insn_ptr last_in_row = NULL;
2988   int row;
2989 
2990   if (! ps_i)
2991     return false;
2992 
2993   row = SMODULO (ps_i->cycle, ps->ii);
2994 
2995   /* Find the first must follow and the last must precede
2996      and insert the node immediately after the must precede
2997      but make sure that it there is no must follow after it.  */
2998   for (next_ps_i = ps->rows[row];
2999        next_ps_i;
3000        next_ps_i = next_ps_i->next_in_row)
3001     {
3002       if (must_follow
3003 	  && bitmap_bit_p (must_follow, next_ps_i->id)
3004 	  && ! first_must_follow)
3005         first_must_follow = next_ps_i;
3006       if (must_precede && bitmap_bit_p (must_precede, next_ps_i->id))
3007         {
3008           /* If we have already met a node that must follow, then
3009 	     there is no possible column.  */
3010   	  if (first_must_follow)
3011             return false;
3012 	  else
3013             last_must_precede = next_ps_i;
3014         }
3015       /* The closing branch must be the last in the row.  */
3016       if (JUMP_P (ps_rtl_insn (ps, next_ps_i->id)))
3017 	return false;
3018 
3019        last_in_row = next_ps_i;
3020     }
3021 
3022   /* The closing branch is scheduled as well.  Make sure there is no
3023      dependent instruction after it as the branch should be the last
3024      instruction in the row.  */
3025   if (JUMP_P (ps_rtl_insn (ps, ps_i->id)))
3026     {
3027       if (first_must_follow)
3028 	return false;
3029       if (last_in_row)
3030 	{
3031 	  /* Make the branch the last in the row.  New instructions
3032 	     will be inserted at the beginning of the row or after the
3033 	     last must_precede instruction thus the branch is guaranteed
3034 	     to remain the last instruction in the row.  */
3035 	  last_in_row->next_in_row = ps_i;
3036 	  ps_i->prev_in_row = last_in_row;
3037 	  ps_i->next_in_row = NULL;
3038 	}
3039       else
3040 	ps->rows[row] = ps_i;
3041       return true;
3042     }
3043 
3044   /* Now insert the node after INSERT_AFTER_PSI.  */
3045 
3046   if (! last_must_precede)
3047     {
3048       ps_i->next_in_row = ps->rows[row];
3049       ps_i->prev_in_row = NULL;
3050       if (ps_i->next_in_row)
3051     	ps_i->next_in_row->prev_in_row = ps_i;
3052       ps->rows[row] = ps_i;
3053     }
3054   else
3055     {
3056       ps_i->next_in_row = last_must_precede->next_in_row;
3057       last_must_precede->next_in_row = ps_i;
3058       ps_i->prev_in_row = last_must_precede;
3059       if (ps_i->next_in_row)
3060         ps_i->next_in_row->prev_in_row = ps_i;
3061     }
3062 
3063   return true;
3064 }
3065 
3066 /* Advances the PS_INSN one column in its current row; returns false
3067    in failure and true in success.  Bit N is set in MUST_FOLLOW if
3068    the node with cuid N must be come after the node pointed to by
3069    PS_I when scheduled in the same cycle.  */
3070 static int
ps_insn_advance_column(partial_schedule_ptr ps,ps_insn_ptr ps_i,sbitmap must_follow)3071 ps_insn_advance_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
3072 			sbitmap must_follow)
3073 {
3074   ps_insn_ptr prev, next;
3075   int row;
3076 
3077   if (!ps || !ps_i)
3078     return false;
3079 
3080   row = SMODULO (ps_i->cycle, ps->ii);
3081 
3082   if (! ps_i->next_in_row)
3083     return false;
3084 
3085   /* Check if next_in_row is dependent on ps_i, both having same sched
3086      times (typically ANTI_DEP).  If so, ps_i cannot skip over it.  */
3087   if (must_follow && bitmap_bit_p (must_follow, ps_i->next_in_row->id))
3088     return false;
3089 
3090   /* Advance PS_I over its next_in_row in the doubly linked list.  */
3091   prev = ps_i->prev_in_row;
3092   next = ps_i->next_in_row;
3093 
3094   if (ps_i == ps->rows[row])
3095     ps->rows[row] = next;
3096 
3097   ps_i->next_in_row = next->next_in_row;
3098 
3099   if (next->next_in_row)
3100     next->next_in_row->prev_in_row = ps_i;
3101 
3102   next->next_in_row = ps_i;
3103   ps_i->prev_in_row = next;
3104 
3105   next->prev_in_row = prev;
3106   if (prev)
3107     prev->next_in_row = next;
3108 
3109   return true;
3110 }
3111 
3112 /* Inserts a DDG_NODE to the given partial schedule at the given cycle.
3113    Returns 0 if this is not possible and a PS_INSN otherwise.  Bit N is
3114    set in MUST_PRECEDE/MUST_FOLLOW if the node with cuid N must be come
3115    before/after (respectively) the node pointed to by PS_I when scheduled
3116    in the same cycle.  */
3117 static ps_insn_ptr
add_node_to_ps(partial_schedule_ptr ps,int id,int cycle,sbitmap must_precede,sbitmap must_follow)3118 add_node_to_ps (partial_schedule_ptr ps, int id, int cycle,
3119 		sbitmap must_precede, sbitmap must_follow)
3120 {
3121   ps_insn_ptr ps_i;
3122   int row = SMODULO (cycle, ps->ii);
3123 
3124   if (ps->rows_length[row] >= issue_rate)
3125     return NULL;
3126 
3127   ps_i = create_ps_insn (id, cycle);
3128 
3129   /* Finds and inserts PS_I according to MUST_FOLLOW and
3130      MUST_PRECEDE.  */
3131   if (! ps_insn_find_column (ps, ps_i, must_precede, must_follow))
3132     {
3133       free (ps_i);
3134       return NULL;
3135     }
3136 
3137   ps->rows_length[row] += 1;
3138   return ps_i;
3139 }
3140 
3141 /* Advance time one cycle.  Assumes DFA is being used.  */
3142 static void
advance_one_cycle(void)3143 advance_one_cycle (void)
3144 {
3145   if (targetm.sched.dfa_pre_cycle_insn)
3146     state_transition (curr_state,
3147 		      targetm.sched.dfa_pre_cycle_insn ());
3148 
3149   state_transition (curr_state, NULL);
3150 
3151   if (targetm.sched.dfa_post_cycle_insn)
3152     state_transition (curr_state,
3153 		      targetm.sched.dfa_post_cycle_insn ());
3154 }
3155 
3156 
3157 
3158 /* Checks if PS has resource conflicts according to DFA, starting from
3159    FROM cycle to TO cycle; returns true if there are conflicts and false
3160    if there are no conflicts.  Assumes DFA is being used.  */
3161 static int
ps_has_conflicts(partial_schedule_ptr ps,int from,int to)3162 ps_has_conflicts (partial_schedule_ptr ps, int from, int to)
3163 {
3164   int cycle;
3165 
3166   state_reset (curr_state);
3167 
3168   for (cycle = from; cycle <= to; cycle++)
3169     {
3170       ps_insn_ptr crr_insn;
3171       /* Holds the remaining issue slots in the current row.  */
3172       int can_issue_more = issue_rate;
3173 
3174       /* Walk through the DFA for the current row.  */
3175       for (crr_insn = ps->rows[SMODULO (cycle, ps->ii)];
3176 	   crr_insn;
3177 	   crr_insn = crr_insn->next_in_row)
3178 	{
3179 	  rtx_insn *insn = ps_rtl_insn (ps, crr_insn->id);
3180 
3181 	  /* Check if there is room for the current insn.  */
3182 	  if (!can_issue_more || state_dead_lock_p (curr_state))
3183 	    return true;
3184 
3185 	  /* Update the DFA state and return with failure if the DFA found
3186 	     resource conflicts.  */
3187 	  if (state_transition (curr_state, insn) >= 0)
3188 	    return true;
3189 
3190 	  if (targetm.sched.variable_issue)
3191 	    can_issue_more =
3192 	      targetm.sched.variable_issue (sched_dump, sched_verbose,
3193 					    insn, can_issue_more);
3194 	  /* A naked CLOBBER or USE generates no instruction, so don't
3195 	     let them consume issue slots.  */
3196 	  else if (GET_CODE (PATTERN (insn)) != USE
3197 		   && GET_CODE (PATTERN (insn)) != CLOBBER)
3198 	    can_issue_more--;
3199 	}
3200 
3201       /* Advance the DFA to the next cycle.  */
3202       advance_one_cycle ();
3203     }
3204   return false;
3205 }
3206 
3207 /* Checks if the given node causes resource conflicts when added to PS at
3208    cycle C.  If not the node is added to PS and returned; otherwise zero
3209    is returned.  Bit N is set in MUST_PRECEDE/MUST_FOLLOW if the node with
3210    cuid N must be come before/after (respectively) the node pointed to by
3211    PS_I when scheduled in the same cycle.  */
3212 ps_insn_ptr
ps_add_node_check_conflicts(partial_schedule_ptr ps,int n,int c,sbitmap must_precede,sbitmap must_follow)3213 ps_add_node_check_conflicts (partial_schedule_ptr ps, int n,
3214    			     int c, sbitmap must_precede,
3215 			     sbitmap must_follow)
3216 {
3217   int i, first, amount, has_conflicts = 0;
3218   ps_insn_ptr ps_i;
3219 
3220   /* First add the node to the PS, if this succeeds check for
3221      conflicts, trying different issue slots in the same row.  */
3222   if (! (ps_i = add_node_to_ps (ps, n, c, must_precede, must_follow)))
3223     return NULL; /* Failed to insert the node at the given cycle.  */
3224 
3225   while (1)
3226     {
3227       has_conflicts = ps_has_conflicts (ps, c, c);
3228       if (ps->history > 0 && !has_conflicts)
3229 	{
3230 	  /* Check all 2h+1 intervals, starting from c-2h..c up to c..2h,
3231 	     but not more than ii intervals.  */
3232 	  first = c - ps->history;
3233 	  amount = 2 * ps->history + 1;
3234 	  if (amount > ps->ii)
3235 	    amount = ps->ii;
3236 	  for (i = first; i < first + amount; i++)
3237 	    {
3238 	      has_conflicts = ps_has_conflicts (ps,
3239 						i - ps->history,
3240 						i + ps->history);
3241 	      if (has_conflicts)
3242 		break;
3243 	    }
3244 	}
3245       if (!has_conflicts)
3246 	break;
3247       /* Try different issue slots to find one that the given node can be
3248 	 scheduled in without conflicts.  */
3249       if (! ps_insn_advance_column (ps, ps_i, must_follow))
3250 	break;
3251     }
3252 
3253   if (has_conflicts)
3254     {
3255       remove_node_from_ps (ps, ps_i);
3256       return NULL;
3257     }
3258 
3259   ps->min_cycle = MIN (ps->min_cycle, c);
3260   ps->max_cycle = MAX (ps->max_cycle, c);
3261   return ps_i;
3262 }
3263 
3264 /* Calculate the stage count of the partial schedule PS.  The calculation
3265    takes into account the rotation amount passed in ROTATION_AMOUNT.  */
3266 int
calculate_stage_count(partial_schedule_ptr ps,int rotation_amount)3267 calculate_stage_count (partial_schedule_ptr ps, int rotation_amount)
3268 {
3269   int new_min_cycle = PS_MIN_CYCLE (ps) - rotation_amount;
3270   int new_max_cycle = PS_MAX_CYCLE (ps) - rotation_amount;
3271   int stage_count = CALC_STAGE_COUNT (-1, new_min_cycle, ps->ii);
3272 
3273   /* The calculation of stage count is done adding the number of stages
3274      before cycle zero and after cycle zero.  */
3275   stage_count += CALC_STAGE_COUNT (new_max_cycle, 0, ps->ii);
3276 
3277   return stage_count;
3278 }
3279 
3280 /* Rotate the rows of PS such that insns scheduled at time
3281    START_CYCLE will appear in row 0.  Updates max/min_cycles.  */
3282 void
rotate_partial_schedule(partial_schedule_ptr ps,int start_cycle)3283 rotate_partial_schedule (partial_schedule_ptr ps, int start_cycle)
3284 {
3285   int i, row, backward_rotates;
3286   int last_row = ps->ii - 1;
3287 
3288   if (start_cycle == 0)
3289     return;
3290 
3291   backward_rotates = SMODULO (start_cycle, ps->ii);
3292 
3293   /* Revisit later and optimize this into a single loop.  */
3294   for (i = 0; i < backward_rotates; i++)
3295     {
3296       ps_insn_ptr first_row = ps->rows[0];
3297       int first_row_length = ps->rows_length[0];
3298 
3299       for (row = 0; row < last_row; row++)
3300 	{
3301 	  ps->rows[row] = ps->rows[row + 1];
3302 	  ps->rows_length[row] = ps->rows_length[row + 1];
3303 	}
3304 
3305       ps->rows[last_row] = first_row;
3306       ps->rows_length[last_row] = first_row_length;
3307     }
3308 
3309   ps->max_cycle -= start_cycle;
3310   ps->min_cycle -= start_cycle;
3311 }
3312 
3313 #endif /* INSN_SCHEDULING */
3314 
3315 /* Run instruction scheduler.  */
3316 /* Perform SMS module scheduling.  */
3317 
3318 namespace {
3319 
3320 const pass_data pass_data_sms =
3321 {
3322   RTL_PASS, /* type */
3323   "sms", /* name */
3324   OPTGROUP_NONE, /* optinfo_flags */
3325   TV_SMS, /* tv_id */
3326   0, /* properties_required */
3327   0, /* properties_provided */
3328   0, /* properties_destroyed */
3329   0, /* todo_flags_start */
3330   TODO_df_finish, /* todo_flags_finish */
3331 };
3332 
3333 class pass_sms : public rtl_opt_pass
3334 {
3335 public:
pass_sms(gcc::context * ctxt)3336   pass_sms (gcc::context *ctxt)
3337     : rtl_opt_pass (pass_data_sms, ctxt)
3338   {}
3339 
3340   /* opt_pass methods: */
gate(function *)3341   virtual bool gate (function *)
3342 {
3343   return (optimize > 0 && flag_modulo_sched);
3344 }
3345 
3346   virtual unsigned int execute (function *);
3347 
3348 }; // class pass_sms
3349 
3350 unsigned int
execute(function * fun ATTRIBUTE_UNUSED)3351 pass_sms::execute (function *fun ATTRIBUTE_UNUSED)
3352 {
3353 #ifdef INSN_SCHEDULING
3354   basic_block bb;
3355 
3356   /* Collect loop information to be used in SMS.  */
3357   cfg_layout_initialize (0);
3358   sms_schedule ();
3359 
3360   /* Update the life information, because we add pseudos.  */
3361   max_regno = max_reg_num ();
3362 
3363   /* Finalize layout changes.  */
3364   FOR_EACH_BB_FN (bb, fun)
3365     if (bb->next_bb != EXIT_BLOCK_PTR_FOR_FN (fun))
3366       bb->aux = bb->next_bb;
3367   free_dominance_info (CDI_DOMINATORS);
3368   cfg_layout_finalize ();
3369 #endif /* INSN_SCHEDULING */
3370   return 0;
3371 }
3372 
3373 } // anon namespace
3374 
3375 rtl_opt_pass *
make_pass_sms(gcc::context * ctxt)3376 make_pass_sms (gcc::context *ctxt)
3377 {
3378   return new pass_sms (ctxt);
3379 }
3380