1 /* Swing Modulo Scheduling implementation.
2    Copyright (C) 2004-2019 Free Software Foundation, Inc.
3    Contributed by Ayal Zaks and Mustafa Hagog <zaks,mustafa@il.ibm.com>
4 
5 This file is part of GCC.
6 
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
10 version.
11 
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15 for more details.
16 
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3.  If not see
19 <http://www.gnu.org/licenses/>.  */
20 
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "cfghooks.h"
30 #include "df.h"
31 #include "memmodel.h"
32 #include "optabs.h"
33 #include "regs.h"
34 #include "emit-rtl.h"
35 #include "gcov-io.h"
36 #include "profile.h"
37 #include "insn-attr.h"
38 #include "cfgrtl.h"
39 #include "sched-int.h"
40 #include "cfgloop.h"
41 #include "expr.h"
42 #include "params.h"
43 #include "ddg.h"
44 #include "tree-pass.h"
45 #include "dbgcnt.h"
46 #include "loop-unroll.h"
47 
48 #ifdef INSN_SCHEDULING
49 
50 /* This file contains the implementation of the Swing Modulo Scheduler,
51    described in the following references:
52    [1] J. Llosa, A. Gonzalez, E. Ayguade, M. Valero., and J. Eckhardt.
53        Lifetime--sensitive modulo scheduling in a production environment.
54        IEEE Trans. on Comps., 50(3), March 2001
55    [2] J. Llosa, A. Gonzalez, E. Ayguade, and M. Valero.
56        Swing Modulo Scheduling: A Lifetime Sensitive Approach.
57        PACT '96 , pages 80-87, October 1996 (Boston - Massachusetts - USA).
58 
59    The basic structure is:
60    1. Build a data-dependence graph (DDG) for each loop.
61    2. Use the DDG to order the insns of a loop (not in topological order
62       necessarily, but rather) trying to place each insn after all its
63       predecessors _or_ after all its successors.
64    3. Compute MII: a lower bound on the number of cycles to schedule the loop.
65    4. Use the ordering to perform list-scheduling of the loop:
66       1. Set II = MII.  We will try to schedule the loop within II cycles.
67       2. Try to schedule the insns one by one according to the ordering.
68 	 For each insn compute an interval of cycles by considering already-
69 	 scheduled preds and succs (and associated latencies); try to place
70 	 the insn in the cycles of this window checking for potential
71 	 resource conflicts (using the DFA interface).
72 	 Note: this is different from the cycle-scheduling of schedule_insns;
73 	 here the insns are not scheduled monotonically top-down (nor bottom-
74 	 up).
75       3. If failed in scheduling all insns - bump II++ and try again, unless
76 	 II reaches an upper bound MaxII, in which case report failure.
77    5. If we succeeded in scheduling the loop within II cycles, we now
78       generate prolog and epilog, decrease the counter of the loop, and
79       perform modulo variable expansion for live ranges that span more than
80       II cycles (i.e. use register copies to prevent a def from overwriting
81       itself before reaching the use).
82 
83     SMS works with countable loops (1) whose control part can be easily
84     decoupled from the rest of the loop and (2) whose loop count can
85     be easily adjusted.  This is because we peel a constant number of
86     iterations into a prologue and epilogue for which we want to avoid
87     emitting the control part, and a kernel which is to iterate that
88     constant number of iterations less than the original loop.  So the
89     control part should be a set of insns clearly identified and having
90     its own iv, not otherwise used in the loop (at-least for now), which
91     initializes a register before the loop to the number of iterations.
92     Currently SMS relies on the do-loop pattern to recognize such loops,
93     where (1) the control part comprises of all insns defining and/or
94     using a certain 'count' register and (2) the loop count can be
95     adjusted by modifying this register prior to the loop.
96     TODO: Rely on cfgloop analysis instead.  */
97 
98 /* This page defines partial-schedule structures and functions for
99    modulo scheduling.  */
100 
101 typedef struct partial_schedule *partial_schedule_ptr;
102 typedef struct ps_insn *ps_insn_ptr;
103 
104 /* The minimum (absolute) cycle that a node of ps was scheduled in.  */
105 #define PS_MIN_CYCLE(ps) (((partial_schedule_ptr)(ps))->min_cycle)
106 
107 /* The maximum (absolute) cycle that a node of ps was scheduled in.  */
108 #define PS_MAX_CYCLE(ps) (((partial_schedule_ptr)(ps))->max_cycle)
109 
110 /* Perform signed modulo, always returning a non-negative value.  */
111 #define SMODULO(x,y) ((x) % (y) < 0 ? ((x) % (y) + (y)) : (x) % (y))
112 
113 /* The number of different iterations the nodes in ps span, assuming
114    the stage boundaries are placed efficiently.  */
115 #define CALC_STAGE_COUNT(max_cycle,min_cycle,ii) ((max_cycle - min_cycle \
116                          + 1 + ii - 1) / ii)
117 /* The stage count of ps.  */
118 #define PS_STAGE_COUNT(ps) (((partial_schedule_ptr)(ps))->stage_count)
119 
120 /* A single instruction in the partial schedule.  */
121 struct ps_insn
122 {
123   /* Identifies the instruction to be scheduled.  Values smaller than
124      the ddg's num_nodes refer directly to ddg nodes.  A value of
125      X - num_nodes refers to register move X.  */
126   int id;
127 
128   /* The (absolute) cycle in which the PS instruction is scheduled.
129      Same as SCHED_TIME (node).  */
130   int cycle;
131 
132   /* The next/prev PS_INSN in the same row.  */
133   ps_insn_ptr next_in_row,
134 	      prev_in_row;
135 
136 };
137 
138 /* Information about a register move that has been added to a partial
139    schedule.  */
140 struct ps_reg_move_info
141 {
142   /* The source of the move is defined by the ps_insn with id DEF.
143      The destination is used by the ps_insns with the ids in USES.  */
144   int def;
145   sbitmap uses;
146 
147   /* The original form of USES' instructions used OLD_REG, but they
148      should now use NEW_REG.  */
149   rtx old_reg;
150   rtx new_reg;
151 
152   /* The number of consecutive stages that the move occupies.  */
153   int num_consecutive_stages;
154 
155   /* An instruction that sets NEW_REG to the correct value.  The first
156      move associated with DEF will have an rhs of OLD_REG; later moves
157      use the result of the previous move.  */
158   rtx_insn *insn;
159 };
160 
161 /* Holds the partial schedule as an array of II rows.  Each entry of the
162    array points to a linked list of PS_INSNs, which represents the
163    instructions that are scheduled for that row.  */
164 struct partial_schedule
165 {
166   int ii;	/* Number of rows in the partial schedule.  */
167   int history;  /* Threshold for conflict checking using DFA.  */
168 
169   /* rows[i] points to linked list of insns scheduled in row i (0<=i<ii).  */
170   ps_insn_ptr *rows;
171 
172   /* All the moves added for this partial schedule.  Index X has
173      a ps_insn id of X + g->num_nodes.  */
174   vec<ps_reg_move_info> reg_moves;
175 
176   /*  rows_length[i] holds the number of instructions in the row.
177       It is used only (as an optimization) to back off quickly from
178       trying to schedule a node in a full row; that is, to avoid running
179       through futile DFA state transitions.  */
180   int *rows_length;
181 
182   /* The earliest absolute cycle of an insn in the partial schedule.  */
183   int min_cycle;
184 
185   /* The latest absolute cycle of an insn in the partial schedule.  */
186   int max_cycle;
187 
188   ddg_ptr g;	/* The DDG of the insns in the partial schedule.  */
189 
190   int stage_count;  /* The stage count of the partial schedule.  */
191 };
192 
193 
194 static partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int history);
195 static void free_partial_schedule (partial_schedule_ptr);
196 static void reset_partial_schedule (partial_schedule_ptr, int new_ii);
197 void print_partial_schedule (partial_schedule_ptr, FILE *);
198 static void verify_partial_schedule (partial_schedule_ptr, sbitmap);
199 static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
200 						int, int, sbitmap, sbitmap);
201 static void rotate_partial_schedule (partial_schedule_ptr, int);
202 void set_row_column_for_ps (partial_schedule_ptr);
203 static void ps_insert_empty_row (partial_schedule_ptr, int, sbitmap);
204 static int compute_split_row (sbitmap, int, int, int, ddg_node_ptr);
205 
206 
207 /* This page defines constants and structures for the modulo scheduling
208    driver.  */
209 
210 static int sms_order_nodes (ddg_ptr, int, int *, int *);
211 static void set_node_sched_params (ddg_ptr);
212 static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int, int *);
213 static void permute_partial_schedule (partial_schedule_ptr, rtx_insn *);
214 static void generate_prolog_epilog (partial_schedule_ptr, struct loop *,
215                                     rtx, rtx);
216 static int calculate_stage_count (partial_schedule_ptr, int);
217 static void calculate_must_precede_follow (ddg_node_ptr, int, int,
218 					   int, int, sbitmap, sbitmap, sbitmap);
219 static int get_sched_window (partial_schedule_ptr, ddg_node_ptr,
220 			     sbitmap, int, int *, int *, int *);
221 static bool try_scheduling_node_in_cycle (partial_schedule_ptr, int, int,
222 					  sbitmap, int *, sbitmap, sbitmap);
223 static void remove_node_from_ps (partial_schedule_ptr, ps_insn_ptr);
224 
225 #define NODE_ASAP(node) ((node)->aux.count)
226 
227 #define SCHED_PARAMS(x) (&node_sched_param_vec[x])
228 #define SCHED_TIME(x) (SCHED_PARAMS (x)->time)
229 #define SCHED_ROW(x) (SCHED_PARAMS (x)->row)
230 #define SCHED_STAGE(x) (SCHED_PARAMS (x)->stage)
231 #define SCHED_COLUMN(x) (SCHED_PARAMS (x)->column)
232 
233 /* The scheduling parameters held for each node.  */
234 typedef struct node_sched_params
235 {
236   int time;	/* The absolute scheduling cycle.  */
237 
238   int row;    /* Holds time % ii.  */
239   int stage;  /* Holds time / ii.  */
240 
241   /* The column of a node inside the ps.  If nodes u, v are on the same row,
242      u will precede v if column (u) < column (v).  */
243   int column;
244 } *node_sched_params_ptr;
245 
246 /* The following three functions are copied from the current scheduler
247    code in order to use sched_analyze() for computing the dependencies.
248    They are used when initializing the sched_info structure.  */
249 static const char *
sms_print_insn(const rtx_insn * insn,int aligned ATTRIBUTE_UNUSED)250 sms_print_insn (const rtx_insn *insn, int aligned ATTRIBUTE_UNUSED)
251 {
252   static char tmp[80];
253 
254   sprintf (tmp, "i%4d", INSN_UID (insn));
255   return tmp;
256 }
257 
258 static void
compute_jump_reg_dependencies(rtx insn ATTRIBUTE_UNUSED,regset used ATTRIBUTE_UNUSED)259 compute_jump_reg_dependencies (rtx insn ATTRIBUTE_UNUSED,
260 			       regset used ATTRIBUTE_UNUSED)
261 {
262 }
263 
264 static struct common_sched_info_def sms_common_sched_info;
265 
266 static struct sched_deps_info_def sms_sched_deps_info =
267   {
268     compute_jump_reg_dependencies,
269     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
270     NULL,
271     0, 0, 0
272   };
273 
274 static struct haifa_sched_info sms_sched_info =
275 {
276   NULL,
277   NULL,
278   NULL,
279   NULL,
280   NULL,
281   sms_print_insn,
282   NULL,
283   NULL, /* insn_finishes_block_p */
284   NULL, NULL,
285   NULL, NULL,
286   0, 0,
287 
288   NULL, NULL, NULL, NULL,
289   NULL, NULL,
290   0
291 };
292 
293 /* Partial schedule instruction ID in PS is a register move.  Return
294    information about it.  */
295 static struct ps_reg_move_info *
ps_reg_move(partial_schedule_ptr ps,int id)296 ps_reg_move (partial_schedule_ptr ps, int id)
297 {
298   gcc_checking_assert (id >= ps->g->num_nodes);
299   return &ps->reg_moves[id - ps->g->num_nodes];
300 }
301 
302 /* Return the rtl instruction that is being scheduled by partial schedule
303    instruction ID, which belongs to schedule PS.  */
304 static rtx_insn *
ps_rtl_insn(partial_schedule_ptr ps,int id)305 ps_rtl_insn (partial_schedule_ptr ps, int id)
306 {
307   if (id < ps->g->num_nodes)
308     return ps->g->nodes[id].insn;
309   else
310     return ps_reg_move (ps, id)->insn;
311 }
312 
313 /* Partial schedule instruction ID, which belongs to PS, occurred in
314    the original (unscheduled) loop.  Return the first instruction
315    in the loop that was associated with ps_rtl_insn (PS, ID).
316    If the instruction had some notes before it, this is the first
317    of those notes.  */
318 static rtx_insn *
ps_first_note(partial_schedule_ptr ps,int id)319 ps_first_note (partial_schedule_ptr ps, int id)
320 {
321   gcc_assert (id < ps->g->num_nodes);
322   return ps->g->nodes[id].first_note;
323 }
324 
325 /* Return the number of consecutive stages that are occupied by
326    partial schedule instruction ID in PS.  */
327 static int
ps_num_consecutive_stages(partial_schedule_ptr ps,int id)328 ps_num_consecutive_stages (partial_schedule_ptr ps, int id)
329 {
330   if (id < ps->g->num_nodes)
331     return 1;
332   else
333     return ps_reg_move (ps, id)->num_consecutive_stages;
334 }
335 
336 /* Given HEAD and TAIL which are the first and last insns in a loop;
337    return the register which controls the loop.  Return zero if it has
338    more than one occurrence in the loop besides the control part or the
339    do-loop pattern is not of the form we expect.  */
340 static rtx
doloop_register_get(rtx_insn * head,rtx_insn * tail)341 doloop_register_get (rtx_insn *head, rtx_insn *tail)
342 {
343   rtx reg, condition;
344   rtx_insn *insn, *first_insn_not_to_check;
345 
346   if (!JUMP_P (tail))
347     return NULL_RTX;
348 
349   if (!targetm.code_for_doloop_end)
350     return NULL_RTX;
351 
352   /* TODO: Free SMS's dependence on doloop_condition_get.  */
353   condition = doloop_condition_get (tail);
354   if (! condition)
355     return NULL_RTX;
356 
357   if (REG_P (XEXP (condition, 0)))
358     reg = XEXP (condition, 0);
359   else if (GET_CODE (XEXP (condition, 0)) == PLUS
360 	   && REG_P (XEXP (XEXP (condition, 0), 0)))
361     reg = XEXP (XEXP (condition, 0), 0);
362   else
363     gcc_unreachable ();
364 
365   /* Check that the COUNT_REG has no other occurrences in the loop
366      until the decrement.  We assume the control part consists of
367      either a single (parallel) branch-on-count or a (non-parallel)
368      branch immediately preceded by a single (decrement) insn.  */
369   first_insn_not_to_check = (GET_CODE (PATTERN (tail)) == PARALLEL ? tail
370                              : prev_nondebug_insn (tail));
371 
372   for (insn = head; insn != first_insn_not_to_check; insn = NEXT_INSN (insn))
373     if (!DEBUG_INSN_P (insn) && reg_mentioned_p (reg, insn))
374       {
375         if (dump_file)
376         {
377           fprintf (dump_file, "SMS count_reg found ");
378           print_rtl_single (dump_file, reg);
379           fprintf (dump_file, " outside control in insn:\n");
380           print_rtl_single (dump_file, insn);
381         }
382 
383         return NULL_RTX;
384       }
385 
386   return reg;
387 }
388 
389 /* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
390    that the number of iterations is a compile-time constant.  If so,
391    return the rtx_insn that sets COUNT_REG to a constant, and set COUNT to
392    this constant.  Otherwise return 0.  */
393 static rtx_insn *
const_iteration_count(rtx count_reg,basic_block pre_header,int64_t * count)394 const_iteration_count (rtx count_reg, basic_block pre_header,
395 		       int64_t * count)
396 {
397   rtx_insn *insn;
398   rtx_insn *head, *tail;
399 
400   if (! pre_header)
401     return NULL;
402 
403   get_ebb_head_tail (pre_header, pre_header, &head, &tail);
404 
405   for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn))
406     if (NONDEBUG_INSN_P (insn) && single_set (insn) &&
407 	rtx_equal_p (count_reg, SET_DEST (single_set (insn))))
408       {
409 	rtx pat = single_set (insn);
410 
411 	if (CONST_INT_P (SET_SRC (pat)))
412 	  {
413 	    *count = INTVAL (SET_SRC (pat));
414 	    return insn;
415 	  }
416 
417 	return NULL;
418       }
419 
420   return NULL;
421 }
422 
423 /* A very simple resource-based lower bound on the initiation interval.
424    ??? Improve the accuracy of this bound by considering the
425    utilization of various units.  */
426 static int
res_MII(ddg_ptr g)427 res_MII (ddg_ptr g)
428 {
429   if (targetm.sched.sms_res_mii)
430     return targetm.sched.sms_res_mii (g);
431 
432   return ((g->num_nodes - g->num_debug) / issue_rate);
433 }
434 
435 
436 /* A vector that contains the sched data for each ps_insn.  */
437 static vec<node_sched_params> node_sched_param_vec;
438 
439 /* Allocate sched_params for each node and initialize it.  */
440 static void
set_node_sched_params(ddg_ptr g)441 set_node_sched_params (ddg_ptr g)
442 {
443   node_sched_param_vec.truncate (0);
444   node_sched_param_vec.safe_grow_cleared (g->num_nodes);
445 }
446 
447 /* Make sure that node_sched_param_vec has an entry for every move in PS.  */
448 static void
extend_node_sched_params(partial_schedule_ptr ps)449 extend_node_sched_params (partial_schedule_ptr ps)
450 {
451   node_sched_param_vec.safe_grow_cleared (ps->g->num_nodes
452 					  + ps->reg_moves.length ());
453 }
454 
455 /* Update the sched_params (time, row and stage) for node U using the II,
456    the CYCLE of U and MIN_CYCLE.
457    We're not simply taking the following
458    SCHED_STAGE (u) = CALC_STAGE_COUNT (SCHED_TIME (u), min_cycle, ii);
459    because the stages may not be aligned on cycle 0.  */
460 static void
update_node_sched_params(int u,int ii,int cycle,int min_cycle)461 update_node_sched_params (int u, int ii, int cycle, int min_cycle)
462 {
463   int sc_until_cycle_zero;
464   int stage;
465 
466   SCHED_TIME (u) = cycle;
467   SCHED_ROW (u) = SMODULO (cycle, ii);
468 
469   /* The calculation of stage count is done adding the number
470      of stages before cycle zero and after cycle zero.  */
471   sc_until_cycle_zero = CALC_STAGE_COUNT (-1, min_cycle, ii);
472 
473   if (SCHED_TIME (u) < 0)
474     {
475       stage = CALC_STAGE_COUNT (-1, SCHED_TIME (u), ii);
476       SCHED_STAGE (u) = sc_until_cycle_zero - stage;
477     }
478   else
479     {
480       stage = CALC_STAGE_COUNT (SCHED_TIME (u), 0, ii);
481       SCHED_STAGE (u) = sc_until_cycle_zero + stage - 1;
482     }
483 }
484 
485 static void
print_node_sched_params(FILE * file,int num_nodes,partial_schedule_ptr ps)486 print_node_sched_params (FILE *file, int num_nodes, partial_schedule_ptr ps)
487 {
488   int i;
489 
490   if (! file)
491     return;
492   for (i = 0; i < num_nodes; i++)
493     {
494       node_sched_params_ptr nsp = SCHED_PARAMS (i);
495 
496       fprintf (file, "Node = %d; INSN = %d\n", i,
497 	       INSN_UID (ps_rtl_insn (ps, i)));
498       fprintf (file, " asap = %d:\n", NODE_ASAP (&ps->g->nodes[i]));
499       fprintf (file, " time = %d:\n", nsp->time);
500       fprintf (file, " stage = %d:\n", nsp->stage);
501     }
502 }
503 
504 /* Set SCHED_COLUMN for each instruction in row ROW of PS.  */
505 static void
set_columns_for_row(partial_schedule_ptr ps,int row)506 set_columns_for_row (partial_schedule_ptr ps, int row)
507 {
508   ps_insn_ptr cur_insn;
509   int column;
510 
511   column = 0;
512   for (cur_insn = ps->rows[row]; cur_insn; cur_insn = cur_insn->next_in_row)
513     SCHED_COLUMN (cur_insn->id) = column++;
514 }
515 
516 /* Set SCHED_COLUMN for each instruction in PS.  */
517 static void
set_columns_for_ps(partial_schedule_ptr ps)518 set_columns_for_ps (partial_schedule_ptr ps)
519 {
520   int row;
521 
522   for (row = 0; row < ps->ii; row++)
523     set_columns_for_row (ps, row);
524 }
525 
526 /* Try to schedule the move with ps_insn identifier I_REG_MOVE in PS.
527    Its single predecessor has already been scheduled, as has its
528    ddg node successors.  (The move may have also another move as its
529    successor, in which case that successor will be scheduled later.)
530 
531    The move is part of a chain that satisfies register dependencies
532    between a producing ddg node and various consuming ddg nodes.
533    If some of these dependencies have a distance of 1 (meaning that
534    the use is upward-exposed) then DISTANCE1_USES is nonnull and
535    contains the set of uses with distance-1 dependencies.
536    DISTANCE1_USES is null otherwise.
537 
538    MUST_FOLLOW is a scratch bitmap that is big enough to hold
539    all current ps_insn ids.
540 
541    Return true on success.  */
542 static bool
schedule_reg_move(partial_schedule_ptr ps,int i_reg_move,sbitmap distance1_uses,sbitmap must_follow)543 schedule_reg_move (partial_schedule_ptr ps, int i_reg_move,
544 		   sbitmap distance1_uses, sbitmap must_follow)
545 {
546   unsigned int u;
547   int this_time, this_distance, this_start, this_end, this_latency;
548   int start, end, c, ii;
549   sbitmap_iterator sbi;
550   ps_reg_move_info *move;
551   rtx_insn *this_insn;
552   ps_insn_ptr psi;
553 
554   move = ps_reg_move (ps, i_reg_move);
555   ii = ps->ii;
556   if (dump_file)
557     {
558       fprintf (dump_file, "Scheduling register move INSN %d; ii = %d"
559 	       ", min cycle = %d\n\n", INSN_UID (move->insn), ii,
560 	       PS_MIN_CYCLE (ps));
561       print_rtl_single (dump_file, move->insn);
562       fprintf (dump_file, "\n%11s %11s %5s\n", "start", "end", "time");
563       fprintf (dump_file, "=========== =========== =====\n");
564     }
565 
566   start = INT_MIN;
567   end = INT_MAX;
568 
569   /* For dependencies of distance 1 between a producer ddg node A
570      and consumer ddg node B, we have a chain of dependencies:
571 
572         A --(T,L1,1)--> M1 --(T,L2,0)--> M2 ... --(T,Ln,0)--> B
573 
574      where Mi is the ith move.  For dependencies of distance 0 between
575      a producer ddg node A and consumer ddg node C, we have a chain of
576      dependencies:
577 
578         A --(T,L1',0)--> M1' --(T,L2',0)--> M2' ... --(T,Ln',0)--> C
579 
580      where Mi' occupies the same position as Mi but occurs a stage later.
581      We can only schedule each move once, so if we have both types of
582      chain, we model the second as:
583 
584         A --(T,L1',1)--> M1 --(T,L2',0)--> M2 ... --(T,Ln',-1)--> C
585 
586      First handle the dependencies between the previously-scheduled
587      predecessor and the move.  */
588   this_insn = ps_rtl_insn (ps, move->def);
589   this_latency = insn_latency (this_insn, move->insn);
590   this_distance = distance1_uses && move->def < ps->g->num_nodes ? 1 : 0;
591   this_time = SCHED_TIME (move->def) - this_distance * ii;
592   this_start = this_time + this_latency;
593   this_end = this_time + ii;
594   if (dump_file)
595     fprintf (dump_file, "%11d %11d %5d %d --(T,%d,%d)--> %d\n",
596 	     this_start, this_end, SCHED_TIME (move->def),
597 	     INSN_UID (this_insn), this_latency, this_distance,
598 	     INSN_UID (move->insn));
599 
600   if (start < this_start)
601     start = this_start;
602   if (end > this_end)
603     end = this_end;
604 
605   /* Handle the dependencies between the move and previously-scheduled
606      successors.  */
607   EXECUTE_IF_SET_IN_BITMAP (move->uses, 0, u, sbi)
608     {
609       this_insn = ps_rtl_insn (ps, u);
610       this_latency = insn_latency (move->insn, this_insn);
611       if (distance1_uses && !bitmap_bit_p (distance1_uses, u))
612 	this_distance = -1;
613       else
614 	this_distance = 0;
615       this_time = SCHED_TIME (u) + this_distance * ii;
616       this_start = this_time - ii;
617       this_end = this_time - this_latency;
618       if (dump_file)
619 	fprintf (dump_file, "%11d %11d %5d %d --(T,%d,%d)--> %d\n",
620 		 this_start, this_end, SCHED_TIME (u), INSN_UID (move->insn),
621 		 this_latency, this_distance, INSN_UID (this_insn));
622 
623       if (start < this_start)
624 	start = this_start;
625       if (end > this_end)
626 	end = this_end;
627     }
628 
629   if (dump_file)
630     {
631       fprintf (dump_file, "----------- ----------- -----\n");
632       fprintf (dump_file, "%11d %11d %5s %s\n", start, end, "", "(max, min)");
633     }
634 
635   bitmap_clear (must_follow);
636   bitmap_set_bit (must_follow, move->def);
637 
638   start = MAX (start, end - (ii - 1));
639   for (c = end; c >= start; c--)
640     {
641       psi = ps_add_node_check_conflicts (ps, i_reg_move, c,
642 					 move->uses, must_follow);
643       if (psi)
644 	{
645 	  update_node_sched_params (i_reg_move, ii, c, PS_MIN_CYCLE (ps));
646 	  if (dump_file)
647 	    fprintf (dump_file, "\nScheduled register move INSN %d at"
648 		     " time %d, row %d\n\n", INSN_UID (move->insn), c,
649 		     SCHED_ROW (i_reg_move));
650 	  return true;
651 	}
652     }
653 
654   if (dump_file)
655     fprintf (dump_file, "\nNo available slot\n\n");
656 
657   return false;
658 }
659 
660 /*
661    Breaking intra-loop register anti-dependences:
662    Each intra-loop register anti-dependence implies a cross-iteration true
663    dependence of distance 1. Therefore, we can remove such false dependencies
664    and figure out if the partial schedule broke them by checking if (for a
665    true-dependence of distance 1): SCHED_TIME (def) < SCHED_TIME (use) and
666    if so generate a register move.   The number of such moves is equal to:
667               SCHED_TIME (use) - SCHED_TIME (def)       { 0 broken
668    nreg_moves = ----------------------------------- + 1 - {   dependence.
669                             ii                          { 1 if not.
670 */
671 static bool
schedule_reg_moves(partial_schedule_ptr ps)672 schedule_reg_moves (partial_schedule_ptr ps)
673 {
674   ddg_ptr g = ps->g;
675   int ii = ps->ii;
676   int i;
677 
678   for (i = 0; i < g->num_nodes; i++)
679     {
680       ddg_node_ptr u = &g->nodes[i];
681       ddg_edge_ptr e;
682       int nreg_moves = 0, i_reg_move;
683       rtx prev_reg, old_reg;
684       int first_move;
685       int distances[2];
686       sbitmap distance1_uses;
687       rtx set = single_set (u->insn);
688 
689       /* Skip instructions that do not set a register.  */
690       if (set && !REG_P (SET_DEST (set)))
691         continue;
692 
693       /* Compute the number of reg_moves needed for u, by looking at life
694 	 ranges started at u (excluding self-loops).  */
695       distances[0] = distances[1] = false;
696       for (e = u->out; e; e = e->next_out)
697 	if (e->type == TRUE_DEP && e->dest != e->src)
698 	  {
699 	    int nreg_moves4e = (SCHED_TIME (e->dest->cuid)
700 				- SCHED_TIME (e->src->cuid)) / ii;
701 
702             if (e->distance == 1)
703               nreg_moves4e = (SCHED_TIME (e->dest->cuid)
704 			      - SCHED_TIME (e->src->cuid) + ii) / ii;
705 
706 	    /* If dest precedes src in the schedule of the kernel, then dest
707 	       will read before src writes and we can save one reg_copy.  */
708 	    if (SCHED_ROW (e->dest->cuid) == SCHED_ROW (e->src->cuid)
709 		&& SCHED_COLUMN (e->dest->cuid) < SCHED_COLUMN (e->src->cuid))
710 	      nreg_moves4e--;
711 
712             if (nreg_moves4e >= 1)
713 	      {
714 		/* !single_set instructions are not supported yet and
715 		   thus we do not except to encounter them in the loop
716 		   except from the doloop part.  For the latter case
717 		   we assume no regmoves are generated as the doloop
718 		   instructions are tied to the branch with an edge.  */
719 		gcc_assert (set);
720 		/* If the instruction contains auto-inc register then
721 		   validate that the regmov is being generated for the
722 		   target regsiter rather then the inc'ed register.	*/
723 		gcc_assert (!autoinc_var_is_used_p (u->insn, e->dest->insn));
724 	      }
725 
726 	    if (nreg_moves4e)
727 	      {
728 		gcc_assert (e->distance < 2);
729 		distances[e->distance] = true;
730 	      }
731 	    nreg_moves = MAX (nreg_moves, nreg_moves4e);
732 	  }
733 
734       if (nreg_moves == 0)
735 	continue;
736 
737       /* Create NREG_MOVES register moves.  */
738       first_move = ps->reg_moves.length ();
739       ps->reg_moves.safe_grow_cleared (first_move + nreg_moves);
740       extend_node_sched_params (ps);
741 
742       /* Record the moves associated with this node.  */
743       first_move += ps->g->num_nodes;
744 
745       /* Generate each move.  */
746       old_reg = prev_reg = SET_DEST (set);
747       if (HARD_REGISTER_P (old_reg))
748 	return false;
749 
750       for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
751 	{
752 	  ps_reg_move_info *move = ps_reg_move (ps, first_move + i_reg_move);
753 
754 	  move->def = i_reg_move > 0 ? first_move + i_reg_move - 1 : i;
755 	  move->uses = sbitmap_alloc (first_move + nreg_moves);
756 	  move->old_reg = old_reg;
757 	  move->new_reg = gen_reg_rtx (GET_MODE (prev_reg));
758 	  move->num_consecutive_stages = distances[0] && distances[1] ? 2 : 1;
759 	  move->insn = gen_move_insn (move->new_reg, copy_rtx (prev_reg));
760 	  bitmap_clear (move->uses);
761 
762 	  prev_reg = move->new_reg;
763 	}
764 
765       distance1_uses = distances[1] ? sbitmap_alloc (g->num_nodes) : NULL;
766 
767       if (distance1_uses)
768 	bitmap_clear (distance1_uses);
769 
770       /* Every use of the register defined by node may require a different
771 	 copy of this register, depending on the time the use is scheduled.
772 	 Record which uses require which move results.  */
773       for (e = u->out; e; e = e->next_out)
774 	if (e->type == TRUE_DEP && e->dest != e->src)
775 	  {
776 	    int dest_copy = (SCHED_TIME (e->dest->cuid)
777 			     - SCHED_TIME (e->src->cuid)) / ii;
778 
779 	    if (e->distance == 1)
780 	      dest_copy = (SCHED_TIME (e->dest->cuid)
781 			   - SCHED_TIME (e->src->cuid) + ii) / ii;
782 
783 	    if (SCHED_ROW (e->dest->cuid) == SCHED_ROW (e->src->cuid)
784 		&& SCHED_COLUMN (e->dest->cuid) < SCHED_COLUMN (e->src->cuid))
785 	      dest_copy--;
786 
787 	    if (dest_copy)
788 	      {
789 		ps_reg_move_info *move;
790 
791 		move = ps_reg_move (ps, first_move + dest_copy - 1);
792 		bitmap_set_bit (move->uses, e->dest->cuid);
793 		if (e->distance == 1)
794 		  bitmap_set_bit (distance1_uses, e->dest->cuid);
795 	      }
796 	  }
797 
798       auto_sbitmap must_follow (first_move + nreg_moves);
799       for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
800 	if (!schedule_reg_move (ps, first_move + i_reg_move,
801 				distance1_uses, must_follow))
802 	  break;
803       if (distance1_uses)
804 	sbitmap_free (distance1_uses);
805       if (i_reg_move < nreg_moves)
806 	return false;
807     }
808   return true;
809 }
810 
811 /* Emit the moves associated with PS.  Apply the substitutions
812    associated with them.  */
813 static void
apply_reg_moves(partial_schedule_ptr ps)814 apply_reg_moves (partial_schedule_ptr ps)
815 {
816   ps_reg_move_info *move;
817   int i;
818 
819   FOR_EACH_VEC_ELT (ps->reg_moves, i, move)
820     {
821       unsigned int i_use;
822       sbitmap_iterator sbi;
823 
824       EXECUTE_IF_SET_IN_BITMAP (move->uses, 0, i_use, sbi)
825 	{
826 	  replace_rtx (ps->g->nodes[i_use].insn, move->old_reg, move->new_reg);
827 	  df_insn_rescan (ps->g->nodes[i_use].insn);
828 	}
829     }
830 }
831 
832 /* Bump the SCHED_TIMEs of all nodes by AMOUNT.  Set the values of
833    SCHED_ROW and SCHED_STAGE.  Instruction scheduled on cycle AMOUNT
834    will move to cycle zero.  */
835 static void
reset_sched_times(partial_schedule_ptr ps,int amount)836 reset_sched_times (partial_schedule_ptr ps, int amount)
837 {
838   int row;
839   int ii = ps->ii;
840   ps_insn_ptr crr_insn;
841 
842   for (row = 0; row < ii; row++)
843     for (crr_insn = ps->rows[row]; crr_insn; crr_insn = crr_insn->next_in_row)
844       {
845 	int u = crr_insn->id;
846 	int normalized_time = SCHED_TIME (u) - amount;
847 	int new_min_cycle = PS_MIN_CYCLE (ps) - amount;
848 
849         if (dump_file)
850           {
851             /* Print the scheduling times after the rotation.  */
852 	    rtx_insn *insn = ps_rtl_insn (ps, u);
853 
854             fprintf (dump_file, "crr_insn->node=%d (insn id %d), "
855                      "crr_insn->cycle=%d, min_cycle=%d", u,
856                      INSN_UID (insn), normalized_time, new_min_cycle);
857             if (JUMP_P (insn))
858               fprintf (dump_file, " (branch)");
859             fprintf (dump_file, "\n");
860           }
861 
862 	gcc_assert (SCHED_TIME (u) >= ps->min_cycle);
863 	gcc_assert (SCHED_TIME (u) <= ps->max_cycle);
864 
865 	crr_insn->cycle = normalized_time;
866 	update_node_sched_params (u, ii, normalized_time, new_min_cycle);
867       }
868 }
869 
870 /* Permute the insns according to their order in PS, from row 0 to
871    row ii-1, and position them right before LAST.  This schedules
872    the insns of the loop kernel.  */
873 static void
permute_partial_schedule(partial_schedule_ptr ps,rtx_insn * last)874 permute_partial_schedule (partial_schedule_ptr ps, rtx_insn *last)
875 {
876   int ii = ps->ii;
877   int row;
878   ps_insn_ptr ps_ij;
879 
880   for (row = 0; row < ii ; row++)
881     for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
882       {
883 	rtx_insn *insn = ps_rtl_insn (ps, ps_ij->id);
884 
885 	if (PREV_INSN (last) != insn)
886 	  {
887 	    if (ps_ij->id < ps->g->num_nodes)
888 	      reorder_insns_nobb (ps_first_note (ps, ps_ij->id), insn,
889 				  PREV_INSN (last));
890 	    else
891 	      add_insn_before (insn, last, NULL);
892 	  }
893       }
894 }
895 
896 /* Set bitmaps TMP_FOLLOW and TMP_PRECEDE to MUST_FOLLOW and MUST_PRECEDE
897    respectively only if cycle C falls on the border of the scheduling
898    window boundaries marked by START and END cycles.  STEP is the
899    direction of the window.  */
900 static inline void
set_must_precede_follow(sbitmap * tmp_follow,sbitmap must_follow,sbitmap * tmp_precede,sbitmap must_precede,int c,int start,int end,int step)901 set_must_precede_follow (sbitmap *tmp_follow, sbitmap must_follow,
902 			 sbitmap *tmp_precede, sbitmap must_precede, int c,
903 			 int start, int end, int step)
904 {
905   *tmp_precede = NULL;
906   *tmp_follow = NULL;
907 
908   if (c == start)
909     {
910       if (step == 1)
911 	*tmp_precede = must_precede;
912       else			/* step == -1.  */
913 	*tmp_follow = must_follow;
914     }
915   if (c == end - step)
916     {
917       if (step == 1)
918 	*tmp_follow = must_follow;
919       else			/* step == -1.  */
920 	*tmp_precede = must_precede;
921     }
922 
923 }
924 
925 /* Return True if the branch can be moved to row ii-1 while
926    normalizing the partial schedule PS to start from cycle zero and thus
927    optimize the SC.  Otherwise return False.  */
928 static bool
optimize_sc(partial_schedule_ptr ps,ddg_ptr g)929 optimize_sc (partial_schedule_ptr ps, ddg_ptr g)
930 {
931   int amount = PS_MIN_CYCLE (ps);
932   int start, end, step;
933   int ii = ps->ii;
934   bool ok = false;
935   int stage_count, stage_count_curr;
936 
937   /* Compare the SC after normalization and SC after bringing the branch
938      to row ii-1.  If they are equal just bail out.  */
939   stage_count = calculate_stage_count (ps, amount);
940   stage_count_curr =
941     calculate_stage_count (ps, SCHED_TIME (g->closing_branch->cuid) - (ii - 1));
942 
943   if (stage_count == stage_count_curr)
944     {
945       if (dump_file)
946 	fprintf (dump_file, "SMS SC already optimized.\n");
947 
948       return false;
949     }
950 
951   if (dump_file)
952     {
953       fprintf (dump_file, "SMS Trying to optimize branch location\n");
954       fprintf (dump_file, "SMS partial schedule before trial:\n");
955       print_partial_schedule (ps, dump_file);
956     }
957 
958   /* First, normalize the partial scheduling.  */
959   reset_sched_times (ps, amount);
960   rotate_partial_schedule (ps, amount);
961   if (dump_file)
962     {
963       fprintf (dump_file,
964 	       "SMS partial schedule after normalization (ii, %d, SC %d):\n",
965 	       ii, stage_count);
966       print_partial_schedule (ps, dump_file);
967     }
968 
969   if (SMODULO (SCHED_TIME (g->closing_branch->cuid), ii) == ii - 1)
970     return true;
971 
972   auto_sbitmap sched_nodes (g->num_nodes);
973   bitmap_ones (sched_nodes);
974 
975   /* Calculate the new placement of the branch.  It should be in row
976      ii-1 and fall into it's scheduling window.  */
977   if (get_sched_window (ps, g->closing_branch, sched_nodes, ii, &start,
978 			&step, &end) == 0)
979     {
980       bool success;
981       ps_insn_ptr next_ps_i;
982       int branch_cycle = SCHED_TIME (g->closing_branch->cuid);
983       int row = SMODULO (branch_cycle, ps->ii);
984       int num_splits = 0;
985       sbitmap tmp_precede, tmp_follow;
986       int min_cycle, c;
987 
988       if (dump_file)
989 	fprintf (dump_file, "\nTrying to schedule node %d "
990 		 "INSN = %d  in (%d .. %d) step %d\n",
991 		 g->closing_branch->cuid,
992 		 (INSN_UID (g->closing_branch->insn)), start, end, step);
993 
994       gcc_assert ((step > 0 && start < end) || (step < 0 && start > end));
995       if (step == 1)
996 	{
997 	  c = start + ii - SMODULO (start, ii) - 1;
998 	  gcc_assert (c >= start);
999 	  if (c >= end)
1000 	    {
1001 	      if (dump_file)
1002 		fprintf (dump_file,
1003 			 "SMS failed to schedule branch at cycle: %d\n", c);
1004 	      return false;
1005 	    }
1006 	}
1007       else
1008 	{
1009 	  c = start - SMODULO (start, ii) - 1;
1010 	  gcc_assert (c <= start);
1011 
1012 	  if (c <= end)
1013 	    {
1014 	      if (dump_file)
1015 		fprintf (dump_file,
1016 			 "SMS failed to schedule branch at cycle: %d\n", c);
1017 	      return false;
1018 	    }
1019 	}
1020 
1021       auto_sbitmap must_precede (g->num_nodes);
1022       auto_sbitmap must_follow (g->num_nodes);
1023 
1024       /* Try to schedule the branch is it's new cycle.  */
1025       calculate_must_precede_follow (g->closing_branch, start, end,
1026 				     step, ii, sched_nodes,
1027 				     must_precede, must_follow);
1028 
1029       set_must_precede_follow (&tmp_follow, must_follow, &tmp_precede,
1030 			       must_precede, c, start, end, step);
1031 
1032       /* Find the element in the partial schedule related to the closing
1033          branch so we can remove it from it's current cycle.  */
1034       for (next_ps_i = ps->rows[row];
1035 	   next_ps_i; next_ps_i = next_ps_i->next_in_row)
1036 	if (next_ps_i->id == g->closing_branch->cuid)
1037 	  break;
1038 
1039       min_cycle = PS_MIN_CYCLE (ps) - SMODULO (PS_MIN_CYCLE (ps), ps->ii);
1040       remove_node_from_ps (ps, next_ps_i);
1041       success =
1042 	try_scheduling_node_in_cycle (ps, g->closing_branch->cuid, c,
1043 				      sched_nodes, &num_splits,
1044 				      tmp_precede, tmp_follow);
1045       gcc_assert (num_splits == 0);
1046       if (!success)
1047 	{
1048 	  if (dump_file)
1049 	    fprintf (dump_file,
1050 		     "SMS failed to schedule branch at cycle: %d, "
1051 		     "bringing it back to cycle %d\n", c, branch_cycle);
1052 
1053 	  /* The branch was failed to be placed in row ii - 1.
1054 	     Put it back in it's original place in the partial
1055 	     schedualing.  */
1056 	  set_must_precede_follow (&tmp_follow, must_follow, &tmp_precede,
1057 				   must_precede, branch_cycle, start, end,
1058 				   step);
1059 	  success =
1060 	    try_scheduling_node_in_cycle (ps, g->closing_branch->cuid,
1061 					  branch_cycle, sched_nodes,
1062 					  &num_splits, tmp_precede,
1063 					  tmp_follow);
1064 	  gcc_assert (success && (num_splits == 0));
1065 	  ok = false;
1066 	}
1067       else
1068 	{
1069 	  /* The branch is placed in row ii - 1.  */
1070 	  if (dump_file)
1071 	    fprintf (dump_file,
1072 		     "SMS success in moving branch to cycle %d\n", c);
1073 
1074 	  update_node_sched_params (g->closing_branch->cuid, ii, c,
1075 				    PS_MIN_CYCLE (ps));
1076 	  ok = true;
1077 	}
1078 
1079       /* This might have been added to a new first stage.  */
1080       if (PS_MIN_CYCLE (ps) < min_cycle)
1081 	reset_sched_times (ps, 0);
1082     }
1083 
1084   return ok;
1085 }
1086 
1087 static void
duplicate_insns_of_cycles(partial_schedule_ptr ps,int from_stage,int to_stage,rtx count_reg)1088 duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
1089 			   int to_stage, rtx count_reg)
1090 {
1091   int row;
1092   ps_insn_ptr ps_ij;
1093 
1094   for (row = 0; row < ps->ii; row++)
1095     for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
1096       {
1097 	int u = ps_ij->id;
1098 	int first_u, last_u;
1099 	rtx_insn *u_insn;
1100 
1101         /* Do not duplicate any insn which refers to count_reg as it
1102            belongs to the control part.
1103            The closing branch is scheduled as well and thus should
1104            be ignored.
1105            TODO: This should be done by analyzing the control part of
1106            the loop.  */
1107 	u_insn = ps_rtl_insn (ps, u);
1108         if (reg_mentioned_p (count_reg, u_insn)
1109             || JUMP_P (u_insn))
1110           continue;
1111 
1112 	first_u = SCHED_STAGE (u);
1113 	last_u = first_u + ps_num_consecutive_stages (ps, u) - 1;
1114 	if (from_stage <= last_u && to_stage >= first_u)
1115 	  {
1116 	    if (u < ps->g->num_nodes)
1117 	      duplicate_insn_chain (ps_first_note (ps, u), u_insn);
1118 	    else
1119 	      emit_insn (copy_rtx (PATTERN (u_insn)));
1120 	  }
1121       }
1122 }
1123 
1124 
1125 /* Generate the instructions (including reg_moves) for prolog & epilog.  */
1126 static void
generate_prolog_epilog(partial_schedule_ptr ps,struct loop * loop,rtx count_reg,rtx count_init)1127 generate_prolog_epilog (partial_schedule_ptr ps, struct loop *loop,
1128                         rtx count_reg, rtx count_init)
1129 {
1130   int i;
1131   int last_stage = PS_STAGE_COUNT (ps) - 1;
1132   edge e;
1133 
1134   /* Generate the prolog, inserting its insns on the loop-entry edge.  */
1135   start_sequence ();
1136 
1137   if (!count_init)
1138     {
1139       /* Generate instructions at the beginning of the prolog to
1140          adjust the loop count by STAGE_COUNT.  If loop count is constant
1141          (count_init), this constant is adjusted by STAGE_COUNT in
1142          generate_prolog_epilog function.  */
1143       rtx sub_reg = NULL_RTX;
1144 
1145       sub_reg = expand_simple_binop (GET_MODE (count_reg), MINUS, count_reg,
1146 				     gen_int_mode (last_stage,
1147 						   GET_MODE (count_reg)),
1148                                      count_reg, 1, OPTAB_DIRECT);
1149       gcc_assert (REG_P (sub_reg));
1150       if (REGNO (sub_reg) != REGNO (count_reg))
1151         emit_move_insn (count_reg, sub_reg);
1152     }
1153 
1154   for (i = 0; i < last_stage; i++)
1155     duplicate_insns_of_cycles (ps, 0, i, count_reg);
1156 
1157   /* Put the prolog on the entry edge.  */
1158   e = loop_preheader_edge (loop);
1159   split_edge_and_insert (e, get_insns ());
1160   if (!flag_resched_modulo_sched)
1161     e->dest->flags |= BB_DISABLE_SCHEDULE;
1162 
1163   end_sequence ();
1164 
1165   /* Generate the epilog, inserting its insns on the loop-exit edge.  */
1166   start_sequence ();
1167 
1168   for (i = 0; i < last_stage; i++)
1169     duplicate_insns_of_cycles (ps, i + 1, last_stage, count_reg);
1170 
1171   /* Put the epilogue on the exit edge.  */
1172   gcc_assert (single_exit (loop));
1173   e = single_exit (loop);
1174   split_edge_and_insert (e, get_insns ());
1175   if (!flag_resched_modulo_sched)
1176     e->dest->flags |= BB_DISABLE_SCHEDULE;
1177 
1178   end_sequence ();
1179 }
1180 
1181 /* Mark LOOP as software pipelined so the later
1182    scheduling passes don't touch it.  */
1183 static void
mark_loop_unsched(struct loop * loop)1184 mark_loop_unsched (struct loop *loop)
1185 {
1186   unsigned i;
1187   basic_block *bbs = get_loop_body (loop);
1188 
1189   for (i = 0; i < loop->num_nodes; i++)
1190     bbs[i]->flags |= BB_DISABLE_SCHEDULE;
1191 
1192   free (bbs);
1193 }
1194 
1195 /* Return true if all the BBs of the loop are empty except the
1196    loop header.  */
1197 static bool
loop_single_full_bb_p(struct loop * loop)1198 loop_single_full_bb_p (struct loop *loop)
1199 {
1200   unsigned i;
1201   basic_block *bbs = get_loop_body (loop);
1202 
1203   for (i = 0; i < loop->num_nodes ; i++)
1204     {
1205       rtx_insn *head, *tail;
1206       bool empty_bb = true;
1207 
1208       if (bbs[i] == loop->header)
1209         continue;
1210 
1211       /* Make sure that basic blocks other than the header
1212          have only notes labels or jumps.  */
1213       get_ebb_head_tail (bbs[i], bbs[i], &head, &tail);
1214       for (; head != NEXT_INSN (tail); head = NEXT_INSN (head))
1215         {
1216           if (NOTE_P (head) || LABEL_P (head)
1217  	      || (INSN_P (head) && (DEBUG_INSN_P (head) || JUMP_P (head))))
1218  	    continue;
1219  	  empty_bb = false;
1220  	  break;
1221         }
1222 
1223       if (! empty_bb)
1224         {
1225           free (bbs);
1226           return false;
1227         }
1228     }
1229   free (bbs);
1230   return true;
1231 }
1232 
1233 /* Dump file:line from INSN's location info to dump_file.  */
1234 
1235 static void
dump_insn_location(rtx_insn * insn)1236 dump_insn_location (rtx_insn *insn)
1237 {
1238   if (dump_file && INSN_HAS_LOCATION (insn))
1239     {
1240       expanded_location xloc = insn_location (insn);
1241       fprintf (dump_file, " %s:%i", xloc.file, xloc.line);
1242     }
1243 }
1244 
1245 /* A simple loop from SMS point of view; it is a loop that is composed of
1246    either a single basic block or two BBs - a header and a latch.  */
1247 #define SIMPLE_SMS_LOOP_P(loop) ((loop->num_nodes < 3 ) 		    \
1248 				  && (EDGE_COUNT (loop->latch->preds) == 1) \
1249                                   && (EDGE_COUNT (loop->latch->succs) == 1))
1250 
1251 /* Return true if the loop is in its canonical form and false if not.
1252    i.e. SIMPLE_SMS_LOOP_P and have one preheader block, and single exit.  */
1253 static bool
loop_canon_p(struct loop * loop)1254 loop_canon_p (struct loop *loop)
1255 {
1256 
1257   if (loop->inner || !loop_outer (loop))
1258   {
1259     if (dump_file)
1260       fprintf (dump_file, "SMS loop inner or !loop_outer\n");
1261     return false;
1262   }
1263 
1264   if (!single_exit (loop))
1265     {
1266       if (dump_file)
1267 	{
1268 	  rtx_insn *insn = BB_END (loop->header);
1269 
1270 	  fprintf (dump_file, "SMS loop many exits");
1271 	  dump_insn_location (insn);
1272 	  fprintf (dump_file, "\n");
1273 	}
1274       return false;
1275     }
1276 
1277   if (! SIMPLE_SMS_LOOP_P (loop) && ! loop_single_full_bb_p (loop))
1278     {
1279       if (dump_file)
1280 	{
1281 	  rtx_insn *insn = BB_END (loop->header);
1282 
1283 	  fprintf (dump_file, "SMS loop many BBs.");
1284 	  dump_insn_location (insn);
1285 	  fprintf (dump_file, "\n");
1286 	}
1287       return false;
1288     }
1289 
1290     return true;
1291 }
1292 
1293 /* If there are more than one entry for the loop,
1294    make it one by splitting the first entry edge and
1295    redirecting the others to the new BB.  */
1296 static void
canon_loop(struct loop * loop)1297 canon_loop (struct loop *loop)
1298 {
1299   edge e;
1300   edge_iterator i;
1301 
1302   /* Avoid annoying special cases of edges going to exit
1303      block.  */
1304   FOR_EACH_EDGE (e, i, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
1305     if ((e->flags & EDGE_FALLTHRU) && (EDGE_COUNT (e->src->succs) > 1))
1306       split_edge (e);
1307 
1308   if (loop->latch == loop->header
1309       || EDGE_COUNT (loop->latch->succs) > 1)
1310     {
1311       FOR_EACH_EDGE (e, i, loop->header->preds)
1312         if (e->src == loop->latch)
1313           break;
1314       split_edge (e);
1315     }
1316 }
1317 
1318 /* Setup infos.  */
1319 static void
setup_sched_infos(void)1320 setup_sched_infos (void)
1321 {
1322   memcpy (&sms_common_sched_info, &haifa_common_sched_info,
1323 	  sizeof (sms_common_sched_info));
1324   sms_common_sched_info.sched_pass_id = SCHED_SMS_PASS;
1325   common_sched_info = &sms_common_sched_info;
1326 
1327   sched_deps_info = &sms_sched_deps_info;
1328   current_sched_info = &sms_sched_info;
1329 }
1330 
1331 /* Probability in % that the sms-ed loop rolls enough so that optimized
1332    version may be entered.  Just a guess.  */
1333 #define PROB_SMS_ENOUGH_ITERATIONS 80
1334 
1335 /* Used to calculate the upper bound of ii.  */
1336 #define MAXII_FACTOR 2
1337 
1338 /* Main entry point, perform SMS scheduling on the loops of the function
1339    that consist of single basic blocks.  */
1340 static void
sms_schedule(void)1341 sms_schedule (void)
1342 {
1343   rtx_insn *insn;
1344   ddg_ptr *g_arr, g;
1345   int * node_order;
1346   int maxii, max_asap;
1347   partial_schedule_ptr ps;
1348   basic_block bb = NULL;
1349   struct loop *loop;
1350   basic_block condition_bb = NULL;
1351   edge latch_edge;
1352   HOST_WIDE_INT trip_count, max_trip_count;
1353 
1354   loop_optimizer_init (LOOPS_HAVE_PREHEADERS
1355 		       | LOOPS_HAVE_RECORDED_EXITS);
1356   if (number_of_loops (cfun) <= 1)
1357     {
1358       loop_optimizer_finalize ();
1359       return;  /* There are no loops to schedule.  */
1360     }
1361 
1362   /* Initialize issue_rate.  */
1363   if (targetm.sched.issue_rate)
1364     {
1365       int temp = reload_completed;
1366 
1367       reload_completed = 1;
1368       issue_rate = targetm.sched.issue_rate ();
1369       reload_completed = temp;
1370     }
1371   else
1372     issue_rate = 1;
1373 
1374   /* Initialize the scheduler.  */
1375   setup_sched_infos ();
1376   haifa_sched_init ();
1377 
1378   /* Allocate memory to hold the DDG array one entry for each loop.
1379      We use loop->num as index into this array.  */
1380   g_arr = XCNEWVEC (ddg_ptr, number_of_loops (cfun));
1381 
1382   if (dump_file)
1383   {
1384     fprintf (dump_file, "\n\nSMS analysis phase\n");
1385     fprintf (dump_file, "===================\n\n");
1386   }
1387 
1388   /* Build DDGs for all the relevant loops and hold them in G_ARR
1389      indexed by the loop index.  */
1390   FOR_EACH_LOOP (loop, 0)
1391     {
1392       rtx_insn *head, *tail;
1393       rtx count_reg;
1394 
1395       /* For debugging.  */
1396       if (dbg_cnt (sms_sched_loop) == false)
1397         {
1398           if (dump_file)
1399             fprintf (dump_file, "SMS reached max limit... \n");
1400 
1401 	  break;
1402         }
1403 
1404       if (dump_file)
1405 	{
1406 	  rtx_insn *insn = BB_END (loop->header);
1407 
1408 	  fprintf (dump_file, "SMS loop num: %d", loop->num);
1409 	  dump_insn_location (insn);
1410 	  fprintf (dump_file, "\n");
1411 	}
1412 
1413       if (! loop_canon_p (loop))
1414         continue;
1415 
1416       if (! loop_single_full_bb_p (loop))
1417       {
1418         if (dump_file)
1419           fprintf (dump_file, "SMS not loop_single_full_bb_p\n");
1420 	continue;
1421       }
1422 
1423       bb = loop->header;
1424 
1425       get_ebb_head_tail (bb, bb, &head, &tail);
1426       latch_edge = loop_latch_edge (loop);
1427       gcc_assert (single_exit (loop));
1428       trip_count = get_estimated_loop_iterations_int (loop);
1429       max_trip_count = get_max_loop_iterations_int (loop);
1430 
1431       /* Perform SMS only on loops that their average count is above threshold.  */
1432 
1433       if ( latch_edge->count () > profile_count::zero ()
1434           && (latch_edge->count()
1435 	      < single_exit (loop)->count ().apply_scale
1436 				 (SMS_LOOP_AVERAGE_COUNT_THRESHOLD, 1)))
1437 	{
1438 	  if (dump_file)
1439 	    {
1440 	      dump_insn_location (tail);
1441 	      fprintf (dump_file, "\nSMS single-bb-loop\n");
1442 	      if (profile_info && flag_branch_probabilities)
1443 	    	{
1444 	      	  fprintf (dump_file, "SMS loop-count ");
1445 	      	  fprintf (dump_file, "%" PRId64,
1446 	             	   (int64_t) bb->count.to_gcov_type ());
1447 	      	  fprintf (dump_file, "\n");
1448                   fprintf (dump_file, "SMS trip-count ");
1449                   fprintf (dump_file, "%" PRId64 "max %" PRId64,
1450                            (int64_t) trip_count, (int64_t) max_trip_count);
1451                   fprintf (dump_file, "\n");
1452 	    	}
1453 	    }
1454           continue;
1455         }
1456 
1457       /* Make sure this is a doloop.  */
1458       if ( !(count_reg = doloop_register_get (head, tail)))
1459       {
1460         if (dump_file)
1461           fprintf (dump_file, "SMS doloop_register_get failed\n");
1462 	continue;
1463       }
1464 
1465       /* Don't handle BBs with calls or barriers
1466 	 or !single_set with the exception of instructions that include
1467 	 count_reg---these instructions are part of the control part
1468 	 that do-loop recognizes.
1469          ??? Should handle insns defining subregs.  */
1470      for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
1471       {
1472          rtx set;
1473 
1474         if (CALL_P (insn)
1475             || BARRIER_P (insn)
1476             || (NONDEBUG_INSN_P (insn) && !JUMP_P (insn)
1477                 && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE
1478                 && !reg_mentioned_p (count_reg, insn))
1479             || (INSN_P (insn) && (set = single_set (insn))
1480                 && GET_CODE (SET_DEST (set)) == SUBREG))
1481         break;
1482       }
1483 
1484       if (insn != NEXT_INSN (tail))
1485 	{
1486 	  if (dump_file)
1487 	    {
1488 	      if (CALL_P (insn))
1489 		fprintf (dump_file, "SMS loop-with-call\n");
1490 	      else if (BARRIER_P (insn))
1491 		fprintf (dump_file, "SMS loop-with-barrier\n");
1492               else if ((NONDEBUG_INSN_P (insn) && !JUMP_P (insn)
1493                 && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE))
1494                 fprintf (dump_file, "SMS loop-with-not-single-set\n");
1495               else
1496                fprintf (dump_file, "SMS loop with subreg in lhs\n");
1497 	      print_rtl_single (dump_file, insn);
1498 	    }
1499 
1500 	  continue;
1501 	}
1502 
1503       /* Always schedule the closing branch with the rest of the
1504          instructions. The branch is rotated to be in row ii-1 at the
1505          end of the scheduling procedure to make sure it's the last
1506          instruction in the iteration.  */
1507       if (! (g = create_ddg (bb, 1)))
1508         {
1509           if (dump_file)
1510 	    fprintf (dump_file, "SMS create_ddg failed\n");
1511 	  continue;
1512         }
1513 
1514       g_arr[loop->num] = g;
1515       if (dump_file)
1516         fprintf (dump_file, "...OK\n");
1517 
1518     }
1519   if (dump_file)
1520   {
1521     fprintf (dump_file, "\nSMS transformation phase\n");
1522     fprintf (dump_file, "=========================\n\n");
1523   }
1524 
1525   /* We don't want to perform SMS on new loops - created by versioning.  */
1526   FOR_EACH_LOOP (loop, 0)
1527     {
1528       rtx_insn *head, *tail;
1529       rtx count_reg;
1530       rtx_insn *count_init;
1531       int mii, rec_mii, stage_count, min_cycle;
1532       int64_t loop_count = 0;
1533       bool opt_sc_p;
1534 
1535       if (! (g = g_arr[loop->num]))
1536         continue;
1537 
1538       if (dump_file)
1539 	{
1540 	  rtx_insn *insn = BB_END (loop->header);
1541 
1542 	  fprintf (dump_file, "SMS loop num: %d", loop->num);
1543 	  dump_insn_location (insn);
1544 	  fprintf (dump_file, "\n");
1545 
1546 	  print_ddg (dump_file, g);
1547 	}
1548 
1549       get_ebb_head_tail (loop->header, loop->header, &head, &tail);
1550 
1551       latch_edge = loop_latch_edge (loop);
1552       gcc_assert (single_exit (loop));
1553       trip_count = get_estimated_loop_iterations_int (loop);
1554       max_trip_count = get_max_loop_iterations_int (loop);
1555 
1556       if (dump_file)
1557 	{
1558 	  dump_insn_location (tail);
1559 	  fprintf (dump_file, "\nSMS single-bb-loop\n");
1560 	  if (profile_info && flag_branch_probabilities)
1561 	    {
1562 	      fprintf (dump_file, "SMS loop-count ");
1563 	      fprintf (dump_file, "%" PRId64,
1564 	               (int64_t) bb->count.to_gcov_type ());
1565 	      fprintf (dump_file, "\n");
1566 	    }
1567 	  fprintf (dump_file, "SMS doloop\n");
1568 	  fprintf (dump_file, "SMS built-ddg %d\n", g->num_nodes);
1569           fprintf (dump_file, "SMS num-loads %d\n", g->num_loads);
1570           fprintf (dump_file, "SMS num-stores %d\n", g->num_stores);
1571 	}
1572 
1573 
1574       /* In case of th loop have doloop register it gets special
1575 	 handling.  */
1576       count_init = NULL;
1577       if ((count_reg = doloop_register_get (head, tail)))
1578 	{
1579 	  basic_block pre_header;
1580 
1581 	  pre_header = loop_preheader_edge (loop)->src;
1582 	  count_init = const_iteration_count (count_reg, pre_header,
1583 					      &loop_count);
1584 	}
1585       gcc_assert (count_reg);
1586 
1587       if (dump_file && count_init)
1588         {
1589           fprintf (dump_file, "SMS const-doloop ");
1590           fprintf (dump_file, "%" PRId64,
1591 		     loop_count);
1592           fprintf (dump_file, "\n");
1593         }
1594 
1595       node_order = XNEWVEC (int, g->num_nodes);
1596 
1597       mii = 1; /* Need to pass some estimate of mii.  */
1598       rec_mii = sms_order_nodes (g, mii, node_order, &max_asap);
1599       mii = MAX (res_MII (g), rec_mii);
1600       mii = MAX (mii, 1);
1601       maxii = MAX (max_asap, MAXII_FACTOR * mii);
1602 
1603       if (dump_file)
1604 	fprintf (dump_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
1605 		 rec_mii, mii, maxii);
1606 
1607       for (;;)
1608 	{
1609 	  set_node_sched_params (g);
1610 
1611 	  stage_count = 0;
1612 	  opt_sc_p = false;
1613 	  ps = sms_schedule_by_order (g, mii, maxii, node_order);
1614 
1615 	  if (ps)
1616 	    {
1617 	      /* Try to achieve optimized SC by normalizing the partial
1618 		 schedule (having the cycles start from cycle zero).
1619 		 The branch location must be placed in row ii-1 in the
1620 		 final scheduling.	If failed, shift all instructions to
1621 		 position the branch in row ii-1.  */
1622 	      opt_sc_p = optimize_sc (ps, g);
1623 	      if (opt_sc_p)
1624 		stage_count = calculate_stage_count (ps, 0);
1625 	      else
1626 		{
1627 		  /* Bring the branch to cycle ii-1.  */
1628 		  int amount = (SCHED_TIME (g->closing_branch->cuid)
1629 				- (ps->ii - 1));
1630 
1631 		  if (dump_file)
1632 		    fprintf (dump_file, "SMS schedule branch at cycle ii-1\n");
1633 
1634 		  stage_count = calculate_stage_count (ps, amount);
1635 		}
1636 
1637 	      gcc_assert (stage_count >= 1);
1638 	    }
1639 
1640 	  /* The default value of PARAM_SMS_MIN_SC is 2 as stage count of
1641 	     1 means that there is no interleaving between iterations thus
1642 	     we let the scheduling passes do the job in this case.  */
1643 	  if (stage_count < PARAM_VALUE (PARAM_SMS_MIN_SC)
1644 	      || (count_init && (loop_count <= stage_count))
1645 	      || (max_trip_count >= 0 && max_trip_count <= stage_count)
1646 	      || (trip_count >= 0 && trip_count <= stage_count))
1647 	    {
1648 	      if (dump_file)
1649 		{
1650 		  fprintf (dump_file, "SMS failed... \n");
1651 		  fprintf (dump_file, "SMS sched-failed (stage-count=%d,"
1652 			   " loop-count=", stage_count);
1653 		  fprintf (dump_file, "%" PRId64, loop_count);
1654 		  fprintf (dump_file, ", trip-count=");
1655 		  fprintf (dump_file, "%" PRId64 "max %" PRId64,
1656 			   (int64_t) trip_count, (int64_t) max_trip_count);
1657 		  fprintf (dump_file, ")\n");
1658 		}
1659 	      break;
1660 	    }
1661 
1662           if (!opt_sc_p)
1663             {
1664 	      /* Rotate the partial schedule to have the branch in row ii-1.  */
1665               int amount = SCHED_TIME (g->closing_branch->cuid) - (ps->ii - 1);
1666 
1667               reset_sched_times (ps, amount);
1668               rotate_partial_schedule (ps, amount);
1669             }
1670 
1671 	  set_columns_for_ps (ps);
1672 
1673 	  min_cycle = PS_MIN_CYCLE (ps) - SMODULO (PS_MIN_CYCLE (ps), ps->ii);
1674 	  if (!schedule_reg_moves (ps))
1675 	    {
1676 	      mii = ps->ii + 1;
1677 	      free_partial_schedule (ps);
1678 	      continue;
1679 	    }
1680 
1681 	  /* Moves that handle incoming values might have been added
1682 	     to a new first stage.  Bump the stage count if so.
1683 
1684 	     ??? Perhaps we could consider rotating the schedule here
1685 	     instead?  */
1686 	  if (PS_MIN_CYCLE (ps) < min_cycle)
1687 	    {
1688 	      reset_sched_times (ps, 0);
1689 	      stage_count++;
1690 	    }
1691 
1692 	  /* The stage count should now be correct without rotation.  */
1693 	  gcc_checking_assert (stage_count == calculate_stage_count (ps, 0));
1694 	  PS_STAGE_COUNT (ps) = stage_count;
1695 
1696 	  canon_loop (loop);
1697 
1698           if (dump_file)
1699             {
1700 	      dump_insn_location (tail);
1701 	      fprintf (dump_file, " SMS succeeded %d %d (with ii, sc)\n",
1702 		       ps->ii, stage_count);
1703 	      print_partial_schedule (ps, dump_file);
1704 	    }
1705 
1706           /* case the BCT count is not known , Do loop-versioning */
1707 	  if (count_reg && ! count_init)
1708             {
1709 	      rtx comp_rtx = gen_rtx_GT (VOIDmode, count_reg,
1710 					 gen_int_mode (stage_count,
1711 						       GET_MODE (count_reg)));
1712 	      profile_probability prob = profile_probability::guessed_always ()
1713 				.apply_scale (PROB_SMS_ENOUGH_ITERATIONS, 100);
1714 
1715 	      loop_version (loop, comp_rtx, &condition_bb,
1716 	  		    prob, prob.invert (),
1717 			    prob, prob.invert (), true);
1718 	     }
1719 
1720 	  /* Set new iteration count of loop kernel.  */
1721           if (count_reg && count_init)
1722 	    SET_SRC (single_set (count_init)) = GEN_INT (loop_count
1723 						     - stage_count + 1);
1724 
1725 	  /* Now apply the scheduled kernel to the RTL of the loop.  */
1726 	  permute_partial_schedule (ps, g->closing_branch->first_note);
1727 
1728           /* Mark this loop as software pipelined so the later
1729 	     scheduling passes don't touch it.  */
1730 	  if (! flag_resched_modulo_sched)
1731 	    mark_loop_unsched (loop);
1732 
1733 	  /* The life-info is not valid any more.  */
1734 	  df_set_bb_dirty (g->bb);
1735 
1736 	  apply_reg_moves (ps);
1737 	  if (dump_file)
1738 	    print_node_sched_params (dump_file, g->num_nodes, ps);
1739 	  /* Generate prolog and epilog.  */
1740           generate_prolog_epilog (ps, loop, count_reg, count_init);
1741 	  break;
1742 	}
1743 
1744       free_partial_schedule (ps);
1745       node_sched_param_vec.release ();
1746       free (node_order);
1747       free_ddg (g);
1748     }
1749 
1750   free (g_arr);
1751 
1752   /* Release scheduler data, needed until now because of DFA.  */
1753   haifa_sched_finish ();
1754   loop_optimizer_finalize ();
1755 }
1756 
1757 /* The SMS scheduling algorithm itself
1758    -----------------------------------
1759    Input: 'O' an ordered list of insns of a loop.
1760    Output: A scheduling of the loop - kernel, prolog, and epilogue.
1761 
1762    'Q' is the empty Set
1763    'PS' is the partial schedule; it holds the currently scheduled nodes with
1764 	their cycle/slot.
1765    'PSP' previously scheduled predecessors.
1766    'PSS' previously scheduled successors.
1767    't(u)' the cycle where u is scheduled.
1768    'l(u)' is the latency of u.
1769    'd(v,u)' is the dependence distance from v to u.
1770    'ASAP(u)' the earliest time at which u could be scheduled as computed in
1771 	     the node ordering phase.
1772    'check_hardware_resources_conflicts(u, PS, c)'
1773 			     run a trace around cycle/slot through DFA model
1774 			     to check resource conflicts involving instruction u
1775 			     at cycle c given the partial schedule PS.
1776    'add_to_partial_schedule_at_time(u, PS, c)'
1777 			     Add the node/instruction u to the partial schedule
1778 			     PS at time c.
1779    'calculate_register_pressure(PS)'
1780 			     Given a schedule of instructions, calculate the register
1781 			     pressure it implies.  One implementation could be the
1782 			     maximum number of overlapping live ranges.
1783    'maxRP' The maximum allowed register pressure, it is usually derived from the number
1784 	   registers available in the hardware.
1785 
1786    1. II = MII.
1787    2. PS = empty list
1788    3. for each node u in O in pre-computed order
1789    4.   if (PSP(u) != Q && PSS(u) == Q) then
1790    5.     Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1791    6.     start = Early_start; end = Early_start + II - 1; step = 1
1792    11.  else if (PSP(u) == Q && PSS(u) != Q) then
1793    12.      Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1794    13.     start = Late_start; end = Late_start - II + 1; step = -1
1795    14.  else if (PSP(u) != Q && PSS(u) != Q) then
1796    15.     Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1797    16.     Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1798    17.     start = Early_start;
1799    18.     end = min(Early_start + II - 1 , Late_start);
1800    19.     step = 1
1801    20.     else "if (PSP(u) == Q && PSS(u) == Q)"
1802    21.	  start = ASAP(u); end = start + II - 1; step = 1
1803    22.  endif
1804 
1805    23.  success = false
1806    24.  for (c = start ; c != end ; c += step)
1807    25.     if check_hardware_resources_conflicts(u, PS, c) then
1808    26.       add_to_partial_schedule_at_time(u, PS, c)
1809    27.       success = true
1810    28.       break
1811    29.     endif
1812    30.  endfor
1813    31.  if (success == false) then
1814    32.    II = II + 1
1815    33.    if (II > maxII) then
1816    34.       finish - failed to schedule
1817    35.	 endif
1818    36.    goto 2.
1819    37.  endif
1820    38. endfor
1821    39. if (calculate_register_pressure(PS) > maxRP) then
1822    40.    goto 32.
1823    41. endif
1824    42. compute epilogue & prologue
1825    43. finish - succeeded to schedule
1826 
1827    ??? The algorithm restricts the scheduling window to II cycles.
1828    In rare cases, it may be better to allow windows of II+1 cycles.
1829    The window would then start and end on the same row, but with
1830    different "must precede" and "must follow" requirements.  */
1831 
1832 /* A limit on the number of cycles that resource conflicts can span.  ??? Should
1833    be provided by DFA, and be dependent on the type of insn scheduled.  Currently
1834    set to 0 to save compile time.  */
1835 #define DFA_HISTORY SMS_DFA_HISTORY
1836 
1837 /* A threshold for the number of repeated unsuccessful attempts to insert
1838    an empty row, before we flush the partial schedule and start over.  */
1839 #define MAX_SPLIT_NUM 10
1840 /* Given the partial schedule PS, this function calculates and returns the
1841    cycles in which we can schedule the node with the given index I.
1842    NOTE: Here we do the backtracking in SMS, in some special cases. We have
1843    noticed that there are several cases in which we fail    to SMS the loop
1844    because the sched window of a node is empty    due to tight data-deps. In
1845    such cases we want to unschedule    some of the predecessors/successors
1846    until we get non-empty    scheduling window.  It returns -1 if the
1847    scheduling window is empty and zero otherwise.  */
1848 
1849 static int
get_sched_window(partial_schedule_ptr ps,ddg_node_ptr u_node,sbitmap sched_nodes,int ii,int * start_p,int * step_p,int * end_p)1850 get_sched_window (partial_schedule_ptr ps, ddg_node_ptr u_node,
1851 		  sbitmap sched_nodes, int ii, int *start_p, int *step_p,
1852 		  int *end_p)
1853 {
1854   int start, step, end;
1855   int early_start, late_start;
1856   ddg_edge_ptr e;
1857   auto_sbitmap psp (ps->g->num_nodes);
1858   auto_sbitmap pss (ps->g->num_nodes);
1859   sbitmap u_node_preds = NODE_PREDECESSORS (u_node);
1860   sbitmap u_node_succs = NODE_SUCCESSORS (u_node);
1861   int psp_not_empty;
1862   int pss_not_empty;
1863   int count_preds;
1864   int count_succs;
1865 
1866   /* 1. compute sched window for u (start, end, step).  */
1867   bitmap_clear (psp);
1868   bitmap_clear (pss);
1869   psp_not_empty = bitmap_and (psp, u_node_preds, sched_nodes);
1870   pss_not_empty = bitmap_and (pss, u_node_succs, sched_nodes);
1871 
1872   /* We first compute a forward range (start <= end), then decide whether
1873      to reverse it.  */
1874   early_start = INT_MIN;
1875   late_start = INT_MAX;
1876   start = INT_MIN;
1877   end = INT_MAX;
1878   step = 1;
1879 
1880   count_preds = 0;
1881   count_succs = 0;
1882 
1883   if (dump_file && (psp_not_empty || pss_not_empty))
1884     {
1885       fprintf (dump_file, "\nAnalyzing dependencies for node %d (INSN %d)"
1886 	       "; ii = %d\n\n", u_node->cuid, INSN_UID (u_node->insn), ii);
1887       fprintf (dump_file, "%11s %11s %11s %11s %5s\n",
1888 	       "start", "early start", "late start", "end", "time");
1889       fprintf (dump_file, "=========== =========== =========== ==========="
1890 	       " =====\n");
1891     }
1892   /* Calculate early_start and limit end.  Both bounds are inclusive.  */
1893   if (psp_not_empty)
1894     for (e = u_node->in; e != 0; e = e->next_in)
1895       {
1896 	int v = e->src->cuid;
1897 
1898 	if (bitmap_bit_p (sched_nodes, v))
1899 	  {
1900 	    int p_st = SCHED_TIME (v);
1901 	    int earliest = p_st + e->latency - (e->distance * ii);
1902 	    int latest = (e->data_type == MEM_DEP ? p_st + ii - 1 : INT_MAX);
1903 
1904 	    if (dump_file)
1905 	      {
1906 		fprintf (dump_file, "%11s %11d %11s %11d %5d",
1907 			 "", earliest, "", latest, p_st);
1908 		print_ddg_edge (dump_file, e);
1909 		fprintf (dump_file, "\n");
1910 	      }
1911 
1912 	    early_start = MAX (early_start, earliest);
1913 	    end = MIN (end, latest);
1914 
1915 	    if (e->type == TRUE_DEP && e->data_type == REG_DEP)
1916 	      count_preds++;
1917 	  }
1918       }
1919 
1920   /* Calculate late_start and limit start.  Both bounds are inclusive.  */
1921   if (pss_not_empty)
1922     for (e = u_node->out; e != 0; e = e->next_out)
1923       {
1924 	int v = e->dest->cuid;
1925 
1926 	if (bitmap_bit_p (sched_nodes, v))
1927 	  {
1928 	    int s_st = SCHED_TIME (v);
1929 	    int earliest = (e->data_type == MEM_DEP ? s_st - ii + 1 : INT_MIN);
1930 	    int latest = s_st - e->latency + (e->distance * ii);
1931 
1932 	    if (dump_file)
1933 	      {
1934 		fprintf (dump_file, "%11d %11s %11d %11s %5d",
1935 			 earliest, "", latest, "", s_st);
1936 		print_ddg_edge (dump_file, e);
1937 		fprintf (dump_file, "\n");
1938 	      }
1939 
1940 	    start = MAX (start, earliest);
1941 	    late_start = MIN (late_start, latest);
1942 
1943 	    if (e->type == TRUE_DEP && e->data_type == REG_DEP)
1944 	      count_succs++;
1945 	  }
1946       }
1947 
1948   if (dump_file && (psp_not_empty || pss_not_empty))
1949     {
1950       fprintf (dump_file, "----------- ----------- ----------- -----------"
1951 	       " -----\n");
1952       fprintf (dump_file, "%11d %11d %11d %11d %5s %s\n",
1953 	       start, early_start, late_start, end, "",
1954 	       "(max, max, min, min)");
1955     }
1956 
1957   /* Get a target scheduling window no bigger than ii.  */
1958   if (early_start == INT_MIN && late_start == INT_MAX)
1959     early_start = NODE_ASAP (u_node);
1960   else if (early_start == INT_MIN)
1961     early_start = late_start - (ii - 1);
1962   late_start = MIN (late_start, early_start + (ii - 1));
1963 
1964   /* Apply memory dependence limits.  */
1965   start = MAX (start, early_start);
1966   end = MIN (end, late_start);
1967 
1968   if (dump_file && (psp_not_empty || pss_not_empty))
1969     fprintf (dump_file, "%11s %11d %11d %11s %5s final window\n",
1970 	     "", start, end, "", "");
1971 
1972   /* If there are at least as many successors as predecessors, schedule the
1973      node close to its successors.  */
1974   if (pss_not_empty && count_succs >= count_preds)
1975     {
1976       std::swap (start, end);
1977       step = -1;
1978     }
1979 
1980   /* Now that we've finalized the window, make END an exclusive rather
1981      than an inclusive bound.  */
1982   end += step;
1983 
1984   *start_p = start;
1985   *step_p = step;
1986   *end_p = end;
1987 
1988   if ((start >= end && step == 1) || (start <= end && step == -1))
1989     {
1990       if (dump_file)
1991 	fprintf (dump_file, "\nEmpty window: start=%d, end=%d, step=%d\n",
1992 		 start, end, step);
1993       return -1;
1994     }
1995 
1996   return 0;
1997 }
1998 
1999 /* Calculate MUST_PRECEDE/MUST_FOLLOW bitmaps of U_NODE; which is the
2000    node currently been scheduled.  At the end of the calculation
2001    MUST_PRECEDE/MUST_FOLLOW contains all predecessors/successors of
2002    U_NODE which are (1) already scheduled in the first/last row of
2003    U_NODE's scheduling window, (2) whose dependence inequality with U
2004    becomes an equality when U is scheduled in this same row, and (3)
2005    whose dependence latency is zero.
2006 
2007    The first and last rows are calculated using the following parameters:
2008    START/END rows - The cycles that begins/ends the traversal on the window;
2009    searching for an empty cycle to schedule U_NODE.
2010    STEP - The direction in which we traverse the window.
2011    II - The initiation interval.  */
2012 
2013 static void
calculate_must_precede_follow(ddg_node_ptr u_node,int start,int end,int step,int ii,sbitmap sched_nodes,sbitmap must_precede,sbitmap must_follow)2014 calculate_must_precede_follow (ddg_node_ptr u_node, int start, int end,
2015 			       int step, int ii, sbitmap sched_nodes,
2016 			       sbitmap must_precede, sbitmap must_follow)
2017 {
2018   ddg_edge_ptr e;
2019   int first_cycle_in_window, last_cycle_in_window;
2020 
2021   gcc_assert (must_precede && must_follow);
2022 
2023   /* Consider the following scheduling window:
2024      {first_cycle_in_window, first_cycle_in_window+1, ...,
2025      last_cycle_in_window}.  If step is 1 then the following will be
2026      the order we traverse the window: {start=first_cycle_in_window,
2027      first_cycle_in_window+1, ..., end=last_cycle_in_window+1},
2028      or {start=last_cycle_in_window, last_cycle_in_window-1, ...,
2029      end=first_cycle_in_window-1} if step is -1.  */
2030   first_cycle_in_window = (step == 1) ? start : end - step;
2031   last_cycle_in_window = (step == 1) ? end - step : start;
2032 
2033   bitmap_clear (must_precede);
2034   bitmap_clear (must_follow);
2035 
2036   if (dump_file)
2037     fprintf (dump_file, "\nmust_precede: ");
2038 
2039   /* Instead of checking if:
2040       (SMODULO (SCHED_TIME (e->src), ii) == first_row_in_window)
2041       && ((SCHED_TIME (e->src) + e->latency - (e->distance * ii)) ==
2042              first_cycle_in_window)
2043       && e->latency == 0
2044      we use the fact that latency is non-negative:
2045       SCHED_TIME (e->src) - (e->distance * ii) <=
2046       SCHED_TIME (e->src) + e->latency - (e->distance * ii)) <=
2047       first_cycle_in_window
2048      and check only if
2049       SCHED_TIME (e->src) - (e->distance * ii) == first_cycle_in_window  */
2050   for (e = u_node->in; e != 0; e = e->next_in)
2051     if (bitmap_bit_p (sched_nodes, e->src->cuid)
2052 	&& ((SCHED_TIME (e->src->cuid) - (e->distance * ii)) ==
2053              first_cycle_in_window))
2054       {
2055 	if (dump_file)
2056 	  fprintf (dump_file, "%d ", e->src->cuid);
2057 
2058 	bitmap_set_bit (must_precede, e->src->cuid);
2059       }
2060 
2061   if (dump_file)
2062     fprintf (dump_file, "\nmust_follow: ");
2063 
2064   /* Instead of checking if:
2065       (SMODULO (SCHED_TIME (e->dest), ii) == last_row_in_window)
2066       && ((SCHED_TIME (e->dest) - e->latency + (e->distance * ii)) ==
2067              last_cycle_in_window)
2068       && e->latency == 0
2069      we use the fact that latency is non-negative:
2070       SCHED_TIME (e->dest) + (e->distance * ii) >=
2071       SCHED_TIME (e->dest) - e->latency + (e->distance * ii)) >=
2072       last_cycle_in_window
2073      and check only if
2074       SCHED_TIME (e->dest) + (e->distance * ii) == last_cycle_in_window  */
2075   for (e = u_node->out; e != 0; e = e->next_out)
2076     if (bitmap_bit_p (sched_nodes, e->dest->cuid)
2077 	&& ((SCHED_TIME (e->dest->cuid) + (e->distance * ii)) ==
2078              last_cycle_in_window))
2079       {
2080 	if (dump_file)
2081 	  fprintf (dump_file, "%d ", e->dest->cuid);
2082 
2083 	bitmap_set_bit (must_follow, e->dest->cuid);
2084       }
2085 
2086   if (dump_file)
2087     fprintf (dump_file, "\n");
2088 }
2089 
2090 /* Return 1 if U_NODE can be scheduled in CYCLE.  Use the following
2091    parameters to decide if that's possible:
2092    PS - The partial schedule.
2093    U - The serial number of U_NODE.
2094    NUM_SPLITS - The number of row splits made so far.
2095    MUST_PRECEDE - The nodes that must precede U_NODE. (only valid at
2096    the first row of the scheduling window)
2097    MUST_FOLLOW - The nodes that must follow U_NODE. (only valid at the
2098    last row of the scheduling window)  */
2099 
2100 static bool
try_scheduling_node_in_cycle(partial_schedule_ptr ps,int u,int cycle,sbitmap sched_nodes,int * num_splits,sbitmap must_precede,sbitmap must_follow)2101 try_scheduling_node_in_cycle (partial_schedule_ptr ps,
2102 			      int u, int cycle, sbitmap sched_nodes,
2103 			      int *num_splits, sbitmap must_precede,
2104 			      sbitmap must_follow)
2105 {
2106   ps_insn_ptr psi;
2107   bool success = 0;
2108 
2109   verify_partial_schedule (ps, sched_nodes);
2110   psi = ps_add_node_check_conflicts (ps, u, cycle, must_precede, must_follow);
2111   if (psi)
2112     {
2113       SCHED_TIME (u) = cycle;
2114       bitmap_set_bit (sched_nodes, u);
2115       success = 1;
2116       *num_splits = 0;
2117       if (dump_file)
2118 	fprintf (dump_file, "Scheduled w/o split in %d\n", cycle);
2119 
2120     }
2121 
2122   return success;
2123 }
2124 
2125 /* This function implements the scheduling algorithm for SMS according to the
2126    above algorithm.  */
2127 static partial_schedule_ptr
sms_schedule_by_order(ddg_ptr g,int mii,int maxii,int * nodes_order)2128 sms_schedule_by_order (ddg_ptr g, int mii, int maxii, int *nodes_order)
2129 {
2130   int ii = mii;
2131   int i, c, success, num_splits = 0;
2132   int flush_and_start_over = true;
2133   int num_nodes = g->num_nodes;
2134   int start, end, step; /* Place together into one struct?  */
2135   auto_sbitmap sched_nodes (num_nodes);
2136   auto_sbitmap must_precede (num_nodes);
2137   auto_sbitmap must_follow (num_nodes);
2138   auto_sbitmap tobe_scheduled (num_nodes);
2139 
2140   partial_schedule_ptr ps = create_partial_schedule (ii, g, DFA_HISTORY);
2141 
2142   bitmap_ones (tobe_scheduled);
2143   bitmap_clear (sched_nodes);
2144 
2145   while (flush_and_start_over && (ii < maxii))
2146     {
2147 
2148       if (dump_file)
2149 	fprintf (dump_file, "Starting with ii=%d\n", ii);
2150       flush_and_start_over = false;
2151       bitmap_clear (sched_nodes);
2152 
2153       for (i = 0; i < num_nodes; i++)
2154 	{
2155 	  int u = nodes_order[i];
2156   	  ddg_node_ptr u_node = &ps->g->nodes[u];
2157 	  rtx_insn *insn = u_node->insn;
2158 
2159 	  if (!NONDEBUG_INSN_P (insn))
2160 	    {
2161 	      bitmap_clear_bit (tobe_scheduled, u);
2162 	      continue;
2163 	    }
2164 
2165 	  if (bitmap_bit_p (sched_nodes, u))
2166 	    continue;
2167 
2168 	  /* Try to get non-empty scheduling window.  */
2169 	 success = 0;
2170          if (get_sched_window (ps, u_node, sched_nodes, ii, &start,
2171                                 &step, &end) == 0)
2172             {
2173               if (dump_file)
2174                 fprintf (dump_file, "\nTrying to schedule node %d "
2175 			 "INSN = %d  in (%d .. %d) step %d\n", u, (INSN_UID
2176                         (g->nodes[u].insn)), start, end, step);
2177 
2178               gcc_assert ((step > 0 && start < end)
2179                           || (step < 0 && start > end));
2180 
2181               calculate_must_precede_follow (u_node, start, end, step, ii,
2182                                              sched_nodes, must_precede,
2183                                              must_follow);
2184 
2185               for (c = start; c != end; c += step)
2186                 {
2187 		  sbitmap tmp_precede, tmp_follow;
2188 
2189                   set_must_precede_follow (&tmp_follow, must_follow,
2190 		                           &tmp_precede, must_precede,
2191                                            c, start, end, step);
2192                   success =
2193                     try_scheduling_node_in_cycle (ps, u, c,
2194                                                   sched_nodes,
2195                                                   &num_splits, tmp_precede,
2196                                                   tmp_follow);
2197                   if (success)
2198                     break;
2199                 }
2200 
2201               verify_partial_schedule (ps, sched_nodes);
2202             }
2203             if (!success)
2204             {
2205               int split_row;
2206 
2207               if (ii++ == maxii)
2208                 break;
2209 
2210               if (num_splits >= MAX_SPLIT_NUM)
2211                 {
2212                   num_splits = 0;
2213                   flush_and_start_over = true;
2214                   verify_partial_schedule (ps, sched_nodes);
2215                   reset_partial_schedule (ps, ii);
2216                   verify_partial_schedule (ps, sched_nodes);
2217                   break;
2218                 }
2219 
2220               num_splits++;
2221               /* The scheduling window is exclusive of 'end'
2222                  whereas compute_split_window() expects an inclusive,
2223                  ordered range.  */
2224               if (step == 1)
2225                 split_row = compute_split_row (sched_nodes, start, end - 1,
2226                                                ps->ii, u_node);
2227               else
2228                 split_row = compute_split_row (sched_nodes, end + 1, start,
2229                                                ps->ii, u_node);
2230 
2231               ps_insert_empty_row (ps, split_row, sched_nodes);
2232               i--;              /* Go back and retry node i.  */
2233 
2234               if (dump_file)
2235                 fprintf (dump_file, "num_splits=%d\n", num_splits);
2236             }
2237 
2238           /* ??? If (success), check register pressure estimates.  */
2239         }                       /* Continue with next node.  */
2240     }                           /* While flush_and_start_over.  */
2241   if (ii >= maxii)
2242     {
2243       free_partial_schedule (ps);
2244       ps = NULL;
2245     }
2246   else
2247     gcc_assert (bitmap_equal_p (tobe_scheduled, sched_nodes));
2248 
2249   return ps;
2250 }
2251 
2252 /* This function inserts a new empty row into PS at the position
2253    according to SPLITROW, keeping all already scheduled instructions
2254    intact and updating their SCHED_TIME and cycle accordingly.  */
2255 static void
ps_insert_empty_row(partial_schedule_ptr ps,int split_row,sbitmap sched_nodes)2256 ps_insert_empty_row (partial_schedule_ptr ps, int split_row,
2257 		     sbitmap sched_nodes)
2258 {
2259   ps_insn_ptr crr_insn;
2260   ps_insn_ptr *rows_new;
2261   int ii = ps->ii;
2262   int new_ii = ii + 1;
2263   int row;
2264   int *rows_length_new;
2265 
2266   verify_partial_schedule (ps, sched_nodes);
2267 
2268   /* We normalize sched_time and rotate ps to have only non-negative sched
2269      times, for simplicity of updating cycles after inserting new row.  */
2270   split_row -= ps->min_cycle;
2271   split_row = SMODULO (split_row, ii);
2272   if (dump_file)
2273     fprintf (dump_file, "split_row=%d\n", split_row);
2274 
2275   reset_sched_times (ps, PS_MIN_CYCLE (ps));
2276   rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
2277 
2278   rows_new = (ps_insn_ptr *) xcalloc (new_ii, sizeof (ps_insn_ptr));
2279   rows_length_new = (int *) xcalloc (new_ii, sizeof (int));
2280   for (row = 0; row < split_row; row++)
2281     {
2282       rows_new[row] = ps->rows[row];
2283       rows_length_new[row] = ps->rows_length[row];
2284       ps->rows[row] = NULL;
2285       for (crr_insn = rows_new[row];
2286 	   crr_insn; crr_insn = crr_insn->next_in_row)
2287 	{
2288 	  int u = crr_insn->id;
2289 	  int new_time = SCHED_TIME (u) + (SCHED_TIME (u) / ii);
2290 
2291 	  SCHED_TIME (u) = new_time;
2292 	  crr_insn->cycle = new_time;
2293 	  SCHED_ROW (u) = new_time % new_ii;
2294 	  SCHED_STAGE (u) = new_time / new_ii;
2295 	}
2296 
2297     }
2298 
2299   rows_new[split_row] = NULL;
2300 
2301   for (row = split_row; row < ii; row++)
2302     {
2303       rows_new[row + 1] = ps->rows[row];
2304       rows_length_new[row + 1] = ps->rows_length[row];
2305       ps->rows[row] = NULL;
2306       for (crr_insn = rows_new[row + 1];
2307 	   crr_insn; crr_insn = crr_insn->next_in_row)
2308 	{
2309 	  int u = crr_insn->id;
2310 	  int new_time = SCHED_TIME (u) + (SCHED_TIME (u) / ii) + 1;
2311 
2312 	  SCHED_TIME (u) = new_time;
2313 	  crr_insn->cycle = new_time;
2314 	  SCHED_ROW (u) = new_time % new_ii;
2315 	  SCHED_STAGE (u) = new_time / new_ii;
2316 	}
2317     }
2318 
2319   /* Updating ps.  */
2320   ps->min_cycle = ps->min_cycle + ps->min_cycle / ii
2321     + (SMODULO (ps->min_cycle, ii) >= split_row ? 1 : 0);
2322   ps->max_cycle = ps->max_cycle + ps->max_cycle / ii
2323     + (SMODULO (ps->max_cycle, ii) >= split_row ? 1 : 0);
2324   free (ps->rows);
2325   ps->rows = rows_new;
2326   free (ps->rows_length);
2327   ps->rows_length = rows_length_new;
2328   ps->ii = new_ii;
2329   gcc_assert (ps->min_cycle >= 0);
2330 
2331   verify_partial_schedule (ps, sched_nodes);
2332 
2333   if (dump_file)
2334     fprintf (dump_file, "min_cycle=%d, max_cycle=%d\n", ps->min_cycle,
2335 	     ps->max_cycle);
2336 }
2337 
2338 /* Given U_NODE which is the node that failed to be scheduled; LOW and
2339    UP which are the boundaries of it's scheduling window; compute using
2340    SCHED_NODES and II a row in the partial schedule that can be split
2341    which will separate a critical predecessor from a critical successor
2342    thereby expanding the window, and return it.  */
2343 static int
compute_split_row(sbitmap sched_nodes,int low,int up,int ii,ddg_node_ptr u_node)2344 compute_split_row (sbitmap sched_nodes, int low, int up, int ii,
2345 		   ddg_node_ptr u_node)
2346 {
2347   ddg_edge_ptr e;
2348   int lower = INT_MIN, upper = INT_MAX;
2349   int crit_pred = -1;
2350   int crit_succ = -1;
2351   int crit_cycle;
2352 
2353   for (e = u_node->in; e != 0; e = e->next_in)
2354     {
2355       int v = e->src->cuid;
2356 
2357       if (bitmap_bit_p (sched_nodes, v)
2358 	  && (low == SCHED_TIME (v) + e->latency - (e->distance * ii)))
2359 	if (SCHED_TIME (v) > lower)
2360 	  {
2361 	    crit_pred = v;
2362 	    lower = SCHED_TIME (v);
2363 	  }
2364     }
2365 
2366   if (crit_pred >= 0)
2367     {
2368       crit_cycle = SCHED_TIME (crit_pred) + 1;
2369       return SMODULO (crit_cycle, ii);
2370     }
2371 
2372   for (e = u_node->out; e != 0; e = e->next_out)
2373     {
2374       int v = e->dest->cuid;
2375 
2376       if (bitmap_bit_p (sched_nodes, v)
2377 	  && (up == SCHED_TIME (v) - e->latency + (e->distance * ii)))
2378 	if (SCHED_TIME (v) < upper)
2379 	  {
2380 	    crit_succ = v;
2381 	    upper = SCHED_TIME (v);
2382 	  }
2383     }
2384 
2385   if (crit_succ >= 0)
2386     {
2387       crit_cycle = SCHED_TIME (crit_succ);
2388       return SMODULO (crit_cycle, ii);
2389     }
2390 
2391   if (dump_file)
2392     fprintf (dump_file, "Both crit_pred and crit_succ are NULL\n");
2393 
2394   return SMODULO ((low + up + 1) / 2, ii);
2395 }
2396 
2397 static void
verify_partial_schedule(partial_schedule_ptr ps,sbitmap sched_nodes)2398 verify_partial_schedule (partial_schedule_ptr ps, sbitmap sched_nodes)
2399 {
2400   int row;
2401   ps_insn_ptr crr_insn;
2402 
2403   for (row = 0; row < ps->ii; row++)
2404     {
2405       int length = 0;
2406 
2407       for (crr_insn = ps->rows[row]; crr_insn; crr_insn = crr_insn->next_in_row)
2408 	{
2409 	  int u = crr_insn->id;
2410 
2411 	  length++;
2412 	  gcc_assert (bitmap_bit_p (sched_nodes, u));
2413 	  /* ??? Test also that all nodes of sched_nodes are in ps, perhaps by
2414 	     popcount (sched_nodes) == number of insns in ps.  */
2415 	  gcc_assert (SCHED_TIME (u) >= ps->min_cycle);
2416 	  gcc_assert (SCHED_TIME (u) <= ps->max_cycle);
2417 	}
2418 
2419       gcc_assert (ps->rows_length[row] == length);
2420     }
2421 }
2422 
2423 
2424 /* This page implements the algorithm for ordering the nodes of a DDG
2425    for modulo scheduling, activated through the
2426    "int sms_order_nodes (ddg_ptr, int mii, int * result)" API.  */
2427 
2428 #define ORDER_PARAMS(x) ((struct node_order_params *) (x)->aux.info)
2429 #define ASAP(x) (ORDER_PARAMS ((x))->asap)
2430 #define ALAP(x) (ORDER_PARAMS ((x))->alap)
2431 #define HEIGHT(x) (ORDER_PARAMS ((x))->height)
2432 #define MOB(x) (ALAP ((x)) - ASAP ((x)))
2433 #define DEPTH(x) (ASAP ((x)))
2434 
2435 typedef struct node_order_params * nopa;
2436 
2437 static void order_nodes_of_sccs (ddg_all_sccs_ptr, int * result);
2438 static int order_nodes_in_scc (ddg_ptr, sbitmap, sbitmap, int*, int);
2439 static nopa  calculate_order_params (ddg_ptr, int, int *);
2440 static int find_max_asap (ddg_ptr, sbitmap);
2441 static int find_max_hv_min_mob (ddg_ptr, sbitmap);
2442 static int find_max_dv_min_mob (ddg_ptr, sbitmap);
2443 
2444 enum sms_direction {BOTTOMUP, TOPDOWN};
2445 
2446 struct node_order_params
2447 {
2448   int asap;
2449   int alap;
2450   int height;
2451 };
2452 
2453 /* Check if NODE_ORDER contains a permutation of 0 .. NUM_NODES-1.  */
2454 static void
check_nodes_order(int * node_order,int num_nodes)2455 check_nodes_order (int *node_order, int num_nodes)
2456 {
2457   int i;
2458   auto_sbitmap tmp (num_nodes);
2459 
2460   bitmap_clear (tmp);
2461 
2462   if (dump_file)
2463     fprintf (dump_file, "SMS final nodes order: \n");
2464 
2465   for (i = 0; i < num_nodes; i++)
2466     {
2467       int u = node_order[i];
2468 
2469       if (dump_file)
2470         fprintf (dump_file, "%d ", u);
2471       gcc_assert (u < num_nodes && u >= 0 && !bitmap_bit_p (tmp, u));
2472 
2473       bitmap_set_bit (tmp, u);
2474     }
2475 
2476   if (dump_file)
2477     fprintf (dump_file, "\n");
2478 }
2479 
2480 /* Order the nodes of G for scheduling and pass the result in
2481    NODE_ORDER.  Also set aux.count of each node to ASAP.
2482    Put maximal ASAP to PMAX_ASAP.  Return the recMII for the given DDG.  */
2483 static int
sms_order_nodes(ddg_ptr g,int mii,int * node_order,int * pmax_asap)2484 sms_order_nodes (ddg_ptr g, int mii, int * node_order, int *pmax_asap)
2485 {
2486   int i;
2487   int rec_mii = 0;
2488   ddg_all_sccs_ptr sccs = create_ddg_all_sccs (g);
2489 
2490   nopa nops = calculate_order_params (g, mii, pmax_asap);
2491 
2492   if (dump_file)
2493     print_sccs (dump_file, sccs, g);
2494 
2495   order_nodes_of_sccs (sccs, node_order);
2496 
2497   if (sccs->num_sccs > 0)
2498     /* First SCC has the largest recurrence_length.  */
2499     rec_mii = sccs->sccs[0]->recurrence_length;
2500 
2501   /* Save ASAP before destroying node_order_params.  */
2502   for (i = 0; i < g->num_nodes; i++)
2503     {
2504       ddg_node_ptr v = &g->nodes[i];
2505       v->aux.count = ASAP (v);
2506     }
2507 
2508   free (nops);
2509   free_ddg_all_sccs (sccs);
2510   check_nodes_order (node_order, g->num_nodes);
2511 
2512   return rec_mii;
2513 }
2514 
2515 static void
order_nodes_of_sccs(ddg_all_sccs_ptr all_sccs,int * node_order)2516 order_nodes_of_sccs (ddg_all_sccs_ptr all_sccs, int * node_order)
2517 {
2518   int i, pos = 0;
2519   ddg_ptr g = all_sccs->ddg;
2520   int num_nodes = g->num_nodes;
2521   auto_sbitmap prev_sccs (num_nodes);
2522   auto_sbitmap on_path (num_nodes);
2523   auto_sbitmap tmp (num_nodes);
2524   auto_sbitmap ones (num_nodes);
2525 
2526   bitmap_clear (prev_sccs);
2527   bitmap_ones (ones);
2528 
2529   /* Perform the node ordering starting from the SCC with the highest recMII.
2530      For each SCC order the nodes according to their ASAP/ALAP/HEIGHT etc.  */
2531   for (i = 0; i < all_sccs->num_sccs; i++)
2532     {
2533       ddg_scc_ptr scc = all_sccs->sccs[i];
2534 
2535       /* Add nodes on paths from previous SCCs to the current SCC.  */
2536       find_nodes_on_paths (on_path, g, prev_sccs, scc->nodes);
2537       bitmap_ior (tmp, scc->nodes, on_path);
2538 
2539       /* Add nodes on paths from the current SCC to previous SCCs.  */
2540       find_nodes_on_paths (on_path, g, scc->nodes, prev_sccs);
2541       bitmap_ior (tmp, tmp, on_path);
2542 
2543       /* Remove nodes of previous SCCs from current extended SCC.  */
2544       bitmap_and_compl (tmp, tmp, prev_sccs);
2545 
2546       pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
2547       /* Above call to order_nodes_in_scc updated prev_sccs |= tmp.  */
2548     }
2549 
2550   /* Handle the remaining nodes that do not belong to any scc.  Each call
2551      to order_nodes_in_scc handles a single connected component.  */
2552   while (pos < g->num_nodes)
2553     {
2554       bitmap_and_compl (tmp, ones, prev_sccs);
2555       pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
2556     }
2557 }
2558 
2559 /* MII is needed if we consider backarcs (that do not close recursive cycles).  */
2560 static struct node_order_params *
calculate_order_params(ddg_ptr g,int mii ATTRIBUTE_UNUSED,int * pmax_asap)2561 calculate_order_params (ddg_ptr g, int mii ATTRIBUTE_UNUSED, int *pmax_asap)
2562 {
2563   int u;
2564   int max_asap;
2565   int num_nodes = g->num_nodes;
2566   ddg_edge_ptr e;
2567   /* Allocate a place to hold ordering params for each node in the DDG.  */
2568   nopa node_order_params_arr;
2569 
2570   /* Initialize of ASAP/ALAP/HEIGHT to zero.  */
2571   node_order_params_arr = (nopa) xcalloc (num_nodes,
2572 					  sizeof (struct node_order_params));
2573 
2574   /* Set the aux pointer of each node to point to its order_params structure.  */
2575   for (u = 0; u < num_nodes; u++)
2576     g->nodes[u].aux.info = &node_order_params_arr[u];
2577 
2578   /* Disregarding a backarc from each recursive cycle to obtain a DAG,
2579      calculate ASAP, ALAP, mobility, distance, and height for each node
2580      in the dependence (direct acyclic) graph.  */
2581 
2582   /* We assume that the nodes in the array are in topological order.  */
2583 
2584   max_asap = 0;
2585   for (u = 0; u < num_nodes; u++)
2586     {
2587       ddg_node_ptr u_node = &g->nodes[u];
2588 
2589       ASAP (u_node) = 0;
2590       for (e = u_node->in; e; e = e->next_in)
2591 	if (e->distance == 0)
2592 	  ASAP (u_node) = MAX (ASAP (u_node),
2593 			       ASAP (e->src) + e->latency);
2594       max_asap = MAX (max_asap, ASAP (u_node));
2595     }
2596 
2597   for (u = num_nodes - 1; u > -1; u--)
2598     {
2599       ddg_node_ptr u_node = &g->nodes[u];
2600 
2601       ALAP (u_node) = max_asap;
2602       HEIGHT (u_node) = 0;
2603       for (e = u_node->out; e; e = e->next_out)
2604 	if (e->distance == 0)
2605 	  {
2606 	    ALAP (u_node) = MIN (ALAP (u_node),
2607 				 ALAP (e->dest) - e->latency);
2608 	    HEIGHT (u_node) = MAX (HEIGHT (u_node),
2609 				   HEIGHT (e->dest) + e->latency);
2610 	  }
2611     }
2612   if (dump_file)
2613   {
2614     fprintf (dump_file, "\nOrder params\n");
2615     for (u = 0; u < num_nodes; u++)
2616       {
2617         ddg_node_ptr u_node = &g->nodes[u];
2618 
2619         fprintf (dump_file, "node %d, ASAP: %d, ALAP: %d, HEIGHT: %d\n", u,
2620                  ASAP (u_node), ALAP (u_node), HEIGHT (u_node));
2621       }
2622   }
2623 
2624   *pmax_asap = max_asap;
2625   return node_order_params_arr;
2626 }
2627 
2628 static int
find_max_asap(ddg_ptr g,sbitmap nodes)2629 find_max_asap (ddg_ptr g, sbitmap nodes)
2630 {
2631   unsigned int u = 0;
2632   int max_asap = -1;
2633   int result = -1;
2634   sbitmap_iterator sbi;
2635 
2636   EXECUTE_IF_SET_IN_BITMAP (nodes, 0, u, sbi)
2637     {
2638       ddg_node_ptr u_node = &g->nodes[u];
2639 
2640       if (max_asap < ASAP (u_node))
2641 	{
2642 	  max_asap = ASAP (u_node);
2643 	  result = u;
2644 	}
2645     }
2646   return result;
2647 }
2648 
2649 static int
find_max_hv_min_mob(ddg_ptr g,sbitmap nodes)2650 find_max_hv_min_mob (ddg_ptr g, sbitmap nodes)
2651 {
2652   unsigned int u = 0;
2653   int max_hv = -1;
2654   int min_mob = INT_MAX;
2655   int result = -1;
2656   sbitmap_iterator sbi;
2657 
2658   EXECUTE_IF_SET_IN_BITMAP (nodes, 0, u, sbi)
2659     {
2660       ddg_node_ptr u_node = &g->nodes[u];
2661 
2662       if (max_hv < HEIGHT (u_node))
2663 	{
2664 	  max_hv = HEIGHT (u_node);
2665 	  min_mob = MOB (u_node);
2666 	  result = u;
2667 	}
2668       else if ((max_hv == HEIGHT (u_node))
2669 	       && (min_mob > MOB (u_node)))
2670 	{
2671 	  min_mob = MOB (u_node);
2672 	  result = u;
2673 	}
2674     }
2675   return result;
2676 }
2677 
2678 static int
find_max_dv_min_mob(ddg_ptr g,sbitmap nodes)2679 find_max_dv_min_mob (ddg_ptr g, sbitmap nodes)
2680 {
2681   unsigned int u = 0;
2682   int max_dv = -1;
2683   int min_mob = INT_MAX;
2684   int result = -1;
2685   sbitmap_iterator sbi;
2686 
2687   EXECUTE_IF_SET_IN_BITMAP (nodes, 0, u, sbi)
2688     {
2689       ddg_node_ptr u_node = &g->nodes[u];
2690 
2691       if (max_dv < DEPTH (u_node))
2692 	{
2693 	  max_dv = DEPTH (u_node);
2694 	  min_mob = MOB (u_node);
2695 	  result = u;
2696 	}
2697       else if ((max_dv == DEPTH (u_node))
2698 	       && (min_mob > MOB (u_node)))
2699 	{
2700 	  min_mob = MOB (u_node);
2701 	  result = u;
2702 	}
2703     }
2704   return result;
2705 }
2706 
2707 /* Places the nodes of SCC into the NODE_ORDER array starting
2708    at position POS, according to the SMS ordering algorithm.
2709    NODES_ORDERED (in&out parameter) holds the bitset of all nodes in
2710    the NODE_ORDER array, starting from position zero.  */
2711 static int
order_nodes_in_scc(ddg_ptr g,sbitmap nodes_ordered,sbitmap scc,int * node_order,int pos)2712 order_nodes_in_scc (ddg_ptr g, sbitmap nodes_ordered, sbitmap scc,
2713 		    int * node_order, int pos)
2714 {
2715   enum sms_direction dir;
2716   int num_nodes = g->num_nodes;
2717   auto_sbitmap workset (num_nodes);
2718   auto_sbitmap tmp (num_nodes);
2719   sbitmap zero_bitmap = sbitmap_alloc (num_nodes);
2720   auto_sbitmap predecessors (num_nodes);
2721   auto_sbitmap successors (num_nodes);
2722 
2723   bitmap_clear (predecessors);
2724   find_predecessors (predecessors, g, nodes_ordered);
2725 
2726   bitmap_clear (successors);
2727   find_successors (successors, g, nodes_ordered);
2728 
2729   bitmap_clear (tmp);
2730   if (bitmap_and (tmp, predecessors, scc))
2731     {
2732       bitmap_copy (workset, tmp);
2733       dir = BOTTOMUP;
2734     }
2735   else if (bitmap_and (tmp, successors, scc))
2736     {
2737       bitmap_copy (workset, tmp);
2738       dir = TOPDOWN;
2739     }
2740   else
2741     {
2742       int u;
2743 
2744       bitmap_clear (workset);
2745       if ((u = find_max_asap (g, scc)) >= 0)
2746 	bitmap_set_bit (workset, u);
2747       dir = BOTTOMUP;
2748     }
2749 
2750   bitmap_clear (zero_bitmap);
2751   while (!bitmap_equal_p (workset, zero_bitmap))
2752     {
2753       int v;
2754       ddg_node_ptr v_node;
2755       sbitmap v_node_preds;
2756       sbitmap v_node_succs;
2757 
2758       if (dir == TOPDOWN)
2759 	{
2760 	  while (!bitmap_equal_p (workset, zero_bitmap))
2761 	    {
2762 	      v = find_max_hv_min_mob (g, workset);
2763 	      v_node = &g->nodes[v];
2764 	      node_order[pos++] = v;
2765 	      v_node_succs = NODE_SUCCESSORS (v_node);
2766 	      bitmap_and (tmp, v_node_succs, scc);
2767 
2768 	      /* Don't consider the already ordered successors again.  */
2769 	      bitmap_and_compl (tmp, tmp, nodes_ordered);
2770 	      bitmap_ior (workset, workset, tmp);
2771 	      bitmap_clear_bit (workset, v);
2772 	      bitmap_set_bit (nodes_ordered, v);
2773 	    }
2774 	  dir = BOTTOMUP;
2775 	  bitmap_clear (predecessors);
2776 	  find_predecessors (predecessors, g, nodes_ordered);
2777 	  bitmap_and (workset, predecessors, scc);
2778 	}
2779       else
2780 	{
2781 	  while (!bitmap_equal_p (workset, zero_bitmap))
2782 	    {
2783 	      v = find_max_dv_min_mob (g, workset);
2784 	      v_node = &g->nodes[v];
2785 	      node_order[pos++] = v;
2786 	      v_node_preds = NODE_PREDECESSORS (v_node);
2787 	      bitmap_and (tmp, v_node_preds, scc);
2788 
2789 	      /* Don't consider the already ordered predecessors again.  */
2790 	      bitmap_and_compl (tmp, tmp, nodes_ordered);
2791 	      bitmap_ior (workset, workset, tmp);
2792 	      bitmap_clear_bit (workset, v);
2793 	      bitmap_set_bit (nodes_ordered, v);
2794 	    }
2795 	  dir = TOPDOWN;
2796 	  bitmap_clear (successors);
2797 	  find_successors (successors, g, nodes_ordered);
2798 	  bitmap_and (workset, successors, scc);
2799 	}
2800     }
2801   sbitmap_free (zero_bitmap);
2802   return pos;
2803 }
2804 
2805 
2806 /* This page contains functions for manipulating partial-schedules during
2807    modulo scheduling.  */
2808 
2809 /* Create a partial schedule and allocate a memory to hold II rows.  */
2810 
2811 static partial_schedule_ptr
create_partial_schedule(int ii,ddg_ptr g,int history)2812 create_partial_schedule (int ii, ddg_ptr g, int history)
2813 {
2814   partial_schedule_ptr ps = XNEW (struct partial_schedule);
2815   ps->rows = (ps_insn_ptr *) xcalloc (ii, sizeof (ps_insn_ptr));
2816   ps->rows_length = (int *) xcalloc (ii, sizeof (int));
2817   ps->reg_moves.create (0);
2818   ps->ii = ii;
2819   ps->history = history;
2820   ps->min_cycle = INT_MAX;
2821   ps->max_cycle = INT_MIN;
2822   ps->g = g;
2823 
2824   return ps;
2825 }
2826 
2827 /* Free the PS_INSNs in rows array of the given partial schedule.
2828    ??? Consider caching the PS_INSN's.  */
2829 static void
free_ps_insns(partial_schedule_ptr ps)2830 free_ps_insns (partial_schedule_ptr ps)
2831 {
2832   int i;
2833 
2834   for (i = 0; i < ps->ii; i++)
2835     {
2836       while (ps->rows[i])
2837 	{
2838 	  ps_insn_ptr ps_insn = ps->rows[i]->next_in_row;
2839 
2840 	  free (ps->rows[i]);
2841 	  ps->rows[i] = ps_insn;
2842 	}
2843       ps->rows[i] = NULL;
2844     }
2845 }
2846 
2847 /* Free all the memory allocated to the partial schedule.  */
2848 
2849 static void
free_partial_schedule(partial_schedule_ptr ps)2850 free_partial_schedule (partial_schedule_ptr ps)
2851 {
2852   ps_reg_move_info *move;
2853   unsigned int i;
2854 
2855   if (!ps)
2856     return;
2857 
2858   FOR_EACH_VEC_ELT (ps->reg_moves, i, move)
2859     sbitmap_free (move->uses);
2860   ps->reg_moves.release ();
2861 
2862   free_ps_insns (ps);
2863   free (ps->rows);
2864   free (ps->rows_length);
2865   free (ps);
2866 }
2867 
2868 /* Clear the rows array with its PS_INSNs, and create a new one with
2869    NEW_II rows.  */
2870 
2871 static void
reset_partial_schedule(partial_schedule_ptr ps,int new_ii)2872 reset_partial_schedule (partial_schedule_ptr ps, int new_ii)
2873 {
2874   if (!ps)
2875     return;
2876   free_ps_insns (ps);
2877   if (new_ii == ps->ii)
2878     return;
2879   ps->rows = (ps_insn_ptr *) xrealloc (ps->rows, new_ii
2880 						 * sizeof (ps_insn_ptr));
2881   memset (ps->rows, 0, new_ii * sizeof (ps_insn_ptr));
2882   ps->rows_length = (int *) xrealloc (ps->rows_length, new_ii * sizeof (int));
2883   memset (ps->rows_length, 0, new_ii * sizeof (int));
2884   ps->ii = new_ii;
2885   ps->min_cycle = INT_MAX;
2886   ps->max_cycle = INT_MIN;
2887 }
2888 
2889 /* Prints the partial schedule as an ii rows array, for each rows
2890    print the ids of the insns in it.  */
2891 void
print_partial_schedule(partial_schedule_ptr ps,FILE * dump)2892 print_partial_schedule (partial_schedule_ptr ps, FILE *dump)
2893 {
2894   int i;
2895 
2896   for (i = 0; i < ps->ii; i++)
2897     {
2898       ps_insn_ptr ps_i = ps->rows[i];
2899 
2900       fprintf (dump, "\n[ROW %d ]: ", i);
2901       while (ps_i)
2902 	{
2903 	  rtx_insn *insn = ps_rtl_insn (ps, ps_i->id);
2904 
2905 	  if (JUMP_P (insn))
2906 	    fprintf (dump, "%d (branch), ", INSN_UID (insn));
2907 	  else
2908 	    fprintf (dump, "%d, ", INSN_UID (insn));
2909 
2910 	  ps_i = ps_i->next_in_row;
2911 	}
2912     }
2913 }
2914 
2915 /* Creates an object of PS_INSN and initializes it to the given parameters.  */
2916 static ps_insn_ptr
create_ps_insn(int id,int cycle)2917 create_ps_insn (int id, int cycle)
2918 {
2919   ps_insn_ptr ps_i = XNEW (struct ps_insn);
2920 
2921   ps_i->id = id;
2922   ps_i->next_in_row = NULL;
2923   ps_i->prev_in_row = NULL;
2924   ps_i->cycle = cycle;
2925 
2926   return ps_i;
2927 }
2928 
2929 
2930 /* Removes the given PS_INSN from the partial schedule.  */
2931 static void
remove_node_from_ps(partial_schedule_ptr ps,ps_insn_ptr ps_i)2932 remove_node_from_ps (partial_schedule_ptr ps, ps_insn_ptr ps_i)
2933 {
2934   int row;
2935 
2936   gcc_assert (ps && ps_i);
2937 
2938   row = SMODULO (ps_i->cycle, ps->ii);
2939   if (! ps_i->prev_in_row)
2940     {
2941       gcc_assert (ps_i == ps->rows[row]);
2942       ps->rows[row] = ps_i->next_in_row;
2943       if (ps->rows[row])
2944 	ps->rows[row]->prev_in_row = NULL;
2945     }
2946   else
2947     {
2948       ps_i->prev_in_row->next_in_row = ps_i->next_in_row;
2949       if (ps_i->next_in_row)
2950 	ps_i->next_in_row->prev_in_row = ps_i->prev_in_row;
2951     }
2952 
2953   ps->rows_length[row] -= 1;
2954   free (ps_i);
2955   return;
2956 }
2957 
2958 /* Unlike what literature describes for modulo scheduling (which focuses
2959    on VLIW machines) the order of the instructions inside a cycle is
2960    important.  Given the bitmaps MUST_FOLLOW and MUST_PRECEDE we know
2961    where the current instruction should go relative to the already
2962    scheduled instructions in the given cycle.  Go over these
2963    instructions and find the first possible column to put it in.  */
2964 static bool
ps_insn_find_column(partial_schedule_ptr ps,ps_insn_ptr ps_i,sbitmap must_precede,sbitmap must_follow)2965 ps_insn_find_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2966 		     sbitmap must_precede, sbitmap must_follow)
2967 {
2968   ps_insn_ptr next_ps_i;
2969   ps_insn_ptr first_must_follow = NULL;
2970   ps_insn_ptr last_must_precede = NULL;
2971   ps_insn_ptr last_in_row = NULL;
2972   int row;
2973 
2974   if (! ps_i)
2975     return false;
2976 
2977   row = SMODULO (ps_i->cycle, ps->ii);
2978 
2979   /* Find the first must follow and the last must precede
2980      and insert the node immediately after the must precede
2981      but make sure that it there is no must follow after it.  */
2982   for (next_ps_i = ps->rows[row];
2983        next_ps_i;
2984        next_ps_i = next_ps_i->next_in_row)
2985     {
2986       if (must_follow
2987 	  && bitmap_bit_p (must_follow, next_ps_i->id)
2988 	  && ! first_must_follow)
2989         first_must_follow = next_ps_i;
2990       if (must_precede && bitmap_bit_p (must_precede, next_ps_i->id))
2991         {
2992           /* If we have already met a node that must follow, then
2993 	     there is no possible column.  */
2994   	  if (first_must_follow)
2995             return false;
2996 	  else
2997             last_must_precede = next_ps_i;
2998         }
2999       /* The closing branch must be the last in the row.  */
3000       if (JUMP_P (ps_rtl_insn (ps, next_ps_i->id)))
3001 	return false;
3002 
3003        last_in_row = next_ps_i;
3004     }
3005 
3006   /* The closing branch is scheduled as well.  Make sure there is no
3007      dependent instruction after it as the branch should be the last
3008      instruction in the row.  */
3009   if (JUMP_P (ps_rtl_insn (ps, ps_i->id)))
3010     {
3011       if (first_must_follow)
3012 	return false;
3013       if (last_in_row)
3014 	{
3015 	  /* Make the branch the last in the row.  New instructions
3016 	     will be inserted at the beginning of the row or after the
3017 	     last must_precede instruction thus the branch is guaranteed
3018 	     to remain the last instruction in the row.  */
3019 	  last_in_row->next_in_row = ps_i;
3020 	  ps_i->prev_in_row = last_in_row;
3021 	  ps_i->next_in_row = NULL;
3022 	}
3023       else
3024 	ps->rows[row] = ps_i;
3025       return true;
3026     }
3027 
3028   /* Now insert the node after INSERT_AFTER_PSI.  */
3029 
3030   if (! last_must_precede)
3031     {
3032       ps_i->next_in_row = ps->rows[row];
3033       ps_i->prev_in_row = NULL;
3034       if (ps_i->next_in_row)
3035     	ps_i->next_in_row->prev_in_row = ps_i;
3036       ps->rows[row] = ps_i;
3037     }
3038   else
3039     {
3040       ps_i->next_in_row = last_must_precede->next_in_row;
3041       last_must_precede->next_in_row = ps_i;
3042       ps_i->prev_in_row = last_must_precede;
3043       if (ps_i->next_in_row)
3044         ps_i->next_in_row->prev_in_row = ps_i;
3045     }
3046 
3047   return true;
3048 }
3049 
3050 /* Advances the PS_INSN one column in its current row; returns false
3051    in failure and true in success.  Bit N is set in MUST_FOLLOW if
3052    the node with cuid N must be come after the node pointed to by
3053    PS_I when scheduled in the same cycle.  */
3054 static int
ps_insn_advance_column(partial_schedule_ptr ps,ps_insn_ptr ps_i,sbitmap must_follow)3055 ps_insn_advance_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
3056 			sbitmap must_follow)
3057 {
3058   ps_insn_ptr prev, next;
3059   int row;
3060 
3061   if (!ps || !ps_i)
3062     return false;
3063 
3064   row = SMODULO (ps_i->cycle, ps->ii);
3065 
3066   if (! ps_i->next_in_row)
3067     return false;
3068 
3069   /* Check if next_in_row is dependent on ps_i, both having same sched
3070      times (typically ANTI_DEP).  If so, ps_i cannot skip over it.  */
3071   if (must_follow && bitmap_bit_p (must_follow, ps_i->next_in_row->id))
3072     return false;
3073 
3074   /* Advance PS_I over its next_in_row in the doubly linked list.  */
3075   prev = ps_i->prev_in_row;
3076   next = ps_i->next_in_row;
3077 
3078   if (ps_i == ps->rows[row])
3079     ps->rows[row] = next;
3080 
3081   ps_i->next_in_row = next->next_in_row;
3082 
3083   if (next->next_in_row)
3084     next->next_in_row->prev_in_row = ps_i;
3085 
3086   next->next_in_row = ps_i;
3087   ps_i->prev_in_row = next;
3088 
3089   next->prev_in_row = prev;
3090   if (prev)
3091     prev->next_in_row = next;
3092 
3093   return true;
3094 }
3095 
3096 /* Inserts a DDG_NODE to the given partial schedule at the given cycle.
3097    Returns 0 if this is not possible and a PS_INSN otherwise.  Bit N is
3098    set in MUST_PRECEDE/MUST_FOLLOW if the node with cuid N must be come
3099    before/after (respectively) the node pointed to by PS_I when scheduled
3100    in the same cycle.  */
3101 static ps_insn_ptr
add_node_to_ps(partial_schedule_ptr ps,int id,int cycle,sbitmap must_precede,sbitmap must_follow)3102 add_node_to_ps (partial_schedule_ptr ps, int id, int cycle,
3103 		sbitmap must_precede, sbitmap must_follow)
3104 {
3105   ps_insn_ptr ps_i;
3106   int row = SMODULO (cycle, ps->ii);
3107 
3108   if (ps->rows_length[row] >= issue_rate)
3109     return NULL;
3110 
3111   ps_i = create_ps_insn (id, cycle);
3112 
3113   /* Finds and inserts PS_I according to MUST_FOLLOW and
3114      MUST_PRECEDE.  */
3115   if (! ps_insn_find_column (ps, ps_i, must_precede, must_follow))
3116     {
3117       free (ps_i);
3118       return NULL;
3119     }
3120 
3121   ps->rows_length[row] += 1;
3122   return ps_i;
3123 }
3124 
3125 /* Advance time one cycle.  Assumes DFA is being used.  */
3126 static void
advance_one_cycle(void)3127 advance_one_cycle (void)
3128 {
3129   if (targetm.sched.dfa_pre_cycle_insn)
3130     state_transition (curr_state,
3131 		      targetm.sched.dfa_pre_cycle_insn ());
3132 
3133   state_transition (curr_state, NULL);
3134 
3135   if (targetm.sched.dfa_post_cycle_insn)
3136     state_transition (curr_state,
3137 		      targetm.sched.dfa_post_cycle_insn ());
3138 }
3139 
3140 
3141 
3142 /* Checks if PS has resource conflicts according to DFA, starting from
3143    FROM cycle to TO cycle; returns true if there are conflicts and false
3144    if there are no conflicts.  Assumes DFA is being used.  */
3145 static int
ps_has_conflicts(partial_schedule_ptr ps,int from,int to)3146 ps_has_conflicts (partial_schedule_ptr ps, int from, int to)
3147 {
3148   int cycle;
3149 
3150   state_reset (curr_state);
3151 
3152   for (cycle = from; cycle <= to; cycle++)
3153     {
3154       ps_insn_ptr crr_insn;
3155       /* Holds the remaining issue slots in the current row.  */
3156       int can_issue_more = issue_rate;
3157 
3158       /* Walk through the DFA for the current row.  */
3159       for (crr_insn = ps->rows[SMODULO (cycle, ps->ii)];
3160 	   crr_insn;
3161 	   crr_insn = crr_insn->next_in_row)
3162 	{
3163 	  rtx_insn *insn = ps_rtl_insn (ps, crr_insn->id);
3164 
3165 	  if (!NONDEBUG_INSN_P (insn))
3166 	    continue;
3167 
3168 	  /* Check if there is room for the current insn.  */
3169 	  if (!can_issue_more || state_dead_lock_p (curr_state))
3170 	    return true;
3171 
3172 	  /* Update the DFA state and return with failure if the DFA found
3173 	     resource conflicts.  */
3174 	  if (state_transition (curr_state, insn) >= 0)
3175 	    return true;
3176 
3177 	  if (targetm.sched.variable_issue)
3178 	    can_issue_more =
3179 	      targetm.sched.variable_issue (sched_dump, sched_verbose,
3180 					    insn, can_issue_more);
3181 	  /* A naked CLOBBER or USE generates no instruction, so don't
3182 	     let them consume issue slots.  */
3183 	  else if (GET_CODE (PATTERN (insn)) != USE
3184 		   && GET_CODE (PATTERN (insn)) != CLOBBER)
3185 	    can_issue_more--;
3186 	}
3187 
3188       /* Advance the DFA to the next cycle.  */
3189       advance_one_cycle ();
3190     }
3191   return false;
3192 }
3193 
3194 /* Checks if the given node causes resource conflicts when added to PS at
3195    cycle C.  If not the node is added to PS and returned; otherwise zero
3196    is returned.  Bit N is set in MUST_PRECEDE/MUST_FOLLOW if the node with
3197    cuid N must be come before/after (respectively) the node pointed to by
3198    PS_I when scheduled in the same cycle.  */
3199 ps_insn_ptr
ps_add_node_check_conflicts(partial_schedule_ptr ps,int n,int c,sbitmap must_precede,sbitmap must_follow)3200 ps_add_node_check_conflicts (partial_schedule_ptr ps, int n,
3201    			     int c, sbitmap must_precede,
3202 			     sbitmap must_follow)
3203 {
3204   int has_conflicts = 0;
3205   ps_insn_ptr ps_i;
3206 
3207   /* First add the node to the PS, if this succeeds check for
3208      conflicts, trying different issue slots in the same row.  */
3209   if (! (ps_i = add_node_to_ps (ps, n, c, must_precede, must_follow)))
3210     return NULL; /* Failed to insert the node at the given cycle.  */
3211 
3212   has_conflicts = ps_has_conflicts (ps, c, c)
3213 		  || (ps->history > 0
3214 		      && ps_has_conflicts (ps,
3215 					   c - ps->history,
3216 					   c + ps->history));
3217 
3218   /* Try different issue slots to find one that the given node can be
3219      scheduled in without conflicts.  */
3220   while (has_conflicts)
3221     {
3222       if (! ps_insn_advance_column (ps, ps_i, must_follow))
3223 	break;
3224       has_conflicts = ps_has_conflicts (ps, c, c)
3225 		      || (ps->history > 0
3226 			  && ps_has_conflicts (ps,
3227 					       c - ps->history,
3228 					       c + ps->history));
3229     }
3230 
3231   if (has_conflicts)
3232     {
3233       remove_node_from_ps (ps, ps_i);
3234       return NULL;
3235     }
3236 
3237   ps->min_cycle = MIN (ps->min_cycle, c);
3238   ps->max_cycle = MAX (ps->max_cycle, c);
3239   return ps_i;
3240 }
3241 
3242 /* Calculate the stage count of the partial schedule PS.  The calculation
3243    takes into account the rotation amount passed in ROTATION_AMOUNT.  */
3244 int
calculate_stage_count(partial_schedule_ptr ps,int rotation_amount)3245 calculate_stage_count (partial_schedule_ptr ps, int rotation_amount)
3246 {
3247   int new_min_cycle = PS_MIN_CYCLE (ps) - rotation_amount;
3248   int new_max_cycle = PS_MAX_CYCLE (ps) - rotation_amount;
3249   int stage_count = CALC_STAGE_COUNT (-1, new_min_cycle, ps->ii);
3250 
3251   /* The calculation of stage count is done adding the number of stages
3252      before cycle zero and after cycle zero.  */
3253   stage_count += CALC_STAGE_COUNT (new_max_cycle, 0, ps->ii);
3254 
3255   return stage_count;
3256 }
3257 
3258 /* Rotate the rows of PS such that insns scheduled at time
3259    START_CYCLE will appear in row 0.  Updates max/min_cycles.  */
3260 void
rotate_partial_schedule(partial_schedule_ptr ps,int start_cycle)3261 rotate_partial_schedule (partial_schedule_ptr ps, int start_cycle)
3262 {
3263   int i, row, backward_rotates;
3264   int last_row = ps->ii - 1;
3265 
3266   if (start_cycle == 0)
3267     return;
3268 
3269   backward_rotates = SMODULO (start_cycle, ps->ii);
3270 
3271   /* Revisit later and optimize this into a single loop.  */
3272   for (i = 0; i < backward_rotates; i++)
3273     {
3274       ps_insn_ptr first_row = ps->rows[0];
3275       int first_row_length = ps->rows_length[0];
3276 
3277       for (row = 0; row < last_row; row++)
3278 	{
3279 	  ps->rows[row] = ps->rows[row + 1];
3280 	  ps->rows_length[row] = ps->rows_length[row + 1];
3281 	}
3282 
3283       ps->rows[last_row] = first_row;
3284       ps->rows_length[last_row] = first_row_length;
3285     }
3286 
3287   ps->max_cycle -= start_cycle;
3288   ps->min_cycle -= start_cycle;
3289 }
3290 
3291 #endif /* INSN_SCHEDULING */
3292 
3293 /* Run instruction scheduler.  */
3294 /* Perform SMS module scheduling.  */
3295 
3296 namespace {
3297 
3298 const pass_data pass_data_sms =
3299 {
3300   RTL_PASS, /* type */
3301   "sms", /* name */
3302   OPTGROUP_NONE, /* optinfo_flags */
3303   TV_SMS, /* tv_id */
3304   0, /* properties_required */
3305   0, /* properties_provided */
3306   0, /* properties_destroyed */
3307   0, /* todo_flags_start */
3308   TODO_df_finish, /* todo_flags_finish */
3309 };
3310 
3311 class pass_sms : public rtl_opt_pass
3312 {
3313 public:
pass_sms(gcc::context * ctxt)3314   pass_sms (gcc::context *ctxt)
3315     : rtl_opt_pass (pass_data_sms, ctxt)
3316   {}
3317 
3318   /* opt_pass methods: */
gate(function *)3319   virtual bool gate (function *)
3320 {
3321   return (optimize > 0 && flag_modulo_sched);
3322 }
3323 
3324   virtual unsigned int execute (function *);
3325 
3326 }; // class pass_sms
3327 
3328 unsigned int
execute(function * fun ATTRIBUTE_UNUSED)3329 pass_sms::execute (function *fun ATTRIBUTE_UNUSED)
3330 {
3331 #ifdef INSN_SCHEDULING
3332   basic_block bb;
3333 
3334   /* Collect loop information to be used in SMS.  */
3335   cfg_layout_initialize (0);
3336   sms_schedule ();
3337 
3338   /* Update the life information, because we add pseudos.  */
3339   max_regno = max_reg_num ();
3340 
3341   /* Finalize layout changes.  */
3342   FOR_EACH_BB_FN (bb, fun)
3343     if (bb->next_bb != EXIT_BLOCK_PTR_FOR_FN (fun))
3344       bb->aux = bb->next_bb;
3345   free_dominance_info (CDI_DOMINATORS);
3346   cfg_layout_finalize ();
3347 #endif /* INSN_SCHEDULING */
3348   return 0;
3349 }
3350 
3351 } // anon namespace
3352 
3353 rtl_opt_pass *
make_pass_sms(gcc::context * ctxt)3354 make_pass_sms (gcc::context *ctxt)
3355 {
3356   return new pass_sms (ctxt);
3357 }
3358