1% This file is part of the MMIXware package (c) Donald E Knuth 1999
2@i boilerplate.w %<< legal stuff: PLEASE READ IT BEFORE MAKING ANY CHANGES!
3
4\def\title{MMIX-PIPE}
5\def\MMIX{\.{MMIX}}
6\def\NNIX{\hbox{\mc NNIX}}
7\def\Hex#1{\hbox{$^{\scriptscriptstyle\#}$\tt#1}} % experimental hex constant
8@s and normal @q unreserve a C++ keyword @>
9@s or normal @q unreserve a C++ keyword @>
10@s bool normal @q unreserve a C++ keyword @>
11@s xor normal @q unreserve a C++ keyword @>
12
13@* Introduction. This program is the heart of the meta-simulator for the
14ultra-configurable \MMIX\ pipeline: It defines the |MMIX_run| routine, which
15does most of the
16work. Another routine, |MMIX_init|, is also defined here, and so is a header
17file called \.{mmix\_pipe.h}. The header file is used by the main routine and
18by other routines like |MMIX_config|, which are compiled separately.
19
20Readers of this program should be familiar with the explanation of \MMIX\
21architecture as presented in the main program module for {\mc MMMIX}.
22
23A lot of subtle things can happen when instructions are executed in parallel.
24Therefore this simulator ranks among the most interesting and instructive
25programs in the author's experience. The author has tried his best to make
26everything correct \dots\ but the chances for error are great. Anyone who
27discovers a bug is therefore urged to report it as soon as possible;
28please see \.{http:/\kern-.1em/mmix.cs.hm.edu/bugs/} for instructions.
29
30It sort of boggles the mind when one realizes that the present program might
31someday be translated by a \CEE/~compiler for \MMIX\ and used to simulate
32{\it itself}.
33
34@ This high-performance prototype of \MMIX\ achieves its efficiency by
35means of ``pipelining,'' a technique of overlapping that is explained
36for the related \.{DLX} computer in Chapter~3 of Hennessy \char`\&\ Patterson's
37@^Hennessy, John LeRoy@>
38@^Patterson, David Andrew@>
39book {\sl Computer Architecture\/} (second edition). Other techniques
40such as ``dynamic scheduling'' and ``multiple issue,'' explained in
41Chapter~4 of that book, are used too.
42
43One good way to visualize the procedure is to imagine that somebody has
44organized a high-tech car repair shop according to similar principles.
45There are eight independent functional units, which we can think of as
46eight groups of auto mechanics, each specializing in a particular task;
47each group has its own workspace with room to deal with one car at a time.
48Group~F (the ``fetch'' group) is in charge of rounding up customers and
49getting them to enter the assembly-line garage in an orderly fashion.
50Group~D (the ``decode and dispatch'' group) does the initial vehicle
51inspection and
52writes up an order that explains what kind of servicing is required.
53The vehicles go next to one of the four ``execution'' groups:
54Group~X handles routine maintenance, while groups XF, XM, and XD are
55specialists in more complex tasks that tend to take longer. (The XF
56people are good at floating the points, while the XM and XD groups are
57experts in multilink suspensions and differentials.) When the relevant X~group
58has finished its work, cars drive to M~station, where they send or receive
59messages and possibly pay money to members of the ``memory'' group. Finally
60all necessary parts are installed by members of group~W, the ``write''
61group, and the car leaves the shop. Everything is tightly organized so
62that in most cases the cars move in synchronized fashion from station
63to station, at regular 100-nanocentury intervals. % about 5.3 minutes
64
65In a similar way, most \MMIX\ instructions can be handled in a five-stage
66pipeline, F--D--X--M--W, with X replaced by XF for floating-point
67addition or conversion, or by XM for multiplication, or by XD for
68division or square root. Each stage ideally takes one clock cycle,
69although XF, XM, and (especially) XD are slower. If the instructions enter
70in a suitable pattern, we might see one instruction being fetched,
71another being decoded, and up to four being executed, while another is accessing
72memory, and yet another is finishing up by writing new information into
73registers; all this is going on simultaneously during one clock cycle. Pipelining
74with eight separate stages might therefore make the machine run
75up to 8 times as fast as it could if each instruction were being dealt with
76individually and without overlap. (Well, perfect speedup turns out to
77be impossible, because of the shared M and~W stages; the theory of
78knapsack programming, to be discussed in Section~7.7 of {\sl The Art
79of Computer Programming}, tells us that the maximal achievable speedup is
80at most $8-1/p-1/q-1/r$ when XF, XM, and~XD have delays bounded by $p$,
81$q$, and~$r$ cycles. But we can achieve a factor of more than~7
82if we are very lucky.)
83
84Consider, for example, the \.{ADD} instruction. This instruction enters
85the computer's processing unit in F stage, taking only one clock cycle if
86it is in the cache of instructions recently seen. Then the D~stage
87recognizes the command as an \.{ADD} and acquires the current values
88of \$Y and \$Z; meanwhile, of course, another instruction is being fetched
89by~F.
90On the next clock cycle, the X stage adds the values together.
91This prepares the way for the M stage to watch for overflow and to
92get ready for any exceptional action that might be needed with respect
93to the settings of special register~rA\null.
94Finally, on the fifth clock cycle, the sum is either written into~\$X
95or the trip handler for integer overflow is invoked.
96Although this process has taken five clock
97cycles (that is, $5\upsilon$),
98the net increase in running time has been only~$1\upsilon$.
99
100Of course congestion can occur, inside a computer as in a repair shop.
101For example, auto parts might not be readily available; or a car might
102have to sit in D station while waiting to move to XM, thereby blocking
103somebody else from moving from F to~D.  Sometimes there won't
104necessarily be a steady stream of customers.  In such cases the
105employees in some parts of the shop will occasionally be idle.  But we
106assume that they always do their jobs as fast as possible, given the
107sequence of customers that they encounter. With a clever person
108setting up appointments---translation: with a clever
109programmer and/or compiler arranging \MMIX\ instructions---the
110organization can often be expected to run at nearly peak capacity.
111
112In fact, this program is designed for experiments with many kinds of
113pipelines, potentially using additional functional units (such as
114several independent X~groups), and potentially fetching, dispatching, and
115executing several nonconflicting instructions simultaneously.
116Such complications
117make this program more difficult than a simple pipeline simulator
118would be, but they also make it a lot more instructive because we
119can get a better understanding of the issues involved if we are
120required to treat them in greater generality.
121
122@ Here's the overall structure of the present program module.
123
124@c
125#include <stdio.h>
126#include <stdlib.h>
127#include <math.h>
128#include "abstime.h"
129@h@#
130@<Header definitions@>@;
131@<Type definitions@>@;
132@<Global variables@>@;
133@<External variables@>@;
134@<Internal prototypes@>@;
135@<External prototypes@>@;
136@<Subroutines@>@;
137@<External routines@>@;
138
139@ The identifier \&{Extern} is used in {\mc MMIX-PIPE} to
140declare variables that are accessed in other modules. Actually
141all appearances of `\&{Extern}' are defined to be blank here, but
142`\&{Extern}' will become `\&{extern}' in the header file.
143
144@d Extern  /* blank for us, \&{extern} for them */
145@f Extern extern
146
147@<External variables@>=
148Extern int verbose; /* controls the level of diagnostic output */
149
150@ The header file repeats the basic definitions and declarations.
151
152@(mmix-pipe.h@>=
153#define Extern extern
154@<Header definitions@>@;
155@<Type definitions@>@;
156@<External variables@>@;
157@<External prototypes@>@;
158
159@ Subroutines of this program are declared first with a prototype,
160as in {\mc ANSI C}, then with an old-style \CEE/ function definition.
161The following preprocessor commands make this work correctly with both
162new-style and old-style compilers.
163@^prototypes for functions@>
164
165@<Header def...@>=
166#ifdef __STDC__
167#define ARGS(list) list
168#else
169#define ARGS(list) ()
170#endif
171
172@ Some of the names that are natural for this program are in
173conflict with library names on at least
174one of the host computers in the author's tests. So we
175bypass the library names here.
176
177@<Header def...@>=
178#define random my_random
179#define fsqrt my_fsqrt
180#define div my_div
181
182@ The amount of verbosity depends on the following bit codes.
183
184@<Header def...@>=
185#define issue_bit (1<<0)
186   /* show control blocks when issued, deissued, committed */
187#define pipe_bit (1<<1)
188   /* show the pipeline and locks on every cycle */
189#define coroutine_bit (1<<2)
190   /* show the coroutines when started on every cycle */
191#define schedule_bit (1<<3)
192   /* show the coroutines when scheduled */
193#define uninit_mem_bit (1<<4)
194   /* complain when reading from an uninitialized chunk of memory */
195#define interactive_read_bit (1<<5)
196   /* prompt user when reading from I/O location */
197#define show_spec_bit (1<<6)
198   /* display special read/write transactions as they happen */
199#define show_pred_bit (1<<7)
200   /* display branch prediction details */
201#define show_wholecache_bit (1<<8)
202   /* display cache blocks even when their key tag is invalid */
203
204@ The |MMIX_init()| routine should be called exactly once, after
205|MMIX_config()| has done its work but before the simulator starts to execute
206any programs. Then |MMIX_run()| can be called as often as the user likes.
207
208The |MMIX_silent()| routine is a noninteractive variant of |MMIX_run()|:
209It will return the value of register |g[255].l| when executing a
210\.{TRAP} \.{0,Halt,0} instruction.
211
212@s octa int
213
214@<External proto...@>=
215Extern void MMIX_init @,@,@[ARGS((void))@];
216Extern void MMIX_run @,@,@[ARGS((int cycs, octa breakpoint))@];
217Extern int MMIX_silent @,@,@[ARGS((void))@];
218
219@ @<External routines@>=
220void MMIX_init()
221{
222  register int i,j;
223  @<Initialize everything@>;
224}
225@#
226int MMIX_silent()
227{
228  octa breakpoint;
229  @<Local variables@>;
230  while (true) {
231    @<Perform one machine cycle@>;
232    if (halted) return specval(&g[255]).o.l;
233  }
234}
235@#
236void MMIX_run(cycs,breakpoint)
237  int cycs;
238  octa breakpoint;
239{
240  @<Local variables@>;
241  while (cycs) {
242    if (verbose&(issue_bit|pipe_bit|coroutine_bit|schedule_bit))
243      printf("*** Cycle %d\n", ticks.l);
244    @<Perform one machine cycle@>;
245    if (verbose&pipe_bit) {
246      print_pipe();@+ print_locks();
247    }
248    if (breakpoint_hit||halted) {
249      if (breakpoint_hit)
250        printf("Breakpoint instruction fetched at time %d\n",ticks.l-1);
251      if (halted) printf("Halted at time %d\n", ticks.l-1);
252      break;
253    }
254    cycs--;
255  }
256 cease:;
257}
258
259@ @<Type...@>=
260typedef enum {@!false, @!true, @!wow}@+bool; /* slightly extended booleans */
261
262@ @<Local var...@>=
263register int i,j,m;
264bool breakpoint_hit=false;
265bool halted=false;
266
267@ Error messages that abort this program are called panic messages.
268The macro called |confusion| will never be needed unless this program is
269internally inconsistent.
270
271@d errprint0(f) fprintf(stderr,f)
272@d errprint1(f,a) fprintf(stderr,f,a)
273@d errprint2(f,a,b) fprintf(stderr,f,a,b)
274@d panic(x)@+ {@+errprint0("Panic: ");@+x;@+errprint0("!\n");@+expire();@+}
275@d confusion(m) errprint1("This can't happen: %s",m)
276@.This can't happen@>
277
278@<Internal proto...@>=
279static void expire @,@,@[ARGS((void))@];
280
281@ @<Sub...@>=
282static void expire() /* the last gasp before dying */
283{
284  if (ticks.h) errprint2("(Clock time is %dH+%d.)\n",ticks.h,ticks.l);
285  else errprint1("(Clock time is %d.)\n",ticks.l);
286@.Clock time is...@>
287  exit(-2);
288}
289
290@ The data structures of this program are not precisely equivalent to
291logical gates that could be implemented directly in silicon;
292we will use data structures and
293algorithms appropriate to the \CEE/ programming language. For example,
294we'll use pointers and arrays, instead of buses and ports and latches. However,
295the net effect of our data structures and algorithms is intended to
296be equivalent to the net effect of a silicon implementation. The methods
297used below are essentially equivalent to those used in real machines today,
298except that diagnostic facilities are added so that we can readily
299watch what is happening.
300
301Each functional unit in the \MMIX\ pipeline is programmed here as a coroutine
302in~\CEE/. At every clock cycle, we will call on each active coroutine to do one
303phase of its operation; in terms of the repair-station analogy
304described in the main program,
305this corresponds to getting each group of
306auto mechanics to do one unit of operation on a car.
307The coroutines are performed sequentially, although
308a real pipeline would have them act in parallel.
309We will not ``cheat'' by letting one coroutine access a value early in its
310cycle that another one computes late in its cycle, unless computer hardware
311could ``cheat'' in an equivalent way.
312
313@* Low-level routines. Where should we begin? It is tempting to start with a
314global view of the simulator and then to break it down into component parts.
315But that task is too daunting, because there are so many unknowns about what
316basic ingredients ought to be combined when we construct the larger
317components. So let us look first at the primitive operations on which
318the superstructure will be built. Once we have created some infrastructure,
319we'll be able to proceed with confidence to the larger tasks ahead.
320
321@ This program for the 64-bit \MMIX\ architecture is based on 32-bit integer
322arithmetic, because nearly every computer available to the author at the time
323of writing (1998--1999) was limited in that way.
324Details of the basic arithmetic appear in a separate program module
325called {\mc MMIX-ARITH}, because the same routines are needed also
326for the assembler and for the non-pipelined simulator. The
327definition of type \&{tetra} should be changed, if necessary, to conform with
328the definitions found there.
329@^system dependencies@>
330
331@<Type...@>=
332typedef unsigned int tetra;
333  /* for systems conforming to the LP-64 data model */
334typedef struct { tetra h,l;} octa; /* two tetrabytes make one octabyte */
335
336@ @<Internal proto...@>=
337static void print_octa @,@,@[ARGS((octa))@];
338
339@ @<Sub...@>=
340static void print_octa(o)
341  octa o;
342{
343  if (o.h) printf("%x%08x",o.h,o.l);@+
344  else printf("%x",o.l);
345}
346
347@ @<Glob...@>=
348extern octa zero_octa; /* |zero_octa.h=zero_octa.l=0| */
349extern octa neg_one; /* |neg_one.h=neg_one.l=-1| */
350extern octa aux; /* auxiliary output of a subroutine */
351extern bool overflow; /* set by certain subroutines for signed arithmetic */
352extern int exceptions; /* bits set by floating point operations */
353extern int cur_round; /* the current rounding mode */
354
355@ Most of the subroutines in {\mc MMIX-ARITH} return an octabyte as
356a function of two octabytes; for example, |oplus(y,z)| returns the
357sum of octabytes |y| and~|z|. Multiplication returns the high
358half of a product in the global variable~|aux|; division returns
359the remainder in~|aux|.
360
361@<Sub...@>=
362extern octa oplus @,@,@[ARGS((octa y,octa z))@];
363  /* unsigned $y+z$ */
364extern octa ominus @,@,@[ARGS((octa y,octa z))@];
365  /* unsigned $y-z$ */
366extern octa incr @,@,@[ARGS((octa y,int delta))@];
367  /* unsigned $y+\delta$ ($\delta$ is signed) */
368extern octa oand @,@,@[ARGS((octa y,octa z))@];
369  /* $y\land z$ */
370extern octa oandn @,@,@[ARGS((octa y,octa z))@];
371  /* $y\land \bar z$ */
372extern octa shift_left @,@,@[ARGS((octa y,int s))@];
373  /* $y\LL s$, $0\le s\le64$ */
374extern octa shift_right @,@,@[ARGS((octa y,int s,int u))@];
375  /* $y\GG s$, signed if |!u| */
376extern octa omult @,@,@[ARGS((octa y,octa z))@];
377  /* unsigned $(|aux|,x)=y\times z$ */
378extern octa signed_omult @,@,@[ARGS((octa y,octa z))@];
379  /* signed $x=y\times z$, setting |overflow| */
380extern octa odiv @,@,@[ARGS((octa x,octa y,octa z))@];
381  /* unsigned $(x,y)/z$; $|aux|=(x,y)\bmod z$ */
382extern octa signed_odiv @,@,@[ARGS((octa y,octa z))@];
383  /* signed $y/z$, when $z\ne0$; $|aux|=y\bmod z$ */
384extern int count_bits @,@,@[ARGS((tetra z))@];
385  /* $x=\nu(z)$ */
386extern tetra byte_diff @,@,@[ARGS((tetra y,tetra z))@];
387  /* half of \.{BDIF} */
388extern tetra wyde_diff @,@,@[ARGS((tetra y,tetra z))@];
389  /* half of \.{WDIF} */
390extern octa bool_mult @,@,@[ARGS((octa y,octa z,bool xor))@];
391  /* \.{MOR} or \.{MXOR} */
392extern octa load_sf @,@,@[ARGS((tetra z))@];
393  /* load short float */
394extern tetra store_sf @,@,@[ARGS((octa x))@];
395  /* store short float */
396extern octa fplus @,@,@[ARGS((octa y,octa z))@];
397  /* floating point $x=y\oplus z$ */
398extern octa fmult @,@,@[ARGS((octa y ,octa z))@];
399  /* floating point $x=y\otimes z$ */
400extern octa fdivide @,@,@[ARGS((octa y,octa z))@];
401  /* floating point $x=y\oslash z$ */
402extern octa froot @,@,@[ARGS((octa,int))@];
403  /* floating point $x=\sqrt z$ */
404extern octa fremstep @,@,@[ARGS((octa y,octa z,int delta))@];
405  /* floating point $x\,{\rm rem}\,z=y\,{\rm rem}\,z$ */
406extern octa fintegerize @,@,@[ARGS((octa z,int mode))@];
407  /* floating point $x={\rm round}(z)$ */
408extern int fcomp @,@,@[ARGS((octa y,octa z))@];
409  /* $-1$, 0, 1, or 2 if $y<z$, $y=z$, $y>z$, $y\parallel z$ */
410extern int fepscomp @,@,@[ARGS((octa y,octa z,octa eps,int sim))@];
411  /* $x=|sim|?\ [y\sim z\ (\epsilon)]:\ [y\approx z\ (\epsilon)]$ */
412extern octa floatit @,@,@[ARGS((octa z,int mode,int unsgnd,int shrt))@];
413  /* fix to float */
414extern octa fixit @,@,@[ARGS((octa z,int mode))@];
415  /* float to fix */
416
417@ We had better check that our 32-bit assumption holds.
418
419@<Initialize e...@>=
420if (shift_left(neg_one,1).h!=0xffffffff)
421  panic(errprint0("Incorrect implementation of type tetra"));
422@.Incorrect implementation...@>
423
424@* Coroutines. As stated earlier, this program can be regarded as a system of
425interacting coroutines. Coroutines---sometimes called threads---are more or
426less independent processes that share and pass data and control back and
427forth. They correspond to the individual workers in an organization.
428
429We don't need the full power of recursive coroutines, in which new threads are
430spawned dynamically and have independent stacks for computation; we are, after
431all, simulating a fixed piece of hardware. The total number of coroutines we
432deal with is established once and for all by the |MMIX_config| routine, and
433each coroutine has a fixed amount of local data.
434
435The simulation operates one clock tick at a time, by executing all
436coroutines scheduled for time~$t$ before advancing to time~$t+1$. The
437coroutines at time~$t$ may decide to become dormant or they may reschedule
438themselves and/or other coroutines for future times.
439
440Each coroutine has a symbolic |name| for diagnostic purposes (e.g.,
441\.{ALU1}); a nonnegative |stage| number (e.g., 2~for the second stage
442of a pipeline); a pointer to the next coroutine scheduled at the same time (or
443|NULL| if the coroutine is unscheduled); a pointer to a lock variable
444(or |NULL| if no lock is currently relevant);
445and a reference to a control block containing the data to be processed.
446
447@s control_struct int
448
449@<Type...@>=
450typedef struct coroutine_struct {
451 char *name; /* symbolic identification of a coroutine */
452 int stage; /* its rank */
453 struct coroutine_struct *next; /* its successor */
454 struct coroutine_struct **lockloc; /* what it might be locking */
455 struct control_struct *ctl; /* its data */
456} coroutine;
457
458@ @<Internal proto...@>=
459static void print_coroutine_id @,@,@[ARGS((coroutine*))@];
460static void errprint_coroutine_id @,@,@[ARGS((coroutine*))@];
461
462@ @<Sub...@>=
463static void print_coroutine_id(c)
464  coroutine *c;
465{
466  if (c) printf("%s:%d",c->name,c->stage);
467  else printf("??");
468}
469@#
470static void errprint_coroutine_id(c)
471  coroutine *c;
472{
473  if (c) errprint2("%s:%d",c->name,c->stage);
474  else errprint0("??");
475@.??@>
476}
477
478@ Coroutine control is masterminded by a ring of queues, one each for
479times $t$, $t+1$, \dots, $t+|ring_size|-1$, when $t$ is the current
480clock time.
481
482All scheduling is first-come-first-served, except that coroutines with higher
483|stage| numbers have priority. We want to process the later stages of a
484pipeline first, in this sequential implementation, for the same reason that a
485car must drive from M~station into W~station before another car can enter
486M~station.
487
488Each queue is a circular list of \&{coroutine} nodes, linked together by their
489|next| fields. A list head~$h$ with |stage=max_stage| comes at the end and the
490beginning of the queue. (All |stage| numbers of legitimate coroutines
491are less than~|max_stage|.) The queued items are |h->next|, |h->next->next|,
492etc., from back to front, and we have |c->stage<=c->next->stage| unless |c=h|.
493
494Initially all queues are empty.
495
496@<Initialize e...@>=
497{@+register coroutine *p;
498  for (p=ring;p<ring+ring_size;p++) p->next=p;
499}
500
501@ To schedule a coroutine |c| with positive delay |d<ring_size|, we call
502|schedule(c,d,s)|. (The |s| parameter is used only if scheduling is
503being logged; it does not affect the computation, but we will
504generally set |s| to the state at which the scheduled coroutine will begin.)
505
506@<Internal proto...@>=
507static void schedule @,@,@[ARGS((coroutine*,int,int))@];
508
509@ @<Sub...@>=
510static void schedule(c,d,s)
511  coroutine *c;
512  int d,s;
513{
514  register int tt=(cur_time+d)%ring_size;
515  register coroutine *p=&ring[tt]; /* start at the list head */
516  if (d<=0 || d>=ring_size) /* do a sanity check */
517   panic(confusion("Scheduling ");errprint_coroutine_id(c);
518         errprint1(" with delay %d",d));
519  while (p->next->stage<c->stage) p=p->next;
520  c->next = p->next;
521  p->next = c;
522  if (verbose&schedule_bit) {
523    printf(" scheduling ");@+print_coroutine_id(c);
524    printf(" at time %d, state %d\n",ticks.l+d,s);
525  }
526}
527
528@ @<External var...@>=
529Extern int ring_size; /* set by |MMIX_config|, must be sufficiently large */
530Extern coroutine *ring;
531Extern int cur_time;
532
533@ The all-important |ctl| field of a coroutine, which contains the
534data being manipulated, will be explained below. One of its key
535components is the |state| field, which helps to specify the next
536actions the coroutine will perform. When we schedule a coroutine for
537a new task, we often want it to begin in state~0.
538
539@<Internal proto...@>=
540static void startup @,@,@[ARGS((coroutine*,int))@];
541
542@ @<Sub...@>=
543static void startup(c,d)
544  coroutine *c;
545  int d;
546{
547  c->ctl->state=0;
548  schedule(c,d,0);
549}
550
551@ The following routine removes a coroutine from whatever queue it's in.
552The case |c->next=c| is also permitted; such a self-loop can occur when a
553coroutine goes to sleep and expects to be awakened (that is, scheduled)
554by another coroutine. Sleeping coroutines have important data in their
555|ctl| field; they are therefore quite different from unscheduled
556or ``unemployed'' coroutines, which have |c->next=NULL|. An unemployed
557coroutine is not assumed to have any valid data in its |ctl| field.
558
559@<Internal proto...@>=
560static void unschedule @,@,@[ARGS((coroutine*))@];
561
562@ @<Sub...@>=
563static void unschedule(c)
564  coroutine *c;
565{@+register coroutine *p;
566  if (c->next) {
567    for (p=c; p->next!=c; p=p->next) ;
568    p->next = c->next;
569    c->next=NULL;
570    if (verbose&schedule_bit) {
571      printf(" unscheduling ");@+print_coroutine_id(c);@+printf("\n");
572    }
573  }
574}
575
576@ When it is time to process all coroutines that have queued up for a
577particular time~|t|, we empty the queue called |ring[t]| and link its items in
578the opposite order (from front to back). The following subroutine uses the
579well known algorithm discussed in exercise 2.2.3--7 of {\sl The Art
580of Computer Programming}.
581
582@<Internal proto...@>=
583static coroutine *queuelist @,@,@[ARGS((int))@];
584
585@ @<Sub...@>=
586static coroutine* queuelist(t)
587  int t;
588{@+register coroutine *p, *q=&sentinel, *r;
589  for (p=ring[t].next;p!=&ring[t];p=r) {
590    r=p->next;
591    p->next=q;
592    q=p;
593  }
594  ring[t].next=&ring[t];
595  sentinel.next=q;
596  return q;
597}
598
599@ @<Glob...@>=
600coroutine sentinel; /* dummy coroutine at origin of circular list */
601
602@ Coroutines often start working on tasks that are {\it speculative}, in the
603sense that we want certain results to be ready if they prove to be
604useful; we understand that speculative computations might not actually
605be needed. Therefore a coroutine might need to be aborted before it
606has finished its work.
607
608All coroutines must be written in such a way that important data structures
609remain intact even when the coroutine is abruptly terminated. In particular,
610we need to be sure that ``locks'' on shared resources are restored to
611an unlocked state when a coroutine holding the lock is aborted.
612
613A \&{lockvar} variable is |NULL| when it is unlocked; otherwise it
614points to the coroutine responsible for unlocking~it.
615
616@d set_lock(c,l) {@+l=c;@+(c)->lockloc=&(l);@+}
617@d release_lock(c,l) {@+l=NULL;@+ (c)->lockloc=NULL;@+}
618
619@<Type...@>=
620typedef coroutine *lockvar;
621
622@ @<External proto...@>=
623Extern void print_locks @,@,@[ARGS((void))@];
624
625@ @<External r...@>=
626void print_locks()
627{
628  print_cache_locks(ITcache);
629  print_cache_locks(DTcache);
630  print_cache_locks(Icache);
631  print_cache_locks(Dcache);
632  print_cache_locks(Scache);
633  if (mem_lock) printf("mem locked by %s:%d\n",mem_lock->name,mem_lock->stage);
634  if (dispatch_lock) printf("dispatch locked by %s:%d\n",
635                    dispatch_lock->name,dispatch_lock->stage);
636  if (wbuf_lock) printf("head of write buffer locked by %s:%d\n",
637                    wbuf_lock->name,wbuf_lock->stage);
638  if (clean_lock) printf("cleaner locked by %s:%d\n",
639                    clean_lock->name,clean_lock->stage);
640  if (speed_lock) printf("write buffer flush locked by %s:%d\n",
641                    speed_lock->name,speed_lock->stage);
642}
643
644@ Many of the quantities we deal with are speculative values
645that might not yet have been certified as part of the ``real''
646calculation; in fact, they might not yet have been calculated.
647
648A \&{spec} consists of a 64-bit quantity |o| and a pointer~|p| to
649a \&{specnode}. The value~|o| is meaningful only if the
650pointer~|p| is~|NULL|; otherwise |p| points to a source of further information.
651
652A \&{specnode} is a 64-bit quantity |o| together with links to other
653\&{specnode}s
654that are above it or below it in a doubly linked list. An additional
655|known| bit tells whether the |o|~field has been calculated. There also is
656a 64-bit |addr| field, to identify the list and give further information.
657A \&{specnode} list keeps track of speculative values related to a specific
658register or to all of main memory; we will discuss such lists in detail~later.
659
660@s specnode_struct int
661
662@<Type...@>=
663typedef struct {
664  octa o;
665  struct specnode_struct *p;
666} spec;
667@#
668typedef struct specnode_struct {
669  octa o;
670  bool known;
671  octa addr;
672  struct specnode_struct *up,*down;
673} specnode;
674
675@ @<Glob...@>=
676spec zero_spec; /* |zero_spec.o.h=zero_spec.o.l=0| and |zero_spec.p=NULL| */
677
678@ @<Internal proto...@>=
679static void print_spec @,@,@[ARGS((spec))@];
680
681@ @<Sub...@>=
682static void print_spec(s)
683  spec s;
684{
685  if (!s.p) print_octa(s.o);
686  else {
687    printf(">");@+ print_specnode_id(s.p->addr);
688  }
689}
690@#
691static void print_specnode(s)
692  specnode s;
693{
694  if (s.known) {@+print_octa(s.o);@+printf("!");@+}
695  else if (s.o.h || s.o.l) {@+print_octa(s.o);@+printf("?");@+}
696  else printf("?");
697  print_specnode_id(s.addr);
698}
699
700@ The analog of an automobile in our simulator is a block of data called
701\&{control}, which represents all the relevant facts about an \MMIX\
702instruction.  We can think of it as the work order attached to a car's
703windshield. Each group of employees updates the work order as the car moves
704through the shop.
705
706A \&{control} record contains the original location of an instruction,
707and its four bytes OP~X~Y~Z. An instruction has up to four inputs, which are
708\&{spec} records called |y|, |z|, |b| and~|ra|; it also has up to three
709outputs, which are \&{specnode} records called |x|, |a|, and~|rl|.
710(We usually don't mention the special input~|ra| or the special output~|rl|,
711which refer to \.{MMIX}'s internal registers rA and~rL.) For example, the
712main inputs to a \.{DIVU} command are \$Y, \$Z, and~rD; the outputs are the
713quotient~\$X and the remainder~rR. The inputs to a
714\.{STO} command are \$Y, \$Z, and~\$X; there is one ``output,'' and
715the field~|x.addr| will be set to the physical address of the memory location
716corresponding to virtual address $\rm \$Y+\$Z$.
717
718Each \&{control} block also points to the coroutine that owns it, if any.
719And it has various other fields that contain other tidbits of information;
720for example, we have already mentioned
721the |state|~field, which often governs a coroutine's actions. The |i|~field,
722which contains an internal operation code number, is generally used together
723with |state| to switch between alternative computational steps. If, for
724example, the |op|~field is \.{SUB} or \.{SUBI} or \.{NEG} or \.{NEGI},
725the internal opcode~|i| will be simply~|sub|.
726We shall define all the fields of \&{control} records
727now and discuss them later.
728
729An actual hardware implementation of \MMIX\ wouldn't need all the information
730we are putting into a \&{control} block. Some of that information would
731typically be latched between stages of a pipeline; other portions would
732probably appear in so-called ``rename registers.''
733@^rename registers@>
734We simulate rename registers only indirectly,
735by counting how many registers of that
736kind would be in use if we were mimicking low-level hardware details more
737precisely. The |go| field is a \&{specnode} for convenience in programming,
738although we use only its |known| and |o| subfields. It generally contains
739the address of the subsequent instruction.
740
741@s mmix_opcode int
742@s internal_opcode int
743
744@<Type...@>=
745@<Declare \&{mmix\_opcode} and \&{internal\_opcode}@>@;
746typedef struct control_struct {
747 octa loc; /* virtual address where an instruction originated */
748 mmix_opcode op;@+ unsigned char xx,yy,zz; /* the original instruction bytes */
749 spec y,z,b,ra; /* inputs */
750 specnode x,a,go,rl; /* outputs */
751 coroutine *owner; /* a coroutine whose |ctl| this is */
752 internal_opcode i; /* internal opcode */
753 int state; /* internal mindset */
754 bool usage; /* should rU be increased? */
755 bool need_b; /* should we stall until |b.p==NULL|? */
756 bool need_ra; /* should we stall until |ra.p==NULL|? */
757 bool ren_x; /* does |x| correspond to a rename register? */
758 bool mem_x; /* does |x| correspond to a memory write? */
759 bool ren_a; /* does |a| correspond to a rename register? */
760 bool set_l; /* does |rl| correspond to a new value of rL? */
761 bool interim; /* does this instruction need to be reissued on interrupt? */
762 bool stack_alert; /* is there potential for stack overflow? */
763 unsigned int arith_exc; /* arithmetic exceptions for event bits of rA */
764 unsigned int hist; /* history bits for use in branch prediction */
765 int denin,denout; /* execution time penalties for subnormal handling */
766 octa cur_O,cur_S; /* speculative rO and rS before this instruction */
767 unsigned int interrupt; /* does this instruction generate an interrupt? */
768 void *ptr_a, *ptr_b, *ptr_c; /* generic pointers for miscellaneous use */
769} control;
770
771@ @<Internal proto...@>=
772static void print_control_block @,@,@[ARGS((control*))@];
773
774@ @<Sub...@>=
775static void print_control_block(c)
776  control *c;
777{
778  octa default_go;
779  if (c->loc.h || c->loc.l || c->op || c->xx || c->yy || c->zz || c->owner) {
780    print_octa(c->loc);
781    printf(": %02x%02x%02x%02x(%s)",c->op,c->xx,c->yy,c->zz,
782              internal_op_name[c->i]);
783  }
784  if (c->usage) printf("*");
785  if (c->interim) printf("+");
786  if (c->y.o.h || c->y.o.l || c->y.p) {@+printf(" y=");@+print_spec(c->y);@+}
787  if (c->z.o.h || c->z.o.l || c->z.p) {@+printf(" z=");@+print_spec(c->z);@+}
788  if (c->b.o.h || c->b.o.l || c->b.p || c->need_b) {
789    printf(" b=");@+print_spec(c->b);
790    if (c->need_b) printf("*");
791  }
792  if (c->need_ra) {@+printf(" rA=");@+print_spec(c->ra);@+}
793  if (c->ren_x || c->mem_x) {@+printf(" x=");@+print_specnode(c->x);@+}
794  else if (c->x.o.h || c->x.o.l) {
795    printf(" x=");@+print_octa(c->x.o);@+printf("%c",c->x.known? '!': '?');
796  }
797  if (c->ren_a) {@+printf(" a=");@+print_specnode(c->a);@+}
798  if (c->set_l) {@+printf(" rL=");@+print_specnode(c->rl);@+}
799  if (c->interrupt) {@+printf(" int=");@+print_bits(c->interrupt);@+}
800  if (c->arith_exc) {@+printf(" exc=");@+print_bits(c->arith_exc<<8);@+}
801  default_go=incr(c->loc,4);
802  if (c->go.o.l!=default_go.l || c->go.o.h!=default_go.h) {
803    printf(" ->");@+print_octa(c->go.o);
804  }
805  if (verbose&show_pred_bit) printf(" hist=%x",c->hist);
806  if (c->i==pop) {
807     printf(" rS="); print_octa(c->cur_S);
808     printf(" rO="); print_octa(c->cur_O);
809  }
810  printf(" state=%d",c->state);
811}
812
813@* Lists. Here is a (boring) list of all the \MMIX\ opcodes, in order.
814
815@<Declare \&{mmix\_opcode} and \&{internal\_opcode}@>=
816typedef enum{@/
817@!TRAP,@!FCMP,@!FUN,@!FEQL,@!FADD,@!FIX,@!FSUB,@!FIXU,@/
818@!FLOT,@!FLOTI,@!FLOTU,@!FLOTUI,@!SFLOT,@!SFLOTI,@!SFLOTU,@!SFLOTUI,@/
819@!FMUL,@!FCMPE,@!FUNE,@!FEQLE,@!FDIV,@!FSQRT,@!FREM,@!FINT,@/
820@!MUL,@!MULI,@!MULU,@!MULUI,@!DIV,@!DIVI,@!DIVU,@!DIVUI,@/
821@!ADD,@!ADDI,@!ADDU,@!ADDUI,@!SUB,@!SUBI,@!SUBU,@!SUBUI,@/
822@!IIADDU,@!IIADDUI,@!IVADDU,@!IVADDUI,@!VIIIADDU,@!VIIIADDUI,@!XVIADDU,@!XVIADDUI,@/
823@!CMP,@!CMPI,@!CMPU,@!CMPUI,@!NEG,@!NEGI,@!NEGU,@!NEGUI,@/
824@!SL,@!SLI,@!SLU,@!SLUI,@!SR,@!SRI,@!SRU,@!SRUI,@/
825@!BN,@!BNB,@!BZ,@!BZB,@!BP,@!BPB,@!BOD,@!BODB,@/
826@!BNN,@!BNNB,@!BNZ,@!BNZB,@!BNP,@!BNPB,@!BEV,@!BEVB,@/
827@!PBN,@!PBNB,@!PBZ,@!PBZB,@!PBP,@!PBPB,@!PBOD,@!PBODB,@/
828@!PBNN,@!PBNNB,@!PBNZ,@!PBNZB,@!PBNP,@!PBNPB,@!PBEV,@!PBEVB,@/
829@!CSN,@!CSNI,@!CSZ,@!CSZI,@!CSP,@!CSPI,@!CSOD,@!CSODI,@/
830@!CSNN,@!CSNNI,@!CSNZ,@!CSNZI,@!CSNP,@!CSNPI,@!CSEV,@!CSEVI,@/
831@!ZSN,@!ZSNI,@!ZSZ,@!ZSZI,@!ZSP,@!ZSPI,@!ZSOD,@!ZSODI,@/
832@!ZSNN,@!ZSNNI,@!ZSNZ,@!ZSNZI,@!ZSNP,@!ZSNPI,@!ZSEV,@!ZSEVI,@/
833@!LDB,@!LDBI,@!LDBU,@!LDBUI,@!LDW,@!LDWI,@!LDWU,@!LDWUI,@/
834@!LDT,@!LDTI,@!LDTU,@!LDTUI,@!LDO,@!LDOI,@!LDOU,@!LDOUI,@/
835@!LDSF,@!LDSFI,@!LDHT,@!LDHTI,@!CSWAP,@!CSWAPI,@!LDUNC,@!LDUNCI,@/
836@!LDVTS,@!LDVTSI,@!PRELD,@!PRELDI,@!PREGO,@!PREGOI,@!GO,@!GOI,@/
837@!STB,@!STBI,@!STBU,@!STBUI,@!STW,@!STWI,@!STWU,@!STWUI,@/
838@!STT,@!STTI,@!STTU,@!STTUI,@!STO,@!STOI,@!STOU,@!STOUI,@/
839@!STSF,@!STSFI,@!STHT,@!STHTI,@!STCO,@!STCOI,@!STUNC,@!STUNCI,@/
840@!SYNCD,@!SYNCDI,@!PREST,@!PRESTI,@!SYNCID,@!SYNCIDI,@!PUSHGO,@!PUSHGOI,@/
841@!OR,@!ORI,@!ORN,@!ORNI,@!NOR,@!NORI,@!XOR,@!XORI,@/
842@!AND,@!ANDI,@!ANDN,@!ANDNI,@!NAND,@!NANDI,@!NXOR,@!NXORI,@/
843@!BDIF,@!BDIFI,@!WDIF,@!WDIFI,@!TDIF,@!TDIFI,@!ODIF,@!ODIFI,@/
844@!MUX,@!MUXI,@!SADD,@!SADDI,@!MOR,@!MORI,@!MXOR,@!MXORI,@/
845@!SETH,@!SETMH,@!SETML,@!SETL,@!INCH,@!INCMH,@!INCML,@!INCL,@/
846@!ORH,@!ORMH,@!ORML,@!ORL,@!ANDNH,@!ANDNMH,@!ANDNML,@!ANDNL,@/
847@!JMP,@!JMPB,@!PUSHJ,@!PUSHJB,@!GETA,@!GETAB,@!PUT,@!PUTI,@/
848@!POP,@!RESUME,@!SAVE,@!UNSAVE,@!SYNC,@!SWYM,@!GET,@!TRIP}@+@!mmix_opcode;
849
850@ @<Glob...@>=
851char *opcode_name[]={
852"TRAP","FCMP","FUN","FEQL","FADD","FIX","FSUB","FIXU",@/
853"FLOT","FLOTI","FLOTU","FLOTUI","SFLOT","SFLOTI","SFLOTU","SFLOTUI",@/
854"FMUL","FCMPE","FUNE","FEQLE","FDIV","FSQRT","FREM","FINT",@/
855"MUL","MULI","MULU","MULUI","DIV","DIVI","DIVU","DIVUI",@/
856"ADD","ADDI","ADDU","ADDUI","SUB","SUBI","SUBU","SUBUI",@/
857"2ADDU","2ADDUI","4ADDU","4ADDUI","8ADDU","8ADDUI","16ADDU","16ADDUI",@/
858"CMP","CMPI","CMPU","CMPUI","NEG","NEGI","NEGU","NEGUI",@/
859"SL","SLI","SLU","SLUI","SR","SRI","SRU","SRUI",@/
860"BN","BNB","BZ","BZB","BP","BPB","BOD","BODB",@/
861"BNN","BNNB","BNZ","BNZB","BNP","BNPB","BEV","BEVB",@/
862"PBN","PBNB","PBZ","PBZB","PBP","PBPB","PBOD","PBODB",@/
863"PBNN","PBNNB","PBNZ","PBNZB","PBNP","PBNPB","PBEV","PBEVB",@/
864"CSN","CSNI","CSZ","CSZI","CSP","CSPI","CSOD","CSODI",@/
865"CSNN","CSNNI","CSNZ","CSNZI","CSNP","CSNPI","CSEV","CSEVI",@/
866"ZSN","ZSNI","ZSZ","ZSZI","ZSP","ZSPI","ZSOD","ZSODI",@/
867"ZSNN","ZSNNI","ZSNZ","ZSNZI","ZSNP","ZSNPI","ZSEV","ZSEVI",@/
868"LDB","LDBI","LDBU","LDBUI","LDW","LDWI","LDWU","LDWUI",@/
869"LDT","LDTI","LDTU","LDTUI","LDO","LDOI","LDOU","LDOUI",@/
870"LDSF","LDSFI","LDHT","LDHTI","CSWAP","CSWAPI","LDUNC","LDUNCI",@/
871"LDVTS","LDVTSI","PRELD","PRELDI","PREGO","PREGOI","GO","GOI",@/
872"STB","STBI","STBU","STBUI","STW","STWI","STWU","STWUI",@/
873"STT","STTI","STTU","STTUI","STO","STOI","STOU","STOUI",@/
874"STSF","STSFI","STHT","STHTI","STCO","STCOI","STUNC","STUNCI",@/
875"SYNCD","SYNCDI","PREST","PRESTI","SYNCID","SYNCIDI","PUSHGO","PUSHGOI",@/
876"OR","ORI","ORN","ORNI","NOR","NORI","XOR","XORI",@/
877"AND","ANDI","ANDN","ANDNI","NAND","NANDI","NXOR","NXORI",@/
878"BDIF","BDIFI","WDIF","WDIFI","TDIF","TDIFI","ODIF","ODIFI",@/
879"MUX","MUXI","SADD","SADDI","MOR","MORI","MXOR","MXORI",@/
880"SETH","SETMH","SETML","SETL","INCH","INCMH","INCML","INCL",@/
881"ORH","ORMH","ORML","ORL","ANDNH","ANDNMH","ANDNML","ANDNL",@/
882"JMP","JMPB","PUSHJ","PUSHJB","GETA","GETAB","PUT","PUTI",@/
883"POP","RESUME","SAVE","UNSAVE","SYNC","SWYM","GET","TRIP"};
884
885@ And here is a (likewise boring) list of all the internal opcodes.
886The smallest numbers, less than or equal to |max_pipe_op|, correspond
887to operations for which arbitrary pipeline delays can be configured
888with |MMIX_config|. The largest numbers, greater than |max_real_command|,
889correspond to internally
890generated operations that have no official OP code; for example,
891there are internal operations to shift the $\gamma$ pointer in the
892register stack, and to compute page table entries.
893
894@<Declare \&{mmix\_opcode} and \&{internal\_opcode}@>=
895#define max_pipe_op feps
896#define max_real_command trip
897
898typedef enum{@/
899@!mul0, /* multiplication by zero */
900@!mul1, /* multiplication by 1--8 bits */
901@!mul2, /* multiplication by 9--16 bits */
902@!mul3, /* multiplication by 17--24 bits */
903@!mul4, /* multiplication by 25--32 bits */
904@!mul5, /* multiplication by 33--40 bits */
905@!mul6, /* multiplication by 41--48 bits */
906@!mul7, /* multiplication by 49--56 bits */
907@!mul8, /* multiplication by 57--64 bits */
908@!div, /* \.{DIV[U][I]} */
909@!sh, /* \.{S[L,R][U][I]} */
910@!mux, /* \.{MUX[I]} */
911@!sadd, /* \.{SADD[I]} */
912@!mor, /* \.{M[X]OR[I]} */
913@!fadd, /* \.{FADD}, \.{FSUB} */
914@!fmul, /* \.{FMUL} */
915@!fdiv, /* \.{FDIV} */
916@!fsqrt, /* \.{FSQRT} */
917@!fint, /* \.{FINT} */
918@!fix, /* \.{FIX[U]} */
919@!flot, /* \.{[S]FLOT[U][I]} */
920@!feps, /* \.{FCMPE}, \.{FUNE}, \.{FEQLE} */
921@!fcmp, /* \.{FCMP} */
922@!funeq, /* \.{FUN}, \.{FEQL} */
923@!fsub, /* \.{FSUB} */
924@!frem, /* \.{FREM} */
925@!mul, /* \.{MUL[I]} */
926@!mulu, /* \.{MULU[I]} */
927@!divu, /* \.{DIVU[I]} */
928@!add, /* \.{ADD[I]} */
929@!addu, /* \.{[2,4,8,16,]ADDU[I]}, \.{INC[M][H,L]} */
930@!sub, /* \.{SUB[I]}, \.{NEG[I]} */
931@!subu, /* \.{SUBU[I]}, \.{NEGU[I]} */
932@!set, /* \.{SET[M][H,L]}, \.{GETA[B]} */
933@!or, /* \.{OR[I]}, \.{OR[M][H,L]} */
934@!orn, /* \.{ORN[I]} */
935@!nor, /* \.{NOR[I]} */
936@!and, /* \.{AND[I]} */
937@!andn, /* \.{ANDN[I]}, \.{ANDN[M][H,L]} */
938@!nand, /* \.{NAND[I]} */
939@!xor, /* \.{XOR[I]} */
940@!nxor, /* \.{NXOR[I]} */
941@!shlu, /* \.{SLU[I]} */
942@!shru, /* \.{SRU[I]} */
943@!shl, /* \.{SL[I]} */
944@!shr, /* \.{SR[I]} */
945@!cmp, /* \.{CMP[I]} */
946@!cmpu, /* \.{CMPU[I]} */
947@!bdif, /* \.{BDIF[I]} */
948@!wdif, /* \.{WDIF[I]} */
949@!tdif, /* \.{TDIF[I]} */
950@!odif, /* \.{ODIF[I]} */
951@!zset, /* \.{ZS[N][N,Z,P][I]}, \.{ZSEV[I]}, \.{ZSOD[I]} */
952@!cset, /* \.{CS[N][N,Z,P][I]}, \.{CSEV[I]}, \.{CSOD[I]} */
953@!get, /* \.{GET} */
954@!put, /* \.{PUT[I]} */
955@!ld, /* \.{LD[B,W,T,O][U][I]}, \.{LDHT[I]}, \.{LDSF[I]} */
956@!ldptp, /* load page table pointer */
957@!ldpte, /* load page table entry */
958@!ldunc, /* \.{LDUNC[I]} */
959@!ldvts, /* \.{LDVTS[I]} */
960@!preld, /* \.{PRELD[I]} */
961@!prest, /* \.{PREST[I]} */
962@!st, /* \.{STO[U][I]}, \.{STCO[I]}, \.{STUNC[I]} */
963@!syncd, /* \.{SYNCD[I]} */
964@!syncid, /* \.{SYNCID[I]} */
965@!pst, /* \.{ST[B,W,T][U][I]}, \.{STHT[I]} */
966@!stunc, /* \.{STUNC[I]}, in write buffer */
967@!cswap, /* \.{CSWAP[I]} */
968@!br, /* \.{B[N][N,Z,P][B]} */
969@!pbr, /* \.{PB[N][N,Z,P][B]} */
970@!pushj, /* \.{PUSHJ[B]} */
971@!go, /* \.{GO[I]} */
972@!prego, /* \.{PREGO[I]} */
973@!pushgo, /* \.{PUSHGO[I]} */
974@!pop, /* \.{POP} */
975@!resume, /* \.{RESUME} */
976@!save, /* \.{SAVE} */
977@!unsave, /* \.{UNSAVE} */
978@!sync, /* \.{SYNC} */
979@!jmp, /* \.{JMP[B]} */
980@!noop, /* \.{SWYM} */
981@!trap, /* \.{TRAP} */
982@!trip, /* \.{TRIP} */
983@!incgamma, /* increase $\gamma$ pointer */
984@!decgamma, /* decrease $\gamma$ pointer */
985@!incrl, /* increase rL and $\beta$ */
986@!sav, /* intermediate stage of \.{SAVE} */
987@!unsav, /* intermediate stage of \.{UNSAVE} */
988@!resum /* intermediate stage of \.{RESUME} */
989}@! internal_opcode;
990
991@ @<Glob...@>=
992char *internal_op_name[]={
993"mul0",
994"mul1",
995"mul2",
996"mul3",
997"mul4",
998"mul5",
999"mul6",
1000"mul7",
1001"mul8",
1002"div",
1003"sh",
1004"mux",
1005"sadd",
1006"mor",
1007"fadd",
1008"fmul",
1009"fdiv",
1010"fsqrt",
1011"fint",
1012"fix",
1013"flot",
1014"feps",
1015"fcmp",
1016"funeq",
1017"fsub",
1018"frem",
1019"mul",
1020"mulu",
1021"divu",
1022"add",
1023"addu",
1024"sub",
1025"subu",
1026"set",
1027"or",
1028"orn",
1029"nor",
1030"and",
1031"andn",
1032"nand",
1033"xor",
1034"nxor",
1035"shlu",
1036"shru",
1037"shl",
1038"shr",
1039"cmp",
1040"cmpu",
1041"bdif",
1042"wdif",
1043"tdif",
1044"odif",
1045"zset",
1046"cset",
1047"get",
1048"put",
1049"ld",
1050"ldptp",
1051"ldpte",
1052"ldunc",
1053"ldvts",
1054"preld",
1055"prest",
1056"st",
1057"syncd",
1058"syncid",
1059"pst",
1060"stunc",
1061"cswap",
1062"br",
1063"pbr",
1064"pushj",
1065"go",
1066"prego",
1067"pushgo",
1068"pop",
1069"resume",
1070"save",
1071"unsave",
1072"sync",
1073"jmp",
1074"noop",
1075"trap",
1076"trip",
1077"incgamma",
1078"decgamma",
1079"incrl",
1080"sav",
1081"unsav",
1082"resum"};
1083
1084@ We need a table to convert the external opcodes to
1085internal ones.
1086
1087@<Glob...@>=
1088internal_opcode internal_op[256]={@/
1089  trap,fcmp,funeq,funeq,fadd,fix,fsub,fix,@/
1090  flot,flot,flot,flot,flot,flot,flot,flot,@/
1091  fmul,feps,feps,feps,fdiv,fsqrt,frem,fint,@/
1092  mul,mul,mulu,mulu,div,div,divu,divu,@/
1093  add,add,addu,addu,sub,sub,subu,subu,@/
1094  addu,addu,addu,addu,addu,addu,addu,addu,@/
1095  cmp,cmp,cmpu,cmpu,sub,sub,subu,subu,@/
1096  shl,shl,shlu,shlu,shr,shr,shru,shru,@/
1097  br,br,br,br,br,br,br,br,@/
1098  br,br,br,br,br,br,br,br,@/
1099  pbr,pbr,pbr,pbr,pbr,pbr,pbr,pbr,@/
1100  pbr,pbr,pbr,pbr,pbr,pbr,pbr,pbr,@/
1101  cset,cset,cset,cset,cset,cset,cset,cset,@/
1102  cset,cset,cset,cset,cset,cset,cset,cset,@/
1103  zset,zset,zset,zset,zset,zset,zset,zset,@/
1104  zset,zset,zset,zset,zset,zset,zset,zset,@/
1105  ld,ld,ld,ld,ld,ld,ld,ld,@/
1106  ld,ld,ld,ld,ld,ld,ld,ld,@/
1107  ld,ld,ld,ld,cswap,cswap,ldunc,ldunc,@/
1108  ldvts,ldvts,preld,preld,prego,prego,go,go,@/
1109  pst,pst,pst,pst,pst,pst,pst,pst,@/
1110  pst,pst,pst,pst,st,st,st,st,@/
1111  pst,pst,pst,pst,st,st,st,st,@/
1112  syncd,syncd,prest,prest,syncid,syncid,pushgo,pushgo,@/
1113  or,or,orn,orn,nor,nor,xor,xor,@/
1114  and,and,andn,andn,nand,nand,nxor,nxor,@/
1115  bdif,bdif,wdif,wdif,tdif,tdif,odif,odif,@/
1116  mux,mux,sadd,sadd,mor,mor,mor,mor,@/
1117  set,set,set,set,addu,addu,addu,addu,@/
1118  or,or,or,or,andn,andn,andn,andn,@/
1119  jmp,jmp,pushj,pushj,set,set,put,put,@/
1120  pop,resume,save,unsave,sync,noop,get,trip};
1121
1122@ While we're into boring lists, we might as well define all the
1123special register numbers, together with an inverse table for
1124use in diagnostic outputs. These codes have been designed so that
1125special registers 0--7 are unencumbered, 9--11 can't be \.{PUT} by anybody,
11268 and 12--18 can't be \.{PUT} by the user. Pipeline delays might occur
1127when \.{GET} is applied to special registers 21--31 or when
1128\.{PUT} is applied to special registers 8 or 15--20. The \.{SAVE} and
1129\.{UNSAVE} commands store and restore special registers 0--6 and 23--27.
1130
1131@<Header def...@>=
1132#define rA 21 /* arithmetic status register */
1133#define rB 0  /* bootstrap register (trip) */
1134#define rC 8  /* continuation register */
1135#define rD 1  /* dividend register */
1136#define rE 2  /* epsilon register */
1137#define rF 22 /* failure location register */
1138#define rG 19 /* global threshold register */
1139#define rH 3  /* himult register */
1140#define rI 12 /* interval counter */
1141#define rJ 4  /* return-jump register */
1142#define rK 15 /* interrupt mask register */
1143#define rL 20 /* local threshold register */
1144#define rM 5  /* multiplex mask register */
1145#define rN 9  /* serial number */
1146#define rO 10 /* register stack offset */
1147#define rP 23 /* prediction register */
1148#define rQ 16 /* interrupt request register */
1149#define rR 6  /* remainder register */
1150#define rS 11 /* register stack pointer */
1151#define rT 13 /* trap address register */
1152#define rU 17 /* usage counter */
1153#define rV 18 /* virtual translation register */
1154#define rW 24 /* where-interrupted register (trip) */
1155#define rX 25 /* execution register (trip) */
1156#define rY 26 /* Y operand (trip) */
1157#define rZ 27 /* Z operand (trip) */
1158#define rBB 7  /* bootstrap register (trap) */
1159#define rTT 14 /* dynamic trap address register */
1160#define rWW 28 /* where-interrupted register (trap) */
1161#define rXX 29 /* execution register (trap) */
1162#define rYY 30 /* Y operand (trap) */
1163#define rZZ 31 /* Z operand (trap) */
1164
1165@ @<Glob...@>=
1166char *special_name[32]={"rB","rD","rE","rH","rJ","rM","rR","rBB",
1167 "rC","rN","rO","rS","rI","rT","rTT","rK","rQ","rU","rV","rG","rL",
1168 "rA","rF","rP","rW","rX","rY","rZ","rWW","rXX","rYY","rZZ"};
1169
1170@ Here are the bit codes that affect trips and traps. The first eight
1171cases also apply to the upper half of~rQ; the next eight apply to~rA.
1172
1173@d P_BIT (1<<0) /* instruction in privileged location */
1174@d S_BIT (1<<1) /* security violation */
1175@d B_BIT (1<<2) /* instruction breaks the rules */
1176@d K_BIT (1<<3) /* instruction for kernel only */
1177@d N_BIT (1<<4) /* virtual translation bypassed */
1178@d PX_BIT (1<<5) /* permission lacking to execute from page */
1179@d PW_BIT (1<<6) /* permission lacking to write on page */
1180@d PR_BIT (1<<7) /* permission lacking to read from page */
1181@d PROT_OFFSET 5 /* distance from |PR_BIT| to protection code position */
1182@d X_BIT (1<<8) /* floating inexact */
1183@d Z_BIT (1<<9) /* floating division by zero */
1184@d U_BIT (1<<10) /* floating underflow */
1185@d O_BIT (1<<11) /* floating overflow */
1186@d I_BIT (1<<12) /* floating invalid operation */
1187@d W_BIT (1<<13) /* float-to-fix overflow */
1188@d V_BIT (1<<14) /* integer overflow */
1189@d D_BIT (1<<15) /* integer divide check */
1190@d H_BIT (1<<16) /* trip handler bit */
1191@d F_BIT (1<<17) /* forced trap bit */
1192@d E_BIT (1<<18) /* external (dynamic) trap bit */
1193
1194@<Glob...@>=
1195char bit_code_map[]="EFHDVWIOUZXrwxnkbsp";
1196
1197@ @<Internal proto...@>=
1198static void print_bits @,@,@[ARGS((int))@];
1199
1200@ @<Subr...@>=
1201static void print_bits(x)
1202  int x;
1203{
1204  register int b,j;
1205  for (j=0,b=E_BIT;(x&(b+b-1))&&b;j++,b>>=1)
1206    if (x&b) printf("%c",bit_code_map[j]);
1207}
1208
1209@ The lower half of rQ holds external interrupts of highest priority.
1210Most of them are implementation-dependent, but a few are defined in general.
1211
1212@<Header def...@>=
1213#define POWER_FAILURE (1<<0) /* try to shut down calmly and quickly */
1214#define PARITY_ERROR (1<<1) /* try to save the file systems */
1215#define NONEXISTENT_MEMORY (1<<2) /* a memory address can't be used */
1216#define REBOOT_SIGNAL (1<<4) /* it's time to start over */
1217#define INTERVAL_TIMEOUT (1<<6) /* the timer register, rI, has reached zero */
1218#define STACK_OVERFLOW (1<<7) /* data has been stored on the rC page */
1219
1220@* Dynamic speculation.
1221Now that we understand some basic low-level structures,
1222we're ready to look at the larger picture.
1223
1224This simulator is based on the idea of ``dynamic scheduling with register
1225renaming,'' as introduced in the 1960s by R.~M. Tomasulo [{\sl IBM Journal
1226@^Tomasulo, Robert Marco@>
1227of Research and Development\/ \bf11} (1967), 25--33]. Moreover, the dynamic
1228scheduling method is extended here to ``speculative execution,'' as
1229implemented in several processors of the 1990s and described in section~4.6 of
1230Hennessy and Patterson's {\sl Computer Architecture}, second edition (1995).
1231@^Hennessy, John LeRoy@>
1232@^Patterson, David Andrew@>
1233The essential idea is to keep track of the pipeline contents by recording all
1234dependencies between unfinished computations in a queue called the {\it
1235reorder buffer}. An entry in the reorder buffer might, for example, correspond
1236to an instruction that adds together two numbers whose values are still being
1237computed; those numbers have been allocated space in earlier positions of the
1238reorder buffer. The addition will take place as soon as both of its operands
1239are known, but the sum won't be written immediately into the destination
1240register. It will stay in the reorder buffer until reaching the {\it hot
1241seat\/} at the front of the queue. Finally, the addition leaves the
1242hot seat and is said to be {\it committed}.
1243
1244Some instructions in the reorder buffer may in fact be executed only
1245on speculation, meaning that they won't really be called for unless a prior
1246branch instruction has the predicted outcome. Indeed, we can say that
1247all instructions not yet in the hot seat are being executed speculatively,
1248because an external interrupt might occur at any time and change the entire
1249course of computation. Organizing the pipeline as a reorder buffer allows us
1250to look ahead and keep busy computing values that have a good chance of being
1251needed later, instead of waiting for slow instructions or slow memory
1252references to be completed.
1253
1254The reorder buffer is in fact a queue of \&{control} records, conceptually
1255forming part of a circle of such records inside the simulator, corresponding
1256to all instructions that have been dispatched or {\it issued\/} but not yet
1257committed, in strict program order.
1258
1259The best way to get an understanding of speculative execution is perhaps to
1260imagine that the reorder buffer is large enough to hold hundreds of
1261instructions in various stages of execution, and to think of an implementation
1262of \MMIX\ that has dozens of functional units---more than would ever actually
1263@^thinking big@>
1264be built into a chip. Then one can readily visualize the kinds of control
1265structures and checks that must be made to ensure correct execution. Without
1266such a broad viewpoint, a programmer or hardware designer will be inclined to
1267think only of the simple cases and to devise algorithms that lack the proper
1268generality. Thus we have a somewhat paradoxical situation in which a difficult
1269general problem turns out to be easier to solve than its simpler special cases,
1270because it enforces clarity of thinking.
1271
1272Instructions that have completed execution and have not yet been committed are
1273analogous to cars that have gone through our hypothetical repair shop and are
1274waiting for their owners to pick them up. However, all analogies break down,
1275and the world of automobiles does not have a natural counterpart for the
1276notion of speculative execution. That notion corresponds roughly to situations
1277in which people are led to believe that their cars need a new piece of
1278equipment, but they suddenly change their mind once they see the price tag,
1279and they insist on having the equipment removed even after it has been
1280partially or completely installed.
1281
1282Speculatively executed instructions might make no sense: They might divide
1283by zero or refer to protected memory areas, etc. Such anomalies are not
1284considered catastrophic or even exceptional until the instruction reaches the
1285hot~seat.
1286
1287The person who designs a computer with speculative execution is an optimist,
1288who has faith that the vast majority of the machine's predictions will come
1289true. The person who designs a reliable implementation of such a computer
1290is a pessimist, who understands that all predictions might come to naught.
1291The pessimist does, however, take pains to optimize the cases that do turn out
1292well.
1293
1294@ Let's consider what happens to a single instruction, say
1295\.{ADD} \.{\$1,\$2,\$3}, as it travels through the pipeline in a normal
1296situation. The first time this instruction is encountered, it is placed into
1297the I-cache (that is, the instruction cache), so that we won't have to access
1298memory when we need to perform it again. We will assume for simplicity in this
1299discussion that each I-cache access takes one clock cycle, although other
1300possibilities are allowed by |MMIX_config|.
1301
1302Suppose the simulated machine fetches the example \.{ADD} instruction
1303at time 1000. Fetching is done by a coroutine whose |stage| number is~0.
1304A cache block typically contains 8 or 16 instructions. The fetch unit
1305of our machine is able to fetch up to |fetch_max| instructions on each clock
1306cycle and place them in the fetch buffer, provided that there is room in the
1307buffer and that all the instructions belong to the same cache block.
1308
1309The dispatch unit of our simulator is able to issue up to |dispatch_max|
1310instructions on each clock cycle and move them from the fetch buffer to the
1311reorder buffer, provided that functional units are available for those
1312instructions and there is room in the reorder buffer. A functional unit that
1313handles \.{ADD} is usually called an ALU (arithmetic logic unit), and our
1314simulated machine might have several of them. If they aren't all stalled
1315in stage~1 of their pipelines, and if the reorder buffer isn't full, and if
1316the machine isn't in the process of deissuing instructions that were
1317mispredicted, and if
1318fewer than |dispatch_max| instructions are ahead of the \.{ADD} in the fetch
1319buffer, and if all such prior instructions can be issued without using up all
1320the free ALUs, our \.{ADD} instruction will be issued at time 1001.
1321(In fact, all of these conditions are usually true.)
1322
1323We assume that $\rm L>3$, so that \$1, \$2, and~\$3 are local registers.
1324For simplicity we'll assume in fact that the register stack is empty, so that
1325the \.{ADD} instruction is supposed to set $\rm l[1]\gets l[2]+l[3]$. The
1326operands l[2] and~l[3] might not be known at time 1001; they are \&{spec}
1327values, which might point to \&{specnode} entries in the reorder buffer for
1328previous instructions whose destinations are l[2] and~l[3].
1329The dispatcher fills the next available control block of the reorder buffer
1330with information for the \.{ADD}, containing appropriate \&{spec} values
1331corresponding to l[2] and~l[3] in its |y| and~|z| fields. The |x|~field of
1332this control block will be inserted into a doubly linked list of \&{specnode}
1333records, corresponding to l[1] and to all instructions in the reorder buffer
1334that have l[1] as a destination. The boolean value |x.known| will be set to
1335|false|, meaning that this speculative value still needs to be
1336computed. Subsequent instructions that need l[1] as a source will point to
1337|x|, if they are issued before the sum |x.o| has been computed. Double
1338linking is used in the \&{specnode} list because the \.{ADD} instruction might
1339be cancelled before it is finally committed; thus deletions might occur
1340at either end of the list for~l[1].
1341
1342At time 1002, the ALU handling the \.{ADD} will stall if its inputs |y|
1343and~|z| are not both known (namely if |y.p!=NULL| or |z.p!=NULL|).
1344In fact, it will also stall if its third input rA is not known;
1345the current speculative value of rA, except for its event bits,
1346is represented in the |ra|~field of the control block, and we must
1347have |ra.p==NULL|. In such a case the ALU will look to see if the
1348\&{spec} values pointed to by |y.p| and/or |z.p| and/or |ra.p| become
1349defined on this clock cycle, and it will update its own input values
1350accordingly.
1351
1352But let's assume that |y|, |z|, and |ra| are already known at time 1002.
1353Then |x.o| will be set to |y.o+z.o| and |x.known| will become~|true|.
1354This will make the result destined for~l[1] available to be used in other
1355commands at time~1003.
1356
1357If no overflow occurs when adding |y.o| to |z.o|, the |interrupt| and
1358|arith_exc| fields of the control block for \.{ADD} are set to zero.  But when
1359overflow does occur (shudder), there are two cases, based on the V-enable bit
1360of rA, which is found in field |b.o| of the control block. If this bit is~0,
1361the V-bit of the |arith_exc| field in the control block is set to~1; the
1362|arith_exc| field will be ored into~rA when the \.{ADD} instruction is
1363eventually committed.  But if the V-enable bit is~1, the trip handler should
1364be called, interrupting the normal sequence. In such a case, the |interrupt|
1365field of the control block is set to specify a trip, and the fetcher and
1366dispatcher are told to forget what they have been doing; all instructions
1367following the \.{ADD} in the reorder buffer must now be deissued. The virtual starting
1368address of the overflow trip handler, namely location~32, is hastily passed to
1369the fetch routine, and instructions will be fetched from that location
1370as soon as possible. (Of course the overflow and the trip handler are
1371still speculative until the \.{ADD} instruction is committed. Other exceptional
1372conditions might cause the \.{ADD} itself to be terminated before it
1373gets to the hot seat. But the pipeline keeps charging ahead, always trying to
1374guess the most probable outcome.)
1375
1376The commission unit of this simulator is able to commit and/or deissue up to
1377|commit_max| instructions on each clock cycle. With luck, fewer than
1378|commit_max| instructions will be ahead of our \.{ADD} instruction at
1379time~1003, and they will all be completed normally. Then l[1]~can be set
1380to |x.o|, and the event bits of~rA can be updated from |arith_exc|,
1381and the \.{ADD} command can pass through the hot seat and out of the
1382reorder buffer.
1383
1384@<External var...@>=
1385Extern int fetch_max, dispatch_max, peekahead, commit_max;
1386 /* limits on instructions that can be handled per clock cycle */
1387
1388@ The instruction currently occupying the hot seat is the only
1389issued-but-not-yet-committed instruction that is guaranteed to be truly
1390essential to the machine's computation. All other instructions in the reorder
1391buffer are being executed on speculation; if they prove to be needed, well and
1392good, but we might want to jettison them all if, say, an external interrupt
1393occurs.
1394
1395Thus all instructions that change the global state in complicated ways---like
1396\.{LDVTS}, which changes the virtual address translation caches---are
1397performed only when they reach the hot seat. Fortunately the vast majority
1398of instructions are sufficiently simple that we can deal with them more
1399efficiently while other computations are taking place.
1400
1401In this implementation the reorder buffer is simply housed in an array of
1402control records. The first array element is |reorder_bot|, and the last is
1403|reorder_top|. Variable |hot| points to the control block in the hot seat, and
1404|hot-1| to its predecessor, etc. Variable |cool| points to the next control
1405block that will be filled in the reorder buffer. If |hot==cool| the reorder
1406buffer is empty; otherwise it contains the control records |hot|, |hot-1|,
1407\dots,~|cool+1|, except of course that we wrap around from |reorder_bot| to
1408|reorder_top| when moving down in the buffer.
1409
1410@<External var...@>=
1411Extern control *reorder_bot, *reorder_top; /* least and greatest
1412                   entries in the ring containing the reorder buffer */
1413Extern control *hot, *cool; /* front and rear of the reorder buffer */
1414Extern control *old_hot; /* value of |hot| at beginning of cycle */
1415Extern int deissues; /* the number of instructions that need to be deissued */
1416
1417@ @<Initialize e...@>=
1418hot=cool=reorder_top;
1419deissues=0;
1420
1421@ @<Internal proto...@>=
1422static void print_reorder_buffer @,@,@[ARGS((void))@];
1423
1424@ @<Sub...@>=
1425static void print_reorder_buffer()
1426{
1427  printf("Reorder buffer");
1428  if (hot==cool) printf(" (empty)\n");
1429  else {@+register control *p;
1430    if (deissues) printf(" (%d to be deissued)",deissues);
1431    if (doing_interrupt) printf(" (interrupt state %d)",doing_interrupt);
1432    printf(":\n");
1433    for (p=hot;p!=cool; p=(p==reorder_bot? reorder_top: p-1)) {
1434      print_control_block(p);
1435      if (p->owner) {
1436        printf(" ");@+ print_coroutine_id(p->owner);
1437      }
1438      printf("\n");
1439    }
1440  }
1441  printf(" %d available rename register%s, %d memory slot%s\n",
1442     rename_regs, rename_regs!=1? "s": "",
1443     mem_slots, mem_slots!=1? "s": "");
1444}
1445
1446@ Here is an overview of what happens on each clock cycle.
1447
1448@<Perform one machine cycle@>=
1449{
1450  @<Check for external interrupt@>;
1451  dispatch_count=0;
1452  old_hot=hot; /* remember the hot seat position at beginning of cycle */
1453  old_tail=tail; /* remember the fetch buffer contents at beginning of cycle */
1454  suppress_dispatch=(deissues || dispatch_lock);
1455  if (doing_interrupt) @<Perform one cycle of the interrupt preparations@>@;
1456  else @<Commit and/or deissue up to |commit_max| instructions@>;
1457  @<Execute all coroutines scheduled for the current time@>;
1458  if (!suppress_dispatch) @<Dispatch one cycle's worth of instructions@>;
1459  ticks=incr(ticks,1); /* and the beat moves on */
1460  dispatch_stat[dispatch_count]++;
1461}
1462
1463@ @<Glob...@>=
1464int dispatch_count; /* how many dispatched on this cycle */
1465bool suppress_dispatch; /* should dispatching be bypassed? */
1466int doing_interrupt; /* how many cycles of interrupt preparations remain */
1467lockvar dispatch_lock; /* lock to prevent instruction issues */
1468
1469@ @<External v...@>=
1470Extern int *dispatch_stat;
1471  /* how often did we dispatch 0, 1, ... instructions? */
1472Extern bool security_disabled; /* omit security checks for testing purposes? */
1473
1474@ @<Commit and/or deissue up to |commit_max| instructions@>=
1475{
1476  for (m=commit_max;m>0 && deissues>0; m--)
1477    @<Deissue the coolest instruction@>;
1478  for (;m>0;m--) {
1479    if (hot==cool) break; /* reorder buffer is empty */
1480    if (!security_disabled) @<Check for security violation, |break| if so@>;
1481    if (hot->owner) break; /* hot seat instruction isn't finished */
1482    @<Commit the hottest instruction, or |break| if it's not ready@>;
1483    i=hot->i;
1484    if (hot==reorder_bot) hot=reorder_top;
1485    else hot--;
1486    if (i==resum) break; /* allow the resumed instruction to see the new rK */
1487  }
1488}
1489
1490@* The dispatch stage. It would be nice to present the parts of this simulator
1491by dealing with the fetching, dispatching, executing, and committing
1492stages in that order. After all, instructions are first fetched,
1493then dispatched, then executed, and finally committed.
1494However, the fetch stage depends heavily on difficult questions of
1495memory management that are best deferred until we have looked at
1496the simpler parts of simulation. Therefore we will take our initial
1497plunge into the details of this program by looking first at the dispatch phase,
1498assuming that instructions have somehow appeared magically in the fetch buffer.
1499
1500The fetch buffer, like the circular priority queue of all coroutines
1501and the circular queue used for the reorder buffer, lives in an
1502array that is best regarded as a ring of elements. The elements
1503are structures of type \&{fetch}, which have five fields:
1504A 32-bit |inst|, which is an \MMIX\ instruction; a 64-bit |loc|,
1505which is the virtual address of that instruction; an |interrupt| field,
1506which is nonzero if, for example, the protection bits in the relevant page
1507table entry for this address do not permit execution access; a boolean
1508|noted| field, which becomes |true| after the dispatch unit has peeked
1509at the instruction to see whether it is a jump or probable branch;
1510and a |hist| field, which records the recent branch history.
1511(The least significant bits of~|hist| correspond to the most recent branches.)
1512
1513@<Type...@>=
1514typedef struct {
1515  octa loc; /* virtual address of instruction */
1516  tetra inst; /* the instruction itself */
1517  unsigned int interrupt; /* bit codes that might cause interruption */
1518  bool noted; /* have we peeked at this instruction? */
1519  unsigned int hist; /* if we peeked, this was the |peek_hist| */
1520} fetch;
1521
1522@ The oldest and youngest entries in the fetch buffer are pointed
1523to by |head| and |tail|, just as the oldest and youngest entries in the
1524reorder buffer are called |hot| and |cool|. The fetch coroutine will
1525be adding entries at the |tail| position, which starts at |old_tail|
1526when a cycle begins, in parallel with the actions simulated by
1527the dispatcher. Therefore the dispatcher is allowed to look only at
1528instructions in |head|, |head-1|, \dots,~|old_tail+1|, although a few
1529more recently fetched instructions will usually be present in the fetch
1530buffer by the time this part of the program is executed.
1531
1532@<External v...@>=
1533Extern fetch *fetch_bot, *fetch_top; /* least and greatest
1534                   entries in the ring containing the fetch buffer */
1535Extern fetch *head, *tail; /* front and rear of the fetch buffer */
1536
1537@ @<Glob...@>=
1538fetch *old_tail; /* rear of the fetch buffer available on the current cycle */
1539
1540@ @d UNKNOWN_SPEC ((specnode*)1)
1541
1542@<Initialize e...@>=
1543head=tail=fetch_top;
1544inst_ptr.p=UNKNOWN_SPEC;
1545
1546@ @<Internal proto...@>=
1547static void print_fetch_buffer @,@,@[ARGS((void))@];
1548
1549@ @<Sub...@>=
1550static void print_fetch_buffer()
1551{
1552  printf("Fetch buffer");
1553  if (head==tail) printf(" (empty)\n");
1554  else {@+register fetch *p;
1555    if (resuming) printf(" (resumption state %d)",resuming);
1556    printf(":\n");
1557    for (p=head;p!=tail; p=(p==fetch_bot? fetch_top: p-1)) {
1558      print_octa(p->loc);
1559      printf(": %08x(%s)",p->inst,opcode_name[p->inst>>24]);
1560      if (p->interrupt) print_bits(p->interrupt);
1561      if (p->noted) printf("*");
1562      printf("\n");
1563    }
1564  }
1565  printf("Instruction pointer is ");
1566  if (inst_ptr.p==NULL) print_octa(inst_ptr.o);
1567  else {
1568    printf("waiting for ");
1569    if (inst_ptr.p==UNKNOWN_SPEC) printf("dispatch");
1570    else if (inst_ptr.p->addr.h==(tetra)-1)
1571      print_coroutine_id(((control*)inst_ptr.p->up)->owner);
1572    else print_specnode_id(inst_ptr.p->addr);
1573  }
1574  printf("\n");
1575}
1576
1577@ The best way to understand the dispatching process is once again
1578to ``think big,'' by imagining a huge fetch buffer and the
1579@^thinking big@>
1580potential ability to issue dozens of instructions per cycle, although
1581the actual numbers are typically quite small.
1582
1583If the fetch buffer is not empty after |dispatch_max| instructions have
1584been dispatched, the dispatcher also looks at up to |peekahead| further
1585instructions to see if they are jumps or other commands that change the
1586flow of control. Much of this action would happen in parallel on a
1587real machine, but our simulator works sequentially.
1588
1589In the following program, |true_head| records the head of the fetch buffer as
1590instructions are actually dispatched, while |head| refers to the position
1591currently being examined (possibly peeking into the future).
1592
1593If the fetch buffer is empty at the beginning of the current clock
1594cycle, a ``dispatch bypass'' allows the dispatcher to issue the
1595first instruction that enters the fetch buffer on this cycle. Otherwise
1596the dispatcher is restricted to previously fetched instructions.
1597
1598@s func int
1599
1600@<Dispatch one cycle's worth of instructions@>=
1601{@+register fetch *true_head, *new_head;
1602  true_head=head;
1603  if (head==old_tail && head!=tail)
1604    old_tail=(head==fetch_bot? fetch_top: head-1);
1605  peek_hist=cool_hist;
1606  for (j=0;j<dispatch_max+peekahead;j++)
1607    @<Look at the |head| instruction, and try
1608              to dispatch it if |j<dispatch_max|@>;
1609  head=true_head;
1610}
1611
1612@ @<Look at the |head| instruction...@>=
1613{
1614  register mmix_opcode op;
1615  register int yz,f;
1616  register bool freeze_dispatch=false;
1617  register func *u=NULL;
1618  if (head==old_tail) break; /* fetch buffer empty */
1619  if (head==fetch_bot) new_head=fetch_top;@+else new_head=head-1;
1620  op=head->inst>>24; @+yz=head->inst&0xffff;
1621  @<Determine the flags, |f|, and the internal opcode, |i|@>;
1622  @<Install default fields in the |cool| block@>;
1623  if (f&rel_addr_bit) @<Convert relative address to absolute address@>;
1624  if (head->noted) peek_hist=head->hist;
1625  else @<Redirect the fetch if control changes at this inst@>;
1626  if (j>=dispatch_max || dispatch_lock || nullifying) {
1627    head=new_head;@+ continue; /* can't dispatch, but can peek ahead */
1628  }
1629  if (cool==reorder_bot) new_cool=reorder_top;@+else new_cool=cool-1;
1630  @<Dispatch an instruction to the |cool| block if possible,
1631    otherwise |goto stall|@>;
1632  @<Assign a functional unit if available, otherwise |goto stall|@>;
1633  @<Check for sufficient rename registers and memory slots, or |goto stall|@>;
1634  if ((op&0xe0)==0x40) @<Record the result of branch prediction@>;
1635  @<Issue the |cool| instruction@>;
1636  cool=new_cool;@+ cool_O=new_O;@+ cool_S=new_S;
1637  cool_hist=peek_hist;@+ continue;
1638stall: @<Undo data structures set prematurely in the |cool| block
1639    and |break|@>;
1640}
1641
1642@ An instruction can be dispatched only if a functional unit
1643is available to handle it. A functional unit consists of a 256-bit
1644vector that specifies a subset of \MMIX's opcodes, and an array
1645of coroutines for the pipeline stages. There are $k$ coroutines in the
1646array, where $k$ is the maximum number of stages needed by any of the opcodes
1647supported.
1648
1649@<Type...@>=
1650typedef struct func_struct{
1651  char name[16]; /* symbolic designation */
1652  tetra ops[8]; /* big-endian bitmap for the opcodes supported */
1653  int k; /* number of pipeline stages */
1654  coroutine *co; /* pointer to the first of $k$ consecutive coroutines */
1655} @!func;
1656
1657@ @<External v...@>=
1658Extern func *funit; /* pointer to array of functional units */
1659Extern int funit_count; /* the number of functional units */
1660
1661@ It is convenient to have
1662a 256-bit vector of all the supported opcodes, because we need to
1663shut off a lot of special actions when an opcode is not supported.
1664
1665@<Glob...@>=
1666control *new_cool; /* the reorder position following |cool| */
1667int resuming; /* set nonzero if resuming an interrupted instruction */
1668tetra support[8]; /* big-endian bitmap for all opcodes supported */
1669
1670@ @<Initialize...@>=
1671{@+register func *u;
1672  for (u=funit;u<=funit+funit_count;u++)
1673    for (i=0;i<8;i++) support[i] |= u->ops[i];
1674}
1675
1676@ @d sign_bit ((unsigned)0x80000000)
1677
1678@<Determine the flags, |f|, and the internal opcode, |i|@>=
1679if (!(support[op>>5]&(sign_bit>>(op&31)))) {
1680  /* oops, this opcode isn't supported by any functional unit */
1681  f=flags[TRAP], i=trap;
1682}@+else f=flags[op], i=internal_op[op];
1683if (i==trip && (head->loc.h&sign_bit)) f=0,i=noop;
1684
1685@ @<Issue the |cool| instruction@>=
1686if (cool->interim) {
1687  cool->usage=false;
1688  if (cool->op==SAVE) @<Get ready for the next step of \.{SAVE}@>@;
1689  else if (cool->op==UNSAVE) @<Get ready for the next step of \.{UNSAVE}@>@;
1690  else if (cool->i==preld || cool->i==prest)
1691     @<Get ready for the next step of \.{PRELD} or \.{PREST}@>@;
1692  else if (cool->i==prego) @<Get ready for the next step of \.{PREGO}@>@;
1693}
1694else if (cool->i<=max_real_command) {
1695  if ((flags[cool->op]&ctl_change_bit)||cool->i==pbr)
1696    if (inst_ptr.p==NULL && (inst_ptr.o.h&sign_bit) && !(cool->loc.h&sign_bit)
1697           && cool->i!=trap)
1698      cool->interrupt|=P_BIT; /* jumping from nonnegative to negative */
1699  true_head=head=new_head; /* delete instruction from fetch buffer */
1700  resuming=0;
1701}
1702if (freeze_dispatch) set_lock(u->co,dispatch_lock);
1703cool->owner=u->co;@+ u->co->ctl=cool;
1704startup(u->co,1); /* schedule execution of the new inst */
1705if (verbose&issue_bit) {
1706  printf("Issuing ");@+print_control_block(cool);
1707  printf(" ");@+print_coroutine_id(u->co);@+printf("\n");
1708}
1709dispatch_count++;
1710
1711@ We assign the first functional unit that supports |op| and is
1712totally unoccupied, if possible; otherwise we assign the first
1713functional unit that supports |op| and has stage~1 unoccupied.
1714
1715@<Assign a functional unit if available...@>=
1716{@+register int t=op>>5, b=sign_bit>>(op&31);
1717  if (cool->i==trap && op!=TRAP) { /* opcode needs to be emulated */
1718    u=funit+funit_count; /* this unit supports just \.{TRIP} and \.{TRAP} */
1719    goto unit_found;
1720  }
1721  for (u=funit;u<=funit+funit_count;u++) if (u->ops[t]&b) {
1722    for (i=0;i<u->k;i++) if (u->co[i].next) goto unit_busy;
1723    goto unit_found;
1724  unit_busy: ;
1725  }
1726  for (u=funit;u<funit+funit_count;u++)
1727    if ((u->ops[t]&b) && (u->co->next==NULL)) goto unit_found;
1728  goto stall; /* all units for this |op| are busy */
1729}
1730unit_found:
1731
1732@ The |flags| table records special properties of each operation code
1733in binary notation: \Hex{1}~means Z~is an immediate value, \Hex{2}~means rZ is
1734a source operand, \Hex{4}~means Y~is an immediate value, \Hex{8}~means rY is a
1735source operand, \Hex{10}~means rX is a source operand, \Hex{20}~means
1736rX is a destination, \Hex{40}~means YZ is part of a relative address,
1737\Hex{80}~means the control changes at this point.
1738
1739@d X_is_dest_bit 0x20
1740@d rel_addr_bit 0x40
1741@d ctl_change_bit 0x80
1742
1743@<Glob...@>=
1744unsigned char flags[256]={
17450x8a, 0x2a, 0x2a, 0x2a, 0x2a, 0x26, 0x2a, 0x26, /* \.{TRAP}, \dots\ */
17460x26, 0x25, 0x26, 0x25, 0x26, 0x25, 0x26, 0x25, /* \.{FLOT}, \dots\ */
17470x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x26, 0x2a, 0x26, /* \.{FMUL}, \dots\ */
17480x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, /* \.{MUL}, \dots\ */
17490x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, /* \.{ADD}, \dots\ */
17500x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, /* \.{2ADDU}, \dots\ */
17510x2a, 0x29, 0x2a, 0x29, 0x26, 0x25, 0x26, 0x25, /* \.{CMP}, \dots\ */
17520x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, /* \.{SL}, \dots\ */
17530x50, 0x50, 0x50, 0x50, 0x50, 0x50, 0x50, 0x50, /* \.{BN}, \dots\ */
17540x50, 0x50, 0x50, 0x50, 0x50, 0x50, 0x50, 0x50, /* \.{BNN}, \dots\ */
17550x50, 0x50, 0x50, 0x50, 0x50, 0x50, 0x50, 0x50, /* \.{PBN}, \dots\ */
17560x50, 0x50, 0x50, 0x50, 0x50, 0x50, 0x50, 0x50, /* \.{PBNN}, \dots\ */
17570x3a, 0x39, 0x3a, 0x39, 0x3a, 0x39, 0x3a, 0x39, /* \.{CSN}, \dots\ */
17580x3a, 0x39, 0x3a, 0x39, 0x3a, 0x39, 0x3a, 0x39, /* \.{CSNN}, \dots\ */
17590x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, /* \.{ZSN}, \dots\ */
17600x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, /* \.{ZSNN}, \dots\ */
17610x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, /* \.{LDB}, \dots\ */
17620x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, /* \.{LDT}, \dots\ */
17630x2a, 0x29, 0x2a, 0x29, 0x3a, 0x39, 0x2a, 0x29, /* \.{LDSF}, \dots\ */
17640x2a, 0x29, 0x0a, 0x09, 0x0a, 0x09, 0xaa, 0xa9, /* \.{LDVTS}, \dots\ */
17650x1a, 0x19, 0x1a, 0x19, 0x1a, 0x19, 0x1a, 0x19, /* \.{STB}, \dots\ */
17660x1a, 0x19, 0x1a, 0x19, 0x1a, 0x19, 0x1a, 0x19, /* \.{STT}, \dots\ */
17670x1a, 0x19, 0x1a, 0x19, 0x0a, 0x09, 0x1a, 0x19, /* \.{STSF}, \dots\ */
17680x0a, 0x09, 0x0a, 0x09, 0x0a, 0x09, 0xaa, 0xa9, /* \.{SYNCD}, \dots\ */
17690x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, /* \.{OR}, \dots\ */
17700x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, /* \.{AND}, \dots\ */
17710x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, /* \.{BDIF}, \dots\ */
17720x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, /* \.{MUX}, \dots\ */
17730x20, 0x20, 0x20, 0x20, 0x30, 0x30, 0x30, 0x30, /* \.{SETH}, \dots\ */
17740x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, /* \.{ORH}, \dots\ */
17750xc0, 0xc0, 0xe0, 0xe0, 0x60, 0x60, 0x02, 0x01, /* \.{JMP}, \dots\ */
17760x80, 0x80, 0x00, 0x02, 0x01, 0x00, 0x20, 0x8a}; /* \.{POP}, \dots\ */
1777
1778@ @<Convert relative...@>=
1779{
1780  if (i==jmp) yz=head->inst&0xffffff;
1781  if (op&1) yz-=(i==jmp? 0x1000000: 0x10000);
1782  cool->y.o=incr(head->loc,4), cool->y.p=NULL;
1783  cool->z.o=incr(head->loc,yz<<2), cool->z.p=NULL;
1784}
1785
1786@ The location of the next instruction to be fetched is in a \&{spec} variable
1787called |inst_ptr|. A slightly tricky optimization of the \.{POP} instruction
1788is made in the common case that the speculative value of~rJ is known.
1789
1790@<Redirect the fetch if control changes at this inst@>=
1791{@+register int predicted=0;
1792  if ((op&0xe0)==0x40) @<Predict a branch outcome@>;
1793  head->noted=true;
1794  head->hist=peek_hist;
1795  if (predicted||(f&ctl_change_bit) || (i==syncid&&!(cool->loc.h&sign_bit))) {
1796    old_tail=tail=new_head; /* discard all remaining fetches */
1797    @<Restart the fetch coroutine@>;
1798    switch (i) {
1799 case jmp: case br: case pbr: case pushj: inst_ptr=cool->z;@+ break;
1800 case pop:@+if (g[rJ].up->known &&
1801          j<dispatch_max && !dispatch_lock && !nullifying) {
1802      inst_ptr.o=incr(g[rJ].up->o,yz<<2), inst_ptr.p=NULL;@+break;
1803      } /* otherwise fall through, will wait on |cool->go| */
1804 case go: case pushgo: case trap: case resume: case syncid:
1805    inst_ptr.p=UNKNOWN_SPEC;@+ break;
1806 case trip: inst_ptr=zero_spec;@+ break;
1807    }
1808  }
1809}
1810
1811@ At any given time the simulated machine is in two main states, the
1812``hot state'' corresponding to instructions that have been committed and the
1813``cool state'' corresponding to all the speculative changes currently
1814being considered. The dispatcher works with cool instructions and puts them
1815into the reorder buffer, where they gradually get warmer and warmer.
1816Intermediate instructions, between |hot| and |cool|, have intermediate
1817temperatures.
1818
1819A machine register like l[101] or g[250] is represented by a specnode whose
1820|o|~field is the current hot value of the register. If the |up| and |down|
1821fields of this specnode point to the node itself,
1822the hot and cool values of the register are
1823identical. Otherwise |up| and |down| are pointers to the coolest and hottest
1824ends of a doubly linked list of specnodes, representing intermediate
1825speculative values (sometimes called ``rename registers'').
1826@^rename registers@>
1827The rename registers are implemented as the |x| or~|a| specnodes inside control
1828blocks, for speculative instructions that use this register as a
1829destination. Speculative instructions that use the register as a
1830source operand point to the next-hottest specnode on the list, until
1831the value becomes known. The doubly linked list of specnodes is an
1832input-restricted deque: A node is inserted at the cool end when the
1833dispatcher issues an instruction with this register as destination;
1834a node is removed from the cool end if an instruction needs to be deissued;
1835a node is removed from the hot end when an instruction is committed.
1836
1837The special registers rA, rB, \dots\ occupy the same array as the
1838global registers g[32], g[33], \dots~\thinspace. For example,
1839rB is internally the same as g[0], because |rB=0|.
1840
1841@<External v...@>=
1842Extern specnode g[256]; /* global registers and special registers */
1843Extern specnode *l; /* the ring of local registers */
1844Extern int lring_size; /* the number of on-chip local registers
1845         (must be a power of~2) */
1846Extern int max_rename_regs, max_mem_slots; /* capacity of reorder buffer */
1847Extern int rename_regs, mem_slots; /* currently unused capacity */
1848
1849@ Special register rC was the clock in the original definition of \MMIX.
1850But now the clock is just an external variable, called |ticks|.
1851
1852@<External v...@>=
1853Extern octa ticks; /* the internal clock */
1854
1855@ @<Glob...@>=
1856int lring_mask; /* for calculations modulo |lring_size| */
1857
1858@ The |addr| fields in the specnode lists for registers are used
1859to identify that register in diagnostic messages. Such addresses
1860are negative; memory addresses are positive.
1861
1862All registers are initially zero except rG, which is initially 255,
1863and rN, which has a constant value identifying the time of compilation.
1864(The macro \.{ABSTIME} is defined externally in the file \.{abstime.h},
1865which should have just been created by {\mc ABSTIME}\kern.05em;
1866{\mc ABSTIME} is
1867a trivial program that computes the value of the standard library function
1868|time(NULL)|. We assume that this number, which is the number of seconds in
1869the ``{\mc UNIX} epoch,'' is less than~$2^{32}$. Beware: Our assumption will
1870fail in February of 2106.)
1871@^system dependencies@>
1872
1873@d VERSION 1 /* version of the \MMIX\ architecture that we support */
1874@d SUBVERSION 0 /* secondary byte of version number */
1875@d SUBSUBVERSION 0 /* further qualification to version number */
1876
1877@<Initialize everything@>=
1878rename_regs=max_rename_regs;
1879mem_slots=max_mem_slots;
1880lring_mask=lring_size-1;
1881for (j=0;j<256;j++) {
1882  g[j].addr.h=sign_bit, g[j].addr.l=j, g[j].known=true;
1883  g[j].up=g[j].down=&g[j];
1884}
1885g[rG].o.l=255;
1886g[rN].o.h=(VERSION<<24)+(SUBVERSION<<16)+(SUBSUBVERSION<<8);
1887g[rN].o.l=ABSTIME; /* see comment and warning above */
1888for (j=0;j<lring_size;j++) {
1889  l[j].addr.h=sign_bit, l[j].addr.l=256+j, l[j].known=true;
1890  l[j].up=l[j].down=&l[j];
1891}
1892
1893@ @<Internal proto...@>=
1894static void print_specnode_id @,@,@[ARGS((octa))@];
1895
1896@ @<Sub...@>=
1897static void print_specnode_id(a)
1898  octa a;
1899{
1900  if (a.h==sign_bit) {
1901    if (a.l<32) printf(special_name[a.l]);
1902    else if (a.l<256) printf("g[%d]",a.l);
1903    else printf("l[%d]",a.l-256);
1904  }@+else if (a.h!=(tetra)-1) {
1905    printf("m[");@+print_octa(a);@+printf("]");
1906  }
1907}
1908
1909@ The |specval| subroutine produces a \&{spec} corresponding to the
1910currently coolest value of a given local or global register.
1911
1912@<Internal proto...@>=
1913static spec specval @,@,@[ARGS((specnode*))@];
1914
1915@ @<Sub...@>=
1916static spec specval(r)
1917  specnode *r;
1918{@+spec res;
1919  if (r->up->known) res.o=r->up->o,res.p=NULL;
1920  else res.p=r->up;
1921  return res;
1922}
1923
1924@ The |spec_install| subroutine introduces a new speculative value at
1925the cool end of a given doubly linked~list.
1926
1927@<Internal proto...@>=
1928static void spec_install @,@,@[ARGS((specnode*,specnode*))@];
1929
1930@ @<Sub...@>=
1931static void spec_install(r,t) /* insert |t| into list |r| */
1932  specnode *r,*t;
1933{
1934  t->up=r->up;
1935  t->up->down=t;
1936  r->up=t;
1937  t->down=r;
1938  t->addr=r->addr;
1939}
1940
1941@ Conversely, |spec_rem| takes such a value out.
1942
1943@<Internal proto...@>=
1944static void spec_rem @,@,@[ARGS((specnode*))@];
1945
1946@ @<Sub...@>=
1947static void spec_rem(t) /* remove |t| from its list */
1948  specnode *t;
1949{@+register specnode *u=t->up, *d=t->down;
1950  u->down=d;@+ d->up=u;
1951}
1952
1953@ Some special registers are so central to \MMIX's operation, they are
1954carried along with each control block in the reorder buffer instead of being
1955treated as source and destination registers of each instruction. For example,
1956the register stack pointers rO and~rS are treated in this way.
1957The normal specnodes for rO and~rS, namely |g[rO]| and~|g[rS]|,
1958are not actually used;
1959the cool values are called |cool_O| and |cool_S|.
1960(Actually |cool_O| and |cool_S| correspond to the register
1961values divided by~8, since rO and~rS are always multiples of~8.)
1962
1963The arithmetic status register, rA, is also treated specially. Its
1964event bits are kept up to date only at the ``hot'' end, by accumulating
1965values of |arith_exc|; an instruction
1966to \.{GET} the value of~rA will be executed only in the hot seat.
1967The other bits of~rA, which are needed to control trip handlers and
1968floating point rounding, are treated in the normal way.
1969
1970@<External v...@>=
1971Extern octa cool_O,cool_S; /* values of rO, rS before the |cool| instruction */
1972
1973@ @<Glob...@>=
1974int cool_L,cool_G; /* values of rL and rG before the |cool| instruction */
1975unsigned int cool_hist,peek_hist; /* history bits for branch prediction */
1976octa new_O,new_S; /* values of rO, rS after |cool| */
1977
1978@ @<Install default fields in the |cool| block@>=
1979cool->op=op; @+cool->i=i;
1980cool->xx=(head->inst>>16)&0xff;@+
1981cool->yy=(head->inst>>8)&0xff;@+
1982cool->zz=(head->inst)&0xff;
1983cool->loc=head->loc;
1984cool->y=cool->z=cool->b=cool->ra=zero_spec;
1985cool->x.o=cool->a.o=cool->rl.o=zero_octa;
1986cool->x.known=false; cool->x.up=NULL;
1987cool->a.known=false; cool->a.up=NULL;
1988cool->rl.known=true; cool->rl.up=NULL;
1989cool->need_b=cool->need_ra=
1990  cool->ren_x=cool->mem_x=cool->ren_a=cool->set_l=false;
1991cool->arith_exc=cool->denin=cool->denout=0;
1992if ((head->loc.h&sign_bit) && !(g[rU].o.h&0x8000)) cool->usage=false;
1993else cool->usage=((op&(g[rU].o.h>>16))==g[rU].o.h>>24? true: false);
1994new_O=cool->cur_O=cool_O;@+ new_S=cool->cur_S=cool_S;
1995cool->interrupt=head->interrupt;
1996cool->hist=peek_hist;
1997cool->go.o=incr(cool->loc,4);
1998cool->go.known=false, cool->go.addr.h=-1,cool->go.up=(specnode*)cool;
1999cool->interim=cool->stack_alert=false;
2000
2001@ @<Dispatch an inst...@>=
2002if (new_cool==hot) goto stall; /* reorder buffer is full */
2003@<Make sure |cool_L| and |cool_G| are up to date@>;
2004@<Install the operand fields of the |cool| block@>;
2005if (f&X_is_dest_bit) @<Install register X as the destination, or insert
2006  an internal command and |goto dispatch_done| if X is marginal@>;
2007switch (i) {
2008@<Special cases of instruction dispatch@>@;
2009default: break;
2010}
2011dispatch_done:@;
2012
2013@ The \.{UNSAVE} operation begins by loading register~rG from memory.
2014We don't really need to know the value of~rG until twelve other registers
2015have been unsaved, so we aren't fussy about it here.
2016
2017@<Make sure |cool_L| and |cool_G| are up to date@>=
2018if (!g[rL].up->known) goto stall;
2019cool_L=g[rL].up->o.l;
2020if (!g[rG].up->known && !(op==UNSAVE && cool->xx==1)) goto stall;
2021cool_G=g[rG].up->o.l;
2022
2023@ @<Install the operand fields of the |cool| block@>=
2024if (resuming)
2025  @<Insert special operands when resuming an interrupted operation@>@;
2026else{
2027  if (f&0x10) @<Set |cool->b| from register X@>@;
2028  if (third_operand[op] && (cool->i!=trap))
2029    @<Set |cool->b| and/or |cool->ra| from special register@>;
2030  if (f&0x1) cool->z.o.l=cool->zz;
2031  else if (f&0x2) @<Set |cool->z| from register Z@>@;
2032  else if ((op&0xf0)==0xe0) @<Set |cool->z| as an immediate wyde@>;
2033  if (f&0x4) cool->y.o.l=cool->yy;
2034  else if (f&0x8) @<Set |cool->y| from register Y@>@;
2035}
2036
2037@ @<Set |cool->z| from register Z@>=
2038{
2039  if (cool->zz>=cool_G) cool->z=specval(&g[cool->zz]);
2040  else if (cool->zz<cool_L) cool->z=specval(&l[(cool_O.l+cool->zz)&lring_mask]);
2041}
2042
2043@ @<Set |cool->y| from register Y@>=
2044{
2045  if (cool->yy>=cool_G) cool->y=specval(&g[cool->yy]);
2046  else if (cool->yy<cool_L) cool->y=specval(&l[(cool_O.l+cool->yy)&lring_mask]);
2047}
2048
2049@ @<Set |cool->b| from register X@>=
2050{
2051  if (cool->xx>=cool_G) cool->b=specval(&g[cool->xx]);
2052  else if (cool->xx<cool_L)
2053    cool->b=specval(&l[(cool_O.l+cool->xx)&lring_mask]);
2054  if (f&rel_addr_bit) cool->need_b=true; /* |br|, |pbr| */
2055}
2056
2057@ If an operation requires a special register as third operand,
2058that register is listed in the |third_operand| table.
2059
2060@<Glob...@>=
2061unsigned char third_operand[256]={@/
2062  0,rA,0,0,rA,rA,rA,rA, /* \.{TRAP}, \dots\ */
2063  rA,rA,rA,rA,rA,rA,rA,rA, /* \.{FLOT}, \dots\ */
2064  rA,rE,rE,rE,rA,rA,rA,rA, /* \.{FMUL}, \dots\ */
2065  rA,rA,0,0,rA,rA,rD,rD, /* \.{MUL}, \dots\ */
2066  rA,rA,0,0,rA,rA,0,0, /* \.{ADD}, \dots\ */
2067  0,0,0,0,0,0,0,0, /* \.{2ADDU}, \dots\ */
2068  0,0,0,0,rA,rA,0,0, /* \.{CMP}, \dots\ */
2069  rA,rA,0,0,0,0,0,0, /* \.{SL}, \dots\ */
2070  0,0,0,0,0,0,0,0, /* \.{BN}, \dots\ */
2071  0,0,0,0,0,0,0,0, /* \.{BNN}, \dots\ */
2072  0,0,0,0,0,0,0,0, /* \.{PBN}, \dots\ */
2073  0,0,0,0,0,0,0,0, /* \.{PBNN}, \dots\ */
2074  0,0,0,0,0,0,0,0, /* \.{CSN}, \dots\ */
2075  0,0,0,0,0,0,0,0, /* \.{CSNN}, \dots\ */
2076  0,0,0,0,0,0,0,0, /* \.{ZSN}, \dots\ */
2077  0,0,0,0,0,0,0,0, /* \.{ZSNN}, \dots\ */
2078  0,0,0,0,0,0,0,0, /* \.{LDB}, \dots\ */
2079  0,0,0,0,0,0,0,0, /* \.{LDT}, \dots\ */
2080  0,0,0,0,0,0,0,0, /* \.{LDSF}, \dots\ */
2081  0,0,0,0,0,0,0,0, /* \.{LDVTS}, \dots\ */
2082  rA,rA,0,0,rA,rA,0,0, /* \.{STB}, \dots\ */
2083  rA,rA,0,0,0,0,0,0, /* \.{STT}, \dots\ */
2084  rA,rA,0,0,0,0,0,0, /* \.{STSF}, \dots\ */
2085  0,0,0,0,0,0,0,0, /* \.{SYNCD}, \dots\ */
2086  0,0,0,0,0,0,0,0, /* \.{OR}, \dots\ */
2087  0,0,0,0,0,0,0,0, /* \.{AND}, \dots\ */
2088  0,0,0,0,0,0,0,0, /* \.{BDIF}, \dots\ */
2089  rM,rM,0,0,0,0,0,0, /* \.{MUX}, \dots\ */
2090  0,0,0,0,0,0,0,0, /* \.{SETH}, \dots\ */
2091  0,0,0,0,0,0,0,0, /* \.{ORH}, \dots\ */
2092  0,0,0,0,0,0,0,0, /* \.{JMP}, \dots\ */
2093  rJ,0,0,0,0,0,0,255}; /* \.{POP}, \dots\ */
2094
2095@ The |cool->b| field is busy in operations like \.{STB} or \.{STSF},
2096which need~rA. So we use |cool->ra| instead, when rA is needed.
2097
2098@<Set |cool->b| and/or |cool->ra| from special register@>=
2099{
2100  if (third_operand[op]==rA || third_operand[op]==rE)
2101    cool->need_ra=true, cool->ra=specval(&g[rA]);
2102  if (third_operand[op]!=rA)
2103    cool->need_b=true, cool->b=specval(&g[third_operand[op]]);
2104}
2105
2106@ @<Set |cool->z| as an immediate wyde@>=
2107{  switch (op&3) {
2108case 0: cool->z.o.h=yz<<16;@+break;
2109case 1: cool->z.o.h=yz;@+break;
2110case 2: cool->z.o.l=yz<<16;@+break;
2111case 3: cool->z.o.l=yz;@+break;
2112}
2113  if (i!=set) { /* register X should also be the Y operand */
2114    cool->y=cool->b; cool->b=zero_spec;
2115  }
2116}
2117
2118@ @<Install register X...@>=
2119{
2120  if (cool->xx>=cool_G) {
2121    if (i!=pushgo && i!=pushj && i!=cswap)
2122      cool->ren_x=true,spec_install(&g[cool->xx],&cool->x);
2123  }@+else if (cool->xx<cool_L) {
2124    if (i!=cswap) cool->ren_x=true,
2125      spec_install(&l[(cool_O.l+cool->xx)&lring_mask],&cool->x);
2126  }@+else { /* we need to increase L before issuing |head->inst| */
2127 increase_L:@+ if (((cool_S.l-cool_O.l-cool_L-1)&lring_mask)==0)
2128      @<Insert an instruction to advance gamma@>@;
2129    else @<Insert an instruction to advance beta and L@>;
2130  }
2131}
2132
2133@ @<Check for sufficient rename registers...@>=
2134if (rename_regs<cool->ren_x+cool->ren_a) goto stall;
2135if (cool->mem_x)
2136  if (mem_slots) mem_slots--;@+else goto stall;
2137rename_regs-=cool->ren_x+cool->ren_a;
2138
2139@ The |incrl| instruction
2140advances $\beta$ and~rL by~1 at a time when we know that $\beta\ne\gamma$,
2141in the ring of local registers.
2142
2143@<Insert an instruction to advance beta and L@>=
2144{
2145  cool->i=incrl;
2146  spec_install(&l[(cool_O.l+cool_L)&lring_mask],&cool->x);
2147  cool->need_b=cool->need_ra=false;
2148  cool->y=cool->z=zero_spec;
2149  cool->x.known=true; /* |cool->x.o=zero_octa| */
2150  spec_install(&g[rL],&cool->rl);
2151  cool->rl.o.l=cool_L+1;
2152  cool->ren_x=cool->set_l=true;
2153  op=SETH; /* this instruction to be handled by the simplest units */
2154  cool->interim=true;
2155  goto dispatch_done;
2156}
2157
2158@ The |incgamma| instruction advances $\gamma$ and rS by storing an octabyte
2159from the local register ring to virtual memory location |cool_S<<3|.
2160
2161@<Insert an instruction to advance gamma@>=
2162{
2163  cool->need_b=cool->need_ra=false;
2164  cool->i=incgamma;
2165  new_S=incr(cool_S,1);
2166  cool->b=specval(&l[cool_S.l&lring_mask]);
2167  cool->y.p=NULL, cool->y.o=shift_left(cool_S,3);
2168  cool->z=zero_spec;
2169  cool->mem_x=true, spec_install(&mem,&cool->x);
2170  op=STOU; /* this instruction needs to be handled by load/store unit */
2171  cool->interim=true;
2172  cool->stack_alert=!(cool->y.o.h&sign_bit);
2173  goto dispatch_done;
2174}
2175
2176@ The |decgamma| instruction decreases $\gamma$ and rS by loading an octabyte
2177from virtual memory location |(cool_S-1)<<3| into the local register ring.
2178The value of $\beta$ may need to be decreased too (by decreasing~rL).
2179
2180@<Insert an instruction to decrease gamma@>=
2181{
2182  if (cool_O.l+cool_L==cool_S.l+lring_size) {
2183      /* don't let $\gamma$ pass $\beta$ */
2184    if (cool->i==pop && cool->xx==cool_L && cool_L>1) {
2185      cool->i=or; /* we'll preserve the main result by moving it down */
2186      head->inst-=0x10000; /* decrease X field of \.{POP} in fetch buffer */
2187      op=OR;
2188      cool->y=specval(&l[(cool_O.l+cool->xx-1)&lring_mask]);
2189      spec_install(&l[(cool_O.l+cool->xx-2)&lring_mask],&cool->x);
2190    }@+else { /* decrease rL by 1 */
2191      spec_install(&g[rL],&cool->rl);
2192      cool->rl.o.l=cool_L-1;
2193      cool->set_l=true;
2194    }
2195  }
2196  if (cool->i!=or) {
2197    cool->i=decgamma;
2198    new_S=incr(cool_S,-1);
2199    cool->y.p=NULL, cool->y.o=shift_left(new_S,3);
2200    spec_install(&l[new_S.l&lring_mask],&cool->x);
2201    op=LDOU; /* this instruction needs to be handled by load/store unit */
2202    cool->ptr_a=(void*)mem.up;
2203  }
2204  cool->z=cool->b=zero_spec; cool->need_b=false;
2205  cool->ren_x=cool->interim=true;
2206  goto dispatch_done;
2207}
2208
2209@ Storing into memory requires a doubly linked data list of specnodes
2210like the lists we use for local and global registers. In this case
2211the head of the list is called |mem|, and the |addr| fields are
2212physical addresses in memory.
2213
2214@<External v...@>=
2215Extern specnode mem;
2216
2217@ The |addr| field of a memory specnode
2218is all 1s until the physical address has been computed.
2219
2220@<Initialize e...@>=
2221mem.addr.h=mem.addr.l=-1;
2222mem.up=mem.down=&mem;
2223
2224@ The \.{CSWAP} operation is treated as a partial store, with \$X
2225as a secondary output. Partial store (|pst|) commands read an octabyte
2226from memory before they write it.
2227
2228@<Special cases of instruction dispatch@>=
2229case cswap: cool->ren_a=true;
2230  spec_install(cool->xx>=cool_G? &g[cool->xx]:
2231      &l[(cool_O.l+cool->xx)&lring_mask],&cool->a);
2232  cool->i=pst;
2233case st:@+ if ((op&0xfe)==STCO) cool->b.o.l=cool->xx;
2234case pst:
2235 cool->mem_x=true, spec_install(&mem,&cool->x);@+ break;
2236case ld: case ldunc: cool->ptr_a=(void *)mem.up;@+ break;
2237
2238@ When new data is \.{PUT} into special registers 8 or 15--20 (namely rC, rK,
2239rQ, rU, rV, rG, or~rL) it can affect many things. Therefore we stop
2240issuing further instructions until such \.{PUT}s are committed.
2241Moreover, we will see later that such drastic \.{PUT}s defer execution until
2242they reach the hot seat.
2243
2244@<Special cases of instruction dispatch@>=
2245case put:@+ if (cool->yy!=0 || cool->xx>=32) goto illegal_inst;
2246 if (cool->xx>=8) {
2247   if (cool->xx<=11 && cool->xx!=8) goto illegal_inst;
2248   if (cool->xx<=18 && !(cool->loc.h&sign_bit)) goto privileged_inst;
2249 }
2250 if (cool->xx==8 || (cool->xx>=15 && cool->xx<=20)) freeze_dispatch=true;
2251 cool->ren_x=true, spec_install(&g[cool->xx],&cool->x);@+break;
2252@#
2253case get:@+ if (cool->yy || cool->zz>=32) goto illegal_inst;
2254 if (cool->zz==rO) cool->z.o=shift_left(cool_O,3);
2255 else if (cool->zz==rS) cool->z.o=shift_left(cool_S,3);
2256 else cool->z=specval(&g[cool->zz]);@+break;
2257illegal_inst: cool->interrupt |= B_BIT;@+goto noop_inst;
2258case ldvts:@+ if (cool->loc.h&sign_bit) break;
2259privileged_inst:  cool->interrupt |= K_BIT;
2260noop_inst: cool->i=noop;@+break;
2261
2262@ A \.{PUSHGO} instruction with $\rm X\ge G$ causes L to increase
2263momentarily by~1, even if $\rm L=G$.
2264But the value of~L will be decreased before the \.{PUSHGO}
2265is complete, so it will never actually exceed~G. Moreover, we needn't
2266insert an~|incrl| command.
2267
2268@<Special cases of instruction dispatch@>=
2269case pushgo: inst_ptr.p=&cool->go;
2270case pushj: {@+register int x=cool->xx;
2271  if (x>=cool_G) {
2272    if (((cool_S.l-cool_O.l-cool_L-1)&lring_mask)==0)
2273      @<Insert an instruction to advance gamma@>@;
2274    x=cool_L;@+ cool_L++;
2275    cool->ren_x=true, spec_install(&l[(cool_O.l+x)&lring_mask],&cool->x);
2276  }
2277  cool->x.known=true, cool->x.o.h=0, cool->x.o.l=x;
2278  cool->ren_a=true, spec_install(&g[rJ],&cool->a);
2279  cool->a.known=true, cool->a.o=incr(cool->loc,4);
2280  cool->set_l=true, spec_install(&g[rL],&cool->rl);
2281  cool->rl.o.l=cool_L-x-1;
2282  new_O=incr(cool_O,x+1);
2283}@+break;
2284case syncid:@+if (cool->loc.h&sign_bit) break;
2285case go: inst_ptr.p=&cool->go;@+break;
2286
2287@ We need to know the topmost ``hidden'' element of the register stack
2288when a \.{POP} instruction is dispatched. This element is usually
2289present in the local register ring, unless $\gamma=\alpha$.
2290
2291Once it is known, let $x$ be its least significant byte. We will
2292be decreasing rO by $x+1$, so we may have to decrease $\gamma$ repeatedly
2293in order to maintain the condition $\rm rS\le rO$.
2294
2295@<Special cases of instruction dispatch@>=
2296case pop:@+if (cool->xx && cool_L>=cool->xx)
2297      cool->y=specval(&l[(cool_O.l+cool->xx-1)&lring_mask]);
2298pop_unsave:@+if (cool_S.l==cool_O.l)
2299    @<Insert an instruction to decrease gamma@>;
2300  {@+register tetra x; register int new_L;
2301    register specnode *p=l[(cool_O.l-1)&lring_mask].up;
2302    if (p->known) x=(p->o.l)&0xff;@+ else goto stall;
2303    if ((tetra)(cool_O.l-cool_S.l)<=x)
2304      @<Insert an instruction to decrease gamma@>;
2305    new_O=incr(cool_O,-x-1);
2306    if (cool->i==pop) new_L=x+(cool->xx<=cool_L? cool->xx: cool_L+1);
2307    else new_L=x;
2308    if (new_L>cool_G) new_L=cool_G;
2309    if (x<new_L)
2310      cool->ren_x=true, spec_install(&l[(cool_O.l-1)&lring_mask],&cool->x);
2311    cool->set_l=true, spec_install(&g[rL],&cool->rl);
2312    cool->rl.o.l=new_L;
2313    if (cool->i==pop) {
2314      cool->z.o.l=yz<<2;
2315      if (inst_ptr.p==UNKNOWN_SPEC && new_head==tail) inst_ptr.p=&cool->go;
2316    }
2317    break;
2318  }
2319
2320@ @<Special cases of instruction dispatch@>=
2321case mulu: cool->ren_a=true, spec_install(&g[rH],&cool->a);@+break;
2322case div: case divu: cool->ren_a=true, spec_install(&g[rR],&cool->a);@+break;
2323
2324@ It's tempting to say that we could avoid taking up space in the reorder
2325buffer when no operation needs to be done.
2326A \.{JMP} instruction qualifies as a no-op in this sense,
2327because the change of control occurs before the execution stage.
2328However, even a no-op might have to be counted in the usage register~rU,
2329so it might get into the execution stage for that reason.
2330A no-op can also cause a protection interrupt, if it appears in a negative
2331location. Even more importantly, a program might get into a loop that consists
2332entirely of jumps and no-ops; then we wouldn't be able to interrupt it,
2333because the interruption mechanism needs to find the current location
2334in the reorder buffer! At least one functional unit therefore needs to provide
2335explicit support for \.{JMP}, \.{JMPB}, and \.{SWYM}.
2336
2337The \.{SWYM} instruction with |F_BIT| set is a special case: This is
2338a request from the fetch coroutine for an update to the IT-cache,
2339when the page table method isn't implemented in hardware.
2340
2341@<Special cases of instruction dispatch@>=
2342case noop:@+if (cool->interrupt&F_BIT) {
2343   cool->go.o=cool->y.o=cool->loc;
2344   inst_ptr=specval(&g[rT]);
2345 }
2346 break;
2347
2348@ @<Undo data structures set prematurely in the |cool| block...@>=
2349if (cool->ren_x || cool->mem_x) spec_rem(&cool->x);
2350if (cool->ren_a) spec_rem(&cool->a);
2351if (cool->set_l) spec_rem(&cool->rl);
2352if (inst_ptr.p==&cool->go) inst_ptr.p=UNKNOWN_SPEC;
2353break;
2354
2355@* The execution stages. \MMIX's {\it raison d'\^etre\/} is its ability
2356to execute instructions. So now we want to simulate the behavior of its
2357functional units.
2358
2359Each coroutine scheduled for action at the current tick of the clock has a
2360|stage| number corresponding to a particular subset of the \MMIX\ hardware.
2361For example, the coroutines with |stage=2| are the second stages in the
2362pipelines of the functional units. A coroutine with |stage=0| works
2363in the fetch unit. Several artificially large stage numbers
2364are used to control special coroutines that do things like write data
2365from buffers into memory.
2366
2367In this program the current coroutine of interest is called |self|; hence
2368|self->stage| is the current stage number of interest. Another key variable,
2369|self->ctl|, is called~|data|; this is the control block being operated on by
2370the current coroutine. We typically are simulating an operation in which
2371|data->x| is being computed as a function of |data->y| and |data->z|.
2372The |data| record has many fields, as described earlier when we defined
2373\&{control} structures; for example, |data->owner| is the same as
2374|self|, during the execution stage, if it is nonnull.
2375
2376This part of the simulator is written as if each functional unit is able to
2377handle all 256 operations. In practice, of course, a functional unit tends to
2378be much more specialized; the actual specialization is governed by the
2379dispatcher, which issues an instruction only to a functional unit that
2380supports it. Once an instruction has been dispatched, however, we can simulate
2381it most easily if we imagine that its functional unit is universal.
2382
2383Coroutines with higher |stage| numbers are processed first.
2384The three most important variables that govern a coroutine's behavior, once
2385|self->stage| is given, are the external operation code |data->op|, the
2386internal operation code |data->i|, and the value of |data->state|. We
2387typically have |data->state=0| when a coroutine is first fired~up.
2388
2389@<Local var...@>=
2390register coroutine *self; /* the current coroutine being executed */
2391register control *data; /* the |control| block of the current coroutine */
2392
2393@ When a coroutine has done all it wants to on a single cycle,
2394it says |goto done|. It will not be scheduled to do any further work
2395unless the |schedule| routine has been called since it began execution.
2396The |wait| macro is a convenient way to say ``Please schedule me to resume
2397again at the current |data->state|'' after a specified time; for example,
2398|wait(1)| will restart a coroutine on the next clock tick.
2399
2400@d wait(t)@+ {@+schedule(self,t,data->state);@+ goto done;@+}
2401@d pass_after(t)  schedule(self+1,t,data->state)
2402@d sleep@+ {@+self->next=self;@+ goto done;@+} /* wait forever */
2403@d awaken(c,t)  schedule(c,t,c->ctl->state)
2404
2405@<Execute all coroutines scheduled for the current time@>=
2406cur_time++;@+ if (cur_time==ring_size) cur_time=0;
2407for (self=queuelist(cur_time);self!=&sentinel;self=sentinel.next) {
2408  sentinel.next=self->next;@+self->next=NULL; /* unschedule this coroutine */
2409  data=self->ctl;
2410  if (verbose&coroutine_bit) {
2411    printf(" running ");@+print_coroutine_id(self);@+printf(" ");
2412    print_control_block(data);@+printf("\n");
2413  }
2414  switch(self->stage) {
2415 case 0:@<Simulate an action of the fetch coroutine@>;
2416 case 1:@<Simulate the first stage of an execution pipeline@>;
2417 default:@<Simulate later stages of an execution pipeline@>;
2418 @t\4@>@<Cases for control of special coroutines@>;
2419  }
2420 terminate:@+if (self->lockloc) *(self->lockloc)=NULL,self->lockloc=NULL;
2421 done:;
2422}
2423
2424@ A special coroutine whose |stage| number is |vanish| simply goes away
2425at its scheduled time.
2426
2427@<Cases for control of special...@>=
2428case vanish: goto terminate;
2429
2430@ @<Glob...@>=
2431coroutine mem_locker; /* trivial coroutine that vanishes */
2432coroutine Dlocker; /* another */
2433control vanish_ctl; /* such coroutines share a common control block */
2434
2435@ @<Init...@>=
2436mem_locker.name="Locker";
2437mem_locker.ctl=&vanish_ctl;
2438mem_locker.stage=vanish;
2439Dlocker.name="Dlocker";
2440Dlocker.ctl=&vanish_ctl;
2441Dlocker.stage=vanish;
2442vanish_ctl.go.o.l=4;
2443for (j=0;j<DTcache->ports;j++) DTcache->reader[j].ctl=&vanish_ctl;
2444if (Dcache) for (j=0;j<Dcache->ports;j++) Dcache->reader[j].ctl=&vanish_ctl;
2445for (j=0;j<ITcache->ports;j++) ITcache->reader[j].ctl=&vanish_ctl;
2446if (Icache) for (j=0;j<Icache->ports;j++) Icache->reader[j].ctl=&vanish_ctl;
2447
2448@ Here is a list of the |stage| numbers for special coroutines to be
2449defined below.
2450
2451@<Header def...@>=
2452#define max_stage 99 /* exceeds all |stage| numbers */
2453#define vanish 98 /* special coroutine that just goes away */
2454#define flush_to_mem 97 /* coroutine for flushing from a cache to memory */
2455#define flush_to_S 96 /* coroutine for flushing from a cache to the S-cache */
2456#define fill_from_mem 95 /* coroutine for filling a cache from memory */
2457#define fill_from_S 94 /* coroutine for filling a cache from the S-cache */
2458#define fill_from_virt 93 /* coroutine for filling a translation cache */
2459#define write_from_wbuf 92 /* coroutine for emptying the write buffer */
2460#define cleanup 91 /* coroutine for cleaning the caches */
2461
2462@ At the very beginning of stage 1, a functional unit will stall if necessary
2463until its operands are available. As soon as the operands are all present, the
2464|state| is set nonzero and execution proper begins.
2465
2466@<Simulate the first stage of an execution pipeline@>=
2467switch1:@+ switch(data->state) {
2468 case 0: @<Wait for input data if necessary; set |state=1| if it's there@>;
2469 case 1: @<Begin execution of an operation@>;
2470 case 2: @<Pass |data| to the next stage of the pipeline@>;
2471 case 3: @<Finish execution of an operation@>;
2472  @<Special cases for states in the first stage@>;
2473}
2474
2475@ If some of our input data has been computed by another coroutine on the
2476current cycle, we grab it now but wait for the next cycle. (An actual machine
2477wouldn't have latched the data until then.)
2478
2479@<Wait for input data if necessary; set |state=1| if it's there@>=
2480j=0;
2481if (data->y.p) {
2482  j++;
2483  if (data->y.p->known) data->y.o=data->y.p->o, data->y.p=NULL;
2484  else j+=10;
2485}
2486if (data->z.p) {
2487  j++;
2488  if (data->z.p->known) data->z.o=data->z.p->o, data->z.p=NULL;
2489  else j+=10;
2490}
2491if (data->b.p) {
2492  if (data->need_b) j++;
2493  if (data->b.p->known) data->b.o=data->b.p->o, data->b.p=NULL;
2494  else if (data->need_b) j+=10;
2495}
2496if (data->ra.p) {
2497  if (data->need_ra) j++;
2498  if (data->ra.p->known) data->ra.o=data->ra.p->o, data->ra.p=NULL;
2499  else if (data->need_ra) j+=10;
2500}
2501if (j<10) data->state=1;
2502if (j) wait(1); /* otherwise we fall through to case 1 */
2503
2504@ Simple register-to-register instructions like \.{ADD} are assumed to take
2505just one cycle, but others like \.{FADD} almost certainly require more time.
2506This simulator can be configured so that \.{FADD} might take, say, four
2507pipeline stages of one cycle each ($1+1+1+1$), or two pipeline stages of two
2508cycles each ($2+2$), or a single unpipelined stage lasting four cycles (4),
2509etc. In any case the simulator computes the results now, for simplicity,
2510placing them in |data->x| and possibly also in |data->a| and/or
2511|data->interrupt|. The results will not be officially made |known| until
2512the proper time.
2513
2514@<Begin execution of an operation@>=
2515switch (data->i) {
2516  @<Cases to compute the results of register-to-register operation@>;
2517  @<Cases to compute the virtual address of a memory operation@>;
2518  @<Cases for stage 1 execution@>;
2519}
2520@<Set things up so that the results become |known| when they should@>;
2521
2522@ If the internal opcode |data->i| is |max_pipe_op| or less, a special
2523pipeline sequence like $1+1+1+1$ or $2+2$ or $15+10$, etc., has been
2524configured. Otherwise we assume that the pipeline sequence is simply~1.
2525
2526Suppose the pipeline sequence is $t_1+t_2+\cdots+t_k$. Each $t_j$ is
2527positive and less than~256, so we represent the sequence as a
2528string |pipe_seq[data->i]| of unsigned ``characters,'' terminated by~0.
2529Given such a string, we want to do the following: Wait $(t_1-1)$ cycles
2530and pass |data| to stage~2; wait $t_2$ cycles and pass |data| to stage~3;
2531\dots; wait $t_{k-1}$ cycles and pass |data| to stage~$k$; wait $t_k$ cycles
2532and make the results |known|.
2533
2534The value of |denin| is added to $t_1$; the value of |denout| is
2535added to~$t_k$.
2536
2537@<Set things up so that the results become |known| when they should@>=
2538data->state=3;
2539if (data->i<=max_pipe_op) {@+register unsigned char *s=pipe_seq[data->i];
2540  j=s[0]+data->denin;
2541  if (s[1]) data->state=2; /* more than one stage */
2542  else j+=data->denout;
2543  if (j>1) wait(j-1);
2544}
2545goto switch1;
2546
2547@ When we're in stage $j$, the coroutine for stage $j+1$ of the same functional
2548unit is |self+1|.
2549
2550@<Pass |data| to the next stage of the pipeline@>=
2551pass_data:@+
2552if ((self+1)->next) wait(1); /* stall if the next stage is occupied */
2553{@+register unsigned char *s=pipe_seq[data->i];
2554  j=s[self->stage];
2555  if (s[self->stage+1]==0) j+=data->denout,data->state=3;
2556          /* the next stage is the last */
2557  pass_after(j);
2558}
2559passit: (self+1)->ctl=data;
2560data->owner=self+1;
2561goto done;
2562
2563@ @<Simulate later stages of an execution pipeline@>=
2564switch2:@+if (data->b.p && data->b.p->known)
2565    data->b.o=data->b.p->o, data->b.p=NULL;
2566 switch(data->state) {
2567 case 0: panic(confusion("switch2"));
2568 case 1: @<Begin execution of a stage-two operation@>;
2569 case 2: goto pass_data;
2570 case 3: goto fin_ex;
2571  @<Special cases for states in later stages@>;
2572}
2573
2574@ The default pipeline times use only one stage; they
2575can be overridden by |MMIX_config|. The total number of stages
2576supported by this simulator is limited to 90, since
2577it must never interfere with the |stage| numbers for special coroutines
2578defined below. (The author doesn't feel guilty about making this restriction.)
2579
2580@<External v...@>=
2581#define pipe_limit 90
2582Extern unsigned char pipe_seq[max_pipe_op+1][pipe_limit+1];
2583
2584@ The simplest of all register-to-register operations is |set|,
2585which occurs for commands like \.{SETH} as well as for commands
2586like \.{GETA}. (We might as well start with the easy cases and work our
2587way up.)
2588
2589@<Cases to compute the results...@>=
2590case set: data->x.o=data->z.o;@+break;
2591
2592@ Here are the basic boolean operations, which account for 24 of \MMIX's
2593256 opcodes.
2594
2595@<Cases to compute the results...@>=
2596case or: data->x.o.h=data->y.o.h | data->z.o.h;
2597   data->x.o.l=data->y.o.l | data->z.o.l; break;
2598case orn: data->x.o.h=data->y.o.h |~data->z.o.h;
2599   data->x.o.l=data->y.o.l |~data->z.o.l; break;
2600case nor: data->x.o.h=~(data->y.o.h | data->z.o.h);
2601   data->x.o.l=~(data->y.o.l | data->z.o.l); break;
2602case and: data->x.o.h=data->y.o.h & data->z.o.h;
2603   data->x.o.l=data->y.o.l & data->z.o.l; break;
2604case andn: data->x.o.h=data->y.o.h &~data->z.o.h;
2605   data->x.o.l=data->y.o.l &~data->z.o.l; break;
2606case nand: data->x.o.h=~(data->y.o.h & data->z.o.h);
2607   data->x.o.l=~(data->y.o.l & data->z.o.l); break;
2608case xor: data->x.o.h=data->y.o.h ^ data->z.o.h;
2609   data->x.o.l=data->y.o.l ^ data->z.o.l; break;
2610case nxor: data->x.o.h=data->y.o.h ^~data->z.o.h;
2611   data->x.o.l=data->y.o.l ^~data->z.o.l; break;
2612
2613@ The implementation of \.{ADDU} is only slightly more difficult.
2614It would be trivial except for the fact that internal opcode
2615|addu| is used not only for the \.{ADDU[I]} and \.{INC[M][H,L]} operations,
2616in which we simply want to add |data->y.o| to |data->z.o|, but also for
2617operations like \.{4ADDU}.
2618
2619@<Cases to compute the results...@>=
2620case addu: data->x.o=oplus((data->op&0xf8)==0x28?@|
2621          shift_left(data->y.o,1+((data->op>>1)&0x3)): data->y.o, data->z.o);
2622 break;
2623case subu: data->x.o=ominus(data->y.o,data->z.o);@+ break;
2624
2625@ Signed addition and subtraction produce the same results as their
2626unsigned counterparts, but overflow must also be detected. Overflow
2627occurs when adding |y| to~|z| if and only if |y| and~|z| have the
2628same sign but their sum has a different sign. Overflow occurs in
2629the calculation |x=y-z| if and only if it occurs in the calculation~|y=x+z|.
2630
2631@<Cases to compute the results...@>=
2632case add: data->x.o=oplus(data->y.o,data->z.o);
2633  if (((data->y.o.h ^ data->z.o.h)&sign_bit)==0 &&
2634      ((data->y.o.h ^ data->x.o.h)&sign_bit)!=0) data->interrupt|=V_BIT;
2635  break;
2636case sub: data->x.o=ominus(data->y.o,data->z.o);
2637  if (((data->x.o.h ^ data->z.o.h)&sign_bit)==0 &&
2638      ((data->y.o.h ^ data->x.o.h)&sign_bit)!=0) data->interrupt|=V_BIT;
2639  break;
2640
2641@ The shift commands might take more than one cycle, or they might even be
2642pipelined, if the default value of |pipe_seq[sh]| is changed. But we compute
2643shifts all at once here, because other parts of the simulator will take care
2644of the pipeline timing. (Notice that |shlu| is changed to |sh|, for this
2645reason. Similar changes to the internal op codes are made for other operators
2646below.)
2647
2648@d shift_amt (data->z.o.h || data->z.o.l>=64? 64: data->z.o.l)
2649
2650@<Cases to compute the results...@>=
2651case shlu: data->x.o=shift_left(data->y.o,shift_amt);@+data->i=sh;@+ break;
2652case shl: data->x.o=shift_left(data->y.o,shift_amt);@+data->i=sh;
2653 {@+octa tmpo;
2654    tmpo=shift_right(data->x.o,shift_amt,0);
2655   if (tmpo.h!=data->y.o.h || tmpo.l!=data->y.o.l) data->interrupt|=V_BIT;
2656 }@+break;
2657case shru: data->x.o=shift_right(data->y.o,shift_amt,1);@+data->i=sh;@+ break;
2658case shr:  data->x.o=shift_right(data->y.o,shift_amt,0);@+data->i=sh;@+ break;
2659
2660@ The \.{MUX} operation has three operands, namely |data->y|, |data->z|,
2661and |data->b|; the third operand is the current (speculative) value of~rM, the
2662special mask register. Otherwise \.{MUX} is unexceptional.
2663
2664@<Cases to compute the results...@>=
2665case mux: data->x.o.h=(data->y.o.h&data->b.o.h)+(data->z.o.h&~data->b.o.h);
2666          data->x.o.l=(data->y.o.l&data->b.o.l)+(data->z.o.l&~data->b.o.l);
2667  break;
2668
2669@ Comparisons are a breeze.
2670
2671@<Cases to compute the results...@>=
2672case cmp:@+if ((data->y.o.h&sign_bit)>(data->z.o.h&sign_bit)) goto cmp_neg;
2673  if ((data->y.o.h&sign_bit)<(data->z.o.h&sign_bit)) goto cmp_pos;
2674case cmpu:@+if (data->y.o.h<data->z.o.h) goto cmp_neg;
2675  if (data->y.o.h>data->z.o.h) goto cmp_pos;
2676  if (data->y.o.l<data->z.o.l) goto cmp_neg;
2677  if (data->y.o.l>data->z.o.l) goto cmp_pos;
2678 cmp_zero: break; /* |data->x| is zero */
2679 cmp_pos: data->x.o.l=1;@+ break; /* |data->x.o.h| is zero */
2680 cmp_neg: data->x.o=neg_one;@+ break;
2681
2682@ The other operations will be deferred until later, now that we understand
2683the basic ideas. But one more piece of code ought to be
2684written before we move on, because
2685it completes the execution stage for the simple cases already considered.
2686
2687The |ren_x| and |ren_a| fields tell us whether the |x| and/or |a|
2688fields contain valid information that should become officially known.
2689
2690@<Finish execution of an operation@>=
2691fin_ex:@+if (data->ren_x) data->x.known=true;
2692else if (data->mem_x) {
2693  data->x.known=true;
2694  if (!(data->x.addr.h&0xffff0000)) data->x.addr.l&=-8;
2695}
2696if (data->ren_a) data->a.known=true;
2697if (data->loc.h&sign_bit)
2698  data->ra.o.l=0; /* no trips enabled for the operating system */
2699if (data->interrupt&0xffff) @<Handle interrupt at end of execution stage@>;
2700die: data->owner=NULL;@+goto terminate; /* this coroutine now fades away */
2701
2702@* The commission/deissue stage. Control blocks leave the reorder buffer
2703either at the hot end (when they're committed) or at the cool end
2704(when they're deissued). We hope most of them are committed, but
2705from time to time our speculation is incorrect and we must deissue
2706a sequence of instructions that prove to be unwanted. Deissuing must
2707take priority over committing, because the dispatcher cannot do anything
2708until the machine's cool state has stabilized.
2709
2710Deissuing changes the cool state by undoing the most recently issued
2711instructions, in reverse order. Committing changes the hot state by
2712doing the least recently issued instructions, in their original order.
2713Both operations are similar, so we assume that they take the same time;
2714at most |commit_max| instructions are deissued and/or committed on
2715each clock cycle.
2716
2717@<Deissue the coolest instruction@>=
2718{
2719  cool=(cool==reorder_top? reorder_bot: cool+1);
2720  if (verbose&issue_bit) {
2721    printf("Deissuing ");@+print_control_block(cool);
2722    if (cool->owner) {@+printf(" ");@+print_coroutine_id(cool->owner);@+}
2723    printf("\n");
2724  }
2725  if (cool->ren_x) rename_regs++,spec_rem(&cool->x);
2726  if (cool->ren_a) rename_regs++,spec_rem(&cool->a);
2727  if (cool->mem_x) mem_slots++,spec_rem(&cool->x);
2728  if (cool->set_l) spec_rem(&cool->rl);
2729  if (cool->owner) {
2730    if (cool->owner->lockloc)
2731      *(cool->owner->lockloc)=NULL, cool->owner->lockloc=NULL;
2732    if (cool->owner->next) unschedule(cool->owner);
2733  }
2734  cool_O=cool->cur_O;@+ cool_S=cool->cur_S;
2735  deissues--;
2736}
2737
2738@ @<Commit the hottest instruction...@>=
2739{
2740  if (nullifying) @<Nullify the hottest instruction@>@;
2741  else {
2742    if (hot->i==get && hot->zz==rQ)
2743      new_Q=oandn(g[rQ].o,hot->x.o);
2744    else if (hot->i==put && hot->xx==rQ)
2745      hot->x.o.h |= new_Q.h, hot->x.o.l |= new_Q.l;
2746    if (hot->mem_x) @<Commit to memory if possible, otherwise |break|@>;
2747    if (hot->stack_alert) stack_overflow=true;
2748    else if (stack_overflow && !hot->interim) {
2749      g[rQ].o.l|=STACK_OVERFLOW, new_Q.l|=STACK_OVERFLOW,stack_overflow=false;
2750      if (verbose&issue_bit) {
2751        printf(" setting rQ=");@+print_octa(g[rQ].o);@+printf("\n");
2752      }
2753    }
2754    if (verbose&issue_bit) {
2755      printf("Committing ");@+print_control_block(hot);@+printf("\n");
2756    }
2757    if (hot->ren_x) rename_regs++,hot->x.up->o=hot->x.o,spec_rem(&(hot->x));
2758    if (hot->ren_a) rename_regs++,hot->a.up->o=hot->a.o,spec_rem(&(hot->a));
2759    if (hot->set_l) hot->rl.up->o=hot->rl.o,spec_rem(&(hot->rl));
2760    if (hot->arith_exc) g[rA].o.l |= hot->arith_exc;
2761    if (hot->usage) {
2762      g[rU].o.l++;@+ if (g[rU].o.l==0) {
2763        g[rU].o.h++;@+ if ((g[rU].o.h&0x7fff)==0) g[rU].o.h-=0x8000;
2764      }
2765    }
2766  }
2767  if (hot->interrupt>=H_BIT) @<Begin an interruption and |break|@>;
2768}
2769
2770@ A load or store instruction is ``nullified'' if it is about to be captured
2771by a trap interrupt. In such cases it will be the only item in the reorder
2772buffer; thus nullifying is sort of a cross between deissuing and
2773committing. (It is important to have stopped dispatching when nullification
2774is necessary, because instructions such as |incgamma| and
2775|decgamma| change~rS, and we need to change it back when an unexpected
2776interruption occurs.)
2777
2778@<Nullify the hottest instruction@>=
2779{
2780  if (verbose&issue_bit) {
2781    printf("Nullifying ");@+print_control_block(hot);@+printf("\n");
2782  }
2783  if (hot->ren_x) rename_regs++,spec_rem(&hot->x);
2784  if (hot->ren_a) rename_regs++,spec_rem(&hot->a);
2785  if (hot->mem_x) mem_slots++,spec_rem(&hot->x);
2786  if (hot->set_l) spec_rem(&hot->rl);
2787  cool_O=hot->cur_O, cool_S=hot->cur_S;
2788  nullifying=false;
2789}
2790
2791@ Interrupt bits in rQ might be lost if they are set between a \.{GET}
2792and a~\.{PUT}. Therefore we don't allow \.{PUT} to zero out bits that
2793have become~1 since the most recently committed \.{GET}.
2794
2795@<Glob...@>=
2796octa new_Q; /* when rQ increases in any bit position, so should this */
2797bool stack_overflow; /* stack overflow not yet reported */
2798
2799@ An instruction will not be committed immediately if it violates the basic
2800security rule of \MMIX: An instruction in a nonnegative location
2801should not be performed unless all eight of the internal interrupts
2802have been enabled in the interrupt mask register~rK.
2803Conversely, an instruction in a negative location should not be performed
2804if the |P_BIT| is enabled in~rK.
2805
2806Such instructions take one extra cycle before they are committed.
2807The nonnegative-location case turns on the |S_BIT| of both rK and~rQ\null,
2808leading to an immediate interrupt (unless the current instruction
2809is |trap|, |put|, or~|resume|).
2810
2811@<Check for security violation, |break| if so@>=
2812{
2813  if (hot->loc.h&sign_bit) {
2814    if ((g[rK].o.h&P_BIT) && !(hot->interrupt&P_BIT)) {
2815      hot->interrupt |= P_BIT;
2816      g[rQ].o.h |= P_BIT;
2817      new_Q.h |= P_BIT;
2818      if (verbose&issue_bit) {
2819        printf(" setting rQ=");@+print_octa(g[rQ].o);@+printf("\n");
2820      }
2821      break;
2822    }
2823  }@+else if ((g[rK].o.h&0xff)!=0xff && !(hot->interrupt&S_BIT)) {
2824    hot->interrupt |= S_BIT;
2825    g[rQ].o.h |= S_BIT;
2826    new_Q.h |= S_BIT;
2827    g[rK].o.h |= S_BIT;
2828    if (verbose&issue_bit) {
2829      printf(" setting rQ=");@+print_octa(g[rQ].o);
2830      printf(", rK=");@+print_octa(g[rK].o);@+printf("\n");
2831    }
2832    break;
2833  }
2834}
2835
2836@* Branch prediction. An \MMIX\ programmer distinguishes statically between
2837``branches'' and ``probable branches,'' but many modern computers attempt to
2838do better by implementing dynamic branch prediction. (See, for example,
2839section~4.3 of Hennessy and Patterson's {\sl Computer Architecture},
2840second edition.) Experience has shown that dynamic branch prediction can
2841@^Hennessy, John LeRoy@>
2842@^Patterson, David Andrew@>
2843significantly improve the performance of speculative execution, by
2844reducing the number of instructions that need to be deissued.
2845
2846This simulator has an optional |bp_table| containing $2^{\mkern1mua+b+c}$ entries of
2847$n$~bits each, where $n$ is between 1 and~8. Usually $n$ is 1 or~2 in
2848practice, but 8 bits are allocated per entry for convenience in this program.
2849The |bp_table| is consulted and updated on every branch instruction
2850(every \.{B}~or \.{PB} instruction, but not~\.{JMP}), for advice on
2851past history of similar situations. It is indexed by the $a$ least
2852significant bits of the address of the instruction, the $b$ most recent
2853bits of global branch history, and the next $c$ bits of both address
2854and history (exclusive-ored).
2855
2856A |bp_table| entry begins at zero and is regarded as a signed $n$-bit number.
2857If it is nonnegative, we will follow the prediction in the instruction,
2858namely to predict a branch taken only in the \.{PB} case. If it is
2859negative, we will predict the opposite of the instruction's recommendation.
2860The $n$-bit number is increased (if possible) if the instruction's
2861prediction was correct, decreased (if possible) if the instruction's
2862prediction was incorrect.
2863
2864(Incidentally, a large value of~$n$ is not necessarily a good idea.
2865For example, if $n=8$ the machine might need 128 steps to
2866recognize that a branch taken the first 150 times is not taken
2867the next 150 times. And if we modify the update criteria to avoid this
2868problem, we obtain a scheme that is rarely better than a simple scheme
2869with smaller~$n$.)
2870
2871The values $a$, $b$, $c$, and $n$ in this discussion are called
2872|bp_a|, |bp_b|, |bp_c|, and |bp_n| in the program.
2873
2874@<External v...@>=
2875Extern int bp_a,bp_b,bp_c,bp_n; /* parameters for branch prediction */
2876Extern char *bp_table; /* either |NULL| or an array of $2^{\mkern1mua+b+c}$ items */
2877
2878@ Branch prediction is made when we are either about to issue an
2879instruction or peeking ahead. We look at the |bp_table|, but we
2880don't want to update it yet.
2881
2882@<Predict a branch outcome@>=
2883{
2884  predicted=op&0x10; /* start with the instruction's recommendation */
2885  if (bp_table) {@+register int h;
2886    m=((head->loc.l&bp_cmask)<<bp_b)+(head->loc.l&bp_amask);
2887    m=((cool_hist&bp_bcmask)<<bp_a)^(m>>2);
2888    h=bp_table[m];
2889    if (h&bp_npower) predicted^=0x10;
2890  }
2891  if (predicted) peek_hist=(peek_hist<<1)+1;
2892  else peek_hist<<=1;
2893}
2894
2895@ We update the |bp_table| when an instruction is issued.
2896And we store the opposite table
2897value in |cool->x.o.l|, just in case our prediction turns out to be wrong.
2898
2899@<Record the result of branch prediction@>=
2900if (bp_table) {@+register int reversed,h,h_up,h_down;
2901  reversed=op&0x10;
2902  if (peek_hist&1) reversed^=0x10;
2903  m=((head->loc.l&bp_cmask)<<bp_b)+(head->loc.l&bp_amask);
2904  m=((cool_hist&bp_bcmask)<<bp_a)^(m>>2);
2905  h=bp_table[m];
2906  h_up=(h+1)&bp_nmask;@+ if (h_up==bp_npower) h_up=h;
2907  if (h==bp_npower) h_down=h;@+ else h_down=(h-1)&bp_nmask;
2908  if (reversed) {
2909    bp_table[m]=h_down, cool->x.o.l=h_up;
2910    cool->i=pbr+br-cool->i; /* reverse the sense */
2911    bp_rev_stat++;
2912  }@+else {
2913    bp_table[m]=h_up, cool->x.o.l=h_down; /* go with the flow */
2914    bp_ok_stat++;
2915  }
2916  if (verbose&show_pred_bit) {
2917    printf(" predicting ");@+print_octa(cool->loc);
2918    printf(" %s; bp[%x]=%d\n",reversed? "NG": "OK",m,
2919          bp_table[m]-((bp_table[m]&bp_npower)<<1));
2920  }
2921  cool->x.o.h=m;
2922}
2923
2924@ The calculations in the previous sections need several precomputed constants,
2925depending on the parameters $a$, $b$, $c$, and~$n$.
2926
2927@<Initialize e...@>=
2928bp_amask=((1<<bp_a)-1)<<2; /* least $a$ bits of instruction address */
2929bp_cmask=((1<<bp_c)-1)<<(bp_a+2); /* the next $c$ address bits */
2930bp_bcmask=(1<<(bp_b+bp_c))-1; /* least $b+c$ bits of history info */
2931bp_nmask=(1<<bp_n)-1; /* least significant $n$ bits */
2932bp_npower=1<<(bp_n-1); /* $2^{n-1}$, the sign bit of an $n$-bit number */
2933
2934@ @<Glob...@>=
2935int bp_amask,bp_cmask,bp_bcmask,bp_nmask,bp_npower;
2936int bp_rev_stat,bp_ok_stat; /* how often we overrode and agreed */
2937int bp_bad_stat,bp_good_stat; /* how often we failed and succeeded */
2938
2939@ After a branch or probable branch instruction has been issued and
2940the value of the relevant register has been computed in the
2941reorder buffer as |data->b.o|, we're ready to determine if the
2942prediction was correct or not.
2943
2944@<Cases for stage 1 execution@>=
2945case br: case pbr: j=register_truth(data->b.o,data->op);
2946  if (j) data->go.o=data->z.o;@+ else data->go.o=data->y.o;
2947  if (j==(data->i==pbr)) bp_good_stat++;
2948  else { /* oops, misprediction */
2949    bp_bad_stat++;
2950    @<Recover from incorrect branch prediction@>;
2951  }
2952  goto fin_ex;
2953
2954@ The |register_truth| subroutine is used by \.B, \.{PB}, \.{CS}, and
2955\.{ZS} commands to decide whether an octabyte satisfies the
2956conditions of the opcode, |data->op|.
2957
2958@<Internal proto...@>=
2959static int register_truth @,@,@[ARGS((octa,mmix_opcode))@];
2960
2961@ @<Sub...@>=
2962static int register_truth(o,op)
2963  octa o;
2964  mmix_opcode op;
2965{@+register int b;
2966  switch ((op>>1) & 0x3) {
2967 case 0: b=o.h>>31;@+break; /* negative? */
2968 case 1: b=(o.h==0 && o.l==0);@+break; /* zero? */
2969 case 2: b=(o.h<sign_bit && (o.h||o.l));@+break; /* positive? */
2970 case 3: b=o.l&0x1;@+break; /* odd? */
2971}
2972  if (op&0x8) return b^1;
2973  else return b;
2974}
2975
2976@ The |issued_between| subroutine determines how many speculative instructions
2977were issued between a given control block in the reorder buffer and
2978the current |cool| pointer, when |cc=cool|.
2979
2980@<Internal proto...@>=
2981static int issued_between @,@,@[ARGS((control*,control*))@];
2982
2983@ @<Sub...@>=
2984static int issued_between(c,cc)
2985  control *c,*cc;
2986{
2987  if (c>cc) return c-1-cc;
2988  return (c-reorder_bot)+(reorder_top-cc);
2989}
2990
2991@ If more than one functional unit is able to process branch instructions and
2992if two of them simultaneously discover misprediction, or if misprediction is
2993detected by one unit just as another unit is generating an interrupt, we
2994assume that an arbitration takes place so that only the hottest one actually
2995deissues the cooler instructions.
2996
2997Changes to the |bp_table| aren't undone when they were made on speculation in
2998an instruction being deissued; nor do we worry about cases where the same
2999|bp_table| entry is being updated by two or more active coroutines. After all,
3000the |bp_table| is just a heuristic, not part of the real computation.
3001We correct the |bp_table| only if we discover that a prediction was wrong, so
3002that we will be less likely to make the same mistake later.
3003
3004@<Recover from incorrect branch prediction@>=
3005i=issued_between(data,cool);
3006if (i<deissues) goto die;
3007deissues=i;
3008old_tail=tail=head;@+resuming=0; /* clear the fetch buffer */
3009@<Restart the fetch coroutine@>;
3010inst_ptr.o=data->go.o, inst_ptr.p=NULL;
3011if (!(data->loc.h&sign_bit)) {
3012  if (inst_ptr.o.h&sign_bit) data->interrupt |= P_BIT;
3013  else data->interrupt &=~P_BIT;
3014}
3015if (bp_table) {
3016  bp_table[data->x.o.h]=data->x.o.l; /* this is what we should have stored */
3017  if (verbose&show_pred_bit) {
3018    printf(" mispredicted ");@+print_octa(data->loc);
3019    printf("; bp[%x]=%d\n",data->x.o.h,
3020          data->x.o.l-((data->x.o.l&bp_npower)<<1));
3021  }
3022}
3023cool_hist=(j? (data->hist<<1)+1: data->hist<<1);
3024
3025@ @<External proto...@>=
3026Extern void print_stats @,@,@[ARGS((void))@];
3027
3028@ @<External r...@>=
3029void print_stats()
3030{
3031  register int j;
3032  if (bp_table)
3033    printf("Predictions: %d in agreement, %d in opposition; %d good, %d bad\n",
3034                 bp_ok_stat,bp_rev_stat,bp_good_stat,bp_bad_stat);
3035  else printf("Predictions: %d good, %d bad\n",bp_good_stat,bp_bad_stat);
3036  printf("Instructions issued per cycle:\n");
3037  for (j=0;j<=dispatch_max;j++)
3038    printf("  %d   %d\n",j,dispatch_stat[j]);
3039}
3040
3041@* Cache memory. It's time now to consider \MMIX's MMU, the memory management
3042unit. This part of the machine deals with the critical problem of getting data
3043to and from the computational units. In a RISC architecture all interaction
3044between main memory and the computer registers is specified by load and store
3045instructions; thus memory accesses are much easier to deal with than they
3046would be on a machine with more complex kinds of interaction. But memory
3047management is still difficult, if we want to do it well, because main memory
3048typically operates at a much slower speed than the registers do. High-speed
3049implementations of \MMIX\ introduce intermediate ``caches'' of storage in
3050order to keep the most important data accessible, and cache maintenance can be
3051complicated when all the details are taken into account.
3052(See, for example, Chapter 5 of Hennessy and Patterson's
3053{\sl Computer Architecture}, second edition.)
3054@^Hennessy, John LeRoy@>
3055@^Patterson, David Andrew@>
3056@^caches@>
3057
3058This simulator can be configured to have up to three auxiliary caches between
3059registers and memory: An I-cache for instructions, a D-cache for data, and an
3060S-cache for both instructions and data. The S-cache, also called a {\it
3061secondary cache}, is supported only if both I-cache and D-cache are present.
3062Arbitrary access times for each cache can be specified independently;
3063we might assume, for example, that data items in the I-cache or D-cache can
3064be sent to a register in one or two clock cycles, but the access time for the
3065S-cache might be say 5 cycles, and main memory might require 20 cycles or more.
3066Our speculative pipeline can have many functional units handling load
3067and store instructions, but only one load or store instruction can be
3068updating the D-cache or S-cache or main memory at a time. (However, the
3069D-cache can have several read ports; furthermore, data might
3070be passing between the S-cache and memory while other data is passing
3071between the reorder buffer and the D-cache.)
3072
3073Besides the optional I-cache, D-cache, and S-cache, there are required caches
3074called the IT-cache and DT-cache, for translation of virtual addresses to
3075physical addresses. A translation cache is often called a ``translation
3076@^TLB@>
3077@^translation caches@>
3078lookaside buffer'' or TLB; but we call it a cache since it is implemented in
3079nearly the same way as an I-cache.
3080
3081@ Consider a cache that has blocks of $2^b$~bytes each and
3082associativity~$2^a$; here $b\ge3$ and $a\ge0$. The I-cache, D-cache, and
3083S-cache are addressed by 48-bit physical addresses, as if they were part of
3084main memory; but the IT and DT caches are addressed by 64-bit keys, obtained
3085from a virtual address by blanking out the lower $s$ bits and inserting the
3086value of~$n$, where the page size~$s$ and the process number~$n$ are found
3087in~rV. We will consider all caches to be addressed by 64-bit keys, so that
3088both cases are handled with the same basic methods.
3089
3090Given a 64-bit key,
3091we ignore the low-order $b$~bits and use the next $c$~bits
3092to address the {\it cache set\/}; then the remaining $64-b-c$ bits should
3093match one of $2^a$ {\it tags\/} in that set. The case $a=0$ corresponds to a
3094so-called {\it direct-mapped\/} cache; the case $c=0$ corresponds to a
3095so-called {\it fully associative\/} cache. With $2^c$ sets of $2^a$ blocks
3096each, and $2^b$ bytes per block, the cache contains $2^{a+b+c}$ bytes of data,
3097in addition to the space needed for tags. Translation caches have $b=3$ and
3098they also usually have $c=0$.
3099
3100If a tag matches the specified bits, we ``hit'' in the cache and can
3101use and/or update the data found there. Otherwise we ``miss,'' and
3102we probably want to replace one of the cache blocks by the block containing
3103the item sought. The item chosen for replacement is called a {\it victim}.
3104The choice of victim is forced when the cache is direct-mapped, but four
3105strategies for victim selection are available when we must choose from
3106among $2^a$ entries for $a>0$:
3107
3108\smallskip\textindent{$\bullet$} ``Random'' selection chooses the victim
3109by extracting the least significant $a$~bits of the clock.
3110
3111\smallskip\textindent{$\bullet$} ``Serial'' selection chooses 0, 1, \dots,
3112$2^a-1$, 0, 1, \dots, $2^a-1$, 0, \dots~on successive trials.
3113
3114\smallskip\textindent{$\bullet$} ``LRU (Least Recently Used)'' selection
3115chooses the victim that ranks last if items are ranked inversely to the time
3116that has elapsed since their previous use.
3117
3118\smallskip\textindent{$\bullet$} ``Pseudo-LRU'' selection chooses the
3119victim by a rough approximation to LRU that is simpler to implement
3120in hardware. It requires a bit table $r_1\ldots r_{2^a-1}$.
3121Whenever we use an item
3122with binary address $(i_1\ldots i_a)_2$ in the set, we adjust the
3123bit table as follows:
3124$$r_1\gets1-i_1,\quad r_{1i_1}\gets1-i_2,\quad\ldots,\quad
3125r_{1i_1\ldots i_{a-1}}\gets1-i_a;$$
3126here the subscripts on~$r$ are binary numbers. (For example, when $a=3$,
3127the use of element $(010)_2$ sets $r_1\gets1$, $r_{10}\gets0$, $r_{101}\gets1$,
3128where $r_{101}$ means the same as $r_5$.) To select a victim, we start with
3129$l\gets1$ and then repeatedly set $l\gets2l+r_l$, $a$~times; then we
3130choose element $l-2^a$. When $a=1$, this scheme is equivalent to LRU.
3131When $a=2$, this scheme was implemented in the Intel 80486 chip.
3132
3133@<Type...@>=
3134typedef enum {@!random,@!serial,@!pseudo_lru,@!lru} replace_policy;
3135
3136@ A cache might also include a ``victim'' area, which contains the
3137last $2^v$ victim blocks removed from the main cache area. The victim
3138area can be searched in parallel with the specified cache set, thereby
3139increasing the chance of a hit without making the search go slower.
3140Each of the three replacement policies can be used also in the victim cache.
3141
3142@ A cache also has a {\it granularity\/} $2^g$, where $b\ge g\ge3$.  This
3143means that we maintain, for each cache block, a set of $2^{b-g}$ ``dirty
3144bits,'' which identify the $2^g$-byte groups that have possibly changed since
3145they were last read from memory. Thus if $g=b$, an entire cache block is
3146either dirty or clean; if $g=3$, the dirtiness of each octabyte is maintained
3147separately.
3148
3149Two policies are available when new data is written into all or part
3150of a cache block. We can {\it write-through}, meaning that we send all new data
3151to memory immediately and never mark anything dirty; or we can {\it
3152write-back}, meaning that we update the memory from the cache only when
3153absolutely necessary. Furthermore we can {\it write-allocate},
3154meaning that we keep the new data in the cache, even if the cache block being
3155written has to be fetched first because of a miss; or we can {\it
3156write-around}, meaning that we keep the new data only if it was part of an
3157existing cache block.
3158
3159(In this discussion, ``memory'' is shorthand for ``the next level
3160of the memory hierarchy''; if there is an S-cache, the I-cache and
3161D-cache write new data to the S-cache, not directly to memory. The I-cache,
3162IT-cache, and DT-cache are read-only, so they do not need the facilities
3163discussed in this section. Moreover, the D-cache and S-cache can be assumed to
3164have the same granularity.)
3165
3166@<Header def...@>=
3167#define WRITE_BACK 1 /* use this if not write-through */
3168#define WRITE_ALLOC 2 /* use this if not write-around */
3169
3170@ We have seen that many flavors of cache can be simulated. They are
3171represented by \&{cache} structures, containing arrays of \&{cacheset}
3172structures that contain arrays of \&{cacheblock} structures
3173for the individual blocks. We use a full byte to store each |dirty| bit,
3174and we use full integer words to store |rank| fields for LRU processing, etc.;
3175memory economy is less important than simplicity in this simulator.
3176
3177@<Type...@>=
3178typedef struct{
3179  octa tag; /* bits of key not included in the cache block address */
3180  char *dirty; /* array of $2^{g-b}$ dirty bits, one per granule */
3181  octa *data; /* array of $2^{b-3}$ octabytes, the data in a cache block */
3182  int rank; /* auxiliary information for non-|random| policies */
3183} cacheblock;
3184@#
3185typedef cacheblock *cacheset; /* array of $2^a$ or $2^v$ blocks */
3186@#
3187typedef struct{
3188  int a,b,c,g,v; /* lg of associativity, blocksize, setsize, granularity,
3189         and victimsize */
3190  int aa,bb,cc,gg,vv; /* associativity, blocksize, setsize, granularity,
3191         and victimsize (all powers of~2) */
3192  int tagmask; /* $-2^{b+c}$ */
3193  replace_policy repl,vrepl; /* how to choose victims and victim-victims */
3194  int mode; /* optional |WRITE_BACK| and/or |WRITE_ALLOC| */
3195  int access_time; /* cycles to know if there's a hit */
3196  int copy_in_time; /* cycles to copy a new block into the cache */
3197  int copy_out_time; /* cycles to copy an old block from the cache */
3198  cacheset *set; /* array of $2^c$ sets of arrays of cache blocks */
3199  cacheset victim; /* the victim cache, if present */
3200  coroutine filler; /* a coroutine for copying new blocks into the cache */
3201  control filler_ctl; /* its control block */
3202  coroutine flusher; /* a coroutine for writing dirty old data
3203                           from the cache */
3204  control flusher_ctl; /* its control block */
3205  cacheblock inbuf; /* filling comes from here */
3206  cacheblock outbuf; /* flushing goes to here */
3207  lockvar lock; /* nonzero when the cache is being changed significantly */
3208  lockvar fill_lock; /* nonzero when filler should pass data back */
3209  int ports; /* how many coroutines can be reading the cache? */
3210  coroutine *reader; /* array of coroutines that might be reading
3211                                    simultaneously */
3212  char *name; /* |"Icache"|, for example */
3213} cache;
3214
3215@ @<External v...@>=
3216Extern cache *Icache, *Dcache, *Scache, *ITcache, *DTcache;
3217
3218@ Now we are ready to define some basic subroutines for cache maintenance.
3219Let's begin with a trivial routine that tests if a given cache block is dirty.
3220
3221@<Internal proto...@>=
3222static bool is_dirty @,@,@[ARGS((cache*,cacheblock*))@];
3223
3224@ @<Sub...@>=
3225static bool is_dirty(c,p)
3226  cache *c; /* the cache containing it */
3227  cacheblock *p; /* a cache block */
3228{
3229  register int j;
3230  register char *d=p->dirty;
3231  for (j=0;j<c->bb;d++,j+=c->gg) if (*d) return true;
3232  return false;
3233}
3234
3235@ For diagnostic purposes we might want to display an entire cache block.
3236
3237@<Internal proto...@>=
3238static void print_cache_block @,@,@[ARGS((cacheblock,cache*))@];
3239
3240@ @<Sub...@>=
3241static void print_cache_block(p,c)
3242  cacheblock p;
3243  cache *c;
3244{@+register int i,j,b=c->bb>>3,g=c->gg>>3;
3245  printf("%08x%08x: ",p.tag.h,p.tag.l);
3246  for (i=j=0; j<b;j++,i+=((j&(g-1))?0:1))
3247    printf("%08x%08x%c",p.data[j].h,p.data[j].l,p.dirty[i]?'*':' ');
3248  printf(" (%d)\n",p.rank);
3249}
3250
3251@ @<Internal proto...@>=
3252static void print_cache_locks @,@,@[ARGS((cache*))@];
3253
3254@ @<Sub...@>=
3255static void print_cache_locks(c)
3256  cache *c;
3257{
3258  if (c) {
3259    if (c->lock) printf("%s locked by %s:%d\n",
3260                    c->name,c->lock->name,c->lock->stage);
3261    if (c->fill_lock) printf("%sfill locked by %s:%d\n",
3262                    c->name,c->fill_lock->name,c->fill_lock->stage);
3263  }
3264}
3265
3266@ The |print_cache| routine prints the entire contents of a cache. This can be
3267a huge amount of data, but it can be very useful when debugging. Fortunately,
3268the task of debugging favors the use of small caches, since interesting cases
3269arise more often when a cache is fairly small.
3270
3271@<External proto...@>=
3272Extern void print_cache @,@,@[ARGS((cache*,bool))@];
3273
3274@ @<External r...@>=
3275void print_cache(c,dirty_only)
3276  cache *c;
3277  bool dirty_only;
3278{
3279  if (c) {@+register int i,j;
3280    printf("%s of %s:",dirty_only?"Dirty blocks":"Contents",c->name);
3281    if (c->filler.next) {
3282      printf(" (filling ");
3283      print_octa(c->name[1]=='T'? c->filler_ctl.y.o: c->filler_ctl.z.o);
3284      printf(")");
3285    }
3286    if (c->flusher.next) {
3287      printf(" (flushing ");
3288      print_octa(c->outbuf.tag);
3289      printf(")");
3290    }
3291    printf("\n");
3292    @<Print all of |c|'s cache blocks@>;
3293  }
3294}
3295
3296@ We don't print the cache blocks that have an invalid tag, unless
3297requested to be verbose.
3298
3299@<Print all of |c|'s cache blocks@>=
3300for (i=0;i<c->cc;i++) for (j=0;j<c->aa;j++)
3301  if ((!(c->set[i][j].tag.h&sign_bit)||(verbose&show_wholecache_bit))&&@|
3302       (!dirty_only || is_dirty(c,&c->set[i][j]))) {
3303    printf("[%d][%d] ",i,j);
3304    print_cache_block(c->set[i][j],c);
3305  }
3306for (j=0;j<c->vv;j++)
3307  if ((!(c->victim[j].tag.h&sign_bit)||(verbose&show_wholecache_bit))&&@|
3308       (!dirty_only || is_dirty(c,&c->victim[j]))) {
3309    printf("V[%d] ",j);
3310    print_cache_block(c->victim[j],c);
3311  }
3312
3313@ The |clean_block| routine simply initializes a given cache block.
3314
3315@<External proto...@>=
3316Extern void clean_block @,@,@[ARGS((cache*,cacheblock*))@];
3317
3318@ @<External r...@>=
3319void clean_block(c,p)
3320  cache *c;
3321  cacheblock *p;
3322{
3323  register int j;
3324  p->tag.h=sign_bit, p->tag.l=0;
3325  for (j=0;j<c->bb>>3;j++) p->data[j]=zero_octa;
3326  for (j=0;j<c->bb>>c->g;j++) p->dirty[j]=false;
3327}
3328
3329@ The |zap_cache| routine invalidates all tags of a given cache,
3330effectively restoring it to its initial condition.
3331
3332@<External proto...@>=
3333Extern void zap_cache @,@,@[ARGS((cache*))@];
3334
3335@ We clear the |dirty| entries here, just to be tidy, although
3336they could actually be left in arbitrary condition when the tags are invalid.
3337
3338@<External r...@>=
3339void zap_cache(c)
3340  cache *c;
3341{
3342  register int i,j;
3343  for (i=0;i<c->cc;i++) for (j=0;j<c->aa;j++) {
3344    clean_block(c,&(c->set[i][j]));
3345  }
3346  for (j=0;j<c->vv;j++) {
3347    clean_block(c,&(c->victim[j]));
3348  }
3349}
3350
3351@ The |get_reader| subroutine finds the index of
3352an available reader coroutine for a given cache, or returns a negative value
3353if no readers are available.
3354
3355@<Internal proto...@>=
3356static int get_reader @,@,@[ARGS((cache*))@];
3357
3358@ @<Sub...@>=
3359static int get_reader(c)
3360  cache *c;
3361{@+ register int j;
3362  for (j=0;j<c->ports;j++)
3363    if (c->reader[j].next==NULL) return j;
3364  return -1;
3365}
3366
3367@ The subroutine |copy_block(c,p,cc,pp)| copies the dirty
3368items from block~|p| of cache~|c| into block~|pp| of cache~|cc|, assuming
3369that the destination cache has a sufficiently large block size.
3370(In other words, we assume that |cc->b>=c->b|.) We also assume that both
3371blocks have compatible tags, and that both caches have the same granularity.
3372
3373@<Internal proto...@>=
3374static void copy_block @,@,@[ARGS((cache*,cacheblock*,cache*,cacheblock*))@];
3375
3376@ @<Sub...@>=
3377static void copy_block(c,p,cc,pp)
3378  cache *c,*cc;
3379  cacheblock *p,*pp;
3380{
3381  register int j,jj,i,ii,lim; register int off=p->tag.l&(cc->bb-1);
3382  if (c->g!=cc->g || p->tag.h!=pp->tag.h || p->tag.l-off!=pp->tag.l)
3383    panic(confusion("copy block"));
3384  for (j=0,jj=off>>c->g;j<c->bb>>c->g;j++,jj++) if (p->dirty[j]) {
3385    pp->dirty[jj]=true;
3386    for (i=j<<(c->g-3),ii=jj<<(c->g-3),lim=(j+1)<<(c->g-3);
3387              i<lim;i++,ii++) pp->data[ii]=p->data[i];
3388  }
3389}
3390
3391@ The |choose_victim| subroutine selects the victim to be replaced when we
3392need to change a cache~set. We need only one bit of the |rank| fields to
3393implement the $r$~table when |policy=pseudo_lru|,
3394and we don't need |rank| at all when |policy=random|. Of course we use an
3395$a$-bit counter to implement |policy=serial|. In the other case,
3396|policy=lru|, we need an $a$-bit |rank| field; the least recently used entry
3397has rank~0, and the most recently used entry has rank~$2^a-1=|aa|-1$.
3398
3399@<Internal proto...@>=
3400static cacheblock* choose_victim @,@,@[ARGS((cacheset,int,replace_policy))@];
3401
3402@ @<Sub...@>=
3403static cacheblock* choose_victim(s,aa,policy)
3404  cacheset s;
3405  int aa; /* setsize */
3406  replace_policy policy;
3407{
3408  register cacheblock *p;
3409  register int l,m;
3410  switch (policy) {
3411 case random: return &s[ticks.l&(aa-1)];
3412 case serial: l=s[0].rank;@+ s[0].rank=(l+1)&(aa-1);@+ return &s[l];
3413 case lru: for (p=s;p<s+aa;p++)
3414    if (p->rank==0) return p;
3415  panic(confusion("lru victim")); /* what happened? nobody has rank zero */
3416 case pseudo_lru: for (l=1,m=aa>>1; m; m>>=1) l=l+l+s[l].rank;
3417   return &s[l-aa];
3418  }
3419}
3420
3421@ The |note_usage| subroutine updates the |rank| entries to record the
3422fact that a particular block in a cache set is now being used.
3423
3424@<Internal proto...@>=
3425static void note_usage @,@,@[ARGS((cacheblock*,cacheset,int,replace_policy))@];
3426
3427@ @<Sub...@>=
3428static void note_usage(l,s,aa,policy)
3429  cacheblock *l; /* a cache block that's probably worth preserving */
3430  cacheset s; /* the set that contains $l$ */
3431  int aa; /* setsize */
3432  replace_policy policy;
3433{
3434  register cacheblock *p;
3435  register int j,m,r;
3436  if (aa==1 || policy<=serial) return;
3437  if (policy==lru) {
3438    r=l->rank;
3439    for (p=s;p<s+aa;p++) if (p->rank>r) p->rank--;
3440    l->rank=aa-1;
3441  } else { /* |policy==pseudo_lru| */
3442    r=l-s;
3443    for (j=1,m=aa>>1;m;m>>=1)
3444      if (r&m) s[j].rank=0,j=j+j+1;
3445      else s[j].rank=1, j=j+j;
3446  }
3447  return;
3448}
3449
3450@ The |demote_usage| subroutine is sort of the opposite of |note_usage|;
3451it changes the rank of a given block to {\it least\/} recently used.
3452
3453@<Internal proto...@>=
3454static void demote_usage @,@,@[ARGS((cacheblock*,cacheset,int,replace_policy))@];
3455
3456@ @<Sub...@>=
3457static void demote_usage(l,s,aa,policy)
3458  cacheblock *l; /* a cache block we probably don't need */
3459  cacheset s; /* the set that contains $l$ */
3460  int aa; /* setsize */
3461  replace_policy policy;
3462{
3463  register cacheblock *p;
3464  register int j,m,r;
3465  if (aa==1 || policy<=serial) return;
3466  if (policy==lru) {
3467    r=l->rank;
3468    for (p=s;p<s+aa;p++) if (p->rank<r) p->rank++;
3469    l->rank=0;
3470  } else { /* |policy==pseudo_lru| */
3471    r=l-s;
3472    for (j=1,m=aa>>1;m;m>>=1)
3473      if (r&m) s[j].rank=1,j=j+j+1;
3474      else s[j].rank=0, j=j+j;
3475  }
3476  return;
3477}
3478
3479@ The |cache_search| routine looks for a given key $\alpha$
3480in a given cache, and returns a cache block if there's a hit; otherwise
3481it returns~|NULL|. If the search hits, the set in which the block was
3482found is stored in global variable |hit_set|. Notice that we need to check
3483more bits of the tag when we search in the victim area.
3484
3485@d cache_addr(c,alf) c->set[(alf.l&~(c->tagmask))>>c->b]
3486
3487@<Internal proto...@>=
3488static cacheblock* cache_search @,@,@[ARGS((cache*,octa))@];
3489
3490@ @<Sub...@>=
3491static cacheblock* cache_search(c,alf)
3492  cache *c; /* the cache to be searched */
3493  octa alf; /* the key */
3494{
3495  register cacheset s;
3496  register cacheblock* p;
3497  s=cache_addr(c,alf); /* the set corresponding to |alf| */
3498  for (p=s;p<s+c->aa;p++)
3499    if (((p->tag.l ^ alf.l)&c->tagmask)==0 && p->tag.h==alf.h) goto hit;
3500  s=c->victim;
3501  if (!s) return NULL; /* cache miss, and no victim area */
3502  for (p=s;p<s+c->vv;p++)
3503    if (((p->tag.l^alf.l)&(-c->bb))==0 && p->tag.h==alf.h) goto hit;
3504  return NULL; /* double miss */
3505 hit: hit_set=s;@+ return p;
3506}
3507
3508@ @<Glob...@>=
3509cacheset hit_set;
3510
3511@ If |p=cache_search(c,alf)| hits and if we call |use_and_fix(c,p)|
3512immediately afterwards, cache~|c| is updated to record the usage of
3513key~|alf|. A hit in the victim area moves the cache block to the main area,
3514unless the |filler| routine of cache~|c| is active.
3515A pointer to the (possibly moved) cache block is returned.
3516
3517@<Internal proto...@>=
3518static cacheblock* use_and_fix @,@,@[ARGS((cache*,cacheblock*))@];
3519
3520@ @<Sub...@>=
3521static cacheblock *use_and_fix(c,p)
3522  cache *c;
3523  cacheblock *p;
3524{
3525  if (hit_set!=c->victim) note_usage(p,hit_set,c->aa,c->repl);
3526  else { note_usage(p,hit_set,c->vv,c->vrepl); /* found in victim cache */
3527    if (!c->filler.next) {
3528      register cacheset s=cache_addr(c,p->tag);
3529      register cacheblock *q=choose_victim(s,c->aa,c->repl);
3530      note_usage(q,s,c->aa,c->repl);
3531      @<Swap cache blocks |p| and |q|@>;
3532      return q;
3533    }
3534  }
3535  return p;
3536}
3537
3538@ We can simply permute the pointers inside the cacheblock structures of a
3539cache, instead of copying the data, if we are careful not to let any of those
3540pointers escape into other data structures.
3541
3542@<Swap cache blocks |p| and |q|@>=
3543{
3544  octa t;
3545  register char *d=p->dirty;
3546  register octa *dd=p->data;
3547  t=p->tag;@+p->tag=q->tag;@+q->tag=t;
3548  p->dirty=q->dirty;@+q->dirty=d;
3549  p->data=q->data;@+q->data=dd;
3550}
3551
3552@ The |demote_and_fix| routine is analogous to |use_and_fix|,
3553except that we don't want to promote the data we found.
3554
3555@<Internal proto...@>=
3556static cacheblock* demote_and_fix @,@,@[ARGS((cache*,cacheblock*))@];
3557
3558@ @<Sub...@>=
3559static cacheblock *demote_and_fix(c,p)
3560  cache *c;
3561  cacheblock *p;
3562{
3563  if (hit_set!=c->victim) demote_usage(p,hit_set,c->aa,c->repl);
3564  else demote_usage(p,hit_set,c->vv,c->vrepl);
3565  return p;
3566}
3567
3568@ The subroutine |load_cache(c,p)| is called at a moment when
3569|c->lock| has been set and |c->inbuf| has been filled with clean data
3570to be placed in the cache block~|p|.
3571
3572@<Internal proto...@>=
3573static void load_cache @,@,@[ARGS((cache*,cacheblock*))@];
3574
3575@ @<Sub...@>=
3576static void load_cache(c,p)
3577  cache *c;
3578  cacheblock *p;
3579{
3580  register int i;
3581  register octa *d;
3582  for (i=0;i<c->bb>>c->g;i++) p->dirty[i]=false;
3583  d=p->data;@+ p->data=c->inbuf.data;@+ c->inbuf.data=d;
3584  p->tag=c->inbuf.tag;
3585  hit_set=cache_addr(c,p->tag);@+
3586  use_and_fix(c,p); /* |p| not moved */
3587}
3588
3589@ The subroutine |flush_cache(c,p,keep)| is called at a ``quiet''
3590moment when |c->flusher.next=NULL|.
3591It puts cache block~|p| into |c->outbuf| and
3592fires up the |c->flusher| coroutine, which will take care of
3593sending the data to lower levels of the memory hierarchy.
3594Cache block~|p| is also marked clean.
3595
3596@<Internal proto...@>=
3597static void flush_cache @,@,@[ARGS((cache*,cacheblock*,bool))@];
3598
3599@ @<Sub...@>=
3600static void flush_cache(c,p,keep)
3601  cache *c;
3602  cacheblock *p; /* a block inside cache |c| */
3603  bool keep; /* should we preserve the data in |p|? */
3604{
3605    register octa *d;
3606    register char *dd;
3607    register int j;
3608    c->outbuf.tag=p->tag;
3609    if (keep)@+ for (j=0;j<c->bb>>3;j++) c->outbuf.data[j]=p->data[j];
3610    else d=c->outbuf.data, c->outbuf.data=p->data, p->data=d;
3611    dd=c->outbuf.dirty, c->outbuf.dirty=p->dirty, p->dirty=dd;
3612    for (j=0;j<c->bb>>c->g;j++) p->dirty[j]=false;
3613    p->rank=c->bb; /* this many valid bytes */
3614    startup(&c->flusher,c->copy_out_time); /* will not be aborted */
3615}
3616
3617@ The |alloc_slot| routine is called when we wish to put new information
3618into a cache after a cache miss. It returns a pointer to a cache block
3619in the main area where the new information should be put. The tag of
3620that cache block is invalidated; the calling routine should take care
3621of filling it and giving it a valid tag in due time. The cache's |filler|
3622routine should not be active when |alloc_slot| is called.
3623
3624Inserting new information might also require writing old information
3625into the next level of the memory hierarchy, if the block being replaced
3626is dirty. This routine returns |NULL| in such cases if the cache is
3627flushing a previously discarded block.
3628Otherwise it schedules the |flusher| coroutine.
3629
3630This routine returns |NULL| also if the given key happens to be in the
3631cache. Such cases are rare, but the following scenario shows that
3632they aren't impossible: Suppose the DT-cache access time is 5, the D-cache
3633access time is~1, and two processes simultaneously look for the
3634same physical address. One process hits in DT-cache but misses in D-cache,
3635waiting 5 cycles before trying |alloc_slot| in the D-cache; meanwhile
3636the other process missed in D-cache but didn't need to use the DT-cache,
3637so it might have updated the D-cache.
3638
3639A key value is never negative. Therefore we can invalidate the tag in
3640the chosen slot by forcing it to be negative.
3641
3642@<Internal proto...@>=
3643static cacheblock* alloc_slot @,@,@[ARGS((cache*,octa))@];
3644
3645@ @<Sub...@>=
3646static cacheblock* alloc_slot(c,alf)
3647  cache *c;
3648  octa alf; /* key that probably isn't in the cache */
3649{
3650  register cacheset s;
3651  register cacheblock *p,*q;
3652  if (cache_search(c,alf)) return NULL;
3653  if (c->flusher.next && c->outbuf.tag.h==alf.h &&
3654        !((c->outbuf.tag.l^alf.l)&-c->bb)) return NULL;
3655  s=cache_addr(c,alf); /* the set corresponding to |alf| */
3656  if (c->victim) p=choose_victim(c->victim,c->vv,c->vrepl);
3657  else p=choose_victim(s,c->aa,c->repl);
3658  if (is_dirty(c,p)) {
3659    if (c->flusher.next) return NULL;
3660    flush_cache(c,p,false);
3661  }
3662  if (c->victim) {
3663    q=choose_victim(s,c->aa,c->repl);
3664    @<Swap cache blocks...@>;
3665    q->tag.h |= sign_bit; /* invalidate the tag */
3666    return q;
3667  }
3668  p->tag.h |= sign_bit;@+ return p;
3669}
3670
3671@* Simulated memory. How should we deal with the potentially gigantic
3672memory of~\MMIX? We can't simply declare an array~$m$ that has
3673$2^{48}$ bytes. (Indeed, up to $2^{63}$ bytes are needed, if we
3674consider also the physical addresses $\ge2^{48}$ that are reserved for
3675memory-mapped input/output.)
3676
3677We could regard memory as a special kind of cache,
3678in which every access is required to hit. For example, such an ``M-cache''
3679could be fully associative, with $2^a$ blocks each
3680having a different tag; simulation could proceed until more than~$2^a-1$ tags
3681are required. But then the predefined value of~$a$ might well be so large that
3682the sequential search of our |cache_search| routine would be too slow.
3683
3684Instead, we will allocate memory in chunks of $2^{16}$ bytes at a time,
3685as needed, and we will use hashing to search for the relevant chunk
3686whenever a physical address is given. If the address is $2^{48}$ or greater,
3687special routines called |spec_read| and |spec_write|, supplied by the
3688user, will be called upon to do the reading or writing. Otherwise
3689the 48-bit address consists of a 32-bit {\it chunk address\/} and a
369016-bit {\it chunk offset}.
3691
3692Chunk addresses that are not used take no space in this simulator. But if,
3693say, 1000 such patterns occur, the simulator will dynamically allocate
3694approximately 65MB for the portions of main memory that are used.
3695Parameter |mem_chunks_max| specifies the largest number of different chunk
3696addresses that are supported. This parameter does not constrain the range of
3697simulated physical addresses, which cover the entire 256 large-terabyte range
3698permitted by~\MMIX.
3699
3700@<Type...@>=
3701typedef struct {
3702  tetra tag; /* 32-bit chunk address */
3703  octa *chunk; /* either |NULL| or an array of $2^{13}$ octabytes */
3704} chunknode;
3705
3706@ The parameter |hash_prime| should be a prime number larger than the
3707parameter
3708|mem_chunks_max|, preferably more than twice as large but not much bigger
3709than~that. The default values |mem_chunks_max=1000| and |hash_prime=2003| are
3710set by |MMIX_config| unless the user specifies otherwise.
3711
3712@<External v...@>=
3713Extern int mem_chunks; /* this many chunks are allocated so far */
3714Extern int mem_chunks_max; /* up to this many different chunks per run */
3715Extern int hash_prime; /* larger than |mem_chunks_max|, but not enormous */
3716Extern chunknode *mem_hash; /* the simulated main memory */
3717
3718@ The separately compiled procedures |spec_read()| and |spec_write()| have the
3719same calling conventions as the general procedures
3720|mem_read()| and |mem_write()|, but with an additional |size| parameter,
3721which specifies that |1<<size| bytes should be read or written.
3722
3723@<Sub...@>=
3724extern octa spec_read @,@,@[ARGS((octa addr,int size))@];
3725 /* for memory mapped I/O */
3726extern void spec_write @,@,@[ARGS((octa addr,octa val,int size))@];
3727 /* likewise */
3728
3729@ If the program tries to read from a chunk that hasn't been allocated,
3730the value zero is returned, optionally with a comment to the user.
3731
3732Chunk address 0 is always allocated first. Then we can assume that
3733a matching chunk tag implies a nonnull |chunk| pointer.
3734
3735This routine sets |last_h| to the chunk found, so that we can rapidly read
3736other words that we know must belong to the same chunk. For this purpose
3737it is convenient to let |mem_hash[hash_prime]| be a chunk full of zeros,
3738representing uninitialized memory.
3739
3740@<External proto...@>=
3741Extern octa mem_read @,@,@[ARGS((octa addr))@];
3742
3743@ @<External r...@>=
3744octa mem_read(addr)
3745  octa addr;
3746{
3747  register tetra off,key;
3748  register int h;
3749  off=(addr.l&0xffff)>>3;
3750  key=(addr.l&0xffff0000)+addr.h;
3751  for (h=key%hash_prime;mem_hash[h].tag!=key;h--) {
3752    if (mem_hash[h].chunk==NULL) {
3753      if (verbose&uninit_mem_bit)
3754        errprint2("uninitialized memory read at %08x%08x",addr.h,addr.l);
3755@.uninitialized memory...@>
3756      h=hash_prime;@+ break; /* zero will be returned */
3757    }
3758    if (h==0) h=hash_prime;
3759  }
3760  last_h=h;
3761  return mem_hash[h].chunk[off];
3762}
3763
3764@ @<External v...@>=
3765Extern int last_h; /* the hash index that was most recently correct */
3766
3767@ @<External proto...@>=
3768Extern void mem_write @,@,@[ARGS((octa addr,octa val))@];
3769
3770@ @<External r...@>=
3771void mem_write(addr,val)
3772  octa addr,val;
3773{
3774  register tetra off,key;
3775  register int h;
3776  off=(addr.l&0xffff)>>3;
3777  key=(addr.l&0xffff0000)+addr.h;
3778  for (h=key%hash_prime;mem_hash[h].tag!=key;h--) {
3779    if (mem_hash[h].chunk==NULL) {
3780      if (++mem_chunks>mem_chunks_max)
3781        panic(errprint1("More than %d memory chunks are needed",
3782@.More...chunks are needed@>
3783                 mem_chunks_max));
3784      mem_hash[h].chunk=(octa *)calloc(1<<13,sizeof(octa));
3785      if (mem_hash[h].chunk==NULL)
3786        panic(errprint1("I can't allocate memory chunk number %d",
3787@.I can't allocate...@>
3788                 mem_chunks));
3789      mem_hash[h].tag=key;
3790      break;
3791    }
3792    if (h==0) h=hash_prime;
3793  }
3794  last_h=h;
3795  mem_hash[h].chunk[off]=val;
3796}
3797
3798@ The memory is characterized by several parameters, depending on the
3799characteristics of the memory bus being simulated. Let |bus_words|
3800be the number of octabytes read or written simultaneously (usually
3801|bus_words| is 1 or~2; it must be a power of~2). The number of clock
3802cycles needed to read or write |c*bus_words| octabytes that all belong to the
3803same cache block is assumed to be |mem_addr_time+c*mem_read_time| or
3804|mem_addr_time+c*mem_write_time|, respectively.
3805
3806@<External v...@>=
3807Extern int mem_addr_time; /* cycles to transmit an address on memory bus */
3808Extern int bus_words; /* width of memory bus, in octabytes */
3809Extern int mem_read_time; /* cycles to read from main memory */
3810Extern int mem_write_time; /* cycles to write to main memory */
3811Extern lockvar mem_lock; /* is nonnull when the bus is busy */
3812
3813@ One of the principal ways to write memory is to invoke
3814a |flush_to_mem| coroutine,
3815which is the |Scache->flusher| if there is an S-cache, or the
3816|Dcache->flusher| if there is a D-cache but no S-cache.
3817
3818When such a coroutine is started, its |data->ptr_a| will be |Scache|
3819or~|Dcache|. The data to be written will just have been copied to the cache's
3820|outbuf|.
3821
3822@<Cases for control of special coroutines@>=
3823case flush_to_mem: {@+register cache *c=(cache *)data->ptr_a;
3824 switch (data->state) {
3825  case 0:@+ if (mem_lock) wait(1);
3826    data->state=1;
3827  case 1: set_lock(self,mem_lock);
3828    data->state=2;
3829    @<Write the dirty data of |c->outbuf| and wait for the bus@>;
3830  case 2: goto terminate; /* this frees |mem_lock| and |c->outbuf| */
3831 }
3832}
3833
3834@ @<Write the dirty data of |c->outbuf| and wait for the bus@>=
3835{
3836  register int off,last_off,count,first,ii;
3837  register int del=c->gg>>3; /* octabytes per granule */
3838  octa addr;
3839  addr=c->outbuf.tag;@+ off=(addr.l&0xffff)>>3;
3840  for (i=j=0,first=1,count=0;j<c->bb>>c->g;j++) {
3841    ii=i+del;
3842    if (!c->outbuf.dirty[j]) i=ii,off+=del,addr.l+=del<<3;
3843    else@+ while (i<ii) {
3844      if (first) {
3845        count++;@+ last_off=off;@+ first=0;
3846        mem_write(addr,c->outbuf.data[i]);
3847      }@+else {
3848        if ((off^last_off)&(-bus_words)) count++;
3849        last_off=off;
3850        mem_hash[last_h].chunk[off]=c->outbuf.data[i];
3851      }
3852      i++;@+ off++;@+ addr.l+=8;
3853    }
3854  }
3855  wait(mem_addr_time+count*mem_write_time);
3856}
3857
3858@* Cache transfers. We have seen that the |Dcache->flusher| sends
3859data directly to the main memory if there is no S-cache.
3860But if both D-cache and S-cache exist, the |Dcache->flusher| is a
3861more complicated coroutine of type |flush_to_S|. In this case we need
3862to deal with the fact that the S-cache blocks might be larger than
3863the D-cache blocks; furthermore, the S-cache might have a
3864write-around and/or write-through policy, etc. But one simplifying
3865fact does help us: We know that the flusher coroutine will not be
3866aborted until it has run to completion.
3867
3868Some machines, such as the Alpha 21164, have an additional cache between
3869@^Alpha computers@>
3870the S-cache and memory, called the B-cache (the ``backup cache''). A B-cache
3871could be simulated by extending the logic used here; but such extensions
3872of the present program are left to the interested reader.
3873
3874@<Cases for control of special coroutines@>=
3875case flush_to_S: {@+register cache *c=(cache *)data->ptr_a;
3876  register int block_diff=Scache->bb-c->outbuf.rank;
3877  p=(cacheblock*)data->ptr_b;
3878 switch (data->state) {
3879  case 0:@+ if (Scache->lock) wait(1);
3880    data->state=1;
3881  case 1: set_lock(self,Scache->lock);
3882    data->ptr_b=(void*)cache_search(Scache,c->outbuf.tag);
3883    if (data->ptr_b) data->state=4;
3884    else if (Scache->mode & WRITE_ALLOC) data->state=(block_diff? 2: 3);
3885    else data->state=6;
3886    wait(Scache->access_time);
3887  case 2: @<Fill |Scache->inbuf| with clean memory data@>;
3888  case 3: @<Allocate a slot |p| in the S-cache@>;
3889    if (block_diff) @<Copy |Scache->inbuf| to slot |p|@>@;
3890    else@+for (j=0;j<Scache->bb>>3;j++) p->data[j]=c->outbuf.data[j];
3891    for (j=0;j<Scache->bb>>Scache->g;j++) p->dirty[j]=false;
3892  case 4: copy_block(c,&(c->outbuf),Scache,p);
3893    hit_set=cache_addr(Scache,c->outbuf.tag);@+ use_and_fix(Scache,p);
3894                   /* |p| not moved */
3895    data->state=5;@+ wait(Scache->copy_in_time);
3896  case 5:@+ if ((Scache->mode&WRITE_BACK)==0) { /* write-through */
3897      if (Scache->flusher.next) wait(1);
3898      flush_cache(Scache,p,true);
3899    }
3900    goto terminate;
3901  case 6:@<Handle write-around when flushing to the S-cache@>;
3902 }
3903}
3904
3905@ @<Allocate a slot |p| in the S-cache@>=
3906if (Scache->filler.next) wait(1); /* perhaps an unnecessary precaution? */
3907p=alloc_slot(Scache,c->outbuf.tag);
3908if (!p) wait(1);
3909data->ptr_b=(void*)p;
3910p->tag=c->outbuf.tag;@+ p->tag.l=c->outbuf.tag.l&(-Scache->bb);
3911
3912@ We only need to read |block_diff| bytes, but it's easier to
3913read them all and to charge only for reading the ones we needed.
3914
3915@<Fill |Scache->inbuf| with clean memory data@>=
3916{@+register int count=block_diff>>3;
3917  register int off,delay;
3918  octa addr;
3919  if (mem_lock) wait(1);
3920  addr.h=c->outbuf.tag.h;@+ addr.l=c->outbuf.tag.l&-Scache->bb;
3921  off=(addr.l&0xffff)>>3;
3922  for (j=0;j<Scache->bb>>3;j++)
3923    if (j==0) Scache->inbuf.data[j]=mem_read(addr);
3924    else Scache->inbuf.data[j]=mem_hash[last_h].chunk[j+off];
3925  set_lock(&mem_locker,mem_lock);
3926  delay=mem_addr_time+(int)((count+bus_words-1)/(bus_words))*mem_read_time;
3927  startup(&mem_locker,delay);
3928  data->state=3;@+ wait(delay);
3929}
3930
3931@ @<Copy |Scache->inbuf| to slot |p|@>=
3932{
3933  register octa *d=p->data;
3934  p->data=Scache->inbuf.data;@+Scache->inbuf.data=d;
3935}
3936
3937@ Here we assume that the granularity is~8.
3938
3939@<Handle write-around when flushing to the S-cache@>=
3940if (Scache->flusher.next) wait(1);
3941Scache->outbuf.tag.h=c->outbuf.tag.h;
3942Scache->outbuf.tag.l=c->outbuf.tag.l&(-Scache->bb);
3943for (j=0;j<Scache->bb>>Scache->g;j++) Scache->outbuf.dirty[j]=false;
3944copy_block(c,&(c->outbuf),Scache,&(Scache->outbuf));
3945startup(&Scache->flusher,Scache->copy_out_time);
3946goto terminate;
3947
3948@ The S-cache gets new data from memory by invoking a |fill_from_mem|
3949coroutine; the I-cache or D-cache may also invoke a |fill_from_mem| coroutine,
3950if there is no S-cache. When such a coroutine is invoked, it holds
3951|mem_lock|, and its caller has gone to sleep.
3952A physical memory address is given in |data->z.o|,
3953and |data->ptr_a| specifies either |Icache|, |Dcache|, or |Scache|.
3954Furthermore, |data->ptr_b| specifies a block within that
3955cache, determined by the |alloc_slot| routine. The coroutine
3956simulates reading the contents of the specified memory location,
3957places the result in the |x.o| field of its caller's control block,
3958and wakes up the caller. It proceeds to fill the cache's |inbuf| and,
3959ultimately, the specified cache block, before waking the caller again.
3960
3961Let |c=data->ptr_a|. The caller is then |c->fill_lock|, if this variable is
3962nonnull. However, the caller might not wish to be awoken or to receive
3963the data (for example, if it has been aborted). In such cases |c->fill_lock|
3964will be~|NULL|; the filling action continues without the wakeup calls.
3965If |c=Scache|, the S-cache will be locked and the caller will not
3966have been aborted.
3967
3968@<Cases for control of special coroutines@>=
3969case fill_from_mem: {@+register cache *c=(cache *)data->ptr_a;
3970  register coroutine *cc=c->fill_lock;
3971 switch (data->state) {
3972  case 0: data->x.o=mem_read(data->z.o);
3973    if (cc) {
3974      cc->ctl->x.o=data->x.o;
3975      awaken(cc,mem_read_time);
3976    }
3977    data->state=1;
3978    @<Read data into |c->inbuf| and wait for the bus@>;
3979  case 1: release_lock(self,mem_lock);
3980    data->state=2;
3981  case 2:@+if (c!=Scache) {
3982      if (c->lock) wait(1);
3983      set_lock(self,c->lock);
3984    }
3985    if (cc) awaken(cc,c->copy_in_time); /* the second wakeup call */
3986    load_cache(c,(cacheblock*)data->ptr_b);
3987    data->state=3;@+ wait(c->copy_in_time);
3988  case 3: goto terminate;
3989 }
3990}
3991
3992@ If |c|'s cache size is no larger than the memory bus, we wait an extra
3993cycle, so that there will be two wakeup calls.
3994
3995@<Read data into |c->inbuf|...@>=
3996{
3997  register int count, off;
3998  c->inbuf.tag=data->z.o;@+ c->inbuf.tag.l &= -c->bb;
3999  count=c->bb>>3, off=(c->inbuf.tag.l&0xffff)>>3;
4000  for (i=0;i<count;i++,off++) c->inbuf.data[i]=mem_hash[last_h].chunk[off];
4001  if (count<=bus_words) wait(1+mem_read_time)@;
4002  else wait((int)(count/bus_words)*mem_read_time);
4003}
4004
4005@ The |fill_from_S| coroutine has the same conventions as |fill_from_mem|,
4006except that the data comes directly from the S-cache if it is present there.
4007This is the |filler| coroutine for the I-cache and D-cache if an S-cache
4008is present.
4009
4010@<Cases for control of special coroutines@>=
4011case fill_from_S: {@+register cache *c=(cache *)data->ptr_a;
4012  register coroutine *cc=c->fill_lock;
4013  p=(cacheblock*)data->ptr_c;
4014  switch (data->state) {
4015  case 0: p=cache_search(Scache,data->z.o);
4016    if (p) goto S_non_miss;
4017    data->state=1;
4018  case 1: @<Start the S-cache filler@>;
4019    data->state=2;@+sleep;
4020  case 2:@+if (cc) {
4021      cc->ctl->x.o=data->x.o;
4022            /* this data has been supplied by |Scache->filler| */
4023      awaken(cc,Scache->access_time); /* we propagate it back */
4024    }
4025    data->state=3;@+sleep; /* when we awake, the S-cache will have our data */
4026  S_non_miss:@+if (cc) {
4027      cc->ctl->x.o=p->data[(data->z.o.l&(Scache->bb-1))>>3];
4028      awaken(cc,Scache->access_time);
4029    }
4030  case 3: @<Copy data from |p| into |c->inbuf|@>;
4031    data->state=4;@+wait(Scache->access_time);
4032  case 4: Scache->lock=NULL; /* we had been holding that lock */
4033    data->state=5;
4034  case 5:@+ if (c->lock) wait(1);
4035    set_lock(self,c->lock);
4036    load_cache(c,(cacheblock*)data->ptr_b);
4037    data->state=6;@+ wait(c->copy_in_time);
4038  case 6:@+if (cc) awaken(cc,1); /* second wakeup call */
4039    goto terminate;
4040  }
4041}
4042
4043@ We are already holding the |Scache->lock|, but we're about to take on the
4044|Scache->fill_lock| too (with the understanding that one is ``stronger''
4045than the other). For a short time the |Scache->lock| will point to us
4046but we will point to |Scache->fill_lock|; this will not cause difficulty,
4047because the present coroutine is not abortable.
4048
4049@<Start the S-cache filler@>=
4050if (Scache->filler.next || mem_lock) wait(1);
4051p=alloc_slot(Scache,data->z.o);
4052if (!p) wait(1);
4053set_lock(&Scache->filler,mem_lock);
4054set_lock(self,Scache->fill_lock);
4055data->ptr_c=Scache->filler_ctl.ptr_b=(void *)p;
4056Scache->filler_ctl.z.o=data->z.o;
4057startup(&Scache->filler,mem_addr_time);
4058
4059@ The S-cache blocks might be wider than the blocks of the I-cache or
4060D-cache, so the copying in this step isn't quite trivial.
4061
4062@<Copy data from |p| into |c->inbuf|@>=
4063{@+register int off;
4064  c->inbuf.tag=data->z.o;@+c->inbuf.tag.l &=-c->bb;
4065  for (j=0,off=(c->inbuf.tag.l&(Scache->bb-1))>>3;j<c->bb>>3;j++,off++)
4066    c->inbuf.data[j]=p->data[off];
4067  release_lock(self,Scache->fill_lock);
4068  set_lock(self,Scache->lock);
4069}
4070
4071@ The instruction \.{PRELD} \.{X,\$Y,\$Z} generates $\lfloor{\rm X}/2^b\rfloor$
4072commands if there are $2^b$ bytes per block in the D-cache. These
4073commands will try to preload blocks $\rm\$Y+\$Z$, ${\rm\$Y}+{\rm\$Z}+2^b$,
4074\dots, into the cache if it is not too busy.
4075
4076Similar considerations apply to the instructions \.{PREGO} \.{X,\$Y,\$Z}
4077and \.{PREST} \.{X,\$Y,\$Z}.
4078
4079@<Special cases of instruction dispatch@>=
4080case preld: case prest:@+ if (!Dcache) goto noop_inst;
4081  if (cool->xx>=Dcache->bb) cool->interim=true;
4082  cool->ptr_a=(void *)mem.up;@+ break;
4083case prego:@+ if (!Icache) goto noop_inst;
4084  if (cool->xx>=Icache->bb) cool->interim=true;
4085  cool->ptr_a=(void *)mem.up;@+ break;
4086
4087@ If the block size is 64, a command like \.{PREST}~\.{200,\$Y,\$Z}
4088is actually issued as four commands \.{PREST}~\.{200,\$Y,\$Z;}
4089\.{PREST}~\.{191,\$Y,\$Z;}  \.{PREST}~\.{127,\$Y,\$Z;}
4090\.{PREST}~\.{63,\$Y,\$Z}. An interruption will then be able to resume
4091properly. In the pipeline, the instruction \.{PREST}~\.{200,\$Y,\$Z}
4092is considered to affect bytes $\rm\$Y+\$Z+192$ through $\rm\$Y+\$Z+200$,
4093or fewer bytes if $\rm\$Y+\$Z$ is not a multiple of~64. (Remember that
4094these instructions are only hints; we act on them only if it is
4095reasonably convenient to do so.)
4096
4097@<Get ready for the next step of \.{PRELD} or \.{PREST}@>=
4098head->inst = (head->inst&~((Dcache->bb-1)<<16))-0x10000;
4099
4100@ @<Get ready for the next step of \.{PREGO}@>=
4101head->inst = (head->inst&~((Icache->bb-1)<<16))-0x10000;
4102
4103@ Another coroutine, called |cleanup|, is occasionally called into
4104action to remove dirty data from the D-cache and S-cache. If it is
4105invoked by starting in state 0, with its |i| field set to |sync|, it
4106will clean everything. It can also be
4107invoked in state~4, with its |i| field set to |syncd| and with a physical
4108address in its |z.o| field; then it simply makes sure that no D-cache
4109or S-cache blocks associated with that address are dirty.
4110
4111Field |x.o.h| should be set to zero if items are expected to remain
4112in the cache after being cleaned; otherwise field |x.o.h| should be
4113set to |sign_bit|.
4114
4115The coroutine that invokes |cleanup| should hold |clean_lock|. If that
4116coroutine dies, because of an interruption, the |cleanup| coroutine
4117will terminate prematurely.
4118
4119We assume that the D-cache and S-cache have some sort of way to
4120identify their first dirty block, if any, in |access_time| cycles.
4121
4122@<Glob...@>=
4123coroutine clean_co;
4124control clean_ctl;
4125lockvar clean_lock;
4126
4127@ @<Initialize e...@>=
4128clean_co.ctl=&clean_ctl;
4129clean_co.name="Clean";
4130clean_co.stage=cleanup;
4131clean_ctl.go.o.l=4;
4132
4133@ @<Cases for control of special...@>=
4134case cleanup: p=(cacheblock*)data->ptr_b;
4135  switch(data->state) {
4136@<Cases 0 through 4, for the D-cache@>;
4137@<Cases 5 through 9, for the S-cache@>;
4138case 10: goto terminate;
4139}
4140
4141@ @<Cases 0 through 4, for the D-cache@>=
4142case 0:@+ if (Dcache->lock || (j=get_reader(Dcache))<0) wait(1);
4143  startup(&Dcache->reader[j],Dcache->access_time);
4144  set_lock(self,Dcache->lock);
4145  i=j=0;
4146Dclean_loop: p=(i<Dcache->cc? &(Dcache->set[i][j]): &(Dcache->victim[j]));
4147  if (p->tag.h&sign_bit) goto Dclean_inc;
4148  if (!is_dirty(Dcache,p)) {
4149    p->tag.h|=data->x.o.h;@+goto Dclean_inc;
4150  }
4151  data->y.o.h=i, data->y.o.l=j;
4152Dclean: data->state=1;@+
4153  data->ptr_b=(void*)p;@+
4154  wait(Dcache->access_time);
4155case 1:@+if (Dcache->flusher.next) wait(1);
4156  flush_cache(Dcache,p,data->x.o.h==0);
4157  p->tag.h|=data->x.o.h;
4158  release_lock(self,Dcache->lock);
4159  data->state=2;@+
4160  wait(Dcache->copy_out_time);
4161case 2:@+ if (!clean_lock) goto done; /* premature termination */
4162  if (Dcache->flusher.next) wait(1);
4163  if (data->i!=sync) goto Sprep;
4164  data->state=3;
4165case 3:@+ if (Dcache->lock || (j=get_reader(Dcache))<0) wait(1);
4166  startup(&Dcache->reader[j],Dcache->access_time);
4167  set_lock(self,Dcache->lock);
4168  i=data->y.o.h, j=data->y.o.l;
4169Dclean_inc: j++;
4170  if (i<Dcache->cc && j==Dcache->aa) j=0, i++;
4171  if (i==Dcache->cc && j==Dcache->vv) {
4172    data->state=5;@+
4173    wait(Dcache->access_time);
4174  }
4175  goto Dclean_loop;
4176case 4:@+ if (Dcache->lock || (j=get_reader(Dcache))<0) wait(1);
4177  startup(&Dcache->reader[j],Dcache->access_time);
4178  set_lock(self,Dcache->lock);
4179  p=cache_search(Dcache,data->z.o);
4180  if (p) {
4181    demote_and_fix(Dcache,p);
4182    if (is_dirty(Dcache,p)) goto Dclean;
4183  }
4184  data->state=9;@+
4185  wait(Dcache->access_time);
4186
4187@ @<Cases 5 through 9...@>=
4188case 5:@+ if (self->lockloc) *(self->lockloc)=NULL, self->lockloc=NULL;
4189  if (!Scache) goto done;
4190  if (Scache->lock) wait(1);
4191  set_lock(self,Scache->lock);
4192  i=j=0;
4193Sclean_loop: p=(i<Scache->cc? &(Scache->set[i][j]): &(Scache->victim[j]));
4194  if (p->tag.h&sign_bit) goto Sclean_inc;
4195  if (!is_dirty(Scache,p)) {
4196    p->tag.h|=data->x.o.h;@+goto Sclean_inc;
4197  }
4198  data->y.o.h=i, data->y.o.l=j;
4199Sclean: data->state=6;@+
4200  data->ptr_b=(void*)p;@+
4201  wait(Scache->access_time);
4202case 6:@+if (Scache->flusher.next) wait(1);
4203  flush_cache(Scache,p,data->x.o.h==0);
4204  p->tag.h|=data->x.o.h;
4205  release_lock(self,Scache->lock);
4206  data->state=7;@+
4207  wait(Scache->copy_out_time);
4208case 7:@+ if (!clean_lock) goto done; /* premature termination */
4209  if (Scache->flusher.next) wait(1);
4210  if (data->i!=sync) goto done;
4211  data->state=8;
4212case 8:@+ if (Scache->lock) wait(1);
4213  set_lock(self,Scache->lock);
4214  i=data->y.o.h, j=data->y.o.l;
4215Sclean_inc: j++;
4216  if (i<Scache->cc && j==Scache->aa) j=0, i++;
4217  if (i==Scache->cc && j==Scache->vv) {
4218    data->state=10;@+
4219    wait(Scache->access_time);
4220  }
4221  goto Sclean_loop;
4222Sprep: data->state=9;
4223case 9:@+if (self->lockloc) release_lock(self,Dcache->lock);
4224  if (!Scache) goto done;
4225  if (Scache->lock) wait(1);
4226  set_lock(self,Scache->lock);
4227  p=cache_search(Scache,data->z.o);
4228  if (p) {
4229    demote_and_fix(Scache,p);
4230    if (is_dirty(Scache,p)) goto Sclean;
4231  }
4232  data->state=10;@+
4233  wait(Scache->access_time);
4234
4235@* Virtual address translation. Special arrays of coroutines and control
4236blocks come into play when we need to implement \MMIX's rather complicated
4237page table mechanism for virtual address translation. In effect, we have up to
4238ten control blocks {\it outside\/} of the reorder buffer that are capable of
4239executing instructions just as if they were part of that buffer. The
4240``opcodes'' of these non-abortable instructions are special internal
4241operations called |ldptp| and |ldpte|, for loading page table pointers and
4242page table entries.
4243
4244Suppose, for example, that we need to translate a virtual address for the
4245DT-cache in which the virtual page address $(a_4a_3a_2a_1a_0)_{1024}$ of
4246segment~$i$ has $a_4=a_3=0$ and $a_2\ne0$. Then the rules say that we should
4247first find a page table pointer $p_2$ in physical location
4248$2^{13}(r+b_i+2)+8a_2$, then another page table pointer~$p_1$ in location
4249$p_2+8a_1$, and finally the page table entry~$p_0$ in location $p_1+8a_0$. The
4250simulator achieves this by setting up three coroutines $c_0$, $c_1$, $c_2$
4251whose control blocks correspond to the pseudo-instructions
4252$$\vbox{\halign{\tt#\hfil\cr
4253LDPTP $x$,[$2^{63}+2^{13}(r+b_i+2)$],$8a_2$\cr
4254LDPTP $x$,$x$,$8a_1$\cr
4255LDPTE $x$,$x$,$8a_0$\cr}}$$
4256where $x$ is a hidden internal register and the other quantities are immediate
4257values. Slight changes to the normal functionality of \.{LDO} give us the
4258actions needed to implement \.{LDPTP} and \.{LDPTE}. Coroutine~$c_j$
4259corresponds to the instruction that involves $a_j$ and computes~$p_j$; when
4260$c_0$ has computed its value~$p_0$, we know how to translate the original
4261virtual address.
4262
4263The \.{LDPTP} and \.{LDPTE} commands return zero
4264if their $y$~operand is zero or if the page table does not properly match~rV.
4265
4266@d LDPTP PREGO /* internally this won't cause confusion */
4267@d LDPTE GO
4268
4269@<Global...@>=
4270control IPTctl[5], DPTctl[5]; /* control blocks for I and D page translation */
4271coroutine IPTco[10], DPTco[10]; /* each coroutine is a two-stage pipeline */
4272char *IPTname[5]={"IPT0","IPT1","IPT2","IPT3","IPT4"};
4273char *DPTname[5]={"DPT0","DPT1","DPT2","DPT3","DPT4"};
4274
4275@ @<Initialize e...@>=
4276for (j=0;j<5;j++) {
4277  DPTco[2*j].ctl=&DPTctl[j];@+  IPTco[2*j].ctl=&IPTctl[j];
4278  if (j>0) DPTctl[j].op=IPTctl[j].op=LDPTP,DPTctl[j].i=IPTctl[j].i=ldptp;
4279  else DPTctl[0].op=IPTctl[0].op=LDPTE,DPTctl[0].i=IPTctl[0].i=ldpte;
4280  IPTctl[j].loc=DPTctl[j].loc=neg_one;
4281  IPTctl[j].go.o=DPTctl[j].go.o=incr(neg_one,4);
4282  IPTctl[j].ptr_a=DPTctl[j].ptr_a=(void*)&mem;
4283  IPTctl[j].ren_x=DPTctl[j].ren_x=true;
4284  IPTctl[j].x.addr.h=DPTctl[j].x.addr.h=-1;
4285  IPTco[2*j].stage=DPTco[2*j].stage=1;
4286  IPTco[2*j+1].stage=DPTco[2*j+1].stage=2;
4287  IPTco[2*j].name=IPTco[2*j+1].name=IPTname[j];
4288  DPTco[2*j].name=DPTco[2*j+1].name=DPTname[j];
4289}
4290ITcache->filler_ctl.ptr_c=(void*)&IPTco[0];@+
4291DTcache->filler_ctl.ptr_c=(void*)&DPTco[0];
4292
4293@ Page table calculations are invoked by a coroutine of type |fill_from_virt|,
4294which is used to fill the IT-cache or DT-cache. The calling conventions of
4295|fill_from_virt| are analogous to those of |fill_from_mem| or |fill_from_S|:
4296A virtual address is supplied in |data->y.o|, and |data->ptr_a| points
4297to a cache (|ITcache| or |DTcache|), while |data->ptr_b| is a block in that
4298cache. We wake up the caller, who holds the cache's |fill_lock|, as soon as
4299the translation of the given address has been calculated, unless the caller
4300has been aborted. (No second wakeup call is necessary.)
4301
4302@<Cases for control of special coroutines@>=
4303case fill_from_virt: {@+register cache *c=(cache *)data->ptr_a;
4304  register coroutine *cc=c->fill_lock;
4305  register coroutine *co=(coroutine*)data->ptr_c;
4306                          /* |&IPTco[0]| or |&DPTco[0]| */
4307  octa aaaaa;
4308 switch (data->state) {
4309  case 0: @<Start up auxiliary coroutines to compute the page table entry@>;
4310    data->state=1;
4311  case 1:@+if (data->b.p) {
4312      if (data->b.p->known) data->b.o=data->b.p->o, data->b.p=NULL;
4313      else wait(1);
4314    }
4315    @<Compute the new entry for |c->inbuf| and give the caller a sneak
4316              preview@>;
4317    data->state=2;
4318  case 2:@+if (c->lock) wait(1);
4319    set_lock(self,c->lock);
4320    load_cache(c,(cacheblock*)data->ptr_b);
4321    data->state=3;@+ wait(c->copy_in_time);
4322  case 3: data->b.o=zero_octa;@+goto terminate;
4323 }
4324}
4325
4326@ The current contents of rV, the special virtual translation register, are
4327kept unpacked in several global variables |page_r|, |page_s|, etc., for
4328convenience. Whenever rV changes, we recompute all these variables.
4329
4330@<Glob...@>=
4331int page_n; /* the 10-bit |n| field of rV, times 8 */
4332int page_r; /* the 27-bit |r| field of rV */
4333int page_s; /* the 8-bit |s| field of rV */
4334int page_f; /* the 3-bit |f| field of rV */
4335int page_b[5]; /* the 4-bit |b| fields of rV; |page_b[0]=0| */
4336octa page_mask; /* the least significant |s| bits */
4337bool page_bad=true; /* does rV violate the rules? */
4338
4339@ @<Update the \\{page} variables@>=
4340{@+octa rv;
4341  rv=data->z.o;
4342  page_f=rv.l&7, page_bad=(page_f>1);
4343  page_n=rv.l&0x1ff8;
4344  rv=shift_right(rv,13,1);
4345  page_r=rv.l&0x7ffffff;
4346  rv=shift_right(rv,27,1);
4347  page_s=rv.l&0xff;
4348  if (page_s<13 || page_s>48) page_bad=true;
4349  else if (page_s<32) page_mask.h=0,page_mask.l=(1<<page_s)-1;
4350  else page_mask.h=(1<<(page_s-32))-1,page_mask.l=0xffffffff;
4351  page_b[4]=(rv.l>>8)&0xf;
4352  page_b[3]=(rv.l>>12)&0xf;
4353  page_b[2]=(rv.l>>16)&0xf;
4354  page_b[1]=(rv.l>>20)&0xf;
4355}
4356
4357@ Here's how we compute a tag of the IT-cache or DT-cache
4358from a virtual address, and how we compute a physical address
4359from a translation found in the cache.
4360
4361@d trans_key(addr) incr(oandn(addr,page_mask),page_n)
4362
4363@<Internal proto...@>=
4364static octa phys_addr @,@,@[ARGS((octa,octa))@];
4365
4366@ @<Sub...@>=
4367static octa phys_addr(virt,trans)
4368  octa virt,trans;
4369{@+octa t;
4370  t=oandn(trans,page_mask); /* zero out the \\{ynp} fields of a PTE */
4371  return oplus(t,oand(virt,page_mask));
4372}
4373
4374@ Cheap (and slow) versions of \MMIX\ leave the page table calculations
4375to software. If the global variable |no_hardware_PT| is set true,
4376|fill_from_virt| begins its actions in state~1, not state~0. (See the
4377|RESUME_TRANS| operation.)
4378
4379@<External v...@>=
4380Extern bool no_hardware_PT;
4381
4382@ Note: The operating system is supposed to ensure that changes to the page
4383table entries do not appear in the pipeline when a translation cache is being
4384updated. The internal \.{LDPTP} and \.{LDPTE} instructions use only the
4385``hot state'' of the memory system.
4386@^operating system@>
4387
4388@<Start up auxiliary coroutines to compute the page table entry@>=
4389aaaaa=data->y.o;
4390i=aaaaa.h>>29; /* the segment number */
4391aaaaa.h&=0x1fffffff; /* the address within segment $i$ */
4392aaaaa=shift_right(aaaaa,page_s,1); /* the page address */
4393for (j=0;aaaaa.l!=0 || aaaaa.h!=0; j++) {
4394  co[2*j].ctl->z.o.h=0, co[2*j].ctl->z.o.l=(aaaaa.l&0x3ff)<<3;
4395  aaaaa=shift_right(aaaaa,10,1);
4396}
4397if (page_b[i+1]<page_b[i]+j) /* address too large */
4398  ; /* nothing needs to be done, since |data->b.o| is zero */
4399else {
4400  if (j==0) j=1,co[0].ctl->z.o=zero_octa;
4401  @<Issue $j$ pseudo-instructions to compute a page table entry@>;
4402}
4403
4404@ The first stage of coroutine $c_j$ is |co[2*j]|. It will pass the $j$th
4405control block to the second stage, |co[2*j+1]|, which will load page table
4406information from memory (or hopefully from the D-cache).
4407
4408@<Issue $j$ pseudo-instructions to compute a page table entry@>=
4409j--;
4410aaaaa.l=page_r+page_b[i]+j;
4411co[2*j].ctl->y.p=NULL;
4412co[2*j].ctl->y.o=shift_left(aaaaa,13);
4413co[2*j].ctl->y.o.h+=sign_bit;
4414for (;;j--) {
4415  co[2*j].ctl->x.o=zero_octa;@+ co[2*j].ctl->x.known=false;
4416  co[2*j].ctl->owner=&co[2*j];
4417  startup(&co[2*j],1);
4418  if (j==0) break;
4419  co[2*(j-1)].ctl->y.p=&co[2*j].ctl->x;
4420}
4421data->b.p=&co[0].ctl->x;
4422
4423@ At this point the translation of the given virtual address |data->y.o| is
4424the octabyte |data->b.o|. Its least significant three bits are the
4425protection code~$p=p_rp_wp_x$; its page address field is scaled by~$2^s$. It
4426is entirely zero, including the protection bits, if there was a
4427page table failure.
4428
4429The |z| field of the caller receives this translation.
4430
4431@<Compute the new entry for |c->inbuf| and give the caller a sneak preview@>=
4432c->inbuf.tag=trans_key(data->y.o);
4433c->inbuf.data[0]=data->b.o;
4434if (cc) {
4435  cc->ctl->z.o=data->b.o;
4436  awaken(cc,1);
4437}
4438
4439@* The write buffer. The dispatcher has arranged things so that speculative
4440stores into memory are recorded in a doubly linked list leading upward from
4441|mem|. When such instructions finally are committed, they enter the ``write
4442buffer,'' which holds octabytes that are ready to be written into designated
4443physical memory addresses (or into the D-cache and/or S-cache). The ``hot
4444state'' of the computation is reflected not only by the registers and caches
4445but also by the instructions that are pending in the write buffer.
4446
4447@<Type...@>=
4448typedef struct{
4449  octa o; /* data to be stored */
4450  octa addr; /* its physical address */
4451  tetra stamp; /* when last committed (mod $2^{32}$) */
4452  internal_opcode i; /* is this write special? */
4453  int size; /* parameter for |spec_write| */
4454} write_node;
4455
4456@ We represent the buffer in the usual way as a circular list, with elements
4457|write_tail+1|, |write_tail+2|, \dots,~|write_head|.
4458
4459The data will sit at least |holding_time| cycles before it leaves
4460the write buffer. This speeds things up when different fields of the same
4461octabyte are being stored by different instructions.
4462
4463@<External v...@>=
4464Extern write_node *wbuf_bot, *wbuf_top;
4465 /* least and greatest write buffer nodes */
4466Extern write_node *write_head, *write_tail;
4467 /* front and rear of the write buffer */
4468Extern lockvar wbuf_lock; /* is the data in |write_head| being written? */
4469Extern int holding_time; /* minimum holding time */
4470Extern lockvar speed_lock; /* should we ignore |holding_time|? */
4471
4472@ @<Glob...@>=
4473coroutine write_co; /* coroutine that empties the write buffer */
4474control write_ctl; /* its control block */
4475
4476@ @<Initialize e...@>=
4477write_co.ctl=&write_ctl;
4478write_co.name="Write";
4479write_co.stage=write_from_wbuf;
4480write_ctl.ptr_a=(void*)&mem;
4481write_ctl.go.o.l=4;
4482startup(&write_co,1);
4483write_head=write_tail=wbuf_top;
4484
4485@ @<Internal proto...@>=
4486static void print_write_buffer @,@,@[ARGS((void))@];
4487
4488@ @<Sub...@>=
4489static void print_write_buffer()
4490{
4491  printf("Write buffer");
4492  if (write_head==write_tail) printf(" (empty)\n");
4493  else {@+register write_node *p;
4494    printf(":\n");
4495    for (p=write_head;p!=write_tail; p=(p==wbuf_bot? wbuf_top: p-1)) {
4496      printf("m[");@+print_octa(p->addr);@+printf("]=");@+print_octa(p->o);
4497      if (p->i==stunc) printf(" unc");
4498      else if (p->i==sync) printf(" sync");
4499      printf(" (age %d)\n",ticks.l-p->stamp);
4500    }
4501  }
4502}
4503
4504@ The entire present state of the pipeline computation can be visualized
4505by printing first the write buffer, then the reorder buffer, then the
4506fetch buffer. This shows the progression of results from oldest to youngest,
4507from sizzling hot to ice cold.
4508
4509@<External proto...@>=
4510Extern void print_pipe @,@,@[ARGS((void))@];
4511
4512@ @<External r...@>=
4513void print_pipe()
4514{
4515  print_write_buffer();
4516  print_reorder_buffer();
4517  print_fetch_buffer();
4518}
4519
4520@ The |write_search| routine looks to see if any instructions ahead of a given
4521place in the |mem| list of the reorder buffer are storing into a given
4522physical address, or if there's a pending instruction in the write buffer for
4523that address. If so, it returns a pointer to the value to be written. If not,
4524it returns~|NULL|. If the answer is currently unknown, because at least one
4525possibly relevant physical address has not yet been computed, the subroutine
4526returns the special code value~|DUNNO|.
4527
4528The search starts at the |x.up| field of a control block for a store
4529instruction, otherwise at the |ptr_a| field of the control block,
4530unless |ptr_a| points to a committed instruction.
4531
4532The |i| field in the write buffer is usually |st| or |pst|, inherited from
4533a store or partial store command. It may also be |sync| (from \.{SYNC}~\.1
4534or \.{SYNC}~\.3) or |stunc| (from \.{STUNC}).
4535
4536@d DUNNO ((octa *)1) /* an impossible non-|NULL| pointer */
4537
4538@<Internal proto...@>=
4539static octa* write_search @,@,@[ARGS((control*,octa))@];
4540
4541@ @<Sub...@>=
4542static octa *write_search(ctl,addr)
4543  control *ctl;
4544  octa addr;
4545{@+register specnode *p=(ctl->mem_x? ctl->x.up: (specnode*)ctl->ptr_a);
4546  register write_node *q=write_tail;
4547  addr.l &=-8;
4548  if (p==&mem) goto qloop;
4549  if (p > &hot->x && ctl <= hot) goto qloop; /* already committed */
4550  if (p < &ctl->x && (ctl <= hot || p > &hot->x)) goto qloop;
4551  for (; p!=&mem; p=p->up) {
4552    if (p->addr.h==(tetra)-1) return DUNNO;
4553    if ((p->addr.l&-8)==addr.l && p->addr.h==addr.h)
4554      return (p->known? &(p->o): DUNNO);
4555  }
4556qloop:@+ for (;;) {
4557    if (q==write_head) return NULL;
4558    if (q==wbuf_top) q=wbuf_bot;@+ else q++;
4559    if (q->addr.l==addr.l && q->addr.h==addr.h) return &(q->o);
4560  }
4561}
4562
4563@ When we're committing new data to memory, we can update an existing item in
4564the write buffer if it has the same physical address, unless that item is
4565already in the process of being written out. Increasing the value of
4566|holding_time| will increase the chance that this economy is possible, but
4567it will also increase the number of buffered items when writes are to
4568different locations.
4569
4570A store instruction that sets any of the eight interrupt bits
4571\.{rwxnkbsp} will not affect memory, even if it doesn't cause an interrupt.
4572
4573When ``store'' is followed by ``store uncached'' at the same address,
4574or vice versa, we believe the most recent hint.
4575
4576@<Commit to memory...@>=
4577{@+register write_node *q=write_tail;
4578  if (hot->interrupt&(F_BIT+0xff)) goto done_with_write;
4579  if (hot->x.addr.h&0xffff0000) {
4580    if (hot->op>=STB && hot->op<STSF) q->size=(hot->op&0xf)>>2;
4581    else if (hot->op>=STSF && hot->op<STCO) q->size=2;
4582    else q->size=3;
4583  }
4584  if (hot->i!=sync) for (;;) {
4585    if (q==write_head) break;
4586    if (q==wbuf_top) q=wbuf_bot;@+ else q++;
4587    if (q->i==sync) break;
4588    if (q->addr.l==hot->x.addr.l && q->addr.h==hot->x.addr.h
4589             && (q!=write_head || !wbuf_lock)) goto addr_found;
4590  }
4591  {@+ register write_node *p=(write_tail==wbuf_bot? wbuf_top: write_tail-1);
4592    if (p==write_head) break; /* the write buffer is full */
4593    q=write_tail;@+ write_tail=p;
4594    q->addr=hot->x.addr;
4595  }
4596addr_found: q->o=hot->x.o;
4597  q->stamp=ticks.l;
4598  q->i=hot->i;
4599done_with_write: spec_rem(&(hot->x));
4600  mem_slots++;
4601}
4602
4603@ A special coroutine whose duty is to empty the write buffer is always
4604active. It holds the |wbuf_lock| while it is writing the contents of
4605|write_head|. It holds |Dcache->fill_lock| while waiting for the D-cache
4606to fill a block.
4607
4608@<Cases for control...@>=
4609case write_from_wbuf:
4610  p=(cacheblock*)data->ptr_b;
4611  switch(data->state) {
4612  case 4: @<Forward the new data past the D-cache if it is write-through@>;
4613    data->state=5;
4614  case 5:@+if (write_head==wbuf_bot) write_head=wbuf_top;@+ else write_head--;
4615 write_restart: data->state=0;
4616  case 0:@+ if (self->lockloc) *(self->lockloc)=NULL,self->lockloc=NULL;
4617    if (write_head==write_tail) wait(1); /* write buffer is empty */
4618    if (write_head->i==sync) @<Ignore the item in |write_head|@>;
4619    if (write_head->addr.h&0xffff0000) goto mem_direct;
4620    if (ticks.l-write_head->stamp<holding_time && !speed_lock)
4621      wait(1); /* data too raw */
4622    if (!Dcache) goto mem_direct; /* not cached */
4623    if (Dcache->lock || (j=get_reader(Dcache))<0) wait(1); /* D-cache busy */
4624    startup(&Dcache->reader[j],Dcache->access_time);
4625    @<Write the data into the D-cache and set |state=4|,
4626                if there's a cache hit@>;
4627    data->state=((Dcache->mode&WRITE_ALLOC) && write_head->i!=stunc? 1: 3);
4628    wait(Dcache->access_time);
4629  case 1: @<Try to put the contents of location |write_head->addr|
4630           into the D-cache@>;
4631    data->state=2;@+sleep;
4632  case 2: data->state=0;@+sleep; /* wake up when the D-cache has the block */
4633  case 3: @<Handle write-around when writing to the D-cache@>;
4634  mem_direct: @<Write directly from |write_head| to memory@>;
4635}
4636
4637@ @<Local var...@>=
4638register cacheblock *p,*q;
4639
4640@ The granularity is guaranteed to be 8 in write-around mode
4641(see |MMIX_config|). Although an uncached store will not be stored in the
4642D-cache (unless it hits in the D-cache), it will go into a secondary cache.
4643
4644@<Handle write-around when writing to the D-cache@>=
4645if (Dcache->filler.next) goto write_restart;
4646if ((Scache&&Scache->lock) || (!Scache&&mem_lock)) goto write_restart;
4647if (Dcache->flusher.next) wait(1);
4648Dcache->outbuf.tag.h=write_head->addr.h;
4649Dcache->outbuf.tag.l=write_head->addr.l&(-Dcache->bb);
4650for (j=0;j<Dcache->bb>>Dcache->g;j++) Dcache->outbuf.dirty[j]=false;
4651Dcache->outbuf.data[(write_head->addr.l&(Dcache->bb-1))>>3]=write_head->o;
4652Dcache->outbuf.dirty[(write_head->addr.l&(Dcache->bb-1))>>Dcache->g]=true;
4653Dcache->outbuf.rank=Dcache->gg; /* this many valid bytes */
4654set_lock(self,wbuf_lock);
4655startup(&Dcache->flusher,Dcache->copy_out_time);
4656data->state=5;@+ wait(Dcache->copy_out_time);
4657
4658@ @<Write directly from |write_head| to memory@>=
4659if (mem_lock) wait(1);
4660set_lock(self,wbuf_lock);
4661set_lock(&mem_locker,mem_lock); /* a coroutine of type |vanish| */
4662startup(&mem_locker,mem_addr_time+mem_write_time);
4663if (write_head->addr.h&0xffff0000)
4664  spec_write(write_head->addr,write_head->o,write_head->size);
4665else mem_write(write_head->addr,write_head->o);
4666data->state=5;@+ wait(mem_addr_time+mem_write_time);
4667
4668@ A subtlety needs to be mentioned here: While we're trying to
4669update the D-cache, another instruction might be filling the
4670same cache block (although not because of the same physical address).
4671Therefore we |goto write_restart| here instead of saying |wait(1)|.
4672
4673@<Try to put the contents of location |write_head->addr| into the D-cache@>=
4674if (Dcache->filler.next) goto write_restart;
4675if ((Scache&&Scache->lock) || (!Scache&&mem_lock)) goto write_restart;
4676p=alloc_slot(Dcache,write_head->addr);
4677if (!p) goto write_restart;
4678if (Scache) set_lock(&Dcache->filler,Scache->lock)@;
4679else set_lock(&Dcache->filler,mem_lock);
4680set_lock(self,Dcache->fill_lock);
4681data->ptr_b=Dcache->filler_ctl.ptr_b=(void *)p;
4682Dcache->filler_ctl.z.o=write_head->addr;
4683startup(&Dcache->filler,Scache? Scache->access_time: mem_addr_time);
4684
4685@ Here it is assumed that |Dcache->access_time| is enough to search
4686the D-cache and update one octabyte in case of a hit. The D-cache is
4687not locked, since other coroutines that might be simultaneously reading
4688the D-cache are not going to use the octabyte that changes.
4689Perhaps the simulator is being too lenient here.
4690
4691@<Write the data into the D-cache...@>=
4692p=cache_search(Dcache,write_head->addr);
4693if (p) {
4694  p=use_and_fix(Dcache,p);
4695  set_lock(self,wbuf_lock);
4696  data->ptr_b=(void *)p;
4697  p->data[(write_head->addr.l&(Dcache->bb-1))>>3]=write_head->o;
4698  p->dirty[(write_head->addr.l&(Dcache->bb-1))>>Dcache->g]=true;
4699  data->state=4;@+ wait(Dcache->access_time);
4700}
4701
4702@ @<Forward the new data past the D-cache if it is write-through@>=
4703if ((Dcache->mode&WRITE_BACK)==0) { /* write-through */
4704  if (Dcache->flusher.next) wait(1);
4705  flush_cache(Dcache,p,true);
4706}
4707
4708@ @<Ignore the item in |write_head|@>=
4709{
4710  set_lock(self,wbuf_lock);
4711  data->state=5;
4712  wait(1);
4713}
4714
4715@* Loading and storing. A RISC machine is often said to have a ``load/store
4716architecture,'' perhaps because loading and storing are among the most
4717difficult things a RISC machine is called upon to do.
4718
4719We want memory accesses
4720to be efficient, so we try to access the D-cache at the same time as we are
4721translating a virtual address via the DT-cache. Usually we hit in both
4722caches, but numerous cases must be dealt with when we miss. Is there
4723an elegant way to handle all the contingencies? Alas, the author of this
4724program was unable to think of anything better than to throw lots
4725of code at the problem --- knowing full well that such a spaghetti-like
4726approach is fraught with possibilities for error.
4727
4728Instructions like \.{LDO} $x,y,z$ operate in two pipeline stages. The first
4729stage computes the virtual address $y+z$, waiting if necessary until $y$
4730and~$z$ are both known; then it starts to access the necessary caches.
4731In the second stage we ascertain the corresponding physical address and
4732hopefully find the data in the cache (or in the speculative |mem| list or the
4733write buffer).
4734
4735An instruction like \.{STB} $x,y,z$ shares some of the computation of
4736\.{LDO}~$x,y,z$, because only one byte is being stored but the other seven
4737bytes must be found in the cache. In this case, however, $x$~is treated as an
4738input, and |mem| is the output. The second stage of a store command can begin
4739even though $x$ is not known during the first stage.
4740
4741Here's what we do at the beginning of stage~1.
4742
4743@d ld_st_launch 7 /* state when load/store command has its memory address */
4744
4745@<Cases to compute the virtual...@>=
4746case preld: case prest: case prego:
4747  data->z.o=incr(data->z.o,data->xx&-(data->i==prego? Icache: Dcache)->bb);
4748  /* (I hope the adder is fast enough) */
4749case ld: case ldunc: case ldvts:
4750case st: case pst: case syncd: case syncid:
4751start_ld_st: data->y.o=oplus(data->y.o,data->z.o);
4752  data->state=ld_st_launch;@+ goto switch1;
4753case ldptp: case ldpte:@+if (data->y.o.h) goto start_ld_st;
4754  data->x.o=zero_octa;@+ data->x.known=true;@+ goto die; /* page table fault */
4755
4756@ @d PRW_BITS (data->i<st? PR_BIT: data->i==pst? PR_BIT+PW_BIT:
4757                  (data->i==syncid && (data->loc.h&sign_bit))? 0: PW_BIT)
4758
4759@<Special cases for states in the first stage@>=
4760case ld_st_launch:@+if ((self+1)->next)
4761    wait(1); /* second stage must be clear */
4762  @<Handle special cases for operations like |prego| and |ldvts|@>;
4763  if (data->y.o.h&sign_bit)
4764    @<Do load/store stage~1 with known physical address@>;
4765  if (page_bad) {
4766    if (data->i<preld || data->i==st || data->i==pst)
4767       data->interrupt|=PRW_BITS;
4768    goto fin_ex;
4769  }
4770  if (DTcache->lock || (j=get_reader(DTcache))<0) wait(1);
4771  startup(&DTcache->reader[j],DTcache->access_time);
4772  @<Look up the address in the DT-cache, and also in the D-cache if possible@>;
4773  pass_after(DTcache->access_time);@+ goto passit;
4774
4775@ When stage 2 of a load/store command begins, the state will depend
4776on what transpired in stage~1.
4777For example, |data->state| will be |DT_miss| if the virtual address key
4778can't be found in the DT-cache; then stage~2 will have to compute the
4779physical address the hard way.
4780
4781The |data->state| will be |DT_hit| if
4782the physical address is known via the DT-cache, but the data may or may not
4783be in the D-cache. The |data->state| will be |hit_and_miss| if the DT-cache
4784hits and the D-cache doesn't. And |data->state| will be |ld_ready| if
4785|data->x.o| is the desired octabyte (for example, if both caches hit).
4786
4787@d DT_miss 10 /* second stage |state| when DT-cache doesn't hold the key */
4788@d DT_hit 11 /* second stage |state| when physical address is known */
4789@d hit_and_miss 12 /* second stage |state| when D-cache misses */
4790@d ld_ready 13 /* second stage |state| when data has been read */
4791@d st_ready 14 /* second stage |state| when data needn't be read */
4792@d prest_win 15 /* second stage |state| when we can fill a block with zeroes */
4793
4794@<Look up the address in the DT-cache...@>=
4795p=cache_search(DTcache,trans_key(data->y.o));
4796if (!Dcache || Dcache->lock || (j=get_reader(Dcache))<0 ||
4797     (data->i>=st && data->i<=syncid))
4798  @<Do load/store stage 1 without D-cache lookup@>;
4799startup(&Dcache->reader[j],Dcache->access_time);
4800if (p) @<Do a simultaneous lookup in the D-cache@>@;
4801else data->state=DT_miss;
4802
4803@ We assume that it is possible to look up a virtual address in the DT-cache
4804at the same time as we look for a corresponding physical address in the
4805D-cache, provided that the lower $b+c$ bits of the two addresses are the same.
4806(They will always be the same if |b+c<=page_s|; otherwise the operating system
4807can try to make them the same by ``page coloring'' whenever possible.) If both
4808caches hit, the physical address is known in
4809@^page coloring@>
4810max(|DTcache->access_time,Dcache->access_time|) cycles.
4811
4812If the lower $b+c$ bits of the virtual and physical addresses differ,
4813the machine will not know this until the DT-cache has hit.
4814Therefore we simulate the operation of accessing the D-cache, but we go to
4815|DT_hit| instead of to |hit_and_miss| because the D-cache will
4816experience a spurious miss.
4817
4818@d max(x,y) ((x)<(y)? (y):(x))
4819
4820@<Do a simultaneous lookup in the D-cache@>=
4821{@+octa *m;
4822  p=use_and_fix(DTcache,p), data->z.o=p->data[0];
4823  @<Check the protection bits and get the physical address@>;
4824  m=write_search(data,data->z.o);
4825  if (m==DUNNO) data->state=DT_hit;
4826  else if (m) data->x.o=*m, data->state=ld_ready;
4827  else if (Dcache->b+Dcache->c>page_s &&@|
4828      ((data->y.o.l^data->z.o.l)&((Dcache->bb<<Dcache->c)-(1<<page_s))))
4829    data->state=DT_hit; /* spurious D-cache lookup */
4830  else {
4831    q=cache_search(Dcache,data->z.o);
4832    if (q) {
4833      if (data->i==ldunc) q=demote_and_fix(Dcache,q);
4834      else q=use_and_fix(Dcache,q);
4835      data->x.o=q->data[(data->z.o.l&(Dcache->bb-1))>>3];
4836      data->state=ld_ready;
4837    }@+else data->state=hit_and_miss;
4838  }
4839  pass_after(max(DTcache->access_time,Dcache->access_time));
4840  goto passit;
4841}
4842
4843@ The protection bits $p_rp_wp_x$ in a translation cache are shifted
4844four positions right from the interrupt codes |PR_BIT|, |PW_BIT|, |PX_BIT|.
4845If the data is protected, we abort the load/store operation immediately;
4846this protects the privacy of other users.
4847
4848@<Check the protection bits and get the physical address@>=
4849if (data->stack_alert) {
4850  if (data->z.o.l&(PW_BIT>>PROT_OFFSET)) data->stack_alert=false;
4851  else data->z.o=g[rC].o; /* use the continuation page for stack overflow */
4852}
4853j=PRW_BITS;
4854if (((data->z.o.l<<PROT_OFFSET)&j)!=j) {
4855  if (data->i==syncd || data->i==syncid) goto sync_check;
4856  if (data->i!=preld && data->i!=prest)
4857    data->interrupt|=j&~(data->z.o.l<<PROT_OFFSET);
4858  data->stack_alert=false;
4859  goto fin_ex;
4860}
4861data->z.o=phys_addr(data->y.o,data->z.o);
4862
4863@ @<Do load/store stage 1 without D-cache lookup@>=
4864{@+octa *m;
4865  if (p) {
4866    p=use_and_fix(DTcache,p), data->z.o=p->data[0];
4867    @<Check the protection bits and get the physical address@>;
4868    if (data->i>=st && data->i<=syncid) data->state=st_ready;
4869    else {
4870      m=write_search(data,data->z.o);
4871      if (m && m!= DUNNO) data->x.o=*m, data->state=ld_ready;
4872      else data->state=DT_hit;
4873    }
4874  }@+ else data->state=DT_miss;
4875  pass_after(DTcache->access_time);@+ goto passit;
4876}
4877
4878@ @<Do load/store stage~1 with known physical address@>=
4879{@+octa *m;
4880  if (!(data->loc.h&sign_bit)) {
4881    if (data->i==syncd || data->i==syncid) goto sync_check;
4882    if (data->i!=preld && data->i!=prest) data->interrupt |= N_BIT;
4883    goto fin_ex;
4884  }
4885  data->z.o=data->y.o;@+ data->z.o.h -= sign_bit;
4886  if (data->z.o.h&0xffff0000) {
4887    switch (data->i) {
4888  case ldvts: case preld: case prest: case prego: case syncd: case syncid:
4889      goto fin_ex;
4890  case ld: case ldunc:@+if (mem_lock) wait(1);
4891    if (data->op<LDSF) i=(data->op&0xf)>>2;
4892    else if (data->op<CSWAP) i=2;
4893    else i=3;
4894    data->x.o=spec_read(data->z.o,i);
4895    goto make_ld_ready;
4896  case pst: if ((data->op^CSWAP)<=1) {
4897     data->x.o=spec_read(data->z.o,3);@+goto make_ld_ready;
4898    }
4899    data->x.o=zero_octa;
4900  case st: data->state=st_ready;@+pass_after(1);@+goto passit;
4901    }
4902  }@+else if (data->i>=st && data->i<=syncid) {
4903    data->state=st_ready;@+pass_after(1);@+goto passit;
4904  }
4905  m=write_search(data,data->z.o);
4906  if (m) {
4907    if (m==DUNNO) data->state=DT_hit;
4908    else data->x.o=*m, data->state=ld_ready;
4909    pass_after(1);@+goto passit;
4910  }@+ else if (!Dcache) {
4911    if (mem_lock) wait(1);
4912    data->x.o=mem_read(data->z.o);
4913make_ld_ready: set_lock(&mem_locker,mem_lock);
4914    data->state=ld_ready;
4915    startup(&mem_locker,mem_addr_time+mem_read_time);
4916    pass_after(mem_addr_time+mem_read_time);@+ goto passit;
4917  }
4918  if (Dcache->lock || (j=get_reader(Dcache))<0) {
4919    data->state=DT_hit;@+pass_after(1);@+ goto passit;
4920  }
4921  startup(&Dcache->reader[j],Dcache->access_time);
4922  q=cache_search(Dcache,data->z.o);
4923  if (q) {
4924    if (data->i==ldunc) q=demote_and_fix(Dcache,q);
4925    else q=use_and_fix(Dcache,q);
4926    data->x.o=q->data[(data->z.o.l&(Dcache->bb-1))>>3];
4927    data->state=ld_ready;
4928  }@+else data->state=hit_and_miss;
4929  pass_after(Dcache->access_time);@+ goto passit;
4930}
4931
4932@ The program for the second stage is, likewise, rather long-winded, yet quite
4933similar to the cache manipulations we have already seen several times.
4934
4935Several instructions might be trying to fill the DT-cache for the same page.
4936(A similar situation faced us in the |write_from_wbuf| coroutine.)
4937The second stage therefore needs to do some
4938translation cache searching just as the first stage did. In this
4939stage, however, we don't go all out for speed, because DT-cache misses
4940are rare.
4941
4942@d DT_retry 8 /* second stage |state| when DT-cache should be searched again */
4943@d got_DT 9   /* second stage |state| when DT-cache entry has been computed */
4944
4945@<Special cases for states in later stages@>=
4946square_one: data->state=DT_retry;
4947 case DT_retry:@+if (DTcache->lock || (j=get_reader(DTcache))<0) wait(1);
4948   startup(&DTcache->reader[j],DTcache->access_time);
4949   p=cache_search(DTcache,trans_key(data->y.o));
4950   if (p) {
4951     p=use_and_fix(DTcache,p), data->z.o=p->data[0];
4952     @<Check the protection bits and get the physical address@>;
4953     if (data->i>=st && data->i<=syncid) data->state=st_ready;
4954     else data->state=DT_hit;
4955   }@+ else data->state=DT_miss;
4956   wait(DTcache->access_time);
4957 case DT_miss:@+if (DTcache->filler.next)
4958     if (data->i==preld || data->i==prest) goto fin_ex;@+ else goto square_one;
4959   if (no_hardware_PT || page_f)
4960     if (data->i==preld || data->i==prest) goto fin_ex;@+else goto emulate_virt;
4961   p=alloc_slot(DTcache,trans_key(data->y.o));
4962   if (!p) goto square_one;
4963   data->ptr_b=DTcache->filler_ctl.ptr_b=(void *)p;
4964   DTcache->filler_ctl.y.o=data->y.o;
4965   set_lock(self,DTcache->fill_lock);
4966   startup(&DTcache->filler,1);
4967   data->state=got_DT;
4968   if (data->i==preld || data->i==prest) goto fin_ex;@+else sleep;
4969 case got_DT: release_lock(self,DTcache->fill_lock);
4970   @<Check the protection bits and get the physical address@>;
4971   if (data->i>=st && data->i<=syncid) goto finish_store;
4972    /* otherwise we fall through to |ld_retry| below */
4973
4974@ The second stage might also want to fill the D-cache (and perhaps
4975the S-cache) as we get the data.
4976
4977Several load instructions might be trying to fill the same cache block.
4978So we should go back and look in the D-cache again if we miss and
4979cannot allocate a slot immediately.
4980
4981A \.{PRELD} or \.{PREST} instruction, which is just a ``hint,'' doesn't do
4982anything more if the caches are already busy.
4983
4984@<Special cases for states in later stages@>=
4985ld_retry: data->state=DT_hit;
4986 case DT_hit:@+ if (data->i==preld || data->i==prest) goto fin_ex;
4987  @<Check for a hit in pending writes@>;
4988  if ((data->z.o.h&0xffff0000) || !Dcache)
4989      @<Do load/store stage 2 without D-cache lookup@>;
4990  if (Dcache->lock || (j=get_reader(Dcache))<0) wait(1);
4991  startup(&Dcache->reader[j],Dcache->access_time);
4992  q=cache_search(Dcache,data->z.o);
4993  if (q) {
4994    if (data->i==ldunc) q=demote_and_fix(Dcache,q);
4995    else q=use_and_fix(Dcache,q);
4996    data->x.o=q->data[(data->z.o.l&(Dcache->bb-1))>>3];
4997    data->state=ld_ready;
4998  }@+else data->state=hit_and_miss;
4999  wait(Dcache->access_time);
5000 case hit_and_miss:@+if (data->i==ldunc) goto avoid_D;
5001    @<Try to get the contents of location |data->z.o| in the D-cache@>;
5002
5003@ @<Try to get the contents of location |data->z.o| in the D-cache@>=
5004@<Check for |prest| with a fully spanned cache block@>;
5005if (Dcache->filler.next) goto ld_retry;
5006if ((Scache&&Scache->lock) || (!Scache&&mem_lock)) goto ld_retry;
5007q=alloc_slot(Dcache,data->z.o);
5008if (!q) goto ld_retry;
5009if (Scache) set_lock(&Dcache->filler,Scache->lock)@;
5010else set_lock(&Dcache->filler,mem_lock);
5011set_lock(self,Dcache->fill_lock);
5012data->ptr_b=Dcache->filler_ctl.ptr_b=(void *)q;
5013Dcache->filler_ctl.z.o=data->z.o;
5014startup(&Dcache->filler,Scache? Scache->access_time: mem_addr_time);
5015data->state=ld_ready;
5016if (data->i==preld || data->i==prest) goto fin_ex;@+else sleep;
5017
5018@ If a |prest| instruction makes it to the hot seat,
5019we have been assured by the user of \.{PREST} that the current
5020values of bytes in virtual addresses |data->y.o-(data->xx&-Dcache->bb)| through
5021|data->y.o+(data->xx&(Dcache->bb-1))|
5022are irrelevant. Hence we can pretend that we know they are zero. This
5023is advantageous if it saves us from filling a cache block from
5024the S-cache or from memory.
5025
5026@<Check for |prest| with a fully spanned cache block@>=
5027if (data->i==prest &&@|
5028   (data->xx>=Dcache->bb || ((data->y.o.l&(Dcache->bb-1))==0)) &&@|
5029   ((data->y.o.l+(data->xx&(Dcache->bb-1))+1)^data->y.o.l)>=Dcache->bb)
5030  goto prest_span;
5031
5032@ @<Special cases for states in later stages@>=
5033prest_span: data->state=prest_win;
5034case prest_win:@+ if (data!=old_hot || Dlocker.next) wait(1);
5035  if (Dcache->lock) goto fin_ex;
5036  q=alloc_slot(Dcache,data->z.o); /* OK if |Dcache->filler| is busy */
5037  if (q) {
5038    clean_block(Dcache,q);
5039    q->tag=data->z.o;@+q->tag.l &=-Dcache->bb;
5040    set_lock(&Dlocker,Dcache->lock);
5041    startup(&Dlocker,Dcache->copy_in_time);
5042  }
5043  goto fin_ex;
5044
5045@ @<Do load/store stage 2 without D-cache lookup@>=
5046{
5047avoid_D:@+ if (mem_lock) wait(1);
5048  set_lock(&mem_locker,mem_lock);
5049  startup(&mem_locker, mem_addr_time+mem_read_time);
5050  data->x.o=mem_read(data->z.o);
5051  data->state=ld_ready;@+ wait(mem_addr_time+mem_read_time);
5052}
5053
5054@ @<Check for a hit in pending writes@>=
5055{
5056  octa *m=write_search(data,data->z.o);
5057  if (m==DUNNO) wait(1);
5058  if (m) {
5059    data->x.o=*m;
5060    data->state=ld_ready;
5061    wait(1);
5062  }
5063}
5064
5065@ The requested octabyte will arrive sooner or later in |data->x.o|.
5066Then a load instruction is almost done, except that we might need
5067to massage the input a little bit.
5068
5069@<Special cases for states in later stages@>=
5070case ld_ready:@+if (self->lockloc)
5071    *(self->lockloc)=NULL, self->lockloc=NULL;
5072  if (data->i>=st) goto finish_store;
5073  switch(data->op>>1) {
5074    case LDB>>1: case LDBU>>1: j=(data->z.o.l&0x7)<<3;@+i=56;@+goto fin_ld;
5075    case LDW>>1: case LDWU>>1: j=(data->z.o.l&0x6)<<3;@+i=48;@+goto fin_ld;
5076    case LDT>>1: case LDTU>>1: j=(data->z.o.l&0x4)<<3;@+i=32;
5077 fin_ld: data->x.o=shift_right(shift_left(data->x.o,j),i,data->op&0x2);
5078    default: goto fin_ex;
5079    case LDHT>>1:@+if (data->z.o.l&4) data->x.o.h=data->x.o.l;
5080      data->x.o.l=0;@+ goto fin_ex;
5081    case LDSF>>1:@+if (data->z.o.l&4) data->x.o.h=data->x.o.l;
5082      if ((data->x.o.h&0x7f800000)==0 && (data->x.o.h&0x7fffff)) {
5083        data->x.o=load_sf(data->x.o.h);
5084        data->state=3;@+wait(denin_penalty);
5085      }
5086      else data->x.o=load_sf(data->x.o.h);@+goto fin_ex;
5087    case LDPTP>>1:@+
5088      if ((data->x.o.h&sign_bit)==0 || (data->x.o.l&0x1ff8)!=page_n)
5089        data->x.o=zero_octa;
5090      else data->x.o.l &= -(1<<13);
5091      goto fin_ex;
5092    case LDPTE>>1:@+if ((data->x.o.l&0x1ff8)!=page_n) data->x.o=zero_octa;
5093      else data->x.o=incr(oandn(data->x.o,page_mask),data->x.o.l&0x7);
5094      data->x.o.h &= 0xffff;@+ goto fin_ex;
5095    case UNSAVE>>1: @<Handle an internal \.{UNSAVE} when it's time to load@>;
5096  }
5097
5098@ @<Special cases for states in later stages@>=
5099 finish_store: data->state=st_ready;
5100case st_ready:@+ switch (data->i) {
5101 case st: case pst: @<Finish a store command@>;
5102 case syncd: data->b.o.l=(Dcache? Dcache->bb: 8192);@+goto do_syncd;
5103 case syncid: data->b.o.l=(Icache? Icache->bb: 8192);
5104   if (Dcache && Dcache->bb<data->b.o.l) data->b.o.l=Dcache->bb;
5105   goto do_syncid;
5106}
5107
5108@ Store instructions have an extra complication, because some of them need
5109to check for overflow.
5110
5111@<Finish a store command@>=
5112data->x.addr=data->z.o;
5113if (data->b.p) wait(1);
5114switch(data->op>>1) {
5115 case STUNC>>1: data->i=stunc;
5116 default: data->x.o=data->b.o;@+goto fin_ex;
5117 case STSF>>1: set_round;@+ data->b.o.h=store_sf(data->b.o);
5118    data->interrupt |= exceptions;
5119    if ((data->b.o.h&0x7f800000)==0 && (data->b.o.h&0x7fffff)) {
5120      if (data->z.o.l&4) data->x.o.l=data->b.o.h;
5121      else data->x.o.h=data->b.o.h;
5122      data->state=3;@+wait(denout_penalty);
5123    }
5124 case STHT>>1:@+if (data->z.o.l&4) data->x.o.l=data->b.o.h;
5125  else data->x.o.h=data->b.o.h;
5126  goto fin_ex;
5127 case STB>>1: case STBU>>1: j=(data->z.o.l&0x7)<<3;@+i=56;@+goto fin_st;
5128 case STW>>1: case STWU>>1: j=(data->z.o.l&0x6)<<3;@+i=48;@+goto fin_st;
5129 case STT>>1: case STTU>>1: j=(data->z.o.l&0x4)<<3;@+i=32;
5130  fin_st: @<Insert |data->b.o| into the proper field of |data->x.o|,
5131                 checking for arithmetic exceptions if signed@>;
5132  goto fin_ex;
5133 case CSWAP>>1: @<Finish a \.{CSWAP}@>;
5134 case SAVE>>1: @<Handle an internal \.{SAVE} when it's time to store@>;
5135  }
5136
5137@ @<Insert |data->b.o| into the proper field...@>=
5138{
5139  octa mask;
5140  if (!(data->op&2)) {@+octa before,after;
5141    before=data->b.o;@+after=shift_right(shift_left(data->b.o,i),i,0);
5142    if (before.l!=after.l || before.h!=after.h) data->interrupt|=V_BIT;
5143  }
5144  mask=shift_right(shift_left(neg_one,i),j,1);
5145  data->b.o=shift_right(shift_left(data->b.o,i),j,1);
5146  data->x.o.h^=mask.h&(data->x.o.h^data->b.o.h);
5147  data->x.o.l^=mask.l&(data->x.o.l^data->b.o.l);
5148}
5149
5150@ The \.{CSWAP} operation has four inputs $\rm(\$X, \$Y, \$Z, rP)$ as well as
5151three outputs $\rm(\$X,M_8[A],rP)$. To keep from exceeding the capacity
5152of the control blocks in our pipeline, we wait until this instruction reaches
5153the hot seat, thereby allowing us non-speculative access to~rP.
5154
5155@<Finish a \.{CSWAP}@>=
5156if (data!=old_hot) wait(1);
5157if (data->x.o.h==g[rP].o.h && data->x.o.l==g[rP].o.l) {
5158  data->a.o.l=1; /* |data->a.o.h| is zero */
5159  data->x.o=data->b.o;
5160}@+else {
5161  g[rP].o=data->x.o; /* |data->a.o| is zero */
5162  if (verbose&issue_bit) {
5163    printf(" setting rP=");@+print_octa(g[rP].o);@+printf("\n");
5164  }
5165}
5166data->i=cswap; /* cosmetic change, affects the trace output only */
5167goto fin_ex;
5168
5169@* The fetch stage. Now that we've mastered the most difficult memory
5170operations, we can relax and apply our knowledge to the slightly simpler task
5171of filling the fetch buffer. Fetching is like loading/storing, except that we
5172use the I-cache instead of the D-cache. It's slightly simpler because the
5173I-cache is read-only. Further simplifications would be possible if there
5174were no \.{PREGO} instruction, because there is only one fetch unit.
5175However, we want to implement \.{PREGO} with reasonable efficiency, in order
5176to see if that instruction is worthwhile; so we include the complications of
5177simultaneous I-cache and IT-cache readers, which we
5178have already implemented for the D-cache and DT-cache.
5179
5180The fetch coroutine is always present, as the one and only coroutine with
5181|stage| number~zero.
5182
5183In normal circumstances, the fetch coroutine accesses a cache block containing
5184the instruction whose virtual address is given by |inst_ptr| (the instruction
5185pointer), and transfers up to |fetch_max| instructions from that block to the
5186fetch buffer. Complications arise if the instruction isn't in the cache, or if
5187we can't translate the virtual address because of a miss in the IT-cache.
5188Moreover, |inst_ptr| is a \&{spec} variable whose value might not even be
5189known; if |inst_ptr.p| is nonnull, we don't know what to fetch.
5190@^program counter@>
5191
5192@<External v...@>=
5193Extern spec inst_ptr; /* the instruction pointer (aka program counter) */
5194Extern octa *fetched; /* buffer for incoming instructions */
5195
5196@ The fetch coroutine usually begins a cycle in state |fetch_ready|, with
5197the most recently fetched octabytes in positions |fetch_lo|, |fetch_lo+1|,
5198\dots, |fetch_hi-1| of a buffer called |fetched|. Once that buffer has been
5199exhausted, the coroutine reverts to state~0; with luck, the buffer might have
5200more data by the time the next cycle rolls around.
5201
5202@<Glob...@>=
5203int fetch_lo, fetch_hi; /* the active region of that buffer */
5204coroutine fetch_co;
5205control fetch_ctl;
5206
5207@ @<Initialize e...@>=
5208fetch_co.ctl=&fetch_ctl;
5209fetch_co.name="Fetch";
5210fetch_ctl.go.o.l=4;
5211startup(&fetch_co,1);
5212
5213@ @<Restart the fetch coroutine@>=
5214if (fetch_co.lockloc) *(fetch_co.lockloc)=NULL,fetch_co.lockloc=NULL;
5215unschedule(&fetch_co);
5216startup(&fetch_co,1);
5217
5218@ Some of the actions here are done not only by the fetcher but also by the
5219first and second stages of a |prego| operation.
5220
5221@d wait_or_pass(t) if (data->i==prego) {@+pass_after(t);@+goto passit;@+}
5222                   else wait(t)
5223
5224@<Simulate an action of the fetch coroutine@>=
5225switch0:@+ switch(data->state) {
5226 new_fetch: data->state=0;
5227 case 0: @<Wait, if necessary, until the instruction pointer is known@>;
5228   data->y.o=inst_ptr.o;
5229   data->state=1;@+ data->interrupt=0;@+ data->x.o=data->z.o=zero_octa;
5230 case 1: start_fetch:@+ if (data->y.o.h&sign_bit)
5231    @<Begin fetch with known physical address@>;
5232  if (page_bad) goto bad_fetch;
5233  if (ITcache->lock || (j=get_reader(ITcache))<0) wait(1);
5234  startup(&ITcache->reader[j],ITcache->access_time);
5235  @<Look up the address in the IT-cache, and also in the I-cache if possible@>;
5236  wait_or_pass(ITcache->access_time);
5237  @<Other cases for the fetch coroutine@>@;
5238}
5239
5240@ @<Handle special cases for operations like |prego| and |ldvts|@>=
5241if (data->i==prego) goto start_fetch;
5242
5243@ @<Wait, if necessary, until the instruction pointer is known@>=
5244if (inst_ptr.p) {
5245  if (inst_ptr.p!=UNKNOWN_SPEC && inst_ptr.p->known)
5246    inst_ptr.o=inst_ptr.p->o, inst_ptr.p=NULL;
5247  wait(1);
5248}
5249
5250@ @d got_IT 19   /* |state| when IT-cache entry has been computed */
5251@d IT_miss 20 /* |state| when IT-cache doesn't hold the key */
5252@d IT_hit 21 /* |state| when physical instruction address is known */
5253@d Ihit_and_miss 22 /* |state| when I-cache misses */
5254@d fetch_ready 23 /* |state| when instructions have been read */
5255@d got_one 24 /* |state| when a ``preview'' octabyte is ready */
5256
5257@<Look up the address in the IT-cache...@>=
5258p=cache_search(ITcache,trans_key(data->y.o));
5259if (!Icache || Icache->lock || (j=get_reader(Icache))<0)
5260  @<Begin fetch without I-cache lookup@>;
5261startup(&Icache->reader[j],Icache->access_time);
5262if (p) @<Do a simultaneous lookup in the I-cache@>@;
5263else data->state=IT_miss;
5264
5265@ We assume that it is possible to look up a virtual address in the IT-cache
5266at the same time as we look for a corresponding physical address in the
5267I-cache, provided that the lower $b+c$ bits of the two addresses are the same.
5268(See the remarks about ``page coloring,'' when we made similar assumptions
5269about the DT-cache and D-cache.)
5270@^page coloring@>
5271
5272@<Do a simultaneous lookup in the I-cache@>=
5273{
5274  @<Update IT-cache usage and check the protection bits@>;
5275  data->z.o=phys_addr(data->y.o,p->data[0]);
5276  if (Icache->b+Icache->c>page_s &&@|
5277      ((data->y.o.l^data->z.o.l)&((Icache->bb<<Icache->c)-(1<<page_s))))
5278    data->state=IT_hit; /* spurious I-cache lookup */
5279  else {
5280    q=cache_search(Icache,data->z.o);
5281    if (q) {
5282      q=use_and_fix(Icache,q);
5283      @<Copy the data from block~|q| to |fetched|@>;
5284      data->state=fetch_ready;
5285    }@+else data->state=Ihit_and_miss;
5286  }
5287  wait_or_pass(max(ITcache->access_time,Icache->access_time));
5288}
5289
5290@ @<Update IT-cache usage and check the protection bits@>=
5291p=use_and_fix(ITcache,p);
5292if (!(p->data[0].l&(PX_BIT>>PROT_OFFSET))) goto bad_fetch;
5293
5294@ At this point |inst_ptr.o| equals |data->y.o|.
5295
5296@<Copy the data from block~|q| to |fetched|@>=
5297if (data->i!=prego) {
5298  for (j=0;j<Icache->bb>>3;j++) fetched[j]=q->data[j];
5299  fetch_lo=(inst_ptr.o.l&(Icache->bb-1))>>3;
5300  fetch_hi=Icache->bb>>3;
5301}
5302
5303@ @<Begin fetch without I-cache lookup@>=
5304{
5305  if (p) {
5306    @<Update IT-cache usage and check the protection bits@>;
5307    data->z.o=phys_addr(data->y.o,p->data[0]);
5308    data->state=IT_hit;
5309  }@+ else data->state=IT_miss;
5310  wait_or_pass(ITcache->access_time);
5311}
5312
5313@ @<Begin fetch with known physical address@>=
5314{
5315  if (data->i==prego && !(data->loc.h&sign_bit)) goto fin_ex;
5316  data->z.o=data->y.o;@+ data->z.o.h -= sign_bit;
5317 known_phys:@+  if (data->z.o.h&0xffff0000) goto bad_fetch;
5318  if (!Icache) @<Read from memory into |fetched|@>;
5319  if (Icache->lock || (j=get_reader(Icache))<0) {
5320    data->state=IT_hit;@+ wait_or_pass(1);
5321  }
5322  startup(&Icache->reader[j],Icache->access_time);
5323  q=cache_search(Icache,data->z.o);
5324  if (q) {
5325    q=use_and_fix(Icache,q);
5326    @<Copy the data from block~|q| to |fetched|@>;
5327    data->state=fetch_ready;
5328  }@+else data->state=Ihit_and_miss;
5329  wait_or_pass(Icache->access_time);
5330}
5331
5332@ @<Read from memory into |fetched|@>=
5333{@+octa addr;
5334  addr=data->z.o;
5335  if (mem_lock) wait(1);
5336  set_lock(&mem_locker,mem_lock);
5337  startup(&mem_locker,mem_addr_time+mem_read_time);
5338  addr.l&=-(bus_words<<3);
5339  fetched[0]=mem_read(addr);
5340  for (j=1;j<bus_words;j++)
5341    fetched[j]=mem_hash[last_h].chunk[((addr.l&0xffff)>>3)+j];
5342  fetch_lo=(data->z.o.l>>3)&(bus_words-1);@+ fetch_hi=bus_words;
5343  data->state=fetch_ready;
5344  wait(mem_addr_time+mem_read_time);
5345}
5346
5347@ @<Other cases for the fetch coroutine@>=
5348 case IT_miss:@+if (ITcache->filler.next)
5349     if (data->i==prego) goto fin_ex;@+else wait(1);
5350   if (no_hardware_PT || page_f)
5351     @<Insert dummy instruction for page table emulation@>;
5352   p=alloc_slot(ITcache,trans_key(data->y.o));
5353   if (!p) /* hey, it was present after all */
5354     if (data->i==prego) goto fin_ex;@+else goto new_fetch;
5355   data->ptr_b=ITcache->filler_ctl.ptr_b=(void *)p;
5356   ITcache->filler_ctl.y.o=data->y.o;
5357   set_lock(self,ITcache->fill_lock);
5358   startup(&ITcache->filler,1);
5359   data->state=got_IT;
5360   if (data->i==prego) goto fin_ex;@+else sleep;
5361 case got_IT: release_lock(self,ITcache->fill_lock);
5362   if (!(data->z.o.l&(PX_BIT>>PROT_OFFSET))) goto bad_fetch;
5363   data->z.o=phys_addr(data->y.o,data->z.o);
5364 fetch_retry: data->state=IT_hit;
5365 case IT_hit:@+if (data->i==prego) goto fin_ex;@+else goto known_phys;
5366 case Ihit_and_miss:
5367    @<Try to get the contents of location |data->z.o| in the I-cache@>;
5368
5369@ @<Special cases for states in later stages@>=
5370case IT_miss: case Ihit_and_miss: case IT_hit: case fetch_ready: goto switch0;
5371
5372@ @<Try to get the contents of location |data->z.o| in the I-cache@>=
5373if (Icache->filler.next) goto fetch_retry;
5374if ((Scache&&Scache->lock) || (!Scache&&mem_lock)) goto fetch_retry;
5375q=alloc_slot(Icache,data->z.o);
5376if (!q) goto fetch_retry;
5377if (Scache) set_lock(&Icache->filler,Scache->lock)@;
5378else set_lock(&Icache->filler,mem_lock);
5379set_lock(self,Icache->fill_lock);
5380data->ptr_b=Icache->filler_ctl.ptr_b=(void *)q;
5381Icache->filler_ctl.z.o=data->z.o;
5382startup(&Icache->filler,Scache? Scache->access_time: mem_addr_time);
5383data->state=got_one;
5384if (data->i==prego) goto fin_ex;@+else sleep;
5385
5386@ The I-cache filler will wake us up with the octabyte we want, before
5387it has filled the entire cache block. In that case we can fetch one
5388or two instructions before the rest of the block has been loaded.
5389
5390@<Other cases for the fetch coroutine@>=
5391bad_fetch:@+ if (data->i==prego) goto fin_ex;
5392  data->interrupt |= PX_BIT;
5393swym_one: fetched[0].h=fetched[0].l=SWYM<<24;
5394  goto fetch_one;
5395case got_one: fetched[0]=data->x.o; /* a ``preview'' of the new cache data */
5396fetch_one:  fetch_lo=0;@+fetch_hi=1;
5397  data->state=fetch_ready;
5398case fetch_ready:@+if (self->lockloc)
5399    *(self->lockloc)=NULL, self->lockloc=NULL;
5400  if (data->i==prego) goto fin_ex;
5401  for (j=0;j<fetch_max;j++) {
5402    register fetch *new_tail;
5403    if (tail==fetch_bot) new_tail=fetch_top;
5404    else new_tail=tail-1;
5405    if (new_tail==head) break; /* fetch buffer is full */
5406    @<Install a new instruction into the |tail| position@>;
5407    tail=new_tail;
5408    if (sleepy) {
5409      sleepy=false;@+ sleep;
5410    }
5411    inst_ptr.o=incr(inst_ptr.o,4);
5412    if (fetch_lo==fetch_hi) goto new_fetch;
5413  }
5414  wait(1);
5415
5416@ @<Insert dummy instruction for page table emulation@>=
5417{
5418  if (cache_search(ITcache,trans_key(inst_ptr.o))) goto new_fetch;
5419  data->interrupt|=F_BIT;
5420  sleepy=true;
5421  goto swym_one;
5422}
5423
5424@ @<Glob...@>=
5425bool sleepy; /* have we just emitted the page table emulation call? */
5426
5427@ At this point we check for egregiously invalid instructions. (Sometimes
5428the dispatcher will actually allow such instructions to occupy
5429the fetch buffer, for internally generated commands.)
5430
5431@<Install a new instruction into the |tail| position@>=
5432tail->loc=inst_ptr.o;
5433if (inst_ptr.o.l&4) tail->inst=fetched[fetch_lo++].l;
5434else tail->inst=fetched[fetch_lo].h;
5435@^big-endian versus little-endian@>
5436@^little-endian versus big-endian@>
5437tail->interrupt=data->interrupt;
5438i=tail->inst>>24;
5439if (i>=RESUME && i<=SYNC && (tail->inst&bad_inst_mask[i-RESUME]))
5440  tail->interrupt |= B_BIT;
5441tail->noted=false;
5442if (inst_ptr.o.l==breakpoint.l && inst_ptr.o.h==breakpoint.h)
5443  breakpoint_hit=true;
5444
5445@ The commands \.{RESUME}, \.{SAVE}, \.{UNSAVE}, and \.{SYNC} should not have
5446nonzero bits in the positions defined here.
5447
5448@<Global...@>=
5449int bad_inst_mask[4]={0xfffffe,0xffff,0xffff00,0xfffff8};
5450
5451@* Interrupts. The scariest thing about the design of a pipelined machine is
5452the existence of interrupts, which disrupt the smooth flow of a computation in
5453ways that are difficult to anticipate. Fortunately, however, the discipline of
5454a reorder buffer, which forces instructions to be committed in order,
5455allows us to deal with interrupts in a fairly natural way. Our solution to the
5456problems of dynamic scheduling and speculative execution therefore solves the
5457interrupt problem as well.
5458@^interrupts@>
5459
5460\MMIX\ has three kinds of interrupts, which show up as bit codes in the
5461|interrupt| field when an instruction is ready to be committed:
5462|H_BIT| invokes a trip handler, for \.{TRIP} instructions and
5463arithmetic exceptions; |F_BIT| invokes a forced-trap handler, for \.{TRAP}
5464instructions and unimplemented instructions that need to be emulated
5465in software; |E_BIT| invokes a dynamic-trap handler, for external
5466interrupts like I/O signals or for internal interrupts caused by
5467improper instructions.
5468In all three cases, the pipeline control has already been redirected to fetch
5469new instructions starting at the correct handler address by the time an
5470interrupted instruction is ready to be committed.
5471
5472@ Most instructions come to the following part of the program, if they
5473have finished execution with any~1s among the eight trip bits or the
5474eight trap bits.
5475
5476If the trip bits aren't all zero, we want to update the event bits
5477of~rA, or perform an enabled trip handler, or both. If the trap bits
5478are nonzero, we need to hold onto them until we get to the hot seat,
5479when they will be joined with the bits of~rQ and probably cause an interrupt.
5480A load or store instruction with nonzero trap bits will be nullified,
5481not committed.
5482
5483Underflow that is exact and not enabled is ignored, in accordance with
5484the IEEE standard conventions. (This applies also to underflow
5485triggered by |RESUME_SET|.)
5486
5487@d is_load_store(i) (i>=ld && i<=cswap)
5488
5489@<Handle interrupt at end of execution stage@>=
5490{
5491  if ((data->interrupt&0xff) && is_load_store(data->i)) goto state_5;
5492  j=data->interrupt&0xff00;
5493  data->interrupt -= j;
5494  if ((j&(U_BIT+X_BIT))==U_BIT && !(data->ra.o.l & U_BIT)) j&=~U_BIT;
5495  data->arith_exc=(j&~data->ra.o.l)>>8;
5496  if (j&data->ra.o.l) @<Prepare for exceptional trip handler@>;
5497  if (data->interrupt&0xff) goto state_5;
5498}
5499
5500@ Since execution is speculative, an exceptional condition might not
5501be part of the ``real'' computation. Indeed, the present coroutine
5502might have already been deissued.
5503
5504@<Prepare for exceptional trip handler@>=
5505{
5506  i=issued_between(data,cool);
5507  if (i<deissues) goto die;
5508  deissues=i;
5509  old_tail=tail=head;@+resuming=0; /* clear the fetch buffer */
5510  @<Restart the fetch coroutine@>;
5511  cool_hist=data->hist;
5512  for (i=j&data->ra.o.l,m=16;!(i&D_BIT);i<<=1,m+=16);
5513  data->arith_exc |= (j & ~(0x10000 >> (m >> 4))) >> 8;
5514      /* trips taken are not logged as events */
5515  data->go.o.h=0, data->go.o.l=m;
5516  inst_ptr.o=data->go.o, inst_ptr.p=NULL;
5517  data->interrupt |= H_BIT;
5518  goto state_4;
5519}
5520
5521@ @<Prepare to emulate the page translation@>=
5522i=issued_between(data,cool);
5523if (i<deissues) goto die;
5524deissues=i;
5525old_tail=tail=head;@+resuming=0; /* clear the fetch buffer */
5526@<Restart the fetch coroutine@>;
5527cool_hist=data->hist;
5528inst_ptr.p=UNKNOWN_SPEC;
5529data->interrupt |= F_BIT;
5530
5531@ We need to stop dispatching when calling a trip handler from within
5532the reorder buffer,
5533lest we issue an instruction that uses
5534|g[255]| or |rB| as an operand.
5535
5536@<Special cases for states in the first stage@>=
5537emulate_virt: @<Prepare to emulate the page translation@>;
5538state_4: data->state=4;
5539case 4:@+if (dispatch_lock) wait(1);
5540  set_lock(self,dispatch_lock);
5541state_5: data->state=5;
5542case 5:@+if (data!=old_hot) wait(1);
5543  if ((data->interrupt&F_BIT) && data->i!=trap) {
5544    inst_ptr.o=g[rT].o, inst_ptr.p=NULL;
5545    if (is_load_store(data->i)) nullifying=true;
5546  }
5547  if (data->interrupt&0xff) {
5548    g[rQ].o.h |= data->interrupt&0xff;
5549    new_Q.h |= data->interrupt&0xff;
5550    if (verbose&issue_bit) {
5551      printf(" setting rQ=");@+print_octa(g[rQ].o);@+printf("\n");
5552    }
5553  }
5554  goto die;
5555
5556@ The instructions of the previous section appear in the switch for
5557coroutine stage~1 only. We need to use them also in later stages.
5558
5559@<Special cases for states in later stages@>=
5560case 4: goto state_4;
5561case 5: goto state_5;
5562
5563@ @<Special cases of instruction dispatch@>=
5564case trap:@+ if ((flags[op]&X_is_dest_bit) &&
5565                cool->xx<cool_G && cool->xx>=cool_L)
5566    goto increase_L;
5567  if (!g[rT].up->known || !g[rJ].up->known) goto stall;
5568  inst_ptr=specval(&g[rT]); /* traps and emulated ops */
5569  cool->need_b=true, cool->b=specval(&g[255]);
5570case trip: if (!g[rJ].up->known) goto stall;
5571  cool->ren_x=true, spec_install(&g[255],&cool->x);
5572  cool->x.known=true, cool->x.o=g[rJ].up->o;
5573  if (i==trip) cool->go.o=zero_octa;
5574  cool->ren_a=true, spec_install(&g[i==trap? rBB: rB],&cool->a);@+break;
5575
5576@ @<Cases for stage 1 execution@>=
5577case trap: data->interrupt |= F_BIT;@+ data->a.o=data->b.o;@+ goto fin_ex;
5578case trip: data->interrupt |= H_BIT;@+ data->a.o=data->b.o;@+ goto fin_ex;
5579
5580@ The following check is performed at the beginning of every cycle.
5581An instruction in the hot seat can be externally interrupted only if
5582it is ready to be committed and not already marked for tripping
5583or trapping.
5584
5585@<Check for external interrupt@>=
5586g[rI].o=incr(g[rI].o,-1);
5587if (g[rI].o.l==0 && g[rI].o.h==0) {
5588  g[rQ].o.l |= INTERVAL_TIMEOUT, new_Q.l |= INTERVAL_TIMEOUT;
5589    if (verbose&issue_bit) {
5590      printf(" setting rQ=");@+print_octa(g[rQ].o);@+printf("\n");
5591    }
5592  }
5593trying_to_interrupt=false;
5594if (((g[rQ].o.h&g[rK].o.h)||(g[rQ].o.l&g[rK].o.l)) && cool!=hot &&@|
5595     !(hot->interrupt&(E_BIT+F_BIT+H_BIT)) && !doing_interrupt &&@|
5596     !(hot->i==resum)) {
5597  if (hot->owner) trying_to_interrupt=true;
5598  else {
5599    hot->interrupt |= E_BIT;
5600    @<Deissue all but the hottest command@>;
5601    inst_ptr.o=g[rTT].o;@+inst_ptr.p=NULL;
5602  }
5603}
5604
5605@ @<Glob...@>=
5606bool trying_to_interrupt; /* encouraging interruptible operations to pause */
5607bool nullifying; /* stopping dispatch to nullify a load/store command */
5608
5609@ It's possible that the command in the hot seat has been deissued,
5610but only if the simulator has done so at the user's request. Otherwise
5611the test `|i>=deissues|' here will always succeed.
5612
5613The value of |cool_hist| becomes flaky here. We could try to keep it
5614strictly up to date, but the unpredictable nature of external interrupts
5615suggests that we are better off leaving it alone. (It's only a heuristic
5616for branch prediction, and a sufficiently strong prediction will survive
5617one-time glitches due to interrupts.)
5618
5619@<Deissue all but the hottest command@>=
5620i=issued_between(hot,cool);
5621if (i>=deissues) {
5622  deissues=i;
5623  tail=head;@+resuming=0; /* clear the fetch buffer */
5624  @<Restart the fetch coroutine@>;
5625  if (is_load_store(hot->i)) nullifying=true;
5626}
5627
5628@ Even though an interrupted instruction has officially been either
5629``committed'' or ``nullified,'' it stays in the hot seat for
5630two or three extra cycles,
5631while we save enough of the machine state to resume the computation later.
5632
5633%Notice, incidentally, that |H_BIT| and |E_BIT| might both be present
5634%simultaneously. In such cases we first prepare for a trip handler, but
5635%interrupt that for a dynamic trap handler. (Ah, the joys of computer
5636%architecture.)
5637
5638@<Begin an interruption and |break|@>=
5639{
5640  if (!(hot->interrupt&H_BIT)) g[rK].o=zero_octa; /* trap */
5641  if (((hot->interrupt&H_BIT)&&hot->i!=trip) ||@|
5642      ((hot->interrupt&F_BIT)&&hot->i!=trap) ||@|
5643      (hot->interrupt&E_BIT)) doing_interrupt=3, suppress_dispatch=true;
5644  else doing_interrupt=2; /* trip or trap started by dispatcher */
5645  break;
5646}
5647
5648@ If a memory failure occurs, we should set rF here, either in
5649case~2 or case~1. The simulator doesn't do anything with~rF at present.
5650
5651@<Perform one cycle of the interrupt preparations@>=
5652switch (doing_interrupt--) {
5653 case 3: @<Set resumption registers $\rm(rB,\$255)$ or $\rm(rBB,\$255)$@>;
5654  @+break;
5655 case 2: @<Set resumption registers $\rm(rW,rX)$ or $\rm(rWW,rXX)$@>;@+break;
5656 case 1: @<Set resumption registers $\rm(rY,rZ)$ or $\rm(rYY,rZZ)$@>;
5657  if (hot==reorder_bot) hot=reorder_top;@+ else hot--;
5658  break;
5659}
5660
5661@ @<Set resumption registers $\rm(rB,\$255)$ or $\rm(rBB,\$255)$@>=
5662j=hot->interrupt&H_BIT;
5663g[j?rB:rBB].o=g[255].o;
5664g[255].o=g[rJ].o;
5665if (verbose&issue_bit) {
5666  if (j) {
5667    printf(" setting rB=");@+print_octa(g[rB].o);
5668  }@+else {
5669    printf(" setting rBB=");@+print_octa(g[rBB].o);
5670  }
5671  printf(", $255=");@+print_octa(g[255].o);@+printf("\n");
5672}
5673
5674@ Here's where we manufacture the ``ropcodes'' for resumption.
5675
5676@d RESUME_AGAIN 0 /* repeat the command in rX as if in location $\rm rW-4$ */
5677@d RESUME_CONT 1 /* same, but substitute rY and rZ for operands */
5678@d RESUME_SET 2 /* set register \$X to rZ */
5679@d RESUME_TRANS 3 /* install $\rm(rY,rZ)$ into IT-cache or DT-cache,
5680        then |RESUME_AGAIN| */
5681@d pack_bytes(a,b,c,d) ((((((unsigned)(a)<<8)+(b))<<8)+(c))<<8)+(d)
5682
5683@<Set resumption registers $\rm(rW,rX)$ or $\rm(rWW,rXX)$@>=
5684j=pack_bytes(hot->op,hot->xx,hot->yy,hot->zz);
5685if (hot->interrupt&H_BIT) { /* trip */
5686  g[rW].o=incr(hot->loc,4);
5687  g[rX].o.h=sign_bit, g[rX].o.l=j;
5688  if (verbose&issue_bit) {
5689    printf(" setting rW=");@+print_octa(g[rW].o);
5690    printf(", rX=");@+print_octa(g[rX].o);@+printf("\n");
5691  }
5692}@+else { /* trap */
5693  g[rWW].o=hot->go.o;
5694  g[rXX].o.l=j;
5695  if (hot->interrupt&F_BIT) { /* forced */
5696    if (hot->i!=trap) j=RESUME_TRANS; /* emulate page translation */
5697    else if (hot->op==TRAP) j=0x80; /* |TRAP| */
5698    else if (flags[internal_op[hot->op]]&X_is_dest_bit)
5699      j=RESUME_SET; /* emulation */
5700    else j=0x80; /* emulation when r[X] is not a destination */
5701  }@+else { /* dynamic */
5702    if (hot->interim)
5703      j=(hot->i==frem || hot->i==syncd || hot->i==syncid? RESUME_CONT:
5704             RESUME_AGAIN);
5705    else if (is_load_store(hot->i)) j=RESUME_AGAIN;
5706    else j=0x80; /* normal external interruption */
5707  }
5708  g[rXX].o.h=(j<<24)+(hot->interrupt&0xff);
5709  if (verbose&issue_bit) {
5710    printf(" setting rWW=");@+print_octa(g[rWW].o);
5711    printf(", rXX=");@+print_octa(g[rXX].o);@+printf("\n");
5712  }
5713}
5714
5715@ @<Set resumption registers $\rm(rY,rZ)$ or $\rm(rYY,rZZ)$@>=
5716j=hot->interrupt&H_BIT;
5717if ((hot->interrupt&F_BIT) && hot->op==SWYM) g[rYY].o=hot->go.o;
5718else g[j?rY:rYY].o=hot->y.o;
5719if (hot->i==st || hot->i==pst) g[j?rZ:rZZ].o=hot->x.o;
5720else g[j?rZ:rZZ].o=hot->z.o;
5721if (verbose&issue_bit) {
5722  if (j) {
5723    printf(" setting rY=");@+print_octa(g[rY].o);
5724    printf(", rZ=");@+print_octa(g[rZ].o);@+printf("\n");
5725  }@+else {
5726    printf(" setting rYY=");@+print_octa(g[rYY].o);
5727    printf(", rZZ=");@+print_octa(g[rZZ].o);@+printf("\n");
5728  }
5729}
5730
5731@ Whew; we've successfully interrupted the computation. The remaining
5732task is to restart it again, as transparently as possible.
5733
5734The \.{RESUME} instruction waits for the pipeline to drain, because
5735it has to do such drastic things. For example, an interrupt may be
5736occurring at this very moment, changing the registers needed for resumption.
5737
5738@<Special cases of instruction dispatch@>=
5739case resume:@+ if (cool!=old_hot) goto stall;
5740  inst_ptr=specval(&g[cool->zz? rWW:rW]);
5741  if (!(cool->loc.h&sign_bit)) {
5742    if (cool->zz) cool->interrupt |= K_BIT;
5743    else if (inst_ptr.o.h&sign_bit) cool->interrupt |= P_BIT;
5744  }
5745  if (cool->interrupt) {
5746    inst_ptr.o=incr(cool->loc,4);@+cool->i=noop;
5747  }@+ else {
5748    cool->go.o=inst_ptr.o;
5749    if (cool->zz) {
5750      @<Magically do an I/O operation, if |cool->loc| is rT@>;
5751      cool->ren_a=true, spec_install(&g[rK],&cool->a);
5752      cool->a.known=true, cool->a.o=g[255].o;
5753      cool->ren_x=true, spec_install(&g[255],&cool->x);
5754      cool->x.known=true, cool->x.o=g[rBB].o;
5755    }
5756    cool->b= specval(&g[cool->zz? rXX:rX]);
5757    if (!(cool->b.o.h&sign_bit)) @<Resume an interrupted operation@>;
5758  }@+break;
5759
5760@ Here we set |cool->i=resum|, since we want to issue another instruction
5761after the \.{RESUME} itself.
5762
5763The restrictions on inserted instructions are designed to ensure that
5764those instructions will be the very next ones issued. (If, for example,
5765an |incgamma| instruction were necessary, it might cause a page fault
5766and we'd lose the operand values for |RESUME_SET| or |RESUME_CONT|.)
5767
5768A subtle point arises here: If |RESUME_TRANS| is being used to compute
5769the page translation of virtual address zero, we don't want to execute
5770the dummy \.{SWYM} instruction from virtual address $-4$! So we avoid
5771the \.{SWYM} altogether.
5772
5773@<Resume an interrupted operation@>=
5774{
5775  cool->xx=cool->b.o.h>>24, cool->i=resum;
5776  head->loc=incr(inst_ptr.o,-4);
5777  switch(cool->xx) {
5778 case RESUME_SET: cool->b.o.l=(SETH<<24)+(cool->b.o.l&0xff0000);
5779  head->interrupt|=cool->b.o.h&0xff00;
5780  resuming=2;
5781 case RESUME_CONT: resuming+=1+cool->zz;
5782  if (((cool->b.o.l>>24)&0xfa)!=0xb8) { /* not |syncd| or |syncid| */
5783    m=cool->b.o.l>>28;
5784    if ((1<<m)&0x8f30) goto bad_resume;
5785    m=(cool->b.o.l>>16)&0xff;
5786    if (m>=cool_L && m<cool_G) goto bad_resume;
5787  }
5788 case RESUME_AGAIN: resume_again: head->inst=cool->b.o.l;
5789  m=head->inst>>24;
5790  if (m==RESUME) goto bad_resume; /* avoid uninterruptible loop */
5791  if (!cool->zz &&
5792    m>RESUME && m<=SYNC && (head->inst&bad_inst_mask[m-RESUME]))
5793      head->interrupt|=B_BIT;
5794  head->noted=false;@+break;
5795 case RESUME_TRANS:@+if (cool->zz) {
5796    cool->y=specval(&g[rYY]), cool->z=specval(&g[rZZ]);
5797    if ((cool->b.o.l>>24)!=SWYM) goto resume_again;
5798    cool->i=resume;@+break; /* see ``subtle point'' above */
5799  }
5800 default: bad_resume: cool->interrupt |= B_BIT, cool->i=noop;
5801  resuming=0;@+break;
5802  }
5803}
5804
5805@ @<Insert special operands when resuming an interrupted operation@>=
5806{
5807  if (resuming&1) {
5808    cool->y=specval(&g[rY]);
5809    cool->z=specval(&g[rZ]);
5810  }@+else {
5811    cool->y=specval(&g[rYY]);
5812    cool->z=specval(&g[rZZ]);
5813  }
5814  if (resuming>=3) { /* |RESUME_SET| */
5815    cool->need_ra=true, cool->ra=specval(&g[rA]);
5816  }
5817  cool->usage=false;
5818}
5819
5820@ @d do_resume_trans 17 /* |state| for performing |RESUME_TRANS| actions */
5821
5822@<Cases for stage 1 execution@>=
5823case resume: case resum:@+if (data->xx!=RESUME_TRANS) goto fin_ex;
5824 data->ptr_a=(void*)((data->b.o.l>>24)==SWYM? ITcache: DTcache);
5825 data->state=do_resume_trans;
5826 data->z.o=incr(oandn(data->z.o,page_mask),data->z.o.l&7);
5827 data->z.o.h &= 0xffff;
5828 goto resume_trans;
5829
5830@ @<Special cases for states in the first stage@>=
5831case do_resume_trans: resume_trans: {@+register cache*c=(cache*)data->ptr_a;
5832   if (c->lock) wait(1);
5833   if (c->filler.next) wait(1);
5834   p=alloc_slot(c,trans_key(data->y.o));
5835   if (p) {
5836     c->filler_ctl.ptr_b=(void*)p;
5837     c->filler_ctl.y.o=data->y.o;
5838     c->filler_ctl.b.o=data->z.o;
5839     c->filler_ctl.state=1;
5840     schedule(&c->filler,c->access_time,1);
5841   }
5842   goto fin_ex;
5843 }
5844
5845
5846@* Administrative operations.
5847The internal instructions that handle the register stack simply reduce
5848to things we already know how to do. (Well, the internal instructions
5849for saving and unsaving do sometimes lead to special cases, based on
5850|data->op|; for the most part, though, the necessary mechanisms are
5851already present.)
5852
5853@<Cases for stage 1 execution@>=
5854case noop:@+if (data->interrupt&F_BIT) goto emulate_virt;
5855case incrl: case unsave: goto fin_ex;
5856case jmp: case pushj: data->go.o=data->z.o; goto fin_ex;
5857case sav:@+if (!(data->mem_x)) goto fin_ex;
5858case incgamma: case save: data->i=st; goto switch1;
5859case decgamma: case unsav: data->i=ld; goto switch1;
5860
5861@ We can \.{GET} special registers $\ge21$ (that is, rA, rF, rP, rW--rZ,
5862or rWW--rZZ) only in the hot seat, because those registers are
5863implicit outputs of many instructions.
5864
5865The same applies to rK, since it is changed by \.{TRAP} and
5866by emulated instructions.
5867
5868Likewise, rQ must not be prematurely gotten.
5869
5870@<Cases for stage 1...@>=
5871case get:@+ if (data->zz>=21 || data->zz==rK || data->zz==rQ) {
5872   if (data!=old_hot) wait(1);
5873   data->z.o=g[data->zz].o;
5874 }
5875 data->x.o=data->z.o;@+goto fin_ex;
5876
5877@ A \.{PUT} is, similarly, delayed in the cases that hold |dispatch_lock|.
5878This program does not restrict the 1~bits that might be
5879\.{PUT} into~rQ, although the contents of that register can have
5880drastic implications.
5881
5882@<Cases for stage 1...@>=
5883case put:@+if (data->xx==8 || (data->xx>=15 && data->xx<=20)) {
5884   if (data!=old_hot) wait(1);
5885   switch (data->xx) {
5886  case rV: @<Update the \\{page} variables@>;@+break;
5887  case rQ: new_Q.h |= data->z.o.h &~ g[rQ].o.h;@+
5888           new_Q.l |= data->z.o.l &~ g[rQ].o.l;
5889           data->z.o.l |= new_Q.l;@+
5890           data->z.o.h |= new_Q.h;@+break;
5891  case rL:@+ if (data->z.o.h!=0) data->z.o.h=0, data->z.o.l=g[rL].o.l;
5892     else if (data->z.o.l>g[rL].o.l) data->z.o.l=g[rL].o.l;
5893  default: break;
5894  case rG: @<Update rG@>;@+break;
5895   }
5896 }@+else if (data->xx==rA && (data->z.o.h!=0 || data->z.o.l>=0x40000))
5897   data->interrupt |= B_BIT, data->z.o.h=0, data->z.o.l&=0x3ffff;
5898 data->x.o=data->z.o;@+goto fin_ex;
5899
5900@ When rG decreases, we assume that up to |commit_max| marginal registers can
5901be zeroed during each clock cycle. (Remember that we're currently in the hot
5902seat, and holding |dispatch_lock|.)
5903
5904@<Update rG@>=
5905if (data->z.o.h!=0 || data->z.o.l>=256 ||
5906      data->z.o.l<g[rL].o.l || data->z.o.l<32)
5907  data->interrupt |= B_BIT, data->z.o=g[rG].o;
5908else if (data->z.o.l<g[rG].o.l) {
5909    data->interim=true; /* potentially interruptible */
5910    for (j=0;j<commit_max;j++) {
5911      g[rG].o.l--;
5912      g[g[rG].o.l].o=zero_octa;
5913      if (data->z.o.l==g[rG].o.l) break;
5914    }
5915    if (j==commit_max) {
5916      if (!trying_to_interrupt) wait(1);
5917    }@+else data->interim=false;
5918  }
5919
5920@ Computed jumps put the desired destination address into the |go| field.
5921
5922@<Cases for stage 1...@>=
5923case go: data->x.o=data->go.o;@+ goto add_go;
5924case pop: data->x.o=data->y.o; data->y.o=data->b.o; /* move rJ to |y| field */
5925case pushgo: add_go: data->go.o=oplus(data->y.o,data->z.o);
5926  if ((data->go.o.h&sign_bit) && !(data->loc.h&sign_bit))
5927    data->interrupt |= P_BIT;
5928  data->go.known=true;@+goto fin_ex;
5929
5930@ The instruction \.{UNSAVE} $z$ generates a sequence of internal instructions
5931that accomplish the actual unsaving. This sequence is controlled by the
5932instruction currently in the fetch buffer, which changes its X and~Y fields
5933until all global registers have been loaded. The first instructions of the
5934sequence are \.{UNSAVE}~$0,0,z$; \.{UNSAVE}~$1,rZ,z-8$;
5935\.{UNSAVE}~$1,rY,z-16$; \dots;
5936\.{UNSAVE}~$1,rB,z-96$; \.{UNSAVE}~$2,255,z-104$; \.{UNSAVE}~$2,254,z-112$;
5937etc. If an interrupt occurs before these instructions have all been committed,
5938the execution register will contain enough information to restart the process.
5939
5940After the global registers have all been loaded, \.{UNSAVE} continues by
5941acting rather like~\.{POP}. An interrupt occurring during this last stage
5942will find $\rm rS<rO$; a context switch might then take us back to
5943restoring the local registers again. But no information will be lost,
5944even though the register from which we began unsaving has long since
5945been replaced.
5946
5947@<Special cases of instruction dispatch@>=
5948case unsave:@+if (cool->interrupt&B_BIT) cool->i=noop;
5949 else {
5950   cool->interim=true;
5951   op=LDOU; /* this instruction needs to be handled by load/store unit */
5952   cool->i=unsav;
5953   switch(cool->xx) {
5954 case 0:@+ if (cool->z.p) goto stall;
5955  @<Set up the first phase of unsaving@>;@+break;
5956 case 1: case 2: @<Generate an instruction to unsave |g[yy]|@>;@+break;
5957 case 3: cool->i=unsave, cool->interim=false, op=UNSAVE;
5958   goto pop_unsave;
5959 default: cool->interim=false,cool->i=noop,cool->interrupt|=B_BIT;@+break;
5960   }
5961 }
5962break; /* this takes us to |dispatch_done| */
5963
5964@ @<Generate an instruction to unsave |g[yy]|@>=
5965cool->ren_x=true, spec_install(&g[cool->yy],&cool->x);
5966new_O=new_S=incr(cool_O,-1);
5967cool->z.o=shift_left(new_O,3);
5968cool->ptr_a=(void*)mem.up;
5969
5970@ @<Set up the first phase of unsaving@>=
5971cool->ren_x=true, spec_install(&g[rG],&cool->x);
5972cool->ren_a=true, spec_install(&g[rA],&cool->a);
5973new_O=new_S=shift_right(cool->z.o,3,1);
5974cool->set_l=true, spec_install(&g[rL],&cool->rl);
5975cool->ptr_a=(void*)mem.up;
5976
5977@ @<Get ready for the next step of \.{UNSAVE}@>=
5978switch (cool->xx) {
5979 case 0: head->inst=pack_bytes(UNSAVE,1,rZ,0);@+ break;
5980 case 1:@+ if (cool->yy==rP) head->inst=pack_bytes(UNSAVE,1,rR,0);
5981  else if (cool->yy==0) head->inst=pack_bytes(UNSAVE,2,255,0);
5982  else head->inst=pack_bytes(UNSAVE,1,cool->yy-1,0);@+ break;
5983 case 2:@+ if (cool->yy==cool_G) head->inst=pack_bytes(UNSAVE,3,0,0);
5984  else head->inst=pack_bytes(UNSAVE,2,cool->yy-1,0);@+ break;
5985}
5986
5987@ @<Handle an internal \.{UNSAVE} when it's time to load@>=
5988if (data->xx==0) {
5989  data->a.o=data->x.o;@+data->a.o.h &=0xffffff; /* unsaved rA */
5990  data->x.o.l=data->x.o.h>>24;@+data->x.o.h=0; /* unsaved rG */
5991  if (data->a.o.h || (data->a.o.l&0xfffc0000)) {
5992    data->a.o.h=0, data->a.o.l&=0x3ffff;@+ data->interrupt |= B_BIT;
5993  }
5994  if (data->x.o.l<32) {
5995    data->x.o.l=32;@+ data->interrupt |= B_BIT;
5996  }
5997}
5998goto fin_ex;
5999
6000@ Of course \.{SAVE} is handled essentially like \.{UNSAVE}, but backwards.
6001
6002@<Special cases of instruction dispatch@>=
6003case save:@+if (cool->xx<cool_G) cool->interrupt|=B_BIT;
6004 if (cool->interrupt&B_BIT) cool->i=noop;
6005 else if (((cool_S.l-cool_O.l-cool_L-1)&lring_mask)==0)
6006      @<Insert an instruction to advance gamma@>@;
6007 else {
6008   cool->interim=true;
6009   cool->i=sav;
6010   switch(cool->zz) {
6011 case 0: @<Set up the first phase of saving@>;@+break;
6012 case 1:@+if (cool_O.l!=cool_S.l) @<Insert an instruction to advance gamma@>@;
6013   cool->zz=2;@+ cool->yy=cool_G;
6014 case 2: case 3: @<Generate an instruction to save |g[yy]|@>;@+break;
6015 default: cool->interim=false,cool->i=noop,cool->interrupt|=B_BIT;@+break;
6016   }
6017 }
6018break;
6019
6020@ If an interrupt occurs during the first phase, say between two |incgamma|
6021instructions, the value |cool->zz=1| will get things restarted properly.
6022(Indeed, if context is saved and unsaved during the interrupt, many
6023|incgamma| instructions may no longer be necessary.)
6024
6025@<Set up the first phase of saving@>=
6026cool->zz=1;
6027cool->ren_x=true, spec_install(&l[(cool_O.l+cool_L)&lring_mask],&cool->x);
6028cool->x.known=true, cool->x.o.h=0, cool->x.o.l=cool_L;
6029cool->set_l=true, spec_install(&g[rL],&cool->rl);
6030new_O=incr(cool_O,cool_L+1);
6031
6032@ @<Generate an instruction to save |g[yy]|@>=
6033op=STOU; /* this instruction needs to be handled by load/store unit */
6034cool->mem_x=true, spec_install(&mem,&cool->x);
6035cool->z.o=shift_left(cool_O,3);
6036new_O=new_S=incr(cool_O,1);
6037if (cool->zz==3 && cool->yy>rZ) @<Do the final \.{SAVE}@>@;
6038else cool->b=specval(&g[cool->yy]);
6039
6040@ The final \.{SAVE} instruction not only stores rG and rA, it also
6041places the final address in global register~X.
6042
6043@<Do the final \.{SAVE}@>=
6044{
6045  cool->i=save;
6046  cool->interim=false;
6047  cool->ren_a=true, spec_install(&g[cool->xx],&cool->a);
6048}
6049
6050@ @<Get ready for the next step of \.{SAVE}@>=
6051switch (cool->zz) {
6052 case 1: head->inst=pack_bytes(SAVE,cool->xx,0,1);@+ break;
6053 case 2:@+ if (cool->yy==255) head->inst=pack_bytes(SAVE,cool->xx,0,3);
6054  else head->inst=pack_bytes(SAVE,cool->xx,cool->yy+1,2);@+break;
6055 case 3:@+ if (cool->yy==rR) head->inst=pack_bytes(SAVE,cool->xx,rP,3);
6056  else head->inst=pack_bytes(SAVE,cool->xx,cool->yy+1,3);@+break;
6057}
6058
6059@ @<Handle an internal \.{SAVE} when it's time to store@>=
6060{
6061  if (data->interim) data->x.o=data->b.o;
6062  else {
6063    if (data!=old_hot) wait(1); /* we need the hottest value of rA */
6064    data->x.o.h=g[rG].o.l<<24;
6065    data->x.o.l=g[rA].o.l;
6066    data->a.o=data->y.o;
6067  }
6068  goto fin_ex;
6069}
6070
6071@* More register-to-register ops.
6072Now that we've finished most of the hard stuff,
6073we can relax and fill in the holes that we left in the
6074all-register parts of the execution stages.
6075
6076First let's complete the fixed point arithmetic operations,
6077by dispensing with multiplication and division.
6078
6079@<Cases to compute the results of reg...@>=
6080case mulu: data->x.o=omult(data->y.o,data->z.o);
6081  data->a.o=aux;
6082  goto quantify_mul;
6083case mul: data->x.o=signed_omult(data->y.o,data->z.o);
6084  if (overflow) data->interrupt |= V_BIT;
6085quantify_mul: aux=data->z.o;
6086  for (j=mul0;aux.l||aux.h;j++) aux=shift_right(aux,8,1);
6087  data->i=j;@+break; /* |j| is |mul0| or |mul1| or \dots~or |mul8| */
6088case divu: data->x.o=odiv(data->b.o,data->y.o,data->z.o);
6089  data->a.o=aux;@+data->i=div;@+break;
6090case div:@+ if (data->z.o.l==0 && data->z.o.h==0) {
6091    data->interrupt |= D_BIT;@+ data->a.o=data->y.o;
6092    data->i=set; /* divide by zero needn't wait in the pipeline */
6093  }@+else {
6094    data->x.o=signed_odiv(data->y.o,data->z.o);
6095    if (overflow) data->interrupt |= V_BIT;
6096    data->a.o=aux;
6097  }@+break;
6098
6099@ Next let's polish off the bitwise and bytewise operations.
6100
6101@<Cases to compute the results of reg...@>=
6102case sadd: data->x.o.l=count_bits(data->y.o.h&~data->z.o.h)
6103                      +count_bits(data->y.o.l&~data->z.o.l);@+ break;
6104case mor: data->x.o=bool_mult(data->y.o,data->z.o,data->op&0x2);@+ break;
6105case bdif: data->x.o.h=byte_diff(data->y.o.h,data->z.o.h);
6106           data->x.o.l=byte_diff(data->y.o.l,data->z.o.l);@+ break;
6107case wdif: data->x.o.h=wyde_diff(data->y.o.h,data->z.o.h);
6108           data->x.o.l=wyde_diff(data->y.o.l,data->z.o.l);@+ break;
6109case tdif:@+ if (data->y.o.h>data->z.o.h)
6110             data->x.o.h=data->y.o.h-data->z.o.h;
6111 tdif_l:@+ if (data->y.o.l>data->z.o.l)
6112             data->x.o.l=data->y.o.l-data->z.o.l;@+ break;
6113case odif:@+ if (data->y.o.h>data->z.o.h)
6114    data->x.o=ominus(data->y.o,data->z.o);
6115  else if (data->y.o.h==data->z.o.h) goto tdif_l;
6116  break;
6117
6118
6119@ The conditional set (\.{CS}) instructions are, rather surprisingly,
6120more difficult to implement than the zero~set (\.{ZS}) instructions,
6121although the \.{ZS} instructions do more. The reason is that dynamic
6122instruction dependencies are more complicated with \.{CS}. Consider, for
6123example, the instructions
6124$$\advance\abovedisplayskip-.5\baselineskip
6125  \advance\belowdisplayskip-.5\baselineskip
6126\hbox{\tt LDO x,a,b; \ FDIV y,c,d; \ CSZ y,x,0; \ INCL y,1.}$$
6127If the value of \.x is zero, the \.{INCL} instruction need not wait for the
6128division to be completed. (We do not, however, abort the division in such a
6129case; it might invoke a trip handler, or change the inexact bit, etc. Our
6130policy is to treat common cases efficiently and to treat all cases correctly,
6131but not to treat all cases with maximum efficiency.)
6132
6133@<Cases to compute the results...@>=
6134case zset:@+if (register_truth(data->y.o,data->op)) data->x.o=data->z.o;
6135  /* otherwise |data->x.o| is already zero */
6136  goto fin_ex;
6137case cset:@+if (register_truth(data->y.o,data->op))
6138    data->x.o=data->z.o, data->b.p=NULL;
6139  else if (data->b.p==NULL) data->x.o=data->b.o;
6140  else {
6141    data->state=0;@+data->need_b=true;@+goto switch1;
6142  }@+break;
6143
6144@ Floating point computations are mostly handled by the routines in
6145{\mc MMIX-ARITH}, which record anomalous events in the global
6146variable |exceptions|. But we consider the operation trivial if an
6147input is infinite or NaN; and we may need to increase the execution
6148time when subnormals are present.
6149
6150@d ROUND_OFF 1
6151@d ROUND_UP 2
6152@d ROUND_DOWN 3
6153@d ROUND_NEAR 4
6154@d is_subnormal(x) ((x.h&0x7ff00000)==0 && ((x.h&0xfffff) || x.l))
6155@d is_trivial(x) ((x.h&0x7ff00000)==0x7ff00000)
6156@d set_round cur_round=(data->ra.o.l<0x10000? ROUND_NEAR: data->ra.o.l>>16)
6157
6158@<Cases to compute the results of reg...@>=
6159case fadd: set_round;@+data->x.o=fplus(data->y.o,data->z.o);
6160 fin_bflot:@+ if (is_subnormal(data->y.o)) data->denin=denin_penalty;
6161 fin_uflot:@+ if (is_subnormal(data->x.o)) data->denout=denout_penalty;
6162 fin_flot:@+ if (is_subnormal(data->z.o)) data->denin=denin_penalty;
6163   data->interrupt|=exceptions;
6164   if (is_trivial(data->y.o) || is_trivial(data->z.o)) goto fin_ex;
6165   if (data->i==fsqrt && (data->z.o.h&sign_bit)) goto fin_ex;
6166   break;
6167case fsub: data->a.o=data->z.o;
6168  if (fcomp(data->z.o,zero_octa)!=2) data->a.o.h ^= sign_bit;
6169  set_round;@+data->x.o=fplus(data->y.o,data->a.o);
6170  data->i=fadd; /* use pipeline times for addition */
6171  goto fin_bflot;
6172case fmul: set_round;@+ data->x.o=fmult(data->y.o,data->z.o);@+ goto fin_bflot;
6173case fdiv: set_round;@+ data->x.o=fdivide(data->y.o,data->z.o);@+
6174  goto fin_bflot;
6175case fsqrt: set_round;@+ data->x.o=froot(data->z.o,data->y.o.l);@+
6176  goto fin_uflot;
6177case fint: set_round;@+ data->x.o=fintegerize(data->z.o,data->y.o.l);@+
6178  goto fin_uflot;
6179case fix: set_round;@+ data->x.o=fixit(data->z.o,data->y.o.l);
6180  if (data->op&0x2) exceptions&=~W_BIT; /* unsigned case doesn't overflow */
6181  goto fin_flot;
6182case flot: set_round;@+
6183  data->x.o=floatit(data->z.o,data->y.o.l,data->op&0x2, data->op&0x4);
6184  data->interrupt|=exceptions;@+break;
6185
6186@ @<Special cases of instruction dispatch@>=
6187case fsqrt: case fint: case fix: case flot:@+ if (cool->y.o.l>4)
6188    goto illegal_inst;
6189  break;
6190
6191@ @<Cases to compute the results of reg...@>=
6192case feps: j=fepscomp(data->y.o,data->z.o,data->b.o,data->op!=FEQLE);
6193  if (j==2) data->i=fcmp;
6194  else if (is_subnormal(data->y.o) || is_subnormal(data->z.o))
6195    data->denin=denin_penalty;
6196  switch (data->op) {
6197 case FUNE:@+ if (j==2) goto cmp_pos;@+ else goto cmp_zero;
6198 case FEQLE: goto cmp_fin;
6199 case FCMPE:@+ if (j) goto cmp_zero_or_invalid;
6200  }
6201case fcmp: j=fcomp(data->y.o,data->z.o);
6202  if (j<0) goto cmp_neg;
6203 cmp_fin:@+ if (j==1) goto cmp_pos;
6204 cmp_zero_or_invalid:@+ if (j==2) data->interrupt |= I_BIT;
6205  goto cmp_zero;
6206case funeq:@+ if (fcomp(data->y.o,data->z.o)==(data->op==FUN? 2:0))
6207    goto cmp_pos;
6208  else goto cmp_zero;
6209
6210@ @<External v...@>=
6211Extern int frem_max;
6212Extern int denin_penalty, denout_penalty;
6213
6214@ The floating point remainder operation is especially interesting
6215because it can be interrupted when it's in the hot seat.
6216
6217@<Cases to compute the results of reg...@>=
6218case frem:@+if(is_trivial(data->y.o) || is_trivial(data->z.o))
6219    {
6220      data->x.o=fremstep(data->y.o,data->z.o,2500);
6221      data->interrupt |= exceptions;@+ goto fin_ex;
6222    }
6223  if ((self+1)->next) wait(1);
6224  data->interim=true;
6225  j=1;
6226  if (is_subnormal(data->y.o)||is_subnormal(data->z.o)) j+=denin_penalty;
6227  pass_after(j);
6228  goto passit;
6229
6230
6231@ @<Begin execution of a stage-two operation@>=
6232j=1;
6233if (data->i==frem) {
6234  data->x.o=fremstep(data->y.o,data->z.o,frem_max);
6235  if (exceptions&E_BIT) {
6236    data->y.o=data->x.o;
6237    if (trying_to_interrupt && data==old_hot) goto fin_ex;
6238  }@+else {
6239    data->state=3;
6240    data->interim=false;
6241    data->interrupt |= exceptions;
6242    if (is_subnormal(data->x.o)) j+=denout_penalty;
6243  }
6244  wait(j);
6245}
6246
6247@* System operations. Finally we need to implement some operations for the
6248operating system; then the hardware simulation will be done!
6249
6250A \.{LDVTS} instruction is delayed until it reaches the hot seat, because
6251it changes the IT and DT caches. The operating system should use \.{SYNC}
6252after \.{LDVTS} if the effects are needed immediately; the system is also
6253responsible for ensuring that the page table permission bits agree with
6254the \.{LDVTS} permission bits when the latter are nonzero. (Also, if
6255write permission is taken away from a page, the operating system must
6256have previously used \.{SYNCD} to write out any dirty bytes that might
6257have been cached from that page; \.{SYNCD} will be inoperative after write
6258permission goes away.)
6259
6260@<Handle special cases for operations like |prego| and |ldvts|@>=
6261if (data->i==ldvts) @<Do stage 1 of \.{LDVTS}@>;
6262
6263@ @<Do stage 1 of \.{LDVTS}@>=
6264{
6265  if (data!=old_hot) wait(1);
6266  if (DTcache->lock || (j=get_reader(DTcache))<0) wait(1);
6267  startup(&DTcache->reader[j],DTcache->access_time);
6268  data->z.o.h=0, data->z.o.l=data->y.o.l&0x7;
6269  p=cache_search(DTcache,data->y.o); /* N.B.: Not |trans_key(data->y.o)| */
6270  if (p) {
6271    data->x.o.l=2;
6272    if (data->z.o.l) {
6273      p=use_and_fix(DTcache,p);
6274      p->data[0].l=(p->data[0].l&-8)+data->z.o.l;
6275    }@+else {
6276      p=demote_and_fix(DTcache,p);
6277      p->tag.h|=sign_bit; /* invalidate the tag */
6278    }
6279  }
6280  pass_after(DTcache->access_time);@+goto passit;
6281}
6282
6283@ @<Special cases for states in later stages@>=
6284case ld_st_launch:@+ if (ITcache->lock || (j=get_reader(ITcache))<0) wait(1);
6285  startup(&ITcache->reader[j],ITcache->access_time);
6286  p=cache_search(ITcache,data->y.o); /* N.B.: Not |trans_key(data->y.o)| */
6287  if (p) {
6288    data->x.o.l|=1;
6289    if (data->z.o.l) {
6290      p=use_and_fix(ITcache,p);
6291      p->data[0].l=(p->data[0].l&-8)+data->z.o.l;
6292    }@+else {
6293      p=demote_and_fix(ITcache,p);
6294      p->tag.h|=sign_bit; /* invalidate the tag */
6295    }
6296  }
6297  data->state=3;@+wait(ITcache->access_time);
6298
6299@ The \.{SYNC} operation interacts with the pipeline in interesting ways.
6300\.{SYNC}~\.0 and \.{SYNC}~\.4 are the simplest; they just lock the
6301dispatch and wait until they get to the hot seat, after which the
6302pipeline has drained. \.{SYNC}~\.1 and \.{SYNC}~\.3 put a ``barrier''
6303into the write buffer so that subsequent store instructions will not merge with
6304previous stores. \.{SYNC}~\.2 and \.{SYNC}~\.3 lock the dispatch until
6305all previous load instructions have left the pipeline. \.{SYNC}~\.5,
6306\.{SYNC}~\.6, and \.{SYNC}~\.7 remove things from caches once they
6307get to the hot seat.
6308
6309@<Special cases of instruction dispatch@>=
6310case sync:@+ if (cool->zz>3) {
6311  if (!(cool->loc.h&sign_bit)) goto privileged_inst;
6312  if (cool->zz==4) freeze_dispatch=true;
6313}@+else {
6314  if (cool->zz!=1) freeze_dispatch=true;
6315  if (cool->zz&1) cool->mem_x=true, spec_install(&mem,&cool->x);
6316}@+break;
6317
6318@ @<Cases for stage 1 execution@>=
6319case sync:@+ switch (data->zz) {
6320 case 0: case 4:@+ if (data!=old_hot) wait(1);
6321  halted=(data->zz!=0);@+goto fin_ex;
6322 case 2: case 3: @<Wait if there's an unfinished load ahead of us@>;
6323  release_lock(self,dispatch_lock);
6324 case 1: data->x.addr=zero_octa;@+goto fin_ex;
6325 case 5:@+ if (data!=old_hot) wait(1);
6326  @<Clean the data caches@>;
6327 case 6:@+ if (data!=old_hot) wait(1);
6328  @<Zap the translation caches@>;
6329 case 7:@+ if (data!=old_hot) wait(1);
6330  @<Zap the instruction and data caches@>;
6331}
6332
6333@ @<Wait if there's an unfinished load ahead of us@>=
6334{
6335  register control *cc;
6336  for (cc=data;cc!=hot;) {
6337    cc=(cc==reorder_top? reorder_bot: cc+1);
6338    if (cc->owner && (cc->i==ld || cc->i==ldunc || cc->i==pst)) wait(1);
6339  }
6340}
6341
6342@ Perhaps the delay should be longer here.
6343
6344@<Zap the translation caches@>=
6345if (DTcache->lock || (j=get_reader(DTcache))<0) wait(1);
6346startup(&DTcache->reader[j],DTcache->access_time);
6347set_lock(self,DTcache->lock);
6348zap_cache(DTcache);
6349data->state=10;@+wait(DTcache->access_time);
6350
6351@ @<Zap the instruction and data caches@>=
6352if (!Icache) {
6353  data->state=11;@+goto switch1;
6354}
6355if (Icache->lock || (j=get_reader(Icache))<0) wait(1);
6356startup(&Icache->reader[j],Icache->access_time);
6357set_lock(self,Icache->lock);
6358zap_cache(Icache);
6359data->state=11;@+wait(Icache->access_time);
6360
6361@ @<Special cases for states in the first stage@>=
6362case 10:@+ if (self->lockloc) *(self->lockloc)=NULL,self->lockloc=NULL;
6363 if (ITcache->lock || (j=get_reader(ITcache))<0) wait(1);
6364 startup(&ITcache->reader[j],ITcache->access_time);
6365 set_lock(self,ITcache->lock);
6366 zap_cache(ITcache);
6367 data->state=3;@+wait(ITcache->access_time);
6368case 11:@+ if (self->lockloc) *(self->lockloc)=NULL,self->lockloc=NULL;
6369 if (wbuf_lock) wait(1);
6370 write_head=write_tail, write_ctl.state=0; /* zap the write buffer */
6371 if (!Dcache) {
6372   data->state=12;@+ goto switch1;
6373 }
6374 if (Dcache->lock || (j=get_reader(Dcache))<0) wait(1);
6375 startup(&Dcache->reader[j],Dcache->access_time);
6376 set_lock(self,Dcache->lock);
6377 zap_cache(Dcache);
6378 data->state=12;@+wait(Dcache->access_time);
6379case 12:@+ if (self->lockloc) *(self->lockloc)=NULL,self->lockloc=NULL;
6380 if (!Scache) goto fin_ex;
6381 if (Scache->lock) wait(1);
6382 set_lock(self,Scache->lock);
6383 zap_cache(Scache);
6384 data->state=3;@+wait(Scache->access_time);
6385
6386@ @<Clean the data caches@>=
6387if (self->lockloc) *(self->lockloc)=NULL,self->lockloc=NULL;
6388@<Wait till write buffer is empty@>;
6389if (clean_co.next || clean_lock) wait(1);
6390set_lock(self,clean_lock);
6391clean_ctl.i=sync;@+
6392clean_ctl.state=0;@+
6393clean_ctl.x.o.h=0;
6394startup(&clean_co,1);
6395data->state=13;
6396data->interim=true;
6397wait(1);
6398
6399@ @<Wait till write buffer is empty@>=
6400if (write_head!=write_tail) {
6401  if (!speed_lock) set_lock(self,speed_lock);
6402  wait(1);
6403}
6404
6405@ The cleanup process might take a huge amount of time, so we must allow
6406it to be interrupted. (Servicing the interruption might, of course,
6407put more stuff into the cache.)
6408
6409@<Special cases for states in the first stage@>=
6410case 13:@+ if (!clean_co.next) {
6411   data->interim=false;@+ goto fin_ex; /* it's done! */
6412 }
6413 if (trying_to_interrupt) goto fin_ex; /* accept an interruption */
6414 wait(1);
6415
6416@ Now we consider \.{SYNCD} and \.{SYNCID}. When control comes to this
6417part of the program, |data->y.o| is a virtual address and |data->z.o|
6418is the corresponding physical address; |data->xx+1| is the number of
6419bytes we are supposed to be syncing; |data->b.o.l| is the number of
6420bytes we can handle at once (either |Icache->bb| or |Dcache->bb| or 8192).
6421
6422We need a more elaborate scheme to implement \.{SYNCD} and \.{SYNCID}
6423than we have used for the ``hint'' instructions \.{PRELD}, \.{PREGO},
6424and \.{PREST}, because \.{SYNCD} and \.{SYNCID} are not merely hints.
6425They cannot be converted into a sequence of cache-block-size commands at
6426dispatch time, because we cannot be sure that the starting virtual address
6427will be aligned with the beginning of a cache block. We need to realize
6428that the bytes specified by \.{SYNCD} or \.{SYNCID} might cross a
6429virtual page boundary---possibly with different protection bits
6430on each page. We need to allow for interrupts. And we also need to
6431keep the fetch buffer empty until a user's \.{SYNCID} has completely
6432brought the memory up to date.
6433
6434@<Special cases for states in later stages@>=
6435do_syncid: data->state=30;
6436case 30:@+ if (data!=old_hot) wait(1);
6437 if (!Icache) {
6438   data->state=(data->loc.h&sign_bit? 31:33);@+goto switch2;
6439 }
6440 @<Clean the I-cache block for |data->z.o|, if any@>;
6441 data->state=(data->loc.h&sign_bit? 31: 33);@+wait(Icache->access_time);
6442case 31:@+ if (self->lockloc) *(self->lockloc)=NULL,self->lockloc=NULL;
6443 @<Wait till write buffer is empty@>;
6444 if (((data->b.o.l-1)&~data->y.o.l)<data->xx) data->interim=true;
6445 if (!Dcache) goto next_sync;
6446 @<Clean the D-cache block for |data->z.o|, if any@>;
6447 data->state=32;@+wait(Dcache->access_time);
6448case 32:@+ if (self->lockloc) *(self->lockloc)=NULL,self->lockloc=NULL;
6449 if (!Scache) goto next_sync;
6450 @<Clean the S-cache block for |data->z.o|, if any@>;
6451 data->state=35;@+wait(Scache->access_time);
6452do_syncd: data->state=33;
6453case 33:@+ if (data!=old_hot) wait(1);
6454 if (self->lockloc) *(self->lockloc)=NULL,self->lockloc=NULL;
6455 @<Wait till write buffer is empty@>;
6456 if (((data->b.o.l-1)&~data->y.o.l)<data->xx) data->interim=true;
6457 if (!Dcache)
6458   if (data->i==syncd) goto fin_ex;@+ else goto next_sync;
6459 @<Use |cleanup| on the cache blocks for |data->z.o|, if any@>;
6460 data->state=34;
6461case 34:@+if (!clean_co.next) goto next_sync;
6462 if (trying_to_interrupt && data->interim && data==old_hot) {
6463   data->z.o=zero_octa; /* anticipate |RESUME_CONT| */
6464   goto fin_ex; /* accept an interruption */
6465 }
6466 wait(1);
6467next_sync: data->state=35;
6468case 35:@+ if (self->lockloc) *(self->lockloc)=NULL,self->lockloc=NULL;
6469 if (data->interim) @<Continue this command on the next cache block@>;
6470 data->go.known=true;
6471 goto fin_ex;
6472
6473@ @<Clean the I-cache block for |data->z.o|, if any@>=
6474if (Icache->lock || (j=get_reader(Icache))<0) wait(1);
6475startup(&Icache->reader[j],Icache->access_time);
6476set_lock(self,Icache->lock);
6477p=cache_search(Icache,data->z.o);
6478if (p) {
6479  demote_and_fix(Icache,p);
6480  clean_block(Icache,p);
6481}
6482
6483@ @<Clean the D-cache block for |data->z.o|, if any@>=
6484if (Dcache->lock || (j=get_reader(Dcache))<0) wait(1);
6485startup(&Dcache->reader[j],Dcache->access_time);
6486set_lock(self,Dcache->lock);
6487p=cache_search(Dcache,data->z.o);
6488if (p) {
6489  demote_and_fix(Dcache,p);
6490  clean_block(Dcache,p);
6491}
6492
6493@ @<Clean the S-cache block for |data->z.o|, if any@>=
6494if (Scache->lock) wait(1);
6495set_lock(self,Scache->lock);
6496p=cache_search(Scache,data->z.o);
6497if (p) {
6498  demote_and_fix(Scache,p);
6499  clean_block(Scache,p);
6500}
6501
6502@ @<Use |cleanup| on the cache blocks for |data->z.o|, if any@>=
6503if (clean_co.next || clean_lock) wait(1);
6504set_lock(self,clean_lock);
6505clean_ctl.i=syncd;
6506clean_ctl.state=4;
6507clean_ctl.x.o.h=data->loc.h&sign_bit;
6508clean_ctl.z.o=data->z.o;
6509schedule(&clean_co,1,4);
6510
6511@ We use the fact that cache block sizes are divisors of 8192.
6512
6513@<Continue this command on the next cache block@>=
6514{
6515  data->interim=false;
6516  data->xx -= ((data->b.o.l-1)&~data->y.o.l)+1;
6517  data->y.o=incr(data->y.o,data->b.o.l);
6518  data->y.o.l &= -data->b.o.l;
6519  data->z.o.l = (data->z.o.l&-8192)+(data->y.o.l&8191);
6520  if ((data->y.o.l&8191)==0) goto square_one;
6521      /* maybe crossed a page boundary */
6522  if (data->i==syncd) goto do_syncd;@+else goto do_syncid;
6523}
6524
6525@ If the first page lacks proper protection, we still must try the
6526second, in the rare case that a page boundary is spanned.
6527
6528@<Special cases for states in later stages@>=
6529sync_check:@+ if ((data->y.o.l ^ (data->y.o.l+data->xx))>=8192) {
6530   data->xx -= (8191&~data->y.o.l)+1;
6531   data->y.o=incr(data->y.o,8192);
6532   data->y.o.l &= -8192;
6533   goto square_one;
6534 }
6535 goto fin_ex;
6536
6537@* Input and output. We're done implementing the hardware, but there's
6538still a small matter of software remaining, because we sometimes
6539want to pretend that a real operating
6540system is present without actually having one loaded. This simulator
6541therefore implements a special feature: If \.{RESUME}~\.1 is issued in
6542location~rT, the ten special I/O traps of {\mc MMIX-SIM} are performed
6543instantaneously behind the scenes.
6544
6545Of course all claims of accurate simulation go out the door when this
6546feature is used.
6547
6548@d max_sys_call Ftell
6549
6550@<Type...@>=
6551typedef enum{
6552@!Halt,@!Fopen,@!Fclose,@!Fread,@!Fgets,@!Fgetws,
6553@!Fwrite,@!Fputs,@!Fputws,@!Fseek,@!Ftell} @!sys_call;
6554
6555@ @<Magically do an I/O operation, if |cool->loc| is rT@>=
6556if (cool->loc.l==g[rT].o.l && cool->loc.h==g[rT].o.h) {
6557  register unsigned char yy,zz; octa ma,mb;
6558  if (g[rXX].o.l&0xffff0000) goto magic_done;
6559  yy=g[rXX].o.l>>8, zz=g[rXX].o.l&0xff;
6560  if (yy>max_sys_call) goto magic_done;
6561   @<Prepare memory arguments $|ma|={\rm M}[a]$ and $|mb|={\rm M}[b]$
6562           if needed@>;
6563  switch (yy) {
6564case Halt: @<Either halt or print warning@>;@+break;
6565case Fopen: g[rBB].o=mmix_fopen(zz,mb,ma);@+break;
6566case Fclose: g[rBB].o=mmix_fclose(zz);@+break;
6567case Fread: g[rBB].o=mmix_fread(zz,mb,ma);@+break;
6568case Fgets: g[rBB].o=mmix_fgets(zz,mb,ma);@+break;
6569case Fgetws: g[rBB].o=mmix_fgetws(zz,mb,ma);@+break;
6570case Fwrite: g[rBB].o=mmix_fwrite(zz,mb,ma);@+break;
6571case Fputs: g[rBB].o=mmix_fputs(zz,g[rBB].o);@+break;
6572case Fputws: g[rBB].o=mmix_fputws(zz,g[rBB].o);@+break;
6573case Fseek: g[rBB].o=mmix_fseek(zz,g[rBB].o);@+break;
6574case Ftell: g[rBB].o=mmix_ftell(zz);@+break;
6575}
6576magic_done: g[255].o=neg_one; /* this will enable interrupts */
6577}
6578
6579@ @<Either halt or print warning@>=
6580if (!zz) halted=true;
6581else if (zz==1) {
6582  octa trap_loc;
6583  trap_loc=incr(g[rWW].o,-4);
6584  if (!(trap_loc.h || trap_loc.l>=0xf0))
6585    print_trip_warning(trap_loc.l>>4,incr(g[rW].o,-4));
6586}
6587
6588@ @<Glob...@>=
6589char arg_count[]={1,3,1,3,3,3,3,2,2,2,1};
6590
6591@ The input/output operations invoked by \.{TRAP}s are
6592done by subroutines in an auxiliary program module called {\mc MMIX-IO}.
6593Here we need only declare those subroutines, and write three primitive
6594interfaces on which they depend.
6595
6596@ @<Glob...@>=
6597extern octa mmix_fopen @,@,@[ARGS((unsigned char,octa,octa))@];
6598extern octa mmix_fclose @,@,@[ARGS((unsigned char))@];
6599extern octa mmix_fread @,@,@[ARGS((unsigned char,octa,octa))@];
6600extern octa mmix_fgets @,@,@[ARGS((unsigned char,octa,octa))@];
6601extern octa mmix_fgetws @,@,@[ARGS((unsigned char,octa,octa))@];
6602extern octa mmix_fwrite @,@,@[ARGS((unsigned char,octa,octa))@];
6603extern octa mmix_fputs @,@,@[ARGS((unsigned char,octa))@];
6604extern octa mmix_fputws @,@,@[ARGS((unsigned char,octa))@];
6605extern octa mmix_fseek @,@,@[ARGS((unsigned char,octa))@];
6606extern octa mmix_ftell @,@,@[ARGS((unsigned char))@];
6607extern void print_trip_warning @,@,@[ARGS((int,octa))@];
6608
6609@ @<Internal proto...@>=
6610int mmgetchars @,@,@[ARGS((char*,int,octa,int))@];
6611void mmputchars @,@,@[ARGS((unsigned char*,int,octa))@];
6612char stdin_chr @,@,@[ARGS((void))@];
6613octa magic_read @,@,@[ARGS((octa))@];
6614void magic_write @,@,@[ARGS((octa,octa))@];
6615
6616@ We need to cut through all the complications of buffers and
6617caches in order to do magical I/O. The |magic_read| routine finds
6618the current octabyte in a given physical address by looking at the
6619write buffer, D-cache, S-cache, and memory until finding it.
6620
6621@<Sub...@>=
6622octa magic_read(addr)
6623  octa addr;
6624{
6625  register write_node *q;
6626  register cacheblock *p;
6627  for (q=write_tail;;) {
6628    if (q==write_head) break;
6629    if (q==wbuf_top) q=wbuf_bot;@+ else q++;
6630    if ((q->addr.l&-8)==(addr.l&-8) && q->addr.h==addr.h) return q->o;
6631  }
6632  if (Dcache) {
6633    p=cache_search(Dcache,addr);
6634    if (p) return p->data[(addr.l&(Dcache->bb-1))>>3];
6635    if (((Dcache->outbuf.tag.l^addr.l)&-Dcache->bb)==0 &&
6636          Dcache->outbuf.tag.h==addr.h)
6637      return Dcache->outbuf.data[(addr.l&(Dcache->bb-1))>>3];
6638    if (Scache) {
6639      p=cache_search(Scache,addr);
6640      if (p) return p->data[(addr.l&(Scache->bb-1))>>3];
6641      if (((Scache->outbuf.tag.l^addr.l)&-Scache->bb)==0 &&
6642            Scache->outbuf.tag.h==addr.h)
6643        return Scache->outbuf.data[(addr.l&(Scache->bb-1))>>3];
6644    }
6645  }
6646  return mem_read(addr);
6647}
6648
6649@ The |magic_write| routine changes the octabyte in a given physical
6650address by changing it wherever it appears in a buffer or cache.
6651Any ``dirty'' or ``least recently used'' status remains unchanged.
6652(Yes, this {\it is\/} magic.)
6653
6654@<Sub...@>=
6655void magic_write(addr,val)
6656  octa addr,val;
6657{
6658  register write_node *q;
6659  register cacheblock *p;
6660  for (q=write_tail;;) {
6661    if (q==write_head) break;
6662    if (q==wbuf_top) q=wbuf_bot;@+ else q++;
6663    if ((q->addr.l&-8)==(addr.l&-8) && q->addr.h==addr.h) q->o=val;
6664  }
6665  if (Dcache) {
6666    p=cache_search(Dcache,addr);
6667    if (p) p->data[(addr.l&(Dcache->bb-1))>>3]=val;
6668    if (((Dcache->inbuf.tag.l^addr.l)&-Dcache->bb)==0 &&
6669          Dcache->inbuf.tag.h==addr.h)
6670      Dcache->inbuf.data[(addr.l&(Dcache->bb-1))>>3]=val;
6671    if (((Dcache->outbuf.tag.l^addr.l)&-Dcache->bb)==0 &&
6672          Dcache->outbuf.tag.h==addr.h)
6673      Dcache->outbuf.data[(addr.l&(Dcache->bb-1))>>3]=val;
6674    if (Scache) {
6675      p=cache_search(Scache,addr);
6676      if (p) p->data[(addr.l&(Scache->bb-1))>>3]=val;
6677      if (((Scache->inbuf.tag.l^addr.l)&-Scache->bb)==0 &&
6678            Scache->inbuf.tag.h==addr.h)
6679        Scache->inbuf.data[(addr.l&(Scache->bb-1))>>3]=val;
6680      if (((Scache->outbuf.tag.l^addr.l)&-Scache->bb)==0 &&
6681            Scache->outbuf.tag.h==addr.h)
6682        Scache->outbuf.data[(addr.l&(Scache->bb-1))>>3]=val;
6683    }
6684  }
6685  mem_write(addr,val);
6686}
6687
6688@ The conventions of our imaginary operating system require us to
6689apply the trivial memory mapping in which segment~$i$ appears in
6690a $2^{32}$-byte page of physical addresses starting at $2^{32}i$.
6691
6692@<Prepare memory arguments...@>=
6693if (arg_count[yy]==3) {
6694  octa arg_loc;
6695  arg_loc=g[rBB].o;
6696  if (arg_loc.h&0x9fffffff) mb=zero_octa;
6697  else arg_loc.h>>=29, mb=magic_read(arg_loc);
6698  arg_loc=incr(g[rBB].o,8);
6699  if (arg_loc.h&0x9fffffff) ma=zero_octa;
6700  else arg_loc.h>>=29, ma=magic_read(arg_loc);
6701}
6702
6703@ The subroutine |mmgetchars(buf,size,addr,stop)| reads characters
6704starting at address |addr| in the simulated memory and stores them
6705in |buf|, continuing until |size| characters have been read or
6706some other stopping criterion has been met. If |stop<0| there is
6707no other criterion; if |stop=0| a null character will also terminate
6708the process; otherwise |addr| is even, and two consecutive null bytes
6709starting at an even address will terminate the process. The number
6710of bytes read and stored, exclusive of terminating nulls, is returned.
6711
6712@<Sub...@>=
6713int mmgetchars(buf,size,addr,stop)
6714  char *buf;
6715  int size;
6716  octa addr;
6717  int stop;
6718{
6719  register char *p;
6720  register int m;
6721  octa a,x;
6722  if (((addr.h&0x9fffffff)||(incr(addr,size-1).h&0x9fffffff))&&size) {
6723    fprintf(stderr,"Attempt to get characters from off the page!\n");
6724@.Attempt to get characters...@>
6725    return 0;
6726  }
6727  for (p=buf,m=0,a=addr,a.h>>=29; m<size;) {
6728    x=magic_read(a);
6729    if ((a.l&0x7) || m>size-8) @<Read and store one byte; |return| if done@>@;
6730    else @<Read and store up to eight bytes; |return| if done@>@;
6731  }
6732  return size;
6733}
6734
6735@ @<Read and store one byte...@>=
6736{
6737  if (a.l&0x4) *p=(x.l>>(8*((~a.l)&0x3)))&0xff;
6738  else *p=(x.h>>(8*((~a.l)&0x3)))&0xff;
6739  if (!*p && stop>=0) {
6740    if (stop==0) return m;
6741    if ((a.l&0x1) && *(p-1)=='\0') return m-1;
6742  }
6743  p++,m++,a=incr(a,1);
6744}
6745
6746@ @<Read and store up to eight bytes...@>=
6747{
6748  *p=x.h>>24;
6749  if (!*p && (stop==0 || (stop>0 && x.h<0x10000))) return m;
6750  *(p+1)=(x.h>>16)&0xff;
6751  if (!*(p+1) && stop==0) return m+1;
6752  *(p+2)=(x.h>>8)&0xff;
6753  if (!*(p+2) && (stop==0 || (stop>0 && (x.h&0xffff)==0))) return m+2;
6754  *(p+3)=x.h&0xff;
6755  if (!*(p+3) && stop==0) return m+3;
6756  *(p+4)=x.l>>24;
6757  if (!*(p+4) && (stop==0 || (stop>0 && x.l<0x10000))) return m+4;
6758  *(p+5)=(x.l>>16)&0xff;
6759  if (!*(p+5) && stop==0) return m+5;
6760  *(p+6)=(x.l>>8)&0xff;
6761  if (!*(p+6) && (stop==0 || (stop>0 && (x.l&0xffff)==0))) return m+6;
6762  *(p+7)=x.l&0xff;
6763  if (!*(p+7) && stop==0) return m+7;
6764  p+=8,m+=8,a=incr(a,8);
6765}
6766
6767@ The subroutine |mmputchars(buf,size,addr)| puts |size| characters
6768into the simulated memory starting at address |addr|.
6769
6770@<Sub...@>=
6771void mmputchars(buf,size,addr)
6772  unsigned char *buf;
6773  int size;
6774  octa addr;
6775{
6776  register unsigned char *p;
6777  register int m;
6778  octa a,x;
6779  if (((addr.h&0x9fffffff)||(incr(addr,size-1).h&0x9fffffff))&&size) {
6780    fprintf(stderr,"Attempt to put characters off the page!\n");
6781@.Attempt to put characters...@>
6782    return;
6783  }
6784  for (p=buf,m=0,a=addr,a.h>>=29; m<size;) {
6785    if ((a.l&0x7) || m>size-8) @<Load and write one byte@>@;
6786    else @<Load and write eight bytes@>;
6787  }
6788}
6789
6790@ @<Load and write one byte@>=
6791{
6792  register int s=8*((~a.l)&0x3);
6793  x=magic_read(a);
6794  if (a.l&0x4) x.l^=(((x.l>>s)^*p)&0xff)<<s;
6795  else x.h^=(((x.h>>s)^*p)&0xff)<<s;
6796  magic_write(a,x);
6797  p++,m++,a=incr(a,1);
6798}
6799
6800@ @<Load and write eight bytes@>=
6801{
6802  x.h=(*p<<24)+(*(p+1)<<16)+(*(p+2)<<8)+*(p+3);
6803  x.l=(*(p+4)<<24)+(*(p+5)<<16)+(*(p+6)<<8)+*(p+7);
6804  magic_write(a,x);
6805  p+=8,m+=8,a=incr(a,8);
6806}
6807
6808@ When standard input is being read by the simulated program at the same time
6809as it is being used for interaction, we try to keep the two uses separate
6810by maintaining a private buffer for the simulated program's \.{StdIn}.
6811Online input is usually transmitted from the keyboard to a \CEE/ program
6812a line at a time; therefore an
6813|fgets| operation works much better than |fread| when we prompt
6814for new input. But there is a slight complication, because |fgets|
6815might read a null character before coming to a newline character.
6816We cannot deduce the number of characters read by |fgets| simply
6817by looking at |strlen(stdin_buf)|.
6818
6819@<Sub...@>=
6820char stdin_chr()
6821{
6822  register char* p;
6823  while (stdin_buf_start==stdin_buf_end) {
6824    printf("StdIn> ");@+fflush(stdout);
6825@.StdIn>@>
6826    fgets(stdin_buf,256,stdin);
6827    stdin_buf_start=stdin_buf;
6828    for (p=stdin_buf;p<stdin_buf+254;p++) if(*p=='\n') break;
6829    stdin_buf_end=p+1;
6830  }
6831  return *stdin_buf_start++;
6832}
6833
6834@ @<Glob...@>=
6835char stdin_buf[256]; /* standard input to the simulated program */
6836char *stdin_buf_start; /* current position in that buffer */
6837char *stdin_buf_end; /* current end of that buffer */
6838
6839@* Index.
6840