1 /* asm.c: primitive redcode assembler
2  * $Id: asm.c,v 1.5 2003/08/30 16:18:30 varfar Exp $
3  */
4 
5 /* This file is part of `exhaust', a memory array redcode simulator.
6  * Author: M Joonas Pihlaja
7  * Public Domain.
8  */
9 
10 /* The format of lines with instructions should be:
11  *
12  * [START]	OPCODE.MODIFIER   A-MODE INT , B-MODE INT
13  *
14  * The ORG pseudo-op is ignored, as is the label after an optional
15  * END (if given). The only label recognised is START.  No fuss over
16  * the amount of white space, as long as it exists where required.
17  *
18  * Comments are recognised and discarded as is any line starting
19  * with "Program".  The output from `pmars -r 0 Your_Real_Source.red'
20  * should assemble fine with this tiny assembler.
21  *
22  *
23  * Functions in this file:
24  *
25  *     asm_line(), asm_file(), asm_fname(), dis1(),
26  *     discore()
27  */
28 #include <stdio.h>
29 #ifdef SYSV
30 #include <strings.h>
31 #else
32 #include <string.h>
33 #endif
34 #include <stdlib.h>
35 #include <ctype.h>
36 
37 #include "exhaust.h"
38 #include "insn.h"
39 #include "asm.h"
40 
41 /* str_tok_t: container for tokens we identify.
42  */
43 typedef struct str_toks_st {
44   char *s;			/* name of token */
45   int c;			/* token code */
46 } str_toks_t;
47 
48 /* Data
49  *
50  * tok_buf[]: globally used to keep the contents of string tokens
51  * tok_int:   if the token was a TOK_INT, the value of the token is here
52  *
53  * str_toks[]: table of multicharacter tokens we identify
54  *
55  */
56 
57 #define MAX_ALL_CHARS 256
58 static char tok_buf[MAX_ALL_CHARS];
59 static int tok_int;
60 
61 static str_toks_t str_toks[] = {
62     { "DAT", TOK_DAT },		/* opcodes */
63     { "SPL", TOK_SPL },
64     { "MOV", TOK_MOV },
65     { "DJN", TOK_DJN },
66     { "ADD", TOK_ADD },
67     { "JMZ", TOK_JMZ },
68     { "SUB", TOK_SUB },
69     { "MOD", TOK_MOD },
70     { "CMP", TOK_SEQ },
71     { "SEQ", TOK_SEQ },
72     { "JMP", TOK_JMP },
73     { "JMN", TOK_JMN },
74     { "SNE", TOK_SNE },
75     { "MUL", TOK_MUL },
76     { "DIV", TOK_DIV },
77     { "SLT", TOK_SLT },
78     { "NOP", TOK_NOP },
79     { "LDP", TOK_LDP },
80     { "STP", TOK_STP },
81 
82     { "ORG", TOK_ORG },		/* pseudo-ops */
83     { "END", TOK_END },
84     { "PIN", TOK_PIN },
85     { "START", TOK_START },
86 
87     { "F", TOK_mF },		/* modifiers */
88     { "A", TOK_mA },
89     { "B", TOK_mB },
90     { "AB", TOK_mAB },
91     { "BA", TOK_mBA },
92     { "X", TOK_mX },
93     { "I", TOK_mI },
94     { NULL, 0 }			/* sentinel */
95 };
96 
97 /* NAME
98  *     get_tok -- read the next token from a string
99  *
100  * SYNOPSIS
101  *     const char *get_tok( const char *s, int *tok );
102  *
103  * INPUTS
104  *     s -- string to read token from
105  *     tok -- where we store the token code of the read token
106  *
107  * RESULTS
108  *     The token code of the read token is stored into *tok,
109  *     with 0 signifying end of input.
110  *
111  *     If the token was an integer, its value is stored into
112  *     the global `tok_int'.  Integers may be in any base >= 10
113  *     as according to strtol().
114  *
115  *     String tokens are converted to upper case when storing
116  *     them into the global `tok_str[]'.  They are concatenated
117  *     at 255 characters.
118  *
119  * RETURN VALUE
120  *      Pointer to the character past the read token, or
121  *	to the nul character if at end of input.
122  *
123  * GLOBALS
124  *     tok_buf[]    -- a string or char token is copied here
125  *     tok_int      -- the value of an integer token
126  *     str_toks[]   -- used to identify string tokens
127  */
128 
129 /* skip_white(): returns ptr. to next non-whitespace char in s */
130 static
131 const char *
skip_white(const char * s)132 skip_white(const char *s)
133 {
134   while ( isspace(*s) ) s++;
135   return s;
136 }
137 
138 static
139 const char *
get_tok(const char * s,int * tok)140 get_tok( const char *s, int *tok )
141 {
142   char *tok_str = tok_buf;
143   int i;
144 
145   s = skip_white(s);
146   if ( *s == 0 )    return (*tok = 0, s);
147 
148   /*
149    * Tokenize strings.
150    *
151    * String tokens must start with a letter and consist of
152    * letters, digits, and underscores.  Strings are
153    * converted to upper case.
154    */
155   tok_buf[1] = tok_buf[0] = 0;
156 
157   i = 0;
158   if ( isalpha(*s) )
159     while ( (isalnum(*s) || *s == '_') && ++i < MAX_ALL_CHARS )
160       *tok_str++ = toupper(*s++);
161   *tok_str = 0;
162 
163   if ( tok_str > tok_buf ) {
164     /*
165      * was a string token -- identify it by searching through
166      * the str_toks[] array.
167      */
168     for ( i = 0; str_toks[i].s ; i++ ) {
169       if ( 0 == strcmp( str_toks[i].s, tok_buf ) ) {
170 	*tok = str_toks[i].c;
171 	return s;
172       }
173     }
174     *tok = TOK_STR;		/* normal string, not special */
175     return s;
176   }
177 
178 
179   /*
180    * Tokenize ints.
181    * Must match /-?[0-9]/
182    */
183   if ( isdigit(*s) ||  ( *s == '-' && isdigit(*(s+1)) )) {
184     char *endptr;
185     tok_int = strtol( s, &endptr, 0 );
186     *tok = TOK_INT;
187     return endptr;
188   }
189 
190 
191   /*
192    * Tokenize addressing modes and pass single chars
193    */
194 
195   tok_buf[0] = *s;		/* store char value as single */
196   tok_buf[1] = 0;		/* char string. */
197 
198   switch ( *tok = *s++ ) {
199   case '$': *tok = TOK_DIRECT;          break;
200   case '#': *tok = TOK_IMMEDIATE;	break;
201   case '*': *tok = TOK_AINDIRECT;	break;
202   case '@': *tok = TOK_BINDIRECT;	break;
203   case '{': *tok = TOK_APREDEC;		break;
204   case '<': *tok = TOK_BPREDEC;		break;
205   case '}': *tok = TOK_APOSTINC;	break;
206   case '>': *tok = TOK_BPOSTINC;	break;
207   }
208 
209   return s;
210 }
211 
212 
213 
214 /* NAME
215  *     panic_bad_token -- issue an error message for a bad token and exit(1)
216  *
217  * SYNOPSIS
218  *     void panic_bad_token( int tok, const char *expected );
219  *
220  * INPUTS
221  *     tok -- token code of unexpected token
222  *     expected -- a string describing what kind of token
223  *		   was expected.  e.g. "a modifier".
224  *
225  * RESULTS
226  *     A message Informing the user of the unexpected token,
227  *     its possible semantic value, and what type of token
228  *     was expected instead.
229  *
230  * GLOBALS
231  *     tok_buf, tok_int -- if the token has semantic value we look
232  *                         for it here.
233  * BUGS
234  *     The error message should be much better -- not even location
235  *     in the source is given here. *sigh*
236  */
237 static
238 void
panic_bad_token(int tok,const char * expected)239 panic_bad_token( int tok, const char *expected )
240 {
241   char *errstr = NULL;
242   char buf[30];
243 
244   memset(buf, 0, 30);
245 
246   /* make an errstr
247    */
248   if ( tok_buf[0] )
249     errstr = tok_buf;
250   if ( tok == TOK_INT ) {
251     sprintf(buf, "%d", tok_int );
252     errstr = buf;
253   }
254 
255   /* complain and exit with error code
256    */
257   fprintf(stderr, "token '%s' not %s\n", errstr, expected );
258   exit(1);
259 }
260 
261 
262 /* NAME
263  *     asm_line -- assemble a line to an instruction
264  *
265  * SYNOPSIS
266  *     int asm_line( const char *line, insn_t *in, unsigned int CORESIZE );
267  *
268  * INPUTS
269  *     line -- line to assemble
270  *     in   -- instruction to assemble into
271  *     CORESIZE -- size of core
272  *
273  * RESULTS
274  *     If there was anything to assemble, it is assembled into
275  *     `in'.  If there was a START label, the corresponding flag
276  *     is set in the instructions flags.  Incomplete or erroneous
277  *     input prompt a quick error message and exit(1).
278  *
279  *     If the 'ORG start-address' construct is encountered where
280  *     `start-address' is an integer, then the `in->a' field contains
281  *     the offset in instructions from the start of the warrior
282  *     where the warrior should start execution.
283  *
284  *     If 'PIN id' is encountered, where `id' is an integer, then the
285  *     `in->a' field contains the `id'.
286  *
287  * RETURN VALUE
288  *     ASMLINE_PIN  : pseudo-op 'PIN' encountered, id saved in `in->a'.
289  *     ASMLINE_ORG  : pseudo-op 'ORG' encountered, warrior start
290  *                    saved in `in->a'.
291  *     ASMLINE_DONE : done assembling, END opcode found, nothing assembled.
292  *     ASMLINE_NONE : nothing to assemble on this line.
293  *     ASMLINE_OK   : assembled instruction into `in' OK.
294  *
295  * GLOBALS
296  *     tok_int, tok_buf[], str_toks[] somewhere down the line.
297  */
298 
299 int
asm_line(const char * line,insn_t * in,unsigned int CORESIZE)300 asm_line( const char *line, insn_t *in, unsigned int CORESIZE  )
301 {
302   const char *s = line;
303   int tok;
304   int flags = 0;
305   int op, m, ma, mb;		/* opcode, modifier, a-mode, b-mode */
306 
307   s = get_tok( s, &tok );
308   if ( tok == 0 ) return ASMLINE_NONE;
309 
310   /*
311    * Ignore string lines '^Program.*' and comments.
312    */
313   if ( tok == TOK_STR && 0 == strcmp( "PROGRAM", tok_buf ))
314   {
315     return ASMLINE_NONE;
316   }
317   if ( tok == ';' ) return ASMLINE_NONE;
318 
319   /*
320    * Now match the instruction's various components:
321    *   [START label,] opcode, modifier, a-mode, a-value, b-mode, b-value
322    */
323 
324   /* Match possible start label
325    */
326   if ( tok == TOK_START ) {
327     flags |= fl_START;
328     s = get_tok( s, &tok );
329   }
330 
331   /* Match opcode
332    */
333   if ( is_tok_pseudoop(tok) ) {
334     switch ( tok ) {
335     case TOK_END:
336       return ASMLINE_DONE;	/* signal done assembling */
337 
338     case TOK_ORG:
339       s = get_tok( s, &tok );	/* get the next token */
340 
341       if ( tok == TOK_START )	/* ignore: */
342 	return ASMLINE_NONE;	/* start label already matched and processed */
343 
344       if ( tok != TOK_INT ) {
345 	panic_bad_token( tok, "an integer -- an int or \"START\" "
346 			 "follows ORG" );
347       }
348       in->a = tok_int;
349       return ASMLINE_ORG;
350 
351     case TOK_PIN:
352       s = get_tok( s, &tok );
353       if ( tok != TOK_INT ) {
354 	panic_bad_token( tok, "an integer -- PIN must be an unsigned integer");
355       }
356       in->a = tok_int;
357       return ASMLINE_PIN;
358 
359     default:
360       panic_bad_token( tok, "a pseudo-op (internal assembler error)" );
361     }
362   }
363   if (!( is_tok_opcode(tok)))
364     panic_bad_token( tok, "an opcode" );
365 
366   op = DAT;
367   switch(tok) {
368   case TOK_DAT: op = DAT; break;
369   case TOK_SPL: op = SPL; break;
370   case TOK_MOV: op = MOV; break;
371   case TOK_JMP: op = JMP; break;
372   case TOK_JMZ: op = JMZ; break;
373   case TOK_JMN: op = JMN; break;
374   case TOK_ADD: op = ADD; break;
375   case TOK_SUB: op = SUB; break;
376   case TOK_SEQ: op = SEQ; break;
377   case TOK_SNE: op = SNE; break;
378   case TOK_MUL: op = MUL; break;
379   case TOK_DIV: op = DIV; break;
380   case TOK_DJN: op = DJN; break;
381   case TOK_SLT: op = SLT; break;
382   case TOK_MOD: op = MODM; break;
383   case TOK_NOP: op = NOP; break;
384   case TOK_LDP: op = LDP; break;
385   case TOK_STP: op = STP; break;
386   default:
387     panic_bad_token( tok, "an opcode" );
388   }
389 
390   /* Match modifier
391    */
392   s = get_tok( s, &tok );	/* first the '.' */
393   if ( tok != '.' )
394     panic_bad_token( tok, "'.'" );
395 
396   s = get_tok( s, &tok );	/* then the modifier itself */
397   if ( ! is_tok_modifier(tok) )
398     panic_bad_token( tok, "a modifier");
399   m = tok - TOK_mF;
400 
401   /* Match a-field addressing mode and a-field
402    */
403   s = get_tok( s, &tok );
404   if ( ! is_tok_mode(tok) )
405     panic_bad_token( tok, "an addressing mode specifier");
406   ma = tok - TOK_DIRECT;
407 
408   s = get_tok( s, &tok );
409   if ( tok != TOK_INT )
410     panic_bad_token( tok, "an integer");
411   in->a = MODS(tok_int,CORESIZE);
412 
413   /* Match comma
414    */
415   s = get_tok( s, &tok );
416   if ( tok != ',' )
417     panic_bad_token( tok, "','" );
418 
419   /* Match b-field addressing mode and a-field
420    */
421   s = get_tok( s, &tok );
422   if ( ! is_tok_mode(tok) )
423     panic_bad_token( tok, "an addressing mode specifier");
424   mb = tok - TOK_DIRECT;
425 
426   s = get_tok( s, &tok );
427   if ( tok != TOK_INT )
428     panic_bad_token( tok, "an integer");
429   in->b = MODS(tok_int,CORESIZE);
430 
431 
432   /*
433    * Set flags and ignore the rest of the line
434    */
435   in->in = (flags << flPOS) | OP( op, m, ma, mb );
436   return ASMLINE_OK;
437 }
438 
439 
440 
441 
442 /* NAME
443  *     asm_file, asm_fname -- assemble a FILE into a warrior
444  *
445  * SYNOPSIS
446  *     void asm_file( FILE *F, warrior_t *w, unsigned int CORESIZE );
447  *     void asm_fname( const char *filename, warrior_t *w,
448  *    	               unsigned int CORESIZE );
449  *
450  * INPUTS
451  *     w        -- warrior_t to assemble into.
452  *     F        -- stream to read warrior source from
453  *     filename -- path to source file.  May be '-'
454  *		   which is interpreted as stdin.
455  *     CORESIZE -- just that
456  *
457  * DESCRIPTION
458  *     These functions assemble a source file into a
459  *     warrior_t setting all the non-info fields.
460  *
461  * RESULTS
462  *    If the warrior assembled correctly, then warrior_t
463  *    contains its code and starting offset.  If an error
464  *    occured during assembly, an error message is issued
465  *    and the program exit()s.
466  *
467  * GLOBALS
468  *     none as such, subroutines use tok_buf[], tok_int, str_toks[],
469  *     MAXLENGTH constant
470  *
471  * SEE ALSO
472  *     asm_line()
473  *
474  * BUGS
475  *     Its not really acceptable to exit() on an assembly error.
476  */
477 void
asm_file(FILE * F,warrior_t * w,unsigned int CORESIZE)478 asm_file( FILE *F, warrior_t *w, unsigned int CORESIZE )
479 {
480   char line[MAX_ALL_CHARS];
481   insn_t *c;
482   int ret;			/* return code from asm_line() */
483 
484   w->len = w->start = 0;
485   w->have_pin = 0;
486   w->pin = 0;
487   c = w->code;
488 
489   while ( fgets(line, MAX_ALL_CHARS, F) ) {
490     ret = asm_line( line, c, CORESIZE );
491     if ( ret == ASMLINE_DONE ) break;
492 
493     switch ( ret ) {
494     case ASMLINE_OK:
495       if ( get_flags( c->in ) & fl_START ) {
496 	w->start = w->len;
497 	clr_flags( c->in, fl_START );
498       }
499       if ( w->len < MAXLENGTH) c++;
500       w->len++;
501       break;
502 
503     case ASMLINE_ORG:
504       w->start =  c->a;		/* was `ORG int', get the starting address */
505       break;
506 
507     case ASMLINE_NONE:
508       break;			/* nop */
509 
510     case ASMLINE_PIN:
511       w->have_pin = 1;
512       w->pin = c->a;		/* save PIN. */
513       break;
514 
515     default:
516       fprintf(stderr,"asm.c/asm_file(): illegal return code from asm_line()\n");
517       exit(1);
518     }
519     if ( w->len > MAXLENGTH ) {
520       fprintf(stderr, "too many instructions in warrior %d\n", w->no);
521       exit(1);
522     }
523   }
524   if ( w->start >= w->len ) {
525     fprintf(stderr, "starting address must be inside warrior body\n" );
526     exit(1);
527   }
528 }
529 
530 
531 void
asm_fname(const char * fname,warrior_t * w,unsigned int CORESIZE)532 asm_fname(const char *fname, warrior_t *w, unsigned int CORESIZE )
533 {
534   FILE *F;
535   int is_stdin = 0;
536 
537   if ( strcmp( fname, "-" ) == 0 ) {
538     F = stdin;
539     is_stdin = 1;
540   }
541   else
542     if (!( F = fopen(fname, "r") )) {
543       fprintf(stderr, "can't open file %s\n", fname);
544       exit(1);
545     }
546 
547   asm_file(F, w, CORESIZE);
548 
549   if ( !is_stdin ) fclose(F);
550 }
551 
552 
553 
554 /* NAME
555  *     dis1 -- disasemble an instruction
556  *     discore -- disasemble a segment of core
557  *
558  * SYNOPSIS
559  *     void dis1( char *s, inst_t in, unsigned int CORESIZE );
560  *     void discore( inst_t *core, unsigned int start, unsigned int end,
561  * 		     unsigned int CORESIZE );
562  *
563  * INPUTS
564  *     s -- string to print disassembled instruction to. A string
565  *          of length 60 should be more than sufficient.
566  *     in -- instruction to disassemble
567  *     core -- pointer to start of core
568  *     start -- core segment start offset
569  *     end -- core segment end offset (excluded)
570  *
571  * RESULTS
572  *     dis1 -- The disassembled instruction is printed to `s'.
573  *     discore -- A segment of core is dissasembled and printed
574  *                to stdout with core addresses.
575  */
576 
577 void
dis1(char * buf,insn_t in,unsigned int CORESIZE)578 dis1(char *buf, insn_t in, unsigned int CORESIZE)
579 {
580   int x;
581   char *op_s, *mo_s, *ma_s, *mb_s;
582   int af, bf;
583 
584   x = (in.in >> opPOS) & opMASK;
585   switch( x ) {
586   case DAT: op_s = "dat"; break;
587   case SPL: op_s = "spl"; break;
588   case MOV: op_s = "mov"; break;
589   case JMP: op_s = "jmp"; break;
590   case JMZ: op_s = "jmz"; break;
591   case JMN: op_s = "jmn"; break;
592   case ADD: op_s = "add"; break;
593   case SUB: op_s = "sub"; break;
594   case SEQ: op_s = "seq"; break;
595   case SNE: op_s = "sne"; break;
596   case MUL: op_s = "mul"; break;
597   case DIV: op_s = "div"; break;
598   case DJN: op_s = "djn"; break;
599   case SLT: op_s = "slt"; break;
600   case MODM: op_s = "mod"; break;
601   case NOP: op_s = "nop"; break;
602   case LDP: op_s = "ldp"; break;
603   case STP: op_s = "stp"; break;
604   default:
605     op_s = "???";
606   }
607 
608   x = (in.in >> moPOS) & moMASK;
609   switch ( x ) {
610   case mF:  mo_s = "f "; break;
611   case mA:  mo_s = "a "; break;
612   case mB:  mo_s = "b "; break;
613   case mAB: mo_s = "ab"; break;
614   case mBA: mo_s = "ba"; break;
615   case mX:  mo_s = "x "; break;
616   case mI:  mo_s = "i "; break;
617   default:
618     mo_s = "?";
619   }
620 
621 
622   x = (in.in >> maPOS) & mMASK;
623   switch (x) {
624   case DIRECT: ma_s = "$"; break;
625   case IMMEDIATE: ma_s = "#"; break;
626   case AINDIRECT: ma_s = "*"; break;
627   case BINDIRECT: ma_s = "@"; break;
628   case APREDEC: ma_s = "{"; break;
629   case APOSTINC: ma_s = "}"; break;
630   case BPREDEC: ma_s = "<"; break;
631   case BPOSTINC: ma_s = ">"; break;
632   default: ma_s = "?";
633   }
634 
635   x = (in.in >> mbPOS) & mMASK;
636   switch (x) {
637   case DIRECT:    mb_s = "$"; break;
638   case IMMEDIATE: mb_s = "#"; break;
639   case AINDIRECT: mb_s = "*"; break;
640   case BINDIRECT: mb_s = "@"; break;
641   case APREDEC:   mb_s = "{"; break;
642   case APOSTINC:  mb_s = "}"; break;
643   case BPREDEC:   mb_s = "<"; break;
644   case BPOSTINC:  mb_s = ">"; break;
645   default: mb_s = "?";
646   }
647 
648   af = in.a <= CORESIZE/2 ? in.a : in.a - CORESIZE;
649   bf = in.b <= CORESIZE/2 ? in.b : in.b - CORESIZE;
650 
651   sprintf(buf,"%s.%s %s%5d , %s%5d", op_s, mo_s, ma_s, af, mb_s, bf);
652 }
653 
654 
655 void
discore(const insn_t * core,int start,int end,unsigned int CORESIZE)656 discore( const insn_t *core,
657 	 int start,
658 	 int end,
659 	 unsigned int CORESIZE )
660 {
661   int adr;
662   char line[MAX_ALL_CHARS];
663   for ( adr = start; adr < end; adr++ ) {
664     int i = MODS(adr, CORESIZE);
665     dis1( line, core[i], CORESIZE );
666     printf("%4d    %s\n", adr, line);
667   }
668 }
669