1 /* asm.c: primitive redcode assembler
2  * $Id: asm.c,v 1.1.1.1 2003/08/26 16:57:02 varfar Exp $
3  */
4 
5 /* This file is part of `exhaust', a memory array redcode simulator.
6  * Author: M Joonas Pihlaja
7  * Public Domain.
8  */
9 
10 /* The format of lines with instructions should be:
11  *
12  * [START]	OPCODE.MODIFIER   A-MODE INT , B-MODE INT
13  *
14  * The ORG pseudo-op is ignored, as is the label after an optional
15  * END (if given). The only label recognised is START.  No fuss over
16  * the amount of white space, as long as it exists where required.
17  *
18  * Comments are recognised and discarded as is any line starting
19  * with "Program".  The output from `pmars -r 0 Your_Real_Source.red'
20  * should assemble fine with this tiny assembler.
21  *
22  *
23  * Functions in this file:
24  *
25  *     asm_line(), asm_file(), asm_fname(), dis1(),
26  *     discore()
27  */
28 #include <stdio.h>
29 #ifdef SYSV
30 #include <strings.h>
31 #else
32 #include <string.h>
33 #endif
34 #include <stdlib.h>
35 #include <ctype.h>
36 
37 #include "exhaust.h"
38 #include "insn.h"
39 #include "asm.h"
40 
41 /* str_tok_t: container for tokens we identify.
42  */
43 typedef struct str_toks_st {
44   char *s;			/* name of token */
45   int c;			/* token code */
46 } str_toks_t;
47 
48 /* Data
49  *
50  * tok_buf[]: globally used to keep the contents of string tokens
51  * tok_int:   if the token was a TOK_INT, the value of the token is here
52  *
53  * str_toks[]: table of multicharacter tokens we identify
54  *
55  */
56 
57 #define MAX_ALL_CHARS 256
58 static char tok_buf[MAX_ALL_CHARS];
59 static int tok_int;
60 
61 const char
62 	*MNEMONIC_OPCODE[OPCODE_LAST] = {
63 		"DAT","SPL","MOV","DJN","ADD","JMZ","SUB","SEQ","SNE","SLT","JMN","JMP","NOP","MUL","MOD","DIV","LDP","STP" },
64 	*MNEMONIC_MODIFIER[MODIFIER_LAST] = {
65 		"F","A","B","AB","BA","X","I"},
66 	MNEMONIC_ADDRMODE[ADDRMODE_LAST] = {
67 		'$','#','@','<','>','*','{','}'};
68 
69 static str_toks_t str_toks[] = {
70     { "DAT", TOK_DAT },		/* opcodes */
71     { "SPL", TOK_SPL },
72     { "MOV", TOK_MOV },
73     { "DJN", TOK_DJN },
74     { "ADD", TOK_ADD },
75     { "JMZ", TOK_JMZ },
76     { "SUB", TOK_SUB },
77     { "MOD", TOK_MOD },
78     { "CMP", TOK_SEQ },
79     { "SEQ", TOK_SEQ },
80     { "JMP", TOK_JMP },
81     { "JMN", TOK_JMN },
82     { "SNE", TOK_SNE },
83     { "MUL", TOK_MUL },
84     { "DIV", TOK_DIV },
85     { "SLT", TOK_SLT },
86     { "NOP", TOK_NOP },
87     { "LDP", TOK_LDP },
88     { "STP", TOK_STP },
89 
90     { "ORG", TOK_ORG },		/* pseudo-ops */
91     { "END", TOK_END },
92     { "PIN", TOK_PIN },
93     { "START", TOK_START },
94 
95     { "F", TOK_mF },		/* modifiers */
96     { "A", TOK_mA },
97     { "B", TOK_mB },
98     { "AB", TOK_mAB },
99     { "BA", TOK_mBA },
100     { "X", TOK_mX },
101     { "I", TOK_mI },
102     { NULL, 0 }			/* sentinel */
103 };
104 
105 
106 
107 /* NAME
108  *     get_tok -- read the next token from a string
109  *
110  * SYNOPSIS
111  *     const char *get_tok( const char *s, int *tok );
112  *
113  * INPUTS
114  *     s -- string to read token from
115  *     tok -- where we store the token code of the read token
116  *
117  * RESULTS
118  *     The token code of the read token is stored into *tok,
119  *     with 0 signifying end of input.
120  *
121  *     If the token was an integer, its value is stored into
122  *     the global `tok_int'.  Integers may be in any base >= 10
123  *     as according to strtol().
124  *
125  *     String tokens are converted to upper case when storing
126  *     them into the global `tok_str[]'.  They are concatenated
127  *     at 255 characters.
128  *
129  * RETURN VALUE
130  *      Pointer to the character past the read token, or
131  *	to the nul character if at end of input.
132  *
133  * GLOBALS
134  *     tok_buf[]    -- a string or char token is copied here
135  *     tok_int      -- the value of an integer token
136  *     str_toks[]   -- used to identify string tokens
137  */
138 
139 /* skip_white(): returns ptr. to next non-whitespace char in s */
140 static
141 const char *
skip_white(const char * s)142 skip_white(const char *s)
143 {
144   while ( isspace(*s) ) s++;
145   return s;
146 }
147 
148 static
149 const char *
get_tok(const char * s,int * tok)150 get_tok( const char *s, int *tok )
151 {
152   char *tok_str = tok_buf;
153   int i;
154 
155   s = skip_white(s);
156   if ( *s == 0 )    return (*tok = 0, s);
157 
158   /*
159    * Tokenize strings.
160    *
161    * String tokens must start with a letter and consist of
162    * letters, digits, and underscores.  Strings are
163    * converted to upper case.
164    */
165   tok_buf[1] = tok_buf[0] = 0;
166 
167   i = 0;
168   if ( isalpha(*s) )
169     while ( (isalnum(*s) || *s == '_') && ++i < MAX_ALL_CHARS )
170       *tok_str++ = toupper(*s++);
171   *tok_str = 0;
172 
173   if ( tok_str > tok_buf ) {
174     /*
175      * was a string token -- identify it by searching through
176      * the str_toks[] array.
177      */
178     for ( i = 0; str_toks[i].s ; i++ ) {
179       if ( 0 == strcmp( str_toks[i].s, tok_buf ) ) {
180 	*tok = str_toks[i].c;
181 	return s;
182       }
183     }
184     *tok = TOK_STR;		/* normal string, not special */
185     return s;
186   }
187 
188 
189   /*
190    * Tokenize ints.
191    * Must match /-?[0-9]/
192    */
193   if ( isdigit(*s) ||  ( *s == '-' && isdigit(*(s+1)) )) {
194     char *endptr;
195     tok_int = strtol( s, &endptr, 0 );
196     *tok = TOK_INT;
197     return endptr;
198   }
199 
200 
201   /*
202    * Tokenize addressing modes and pass single chars
203    */
204 
205   tok_buf[0] = *s;		/* store char value as single */
206   tok_buf[1] = 0;		/* char string. */
207 
208   switch ( *tok = *s++ ) {
209   case '$': *tok = TOK_DIRECT;          break;
210   case '#': *tok = TOK_IMMEDIATE;	break;
211   case '*': *tok = TOK_AINDIRECT;	break;
212   case '@': *tok = TOK_BINDIRECT;	break;
213   case '{': *tok = TOK_APREDEC;		break;
214   case '<': *tok = TOK_BPREDEC;		break;
215   case '}': *tok = TOK_APOSTINC;	break;
216   case '>': *tok = TOK_BPOSTINC;	break;
217   }
218 
219   return s;
220 }
221 
222 
223 
224 /* NAME
225  *     panic_bad_token -- issue an error message for a bad token and exit(1)
226  *
227  * SYNOPSIS
228  *     void panic_bad_token( int tok, const char *expected );
229  *
230  * INPUTS
231  *     tok -- token code of unexpected token
232  *     expected -- a string describing what kind of token
233  *		   was expected.  e.g. "a modifier".
234  *
235  * RESULTS
236  *     A message Informing the user of the unexpected token,
237  *     its possible semantic value, and what type of token
238  *     was expected instead.
239  *
240  * GLOBALS
241  *     tok_buf, tok_int -- if the token has semantic value we look
242  *                         for it here.
243  * BUGS
244  *     The error message should be much better -- not even location
245  *     in the source is given here. *sigh*
246  */
247 static
248 void
panic_bad_token(int tok,const char * expected)249 panic_bad_token( int tok, const char *expected )
250 {
251   char *errstr = NULL;
252   char buf[30];
253 
254   memset(buf, 0, 30);
255 
256   /* make an errstr
257    */
258   if ( tok_buf[0] )
259     errstr = tok_buf;
260   if ( tok == TOK_INT ) {
261     sprintf(buf, "%d", tok_int );
262     errstr = buf;
263   }
264 
265   /* complain and exit with error code
266    */
267   fprintf(stderr, "token '%s' not %s\n", errstr, expected );
268   exit(1);
269 }
270 
271 
272 /* NAME
273  *     asm_line -- assemble a line to an instruction
274  *
275  * SYNOPSIS
276  *     int asm_line( const char *line, insn_t *in, unsigned int CORESIZE );
277  *
278  * INPUTS
279  *     line -- line to assemble
280  *     in   -- instruction to assemble into
281  *     CORESIZE -- size of core
282  *
283  * RESULTS
284  *     If there was anything to assemble, it is assembled into
285  *     `in'.  If there was a START label, the corresponding flag
286  *     is set in the instructions flags.  Incomplete or erroneous
287  *     input prompt a quick error message and exit(1).
288  *
289  *     If the 'ORG start-address' construct is encountered where
290  *     `start-address' is an integer, then the `in->a' field contains
291  *     the offset in instructions from the start of the warrior
292  *     where the warrior should start execution.
293  *
294  *     If 'PIN id' is encountered, where `id' is an integer, then the
295  *     `in->a' field contains the `id'.
296  *
297  * RETURN VALUE
298  *     ASMLINE_PIN  : pseudo-op 'PIN' encountered, id saved in `in->a'.
299  *     ASMLINE_ORG  : pseudo-op 'ORG' encountered, warrior start
300  *                    saved in `in->a'.
301  *     ASMLINE_DONE : done assembling, END opcode found, nothing assembled.
302  *     ASMLINE_NONE : nothing to assemble on this line.
303  *     ASMLINE_OK   : assembled instruction into `in' OK.
304  *
305  * GLOBALS
306  *     tok_int, tok_buf[], str_toks[] somewhere down the line.
307  */
308 
309 int
asm_line(const char * line,insn_t * in,unsigned int CORESIZE)310 asm_line( const char *line, insn_t *in, unsigned int CORESIZE  )
311 {
312   const char *s = line;
313   int tok;
314   int flags = 0;
315   int op, m, ma, mb;		/* opcode, modifier, a-mode, b-mode */
316 
317   s = get_tok( s, &tok );
318   if ( tok == 0 ) return ASMLINE_NONE;
319 
320   /*
321    * Ignore string lines '^Program.*' and comments.
322    */
323   if ( tok == TOK_STR && 0 == strcmp( "PROGRAM", tok_buf ))
324   {
325     return ASMLINE_NONE;
326   }
327   if ( tok == ';' ) return ASMLINE_NONE;
328 
329   /*
330    * Now match the instruction's various components:
331    *   [START label,] opcode, modifier, a-mode, a-value, b-mode, b-value
332    */
333 
334   /* Match possible start label
335    */
336   if ( tok == TOK_START ) {
337     flags |= fl_START;
338     s = get_tok( s, &tok );
339   }
340 
341   /* Match opcode
342    */
343   if ( is_tok_pseudoop(tok) ) {
344     switch ( tok ) {
345     case TOK_END:
346       return ASMLINE_DONE;	/* signal done assembling */
347 
348     case TOK_ORG:
349       s = get_tok( s, &tok );	/* get the next token */
350 
351       if ( tok == TOK_START )	/* ignore: */
352 	return ASMLINE_NONE;	/* start label already matched and processed */
353 
354       if ( tok != TOK_INT ) {
355 	panic_bad_token( tok, "an integer -- an int or \"START\" "
356 			 "follows ORG" );
357       }
358       in->a = tok_int;
359       return ASMLINE_ORG;
360 
361     case TOK_PIN:
362       s = get_tok( s, &tok );
363       if ( tok != TOK_INT ) {
364 	panic_bad_token( tok, "an integer -- PIN must be an unsigned integer");
365       }
366       in->a = tok_int;
367       return ASMLINE_PIN;
368 
369     default:
370       panic_bad_token( tok, "a pseudo-op (internal assembler error)" );
371     }
372   }
373   if (!( is_tok_opcode(tok)))
374     panic_bad_token( tok, "an opcode" );
375 
376   op = DAT;
377   switch(tok) {
378   case TOK_DAT: op = DAT; break;
379   case TOK_SPL: op = SPL; break;
380   case TOK_MOV: op = MOV; break;
381   case TOK_JMP: op = JMP; break;
382   case TOK_JMZ: op = JMZ; break;
383   case TOK_JMN: op = JMN; break;
384   case TOK_ADD: op = ADD; break;
385   case TOK_SUB: op = SUB; break;
386   case TOK_SEQ: op = SEQ; break;
387   case TOK_SNE: op = SNE; break;
388   case TOK_MUL: op = MUL; break;
389   case TOK_DIV: op = DIV; break;
390   case TOK_DJN: op = DJN; break;
391   case TOK_SLT: op = SLT; break;
392   case TOK_MOD: op = MODM; break;
393   case TOK_NOP: op = NOP; break;
394   case TOK_LDP: op = LDP; break;
395   case TOK_STP: op = STP; break;
396   default:
397     panic_bad_token( tok, "an opcode" );
398   }
399 
400   /* Match modifier
401    */
402   s = get_tok( s, &tok );	/* first the '.' */
403   if ( tok != '.' )
404     panic_bad_token( tok, "'.'" );
405 
406   s = get_tok( s, &tok );	/* then the modifier itself */
407   if ( ! is_tok_modifier(tok) )
408     panic_bad_token( tok, "a modifier");
409   m = tok - TOK_mF;
410 
411   /* Match a-field addressing mode and a-field
412    */
413   s = get_tok( s, &tok );
414   if ( ! is_tok_mode(tok) )
415     panic_bad_token( tok, "an addressing mode specifier");
416   ma = tok - TOK_DIRECT;
417 
418   s = get_tok( s, &tok );
419   if ( tok != TOK_INT )
420     panic_bad_token( tok, "an integer");
421   in->a = MODS(tok_int,CORESIZE);
422 
423   /* Match comma
424    */
425   s = get_tok( s, &tok );
426   if ( tok != ',' )
427     panic_bad_token( tok, "','" );
428 
429   /* Match b-field addressing mode and a-field
430    */
431   s = get_tok( s, &tok );
432   if ( ! is_tok_mode(tok) )
433     panic_bad_token( tok, "an addressing mode specifier");
434   mb = tok - TOK_DIRECT;
435 
436   s = get_tok( s, &tok );
437   if ( tok != TOK_INT )
438     panic_bad_token( tok, "an integer");
439   in->b = MODS(tok_int,CORESIZE);
440 
441 
442   /*
443    * Set flags and ignore the rest of the line
444    */
445   in->in = (flags << flPOS) | OP( op, m, ma, mb );
446   return ASMLINE_OK;
447 }
448 
449 
450 
451 
452 /* NAME
453  *     asm_file, asm_fname -- assemble a FILE into a warrior
454  *
455  * SYNOPSIS
456  *     void asm_file( FILE *F, warrior_t *w, unsigned int CORESIZE );
457  *     void asm_fname( const char *filename, warrior_t *w,
458  *    	               unsigned int CORESIZE );
459  *
460  * INPUTS
461  *     w        -- warrior_t to assemble into.
462  *     F        -- stream to read warrior source from
463  *     filename -- path to source file.  May be '-'
464  *		   which is interpreted as stdin.
465  *     CORESIZE -- just that
466  *
467  * DESCRIPTION
468  *     These functions assemble a source file into a
469  *     warrior_t setting all the non-info fields.
470  *
471  * RESULTS
472  *    If the warrior assembled correctly, then warrior_t
473  *    contains its code and starting offset.  If an error
474  *    occured during assembly, an error message is issued
475  *    and the program exit()s.
476  *
477  * GLOBALS
478  *     none as such, subroutines use tok_buf[], tok_int, str_toks[],
479  *     MAXLENGTH constant
480  *
481  * SEE ALSO
482  *     asm_line()
483  *
484  * BUGS
485  *     Its not really acceptable to exit() on an assembly error.
486  */
487 void
asm_file(FILE * F,warrior_t * w,unsigned int CORESIZE)488 asm_file( FILE *F, warrior_t *w, unsigned int CORESIZE )
489 {
490   char line[MAX_ALL_CHARS];
491   insn_t *c;
492   int ret;			/* return code from asm_line() */
493 
494   w->len = w->start = 0;
495   w->have_pin = 0;
496   w->pin = 0;
497   c = w->code;
498 
499   while ( fgets(line, MAX_ALL_CHARS, F) ) {
500     ret = asm_line( line, c, CORESIZE );
501     if ( ret == ASMLINE_DONE ) break;
502 
503     switch ( ret ) {
504     case ASMLINE_OK:
505       if ( get_flags( c->in ) & fl_START ) {
506 	w->start = w->len;
507 	clr_flags( c->in, fl_START );
508       }
509       if ( w->len < MAXLENGTH) c++;
510       w->len++;
511       break;
512 
513     case ASMLINE_ORG:
514       w->start =  c->a;		/* was `ORG int', get the starting address */
515       break;
516 
517     case ASMLINE_NONE:
518       break;			/* nop */
519 
520     case ASMLINE_PIN:
521       w->have_pin = 1;
522       w->pin = c->a;		/* save PIN. */
523       break;
524 
525     default:
526       fprintf(stderr,"asm.c/asm_file(): illegal return code from asm_line()\n");
527       exit(1);
528     }
529     if ( w->len > MAXLENGTH ) {
530       fprintf(stderr, "too many instructions in warrior %d\n", w->no);
531       exit(1);
532     }
533   }
534   if ( w->start >= w->len ) {
535     fprintf(stderr, "starting address must be inside warrior body\n" );
536     exit(1);
537   }
538 }
539 
540 
541 void
asm_fname(const char * fname,warrior_t * w,unsigned int CORESIZE)542 asm_fname( const char *fname, warrior_t *w, unsigned int CORESIZE )
543 {
544   FILE *F;
545   int is_stdin = 0;
546 
547   if ( strcmp( fname, "-" ) == 0 ) {
548     F = stdin;
549     is_stdin = 1;
550   }
551   else
552     if (!( F = fopen(fname, "r") )) {
553       fprintf(stderr, "can't open file %s\n", fname);
554       exit(1);
555     }
556 
557   asm_file(F, w, CORESIZE);
558 
559   if ( !is_stdin ) fclose(F);
560 }
561 
562 
563 
564 /* NAME
565  *     dis1 -- disasemble an instruction
566  *     discore -- disasemble a segment of core
567  *
568  * SYNOPSIS
569  *     void dis1( char *s, inst_t in, unsigned int CORESIZE );
570  *     void discore( inst_t *core, unsigned int start, unsigned int end,
571  * 		     unsigned int CORESIZE );
572  *
573  * INPUTS
574  *     s -- string to print disassembled instruction to. A string
575  *          of length 60 should be more than sufficient.
576  *     in -- instruction to disassemble
577  *     core -- pointer to start of core
578  *     start -- core segment start offset
579  *     end -- core segment end offset (excluded)
580  *
581  * RESULTS
582  *     dis1 -- The disassembled instruction is printed to `s'.
583  *     discore -- A segment of core is dissasembled and printed
584  *                to stdout with core addresses.
585  */
586 
587 void
dis1(char * buf,insn_t in,unsigned int CORESIZE)588 dis1(char *buf, insn_t in, unsigned int CORESIZE)
589 {
590   int x;
591   char *op_s, *mo_s, *ma_s, *mb_s;
592   int af, bf;
593 
594   x = (in.in >> opPOS) & opMASK;
595   switch( x ) {
596   case DAT: op_s = "dat"; break;
597   case SPL: op_s = "spl"; break;
598   case MOV: op_s = "mov"; break;
599   case JMP: op_s = "jmp"; break;
600   case JMZ: op_s = "jmz"; break;
601   case JMN: op_s = "jmn"; break;
602   case ADD: op_s = "add"; break;
603   case SUB: op_s = "sub"; break;
604   case SEQ: op_s = "seq"; break;
605   case SNE: op_s = "sne"; break;
606   case MUL: op_s = "mul"; break;
607   case DIV: op_s = "div"; break;
608   case DJN: op_s = "djn"; break;
609   case SLT: op_s = "slt"; break;
610   case MODM: op_s = "mod"; break;
611   case NOP: op_s = "nop"; break;
612   case LDP: op_s = "ldp"; break;
613   case STP: op_s = "stp"; break;
614   default:
615     op_s = "???";
616   }
617 
618   x = (in.in >> moPOS) & moMASK;
619   switch ( x ) {
620   case mF:  mo_s = "f "; break;
621   case mA:  mo_s = "a "; break;
622   case mB:  mo_s = "b "; break;
623   case mAB: mo_s = "ab"; break;
624   case mBA: mo_s = "ba"; break;
625   case mX:  mo_s = "x "; break;
626   case mI:  mo_s = "i "; break;
627   default:
628     mo_s = "?";
629   }
630 
631 
632   x = (in.in >> maPOS) & mMASK;
633   switch (x) {
634   case DIRECT: ma_s = "$"; break;
635   case IMMEDIATE: ma_s = "#"; break;
636   case AINDIRECT: ma_s = "*"; break;
637   case BINDIRECT: ma_s = "@"; break;
638   case APREDEC: ma_s = "{"; break;
639   case APOSTINC: ma_s = "}"; break;
640   case BPREDEC: ma_s = "<"; break;
641   case BPOSTINC: ma_s = ">"; break;
642   default: ma_s = "?";
643   }
644 
645   x = (in.in >> mbPOS) & mMASK;
646   switch (x) {
647   case DIRECT:    mb_s = "$"; break;
648   case IMMEDIATE: mb_s = "#"; break;
649   case AINDIRECT: mb_s = "*"; break;
650   case BINDIRECT: mb_s = "@"; break;
651   case APREDEC:   mb_s = "{"; break;
652   case APOSTINC:  mb_s = "}"; break;
653   case BPREDEC:   mb_s = "<"; break;
654   case BPOSTINC:  mb_s = ">"; break;
655   default: mb_s = "?";
656   }
657 
658   af = in.a <= CORESIZE/2 ? in.a : in.a - CORESIZE;
659   bf = in.b <= CORESIZE/2 ? in.b : in.b - CORESIZE;
660 
661   sprintf(buf,"%s.%s %s%5d , %s%5d", op_s, mo_s, ma_s, af, mb_s, bf);
662 }
663 
664 
665 void
discore(const insn_t * core,int start,int end,unsigned int CORESIZE)666 discore( const insn_t *core,
667 	 int start,
668 	 int end,
669 	 unsigned int CORESIZE )
670 {
671   int adr;
672   char line[MAX_ALL_CHARS];
673   for ( adr = start; adr < end; adr++ ) {
674     int i = MODS(adr, CORESIZE);
675     dis1( line, core[i], CORESIZE );
676     printf("%4d    %s\n", adr, line);
677   }
678 }
679