1% This file is part of the Stanford GraphBase (c) Stanford University 1993
2@i boilerplate.w %<< legal stuff: PLEASE READ IT BEFORE MAKING ANY CHANGES!
3
4\def\title{GB\_\,IO}
5
6@* Introduction. This is {\sc GB\_\,IO}, the input/output module used
7by all GraphBase routines to access data~files. It doesn't actually do
8any output; but somehow `input/output' sounds like a more useful title
9than just `input'.
10
11All files of GraphBase data are designed to produce identical results on
12almost all existing computers and operating systems. Each line of each file
13contains at most 79 characters. Each character is either a blank or a
14digit or an uppercase letter or a lowercase letter or a standard punctuation
15mark. Blank characters at the end of each line are ``invisible''; that is,
16they have no perceivable effect. Hence identical results will be obtained on
17record-oriented systems that pad every line with blanks.
18
19The data is carefully sum-checked so that defective input files have little
20chance of being accepted.
21
22@ Changes might be needed when these routines are ported to different
23systems. Sections of the program that are most likely to require such changes
24are listed under `system dependencies' in the index.
25
26A validation program is provided so that installers can tell if {\sc GB\_\,IO}
27is working properly. To make the test, simply run \.{test\_io}.
28
29@(test_io.c@>=
30#include "gb_io.h"
31  /* all users of {\sc GB\_\,IO} should include this header file */
32#define exit_test(m) /* we invoke this macro if something goes wrong */\
33 {@+fprintf(stderr,"%s!\n(Error code = %ld)\n",m,io_errors);@+return -1;@+}
34@t\2@>@/
35int main()
36{
37  @<Test the |gb_open| routine; exit if there's trouble@>;
38  @<Test the sample data lines; exit if there's trouble@>;
39  @<Test the |gb_close| routine; exit if there's trouble@>;
40  printf("OK, the gb_io routines seem to work!\n");
41  return 0;
42}
43
44@ The external variable |io_errors| mentioned in the previous section
45will be set nonzero if any anomalies are detected. Errors won't occur
46in normal use of GraphBase programs, so no attempt has been made to
47provide a user-friendly way to decode the nonzero values that
48|io_errors| might assume.  Information is simply gathered in binary
49form; system wizards who might need to do a bit of troubleshooting
50should be able to decode |io_errors| without great pain.
51
52@d cant_open_file 0x1 /* bit set in |io_errors| if |fopen| fails */
53@d cant_close_file 0x2 /* bit set if |fclose| fails */
54@d bad_first_line 0x4 /* bit set if the data file's first line isn't legit */
55@d bad_second_line 0x8 /* bit set if the second line doesn't pass muster */
56@d bad_third_line 0x10 /* bit set if the third line is awry */
57@d bad_fourth_line 0x20 /* guess when this bit is set */
58@d file_ended_prematurely 0x40 /* bit set if |fgets| fails */
59@d missing_newline 0x80 /* bit set if line is too long or |'\n'| is missing */
60@d wrong_number_of_lines 0x100 /* bit set if the line count is wrong */
61@d wrong_checksum 0x200 /* bit set if the checksum is wrong */
62@d no_file_open 0x400 /* bit set if user tries to close an unopened file */
63@d bad_last_line 0x800 /* bit set if final line has incorrect form */
64
65@ The \CEE/ code for {\sc GB\_\,IO} doesn't have a main routine; it's just a
66bunch of subroutines to be incorporated into programs at a higher level
67via the system loading routine. Here is the general outline of \.{gb\_io.c}:
68
69@p
70@<Header files to include@>@;
71@h
72@<External declarations@>@;
73@<Private declarations@>@;
74@<Internal functions@>@;
75@<External functions@>
76
77@ Every external variable is declared twice in this \.{CWEB} file:
78once for {\sc GB\_\,IO} itself (the ``real'' declaration for storage
79allocation purposes) and once in \.{gb\_io.h} (for cross-references
80by {\sc GB\_\,IO} users).
81
82@<External declarations@>=
83long io_errors; /* record of anomalies noted by {\sc GB\_\,IO} routines */
84
85@ @(gb_io.h@>=
86@<Header...@>@;
87extern long io_errors;
88 /* record of anomalies noted by {\sc GB\_\,IO} routines */
89
90@ We will stick to standard \CEE/-type input conventions. We'll also have
91occasion to use some of the standard string operations.
92
93@<Header...@>=
94#include <stdio.h>
95#ifdef SYSV
96#include <string.h>
97#else
98#include <strings.h>
99#endif
100
101@* Inputting a line. The {\sc GB\_\,IO} routines get their input from
102an array called |buffer|. This array is internal to {\sc
103GB\_\,IO}---its contents are hidden from user programs. We make it 81
104characters long, since the data is supposed to have at most 79
105characters per line, followed by newline and null.
106
107@<Private...@>=
108static char buffer[81]; /* the current line of input */
109static char *cur_pos=buffer; /* the current character of interest */
110static FILE *cur_file; /* current file, or |NULL| if none is open */
111
112@ Here's a basic subroutine to fill the |buffer|. The main feature of interest
113is the removal of trailing blanks. We assume that |cur_file| is open.
114
115Notice that a line of 79 characters (followed by |'\n'|) will just fit into
116the buffer, and will cause no errors. A line of 80 characters will
117be split into two lines and the |missing_newline|
118message will occur, because of the way |fgets| is defined. A |missing_newline|
119error will also occur if the file ends in the middle of a line, or if
120a null character (|'\0'|) occurs within a line.
121
122@<Internal...@>=
123static void fill_buf()
124{@+register char *p;
125  if (!fgets(buffer,sizeof(buffer),cur_file)) {
126    io_errors |= file_ended_prematurely; buffer[0]=more_data=0;
127  }
128  for (p=buffer; *p; p++) ; /* advance to first null character */
129  if (p--==buffer || *p!='\n') {
130    io_errors |= missing_newline; p++;
131  }
132  while (--p>=buffer && *p==' ') ; /* move back over trailing blanks */
133  *++p='\n'; *++p=0; /* newline and null are always present at end of line */
134  cur_pos=buffer; /* get ready to read |buffer[0]| */
135}
136
137@* Checksums. Each data file has a ``magic number,'' which is defined to be
138$$\biggl(\sum_l 2^l c_l\biggr) \bmod p\,.$$
139Here $p$ is a large prime number, and $c_l$ denotes the internal code
140corresponding to the $l$th-from-last
141data character read (including newlines but not nulls).
142
143The ``internal codes'' $c_l$ are computed in a system-independent way:
144Each character |c| in the actual encoding scheme being used has a
145corresponding |icode|, which is the same on all systems. For example,
146the |icode| of |'0'| is zero, regardless of whether |'0'| is actually
147represented in ASCII or EBCDIC or some other scheme. (We assume that
148every modern computer system is capable of printing at least 95
149different characters, including a blank space.)
150
151We will accept a data file as error-free if it has the correct number of
152lines and ends with the proper magic number.
153
154@<Private...@>=
155static char icode[256]; /* mapping of characters to internal codes */
156static long checksum_prime=(1L<<30)-83;
157  /* large prime such that $2p+|unexpected_char|$ won't overflow */
158static long magic; /* current checksum value */
159static long line_no; /* current line number in file */
160static long final_magic; /* desired final magic number */
161static long tot_lines; /* total number of data lines */
162static char more_data; /* is there data still waiting to be read? */
163
164@ The |icode| mapping is defined by a single string, |imap|, such that
165character |imap[k]| has |icode| value~|k|. There are 96 characters
166in |imap|, namely the 94 standard visible ASCII codes plus space
167and newline. If EBCDIC code is used instead of ASCII, the
168cents sign \rlap{\.{\kern.05em/}}\.c should take the place of single-left-quote
169\.{\char`\`}, and \.{\char5}~should take the place of\/~\.{\char`\~}.
170
171All characters that don't appear in |imap| are given the same |icode|
172value, called |unexpected_char|. Such characters should be avoided in
173GraphBase files whenever possible. (If they do appear, they can still
174get into a user's data, but we don't distinguish them from each other
175for checksumming purposes.)
176
177The |icode| table actually plays a dual role, because we've rigged it so that
178codes 0--15 come from the characters |"0123456789ABCDEF"|. This facilitates
179conversion of decimal and hexadecimal data. We can also use it for
180radices higher than 16.
181
182@d unexpected_char 127 /* default |icode| value */
183
184@<Private...@>=
185static char *imap="0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ\
186abcdefghijklmnopqrstuvwxyz_^~&@@,;.:?!%#$+-*/|\\<=>()[]{}`'\" \n";
187
188@ Users of {\sc GB\_\,IO} can look at the |imap|, but they can't change it.
189
190@<External fun...@>=
191char imap_chr(d)
192  long d;
193{
194  return d<0 || d>strlen(imap)? '\0': imap[d];
195}
196@#
197long imap_ord(c)
198  char c;
199{
200  @<Make sure that |icode| has been initialized@>;
201  return (c<0||c>255)? unexpected_char: icode[c];
202}
203
204@ @(gb_io.h@>=
205#define unexpected_char @t\quad@> 127
206extern char imap_chr(); /* the character that maps to a given character */
207extern long imap_ord(); /* the ordinal number of a given character */
208
209@ @<Make sure that |icode| has been initialized@>=
210if (!icode['1']) icode_setup();
211
212@ @<Internal...@>=
213static void icode_setup()
214{@+register long k;
215  register char *p;
216  for (k=0;k<256;k++) icode[k]=unexpected_char;
217  for (p=imap,k=0; *p; p++,k++) icode[*p]=k;
218}
219
220@ Now we're ready to specify some external subroutines that do
221input.  Calling |gb_newline()| will read the next line of
222data into |buffer| and update the magic number accordingly.
223
224@(gb_io.h@>=
225extern void gb_newline(); /* advance to next line of the data file */
226extern long new_checksum(); /* compute change in magic number */
227
228@ Users can compute checksums as |gb_newline| does, but they can't
229change the (private) value of |magic|.
230
231@<External f...@>=
232long new_checksum(s,old_checksum)
233  char *s; /* a string */
234  long old_checksum;
235{@+register long a=old_checksum;
236  register char*p;
237  for (p=s; *p; p++)
238    a=(a+a+imap_ord(*p)) % checksum_prime;
239  return a;
240}
241
242@ The magic checksum is not affected by lines that begin with \.*.
243
244@<External f...@>=
245void gb_newline()
246{
247  if (++line_no>tot_lines) more_data=0;
248  if (more_data) {
249    fill_buf();
250    if (buffer[0]!='*')
251      magic=new_checksum(buffer,magic);
252  }
253}
254
255@ Another simple routine allows a user to read (but not write) the
256variable |more_data|.
257
258@(gb_io.h@>=
259extern long gb_eof(); /* has the data all been read? */
260
261@ @<External f...@>=
262long gb_eof() { return !more_data; }
263
264@* Parsing a line. The user can input characters from the buffer in several
265ways. First, there's a basic |gb_char()| routine, which returns
266a single character. The character is |'\n'| if the last character on the
267line has already been read (and it continues to be |'\n'| until the user calls
268|gb_newline|).
269
270The current position in the line, |cur_pos|, always advances when |gb_char|
271is called, unless |cur_pos| was already at the end of the line.
272There's also a |gb_backup()| routine, which moves |cur_pos| one place
273to the left unless it was already at the beginning.
274
275@(gb_io.h@>=
276extern char gb_char(); /* get next character of current line, or |'\n'| */
277extern void gb_backup(); /* move back ready to scan a character again */
278
279@ @<External f...@>=
280char gb_char()
281{
282  if (*cur_pos) return (*cur_pos++);
283  return '\n';
284}
285@#
286void gb_backup()
287{
288  if (cur_pos>buffer)
289    cur_pos--;
290}
291
292@ There are two ways to read numerical data. The first, |gb_digit(d)|,
293expects to read a single character in radix~|d|, using |icode| values
294to specify digits greater than~9. (Thus, for example, |'A'| represents
295the hexadecimal digit for decimal~10.)
296If the next character is a valid |d|-git,
297|cur_pos| moves to the next character and the numerical value is returned.
298Otherwise |cur_pos| stays in the same place and $-1$ is returned.
299
300The second routine, |gb_number(d)|, reads characters and forms an
301unsigned radix-|d| number until the first non-digit is encountered.
302The resulting number is returned; it is zero if no digits were found.
303No errors are possible with this routine, because it uses
304|unsigned long| arithmetic.
305
306@(gb_io.h@>=
307extern long gb_digit(); /* |gb_digit(d)| reads a digit between 0 and |d-1| */
308extern unsigned long gb_number(); /* |gb_number(d)| reads a radix-|d| number */
309
310@ The value of |d| should be at most 127, if users want their programs to be
311portable, because \CEE/ does not treat larger |char| values in a
312well-defined manner. In most applications, |d| is of course either 10 or 16.
313
314@<External f...@>=
315long gb_digit(d)
316    char d;
317{
318  icode[0]=d; /* make sure |'\0'| is a nondigit */
319  if (imap_ord(*cur_pos)<d) return icode[*cur_pos++];
320  return -1;
321}
322@#
323unsigned long gb_number(d)
324    char d;
325{@+register unsigned long a=0;
326  icode[0]=d; /* make sure |'\0'| is a nondigit */
327  while (imap_ord(*cur_pos)<d)
328    a=a*d+icode[*cur_pos++];
329  return a;
330}
331
332@ The final subroutine for fetching data is |gb_string(p,c)|, which
333stores a null-terminated string into locations starting at~|p|.
334The string starts at |cur_pos| and ends just before the first appearance
335of character |c|. If |c=='\n'|, the string will stop at the end of the line.
336If |c| doesn't appear in the buffer at or after |cur_pos|, the last character
337of the string will be the |'\n'| that is always inserted at the end
338of a line, unless the entire line has already been read. (If the entire
339line has previously been read, the empty string is always returned.)
340After the string has been copied, |cur_pos| advances past it.
341
342In order to use this routine safely, the user should first check that
343there is room to store up to 81 characters beginning at location~|p|.
344A suitable place to put the result, called |str_buf|, is provided
345for the user's convenience.
346
347The location following the stored string is returned. Thus, if the
348stored string has length~|l| (not counting the null character that is
349stored at the end), the value returned will be |p+l+1|.
350
351@(gb_io.h@>=
352#define STR_BUF_LENGTH 160
353extern char str_buf[]; /* safe place to receive output of |gb_string| */
354extern char *gb_string(); /* |gb_string(p,c)| reads a string delimited by |c|
355  into bytes starting at |p| */
356
357@ @d STR_BUF_LENGTH 160
358
359@<External f...@>=
360char str_buf[STR_BUF_LENGTH]; /* users can put strings here if they wish */
361char *gb_string(p,c)
362    char *p; /* where to put the result */
363    char c; /* character following the string */
364{
365  while (*cur_pos && *cur_pos!=c)
366    *p++=*cur_pos++;
367  *p++=0;
368  return p;
369}
370
371@ Here's how we test those routines in \.{test\_io}: The first line of test
372data consists of 79 characters, beginning with 64 zeroes and ending with
373`\.{123456789ABCDEF}'. The second line is completely blank. The third
374and final line says `\.{Oops:(intentional mistake)}'.
375
376@<Test the sample data lines...@>=
377if (gb_number(10)!=123456789)
378  io_errors |= 1L<<20; /* decimal number not working */
379if (gb_digit(16)!=10)
380  io_errors |= 1L<<21; /* we missed the \.A following the decimal number */
381gb_backup();@+ gb_backup(); /* get set to read `\.{9A}' again */
382if (gb_number(16)!=0x9ABCDEF)
383  io_errors |= 1L<<22; /* hexadecimal number not working */
384gb_newline(); /* now we should be scanning a blank line */
385if (gb_char()!='\n')
386  io_errors |= 1L<<23; /* newline not inserted at end */
387if (gb_char()!='\n')
388  io_errors |= 1L<<24; /* newline not implied after end */
389if (gb_number(60)!=0)
390  io_errors |= 1L<<25; /* number should stop at null character */
391{@+char temp[100];
392  if (gb_string(temp,'\n')!=temp+1)
393    io_errors |= 1L<<26; /* string should be null after end of line */
394  gb_newline();
395  if (gb_string(temp,':')!=temp+5 || strcmp(temp,"Oops"))
396    io_errors |= 1L<<27; /* string not read properly */
397}
398if (io_errors)
399  exit_test("Sorry, it failed. Look at the error code for clues");
400if (gb_digit(10)!=-1) exit_test("Digit error not detected");
401if (gb_char()!=':')
402  io_errors |= 1L<<28; /* lost synch after |gb_string| and |gb_digit| */
403if (gb_eof())
404  io_errors |= 1L<<29; /* premature end-of-file indication */
405gb_newline();
406if (!gb_eof())
407  io_errors |= 1L<<30; /* postmature end-of-file indication */
408
409@* Opening a file. The call |gb_raw_open("foo")| will open file |"foo"| and
410initialize the checksumming process. If the file cannot be opened,
411|io_errors| will be set to |cant_open_file|, otherwise
412|io_errors| will be initialized to zero.
413
414The call |gb_open("foo")| is a stronger version of |gb_raw_open|, which
415is used for standard GraphBase data files like |"words.dat"| to make
416doubly sure that they have not been corrupted. It returns the current value
417of |io_errors|, which will be nonzero if any problems were detected
418at the beginning of the file.
419
420@<Test the |gb_open| routine...@>=
421if (gb_open("test.dat")!=0)
422  exit_test("Can't open test.dat");
423
424@ @d gb_raw_open gb_r_open /* abbreviation for Procrustean external linkage */
425
426@(gb_io.h@>=
427#define gb_raw_open gb_r_open
428extern void gb_raw_open(); /* open a file for GraphBase input */
429extern long gb_open(); /* open a GraphBase data file; return 0 if OK */
430
431@ @<External f...@>=
432void gb_raw_open(f)
433    char *f;
434{
435  @<Make sure that |icode|...@>;
436  @<Try to open |f|@>;
437  if (cur_file) {
438    io_errors=0;
439    more_data=1;
440    line_no=magic=0;
441    tot_lines=0x7fffffff; /* allow ``infinitely many'' lines */
442    fill_buf();
443  }@+else io_errors=cant_open_file;
444}
445
446@ Here's a possibly system-dependent part of the code: We try first to
447open the data file by using the file name itself as the path name;
448failing that, we try to prefix the file name with the name of the
449standard directory for GraphBase data, if the program has been compiled
450with |DATA_DIRECTORY| defined.
451@^system dependencies@>
452
453@<Try to open |f|@>=
454cur_file=fopen(f,"r");
455@^system dependencies@>
456#ifdef DATA_DIRECTORY
457if (!cur_file && (strlen(DATA_DIRECTORY)+strlen(f)<STR_BUF_LENGTH)) {
458  sprintf(str_buf,"%s%s",DATA_DIRECTORY,f);
459  cur_file=fopen(str_buf,"r");
460}
461#endif
462
463@ @<External f...@>=
464long gb_open(f)
465    char *f;
466{
467  strncpy(file_name,f,sizeof(file_name)-1);
468     /* save the name for use by |gb_close| */
469  gb_raw_open(f);
470  if (cur_file) {
471    @<Check the first line; return if unsuccessful@>;
472    @<Check the second line; return if unsuccessful@>;
473    @<Check the third line; return if unsuccessful@>;
474    @<Check the fourth line; return if unsuccessful@>;
475    gb_newline(); /* the first line of real data is now in the buffer */
476  }
477  return io_errors;
478}
479
480@ @<Private...@>=
481static char file_name[20]; /* name of the data file, without a prefix */
482
483@ The first four lines of a typical data file should look something like this:
484$$\halign{\hskip5em\.{#}\hfill\cr
485 * File "words.dat" from the Stanford GraphBase (C) 1993 Stanford University\cr
486 * A database of English five-letter words\cr
487 * This file may be freely copied but please do not change it in any way!\cr
488 * (Checksum parameters 5757,526296596)\cr}$$
489We actually verify only that the first four lines of a data file named |"foo"|
490begin respectively with the characters
491$$\halign{\hskip5em\.{#}\hfill\cr
492 * File "foo"\cr
493 *\cr
494 *\cr
495 * (Checksum parameters $l$,$m$)\cr}$$
496where $l$ and $m$ are decimal numbers. The values of $l$ and~$m$
497are stored away as |tot_lines| and |final_magic|, to be matched at the
498end of the file.
499
500@<Check the first line...@>=
501sprintf(str_buf,"* File \"%s\"",f);
502if (strncmp(buffer,str_buf,strlen(str_buf)))
503  return (io_errors |= bad_first_line);
504
505@ @<Check the second line...@>=
506fill_buf();
507if (*buffer!='*') return (io_errors |= bad_second_line);
508
509@ @<Check the third line...@>=
510fill_buf();
511if (*buffer!='*') return (io_errors |= bad_third_line);
512
513@ @<Check the fourth line; return if unsuccessful@>=
514fill_buf();
515if (strncmp(buffer,"* (Checksum parameters ",23))
516  return (io_errors |= bad_fourth_line);
517cur_pos +=23;
518tot_lines=gb_number(10);
519if (gb_char()!=',')
520  return (io_errors |= bad_fourth_line);
521final_magic=gb_number(10);
522if (gb_char()!=')')
523  return (io_errors |= bad_fourth_line);
524
525@* Closing a file. After all data has been input, or should have been input,
526we check that the file was open and that it had the correct number of
527lines, the correct magic number, and a correct final line.  The
528subroutine |gb_close|, like |gb_open|, returns the value of
529|io_errors|, which will be nonzero if at least one problem was noticed.
530
531@<Test the |gb_close| routine; exit if there's trouble@>=
532if (gb_close()!=0)
533  exit_test("Bad checksum, or difficulty closing the file");
534
535@ @<External f...@>=
536long gb_close()
537{
538  if (!cur_file)
539    return (io_errors |= no_file_open);
540  fill_buf();
541  sprintf(str_buf,"* End of file \"%s\"",file_name);
542  if (strncmp(buffer,str_buf,strlen(str_buf)))
543    io_errors |= bad_last_line;
544  more_data=buffer[0]=0;
545   /* now the {\sc GB\_\,IO} routines are effectively shut down */
546   /* we have |cur_pos=buffer| */
547  if (fclose(cur_file)!=0)
548    return (io_errors |= cant_close_file);
549  cur_file=NULL;
550  if (line_no!=tot_lines+1)
551    return (io_errors |= wrong_number_of_lines);
552  if (magic!=final_magic)
553    return (io_errors |= wrong_checksum);
554  return io_errors;
555}
556
557@ There is also a less paranoid routine, |gb_raw_close|, that
558closes user-generated files. It simply closes the current file, if any,
559and returns the value of the |magic| checksum.
560
561Example: The |restore_graph| subroutine in {\sc GB\_\,SAVE} uses
562|gb_raw_open| and |gb_raw_close| to provide system-independent input
563that is almost as foolproof as the reading of standard GraphBase data.
564
565@ @d gb_raw_close gb_r_close /* for Procrustean external linkage */
566
567@(gb_io.h@>=
568#define gb_raw_close gb_r_close
569extern long gb_close(); /* close a GraphBase data file; return 0 if OK */
570extern long gb_raw_close(); /* close file and return the checksum */
571
572@ @<External f...@>=
573long gb_raw_close()
574{
575  if (cur_file) {
576    fclose(cur_file);
577    more_data=buffer[0]=0;
578    cur_pos=buffer;
579    cur_file=NULL;
580  }
581  return magic;
582}
583
584@* Index. Here is a list that shows where the identifiers of this program are
585defined and used.
586