1% This file is part of the Stanford GraphBase (c) Stanford University 1993 2@i boilerplate.w %<< legal stuff: PLEASE READ IT BEFORE MAKING ANY CHANGES! 3 4\def\title{GB\_\,IO} 5 6@* Introduction. This is {\sc GB\_\,IO}, the input/output module used 7by all GraphBase routines to access data~files. It doesn't actually do 8any output; but somehow `input/output' sounds like a more useful title 9than just `input'. 10 11All files of GraphBase data are designed to produce identical results on 12almost all existing computers and operating systems. Each line of each file 13contains at most 79 characters. Each character is either a blank or a 14digit or an uppercase letter or a lowercase letter or a standard punctuation 15mark. Blank characters at the end of each line are ``invisible''; that is, 16they have no perceivable effect. Hence identical results will be obtained on 17record-oriented systems that pad every line with blanks. 18 19The data is carefully sum-checked so that defective input files have little 20chance of being accepted. 21 22@ Changes might be needed when these routines are ported to different 23systems. Sections of the program that are most likely to require such changes 24are listed under `system dependencies' in the index. 25 26A validation program is provided so that installers can tell if {\sc GB\_\,IO} 27is working properly. To make the test, simply run \.{test\_io}. 28 29@(test_io.c@>= 30#include "gb_io.h" 31 /* all users of {\sc GB\_\,IO} should include this header file */ 32#define exit_test(m) /* we invoke this macro if something goes wrong */\ 33 {@+fprintf(stderr,"%s!\n(Error code = %ld)\n",m,io_errors);@+return -1;@+} 34@t\2@>@/ 35int main() 36{ 37 @<Test the |gb_open| routine; exit if there's trouble@>; 38 @<Test the sample data lines; exit if there's trouble@>; 39 @<Test the |gb_close| routine; exit if there's trouble@>; 40 printf("OK, the gb_io routines seem to work!\n"); 41 return 0; 42} 43 44@ The external variable |io_errors| mentioned in the previous section 45will be set nonzero if any anomalies are detected. Errors won't occur 46in normal use of GraphBase programs, so no attempt has been made to 47provide a user-friendly way to decode the nonzero values that 48|io_errors| might assume. Information is simply gathered in binary 49form; system wizards who might need to do a bit of troubleshooting 50should be able to decode |io_errors| without great pain. 51 52@d cant_open_file 0x1 /* bit set in |io_errors| if |fopen| fails */ 53@d cant_close_file 0x2 /* bit set if |fclose| fails */ 54@d bad_first_line 0x4 /* bit set if the data file's first line isn't legit */ 55@d bad_second_line 0x8 /* bit set if the second line doesn't pass muster */ 56@d bad_third_line 0x10 /* bit set if the third line is awry */ 57@d bad_fourth_line 0x20 /* guess when this bit is set */ 58@d file_ended_prematurely 0x40 /* bit set if |fgets| fails */ 59@d missing_newline 0x80 /* bit set if line is too long or |'\n'| is missing */ 60@d wrong_number_of_lines 0x100 /* bit set if the line count is wrong */ 61@d wrong_checksum 0x200 /* bit set if the checksum is wrong */ 62@d no_file_open 0x400 /* bit set if user tries to close an unopened file */ 63@d bad_last_line 0x800 /* bit set if final line has incorrect form */ 64 65@ The \CEE/ code for {\sc GB\_\,IO} doesn't have a main routine; it's just a 66bunch of subroutines to be incorporated into programs at a higher level 67via the system loading routine. Here is the general outline of \.{gb\_io.c}: 68 69@p 70@<Header files to include@>@; 71@h 72@<External declarations@>@; 73@<Private declarations@>@; 74@<Internal functions@>@; 75@<External functions@> 76 77@ Every external variable is declared twice in this \.{CWEB} file: 78once for {\sc GB\_\,IO} itself (the ``real'' declaration for storage 79allocation purposes) and once in \.{gb\_io.h} (for cross-references 80by {\sc GB\_\,IO} users). 81 82@<External declarations@>= 83long io_errors; /* record of anomalies noted by {\sc GB\_\,IO} routines */ 84 85@ @(gb_io.h@>= 86@<Header...@>@; 87extern long io_errors; 88 /* record of anomalies noted by {\sc GB\_\,IO} routines */ 89 90@ We will stick to standard \CEE/-type input conventions. We'll also have 91occasion to use some of the standard string operations. 92 93@<Header...@>= 94#include <stdio.h> 95#ifdef SYSV 96#include <string.h> 97#else 98#include <strings.h> 99#endif 100 101@* Inputting a line. The {\sc GB\_\,IO} routines get their input from 102an array called |buffer|. This array is internal to {\sc 103GB\_\,IO}---its contents are hidden from user programs. We make it 81 104characters long, since the data is supposed to have at most 79 105characters per line, followed by newline and null. 106 107@<Private...@>= 108static char buffer[81]; /* the current line of input */ 109static char *cur_pos=buffer; /* the current character of interest */ 110static FILE *cur_file; /* current file, or |NULL| if none is open */ 111 112@ Here's a basic subroutine to fill the |buffer|. The main feature of interest 113is the removal of trailing blanks. We assume that |cur_file| is open. 114 115Notice that a line of 79 characters (followed by |'\n'|) will just fit into 116the buffer, and will cause no errors. A line of 80 characters will 117be split into two lines and the |missing_newline| 118message will occur, because of the way |fgets| is defined. A |missing_newline| 119error will also occur if the file ends in the middle of a line, or if 120a null character (|'\0'|) occurs within a line. 121 122@<Internal...@>= 123static void fill_buf() 124{@+register char *p; 125 if (!fgets(buffer,sizeof(buffer),cur_file)) { 126 io_errors |= file_ended_prematurely; buffer[0]=more_data=0; 127 } 128 for (p=buffer; *p; p++) ; /* advance to first null character */ 129 if (p--==buffer || *p!='\n') { 130 io_errors |= missing_newline; p++; 131 } 132 while (--p>=buffer && *p==' ') ; /* move back over trailing blanks */ 133 *++p='\n'; *++p=0; /* newline and null are always present at end of line */ 134 cur_pos=buffer; /* get ready to read |buffer[0]| */ 135} 136 137@* Checksums. Each data file has a ``magic number,'' which is defined to be 138$$\biggl(\sum_l 2^l c_l\biggr) \bmod p\,.$$ 139Here $p$ is a large prime number, and $c_l$ denotes the internal code 140corresponding to the $l$th-from-last 141data character read (including newlines but not nulls). 142 143The ``internal codes'' $c_l$ are computed in a system-independent way: 144Each character |c| in the actual encoding scheme being used has a 145corresponding |icode|, which is the same on all systems. For example, 146the |icode| of |'0'| is zero, regardless of whether |'0'| is actually 147represented in ASCII or EBCDIC or some other scheme. (We assume that 148every modern computer system is capable of printing at least 95 149different characters, including a blank space.) 150 151We will accept a data file as error-free if it has the correct number of 152lines and ends with the proper magic number. 153 154@<Private...@>= 155static char icode[256]; /* mapping of characters to internal codes */ 156static long checksum_prime=(1L<<30)-83; 157 /* large prime such that $2p+|unexpected_char|$ won't overflow */ 158static long magic; /* current checksum value */ 159static long line_no; /* current line number in file */ 160static long final_magic; /* desired final magic number */ 161static long tot_lines; /* total number of data lines */ 162static char more_data; /* is there data still waiting to be read? */ 163 164@ The |icode| mapping is defined by a single string, |imap|, such that 165character |imap[k]| has |icode| value~|k|. There are 96 characters 166in |imap|, namely the 94 standard visible ASCII codes plus space 167and newline. If EBCDIC code is used instead of ASCII, the 168cents sign \rlap{\.{\kern.05em/}}\.c should take the place of single-left-quote 169\.{\char`\`}, and \.{\char5}~should take the place of\/~\.{\char`\~}. 170 171All characters that don't appear in |imap| are given the same |icode| 172value, called |unexpected_char|. Such characters should be avoided in 173GraphBase files whenever possible. (If they do appear, they can still 174get into a user's data, but we don't distinguish them from each other 175for checksumming purposes.) 176 177The |icode| table actually plays a dual role, because we've rigged it so that 178codes 0--15 come from the characters |"0123456789ABCDEF"|. This facilitates 179conversion of decimal and hexadecimal data. We can also use it for 180radices higher than 16. 181 182@d unexpected_char 127 /* default |icode| value */ 183 184@<Private...@>= 185static char *imap="0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ\ 186abcdefghijklmnopqrstuvwxyz_^~&@@,;.:?!%#$+-*/|\\<=>()[]{}`'\" \n"; 187 188@ Users of {\sc GB\_\,IO} can look at the |imap|, but they can't change it. 189 190@<External fun...@>= 191char imap_chr(d) 192 long d; 193{ 194 return d<0 || d>strlen(imap)? '\0': imap[d]; 195} 196@# 197long imap_ord(c) 198 char c; 199{ 200 @<Make sure that |icode| has been initialized@>; 201 return (c<0||c>255)? unexpected_char: icode[c]; 202} 203 204@ @(gb_io.h@>= 205#define unexpected_char @t\quad@> 127 206extern char imap_chr(); /* the character that maps to a given character */ 207extern long imap_ord(); /* the ordinal number of a given character */ 208 209@ @<Make sure that |icode| has been initialized@>= 210if (!icode['1']) icode_setup(); 211 212@ @<Internal...@>= 213static void icode_setup() 214{@+register long k; 215 register char *p; 216 for (k=0;k<256;k++) icode[k]=unexpected_char; 217 for (p=imap,k=0; *p; p++,k++) icode[*p]=k; 218} 219 220@ Now we're ready to specify some external subroutines that do 221input. Calling |gb_newline()| will read the next line of 222data into |buffer| and update the magic number accordingly. 223 224@(gb_io.h@>= 225extern void gb_newline(); /* advance to next line of the data file */ 226extern long new_checksum(); /* compute change in magic number */ 227 228@ Users can compute checksums as |gb_newline| does, but they can't 229change the (private) value of |magic|. 230 231@<External f...@>= 232long new_checksum(s,old_checksum) 233 char *s; /* a string */ 234 long old_checksum; 235{@+register long a=old_checksum; 236 register char*p; 237 for (p=s; *p; p++) 238 a=(a+a+imap_ord(*p)) % checksum_prime; 239 return a; 240} 241 242@ The magic checksum is not affected by lines that begin with \.*. 243 244@<External f...@>= 245void gb_newline() 246{ 247 if (++line_no>tot_lines) more_data=0; 248 if (more_data) { 249 fill_buf(); 250 if (buffer[0]!='*') 251 magic=new_checksum(buffer,magic); 252 } 253} 254 255@ Another simple routine allows a user to read (but not write) the 256variable |more_data|. 257 258@(gb_io.h@>= 259extern long gb_eof(); /* has the data all been read? */ 260 261@ @<External f...@>= 262long gb_eof() { return !more_data; } 263 264@* Parsing a line. The user can input characters from the buffer in several 265ways. First, there's a basic |gb_char()| routine, which returns 266a single character. The character is |'\n'| if the last character on the 267line has already been read (and it continues to be |'\n'| until the user calls 268|gb_newline|). 269 270The current position in the line, |cur_pos|, always advances when |gb_char| 271is called, unless |cur_pos| was already at the end of the line. 272There's also a |gb_backup()| routine, which moves |cur_pos| one place 273to the left unless it was already at the beginning. 274 275@(gb_io.h@>= 276extern char gb_char(); /* get next character of current line, or |'\n'| */ 277extern void gb_backup(); /* move back ready to scan a character again */ 278 279@ @<External f...@>= 280char gb_char() 281{ 282 if (*cur_pos) return (*cur_pos++); 283 return '\n'; 284} 285@# 286void gb_backup() 287{ 288 if (cur_pos>buffer) 289 cur_pos--; 290} 291 292@ There are two ways to read numerical data. The first, |gb_digit(d)|, 293expects to read a single character in radix~|d|, using |icode| values 294to specify digits greater than~9. (Thus, for example, |'A'| represents 295the hexadecimal digit for decimal~10.) 296If the next character is a valid |d|-git, 297|cur_pos| moves to the next character and the numerical value is returned. 298Otherwise |cur_pos| stays in the same place and $-1$ is returned. 299 300The second routine, |gb_number(d)|, reads characters and forms an 301unsigned radix-|d| number until the first non-digit is encountered. 302The resulting number is returned; it is zero if no digits were found. 303No errors are possible with this routine, because it uses 304|unsigned long| arithmetic. 305 306@(gb_io.h@>= 307extern long gb_digit(); /* |gb_digit(d)| reads a digit between 0 and |d-1| */ 308extern unsigned long gb_number(); /* |gb_number(d)| reads a radix-|d| number */ 309 310@ The value of |d| should be at most 127, if users want their programs to be 311portable, because \CEE/ does not treat larger |char| values in a 312well-defined manner. In most applications, |d| is of course either 10 or 16. 313 314@<External f...@>= 315long gb_digit(d) 316 char d; 317{ 318 icode[0]=d; /* make sure |'\0'| is a nondigit */ 319 if (imap_ord(*cur_pos)<d) return icode[*cur_pos++]; 320 return -1; 321} 322@# 323unsigned long gb_number(d) 324 char d; 325{@+register unsigned long a=0; 326 icode[0]=d; /* make sure |'\0'| is a nondigit */ 327 while (imap_ord(*cur_pos)<d) 328 a=a*d+icode[*cur_pos++]; 329 return a; 330} 331 332@ The final subroutine for fetching data is |gb_string(p,c)|, which 333stores a null-terminated string into locations starting at~|p|. 334The string starts at |cur_pos| and ends just before the first appearance 335of character |c|. If |c=='\n'|, the string will stop at the end of the line. 336If |c| doesn't appear in the buffer at or after |cur_pos|, the last character 337of the string will be the |'\n'| that is always inserted at the end 338of a line, unless the entire line has already been read. (If the entire 339line has previously been read, the empty string is always returned.) 340After the string has been copied, |cur_pos| advances past it. 341 342In order to use this routine safely, the user should first check that 343there is room to store up to 81 characters beginning at location~|p|. 344A suitable place to put the result, called |str_buf|, is provided 345for the user's convenience. 346 347The location following the stored string is returned. Thus, if the 348stored string has length~|l| (not counting the null character that is 349stored at the end), the value returned will be |p+l+1|. 350 351@(gb_io.h@>= 352#define STR_BUF_LENGTH 160 353extern char str_buf[]; /* safe place to receive output of |gb_string| */ 354extern char *gb_string(); /* |gb_string(p,c)| reads a string delimited by |c| 355 into bytes starting at |p| */ 356 357@ @d STR_BUF_LENGTH 160 358 359@<External f...@>= 360char str_buf[STR_BUF_LENGTH]; /* users can put strings here if they wish */ 361char *gb_string(p,c) 362 char *p; /* where to put the result */ 363 char c; /* character following the string */ 364{ 365 while (*cur_pos && *cur_pos!=c) 366 *p++=*cur_pos++; 367 *p++=0; 368 return p; 369} 370 371@ Here's how we test those routines in \.{test\_io}: The first line of test 372data consists of 79 characters, beginning with 64 zeroes and ending with 373`\.{123456789ABCDEF}'. The second line is completely blank. The third 374and final line says `\.{Oops:(intentional mistake)}'. 375 376@<Test the sample data lines...@>= 377if (gb_number(10)!=123456789) 378 io_errors |= 1L<<20; /* decimal number not working */ 379if (gb_digit(16)!=10) 380 io_errors |= 1L<<21; /* we missed the \.A following the decimal number */ 381gb_backup();@+ gb_backup(); /* get set to read `\.{9A}' again */ 382if (gb_number(16)!=0x9ABCDEF) 383 io_errors |= 1L<<22; /* hexadecimal number not working */ 384gb_newline(); /* now we should be scanning a blank line */ 385if (gb_char()!='\n') 386 io_errors |= 1L<<23; /* newline not inserted at end */ 387if (gb_char()!='\n') 388 io_errors |= 1L<<24; /* newline not implied after end */ 389if (gb_number(60)!=0) 390 io_errors |= 1L<<25; /* number should stop at null character */ 391{@+char temp[100]; 392 if (gb_string(temp,'\n')!=temp+1) 393 io_errors |= 1L<<26; /* string should be null after end of line */ 394 gb_newline(); 395 if (gb_string(temp,':')!=temp+5 || strcmp(temp,"Oops")) 396 io_errors |= 1L<<27; /* string not read properly */ 397} 398if (io_errors) 399 exit_test("Sorry, it failed. Look at the error code for clues"); 400if (gb_digit(10)!=-1) exit_test("Digit error not detected"); 401if (gb_char()!=':') 402 io_errors |= 1L<<28; /* lost synch after |gb_string| and |gb_digit| */ 403if (gb_eof()) 404 io_errors |= 1L<<29; /* premature end-of-file indication */ 405gb_newline(); 406if (!gb_eof()) 407 io_errors |= 1L<<30; /* postmature end-of-file indication */ 408 409@* Opening a file. The call |gb_raw_open("foo")| will open file |"foo"| and 410initialize the checksumming process. If the file cannot be opened, 411|io_errors| will be set to |cant_open_file|, otherwise 412|io_errors| will be initialized to zero. 413 414The call |gb_open("foo")| is a stronger version of |gb_raw_open|, which 415is used for standard GraphBase data files like |"words.dat"| to make 416doubly sure that they have not been corrupted. It returns the current value 417of |io_errors|, which will be nonzero if any problems were detected 418at the beginning of the file. 419 420@<Test the |gb_open| routine...@>= 421if (gb_open("test.dat")!=0) 422 exit_test("Can't open test.dat"); 423 424@ @d gb_raw_open gb_r_open /* abbreviation for Procrustean external linkage */ 425 426@(gb_io.h@>= 427#define gb_raw_open gb_r_open 428extern void gb_raw_open(); /* open a file for GraphBase input */ 429extern long gb_open(); /* open a GraphBase data file; return 0 if OK */ 430 431@ @<External f...@>= 432void gb_raw_open(f) 433 char *f; 434{ 435 @<Make sure that |icode|...@>; 436 @<Try to open |f|@>; 437 if (cur_file) { 438 io_errors=0; 439 more_data=1; 440 line_no=magic=0; 441 tot_lines=0x7fffffff; /* allow ``infinitely many'' lines */ 442 fill_buf(); 443 }@+else io_errors=cant_open_file; 444} 445 446@ Here's a possibly system-dependent part of the code: We try first to 447open the data file by using the file name itself as the path name; 448failing that, we try to prefix the file name with the name of the 449standard directory for GraphBase data, if the program has been compiled 450with |DATA_DIRECTORY| defined. 451@^system dependencies@> 452 453@<Try to open |f|@>= 454cur_file=fopen(f,"r"); 455@^system dependencies@> 456#ifdef DATA_DIRECTORY 457if (!cur_file && (strlen(DATA_DIRECTORY)+strlen(f)<STR_BUF_LENGTH)) { 458 sprintf(str_buf,"%s%s",DATA_DIRECTORY,f); 459 cur_file=fopen(str_buf,"r"); 460} 461#endif 462 463@ @<External f...@>= 464long gb_open(f) 465 char *f; 466{ 467 strncpy(file_name,f,sizeof(file_name)-1); 468 /* save the name for use by |gb_close| */ 469 gb_raw_open(f); 470 if (cur_file) { 471 @<Check the first line; return if unsuccessful@>; 472 @<Check the second line; return if unsuccessful@>; 473 @<Check the third line; return if unsuccessful@>; 474 @<Check the fourth line; return if unsuccessful@>; 475 gb_newline(); /* the first line of real data is now in the buffer */ 476 } 477 return io_errors; 478} 479 480@ @<Private...@>= 481static char file_name[20]; /* name of the data file, without a prefix */ 482 483@ The first four lines of a typical data file should look something like this: 484$$\halign{\hskip5em\.{#}\hfill\cr 485 * File "words.dat" from the Stanford GraphBase (C) 1993 Stanford University\cr 486 * A database of English five-letter words\cr 487 * This file may be freely copied but please do not change it in any way!\cr 488 * (Checksum parameters 5757,526296596)\cr}$$ 489We actually verify only that the first four lines of a data file named |"foo"| 490begin respectively with the characters 491$$\halign{\hskip5em\.{#}\hfill\cr 492 * File "foo"\cr 493 *\cr 494 *\cr 495 * (Checksum parameters $l$,$m$)\cr}$$ 496where $l$ and $m$ are decimal numbers. The values of $l$ and~$m$ 497are stored away as |tot_lines| and |final_magic|, to be matched at the 498end of the file. 499 500@<Check the first line...@>= 501sprintf(str_buf,"* File \"%s\"",f); 502if (strncmp(buffer,str_buf,strlen(str_buf))) 503 return (io_errors |= bad_first_line); 504 505@ @<Check the second line...@>= 506fill_buf(); 507if (*buffer!='*') return (io_errors |= bad_second_line); 508 509@ @<Check the third line...@>= 510fill_buf(); 511if (*buffer!='*') return (io_errors |= bad_third_line); 512 513@ @<Check the fourth line; return if unsuccessful@>= 514fill_buf(); 515if (strncmp(buffer,"* (Checksum parameters ",23)) 516 return (io_errors |= bad_fourth_line); 517cur_pos +=23; 518tot_lines=gb_number(10); 519if (gb_char()!=',') 520 return (io_errors |= bad_fourth_line); 521final_magic=gb_number(10); 522if (gb_char()!=')') 523 return (io_errors |= bad_fourth_line); 524 525@* Closing a file. After all data has been input, or should have been input, 526we check that the file was open and that it had the correct number of 527lines, the correct magic number, and a correct final line. The 528subroutine |gb_close|, like |gb_open|, returns the value of 529|io_errors|, which will be nonzero if at least one problem was noticed. 530 531@<Test the |gb_close| routine; exit if there's trouble@>= 532if (gb_close()!=0) 533 exit_test("Bad checksum, or difficulty closing the file"); 534 535@ @<External f...@>= 536long gb_close() 537{ 538 if (!cur_file) 539 return (io_errors |= no_file_open); 540 fill_buf(); 541 sprintf(str_buf,"* End of file \"%s\"",file_name); 542 if (strncmp(buffer,str_buf,strlen(str_buf))) 543 io_errors |= bad_last_line; 544 more_data=buffer[0]=0; 545 /* now the {\sc GB\_\,IO} routines are effectively shut down */ 546 /* we have |cur_pos=buffer| */ 547 if (fclose(cur_file)!=0) 548 return (io_errors |= cant_close_file); 549 cur_file=NULL; 550 if (line_no!=tot_lines+1) 551 return (io_errors |= wrong_number_of_lines); 552 if (magic!=final_magic) 553 return (io_errors |= wrong_checksum); 554 return io_errors; 555} 556 557@ There is also a less paranoid routine, |gb_raw_close|, that 558closes user-generated files. It simply closes the current file, if any, 559and returns the value of the |magic| checksum. 560 561Example: The |restore_graph| subroutine in {\sc GB\_\,SAVE} uses 562|gb_raw_open| and |gb_raw_close| to provide system-independent input 563that is almost as foolproof as the reading of standard GraphBase data. 564 565@ @d gb_raw_close gb_r_close /* for Procrustean external linkage */ 566 567@(gb_io.h@>= 568#define gb_raw_close gb_r_close 569extern long gb_close(); /* close a GraphBase data file; return 0 if OK */ 570extern long gb_raw_close(); /* close file and return the checksum */ 571 572@ @<External f...@>= 573long gb_raw_close() 574{ 575 if (cur_file) { 576 fclose(cur_file); 577 more_data=buffer[0]=0; 578 cur_pos=buffer; 579 cur_file=NULL; 580 } 581 return magic; 582} 583 584@* Index. Here is a list that shows where the identifiers of this program are 585defined and used. 586