1% This file is part of the Stanford GraphBase (c) Stanford University 1993 2@i boilerplate.w %<< legal stuff: PLEASE READ IT BEFORE MAKING ANY CHANGES! 3@i gb_types.w 4 5\def\title{GB\_\,ROGET} 6 7\prerequisites{GB\_\,GRAPH}{GB\_\,IO} 8@* Introduction. This GraphBase module contains the |roget| subroutine, 9which creates a family of graphs based on Roget's Thesaurus. An example 10of the use of this procedure can be found in the demo program 11{\sc ROGET\_\,COMPONENTS}. 12 13@(gb_roget.h@>= 14extern Graph *roget(); 15 16@ The subroutine call |roget(n,min_distance,prob,seed)| 17constructs a graph based on the information in \.{roget.dat}. 18Each vertex of the graph corresponds to one of the 1022 categories in 19the 1879 edition of Peter Mark Roget's {\sl Thesaurus of English Words 20@^Roget, Peter Mark@>@^Roget, John Lewis@> 21and Phrases}, edited by John Lewis Roget. 22An arc goes from one category to another if Roget gave a 23reference to the latter among the words and phrases of the former, 24or if the two categories were directly related to each other by their 25positions in Roget's book. For example, the vertex for category 312 26(`ascent') has arcs to the vertices for categories 224 (`obliquity'), 27313 (`descent'), and 316 (`leap'), because Roget gave explicit 28cross-references from 312 to 224 and~316, and because category 312 29was implicitly paired with 313 in his scheme. 30 31The constructed graph will have $\min(n,1022)$ vertices; however, the 32default value |n=1022| is substituted when |n=0|. If |n| is less 33than 1022, the |n| categories will be selected at random, 34and all arcs to unselected categories will be omitted. 35Arcs will also be omitted if they correspond to categories whose 36numbers differ by less than |min_distance|. For example, if 37|min_distance>1|, the arc between categories 312 and~313 will not 38be included. (Roget sometimes formed clusters of three interrelated 39categories; to avoid cross-references within all such clusters, you can set 40|min_distance=3|.) 41 42If |prob>0|, arcs that would ordinarily be included in the graph are 43rejected with probability |prob/65536|. This provides a way 44to obtain sparser graphs. 45 46The vertices will appear in random order. However, all ``randomness'' 47in GraphBase graphs is reproducible; it depends only on the value of 48a given |seed|, which can be any nonnegative integer less than~$2^{31}$. 49For example, everyone who asks for |roget(1000,3,32768,50)| will 50obtain exactly the same graph, regardless of their computer system. 51 52Changing the value of |prob| will affect only the arcs of the 53generated graph; it will change neither the choice of vertices 54nor the vertex order. 55 56@d MAX_N 1022 /* the number of categories in Roget's book */ 57 58@ If the |roget| routine encounters a problem, it returns |NULL| 59(\.{NULL}), after putting a code number into the external variable 60|panic_code|. This code number identifies the type of failure. 61Otherwise |roget| returns a pointer to the newly created graph, which 62will be represented with the data structures explained in {\sc GB\_\,GRAPH}. 63(The external variable |panic_code| is itself defined in {\sc GB\_\,GRAPH}.) 64 65@d panic(c) @+{@+panic_code=c;@+gb_trouble_code=0;@+return NULL;@+} 66 67@ The \CEE/ file \.{gb\_roget.c} has the following general shape: 68 69@p 70#include "gb_io.h" /* we will use the {\sc GB\_\,IO} routines for input */ 71#include "gb_flip.h" 72 /* we will use the {\sc GB\_\,FLIP} routines for random numbers */ 73#include "gb_graph.h" 74 /* and we will use the {\sc GB\_\,GRAPH} data structures */ 75@h@# 76@<Private variables@>@; 77@# 78Graph *roget(n,min_distance,prob,seed) 79 unsigned long n; /* number of vertices desired */ 80 unsigned long min_distance; /* smallest inter-category distance allowed 81 in an arc */ 82 unsigned long prob; /* 65536 times the probability of rejecting an arc */ 83 long seed; /* random number seed */ 84{@+@<Local variables@>@;@# 85 gb_init_rand(seed); 86 if (n==0 || n>MAX_N) n=MAX_N; 87 @<Set up a graph with |n| vertices@>; 88 @<Determine the |n| categories to use in the graph@>; 89 @<Input \.{roget.dat} and build the graph@>; 90 if (gb_trouble_code) { 91 gb_recycle(new_graph); 92 panic(alloc_fault); /* oops, we ran out of memory somewhere back there */ 93 } 94 return new_graph; 95} 96 97@ @<Local var...@>= 98Graph *new_graph; /* the graph constructed by |roget| */ 99 100@* Vertices. 101 102@<Set up a graph with |n| vertices@>= 103new_graph=gb_new_graph(n); 104if (new_graph==NULL) 105 panic(no_room); /* out of memory before we're even started */ 106sprintf(new_graph->id,"roget(%lu,%lu,%lu,%ld)",n,min_distance,prob,seed); 107strcpy(new_graph->util_types,"IZZZZZZZZZZZZZ"); 108 109@ The first nontrivial thing we need to do is find a random selection and 110permutation of |n| vertices. We will compute a |mapping| table such that 111|mapping[k]| is non-|NULL| for exactly |n| randomly selected 112category numbers~|k|. 113Moreover, these non-|NULL| values will be a random permutation of the 114vertices of the graph. 115 116@<Priv...@>= 117static Vertex *mapping[MAX_N+1]; 118 /* the vertex corresponding to a given category */ 119static long cats[MAX_N]; 120 /* table of category numbers that have not yet been used */ 121 122@ During the loop on |v| in this step, |k| is the number of categories 123whose |mapping| value is still~|NULL|. 124The first |k| entries of |cats| will contain 125those category numbers in some order. 126 127@<Determine the |n| categories to use in the graph@>= 128for (k=0; k<MAX_N; k++) 129 cats[k]=k+1,@,mapping[k+1]=NULL; 130for (v=new_graph->vertices+n-1; v>=new_graph->vertices; v--) { 131 j=gb_unif_rand(k); 132 mapping[cats[j]]=v; cats[j]=cats[--k]; 133} 134 135@ @<Local...@>= 136register long j,k; /* all-purpose indices */ 137register Vertex *v; /* current vertex */ 138 139@* Arcs. The data in \.{roget.dat} appears in 1022 lines, one for each 140category. For example, the line 141$$\hbox{\tt 312ascent:224 313 316}$$ 142specifies the arcs from category 312 as explained earlier. First comes the 143category number, then the category name, then a colon, then zero or more 144numbers specifying arcs to other categories; the numbers are 145separated by spaces. 146 147Some categories have too many arcs to fit on a single line; the data 148for these categories can be found on two lines, the first line ending 149with a backslash and the second line beginning with a space. 150 151@<Input \.{roget.dat} and build the graph@>= 152if (gb_open("roget.dat")!=0) 153 panic(early_data_fault); 154 /* couldn't open |"roget.dat"| using GraphBase conventions */ 155for (k=1; !gb_eof(); k++) 156 @<Read the data for category |k|, and put it in the graph if it 157 has been selected@>; 158if (gb_close()!=0) 159 panic(late_data_fault); 160 /* something's wrong with |"roget.dat"|; see |io_errors| */ 161if (k!=MAX_N+1) panic(impossible); 162 /* we don't have the right value of |MAX_N| */ 163 164@ We check that the data isn't garbled, except that we don't 165bother to look at unselected categories. 166 167The original category number is stored in vertex utility field |cat_no|, 168in case anybody wants to see it. 169 170@d cat_no u.I /* utility field |u| of each vertex holds the category number */ 171 172@<Read the data for category |k|, and put it in the graph if it 173 has been selected@>= 174{ 175 if (mapping[k]) { /* yes, this category has been selected */ 176 if (gb_number(10)!=k) panic(syntax_error); /* out of synch */ 177 (void)gb_string(str_buf,':'); 178 if (gb_char()!=':') panic(syntax_error+1); /* no colon found */ 179 v=mapping[k]; 180 v->name=gb_save_string(str_buf); 181 v->cat_no=k; 182 @<Add arcs from |v| for every category that's both listed on the line 183 and selected@>; 184 }@+else @<Skip past the data for one category@>; 185} 186 187@ @(gb_roget.h@>= 188#define cat_no @t\quad@> u.I 189 /* definition of |cat_no| is repeated in the header file */ 190 191@ @d iabs(x) ((x)<0? -(x): (x)) 192 193@<Add arcs from |v| for every...@>= 194j=gb_number(10); 195if (j==0) goto done; /* some categories lead to no arcs at all */ 196while (1) { 197 if (j>MAX_N) panic(syntax_error+2); /* category code out of range */ 198 if (mapping[j] && iabs(j-k)>=min_distance && 199 (prob==0 || ((gb_next_rand()>>15)>=prob))) 200 gb_new_arc(v,mapping[j],1L); 201 switch (gb_char()) { 202 case '\\': gb_newline(); 203 if (gb_char()!=' ') 204 panic(syntax_error+3); /* space should begin a continuation line */ 205 /* fall through to the space case */ 206 case ' ': j=gb_number(10);@+break; 207 case '\n': goto done; 208 default: panic(syntax_error+4); 209 /* illegal character following category number */ 210 } 211} 212done: gb_newline(); 213 214@ We want to call |gb_newline()| twice if the current line ends with a 215backslash; otherwise we want to call it just once. There's an obvious 216way to do that, and there's also a faster and trickier way. The 217author apologizes here for succumbing to some old-fashioned impulses. 218(Recall that |gb_string| returns the location just following the 219|'\0'| it places at the end of a scanned string.) 220 221@<Skip past the data for one category@>= 222{ 223 if (*(gb_string(str_buf,'\n')-2)=='\\') 224 gb_newline(); /* the first line ended with backslash */ 225 gb_newline(); 226} 227 228@* Index. Here is a list that shows where the identifiers of this program are 229defined and used. 230