1% This file is part of the Stanford GraphBase (c) Stanford University 1993
2@i boilerplate.w %<< legal stuff: PLEASE READ IT BEFORE MAKING ANY CHANGES!
3@i gb_types.w
4
5\def\title{GB\_\,ROGET}
6
7\prerequisites{GB\_\,GRAPH}{GB\_\,IO}
8@* Introduction. This GraphBase module contains the |roget| subroutine,
9which creates a family of graphs based on Roget's Thesaurus. An example
10of the use of this procedure can be found in the demo program
11{\sc ROGET\_\,COMPONENTS}.
12
13@(gb_roget.h@>=
14extern Graph *roget();
15
16@ The subroutine call |roget(n,min_distance,prob,seed)|
17constructs a graph based on the information in \.{roget.dat}.
18Each vertex of the graph corresponds to one of the 1022 categories in
19the 1879 edition of Peter Mark Roget's {\sl Thesaurus of English Words
20@^Roget, Peter Mark@>@^Roget, John Lewis@>
21and Phrases}, edited by John Lewis Roget.
22An arc goes from one category to another if Roget gave a
23reference to the latter among the words and phrases of the former,
24or if the two categories were directly related to each other by their
25positions in Roget's book. For example, the vertex for category 312
26(`ascent') has arcs to the vertices for categories 224 (`obliquity'),
27313 (`descent'), and 316 (`leap'), because Roget gave explicit
28cross-references from 312 to 224 and~316, and because category 312
29was implicitly paired with 313 in his scheme.
30
31The constructed graph will have $\min(n,1022)$ vertices; however, the
32default value |n=1022| is substituted when |n=0|. If |n| is less
33than 1022, the |n| categories will be selected at random,
34and all arcs to unselected categories will be omitted.
35Arcs will also be omitted if they correspond to categories whose
36numbers differ by less than |min_distance|. For example, if
37|min_distance>1|, the arc between categories 312 and~313 will not
38be included. (Roget sometimes formed clusters of three interrelated
39categories; to avoid cross-references within all such clusters, you can set
40|min_distance=3|.)
41
42If |prob>0|, arcs that would ordinarily be included in the graph are
43rejected with probability |prob/65536|. This provides a way
44to obtain sparser graphs.
45
46The vertices will appear in random order. However, all ``randomness''
47in GraphBase graphs is reproducible; it depends only on the value of
48a given |seed|, which can be any nonnegative integer less than~$2^{31}$.
49For example, everyone who asks for |roget(1000,3,32768,50)| will
50obtain exactly the same graph, regardless of their computer system.
51
52Changing the value of |prob| will affect only the arcs of the
53generated graph; it will change neither the choice of vertices
54nor the vertex order.
55
56@d MAX_N 1022 /* the number of categories in Roget's book */
57
58@ If the |roget| routine encounters a problem, it returns |NULL|
59(\.{NULL}), after putting a code number into the external variable
60|panic_code|. This code number identifies the type of failure.
61Otherwise |roget| returns a pointer to the newly created graph, which
62will be represented with the data structures explained in {\sc GB\_\,GRAPH}.
63(The external variable |panic_code| is itself defined in {\sc GB\_\,GRAPH}.)
64
65@d panic(c) @+{@+panic_code=c;@+gb_trouble_code=0;@+return NULL;@+}
66
67@ The \CEE/ file \.{gb\_roget.c} has the following general shape:
68
69@p
70#include "gb_io.h" /* we will use the {\sc GB\_\,IO} routines for input */
71#include "gb_flip.h"
72 /* we will use the {\sc GB\_\,FLIP} routines for random numbers */
73#include "gb_graph.h"
74 /* and we will use the {\sc GB\_\,GRAPH} data structures */
75@h@#
76@<Private variables@>@;
77@#
78Graph *roget(n,min_distance,prob,seed)
79  unsigned long n; /* number of vertices desired */
80  unsigned long min_distance; /* smallest inter-category distance allowed
81                            in an arc */
82  unsigned long prob; /* 65536 times the probability of rejecting an arc */
83  long seed; /* random number seed */
84{@+@<Local variables@>@;@#
85  gb_init_rand(seed);
86  if (n==0 || n>MAX_N) n=MAX_N;
87  @<Set up a graph with |n| vertices@>;
88  @<Determine the |n| categories to use in the graph@>;
89  @<Input \.{roget.dat} and build the graph@>;
90  if (gb_trouble_code) {
91    gb_recycle(new_graph);
92    panic(alloc_fault); /* oops, we ran out of memory somewhere back there */
93  }
94  return new_graph;
95}
96
97@ @<Local var...@>=
98Graph *new_graph; /* the graph constructed by |roget| */
99
100@* Vertices.
101
102@<Set up a graph with |n| vertices@>=
103new_graph=gb_new_graph(n);
104if (new_graph==NULL)
105  panic(no_room); /* out of memory before we're even started */
106sprintf(new_graph->id,"roget(%lu,%lu,%lu,%ld)",n,min_distance,prob,seed);
107strcpy(new_graph->util_types,"IZZZZZZZZZZZZZ");
108
109@ The first nontrivial thing we need to do is find a random selection and
110permutation of |n| vertices. We will compute a |mapping| table such that
111|mapping[k]| is non-|NULL| for exactly |n| randomly selected
112category numbers~|k|.
113Moreover, these non-|NULL| values will be a random permutation of the
114vertices of the graph.
115
116@<Priv...@>=
117static Vertex *mapping[MAX_N+1];
118 /* the vertex corresponding to a given category */
119static long cats[MAX_N];
120 /* table of category numbers that have not yet been used */
121
122@ During the loop on |v| in this step, |k| is the number of categories
123whose |mapping| value is still~|NULL|.
124The first |k| entries of |cats| will contain
125those category numbers in some order.
126
127@<Determine the |n| categories to use in the graph@>=
128for (k=0; k<MAX_N; k++)
129  cats[k]=k+1,@,mapping[k+1]=NULL;
130for (v=new_graph->vertices+n-1; v>=new_graph->vertices; v--) {
131  j=gb_unif_rand(k);
132  mapping[cats[j]]=v; cats[j]=cats[--k];
133}
134
135@ @<Local...@>=
136register long j,k; /* all-purpose indices */
137register Vertex *v; /* current vertex */
138
139@* Arcs. The data in \.{roget.dat} appears in 1022 lines, one for each
140category. For example, the line
141$$\hbox{\tt 312ascent:224 313 316}$$
142specifies the arcs from category 312 as explained earlier. First comes the
143category number, then the category name, then a colon, then zero or more
144numbers specifying arcs to other categories; the numbers are
145separated by spaces.
146
147Some categories have too many arcs to fit on a single line; the data
148for these categories can be found on two lines, the first line ending
149with a backslash and the second line beginning with a space.
150
151@<Input \.{roget.dat} and build the graph@>=
152if (gb_open("roget.dat")!=0)
153  panic(early_data_fault);
154    /* couldn't open |"roget.dat"| using GraphBase conventions */
155for (k=1; !gb_eof(); k++)
156  @<Read the data for category |k|, and put it in the graph if it
157    has been selected@>;
158if (gb_close()!=0)
159  panic(late_data_fault);
160    /* something's wrong with |"roget.dat"|; see |io_errors| */
161if (k!=MAX_N+1) panic(impossible);
162  /* we don't have the right value of |MAX_N| */
163
164@ We check that the data isn't garbled, except that we don't
165bother to look at unselected categories.
166
167The original category number is stored in vertex utility field |cat_no|,
168in case anybody wants to see it.
169
170@d cat_no u.I /* utility field |u| of each vertex holds the category number */
171
172@<Read the data for category |k|, and put it in the graph if it
173   has been selected@>=
174{
175  if (mapping[k]) { /* yes, this category has been selected */
176    if (gb_number(10)!=k) panic(syntax_error); /* out of synch */
177    (void)gb_string(str_buf,':');
178    if (gb_char()!=':') panic(syntax_error+1); /* no colon found */
179    v=mapping[k];
180    v->name=gb_save_string(str_buf);
181    v->cat_no=k;
182    @<Add arcs from |v| for every category that's both listed on the line
183          and selected@>;
184  }@+else @<Skip past the data for one category@>;
185}
186
187@ @(gb_roget.h@>=
188#define cat_no @t\quad@> u.I
189 /* definition of |cat_no| is repeated in the header file */
190
191@ @d iabs(x) ((x)<0? -(x): (x))
192
193@<Add arcs from |v| for every...@>=
194j=gb_number(10);
195if (j==0) goto done; /* some categories lead to no arcs at all */
196while (1) {
197  if (j>MAX_N) panic(syntax_error+2); /* category code out of range */
198  if (mapping[j] && iabs(j-k)>=min_distance &&
199       (prob==0 || ((gb_next_rand()>>15)>=prob)))
200    gb_new_arc(v,mapping[j],1L);
201  switch (gb_char()) {
202  case '\\': gb_newline();
203    if (gb_char()!=' ')
204      panic(syntax_error+3); /* space should begin a continuation line */
205    /* fall through to the space case */
206  case ' ': j=gb_number(10);@+break;
207  case '\n': goto done;
208  default: panic(syntax_error+4);
209    /* illegal character following category number */
210  }
211}
212done: gb_newline();
213
214@ We want to call |gb_newline()| twice if the current line ends with a
215backslash; otherwise we want to call it just once. There's an obvious
216way to do that, and there's also a faster and trickier way.  The
217author apologizes here for succumbing to some old-fashioned impulses.
218(Recall that |gb_string| returns the location just following the
219|'\0'| it places at the end of a scanned string.)
220
221@<Skip past the data for one category@>=
222{
223  if (*(gb_string(str_buf,'\n')-2)=='\\')
224    gb_newline(); /* the first line ended with backslash */
225  gb_newline();
226}
227
228@* Index. Here is a list that shows where the identifiers of this program are
229defined and used.
230