1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2;;;                                                                       ;;
3;;;                Centre for Speech Technology Research                  ;;
4;;;                     University of Edinburgh, UK                       ;;
5;;;                       Copyright (c) 1996,1997                         ;;
6;;;                        All Rights Reserved.                           ;;
7;;;                                                                       ;;
8;;;  Permission is hereby granted, free of charge, to use and distribute  ;;
9;;;  this software and its documentation without restriction, including   ;;
10;;;  without limitation the rights to use, copy, modify, merge, publish,  ;;
11;;;  distribute, sublicense, and/or sell copies of this work, and to      ;;
12;;;  permit persons to whom this work is furnished to do so, subject to   ;;
13;;;  the following conditions:                                            ;;
14;;;   1. The code must retain the above copyright notice, this list of    ;;
15;;;      conditions and the following disclaimer.                         ;;
16;;;   2. Any modifications must be clearly marked as such.                ;;
17;;;   3. Original authors' names are not deleted.                         ;;
18;;;   4. The authors' names are not used to endorse or promote products   ;;
19;;;      derived from this software without specific prior written        ;;
20;;;      permission.                                                      ;;
21;;;                                                                       ;;
22;;;  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        ;;
23;;;  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      ;;
24;;;  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   ;;
25;;;  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     ;;
26;;;  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    ;;
27;;;  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   ;;
28;;;  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          ;;
29;;;  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       ;;
30;;;  THIS SOFTWARE.                                                       ;;
31;;;                                                                       ;;
32;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
33;;;
34;;;  A search for token occurrences in buckets of text
35;;;
36;;;  This is only an example to aid you, this actually depends on
37;;;  the availability of databases we don't have permission to
38;;;  distribute.
39
40(set! text_dir "/home/awb/data/text/")
41
42;;;  The databases themselves are identified by a file which names all
43;;;  the files in that databases.  e.g. This expects bin/gutenberg.files
44;;;  to exists which should contain something like
45;;;      gutenberg/etext90/bill11.txt
46;;;      gutenberg/etext90/const11.txt
47;;;      gutenberg/etext90/getty11.txt
48
49(set! db_names
50      '("gutenberg"     ;; books from gutenberg              21906570
51        "desktopshop"   ;; books, documents etc              23090463
52        "time"          ;; Time Magazine 1990-1994            6770175
53        "hutch"         ;; Hutchinson Encyclopedia            1715268
54        "dicts"         ;; Dictionaries and Encyclopedias     4248109
55        "stw-ref"       ;; Standard Reference libraries       3330448
56	"treebank"      ;; WSJ articles from PENN treebank    1109895
57	"email"         ;; awb's email
58       ))
59
60;;; Identify the tokens you want extracted
61;;; Tokens may be regular expressions
62(set! desired_tokens
63      '(lead wound tear axes Jan bass Nice Begin Chi Colon
64        St Dr III IV V X VII II "[0-9]+"))
65
66;;; First pass: to get examples and context for labelling
67(set! desired_feats
68      '(filepos
69	p.p.p.p.name p.p.p.name p.p.name p.name
70	name
71	n.name nn.name n.n.n.name n.n.n.n.name))
72;;; Second: pass to get desried features for tree building
73;;; Typically this has to be specific for a particular homograph
74;;; so you'll probably want to do multiple second passes one for each
75;;; homograph type
76;(set! desired_feats
77;      '(filepos
78;	lisp_tok_rex
79;	p.punc
80;	punc
81;	n.punc
82;	pp.cap p.cap n.cap nn.cap
83;	))
84
85(define (tok_search_db dbname)
86"Search through DB for named tokens and save found occurrences."
87  (let ((outfile (string-append text_dir "fullhgs/" dbname ".out")))
88    (delete-file outfile)
89    (mapcar
90     (lambda (fname)  ;; for each file in the database
91       (extract_tokens  ;; call internal function to extract tokens
92	(string-append text_dir fname)  ;; full pathname to extract from
93	(mapcar                         ;; list of tokens and features
94	 (lambda (t)                    ;;    to extract
95	   (cons t desired_feats))
96	 desired_tokens)
97	outfile))
98     (load (string-append text_dir "bin/" dbname ".files") t))
99    t))
100
101(define (tok_do_all)
102"Search all dbs for desired tokens."
103  (mapcar
104   (lambda (db)
105     (print db)
106     (tok_search_db db))
107   db_names)
108  t)
109
110