1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2;;; ;; 3;;; Centre for Speech Technology Research ;; 4;;; University of Edinburgh, UK ;; 5;;; Copyright (c) 1996,1997 ;; 6;;; All Rights Reserved. ;; 7;;; ;; 8;;; Permission is hereby granted, free of charge, to use and distribute ;; 9;;; this software and its documentation without restriction, including ;; 10;;; without limitation the rights to use, copy, modify, merge, publish, ;; 11;;; distribute, sublicense, and/or sell copies of this work, and to ;; 12;;; permit persons to whom this work is furnished to do so, subject to ;; 13;;; the following conditions: ;; 14;;; 1. The code must retain the above copyright notice, this list of ;; 15;;; conditions and the following disclaimer. ;; 16;;; 2. Any modifications must be clearly marked as such. ;; 17;;; 3. Original authors' names are not deleted. ;; 18;;; 4. The authors' names are not used to endorse or promote products ;; 19;;; derived from this software without specific prior written ;; 20;;; permission. ;; 21;;; ;; 22;;; THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK ;; 23;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; 24;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;; 25;;; SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE ;; 26;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;; 27;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;; 28;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;; 29;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;; 30;;; THIS SOFTWARE. ;; 31;;; ;; 32;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 33;;; 34;;; A search for token occurrences in buckets of text 35;;; 36;;; This is only an example to aid you, this actually depends on 37;;; the availability of databases we don't have permission to 38;;; distribute. 39 40(set! text_dir "/home/awb/data/text/") 41 42;;; The databases themselves are identified by a file which names all 43;;; the files in that databases. e.g. This expects bin/gutenberg.files 44;;; to exists which should contain something like 45;;; gutenberg/etext90/bill11.txt 46;;; gutenberg/etext90/const11.txt 47;;; gutenberg/etext90/getty11.txt 48 49(set! db_names 50 '("gutenberg" ;; books from gutenberg 21906570 51 "desktopshop" ;; books, documents etc 23090463 52 "time" ;; Time Magazine 1990-1994 6770175 53 "hutch" ;; Hutchinson Encyclopedia 1715268 54 "dicts" ;; Dictionaries and Encyclopedias 4248109 55 "stw-ref" ;; Standard Reference libraries 3330448 56 "treebank" ;; WSJ articles from PENN treebank 1109895 57 "email" ;; awb's email 58 )) 59 60;;; Identify the tokens you want extracted 61;;; Tokens may be regular expressions 62(set! desired_tokens 63 '(lead wound tear axes Jan bass Nice Begin Chi Colon 64 St Dr III IV V X VII II "[0-9]+")) 65 66;;; First pass: to get examples and context for labelling 67(set! desired_feats 68 '(filepos 69 p.p.p.p.name p.p.p.name p.p.name p.name 70 name 71 n.name nn.name n.n.n.name n.n.n.n.name)) 72;;; Second: pass to get desried features for tree building 73;;; Typically this has to be specific for a particular homograph 74;;; so you'll probably want to do multiple second passes one for each 75;;; homograph type 76;(set! desired_feats 77; '(filepos 78; lisp_tok_rex 79; p.punc 80; punc 81; n.punc 82; pp.cap p.cap n.cap nn.cap 83; )) 84 85(define (tok_search_db dbname) 86"Search through DB for named tokens and save found occurrences." 87 (let ((outfile (string-append text_dir "fullhgs/" dbname ".out"))) 88 (delete-file outfile) 89 (mapcar 90 (lambda (fname) ;; for each file in the database 91 (extract_tokens ;; call internal function to extract tokens 92 (string-append text_dir fname) ;; full pathname to extract from 93 (mapcar ;; list of tokens and features 94 (lambda (t) ;; to extract 95 (cons t desired_feats)) 96 desired_tokens) 97 outfile)) 98 (load (string-append text_dir "bin/" dbname ".files") t)) 99 t)) 100 101(define (tok_do_all) 102"Search all dbs for desired tokens." 103 (mapcar 104 (lambda (db) 105 (print db) 106 (tok_search_db db)) 107 db_names) 108 t) 109 110