1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996,1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : November 1996 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* Tokenizing */
38 /* */
39 /* This provides tokenizing methods for tokens into words. All that */
40 /* special rules stuff for analysizing numbers, dates, acronyms etc. */
41 /* Much of this is still too specific and although easy to add to it */
42 /* be better if the rules could be specified externally */
43 /* */
44 /* Note only English tokenization has any substance at present */
45 /* */
46 /*=======================================================================*/
47 #include <cstdio>
48
49 using namespace std;
50
51 #include "festival.h"
52 #include "lexicon.h"
53 #include "modules.h"
54 #include "text.h"
55 #include "tokenP.h"
56
57 static EST_Regex numpointnum("[0-9]*\\.[0-9]+");
58 static EST_Regex RXintcommaed("[0-9][0-9]?[0-9]?,\\([0-9][0-9][0-9],\\)*[0-9][0-9][0-9]\\(\\.[0-9]+\\)?");
59 static EST_Regex RXintord("[0-9]*\\(1st\\|2nd\\|3rd\\|[0-9]th\\)");
60 static EST_Regex RXdottedabbrev("\\([A-Za-z]\\.\\)+[A-Za-z]\\.?");
61 static EST_Regex RXapostropheS(".*'[sS]$");
62 static EST_String PunctuationChars("'`.,:;!?{}[]()-\"");
63 static EST_Regex RXpunctuation("\\(\\]\\|[-[.,!?]\\)+");
64 static EST_String remove_punct(const EST_String &tok);
65 static int only_punc(const EST_String &tok);
66 static LISP num_2_words(int iword);
67 static LISP say_num_as_words(const EST_String &num);
68 static LISP word_it(EST_Item *t,const EST_String tok);
69 static LISP builtin_word_it(EST_Item *token, EST_String tok);
70 static LISP say_as_letters(const EST_String &word);
71 static LISP say_num_as_ordinal(const EST_String &num);
72 static LISP say_num_as_year(const EST_String &num);
73 static LISP say_as_digits(const EST_String &word);
74
75 LISP FT_English_Token_Utt(LISP utt);
76 LISP FT_Welsh_Token_Utt(LISP utt);
77 LISP FT_Spanish_Token_Utt(LISP utt);
78 LISP FT_Any_Token_Utt(LISP utt);
79
80 static LISP user_token_to_word_func = NIL;
81
FT_Welsh_Token_Utt(LISP utt)82 LISP FT_Welsh_Token_Utt(LISP utt)
83 {
84 return FT_Any_Token_Utt(utt);
85 }
86
FT_Spanish_Token_Utt(LISP utt)87 LISP FT_Spanish_Token_Utt(LISP utt)
88 {
89 (void)utt;
90 cerr << "TOKEN: Spanish tokenization not yet supported\n";
91 festival_error();
92
93 // never happens
94 return NULL;
95 }
96
FT_Any_Token_Utt(LISP utt)97 LISP FT_Any_Token_Utt(LISP utt)
98 {
99 // Language independent EST_Token to Word module. Uses user specified
100 // token to word function of simply creates a word for each token.
101 EST_Utterance *u = get_c_utt(utt);
102 LISP words,w;
103 EST_Item *t;
104 EST_Item *new_word;
105
106 user_token_to_word_func = siod_get_lval("token_to_words",NULL);
107 u->create_relation("Word");
108
109 for (t=u->relation("Token")->first(); t != 0; t = t->next())
110 {
111 if (user_token_to_word_func != NIL)
112 {
113 words = word_it(t,t->name());
114 for (w=words; w != NIL; w=cdr(w))
115 {
116 new_word = add_word(u,car(w));
117 append_daughter(t,"Token",new_word);
118 }
119 }
120 else
121 { // No user token_to_word function so just do it directly
122 new_word = add_word(u,t->name());
123 append_daughter(t,"Token",new_word);
124 }
125 }
126 user_token_to_word_func = NIL; // reset this
127
128 return utt;
129 }
130
FT_English_Token_Utt(LISP utt)131 LISP FT_English_Token_Utt(LISP utt)
132 {
133 // This module generates a word stream from a token stream
134 // Tokens may go to zero or more words, tokens retain information
135 // about their punctuation and spacing on the page.
136 // Preceeding and succeeding punctuation become words
137 EST_Utterance *u = get_c_utt(utt);
138 EST_Item *t;
139 LISP words,w,eou_tree,l;
140 EST_Item *new_word;
141
142 *cdebug << "Token module (English)" << endl;
143
144 eou_tree = siod_get_lval("eou_tree","No end of utterance tree");
145 user_token_to_word_func = siod_get_lval("token_to_words",NULL);
146 u->create_relation("Word");
147
148 for (t=u->relation("Token")->first(); t != 0; t = t->next())
149 {
150 words = word_it(t,t->name());
151 // Initial punctuation becomes words
152 new_word = 0;
153 if ((t->f("prepunctuation") != "0") &&
154 (t->f("prepunctuation") != ""))
155 {
156 l = symbolexplode(strintern(t->f("prepunctuation").string()));
157 for (w=l; w != NIL; w=cdr(w))
158 {
159 new_word = add_word(u,car(w));
160 append_daughter(t,"Token",new_word);
161 }
162 }
163
164 // Words become words
165 for (w=words; w != NIL; w=cdr(w))
166 {
167 new_word = add_word(u,car(w));
168 append_daughter(t,"Token",new_word);
169 }
170
171 // final word gets punctuation marking
172 if ((new_word != 0) && (ffeature(t,"punc") != "0"))
173 {
174 if ((ffeature(t,"punc") == ".") &&
175 (wagon_predict(t,eou_tree) == 0))
176 { // It wasn't a really punctuation mark
177 t->set("punc","0");
178 }
179 else
180 {
181 l = symbolexplode(strintern(ffeature(t,"punc").string()));
182 for (w=l; w != NIL; w=cdr(w))
183 {
184 new_word = add_word(u,car(w));
185 append_daughter(t,"Token",new_word);
186 }
187 }
188 }
189 }
190
191 user_token_to_word_func = NIL;
192
193 return utt;
194
195 }
196
word_it(EST_Item * token,const EST_String tok)197 static LISP word_it(EST_Item *token, const EST_String tok)
198 {
199 // The user may specify their own addition;a token to word rules
200 // through the variable user_token_to_word_func if so we must
201 // call that, which may or may not call the builtin version
202 // The builtin version if bound in LISP so that recursion works
203 // properly
204 // This takes a LISP utt as an argument as creating a new wraparound
205 // will cause gc to fail.
206 LISP tok_string = strcons(tok.length(),tok);
207
208 if (user_token_to_word_func != NIL) // check user's rules
209 return leval(cons(user_token_to_word_func,
210 cons(siod(token),
211 cons(tok_string,NIL))),NIL);
212 else
213 return builtin_word_it(token,tok);
214 }
215
builtin_word_it(EST_Item * token,EST_String tok)216 static LISP builtin_word_it(EST_Item *token, EST_String tok)
217 {
218 // Return a list of words for this token
219 EST_String token_pos;
220
221 if (tok == "")
222 return NIL;
223 else if (in_current_lexicon(downcase(tok),NIL)) // if in lexicon use as is
224 {
225 if ((tok != token->name()) && // mainly to catch internal "a"
226 (tok.length() == 1))
227 {
228 LISP let_pos = siod_get_lval("token.letter_pos",NULL);
229 return cons(cons(make_param_str("name",tok),
230 cons(make_param_lisp("pos",let_pos),NIL)),
231 NIL);
232 }
233 else
234 return cons(strintern(tok),NIL);
235 }
236 else if ((token_pos = (EST_String)ffeature(token,"token_pos")) == "ordinal")
237 return say_num_as_ordinal(tok);
238 else if (token_pos == "year")
239 return say_num_as_year(tok);
240 else if ((token_pos == "digits") ||
241 (tok.matches(make_regex("0[0-9]+"))))
242 return say_as_digits(tok);
243 else if (tok.matches(RXint))
244 return say_num_as_words(tok);
245 else if (tok.matches(RXintord))
246 return say_num_as_ordinal(tok.at(0,tok.length()-2));
247 else if (tok.matches(RXintcommaed)) // containing commas at thousands
248 {
249 if (tok.contains("."))
250 return word_it(token,remove_punct(tok.before("."))+
251 "."+tok.after("."));
252 else
253 return say_num_as_words(remove_punct(tok));
254 }
255 else if (tok.matches(RXapostropheS))
256 {
257 return append(word_it(token,tok.at(0,tok.length()-2)),
258 cons(strintern("'s"),NIL));
259 }
260 else if (tok.matches(numpointnum))
261 {
262 EST_String afterpoint = tok.after(".");
263 LISP ap = NIL;
264 int i;
265 for (i=0; i < afterpoint.length(); i++)
266 ap = append(say_num_as_words(afterpoint.at(i,1)),ap);
267 return append(say_num_as_words(tok.before(".")),
268 cons(strintern("point"),reverse(ap)));
269 }
270 else if ((tok.matches(make_regex("[A-Z][A-Z]+"))) &&
271 ((!tok.contains(make_regex("[AEIOUY]"))) ||
272 ((!tok.contains(make_regex("[^AEIOU][AEIOU][^AEIOU]"))) &&
273 (tok.length() < 5)))) // an acronym
274 return say_as_letters(tok);
275 else if (tok.matches(RXdottedabbrev))
276 return say_as_letters(remove_punct(tok));
277 else if ((tok.matches(RXalpha)) &&
278 !(tok.matches(make_regex(".*[AEIOUYaeiouy].*"))))
279 return say_as_letters(tok); // no vowels so spell it
280 else if (tok.matches(RXalpha)) // as is, some sort of word
281 return cons(strintern(tok),NIL);
282 else if (only_punc(tok))
283 return stringexplode(tok);
284 else if (tok.contains("-"))
285 return append(word_it(token,tok.before("-")),
286 word_it(token,tok.after("-")));
287 else if (tok.contains(".")) // internet address
288 {
289 LISP a=NIL;
290 EST_String tok2 = tok;
291 for ( ; tok2.contains("."); tok2 = tok2.after("."))
292 a = append(a,
293 append(word_it(token,tok2.before(".")),
294 cons(strintern("dot"),NIL)));
295 a = append(a,word_it(token,tok2));
296 return a;
297 }
298 else if (tok.contains("/"))
299 return append(word_it(token,tok.before("/")),
300 cons(strintern("slash"),
301 word_it(token,tok.after("/"))));
302 else if (tok.contains("&"))
303 return append(word_it(token,tok.before("&")),
304 cons(strintern("ampersand"),
305 word_it(token,tok.after("&"))));
306 else if (tok.contains("_"))
307 return append(word_it(token,tok.before("_")),
308 cons(strintern("underscore"),
309 word_it(token,tok.after("_"))));
310 else if (tok.contains("'"))
311 return word_it(token,tok.before("'")+tok.after("'"));
312 else if (tok.contains("`"))
313 return append(word_it(token,tok.before("`")),
314 word_it(token,tok.after("`")));
315 else if (tok.contains("\""))
316 return append(word_it(token,tok.before("\"")),
317 word_it(token,tok.after("\"")));
318 else if (tok.contains(","))
319 return append(word_it(token,tok.before(",")),
320 word_it(token,tok.after(",")));
321 else if (tok.contains("("))
322 return append(word_it(token,tok.before("(")),
323 word_it(token,tok.after("(")));
324 else if (tok.contains(")"))
325 return append(word_it(token,tok.before(")")),
326 word_it(token,tok.after(")")));
327 else if (tok.matches(make_regex("^[^a-zA-Z].+"))) // incrementally remove
328 return append(say_as_letters(tok.at(0,1)),// num/symbol from front
329 word_it(token,tok.at(1,tok.length()-1)));
330 else if (tok.matches(make_regex(".+[^a-zA-Z]$"))) // incrementally remove rear
331 return append(word_it(token,tok.at(0,tok.length()-1)),
332 say_as_letters(tok.at((int)tok.length()-1,1)));
333 else // could try harder
334 return say_as_letters(remove_punct(tok));
335 }
336
say_as_digits(const EST_String & word)337 static LISP say_as_digits(const EST_String &word)
338 {
339 // Should be string of digits, but I wont require it
340 // This isn't really correct for telephone numbers (oh/zero/double)
341 LISP l;
342 LISP lets = stringexplode(word);
343 LISP let_pos = siod_get_lval("token.letter_pos",NULL);
344
345 for (l=lets; l != NIL; l=cdr(l))
346 {
347 if (streq(get_c_string(car(l)),"0"))
348 CAR(l) = strintern("zero");
349 else if (streq(get_c_string(car(l)),"1"))
350 CAR(l) = strintern("one");
351 else if (streq(get_c_string(car(l)),"2"))
352 CAR(l) = strintern("two");
353 else if (streq(get_c_string(car(l)),"3"))
354 CAR(l) = strintern("three");
355 else if (streq(get_c_string(car(l)),"4"))
356 CAR(l) = strintern("four");
357 else if (streq(get_c_string(car(l)),"5"))
358 CAR(l) = strintern("five");
359 else if (streq(get_c_string(car(l)),"6"))
360 CAR(l) = strintern("six");
361 else if (streq(get_c_string(car(l)),"7"))
362 CAR(l) = strintern("seven");
363 else if (streq(get_c_string(car(l)),"8"))
364 CAR(l) = strintern("eight");
365 else if (streq(get_c_string(car(l)),"9"))
366 CAR(l) = strintern("nine");
367 else
368 CAR(l) = cons(make_param_lisp("name",car(l)),
369 cons(make_param_lisp("pos",let_pos),NIL));
370 }
371
372 return lets;
373 }
374
say_as_letters(const EST_String & word)375 static LISP say_as_letters(const EST_String &word)
376 {
377 // Explode letters in word and say them, marking them as nouns
378 // This is particularly designed so that A/a doesn't come out as
379 // the determiner a which in typically schwa'd
380 LISP l;
381 LISP lets = stringexplode(word);
382 LISP let_pos = siod_get_lval("token.letter_pos",NULL);
383
384 for (l=lets; l != NIL; l=cdr(l))
385 {
386 EST_String name = EST_String(get_c_string(car(l)));
387 if (name.matches(make_regex("[0-9]")))
388 CAR(l) = car(say_as_digits(get_c_string(car(l))));
389 // else if (name.matches(make_regex("[^a-zA-Z]")))
390 // // Not sure, probably a bug to get here
391 // CAR(l) = cons(make_param_str("name","symbol"),
392 // cons(make_param_lisp("pos",let_pos),NIL));
393 else
394 CAR(l) = cons(make_param_lisp("name",car(l)),
395 cons(make_param_lisp("pos",let_pos),NIL));
396 }
397
398 return lets;
399 }
400
only_punc(const EST_String & tok)401 static int only_punc(const EST_String &tok)
402 {
403 // If this token consists solely of punctuation chars
404 // If this is true I'm probably suppose to say some of them
405 int i;
406 EST_String np;
407 const char *tokch = tok;
408
409 for (i=0; i<tok.length(); i++)
410 if (strchr((const char *)PunctuationChars,tokch[i]) == NULL)
411 return FALSE;
412
413 return TRUE;
414 }
415
remove_punct(const EST_String & tok)416 static EST_String remove_punct(const EST_String &tok)
417 {
418 EST_String np(tok);
419
420 np.make_updatable();
421
422 np.gsub(RXpunctuation, "");
423
424 return np;
425 }
426
427
say_num_as_ordinal(const EST_String & num)428 static LISP say_num_as_ordinal(const EST_String &num)
429 {
430 LISP numwords = num_2_words(atoi(num));
431 LISP last;
432
433 // Now change the last word to the appropriate ordinal
434 for (last=numwords; cdr(last) != NIL; last=cdr(last));
435 const char *lastword = get_c_string(car(last));
436
437 if (streq(lastword,"zero"))
438 CAR(last) = strintern("zeroth");
439 else if (streq(lastword,"one"))
440 CAR(last) = strintern("first");
441 else if (streq(lastword,"two"))
442 CAR(last) = strintern("second");
443 else if (streq(lastword,"three"))
444 CAR(last) = strintern("third");
445 else if (streq(lastword,"four"))
446 CAR(last) = strintern("fourth");
447 else if (streq(lastword,"five"))
448 CAR(last) = strintern("fifth");
449 else if (streq(lastword,"six"))
450 CAR(last) = strintern("sixth");
451 else if (streq(lastword,"seven"))
452 CAR(last) = strintern("seventh");
453 else if (streq(lastword,"eight"))
454 CAR(last) = strintern("eighth");
455 else if (streq(lastword,"nine"))
456 CAR(last) = strintern("ninth");
457 else if (streq(lastword,"ten"))
458 CAR(last) = strintern("tenth");
459 else if (streq(lastword,"eleven"))
460 CAR(last) = strintern("eleventh");
461 else if (streq(lastword,"twelve"))
462 CAR(last) = strintern("twelfth");
463 else if (streq(&lastword[strlen(lastword)-4],"teen"))
464 CAR(last) = strintern(EST_String(lastword)+"th");
465 else if (streq(&lastword[strlen(lastword)-2],"ty"))
466 CAR(last) = strintern(EST_String(lastword).before("ty")+"tieth");
467 else if (streq(lastword,"hundred"))
468 CAR(last) = strintern("hundredth");
469 else if (streq(lastword,"thousand"))
470 CAR(last) = strintern("thousandth");
471 else if (streq(&lastword[strlen(lastword)-6],"illion"))
472 CAR(last) = strintern(EST_String(lastword)+"th");
473 else
474 {
475 // I don't think I've forgotten anything
476 *cdebug << "Token: can't make ordinal from \"" << lastword
477 << "\"" << endl;
478 CAR(last) = strintern(EST_String(lastword)+"th");
479 }
480
481 return numwords;
482 }
483
say_num_as_words(const EST_String & num)484 static LISP say_num_as_words(const EST_String &num)
485 {
486 if (num.length() > 9)
487 {
488 if (num(0) == '-')
489 return cons(strintern("minus"),say_as_digits(num.after("-")));
490 else
491 return say_as_digits(num);
492 }
493 else
494 return num_2_words(atoi(num));
495 }
496
say_num_as_year(const EST_String & num)497 static LISP say_num_as_year(const EST_String &num)
498 {
499 int iword = atoi(num);
500
501 if (num.length() > 4)
502 return say_num_as_words(num);
503 else if (num.matches(make_regex("00")))
504 {
505 return cons(strintern("o"),
506 cons(strintern("o"),NIL));
507 }
508 else if (num.matches(make_regex("0[0-9]")))
509 {
510 return cons(strintern("o"),
511 num_2_words(iword));
512 }
513 else if (iword < 100)
514 return num_2_words(iword);
515 else if ((iword % 1000) < 10)
516 {
517 if ((iword % 1000) == 0)
518 return append(num_2_words(iword/1000),
519 cons(strintern("thousand"),NIL));
520 else
521 return append(num_2_words(iword/1000),
522 cons(strintern("thousand"),
523 cons(strintern("and"),
524 num_2_words(iword%1000))));
525 }
526 else if ((iword % 100) == 0)
527 return append(num_2_words(iword/100),
528 cons(strintern("hundred"),NIL));
529 else if ((iword % 100) < 10)
530 return append(num_2_words(iword/100),
531 cons(strintern("o"),
532 num_2_words(iword%100)));
533 else
534 return append(num_2_words(iword/100),
535 num_2_words(iword%100));
536 }
537
num_2_words(int iword)538 static LISP num_2_words(int iword)
539 {
540 // Convert number of list of words
541 int tens, units;
542 LISP s_tens, lang_stype=NIL;
543
544 if (iword < 0)
545 return cons(strintern("minus"),num_2_words(-iword));
546 else if (iword < 20)
547 switch (iword) {
548 case 0: return cons(strintern("zero"),NIL);
549 case 1: return cons(strintern("one"),NIL);
550 case 2: return cons(strintern("two"),NIL);
551 case 3: return cons(strintern("three"),NIL);
552 case 4: return cons(strintern("four"),NIL);
553 case 5: return cons(strintern("five"),NIL);
554 case 6: return cons(strintern("six"),NIL);
555 case 7: return cons(strintern("seven"),NIL);
556 case 8: return cons(strintern("eight"),NIL);
557 case 9: return cons(strintern("nine"),NIL);
558 case 10: return cons(strintern("ten"),NIL);
559 case 11: return cons(strintern("eleven"),NIL);
560 case 12: return cons(strintern("twelve"),NIL);
561 case 13: return cons(strintern("thirteen"),NIL);
562 case 14: return cons(strintern("fourteen"),NIL);
563 case 15: return cons(strintern("fifteen"),NIL);
564 case 16: return cons(strintern("sixteen"),NIL);
565 case 17: return cons(strintern("seventeen"),NIL);
566 case 18: return cons(strintern("eighteen"),NIL);
567 case 19: return cons(strintern("nineteen"),NIL);
568 default: return cons(siod_get_lval("token.unknown_word_name",NULL),
569 NIL);
570 }
571 else if (iword < 100)
572 {
573 tens = iword / 10;
574 units = iword % 10;
575 switch (tens)
576 {
577 case 2: s_tens = strintern("twenty"); break;
578 case 3: s_tens = strintern("thirty"); break;
579 case 4: s_tens = strintern("forty"); break;
580 case 5: s_tens = strintern("fifty"); break;
581 case 6: s_tens = strintern("sixty"); break;
582 case 7: s_tens = strintern("seventy"); break;
583 case 8: s_tens = strintern("eighty"); break;
584 case 9: s_tens = strintern("ninety"); break;
585 default: return cons(siod_get_lval("token.unknown_word_name",NULL),
586 NIL);
587 }
588 if (units != 0)
589 return cons(s_tens,num_2_words(units));
590 else
591 return cons(s_tens,NIL);
592 }
593 else if (iword < 1000)
594 {
595 lang_stype = ft_get_param("Language");
596 if (streq("americanenglish",get_c_string(lang_stype)))
597 return append(num_2_words(iword/100),
598 cons(strintern("hundred"),
599 (((iword % 100) != 0) ?
600 num_2_words(iword % 100) :
601 NIL)));
602 else
603 return append(num_2_words(iword/100),
604 cons(strintern("hundred"),
605 (((iword % 100) != 0) ?
606 cons(strintern("and"),
607 num_2_words(iword % 100)) :
608 NIL)));
609 }
610 #if 0
611 // We don't depend on this hack now.
612 else if ((iword > 1910) &&
613 (iword < 2000)) // hacky date condition
614 return append(num_2_words(iword/100),
615 num_2_words(iword%100));
616 #endif
617 else if (iword < 1000000)
618 return append(num_2_words(iword/1000),
619 cons(strintern("thousand"),
620 (((iword % 1000) != 0) ?
621 ((((iword % 1000)/100) == 0) ?
622 cons(strintern("and"),num_2_words(iword % 1000)):
623 num_2_words(iword % 1000)) :
624 NIL)));
625 else if (iword >= 1000000)
626 return append(num_2_words(iword/1000000),
627 cons(strintern("million"),
628 (((iword % 1000000) != 0) ?
629 num_2_words(iword % 1000000) :
630 NIL)));
631 else
632 return cons(strintern("bignum"),NIL);
633 }
634
l_word_it(LISP token,LISP tok)635 static LISP l_word_it(LISP token, LISP tok)
636 {
637 // Lisp wrap around for word_it
638 EST_Item *t = get_c_item(token);
639 EST_String tok_name = get_c_string(tok);
640
641 return builtin_word_it(t,tok_name);
642 }
643
festival_token_init(void)644 void festival_token_init(void)
645 {
646 festival_def_utt_module("Token_English",FT_English_Token_Utt,
647 "(Token_English UTT)\n\
648 Build a Word stream from the Token stream, for English (American and\n\
649 British English), analyzing compound words, numbers, etc. as tokens\n\
650 into words.");
651 festival_def_utt_module("Token_Welsh",FT_Welsh_Token_Utt,
652 "(Token_Welsh UTT)\n\
653 Build a Word stream from the Token stream, for Welsh, analyzing\n\
654 compound words, numbers etc as tokens into words.");
655 festival_def_utt_module("Token_Spanish",FT_Spanish_Token_Utt,
656 "(Token_Spanish UTT)\n\
657 Build a Word stream from the Token stream, for Castillian Spanish,\n\
658 analyzing compound words, numbers etc as tokens into words.");
659 festival_def_utt_module("Token_Any",FT_Any_Token_Utt,
660 "(Token_Any UTT)\n\
661 Build a Word stream from the Token stream, in a language independent way,\n\
662 which means that all simple tokens should be in the lexicon, or analysed\n\
663 by letter to sound rules.");
664 festival_def_utt_module("Token_POS",FT_Token_POS_Utt,
665 "(Token_POS UTT)\n\
666 Assign feature token_pos to tokens thats match CART trees in the\n\
667 variable token_pos_cart_trees. These are used for gross level pos\n\
668 such as identifying how numbers should be pronunced.");
669 init_subr_2("builtin_english_token_to_words",l_word_it,
670 "(english_token_to_words TOKENSTREAM TOKENNAME)\n\
671 Returns a list of words expanded from TOKENNAME. Note that as this\n\
672 function may be called recursively TOKENNAME may not be the name of\n\
673 TOKENSTREAM.");
674 }
675