1 /*************************************************************************/
2 /*                                                                       */
3 /*                Centre for Speech Technology Research                  */
4 /*                     University of Edinburgh, UK                       */
5 /*                       Copyright (c) 1996,1997                         */
6 /*                        All Rights Reserved.                           */
7 /*                                                                       */
8 /*  Permission is hereby granted, free of charge, to use and distribute  */
9 /*  this software and its documentation without restriction, including   */
10 /*  without limitation the rights to use, copy, modify, merge, publish,  */
11 /*  distribute, sublicense, and/or sell copies of this work, and to      */
12 /*  permit persons to whom this work is furnished to do so, subject to   */
13 /*  the following conditions:                                            */
14 /*   1. The code must retain the above copyright notice, this list of    */
15 /*      conditions and the following disclaimer.                         */
16 /*   2. Any modifications must be clearly marked as such.                */
17 /*   3. Original authors' names are not deleted.                         */
18 /*   4. The authors' names are not used to endorse or promote products   */
19 /*      derived from this software without specific prior written        */
20 /*      permission.                                                      */
21 /*                                                                       */
22 /*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
23 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
24 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
25 /*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
26 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
27 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
28 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
29 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
30 /*  THIS SOFTWARE.                                                       */
31 /*                                                                       */
32 /*************************************************************************/
33 /*             Author :  Alan W Black                                    */
34 /*             Date   :  November 1996                                   */
35 /*-----------------------------------------------------------------------*/
36 /*                                                                       */
37 /* Tokenizing                                                            */
38 /*                                                                       */
39 /* This provides tokenizing methods for tokens into words.  All that     */
40 /* special rules stuff for analysizing numbers, dates, acronyms etc.     */
41 /* Much of this is still too specific and although easy to add to it     */
42 /* be better if the rules could be specified externally                  */
43 /*                                                                       */
44 /* Note only English tokenization has any substance at present           */
45 /*                                                                       */
46 /*=======================================================================*/
47 #include <cstdio>
48 
49 using namespace std;
50 
51 #include "festival.h"
52 #include "lexicon.h"
53 #include "modules.h"
54 #include "text.h"
55 #include "tokenP.h"
56 
57 static EST_Regex numpointnum("[0-9]*\\.[0-9]+");
58 static EST_Regex RXintcommaed("[0-9][0-9]?[0-9]?,\\([0-9][0-9][0-9],\\)*[0-9][0-9][0-9]\\(\\.[0-9]+\\)?");
59 static EST_Regex RXintord("[0-9]*\\(1st\\|2nd\\|3rd\\|[0-9]th\\)");
60 static EST_Regex RXdottedabbrev("\\([A-Za-z]\\.\\)+[A-Za-z]\\.?");
61 static EST_Regex RXapostropheS(".*'[sS]$");
62 static EST_String PunctuationChars("'`.,:;!?{}[]()-\"");
63 static EST_Regex RXpunctuation("\\(\\]\\|[-[.,!?]\\)+");
64 static EST_String remove_punct(const EST_String &tok);
65 static int only_punc(const EST_String &tok);
66 static LISP num_2_words(int iword);
67 static LISP say_num_as_words(const EST_String &num);
68 static LISP word_it(EST_Item *t,const EST_String tok);
69 static LISP builtin_word_it(EST_Item *token, EST_String tok);
70 static LISP say_as_letters(const EST_String &word);
71 static LISP say_num_as_ordinal(const EST_String &num);
72 static LISP say_num_as_year(const EST_String &num);
73 static LISP say_as_digits(const EST_String &word);
74 
75 LISP FT_English_Token_Utt(LISP utt);
76 LISP FT_Welsh_Token_Utt(LISP utt);
77 LISP FT_Spanish_Token_Utt(LISP utt);
78 LISP FT_Any_Token_Utt(LISP utt);
79 
80 static LISP user_token_to_word_func = NIL;
81 
FT_Welsh_Token_Utt(LISP utt)82 LISP FT_Welsh_Token_Utt(LISP utt)
83 {
84     return FT_Any_Token_Utt(utt);
85 }
86 
FT_Spanish_Token_Utt(LISP utt)87 LISP FT_Spanish_Token_Utt(LISP utt)
88 {
89     (void)utt;
90     cerr << "TOKEN: Spanish tokenization not yet supported\n";
91     festival_error();
92 
93     // never happens
94     return NULL;
95 }
96 
FT_Any_Token_Utt(LISP utt)97 LISP FT_Any_Token_Utt(LISP utt)
98 {
99     // Language independent EST_Token to Word module.  Uses user specified
100     // token to word function of simply creates a word for each token.
101     EST_Utterance *u = get_c_utt(utt);
102     LISP words,w;
103     EST_Item *t;
104     EST_Item *new_word;
105 
106     user_token_to_word_func = siod_get_lval("token_to_words",NULL);
107     u->create_relation("Word");
108 
109     for (t=u->relation("Token")->first(); t != 0; t = t->next())
110     {
111 	if (user_token_to_word_func != NIL)
112 	{
113 	    words = word_it(t,t->name());
114 	    for (w=words; w != NIL; w=cdr(w))
115 	    {
116 		new_word = add_word(u,car(w));
117 		append_daughter(t,"Token",new_word);
118 	    }
119 	}
120 	else
121 	{   // No user token_to_word function so just do it directly
122 	    new_word = add_word(u,t->name());
123 	    append_daughter(t,"Token",new_word);
124 	}
125     }
126     user_token_to_word_func = NIL;  // reset this
127 
128     return utt;
129 }
130 
FT_English_Token_Utt(LISP utt)131 LISP FT_English_Token_Utt(LISP utt)
132 {
133     // This module generates a word stream from a token stream
134     // Tokens may go to zero or more words, tokens retain information
135     // about their punctuation and spacing on the page.
136     //  Preceeding and succeeding punctuation become words
137     EST_Utterance *u = get_c_utt(utt);
138     EST_Item *t;
139     LISP words,w,eou_tree,l;
140     EST_Item *new_word;
141 
142     *cdebug << "Token module (English)" << endl;
143 
144     eou_tree = siod_get_lval("eou_tree","No end of utterance tree");
145     user_token_to_word_func = siod_get_lval("token_to_words",NULL);
146     u->create_relation("Word");
147 
148     for (t=u->relation("Token")->first(); t != 0; t = t->next())
149     {
150 	words = word_it(t,t->name());
151 	// Initial punctuation becomes words
152 	new_word = 0;
153 	if ((t->f("prepunctuation") != "0") &&
154 	    (t->f("prepunctuation") != ""))
155 	{
156 	    l = symbolexplode(strintern(t->f("prepunctuation").string()));
157 	    for (w=l; w != NIL; w=cdr(w))
158 	    {
159 		new_word = add_word(u,car(w));
160 		append_daughter(t,"Token",new_word);
161 	    }
162 	}
163 
164 	// Words become words
165 	for (w=words; w != NIL; w=cdr(w))
166 	{
167 	    new_word = add_word(u,car(w));
168 	    append_daughter(t,"Token",new_word);
169 	}
170 
171 	// final word gets punctuation marking
172 	if ((new_word != 0) && (ffeature(t,"punc") != "0"))
173 	{
174 	    if ((ffeature(t,"punc") == ".") &&
175 		(wagon_predict(t,eou_tree) == 0))
176 	    {   // It wasn't a really punctuation mark
177 		t->set("punc","0");
178 	    }
179 	    else
180 	    {
181 		l = symbolexplode(strintern(ffeature(t,"punc").string()));
182 		for (w=l; w != NIL; w=cdr(w))
183 		{
184 		    new_word = add_word(u,car(w));
185 		    append_daughter(t,"Token",new_word);
186 		}
187 	    }
188 	}
189     }
190 
191     user_token_to_word_func = NIL;
192 
193     return utt;
194 
195 }
196 
word_it(EST_Item * token,const EST_String tok)197 static LISP word_it(EST_Item *token, const EST_String tok)
198 {
199     // The user may specify their own addition;a token to word rules
200     // through the variable user_token_to_word_func if so we must
201     // call that, which may or may not call the builtin version
202     // The builtin version if bound in LISP so that recursion works
203     // properly
204     // This takes a LISP utt as an argument as creating a new wraparound
205     // will cause gc to fail.
206     LISP tok_string = strcons(tok.length(),tok);
207 
208     if (user_token_to_word_func != NIL)            // check user's rules
209 	return leval(cons(user_token_to_word_func,
210 			  cons(siod(token),
211 			       cons(tok_string,NIL))),NIL);
212     else
213 	return builtin_word_it(token,tok);
214 }
215 
builtin_word_it(EST_Item * token,EST_String tok)216 static LISP builtin_word_it(EST_Item *token, EST_String tok)
217 {
218     // Return a list of words for this token
219     EST_String token_pos;
220 
221     if (tok == "")
222 	return NIL;
223     else if (in_current_lexicon(downcase(tok),NIL))  // if in lexicon use as is
224     {
225 	if ((tok != token->name()) && // mainly to catch internal "a"
226 	    (tok.length() == 1))
227 	{
228 	    LISP let_pos = siod_get_lval("token.letter_pos",NULL);
229 	    return cons(cons(make_param_str("name",tok),
230 			     cons(make_param_lisp("pos",let_pos),NIL)),
231 			NIL);
232 	}
233 	else
234 	    return cons(strintern(tok),NIL);
235     }
236     else if ((token_pos = (EST_String)ffeature(token,"token_pos")) == "ordinal")
237 	return say_num_as_ordinal(tok);
238     else if (token_pos == "year")
239 	return say_num_as_year(tok);
240     else if ((token_pos == "digits") ||
241 	     (tok.matches(make_regex("0[0-9]+"))))
242 	return say_as_digits(tok);
243     else if (tok.matches(RXint))
244 	return say_num_as_words(tok);
245     else if (tok.matches(RXintord))
246 	return say_num_as_ordinal(tok.at(0,tok.length()-2));
247     else if (tok.matches(RXintcommaed))  // containing commas at thousands
248     {
249 	if (tok.contains("."))
250 	    return word_it(token,remove_punct(tok.before("."))+
251 			   "."+tok.after("."));
252 	else
253 	    return say_num_as_words(remove_punct(tok));
254     }
255     else if (tok.matches(RXapostropheS))
256     {
257 	return append(word_it(token,tok.at(0,tok.length()-2)),
258 			cons(strintern("'s"),NIL));
259     }
260     else if (tok.matches(numpointnum))
261     {
262 	EST_String afterpoint = tok.after(".");
263 	LISP ap = NIL;
264 	int i;
265 	for (i=0; i < afterpoint.length(); i++)
266 	     ap = append(say_num_as_words(afterpoint.at(i,1)),ap);
267 	return append(say_num_as_words(tok.before(".")),
268 			cons(strintern("point"),reverse(ap)));
269     }
270     else if ((tok.matches(make_regex("[A-Z][A-Z]+"))) &&
271 	     ((!tok.contains(make_regex("[AEIOUY]"))) ||
272 	      ((!tok.contains(make_regex("[^AEIOU][AEIOU][^AEIOU]"))) &&
273 	       (tok.length() < 5))))       // an acronym
274 	return say_as_letters(tok);
275     else if (tok.matches(RXdottedabbrev))
276 	return say_as_letters(remove_punct(tok));
277     else if ((tok.matches(RXalpha)) &&
278 	     !(tok.matches(make_regex(".*[AEIOUYaeiouy].*"))))
279 	return say_as_letters(tok);   // no vowels so spell it
280     else if (tok.matches(RXalpha))   // as is, some sort of word
281 	return cons(strintern(tok),NIL);
282     else if (only_punc(tok))
283 	return stringexplode(tok);
284     else if (tok.contains("-"))
285 	return append(word_it(token,tok.before("-")),
286 		      word_it(token,tok.after("-")));
287     else if (tok.contains(".")) // internet address
288     {
289 	LISP a=NIL;
290 	EST_String tok2 = tok;
291 	for ( ; tok2.contains("."); tok2 = tok2.after("."))
292 	    a = append(a,
293 			 append(word_it(token,tok2.before(".")),
294 				  cons(strintern("dot"),NIL)));
295 	a = append(a,word_it(token,tok2));
296 	return a;
297     }
298     else if (tok.contains("/"))
299 	return append(word_it(token,tok.before("/")),
300 		      cons(strintern("slash"),
301 			   word_it(token,tok.after("/"))));
302     else if (tok.contains("&"))
303 	return append(word_it(token,tok.before("&")),
304 		      cons(strintern("ampersand"),
305 			   word_it(token,tok.after("&"))));
306     else if (tok.contains("_"))
307 	return append(word_it(token,tok.before("_")),
308 		      cons(strintern("underscore"),
309 			   word_it(token,tok.after("_"))));
310     else if (tok.contains("'"))
311 	return word_it(token,tok.before("'")+tok.after("'"));
312     else if (tok.contains("`"))
313 	return append(word_it(token,tok.before("`")),
314 			word_it(token,tok.after("`")));
315     else if (tok.contains("\""))
316 	return append(word_it(token,tok.before("\"")),
317 			word_it(token,tok.after("\"")));
318     else if (tok.contains(","))
319 	return append(word_it(token,tok.before(",")),
320 			word_it(token,tok.after(",")));
321     else if (tok.contains("("))
322 	return append(word_it(token,tok.before("(")),
323 			word_it(token,tok.after("(")));
324     else if (tok.contains(")"))
325 	return append(word_it(token,tok.before(")")),
326 			word_it(token,tok.after(")")));
327     else if (tok.matches(make_regex("^[^a-zA-Z].+"))) // incrementally remove
328 	return append(say_as_letters(tok.at(0,1)),// num/symbol from front
329 			word_it(token,tok.at(1,tok.length()-1)));
330     else if (tok.matches(make_regex(".+[^a-zA-Z]$"))) // incrementally remove rear
331 	return append(word_it(token,tok.at(0,tok.length()-1)),
332 			say_as_letters(tok.at((int)tok.length()-1,1)));
333     else  // could try harder
334 	return say_as_letters(remove_punct(tok));
335 }
336 
say_as_digits(const EST_String & word)337 static LISP say_as_digits(const EST_String &word)
338 {
339     // Should be string of digits, but I wont require it
340     // This isn't really correct for telephone numbers (oh/zero/double)
341     LISP l;
342     LISP lets = stringexplode(word);
343     LISP let_pos = siod_get_lval("token.letter_pos",NULL);
344 
345     for (l=lets; l != NIL; l=cdr(l))
346     {
347 	if (streq(get_c_string(car(l)),"0"))
348 	    CAR(l) = strintern("zero");
349 	else if (streq(get_c_string(car(l)),"1"))
350 	    CAR(l) = strintern("one");
351 	else if (streq(get_c_string(car(l)),"2"))
352 	    CAR(l) = strintern("two");
353 	else if (streq(get_c_string(car(l)),"3"))
354 	    CAR(l) = strintern("three");
355 	else if (streq(get_c_string(car(l)),"4"))
356 	    CAR(l) = strintern("four");
357 	else if (streq(get_c_string(car(l)),"5"))
358 	    CAR(l) = strintern("five");
359 	else if (streq(get_c_string(car(l)),"6"))
360 	    CAR(l) = strintern("six");
361 	else if (streq(get_c_string(car(l)),"7"))
362 	    CAR(l) = strintern("seven");
363 	else if (streq(get_c_string(car(l)),"8"))
364 	    CAR(l) = strintern("eight");
365 	else if (streq(get_c_string(car(l)),"9"))
366 	    CAR(l) = strintern("nine");
367 	else
368 	    CAR(l) = cons(make_param_lisp("name",car(l)),
369 			  cons(make_param_lisp("pos",let_pos),NIL));
370     }
371 
372     return lets;
373 }
374 
say_as_letters(const EST_String & word)375 static LISP say_as_letters(const EST_String &word)
376 {
377     // Explode letters in word and say them, marking them as nouns
378     // This is particularly designed so that A/a doesn't come out as
379     // the determiner a which in typically schwa'd
380     LISP l;
381     LISP lets = stringexplode(word);
382     LISP let_pos = siod_get_lval("token.letter_pos",NULL);
383 
384     for (l=lets; l != NIL; l=cdr(l))
385     {
386 	EST_String name = EST_String(get_c_string(car(l)));
387 	if (name.matches(make_regex("[0-9]")))
388 	    CAR(l) = car(say_as_digits(get_c_string(car(l))));
389 //	else if (name.matches(make_regex("[^a-zA-Z]")))
390 //	    // Not sure, probably a bug to get here
391 //	    CAR(l) = cons(make_param_str("name","symbol"),
392 //			  cons(make_param_lisp("pos",let_pos),NIL));
393 	else
394 	    CAR(l) = cons(make_param_lisp("name",car(l)),
395 			  cons(make_param_lisp("pos",let_pos),NIL));
396     }
397 
398     return lets;
399 }
400 
only_punc(const EST_String & tok)401 static int only_punc(const EST_String &tok)
402 {
403     // If this token consists solely of punctuation chars
404     // If this is true I'm probably suppose to say some of them
405     int i;
406     EST_String np;
407     const char *tokch = tok;
408 
409     for (i=0; i<tok.length(); i++)
410       if (strchr((const char *)PunctuationChars,tokch[i]) == NULL)
411 	    return FALSE;
412 
413     return TRUE;
414 }
415 
remove_punct(const EST_String & tok)416 static EST_String remove_punct(const EST_String &tok)
417 {
418     EST_String np(tok);
419 
420     np.make_updatable();
421 
422     np.gsub(RXpunctuation, "");
423 
424     return np;
425 }
426 
427 
say_num_as_ordinal(const EST_String & num)428 static LISP say_num_as_ordinal(const EST_String &num)
429 {
430     LISP numwords = num_2_words(atoi(num));
431     LISP last;
432 
433     // Now change the last word to the appropriate ordinal
434     for (last=numwords; cdr(last) != NIL; last=cdr(last));
435     const char *lastword = get_c_string(car(last));
436 
437     if (streq(lastword,"zero"))
438 	CAR(last) = strintern("zeroth");
439     else if (streq(lastword,"one"))
440 	CAR(last) = strintern("first");
441     else if (streq(lastword,"two"))
442 	CAR(last) = strintern("second");
443     else if (streq(lastword,"three"))
444 	CAR(last) = strintern("third");
445     else if (streq(lastword,"four"))
446 	CAR(last) = strintern("fourth");
447     else if (streq(lastword,"five"))
448 	CAR(last) = strintern("fifth");
449     else if (streq(lastword,"six"))
450 	CAR(last) = strintern("sixth");
451     else if (streq(lastword,"seven"))
452 	CAR(last) = strintern("seventh");
453     else if (streq(lastword,"eight"))
454 	CAR(last) = strintern("eighth");
455     else if (streq(lastword,"nine"))
456 	CAR(last) = strintern("ninth");
457     else if (streq(lastword,"ten"))
458 	CAR(last) = strintern("tenth");
459     else if (streq(lastword,"eleven"))
460 	CAR(last) = strintern("eleventh");
461     else if (streq(lastword,"twelve"))
462 	CAR(last) = strintern("twelfth");
463     else if (streq(&lastword[strlen(lastword)-4],"teen"))
464 	CAR(last) = strintern(EST_String(lastword)+"th");
465     else if (streq(&lastword[strlen(lastword)-2],"ty"))
466 	CAR(last) = strintern(EST_String(lastword).before("ty")+"tieth");
467     else if (streq(lastword,"hundred"))
468 	CAR(last) = strintern("hundredth");
469     else if (streq(lastword,"thousand"))
470 	CAR(last) = strintern("thousandth");
471     else if (streq(&lastword[strlen(lastword)-6],"illion"))
472 	CAR(last) = strintern(EST_String(lastword)+"th");
473     else
474     {
475 	// I don't think I've forgotten anything
476 	*cdebug << "Token: can't make ordinal from \"" << lastword
477 	    << "\"" << endl;
478 	CAR(last) = strintern(EST_String(lastword)+"th");
479     }
480 
481     return numwords;
482 }
483 
say_num_as_words(const EST_String & num)484 static LISP say_num_as_words(const EST_String &num)
485 {
486     if (num.length() > 9)
487     {
488 	if (num(0) == '-')
489 	    return cons(strintern("minus"),say_as_digits(num.after("-")));
490 	else
491 	    return say_as_digits(num);
492     }
493     else
494 	return num_2_words(atoi(num));
495 }
496 
say_num_as_year(const EST_String & num)497 static LISP say_num_as_year(const EST_String &num)
498 {
499     int iword = atoi(num);
500 
501     if (num.length() > 4)
502 	return say_num_as_words(num);
503     else if (num.matches(make_regex("00")))
504     {
505 	return cons(strintern("o"),
506 		    cons(strintern("o"),NIL));
507     }
508     else if (num.matches(make_regex("0[0-9]")))
509     {
510 	return cons(strintern("o"),
511 		    num_2_words(iword));
512     }
513     else if (iword < 100)
514 	return num_2_words(iword);
515     else if ((iword % 1000) < 10)
516     {
517 	if ((iword % 1000) == 0)
518 	    return append(num_2_words(iword/1000),
519 			    cons(strintern("thousand"),NIL));
520 	else
521 	    return append(num_2_words(iword/1000),
522 			    cons(strintern("thousand"),
523 				 cons(strintern("and"),
524 				      num_2_words(iword%1000))));
525     }
526     else if ((iword % 100) == 0)
527 	return append(num_2_words(iword/100),
528 			cons(strintern("hundred"),NIL));
529     else if ((iword % 100) < 10)
530 	return append(num_2_words(iword/100),
531 			cons(strintern("o"),
532 			     num_2_words(iword%100)));
533     else
534 	return append(num_2_words(iword/100),
535 			num_2_words(iword%100));
536 }
537 
num_2_words(int iword)538 static LISP num_2_words(int iword)
539 {
540     // Convert number of list of words
541     int tens, units;
542     LISP s_tens, lang_stype=NIL;
543 
544     if (iword < 0)
545 	return cons(strintern("minus"),num_2_words(-iword));
546     else if (iword < 20)
547 	switch (iword) 	{
548 	  case 0: return cons(strintern("zero"),NIL);
549           case 1: return cons(strintern("one"),NIL);
550 	  case 2: return cons(strintern("two"),NIL);
551 	  case 3: return cons(strintern("three"),NIL);
552 	  case 4: return cons(strintern("four"),NIL);
553 	  case 5: return cons(strintern("five"),NIL);
554 	  case 6: return cons(strintern("six"),NIL);
555 	  case 7: return cons(strintern("seven"),NIL);
556 	  case 8: return cons(strintern("eight"),NIL);
557 	  case 9: return cons(strintern("nine"),NIL);
558 	  case 10: return cons(strintern("ten"),NIL);
559 	  case 11: return cons(strintern("eleven"),NIL);
560 	  case 12: return cons(strintern("twelve"),NIL);
561 	  case 13: return cons(strintern("thirteen"),NIL);
562 	  case 14: return cons(strintern("fourteen"),NIL);
563 	  case 15: return cons(strintern("fifteen"),NIL);
564 	  case 16: return cons(strintern("sixteen"),NIL);
565 	  case 17: return cons(strintern("seventeen"),NIL);
566 	  case 18: return cons(strintern("eighteen"),NIL);
567 	  case 19: return cons(strintern("nineteen"),NIL);
568 	  default: return cons(siod_get_lval("token.unknown_word_name",NULL),
569 			       NIL);
570       }
571     else if (iword < 100)
572     {
573 	tens = iword / 10;
574 	units = iword % 10;
575 	switch (tens)
576 	{
577 	  case 2: s_tens = strintern("twenty"); break;
578 	  case 3: s_tens = strintern("thirty"); break;
579 	  case 4: s_tens = strintern("forty"); break;
580 	  case 5: s_tens = strintern("fifty"); break;
581 	  case 6: s_tens = strintern("sixty"); break;
582 	  case 7: s_tens = strintern("seventy"); break;
583 	  case 8: s_tens = strintern("eighty"); break;
584 	  case 9: s_tens = strintern("ninety"); break;
585 	  default: return cons(siod_get_lval("token.unknown_word_name",NULL),
586 			       NIL);
587 	}
588         if (units != 0)
589 	    return cons(s_tens,num_2_words(units));
590 	else
591 	    return cons(s_tens,NIL);
592     }
593     else if (iword < 1000)
594     {
595 	lang_stype = ft_get_param("Language");
596 	if (streq("americanenglish",get_c_string(lang_stype)))
597 	    return append(num_2_words(iword/100),
598 			    cons(strintern("hundred"),
599 				 (((iword % 100) != 0) ?
600 				  num_2_words(iword % 100) :
601 				  NIL)));
602 	else
603 	    return append(num_2_words(iword/100),
604 			    cons(strintern("hundred"),
605 				 (((iword % 100) != 0) ?
606 				  cons(strintern("and"),
607 				       num_2_words(iword % 100)) :
608 				  NIL)));
609     }
610 #if 0
611     // We don't depend on this hack now.
612     else if ((iword > 1910) &&
613 	     (iword < 2000))   // hacky date condition
614 	return append(num_2_words(iword/100),
615 			num_2_words(iword%100));
616 #endif
617     else if (iword < 1000000)
618 	return append(num_2_words(iword/1000),
619 			cons(strintern("thousand"),
620 			     (((iword % 1000) != 0) ?
621 			      ((((iword % 1000)/100) == 0) ?
622 			       cons(strintern("and"),num_2_words(iword % 1000)):
623 			       num_2_words(iword % 1000)) :
624 			      NIL)));
625     else if (iword >= 1000000)
626 	return append(num_2_words(iword/1000000),
627 			cons(strintern("million"),
628 			     (((iword % 1000000) != 0) ?
629 			      num_2_words(iword % 1000000) :
630 			      NIL)));
631     else
632 	return cons(strintern("bignum"),NIL);
633 }
634 
l_word_it(LISP token,LISP tok)635 static LISP l_word_it(LISP token, LISP tok)
636 {
637     // Lisp wrap around for word_it
638     EST_Item *t = get_c_item(token);
639     EST_String tok_name = get_c_string(tok);
640 
641     return builtin_word_it(t,tok_name);
642 }
643 
festival_token_init(void)644 void festival_token_init(void)
645 {
646     festival_def_utt_module("Token_English",FT_English_Token_Utt,
647     "(Token_English UTT)\n\
648   Build a Word stream from the Token stream, for English (American and\n\
649   British English), analyzing compound words, numbers, etc. as tokens\n\
650   into words.");
651     festival_def_utt_module("Token_Welsh",FT_Welsh_Token_Utt,
652     "(Token_Welsh UTT)\n\
653   Build a Word stream from the Token stream, for Welsh, analyzing\n\
654   compound words, numbers etc as tokens into words.");
655     festival_def_utt_module("Token_Spanish",FT_Spanish_Token_Utt,
656     "(Token_Spanish UTT)\n\
657   Build a Word stream from the Token stream, for Castillian Spanish,\n\
658   analyzing compound words, numbers etc as tokens into words.");
659     festival_def_utt_module("Token_Any",FT_Any_Token_Utt,
660     "(Token_Any UTT)\n\
661   Build a Word stream from the Token stream, in a language independent way,\n\
662   which means that all simple tokens should be in the lexicon, or analysed\n\
663   by letter to sound rules.");
664     festival_def_utt_module("Token_POS",FT_Token_POS_Utt,
665     "(Token_POS UTT)\n\
666   Assign feature token_pos to tokens thats match CART trees in the\n\
667   variable token_pos_cart_trees.  These are used for gross level pos\n\
668   such as identifying how numbers should be pronunced.");
669     init_subr_2("builtin_english_token_to_words",l_word_it,
670     "(english_token_to_words TOKENSTREAM TOKENNAME)\n\
671   Returns a list of words expanded from TOKENNAME.  Note that as this\n\
672   function may be called recursively TOKENNAME may not be the name of\n\
673   TOKENSTREAM.");
674 }
675