1 /*************************************************************************/
2 /*                                                                       */
3 /*                  Language Technologies Institute                      */
4 /*                     Carnegie Mellon University                        */
5 /*                        Copyright (c) 1999                             */
6 /*                        All Rights Reserved.                           */
7 /*                                                                       */
8 /*  Permission is hereby granted, free of charge, to use and distribute  */
9 /*  this software and its documentation without restriction, including   */
10 /*  without limitation the rights to use, copy, modify, merge, publish,  */
11 /*  distribute, sublicense, and/or sell copies of this work, and to      */
12 /*  permit persons to whom this work is furnished to do so, subject to   */
13 /*  the following conditions:                                            */
14 /*   1. The code must retain the above copyright notice, this list of    */
15 /*      conditions and the following disclaimer.                         */
16 /*   2. Any modifications must be clearly marked as such.                */
17 /*   3. Original authors' names are not deleted.                         */
18 /*   4. The authors' names are not used to endorse or promote products   */
19 /*      derived from this software without specific prior written        */
20 /*      permission.                                                      */
21 /*                                                                       */
22 /*  CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK         */
23 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
24 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
25 /*  SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE      */
26 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
27 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
28 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
29 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
30 /*  THIS SOFTWARE.                                                       */
31 /*                                                                       */
32 /*************************************************************************/
33 /*             Author:  Alan W Black (awb@cs.cmu.edu)                    */
34 /*               Date:  July 1999                                        */
35 /*************************************************************************/
36 /*                                                                       */
37 /*  Tokenizer for strings and files                                      */
38 /*                                                                       */
39 /*************************************************************************/
40 
41 /* ----------------------------------------------------------------- */
42 /*           The English TTS System "Flite+hts_engine"               */
43 /*           developed by HTS Working Group                          */
44 /*           http://hts-engine.sourceforge.net/                      */
45 /* ----------------------------------------------------------------- */
46 /*                                                                   */
47 /*  Copyright (c) 2005-2013  Nagoya Institute of Technology          */
48 /*                           Department of Computer Science          */
49 /*                                                                   */
50 /*                2005-2008  Tokyo Institute of Technology           */
51 /*                           Interdisciplinary Graduate School of    */
52 /*                           Science and Engineering                 */
53 /*                                                                   */
54 /* All rights reserved.                                              */
55 /*                                                                   */
56 /* Redistribution and use in source and binary forms, with or        */
57 /* without modification, are permitted provided that the following   */
58 /* conditions are met:                                               */
59 /*                                                                   */
60 /* - Redistributions of source code must retain the above copyright  */
61 /*   notice, this list of conditions and the following disclaimer.   */
62 /* - Redistributions in binary form must reproduce the above         */
63 /*   copyright notice, this list of conditions and the following     */
64 /*   disclaimer in the documentation and/or other materials provided */
65 /*   with the distribution.                                          */
66 /* - Neither the name of the HTS working group nor the names of its  */
67 /*   contributors may be used to endorse or promote products derived */
68 /*   from this software without specific prior written permission.   */
69 /*                                                                   */
70 /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND            */
71 /* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,       */
72 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF          */
73 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE          */
74 /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS */
75 /* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,          */
76 /* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED   */
77 /* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,     */
78 /* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON */
79 /* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,   */
80 /* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY    */
81 /* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE           */
82 /* POSSIBILITY OF SUCH DAMAGE.                                       */
83 /* ----------------------------------------------------------------- */
84 
85 #include "cst_tokenstream.h"
86 
87 const cst_string * const cst_ts_default_whitespacesymbols = " \t\n\r";
88 const cst_string * const cst_ts_default_singlecharsymbols = "(){}[]";
89 const cst_string * const cst_ts_default_prepunctuationsymbols = "\"'`({[";
90 const cst_string * const cst_ts_default_postpunctuationsymbols = "\"'`.,:;!?(){}[]";
91 
92 #define TS_BUFFER_SIZE 256
93 #define TS_EOF -1
94 
95 static cst_string ts_getc(cst_tokenstream *ts);
96 
set_charclass_table(cst_tokenstream * ts)97 static void set_charclass_table(cst_tokenstream *ts)
98 {
99     int i;
100     memset(ts->charclass,0,256);  /* zero everything */
101 
102     for (i=0; ts->p_whitespacesymbols[i]; i++)
103 	ts->charclass[(unsigned char)ts->p_whitespacesymbols[i]] |= TS_CHARCLASS_WHITESPACE;
104     for (i=0; ts->p_singlecharsymbols[i]; i++)
105 	ts->charclass[(unsigned char)ts->p_singlecharsymbols[i]] |= TS_CHARCLASS_SINGLECHAR;
106     for (i=0; ts->p_prepunctuationsymbols[i]; i++)
107 	ts->charclass[(unsigned char)ts->p_prepunctuationsymbols[i]] |= TS_CHARCLASS_PREPUNCT;
108     for (i=0; ts->p_postpunctuationsymbols[i]; i++)
109 	ts->charclass[(unsigned char)ts->p_postpunctuationsymbols[i]]|=TS_CHARCLASS_POSTPUNCT;
110     return;
111 }
112 
set_charclasses(cst_tokenstream * ts,const cst_string * whitespace,const cst_string * singlecharsymbols,const cst_string * prepunctuation,const cst_string * postpunctuation)113 void set_charclasses(cst_tokenstream *ts,
114 		     const cst_string *whitespace,
115 		     const cst_string *singlecharsymbols,
116 		     const cst_string *prepunctuation,
117 		     const cst_string *postpunctuation)
118 {
119     ts->p_whitespacesymbols =
120 	(whitespace ? whitespace : cst_ts_default_whitespacesymbols);
121     ts->p_singlecharsymbols =
122     (singlecharsymbols ? singlecharsymbols : cst_ts_default_singlecharsymbols);
123     ts->p_prepunctuationsymbols =
124     (prepunctuation ? prepunctuation : cst_ts_default_prepunctuationsymbols);
125     ts->p_postpunctuationsymbols =
126    (postpunctuation ? postpunctuation : cst_ts_default_postpunctuationsymbols);
127 
128     set_charclass_table(ts);
129     return;
130 }
131 
extend_buffer(cst_string ** buffer,int * buffer_max)132 static void extend_buffer(cst_string **buffer,int *buffer_max)
133 {
134     int new_max;
135     cst_string *new_buffer;
136 
137     new_max = (*buffer_max)+(*buffer_max)/5;
138     new_buffer = cst_alloc(cst_string,new_max);
139     memmove(new_buffer,*buffer,*buffer_max);
140     cst_free(*buffer);
141     *buffer = new_buffer;
142     *buffer_max = new_max;
143 }
144 
new_tokenstream(const cst_string * whitespace,const cst_string * singlechars,const cst_string * prepunct,const cst_string * postpunct)145 static cst_tokenstream *new_tokenstream(const cst_string *whitespace,
146 					const cst_string *singlechars,
147 					const cst_string *prepunct,
148 					const cst_string *postpunct)
149 {   /* Constructor function */
150     cst_tokenstream *ts = cst_alloc(cst_tokenstream,1);
151     ts->fd = NULL;
152     ts->file_pos = 0;
153     ts->line_number = 0;
154     ts->string_buffer = NULL;
155     ts->token_pos = 0;
156     ts->whitespace = cst_alloc(cst_string,TS_BUFFER_SIZE);
157     ts->ws_max = TS_BUFFER_SIZE;
158     if (prepunct && prepunct[0])
159     {
160         ts->prepunctuation = cst_alloc(cst_string,TS_BUFFER_SIZE);
161         ts->prep_max = TS_BUFFER_SIZE;
162     }
163     ts->token = cst_alloc(cst_string,TS_BUFFER_SIZE);
164     ts->token_max = TS_BUFFER_SIZE;
165     if (postpunct && postpunct[0])
166     {
167         ts->postpunctuation = cst_alloc(cst_string,TS_BUFFER_SIZE);
168         ts->postp_max = TS_BUFFER_SIZE;
169     }
170 
171     set_charclasses(ts,whitespace,singlechars,prepunct,postpunct);
172     ts->current_char = 0;
173 
174     return ts;
175 }
176 
delete_tokenstream(cst_tokenstream * ts)177 void delete_tokenstream(cst_tokenstream *ts)
178 {
179     cst_free(ts->whitespace);
180     cst_free(ts->token);
181     if (ts->prepunctuation) cst_free(ts->prepunctuation);
182     if (ts->postpunctuation) cst_free(ts->postpunctuation);
183     cst_free(ts);
184 }
185 
186 #ifndef FLITE_PLUS_HTS_ENGINE
ts_open(const char * filename,const cst_string * whitespace,const cst_string * singlechars,const cst_string * prepunct,const cst_string * postpunct)187 cst_tokenstream *ts_open(const char *filename,
188 			 const cst_string *whitespace,
189 			 const cst_string *singlechars,
190 			 const cst_string *prepunct,
191 			 const cst_string *postpunct)
192 {
193     cst_tokenstream *ts = new_tokenstream(whitespace,
194 					  singlechars,
195 					  prepunct,
196 					  postpunct);
197 
198 #ifndef UNDER_CE
199     if (cst_streq("-",filename))
200 	ts->fd = stdin;
201     else
202 #endif
203 	ts->fd = cst_fopen(filename,CST_OPEN_READ|CST_OPEN_BINARY);
204     ts_getc(ts);
205 
206     if (ts->fd == NULL)
207     {
208 	delete_tokenstream(ts);
209 	return NULL;
210     }
211     else
212 	return ts;
213 }
214 #endif /* !FLITE_PLUS_HTS_ENGINE*/
215 
ts_open_string(const cst_string * string,const cst_string * whitespace,const cst_string * singlechars,const cst_string * prepunct,const cst_string * postpunct)216 cst_tokenstream *ts_open_string(const cst_string *string,
217 				const cst_string *whitespace,
218 				const cst_string *singlechars,
219 				const cst_string *prepunct,
220 				const cst_string *postpunct)
221 {
222     cst_tokenstream *ts = new_tokenstream(whitespace,
223 					  singlechars,
224 					  prepunct,
225 					  postpunct);
226 
227     ts->string_buffer = cst_strdup(string);
228     ts_getc(ts);
229 
230     return ts;
231 }
232 
ts_close(cst_tokenstream * ts)233 void ts_close(cst_tokenstream *ts)
234 {
235     if (ts->fd != NULL)
236     {
237 #ifndef FLITE_PLUS_HTS_ENGINE
238 #ifndef UNDER_CE
239 	if (ts->fd != stdin)
240 #endif
241 	    cst_fclose(ts->fd);
242 #endif /* !FLITE_PLUS_HTS_ENGINE */
243 	ts->fd = NULL; /* just in case close gets called twice */
244     }
245     if (ts->string_buffer != NULL)
246     {
247         cst_free(ts->string_buffer);
248 	ts->string_buffer = NULL;
249     }
250     delete_tokenstream(ts);
251 }
252 
get_token_sub_part(cst_tokenstream * ts,int charclass,cst_string ** buffer,int * buffer_max)253 static void get_token_sub_part(cst_tokenstream *ts,
254 			       int charclass,
255 			       cst_string **buffer,
256 			       int *buffer_max)
257 {
258     int p;
259 
260     for (p=0; ((ts->current_char != TS_EOF) &&
261                (ts_charclass(ts->current_char,charclass,ts)) &&
262 	       (!ts_charclass(ts->current_char,
263 			      TS_CHARCLASS_SINGLECHAR,ts))); p++)
264     {
265 	if (p >= *buffer_max) extend_buffer(buffer,buffer_max);
266 	(*buffer)[p] = ts->current_char;
267 	ts_getc(ts);
268     }
269     (*buffer)[p] = '\0';
270 }
271 
272 /* Can't afford dynamically generate this char class so have separater func */
get_token_sub_part_2(cst_tokenstream * ts,int endclass1,cst_string ** buffer,int * buffer_max)273 static void get_token_sub_part_2(cst_tokenstream *ts,
274 				 int endclass1,
275 				 cst_string **buffer,
276 				 int *buffer_max)
277 {
278     int p;
279 
280     for (p=0; ((ts->current_char != TS_EOF) &&
281                (!ts_charclass(ts->current_char,endclass1,ts)) &&
282 	       (!ts_charclass(ts->current_char,
283 			      TS_CHARCLASS_SINGLECHAR,ts)));
284          p++)
285     {
286 	if (p >= *buffer_max) extend_buffer(buffer,buffer_max);
287 	(*buffer)[p] = ts->current_char;
288 	ts_getc(ts);
289     }
290     (*buffer)[p] = '\0';
291 }
292 
get_token_postpunctuation(cst_tokenstream * ts)293 static void get_token_postpunctuation(cst_tokenstream *ts)
294 {
295     int p,t;
296 
297     t = cst_strlen(ts->token);
298     for (p=t;
299 	 (p > 0) &&
300 	     ((ts->token[p] == '\0') ||
301 	      (ts_charclass(ts->token[p],TS_CHARCLASS_POSTPUNCT,ts)));
302 	 p--);
303 
304     if (t != p)
305     {
306 	if (t-p >= ts->postp_max)
307 	    extend_buffer(&ts->postpunctuation,&ts->postp_max);
308 	/* Copy postpunctuation from token */
309 	memmove(ts->postpunctuation,&ts->token[p+1],(t-p));
310 	/* truncate token at postpunctuation */
311 	ts->token[p+1] = '\0';
312     }
313 }
314 
ts_eof(cst_tokenstream * ts)315 int ts_eof(cst_tokenstream *ts)
316 {
317     if (ts->current_char == TS_EOF)
318 	return TRUE;
319     else
320 	return FALSE;
321 }
322 
ts_set_stream_pos(cst_tokenstream * ts,int pos)323 int ts_set_stream_pos(cst_tokenstream *ts, int pos)
324 {
325     /* Note this doesn't preserve line_pos */
326     int new_pos, l;
327 
328     if (ts->fd)
329 #ifdef FLITE_PLUS_HTS_ENGINE
330         ;
331 #else
332         new_pos = (int)cst_fseek(ts->fd,(long)pos,CST_SEEK_ABSOLUTE);
333 #endif /* !FLITE_PLUS_HTS_ENGINE */
334     else if (ts->string_buffer)
335     {
336         l = cst_strlen(ts->string_buffer);
337         if (pos > l)
338             new_pos = l;
339         else if (pos < 0)
340             new_pos = 0;
341         else
342             new_pos = pos;
343     }
344     else
345         new_pos = pos;  /* not sure it can get here */
346     ts->file_pos = new_pos;
347     ts->current_char = ' ';  /* To be safe */
348 
349     return ts->file_pos;
350 }
351 
ts_get_stream_pos(cst_tokenstream * ts)352 int ts_get_stream_pos(cst_tokenstream *ts)
353 {
354     return ts->file_pos;
355 }
356 
ts_getc(cst_tokenstream * ts)357 static cst_string ts_getc(cst_tokenstream *ts)
358 {
359     if (ts->fd)
360     {
361 #ifndef FLITE_PLUS_HTS_ENGINE
362 	ts->current_char = cst_fgetc(ts->fd);
363 #endif /* !FLITE_PLUS_HTS_ENGINE */
364     }
365     else if (ts->string_buffer)
366     {
367 	if (ts->string_buffer[ts->file_pos] == '\0')
368 	    ts->current_char = TS_EOF;
369 	else
370 	    ts->current_char = ts->string_buffer[ts->file_pos];
371     }
372 
373     if (ts->current_char != TS_EOF)
374 	ts->file_pos++;
375     if (ts->current_char == '\n')
376 	ts->line_number++;
377     return ts->current_char;
378 }
379 
ts_get_quoted_token(cst_tokenstream * ts,char quote,char escape)380 const cst_string *ts_get_quoted_token(cst_tokenstream *ts,
381 					 char quote,
382 					 char escape)
383 {
384     /* for reading the next quoted token that starts with quote and
385        ends with quote, quote may appear only if preceded by escape */
386     int l, p;
387 
388     /* Hmm can't change quotes within a ts */
389     ts->charclass[(unsigned int)quote] |= TS_CHARCLASS_QUOTE;
390     ts->charclass[(unsigned int)escape] |= TS_CHARCLASS_QUOTE;
391 
392     /* skipping whitespace */
393     get_token_sub_part(ts,TS_CHARCLASS_WHITESPACE,
394 		       &ts->whitespace,
395 		       &ts->ws_max);
396     ts->token_pos = ts->file_pos - 1;
397 
398     if (ts->current_char == quote)
399     {   /* go until quote */
400 	ts_getc(ts);
401 	l=0;
402         for (p=0; ((ts->current_char != TS_EOF) &&
403                    (ts->current_char != quote));
404              p++)
405         {
406             if (p >= ts->token_max)
407                 extend_buffer(&ts->token,&ts->token_max);
408             ts->token[p] = ts->current_char;
409             ts_getc(ts);
410             if (ts->current_char == escape)
411             {
412                 ts_get(ts);
413                 if (p >= ts->token_max)
414                     extend_buffer(&ts->token,&ts->token_max);
415                 ts->token[p] = ts->current_char;
416                 ts_get(ts);
417             }
418         }
419         ts->token[p] = '\0';
420 	ts_getc(ts);
421     }
422     else /* its not quotes, like to be careful dont you */
423     {    /* treat is as standard token                  */
424 	/* Get prepunctuation */
425 	get_token_sub_part(ts,TS_CHARCLASS_PREPUNCT,
426 			   &ts->prepunctuation,
427 			   &ts->prep_max);
428 	/* Get the symbol itself */
429 	if (!ts_charclass(ts->current_char,TS_CHARCLASS_SINGLECHAR,ts))
430 	{
431 	    if (2 >= ts->token_max) extend_buffer(&ts->token,&ts->token_max);
432 	    ts->token[0] = ts->current_char;
433 	    ts->token[1] = '\0';
434 	    ts_getc(ts);
435 	}
436 	else
437 	    get_token_sub_part_2(ts,
438 				 TS_CHARCLASS_WHITESPACE,    /* end class1 */
439 				 &ts->token,
440 				 &ts->token_max);
441 	/* This'll have token *plus* post punctuation in ts->token */
442 	/* Get postpunctuation */
443 	get_token_postpunctuation(ts);
444     }
445 
446     return ts->token;
447 }
448 
ts_get(cst_tokenstream * ts)449 const cst_string *ts_get(cst_tokenstream *ts)
450 {
451     /* Get next token */
452 
453     /* Skip whitespace */
454     get_token_sub_part(ts,
455 		       TS_CHARCLASS_WHITESPACE,
456 		       &ts->whitespace,
457 		       &ts->ws_max);
458 
459     /* quoted strings currently ignored */
460     ts->token_pos = ts->file_pos - 1;
461 
462     /* Get prepunctuation */
463     if (ts->current_char != TS_EOF &&
464         ts_charclass(ts->current_char,TS_CHARCLASS_PREPUNCT,ts))
465 	get_token_sub_part(ts,
466 			   TS_CHARCLASS_PREPUNCT,
467 			   &ts->prepunctuation,
468 			   &ts->prep_max);
469     else if (ts->prepunctuation)
470 	ts->prepunctuation[0] = '\0';
471     /* Get the symbol itself */
472     if (ts->current_char != TS_EOF &&
473         ts_charclass(ts->current_char,TS_CHARCLASS_SINGLECHAR,ts))
474     {
475 	if (2 >= ts->token_max) extend_buffer(&ts->token,&ts->token_max);
476 	ts->token[0] = ts->current_char;
477 	ts->token[1] = '\0';
478 	ts_getc(ts);
479     }
480     else
481 	get_token_sub_part_2(ts,
482 			     TS_CHARCLASS_WHITESPACE,       /* end class1 */
483 			     &ts->token,
484 			     &ts->token_max);
485     /* This'll have token *plus* post punctuation in ts->token */
486     /* Get postpunctuation */
487     if (ts->p_postpunctuationsymbols[0])
488         get_token_postpunctuation(ts);
489 
490     return ts->token;
491 }
492 
ts_read(void * buff,int size,int num,cst_tokenstream * ts)493 int ts_read(void *buff, int size, int num, cst_tokenstream *ts)
494 {
495     /* people should complain about the speed here */
496     /* people will complain about EOF as end of file */
497     int i,j,p;
498     cst_string *cbuff;
499 
500     cbuff = (cst_string *)buff;
501 
502     for (p=i=0; i < num; i++)
503 	for (j=0; j < size; j++,p++)
504 	    cbuff[p] = ts_getc(ts);
505 
506     return i;
507 }
508