1 /*************************************************************************/
2 /* */
3 /* Language Technologies Institute */
4 /* Carnegie Mellon University */
5 /* Copyright (c) 1999 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author: Alan W Black (awb@cs.cmu.edu) */
34 /* Date: July 1999 */
35 /*************************************************************************/
36 /* */
37 /* Tokenizer for strings and files */
38 /* */
39 /*************************************************************************/
40
41 /* ----------------------------------------------------------------- */
42 /* The English TTS System "Flite+hts_engine" */
43 /* developed by HTS Working Group */
44 /* http://hts-engine.sourceforge.net/ */
45 /* ----------------------------------------------------------------- */
46 /* */
47 /* Copyright (c) 2005-2013 Nagoya Institute of Technology */
48 /* Department of Computer Science */
49 /* */
50 /* 2005-2008 Tokyo Institute of Technology */
51 /* Interdisciplinary Graduate School of */
52 /* Science and Engineering */
53 /* */
54 /* All rights reserved. */
55 /* */
56 /* Redistribution and use in source and binary forms, with or */
57 /* without modification, are permitted provided that the following */
58 /* conditions are met: */
59 /* */
60 /* - Redistributions of source code must retain the above copyright */
61 /* notice, this list of conditions and the following disclaimer. */
62 /* - Redistributions in binary form must reproduce the above */
63 /* copyright notice, this list of conditions and the following */
64 /* disclaimer in the documentation and/or other materials provided */
65 /* with the distribution. */
66 /* - Neither the name of the HTS working group nor the names of its */
67 /* contributors may be used to endorse or promote products derived */
68 /* from this software without specific prior written permission. */
69 /* */
70 /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND */
71 /* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */
72 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
73 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
74 /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS */
75 /* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, */
76 /* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED */
77 /* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, */
78 /* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON */
79 /* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, */
80 /* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY */
81 /* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
82 /* POSSIBILITY OF SUCH DAMAGE. */
83 /* ----------------------------------------------------------------- */
84
85 #include "cst_tokenstream.h"
86
87 const cst_string * const cst_ts_default_whitespacesymbols = " \t\n\r";
88 const cst_string * const cst_ts_default_singlecharsymbols = "(){}[]";
89 const cst_string * const cst_ts_default_prepunctuationsymbols = "\"'`({[";
90 const cst_string * const cst_ts_default_postpunctuationsymbols = "\"'`.,:;!?(){}[]";
91
92 #define TS_BUFFER_SIZE 256
93 #define TS_EOF -1
94
95 static cst_string ts_getc(cst_tokenstream *ts);
96
set_charclass_table(cst_tokenstream * ts)97 static void set_charclass_table(cst_tokenstream *ts)
98 {
99 int i;
100 memset(ts->charclass,0,256); /* zero everything */
101
102 for (i=0; ts->p_whitespacesymbols[i]; i++)
103 ts->charclass[(unsigned char)ts->p_whitespacesymbols[i]] |= TS_CHARCLASS_WHITESPACE;
104 for (i=0; ts->p_singlecharsymbols[i]; i++)
105 ts->charclass[(unsigned char)ts->p_singlecharsymbols[i]] |= TS_CHARCLASS_SINGLECHAR;
106 for (i=0; ts->p_prepunctuationsymbols[i]; i++)
107 ts->charclass[(unsigned char)ts->p_prepunctuationsymbols[i]] |= TS_CHARCLASS_PREPUNCT;
108 for (i=0; ts->p_postpunctuationsymbols[i]; i++)
109 ts->charclass[(unsigned char)ts->p_postpunctuationsymbols[i]]|=TS_CHARCLASS_POSTPUNCT;
110 return;
111 }
112
set_charclasses(cst_tokenstream * ts,const cst_string * whitespace,const cst_string * singlecharsymbols,const cst_string * prepunctuation,const cst_string * postpunctuation)113 void set_charclasses(cst_tokenstream *ts,
114 const cst_string *whitespace,
115 const cst_string *singlecharsymbols,
116 const cst_string *prepunctuation,
117 const cst_string *postpunctuation)
118 {
119 ts->p_whitespacesymbols =
120 (whitespace ? whitespace : cst_ts_default_whitespacesymbols);
121 ts->p_singlecharsymbols =
122 (singlecharsymbols ? singlecharsymbols : cst_ts_default_singlecharsymbols);
123 ts->p_prepunctuationsymbols =
124 (prepunctuation ? prepunctuation : cst_ts_default_prepunctuationsymbols);
125 ts->p_postpunctuationsymbols =
126 (postpunctuation ? postpunctuation : cst_ts_default_postpunctuationsymbols);
127
128 set_charclass_table(ts);
129 return;
130 }
131
extend_buffer(cst_string ** buffer,int * buffer_max)132 static void extend_buffer(cst_string **buffer,int *buffer_max)
133 {
134 int new_max;
135 cst_string *new_buffer;
136
137 new_max = (*buffer_max)+(*buffer_max)/5;
138 new_buffer = cst_alloc(cst_string,new_max);
139 memmove(new_buffer,*buffer,*buffer_max);
140 cst_free(*buffer);
141 *buffer = new_buffer;
142 *buffer_max = new_max;
143 }
144
new_tokenstream(const cst_string * whitespace,const cst_string * singlechars,const cst_string * prepunct,const cst_string * postpunct)145 static cst_tokenstream *new_tokenstream(const cst_string *whitespace,
146 const cst_string *singlechars,
147 const cst_string *prepunct,
148 const cst_string *postpunct)
149 { /* Constructor function */
150 cst_tokenstream *ts = cst_alloc(cst_tokenstream,1);
151 ts->fd = NULL;
152 ts->file_pos = 0;
153 ts->line_number = 0;
154 ts->string_buffer = NULL;
155 ts->token_pos = 0;
156 ts->whitespace = cst_alloc(cst_string,TS_BUFFER_SIZE);
157 ts->ws_max = TS_BUFFER_SIZE;
158 if (prepunct && prepunct[0])
159 {
160 ts->prepunctuation = cst_alloc(cst_string,TS_BUFFER_SIZE);
161 ts->prep_max = TS_BUFFER_SIZE;
162 }
163 ts->token = cst_alloc(cst_string,TS_BUFFER_SIZE);
164 ts->token_max = TS_BUFFER_SIZE;
165 if (postpunct && postpunct[0])
166 {
167 ts->postpunctuation = cst_alloc(cst_string,TS_BUFFER_SIZE);
168 ts->postp_max = TS_BUFFER_SIZE;
169 }
170
171 set_charclasses(ts,whitespace,singlechars,prepunct,postpunct);
172 ts->current_char = 0;
173
174 return ts;
175 }
176
delete_tokenstream(cst_tokenstream * ts)177 void delete_tokenstream(cst_tokenstream *ts)
178 {
179 cst_free(ts->whitespace);
180 cst_free(ts->token);
181 if (ts->prepunctuation) cst_free(ts->prepunctuation);
182 if (ts->postpunctuation) cst_free(ts->postpunctuation);
183 cst_free(ts);
184 }
185
186 #ifndef FLITE_PLUS_HTS_ENGINE
ts_open(const char * filename,const cst_string * whitespace,const cst_string * singlechars,const cst_string * prepunct,const cst_string * postpunct)187 cst_tokenstream *ts_open(const char *filename,
188 const cst_string *whitespace,
189 const cst_string *singlechars,
190 const cst_string *prepunct,
191 const cst_string *postpunct)
192 {
193 cst_tokenstream *ts = new_tokenstream(whitespace,
194 singlechars,
195 prepunct,
196 postpunct);
197
198 #ifndef UNDER_CE
199 if (cst_streq("-",filename))
200 ts->fd = stdin;
201 else
202 #endif
203 ts->fd = cst_fopen(filename,CST_OPEN_READ|CST_OPEN_BINARY);
204 ts_getc(ts);
205
206 if (ts->fd == NULL)
207 {
208 delete_tokenstream(ts);
209 return NULL;
210 }
211 else
212 return ts;
213 }
214 #endif /* !FLITE_PLUS_HTS_ENGINE*/
215
ts_open_string(const cst_string * string,const cst_string * whitespace,const cst_string * singlechars,const cst_string * prepunct,const cst_string * postpunct)216 cst_tokenstream *ts_open_string(const cst_string *string,
217 const cst_string *whitespace,
218 const cst_string *singlechars,
219 const cst_string *prepunct,
220 const cst_string *postpunct)
221 {
222 cst_tokenstream *ts = new_tokenstream(whitespace,
223 singlechars,
224 prepunct,
225 postpunct);
226
227 ts->string_buffer = cst_strdup(string);
228 ts_getc(ts);
229
230 return ts;
231 }
232
ts_close(cst_tokenstream * ts)233 void ts_close(cst_tokenstream *ts)
234 {
235 if (ts->fd != NULL)
236 {
237 #ifndef FLITE_PLUS_HTS_ENGINE
238 #ifndef UNDER_CE
239 if (ts->fd != stdin)
240 #endif
241 cst_fclose(ts->fd);
242 #endif /* !FLITE_PLUS_HTS_ENGINE */
243 ts->fd = NULL; /* just in case close gets called twice */
244 }
245 if (ts->string_buffer != NULL)
246 {
247 cst_free(ts->string_buffer);
248 ts->string_buffer = NULL;
249 }
250 delete_tokenstream(ts);
251 }
252
get_token_sub_part(cst_tokenstream * ts,int charclass,cst_string ** buffer,int * buffer_max)253 static void get_token_sub_part(cst_tokenstream *ts,
254 int charclass,
255 cst_string **buffer,
256 int *buffer_max)
257 {
258 int p;
259
260 for (p=0; ((ts->current_char != TS_EOF) &&
261 (ts_charclass(ts->current_char,charclass,ts)) &&
262 (!ts_charclass(ts->current_char,
263 TS_CHARCLASS_SINGLECHAR,ts))); p++)
264 {
265 if (p >= *buffer_max) extend_buffer(buffer,buffer_max);
266 (*buffer)[p] = ts->current_char;
267 ts_getc(ts);
268 }
269 (*buffer)[p] = '\0';
270 }
271
272 /* Can't afford dynamically generate this char class so have separater func */
get_token_sub_part_2(cst_tokenstream * ts,int endclass1,cst_string ** buffer,int * buffer_max)273 static void get_token_sub_part_2(cst_tokenstream *ts,
274 int endclass1,
275 cst_string **buffer,
276 int *buffer_max)
277 {
278 int p;
279
280 for (p=0; ((ts->current_char != TS_EOF) &&
281 (!ts_charclass(ts->current_char,endclass1,ts)) &&
282 (!ts_charclass(ts->current_char,
283 TS_CHARCLASS_SINGLECHAR,ts)));
284 p++)
285 {
286 if (p >= *buffer_max) extend_buffer(buffer,buffer_max);
287 (*buffer)[p] = ts->current_char;
288 ts_getc(ts);
289 }
290 (*buffer)[p] = '\0';
291 }
292
get_token_postpunctuation(cst_tokenstream * ts)293 static void get_token_postpunctuation(cst_tokenstream *ts)
294 {
295 int p,t;
296
297 t = cst_strlen(ts->token);
298 for (p=t;
299 (p > 0) &&
300 ((ts->token[p] == '\0') ||
301 (ts_charclass(ts->token[p],TS_CHARCLASS_POSTPUNCT,ts)));
302 p--);
303
304 if (t != p)
305 {
306 if (t-p >= ts->postp_max)
307 extend_buffer(&ts->postpunctuation,&ts->postp_max);
308 /* Copy postpunctuation from token */
309 memmove(ts->postpunctuation,&ts->token[p+1],(t-p));
310 /* truncate token at postpunctuation */
311 ts->token[p+1] = '\0';
312 }
313 }
314
ts_eof(cst_tokenstream * ts)315 int ts_eof(cst_tokenstream *ts)
316 {
317 if (ts->current_char == TS_EOF)
318 return TRUE;
319 else
320 return FALSE;
321 }
322
ts_set_stream_pos(cst_tokenstream * ts,int pos)323 int ts_set_stream_pos(cst_tokenstream *ts, int pos)
324 {
325 /* Note this doesn't preserve line_pos */
326 int new_pos, l;
327
328 if (ts->fd)
329 #ifdef FLITE_PLUS_HTS_ENGINE
330 ;
331 #else
332 new_pos = (int)cst_fseek(ts->fd,(long)pos,CST_SEEK_ABSOLUTE);
333 #endif /* !FLITE_PLUS_HTS_ENGINE */
334 else if (ts->string_buffer)
335 {
336 l = cst_strlen(ts->string_buffer);
337 if (pos > l)
338 new_pos = l;
339 else if (pos < 0)
340 new_pos = 0;
341 else
342 new_pos = pos;
343 }
344 else
345 new_pos = pos; /* not sure it can get here */
346 ts->file_pos = new_pos;
347 ts->current_char = ' '; /* To be safe */
348
349 return ts->file_pos;
350 }
351
ts_get_stream_pos(cst_tokenstream * ts)352 int ts_get_stream_pos(cst_tokenstream *ts)
353 {
354 return ts->file_pos;
355 }
356
ts_getc(cst_tokenstream * ts)357 static cst_string ts_getc(cst_tokenstream *ts)
358 {
359 if (ts->fd)
360 {
361 #ifndef FLITE_PLUS_HTS_ENGINE
362 ts->current_char = cst_fgetc(ts->fd);
363 #endif /* !FLITE_PLUS_HTS_ENGINE */
364 }
365 else if (ts->string_buffer)
366 {
367 if (ts->string_buffer[ts->file_pos] == '\0')
368 ts->current_char = TS_EOF;
369 else
370 ts->current_char = ts->string_buffer[ts->file_pos];
371 }
372
373 if (ts->current_char != TS_EOF)
374 ts->file_pos++;
375 if (ts->current_char == '\n')
376 ts->line_number++;
377 return ts->current_char;
378 }
379
ts_get_quoted_token(cst_tokenstream * ts,char quote,char escape)380 const cst_string *ts_get_quoted_token(cst_tokenstream *ts,
381 char quote,
382 char escape)
383 {
384 /* for reading the next quoted token that starts with quote and
385 ends with quote, quote may appear only if preceded by escape */
386 int l, p;
387
388 /* Hmm can't change quotes within a ts */
389 ts->charclass[(unsigned int)quote] |= TS_CHARCLASS_QUOTE;
390 ts->charclass[(unsigned int)escape] |= TS_CHARCLASS_QUOTE;
391
392 /* skipping whitespace */
393 get_token_sub_part(ts,TS_CHARCLASS_WHITESPACE,
394 &ts->whitespace,
395 &ts->ws_max);
396 ts->token_pos = ts->file_pos - 1;
397
398 if (ts->current_char == quote)
399 { /* go until quote */
400 ts_getc(ts);
401 l=0;
402 for (p=0; ((ts->current_char != TS_EOF) &&
403 (ts->current_char != quote));
404 p++)
405 {
406 if (p >= ts->token_max)
407 extend_buffer(&ts->token,&ts->token_max);
408 ts->token[p] = ts->current_char;
409 ts_getc(ts);
410 if (ts->current_char == escape)
411 {
412 ts_get(ts);
413 if (p >= ts->token_max)
414 extend_buffer(&ts->token,&ts->token_max);
415 ts->token[p] = ts->current_char;
416 ts_get(ts);
417 }
418 }
419 ts->token[p] = '\0';
420 ts_getc(ts);
421 }
422 else /* its not quotes, like to be careful dont you */
423 { /* treat is as standard token */
424 /* Get prepunctuation */
425 get_token_sub_part(ts,TS_CHARCLASS_PREPUNCT,
426 &ts->prepunctuation,
427 &ts->prep_max);
428 /* Get the symbol itself */
429 if (!ts_charclass(ts->current_char,TS_CHARCLASS_SINGLECHAR,ts))
430 {
431 if (2 >= ts->token_max) extend_buffer(&ts->token,&ts->token_max);
432 ts->token[0] = ts->current_char;
433 ts->token[1] = '\0';
434 ts_getc(ts);
435 }
436 else
437 get_token_sub_part_2(ts,
438 TS_CHARCLASS_WHITESPACE, /* end class1 */
439 &ts->token,
440 &ts->token_max);
441 /* This'll have token *plus* post punctuation in ts->token */
442 /* Get postpunctuation */
443 get_token_postpunctuation(ts);
444 }
445
446 return ts->token;
447 }
448
ts_get(cst_tokenstream * ts)449 const cst_string *ts_get(cst_tokenstream *ts)
450 {
451 /* Get next token */
452
453 /* Skip whitespace */
454 get_token_sub_part(ts,
455 TS_CHARCLASS_WHITESPACE,
456 &ts->whitespace,
457 &ts->ws_max);
458
459 /* quoted strings currently ignored */
460 ts->token_pos = ts->file_pos - 1;
461
462 /* Get prepunctuation */
463 if (ts->current_char != TS_EOF &&
464 ts_charclass(ts->current_char,TS_CHARCLASS_PREPUNCT,ts))
465 get_token_sub_part(ts,
466 TS_CHARCLASS_PREPUNCT,
467 &ts->prepunctuation,
468 &ts->prep_max);
469 else if (ts->prepunctuation)
470 ts->prepunctuation[0] = '\0';
471 /* Get the symbol itself */
472 if (ts->current_char != TS_EOF &&
473 ts_charclass(ts->current_char,TS_CHARCLASS_SINGLECHAR,ts))
474 {
475 if (2 >= ts->token_max) extend_buffer(&ts->token,&ts->token_max);
476 ts->token[0] = ts->current_char;
477 ts->token[1] = '\0';
478 ts_getc(ts);
479 }
480 else
481 get_token_sub_part_2(ts,
482 TS_CHARCLASS_WHITESPACE, /* end class1 */
483 &ts->token,
484 &ts->token_max);
485 /* This'll have token *plus* post punctuation in ts->token */
486 /* Get postpunctuation */
487 if (ts->p_postpunctuationsymbols[0])
488 get_token_postpunctuation(ts);
489
490 return ts->token;
491 }
492
ts_read(void * buff,int size,int num,cst_tokenstream * ts)493 int ts_read(void *buff, int size, int num, cst_tokenstream *ts)
494 {
495 /* people should complain about the speed here */
496 /* people will complain about EOF as end of file */
497 int i,j,p;
498 cst_string *cbuff;
499
500 cbuff = (cst_string *)buff;
501
502 for (p=i=0; i < num; i++)
503 for (j=0; j < size; j++,p++)
504 cbuff[p] = ts_getc(ts);
505
506 return i;
507 }
508