1 /*************************************************************************/
2 /*                                                                       */
3 /*                Centre for Speech Technology Research                  */
4 /*                     University of Edinburgh, UK                       */
5 /*                         Copyright (c) 1996                            */
6 /*                        All Rights Reserved.                           */
7 /*                                                                       */
8 /*  Permission is hereby granted, free of charge, to use and distribute  */
9 /*  this software and its documentation without restriction, including   */
10 /*  without limitation the rights to use, copy, modify, merge, publish,  */
11 /*  distribute, sublicense, and/or sell copies of this work, and to      */
12 /*  permit persons to whom this work is furnished to do so, subject to   */
13 /*  the following conditions:                                            */
14 /*   1. The code must retain the above copyright notice, this list of    */
15 /*      conditions and the following disclaimer.                         */
16 /*   2. Any modifications must be clearly marked as such.                */
17 /*   3. Original authors' names are not deleted.                         */
18 /*   4. The authors' names are not used to endorse or promote products   */
19 /*      derived from this software without specific prior written        */
20 /*      permission.                                                      */
21 /*                                                                       */
22 /*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
23 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
24 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
25 /*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
26 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
27 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
28 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
29 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
30 /*  THIS SOFTWARE.                                                       */
31 /*                                                                       */
32 /*************************************************************************/
33 /*                         Author :  Alan W Black                        */
34 /*                         Date   :  April 1996                          */
35 /*-----------------------------------------------------------------------*/
36 /*                                                                       */
37 /* A Tokenize class, both for Tokens (Strings plus alpha)                */
38 /* EST_TokenStream for strings, FILE *, files, pipes etc                 */
39 /*                                                                       */
40 /*=======================================================================*/
41 #include <cstdio>
42 #include <iostream>
43 #include "EST_unix.h"
44 #include <cstdlib>
45 #include <climits>
46 #include <cstring>
47 #include "EST_math.h"
48 #include "EST_Token.h"
49 #include "EST_string_aux.h"
50 #include "EST_cutils.h"
51 #include "EST_error.h"
52 
53 const EST_String EST_Token_Default_WhiteSpaceChars = " \t\n\r";
54 const EST_String EST_Token_Default_SingleCharSymbols = "(){}[]";
55 const EST_String EST_Token_Default_PrePunctuationSymbols = "\"'`({[";
56 const EST_String EST_Token_Default_PunctuationSymbols = "\"'`.,:;!?]})";
57 const EST_String Token_Origin_FD = "existing file descriptor";
58 const EST_String Token_Origin_Stream = "existing istream";
59 const EST_String Token_Origin_String = "existing string";
60 
61 static EST_Regex RXanywhitespace("[ \t\n\r]");
62 
check_extend_str_in(char * str,int pos,int * max)63 static inline char *check_extend_str_in(char *str, int pos, int *max)
64 {
65     // Check we are not at the end of the string, if so get some more
66     // and copy the old one into the new one
67     char *newstuff;
68 
69     if (pos >= *max)
70     {
71         if (pos > *max)
72             *max = 2 * pos;
73         else
74             *max *= 2;
75 	newstuff = new char[*max];
76 	strncpy(newstuff,str,pos);
77 	delete [] str;
78 	return newstuff;
79     }
80     else
81 	return str;
82 }
83 
84 #define check_extend_str(STR, POS, MAX) \
85 	(((POS)>= *(MAX))?check_extend_str_in((STR),(POS),(MAX)):(STR))
86 
operator <<(ostream & s,const EST_Token & p)87 ostream& operator<<(ostream& s, const EST_Token &p)
88 {
89     s << "[TOKEN " << p.pname << "]";
90     return s;
91 }
92 
93 
operator =(const EST_Token & a)94 EST_Token &EST_Token::operator = (const EST_Token &a)
95 {
96     linenum = a.linenum;
97     linepos = a.linepos;
98     p_filepos = a.p_filepos;
99     p_quoted = a.p_quoted;
100     space = a.space;
101     prepunc = a.prepunc;
102     pname = a.pname;
103     punc = a.punc;
104     return *this;
105 }
106 
pos_description() const107 const EST_String EST_Token::pos_description() const
108 {
109     return "line "+itoString(linenum)+" char "+itoString(linepos);
110 }
111 
operator =(const EST_String & a)112 EST_Token &EST_Token::operator = (const EST_String &a)
113 {
114     pname = a;
115     return *this;
116 }
117 
EST_TokenStream()118 EST_TokenStream::EST_TokenStream()
119 {
120     tok_wspacelen = 64;  // will grow if necessary
121     tok_wspace = new char[tok_wspacelen];
122     tok_stufflen = 512;  // will grow if necessary
123     tok_stuff = new char[tok_stufflen];
124     tok_prepuncslen = 32;  // will grow if necessary
125     tok_prepuncs = new char[tok_prepuncslen];
126 
127     default_values();
128 }
129 
EST_TokenStream(EST_TokenStream & s)130 EST_TokenStream::EST_TokenStream(EST_TokenStream &s)
131 {
132     (void)s;
133 
134     cerr << "TokenStream: warning passing TokenStream not as reference"
135 	<< endl;
136 
137     // You *really* shouldn't use this AT ALL unless you
138     // fully understand its consequences, you'll be copying open
139     // files and moving file pointers all over the place
140     // basically *DON'T* do this, pass the stream by reference
141 
142     // Now there may be occasions when you do want to do this for example
143     // when you need to do far look ahead or check point as you read
144     // but they are obscure and I'm not sure how to do that for all
145     // the file forms supported by the TokenStream.  If you do
146     // I can write a clone function that might do it.
147 
148 }
149 
default_values()150 void EST_TokenStream::default_values()
151 {
152     type = tst_none;
153     peeked_tokp = FALSE;
154     peeked_charp = FALSE;
155     eof_flag = FALSE;
156     quotes = FALSE;
157     p_filepos = 0;
158     linepos = 1;
159     WhiteSpaceChars = EST_Token_Default_WhiteSpaceChars;
160     SingleCharSymbols = EST_String::Empty;
161     PrePunctuationSymbols = EST_String::Empty;
162     PunctuationSymbols = EST_String::Empty;
163     build_table();
164     close_at_end=TRUE;
165 }
166 
~EST_TokenStream()167 EST_TokenStream::~EST_TokenStream()
168 {
169     if (type != tst_none)
170 	close();
171     delete [] tok_wspace;
172     delete [] tok_stuff;
173     delete [] tok_prepuncs;
174 
175 }
176 
operator <<(ostream & s,EST_TokenStream & p)177 ostream& operator<<(ostream& s, EST_TokenStream &p)
178 {
179     s << "[TOKENSTREAM ";
180     switch (p.type)
181     {
182       case tst_none:
183 	cerr << "UNSET"; break;
184       case tst_file:
185 	cerr << "FILE"; break;
186       case tst_pipe:
187 	cerr << "PIPE";	break;
188       case tst_istream:
189 	cerr << "ISTREAM"; break;
190       case tst_string:
191 	cerr << "STRING"; break;
192       default:
193 	cerr << "UNKNOWN" << endl;
194     }
195     s << "]";
196 
197     return s;
198 }
199 
open(const EST_String & filename)200 int EST_TokenStream::open(const EST_String &filename)
201 {
202     if (type != tst_none)
203 	close();
204     default_values();
205     fp = fopen(filename,"rb");
206     if (fp == NULL)
207     {
208 	cerr << "Cannot open file " << filename << " as tokenstream"
209 	    << endl;
210 	return -1;
211     }
212     Origin = filename;
213     type = tst_file;
214 
215     return 0;
216 }
217 
open(FILE * ofp,int close_when_finished)218 int EST_TokenStream::open(FILE *ofp, int close_when_finished)
219 {
220     // absorb already open stream
221     if (type != tst_none)
222 	close();
223     default_values();
224     fp = ofp;
225     if (fp == NULL)
226     {
227 	cerr << "Cannot absorb NULL filestream as tokenstream" << endl;
228 	return -1;
229     }
230     Origin = Token_Origin_FD;
231     type = tst_file;
232 
233     close_at_end = close_when_finished;
234 
235     return 0;
236 }
237 
open(istream & newis)238 int EST_TokenStream::open(istream &newis)
239 {
240     // absorb already open istream
241     if (type != tst_none)
242 	close();
243     default_values();
244     is = &newis;
245     Origin = Token_Origin_Stream;
246     type = tst_istream;
247 
248     return 0;
249 }
250 
open_string(const EST_String & newbuffer)251 int EST_TokenStream::open_string(const EST_String &newbuffer)
252 {
253     // Make a tokenstream from an internal existing string/buffer
254     const char *buf;
255     if (type != tst_none)
256 	close();
257     default_values();
258     buf = (const char *)newbuffer;
259     buffer_length = newbuffer.length();
260     buffer = new char[buffer_length+1];
261     memmove(buffer,buf,buffer_length+1);
262     pos = 0;
263     Origin = Token_Origin_String;
264     type = tst_string;
265 
266     return 0;
267 }
268 
seek_end()269 int EST_TokenStream::seek_end()
270 {
271     // This isn't actually useful but people expect it
272     peeked_charp = FALSE;
273     peeked_tokp = FALSE;
274 
275     switch (type)
276     {
277       case tst_none:
278 	cerr << "EST_TokenStream unset" << endl;
279 	return -1;
280 	break;
281       case tst_file:
282 	fseek(fp,0,SEEK_END);
283 	p_filepos = ftell(fp);
284 	return p_filepos;
285       case tst_pipe:
286 	cerr << "EST_TokenStream seek on pipe not supported" << endl;
287 	return -1;
288 	break;
289       case tst_istream:
290 	cerr << "EST_TokenStream seek on istream not yet supported" << endl;
291 	return -1;
292 	break;
293       case tst_string:
294 	pos = buffer_length;
295 	return pos;
296       default:
297 	cerr << "EST_TokenStream: unknown type" << endl;
298 	return -1;
299     }
300 
301     return -1;  // can't get here
302 }
303 
seek(int position)304 int EST_TokenStream::seek(int position)
305 {
306     peeked_charp = FALSE;
307     peeked_tokp = FALSE;
308 
309     switch (type)
310     {
311       case tst_none:
312 	cerr << "EST_TokenStream unset" << endl;
313 	return -1;
314 	break;
315       case tst_file:
316 	p_filepos = position;
317 	return fseek(fp,position,SEEK_SET);
318       case tst_pipe:
319 	cerr << "EST_TokenStream seek on pipe not supported" << endl;
320 	return -1;
321 	break;
322       case tst_istream:
323 	cerr << "EST_TokenStream seek on istream not yet supported" << endl;
324 	return -1;
325 	break;
326       case tst_string:
327 	if (position >= pos)
328 	{
329 	    pos = position;
330 	    return -1;
331 	}
332 	else
333 	{
334 	    pos = position;
335 	    return 0;
336 	}
337 	break;
338       default:
339 	cerr << "EST_TokenStream: unknown type" << endl;
340 	return -1;
341     }
342 
343     return -1;  // can't get here
344 
345 }
346 
stdio_fread(void * buff,int size,int nitems,FILE * fp)347 static int stdio_fread(void *buff,int size,int nitems,FILE *fp)
348 {
349     // So it can find the stdio one rather than the TokenStream one
350     return fread(buff,size,nitems,fp);
351 }
352 
fread(void * buff,int size,int nitems)353 int EST_TokenStream::fread(void *buff, int size, int nitems)
354 {
355     // switching into binary mode for current position
356     int items_read;
357 
358     // so we can continue to read afterwards
359     if (peeked_tokp)
360     {
361 	cerr << "ERROR " << pos_description()
362 	    << " peeked into binary data" << endl;
363 	return 0;
364     }
365 
366     peeked_charp = FALSE;
367     peeked_tokp = FALSE;
368 
369     switch (type)
370     {
371       case tst_none:
372 	cerr << "EST_TokenStream unset" << endl;
373 	return 0;
374 	break;
375       case tst_file:
376 	items_read = stdio_fread(buff,(size_t)size,(size_t)nitems,fp);
377 	p_filepos += items_read*size;
378 	return items_read;
379       case tst_pipe:
380 	cerr << "EST_TokenStream fread pipe not yet supported" << endl;
381 	return 0;
382 	break;
383       case tst_istream:
384 	cerr << "EST_TokenStream fread istream not yet supported" << endl;
385 	return 0;
386       case tst_string:
387 	if ((buffer_length-pos)/size < nitems)
388 	    items_read = (buffer_length-pos)/size;
389 	else
390 	    items_read = nitems;
391 	memcpy(buff,&buffer[pos],items_read*size);
392 	pos += items_read*size;
393 	return items_read;
394       default:
395 	cerr << "EST_TokenStream: unknown type" << endl;
396 	return EOF;
397     }
398 
399     return 0;  // can't get here
400 
401 }
402 
close(void)403 void EST_TokenStream::close(void)
404 {
405     // close any files (if they were used)
406 
407     switch (type)
408     {
409       case tst_none:
410 	break;
411       case tst_file:
412 	if (close_at_end)
413 	  fclose(fp);
414       case tst_pipe:
415 	// close(fd);
416 	break;
417       case tst_istream:
418 	break;
419       case tst_string:
420 	delete [] buffer;
421 	buffer = 0;
422 	break;
423       default:
424 	cerr << "EST_TokenStream: unknown type" << endl;
425 	break;
426     }
427 
428     type = tst_none;
429     peeked_charp = FALSE;
430     peeked_tokp = FALSE;
431 
432 }
433 
restart(void)434 int EST_TokenStream::restart(void)
435 {
436     // For paul, the only person I know who uses this
437 
438     switch (type)
439     {
440       case tst_none:
441 	break;
442       case tst_file:
443         fp = freopen(Origin,"rb",fp);
444 	p_filepos = 0;
445 	break;
446       case tst_pipe:
447 	cerr << "EST_TokenStream: can't rewind pipe" << endl;
448 	return -1;
449 	break;
450       case tst_istream:
451 	cerr << "EST_TokenStream: can't rewind istream" << endl;
452 	break;
453       case tst_string:
454 	pos = 0;
455 	break;
456       default:
457 	cerr << "EST_TokenStream: unknown type" << endl;
458 	break;
459     }
460 
461     linepos = 1;
462     peeked_charp = FALSE;
463     peeked_tokp = FALSE;
464     eof_flag = FALSE;
465 
466     return 0;
467 }
468 
operator >>(EST_Token & p)469 EST_TokenStream & EST_TokenStream::operator >>(EST_Token &p)
470 {
471     return get(p);
472 }
473 
operator >>(EST_String & p)474 EST_TokenStream & EST_TokenStream::operator >>(EST_String &p)
475 {
476     EST_Token t;
477 
478     get(t);
479     p = t.string();
480     return *this;
481 }
482 
get(EST_Token & tok)483 EST_TokenStream &EST_TokenStream::get(EST_Token &tok)
484 {
485     tok = get();
486     return *this;
487 }
488 
get_upto(const EST_String & s)489 EST_Token EST_TokenStream::get_upto(const EST_String &s)
490 {
491     // Returns a concatenated token form here to next symbol that matches s
492     // including s (though not adding s on the result)
493     // Not really for the purist but lots of times very handy
494     // Note this is not very efficient
495     EST_String result;
496     EST_Token t;
497 
498     for (result=EST_String::Empty; (t=get()) != s; )
499     {
500 	result += t.whitespace() + t.prepunctuation() +
501 	    t.string() + t.punctuation();
502 	if (eof())
503 	{
504 	    cerr << "EST_TokenStream: end of file when looking for \"" <<
505 		s << "\"" << endl;
506 	    break;
507 	}
508     }
509 
510     return EST_Token(result);
511 }
512 
get_upto_eoln(void)513 EST_Token EST_TokenStream::get_upto_eoln(void)
514 {
515     // Swallow the lot up to end of line
516     // assumes \n is a whitespace character
517 
518     EST_String result(EST_String::Empty);
519 
520     while (!eoln())
521     {
522 	EST_Token &t=get();
523 	result += t.whitespace() + t.prepunctuation();
524 
525 	if (quotes)
526 	    result += quote_string(t.string());
527 	else
528 	    result += t.string();
529 
530 	result += t.punctuation();
531 
532 	if (eof())
533 	{
534 //	    cerr << "EST_TokenStream: end of file when looking for end of line"
535 //		<< endl;
536 	    break;
537 	}
538     }
539     // So that the next call works I have to step over the eoln condition
540     // That involves removing the whitespace upto and including the next
541     // \n in the peek token.
542 
543     char *w = wstrdup(peek().whitespace());
544     int i;
545     for (i=0; w[i] != 0; i++)
546 	if (w[i] == '\n')   // maybe not portable
547 	    peek().set_whitespace(&w[i+1]);
548 
549     wfree(w);
550 
551     static EST_Token result_t;
552 
553     result_t.set_token(result);
554 
555     return result_t;
556 }
557 
must_get(EST_String expected,bool * ok)558 EST_Token &EST_TokenStream::must_get(EST_String expected, bool *ok)
559 {
560     EST_Token &tok = get();
561 
562     if (tok != expected)
563     {
564         if (ok != NULL)
565         {
566             *ok=FALSE;
567             return tok;
568         }
569         else
570             EST_error("Expected '%s' got '%s' at %s",
571                       (const char *)expected,
572                       (const char *)(EST_String)tok,
573                       (const char *)pos_description());
574     }
575 
576     if (ok != NULL)
577         *ok=TRUE;
578     return tok;
579 }
580 
build_table()581 void EST_TokenStream::build_table()
582 {
583     int i;
584     const char *p;
585     unsigned char c;
586 
587     for (i=0; i<256; ++i)
588 	p_table[i]=0;
589 
590     for (p=WhiteSpaceChars; *p; ++p)
591 	if (p_table[c=(unsigned char)*p])
592 	    EST_warning("Character '%c' has two classes, '%c' and '%c'",
593 			*p, c, ' ');
594 	else
595 	    p_table[c] = ' ';
596 
597     for (p=SingleCharSymbols; *p; ++p)
598 	if (p_table[c=(unsigned char)*p])
599 	    EST_warning("Character '%c' has two classes, '%c' and '%c'",
600 			*p, p_table[c], '!');
601 	else
602 	    p_table[c] = '@';
603 
604     for (p=PunctuationSymbols; *p; ++p)
605 	if (p_table[c=(unsigned char)*p] == '@')
606 	    continue;
607 	else if (p_table[c])
608 	    EST_warning("Character '%c' has two classes, '%c' and '%c'",
609 			*p, p_table[c], '.');
610 	else
611 	    p_table[c] = '.';
612 
613     for(p=PrePunctuationSymbols; *p; ++p)
614 	if (p_table[c=(unsigned char)*p] == '@')
615 	    continue;
616 	else if (p_table[c] == '.')
617 	    p_table[c] = '"';
618 	else if (p_table[c])
619 	    EST_warning("Character '%c' has two classes, '%c' and '%c'",
620 			*p, p_table[c], '$');
621 	else
622 	    p_table[c] = '$';
623 
624     p_table_wrong=0;
625 }
626 
getpeeked_internal(void)627 inline int EST_TokenStream::getpeeked_internal(void)
628 {
629   peeked_charp = FALSE;
630   return peeked_char;
631 }
632 
633 inline
getch_internal()634 int EST_TokenStream::getch_internal()
635 {
636     // Return next character in stream
637     if (EST_TokenStream::peeked_charp)
638     {
639       return getpeeked_internal();
640     }
641 
642     switch (type)
643     {
644       case tst_none:
645 	cerr << "EST_TokenStream unset" << endl;
646 	return EOF;
647 	break;
648       case tst_file:
649 	p_filepos++;
650 	{
651 	    char lc;
652 	    if (stdio_fread(&lc,1,1,fp) == 0)
653 		return EOF;
654 	    else
655 		return (int)lc;
656 	}
657 /*	return getc(fp); */
658       case tst_pipe:
659 	cerr << "EST_TokenStream pipe not yet supported" << endl;
660 	return EOF;
661 	break;
662       case tst_istream:
663 	p_filepos++;
664 	return is->get();
665       case tst_string:
666 	if (pos < buffer_length)
667 	{
668 	    p_filepos++;
669 	    return buffer[pos++];
670 	}
671 	else
672 	    return EOF;
673       default:
674 	cerr << "EST_TokenStream: unknown type" << endl;
675 	return EOF;
676     }
677 
678     return EOF;  // can't get here
679 }
680 
getch(void)681 int EST_TokenStream::getch(void)
682 {
683   return getch_internal();
684 }
685 
peekch_internal()686 inline int EST_TokenStream::peekch_internal()
687 {
688     // Return next character in stream (without reading it)
689 
690     if (!peeked_charp)
691 	peeked_char = getch_internal();
692     peeked_charp = TRUE;
693     return peeked_char;
694 }
695 
696 
peekch(void)697 int EST_TokenStream::peekch(void)
698 {
699   return peekch_internal();
700 
701 }
702 
703 #define CLASS(C,CL) (p_table[(unsigned char)(C)]==(CL))
704 
705 #define CLASS2(C,CL1,CL2) (p_table[(unsigned char)(C)]==(CL1)||p_table[(unsigned char)(C)]==(CL2))
706 
get(void)707 EST_Token &EST_TokenStream::get(void)
708 {
709     if (peeked_tokp)
710     {
711 	peeked_tokp = FALSE;
712 	return current_tok;
713     }
714 
715     if (p_table_wrong)
716       build_table();
717 
718     char *word;
719     int c,i,j;
720 
721     for (i=0; (CLASS(c=getch_internal(),' ') &&
722 	       ( c != EOF )); i++)
723     {
724 	if (c == '\n') linepos++;
725 	tok_wspace = check_extend_str(tok_wspace,i,&tok_wspacelen);
726 	tok_wspace[i] = c;
727     }
728     tok_wspace[i] = '\0';
729 
730     current_tok.init();
731 
732     if (c != EOF)
733     {
734 	current_tok.set_filepos(p_filepos-1);
735 
736 	if ((quotes) &&  // quoted strings (with escapes) are allowed
737 	    (c == quote))
738 	{
739 	    for (i=0;
740 		 ((c = getch_internal()) != EOF)
741 		 ;)
742 	    {
743 		if (c == quote)
744 		    break;
745 		tok_stuff = check_extend_str(tok_stuff,i,&tok_stufflen);
746 		if (c == escape)
747 		    c = getch_internal();
748 		tok_stuff[i++] = c;
749 	    }
750 	    current_tok.set_quoted(TRUE);
751 	}
752 	else            // standard whitespace separated tokens
753 	{
754 	    for (i=0,tok_stuff[i++]=c;
755 		 (
756 		  !CLASS(c,'@') &&
757 		  !CLASS(c=peekch_internal(),' ') &&
758 		  !CLASS(c,'@') &&
759 		  ( c != EOF )) ;)
760 	    {
761 		tok_stuff = check_extend_str(tok_stuff,i,&tok_stufflen);
762 		// note, we must have peeked to get here.
763 		tok_stuff[i++] = getpeeked_internal();
764 	    }
765 	}
766 	tok_stuff[i] = '\0';
767 	// Are there any punctuation symbols at the start?
768 	for (j=0;
769 	     ((j < i) && CLASS2(tok_stuff[j], '$', '"'));
770 	     j++);
771 	if ((j > 0) && (j < i))  // there are
772 	{
773 	    tok_prepuncs = check_extend_str(tok_prepuncs,j+1,&tok_prepuncslen);
774 	    memmove(tok_prepuncs,tok_stuff,j);
775 	    tok_prepuncs[j] = '\0';
776 	    current_tok.set_prepunctuation(tok_prepuncs);
777 	    word=&tok_stuff[j];
778 	    i-=j;  // reduce size by number of prepuncs
779 	}
780 	else
781 	{
782 	    current_tok.set_prepunctuation(EST_String::Empty);
783 	    word = tok_stuff;
784 	}
785 	// Are there any punctuation symbols at the end
786 	for (j=i-1;
787 	     ((j > 0) && CLASS2(word[j],'.','"'));
788 	     j--);
789 	if (word[j+1] != '\0')
790 	{
791 	    current_tok.set_punctuation(&word[j+1]);
792 	    word[j+1] = '\0';
793 	}
794 	else
795 	    current_tok.set_punctuation(EST_String::Empty);
796 
797 	current_tok.set_token(word);
798 	if (tok_wspace[0] == '\0') // feature paths will have null whitespace
799 	    current_tok.set_whitespace(EST_String::Empty);
800 	else
801 	    current_tok.set_whitespace(tok_wspace);
802     }
803     else
804     {
805 	current_tok.set_token(EST_String::Empty);
806 	current_tok.set_whitespace(tok_wspace);
807 	current_tok.set_punctuation(EST_String::Empty);
808 	current_tok.set_prepunctuation(EST_String::Empty);
809 	eof_flag = TRUE;
810     }
811 
812     return current_tok;
813 }
814 
eoln(void)815 int EST_TokenStream::eoln(void)
816 {
817     // This doesn't really work if there are blank lines (and you want
818     // to know about them)
819 
820     if ((peek().whitespace().contains("\n")) ||	eof())
821 	return TRUE;
822     else
823 	return FALSE;
824 
825 }
826 
quote_string(const EST_String & s,const EST_String & quote,const EST_String & escape,int force)827 EST_String quote_string(const EST_String &s,
828 			const EST_String &quote,
829 			const EST_String &escape,
830 			int force)
831 {
832     // Quotes s always if force true, or iff s contains whitespace,
833     // quotes or escapes force is false
834     // Note quote and escape are assumed to be string of length 1
835     EST_String quoted_form;
836     if ((force) ||
837 	(s.contains(quote)) ||
838 	(s.contains(escape)) ||
839 	(s.contains(RXanywhitespace)) ||
840 	(s.length() == 0))
841     {
842 	// bigger than the quoted form could ever be
843 	int i,j;
844 	char *quoted = new char[s.length()*(quote.length()+escape.length())+
845 		       1+quote.length()+quote.length()];
846 	quoted[0] = quote(0);
847 	for (i=1,j=0; j < s.length(); j++,i++)
848 	{
849 	    if (s(j) == quote(0))
850 		quoted[i++] = escape(0);
851 	    else if (s(j) == escape(0))
852 		quoted[i++] = escape(0);
853 	    quoted[i] = s(j);
854 	}
855 	quoted[i++] = quote(0);
856 	quoted[i] = '\0';
857 	quoted_form = quoted;
858 	delete [] quoted;
859 	return quoted_form;
860     }
861     else
862       return s;
863 }
864 
pos_description()865 const EST_String EST_TokenStream::pos_description()
866 {
867     return Origin+":"+itoString(linepos);
868 }
869