1 //	Copyright (C) 1999-2003 Paul O. Lewis
2 //
3 //	This file is part of NCL (Nexus Class Library) version 2.0.
4 //
5 //	NCL is free software; you can redistribute it and/or modify
6 //	it under the terms of the GNU General Public License as published by
7 //	the Free Software Foundation; either version 2 of the License, or
8 //	(at your option) any later version.
9 //
10 //	NCL is distributed in the hope that it will be useful,
11 //	but WITHOUT ANY WARRANTY; without even the implied warranty of
12 //	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 //	GNU General Public License for more details.
14 //
15 //	You should have received a copy of the GNU General Public License
16 //	along with NCL; if not, write to the Free Software Foundation, Inc.,
17 //	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 //
19 
20 #ifndef NCL_NXSTOKEN_H
21 #define NCL_NXSTOKEN_H
22 
23 /*----------------------------------------------------------------------------------------------------------------------
24 |	NxsToken objects are used by NxsReader to extract words (tokens) from a NEXUS data file. NxsToken objects know to
25 |	correctly skip NEXUS comments and understand NEXUS punctuation, making reading a NEXUS file as simple as repeatedly
26 |	calling the GetNextToken() function and then interpreting the token returned. If the token object is not attached
27 |	to an input stream, calls to GetNextToken() will have no effect. If the token object is not attached to an output
28 |	stream, output comments will be discarded (i.e., not output anywhere) and calls to Write or Writeln will be
29 |	ineffective. If input and output streams have been attached to the token object, however, tokens are read one at a
30 |	time from the input stream, and comments are correctly read and either written to the output stream (if an output
31 |	comment) or ignored (if not an output comment). Sequences of characters surrounded by single quotes are read in as
32 |	single tokens. A pair of adjacent single quotes are stored as a single quote, and underscore characters are stored
33 |	as blanks.
34 */
35 class NxsToken
36 	{
37 	public:
38 
39 		enum NxsTokenFlags	/* For use with the variable labileFlags */
40 			{
41 			saveCommandComments		= 0x0001,	/* if set, command comments of the form [&X] are not ignored but are instead saved as regular tokens (without the square brackets, however) */
42 			parentheticalToken		= 0x0002,	/* if set, and if next character encountered is a left parenthesis, token will include everything up to the matching right parenthesis */
43 			curlyBracketedToken		= 0x0004,	/* if set, and if next character encountered is a left curly bracket, token will include everything up to the matching right curly bracket */
44 			doubleQuotedToken		= 0x0008,	/* if set, grabs entire phrase surrounded by double quotes */
45 			singleCharacterToken	= 0x0010,	/* if set, next non-whitespace character returned as token */
46 			newlineIsToken			= 0x0020,	/* if set, newline character treated as a token and atEOL set if newline encountered */
47 			tildeIsPunctuation		= 0x0040,	/* if set, tilde character treated as punctuation and returned as a separate token */
48 			useSpecialPunctuation	= 0x0080,	/* if set, character specified by the data member special is treated as punctuation and returned as a separate token */
49 			hyphenNotPunctuation	= 0x0100,	/* if set, the hyphen character is not treated as punctutation (it is normally returned as a separate token) */
50 			preserveUnderscores		= 0x0200,	/* if set, underscore characters inside tokens are not converted to blank spaces (normally, all underscores are automatically converted to blanks) */
51 			ignorePunctuation		= 0x0400	/* if set, the normal punctuation symbols are treated the same as any other darkspace characters */
52 			};
53 
54 		NxsString		errormsg;
55 
56 						NxsToken(istream &i);
57 		virtual			~NxsToken();
58 
59 		bool			AtEOF();
60 		bool			AtEOL();
61 		bool			Abbreviation(NxsString s);
62 		bool			Begins(NxsString s, bool respect_case = false);
63 		void			BlanksToUnderscores();
64 		bool			Equals(NxsString s, bool respect_case = false);
65 		long			GetFileColumn() const;
66 		file_pos		GetFilePosition() const;
67 		long			GetFileLine() const;
68 		void			GetNextToken();
69 		NxsString		GetToken(bool respect_case = true);
70 		const char		*GetTokenAsCStr(bool respect_case = true);
71 		const NxsString	&GetTokenReference();
72 		int				GetTokenLength() const;
73 		bool			IsPlusMinusToken();
74 		bool			IsPunctuationToken();
75 		bool			IsWhitespaceToken();
76 		void			ReplaceToken(const NxsString s);
77 		void			ResetToken();
78 		void			SetSpecialPunctuationCharacter(char c);
79 		void			SetLabileFlagBit(int bit);
80 		bool			StoppedOn(char ch);
81 		void			StripWhitespace();
82 		void			ToUpper();
83 		void			Write(ostream &out);
84 		void			Writeln(ostream &out);
85 
86 		virtual void	OutputComment(const NxsString &msg);
87 		void GetNextContiguousToken(char stop_char); // Added by BQM
88 	protected:
89 
90 		void			AppendToComment(char ch);
91 		void			AppendToToken(char ch);
92 		char			GetNextChar();
93 		void			GetComment();
94 		void			GetCurlyBracketedToken();
95 		void			GetDoubleQuotedToken();
96 		void			GetQuoted();
97 		void			GetParentheticalToken();
98 		bool			IsPunctuation(char ch);
99 		bool			IsWhitespace(char ch);
100 
101 	private:
102 
103 		istream			∈				/* reference to input stream from which tokens will be read */
104 		file_pos		filepos;			/* current file position (for Metrowerks compiler, type is streampos rather than long) */
105 		long			fileline;			/* current file line */
106 		long			filecol;			/* current column in current line (refers to column immediately following token just read) */
107 		NxsString		token;				/* the character buffer used to store the current token */
108 		NxsString		comment;			/* temporary buffer used to store output comments while they are being built */
109 		char			saved;				/* either '\0' or is last character read from input stream */
110 		bool			atEOF;				/* true if end of file has been encountered */
111 		bool			atEOL;				/* true if newline encountered while newlineIsToken labile flag set */
112 		char			special;			/* ad hoc punctuation character; default value is '\0' */
113 		int				labileFlags;		/* storage for flags in the NxsTokenFlags enum */
114 		char			punctuation[21];	/* stores the 20 NEXUS punctuation characters */
115 		char			whitespace[4];		/* stores the 3 whitespace characters: blank space, tab and newline */
116 	};
117 
118 typedef NxsToken NexusToken;
119 
120 /*----------------------------------------------------------------------------------------------------------------------
121 |	Returns the token for functions that only need read only access - faster than GetToken.
122 */
GetTokenReference()123 inline const NxsString &NxsToken::GetTokenReference()
124 	{
125 	return token;
126 	}
127 
128 /*----------------------------------------------------------------------------------------------------------------------
129 |	This function is called whenever an output comment (i.e., a comment beginning with an exclamation point) is found
130 |	in the data file. This version of OutputComment does nothing; override this virtual function to display the output
131 |	comment in the most appropriate way for the platform you are supporting.
132 */
OutputComment(const NxsString & msg)133 inline void NxsToken::OutputComment(
134   const NxsString &msg)	/* the contents of the printable comment discovered in the NEXUS data file */
135 	{
136 #	if defined(HAVE_PRAGMA_UNUSED)
137 #		pragma unused(msg)
138 #	endif
139 	}
140 
141 /*----------------------------------------------------------------------------------------------------------------------
142 |	Adds `ch' to end of comment NxsString.
143 */
AppendToComment(char ch)144 inline void NxsToken::AppendToComment(
145   char ch)	/* character to be appended to comment */
146 	{
147 	comment += ch;
148 	}
149 
150 /*----------------------------------------------------------------------------------------------------------------------
151 |	Adds `ch' to end of current token.
152 */
AppendToToken(char ch)153 inline void NxsToken::AppendToToken(
154   char ch)	/* character to be appended to token */
155 	{
156 	// First three lines proved necessary to keep Borland's implementation of STL from crashing
157 	// under some circumstances (may no longer be necessary)
158 	//
159 	char s[2];
160 	s[0] = ch;
161 	s[1] = '\0';
162 
163 	token += s;
164 	}
165 
166 /*----------------------------------------------------------------------------------------------------------------------
167 |	Reads next character from in and does all of the following before returning it to the calling function:
168 |~
169 |	o if character read is either a carriage return or line feed, the variable line is incremented by one and the
170 |	  variable col is reset to zero
171 |	o if character read is a carriage return, and a peek at the next character to be read reveals that it is a line
172 |	  feed, then the next (line feed) character is read
173 |	o if either a carriage return or line feed is read, the character returned to the calling function is '\n' if
174 |	  character read is neither a carriage return nor a line feed, col is incremented by one and the character is
175 |	  returned as is to the calling function
176 |	o in all cases, the variable filepos is updated using a call to the tellg function of istream.
177 |~
178 */
GetNextChar()179 inline char NxsToken::GetNextChar()
180 	{
181 	int ch = in.get();
182 	int failed = in.bad();
183 	if (failed)
184 		{
185 		errormsg = "Unknown error reading data file (check to make sure file exists)";
186 		throw NxsException(errormsg);
187 		}
188 
189 	if (ch == 13 || ch == 10)
190 		{
191 		fileline++;
192 		filecol = 1L;
193 
194 		if (ch == 13 && (int)in.peek() == 10)
195 			ch = in.get();
196 
197 		atEOL = 1;
198 		}
199 	else if (ch == EOF)
200 		atEOF = 1;
201 	else
202 		{
203 		filecol++;
204 		atEOL = 0;
205 		}
206 
207 #	if defined(__DECCXX)
208 		filepos = 0L;
209 #	else
210     // BQM this cause crash compiling with clang under Windows!
211 //		filepos = in.tellg();
212     filepos += 1;
213 #	endif
214 
215 	if (atEOF)
216 		return '\0';
217 	else if (atEOL)
218 		return '\n';
219 	else
220 		return (char)ch;
221 	}
222 
223 /*----------------------------------------------------------------------------------------------------------------------
224 |	Returns true if character supplied is considered a punctuation character. The following twenty characters are
225 |	considered punctuation characters:
226 |>
227 |	()[]{}/\,;:=*'"`+-<>
228 |>
229 |	Exceptions:
230 |~
231 |	o The tilde character ('~') is also considered punctuation if the tildeIsPunctuation labile flag is set
232 |	o The special punctuation character (specified using the SetSpecialPunctuationCharacter) is also considered
233 |	  punctuation if the useSpecialPunctuation labile flag is set
234 |	o The hyphen (i.e., minus sign) character ('-') is not considered punctuation if the hyphenNotPunctuation
235 |	  labile flag is set
236 |~
237 |	Use the SetLabileFlagBit method to set one or more NxsLabileFlags flags in `labileFlags'
238 */
IsPunctuation(char ch)239 inline bool NxsToken::IsPunctuation(
240   char ch)	/* the character in question */
241 	{
242 	// PAUP 4.0b10
243 	//  o allows ]`<> inside taxon names
244 	//  o allows `<> inside taxset names
245 	//
246 	bool is_punctuation = false;
247 	if (strchr(punctuation, ch))
248 		is_punctuation = true;
249 	if (labileFlags & tildeIsPunctuation  && ch == '~')
250 		is_punctuation = true;
251 	if (labileFlags & useSpecialPunctuation  && ch == special)
252 		is_punctuation = true;
253 	if (labileFlags & hyphenNotPunctuation  && ch == '-')
254 		is_punctuation = false;
255 
256 	return is_punctuation;
257 	}
258 
259 /*----------------------------------------------------------------------------------------------------------------------
260 |	Returns true if character supplied is considered a whitespace character. Note: treats '\n' as darkspace if labile
261 |	flag newlineIsToken is in effect.
262 */
IsWhitespace(char ch)263 inline bool NxsToken::IsWhitespace(
264   char ch)	/* the character in question */
265 	{
266 	bool ws = false;
267 
268 	// If ch is found in the whitespace array, it's whitespace
269 	//
270 	if (strchr(whitespace, ch))
271 		ws = true;
272 
273 	// Unless of course ch is the newline character and we're currently
274 	// treating newlines as darkspace!
275 	//
276 	if (labileFlags & newlineIsToken && ch == '\n')
277 		ws = false;
278 
279 	return ws;
280 	}
281 
282 /*----------------------------------------------------------------------------------------------------------------------
283 |	Returns true if and only if last call to GetNextToken encountered the end-of-file character (or for some reason the
284 |	input stream is now out of commission).
285 */
AtEOF()286 inline bool NxsToken::AtEOF()
287 	{
288 	return atEOF;
289 	}
290 
291 /*----------------------------------------------------------------------------------------------------------------------
292 |	Returns true if and only if last call to GetNextToken encountered the newline character while the newlineIsToken
293 |	labile flag was in effect.
294 */
AtEOL()295 inline bool NxsToken::AtEOL()
296 	{
297 	return atEOL;
298 	}
299 
300 /*----------------------------------------------------------------------------------------------------------------------
301 |	Converts all blanks in token to underscore characters. Normally, underscores found in the tokens read from a NEXUS
302 |	file are converted to blanks automatically as they are read; this function reverts the blanks back to underscores.
303 */
BlanksToUnderscores()304 inline void NxsToken::BlanksToUnderscores()
305 	{
306 	token.BlanksToUnderscores();
307 	}
308 
309 /*----------------------------------------------------------------------------------------------------------------------
310 |	Returns value stored in `filecol', which keeps track of the current column in the data file (i.e., number of
311 |	characters since the last new line was encountered).
312 */
GetFileColumn()313 inline long  NxsToken::GetFileColumn() const
314 	{
315 	return filecol;
316 	}
317 
318 /*----------------------------------------------------------------------------------------------------------------------
319 |	Returns value stored in filepos, which keeps track of the current position in the data file (i.e., number of
320 |	characters since the beginning of the file).  Note: for Metrowerks compiler, you must use the offset() method of
321 |	the streampos class to use the value returned.
322 */
GetFilePosition()323 inline file_pos  NxsToken::GetFilePosition() const
324 	{
325 	return filepos;
326 	}
327 
328 /*----------------------------------------------------------------------------------------------------------------------
329 |	Returns value stored in `fileline', which keeps track of the current line in the data file (i.e., number of new
330 |	lines encountered thus far).
331 */
GetFileLine()332 inline long  NxsToken::GetFileLine() const
333 	{
334 	return fileline;
335 	}
336 
337 /*----------------------------------------------------------------------------------------------------------------------
338 |	Returns the data member `token'. Specifying false for`respect_case' parameter causes all characters in `token'
339 |	to be converted to upper case before `token' is returned. Specifying true results in GetToken returning exactly
340 |	what it read from the file.
341 */
GetToken(bool respect_case)342 inline NxsString NxsToken::GetToken(
343   bool respect_case)	/* determines whether token is converted to upper case before being returned */
344 	{
345 	if (!respect_case)
346 		ToUpper();
347 
348 	return token;
349 	}
350 
351 /*----------------------------------------------------------------------------------------------------------------------
352 |	Returns the data member `token' as a C-style string. Specifying false for`respect_case' parameter causes all
353 |	characters in `token' to be converted to upper case before the `token' C-string is returned. Specifying true
354 |	results in GetTokenAsCStr returning exactly what it read from the file.
355 */
GetTokenAsCStr(bool respect_case)356 inline const char *NxsToken::GetTokenAsCStr(
357   bool respect_case)	/* determines whether token is converted to upper case before being returned */
358 	{
359 	if (!respect_case)
360 		ToUpper();
361 
362 	return token.c_str();
363 	}
364 
365 /*----------------------------------------------------------------------------------------------------------------------
366 |	Returns token.size().
367 */
GetTokenLength()368 inline int NxsToken::GetTokenLength() const
369 	{
370 	return token.size();
371 	}
372 
373 /*----------------------------------------------------------------------------------------------------------------------
374 |	Returns true if current token is a single character and this character is either '+' or '-'.
375 */
IsPlusMinusToken()376 inline bool NxsToken::IsPlusMinusToken()
377 	{
378 	if (token.size() == 1 && ( token[0] == '+' || token[0] == '-') )
379 		return true;
380 	else
381 		return false;
382 	}
383 
384 /*----------------------------------------------------------------------------------------------------------------------
385 |	Returns true if current token is a single character and this character is a punctuation character (as defined in
386 |	IsPunctuation function).
387 */
IsPunctuationToken()388 inline bool NxsToken::IsPunctuationToken()
389 	{
390 	if (token.size() == 1 && IsPunctuation( token[0]))
391 		return true;
392 	else
393 		return false;
394 	}
395 
396 /*----------------------------------------------------------------------------------------------------------------------
397 |	Returns true if current token is a single character and this character is a whitespace character (as defined in
398 |	IsWhitespace function).
399 */
IsWhitespaceToken()400 inline bool NxsToken::IsWhitespaceToken()
401 	{
402 	if (token.size() == 1 && IsWhitespace( token[0]))
403 		return true;
404 	else
405 		return false;
406 	}
407 
408 /*----------------------------------------------------------------------------------------------------------------------
409 |	Replaces current token NxsString with s.
410 */
ReplaceToken(const NxsString s)411 inline void NxsToken::ReplaceToken(
412   const NxsString s)	/* NxsString to replace current token NxsString */
413 	{
414 	token = s;
415 	}
416 
417 /*----------------------------------------------------------------------------------------------------------------------
418 |	Sets token to the empty NxsString ("").
419 */
ResetToken()420 inline void NxsToken::ResetToken()
421 	{
422 	token.clear();
423 	}
424 
425 /*----------------------------------------------------------------------------------------------------------------------
426 |	Sets the special punctuation character to `c'. If the labile bit useSpecialPunctuation is set, this character will
427 |	be added to the standard list of punctuation symbols, and will be returned as a separate token like the other
428 |	punctuation characters.
429 */
SetSpecialPunctuationCharacter(char c)430 inline void NxsToken::SetSpecialPunctuationCharacter(
431   char c)	/* the character to which `special' is set */
432 	{
433 	special = c;
434 	}
435 
436 /*----------------------------------------------------------------------------------------------------------------------
437 |	Sets the bit specified in the variable `labileFlags'. The available bits are specified in the NxsTokenFlags enum.
438 |	All bits in `labileFlags' are cleared after each token is read.
439 */
SetLabileFlagBit(int bit)440 inline void NxsToken::SetLabileFlagBit(
441   int bit)	/* the bit (see NxsTokenFlags enum) to set in `labileFlags' */
442 	{
443 	labileFlags |= bit;
444 	}
445 
446 /*----------------------------------------------------------------------------------------------------------------------
447 |	Checks character stored in the variable saved to see if it matches supplied character `ch'. Good for checking such
448 |	things as whether token stopped reading characters because it encountered a newline (and labileFlags bit
449 |	newlineIsToken was set):
450 |>
451 |	StoppedOn('\n');
452 |>
453 |	or whether token stopped reading characters because of a punctuation character such as a comma:
454 |>
455 |	StoppedOn(',');
456 |>
457 */
StoppedOn(char ch)458 inline bool NxsToken::StoppedOn(
459   char ch)	/* the character to compare with saved character */
460 	{
461 	if (saved == ch)
462 		return true;
463 	else
464 		return false;
465 	}
466 
467 /*----------------------------------------------------------------------------------------------------------------------
468 |	Simply outputs the current NxsString stored in `token' to the output stream `out'. Does not send a newline to the
469 |	output stream afterwards.
470 */
Write(ostream & out)471 inline void NxsToken::Write(
472   ostream &out)	/* the output stream to which to write token NxsString */
473 	{
474 	out << token;
475 	}
476 
477 /*----------------------------------------------------------------------------------------------------------------------
478 |	Simply outputs the current NxsString stored in `token' to the output stream `out'. Sends a newline to the output
479 |	stream afterwards.
480 */
Writeln(ostream & out)481 inline void NxsToken::Writeln(
482   ostream &out)	/* the output stream to which to write `token' */
483 	{
484 	out << token << endl;
485 	}
486 
487 /**
488  * Added by BQM: return the contiguous string (including white space) as token
489  * until hitting stop_char
490  * @param stop_char a character to stop reading in
491  */
GetNextContiguousToken(char stop_char)492 inline void NxsToken::GetNextContiguousToken(char stop_char) {
493 	ResetToken();
494 
495 	char ch = ' ';
496 	if (saved == '\0' || IsWhitespace(saved))
497 	{
498 		// Skip leading whitespace
499 		//
500 		while( IsWhitespace(ch) && !atEOF)
501 			ch = GetNextChar();
502 		saved = ch;
503 	}
504 	for (;;) {
505 
506 		// Get next character either from saved or from input stream.
507 		//
508 		if (saved != '\0')
509 			{
510 			ch = saved;
511 			saved = '\0';
512 			}
513 		else
514 			ch = GetNextChar();
515 
516 		// Break now if we've hit EOF.
517 		//
518 		if (atEOF)
519 			break;
520 		if (ch == stop_char) {
521 			saved = ch;
522 			break;
523 		}
524 		AppendToToken(ch);
525 	}
526 	// Skip ending whitespace
527 	if (token.empty()) return;
528 	NxsString::iterator last = token.end();
529 	while (last != token.begin() && IsWhitespace(*(last-1))) {
530 		last--;
531 	}
532 	if (last != token.end()) token.erase(last, token.end());
533 }
534 
535 #endif
536