1 // Copyright (C) 1999-2003 Paul O. Lewis
2 //
3 // This file is part of NCL (Nexus Class Library) version 2.0.
4 //
5 // NCL is free software; you can redistribute it and/or modify
6 // it under the terms of the GNU General Public License as published by
7 // the Free Software Foundation; either version 2 of the License, or
8 // (at your option) any later version.
9 //
10 // NCL is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 // GNU General Public License for more details.
14 //
15 // You should have received a copy of the GNU General Public License
16 // along with NCL; if not, write to the Free Software Foundation, Inc.,
17 // 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 //
19
20 #ifndef NCL_NXSTOKEN_H
21 #define NCL_NXSTOKEN_H
22
23 /*----------------------------------------------------------------------------------------------------------------------
24 | NxsToken objects are used by NxsReader to extract words (tokens) from a NEXUS data file. NxsToken objects know to
25 | correctly skip NEXUS comments and understand NEXUS punctuation, making reading a NEXUS file as simple as repeatedly
26 | calling the GetNextToken() function and then interpreting the token returned. If the token object is not attached
27 | to an input stream, calls to GetNextToken() will have no effect. If the token object is not attached to an output
28 | stream, output comments will be discarded (i.e., not output anywhere) and calls to Write or Writeln will be
29 | ineffective. If input and output streams have been attached to the token object, however, tokens are read one at a
30 | time from the input stream, and comments are correctly read and either written to the output stream (if an output
31 | comment) or ignored (if not an output comment). Sequences of characters surrounded by single quotes are read in as
32 | single tokens. A pair of adjacent single quotes are stored as a single quote, and underscore characters are stored
33 | as blanks.
34 */
35 class NxsToken
36 {
37 public:
38
39 enum NxsTokenFlags /* For use with the variable labileFlags */
40 {
41 saveCommandComments = 0x0001, /* if set, command comments of the form [&X] are not ignored but are instead saved as regular tokens (without the square brackets, however) */
42 parentheticalToken = 0x0002, /* if set, and if next character encountered is a left parenthesis, token will include everything up to the matching right parenthesis */
43 curlyBracketedToken = 0x0004, /* if set, and if next character encountered is a left curly bracket, token will include everything up to the matching right curly bracket */
44 doubleQuotedToken = 0x0008, /* if set, grabs entire phrase surrounded by double quotes */
45 singleCharacterToken = 0x0010, /* if set, next non-whitespace character returned as token */
46 newlineIsToken = 0x0020, /* if set, newline character treated as a token and atEOL set if newline encountered */
47 tildeIsPunctuation = 0x0040, /* if set, tilde character treated as punctuation and returned as a separate token */
48 useSpecialPunctuation = 0x0080, /* if set, character specified by the data member special is treated as punctuation and returned as a separate token */
49 hyphenNotPunctuation = 0x0100, /* if set, the hyphen character is not treated as punctutation (it is normally returned as a separate token) */
50 preserveUnderscores = 0x0200, /* if set, underscore characters inside tokens are not converted to blank spaces (normally, all underscores are automatically converted to blanks) */
51 ignorePunctuation = 0x0400 /* if set, the normal punctuation symbols are treated the same as any other darkspace characters */
52 };
53
54 NxsString errormsg;
55
56 NxsToken(istream &i);
57 virtual ~NxsToken();
58
59 bool AtEOF();
60 bool AtEOL();
61 bool Abbreviation(NxsString s);
62 bool Begins(NxsString s, bool respect_case = false);
63 void BlanksToUnderscores();
64 bool Equals(NxsString s, bool respect_case = false);
65 long GetFileColumn() const;
66 file_pos GetFilePosition() const;
67 long GetFileLine() const;
68 void GetNextToken();
69 NxsString GetToken(bool respect_case = true);
70 const char *GetTokenAsCStr(bool respect_case = true);
71 const NxsString &GetTokenReference();
72 int GetTokenLength() const;
73 bool IsPlusMinusToken();
74 bool IsPunctuationToken();
75 bool IsWhitespaceToken();
76 void ReplaceToken(const NxsString s);
77 void ResetToken();
78 void SetSpecialPunctuationCharacter(char c);
79 void SetLabileFlagBit(int bit);
80 bool StoppedOn(char ch);
81 void StripWhitespace();
82 void ToUpper();
83 void Write(ostream &out);
84 void Writeln(ostream &out);
85
86 virtual void OutputComment(const NxsString &msg);
87 void GetNextContiguousToken(char stop_char); // Added by BQM
88 protected:
89
90 void AppendToComment(char ch);
91 void AppendToToken(char ch);
92 char GetNextChar();
93 void GetComment();
94 void GetCurlyBracketedToken();
95 void GetDoubleQuotedToken();
96 void GetQuoted();
97 void GetParentheticalToken();
98 bool IsPunctuation(char ch);
99 bool IsWhitespace(char ch);
100
101 private:
102
103 istream ∈ /* reference to input stream from which tokens will be read */
104 file_pos filepos; /* current file position (for Metrowerks compiler, type is streampos rather than long) */
105 long fileline; /* current file line */
106 long filecol; /* current column in current line (refers to column immediately following token just read) */
107 NxsString token; /* the character buffer used to store the current token */
108 NxsString comment; /* temporary buffer used to store output comments while they are being built */
109 char saved; /* either '\0' or is last character read from input stream */
110 bool atEOF; /* true if end of file has been encountered */
111 bool atEOL; /* true if newline encountered while newlineIsToken labile flag set */
112 char special; /* ad hoc punctuation character; default value is '\0' */
113 int labileFlags; /* storage for flags in the NxsTokenFlags enum */
114 char punctuation[21]; /* stores the 20 NEXUS punctuation characters */
115 char whitespace[4]; /* stores the 3 whitespace characters: blank space, tab and newline */
116 };
117
118 typedef NxsToken NexusToken;
119
120 /*----------------------------------------------------------------------------------------------------------------------
121 | Returns the token for functions that only need read only access - faster than GetToken.
122 */
GetTokenReference()123 inline const NxsString &NxsToken::GetTokenReference()
124 {
125 return token;
126 }
127
128 /*----------------------------------------------------------------------------------------------------------------------
129 | This function is called whenever an output comment (i.e., a comment beginning with an exclamation point) is found
130 | in the data file. This version of OutputComment does nothing; override this virtual function to display the output
131 | comment in the most appropriate way for the platform you are supporting.
132 */
OutputComment(const NxsString & msg)133 inline void NxsToken::OutputComment(
134 const NxsString &msg) /* the contents of the printable comment discovered in the NEXUS data file */
135 {
136 # if defined(HAVE_PRAGMA_UNUSED)
137 # pragma unused(msg)
138 # endif
139 }
140
141 /*----------------------------------------------------------------------------------------------------------------------
142 | Adds `ch' to end of comment NxsString.
143 */
AppendToComment(char ch)144 inline void NxsToken::AppendToComment(
145 char ch) /* character to be appended to comment */
146 {
147 comment += ch;
148 }
149
150 /*----------------------------------------------------------------------------------------------------------------------
151 | Adds `ch' to end of current token.
152 */
AppendToToken(char ch)153 inline void NxsToken::AppendToToken(
154 char ch) /* character to be appended to token */
155 {
156 // First three lines proved necessary to keep Borland's implementation of STL from crashing
157 // under some circumstances (may no longer be necessary)
158 //
159 char s[2];
160 s[0] = ch;
161 s[1] = '\0';
162
163 token += s;
164 }
165
166 /*----------------------------------------------------------------------------------------------------------------------
167 | Reads next character from in and does all of the following before returning it to the calling function:
168 |~
169 | o if character read is either a carriage return or line feed, the variable line is incremented by one and the
170 | variable col is reset to zero
171 | o if character read is a carriage return, and a peek at the next character to be read reveals that it is a line
172 | feed, then the next (line feed) character is read
173 | o if either a carriage return or line feed is read, the character returned to the calling function is '\n' if
174 | character read is neither a carriage return nor a line feed, col is incremented by one and the character is
175 | returned as is to the calling function
176 | o in all cases, the variable filepos is updated using a call to the tellg function of istream.
177 |~
178 */
GetNextChar()179 inline char NxsToken::GetNextChar()
180 {
181 int ch = in.get();
182 int failed = in.bad();
183 if (failed)
184 {
185 errormsg = "Unknown error reading data file (check to make sure file exists)";
186 throw NxsException(errormsg);
187 }
188
189 if (ch == 13 || ch == 10)
190 {
191 fileline++;
192 filecol = 1L;
193
194 if (ch == 13 && (int)in.peek() == 10)
195 ch = in.get();
196
197 atEOL = 1;
198 }
199 else if (ch == EOF)
200 atEOF = 1;
201 else
202 {
203 filecol++;
204 atEOL = 0;
205 }
206
207 # if defined(__DECCXX)
208 filepos = 0L;
209 # else
210 // BQM this cause crash compiling with clang under Windows!
211 // filepos = in.tellg();
212 filepos += 1;
213 # endif
214
215 if (atEOF)
216 return '\0';
217 else if (atEOL)
218 return '\n';
219 else
220 return (char)ch;
221 }
222
223 /*----------------------------------------------------------------------------------------------------------------------
224 | Returns true if character supplied is considered a punctuation character. The following twenty characters are
225 | considered punctuation characters:
226 |>
227 | ()[]{}/\,;:=*'"`+-<>
228 |>
229 | Exceptions:
230 |~
231 | o The tilde character ('~') is also considered punctuation if the tildeIsPunctuation labile flag is set
232 | o The special punctuation character (specified using the SetSpecialPunctuationCharacter) is also considered
233 | punctuation if the useSpecialPunctuation labile flag is set
234 | o The hyphen (i.e., minus sign) character ('-') is not considered punctuation if the hyphenNotPunctuation
235 | labile flag is set
236 |~
237 | Use the SetLabileFlagBit method to set one or more NxsLabileFlags flags in `labileFlags'
238 */
IsPunctuation(char ch)239 inline bool NxsToken::IsPunctuation(
240 char ch) /* the character in question */
241 {
242 // PAUP 4.0b10
243 // o allows ]`<> inside taxon names
244 // o allows `<> inside taxset names
245 //
246 bool is_punctuation = false;
247 if (strchr(punctuation, ch))
248 is_punctuation = true;
249 if (labileFlags & tildeIsPunctuation && ch == '~')
250 is_punctuation = true;
251 if (labileFlags & useSpecialPunctuation && ch == special)
252 is_punctuation = true;
253 if (labileFlags & hyphenNotPunctuation && ch == '-')
254 is_punctuation = false;
255
256 return is_punctuation;
257 }
258
259 /*----------------------------------------------------------------------------------------------------------------------
260 | Returns true if character supplied is considered a whitespace character. Note: treats '\n' as darkspace if labile
261 | flag newlineIsToken is in effect.
262 */
IsWhitespace(char ch)263 inline bool NxsToken::IsWhitespace(
264 char ch) /* the character in question */
265 {
266 bool ws = false;
267
268 // If ch is found in the whitespace array, it's whitespace
269 //
270 if (strchr(whitespace, ch))
271 ws = true;
272
273 // Unless of course ch is the newline character and we're currently
274 // treating newlines as darkspace!
275 //
276 if (labileFlags & newlineIsToken && ch == '\n')
277 ws = false;
278
279 return ws;
280 }
281
282 /*----------------------------------------------------------------------------------------------------------------------
283 | Returns true if and only if last call to GetNextToken encountered the end-of-file character (or for some reason the
284 | input stream is now out of commission).
285 */
AtEOF()286 inline bool NxsToken::AtEOF()
287 {
288 return atEOF;
289 }
290
291 /*----------------------------------------------------------------------------------------------------------------------
292 | Returns true if and only if last call to GetNextToken encountered the newline character while the newlineIsToken
293 | labile flag was in effect.
294 */
AtEOL()295 inline bool NxsToken::AtEOL()
296 {
297 return atEOL;
298 }
299
300 /*----------------------------------------------------------------------------------------------------------------------
301 | Converts all blanks in token to underscore characters. Normally, underscores found in the tokens read from a NEXUS
302 | file are converted to blanks automatically as they are read; this function reverts the blanks back to underscores.
303 */
BlanksToUnderscores()304 inline void NxsToken::BlanksToUnderscores()
305 {
306 token.BlanksToUnderscores();
307 }
308
309 /*----------------------------------------------------------------------------------------------------------------------
310 | Returns value stored in `filecol', which keeps track of the current column in the data file (i.e., number of
311 | characters since the last new line was encountered).
312 */
GetFileColumn()313 inline long NxsToken::GetFileColumn() const
314 {
315 return filecol;
316 }
317
318 /*----------------------------------------------------------------------------------------------------------------------
319 | Returns value stored in filepos, which keeps track of the current position in the data file (i.e., number of
320 | characters since the beginning of the file). Note: for Metrowerks compiler, you must use the offset() method of
321 | the streampos class to use the value returned.
322 */
GetFilePosition()323 inline file_pos NxsToken::GetFilePosition() const
324 {
325 return filepos;
326 }
327
328 /*----------------------------------------------------------------------------------------------------------------------
329 | Returns value stored in `fileline', which keeps track of the current line in the data file (i.e., number of new
330 | lines encountered thus far).
331 */
GetFileLine()332 inline long NxsToken::GetFileLine() const
333 {
334 return fileline;
335 }
336
337 /*----------------------------------------------------------------------------------------------------------------------
338 | Returns the data member `token'. Specifying false for`respect_case' parameter causes all characters in `token'
339 | to be converted to upper case before `token' is returned. Specifying true results in GetToken returning exactly
340 | what it read from the file.
341 */
GetToken(bool respect_case)342 inline NxsString NxsToken::GetToken(
343 bool respect_case) /* determines whether token is converted to upper case before being returned */
344 {
345 if (!respect_case)
346 ToUpper();
347
348 return token;
349 }
350
351 /*----------------------------------------------------------------------------------------------------------------------
352 | Returns the data member `token' as a C-style string. Specifying false for`respect_case' parameter causes all
353 | characters in `token' to be converted to upper case before the `token' C-string is returned. Specifying true
354 | results in GetTokenAsCStr returning exactly what it read from the file.
355 */
GetTokenAsCStr(bool respect_case)356 inline const char *NxsToken::GetTokenAsCStr(
357 bool respect_case) /* determines whether token is converted to upper case before being returned */
358 {
359 if (!respect_case)
360 ToUpper();
361
362 return token.c_str();
363 }
364
365 /*----------------------------------------------------------------------------------------------------------------------
366 | Returns token.size().
367 */
GetTokenLength()368 inline int NxsToken::GetTokenLength() const
369 {
370 return token.size();
371 }
372
373 /*----------------------------------------------------------------------------------------------------------------------
374 | Returns true if current token is a single character and this character is either '+' or '-'.
375 */
IsPlusMinusToken()376 inline bool NxsToken::IsPlusMinusToken()
377 {
378 if (token.size() == 1 && ( token[0] == '+' || token[0] == '-') )
379 return true;
380 else
381 return false;
382 }
383
384 /*----------------------------------------------------------------------------------------------------------------------
385 | Returns true if current token is a single character and this character is a punctuation character (as defined in
386 | IsPunctuation function).
387 */
IsPunctuationToken()388 inline bool NxsToken::IsPunctuationToken()
389 {
390 if (token.size() == 1 && IsPunctuation( token[0]))
391 return true;
392 else
393 return false;
394 }
395
396 /*----------------------------------------------------------------------------------------------------------------------
397 | Returns true if current token is a single character and this character is a whitespace character (as defined in
398 | IsWhitespace function).
399 */
IsWhitespaceToken()400 inline bool NxsToken::IsWhitespaceToken()
401 {
402 if (token.size() == 1 && IsWhitespace( token[0]))
403 return true;
404 else
405 return false;
406 }
407
408 /*----------------------------------------------------------------------------------------------------------------------
409 | Replaces current token NxsString with s.
410 */
ReplaceToken(const NxsString s)411 inline void NxsToken::ReplaceToken(
412 const NxsString s) /* NxsString to replace current token NxsString */
413 {
414 token = s;
415 }
416
417 /*----------------------------------------------------------------------------------------------------------------------
418 | Sets token to the empty NxsString ("").
419 */
ResetToken()420 inline void NxsToken::ResetToken()
421 {
422 token.clear();
423 }
424
425 /*----------------------------------------------------------------------------------------------------------------------
426 | Sets the special punctuation character to `c'. If the labile bit useSpecialPunctuation is set, this character will
427 | be added to the standard list of punctuation symbols, and will be returned as a separate token like the other
428 | punctuation characters.
429 */
SetSpecialPunctuationCharacter(char c)430 inline void NxsToken::SetSpecialPunctuationCharacter(
431 char c) /* the character to which `special' is set */
432 {
433 special = c;
434 }
435
436 /*----------------------------------------------------------------------------------------------------------------------
437 | Sets the bit specified in the variable `labileFlags'. The available bits are specified in the NxsTokenFlags enum.
438 | All bits in `labileFlags' are cleared after each token is read.
439 */
SetLabileFlagBit(int bit)440 inline void NxsToken::SetLabileFlagBit(
441 int bit) /* the bit (see NxsTokenFlags enum) to set in `labileFlags' */
442 {
443 labileFlags |= bit;
444 }
445
446 /*----------------------------------------------------------------------------------------------------------------------
447 | Checks character stored in the variable saved to see if it matches supplied character `ch'. Good for checking such
448 | things as whether token stopped reading characters because it encountered a newline (and labileFlags bit
449 | newlineIsToken was set):
450 |>
451 | StoppedOn('\n');
452 |>
453 | or whether token stopped reading characters because of a punctuation character such as a comma:
454 |>
455 | StoppedOn(',');
456 |>
457 */
StoppedOn(char ch)458 inline bool NxsToken::StoppedOn(
459 char ch) /* the character to compare with saved character */
460 {
461 if (saved == ch)
462 return true;
463 else
464 return false;
465 }
466
467 /*----------------------------------------------------------------------------------------------------------------------
468 | Simply outputs the current NxsString stored in `token' to the output stream `out'. Does not send a newline to the
469 | output stream afterwards.
470 */
Write(ostream & out)471 inline void NxsToken::Write(
472 ostream &out) /* the output stream to which to write token NxsString */
473 {
474 out << token;
475 }
476
477 /*----------------------------------------------------------------------------------------------------------------------
478 | Simply outputs the current NxsString stored in `token' to the output stream `out'. Sends a newline to the output
479 | stream afterwards.
480 */
Writeln(ostream & out)481 inline void NxsToken::Writeln(
482 ostream &out) /* the output stream to which to write `token' */
483 {
484 out << token << endl;
485 }
486
487 /**
488 * Added by BQM: return the contiguous string (including white space) as token
489 * until hitting stop_char
490 * @param stop_char a character to stop reading in
491 */
GetNextContiguousToken(char stop_char)492 inline void NxsToken::GetNextContiguousToken(char stop_char) {
493 ResetToken();
494
495 char ch = ' ';
496 if (saved == '\0' || IsWhitespace(saved))
497 {
498 // Skip leading whitespace
499 //
500 while( IsWhitespace(ch) && !atEOF)
501 ch = GetNextChar();
502 saved = ch;
503 }
504 for (;;) {
505
506 // Get next character either from saved or from input stream.
507 //
508 if (saved != '\0')
509 {
510 ch = saved;
511 saved = '\0';
512 }
513 else
514 ch = GetNextChar();
515
516 // Break now if we've hit EOF.
517 //
518 if (atEOF)
519 break;
520 if (ch == stop_char) {
521 saved = ch;
522 break;
523 }
524 AppendToToken(ch);
525 }
526 // Skip ending whitespace
527 if (token.empty()) return;
528 NxsString::iterator last = token.end();
529 while (last != token.begin() && IsWhitespace(*(last-1))) {
530 last--;
531 }
532 if (last != token.end()) token.erase(last, token.end());
533 }
534
535 #endif
536