1 /*
2 * Portable Agile C++ Classes (PACC)
3 * Copyright (C) 2001-2003 by Marc Parizeau
4 * http://manitou.gel.ulaval.ca/~parizeau/PACC
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *
20 * Contact:
21 * Laboratoire de Vision et Systemes Numeriques
22 * Departement de genie electrique et de genie informatique
23 * Universite Laval, Quebec, Canada, G1K 7P4
24 * http://vision.gel.ulaval.ca
25 *
26 */
27
28 /*!
29 * \file PACC/Util/Tokenizer.cpp
30 * \brief Class methods for the input stream tokenizer.
31 * \author Marc Parizeau, Laboratoire de vision et systèmes numériques, Université Laval
32 * $Revision: 1.7.2.1 $
33 * $Date: 2007/09/10 18:24:10 $
34 */
35
36 #include "Util/Tokenizer.hpp"
37 #include "Util/Assert.hpp"
38 #include <stdexcept>
39 #include <cstring>
40
41 using namespace std;
42 using namespace PACC;
43
44 /*!
45 The internal read buffer size can be set with argument \c inBufSize (default=1024). This buffer can also be disactivated by setting this argument to 0. The internal read buffer can greatly accelerate the parse of the stream. A size between 512 and 1024 appears to give good results in most circumstances.
46
47 \attention It should be noted that the use of such a buffer implies that the stream must be fully parsed by this tokenizer, because there is no way to put it's content back into the stream.
48 */
Tokenizer(unsigned int inBufSize)49 Tokenizer::Tokenizer(unsigned int inBufSize)
50 : mLine(1), mStream(0), mBuffer(0), mBufSize(0), mBufPtr(0), mBufCount(0)
51 {
52 setDelimiters(" \t\n\r", "");
53 setBufferSize(inBufSize);
54 }
55
56 /*!
57 The internal read buffer size can be set with argument \c inBufSize (default=1024). This buffer can also be disactivated by setting this argument to 0. The internal read buffer can greatly accelerate the parse of the stream. A size between 512 and 1024 appears to give the best results in most circumstances.
58
59 \attention It should be noted that the use of such a buffer implies that the stream must be fully parsed by this tokenizer, because there is no way to put it's content back into the stream.
60 */
Tokenizer(istream & inStream,unsigned int inBufSize)61 Tokenizer::Tokenizer(istream& inStream, unsigned int inBufSize)
62 : mLine(1), mStream(&inStream), mBuffer(0), mBufSize(0), mBufPtr(0), mBufCount(0)
63 {
64 setDelimiters(" \t\n\r", "");
65 setBufferSize(inBufSize);
66 }
67
68 /*!
69 */
~Tokenizer(void)70 Tokenizer::~Tokenizer(void) {
71 if(mBuffer != 0) delete[] mBuffer;
72 }
73
74 /*!
75 \return String of next token found.
76
77 This method finds the next token in the default input stream, after removing any leading white space. An empty token (string) means that end of stream was reached.
78 \attention This method is depricated.
79 */
getNextToken(void)80 string Tokenizer::getNextToken(void)
81 {
82 string lToken;
83 getNextToken(lToken);
84 return lToken;
85 }
86
87 /*!
88 \return True if a valid token was found, false otherwise.
89
90 This method finds the next token in the default input stream, after removing any leading white space. The token is returned through argument \c outToken. An empty token (string) means that end of stream was reached.
91 */
getNextToken(string & outToken)92 bool Tokenizer::getNextToken(string& outToken)
93 {
94 PACC_AssertM(mStream, "undefined input stream!");
95 if(!mTokens.empty()) {
96 // use putback tokens if available
97 outToken = mTokens.top();
98 mTokens.pop();
99 } else if(mBufSize == 0) {
100 // DO NOT use the input read buffer
101 register unsigned char lChar;
102 // get rid of leading white space
103 do {
104 lChar = mStream->get();
105 if(mStream->eof()) {
106 outToken.clear();
107 return false;
108 }
109 if(lChar == '\n') ++mLine;
110 } while(mDelimiters[lChar] == eWhiteSpace);
111 outToken = lChar;
112 // append until next white space or single char token
113 char lOutBuffer[100];
114 while(mDelimiters[lChar] == 0 && !mStream->eof()) {
115 unsigned lOutCount = 0;
116 while(lOutCount < sizeof(lOutBuffer)) {
117 lChar = mStream->get();
118 if(mStream->eof()) break;
119 if(mDelimiters[lChar] != 0) {
120 // put character back into stream
121 mStream->putback(lChar);
122 break;
123 }
124 lOutBuffer[lOutCount++] = lChar;
125 // check for end-of-line counter
126 if(lChar == '\n') ++mLine;
127 }
128 outToken.append(lOutBuffer, lOutCount);
129 }
130 } else {
131 // otherwise, use the input read buffer
132 register unsigned char lChar;
133 // get rid of leading white space
134 do {
135 if(mBufCount == 0 && fillBuffer() == 0) {
136 outToken.clear();
137 return false;
138 }
139 lChar = *(mBufPtr++); --mBufCount;
140 if(lChar == '\n') ++mLine;
141 } while(mDelimiters[lChar] == eWhiteSpace);
142 outToken = lChar;
143 // append until next white space or single char token
144 char lOutBuffer[100];
145 while(mDelimiters[lChar] == 0 && mBufPtr != mBuffer) {
146 unsigned lOutCount = 0;
147 while(lOutCount < sizeof(lOutBuffer)) {
148 if(mBufCount == 0 && fillBuffer() == 0) break;
149 lChar = *(mBufPtr++); --mBufCount;
150 if(mDelimiters[lChar] != 0) {
151 // put character back into buffer
152 --mBufPtr; ++mBufCount;
153 break;
154 }
155 lOutBuffer[lOutCount++] = lChar;
156 // check for end-of-line counter
157 if(lChar == '\n') ++mLine;
158 }
159 outToken.append(lOutBuffer, lOutCount);
160 }
161 }
162 return !outToken.empty();
163 }
164
165 /*
166 */
getSingleCharTokens(void) const167 string Tokenizer::getSingleCharTokens(void) const
168 {
169 string lSingleCharTokens;
170 for(unsigned int i=0; i < 256; ++i) if(mDelimiters[i] == 2) lSingleCharTokens += (char) i;
171 return lSingleCharTokens;
172 }
173
174 /*
175 */
getWhiteSpace(void) const176 string Tokenizer::getWhiteSpace(void) const
177 {
178 string lWhiteSpace;
179 for(unsigned int i=0; i < 256; ++i) if(mDelimiters[i] == 1) lWhiteSpace += (char) i;
180 return lWhiteSpace;
181 }
182
183 /*! \return -1 if end-of-stream.
184
185 This method returns the next character without removing it from the input stream.
186 */
peekNextChar(void)187 int Tokenizer::peekNextChar(void)
188 {
189 PACC_AssertM(mStream, "undefined input stream!");
190 // check for putback tokens
191 if(!mTokens.empty()) return mTokens.top()[0];
192 else if(mBufSize == 0) return mStream->peek();
193 else {
194 // otherwise, use buffer
195 if(mBufCount == 0 && fillBuffer() == 0) return -1;
196 return *mBufPtr;
197 }
198 }
199
200 /*
201 \attention Any number of tokens can be put back. However, take note that these WILL NOT be parsed again, if the user decides to change delimiters.
202 */
putbackToken(const string & inToken)203 void Tokenizer::putbackToken(const string& inToken)
204 {
205 PACC_AssertM(!inToken.empty(), "cannot put back an empty string!");
206 mTokens.push(inToken);
207 }
208
209 /*
210 The minimum buffer size is set to 10. A smaller buffer size will disable the use of the internal read buffer.
211
212 \attention This method should be called prior to the first call of method Tokenizer::getNextToken, because it is an error to resize a buffer that is not empty. Method Tokenizer::setStream should be called explicitely to flush the buffer .
213 */
setBufferSize(unsigned int inSize)214 void Tokenizer::setBufferSize(unsigned int inSize)
215 {
216 if(mBuffer != 0) delete[] mBuffer;
217 if(inSize < 10) inSize = 0;
218 if(inSize > 0) mBuffer = new char[inSize];
219 else mBuffer = 0;
220 mBufSize = inSize;
221 mBufCount = 0;
222 }
223
224 /*!
225 The white space and single character delimiters are set to the characters contained in strings \c inWhiteSpace and \c inSingleCharTokens, respectively. The white space characters delimit tokens but are not tokens themselves. Single character tokens are tokens that delimit other tokens.
226 */
setDelimiters(const string & inWhiteSpace,const string & inSingleCharTokens)227 void Tokenizer::setDelimiters(const string &inWhiteSpace, const string &inSingleCharTokens)
228 {
229 memset(mDelimiters, 0, sizeof(mDelimiters));
230 for(string::const_iterator i = inWhiteSpace.begin(); i != inWhiteSpace.end(); ++i) {
231 mDelimiters[(unsigned)*i] = eWhiteSpace;
232 }
233 for(string::const_iterator i = inSingleCharTokens.begin(); i != inSingleCharTokens.end(); ++i) {
234 PACC_AssertM(mDelimiters[(unsigned)*i] == 0, "a delimiter cannot be both white space and single char token!");
235 mDelimiters[(unsigned)*i] = eSingleChar;
236 }
237 }
238
239 /*!
240 This method sets a new stream to be tokenized. It also flushes the internal read buffer.
241 */
setStream(istream & inStream)242 void Tokenizer::setStream(istream& inStream)
243 {
244 mStream = &inStream;
245 mLine = 1;
246 mBufCount = 0;
247 mTokens = stack<string>();
248 }
249