1 /*
2  *  Portable Agile C++ Classes (PACC)
3  *  Copyright (C) 2001-2003 by Marc Parizeau
4  *  http://manitou.gel.ulaval.ca/~parizeau/PACC
5  *
6  *  This library is free software; you can redistribute it and/or
7  *  modify it under the terms of the GNU Lesser General Public
8  *  License as published by the Free Software Foundation; either
9  *  version 2.1 of the License, or (at your option) any later version.
10  *
11  *  This library is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  *  Lesser General Public License for more details.
15  *
16  *  You should have received a copy of the GNU Lesser General Public
17  *  License along with this library; if not, write to the Free Software
18  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  *
20  *  Contact:
21  *  Laboratoire de Vision et Systemes Numeriques
22  *  Departement de genie electrique et de genie informatique
23  *  Universite Laval, Quebec, Canada, G1K 7P4
24  *  http://vision.gel.ulaval.ca
25  *
26  */
27 
28 /*!
29  * \file PACC/Util/Tokenizer.cpp
30  * \brief Class methods for the input stream tokenizer.
31  * \author Marc Parizeau, Laboratoire de vision et systèmes numériques, Université Laval
32  * $Revision: 1.7.2.1 $
33  * $Date: 2007/09/10 18:24:10 $
34  */
35 
36 #include "Util/Tokenizer.hpp"
37 #include "Util/Assert.hpp"
38 #include <stdexcept>
39 #include <cstring>
40 
41 using namespace std;
42 using namespace PACC;
43 
44 /*!
45 The internal read buffer size can be set with argument \c inBufSize (default=1024). This buffer can also be disactivated by setting this argument to 0. The internal read buffer can greatly accelerate the parse of the stream. A size between 512 and 1024 appears to give good results in most circumstances.
46 
47  \attention It should be noted that the use of such a buffer implies that the stream must be fully parsed by this tokenizer, because there is no way to put it's content back into the stream.
48  */
Tokenizer(unsigned int inBufSize)49 Tokenizer::Tokenizer(unsigned int inBufSize)
50 : mLine(1), mStream(0), mBuffer(0), mBufSize(0), mBufPtr(0), mBufCount(0)
51 {
52 	setDelimiters(" \t\n\r", "");
53 	setBufferSize(inBufSize);
54 }
55 
56 /*!
57 The internal read buffer size can be set with argument \c inBufSize (default=1024). This buffer can also be disactivated by setting this argument to 0. The internal read buffer can greatly accelerate the parse of the stream. A size between 512 and 1024 appears to give the best results in most circumstances.
58 
59  \attention It should be noted that the use of such a buffer implies that the stream must be fully parsed by this tokenizer, because there is no way to put it's content back into the stream.
60  */
Tokenizer(istream & inStream,unsigned int inBufSize)61 Tokenizer::Tokenizer(istream& inStream, unsigned int inBufSize)
62 : mLine(1), mStream(&inStream), mBuffer(0), mBufSize(0), mBufPtr(0), mBufCount(0)
63 {
64 	setDelimiters(" \t\n\r", "");
65 	setBufferSize(inBufSize);
66 }
67 
68 /*!
69  */
~Tokenizer(void)70 Tokenizer::~Tokenizer(void) {
71 	if(mBuffer != 0) delete[] mBuffer;
72 }
73 
74 /*!
75 \return String of next token found.
76 
77  This method finds the next token in the default input stream, after removing any leading white space. An empty token (string) means that end of stream was reached.
78  \attention This method is depricated.
79  */
getNextToken(void)80 string Tokenizer::getNextToken(void)
81 {
82 	string lToken;
83 	getNextToken(lToken);
84 	return lToken;
85 }
86 
87 /*!
88 \return True if a valid token was found, false otherwise.
89 
90  This method finds the next token in the default input stream, after removing any leading white space. The token is returned through argument \c outToken. An empty token (string) means that end of stream was reached.
91  */
getNextToken(string & outToken)92 bool Tokenizer::getNextToken(string& outToken)
93 {
94 	PACC_AssertM(mStream, "undefined input stream!");
95 	if(!mTokens.empty()) {
96 		// use putback tokens if available
97 		outToken = mTokens.top();
98 		mTokens.pop();
99 	} else if(mBufSize == 0) {
100 		// DO NOT use the input read buffer
101 		register unsigned char lChar;
102 		// get rid of leading white space
103 		do {
104 			lChar = mStream->get();
105 			if(mStream->eof()) {
106 				outToken.clear();
107 				return false;
108 			}
109 			if(lChar == '\n') ++mLine;
110 		} while(mDelimiters[lChar] == eWhiteSpace);
111 		outToken = lChar;
112 		// append until next white space or single char token
113 		char lOutBuffer[100];
114 		while(mDelimiters[lChar] == 0 && !mStream->eof()) {
115 			unsigned lOutCount = 0;
116 			while(lOutCount < sizeof(lOutBuffer)) {
117 				lChar = mStream->get();
118 				if(mStream->eof()) break;
119 				if(mDelimiters[lChar] != 0) {
120 					// put character back into stream
121 					mStream->putback(lChar);
122 					break;
123 				}
124 				lOutBuffer[lOutCount++] = lChar;
125 				// check for end-of-line counter
126 				if(lChar == '\n') ++mLine;
127 			}
128 			outToken.append(lOutBuffer, lOutCount);
129 		}
130 	} else {
131 		// otherwise, use the input read buffer
132 		register unsigned char lChar;
133 		// get rid of leading white space
134 		do {
135 			if(mBufCount == 0 && fillBuffer() == 0) {
136 				outToken.clear();
137 				return false;
138 			}
139 			lChar = *(mBufPtr++); --mBufCount;
140 			if(lChar == '\n') ++mLine;
141 		} while(mDelimiters[lChar] == eWhiteSpace);
142 		outToken = lChar;
143 		// append until next white space or single char token
144 		char lOutBuffer[100];
145 		while(mDelimiters[lChar] == 0 && mBufPtr != mBuffer) {
146 			unsigned lOutCount = 0;
147 			while(lOutCount < sizeof(lOutBuffer)) {
148 				if(mBufCount == 0 && fillBuffer() == 0) break;
149 				lChar = *(mBufPtr++); --mBufCount;
150 				if(mDelimiters[lChar] != 0) {
151 					// put character back into buffer
152 					--mBufPtr; ++mBufCount;
153 					break;
154 				}
155 				lOutBuffer[lOutCount++] = lChar;
156 				// check for end-of-line counter
157 				if(lChar == '\n') ++mLine;
158 			}
159 			outToken.append(lOutBuffer, lOutCount);
160 		}
161 	}
162 	return !outToken.empty();
163 }
164 
165 /*
166  */
getSingleCharTokens(void) const167 string Tokenizer::getSingleCharTokens(void) const
168 {
169 	string lSingleCharTokens;
170 	for(unsigned int i=0; i < 256; ++i) if(mDelimiters[i] == 2) lSingleCharTokens += (char) i;
171 	return lSingleCharTokens;
172 }
173 
174 /*
175  */
getWhiteSpace(void) const176 string Tokenizer::getWhiteSpace(void) const
177 {
178 	string lWhiteSpace;
179 	for(unsigned int i=0; i < 256; ++i) if(mDelimiters[i] == 1) lWhiteSpace += (char) i;
180 	return lWhiteSpace;
181 }
182 
183 /*! \return -1 if end-of-stream.
184 
185  This method returns the next character without removing it from the input stream.
186  */
peekNextChar(void)187 int Tokenizer::peekNextChar(void)
188 {
189 	PACC_AssertM(mStream, "undefined input stream!");
190 	// check for putback tokens
191 	if(!mTokens.empty()) return mTokens.top()[0];
192 	else if(mBufSize == 0) return mStream->peek();
193 	else {
194 		// otherwise, use buffer
195 		if(mBufCount == 0 && fillBuffer() == 0) return -1;
196 		return *mBufPtr;
197 	}
198 }
199 
200 /*
201  \attention Any number of tokens can be put back. However, take note that these WILL NOT be parsed again, if the user decides to change delimiters.
202  */
putbackToken(const string & inToken)203 void Tokenizer::putbackToken(const string& inToken)
204 {
205 	PACC_AssertM(!inToken.empty(), "cannot put back an empty string!");
206 	mTokens.push(inToken);
207 }
208 
209 /*
210  The minimum buffer size is set to 10. A smaller buffer size will disable the use of the internal read buffer.
211 
212 \attention This method should be called prior to the first call of method Tokenizer::getNextToken, because it is an error to resize a buffer that is not empty. Method Tokenizer::setStream should be called explicitely to flush the buffer .
213  */
setBufferSize(unsigned int inSize)214 void Tokenizer::setBufferSize(unsigned int inSize)
215 {
216 	if(mBuffer != 0) delete[] mBuffer;
217 	if(inSize < 10) inSize = 0;
218 	if(inSize > 0) mBuffer = new char[inSize];
219 	else mBuffer = 0;
220 	mBufSize = inSize;
221 	mBufCount = 0;
222 }
223 
224 /*!
225 The white space and single character delimiters are set to the characters contained in strings \c inWhiteSpace and \c inSingleCharTokens, respectively. The white space characters delimit tokens but are not tokens themselves. Single character tokens are tokens that delimit other tokens.
226  */
setDelimiters(const string & inWhiteSpace,const string & inSingleCharTokens)227 void Tokenizer::setDelimiters(const string &inWhiteSpace, const string &inSingleCharTokens)
228 {
229 	memset(mDelimiters, 0, sizeof(mDelimiters));
230 	for(string::const_iterator i = inWhiteSpace.begin(); i != inWhiteSpace.end(); ++i) {
231 		mDelimiters[(unsigned)*i] = eWhiteSpace;
232 	}
233 	for(string::const_iterator i = inSingleCharTokens.begin(); i != inSingleCharTokens.end(); ++i) {
234 		PACC_AssertM(mDelimiters[(unsigned)*i] == 0, "a delimiter cannot be both white space and single char token!");
235 		mDelimiters[(unsigned)*i] = eSingleChar;
236 	}
237 }
238 
239 /*!
240 This method sets a new stream to be tokenized. It also flushes the internal read buffer.
241  */
setStream(istream & inStream)242 void Tokenizer::setStream(istream& inStream)
243 {
244 	mStream = &inStream;
245 	mLine = 1;
246 	mBufCount = 0;
247 	mTokens = stack<string>();
248 }
249