1 ///###////////////////////////////////////////////////////////////////////////
2 //
3 // Burton Computer Corporation
4 // http://www.burton-computer.com
5 // http://www.cooldevtools.com
6 // $Id: HtmlTokenizer.cc 272 2007-01-06 19:37:27Z brian $
7 //
8 // Copyright (C) 2007 Burton Computer Corporation
9 // ALL RIGHTS RESERVED
10 //
11 // This program is open source software; you can redistribute it
12 // and/or modify it under the terms of the Q Public License (QPL)
13 // version 1.0. Use of this software in whole or in part, including
14 // linking it (modified or unmodified) into other programs is
15 // subject to the terms of the QPL.
16 //
17 // This program is distributed in the hope that it will be useful,
18 // but WITHOUT ANY WARRANTY; without even the implied warranty of
19 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 // Q Public License for more details.
21 //
22 // You should have received a copy of the Q Public License
23 // along with this program; see the file LICENSE.txt. If not, visit
24 // the Burton Computer Corporation or CoolDevTools web site
25 // QPL pages at:
26 //
27 // http://www.burton-computer.com/qpl.html
28 // http://www.cooldevtools.com/qpl.html
29 //
30
31 #include <cstdlib>
32 #include "AbstractTokenReceiver.h"
33 #include "StringReader.h"
34 #include "RegularExpression.h"
35 #include "TempPtr.h"
36 #include "HtmlTokenizer.h"
37
38 static const char SEPARATOR = ' ';
39 static const int MAX_ENTITY_LENGTH = 6;
40 static const string URL_REGEX("[^a-z0-9_](href|src)[ \t\r\n]*=[ \t\r\n]*('[^>' \t\r\n]+|\"[^>\" \t\r\n]+|[^> \t\r\n]+)");
41 static const string TAG_BASE_PREFIX("T");
42 static const string TAG_BASE_SUFFIX("_");
43 static const string URL_PREFIX("U_");
44
HtmlTokenizer(AbstractTokenizer * textTokenizer,AbstractTokenizer * tagTokenizer,int maxTagLength)45 HtmlTokenizer::HtmlTokenizer(AbstractTokenizer *textTokenizer,
46 AbstractTokenizer *tagTokenizer,
47 int maxTagLength)
48 : m_textTokenizer(textTokenizer),
49 m_tagTokenizer(tagTokenizer),
50 m_maxTagLength(maxTagLength),
51 m_reader(0),
52 m_receiver(0)
53 {
54 }
55
~HtmlTokenizer()56 HtmlTokenizer::~HtmlTokenizer()
57 {
58 }
59
tokenize(AbstractTokenReceiver * receiver,AbstractCharReader * reader,const string & prefix)60 void HtmlTokenizer::tokenize(AbstractTokenReceiver *receiver,
61 AbstractCharReader *reader,
62 const string &prefix)
63 {
64 TempPtr<AbstractCharReader> tmp_reader(m_reader, reader);
65 TempPtr<AbstractTokenReceiver> tmp_receiver(m_receiver, receiver);
66 m_prefix = prefix;
67 m_textTokenizer->tokenize(m_receiver, this, m_prefix);
68 }
69
decodeUrl(const string & url,string & buffer)70 const string &HtmlTokenizer::decodeUrl(const string &url,
71 string &buffer)
72 {
73 const int len = url.length();
74 const int hex_limit = len - 2;
75 const char *chars = url.c_str();
76 buffer.erase();
77 buffer.reserve(len);
78 for (int i = 0; i < len; ++i) {
79 char ch = chars[i];
80 if (ch == '%' && i < hex_limit && is_xdigit(chars[i+1]) && is_xdigit(chars[i+2])) {
81 buffer += (char)(hex_to_int(chars[i+1]) << 4 | hex_to_int(chars[i+2]));
82 i += 2;
83 } else {
84 buffer += ch;
85 }
86 }
87 if (is_debug) {
88 cerr << "ORIG URL '" << url << "' DECODED '" << buffer << "'" << endl;
89 }
90 return buffer;
91 }
92
processTagUrls(const string & tag)93 void HtmlTokenizer::processTagUrls(const string &tag)
94 {
95 static RegularExpression url_regex(URL_REGEX, 3, true);
96
97 if (isCommentTag(tag)) {
98 return;
99 }
100
101 int offset = 0;
102 string url, decoded_url;
103 RegularExpression::MatchData match;
104 while (url_regex.match(tag.c_str() + offset)) {
105 url_regex.getMatch(2, url);
106 url_regex.getMatch(2, match);
107 StringReader reader(decodeUrl(url, decoded_url));
108 m_tagTokenizer->tokenize(m_receiver, &reader, URL_PREFIX);
109 offset += match.end_pos;
110 assert(offset <= tag.length());
111 }
112 }
113
make_tag_prefix(const string & tag_body,string & prefix)114 static const string &make_tag_prefix(const string &tag_body,
115 string &prefix)
116 {
117 assert(tag_body.length() > 0);
118 assert(!is_space(tag_body[0]));
119
120 prefix = TAG_BASE_PREFIX;
121 if (starts_with(tag_body, "!--")) {
122 prefix += "CMT";
123 } else {
124 for (const char *s = tag_body.c_str(); *s && !is_space(*s) && prefix.length() < 8; ++s) {
125 if (is_alnum(*s)) {
126 prefix += to_lower(*s);
127 }
128 }
129 }
130 prefix += TAG_BASE_SUFFIX;
131 return prefix;
132 }
133
processTagBody(const string & tag)134 void HtmlTokenizer::processTagBody(const string &tag)
135 {
136 if (is_debug) {
137 cerr << "PROCESSING TAG BODY: " << tag << endl;
138 }
139 string tag_prefix;
140 StringReader reader(tag);
141 m_tagTokenizer->tokenize(m_receiver, &reader, make_tag_prefix(tag, tag_prefix));
142 // TODO: leaving this in a for a while as a backward compatibility - remove after 3/31/2006
143 m_tagTokenizer->tokenize(m_receiver, &reader, URL_PREFIX);
144 // TODO: end backward compatibility hook
145 if (is_debug) {
146 cerr << "PROCESSED TAG BODY: " << tag << endl;
147 }
148 }
149
isCommentTag(const string & tag)150 bool HtmlTokenizer::isCommentTag(const string &tag)
151 {
152 return starts_with(tag, "!--") && ends_with(tag, "--");
153 }
154
isInvisibleTag(const string & tag)155 bool HtmlTokenizer::isInvisibleTag(const string &tag)
156 {
157 return isCommentTag(tag);
158 }
159
isOpenCommentTag(const string & tag)160 bool HtmlTokenizer::isOpenCommentTag(const string &tag)
161 {
162 return starts_with(tag, "!--");
163 }
164
isIncompleteCommentTag(const string & tag)165 bool HtmlTokenizer::isIncompleteCommentTag(const string &tag)
166 {
167 return starts_with(tag, "!--") && !ends_with(tag, "--");
168 }
169
processedTag(string & tag)170 bool HtmlTokenizer::processedTag(string &tag)
171 {
172 if (m_reader->currentChar() != '<') {
173 return false;
174 }
175
176 tag.erase();
177 Ptr<AbstractCharReaderPosition> pos(m_reader->createMark());
178 while (m_reader->forward() && (static_cast<int>(tag.length()) < m_maxTagLength || isOpenCommentTag(tag))) {
179 if (m_reader->currentChar() == '>' && !isIncompleteCommentTag(tag)) {
180 if (tag.length() > 0) {
181 processTagBody(tag);
182 processTagUrls(tag);
183 return true;
184 } else {
185 break;
186 }
187 }
188
189 char ch = processedEntity() ? currentChar() : m_reader->currentChar();
190 if (tag.length() > 0 || !is_space(ch)) {
191 tag += ch;
192 }
193 }
194
195 m_reader->returnToMark(pos.get());
196 return false;
197 }
198
parseEntityInteger(const string & entity)199 char HtmlTokenizer::parseEntityInteger(const string &entity)
200 {
201 char answer = ' ';
202 if (entity.length() > 1 && (entity[1] == 'x' || entity[1] == 'X')) {
203 answer = (char)hex_to_int(entity.c_str() + 1);
204 } else {
205 answer = (char)atoi(entity.c_str());
206 }
207 return answer;
208 }
209
processEntity(const string & entity)210 bool HtmlTokenizer::processEntity(const string &entity)
211 {
212 if (entity == "amp") {
213 setCurrentChar('&');
214 return true;
215 }
216
217 if (entity == "apos") {
218 setCurrentChar('\'');
219 return true;
220 }
221
222 if (entity == "quot") {
223 setCurrentChar('"');
224 return true;
225 }
226
227 if (entity == "lt") {
228 setCurrentChar('<');
229 return true;
230 }
231
232 if (entity == "gt") {
233 setCurrentChar('>');
234 return true;
235 }
236
237 if (entity == "nbsp") {
238 setCurrentChar(' ');
239 return true;
240 }
241
242 if (entity[0] == '#') {
243 setCurrentChar(parseEntityInteger(entity));
244 return true;
245 }
246
247 return false;
248 }
249
processedEntity()250 bool HtmlTokenizer::processedEntity()
251 {
252 if (m_reader->currentChar() != '&') {
253 return false;
254 }
255
256 Ptr<AbstractCharReaderPosition> startPos(m_reader->createMark());
257
258 string entity;
259 while (m_reader->forward() && static_cast<int>(entity.length()) < MAX_ENTITY_LENGTH) {
260 if (m_reader->currentChar() == ';') {
261 if (!processEntity(entity)) {
262 break;
263 }
264 return true;
265 }
266 entity += m_reader->currentChar();
267 }
268
269 m_reader->returnToMark(startPos.get());
270 return false;
271 }
272
forward()273 bool HtmlTokenizer::forward()
274 {
275 while (true) {
276 if (!m_reader->forward()) {
277 return false;
278 }
279
280 if (processedEntity()) {
281 return true;
282 }
283
284 string tagText;
285 if (!processedTag(tagText)) {
286 setCurrentChar(m_reader->currentChar());
287 return true;
288 }
289
290 if (!isInvisibleTag(tagText)) {
291 setCurrentChar(' ');
292 return true;
293 }
294 }
295 }
296
hasChar()297 bool HtmlTokenizer::hasChar()
298 {
299 return m_reader->hasChar();
300 }
301
atEnd()302 bool HtmlTokenizer::atEnd()
303 {
304 return m_reader->atEnd();
305 }
306
skip(int nchars)307 bool HtmlTokenizer::skip(int nchars)
308 {
309 bool have_char = true;
310 while (have_char && nchars-- > 0) {
311 have_char = forward();
312 }
313 return have_char;
314 }
315