1 ///###////////////////////////////////////////////////////////////////////////
2 //
3 // Burton Computer Corporation
4 // http://www.burton-computer.com
5 // http://www.cooldevtools.com
6 // $Id: HtmlTokenizer.cc 272 2007-01-06 19:37:27Z brian $
7 //
8 // Copyright (C) 2007 Burton Computer Corporation
9 // ALL RIGHTS RESERVED
10 //
11 // This program is open source software; you can redistribute it
12 // and/or modify it under the terms of the Q Public License (QPL)
13 // version 1.0. Use of this software in whole or in part, including
14 // linking it (modified or unmodified) into other programs is
15 // subject to the terms of the QPL.
16 //
17 // This program is distributed in the hope that it will be useful,
18 // but WITHOUT ANY WARRANTY; without even the implied warranty of
19 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20 // Q Public License for more details.
21 //
22 // You should have received a copy of the Q Public License
23 // along with this program; see the file LICENSE.txt.  If not, visit
24 // the Burton Computer Corporation or CoolDevTools web site
25 // QPL pages at:
26 //
27 //    http://www.burton-computer.com/qpl.html
28 //    http://www.cooldevtools.com/qpl.html
29 //
30 
31 #include <cstdlib>
32 #include "AbstractTokenReceiver.h"
33 #include "StringReader.h"
34 #include "RegularExpression.h"
35 #include "TempPtr.h"
36 #include "HtmlTokenizer.h"
37 
38 static const char SEPARATOR = ' ';
39 static const int MAX_ENTITY_LENGTH = 6;
40 static const string URL_REGEX("[^a-z0-9_](href|src)[ \t\r\n]*=[ \t\r\n]*('[^>' \t\r\n]+|\"[^>\" \t\r\n]+|[^> \t\r\n]+)");
41 static const string TAG_BASE_PREFIX("T");
42 static const string TAG_BASE_SUFFIX("_");
43 static const string URL_PREFIX("U_");
44 
HtmlTokenizer(AbstractTokenizer * textTokenizer,AbstractTokenizer * tagTokenizer,int maxTagLength)45 HtmlTokenizer::HtmlTokenizer(AbstractTokenizer *textTokenizer,
46                              AbstractTokenizer *tagTokenizer,
47                              int maxTagLength)
48 : m_textTokenizer(textTokenizer),
49   m_tagTokenizer(tagTokenizer),
50   m_maxTagLength(maxTagLength),
51   m_reader(0),
52   m_receiver(0)
53 {
54 }
55 
~HtmlTokenizer()56 HtmlTokenizer::~HtmlTokenizer()
57 {
58 }
59 
tokenize(AbstractTokenReceiver * receiver,AbstractCharReader * reader,const string & prefix)60 void HtmlTokenizer::tokenize(AbstractTokenReceiver *receiver,
61                              AbstractCharReader *reader,
62                              const string &prefix)
63 {
64   TempPtr<AbstractCharReader> tmp_reader(m_reader, reader);
65   TempPtr<AbstractTokenReceiver> tmp_receiver(m_receiver, receiver);
66   m_prefix = prefix;
67   m_textTokenizer->tokenize(m_receiver, this, m_prefix);
68 }
69 
decodeUrl(const string & url,string & buffer)70 const string &HtmlTokenizer::decodeUrl(const string &url,
71                                        string &buffer)
72 {
73   const int len = url.length();
74   const int hex_limit = len - 2;
75   const char *chars = url.c_str();
76   buffer.erase();
77   buffer.reserve(len);
78   for (int i = 0; i < len; ++i) {
79     char ch = chars[i];
80     if (ch == '%' && i < hex_limit && is_xdigit(chars[i+1]) && is_xdigit(chars[i+2])) {
81       buffer += (char)(hex_to_int(chars[i+1]) << 4 | hex_to_int(chars[i+2]));
82       i += 2;
83     } else {
84       buffer += ch;
85     }
86   }
87   if (is_debug) {
88     cerr << "ORIG URL '" << url << "' DECODED '" << buffer << "'" << endl;
89   }
90   return buffer;
91 }
92 
processTagUrls(const string & tag)93 void HtmlTokenizer::processTagUrls(const string &tag)
94 {
95   static RegularExpression url_regex(URL_REGEX, 3, true);
96 
97   if (isCommentTag(tag)) {
98     return;
99   }
100 
101   int offset = 0;
102   string url, decoded_url;
103   RegularExpression::MatchData match;
104   while (url_regex.match(tag.c_str() + offset)) {
105     url_regex.getMatch(2, url);
106     url_regex.getMatch(2, match);
107     StringReader reader(decodeUrl(url, decoded_url));
108     m_tagTokenizer->tokenize(m_receiver, &reader, URL_PREFIX);
109     offset += match.end_pos;
110     assert(offset <= tag.length());
111   }
112 }
113 
make_tag_prefix(const string & tag_body,string & prefix)114 static const string &make_tag_prefix(const string &tag_body,
115                                      string &prefix)
116 {
117   assert(tag_body.length() > 0);
118   assert(!is_space(tag_body[0]));
119 
120   prefix = TAG_BASE_PREFIX;
121   if (starts_with(tag_body, "!--")) {
122     prefix += "CMT";
123   } else {
124     for (const char *s = tag_body.c_str(); *s && !is_space(*s) && prefix.length() < 8; ++s) {
125       if (is_alnum(*s)) {
126         prefix += to_lower(*s);
127       }
128     }
129   }
130   prefix += TAG_BASE_SUFFIX;
131   return prefix;
132 }
133 
processTagBody(const string & tag)134 void HtmlTokenizer::processTagBody(const string &tag)
135 {
136   if (is_debug) {
137     cerr << "PROCESSING TAG BODY: " << tag << endl;
138   }
139   string tag_prefix;
140   StringReader reader(tag);
141   m_tagTokenizer->tokenize(m_receiver, &reader, make_tag_prefix(tag, tag_prefix));
142   // TODO: leaving this in a for a while as a backward compatibility - remove after 3/31/2006
143   m_tagTokenizer->tokenize(m_receiver, &reader, URL_PREFIX);
144   // TODO: end backward compatibility hook
145   if (is_debug) {
146     cerr << "PROCESSED TAG BODY: " << tag << endl;
147   }
148 }
149 
isCommentTag(const string & tag)150 bool HtmlTokenizer::isCommentTag(const string &tag)
151 {
152   return starts_with(tag, "!--") && ends_with(tag, "--");
153 }
154 
isInvisibleTag(const string & tag)155 bool HtmlTokenizer::isInvisibleTag(const string &tag)
156 {
157   return isCommentTag(tag);
158 }
159 
isOpenCommentTag(const string & tag)160 bool HtmlTokenizer::isOpenCommentTag(const string &tag)
161 {
162   return starts_with(tag, "!--");
163 }
164 
isIncompleteCommentTag(const string & tag)165 bool HtmlTokenizer::isIncompleteCommentTag(const string &tag)
166 {
167   return starts_with(tag, "!--") && !ends_with(tag, "--");
168 }
169 
processedTag(string & tag)170 bool HtmlTokenizer::processedTag(string &tag)
171 {
172   if (m_reader->currentChar() != '<') {
173     return false;
174   }
175 
176   tag.erase();
177   Ptr<AbstractCharReaderPosition> pos(m_reader->createMark());
178   while (m_reader->forward() && (static_cast<int>(tag.length()) < m_maxTagLength || isOpenCommentTag(tag))) {
179     if (m_reader->currentChar() == '>' && !isIncompleteCommentTag(tag)) {
180       if (tag.length() > 0) {
181         processTagBody(tag);
182         processTagUrls(tag);
183         return true;
184       } else {
185         break;
186       }
187     }
188 
189     char ch = processedEntity() ? currentChar() : m_reader->currentChar();
190     if (tag.length() > 0 || !is_space(ch)) {
191       tag += ch;
192     }
193   }
194 
195   m_reader->returnToMark(pos.get());
196   return false;
197 }
198 
parseEntityInteger(const string & entity)199 char HtmlTokenizer::parseEntityInteger(const string &entity)
200 {
201   char answer = ' ';
202   if (entity.length() > 1 && (entity[1] == 'x' || entity[1] == 'X')) {
203     answer = (char)hex_to_int(entity.c_str() + 1);
204   } else {
205     answer = (char)atoi(entity.c_str());
206   }
207   return answer;
208 }
209 
processEntity(const string & entity)210 bool HtmlTokenizer::processEntity(const string &entity)
211 {
212   if (entity == "amp") {
213     setCurrentChar('&');
214     return true;
215   }
216 
217   if (entity == "apos") {
218     setCurrentChar('\'');
219     return true;
220   }
221 
222   if (entity == "quot") {
223     setCurrentChar('"');
224     return true;
225   }
226 
227   if (entity == "lt") {
228     setCurrentChar('<');
229     return true;
230   }
231 
232   if (entity == "gt") {
233     setCurrentChar('>');
234     return true;
235   }
236 
237   if (entity == "nbsp") {
238     setCurrentChar(' ');
239     return true;
240   }
241 
242   if (entity[0] == '#') {
243     setCurrentChar(parseEntityInteger(entity));
244     return true;
245   }
246 
247   return false;
248 }
249 
processedEntity()250 bool HtmlTokenizer::processedEntity()
251 {
252   if (m_reader->currentChar() != '&') {
253     return false;
254   }
255 
256   Ptr<AbstractCharReaderPosition> startPos(m_reader->createMark());
257 
258   string entity;
259   while (m_reader->forward() && static_cast<int>(entity.length()) < MAX_ENTITY_LENGTH) {
260     if (m_reader->currentChar() == ';') {
261       if (!processEntity(entity)) {
262         break;
263       }
264       return true;
265     }
266     entity += m_reader->currentChar();
267   }
268 
269   m_reader->returnToMark(startPos.get());
270   return false;
271 }
272 
forward()273 bool HtmlTokenizer::forward()
274 {
275   while (true) {
276     if (!m_reader->forward()) {
277       return false;
278     }
279 
280     if (processedEntity()) {
281       return true;
282     }
283 
284     string tagText;
285     if (!processedTag(tagText)) {
286       setCurrentChar(m_reader->currentChar());
287       return true;
288     }
289 
290     if (!isInvisibleTag(tagText)) {
291       setCurrentChar(' ');
292       return true;
293     }
294   }
295 }
296 
hasChar()297 bool HtmlTokenizer::hasChar()
298 {
299   return m_reader->hasChar();
300 }
301 
atEnd()302 bool HtmlTokenizer::atEnd()
303 {
304   return m_reader->atEnd();
305 }
306 
skip(int nchars)307 bool HtmlTokenizer::skip(int nchars)
308 {
309   bool have_char = true;
310   while (have_char && nchars-- > 0) {
311     have_char = forward();
312   }
313   return have_char;
314 }
315