1 /*
2 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26 #ifndef HTMLToken_h
27 #define HTMLToken_h
28
29 #include "NamedNodeMap.h"
30 #include <wtf/PassOwnPtr.h>
31 #include <wtf/Vector.h>
32
33 namespace WebCore {
34
35 class HTMLToken {
36 WTF_MAKE_NONCOPYABLE(HTMLToken); WTF_MAKE_FAST_ALLOCATED;
37 public:
38 enum Type {
39 Uninitialized,
40 DOCTYPE,
41 StartTag,
42 EndTag,
43 Comment,
44 Character,
45 EndOfFile,
46 };
47
48 class Range {
49 public:
50 int m_start;
51 int m_end;
52 };
53
54 class Attribute {
55 public:
56 Range m_nameRange;
57 Range m_valueRange;
58 WTF::Vector<UChar, 32> m_name;
59 WTF::Vector<UChar, 32> m_value;
60 };
61
62 typedef WTF::Vector<Attribute, 10> AttributeList;
63 typedef WTF::Vector<UChar, 1024> DataVector;
64
HTMLToken()65 HTMLToken() { clear(); }
66
clear()67 void clear()
68 {
69 m_type = Uninitialized;
70 m_range.m_start = 0;
71 m_range.m_end = 0;
72 m_baseOffset = 0;
73 m_data.clear();
74 }
75
isUninitialized()76 bool isUninitialized() { return m_type == Uninitialized; }
77
startIndex()78 int startIndex() const { return m_range.m_start; }
endIndex()79 int endIndex() const { return m_range.m_end; }
80
setBaseOffset(int offset)81 void setBaseOffset(int offset)
82 {
83 m_baseOffset = offset;
84 }
85
end(int endOffset)86 void end(int endOffset)
87 {
88 m_range.m_end = endOffset - m_baseOffset;
89 }
90
makeEndOfFile()91 void makeEndOfFile()
92 {
93 ASSERT(m_type == Uninitialized);
94 m_type = EndOfFile;
95 }
96
beginStartTag(UChar character)97 void beginStartTag(UChar character)
98 {
99 ASSERT(character);
100 ASSERT(m_type == Uninitialized);
101 m_type = StartTag;
102 m_selfClosing = false;
103 m_currentAttribute = 0;
104 m_attributes.clear();
105
106 m_data.append(character);
107 }
108
109 template<typename T>
beginEndTag(T characters)110 void beginEndTag(T characters)
111 {
112 ASSERT(m_type == Uninitialized);
113 m_type = EndTag;
114 m_selfClosing = false;
115 m_currentAttribute = 0;
116 m_attributes.clear();
117
118 m_data.append(characters);
119 }
120
121 // Starting a character token works slightly differently than starting
122 // other types of tokens because we want to save a per-character branch.
ensureIsCharacterToken()123 void ensureIsCharacterToken()
124 {
125 ASSERT(m_type == Uninitialized || m_type == Character);
126 m_type = Character;
127 }
128
beginComment()129 void beginComment()
130 {
131 ASSERT(m_type == Uninitialized);
132 m_type = Comment;
133 }
134
beginDOCTYPE()135 void beginDOCTYPE()
136 {
137 ASSERT(m_type == Uninitialized);
138 m_type = DOCTYPE;
139 m_doctypeData = adoptPtr(new DoctypeData());
140 }
141
beginDOCTYPE(UChar character)142 void beginDOCTYPE(UChar character)
143 {
144 ASSERT(character);
145 beginDOCTYPE();
146 m_data.append(character);
147 }
148
appendToName(UChar character)149 void appendToName(UChar character)
150 {
151 ASSERT(character);
152 ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
153 m_data.append(character);
154 }
155
156 template<typename T>
appendToCharacter(T characters)157 void appendToCharacter(T characters)
158 {
159 ASSERT(m_type == Character);
160 m_data.append(characters);
161 }
162
appendToComment(UChar character)163 void appendToComment(UChar character)
164 {
165 ASSERT(character);
166 ASSERT(m_type == Comment);
167 m_data.append(character);
168 }
169
addNewAttribute()170 void addNewAttribute()
171 {
172 ASSERT(m_type == StartTag || m_type == EndTag);
173 m_attributes.grow(m_attributes.size() + 1);
174 m_currentAttribute = &m_attributes.last();
175 #ifndef NDEBUG
176 m_currentAttribute->m_nameRange.m_start = 0;
177 m_currentAttribute->m_nameRange.m_end = 0;
178 m_currentAttribute->m_valueRange.m_start = 0;
179 m_currentAttribute->m_valueRange.m_end = 0;
180 #endif
181 }
182
beginAttributeName(int offset)183 void beginAttributeName(int offset)
184 {
185 m_currentAttribute->m_nameRange.m_start = offset - m_baseOffset;
186 }
187
endAttributeName(int offset)188 void endAttributeName(int offset)
189 {
190 int index = offset - m_baseOffset;
191 m_currentAttribute->m_nameRange.m_end = index;
192 m_currentAttribute->m_valueRange.m_start = index;
193 m_currentAttribute->m_valueRange.m_end = index;
194 }
195
beginAttributeValue(int offset)196 void beginAttributeValue(int offset)
197 {
198 m_currentAttribute->m_valueRange.m_start = offset - m_baseOffset;
199 #ifndef NDEBUG
200 m_currentAttribute->m_valueRange.m_end = 0;
201 #endif
202 }
203
endAttributeValue(int offset)204 void endAttributeValue(int offset)
205 {
206 m_currentAttribute->m_valueRange.m_end = offset - m_baseOffset;
207 }
208
appendToAttributeName(UChar character)209 void appendToAttributeName(UChar character)
210 {
211 ASSERT(character);
212 ASSERT(m_type == StartTag || m_type == EndTag);
213 // FIXME: We should be able to add the following ASSERT once we fix
214 // https://bugs.webkit.org/show_bug.cgi?id=62971
215 // ASSERT(m_currentAttribute->m_nameRange.m_start);
216 m_currentAttribute->m_name.append(character);
217 }
218
appendToAttributeValue(UChar character)219 void appendToAttributeValue(UChar character)
220 {
221 ASSERT(character);
222 ASSERT(m_type == StartTag || m_type == EndTag);
223 ASSERT(m_currentAttribute->m_valueRange.m_start);
224 m_currentAttribute->m_value.append(character);
225 }
226
appendToAttributeValue(size_t i,const String & value)227 void appendToAttributeValue(size_t i, const String& value)
228 {
229 ASSERT(!value.isEmpty());
230 ASSERT(m_type == StartTag || m_type == EndTag);
231 m_attributes[i].m_value.append(value.characters(), value.length());
232 }
233
type()234 Type type() const { return m_type; }
235
selfClosing()236 bool selfClosing() const
237 {
238 ASSERT(m_type == StartTag || m_type == EndTag);
239 return m_selfClosing;
240 }
241
setSelfClosing()242 void setSelfClosing()
243 {
244 ASSERT(m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag);
245 m_selfClosing = true;
246 }
247
attributes()248 const AttributeList& attributes() const
249 {
250 ASSERT(m_type == StartTag || m_type == EndTag);
251 return m_attributes;
252 }
253
name()254 const DataVector& name() const
255 {
256 ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
257 return m_data;
258 }
259
eraseCharacters()260 void eraseCharacters()
261 {
262 ASSERT(m_type == Character);
263 m_data.clear();
264 }
265
eraseValueOfAttribute(size_t i)266 void eraseValueOfAttribute(size_t i)
267 {
268 ASSERT(m_type == StartTag || m_type == EndTag);
269 m_attributes[i].m_value.clear();
270 }
271
characters()272 const DataVector& characters() const
273 {
274 ASSERT(m_type == Character);
275 return m_data;
276 }
277
comment()278 const DataVector& comment() const
279 {
280 ASSERT(m_type == Comment);
281 return m_data;
282 }
283
284 // FIXME: Distinguish between a missing public identifer and an empty one.
publicIdentifier()285 const WTF::Vector<UChar>& publicIdentifier() const
286 {
287 ASSERT(m_type == DOCTYPE);
288 return m_doctypeData->m_publicIdentifier;
289 }
290
291 // FIXME: Distinguish between a missing system identifer and an empty one.
systemIdentifier()292 const WTF::Vector<UChar>& systemIdentifier() const
293 {
294 ASSERT(m_type == DOCTYPE);
295 return m_doctypeData->m_systemIdentifier;
296 }
297
setPublicIdentifierToEmptyString()298 void setPublicIdentifierToEmptyString()
299 {
300 ASSERT(m_type == DOCTYPE);
301 m_doctypeData->m_hasPublicIdentifier = true;
302 m_doctypeData->m_publicIdentifier.clear();
303 }
304
setSystemIdentifierToEmptyString()305 void setSystemIdentifierToEmptyString()
306 {
307 ASSERT(m_type == DOCTYPE);
308 m_doctypeData->m_hasSystemIdentifier = true;
309 m_doctypeData->m_systemIdentifier.clear();
310 }
311
forceQuirks()312 bool forceQuirks() const
313 {
314 ASSERT(m_type == DOCTYPE);
315 return m_doctypeData->m_forceQuirks;
316 }
317
setForceQuirks()318 void setForceQuirks()
319 {
320 ASSERT(m_type == DOCTYPE);
321 m_doctypeData->m_forceQuirks = true;
322 }
323
appendToPublicIdentifier(UChar character)324 void appendToPublicIdentifier(UChar character)
325 {
326 ASSERT(character);
327 ASSERT(m_type == DOCTYPE);
328 ASSERT(m_doctypeData->m_hasPublicIdentifier);
329 m_doctypeData->m_publicIdentifier.append(character);
330 }
331
appendToSystemIdentifier(UChar character)332 void appendToSystemIdentifier(UChar character)
333 {
334 ASSERT(character);
335 ASSERT(m_type == DOCTYPE);
336 ASSERT(m_doctypeData->m_hasSystemIdentifier);
337 m_doctypeData->m_systemIdentifier.append(character);
338 }
339
340 private:
341 // FIXME: I'm not sure what the final relationship between HTMLToken and
342 // AtomicHTMLToken will be. I'm marking this a friend for now, but we'll
343 // want to end up with a cleaner interface between the two classes.
344 friend class AtomicHTMLToken;
345
346 class DoctypeData {
347 WTF_MAKE_NONCOPYABLE(DoctypeData);
348 public:
DoctypeData()349 DoctypeData()
350 : m_hasPublicIdentifier(false)
351 , m_hasSystemIdentifier(false)
352 , m_forceQuirks(false)
353 {
354 }
355
356 bool m_hasPublicIdentifier;
357 bool m_hasSystemIdentifier;
358 bool m_forceQuirks;
359 WTF::Vector<UChar> m_publicIdentifier;
360 WTF::Vector<UChar> m_systemIdentifier;
361 };
362
363 Type m_type;
364 Range m_range; // Always starts at zero.
365 int m_baseOffset;
366
367 // "name" for DOCTYPE, StartTag, and EndTag
368 // "characters" for Character
369 // "data" for Comment
370 DataVector m_data;
371
372 // For DOCTYPE
373 OwnPtr<DoctypeData> m_doctypeData;
374
375 // For StartTag and EndTag
376 bool m_selfClosing;
377 AttributeList m_attributes;
378
379 // A pointer into m_attributes used during lexing.
380 Attribute* m_currentAttribute;
381 };
382
383 // FIXME: This class should eventually be named HTMLToken once we move the
384 // exiting HTMLToken to be internal to the HTMLTokenizer.
385 class AtomicHTMLToken {
386 WTF_MAKE_NONCOPYABLE(AtomicHTMLToken);
387 public:
AtomicHTMLToken(HTMLToken & token)388 AtomicHTMLToken(HTMLToken& token)
389 : m_type(token.type())
390 {
391 switch (m_type) {
392 case HTMLToken::Uninitialized:
393 ASSERT_NOT_REACHED();
394 break;
395 case HTMLToken::DOCTYPE:
396 m_name = AtomicString(token.name().data(), token.name().size());
397 m_doctypeData = token.m_doctypeData.release();
398 break;
399 case HTMLToken::EndOfFile:
400 break;
401 case HTMLToken::StartTag:
402 case HTMLToken::EndTag: {
403 m_selfClosing = token.selfClosing();
404 m_name = AtomicString(token.name().data(), token.name().size());
405 initializeAttributes(token.attributes());
406 break;
407 }
408 case HTMLToken::Comment:
409 m_data = String(token.comment().data(), token.comment().size());
410 break;
411 case HTMLToken::Character:
412 m_externalCharacters = &token.characters();
413 break;
414 }
415 }
416
417 AtomicHTMLToken(HTMLToken::Type type, AtomicString name, PassRefPtr<NamedNodeMap> attributes = 0)
m_type(type)418 : m_type(type)
419 , m_name(name)
420 , m_attributes(attributes)
421 {
422 ASSERT(usesName());
423 }
424
type()425 HTMLToken::Type type() const { return m_type; }
426
name()427 const AtomicString& name() const
428 {
429 ASSERT(usesName());
430 return m_name;
431 }
432
setName(const AtomicString & name)433 void setName(const AtomicString& name)
434 {
435 ASSERT(usesName());
436 m_name = name;
437 }
438
selfClosing()439 bool selfClosing() const
440 {
441 ASSERT(m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag);
442 return m_selfClosing;
443 }
444
getAttributeItem(const QualifiedName & attributeName)445 Attribute* getAttributeItem(const QualifiedName& attributeName)
446 {
447 ASSERT(usesAttributes());
448 if (!m_attributes)
449 return 0;
450 return m_attributes->getAttributeItem(attributeName);
451 }
452
attributes()453 NamedNodeMap* attributes() const
454 {
455 ASSERT(usesAttributes());
456 return m_attributes.get();
457 }
458
takeAtributes()459 PassRefPtr<NamedNodeMap> takeAtributes()
460 {
461 ASSERT(usesAttributes());
462 return m_attributes.release();
463 }
464
characters()465 const HTMLToken::DataVector& characters() const
466 {
467 ASSERT(m_type == HTMLToken::Character);
468 return *m_externalCharacters;
469 }
470
comment()471 const String& comment() const
472 {
473 ASSERT(m_type == HTMLToken::Comment);
474 return m_data;
475 }
476
477 // FIXME: Distinguish between a missing public identifer and an empty one.
publicIdentifier()478 WTF::Vector<UChar>& publicIdentifier() const
479 {
480 ASSERT(m_type == HTMLToken::DOCTYPE);
481 return m_doctypeData->m_publicIdentifier;
482 }
483
484 // FIXME: Distinguish between a missing system identifer and an empty one.
systemIdentifier()485 WTF::Vector<UChar>& systemIdentifier() const
486 {
487 ASSERT(m_type == HTMLToken::DOCTYPE);
488 return m_doctypeData->m_systemIdentifier;
489 }
490
forceQuirks()491 bool forceQuirks() const
492 {
493 ASSERT(m_type == HTMLToken::DOCTYPE);
494 return m_doctypeData->m_forceQuirks;
495 }
496
497 private:
498 HTMLToken::Type m_type;
499
500 void initializeAttributes(const HTMLToken::AttributeList& attributes);
501
usesName()502 bool usesName() const
503 {
504 return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag || m_type == HTMLToken::DOCTYPE;
505 }
506
usesAttributes()507 bool usesAttributes() const
508 {
509 return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag;
510 }
511
512 // "name" for DOCTYPE, StartTag, and EndTag
513 AtomicString m_name;
514
515 // "data" for Comment
516 String m_data;
517
518 // "characters" for Character
519 //
520 // We don't want to copy the the characters out of the HTMLToken, so we
521 // keep a pointer to its buffer instead. This buffer is owned by the
522 // HTMLToken and causes a lifetime dependence between these objects.
523 //
524 // FIXME: Add a mechanism for "internalizing" the characters when the
525 // HTMLToken is destructed.
526 const HTMLToken::DataVector* m_externalCharacters;
527
528 // For DOCTYPE
529 OwnPtr<HTMLToken::DoctypeData> m_doctypeData;
530
531 // For StartTag and EndTag
532 bool m_selfClosing;
533
534 RefPtr<NamedNodeMap> m_attributes;
535 };
536
initializeAttributes(const HTMLToken::AttributeList & attributes)537 inline void AtomicHTMLToken::initializeAttributes(const HTMLToken::AttributeList& attributes)
538 {
539 size_t size = attributes.size();
540 if (!size)
541 return;
542
543 m_attributes = NamedNodeMap::create();
544 m_attributes->reserveInitialCapacity(size);
545 for (size_t i = 0; i < size; ++i) {
546 const HTMLToken::Attribute& attribute = attributes[i];
547 if (attribute.m_name.isEmpty())
548 continue;
549
550 // FIXME: We should be able to add the following ASSERT once we fix
551 // https://bugs.webkit.org/show_bug.cgi?id=62971
552 // ASSERT(attribute.m_nameRange.m_start);
553 ASSERT(attribute.m_nameRange.m_end);
554 ASSERT(attribute.m_valueRange.m_start);
555 ASSERT(attribute.m_valueRange.m_end);
556
557 String name(attribute.m_name.data(), attribute.m_name.size());
558 String value(attribute.m_value.data(), attribute.m_value.size());
559 m_attributes->insertAttribute(Attribute::createMapped(name, value), false);
560 }
561 }
562
563 }
564
565 #endif
566