1 /*
2  * Copyright (C) 2010 Google, Inc. All Rights Reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 
26 #ifndef HTMLToken_h
27 #define HTMLToken_h
28 
29 #include "NamedNodeMap.h"
30 #include <wtf/PassOwnPtr.h>
31 #include <wtf/Vector.h>
32 
33 namespace WebCore {
34 
35 class HTMLToken {
36     WTF_MAKE_NONCOPYABLE(HTMLToken); WTF_MAKE_FAST_ALLOCATED;
37 public:
38     enum Type {
39         Uninitialized,
40         DOCTYPE,
41         StartTag,
42         EndTag,
43         Comment,
44         Character,
45         EndOfFile,
46     };
47 
48     class Range {
49     public:
50         int m_start;
51         int m_end;
52     };
53 
54     class Attribute {
55     public:
56         Range m_nameRange;
57         Range m_valueRange;
58         WTF::Vector<UChar, 32> m_name;
59         WTF::Vector<UChar, 32> m_value;
60     };
61 
62     typedef WTF::Vector<Attribute, 10> AttributeList;
63     typedef WTF::Vector<UChar, 1024> DataVector;
64 
HTMLToken()65     HTMLToken() { clear(); }
66 
clear()67     void clear()
68     {
69         m_type = Uninitialized;
70         m_range.m_start = 0;
71         m_range.m_end = 0;
72         m_baseOffset = 0;
73         m_data.clear();
74     }
75 
isUninitialized()76     bool isUninitialized() { return m_type == Uninitialized; }
77 
startIndex()78     int startIndex() const { return m_range.m_start; }
endIndex()79     int endIndex() const { return m_range.m_end; }
80 
setBaseOffset(int offset)81     void setBaseOffset(int offset)
82     {
83         m_baseOffset = offset;
84     }
85 
end(int endOffset)86     void end(int endOffset)
87     {
88         m_range.m_end = endOffset - m_baseOffset;
89     }
90 
makeEndOfFile()91     void makeEndOfFile()
92     {
93         ASSERT(m_type == Uninitialized);
94         m_type = EndOfFile;
95     }
96 
beginStartTag(UChar character)97     void beginStartTag(UChar character)
98     {
99         ASSERT(character);
100         ASSERT(m_type == Uninitialized);
101         m_type = StartTag;
102         m_selfClosing = false;
103         m_currentAttribute = 0;
104         m_attributes.clear();
105 
106         m_data.append(character);
107     }
108 
109     template<typename T>
beginEndTag(T characters)110     void beginEndTag(T characters)
111     {
112         ASSERT(m_type == Uninitialized);
113         m_type = EndTag;
114         m_selfClosing = false;
115         m_currentAttribute = 0;
116         m_attributes.clear();
117 
118         m_data.append(characters);
119     }
120 
121     // Starting a character token works slightly differently than starting
122     // other types of tokens because we want to save a per-character branch.
ensureIsCharacterToken()123     void ensureIsCharacterToken()
124     {
125         ASSERT(m_type == Uninitialized || m_type == Character);
126         m_type = Character;
127     }
128 
beginComment()129     void beginComment()
130     {
131         ASSERT(m_type == Uninitialized);
132         m_type = Comment;
133     }
134 
beginDOCTYPE()135     void beginDOCTYPE()
136     {
137         ASSERT(m_type == Uninitialized);
138         m_type = DOCTYPE;
139         m_doctypeData = adoptPtr(new DoctypeData());
140     }
141 
beginDOCTYPE(UChar character)142     void beginDOCTYPE(UChar character)
143     {
144         ASSERT(character);
145         beginDOCTYPE();
146         m_data.append(character);
147     }
148 
appendToName(UChar character)149     void appendToName(UChar character)
150     {
151         ASSERT(character);
152         ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
153         m_data.append(character);
154     }
155 
156     template<typename T>
appendToCharacter(T characters)157     void appendToCharacter(T characters)
158     {
159         ASSERT(m_type == Character);
160         m_data.append(characters);
161     }
162 
appendToComment(UChar character)163     void appendToComment(UChar character)
164     {
165         ASSERT(character);
166         ASSERT(m_type == Comment);
167         m_data.append(character);
168     }
169 
addNewAttribute()170     void addNewAttribute()
171     {
172         ASSERT(m_type == StartTag || m_type == EndTag);
173         m_attributes.grow(m_attributes.size() + 1);
174         m_currentAttribute = &m_attributes.last();
175 #ifndef NDEBUG
176         m_currentAttribute->m_nameRange.m_start = 0;
177         m_currentAttribute->m_nameRange.m_end = 0;
178         m_currentAttribute->m_valueRange.m_start = 0;
179         m_currentAttribute->m_valueRange.m_end = 0;
180 #endif
181     }
182 
beginAttributeName(int offset)183     void beginAttributeName(int offset)
184     {
185         m_currentAttribute->m_nameRange.m_start = offset - m_baseOffset;
186     }
187 
endAttributeName(int offset)188     void endAttributeName(int offset)
189     {
190         int index = offset - m_baseOffset;
191         m_currentAttribute->m_nameRange.m_end = index;
192         m_currentAttribute->m_valueRange.m_start = index;
193         m_currentAttribute->m_valueRange.m_end = index;
194     }
195 
beginAttributeValue(int offset)196     void beginAttributeValue(int offset)
197     {
198         m_currentAttribute->m_valueRange.m_start = offset - m_baseOffset;
199 #ifndef NDEBUG
200         m_currentAttribute->m_valueRange.m_end = 0;
201 #endif
202     }
203 
endAttributeValue(int offset)204     void endAttributeValue(int offset)
205     {
206         m_currentAttribute->m_valueRange.m_end = offset - m_baseOffset;
207     }
208 
appendToAttributeName(UChar character)209     void appendToAttributeName(UChar character)
210     {
211         ASSERT(character);
212         ASSERT(m_type == StartTag || m_type == EndTag);
213         // FIXME: We should be able to add the following ASSERT once we fix
214         // https://bugs.webkit.org/show_bug.cgi?id=62971
215         //   ASSERT(m_currentAttribute->m_nameRange.m_start);
216         m_currentAttribute->m_name.append(character);
217     }
218 
appendToAttributeValue(UChar character)219     void appendToAttributeValue(UChar character)
220     {
221         ASSERT(character);
222         ASSERT(m_type == StartTag || m_type == EndTag);
223         ASSERT(m_currentAttribute->m_valueRange.m_start);
224         m_currentAttribute->m_value.append(character);
225     }
226 
appendToAttributeValue(size_t i,const String & value)227     void appendToAttributeValue(size_t i, const String& value)
228     {
229         ASSERT(!value.isEmpty());
230         ASSERT(m_type == StartTag || m_type == EndTag);
231         m_attributes[i].m_value.append(value.characters(), value.length());
232     }
233 
type()234     Type type() const { return m_type; }
235 
selfClosing()236     bool selfClosing() const
237     {
238         ASSERT(m_type == StartTag || m_type == EndTag);
239         return m_selfClosing;
240     }
241 
setSelfClosing()242     void setSelfClosing()
243     {
244         ASSERT(m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag);
245         m_selfClosing = true;
246     }
247 
attributes()248     const AttributeList& attributes() const
249     {
250         ASSERT(m_type == StartTag || m_type == EndTag);
251         return m_attributes;
252     }
253 
name()254     const DataVector& name() const
255     {
256         ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
257         return m_data;
258     }
259 
eraseCharacters()260     void eraseCharacters()
261     {
262         ASSERT(m_type == Character);
263         m_data.clear();
264     }
265 
eraseValueOfAttribute(size_t i)266     void eraseValueOfAttribute(size_t i)
267     {
268         ASSERT(m_type == StartTag || m_type == EndTag);
269         m_attributes[i].m_value.clear();
270     }
271 
characters()272     const DataVector& characters() const
273     {
274         ASSERT(m_type == Character);
275         return m_data;
276     }
277 
comment()278     const DataVector& comment() const
279     {
280         ASSERT(m_type == Comment);
281         return m_data;
282     }
283 
284     // FIXME: Distinguish between a missing public identifer and an empty one.
publicIdentifier()285     const WTF::Vector<UChar>& publicIdentifier() const
286     {
287         ASSERT(m_type == DOCTYPE);
288         return m_doctypeData->m_publicIdentifier;
289     }
290 
291     // FIXME: Distinguish between a missing system identifer and an empty one.
systemIdentifier()292     const WTF::Vector<UChar>& systemIdentifier() const
293     {
294         ASSERT(m_type == DOCTYPE);
295         return m_doctypeData->m_systemIdentifier;
296     }
297 
setPublicIdentifierToEmptyString()298     void setPublicIdentifierToEmptyString()
299     {
300         ASSERT(m_type == DOCTYPE);
301         m_doctypeData->m_hasPublicIdentifier = true;
302         m_doctypeData->m_publicIdentifier.clear();
303     }
304 
setSystemIdentifierToEmptyString()305     void setSystemIdentifierToEmptyString()
306     {
307         ASSERT(m_type == DOCTYPE);
308         m_doctypeData->m_hasSystemIdentifier = true;
309         m_doctypeData->m_systemIdentifier.clear();
310     }
311 
forceQuirks()312     bool forceQuirks() const
313     {
314         ASSERT(m_type == DOCTYPE);
315         return m_doctypeData->m_forceQuirks;
316     }
317 
setForceQuirks()318     void setForceQuirks()
319     {
320         ASSERT(m_type == DOCTYPE);
321         m_doctypeData->m_forceQuirks = true;
322     }
323 
appendToPublicIdentifier(UChar character)324     void appendToPublicIdentifier(UChar character)
325     {
326         ASSERT(character);
327         ASSERT(m_type == DOCTYPE);
328         ASSERT(m_doctypeData->m_hasPublicIdentifier);
329         m_doctypeData->m_publicIdentifier.append(character);
330     }
331 
appendToSystemIdentifier(UChar character)332     void appendToSystemIdentifier(UChar character)
333     {
334         ASSERT(character);
335         ASSERT(m_type == DOCTYPE);
336         ASSERT(m_doctypeData->m_hasSystemIdentifier);
337         m_doctypeData->m_systemIdentifier.append(character);
338     }
339 
340 private:
341     // FIXME: I'm not sure what the final relationship between HTMLToken and
342     // AtomicHTMLToken will be.  I'm marking this a friend for now, but we'll
343     // want to end up with a cleaner interface between the two classes.
344     friend class AtomicHTMLToken;
345 
346     class DoctypeData {
347         WTF_MAKE_NONCOPYABLE(DoctypeData);
348     public:
DoctypeData()349         DoctypeData()
350             : m_hasPublicIdentifier(false)
351             , m_hasSystemIdentifier(false)
352             , m_forceQuirks(false)
353         {
354         }
355 
356         bool m_hasPublicIdentifier;
357         bool m_hasSystemIdentifier;
358         bool m_forceQuirks;
359         WTF::Vector<UChar> m_publicIdentifier;
360         WTF::Vector<UChar> m_systemIdentifier;
361     };
362 
363     Type m_type;
364     Range m_range; // Always starts at zero.
365     int m_baseOffset;
366 
367     // "name" for DOCTYPE, StartTag, and EndTag
368     // "characters" for Character
369     // "data" for Comment
370     DataVector m_data;
371 
372     // For DOCTYPE
373     OwnPtr<DoctypeData> m_doctypeData;
374 
375     // For StartTag and EndTag
376     bool m_selfClosing;
377     AttributeList m_attributes;
378 
379     // A pointer into m_attributes used during lexing.
380     Attribute* m_currentAttribute;
381 };
382 
383 // FIXME: This class should eventually be named HTMLToken once we move the
384 // exiting HTMLToken to be internal to the HTMLTokenizer.
385 class AtomicHTMLToken {
386     WTF_MAKE_NONCOPYABLE(AtomicHTMLToken);
387 public:
AtomicHTMLToken(HTMLToken & token)388     AtomicHTMLToken(HTMLToken& token)
389         : m_type(token.type())
390     {
391         switch (m_type) {
392         case HTMLToken::Uninitialized:
393             ASSERT_NOT_REACHED();
394             break;
395         case HTMLToken::DOCTYPE:
396             m_name = AtomicString(token.name().data(), token.name().size());
397             m_doctypeData = token.m_doctypeData.release();
398             break;
399         case HTMLToken::EndOfFile:
400             break;
401         case HTMLToken::StartTag:
402         case HTMLToken::EndTag: {
403             m_selfClosing = token.selfClosing();
404             m_name = AtomicString(token.name().data(), token.name().size());
405             initializeAttributes(token.attributes());
406             break;
407         }
408         case HTMLToken::Comment:
409             m_data = String(token.comment().data(), token.comment().size());
410             break;
411         case HTMLToken::Character:
412             m_externalCharacters = &token.characters();
413             break;
414         }
415     }
416 
417     AtomicHTMLToken(HTMLToken::Type type, AtomicString name, PassRefPtr<NamedNodeMap> attributes = 0)
m_type(type)418         : m_type(type)
419         , m_name(name)
420         , m_attributes(attributes)
421     {
422         ASSERT(usesName());
423     }
424 
type()425     HTMLToken::Type type() const { return m_type; }
426 
name()427     const AtomicString& name() const
428     {
429         ASSERT(usesName());
430         return m_name;
431     }
432 
setName(const AtomicString & name)433     void setName(const AtomicString& name)
434     {
435         ASSERT(usesName());
436         m_name = name;
437     }
438 
selfClosing()439     bool selfClosing() const
440     {
441         ASSERT(m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag);
442         return m_selfClosing;
443     }
444 
getAttributeItem(const QualifiedName & attributeName)445     Attribute* getAttributeItem(const QualifiedName& attributeName)
446     {
447         ASSERT(usesAttributes());
448         if (!m_attributes)
449             return 0;
450         return m_attributes->getAttributeItem(attributeName);
451     }
452 
attributes()453     NamedNodeMap* attributes() const
454     {
455         ASSERT(usesAttributes());
456         return m_attributes.get();
457     }
458 
takeAtributes()459     PassRefPtr<NamedNodeMap> takeAtributes()
460     {
461         ASSERT(usesAttributes());
462         return m_attributes.release();
463     }
464 
characters()465     const HTMLToken::DataVector& characters() const
466     {
467         ASSERT(m_type == HTMLToken::Character);
468         return *m_externalCharacters;
469     }
470 
comment()471     const String& comment() const
472     {
473         ASSERT(m_type == HTMLToken::Comment);
474         return m_data;
475     }
476 
477     // FIXME: Distinguish between a missing public identifer and an empty one.
publicIdentifier()478     WTF::Vector<UChar>& publicIdentifier() const
479     {
480         ASSERT(m_type == HTMLToken::DOCTYPE);
481         return m_doctypeData->m_publicIdentifier;
482     }
483 
484     // FIXME: Distinguish between a missing system identifer and an empty one.
systemIdentifier()485     WTF::Vector<UChar>& systemIdentifier() const
486     {
487         ASSERT(m_type == HTMLToken::DOCTYPE);
488         return m_doctypeData->m_systemIdentifier;
489     }
490 
forceQuirks()491     bool forceQuirks() const
492     {
493         ASSERT(m_type == HTMLToken::DOCTYPE);
494         return m_doctypeData->m_forceQuirks;
495     }
496 
497 private:
498     HTMLToken::Type m_type;
499 
500     void initializeAttributes(const HTMLToken::AttributeList& attributes);
501 
usesName()502     bool usesName() const
503     {
504         return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag || m_type == HTMLToken::DOCTYPE;
505     }
506 
usesAttributes()507     bool usesAttributes() const
508     {
509         return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag;
510     }
511 
512     // "name" for DOCTYPE, StartTag, and EndTag
513     AtomicString m_name;
514 
515     // "data" for Comment
516     String m_data;
517 
518     // "characters" for Character
519     //
520     // We don't want to copy the the characters out of the HTMLToken, so we
521     // keep a pointer to its buffer instead.  This buffer is owned by the
522     // HTMLToken and causes a lifetime dependence between these objects.
523     //
524     // FIXME: Add a mechanism for "internalizing" the characters when the
525     //        HTMLToken is destructed.
526     const HTMLToken::DataVector* m_externalCharacters;
527 
528     // For DOCTYPE
529     OwnPtr<HTMLToken::DoctypeData> m_doctypeData;
530 
531     // For StartTag and EndTag
532     bool m_selfClosing;
533 
534     RefPtr<NamedNodeMap> m_attributes;
535 };
536 
initializeAttributes(const HTMLToken::AttributeList & attributes)537 inline void AtomicHTMLToken::initializeAttributes(const HTMLToken::AttributeList& attributes)
538 {
539     size_t size = attributes.size();
540     if (!size)
541         return;
542 
543     m_attributes = NamedNodeMap::create();
544     m_attributes->reserveInitialCapacity(size);
545     for (size_t i = 0; i < size; ++i) {
546         const HTMLToken::Attribute& attribute = attributes[i];
547         if (attribute.m_name.isEmpty())
548             continue;
549 
550         // FIXME: We should be able to add the following ASSERT once we fix
551         // https://bugs.webkit.org/show_bug.cgi?id=62971
552         //   ASSERT(attribute.m_nameRange.m_start);
553         ASSERT(attribute.m_nameRange.m_end);
554         ASSERT(attribute.m_valueRange.m_start);
555         ASSERT(attribute.m_valueRange.m_end);
556 
557         String name(attribute.m_name.data(), attribute.m_name.size());
558         String value(attribute.m_value.data(), attribute.m_value.size());
559         m_attributes->insertAttribute(Attribute::createMapped(name, value), false);
560     }
561 }
562 
563 }
564 
565 #endif
566