1/*
2 * Copyright (C) 2009 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 *     * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *     * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 *     * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31// Generate js file as follows:
32//
33// re2c -isc WebCore/inspector/front-end/SourceHTMLTokenizer.re2js \
34// | sed 's|^yy\([^:]*\)*\:|case \1:|' \
35// | sed 's|[*]cursor[+][+]|this._charAt(cursor++)|' \
36// | sed 's|[[*][+][+]cursor|this._charAt(++cursor)|' \
37// | sed 's|[*]cursor|this._charAt(cursor)|' \
38// | sed 's|yych = \*\([^;]*\)|yych = this._charAt\1|' \
39// | sed 's|goto case \([^;]*\)|{ gotoCase = \1; continue; }|' \
40// | sed 's|unsigned\ int|var|' \
41// | sed 's|var\ yych|case 1: var yych|'
42
43WebInspector.SourceHTMLTokenizer = function()
44{
45    WebInspector.SourceTokenizer.call(this);
46
47    // The order is determined by the generated code.
48    this._lexConditions = {
49        INITIAL: 0,
50        COMMENT: 1,
51        DOCTYPE: 2,
52        TAG: 3,
53        DSTRING: 4,
54        SSTRING: 5
55    };
56    this.case_INITIAL = 1000;
57    this.case_COMMENT = 1001;
58    this.case_DOCTYPE = 1002;
59    this.case_TAG = 1003;
60    this.case_DSTRING = 1004;
61    this.case_SSTRING = 1005;
62
63    this._parseConditions = {
64        INITIAL: 0,
65        ATTRIBUTE: 1,
66        ATTRIBUTE_VALUE: 2,
67        LINKIFY: 4,
68        A_NODE: 8,
69        SCRIPT: 16,
70        STYLE: 32
71    };
72
73    this.initialCondition = { lexCondition: this._lexConditions.INITIAL, parseCondition: this._parseConditions.INITIAL };
74    this.condition = this.initialCondition;
75}
76
77WebInspector.SourceHTMLTokenizer.prototype = {
78    set line(line) {
79        if (this._internalJavaScriptTokenizer) {
80            var match = /<\/script/i.exec(line);
81            if (match) {
82                this._internalJavaScriptTokenizer.line = line.substring(0, match.index);
83            } else
84                this._internalJavaScriptTokenizer.line = line;
85        } else if (this._internalCSSTokenizer) {
86            var match = /<\/style/i.exec(line);
87            if (match) {
88                this._internalCSSTokenizer.line = line.substring(0, match.index);
89            } else
90                this._internalCSSTokenizer.line = line;
91        }
92        this._line = line;
93    },
94
95    _isExpectingAttribute: function()
96    {
97        return this._condition.parseCondition & this._parseConditions.ATTRIBUTE;
98    },
99
100    _isExpectingAttributeValue: function()
101    {
102        return this._condition.parseCondition & this._parseConditions.ATTRIBUTE_VALUE;
103    },
104
105    _setExpectingAttribute: function()
106    {
107        if (this._isExpectingAttributeValue())
108            this._condition.parseCondition ^= this._parseConditions.ATTRIBUTE_VALUE;
109        this._condition.parseCondition |= this._parseConditions.ATTRIBUTE;
110    },
111
112    _setExpectingAttributeValue: function()
113    {
114        if (this._isExpectingAttribute())
115            this._condition.parseCondition ^= this._parseConditions.ATTRIBUTE;
116        this._condition.parseCondition |= this._parseConditions.ATTRIBUTE_VALUE;
117    },
118
119    _stringToken: function(cursor, stringEnds)
120    {
121        if (!this._isExpectingAttributeValue()) {
122            this.tokenType = null;
123            return cursor;
124        }
125        this.tokenType = this._attrValueTokenType();
126        if (stringEnds)
127            this._setExpectingAttribute();
128        return cursor;
129    },
130
131    _attrValueTokenType: function()
132    {
133        if (this._condition.parseCondition & this._parseConditions.LINKIFY) {
134            if (this._condition.parseCondition & this._parseConditions.A_NODE)
135                return "html-external-link";
136            return "html-resource-link";
137        }
138        return "html-attribute-value";
139    },
140
141    scriptStarted: function(cursor)
142    {
143        if (!this._internalJavaScriptTokenizer) {
144            this._internalJavaScriptTokenizer = WebInspector.SourceTokenizer.Registry.getInstance().getTokenizer("text/javascript");
145            this._condition.internalJavaScriptTokenizerCondition = this._internalJavaScriptTokenizer.initialCondition;
146        }
147    },
148
149    scriptEnded: function(cursor)
150    {
151    },
152
153    styleSheetStarted: function(cursor)
154    {
155        if (!this._internalCSSTokenizer) {
156            this._internalCSSTokenizer = WebInspector.SourceTokenizer.Registry.getInstance().getTokenizer("text/css");
157            this._condition.internalCSSTokenizerCondition = this._internalCSSTokenizer.initialCondition;
158        }
159    },
160
161    styleSheetEnded: function(cursor)
162    {
163    },
164
165    nextToken: function(cursor)
166    {
167        if (this._internalJavaScriptTokenizer) {
168            // Re-set line to force </script> detection first.
169            this.line = this._line;
170            if (cursor !== this._internalJavaScriptTokenizer._line.length) {
171                // Tokenizer is stateless, so restore its condition before tokenizing and save it after.
172                this._internalJavaScriptTokenizer.condition = this._condition.internalJavaScriptTokenizerCondition;
173                var result = this._internalJavaScriptTokenizer.nextToken(cursor);
174                this.tokenType = this._internalJavaScriptTokenizer.tokenType;
175                this._condition.internalJavaScriptTokenizerCondition = this._internalJavaScriptTokenizer.condition;
176                return result;
177            } else if (cursor !== this._line.length)
178                delete this._internalJavaScriptTokenizer;
179        } else if (this._internalCSSTokenizer) {
180            // Re-set line to force </style> detection first.
181            this.line = this._line;
182            if (cursor !== this._internalCSSTokenizer._line.length) {
183                // Tokenizer is stateless, so restore its condition before tokenizing and save it after.
184                this._internalCSSTokenizer.condition = this._condition.internalCSSTokenizerCondition;
185                var result = this._internalCSSTokenizer.nextToken(cursor);
186                this.tokenType = this._internalCSSTokenizer.tokenType;
187                this._condition.internalCSSTokenizerCondition = this._internalCSSTokenizer.condition;
188                return result;
189            } else if (cursor !== this._line.length)
190                delete this._internalCSSTokenizer;
191        }
192
193        var cursorOnEnter = cursor;
194        var gotoCase = 1;
195        while (1) {
196            switch (gotoCase)
197            // Following comment is replaced with generated state machine.
198            /*!re2c
199                re2c:define:YYCTYPE  = "var";
200                re2c:define:YYCURSOR = cursor;
201                re2c:define:YYGETCONDITION = "this.getLexCondition";
202                re2c:define:YYSETCONDITION = "this.setLexCondition";
203                re2c:condprefix = "case this.case_";
204                re2c:condenumprefix = "this._lexConditions.";
205                re2c:yyfill:enable = 0;
206                re2c:labelprefix = "case ";
207                re2c:indent:top = 2;
208                re2c:indent:string = "    ";
209
210                CommentContent = ([^-\r\n] | ("--" [^>]))*;
211                Comment = "<!--" CommentContent "-->";
212                CommentStart = "<!--" CommentContent [\r\n];
213                CommentEnd = CommentContent "-->";
214
215                DocTypeStart = "<!" [Dd] [Oo] [Cc] [Tt] [Yy] [Pp] [Ee];
216                DocTypeContent = [^\r\n>]*;
217
218                ScriptStart = "<" [Ss] [Cc] [Rr] [Ii] [Pp] [Tt];
219                ScriptEnd = "</" [Ss] [Cc] [Rr] [Ii] [Pp] [Tt];
220
221                StyleStart = "<" [Ss] [Tt] [Yy] [Ll] [Ee];
222                StyleEnd = "</" [Ss] [Tt] [Yy] [Ll] [Ee];
223
224                LT = "<" | "</";
225                GT = ">";
226                EqualSign = "=";
227
228                DoubleStringContent = [^\r\n\"]*;
229                SingleStringContent = [^\r\n\']*;
230                StringLiteral = "\"" DoubleStringContent "\"" | "'" SingleStringContent "'";
231                DoubleStringStart = "\"" DoubleStringContent [\r\n];
232                DoubleStringEnd = DoubleStringContent "\"";
233                SingleStringStart = "'" SingleStringContent [\r\n];
234                SingleStringEnd = SingleStringContent "'";
235
236                Identifier = [^ \r\n"'<>\[\]=]+;
237
238                <INITIAL> Comment { this.tokenType = "html-comment"; return cursor; }
239                <INITIAL> CommentStart => COMMENT { this.tokenType = "html-comment"; return cursor; }
240                <COMMENT> CommentContent => COMMENT { this.tokenType = "html-comment"; return cursor; }
241                <COMMENT> CommentEnd => INITIAL { this.tokenType = "html-comment"; return cursor; }
242
243                <INITIAL> DocTypeStart => DOCTYPE { this.tokenType = "html-doctype"; return cursor; }
244                <DOCTYPE> DocTypeContent => DOCTYPE { this.tokenType = "html-doctype"; return cursor; }
245                <DOCTYPE> GT => INITIAL { this.tokenType = "html-doctype"; return cursor; }
246
247                <INITIAL> ScriptStart => TAG
248                {
249                    if (this._condition.parseCondition & this._parseConditions.SCRIPT) {
250                        // Do not tokenize script tag contents, keep lexer state, even though processing "<".
251                        this.setLexCondition(this._lexConditions.INITIAL);
252                        this.tokenType = null;
253                        return cursor;
254                    }
255                    this.tokenType = "html-tag";
256                    this._condition.parseCondition = this._parseConditions.SCRIPT;
257                    this._setExpectingAttribute();
258                    return cursor;
259                }
260
261                <INITIAL> ScriptEnd => TAG
262                {
263                    this.tokenType = "html-tag";
264                    this._condition.parseCondition = this._parseConditions.INITIAL;
265                    this.scriptEnded(cursor - 8);
266                    return cursor;
267                }
268
269                <INITIAL> StyleStart => TAG
270                {
271                    if (this._condition.parseCondition & this._parseConditions.STYLE) {
272                        // Do not tokenize style tag contents, keep lexer state, even though processing "<".
273                        this.setLexCondition(this._lexConditions.INITIAL);
274                        this.tokenType = null;
275                        return cursor;
276                    }
277                    this.tokenType = "html-tag";
278                    this._condition.parseCondition = this._parseConditions.STYLE;
279                    this._setExpectingAttribute();
280                    return cursor;
281                }
282
283                <INITIAL> StyleEnd => TAG
284                {
285                    this.tokenType = "html-tag";
286                    this._condition.parseCondition = this._parseConditions.INITIAL;
287                    this.styleEnded(cursor - 7);
288                    return cursor;
289                }
290
291                <INITIAL> LT => TAG
292                {
293                    if (this._condition.parseCondition & (this._parseConditions.SCRIPT | this._parseConditions.STYLE)) {
294                        // Do not tokenize script and style tag contents, keep lexer state, even though processing "<".
295                        this.setLexCondition(this._lexConditions.INITIAL);
296                        this.tokenType = null;
297                        return cursor;
298                    }
299
300                    this._condition.parseCondition = this._parseConditions.INITIAL;
301                    this.tokenType = "html-tag";
302                    return cursor;
303                }
304
305                <TAG> GT => INITIAL
306                {
307                    this.tokenType = "html-tag";
308                    if (this._condition.parseCondition & this._parseConditions.SCRIPT) {
309                        this.scriptStarted(cursor);
310                        // Do not tokenize script tag contents.
311                        return cursor;
312                    }
313
314                    if (this._condition.parseCondition & this._parseConditions.STYLE) {
315                        this.styleSheetStarted(cursor);
316                        // Do not tokenize style tag contents.
317                        return cursor;
318                    }
319
320                    this._condition.parseCondition = this._parseConditions.INITIAL;
321                    return cursor;
322                }
323
324                <TAG> StringLiteral { return this._stringToken(cursor, true); }
325                <TAG> DoubleStringStart => DSTRING { return this._stringToken(cursor); }
326                <DSTRING> DoubleStringContent => DSTRING { return this._stringToken(cursor); }
327                <DSTRING> DoubleStringEnd => TAG { return this._stringToken(cursor, true); }
328                <TAG> SingleStringStart => SSTRING { return this._stringToken(cursor); }
329                <SSTRING> SingleStringContent => SSTRING { return this._stringToken(cursor); }
330                <SSTRING> SingleStringEnd => TAG { return this._stringToken(cursor, true); }
331
332                <TAG> EqualSign => TAG
333                {
334                    if (this._isExpectingAttribute())
335                        this._setExpectingAttributeValue();
336                    this.tokenType = null;
337                    return cursor;
338                }
339
340                <TAG> Identifier
341                {
342                    if (this._condition.parseCondition === this._parseConditions.SCRIPT || this._condition.parseCondition === this._parseConditions.STYLE) {
343                        // Fall through if expecting attributes.
344                        this.tokenType = null;
345                        return cursor;
346                    }
347
348                    if (this._condition.parseCondition === this._parseConditions.INITIAL) {
349                        this.tokenType = "html-tag";
350                        this._setExpectingAttribute();
351                        var token = this._line.substring(cursorOnEnter, cursor);
352                        if (token === "a")
353                            this._condition.parseCondition |= this._parseConditions.A_NODE;
354                        else if (this._condition.parseCondition & this._parseConditions.A_NODE)
355                            this._condition.parseCondition ^= this._parseConditions.A_NODE;
356                    } else if (this._isExpectingAttribute()) {
357                        var token = this._line.substring(cursorOnEnter, cursor);
358                        if (token === "href" || token === "src")
359                            this._condition.parseCondition |= this._parseConditions.LINKIFY;
360                        else if (this._condition.parseCondition |= this._parseConditions.LINKIFY)
361                            this._condition.parseCondition ^= this._parseConditions.LINKIFY;
362                        this.tokenType = "html-attribute-name";
363                    } else if (this._isExpectingAttributeValue())
364                        this.tokenType = this._attrValueTokenType();
365                    else
366                        this.tokenType = null;
367                    return cursor;
368                }
369                <*> [^] { this.tokenType = null; return cursor; }
370            */
371        }
372    }
373}
374
375WebInspector.SourceHTMLTokenizer.prototype.__proto__ = WebInspector.SourceTokenizer.prototype;
376