1# The MIT License (MIT)
2#
3# Copyright (c) 2007-2018 Einar Lielmanis, Liam Newman, and contributors.
4#
5# Permission is hereby granted, free of charge, to any person
6# obtaining a copy of this software and associated documentation files
7# (the "Software"), to deal in the Software without restriction,
8# including without limitation the rights to use, copy, modify, merge,
9# publish, distribute, sublicense, and/or sell copies of the Software,
10# and to permit persons to whom the Software is furnished to do so,
11# subject to the following conditions:
12#
13# The above copyright notice and this permission notice shall be
14# included in all copies or substantial portions of the Software.
15#
16# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23# SOFTWARE.
24
25import re
26from ..core.inputscanner import InputScanner
27from ..core.tokenizer import TokenTypes as BaseTokenTypes
28from ..core.tokenizer import Tokenizer as BaseTokenizer
29from ..core.tokenizer import TokenizerPatterns as BaseTokenizerPatterns
30from ..core.directives import Directives
31
32from ..core.pattern import Pattern
33from ..core.templatablepattern import TemplatablePattern
34
35
36__all__ = ["TOKEN", "Tokenizer", "TokenTypes"]
37
38class TokenTypes(BaseTokenTypes):
39    START_EXPR = 'TK_START_EXPR'
40    END_EXPR = 'TK_END_EXPR'
41    START_BLOCK = 'TK_START_BLOCK'
42    END_BLOCK = 'TK_END_BLOCK'
43    WORD = 'TK_WORD'
44    RESERVED = 'TK_RESERVED'
45    SEMICOLON = 'TK_SEMICOLON'
46    STRING = 'TK_STRING'
47    EQUALS = 'TK_EQUALS'
48    OPERATOR = 'TK_OPERATOR'
49    COMMA = 'TK_COMMA'
50    BLOCK_COMMENT = 'TK_BLOCK_COMMENT'
51    COMMENT = 'TK_COMMENT'
52    DOT = 'TK_DOT'
53    UNKNOWN = 'TK_UNKNOWN'
54
55    def __init__(self):
56        pass
57
58
59TOKEN = TokenTypes()
60
61dot_pattern = re.compile(r'[^\d\.]')
62
63number_pattern = re.compile(
64    r'0[xX][0123456789abcdefABCDEF]*|0[oO][01234567]*|0[bB][01]*|\d+n|(?:\.\d+|\d+\.?\d*)(?:[eE][+-]?\d+)?')
65digit = re.compile(r'[0-9]')
66
67
68positionable_operators = frozenset(
69    (">>> === !== " +
70    "<< && >= ** != == <= >> || |> " +
71    "< / - + > : & % ? ^ | *").split(' '))
72
73punct =  (">>>= " +
74    "... >>= <<= === >>> !== **= " +
75    "=> ^= :: /= << <= == && -= >= >> != -- += ** || ++ %= &= *= |= |> " +
76    "= ! ? > < : / ^ - + * & % ~ |")
77
78punct = re.compile(r'([-[\]{}()*+?.,\\^$|#])').sub(r'\\\1', punct)
79# ?. but not if followed by a number
80punct = '\\?\\.(?!\\d) ' + punct
81punct = punct.replace(' ', '|')
82
83punct_pattern = re.compile(punct)
84
85# Words which always should start on a new line
86line_starters = frozenset(
87    ('continue,try,throw,return,var,let,const,if,switch,case,default,for,' +
88    'while,break,function,import,export').split(','))
89reserved_words = line_starters | frozenset(['do',
90                                    'in',
91                                    'of',
92                                    'else',
93                                    'get',
94                                    'set',
95                                    'new',
96                                    'catch',
97                                    'finally',
98                                    'typeof',
99                                    'yield',
100                                    'async',
101                                    'await',
102                                    'from',
103                                    'as'])
104
105reserved_word_pattern = re.compile(r'^(?:' + '|'.join(reserved_words) + r')$')
106
107directives_core = Directives(r'/\*', r'\*/')
108
109xmlRegExp = re.compile(
110    r'[\s\S]*?<(\/?)([-a-zA-Z:0-9_.]+|{[\s\S]+?}|!\[CDATA\[[\s\S]*?\]\])(\s+{[\s\S]+?}|\s+[-a-zA-Z:0-9_.]+|\s+[-a-zA-Z:0-9_.]+\s*=\s*(\'[^\']*\'|"[^"]*"|{[\s\S]+?}))*\s*(/?)\s*>')
111
112class TokenizerPatterns(BaseTokenizerPatterns):
113    def __init__(self, input_scanner, acorn, options):
114        BaseTokenizerPatterns.__init__(self, input_scanner)
115
116        # This is not pretty, but given how we did the version import
117        # it is the only way to do this without having setup.py fail on a missing
118        # six dependency.
119        six = __import__("six")
120
121        # IMPORTANT: This string must be run through six to handle \u chars
122        self.whitespace = self.whitespace.matching(
123            six.u(r'\u00A0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000\ufeff'),
124            six.u(r'\u2028\u2029'))
125
126        pattern = Pattern(input_scanner)
127        templatable = TemplatablePattern(input_scanner) \
128                .read_options(options)
129
130        self.identifier = templatable.starting_with(acorn.identifier \
131            ).matching(acorn.identifierMatch)
132        self.number = pattern.matching(number_pattern)
133        self.punct = pattern.matching(punct_pattern)
134        self.comment = pattern.starting_with(r'//').until(
135            six.u(r'[\n\r\u2028\u2029]'))
136        self.block_comment = pattern.starting_with(r'/\*').until_after(r'\*/')
137        self.html_comment_start = pattern.matching(r'<!--')
138        self.html_comment_end = pattern.matching(r'-->')
139        self.include = pattern.starting_with(r'#include' \
140            ).until_after(acorn.lineBreak)
141        self.shebang = pattern.starting_with(r'#!' \
142            ).until_after(acorn.lineBreak)
143
144        self.xml = pattern.matching(xmlRegExp)
145
146        self.single_quote = templatable.until(six.u(r"['\\\n\r\u2028\u2029]"))
147        self.double_quote = templatable.until(six.u(r'["\\\n\r\u2028\u2029]'))
148        self.template_text = templatable.until(r'[`\\$]')
149        self.template_expression = templatable.until(r'[`}\\]')
150
151
152
153class Tokenizer(BaseTokenizer):
154    positionable_operators = positionable_operators
155    line_starters = line_starters
156
157    def __init__(self, input_string, opts):
158        BaseTokenizer.__init__(self, input_string, opts)
159
160        import jsbeautifier.javascript.acorn as acorn
161        self.acorn = acorn
162
163        self.in_html_comment = False
164        self.has_char_escapes = False
165
166        self._patterns = TokenizerPatterns(self._input, self.acorn, opts)
167
168
169    def _reset(self):
170        self.in_html_comment = False
171
172    def _is_comment(self, current_token):
173        return current_token.type == TOKEN.COMMENT or \
174            current_token.type == TOKEN.BLOCK_COMMENT or \
175            current_token.type == TOKEN.UNKNOWN
176
177
178    def _is_opening(self, current_token):
179        return current_token.type == TOKEN.START_BLOCK or current_token.type == TOKEN.START_EXPR
180
181    def _is_closing(self, current_token, open_token):
182        return (current_token.type == TOKEN.END_BLOCK or current_token.type == TOKEN.END_EXPR) and \
183                    (open_token is not None and (
184                            (current_token.text == ']' and open_token.text == '[') or
185                            (current_token.text == ')' and open_token.text == '(') or
186                            (current_token.text == '}' and open_token.text == '{')))
187
188    def _get_next_token(self, previous_token, open_token):
189        token = None
190        self._readWhitespace()
191
192        c = self._input.peek()
193        if c is None:
194            token = self._create_token(TOKEN.EOF, '')
195
196        token = token or self._read_non_javascript(c)
197        token = token or self._read_string(c)
198        token = token or self._read_word(previous_token)
199        token = token or self._read_singles(c)
200        token = token or self._read_comment(c)
201        token = token or self._read_regexp(c, previous_token)
202        token = token or self._read_xml(c, previous_token)
203        token = token or self._read_punctuation()
204        token = token or self._create_token(TOKEN.UNKNOWN, self._input.next())
205
206        return token
207
208    def _read_singles(self, c):
209        token = None
210
211        if c == '(' or c == '[':
212            token = self._create_token(TOKEN.START_EXPR, c)
213        elif c == ')' or c == ']':
214            token = self._create_token(TOKEN.END_EXPR, c)
215        elif c == '{':
216            token = self._create_token(TOKEN.START_BLOCK, c)
217        elif c == '}':
218            token = self._create_token(TOKEN.END_BLOCK, c)
219        elif c == ';':
220            token = self._create_token(TOKEN.SEMICOLON, c)
221        elif c == '.' and self._input.peek(1) is not None and \
222                bool(dot_pattern.match(self._input.peek(1))):
223            token = self._create_token(TOKEN.DOT, c)
224        elif c == ',':
225            token = self._create_token(TOKEN.COMMA, c)
226
227        if token is not None:
228            self._input.next()
229
230        return token
231
232    def _read_word(self, previous_token):
233        resulting_string = self._patterns.identifier.read()
234
235        if bool(resulting_string):
236            resulting_string = re.sub(self.acorn.allLineBreaks, '\n', resulting_string)
237            if not (previous_token.type == TOKEN.DOT or (
238                    previous_token.type == TOKEN.RESERVED and (
239                        previous_token.text == 'set' or previous_token.text == 'get')
240                        )) and reserved_word_pattern.match(resulting_string):
241                if resulting_string == 'in' or resulting_string == 'of':
242                    # in and of are operators, need to hack
243                    return self._create_token(TOKEN.OPERATOR, resulting_string)
244
245                return self._create_token(TOKEN.RESERVED, resulting_string)
246
247            return self._create_token(TOKEN.WORD, resulting_string)
248
249        resulting_string = self._patterns.number.read()
250        if resulting_string != '':
251            return self._create_token(TOKEN.WORD, resulting_string)
252
253    def _read_comment(self, c):
254        token = None
255        if c == '/':
256            comment = ''
257            if self._input.peek(1) == '*':  # peek /* .. */ comment
258                comment = self._patterns.block_comment.read()
259
260                directives = directives_core.get_directives(comment)
261                if directives and directives.get('ignore') == 'start':
262                    comment += directives_core.readIgnored(self._input)
263                comment = re.sub(self.acorn.allLineBreaks, '\n', comment)
264                token = self._create_token(TOKEN.BLOCK_COMMENT, comment)
265                token.directives = directives
266
267            elif self._input.peek(1) == '/':  # peek // comment
268                comment = self._patterns.comment.read()
269                token = self._create_token(TOKEN.COMMENT, comment)
270
271        return token
272
273
274    def _read_string(self, c):
275        if c == '`' or c == "'" or c == '"':
276            resulting_string = self._input.next()
277            self.has_char_escapes = False
278
279            if c == '`':
280                resulting_string += self.parse_string('`', True, '${')
281            else:
282                resulting_string += self.parse_string(c)
283
284            if self.has_char_escapes and self._options.unescape_strings:
285                resulting_string = self.unescape_string(resulting_string)
286
287            if self._input.peek() == c:
288                resulting_string += self._input.next()
289
290            resulting_string = re.sub(
291                self.acorn.allLineBreaks, '\n', resulting_string)
292
293            return self._create_token(TOKEN.STRING, resulting_string)
294
295        return None
296
297    def _read_regexp(self, c, previous_token):
298
299        if c == '/' and self.allowRegExOrXML(previous_token):
300            # handle regexp
301            resulting_string = self._input.next()
302            esc = False
303
304            in_char_class = False
305            while self._input.hasNext() and \
306                    (esc or in_char_class or self._input.peek() != c) and \
307                    not self._input.testChar(self.acorn.newline):
308                resulting_string += self._input.peek()
309                if not esc:
310                    esc = self._input.peek() == '\\'
311                    if self._input.peek() == '[':
312                        in_char_class = True
313                    elif self._input.peek() == ']':
314                        in_char_class = False
315                else:
316                    esc = False
317                self._input.next()
318
319            if self._input.peek() == c:
320                resulting_string += self._input.next()
321
322                if c == '/':
323                    # regexps may have modifiers /regexp/MOD, so fetch those too
324                    # Only [gim] are valid, but if the user puts in garbage, do
325                    # what we can to take it.
326                    resulting_string += self._input.read(
327                        self.acorn.identifier)
328
329            return self._create_token(TOKEN.STRING, resulting_string)
330
331        return None
332
333
334    def _read_xml(self, c, previous_token):
335        if self._options.e4x and c == "<" and self.allowRegExOrXML(previous_token):
336            # handle e4x xml literals
337            xmlStr = ""
338            match = self._patterns.xml.read_match()
339            if match and not match.group(1):
340                rootTag = match.group(2)
341                rootTag = re.sub(r'^{\s+', '{', re.sub(r'\s+}$', '}', rootTag))
342                isCurlyRoot = rootTag.startswith('{')
343                depth = 0
344                while bool(match):
345                    isEndTag = match.group(1)
346                    tagName = match.group(2)
347                    isSingletonTag = (
348                        match.groups()[-1] != "") or (match.group(2)[0:8] == "![CDATA[")
349                    if not isSingletonTag and (tagName == rootTag or (
350                            isCurlyRoot and re.sub(r'^{\s+', '{', re.sub(r'\s+}$', '}', tagName)))):
351                        if isEndTag:
352                            depth -= 1
353                        else:
354                            depth += 1
355
356                    xmlStr += match.group(0)
357                    if depth <= 0:
358                        break
359
360                    match = self._patterns.xml.read_match()
361
362                # if we didn't close correctly, keep unformatted.
363                if not match:
364                    xmlStr += self._input.match(re.compile(r'[\s\S]*')).group(0)
365
366                xmlStr = re.sub(self.acorn.allLineBreaks, '\n', xmlStr)
367                return self._create_token(TOKEN.STRING, xmlStr)
368
369        return None
370
371    def _read_non_javascript(self, c):
372        resulting_string = ''
373
374        if c == '#':
375
376            # she-bang
377            if self._is_first_token():
378                resulting_string = self._patterns.shebang.read()
379                if resulting_string:
380                    return self._create_token(TOKEN.UNKNOWN, resulting_string.strip() + '\n')
381
382            # handles extendscript #includes
383            resulting_string = self._patterns.include.read()
384
385            if resulting_string:
386                return self._create_token(TOKEN.UNKNOWN, resulting_string.strip() + '\n')
387
388            c = self._input.next()
389
390            # Spidermonkey-specific sharp variables for circular references
391            # https://developer.mozilla.org/En/Sharp_variables_in_JavaScript
392            # http://mxr.mozilla.org/mozilla-central/source/js/src/jsscan.cpp
393            # around line 1935
394            sharp = '#'
395            if self._input.hasNext() and self._input.testChar(digit):
396                while True:
397                    c = self._input.next()
398                    sharp += c
399                    if (not self._input.hasNext()) or c == '#' or c == '=':
400                        break
401                if c == '#':
402                    pass
403                elif self._input.peek() == '[' and self._input.peek(1) == ']':
404                    sharp += '[]'
405                    self._input.next()
406                    self._input.next()
407                elif self._input.peek() == '{' and self._input.peek(1) == '}':
408                    sharp += '{}'
409                    self._input.next()
410                    self._input.next()
411
412                return self._create_token(TOKEN.WORD, sharp)
413
414            self._input.back()
415
416        elif c == '<' and self._is_first_token():
417
418            if self._patterns.html_comment_start.read():
419                c = '<!--'
420                while self._input.hasNext() and not self._input.testChar(self.acorn.newline):
421                    c += self._input.next()
422
423                self.in_html_comment = True
424                return self._create_token(TOKEN.COMMENT, c)
425
426        elif c == '-' and self.in_html_comment and \
427                self._patterns.html_comment_end.read():
428            self.in_html_comment = False
429            return self._create_token(TOKEN.COMMENT, '-->')
430
431        return None
432
433    def _read_punctuation(self):
434        token = None
435        resulting_string = self._patterns.punct.read()
436        if resulting_string != '':
437            if resulting_string == '=':
438                token = self._create_token(TOKEN.EQUALS, resulting_string)
439            elif resulting_string == '?.':
440                token = self._create_token(TOKEN.DOT, resulting_string)
441            else:
442                token = self._create_token(TOKEN.OPERATOR, resulting_string)
443
444        return token
445
446    __regexTokens = { TOKEN.COMMENT, TOKEN.START_EXPR, TOKEN.START_BLOCK,
447        TOKEN.START, TOKEN.END_BLOCK, TOKEN.OPERATOR,
448        TOKEN.EQUALS, TOKEN.EOF, TOKEN.SEMICOLON, TOKEN.COMMA }
449    def allowRegExOrXML(self, previous_token):
450        return (previous_token.type == TOKEN.RESERVED and previous_token.text in {'return', 'case', 'throw', 'else', 'do', 'typeof', 'yield'}) or \
451            (previous_token.type == TOKEN.END_EXPR and previous_token.text == ')' and
452                previous_token.opened.previous.type == TOKEN.RESERVED and previous_token.opened.previous.text in {'if', 'while', 'for'}) or \
453            (previous_token.type in self.__regexTokens )
454
455    def parse_string(
456            self,
457            delimiter,
458            allow_unescaped_newlines=False,
459            start_sub=None):
460        if delimiter == '\'':
461            pattern = self._patterns.single_quote
462        elif delimiter == '"':
463            pattern = self._patterns.double_quote
464        elif delimiter == '`':
465            pattern = self._patterns.template_text
466        elif delimiter == '}':
467            pattern = self._patterns.template_expression
468        resulting_string = pattern.read()
469        next = ''
470        while self._input.hasNext():
471            next = self._input.next()
472            if next == delimiter or \
473                (not allow_unescaped_newlines and
474                    self.acorn.newline.match(next)):
475                self._input.back()
476                break
477            elif next == '\\' and self._input.hasNext():
478                current_char = self._input.peek()
479                if current_char == 'x' or current_char == 'u':
480                    self.has_char_escapes = True
481                elif current_char == '\r' and self._input.peek(1) == '\n':
482                    self._input.next()
483
484                next += self._input.next()
485            elif start_sub is not None:
486                if start_sub == '${' and next == '$' and \
487                        self._input.peek() == '{':
488                    next += self._input.next()
489
490                if start_sub == next:
491                    if delimiter == '`':
492                        next += self.parse_string(
493                            '}', allow_unescaped_newlines, '`')
494                    else:
495                        next += self.parse_string(
496                            '`', allow_unescaped_newlines, '${')
497
498                    if self._input.hasNext():
499                        next += self._input.next()
500
501            next += pattern.read()
502            resulting_string += next
503        return resulting_string
504
505
506    def unescape_string(self, s):
507        # You think that a regex would work for this
508        # return s.replace(/\\x([0-9a-f]{2})/gi, function(match, val) {
509        #         return String.fromCharCode(parseInt(val, 16));
510        #     })
511        # However, dealing with '\xff', '\\xff', '\\\xff' makes this more fun.
512        out = self.acorn.six.u('')
513        escaped = 0
514
515        input_scan = InputScanner(s)
516        matched = None
517
518        while input_scan.hasNext():
519            # Keep any whitespace, non-slash characters
520            # also keep slash pairs.
521            matched = input_scan.match(re.compile(r'([\s]|[^\\]|\\\\)+'))
522
523            if matched:
524                out += matched.group(0)
525
526            if input_scan.peek() != '\\':
527                continue
528
529            input_scan.next()
530            if input_scan.peek() == 'x':
531                matched = input_scan.match(re.compile(r'x([0-9A-Fa-f]{2})'))
532            elif input_scan.peek() == 'u':
533                matched = input_scan.match(re.compile(r'u([0-9A-Fa-f]{4})'))
534            else:
535                out += '\\'
536                if input_scan.hasNext():
537                    out += input_scan.next()
538                continue
539
540            # If there's some error decoding, return the original string
541            if not matched:
542                return s
543
544            escaped = int(matched.group(1), 16)
545
546            if escaped > 0x7e and escaped <= 0xff and matched.group(
547                    0).startswith('x'):
548                # we bail out on \x7f..\xff,
549                # leaving whole string escaped,
550                # as it's probably completely binary
551                return s
552            elif escaped >= 0x00 and escaped < 0x20:
553                # leave 0x00...0x1f escaped
554                out += '\\' + matched.group(0)
555                continue
556            elif escaped == 0x22 or escaped == 0x27 or escaped == 0x5c:
557                # single-quote, apostrophe, backslash - escape these
558                out += ('\\' + chr(escaped))
559            else:
560                out += self.acorn.six.unichr(escaped)
561
562        return out
563