1# The MIT License (MIT) 2# 3# Copyright (c) 2007-2018 Einar Lielmanis, Liam Newman, and contributors. 4# 5# Permission is hereby granted, free of charge, to any person 6# obtaining a copy of this software and associated documentation files 7# (the "Software"), to deal in the Software without restriction, 8# including without limitation the rights to use, copy, modify, merge, 9# publish, distribute, sublicense, and/or sell copies of the Software, 10# and to permit persons to whom the Software is furnished to do so, 11# subject to the following conditions: 12# 13# The above copyright notice and this permission notice shall be 14# included in all copies or substantial portions of the Software. 15# 16# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23# SOFTWARE. 24 25import re 26from ..core.inputscanner import InputScanner 27from ..core.tokenizer import TokenTypes as BaseTokenTypes 28from ..core.tokenizer import Tokenizer as BaseTokenizer 29from ..core.tokenizer import TokenizerPatterns as BaseTokenizerPatterns 30from ..core.directives import Directives 31 32from ..core.pattern import Pattern 33from ..core.templatablepattern import TemplatablePattern 34 35 36__all__ = ["TOKEN", "Tokenizer", "TokenTypes"] 37 38class TokenTypes(BaseTokenTypes): 39 START_EXPR = 'TK_START_EXPR' 40 END_EXPR = 'TK_END_EXPR' 41 START_BLOCK = 'TK_START_BLOCK' 42 END_BLOCK = 'TK_END_BLOCK' 43 WORD = 'TK_WORD' 44 RESERVED = 'TK_RESERVED' 45 SEMICOLON = 'TK_SEMICOLON' 46 STRING = 'TK_STRING' 47 EQUALS = 'TK_EQUALS' 48 OPERATOR = 'TK_OPERATOR' 49 COMMA = 'TK_COMMA' 50 BLOCK_COMMENT = 'TK_BLOCK_COMMENT' 51 COMMENT = 'TK_COMMENT' 52 DOT = 'TK_DOT' 53 UNKNOWN = 'TK_UNKNOWN' 54 55 def __init__(self): 56 pass 57 58 59TOKEN = TokenTypes() 60 61dot_pattern = re.compile(r'[^\d\.]') 62 63number_pattern = re.compile( 64 r'0[xX][0123456789abcdefABCDEF]*|0[oO][01234567]*|0[bB][01]*|\d+n|(?:\.\d+|\d+\.?\d*)(?:[eE][+-]?\d+)?') 65digit = re.compile(r'[0-9]') 66 67 68positionable_operators = frozenset( 69 (">>> === !== " + 70 "<< && >= ** != == <= >> || |> " + 71 "< / - + > : & % ? ^ | *").split(' ')) 72 73punct = (">>>= " + 74 "... >>= <<= === >>> !== **= " + 75 "=> ^= :: /= << <= == && -= >= >> != -- += ** || ++ %= &= *= |= |> " + 76 "= ! ? > < : / ^ - + * & % ~ |") 77 78punct = re.compile(r'([-[\]{}()*+?.,\\^$|#])').sub(r'\\\1', punct) 79# ?. but not if followed by a number 80punct = '\\?\\.(?!\\d) ' + punct 81punct = punct.replace(' ', '|') 82 83punct_pattern = re.compile(punct) 84 85# Words which always should start on a new line 86line_starters = frozenset( 87 ('continue,try,throw,return,var,let,const,if,switch,case,default,for,' + 88 'while,break,function,import,export').split(',')) 89reserved_words = line_starters | frozenset(['do', 90 'in', 91 'of', 92 'else', 93 'get', 94 'set', 95 'new', 96 'catch', 97 'finally', 98 'typeof', 99 'yield', 100 'async', 101 'await', 102 'from', 103 'as']) 104 105reserved_word_pattern = re.compile(r'^(?:' + '|'.join(reserved_words) + r')$') 106 107directives_core = Directives(r'/\*', r'\*/') 108 109xmlRegExp = re.compile( 110 r'[\s\S]*?<(\/?)([-a-zA-Z:0-9_.]+|{[\s\S]+?}|!\[CDATA\[[\s\S]*?\]\])(\s+{[\s\S]+?}|\s+[-a-zA-Z:0-9_.]+|\s+[-a-zA-Z:0-9_.]+\s*=\s*(\'[^\']*\'|"[^"]*"|{[\s\S]+?}))*\s*(/?)\s*>') 111 112class TokenizerPatterns(BaseTokenizerPatterns): 113 def __init__(self, input_scanner, acorn, options): 114 BaseTokenizerPatterns.__init__(self, input_scanner) 115 116 # This is not pretty, but given how we did the version import 117 # it is the only way to do this without having setup.py fail on a missing 118 # six dependency. 119 six = __import__("six") 120 121 # IMPORTANT: This string must be run through six to handle \u chars 122 self.whitespace = self.whitespace.matching( 123 six.u(r'\u00A0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000\ufeff'), 124 six.u(r'\u2028\u2029')) 125 126 pattern = Pattern(input_scanner) 127 templatable = TemplatablePattern(input_scanner) \ 128 .read_options(options) 129 130 self.identifier = templatable.starting_with(acorn.identifier \ 131 ).matching(acorn.identifierMatch) 132 self.number = pattern.matching(number_pattern) 133 self.punct = pattern.matching(punct_pattern) 134 self.comment = pattern.starting_with(r'//').until( 135 six.u(r'[\n\r\u2028\u2029]')) 136 self.block_comment = pattern.starting_with(r'/\*').until_after(r'\*/') 137 self.html_comment_start = pattern.matching(r'<!--') 138 self.html_comment_end = pattern.matching(r'-->') 139 self.include = pattern.starting_with(r'#include' \ 140 ).until_after(acorn.lineBreak) 141 self.shebang = pattern.starting_with(r'#!' \ 142 ).until_after(acorn.lineBreak) 143 144 self.xml = pattern.matching(xmlRegExp) 145 146 self.single_quote = templatable.until(six.u(r"['\\\n\r\u2028\u2029]")) 147 self.double_quote = templatable.until(six.u(r'["\\\n\r\u2028\u2029]')) 148 self.template_text = templatable.until(r'[`\\$]') 149 self.template_expression = templatable.until(r'[`}\\]') 150 151 152 153class Tokenizer(BaseTokenizer): 154 positionable_operators = positionable_operators 155 line_starters = line_starters 156 157 def __init__(self, input_string, opts): 158 BaseTokenizer.__init__(self, input_string, opts) 159 160 import jsbeautifier.javascript.acorn as acorn 161 self.acorn = acorn 162 163 self.in_html_comment = False 164 self.has_char_escapes = False 165 166 self._patterns = TokenizerPatterns(self._input, self.acorn, opts) 167 168 169 def _reset(self): 170 self.in_html_comment = False 171 172 def _is_comment(self, current_token): 173 return current_token.type == TOKEN.COMMENT or \ 174 current_token.type == TOKEN.BLOCK_COMMENT or \ 175 current_token.type == TOKEN.UNKNOWN 176 177 178 def _is_opening(self, current_token): 179 return current_token.type == TOKEN.START_BLOCK or current_token.type == TOKEN.START_EXPR 180 181 def _is_closing(self, current_token, open_token): 182 return (current_token.type == TOKEN.END_BLOCK or current_token.type == TOKEN.END_EXPR) and \ 183 (open_token is not None and ( 184 (current_token.text == ']' and open_token.text == '[') or 185 (current_token.text == ')' and open_token.text == '(') or 186 (current_token.text == '}' and open_token.text == '{'))) 187 188 def _get_next_token(self, previous_token, open_token): 189 token = None 190 self._readWhitespace() 191 192 c = self._input.peek() 193 if c is None: 194 token = self._create_token(TOKEN.EOF, '') 195 196 token = token or self._read_non_javascript(c) 197 token = token or self._read_string(c) 198 token = token or self._read_word(previous_token) 199 token = token or self._read_singles(c) 200 token = token or self._read_comment(c) 201 token = token or self._read_regexp(c, previous_token) 202 token = token or self._read_xml(c, previous_token) 203 token = token or self._read_punctuation() 204 token = token or self._create_token(TOKEN.UNKNOWN, self._input.next()) 205 206 return token 207 208 def _read_singles(self, c): 209 token = None 210 211 if c == '(' or c == '[': 212 token = self._create_token(TOKEN.START_EXPR, c) 213 elif c == ')' or c == ']': 214 token = self._create_token(TOKEN.END_EXPR, c) 215 elif c == '{': 216 token = self._create_token(TOKEN.START_BLOCK, c) 217 elif c == '}': 218 token = self._create_token(TOKEN.END_BLOCK, c) 219 elif c == ';': 220 token = self._create_token(TOKEN.SEMICOLON, c) 221 elif c == '.' and self._input.peek(1) is not None and \ 222 bool(dot_pattern.match(self._input.peek(1))): 223 token = self._create_token(TOKEN.DOT, c) 224 elif c == ',': 225 token = self._create_token(TOKEN.COMMA, c) 226 227 if token is not None: 228 self._input.next() 229 230 return token 231 232 def _read_word(self, previous_token): 233 resulting_string = self._patterns.identifier.read() 234 235 if bool(resulting_string): 236 resulting_string = re.sub(self.acorn.allLineBreaks, '\n', resulting_string) 237 if not (previous_token.type == TOKEN.DOT or ( 238 previous_token.type == TOKEN.RESERVED and ( 239 previous_token.text == 'set' or previous_token.text == 'get') 240 )) and reserved_word_pattern.match(resulting_string): 241 if resulting_string == 'in' or resulting_string == 'of': 242 # in and of are operators, need to hack 243 return self._create_token(TOKEN.OPERATOR, resulting_string) 244 245 return self._create_token(TOKEN.RESERVED, resulting_string) 246 247 return self._create_token(TOKEN.WORD, resulting_string) 248 249 resulting_string = self._patterns.number.read() 250 if resulting_string != '': 251 return self._create_token(TOKEN.WORD, resulting_string) 252 253 def _read_comment(self, c): 254 token = None 255 if c == '/': 256 comment = '' 257 if self._input.peek(1) == '*': # peek /* .. */ comment 258 comment = self._patterns.block_comment.read() 259 260 directives = directives_core.get_directives(comment) 261 if directives and directives.get('ignore') == 'start': 262 comment += directives_core.readIgnored(self._input) 263 comment = re.sub(self.acorn.allLineBreaks, '\n', comment) 264 token = self._create_token(TOKEN.BLOCK_COMMENT, comment) 265 token.directives = directives 266 267 elif self._input.peek(1) == '/': # peek // comment 268 comment = self._patterns.comment.read() 269 token = self._create_token(TOKEN.COMMENT, comment) 270 271 return token 272 273 274 def _read_string(self, c): 275 if c == '`' or c == "'" or c == '"': 276 resulting_string = self._input.next() 277 self.has_char_escapes = False 278 279 if c == '`': 280 resulting_string += self.parse_string('`', True, '${') 281 else: 282 resulting_string += self.parse_string(c) 283 284 if self.has_char_escapes and self._options.unescape_strings: 285 resulting_string = self.unescape_string(resulting_string) 286 287 if self._input.peek() == c: 288 resulting_string += self._input.next() 289 290 resulting_string = re.sub( 291 self.acorn.allLineBreaks, '\n', resulting_string) 292 293 return self._create_token(TOKEN.STRING, resulting_string) 294 295 return None 296 297 def _read_regexp(self, c, previous_token): 298 299 if c == '/' and self.allowRegExOrXML(previous_token): 300 # handle regexp 301 resulting_string = self._input.next() 302 esc = False 303 304 in_char_class = False 305 while self._input.hasNext() and \ 306 (esc or in_char_class or self._input.peek() != c) and \ 307 not self._input.testChar(self.acorn.newline): 308 resulting_string += self._input.peek() 309 if not esc: 310 esc = self._input.peek() == '\\' 311 if self._input.peek() == '[': 312 in_char_class = True 313 elif self._input.peek() == ']': 314 in_char_class = False 315 else: 316 esc = False 317 self._input.next() 318 319 if self._input.peek() == c: 320 resulting_string += self._input.next() 321 322 if c == '/': 323 # regexps may have modifiers /regexp/MOD, so fetch those too 324 # Only [gim] are valid, but if the user puts in garbage, do 325 # what we can to take it. 326 resulting_string += self._input.read( 327 self.acorn.identifier) 328 329 return self._create_token(TOKEN.STRING, resulting_string) 330 331 return None 332 333 334 def _read_xml(self, c, previous_token): 335 if self._options.e4x and c == "<" and self.allowRegExOrXML(previous_token): 336 # handle e4x xml literals 337 xmlStr = "" 338 match = self._patterns.xml.read_match() 339 if match and not match.group(1): 340 rootTag = match.group(2) 341 rootTag = re.sub(r'^{\s+', '{', re.sub(r'\s+}$', '}', rootTag)) 342 isCurlyRoot = rootTag.startswith('{') 343 depth = 0 344 while bool(match): 345 isEndTag = match.group(1) 346 tagName = match.group(2) 347 isSingletonTag = ( 348 match.groups()[-1] != "") or (match.group(2)[0:8] == "![CDATA[") 349 if not isSingletonTag and (tagName == rootTag or ( 350 isCurlyRoot and re.sub(r'^{\s+', '{', re.sub(r'\s+}$', '}', tagName)))): 351 if isEndTag: 352 depth -= 1 353 else: 354 depth += 1 355 356 xmlStr += match.group(0) 357 if depth <= 0: 358 break 359 360 match = self._patterns.xml.read_match() 361 362 # if we didn't close correctly, keep unformatted. 363 if not match: 364 xmlStr += self._input.match(re.compile(r'[\s\S]*')).group(0) 365 366 xmlStr = re.sub(self.acorn.allLineBreaks, '\n', xmlStr) 367 return self._create_token(TOKEN.STRING, xmlStr) 368 369 return None 370 371 def _read_non_javascript(self, c): 372 resulting_string = '' 373 374 if c == '#': 375 376 # she-bang 377 if self._is_first_token(): 378 resulting_string = self._patterns.shebang.read() 379 if resulting_string: 380 return self._create_token(TOKEN.UNKNOWN, resulting_string.strip() + '\n') 381 382 # handles extendscript #includes 383 resulting_string = self._patterns.include.read() 384 385 if resulting_string: 386 return self._create_token(TOKEN.UNKNOWN, resulting_string.strip() + '\n') 387 388 c = self._input.next() 389 390 # Spidermonkey-specific sharp variables for circular references 391 # https://developer.mozilla.org/En/Sharp_variables_in_JavaScript 392 # http://mxr.mozilla.org/mozilla-central/source/js/src/jsscan.cpp 393 # around line 1935 394 sharp = '#' 395 if self._input.hasNext() and self._input.testChar(digit): 396 while True: 397 c = self._input.next() 398 sharp += c 399 if (not self._input.hasNext()) or c == '#' or c == '=': 400 break 401 if c == '#': 402 pass 403 elif self._input.peek() == '[' and self._input.peek(1) == ']': 404 sharp += '[]' 405 self._input.next() 406 self._input.next() 407 elif self._input.peek() == '{' and self._input.peek(1) == '}': 408 sharp += '{}' 409 self._input.next() 410 self._input.next() 411 412 return self._create_token(TOKEN.WORD, sharp) 413 414 self._input.back() 415 416 elif c == '<' and self._is_first_token(): 417 418 if self._patterns.html_comment_start.read(): 419 c = '<!--' 420 while self._input.hasNext() and not self._input.testChar(self.acorn.newline): 421 c += self._input.next() 422 423 self.in_html_comment = True 424 return self._create_token(TOKEN.COMMENT, c) 425 426 elif c == '-' and self.in_html_comment and \ 427 self._patterns.html_comment_end.read(): 428 self.in_html_comment = False 429 return self._create_token(TOKEN.COMMENT, '-->') 430 431 return None 432 433 def _read_punctuation(self): 434 token = None 435 resulting_string = self._patterns.punct.read() 436 if resulting_string != '': 437 if resulting_string == '=': 438 token = self._create_token(TOKEN.EQUALS, resulting_string) 439 elif resulting_string == '?.': 440 token = self._create_token(TOKEN.DOT, resulting_string) 441 else: 442 token = self._create_token(TOKEN.OPERATOR, resulting_string) 443 444 return token 445 446 __regexTokens = { TOKEN.COMMENT, TOKEN.START_EXPR, TOKEN.START_BLOCK, 447 TOKEN.START, TOKEN.END_BLOCK, TOKEN.OPERATOR, 448 TOKEN.EQUALS, TOKEN.EOF, TOKEN.SEMICOLON, TOKEN.COMMA } 449 def allowRegExOrXML(self, previous_token): 450 return (previous_token.type == TOKEN.RESERVED and previous_token.text in {'return', 'case', 'throw', 'else', 'do', 'typeof', 'yield'}) or \ 451 (previous_token.type == TOKEN.END_EXPR and previous_token.text == ')' and 452 previous_token.opened.previous.type == TOKEN.RESERVED and previous_token.opened.previous.text in {'if', 'while', 'for'}) or \ 453 (previous_token.type in self.__regexTokens ) 454 455 def parse_string( 456 self, 457 delimiter, 458 allow_unescaped_newlines=False, 459 start_sub=None): 460 if delimiter == '\'': 461 pattern = self._patterns.single_quote 462 elif delimiter == '"': 463 pattern = self._patterns.double_quote 464 elif delimiter == '`': 465 pattern = self._patterns.template_text 466 elif delimiter == '}': 467 pattern = self._patterns.template_expression 468 resulting_string = pattern.read() 469 next = '' 470 while self._input.hasNext(): 471 next = self._input.next() 472 if next == delimiter or \ 473 (not allow_unescaped_newlines and 474 self.acorn.newline.match(next)): 475 self._input.back() 476 break 477 elif next == '\\' and self._input.hasNext(): 478 current_char = self._input.peek() 479 if current_char == 'x' or current_char == 'u': 480 self.has_char_escapes = True 481 elif current_char == '\r' and self._input.peek(1) == '\n': 482 self._input.next() 483 484 next += self._input.next() 485 elif start_sub is not None: 486 if start_sub == '${' and next == '$' and \ 487 self._input.peek() == '{': 488 next += self._input.next() 489 490 if start_sub == next: 491 if delimiter == '`': 492 next += self.parse_string( 493 '}', allow_unescaped_newlines, '`') 494 else: 495 next += self.parse_string( 496 '`', allow_unescaped_newlines, '${') 497 498 if self._input.hasNext(): 499 next += self._input.next() 500 501 next += pattern.read() 502 resulting_string += next 503 return resulting_string 504 505 506 def unescape_string(self, s): 507 # You think that a regex would work for this 508 # return s.replace(/\\x([0-9a-f]{2})/gi, function(match, val) { 509 # return String.fromCharCode(parseInt(val, 16)); 510 # }) 511 # However, dealing with '\xff', '\\xff', '\\\xff' makes this more fun. 512 out = self.acorn.six.u('') 513 escaped = 0 514 515 input_scan = InputScanner(s) 516 matched = None 517 518 while input_scan.hasNext(): 519 # Keep any whitespace, non-slash characters 520 # also keep slash pairs. 521 matched = input_scan.match(re.compile(r'([\s]|[^\\]|\\\\)+')) 522 523 if matched: 524 out += matched.group(0) 525 526 if input_scan.peek() != '\\': 527 continue 528 529 input_scan.next() 530 if input_scan.peek() == 'x': 531 matched = input_scan.match(re.compile(r'x([0-9A-Fa-f]{2})')) 532 elif input_scan.peek() == 'u': 533 matched = input_scan.match(re.compile(r'u([0-9A-Fa-f]{4})')) 534 else: 535 out += '\\' 536 if input_scan.hasNext(): 537 out += input_scan.next() 538 continue 539 540 # If there's some error decoding, return the original string 541 if not matched: 542 return s 543 544 escaped = int(matched.group(1), 16) 545 546 if escaped > 0x7e and escaped <= 0xff and matched.group( 547 0).startswith('x'): 548 # we bail out on \x7f..\xff, 549 # leaving whole string escaped, 550 # as it's probably completely binary 551 return s 552 elif escaped >= 0x00 and escaped < 0x20: 553 # leave 0x00...0x1f escaped 554 out += '\\' + matched.group(0) 555 continue 556 elif escaped == 0x22 or escaped == 0x27 or escaped == 0x5c: 557 # single-quote, apostrophe, backslash - escape these 558 out += ('\\' + chr(escaped)) 559 else: 560 out += self.acorn.six.unichr(escaped) 561 562 return out 563