1# Copyright 2015 The Chromium Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5from __future__ import print_function 6 7import parser 8import symbol 9import sys 10import token 11import tokenize 12 13from py_utils.refactor import offset_token 14 15 16class Snippet(object): 17 """A node in the Python parse tree. 18 19 The Python grammar is defined at: 20 https://docs.python.org/2/reference/grammar.html 21 22 There are two types of Snippets: 23 TokenSnippets are leaf nodes containing actual text. 24 Symbols are internal nodes representing higher-level groupings, and are 25 defined by the left-hand sides of the BNFs in the above link. 26 """ 27 @property 28 def type(self): 29 raise NotImplementedError() 30 31 @property 32 def type_name(self): 33 raise NotImplementedError() 34 35 @property 36 def children(self): 37 """Return a list of this node's children.""" 38 raise NotImplementedError() 39 40 @property 41 def tokens(self): 42 """Return a tuple of the tokens this Snippet contains.""" 43 raise NotImplementedError() 44 45 def PrintTree(self, indent=0, stream=sys.stdout): 46 """Spew a pretty-printed parse tree. Mostly useful for debugging.""" 47 raise NotImplementedError() 48 49 def __str__(self): 50 return offset_token.Untokenize(self.tokens) 51 52 def FindAll(self, snippet_type): 53 if isinstance(snippet_type, int): 54 if self.type == snippet_type: 55 yield self 56 else: 57 if isinstance(self, snippet_type): 58 yield self 59 60 for child in self.children: 61 for snippet in child.FindAll(snippet_type): 62 yield snippet 63 64 def FindChild(self, snippet_type, **kwargs): 65 for child in self.children: 66 if isinstance(snippet_type, int): 67 if child.type != snippet_type: 68 continue 69 else: 70 if not isinstance(child, snippet_type): 71 continue 72 73 for attribute, value in kwargs: 74 if getattr(child, attribute) != value: 75 break 76 else: 77 return child 78 raise ValueError('%s is not in %s. Children are: %s' % 79 (snippet_type, self, self.children)) 80 81 def FindChildren(self, snippet_type): 82 if isinstance(snippet_type, int): 83 for child in self.children: 84 if child.type == snippet_type: 85 yield child 86 else: 87 for child in self.children: 88 if isinstance(child, snippet_type): 89 yield child 90 91 92class TokenSnippet(Snippet): 93 """A Snippet containing a list of tokens. 94 95 A list of tokens may start with any number of comments and non-terminating 96 newlines, but must end with a syntactically meaningful token. 97 """ 98 99 def __init__(self, token_type, tokens): 100 # For operators and delimiters, the TokenSnippet's type may be more specific 101 # than the type of the constituent token. E.g. the TokenSnippet type is 102 # token.DOT, but the token type is token.OP. This is because the parser 103 # has more context than the tokenizer. 104 self._type = token_type 105 self._tokens = tokens 106 self._modified = False 107 108 @classmethod 109 def Create(cls, token_type, string, offset=(0, 0)): 110 return cls(token_type, 111 [offset_token.OffsetToken(token_type, string, offset)]) 112 113 @property 114 def type(self): 115 return self._type 116 117 @property 118 def type_name(self): 119 return token.tok_name[self.type] 120 121 @property 122 def value(self): 123 return self._tokens[-1].string 124 125 @value.setter 126 def value(self, value): 127 self._tokens[-1].string = value 128 self._modified = True 129 130 @property 131 def children(self): 132 return [] 133 134 @property 135 def tokens(self): 136 return tuple(self._tokens) 137 138 @property 139 def modified(self): 140 return self._modified 141 142 def PrintTree(self, indent=0, stream=sys.stdout): 143 stream.write(' ' * indent) 144 if not self.tokens: 145 print(self.type_name, file=stream) 146 return 147 148 print('%-4s' % self.type_name, repr(self.tokens[0].string), file=stream) 149 for tok in self.tokens[1:]: 150 stream.write(' ' * indent) 151 print(' ' * max(len(self.type_name), 4), repr(tok.string), file=stream) 152 153 154class Symbol(Snippet): 155 """A Snippet containing sub-Snippets. 156 157 The possible types and type_names are defined in Python's symbol module.""" 158 159 def __init__(self, symbol_type, children): 160 self._type = symbol_type 161 self._children = children 162 163 @property 164 def type(self): 165 return self._type 166 167 @property 168 def type_name(self): 169 return symbol.sym_name[self.type] 170 171 @property 172 def children(self): 173 return self._children 174 175 @children.setter 176 def children(self, value): # pylint: disable=arguments-differ 177 self._children = value 178 179 @property 180 def tokens(self): 181 tokens = [] 182 for child in self.children: 183 tokens += child.tokens 184 return tuple(tokens) 185 186 @property 187 def modified(self): 188 return any(child.modified for child in self.children) 189 190 def PrintTree(self, indent=0, stream=sys.stdout): 191 stream.write(' ' * indent) 192 193 # If there's only one child, collapse it onto the same line. 194 node = self 195 while len(node.children) == 1 and len(node.children[0].children) == 1: 196 print(node.type_name, end=' ', file=stream) 197 node = node.children[0] 198 199 print(node.type_name, file=stream) 200 for child in node.children: 201 child.PrintTree(indent + 2, stream) 202 203 204def Snippetize(f): 205 """Return the syntax tree of the given file.""" 206 f.seek(0) 207 syntax_tree = parser.st2list(parser.suite(f.read())) 208 tokens = offset_token.Tokenize(f) 209 210 snippet = _SnippetizeNode(syntax_tree, tokens) 211 assert not tokens 212 return snippet 213 214 215def _SnippetizeNode(node, tokens): 216 # The parser module gives a syntax tree that discards comments, 217 # non-terminating newlines, and whitespace information. Use the tokens given 218 # by the tokenize module to annotate the syntax tree with the information 219 # needed to exactly reproduce the original source code. 220 node_type = node[0] 221 222 if node_type >= token.NT_OFFSET: 223 # Symbol. 224 children = tuple(_SnippetizeNode(child, tokens) for child in node[1:]) 225 return Symbol(node_type, children) 226 else: 227 # Token. 228 grabbed_tokens = [] 229 while tokens and ( 230 tokens[0].type == tokenize.COMMENT or tokens[0].type == tokenize.NL): 231 grabbed_tokens.append(tokens.popleft()) 232 233 # parser has 2 NEWLINEs right before the end. 234 # tokenize has 0 or 1 depending on if the file has one. 235 # Create extra nodes without consuming tokens to account for this. 236 if node_type == token.NEWLINE: 237 for tok in tokens: 238 if tok.type == token.ENDMARKER: 239 return TokenSnippet(node_type, grabbed_tokens) 240 if tok.type != token.DEDENT: 241 break 242 243 assert tokens[0].type == token.OP or node_type == tokens[0].type 244 245 grabbed_tokens.append(tokens.popleft()) 246 return TokenSnippet(node_type, grabbed_tokens) 247