1# Copyright 2015 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5from __future__ import print_function
6
7import parser
8import symbol
9import sys
10import token
11import tokenize
12
13from py_utils.refactor import offset_token
14
15
16class Snippet(object):
17  """A node in the Python parse tree.
18
19  The Python grammar is defined at:
20  https://docs.python.org/2/reference/grammar.html
21
22  There are two types of Snippets:
23    TokenSnippets are leaf nodes containing actual text.
24    Symbols are internal nodes representing higher-level groupings, and are
25        defined by the left-hand sides of the BNFs in the above link.
26  """
27  @property
28  def type(self):
29    raise NotImplementedError()
30
31  @property
32  def type_name(self):
33    raise NotImplementedError()
34
35  @property
36  def children(self):
37    """Return a list of this node's children."""
38    raise NotImplementedError()
39
40  @property
41  def tokens(self):
42    """Return a tuple of the tokens this Snippet contains."""
43    raise NotImplementedError()
44
45  def PrintTree(self, indent=0, stream=sys.stdout):
46    """Spew a pretty-printed parse tree. Mostly useful for debugging."""
47    raise NotImplementedError()
48
49  def __str__(self):
50    return offset_token.Untokenize(self.tokens)
51
52  def FindAll(self, snippet_type):
53    if isinstance(snippet_type, int):
54      if self.type == snippet_type:
55        yield self
56    else:
57      if isinstance(self, snippet_type):
58        yield self
59
60    for child in self.children:
61      for snippet in child.FindAll(snippet_type):
62        yield snippet
63
64  def FindChild(self, snippet_type, **kwargs):
65    for child in self.children:
66      if isinstance(snippet_type, int):
67        if child.type != snippet_type:
68          continue
69      else:
70        if not isinstance(child, snippet_type):
71          continue
72
73      for attribute, value in kwargs:
74        if getattr(child, attribute) != value:
75          break
76      else:
77        return child
78    raise ValueError('%s is not in %s. Children are: %s' %
79                     (snippet_type, self, self.children))
80
81  def FindChildren(self, snippet_type):
82    if isinstance(snippet_type, int):
83      for child in self.children:
84        if child.type == snippet_type:
85          yield child
86    else:
87      for child in self.children:
88        if isinstance(child, snippet_type):
89          yield child
90
91
92class TokenSnippet(Snippet):
93  """A Snippet containing a list of tokens.
94
95  A list of tokens may start with any number of comments and non-terminating
96  newlines, but must end with a syntactically meaningful token.
97  """
98
99  def __init__(self, token_type, tokens):
100    # For operators and delimiters, the TokenSnippet's type may be more specific
101    # than the type of the constituent token. E.g. the TokenSnippet type is
102    # token.DOT, but the token type is token.OP. This is because the parser
103    # has more context than the tokenizer.
104    self._type = token_type
105    self._tokens = tokens
106    self._modified = False
107
108  @classmethod
109  def Create(cls, token_type, string, offset=(0, 0)):
110    return cls(token_type,
111               [offset_token.OffsetToken(token_type, string, offset)])
112
113  @property
114  def type(self):
115    return self._type
116
117  @property
118  def type_name(self):
119    return token.tok_name[self.type]
120
121  @property
122  def value(self):
123    return self._tokens[-1].string
124
125  @value.setter
126  def value(self, value):
127    self._tokens[-1].string = value
128    self._modified = True
129
130  @property
131  def children(self):
132    return []
133
134  @property
135  def tokens(self):
136    return tuple(self._tokens)
137
138  @property
139  def modified(self):
140    return self._modified
141
142  def PrintTree(self, indent=0, stream=sys.stdout):
143    stream.write(' ' * indent)
144    if not self.tokens:
145      print(self.type_name, file=stream)
146      return
147
148    print('%-4s' % self.type_name, repr(self.tokens[0].string), file=stream)
149    for tok in self.tokens[1:]:
150      stream.write(' ' * indent)
151      print(' ' * max(len(self.type_name), 4), repr(tok.string), file=stream)
152
153
154class Symbol(Snippet):
155  """A Snippet containing sub-Snippets.
156
157  The possible types and type_names are defined in Python's symbol module."""
158
159  def __init__(self, symbol_type, children):
160    self._type = symbol_type
161    self._children = children
162
163  @property
164  def type(self):
165    return self._type
166
167  @property
168  def type_name(self):
169    return symbol.sym_name[self.type]
170
171  @property
172  def children(self):
173    return self._children
174
175  @children.setter
176  def children(self, value):  # pylint: disable=arguments-differ
177    self._children = value
178
179  @property
180  def tokens(self):
181    tokens = []
182    for child in self.children:
183      tokens += child.tokens
184    return tuple(tokens)
185
186  @property
187  def modified(self):
188    return any(child.modified for child in self.children)
189
190  def PrintTree(self, indent=0, stream=sys.stdout):
191    stream.write(' ' * indent)
192
193    # If there's only one child, collapse it onto the same line.
194    node = self
195    while len(node.children) == 1 and len(node.children[0].children) == 1:
196      print(node.type_name, end=' ', file=stream)
197      node = node.children[0]
198
199    print(node.type_name, file=stream)
200    for child in node.children:
201      child.PrintTree(indent + 2, stream)
202
203
204def Snippetize(f):
205  """Return the syntax tree of the given file."""
206  f.seek(0)
207  syntax_tree = parser.st2list(parser.suite(f.read()))
208  tokens = offset_token.Tokenize(f)
209
210  snippet = _SnippetizeNode(syntax_tree, tokens)
211  assert not tokens
212  return snippet
213
214
215def _SnippetizeNode(node, tokens):
216  # The parser module gives a syntax tree that discards comments,
217  # non-terminating newlines, and whitespace information. Use the tokens given
218  # by the tokenize module to annotate the syntax tree with the information
219  # needed to exactly reproduce the original source code.
220  node_type = node[0]
221
222  if node_type >= token.NT_OFFSET:
223    # Symbol.
224    children = tuple(_SnippetizeNode(child, tokens) for child in node[1:])
225    return Symbol(node_type, children)
226  else:
227    # Token.
228    grabbed_tokens = []
229    while tokens and (
230        tokens[0].type == tokenize.COMMENT or tokens[0].type == tokenize.NL):
231      grabbed_tokens.append(tokens.popleft())
232
233    # parser has 2 NEWLINEs right before the end.
234    # tokenize has 0 or 1 depending on if the file has one.
235    # Create extra nodes without consuming tokens to account for this.
236    if node_type == token.NEWLINE:
237      for tok in tokens:
238        if tok.type == token.ENDMARKER:
239          return TokenSnippet(node_type, grabbed_tokens)
240        if tok.type != token.DEDENT:
241          break
242
243    assert tokens[0].type == token.OP or node_type == tokens[0].type
244
245    grabbed_tokens.append(tokens.popleft())
246    return TokenSnippet(node_type, grabbed_tokens)
247