1#!/usr/bin/env python
2#
3# Natural Language Toolkit: Deprecated Function & Class Finder
4#
5# Copyright (C) 2001-2019 NLTK Project
6# Author: Edward Loper <edloper@gmail.com>
7# URL: <http://nltk.org/>
8# For license information, see LICENSE.TXT
9
10from __future__ import print_function
11
12"""
13This command-line tool takes a list of python files or directories,
14and searches them for calls to deprecated NLTK functions, or uses of
15deprecated NLTK classes.  For each use of a deprecated object it
16finds, it will print out a warning containing the offending line, as
17well as its line number and containing file name.  If the terminal has
18color support (and if epydoc is installed), then the offending
19identifier will be highlighted in red.
20"""
21
22######################################################################
23# Imports
24######################################################################
25
26import os
27import re
28import sys
29import tokenize
30import textwrap
31import nltk.corpus
32from doctest import DocTestParser, register_optionflag
33from cStringIO import StringIO
34from nltk import defaultdict
35
36######################################################################
37# Regexps
38######################################################################
39
40#: A little over-simplified, but it'll do.
41STRING_PAT = (
42    r'\s*[ur]{0,2}(?:'
43    '"""[\s\S]*?"""|'
44    '"[^"\n]+?"|'
45    "'''[\s\S]*?'''|"
46    "'[^'\n]+?'"
47    ")\s*"
48)
49STRING_RE = re.compile(STRING_PAT)
50
51STRINGS_PAT = '{}(?:[+]?{})*'.format(STRING_PAT, STRING_PAT)
52STRINGS_RE = re.compile(STRINGS_PAT)
53
54# Define a regexp to search for deprecated definitions.
55DEPRECATED_DEF_PAT = (
56    r'^\s*@deprecated\s*\(\s*({})\s*\)\s*\n+'.format(STRINGS_PAT)
57    + r'\s*def\s*(\w+).*'
58    + r'|'
59    + r'^\s*class\s+(\w+)\s*\(.*Deprecated.*\):\s*'
60)
61DEPRECATED_DEF_RE = re.compile(DEPRECATED_DEF_PAT, re.MULTILINE)
62
63CORPUS_READ_METHOD_RE = re.compile(
64    '({})\.read\('.format('|'.join(re.escape(n) for n in dir(nltk.corpus)))
65)
66
67CLASS_DEF_RE = re.compile('^\s*class\s+(\w+)\s*[:\(]')
68
69######################################################################
70# Globals
71######################################################################
72# Yes, it's bad programming practice, but this is a little hack
73# script. :)  These get initialized by find_deprecated_defs.
74
75deprecated_funcs = defaultdict(set)
76deprecated_classes = defaultdict(set)
77deprecated_methods = defaultdict(set)
78
79try:
80    from epydoc.cli import TerminalController
81except ImportError:
82
83    class TerminalController:
84        def __getattr__(self, attr):
85            return ''
86
87
88term = TerminalController()
89
90######################################################################
91# Code
92######################################################################
93
94# If we're using py24, then ignore the +SKIP directive.
95if sys.version_info[:2] < (2, 5):
96    register_optionflag('SKIP')
97
98
99def strip_quotes(s):
100    s = s.strip()
101    while s and (s[0] in "ur") and (s[-1] in "'\""):
102        s = s[1:]
103    while s and (s[0] in "'\"" and (s[0] == s[-1])):
104        s = s[1:-1]
105    s = s.strip()
106    return s
107
108
109def find_class(s, index):
110    lines = s[:index].split('\n')
111    while lines:
112        m = CLASS_DEF_RE.match(lines[-1])
113        if m:
114            return m.group(1) + '.'
115        lines.pop()
116    return '?.'
117
118
119def find_deprecated_defs(pkg_dir):
120    """
121    Return a list of all functions marked with the @deprecated
122    decorator, and classes with an immediate Deprecated base class, in
123    all Python files in the given directory.
124    """
125    # Walk through the directory, finding python files.
126    for root, dirs, files in os.walk(pkg_dir):
127        for filename in files:
128            if filename.endswith('.py'):
129                # Search the file for any deprecated definitions.
130                s = open(os.path.join(root, filename)).read()
131                for m in DEPRECATED_DEF_RE.finditer(s):
132                    if m.group(2):
133                        name = m.group(2)
134                        msg = ' '.join(
135                            strip_quotes(s) for s in STRING_RE.findall(m.group(1))
136                        )
137                        msg = ' '.join(msg.split())
138                        if m.group()[0] in ' \t':
139                            cls = find_class(s, m.start())
140                            deprecated_methods[name].add((msg, cls, '()'))
141                        else:
142                            deprecated_funcs[name].add((msg, '', '()'))
143                    else:
144                        name = m.group(3)
145                        m2 = STRING_RE.match(s, m.end())
146                        if m2:
147                            msg = strip_quotes(m2.group())
148                        else:
149                            msg = ''
150                        msg = ' '.join(msg.split())
151                        deprecated_classes[name].add((msg, '', ''))
152
153
154def print_deprecated_uses(paths):
155    dep_names = set()
156    dep_files = set()
157    for path in sorted(paths):
158        if os.path.isdir(path):
159            dep_names.update(
160                print_deprecated_uses([os.path.join(path, f) for f in os.listdir(path)])
161            )
162        elif path.endswith('.py'):
163            print_deprecated_uses_in(open(path).readline, path, dep_files, dep_names, 0)
164        elif path.endswith('.doctest') or path.endswith('.txt'):
165            for example in DocTestParser().get_examples(open(path).read()):
166                ex = StringIO(example.source)
167                try:
168                    print_deprecated_uses_in(
169                        ex.readline, path, dep_files, dep_names, example.lineno
170                    )
171                except tokenize.TokenError:
172                    print(
173                        term.RED + 'Caught TokenError -- '
174                        'malformatted doctest?' + term.NORMAL
175                    )
176    return dep_names
177
178
179def print_deprecated_uses_in(readline, path, dep_files, dep_names, lineno_offset):
180    tokiter = tokenize.generate_tokens(readline)
181    context = ['']
182    for (typ, tok, start, end, line) in tokiter:
183        # Remember the previous line -- it might contain
184        # the @deprecated decorator.
185        if line is not context[-1]:
186            context.append(line)
187            if len(context) > 10:
188                del context[0]
189        esctok = re.escape(tok)
190        # Ignore all tokens except deprecated names.
191        if not (
192            tok in deprecated_classes
193            or (
194                tok in deprecated_funcs and re.search(r'\b{}\s*\('.format(esctok), line)
195            )
196            or (
197                tok in deprecated_methods
198                and re.search(r'(?!<\bself)[.]\s*{}\s*\('.format(esctok), line)
199            )
200        ):
201            continue
202        # Hack: only complain about read if it's used after a corpus.
203        if tok == 'read' and not CORPUS_READ_METHOD_RE.search(line):
204            continue
205        # Ignore deprecated definitions:
206        if DEPRECATED_DEF_RE.search(''.join(context)):
207            continue
208        # Print a header for the first use in a file:
209        if path not in dep_files:
210            print('\n' + term.BOLD + path + term.NORMAL)
211            print('  {}linenum{}'.format(term.YELLOW, term.NORMAL))
212            dep_files.add(path)
213        # Mark the offending token.
214        dep_names.add(tok)
215        if term.RED:
216            sub = term.RED + tok + term.NORMAL
217        elif term.BOLD:
218            sub = term.BOLD + tok + term.NORMAL
219        else:
220            sub = '<<' + tok + '>>'
221        line = re.sub(r'\b{}\b'.format(esctok), sub, line)
222        # Print the offending line.
223        print(
224            '  {}[{:5d}]{} {}'.format(
225                term.YELLOW, start[0] + lineno_offset, term.NORMAL, line.rstrip()
226            )
227        )
228
229
230def main():
231    paths = sys.argv[1:] or ['.']
232
233    print('Importing nltk...')
234    try:
235        import nltk
236    except ImportError:
237        print('Unable to import nltk -- check your PYTHONPATH.')
238        sys.exit(-1)
239
240    print('Finding definitions of deprecated funtions & classes in nltk...')
241    find_deprecated_defs(nltk.__path__[0])
242
243    print('Looking for possible uses of deprecated funcs & classes...')
244    dep_names = print_deprecated_uses(paths)
245
246    if not dep_names:
247        print('No deprecated funcs or classes found!')
248    else:
249        print("\n" + term.BOLD + "What you should use instead:" + term.NORMAL)
250        for name in sorted(dep_names):
251            msgs = (
252                deprecated_funcs[name]
253                .union(deprecated_classes[name])
254                .union(deprecated_methods[name])
255            )
256            for msg, prefix, suffix in msgs:
257                print(
258                    textwrap.fill(
259                        term.RED + prefix + name + suffix + term.NORMAL + ': ' + msg,
260                        width=75,
261                        initial_indent=' ' * 2,
262                        subsequent_indent=' ' * 6,
263                    )
264                )
265
266
267if __name__ == '__main__':
268    main()
269