1# Copyright (c) 2009-2014 LOGILAB S.A. (Paris, FRANCE) <contact@logilab.fr>
2# Copyright (c) 2010 Daniel Harding <dharding@gmail.com>
3# Copyright (c) 2012-2014 Google, Inc.
4# Copyright (c) 2013-2020 Claudiu Popa <pcmanticore@gmail.com>
5# Copyright (c) 2014 Brett Cannon <brett@python.org>
6# Copyright (c) 2014 Arun Persaud <arun@nubati.net>
7# Copyright (c) 2015 Rene Zhang <rz99@cornell.edu>
8# Copyright (c) 2015 Ionel Cristian Maries <contact@ionelmc.ro>
9# Copyright (c) 2016, 2018 Jakub Wilk <jwilk@jwilk.net>
10# Copyright (c) 2016 Peter Dawyndt <Peter.Dawyndt@UGent.be>
11# Copyright (c) 2017 Łukasz Rogalski <rogalski.91@gmail.com>
12# Copyright (c) 2017 Ville Skyttä <ville.skytta@iki.fi>
13# Copyright (c) 2018, 2020 Anthony Sottile <asottile@umich.edu>
14# Copyright (c) 2018-2019 Lucas Cimon <lucas.cimon@gmail.com>
15# Copyright (c) 2018 Alan Chan <achan961117@gmail.com>
16# Copyright (c) 2018 Yury Gribov <tetra2005@gmail.com>
17# Copyright (c) 2018 ssolanki <sushobhitsolanki@gmail.com>
18# Copyright (c) 2018 Nick Drozd <nicholasdrozd@gmail.com>
19# Copyright (c) 2019-2021 Pierre Sassoulas <pierre.sassoulas@gmail.com>
20# Copyright (c) 2019 Wes Turner <westurner@google.com>
21# Copyright (c) 2019 Djailla <bastien.vallet@gmail.com>
22# Copyright (c) 2019 Hugo van Kemenade <hugovk@users.noreply.github.com>
23# Copyright (c) 2020 Matthew Suozzo <msuozzo@google.com>
24# Copyright (c) 2020 hippo91 <guillaume.peillex@gmail.com>
25# Copyright (c) 2020 谭九鼎 <109224573@qq.com>
26# Copyright (c) 2020 Anthony <tanant@users.noreply.github.com>
27# Copyright (c) 2021 Marc Mueller <30130371+cdce8p@users.noreply.github.com>
28# Copyright (c) 2021 Tushar Sadhwani <tushar.sadhwani000@gmail.com>
29# Copyright (c) 2021 Jaehoon Hwang <jaehoonhwang@users.noreply.github.com>
30# Copyright (c) 2021 Daniël van Noord <13665637+DanielNoord@users.noreply.github.com>
31# Copyright (c) 2021 Peter Kolbus <peter.kolbus@garmin.com>
32
33
34# Licensed under the GPL: https://www.gnu.org/licenses/old-licenses/gpl-2.0.html
35# For details: https://github.com/PyCQA/pylint/blob/main/LICENSE
36
37"""Checker for string formatting operations.
38"""
39
40import collections
41import numbers
42import re
43import tokenize
44from typing import Counter, Iterable
45
46import astroid
47from astroid import nodes
48
49from pylint.checkers import BaseChecker, BaseTokenChecker, utils
50from pylint.checkers.utils import check_messages
51from pylint.interfaces import IAstroidChecker, IRawChecker, ITokenChecker
52
53_AST_NODE_STR_TYPES = ("__builtin__.unicode", "__builtin__.str", "builtins.str")
54# Prefixes for both strings and bytes literals per
55# https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
56_PREFIXES = {
57    "r",
58    "u",
59    "R",
60    "U",
61    "f",
62    "F",
63    "fr",
64    "Fr",
65    "fR",
66    "FR",
67    "rf",
68    "rF",
69    "Rf",
70    "RF",
71    "b",
72    "B",
73    "br",
74    "Br",
75    "bR",
76    "BR",
77    "rb",
78    "rB",
79    "Rb",
80    "RB",
81}
82SINGLE_QUOTED_REGEX = re.compile(f"({'|'.join(_PREFIXES)})?'''")
83DOUBLE_QUOTED_REGEX = re.compile(f"({'|'.join(_PREFIXES)})?\"\"\"")
84QUOTE_DELIMITER_REGEX = re.compile(f"({'|'.join(_PREFIXES)})?(\"|')", re.DOTALL)
85
86MSGS = {  # pylint: disable=consider-using-namedtuple-or-dataclass
87    "E1300": (
88        "Unsupported format character %r (%#02x) at index %d",
89        "bad-format-character",
90        "Used when an unsupported format character is used in a format string.",
91    ),
92    "E1301": (
93        "Format string ends in middle of conversion specifier",
94        "truncated-format-string",
95        "Used when a format string terminates before the end of a "
96        "conversion specifier.",
97    ),
98    "E1302": (
99        "Mixing named and unnamed conversion specifiers in format string",
100        "mixed-format-string",
101        "Used when a format string contains both named (e.g. '%(foo)d') "
102        "and unnamed (e.g. '%d') conversion specifiers.  This is also "
103        "used when a named conversion specifier contains * for the "
104        "minimum field width and/or precision.",
105    ),
106    "E1303": (
107        "Expected mapping for format string, not %s",
108        "format-needs-mapping",
109        "Used when a format string that uses named conversion specifiers "
110        "is used with an argument that is not a mapping.",
111    ),
112    "W1300": (
113        "Format string dictionary key should be a string, not %s",
114        "bad-format-string-key",
115        "Used when a format string that uses named conversion specifiers "
116        "is used with a dictionary whose keys are not all strings.",
117    ),
118    "W1301": (
119        "Unused key %r in format string dictionary",
120        "unused-format-string-key",
121        "Used when a format string that uses named conversion specifiers "
122        "is used with a dictionary that contains keys not required by the "
123        "format string.",
124    ),
125    "E1304": (
126        "Missing key %r in format string dictionary",
127        "missing-format-string-key",
128        "Used when a format string that uses named conversion specifiers "
129        "is used with a dictionary that doesn't contain all the keys "
130        "required by the format string.",
131    ),
132    "E1305": (
133        "Too many arguments for format string",
134        "too-many-format-args",
135        "Used when a format string that uses unnamed conversion "
136        "specifiers is given too many arguments.",
137    ),
138    "E1306": (
139        "Not enough arguments for format string",
140        "too-few-format-args",
141        "Used when a format string that uses unnamed conversion "
142        "specifiers is given too few arguments",
143    ),
144    "E1307": (
145        "Argument %r does not match format type %r",
146        "bad-string-format-type",
147        "Used when a type required by format string "
148        "is not suitable for actual argument type",
149    ),
150    "E1310": (
151        "Suspicious argument in %s.%s call",
152        "bad-str-strip-call",
153        "The argument to a str.{l,r,}strip call contains a duplicate character, ",
154    ),
155    "W1302": (
156        "Invalid format string",
157        "bad-format-string",
158        "Used when a PEP 3101 format string is invalid.",
159    ),
160    "W1303": (
161        "Missing keyword argument %r for format string",
162        "missing-format-argument-key",
163        "Used when a PEP 3101 format string that uses named fields "
164        "doesn't receive one or more required keywords.",
165    ),
166    "W1304": (
167        "Unused format argument %r",
168        "unused-format-string-argument",
169        "Used when a PEP 3101 format string that uses named "
170        "fields is used with an argument that "
171        "is not required by the format string.",
172    ),
173    "W1305": (
174        "Format string contains both automatic field numbering "
175        "and manual field specification",
176        "format-combined-specification",
177        "Used when a PEP 3101 format string contains both automatic "
178        "field numbering (e.g. '{}') and manual field "
179        "specification (e.g. '{0}').",
180    ),
181    "W1306": (
182        "Missing format attribute %r in format specifier %r",
183        "missing-format-attribute",
184        "Used when a PEP 3101 format string uses an "
185        "attribute specifier ({0.length}), but the argument "
186        "passed for formatting doesn't have that attribute.",
187    ),
188    "W1307": (
189        "Using invalid lookup key %r in format specifier %r",
190        "invalid-format-index",
191        "Used when a PEP 3101 format string uses a lookup specifier "
192        "({a[1]}), but the argument passed for formatting "
193        "doesn't contain or doesn't have that key as an attribute.",
194    ),
195    "W1308": (
196        "Duplicate string formatting argument %r, consider passing as named argument",
197        "duplicate-string-formatting-argument",
198        "Used when we detect that a string formatting is "
199        "repeating an argument instead of using named string arguments",
200    ),
201    "W1309": (
202        "Using an f-string that does not have any interpolated variables",
203        "f-string-without-interpolation",
204        "Used when we detect an f-string that does not use any interpolation variables, "
205        "in which case it can be either a normal string or a bug in the code.",
206    ),
207    "W1310": (
208        "Using formatting for a string that does not have any interpolated variables",
209        "format-string-without-interpolation",
210        "Used when we detect a string that does not have any interpolation variables, "
211        "in which case it can be either a normal string without formatting or a bug in the code.",
212    ),
213}
214
215OTHER_NODES = (
216    nodes.Const,
217    nodes.List,
218    nodes.Lambda,
219    nodes.FunctionDef,
220    nodes.ListComp,
221    nodes.SetComp,
222    nodes.GeneratorExp,
223)
224
225
226def get_access_path(key, parts):
227    """Given a list of format specifiers, returns
228    the final access path (e.g. a.b.c[0][1]).
229    """
230    path = []
231    for is_attribute, specifier in parts:
232        if is_attribute:
233            path.append(f".{specifier}")
234        else:
235            path.append(f"[{specifier!r}]")
236    return str(key) + "".join(path)
237
238
239def arg_matches_format_type(arg_type, format_type):
240    if format_type in "sr":
241        # All types can be printed with %s and %r
242        return True
243    if isinstance(arg_type, astroid.Instance):
244        arg_type = arg_type.pytype()
245        if arg_type == "builtins.str":
246            return format_type == "c"
247        if arg_type == "builtins.float":
248            return format_type in "deEfFgGn%"
249        if arg_type == "builtins.int":
250            # Integers allow all types
251            return True
252        return False
253    return True
254
255
256class StringFormatChecker(BaseChecker):
257    """Checks string formatting operations to ensure that the format string
258    is valid and the arguments match the format string.
259    """
260
261    __implements__ = (IAstroidChecker,)
262    name = "string"
263    msgs = MSGS
264
265    # pylint: disable=too-many-branches
266    @check_messages(
267        "bad-format-character",
268        "truncated-format-string",
269        "mixed-format-string",
270        "bad-format-string-key",
271        "missing-format-string-key",
272        "unused-format-string-key",
273        "bad-string-format-type",
274        "format-needs-mapping",
275        "too-many-format-args",
276        "too-few-format-args",
277        "bad-string-format-type",
278        "format-string-without-interpolation",
279    )
280    def visit_binop(self, node: nodes.BinOp) -> None:
281        if node.op != "%":
282            return
283        left = node.left
284        args = node.right
285
286        if not (isinstance(left, nodes.Const) and isinstance(left.value, str)):
287            return
288        format_string = left.value
289        try:
290            (
291                required_keys,
292                required_num_args,
293                required_key_types,
294                required_arg_types,
295            ) = utils.parse_format_string(format_string)
296        except utils.UnsupportedFormatCharacter as exc:
297            formatted = format_string[exc.index]
298            self.add_message(
299                "bad-format-character",
300                node=node,
301                args=(formatted, ord(formatted), exc.index),
302            )
303            return
304        except utils.IncompleteFormatString:
305            self.add_message("truncated-format-string", node=node)
306            return
307        if not required_keys and not required_num_args:
308            self.add_message("format-string-without-interpolation", node=node)
309            return
310        if required_keys and required_num_args:
311            # The format string uses both named and unnamed format
312            # specifiers.
313            self.add_message("mixed-format-string", node=node)
314        elif required_keys:
315            # The format string uses only named format specifiers.
316            # Check that the RHS of the % operator is a mapping object
317            # that contains precisely the set of keys required by the
318            # format string.
319            if isinstance(args, nodes.Dict):
320                keys = set()
321                unknown_keys = False
322                for k, _ in args.items:
323                    if isinstance(k, nodes.Const):
324                        key = k.value
325                        if isinstance(key, str):
326                            keys.add(key)
327                        else:
328                            self.add_message(
329                                "bad-format-string-key", node=node, args=key
330                            )
331                    else:
332                        # One of the keys was something other than a
333                        # constant.  Since we can't tell what it is,
334                        # suppress checks for missing keys in the
335                        # dictionary.
336                        unknown_keys = True
337                if not unknown_keys:
338                    for key in required_keys:
339                        if key not in keys:
340                            self.add_message(
341                                "missing-format-string-key", node=node, args=key
342                            )
343                for key in keys:
344                    if key not in required_keys:
345                        self.add_message(
346                            "unused-format-string-key", node=node, args=key
347                        )
348                for key, arg in args.items:
349                    if not isinstance(key, nodes.Const):
350                        continue
351                    format_type = required_key_types.get(key.value, None)
352                    arg_type = utils.safe_infer(arg)
353                    if (
354                        format_type is not None
355                        and arg_type
356                        and arg_type != astroid.Uninferable
357                        and not arg_matches_format_type(arg_type, format_type)
358                    ):
359                        self.add_message(
360                            "bad-string-format-type",
361                            node=node,
362                            args=(arg_type.pytype(), format_type),
363                        )
364            elif isinstance(args, (OTHER_NODES, nodes.Tuple)):
365                type_name = type(args).__name__
366                self.add_message("format-needs-mapping", node=node, args=type_name)
367            # else:
368            # The RHS of the format specifier is a name or
369            # expression.  It may be a mapping object, so
370            # there's nothing we can check.
371        else:
372            # The format string uses only unnamed format specifiers.
373            # Check that the number of arguments passed to the RHS of
374            # the % operator matches the number required by the format
375            # string.
376            args_elts = []
377            if isinstance(args, nodes.Tuple):
378                rhs_tuple = utils.safe_infer(args)
379                num_args = None
380                if isinstance(rhs_tuple, nodes.BaseContainer):
381                    args_elts = rhs_tuple.elts
382                    num_args = len(args_elts)
383            elif isinstance(args, (OTHER_NODES, (nodes.Dict, nodes.DictComp))):
384                args_elts = [args]
385                num_args = 1
386            else:
387                # The RHS of the format specifier is a name or
388                # expression.  It could be a tuple of unknown size, so
389                # there's nothing we can check.
390                num_args = None
391            if num_args is not None:
392                if num_args > required_num_args:
393                    self.add_message("too-many-format-args", node=node)
394                elif num_args < required_num_args:
395                    self.add_message("too-few-format-args", node=node)
396                for arg, format_type in zip(args_elts, required_arg_types):
397                    if not arg:
398                        continue
399                    arg_type = utils.safe_infer(arg)
400                    if (
401                        arg_type
402                        and arg_type != astroid.Uninferable
403                        and not arg_matches_format_type(arg_type, format_type)
404                    ):
405                        self.add_message(
406                            "bad-string-format-type",
407                            node=node,
408                            args=(arg_type.pytype(), format_type),
409                        )
410
411    @check_messages("f-string-without-interpolation")
412    def visit_joinedstr(self, node: nodes.JoinedStr) -> None:
413        self._check_interpolation(node)
414
415    def _check_interpolation(self, node: nodes.JoinedStr) -> None:
416        if isinstance(node.parent, nodes.FormattedValue):
417            return
418        for value in node.values:
419            if isinstance(value, nodes.FormattedValue):
420                return
421        self.add_message("f-string-without-interpolation", node=node)
422
423    @check_messages(*MSGS)
424    def visit_call(self, node: nodes.Call) -> None:
425        func = utils.safe_infer(node.func)
426        if (
427            isinstance(func, astroid.BoundMethod)
428            and isinstance(func.bound, astroid.Instance)
429            and func.bound.name in {"str", "unicode", "bytes"}
430        ):
431            if func.name in {"strip", "lstrip", "rstrip"} and node.args:
432                arg = utils.safe_infer(node.args[0])
433                if not isinstance(arg, nodes.Const) or not isinstance(arg.value, str):
434                    return
435                if len(arg.value) != len(set(arg.value)):
436                    self.add_message(
437                        "bad-str-strip-call",
438                        node=node,
439                        args=(func.bound.name, func.name),
440                    )
441            elif func.name == "format":
442                self._check_new_format(node, func)
443
444    def _detect_vacuous_formatting(self, node, positional_arguments):
445        counter = collections.Counter(
446            arg.name for arg in positional_arguments if isinstance(arg, nodes.Name)
447        )
448        for name, count in counter.items():
449            if count == 1:
450                continue
451            self.add_message(
452                "duplicate-string-formatting-argument", node=node, args=(name,)
453            )
454
455    def _check_new_format(self, node, func):
456        """Check the new string formatting."""
457        # Skip format nodes which don't have an explicit string on the
458        # left side of the format operation.
459        # We do this because our inference engine can't properly handle
460        # redefinitions of the original string.
461        # Note that there may not be any left side at all, if the format method
462        # has been assigned to another variable. See issue 351. For example:
463        #
464        #    fmt = 'some string {}'.format
465        #    fmt('arg')
466        if isinstance(node.func, nodes.Attribute) and not isinstance(
467            node.func.expr, nodes.Const
468        ):
469            return
470        if node.starargs or node.kwargs:
471            return
472        try:
473            strnode = next(func.bound.infer())
474        except astroid.InferenceError:
475            return
476        if not (isinstance(strnode, nodes.Const) and isinstance(strnode.value, str)):
477            return
478        try:
479            call_site = astroid.arguments.CallSite.from_call(node)
480        except astroid.InferenceError:
481            return
482
483        try:
484            fields, num_args, manual_pos = utils.parse_format_method_string(
485                strnode.value
486            )
487        except utils.IncompleteFormatString:
488            self.add_message("bad-format-string", node=node)
489            return
490
491        positional_arguments = call_site.positional_arguments
492        named_arguments = call_site.keyword_arguments
493        named_fields = {field[0] for field in fields if isinstance(field[0], str)}
494        if num_args and manual_pos:
495            self.add_message("format-combined-specification", node=node)
496            return
497
498        check_args = False
499        # Consider "{[0]} {[1]}" as num_args.
500        num_args += sum(1 for field in named_fields if field == "")
501        if named_fields:
502            for field in named_fields:
503                if field and field not in named_arguments:
504                    self.add_message(
505                        "missing-format-argument-key", node=node, args=(field,)
506                    )
507            for field in named_arguments:
508                if field not in named_fields:
509                    self.add_message(
510                        "unused-format-string-argument", node=node, args=(field,)
511                    )
512            # num_args can be 0 if manual_pos is not.
513            num_args = num_args or manual_pos
514            if positional_arguments or num_args:
515                empty = any(field == "" for field in named_fields)
516                if named_arguments or empty:
517                    # Verify the required number of positional arguments
518                    # only if the .format got at least one keyword argument.
519                    # This means that the format strings accepts both
520                    # positional and named fields and we should warn
521                    # when one of the them is missing or is extra.
522                    check_args = True
523        else:
524            check_args = True
525        if check_args:
526            # num_args can be 0 if manual_pos is not.
527            num_args = num_args or manual_pos
528            if not num_args:
529                self.add_message("format-string-without-interpolation", node=node)
530                return
531            if len(positional_arguments) > num_args:
532                self.add_message("too-many-format-args", node=node)
533            elif len(positional_arguments) < num_args:
534                self.add_message("too-few-format-args", node=node)
535
536        self._detect_vacuous_formatting(node, positional_arguments)
537        self._check_new_format_specifiers(node, fields, named_arguments)
538
539    def _check_new_format_specifiers(self, node, fields, named):
540        """
541        Check attribute and index access in the format
542        string ("{0.a}" and "{0[a]}").
543        """
544        for key, specifiers in fields:
545            # Obtain the argument. If it can't be obtained
546            # or inferred, skip this check.
547            if key == "":
548                # {[0]} will have an unnamed argument, defaulting
549                # to 0. It will not be present in `named`, so use the value
550                # 0 for it.
551                key = 0
552            if isinstance(key, numbers.Number):
553                try:
554                    argname = utils.get_argument_from_call(node, key)
555                except utils.NoSuchArgumentError:
556                    continue
557            else:
558                if key not in named:
559                    continue
560                argname = named[key]
561            if argname in (astroid.Uninferable, None):
562                continue
563            try:
564                argument = utils.safe_infer(argname)
565            except astroid.InferenceError:
566                continue
567            if not specifiers or not argument:
568                # No need to check this key if it doesn't
569                # use attribute / item access
570                continue
571            if argument.parent and isinstance(argument.parent, nodes.Arguments):
572                # Ignore any object coming from an argument,
573                # because we can't infer its value properly.
574                continue
575            previous = argument
576            parsed = []
577            for is_attribute, specifier in specifiers:
578                if previous is astroid.Uninferable:
579                    break
580                parsed.append((is_attribute, specifier))
581                if is_attribute:
582                    try:
583                        previous = previous.getattr(specifier)[0]
584                    except astroid.NotFoundError:
585                        if (
586                            hasattr(previous, "has_dynamic_getattr")
587                            and previous.has_dynamic_getattr()
588                        ):
589                            # Don't warn if the object has a custom __getattr__
590                            break
591                        path = get_access_path(key, parsed)
592                        self.add_message(
593                            "missing-format-attribute",
594                            args=(specifier, path),
595                            node=node,
596                        )
597                        break
598                else:
599                    warn_error = False
600                    if hasattr(previous, "getitem"):
601                        try:
602                            previous = previous.getitem(nodes.Const(specifier))
603                        except (
604                            astroid.AstroidIndexError,
605                            astroid.AstroidTypeError,
606                            astroid.AttributeInferenceError,
607                        ):
608                            warn_error = True
609                        except astroid.InferenceError:
610                            break
611                        if previous is astroid.Uninferable:
612                            break
613                    else:
614                        try:
615                            # Lookup __getitem__ in the current node,
616                            # but skip further checks, because we can't
617                            # retrieve the looked object
618                            previous.getattr("__getitem__")
619                            break
620                        except astroid.NotFoundError:
621                            warn_error = True
622                    if warn_error:
623                        path = get_access_path(key, parsed)
624                        self.add_message(
625                            "invalid-format-index", args=(specifier, path), node=node
626                        )
627                        break
628
629                try:
630                    previous = next(previous.infer())
631                except astroid.InferenceError:
632                    # can't check further if we can't infer it
633                    break
634
635
636class StringConstantChecker(BaseTokenChecker):
637    """Check string literals"""
638
639    __implements__ = (IAstroidChecker, ITokenChecker, IRawChecker)
640    name = "string"
641    msgs = {
642        "W1401": (
643            "Anomalous backslash in string: '%s'. "
644            "String constant might be missing an r prefix.",
645            "anomalous-backslash-in-string",
646            "Used when a backslash is in a literal string but not as an escape.",
647        ),
648        "W1402": (
649            "Anomalous Unicode escape in byte string: '%s'. "
650            "String constant might be missing an r or u prefix.",
651            "anomalous-unicode-escape-in-string",
652            "Used when an escape like \\u is encountered in a byte "
653            "string where it has no effect.",
654        ),
655        "W1404": (
656            "Implicit string concatenation found in %s",
657            "implicit-str-concat",
658            "String literals are implicitly concatenated in a "
659            "literal iterable definition : "
660            "maybe a comma is missing ?",
661            {"old_names": [("W1403", "implicit-str-concat-in-sequence")]},
662        ),
663        "W1405": (
664            "Quote delimiter %s is inconsistent with the rest of the file",
665            "inconsistent-quotes",
666            "Quote delimiters are not used consistently throughout a module "
667            "(with allowances made for avoiding unnecessary escaping).",
668        ),
669        "W1406": (
670            "The u prefix for strings is no longer necessary in Python >=3.0",
671            "redundant-u-string-prefix",
672            "Used when we detect a string with a u prefix. These prefixes were necessary "
673            "in Python 2 to indicate a string was Unicode, but since Python 3.0 strings "
674            "are Unicode by default.",
675        ),
676    }
677    options = (
678        (
679            "check-str-concat-over-line-jumps",
680            {
681                "default": False,
682                "type": "yn",
683                "metavar": "<y or n>",
684                "help": "This flag controls whether the "
685                "implicit-str-concat should generate a warning "
686                "on implicit string concatenation in sequences defined over "
687                "several lines.",
688            },
689        ),
690        (
691            "check-quote-consistency",
692            {
693                "default": False,
694                "type": "yn",
695                "metavar": "<y or n>",
696                "help": "This flag controls whether inconsistent-quotes generates a "
697                "warning when the character used as a quote delimiter is used "
698                "inconsistently within a module.",
699            },
700        ),
701    )
702
703    # Characters that have a special meaning after a backslash in either
704    # Unicode or byte strings.
705    ESCAPE_CHARACTERS = "abfnrtvx\n\r\t\\'\"01234567"
706
707    # Characters that have a special meaning after a backslash but only in
708    # Unicode strings.
709    UNICODE_ESCAPE_CHARACTERS = "uUN"
710
711    def __init__(self, *args, **kwargs):
712        super().__init__(*args, **kwargs)
713        self.string_tokens = {}  # token position -> (token value, next token)
714
715    def process_module(self, node: nodes.Module) -> None:
716        self._unicode_literals = "unicode_literals" in node.future_imports
717
718    def process_tokens(self, tokens):
719        encoding = "ascii"
720        for i, (tok_type, token, start, _, line) in enumerate(tokens):
721            if tok_type == tokenize.ENCODING:
722                # this is always the first token processed
723                encoding = token
724            elif tok_type == tokenize.STRING:
725                # 'token' is the whole un-parsed token; we can look at the start
726                # of it to see whether it's a raw or unicode string etc.
727                self.process_string_token(token, start[0], start[1])
728                # We figure the next token, ignoring comments & newlines:
729                j = i + 1
730                while j < len(tokens) and tokens[j].type in (
731                    tokenize.NEWLINE,
732                    tokenize.NL,
733                    tokenize.COMMENT,
734                ):
735                    j += 1
736                next_token = tokens[j] if j < len(tokens) else None
737                if encoding != "ascii":
738                    # We convert `tokenize` character count into a byte count,
739                    # to match with astroid `.col_offset`
740                    start = (start[0], len(line[: start[1]].encode(encoding)))
741                self.string_tokens[start] = (str_eval(token), next_token)
742
743        if self.config.check_quote_consistency:
744            self.check_for_consistent_string_delimiters(tokens)
745
746    @check_messages("implicit-str-concat")
747    def visit_list(self, node: nodes.List) -> None:
748        self.check_for_concatenated_strings(node.elts, "list")
749
750    @check_messages("implicit-str-concat")
751    def visit_set(self, node: nodes.Set) -> None:
752        self.check_for_concatenated_strings(node.elts, "set")
753
754    @check_messages("implicit-str-concat")
755    def visit_tuple(self, node: nodes.Tuple) -> None:
756        self.check_for_concatenated_strings(node.elts, "tuple")
757
758    def visit_assign(self, node: nodes.Assign) -> None:
759        if isinstance(node.value, nodes.Const) and isinstance(node.value.value, str):
760            self.check_for_concatenated_strings([node.value], "assignment")
761
762    def check_for_consistent_string_delimiters(
763        self, tokens: Iterable[tokenize.TokenInfo]
764    ) -> None:
765        """Adds a message for each string using inconsistent quote delimiters.
766
767        Quote delimiters are used inconsistently if " and ' are mixed in a module's
768        shortstrings without having done so to avoid escaping an internal quote
769        character.
770
771        Args:
772          tokens: The tokens to be checked against for consistent usage.
773        """
774        string_delimiters: Counter[str] = collections.Counter()
775
776        # First, figure out which quote character predominates in the module
777        for tok_type, token, _, _, _ in tokens:
778            if tok_type == tokenize.STRING and _is_quote_delimiter_chosen_freely(token):
779                string_delimiters[_get_quote_delimiter(token)] += 1
780
781        if len(string_delimiters) > 1:
782            # Ties are broken arbitrarily
783            most_common_delimiter = string_delimiters.most_common(1)[0][0]
784            for tok_type, token, start, _, _ in tokens:
785                if tok_type != tokenize.STRING:
786                    continue
787                quote_delimiter = _get_quote_delimiter(token)
788                if (
789                    _is_quote_delimiter_chosen_freely(token)
790                    and quote_delimiter != most_common_delimiter
791                ):
792                    self.add_message(
793                        "inconsistent-quotes", line=start[0], args=(quote_delimiter,)
794                    )
795
796    def check_for_concatenated_strings(self, elements, iterable_type):
797        for elt in elements:
798            if not (
799                isinstance(elt, nodes.Const) and elt.pytype() in _AST_NODE_STR_TYPES
800            ):
801                continue
802            if elt.col_offset < 0:
803                # This can happen in case of escaped newlines
804                continue
805            if (elt.lineno, elt.col_offset) not in self.string_tokens:
806                # This may happen with Latin1 encoding
807                # cf. https://github.com/PyCQA/pylint/issues/2610
808                continue
809            matching_token, next_token = self.string_tokens[
810                (elt.lineno, elt.col_offset)
811            ]
812            # We detect string concatenation: the AST Const is the
813            # combination of 2 string tokens
814            if matching_token != elt.value and next_token is not None:
815                if next_token.type == tokenize.STRING and (
816                    next_token.start[0] == elt.lineno
817                    or self.config.check_str_concat_over_line_jumps
818                ):
819                    self.add_message(
820                        "implicit-str-concat", line=elt.lineno, args=(iterable_type,)
821                    )
822
823    def process_string_token(self, token, start_row, start_col):
824        quote_char = None
825        index = None
826        for index, char in enumerate(token):
827            if char in "'\"":
828                quote_char = char
829                break
830        if quote_char is None:
831            return
832
833        prefix = token[:index].lower()  # markers like u, b, r.
834        after_prefix = token[index:]
835        # Chop off quotes
836        quote_length = (
837            3 if after_prefix[:3] == after_prefix[-3:] == 3 * quote_char else 1
838        )
839        string_body = after_prefix[quote_length:-quote_length]
840        # No special checks on raw strings at the moment.
841        if "r" not in prefix:
842            self.process_non_raw_string_token(
843                prefix,
844                string_body,
845                start_row,
846                start_col + len(prefix) + quote_length,
847            )
848
849    def process_non_raw_string_token(
850        self, prefix, string_body, start_row, string_start_col
851    ):
852        """check for bad escapes in a non-raw string.
853
854        prefix: lowercase string of eg 'ur' string prefix markers.
855        string_body: the un-parsed body of the string, not including the quote
856        marks.
857        start_row: integer line number in the source.
858        string_start_col: integer col number of the string start in the source.
859        """
860        # Walk through the string; if we see a backslash then escape the next
861        # character, and skip over it.  If we see a non-escaped character,
862        # alert, and continue.
863        #
864        # Accept a backslash when it escapes a backslash, or a quote, or
865        # end-of-line, or one of the letters that introduce a special escape
866        # sequence <https://docs.python.org/reference/lexical_analysis.html>
867        #
868        index = 0
869        while True:
870            index = string_body.find("\\", index)
871            if index == -1:
872                break
873            # There must be a next character; having a backslash at the end
874            # of the string would be a SyntaxError.
875            next_char = string_body[index + 1]
876            match = string_body[index : index + 2]
877            # The column offset will vary depending on whether the string token
878            # is broken across lines. Calculate relative to the nearest line
879            # break or relative to the start of the token's line.
880            last_newline = string_body.rfind("\n", 0, index)
881            if last_newline == -1:
882                line = start_row
883                col_offset = index + string_start_col
884            else:
885                line = start_row + string_body.count("\n", 0, index)
886                col_offset = index - last_newline - 1
887            if next_char in self.UNICODE_ESCAPE_CHARACTERS:
888                if "u" in prefix:
889                    pass
890                elif "b" not in prefix:
891                    pass  # unicode by default
892                else:
893                    self.add_message(
894                        "anomalous-unicode-escape-in-string",
895                        line=line,
896                        args=(match,),
897                        col_offset=col_offset,
898                    )
899            elif next_char not in self.ESCAPE_CHARACTERS:
900                self.add_message(
901                    "anomalous-backslash-in-string",
902                    line=line,
903                    args=(match,),
904                    col_offset=col_offset,
905                )
906            # Whether it was a valid escape or not, backslash followed by
907            # another character can always be consumed whole: the second
908            # character can never be the start of a new backslash escape.
909            index += 2
910
911    @check_messages("redundant-u-string-prefix")
912    def visit_const(self, node: nodes.Const) -> None:
913        if node.pytype() == "builtins.str" and not isinstance(
914            node.parent, nodes.JoinedStr
915        ):
916            self._detect_u_string_prefix(node)
917
918    def _detect_u_string_prefix(self, node: nodes.Const):
919        """Check whether strings include a 'u' prefix like u'String'"""
920        if node.kind == "u":
921            self.add_message(
922                "redundant-u-string-prefix",
923                line=node.lineno,
924                col_offset=node.col_offset,
925            )
926
927
928def register(linter):
929    """required method to auto register this checker"""
930    linter.register_checker(StringFormatChecker(linter))
931    linter.register_checker(StringConstantChecker(linter))
932
933
934def str_eval(token):
935    """
936    Mostly replicate `ast.literal_eval(token)` manually to avoid any performance hit.
937    This supports f-strings, contrary to `ast.literal_eval`.
938    We have to support all string literal notations:
939    https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
940    """
941    if token[0:2].lower() in {"fr", "rf"}:
942        token = token[2:]
943    elif token[0].lower() in {"r", "u", "f"}:
944        token = token[1:]
945    if token[0:3] in {'"""', "'''"}:
946        return token[3:-3]
947    return token[1:-1]
948
949
950def _is_long_string(string_token: str) -> bool:
951    """Is this string token a "longstring" (is it triple-quoted)?
952
953    Long strings are triple-quoted as defined in
954    https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
955
956    This function only checks characters up through the open quotes.  Because it's meant
957    to be applied only to tokens that represent string literals, it doesn't bother to
958    check for close-quotes (demonstrating that the literal is a well-formed string).
959
960    Args:
961        string_token: The string token to be parsed.
962
963    Returns:
964        A boolean representing whether or not this token matches a longstring
965        regex.
966    """
967    return bool(
968        SINGLE_QUOTED_REGEX.match(string_token)
969        or DOUBLE_QUOTED_REGEX.match(string_token)
970    )
971
972
973def _get_quote_delimiter(string_token: str) -> str:
974    """Returns the quote character used to delimit this token string.
975
976    This function does little checking for whether the token is a well-formed
977    string.
978
979    Args:
980        string_token: The token to be parsed.
981
982    Returns:
983        A string containing solely the first quote delimiter character in the passed
984        string.
985
986    Raises:
987      ValueError: No quote delimiter characters are present.
988    """
989    match = QUOTE_DELIMITER_REGEX.match(string_token)
990    if not match:
991        raise ValueError(f"string token {string_token} is not a well-formed string")
992    return match.group(2)
993
994
995def _is_quote_delimiter_chosen_freely(string_token: str) -> bool:
996    """Was there a non-awkward option for the quote delimiter?
997
998    Args:
999        string_token: The quoted string whose delimiters are to be checked.
1000
1001    Returns:
1002        Whether there was a choice in this token's quote character that would
1003        not have involved backslash-escaping an interior quote character.  Long
1004        strings are excepted from this analysis under the assumption that their
1005        quote characters are set by policy.
1006    """
1007    quote_delimiter = _get_quote_delimiter(string_token)
1008    unchosen_delimiter = '"' if quote_delimiter == "'" else "'"
1009    return bool(
1010        quote_delimiter
1011        and not _is_long_string(string_token)
1012        and unchosen_delimiter not in str_eval(string_token)
1013    )
1014