1# Copyright (c) 2009-2014 LOGILAB S.A. (Paris, FRANCE) <contact@logilab.fr> 2# Copyright (c) 2010 Daniel Harding <dharding@gmail.com> 3# Copyright (c) 2012-2014 Google, Inc. 4# Copyright (c) 2013-2020 Claudiu Popa <pcmanticore@gmail.com> 5# Copyright (c) 2014 Brett Cannon <brett@python.org> 6# Copyright (c) 2014 Arun Persaud <arun@nubati.net> 7# Copyright (c) 2015 Rene Zhang <rz99@cornell.edu> 8# Copyright (c) 2015 Ionel Cristian Maries <contact@ionelmc.ro> 9# Copyright (c) 2016, 2018 Jakub Wilk <jwilk@jwilk.net> 10# Copyright (c) 2016 Peter Dawyndt <Peter.Dawyndt@UGent.be> 11# Copyright (c) 2017 Łukasz Rogalski <rogalski.91@gmail.com> 12# Copyright (c) 2017 Ville Skyttä <ville.skytta@iki.fi> 13# Copyright (c) 2018, 2020 Anthony Sottile <asottile@umich.edu> 14# Copyright (c) 2018-2019 Lucas Cimon <lucas.cimon@gmail.com> 15# Copyright (c) 2018 Alan Chan <achan961117@gmail.com> 16# Copyright (c) 2018 Yury Gribov <tetra2005@gmail.com> 17# Copyright (c) 2018 ssolanki <sushobhitsolanki@gmail.com> 18# Copyright (c) 2018 Nick Drozd <nicholasdrozd@gmail.com> 19# Copyright (c) 2019-2021 Pierre Sassoulas <pierre.sassoulas@gmail.com> 20# Copyright (c) 2019 Wes Turner <westurner@google.com> 21# Copyright (c) 2019 Djailla <bastien.vallet@gmail.com> 22# Copyright (c) 2019 Hugo van Kemenade <hugovk@users.noreply.github.com> 23# Copyright (c) 2020 Matthew Suozzo <msuozzo@google.com> 24# Copyright (c) 2020 hippo91 <guillaume.peillex@gmail.com> 25# Copyright (c) 2020 谭九鼎 <109224573@qq.com> 26# Copyright (c) 2020 Anthony <tanant@users.noreply.github.com> 27# Copyright (c) 2021 Marc Mueller <30130371+cdce8p@users.noreply.github.com> 28# Copyright (c) 2021 Tushar Sadhwani <tushar.sadhwani000@gmail.com> 29# Copyright (c) 2021 Jaehoon Hwang <jaehoonhwang@users.noreply.github.com> 30# Copyright (c) 2021 Daniël van Noord <13665637+DanielNoord@users.noreply.github.com> 31# Copyright (c) 2021 Peter Kolbus <peter.kolbus@garmin.com> 32 33 34# Licensed under the GPL: https://www.gnu.org/licenses/old-licenses/gpl-2.0.html 35# For details: https://github.com/PyCQA/pylint/blob/main/LICENSE 36 37"""Checker for string formatting operations. 38""" 39 40import collections 41import numbers 42import re 43import tokenize 44from typing import Counter, Iterable 45 46import astroid 47from astroid import nodes 48 49from pylint.checkers import BaseChecker, BaseTokenChecker, utils 50from pylint.checkers.utils import check_messages 51from pylint.interfaces import IAstroidChecker, IRawChecker, ITokenChecker 52 53_AST_NODE_STR_TYPES = ("__builtin__.unicode", "__builtin__.str", "builtins.str") 54# Prefixes for both strings and bytes literals per 55# https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals 56_PREFIXES = { 57 "r", 58 "u", 59 "R", 60 "U", 61 "f", 62 "F", 63 "fr", 64 "Fr", 65 "fR", 66 "FR", 67 "rf", 68 "rF", 69 "Rf", 70 "RF", 71 "b", 72 "B", 73 "br", 74 "Br", 75 "bR", 76 "BR", 77 "rb", 78 "rB", 79 "Rb", 80 "RB", 81} 82SINGLE_QUOTED_REGEX = re.compile(f"({'|'.join(_PREFIXES)})?'''") 83DOUBLE_QUOTED_REGEX = re.compile(f"({'|'.join(_PREFIXES)})?\"\"\"") 84QUOTE_DELIMITER_REGEX = re.compile(f"({'|'.join(_PREFIXES)})?(\"|')", re.DOTALL) 85 86MSGS = { # pylint: disable=consider-using-namedtuple-or-dataclass 87 "E1300": ( 88 "Unsupported format character %r (%#02x) at index %d", 89 "bad-format-character", 90 "Used when an unsupported format character is used in a format string.", 91 ), 92 "E1301": ( 93 "Format string ends in middle of conversion specifier", 94 "truncated-format-string", 95 "Used when a format string terminates before the end of a " 96 "conversion specifier.", 97 ), 98 "E1302": ( 99 "Mixing named and unnamed conversion specifiers in format string", 100 "mixed-format-string", 101 "Used when a format string contains both named (e.g. '%(foo)d') " 102 "and unnamed (e.g. '%d') conversion specifiers. This is also " 103 "used when a named conversion specifier contains * for the " 104 "minimum field width and/or precision.", 105 ), 106 "E1303": ( 107 "Expected mapping for format string, not %s", 108 "format-needs-mapping", 109 "Used when a format string that uses named conversion specifiers " 110 "is used with an argument that is not a mapping.", 111 ), 112 "W1300": ( 113 "Format string dictionary key should be a string, not %s", 114 "bad-format-string-key", 115 "Used when a format string that uses named conversion specifiers " 116 "is used with a dictionary whose keys are not all strings.", 117 ), 118 "W1301": ( 119 "Unused key %r in format string dictionary", 120 "unused-format-string-key", 121 "Used when a format string that uses named conversion specifiers " 122 "is used with a dictionary that contains keys not required by the " 123 "format string.", 124 ), 125 "E1304": ( 126 "Missing key %r in format string dictionary", 127 "missing-format-string-key", 128 "Used when a format string that uses named conversion specifiers " 129 "is used with a dictionary that doesn't contain all the keys " 130 "required by the format string.", 131 ), 132 "E1305": ( 133 "Too many arguments for format string", 134 "too-many-format-args", 135 "Used when a format string that uses unnamed conversion " 136 "specifiers is given too many arguments.", 137 ), 138 "E1306": ( 139 "Not enough arguments for format string", 140 "too-few-format-args", 141 "Used when a format string that uses unnamed conversion " 142 "specifiers is given too few arguments", 143 ), 144 "E1307": ( 145 "Argument %r does not match format type %r", 146 "bad-string-format-type", 147 "Used when a type required by format string " 148 "is not suitable for actual argument type", 149 ), 150 "E1310": ( 151 "Suspicious argument in %s.%s call", 152 "bad-str-strip-call", 153 "The argument to a str.{l,r,}strip call contains a duplicate character, ", 154 ), 155 "W1302": ( 156 "Invalid format string", 157 "bad-format-string", 158 "Used when a PEP 3101 format string is invalid.", 159 ), 160 "W1303": ( 161 "Missing keyword argument %r for format string", 162 "missing-format-argument-key", 163 "Used when a PEP 3101 format string that uses named fields " 164 "doesn't receive one or more required keywords.", 165 ), 166 "W1304": ( 167 "Unused format argument %r", 168 "unused-format-string-argument", 169 "Used when a PEP 3101 format string that uses named " 170 "fields is used with an argument that " 171 "is not required by the format string.", 172 ), 173 "W1305": ( 174 "Format string contains both automatic field numbering " 175 "and manual field specification", 176 "format-combined-specification", 177 "Used when a PEP 3101 format string contains both automatic " 178 "field numbering (e.g. '{}') and manual field " 179 "specification (e.g. '{0}').", 180 ), 181 "W1306": ( 182 "Missing format attribute %r in format specifier %r", 183 "missing-format-attribute", 184 "Used when a PEP 3101 format string uses an " 185 "attribute specifier ({0.length}), but the argument " 186 "passed for formatting doesn't have that attribute.", 187 ), 188 "W1307": ( 189 "Using invalid lookup key %r in format specifier %r", 190 "invalid-format-index", 191 "Used when a PEP 3101 format string uses a lookup specifier " 192 "({a[1]}), but the argument passed for formatting " 193 "doesn't contain or doesn't have that key as an attribute.", 194 ), 195 "W1308": ( 196 "Duplicate string formatting argument %r, consider passing as named argument", 197 "duplicate-string-formatting-argument", 198 "Used when we detect that a string formatting is " 199 "repeating an argument instead of using named string arguments", 200 ), 201 "W1309": ( 202 "Using an f-string that does not have any interpolated variables", 203 "f-string-without-interpolation", 204 "Used when we detect an f-string that does not use any interpolation variables, " 205 "in which case it can be either a normal string or a bug in the code.", 206 ), 207 "W1310": ( 208 "Using formatting for a string that does not have any interpolated variables", 209 "format-string-without-interpolation", 210 "Used when we detect a string that does not have any interpolation variables, " 211 "in which case it can be either a normal string without formatting or a bug in the code.", 212 ), 213} 214 215OTHER_NODES = ( 216 nodes.Const, 217 nodes.List, 218 nodes.Lambda, 219 nodes.FunctionDef, 220 nodes.ListComp, 221 nodes.SetComp, 222 nodes.GeneratorExp, 223) 224 225 226def get_access_path(key, parts): 227 """Given a list of format specifiers, returns 228 the final access path (e.g. a.b.c[0][1]). 229 """ 230 path = [] 231 for is_attribute, specifier in parts: 232 if is_attribute: 233 path.append(f".{specifier}") 234 else: 235 path.append(f"[{specifier!r}]") 236 return str(key) + "".join(path) 237 238 239def arg_matches_format_type(arg_type, format_type): 240 if format_type in "sr": 241 # All types can be printed with %s and %r 242 return True 243 if isinstance(arg_type, astroid.Instance): 244 arg_type = arg_type.pytype() 245 if arg_type == "builtins.str": 246 return format_type == "c" 247 if arg_type == "builtins.float": 248 return format_type in "deEfFgGn%" 249 if arg_type == "builtins.int": 250 # Integers allow all types 251 return True 252 return False 253 return True 254 255 256class StringFormatChecker(BaseChecker): 257 """Checks string formatting operations to ensure that the format string 258 is valid and the arguments match the format string. 259 """ 260 261 __implements__ = (IAstroidChecker,) 262 name = "string" 263 msgs = MSGS 264 265 # pylint: disable=too-many-branches 266 @check_messages( 267 "bad-format-character", 268 "truncated-format-string", 269 "mixed-format-string", 270 "bad-format-string-key", 271 "missing-format-string-key", 272 "unused-format-string-key", 273 "bad-string-format-type", 274 "format-needs-mapping", 275 "too-many-format-args", 276 "too-few-format-args", 277 "bad-string-format-type", 278 "format-string-without-interpolation", 279 ) 280 def visit_binop(self, node: nodes.BinOp) -> None: 281 if node.op != "%": 282 return 283 left = node.left 284 args = node.right 285 286 if not (isinstance(left, nodes.Const) and isinstance(left.value, str)): 287 return 288 format_string = left.value 289 try: 290 ( 291 required_keys, 292 required_num_args, 293 required_key_types, 294 required_arg_types, 295 ) = utils.parse_format_string(format_string) 296 except utils.UnsupportedFormatCharacter as exc: 297 formatted = format_string[exc.index] 298 self.add_message( 299 "bad-format-character", 300 node=node, 301 args=(formatted, ord(formatted), exc.index), 302 ) 303 return 304 except utils.IncompleteFormatString: 305 self.add_message("truncated-format-string", node=node) 306 return 307 if not required_keys and not required_num_args: 308 self.add_message("format-string-without-interpolation", node=node) 309 return 310 if required_keys and required_num_args: 311 # The format string uses both named and unnamed format 312 # specifiers. 313 self.add_message("mixed-format-string", node=node) 314 elif required_keys: 315 # The format string uses only named format specifiers. 316 # Check that the RHS of the % operator is a mapping object 317 # that contains precisely the set of keys required by the 318 # format string. 319 if isinstance(args, nodes.Dict): 320 keys = set() 321 unknown_keys = False 322 for k, _ in args.items: 323 if isinstance(k, nodes.Const): 324 key = k.value 325 if isinstance(key, str): 326 keys.add(key) 327 else: 328 self.add_message( 329 "bad-format-string-key", node=node, args=key 330 ) 331 else: 332 # One of the keys was something other than a 333 # constant. Since we can't tell what it is, 334 # suppress checks for missing keys in the 335 # dictionary. 336 unknown_keys = True 337 if not unknown_keys: 338 for key in required_keys: 339 if key not in keys: 340 self.add_message( 341 "missing-format-string-key", node=node, args=key 342 ) 343 for key in keys: 344 if key not in required_keys: 345 self.add_message( 346 "unused-format-string-key", node=node, args=key 347 ) 348 for key, arg in args.items: 349 if not isinstance(key, nodes.Const): 350 continue 351 format_type = required_key_types.get(key.value, None) 352 arg_type = utils.safe_infer(arg) 353 if ( 354 format_type is not None 355 and arg_type 356 and arg_type != astroid.Uninferable 357 and not arg_matches_format_type(arg_type, format_type) 358 ): 359 self.add_message( 360 "bad-string-format-type", 361 node=node, 362 args=(arg_type.pytype(), format_type), 363 ) 364 elif isinstance(args, (OTHER_NODES, nodes.Tuple)): 365 type_name = type(args).__name__ 366 self.add_message("format-needs-mapping", node=node, args=type_name) 367 # else: 368 # The RHS of the format specifier is a name or 369 # expression. It may be a mapping object, so 370 # there's nothing we can check. 371 else: 372 # The format string uses only unnamed format specifiers. 373 # Check that the number of arguments passed to the RHS of 374 # the % operator matches the number required by the format 375 # string. 376 args_elts = [] 377 if isinstance(args, nodes.Tuple): 378 rhs_tuple = utils.safe_infer(args) 379 num_args = None 380 if isinstance(rhs_tuple, nodes.BaseContainer): 381 args_elts = rhs_tuple.elts 382 num_args = len(args_elts) 383 elif isinstance(args, (OTHER_NODES, (nodes.Dict, nodes.DictComp))): 384 args_elts = [args] 385 num_args = 1 386 else: 387 # The RHS of the format specifier is a name or 388 # expression. It could be a tuple of unknown size, so 389 # there's nothing we can check. 390 num_args = None 391 if num_args is not None: 392 if num_args > required_num_args: 393 self.add_message("too-many-format-args", node=node) 394 elif num_args < required_num_args: 395 self.add_message("too-few-format-args", node=node) 396 for arg, format_type in zip(args_elts, required_arg_types): 397 if not arg: 398 continue 399 arg_type = utils.safe_infer(arg) 400 if ( 401 arg_type 402 and arg_type != astroid.Uninferable 403 and not arg_matches_format_type(arg_type, format_type) 404 ): 405 self.add_message( 406 "bad-string-format-type", 407 node=node, 408 args=(arg_type.pytype(), format_type), 409 ) 410 411 @check_messages("f-string-without-interpolation") 412 def visit_joinedstr(self, node: nodes.JoinedStr) -> None: 413 self._check_interpolation(node) 414 415 def _check_interpolation(self, node: nodes.JoinedStr) -> None: 416 if isinstance(node.parent, nodes.FormattedValue): 417 return 418 for value in node.values: 419 if isinstance(value, nodes.FormattedValue): 420 return 421 self.add_message("f-string-without-interpolation", node=node) 422 423 @check_messages(*MSGS) 424 def visit_call(self, node: nodes.Call) -> None: 425 func = utils.safe_infer(node.func) 426 if ( 427 isinstance(func, astroid.BoundMethod) 428 and isinstance(func.bound, astroid.Instance) 429 and func.bound.name in {"str", "unicode", "bytes"} 430 ): 431 if func.name in {"strip", "lstrip", "rstrip"} and node.args: 432 arg = utils.safe_infer(node.args[0]) 433 if not isinstance(arg, nodes.Const) or not isinstance(arg.value, str): 434 return 435 if len(arg.value) != len(set(arg.value)): 436 self.add_message( 437 "bad-str-strip-call", 438 node=node, 439 args=(func.bound.name, func.name), 440 ) 441 elif func.name == "format": 442 self._check_new_format(node, func) 443 444 def _detect_vacuous_formatting(self, node, positional_arguments): 445 counter = collections.Counter( 446 arg.name for arg in positional_arguments if isinstance(arg, nodes.Name) 447 ) 448 for name, count in counter.items(): 449 if count == 1: 450 continue 451 self.add_message( 452 "duplicate-string-formatting-argument", node=node, args=(name,) 453 ) 454 455 def _check_new_format(self, node, func): 456 """Check the new string formatting.""" 457 # Skip format nodes which don't have an explicit string on the 458 # left side of the format operation. 459 # We do this because our inference engine can't properly handle 460 # redefinitions of the original string. 461 # Note that there may not be any left side at all, if the format method 462 # has been assigned to another variable. See issue 351. For example: 463 # 464 # fmt = 'some string {}'.format 465 # fmt('arg') 466 if isinstance(node.func, nodes.Attribute) and not isinstance( 467 node.func.expr, nodes.Const 468 ): 469 return 470 if node.starargs or node.kwargs: 471 return 472 try: 473 strnode = next(func.bound.infer()) 474 except astroid.InferenceError: 475 return 476 if not (isinstance(strnode, nodes.Const) and isinstance(strnode.value, str)): 477 return 478 try: 479 call_site = astroid.arguments.CallSite.from_call(node) 480 except astroid.InferenceError: 481 return 482 483 try: 484 fields, num_args, manual_pos = utils.parse_format_method_string( 485 strnode.value 486 ) 487 except utils.IncompleteFormatString: 488 self.add_message("bad-format-string", node=node) 489 return 490 491 positional_arguments = call_site.positional_arguments 492 named_arguments = call_site.keyword_arguments 493 named_fields = {field[0] for field in fields if isinstance(field[0], str)} 494 if num_args and manual_pos: 495 self.add_message("format-combined-specification", node=node) 496 return 497 498 check_args = False 499 # Consider "{[0]} {[1]}" as num_args. 500 num_args += sum(1 for field in named_fields if field == "") 501 if named_fields: 502 for field in named_fields: 503 if field and field not in named_arguments: 504 self.add_message( 505 "missing-format-argument-key", node=node, args=(field,) 506 ) 507 for field in named_arguments: 508 if field not in named_fields: 509 self.add_message( 510 "unused-format-string-argument", node=node, args=(field,) 511 ) 512 # num_args can be 0 if manual_pos is not. 513 num_args = num_args or manual_pos 514 if positional_arguments or num_args: 515 empty = any(field == "" for field in named_fields) 516 if named_arguments or empty: 517 # Verify the required number of positional arguments 518 # only if the .format got at least one keyword argument. 519 # This means that the format strings accepts both 520 # positional and named fields and we should warn 521 # when one of the them is missing or is extra. 522 check_args = True 523 else: 524 check_args = True 525 if check_args: 526 # num_args can be 0 if manual_pos is not. 527 num_args = num_args or manual_pos 528 if not num_args: 529 self.add_message("format-string-without-interpolation", node=node) 530 return 531 if len(positional_arguments) > num_args: 532 self.add_message("too-many-format-args", node=node) 533 elif len(positional_arguments) < num_args: 534 self.add_message("too-few-format-args", node=node) 535 536 self._detect_vacuous_formatting(node, positional_arguments) 537 self._check_new_format_specifiers(node, fields, named_arguments) 538 539 def _check_new_format_specifiers(self, node, fields, named): 540 """ 541 Check attribute and index access in the format 542 string ("{0.a}" and "{0[a]}"). 543 """ 544 for key, specifiers in fields: 545 # Obtain the argument. If it can't be obtained 546 # or inferred, skip this check. 547 if key == "": 548 # {[0]} will have an unnamed argument, defaulting 549 # to 0. It will not be present in `named`, so use the value 550 # 0 for it. 551 key = 0 552 if isinstance(key, numbers.Number): 553 try: 554 argname = utils.get_argument_from_call(node, key) 555 except utils.NoSuchArgumentError: 556 continue 557 else: 558 if key not in named: 559 continue 560 argname = named[key] 561 if argname in (astroid.Uninferable, None): 562 continue 563 try: 564 argument = utils.safe_infer(argname) 565 except astroid.InferenceError: 566 continue 567 if not specifiers or not argument: 568 # No need to check this key if it doesn't 569 # use attribute / item access 570 continue 571 if argument.parent and isinstance(argument.parent, nodes.Arguments): 572 # Ignore any object coming from an argument, 573 # because we can't infer its value properly. 574 continue 575 previous = argument 576 parsed = [] 577 for is_attribute, specifier in specifiers: 578 if previous is astroid.Uninferable: 579 break 580 parsed.append((is_attribute, specifier)) 581 if is_attribute: 582 try: 583 previous = previous.getattr(specifier)[0] 584 except astroid.NotFoundError: 585 if ( 586 hasattr(previous, "has_dynamic_getattr") 587 and previous.has_dynamic_getattr() 588 ): 589 # Don't warn if the object has a custom __getattr__ 590 break 591 path = get_access_path(key, parsed) 592 self.add_message( 593 "missing-format-attribute", 594 args=(specifier, path), 595 node=node, 596 ) 597 break 598 else: 599 warn_error = False 600 if hasattr(previous, "getitem"): 601 try: 602 previous = previous.getitem(nodes.Const(specifier)) 603 except ( 604 astroid.AstroidIndexError, 605 astroid.AstroidTypeError, 606 astroid.AttributeInferenceError, 607 ): 608 warn_error = True 609 except astroid.InferenceError: 610 break 611 if previous is astroid.Uninferable: 612 break 613 else: 614 try: 615 # Lookup __getitem__ in the current node, 616 # but skip further checks, because we can't 617 # retrieve the looked object 618 previous.getattr("__getitem__") 619 break 620 except astroid.NotFoundError: 621 warn_error = True 622 if warn_error: 623 path = get_access_path(key, parsed) 624 self.add_message( 625 "invalid-format-index", args=(specifier, path), node=node 626 ) 627 break 628 629 try: 630 previous = next(previous.infer()) 631 except astroid.InferenceError: 632 # can't check further if we can't infer it 633 break 634 635 636class StringConstantChecker(BaseTokenChecker): 637 """Check string literals""" 638 639 __implements__ = (IAstroidChecker, ITokenChecker, IRawChecker) 640 name = "string" 641 msgs = { 642 "W1401": ( 643 "Anomalous backslash in string: '%s'. " 644 "String constant might be missing an r prefix.", 645 "anomalous-backslash-in-string", 646 "Used when a backslash is in a literal string but not as an escape.", 647 ), 648 "W1402": ( 649 "Anomalous Unicode escape in byte string: '%s'. " 650 "String constant might be missing an r or u prefix.", 651 "anomalous-unicode-escape-in-string", 652 "Used when an escape like \\u is encountered in a byte " 653 "string where it has no effect.", 654 ), 655 "W1404": ( 656 "Implicit string concatenation found in %s", 657 "implicit-str-concat", 658 "String literals are implicitly concatenated in a " 659 "literal iterable definition : " 660 "maybe a comma is missing ?", 661 {"old_names": [("W1403", "implicit-str-concat-in-sequence")]}, 662 ), 663 "W1405": ( 664 "Quote delimiter %s is inconsistent with the rest of the file", 665 "inconsistent-quotes", 666 "Quote delimiters are not used consistently throughout a module " 667 "(with allowances made for avoiding unnecessary escaping).", 668 ), 669 "W1406": ( 670 "The u prefix for strings is no longer necessary in Python >=3.0", 671 "redundant-u-string-prefix", 672 "Used when we detect a string with a u prefix. These prefixes were necessary " 673 "in Python 2 to indicate a string was Unicode, but since Python 3.0 strings " 674 "are Unicode by default.", 675 ), 676 } 677 options = ( 678 ( 679 "check-str-concat-over-line-jumps", 680 { 681 "default": False, 682 "type": "yn", 683 "metavar": "<y or n>", 684 "help": "This flag controls whether the " 685 "implicit-str-concat should generate a warning " 686 "on implicit string concatenation in sequences defined over " 687 "several lines.", 688 }, 689 ), 690 ( 691 "check-quote-consistency", 692 { 693 "default": False, 694 "type": "yn", 695 "metavar": "<y or n>", 696 "help": "This flag controls whether inconsistent-quotes generates a " 697 "warning when the character used as a quote delimiter is used " 698 "inconsistently within a module.", 699 }, 700 ), 701 ) 702 703 # Characters that have a special meaning after a backslash in either 704 # Unicode or byte strings. 705 ESCAPE_CHARACTERS = "abfnrtvx\n\r\t\\'\"01234567" 706 707 # Characters that have a special meaning after a backslash but only in 708 # Unicode strings. 709 UNICODE_ESCAPE_CHARACTERS = "uUN" 710 711 def __init__(self, *args, **kwargs): 712 super().__init__(*args, **kwargs) 713 self.string_tokens = {} # token position -> (token value, next token) 714 715 def process_module(self, node: nodes.Module) -> None: 716 self._unicode_literals = "unicode_literals" in node.future_imports 717 718 def process_tokens(self, tokens): 719 encoding = "ascii" 720 for i, (tok_type, token, start, _, line) in enumerate(tokens): 721 if tok_type == tokenize.ENCODING: 722 # this is always the first token processed 723 encoding = token 724 elif tok_type == tokenize.STRING: 725 # 'token' is the whole un-parsed token; we can look at the start 726 # of it to see whether it's a raw or unicode string etc. 727 self.process_string_token(token, start[0], start[1]) 728 # We figure the next token, ignoring comments & newlines: 729 j = i + 1 730 while j < len(tokens) and tokens[j].type in ( 731 tokenize.NEWLINE, 732 tokenize.NL, 733 tokenize.COMMENT, 734 ): 735 j += 1 736 next_token = tokens[j] if j < len(tokens) else None 737 if encoding != "ascii": 738 # We convert `tokenize` character count into a byte count, 739 # to match with astroid `.col_offset` 740 start = (start[0], len(line[: start[1]].encode(encoding))) 741 self.string_tokens[start] = (str_eval(token), next_token) 742 743 if self.config.check_quote_consistency: 744 self.check_for_consistent_string_delimiters(tokens) 745 746 @check_messages("implicit-str-concat") 747 def visit_list(self, node: nodes.List) -> None: 748 self.check_for_concatenated_strings(node.elts, "list") 749 750 @check_messages("implicit-str-concat") 751 def visit_set(self, node: nodes.Set) -> None: 752 self.check_for_concatenated_strings(node.elts, "set") 753 754 @check_messages("implicit-str-concat") 755 def visit_tuple(self, node: nodes.Tuple) -> None: 756 self.check_for_concatenated_strings(node.elts, "tuple") 757 758 def visit_assign(self, node: nodes.Assign) -> None: 759 if isinstance(node.value, nodes.Const) and isinstance(node.value.value, str): 760 self.check_for_concatenated_strings([node.value], "assignment") 761 762 def check_for_consistent_string_delimiters( 763 self, tokens: Iterable[tokenize.TokenInfo] 764 ) -> None: 765 """Adds a message for each string using inconsistent quote delimiters. 766 767 Quote delimiters are used inconsistently if " and ' are mixed in a module's 768 shortstrings without having done so to avoid escaping an internal quote 769 character. 770 771 Args: 772 tokens: The tokens to be checked against for consistent usage. 773 """ 774 string_delimiters: Counter[str] = collections.Counter() 775 776 # First, figure out which quote character predominates in the module 777 for tok_type, token, _, _, _ in tokens: 778 if tok_type == tokenize.STRING and _is_quote_delimiter_chosen_freely(token): 779 string_delimiters[_get_quote_delimiter(token)] += 1 780 781 if len(string_delimiters) > 1: 782 # Ties are broken arbitrarily 783 most_common_delimiter = string_delimiters.most_common(1)[0][0] 784 for tok_type, token, start, _, _ in tokens: 785 if tok_type != tokenize.STRING: 786 continue 787 quote_delimiter = _get_quote_delimiter(token) 788 if ( 789 _is_quote_delimiter_chosen_freely(token) 790 and quote_delimiter != most_common_delimiter 791 ): 792 self.add_message( 793 "inconsistent-quotes", line=start[0], args=(quote_delimiter,) 794 ) 795 796 def check_for_concatenated_strings(self, elements, iterable_type): 797 for elt in elements: 798 if not ( 799 isinstance(elt, nodes.Const) and elt.pytype() in _AST_NODE_STR_TYPES 800 ): 801 continue 802 if elt.col_offset < 0: 803 # This can happen in case of escaped newlines 804 continue 805 if (elt.lineno, elt.col_offset) not in self.string_tokens: 806 # This may happen with Latin1 encoding 807 # cf. https://github.com/PyCQA/pylint/issues/2610 808 continue 809 matching_token, next_token = self.string_tokens[ 810 (elt.lineno, elt.col_offset) 811 ] 812 # We detect string concatenation: the AST Const is the 813 # combination of 2 string tokens 814 if matching_token != elt.value and next_token is not None: 815 if next_token.type == tokenize.STRING and ( 816 next_token.start[0] == elt.lineno 817 or self.config.check_str_concat_over_line_jumps 818 ): 819 self.add_message( 820 "implicit-str-concat", line=elt.lineno, args=(iterable_type,) 821 ) 822 823 def process_string_token(self, token, start_row, start_col): 824 quote_char = None 825 index = None 826 for index, char in enumerate(token): 827 if char in "'\"": 828 quote_char = char 829 break 830 if quote_char is None: 831 return 832 833 prefix = token[:index].lower() # markers like u, b, r. 834 after_prefix = token[index:] 835 # Chop off quotes 836 quote_length = ( 837 3 if after_prefix[:3] == after_prefix[-3:] == 3 * quote_char else 1 838 ) 839 string_body = after_prefix[quote_length:-quote_length] 840 # No special checks on raw strings at the moment. 841 if "r" not in prefix: 842 self.process_non_raw_string_token( 843 prefix, 844 string_body, 845 start_row, 846 start_col + len(prefix) + quote_length, 847 ) 848 849 def process_non_raw_string_token( 850 self, prefix, string_body, start_row, string_start_col 851 ): 852 """check for bad escapes in a non-raw string. 853 854 prefix: lowercase string of eg 'ur' string prefix markers. 855 string_body: the un-parsed body of the string, not including the quote 856 marks. 857 start_row: integer line number in the source. 858 string_start_col: integer col number of the string start in the source. 859 """ 860 # Walk through the string; if we see a backslash then escape the next 861 # character, and skip over it. If we see a non-escaped character, 862 # alert, and continue. 863 # 864 # Accept a backslash when it escapes a backslash, or a quote, or 865 # end-of-line, or one of the letters that introduce a special escape 866 # sequence <https://docs.python.org/reference/lexical_analysis.html> 867 # 868 index = 0 869 while True: 870 index = string_body.find("\\", index) 871 if index == -1: 872 break 873 # There must be a next character; having a backslash at the end 874 # of the string would be a SyntaxError. 875 next_char = string_body[index + 1] 876 match = string_body[index : index + 2] 877 # The column offset will vary depending on whether the string token 878 # is broken across lines. Calculate relative to the nearest line 879 # break or relative to the start of the token's line. 880 last_newline = string_body.rfind("\n", 0, index) 881 if last_newline == -1: 882 line = start_row 883 col_offset = index + string_start_col 884 else: 885 line = start_row + string_body.count("\n", 0, index) 886 col_offset = index - last_newline - 1 887 if next_char in self.UNICODE_ESCAPE_CHARACTERS: 888 if "u" in prefix: 889 pass 890 elif "b" not in prefix: 891 pass # unicode by default 892 else: 893 self.add_message( 894 "anomalous-unicode-escape-in-string", 895 line=line, 896 args=(match,), 897 col_offset=col_offset, 898 ) 899 elif next_char not in self.ESCAPE_CHARACTERS: 900 self.add_message( 901 "anomalous-backslash-in-string", 902 line=line, 903 args=(match,), 904 col_offset=col_offset, 905 ) 906 # Whether it was a valid escape or not, backslash followed by 907 # another character can always be consumed whole: the second 908 # character can never be the start of a new backslash escape. 909 index += 2 910 911 @check_messages("redundant-u-string-prefix") 912 def visit_const(self, node: nodes.Const) -> None: 913 if node.pytype() == "builtins.str" and not isinstance( 914 node.parent, nodes.JoinedStr 915 ): 916 self._detect_u_string_prefix(node) 917 918 def _detect_u_string_prefix(self, node: nodes.Const): 919 """Check whether strings include a 'u' prefix like u'String'""" 920 if node.kind == "u": 921 self.add_message( 922 "redundant-u-string-prefix", 923 line=node.lineno, 924 col_offset=node.col_offset, 925 ) 926 927 928def register(linter): 929 """required method to auto register this checker""" 930 linter.register_checker(StringFormatChecker(linter)) 931 linter.register_checker(StringConstantChecker(linter)) 932 933 934def str_eval(token): 935 """ 936 Mostly replicate `ast.literal_eval(token)` manually to avoid any performance hit. 937 This supports f-strings, contrary to `ast.literal_eval`. 938 We have to support all string literal notations: 939 https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals 940 """ 941 if token[0:2].lower() in {"fr", "rf"}: 942 token = token[2:] 943 elif token[0].lower() in {"r", "u", "f"}: 944 token = token[1:] 945 if token[0:3] in {'"""', "'''"}: 946 return token[3:-3] 947 return token[1:-1] 948 949 950def _is_long_string(string_token: str) -> bool: 951 """Is this string token a "longstring" (is it triple-quoted)? 952 953 Long strings are triple-quoted as defined in 954 https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals 955 956 This function only checks characters up through the open quotes. Because it's meant 957 to be applied only to tokens that represent string literals, it doesn't bother to 958 check for close-quotes (demonstrating that the literal is a well-formed string). 959 960 Args: 961 string_token: The string token to be parsed. 962 963 Returns: 964 A boolean representing whether or not this token matches a longstring 965 regex. 966 """ 967 return bool( 968 SINGLE_QUOTED_REGEX.match(string_token) 969 or DOUBLE_QUOTED_REGEX.match(string_token) 970 ) 971 972 973def _get_quote_delimiter(string_token: str) -> str: 974 """Returns the quote character used to delimit this token string. 975 976 This function does little checking for whether the token is a well-formed 977 string. 978 979 Args: 980 string_token: The token to be parsed. 981 982 Returns: 983 A string containing solely the first quote delimiter character in the passed 984 string. 985 986 Raises: 987 ValueError: No quote delimiter characters are present. 988 """ 989 match = QUOTE_DELIMITER_REGEX.match(string_token) 990 if not match: 991 raise ValueError(f"string token {string_token} is not a well-formed string") 992 return match.group(2) 993 994 995def _is_quote_delimiter_chosen_freely(string_token: str) -> bool: 996 """Was there a non-awkward option for the quote delimiter? 997 998 Args: 999 string_token: The quoted string whose delimiters are to be checked. 1000 1001 Returns: 1002 Whether there was a choice in this token's quote character that would 1003 not have involved backslash-escaping an interior quote character. Long 1004 strings are excepted from this analysis under the assumption that their 1005 quote characters are set by policy. 1006 """ 1007 quote_delimiter = _get_quote_delimiter(string_token) 1008 unchosen_delimiter = '"' if quote_delimiter == "'" else "'" 1009 return bool( 1010 quote_delimiter 1011 and not _is_long_string(string_token) 1012 and unchosen_delimiter not in str_eval(string_token) 1013 ) 1014