1# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com> 2# 3# Permission is hereby granted, free of charge, to any person obtaining a copy 4# of this software and associated documentation files (the "Software"), to deal 5# in the Software without restriction, including without limitation the rights 6# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7# copies of the Software, and to permit persons to whom the Software is 8# furnished to do so, subject to the following conditions: 9# 10# The above copyright notice and this permission notice shall be included in 11# all copies or substantial portions of the Software. 12# 13# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19# SOFTWARE. 20 21""" 22Contains data about certain markup, like HTML tags and external links. 23 24When updating this file, please also update the the C tokenizer version: 25- mwparserfromhell/parser/ctokenizer/definitions.c 26- mwparserfromhell/parser/ctokenizer/definitions.h 27""" 28 29__all__ = [ 30 "get_html_tag", 31 "is_parsable", 32 "is_visible", 33 "is_single", 34 "is_single_only", 35 "is_scheme", 36] 37 38URI_SCHEMES = { 39 # [wikimedia/mediawiki.git]/includes/DefaultSettings.php @ 5c660de5d0 40 "bitcoin": False, 41 "ftp": True, 42 "ftps": True, 43 "geo": False, 44 "git": True, 45 "gopher": True, 46 "http": True, 47 "https": True, 48 "irc": True, 49 "ircs": True, 50 "magnet": False, 51 "mailto": False, 52 "mms": True, 53 "news": False, 54 "nntp": True, 55 "redis": True, 56 "sftp": True, 57 "sip": False, 58 "sips": False, 59 "sms": False, 60 "ssh": True, 61 "svn": True, 62 "tel": False, 63 "telnet": True, 64 "urn": False, 65 "worldwind": True, 66 "xmpp": False, 67} 68 69PARSER_BLACKLIST = [ 70 # https://www.mediawiki.org/wiki/Parser_extension_tags @ 2020-12-21 71 "categorytree", 72 "ce", 73 "chem", 74 "gallery", 75 "graph", 76 "hiero", 77 "imagemap", 78 "inputbox", 79 "math", 80 "nowiki", 81 "pre", 82 "score", 83 "section", 84 "source", 85 "syntaxhighlight", 86 "templatedata", 87 "timeline", 88] 89 90INVISIBLE_TAGS = [ 91 # https://www.mediawiki.org/wiki/Parser_extension_tags @ 2020-12-21 92 "categorytree", 93 "gallery", 94 "graph", 95 "imagemap", 96 "inputbox", 97 "math", 98 "score", 99 "section", 100 "templatedata", 101 "timeline", 102] 103 104# [wikimedia/mediawiki.git]/includes/parser/Sanitizer.php @ 95e17ee645 105SINGLE_ONLY = ["br", "wbr", "hr", "meta", "link", "img"] 106SINGLE = SINGLE_ONLY + ["li", "dt", "dd", "th", "td", "tr"] 107 108MARKUP_TO_HTML = { 109 "#": "li", 110 "*": "li", 111 ";": "dt", 112 ":": "dd", 113} 114 115 116def get_html_tag(markup): 117 """Return the HTML tag associated with the given wiki-markup.""" 118 return MARKUP_TO_HTML[markup] 119 120 121def is_parsable(tag): 122 """Return if the given *tag*'s contents should be passed to the parser.""" 123 return tag.lower() not in PARSER_BLACKLIST 124 125 126def is_visible(tag): 127 """Return whether or not the given *tag* contains visible text.""" 128 return tag.lower() not in INVISIBLE_TAGS 129 130 131def is_single(tag): 132 """Return whether or not the given *tag* can exist without a close tag.""" 133 return tag.lower() in SINGLE 134 135 136def is_single_only(tag): 137 """Return whether or not the given *tag* must exist without a close tag.""" 138 return tag.lower() in SINGLE_ONLY 139 140 141def is_scheme(scheme, slashes=True): 142 """Return whether *scheme* is valid for external links.""" 143 scheme = scheme.lower() 144 if slashes: 145 return scheme in URI_SCHEMES 146 return scheme in URI_SCHEMES and not URI_SCHEMES[scheme] 147