1# Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
2#
3# Permission is hereby granted, free of charge, to any person obtaining a copy
4# of this software and associated documentation files (the "Software"), to deal
5# in the Software without restriction, including without limitation the rights
6# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7# copies of the Software, and to permit persons to whom the Software is
8# furnished to do so, subject to the following conditions:
9#
10# The above copyright notice and this permission notice shall be included in
11# all copies or substantial portions of the Software.
12#
13# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19# SOFTWARE.
20
21"""
22Contains data about certain markup, like HTML tags and external links.
23
24When updating this file, please also update the the C tokenizer version:
25- mwparserfromhell/parser/ctokenizer/definitions.c
26- mwparserfromhell/parser/ctokenizer/definitions.h
27"""
28
29__all__ = [
30    "get_html_tag",
31    "is_parsable",
32    "is_visible",
33    "is_single",
34    "is_single_only",
35    "is_scheme",
36]
37
38URI_SCHEMES = {
39    # [wikimedia/mediawiki.git]/includes/DefaultSettings.php @ 5c660de5d0
40    "bitcoin": False,
41    "ftp": True,
42    "ftps": True,
43    "geo": False,
44    "git": True,
45    "gopher": True,
46    "http": True,
47    "https": True,
48    "irc": True,
49    "ircs": True,
50    "magnet": False,
51    "mailto": False,
52    "mms": True,
53    "news": False,
54    "nntp": True,
55    "redis": True,
56    "sftp": True,
57    "sip": False,
58    "sips": False,
59    "sms": False,
60    "ssh": True,
61    "svn": True,
62    "tel": False,
63    "telnet": True,
64    "urn": False,
65    "worldwind": True,
66    "xmpp": False,
67}
68
69PARSER_BLACKLIST = [
70    # https://www.mediawiki.org/wiki/Parser_extension_tags @ 2020-12-21
71    "categorytree",
72    "ce",
73    "chem",
74    "gallery",
75    "graph",
76    "hiero",
77    "imagemap",
78    "inputbox",
79    "math",
80    "nowiki",
81    "pre",
82    "score",
83    "section",
84    "source",
85    "syntaxhighlight",
86    "templatedata",
87    "timeline",
88]
89
90INVISIBLE_TAGS = [
91    # https://www.mediawiki.org/wiki/Parser_extension_tags @ 2020-12-21
92    "categorytree",
93    "gallery",
94    "graph",
95    "imagemap",
96    "inputbox",
97    "math",
98    "score",
99    "section",
100    "templatedata",
101    "timeline",
102]
103
104# [wikimedia/mediawiki.git]/includes/parser/Sanitizer.php @ 95e17ee645
105SINGLE_ONLY = ["br", "wbr", "hr", "meta", "link", "img"]
106SINGLE = SINGLE_ONLY + ["li", "dt", "dd", "th", "td", "tr"]
107
108MARKUP_TO_HTML = {
109    "#": "li",
110    "*": "li",
111    ";": "dt",
112    ":": "dd",
113}
114
115
116def get_html_tag(markup):
117    """Return the HTML tag associated with the given wiki-markup."""
118    return MARKUP_TO_HTML[markup]
119
120
121def is_parsable(tag):
122    """Return if the given *tag*'s contents should be passed to the parser."""
123    return tag.lower() not in PARSER_BLACKLIST
124
125
126def is_visible(tag):
127    """Return whether or not the given *tag* contains visible text."""
128    return tag.lower() not in INVISIBLE_TAGS
129
130
131def is_single(tag):
132    """Return whether or not the given *tag* can exist without a close tag."""
133    return tag.lower() in SINGLE
134
135
136def is_single_only(tag):
137    """Return whether or not the given *tag* must exist without a close tag."""
138    return tag.lower() in SINGLE_ONLY
139
140
141def is_scheme(scheme, slashes=True):
142    """Return whether *scheme* is valid for external links."""
143    scheme = scheme.lower()
144    if slashes:
145        return scheme in URI_SCHEMES
146    return scheme in URI_SCHEMES and not URI_SCHEMES[scheme]
147