1 /*
2 Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
3
4 Permission is hereby granted, free of charge, to any person obtaining a copy of
5 this software and associated documentation files (the "Software"), to deal in
6 the Software without restriction, including without limitation the rights to
7 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
8 of the Software, and to permit persons to whom the Software is furnished to do
9 so, subject to the following conditions:
10
11 The above copyright notice and this permission notice shall be included in all
12 copies or substantial portions of the Software.
13
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 SOFTWARE.
21 */
22
23 #include "definitions.h"
24
25 /*
26 This file should be kept up to date with mwparserfromhell/definitions.py.
27 See the Python version for data sources.
28 */
29
30 // clang-format off
31 static const char *URI_SCHEMES[] = {
32 "bitcoin",
33 "ftp",
34 "ftps",
35 "geo",
36 "git",
37 "gopher",
38 "http",
39 "https",
40 "irc",
41 "ircs",
42 "magnet",
43 "mailto",
44 "mms",
45 "news",
46 "nntp",
47 "redis",
48 "sftp",
49 "sip",
50 "sips",
51 "sms",
52 "ssh",
53 "svn",
54 "tel",
55 "telnet",
56 "urn",
57 "worldwind",
58 "xmpp",
59 NULL,
60 };
61
62 static const char *URI_SCHEMES_AUTHORITY_OPTIONAL[] = {
63 "bitcoin",
64 "geo",
65 "magnet",
66 "mailto",
67 "news",
68 "sip",
69 "sips",
70 "sms",
71 "tel",
72 "urn",
73 "xmpp",
74 NULL,
75 };
76
77 static const char *PARSER_BLACKLIST[] = {
78 "categorytree",
79 "ce",
80 "chem",
81 "gallery",
82 "graph",
83 "hiero",
84 "imagemap",
85 "inputbox",
86 "math",
87 "nowiki",
88 "pre",
89 "score",
90 "section",
91 "source",
92 "syntaxhighlight",
93 "templatedata",
94 "timeline",
95 NULL,
96 };
97 // clang-format on
98
99 static const char *SINGLE[] = {
100 "br", "wbr", "hr", "meta", "link", "img", "li", "dt", "dd", "th", "td", "tr", NULL};
101
102 static const char *SINGLE_ONLY[] = {"br", "wbr", "hr", "meta", "link", "img", NULL};
103
104 /*
105 Convert a PyUnicodeObject to a lowercase ASCII char* array and store it in
106 the second argument. The caller must free the return value when finished.
107 If the return value is NULL, the conversion failed and *string is not set.
108 */
109 static PyObject *
unicode_to_lcase_ascii(PyObject * input,const char ** string)110 unicode_to_lcase_ascii(PyObject *input, const char **string)
111 {
112 PyObject *lower = PyObject_CallMethod(input, "lower", NULL), *bytes;
113
114 if (!lower) {
115 return NULL;
116 }
117 bytes = PyUnicode_AsASCIIString(lower);
118 Py_DECREF(lower);
119 if (!bytes) {
120 if (PyErr_Occurred() && PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) {
121 PyErr_Clear();
122 }
123 return NULL;
124 }
125 *string = PyBytes_AS_STRING(bytes);
126 return bytes;
127 }
128
129 /*
130 Return whether a PyUnicodeObject is in a list of lowercase ASCII strings.
131 */
132 static int
unicode_in_string_list(PyObject * input,const char ** list)133 unicode_in_string_list(PyObject *input, const char **list)
134 {
135 const char *string;
136 PyObject *temp = unicode_to_lcase_ascii(input, &string);
137 int retval = 0;
138
139 if (!temp) {
140 return 0;
141 }
142
143 while (*list) {
144 if (!strcmp(*(list++), string)) {
145 retval = 1;
146 goto end;
147 }
148 }
149
150 end:
151 Py_DECREF(temp);
152 return retval;
153 }
154
155 /*
156 Return if the given tag's contents should be passed to the parser.
157 */
158 int
is_parsable(PyObject * tag)159 is_parsable(PyObject *tag)
160 {
161 return !unicode_in_string_list(tag, PARSER_BLACKLIST);
162 }
163
164 /*
165 Return whether or not the given tag can exist without a close tag.
166 */
167 int
is_single(PyObject * tag)168 is_single(PyObject *tag)
169 {
170 return unicode_in_string_list(tag, SINGLE);
171 }
172
173 /*
174 Return whether or not the given tag must exist without a close tag.
175 */
176 int
is_single_only(PyObject * tag)177 is_single_only(PyObject *tag)
178 {
179 return unicode_in_string_list(tag, SINGLE_ONLY);
180 }
181
182 /*
183 Return whether the given scheme is valid for external links.
184 */
185 int
is_scheme(PyObject * scheme,int slashes)186 is_scheme(PyObject *scheme, int slashes)
187 {
188 if (slashes) {
189 return unicode_in_string_list(scheme, URI_SCHEMES);
190 } else {
191 return unicode_in_string_list(scheme, URI_SCHEMES_AUTHORITY_OPTIONAL);
192 }
193 }
194