1 /*
2 Copyright (C) 2012-2020 Ben Kurtovic <ben.kurtovic@gmail.com>
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy of
5 this software and associated documentation files (the "Software"), to deal in
6 the Software without restriction, including without limitation the rights to
7 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
8 of the Software, and to permit persons to whom the Software is furnished to do
9 so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in all
12 copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 SOFTWARE.
21 */
22 
23 #include "definitions.h"
24 
25 /*
26     This file should be kept up to date with mwparserfromhell/definitions.py.
27     See the Python version for data sources.
28 */
29 
30 // clang-format off
31 static const char *URI_SCHEMES[] = {
32     "bitcoin",
33     "ftp",
34     "ftps",
35     "geo",
36     "git",
37     "gopher",
38     "http",
39     "https",
40     "irc",
41     "ircs",
42     "magnet",
43     "mailto",
44     "mms",
45     "news",
46     "nntp",
47     "redis",
48     "sftp",
49     "sip",
50     "sips",
51     "sms",
52     "ssh",
53     "svn",
54     "tel",
55     "telnet",
56     "urn",
57     "worldwind",
58     "xmpp",
59     NULL,
60 };
61 
62 static const char *URI_SCHEMES_AUTHORITY_OPTIONAL[] = {
63     "bitcoin",
64     "geo",
65     "magnet",
66     "mailto",
67     "news",
68     "sip",
69     "sips",
70     "sms",
71     "tel",
72     "urn",
73     "xmpp",
74     NULL,
75 };
76 
77 static const char *PARSER_BLACKLIST[] = {
78     "categorytree",
79     "ce",
80     "chem",
81     "gallery",
82     "graph",
83     "hiero",
84     "imagemap",
85     "inputbox",
86     "math",
87     "nowiki",
88     "pre",
89     "score",
90     "section",
91     "source",
92     "syntaxhighlight",
93     "templatedata",
94     "timeline",
95     NULL,
96 };
97 // clang-format on
98 
99 static const char *SINGLE[] = {
100     "br", "wbr", "hr", "meta", "link", "img", "li", "dt", "dd", "th", "td", "tr", NULL};
101 
102 static const char *SINGLE_ONLY[] = {"br", "wbr", "hr", "meta", "link", "img", NULL};
103 
104 /*
105     Convert a PyUnicodeObject to a lowercase ASCII char* array and store it in
106     the second argument. The caller must free the return value when finished.
107     If the return value is NULL, the conversion failed and *string is not set.
108 */
109 static PyObject *
unicode_to_lcase_ascii(PyObject * input,const char ** string)110 unicode_to_lcase_ascii(PyObject *input, const char **string)
111 {
112     PyObject *lower = PyObject_CallMethod(input, "lower", NULL), *bytes;
113 
114     if (!lower) {
115         return NULL;
116     }
117     bytes = PyUnicode_AsASCIIString(lower);
118     Py_DECREF(lower);
119     if (!bytes) {
120         if (PyErr_Occurred() && PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) {
121             PyErr_Clear();
122         }
123         return NULL;
124     }
125     *string = PyBytes_AS_STRING(bytes);
126     return bytes;
127 }
128 
129 /*
130     Return whether a PyUnicodeObject is in a list of lowercase ASCII strings.
131 */
132 static int
unicode_in_string_list(PyObject * input,const char ** list)133 unicode_in_string_list(PyObject *input, const char **list)
134 {
135     const char *string;
136     PyObject *temp = unicode_to_lcase_ascii(input, &string);
137     int retval = 0;
138 
139     if (!temp) {
140         return 0;
141     }
142 
143     while (*list) {
144         if (!strcmp(*(list++), string)) {
145             retval = 1;
146             goto end;
147         }
148     }
149 
150 end:
151     Py_DECREF(temp);
152     return retval;
153 }
154 
155 /*
156     Return if the given tag's contents should be passed to the parser.
157 */
158 int
is_parsable(PyObject * tag)159 is_parsable(PyObject *tag)
160 {
161     return !unicode_in_string_list(tag, PARSER_BLACKLIST);
162 }
163 
164 /*
165     Return whether or not the given tag can exist without a close tag.
166 */
167 int
is_single(PyObject * tag)168 is_single(PyObject *tag)
169 {
170     return unicode_in_string_list(tag, SINGLE);
171 }
172 
173 /*
174     Return whether or not the given tag must exist without a close tag.
175 */
176 int
is_single_only(PyObject * tag)177 is_single_only(PyObject *tag)
178 {
179     return unicode_in_string_list(tag, SINGLE_ONLY);
180 }
181 
182 /*
183     Return whether the given scheme is valid for external links.
184 */
185 int
is_scheme(PyObject * scheme,int slashes)186 is_scheme(PyObject *scheme, int slashes)
187 {
188     if (slashes) {
189         return unicode_in_string_list(scheme, URI_SCHEMES);
190     } else {
191         return unicode_in_string_list(scheme, URI_SCHEMES_AUTHORITY_OPTIONAL);
192     }
193 }
194