1 /*
2  * python-wrapper.c
3  * Copyright (C) 2017 Kovid Goyal <kovid at kovidgoyal.net>
4  *
5  * Distributed under terms of the Apache 2.0 license.
6  */
7 
8 
9 #define PY_SSIZE_T_CLEAN
10 #include <Python.h>
11 
12 #include "../gumbo/gumbo.h"
13 #include "as-libxml.h"
14 #include "as-python-tree.h"
15 
16 #define MAJOR 0
17 #define MINOR 4
18 #define PATCH 10
19 
20 static char *NAME =  "libxml2:xmlDoc";
21 static char *DESTRUCTOR = "destructor:xmlFreeDoc";
22 
23 static inline libxml_doc*
convert_tree(GumboOutput * output,Options * opts)24 convert_tree(GumboOutput *output, Options *opts) {
25     char *errmsg = NULL;
26     libxml_doc *doc = NULL;
27 
28     Py_BEGIN_ALLOW_THREADS;
29     doc = convert_gumbo_tree_to_libxml_tree(output, opts, &errmsg);
30     Py_END_ALLOW_THREADS;
31     if (doc == NULL) {
32         if (errmsg) PyErr_SetString(PyExc_Exception, errmsg);
33         else PyErr_NoMemory();
34     }
35     return doc;
36 }
37 
38 static libxml_doc*
parse_with_options(const char * buffer,size_t buffer_length,Options * opts,const GumboTag context,GumboNamespaceEnum context_namespace)39 parse_with_options(const char* buffer, size_t buffer_length, Options *opts, const GumboTag context, GumboNamespaceEnum context_namespace) {
40     GumboOutput *output = NULL;
41     libxml_doc* doc = NULL;
42     Py_BEGIN_ALLOW_THREADS;
43     output = gumbo_parse_fragment(&(opts->gumbo_opts), buffer, buffer_length, context, context_namespace);
44     Py_END_ALLOW_THREADS;
45     if (output == NULL) PyErr_NoMemory();
46     else {
47         doc = convert_tree(output, opts);
48         gumbo_destroy_output(output);
49     }
50     return doc;
51 }
52 
53 static void
free_encapsulated_doc(PyObject * capsule)54 free_encapsulated_doc(PyObject *capsule) {
55     libxml_doc *doc = (libxml_doc*)PyCapsule_GetPointer(capsule, NAME);
56     if (doc != NULL) {
57         char *ctx = PyCapsule_GetContext(capsule);
58         if (ctx == DESTRUCTOR) free_libxml_doc(doc);
59     }
60 }
61 
62 static inline PyObject*
encapsulate(libxml_doc * doc)63 encapsulate(libxml_doc* doc) {
64     PyObject *ans = NULL;
65     ans = PyCapsule_New(doc, NAME, free_encapsulated_doc);
66     if (ans == NULL) { free_libxml_doc(doc); return NULL; }
67     if (PyCapsule_SetContext(ans, DESTRUCTOR) != 0) { Py_DECREF(ans); return NULL; }
68     return ans;
69 }
70 
71 static PyObject *
parse(PyObject UNUSED * self,PyObject * args,PyObject * kwds)72 parse(PyObject UNUSED *self, PyObject *args, PyObject *kwds) {
73     libxml_doc *doc = NULL;
74     const char *buffer = NULL;
75     Py_ssize_t sz = 0;
76     Options opts = {0};
77     opts.stack_size = 16 * 1024;
78     PyObject *kd = Py_True, *mx = Py_False, *ne = Py_False, *sn = Py_True;
79     char *fragment_context = NULL; Py_ssize_t fragment_context_sz = 0;
80     opts.gumbo_opts = kGumboDefaultOptions;
81     opts.gumbo_opts.max_errors = 0;  // We discard errors since we are not reporting them anyway
82     GumboNamespaceEnum fragment_namespace = GUMBO_NAMESPACE_HTML;
83 
84     static char *kwlist[] = {"data", "namespace_elements", "keep_doctype", "maybe_xhtml", "line_number_attr", "sanitize_names", "stack_size", "fragment_context", "fragment_namespace", NULL};
85 
86     if (!PyArg_ParseTupleAndKeywords(args, kwds, "s#|OOOzOIz#i", kwlist, &buffer, &sz, &ne, &kd, &mx, &(opts.line_number_attr), &sn, &(opts.stack_size), &fragment_context, &fragment_context_sz, &fragment_namespace)) return NULL;
87     opts.namespace_elements = PyObject_IsTrue(ne);
88     opts.keep_doctype = PyObject_IsTrue(kd);
89     opts.sanitize_names = PyObject_IsTrue(sn);
90     opts.gumbo_opts.use_xhtml_rules = PyObject_IsTrue(mx);
91     GumboTag context = GUMBO_TAG_LAST;
92     if (fragment_context && fragment_context_sz > 0) {
93         context = gumbo_tagn_enum(fragment_context, fragment_context_sz);
94         if (context == GUMBO_TAG_UNKNOWN) {
95             PyErr_Format(PyExc_KeyError, "Unknown fragment_context tag name: %s", fragment_context);
96             return NULL;
97         }
98     }
99     if (fragment_namespace != GUMBO_NAMESPACE_HTML) {
100         // causes infinite loops in gumbo, enable the non html fragment context tests
101         // in html5lib_adapter.py to trigger
102         PyErr_SetString(PyExc_KeyError, "Fragment parsing with non-HTML namespaces is not supported");
103         return NULL;
104     }
105     doc = parse_with_options(buffer, (size_t)sz, &opts, context, fragment_namespace);
106     if (!doc) return NULL;
107     return encapsulate(doc);
108 }
109 
110 
111 static PyObject *
parse_and_build(PyObject UNUSED * self,PyObject * args)112 parse_and_build(PyObject UNUSED *self, PyObject *args) {
113     const char *buffer = NULL;
114     Py_ssize_t sz = 0;
115     GumboOutput *output = NULL;
116     PyObject *new_tag, *new_comment, *ans, *new_doctype, *append, *new_string, *ret;
117     Options opts = {0};
118     opts.stack_size = 16 * 1024;
119     opts.gumbo_opts = kGumboDefaultOptions;
120     opts.gumbo_opts.max_errors = 0;  // We discard errors since we are not reporting them anyway
121 
122     if (!PyArg_ParseTuple(args, "s#OOOOO|I", &buffer, &sz, &new_tag, &new_comment, &new_string, &append, &new_doctype, &(opts.stack_size))) return NULL;
123     Py_BEGIN_ALLOW_THREADS;
124     output = gumbo_parse_with_options(&(opts.gumbo_opts), buffer, (size_t)sz);
125     Py_END_ALLOW_THREADS;
126     if (output == NULL) PyErr_NoMemory();
127     GumboDocument* document = &(output->document->v.document);
128 
129     if (new_doctype != Py_None && document->has_doctype) {
130         ret = PyObject_CallFunction(new_doctype, "sss", document->name, document->public_identifier, document->system_identifier);
131         if (ret == NULL) { gumbo_destroy_output(output); return NULL; }
132         Py_CLEAR(ret);
133     }
134     ans = as_python_tree(output, &opts, new_tag, new_comment, new_string, append);
135     gumbo_destroy_output(output);
136     return ans;
137 }
138 
139 
140 static PyObject *
clone_doc(PyObject UNUSED * self,PyObject * capsule)141 clone_doc(PyObject UNUSED *self, PyObject *capsule) {
142     if (!PyCapsule_CheckExact(capsule)) { PyErr_SetString(PyExc_TypeError, "Must specify a capsule as the argument"); return NULL; }
143     libxml_doc *sdoc = PyCapsule_GetPointer(capsule, PyCapsule_GetName(capsule)), *doc;
144     if (sdoc == NULL) return NULL;
145     doc = copy_libxml_doc(sdoc);
146     if (doc == NULL) return PyErr_NoMemory();
147     return encapsulate(doc);
148 }
149 
150 static PyMethodDef
151 methods[] = {
152     {"parse", (PyCFunction)(void(*)(void))(PyCFunctionWithKeywords)(parse), METH_VARARGS | METH_KEYWORDS,
153         "parse()\n\nParse specified bytestring which must be in the UTF-8 encoding."
154     },
155 
156     {"parse_and_build", (PyCFunction)parse_and_build, METH_VARARGS,
157         "parse_and_build()\n\nParse specified bytestring which must be in the UTF-8 encoding and build a tree using the specified functions."
158     },
159 
160     {"clone_doc", clone_doc, METH_O,
161         "clone_doc()\n\nClone the specified document. Which must be a document returned by the parse() function."
162     },
163 
164     {NULL, NULL, 0, NULL}
165 };
166 
167 #define MODULE_NAME "html_parser"
168 #define MODULE_DOC "HTML parser in C for speed."
169 
170 #if PY_MAJOR_VERSION >= 3
171 
172 static struct PyModuleDef
173 moduledef = {
174         PyModuleDef_HEAD_INIT,
175         MODULE_NAME,
176         MODULE_DOC,
177         0,
178         methods,
179         NULL,
180         NULL,
181         NULL,
182         NULL
183 };
184 
185 #define INITERROR return NULL
186 
187 EXPORTED PyMODINIT_FUNC
PyInit_html_parser(void)188 PyInit_html_parser(void) {
189 
190 #else
191 #define INITERROR return
192 EXPORTED PyMODINIT_FUNC
193 inithtml_parser(void) {
194 #endif
195     PyObject *m, *known_tag_names, *known_attr_names;
196 #if PY_MAJOR_VERSION >= 3
197     m = PyModule_Create(&moduledef);
198 #else
199     m = Py_InitModule3(MODULE_NAME, methods, MODULE_DOC);
200 #endif
201     if (m == NULL) INITERROR;
202     if (PyModule_AddIntMacro(m, MAJOR) != 0) INITERROR;
203     if (PyModule_AddIntMacro(m, MINOR) != 0) INITERROR;
204     if (PyModule_AddIntMacro(m, PATCH) != 0) INITERROR;
205     if (PyModule_AddIntMacro(m, GUMBO_NAMESPACE_HTML) != 0) INITERROR;
206     if (PyModule_AddIntMacro(m, GUMBO_NAMESPACE_SVG) != 0) INITERROR;
207     if (PyModule_AddIntMacro(m, GUMBO_NAMESPACE_MATHML) != 0) INITERROR;
208     if (PyModule_AddIntConstant(m, "LIBXML_VERSION", get_libxml_version()) != 0) INITERROR;
209     known_tag_names = PyTuple_New(GUMBO_TAG_UNKNOWN);
210     if (known_tag_names == NULL) INITERROR;
211     if (PyModule_AddObject(m, "KNOWN_TAG_NAMES", known_tag_names) != 0) { Py_CLEAR(known_tag_names); INITERROR; }
212     known_attr_names = PyTuple_New(HTML_ATTR_LAST);
213     if (known_attr_names == NULL) INITERROR;
214     if (PyModule_AddObject(m, "KNOWN_ATTR_NAMES", known_attr_names) != 0) { Py_CLEAR(known_attr_names); INITERROR; }
215     if (!set_known_tag_names(known_tag_names, known_attr_names)) { Py_CLEAR(known_tag_names); Py_CLEAR(known_attr_names); INITERROR; }
216 #if PY_MAJOR_VERSION >= 3
217     return m;
218 #endif
219 }
220