1 /*
2 * python-wrapper.c
3 * Copyright (C) 2017 Kovid Goyal <kovid at kovidgoyal.net>
4 *
5 * Distributed under terms of the Apache 2.0 license.
6 */
7
8
9 #define PY_SSIZE_T_CLEAN
10 #include <Python.h>
11
12 #include "../gumbo/gumbo.h"
13 #include "as-libxml.h"
14 #include "as-python-tree.h"
15
16 #define MAJOR 0
17 #define MINOR 4
18 #define PATCH 10
19
20 static char *NAME = "libxml2:xmlDoc";
21 static char *DESTRUCTOR = "destructor:xmlFreeDoc";
22
23 static inline libxml_doc*
convert_tree(GumboOutput * output,Options * opts)24 convert_tree(GumboOutput *output, Options *opts) {
25 char *errmsg = NULL;
26 libxml_doc *doc = NULL;
27
28 Py_BEGIN_ALLOW_THREADS;
29 doc = convert_gumbo_tree_to_libxml_tree(output, opts, &errmsg);
30 Py_END_ALLOW_THREADS;
31 if (doc == NULL) {
32 if (errmsg) PyErr_SetString(PyExc_Exception, errmsg);
33 else PyErr_NoMemory();
34 }
35 return doc;
36 }
37
38 static libxml_doc*
parse_with_options(const char * buffer,size_t buffer_length,Options * opts,const GumboTag context,GumboNamespaceEnum context_namespace)39 parse_with_options(const char* buffer, size_t buffer_length, Options *opts, const GumboTag context, GumboNamespaceEnum context_namespace) {
40 GumboOutput *output = NULL;
41 libxml_doc* doc = NULL;
42 Py_BEGIN_ALLOW_THREADS;
43 output = gumbo_parse_fragment(&(opts->gumbo_opts), buffer, buffer_length, context, context_namespace);
44 Py_END_ALLOW_THREADS;
45 if (output == NULL) PyErr_NoMemory();
46 else {
47 doc = convert_tree(output, opts);
48 gumbo_destroy_output(output);
49 }
50 return doc;
51 }
52
53 static void
free_encapsulated_doc(PyObject * capsule)54 free_encapsulated_doc(PyObject *capsule) {
55 libxml_doc *doc = (libxml_doc*)PyCapsule_GetPointer(capsule, NAME);
56 if (doc != NULL) {
57 char *ctx = PyCapsule_GetContext(capsule);
58 if (ctx == DESTRUCTOR) free_libxml_doc(doc);
59 }
60 }
61
62 static inline PyObject*
encapsulate(libxml_doc * doc)63 encapsulate(libxml_doc* doc) {
64 PyObject *ans = NULL;
65 ans = PyCapsule_New(doc, NAME, free_encapsulated_doc);
66 if (ans == NULL) { free_libxml_doc(doc); return NULL; }
67 if (PyCapsule_SetContext(ans, DESTRUCTOR) != 0) { Py_DECREF(ans); return NULL; }
68 return ans;
69 }
70
71 static PyObject *
parse(PyObject UNUSED * self,PyObject * args,PyObject * kwds)72 parse(PyObject UNUSED *self, PyObject *args, PyObject *kwds) {
73 libxml_doc *doc = NULL;
74 const char *buffer = NULL;
75 Py_ssize_t sz = 0;
76 Options opts = {0};
77 opts.stack_size = 16 * 1024;
78 PyObject *kd = Py_True, *mx = Py_False, *ne = Py_False, *sn = Py_True;
79 char *fragment_context = NULL; Py_ssize_t fragment_context_sz = 0;
80 opts.gumbo_opts = kGumboDefaultOptions;
81 opts.gumbo_opts.max_errors = 0; // We discard errors since we are not reporting them anyway
82 GumboNamespaceEnum fragment_namespace = GUMBO_NAMESPACE_HTML;
83
84 static char *kwlist[] = {"data", "namespace_elements", "keep_doctype", "maybe_xhtml", "line_number_attr", "sanitize_names", "stack_size", "fragment_context", "fragment_namespace", NULL};
85
86 if (!PyArg_ParseTupleAndKeywords(args, kwds, "s#|OOOzOIz#i", kwlist, &buffer, &sz, &ne, &kd, &mx, &(opts.line_number_attr), &sn, &(opts.stack_size), &fragment_context, &fragment_context_sz, &fragment_namespace)) return NULL;
87 opts.namespace_elements = PyObject_IsTrue(ne);
88 opts.keep_doctype = PyObject_IsTrue(kd);
89 opts.sanitize_names = PyObject_IsTrue(sn);
90 opts.gumbo_opts.use_xhtml_rules = PyObject_IsTrue(mx);
91 GumboTag context = GUMBO_TAG_LAST;
92 if (fragment_context && fragment_context_sz > 0) {
93 context = gumbo_tagn_enum(fragment_context, fragment_context_sz);
94 if (context == GUMBO_TAG_UNKNOWN) {
95 PyErr_Format(PyExc_KeyError, "Unknown fragment_context tag name: %s", fragment_context);
96 return NULL;
97 }
98 }
99 if (fragment_namespace != GUMBO_NAMESPACE_HTML) {
100 // causes infinite loops in gumbo, enable the non html fragment context tests
101 // in html5lib_adapter.py to trigger
102 PyErr_SetString(PyExc_KeyError, "Fragment parsing with non-HTML namespaces is not supported");
103 return NULL;
104 }
105 doc = parse_with_options(buffer, (size_t)sz, &opts, context, fragment_namespace);
106 if (!doc) return NULL;
107 return encapsulate(doc);
108 }
109
110
111 static PyObject *
parse_and_build(PyObject UNUSED * self,PyObject * args)112 parse_and_build(PyObject UNUSED *self, PyObject *args) {
113 const char *buffer = NULL;
114 Py_ssize_t sz = 0;
115 GumboOutput *output = NULL;
116 PyObject *new_tag, *new_comment, *ans, *new_doctype, *append, *new_string, *ret;
117 Options opts = {0};
118 opts.stack_size = 16 * 1024;
119 opts.gumbo_opts = kGumboDefaultOptions;
120 opts.gumbo_opts.max_errors = 0; // We discard errors since we are not reporting them anyway
121
122 if (!PyArg_ParseTuple(args, "s#OOOOO|I", &buffer, &sz, &new_tag, &new_comment, &new_string, &append, &new_doctype, &(opts.stack_size))) return NULL;
123 Py_BEGIN_ALLOW_THREADS;
124 output = gumbo_parse_with_options(&(opts.gumbo_opts), buffer, (size_t)sz);
125 Py_END_ALLOW_THREADS;
126 if (output == NULL) PyErr_NoMemory();
127 GumboDocument* document = &(output->document->v.document);
128
129 if (new_doctype != Py_None && document->has_doctype) {
130 ret = PyObject_CallFunction(new_doctype, "sss", document->name, document->public_identifier, document->system_identifier);
131 if (ret == NULL) { gumbo_destroy_output(output); return NULL; }
132 Py_CLEAR(ret);
133 }
134 ans = as_python_tree(output, &opts, new_tag, new_comment, new_string, append);
135 gumbo_destroy_output(output);
136 return ans;
137 }
138
139
140 static PyObject *
clone_doc(PyObject UNUSED * self,PyObject * capsule)141 clone_doc(PyObject UNUSED *self, PyObject *capsule) {
142 if (!PyCapsule_CheckExact(capsule)) { PyErr_SetString(PyExc_TypeError, "Must specify a capsule as the argument"); return NULL; }
143 libxml_doc *sdoc = PyCapsule_GetPointer(capsule, PyCapsule_GetName(capsule)), *doc;
144 if (sdoc == NULL) return NULL;
145 doc = copy_libxml_doc(sdoc);
146 if (doc == NULL) return PyErr_NoMemory();
147 return encapsulate(doc);
148 }
149
150 static PyMethodDef
151 methods[] = {
152 {"parse", (PyCFunction)(void(*)(void))(PyCFunctionWithKeywords)(parse), METH_VARARGS | METH_KEYWORDS,
153 "parse()\n\nParse specified bytestring which must be in the UTF-8 encoding."
154 },
155
156 {"parse_and_build", (PyCFunction)parse_and_build, METH_VARARGS,
157 "parse_and_build()\n\nParse specified bytestring which must be in the UTF-8 encoding and build a tree using the specified functions."
158 },
159
160 {"clone_doc", clone_doc, METH_O,
161 "clone_doc()\n\nClone the specified document. Which must be a document returned by the parse() function."
162 },
163
164 {NULL, NULL, 0, NULL}
165 };
166
167 #define MODULE_NAME "html_parser"
168 #define MODULE_DOC "HTML parser in C for speed."
169
170 #if PY_MAJOR_VERSION >= 3
171
172 static struct PyModuleDef
173 moduledef = {
174 PyModuleDef_HEAD_INIT,
175 MODULE_NAME,
176 MODULE_DOC,
177 0,
178 methods,
179 NULL,
180 NULL,
181 NULL,
182 NULL
183 };
184
185 #define INITERROR return NULL
186
187 EXPORTED PyMODINIT_FUNC
PyInit_html_parser(void)188 PyInit_html_parser(void) {
189
190 #else
191 #define INITERROR return
192 EXPORTED PyMODINIT_FUNC
193 inithtml_parser(void) {
194 #endif
195 PyObject *m, *known_tag_names, *known_attr_names;
196 #if PY_MAJOR_VERSION >= 3
197 m = PyModule_Create(&moduledef);
198 #else
199 m = Py_InitModule3(MODULE_NAME, methods, MODULE_DOC);
200 #endif
201 if (m == NULL) INITERROR;
202 if (PyModule_AddIntMacro(m, MAJOR) != 0) INITERROR;
203 if (PyModule_AddIntMacro(m, MINOR) != 0) INITERROR;
204 if (PyModule_AddIntMacro(m, PATCH) != 0) INITERROR;
205 if (PyModule_AddIntMacro(m, GUMBO_NAMESPACE_HTML) != 0) INITERROR;
206 if (PyModule_AddIntMacro(m, GUMBO_NAMESPACE_SVG) != 0) INITERROR;
207 if (PyModule_AddIntMacro(m, GUMBO_NAMESPACE_MATHML) != 0) INITERROR;
208 if (PyModule_AddIntConstant(m, "LIBXML_VERSION", get_libxml_version()) != 0) INITERROR;
209 known_tag_names = PyTuple_New(GUMBO_TAG_UNKNOWN);
210 if (known_tag_names == NULL) INITERROR;
211 if (PyModule_AddObject(m, "KNOWN_TAG_NAMES", known_tag_names) != 0) { Py_CLEAR(known_tag_names); INITERROR; }
212 known_attr_names = PyTuple_New(HTML_ATTR_LAST);
213 if (known_attr_names == NULL) INITERROR;
214 if (PyModule_AddObject(m, "KNOWN_ATTR_NAMES", known_attr_names) != 0) { Py_CLEAR(known_attr_names); INITERROR; }
215 if (!set_known_tag_names(known_tag_names, known_attr_names)) { Py_CLEAR(known_tag_names); Py_CLEAR(known_attr_names); INITERROR; }
216 #if PY_MAJOR_VERSION >= 3
217 return m;
218 #endif
219 }
220