1import ctypes
2import io
3import os
4import os.path
5import weakref
6from typing import Optional, Tuple
7
8from tidy.error import InvalidOptionError, OptionArgError
9
10LIBNAMES = (
11    # FreeBSD
12    "libtidy5.so",
13    # Linux
14    "libtidy.so",
15    # MacOS
16    "libtidy.dylib",
17    # Windows
18    "tidy",
19    # Cygwin
20    "cygtidy-0-99-0",
21    # Linux, full soname
22    "libtidy-0.99.so.0",
23    # Linux, full soname
24    "libtidy-0.99.so.0.0.0",
25    # HTML tidy
26    "libtidy.so.5",
27    # Debian changed soname
28    "libtidy.so.5deb1",
29    # Windows?
30    "libtidy",
31    # Windows?
32    "tidylib",
33)
34
35
36class Loader:
37    """I am a trivial wrapper that eliminates the need for tidy.tidyFoo,
38    so you can just access tidy.Foo
39    """
40
41    def __init__(self, libnames: Optional[Tuple[str, ...]] = None):
42        self.lib = None
43        self.libnames = libnames or LIBNAMES
44
45        # Add package directory to search path
46        os.environ["PATH"] = "".join(
47            (os.path.dirname(__file__), os.pathsep, os.environ["PATH"])
48        )
49
50        # Try loading library
51        for libname in self.libnames:
52            try:
53                self.lib = ctypes.CDLL(libname)
54                break
55            except OSError:
56                continue
57
58        # Fail in case we could not load it
59        if self.lib is None and "IGNORE_MISSING_TIDY" not in os.environ:
60            raise OSError("Couldn't find libtidy, please make sure it is installed.")
61
62        # Adjust some types
63        if self.lib is not None:
64            self.Create.restype = ctypes.POINTER(ctypes.c_void_p)
65
66    def __getattr__(self, name):
67        return getattr(self.lib, "tidy%s" % name)
68
69
70_tidy = Loader()
71
72
73_putByteFunction = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_char)
74
75
76# define a callback to pass to Tidylib
77@_putByteFunction
78def putByte(handle, char):
79    """Lookup sink by handle and call its putByte method"""
80    sinkfactory[handle].putByte(char)
81    return 0
82
83
84class _OutputSink(ctypes.Structure):
85    _fields_ = [("sinkData", ctypes.c_int), ("putByte", _putByteFunction)]
86
87
88class _Sink:
89    def __init__(self, handle):
90        self._data = io.BytesIO()
91        self.struct = _OutputSink()
92        self.struct.putByte = putByte
93        self.handle = handle
94
95    def putByte(self, byte):
96        self._data.write(byte)
97
98    def getvalue(self):
99        return self._data.getvalue()
100
101
102class ReportItem:
103    """
104    Error report item as returned by tidy.
105
106    :attribute severity: D, W, E or C indicating severity
107    :attribute line: Line where error was fired (can be None)
108    :attribute col: Column where error was fired (can be None)
109    :attribute message: Error message itsef
110    :attribute err: Whole error message as returned by tidy
111    """
112
113    severities = {"W": "Warning", "E": "Error", "C": "Config", "D": "Document"}
114
115    def __init__(self, err):
116        # TODO - parse emacs mode
117        self.err = err
118        if err.startswith("line"):
119            tokens = err.split(" ", 6)
120            self.full_severity = tokens[5]
121            self.severity = tokens[5][0]  # W, E or C
122            self.line = int(tokens[1])
123            self.col = int(tokens[3])
124            self.message = tokens[6]
125        else:
126            tokens = err.split(" ", 1)
127            self.full_severity = tokens[0]
128            self.severity = tokens[0][0]
129            self.message = tokens[1]
130            self.line = None
131            self.col = None
132
133    def get_severity(self):
134        try:
135            return self.severities[self.severity]
136        except KeyError:
137            return self.full_severity.strip().rstrip(":")
138
139    def __str__(self):
140        if self.line:
141            return "line {} col {} - {}: {}".format(
142                self.line, self.col, self.get_severity(), self.message
143            )
144        return f"{self.get_severity()}: {self.message}"
145
146    def __repr__(self):
147        return "{}('{}')".format(self.__class__.__name__, str(self).replace("'", "\\'"))
148
149
150class FactoryDict(dict):
151    """I am a dict with a create method and no __setitem__.  This allows
152    me to control my own keys.
153    """
154
155    def create(self):
156        """Subclasses should implement me to generate a new item"""
157
158    def _setitem(self, name, value):
159        dict.__setitem__(self, name, value)
160
161    def __setitem__(self, name, value):
162        raise TypeError("Use create() to get a new object")
163
164
165class SinkFactory(FactoryDict):
166    """Mapping for lookup of sinks by handle"""
167
168    def __init__(self):
169        super().__init__()
170        self.lastsink = 0
171
172    def create(self):
173        sink = _Sink(self.lastsink)
174        sink.struct.sinkData = self.lastsink
175        FactoryDict._setitem(self, self.lastsink, sink)
176        self.lastsink = self.lastsink + 1
177        return sink
178
179
180sinkfactory = SinkFactory()
181
182
183class Document:
184    """
185    Document object as returned by :func:`parseString` or :func:`parse`.
186    """
187
188    def __init__(self, options):
189        self.cdoc = _tidy.Create()
190        self.options = options
191        self.errsink = sinkfactory.create()
192        _tidy.SetErrorSink(self.cdoc, ctypes.byref(self.errsink.struct))
193        self._set_options()
194
195    def _set_options(self):
196        for key, value in self.options.items():
197
198            # this will flush out most argument type errors...
199            if value is None:
200                value = ""
201
202            _tidy.OptParseValue(
203                self.cdoc,
204                key.replace("_", "-").encode("utf-8"),
205                str(value).encode("utf-8"),
206            )
207            if self.errors:
208                for error in ERROR_MAP:
209                    if self.errors[-1].message.startswith(error):
210                        raise ERROR_MAP[error](self.errors[-1].message)
211
212    def __del__(self):
213        del sinkfactory[self.errsink.handle]
214
215    def write(self, stream):
216        """
217        :param stream: Writable file like object.
218
219        Writes document to the stream.
220        """
221        stream.write(self.getvalue())
222
223    def get_errors(self):
224        """
225        Returns list of errors as a list of :class:`ReportItem`.
226        """
227        ret = []
228        for line in self.errsink.getvalue().decode("utf-8").splitlines():
229            line = line.strip()
230            if line:
231                ret.append(ReportItem(line))
232        return ret
233
234    errors = property(get_errors)
235
236    def getvalue(self):
237        """Raw string as returned by tidy."""
238        stlen = ctypes.c_int(8192)
239        string_buffer = ctypes.create_string_buffer(stlen.value)
240        result = _tidy.SaveString(self.cdoc, string_buffer, ctypes.byref(stlen))
241        if result == -12:  # buffer too small
242            string_buffer = ctypes.create_string_buffer(stlen.value)
243            _tidy.SaveString(self.cdoc, string_buffer, ctypes.byref(stlen))
244        return string_buffer.value
245
246    def gettext(self):
247        """Unicode text for output returned by tidy."""
248        return self.getvalue().decode(self.options["output_encoding"])
249
250    def __str__(self):
251        return self.gettext()
252        return self.getvalue()
253
254
255ERROR_MAP = {
256    "missing or malformed argument for option: ": OptionArgError,
257    "unknown option: ": InvalidOptionError,
258}
259
260
261class DocumentFactory(FactoryDict):
262    @staticmethod
263    def load(doc, arg, loader):
264        status = loader(doc.cdoc, arg)
265        if status > 0:
266            _tidy.CleanAndRepair(doc.cdoc)
267
268    def loadFile(self, doc, filename):
269        self.load(doc, filename.encode("utf-8"), _tidy.ParseFile)
270
271    def loadString(self, doc, text):
272        self.load(doc, text, _tidy.ParseString)
273
274    def _create(self, **kwargs):
275        enc = kwargs.get("char-encoding", "utf8")
276        if "output_encoding" not in kwargs:
277            kwargs["output_encoding"] = enc
278        if "input_encoding" not in kwargs:
279            kwargs["input_encoding"] = enc
280        doc = Document(kwargs)
281        ref = weakref.ref(doc, self.releaseDoc)
282        FactoryDict._setitem(self, ref, doc.cdoc)
283        return doc
284
285    def parse(self, filename, **kwargs):
286        """
287        :param kwargs: named options to pass to TidyLib for processing the
288                       input file.
289        :param filename: the name of a file to process
290        :return: a :class:`Document` object
291
292        Open and process filename as an HTML file, returning a
293        processed document object.
294        """
295        doc = self._create(**kwargs)
296        self.loadFile(doc, filename)
297        return doc
298
299    def parseString(self, text, **kwargs):
300        """
301        :param kwargs: named options to pass to TidyLib for processing the
302                       input file.
303        :param text: the string to parse
304        :return: a :class:`Document` object
305
306        Use text as an HTML file, and process it, returning a
307        document object.
308        """
309        doc = self._create(**kwargs)
310        if isinstance(text, str):
311            text = text.encode(doc.options["input_encoding"])
312        self.loadString(doc, text)
313        return doc
314
315    def releaseDoc(self, ref):
316        _tidy.Release(self[ref])
317
318
319docfactory = DocumentFactory()
320parse = docfactory.parse
321parseString = docfactory.parseString
322