1import ctypes 2import io 3import os 4import os.path 5import weakref 6from typing import Optional, Tuple 7 8from tidy.error import InvalidOptionError, OptionArgError 9 10LIBNAMES = ( 11 # FreeBSD 12 "libtidy5.so", 13 # Linux 14 "libtidy.so", 15 # MacOS 16 "libtidy.dylib", 17 # Windows 18 "tidy", 19 # Cygwin 20 "cygtidy-0-99-0", 21 # Linux, full soname 22 "libtidy-0.99.so.0", 23 # Linux, full soname 24 "libtidy-0.99.so.0.0.0", 25 # HTML tidy 26 "libtidy.so.5", 27 # Debian changed soname 28 "libtidy.so.5deb1", 29 # Windows? 30 "libtidy", 31 # Windows? 32 "tidylib", 33) 34 35 36class Loader: 37 """I am a trivial wrapper that eliminates the need for tidy.tidyFoo, 38 so you can just access tidy.Foo 39 """ 40 41 def __init__(self, libnames: Optional[Tuple[str, ...]] = None): 42 self.lib = None 43 self.libnames = libnames or LIBNAMES 44 45 # Add package directory to search path 46 os.environ["PATH"] = "".join( 47 (os.path.dirname(__file__), os.pathsep, os.environ["PATH"]) 48 ) 49 50 # Try loading library 51 for libname in self.libnames: 52 try: 53 self.lib = ctypes.CDLL(libname) 54 break 55 except OSError: 56 continue 57 58 # Fail in case we could not load it 59 if self.lib is None and "IGNORE_MISSING_TIDY" not in os.environ: 60 raise OSError("Couldn't find libtidy, please make sure it is installed.") 61 62 # Adjust some types 63 if self.lib is not None: 64 self.Create.restype = ctypes.POINTER(ctypes.c_void_p) 65 66 def __getattr__(self, name): 67 return getattr(self.lib, "tidy%s" % name) 68 69 70_tidy = Loader() 71 72 73_putByteFunction = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_char) 74 75 76# define a callback to pass to Tidylib 77@_putByteFunction 78def putByte(handle, char): 79 """Lookup sink by handle and call its putByte method""" 80 sinkfactory[handle].putByte(char) 81 return 0 82 83 84class _OutputSink(ctypes.Structure): 85 _fields_ = [("sinkData", ctypes.c_int), ("putByte", _putByteFunction)] 86 87 88class _Sink: 89 def __init__(self, handle): 90 self._data = io.BytesIO() 91 self.struct = _OutputSink() 92 self.struct.putByte = putByte 93 self.handle = handle 94 95 def putByte(self, byte): 96 self._data.write(byte) 97 98 def getvalue(self): 99 return self._data.getvalue() 100 101 102class ReportItem: 103 """ 104 Error report item as returned by tidy. 105 106 :attribute severity: D, W, E or C indicating severity 107 :attribute line: Line where error was fired (can be None) 108 :attribute col: Column where error was fired (can be None) 109 :attribute message: Error message itsef 110 :attribute err: Whole error message as returned by tidy 111 """ 112 113 severities = {"W": "Warning", "E": "Error", "C": "Config", "D": "Document"} 114 115 def __init__(self, err): 116 # TODO - parse emacs mode 117 self.err = err 118 if err.startswith("line"): 119 tokens = err.split(" ", 6) 120 self.full_severity = tokens[5] 121 self.severity = tokens[5][0] # W, E or C 122 self.line = int(tokens[1]) 123 self.col = int(tokens[3]) 124 self.message = tokens[6] 125 else: 126 tokens = err.split(" ", 1) 127 self.full_severity = tokens[0] 128 self.severity = tokens[0][0] 129 self.message = tokens[1] 130 self.line = None 131 self.col = None 132 133 def get_severity(self): 134 try: 135 return self.severities[self.severity] 136 except KeyError: 137 return self.full_severity.strip().rstrip(":") 138 139 def __str__(self): 140 if self.line: 141 return "line {} col {} - {}: {}".format( 142 self.line, self.col, self.get_severity(), self.message 143 ) 144 return f"{self.get_severity()}: {self.message}" 145 146 def __repr__(self): 147 return "{}('{}')".format(self.__class__.__name__, str(self).replace("'", "\\'")) 148 149 150class FactoryDict(dict): 151 """I am a dict with a create method and no __setitem__. This allows 152 me to control my own keys. 153 """ 154 155 def create(self): 156 """Subclasses should implement me to generate a new item""" 157 158 def _setitem(self, name, value): 159 dict.__setitem__(self, name, value) 160 161 def __setitem__(self, name, value): 162 raise TypeError("Use create() to get a new object") 163 164 165class SinkFactory(FactoryDict): 166 """Mapping for lookup of sinks by handle""" 167 168 def __init__(self): 169 super().__init__() 170 self.lastsink = 0 171 172 def create(self): 173 sink = _Sink(self.lastsink) 174 sink.struct.sinkData = self.lastsink 175 FactoryDict._setitem(self, self.lastsink, sink) 176 self.lastsink = self.lastsink + 1 177 return sink 178 179 180sinkfactory = SinkFactory() 181 182 183class Document: 184 """ 185 Document object as returned by :func:`parseString` or :func:`parse`. 186 """ 187 188 def __init__(self, options): 189 self.cdoc = _tidy.Create() 190 self.options = options 191 self.errsink = sinkfactory.create() 192 _tidy.SetErrorSink(self.cdoc, ctypes.byref(self.errsink.struct)) 193 self._set_options() 194 195 def _set_options(self): 196 for key, value in self.options.items(): 197 198 # this will flush out most argument type errors... 199 if value is None: 200 value = "" 201 202 _tidy.OptParseValue( 203 self.cdoc, 204 key.replace("_", "-").encode("utf-8"), 205 str(value).encode("utf-8"), 206 ) 207 if self.errors: 208 for error in ERROR_MAP: 209 if self.errors[-1].message.startswith(error): 210 raise ERROR_MAP[error](self.errors[-1].message) 211 212 def __del__(self): 213 del sinkfactory[self.errsink.handle] 214 215 def write(self, stream): 216 """ 217 :param stream: Writable file like object. 218 219 Writes document to the stream. 220 """ 221 stream.write(self.getvalue()) 222 223 def get_errors(self): 224 """ 225 Returns list of errors as a list of :class:`ReportItem`. 226 """ 227 ret = [] 228 for line in self.errsink.getvalue().decode("utf-8").splitlines(): 229 line = line.strip() 230 if line: 231 ret.append(ReportItem(line)) 232 return ret 233 234 errors = property(get_errors) 235 236 def getvalue(self): 237 """Raw string as returned by tidy.""" 238 stlen = ctypes.c_int(8192) 239 string_buffer = ctypes.create_string_buffer(stlen.value) 240 result = _tidy.SaveString(self.cdoc, string_buffer, ctypes.byref(stlen)) 241 if result == -12: # buffer too small 242 string_buffer = ctypes.create_string_buffer(stlen.value) 243 _tidy.SaveString(self.cdoc, string_buffer, ctypes.byref(stlen)) 244 return string_buffer.value 245 246 def gettext(self): 247 """Unicode text for output returned by tidy.""" 248 return self.getvalue().decode(self.options["output_encoding"]) 249 250 def __str__(self): 251 return self.gettext() 252 return self.getvalue() 253 254 255ERROR_MAP = { 256 "missing or malformed argument for option: ": OptionArgError, 257 "unknown option: ": InvalidOptionError, 258} 259 260 261class DocumentFactory(FactoryDict): 262 @staticmethod 263 def load(doc, arg, loader): 264 status = loader(doc.cdoc, arg) 265 if status > 0: 266 _tidy.CleanAndRepair(doc.cdoc) 267 268 def loadFile(self, doc, filename): 269 self.load(doc, filename.encode("utf-8"), _tidy.ParseFile) 270 271 def loadString(self, doc, text): 272 self.load(doc, text, _tidy.ParseString) 273 274 def _create(self, **kwargs): 275 enc = kwargs.get("char-encoding", "utf8") 276 if "output_encoding" not in kwargs: 277 kwargs["output_encoding"] = enc 278 if "input_encoding" not in kwargs: 279 kwargs["input_encoding"] = enc 280 doc = Document(kwargs) 281 ref = weakref.ref(doc, self.releaseDoc) 282 FactoryDict._setitem(self, ref, doc.cdoc) 283 return doc 284 285 def parse(self, filename, **kwargs): 286 """ 287 :param kwargs: named options to pass to TidyLib for processing the 288 input file. 289 :param filename: the name of a file to process 290 :return: a :class:`Document` object 291 292 Open and process filename as an HTML file, returning a 293 processed document object. 294 """ 295 doc = self._create(**kwargs) 296 self.loadFile(doc, filename) 297 return doc 298 299 def parseString(self, text, **kwargs): 300 """ 301 :param kwargs: named options to pass to TidyLib for processing the 302 input file. 303 :param text: the string to parse 304 :return: a :class:`Document` object 305 306 Use text as an HTML file, and process it, returning a 307 document object. 308 """ 309 doc = self._create(**kwargs) 310 if isinstance(text, str): 311 text = text.encode(doc.options["input_encoding"]) 312 self.loadString(doc, text) 313 return doc 314 315 def releaseDoc(self, ref): 316 _tidy.Release(self[ref]) 317 318 319docfactory = DocumentFactory() 320parse = docfactory.parse 321parseString = docfactory.parseString 322