1# Licensed under a 3-clause BSD style license - see LICENSE.rst 2"""An extensible HTML table reader and writer. 3 4html.py: 5 Classes to read and write HTML tables 6 7`BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/>`_ 8must be installed to read HTML tables. 9""" 10 11import warnings 12 13from . import core 14from astropy.table import Column 15from astropy.utils.xml import writer 16 17from copy import deepcopy 18 19 20class SoupString(str): 21 """ 22 Allows for strings to hold BeautifulSoup data. 23 """ 24 25 def __new__(cls, *args, **kwargs): 26 return str.__new__(cls, *args, **kwargs) 27 28 def __init__(self, val): 29 self.soup = val 30 31 32class ListWriter: 33 """ 34 Allows for XMLWriter to write to a list instead of a file. 35 """ 36 37 def __init__(self, out): 38 self.out = out 39 40 def write(self, data): 41 self.out.append(data) 42 43 44def identify_table(soup, htmldict, numtable): 45 """ 46 Checks whether the given BeautifulSoup tag is the table 47 the user intends to process. 48 """ 49 50 if soup is None or soup.name != 'table': 51 return False # Tag is not a <table> 52 53 elif 'table_id' not in htmldict: 54 return numtable == 1 55 table_id = htmldict['table_id'] 56 57 if isinstance(table_id, str): 58 return 'id' in soup.attrs and soup['id'] == table_id 59 elif isinstance(table_id, int): 60 return table_id == numtable 61 62 # Return False if an invalid parameter is given 63 return False 64 65 66class HTMLInputter(core.BaseInputter): 67 """ 68 Input lines of HTML in a valid form. 69 70 This requires `BeautifulSoup 71 <http://www.crummy.com/software/BeautifulSoup/>`_ to be installed. 72 """ 73 74 def process_lines(self, lines): 75 """ 76 Convert the given input into a list of SoupString rows 77 for further processing. 78 """ 79 80 try: 81 from bs4 import BeautifulSoup 82 except ImportError: 83 raise core.OptionalTableImportError('BeautifulSoup must be ' 84 'installed to read HTML tables') 85 86 if 'parser' not in self.html: 87 with warnings.catch_warnings(): 88 # Ignore bs4 parser warning #4550. 89 warnings.filterwarnings('ignore', '.*no parser was explicitly specified.*') 90 soup = BeautifulSoup('\n'.join(lines)) 91 else: # use a custom backend parser 92 soup = BeautifulSoup('\n'.join(lines), self.html['parser']) 93 tables = soup.find_all('table') 94 for i, possible_table in enumerate(tables): 95 if identify_table(possible_table, self.html, i + 1): 96 table = possible_table # Find the correct table 97 break 98 else: 99 if isinstance(self.html['table_id'], int): 100 err_descr = f"number {self.html['table_id']}" 101 else: 102 err_descr = f"id '{self.html['table_id']}'" 103 raise core.InconsistentTableError( 104 f'ERROR: HTML table {err_descr} not found') 105 106 # Get all table rows 107 soup_list = [SoupString(x) for x in table.find_all('tr')] 108 109 return soup_list 110 111 112class HTMLSplitter(core.BaseSplitter): 113 """ 114 Split HTML table data. 115 """ 116 117 def __call__(self, lines): 118 """ 119 Return HTML data from lines as a generator. 120 """ 121 for line in lines: 122 if not isinstance(line, SoupString): 123 raise TypeError('HTML lines should be of type SoupString') 124 soup = line.soup 125 header_elements = soup.find_all('th') 126 if header_elements: 127 # Return multicolumns as tuples for HTMLHeader handling 128 yield [(el.text.strip(), el['colspan']) if el.has_attr('colspan') 129 else el.text.strip() for el in header_elements] 130 data_elements = soup.find_all('td') 131 if data_elements: 132 yield [el.text.strip() for el in data_elements] 133 if len(lines) == 0: 134 raise core.InconsistentTableError('HTML tables must contain data ' 135 'in a <table> tag') 136 137 138class HTMLOutputter(core.TableOutputter): 139 """ 140 Output the HTML data as an ``astropy.table.Table`` object. 141 142 This subclass allows for the final table to contain 143 multidimensional columns (defined using the colspan attribute 144 of <th>). 145 """ 146 147 default_converters = [core.convert_numpy(int), 148 core.convert_numpy(float), 149 core.convert_numpy(str)] 150 151 def __call__(self, cols, meta): 152 """ 153 Process the data in multidimensional columns. 154 """ 155 new_cols = [] 156 col_num = 0 157 158 while col_num < len(cols): 159 col = cols[col_num] 160 if hasattr(col, 'colspan'): 161 # Join elements of spanned columns together into list of tuples 162 span_cols = cols[col_num:col_num + col.colspan] 163 new_col = core.Column(col.name) 164 new_col.str_vals = list(zip(*[x.str_vals for x in span_cols])) 165 new_cols.append(new_col) 166 col_num += col.colspan 167 else: 168 new_cols.append(col) 169 col_num += 1 170 171 return super().__call__(new_cols, meta) 172 173 174class HTMLHeader(core.BaseHeader): 175 splitter_class = HTMLSplitter 176 177 def start_line(self, lines): 178 """ 179 Return the line number at which header data begins. 180 """ 181 182 for i, line in enumerate(lines): 183 if not isinstance(line, SoupString): 184 raise TypeError('HTML lines should be of type SoupString') 185 soup = line.soup 186 if soup.th is not None: 187 return i 188 189 return None 190 191 def _set_cols_from_names(self): 192 """ 193 Set columns from header names, handling multicolumns appropriately. 194 """ 195 self.cols = [] 196 new_names = [] 197 198 for name in self.names: 199 if isinstance(name, tuple): 200 col = core.Column(name=name[0]) 201 col.colspan = int(name[1]) 202 self.cols.append(col) 203 new_names.append(name[0]) 204 for i in range(1, int(name[1])): 205 # Add dummy columns 206 self.cols.append(core.Column('')) 207 new_names.append('') 208 else: 209 self.cols.append(core.Column(name=name)) 210 new_names.append(name) 211 212 self.names = new_names 213 214 215class HTMLData(core.BaseData): 216 splitter_class = HTMLSplitter 217 218 def start_line(self, lines): 219 """ 220 Return the line number at which table data begins. 221 """ 222 223 for i, line in enumerate(lines): 224 if not isinstance(line, SoupString): 225 raise TypeError('HTML lines should be of type SoupString') 226 soup = line.soup 227 228 if soup.td is not None: 229 if soup.th is not None: 230 raise core.InconsistentTableError('HTML tables cannot ' 231 'have headings and data in the same row') 232 return i 233 234 raise core.InconsistentTableError('No start line found for HTML data') 235 236 def end_line(self, lines): 237 """ 238 Return the line number at which table data ends. 239 """ 240 last_index = -1 241 242 for i, line in enumerate(lines): 243 if not isinstance(line, SoupString): 244 raise TypeError('HTML lines should be of type SoupString') 245 soup = line.soup 246 if soup.td is not None: 247 last_index = i 248 249 if last_index == -1: 250 return None 251 return last_index + 1 252 253 254class HTML(core.BaseReader): 255 """HTML format table. 256 257 In order to customize input and output, a dict of parameters may 258 be passed to this class holding specific customizations. 259 260 **htmldict** : Dictionary of parameters for HTML input/output. 261 262 * css : Customized styling 263 If present, this parameter will be included in a <style> 264 tag and will define stylistic attributes of the output. 265 266 * table_id : ID for the input table 267 If a string, this defines the HTML id of the table to be processed. 268 If an integer, this specifies the index of the input table in the 269 available tables. Unless this parameter is given, the reader will 270 use the first table found in the input file. 271 272 * multicol : Use multi-dimensional columns for output 273 The writer will output tuples as elements of multi-dimensional 274 columns if this parameter is true, and if not then it will 275 use the syntax 1.36583e-13 .. 1.36583e-13 for output. If not 276 present, this parameter will be true by default. 277 278 * raw_html_cols : column name or list of names with raw HTML content 279 This allows one to include raw HTML content in the column output, 280 for instance to include link references in a table. This option 281 requires that the bleach package be installed. Only whitelisted 282 tags are allowed through for security reasons (see the 283 raw_html_clean_kwargs arg). 284 285 * raw_html_clean_kwargs : dict of keyword args controlling HTML cleaning 286 Raw HTML will be cleaned to prevent unsafe HTML from ending up in 287 the table output. This is done by calling ``bleach.clean(data, 288 **raw_html_clean_kwargs)``. For details on the available options 289 (e.g. tag whitelist) see: 290 https://bleach.readthedocs.io/en/latest/clean.html 291 292 * parser : Specific HTML parsing library to use 293 If specified, this specifies which HTML parsing library 294 BeautifulSoup should use as a backend. The options to choose 295 from are 'html.parser' (the standard library parser), 'lxml' 296 (the recommended parser), 'xml' (lxml's XML parser), and 297 'html5lib'. html5lib is a highly lenient parser and therefore 298 might work correctly for unusual input if a different parser 299 fails. 300 301 * jsfiles : list of js files to include when writing table. 302 303 * cssfiles : list of css files to include when writing table. 304 305 * js : js script to include in the body when writing table. 306 307 * table_class : css class for the table 308 309 """ 310 311 _format_name = 'html' 312 _io_registry_format_aliases = ['html'] 313 _io_registry_suffix = '.html' 314 _description = 'HTML table' 315 316 header_class = HTMLHeader 317 data_class = HTMLData 318 inputter_class = HTMLInputter 319 320 max_ndim = 2 # HTML supports writing 2-d columns with shape (n, m) 321 322 def __init__(self, htmldict={}): 323 """ 324 Initialize classes for HTML reading and writing. 325 """ 326 super().__init__() 327 self.html = deepcopy(htmldict) 328 if 'multicol' not in htmldict: 329 self.html['multicol'] = True 330 if 'table_id' not in htmldict: 331 self.html['table_id'] = 1 332 self.inputter.html = self.html 333 334 def read(self, table): 335 """ 336 Read the ``table`` in HTML format and return a resulting ``Table``. 337 """ 338 339 self.outputter = HTMLOutputter() 340 return super().read(table) 341 342 def write(self, table): 343 """ 344 Return data in ``table`` converted to HTML as a list of strings. 345 """ 346 # Check that table has only 1-d or 2-d columns. Above that fails. 347 self._check_multidim_table(table) 348 349 cols = list(table.columns.values()) 350 351 self.data.header.cols = cols 352 353 if isinstance(self.data.fill_values, tuple): 354 self.data.fill_values = [self.data.fill_values] 355 356 self.data._set_fill_values(cols) 357 358 lines = [] 359 360 # Set HTML escaping to False for any column in the raw_html_cols input 361 raw_html_cols = self.html.get('raw_html_cols', []) 362 if isinstance(raw_html_cols, str): 363 raw_html_cols = [raw_html_cols] # Allow for a single string as input 364 cols_escaped = [col.info.name not in raw_html_cols for col in cols] 365 366 # Kwargs that get passed on to bleach.clean() if that is available. 367 raw_html_clean_kwargs = self.html.get('raw_html_clean_kwargs', {}) 368 369 # Use XMLWriter to output HTML to lines 370 w = writer.XMLWriter(ListWriter(lines)) 371 372 with w.tag('html'): 373 with w.tag('head'): 374 # Declare encoding and set CSS style for table 375 with w.tag('meta', attrib={'charset': 'utf-8'}): 376 pass 377 with w.tag('meta', attrib={'http-equiv': 'Content-type', 378 'content': 'text/html;charset=UTF-8'}): 379 pass 380 if 'css' in self.html: 381 with w.tag('style'): 382 w.data(self.html['css']) 383 if 'cssfiles' in self.html: 384 for filename in self.html['cssfiles']: 385 with w.tag('link', rel="stylesheet", href=filename, type='text/css'): 386 pass 387 if 'jsfiles' in self.html: 388 for filename in self.html['jsfiles']: 389 with w.tag('script', src=filename): 390 w.data('') # need this instead of pass to get <script></script> 391 with w.tag('body'): 392 if 'js' in self.html: 393 with w.xml_cleaning_method('none'): 394 with w.tag('script'): 395 w.data(self.html['js']) 396 if isinstance(self.html['table_id'], str): 397 html_table_id = self.html['table_id'] 398 else: 399 html_table_id = None 400 if 'table_class' in self.html: 401 html_table_class = self.html['table_class'] 402 attrib = {"class": html_table_class} 403 else: 404 attrib = {} 405 with w.tag('table', id=html_table_id, attrib=attrib): 406 with w.tag('thead'): 407 with w.tag('tr'): 408 for col in cols: 409 if len(col.shape) > 1 and self.html['multicol']: 410 # Set colspan attribute for multicolumns 411 w.start('th', colspan=col.shape[1]) 412 else: 413 w.start('th') 414 w.data(col.info.name.strip()) 415 w.end(indent=False) 416 col_str_iters = [] 417 new_cols_escaped = [] 418 419 # Make a container to hold any new_col objects created 420 # below for multicolumn elements. This is purely to 421 # maintain a reference for these objects during 422 # subsequent iteration to format column values. This 423 # requires that the weakref info._parent be maintained. 424 new_cols = [] 425 426 for col, col_escaped in zip(cols, cols_escaped): 427 if len(col.shape) > 1 and self.html['multicol']: 428 span = col.shape[1] 429 for i in range(span): 430 # Split up multicolumns into separate columns 431 new_col = Column([el[i] for el in col]) 432 433 new_col_iter_str_vals = self.fill_values( 434 col, new_col.info.iter_str_vals()) 435 col_str_iters.append(new_col_iter_str_vals) 436 new_cols_escaped.append(col_escaped) 437 new_cols.append(new_col) 438 else: 439 440 col_iter_str_vals = self.fill_values(col, col.info.iter_str_vals()) 441 col_str_iters.append(col_iter_str_vals) 442 443 new_cols_escaped.append(col_escaped) 444 445 for row in zip(*col_str_iters): 446 with w.tag('tr'): 447 for el, col_escaped in zip(row, new_cols_escaped): 448 # Potentially disable HTML escaping for column 449 method = ('escape_xml' if col_escaped else 'bleach_clean') 450 with w.xml_cleaning_method(method, **raw_html_clean_kwargs): 451 w.start('td') 452 w.data(el.strip()) 453 w.end(indent=False) 454 455 # Fixes XMLWriter's insertion of unwanted line breaks 456 return [''.join(lines)] 457 458 def fill_values(self, col, col_str_iters): 459 """ 460 Return an iterator of the values with replacements based on fill_values 461 """ 462 # check if the col is a masked column and has fill values 463 is_masked_column = hasattr(col, 'mask') 464 has_fill_values = hasattr(col, 'fill_values') 465 466 for idx, col_str in enumerate(col_str_iters): 467 if is_masked_column and has_fill_values: 468 if col.mask[idx]: 469 yield col.fill_values[core.masked] 470 continue 471 472 if has_fill_values: 473 if col_str in col.fill_values: 474 yield col.fill_values[col_str] 475 continue 476 477 yield col_str 478