1# Licensed under a 3-clause BSD style license - see LICENSE.rst
2"""An extensible HTML table reader and writer.
3
4html.py:
5  Classes to read and write HTML tables
6
7`BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/>`_
8must be installed to read HTML tables.
9"""
10
11import warnings
12
13from . import core
14from astropy.table import Column
15from astropy.utils.xml import writer
16
17from copy import deepcopy
18
19
20class SoupString(str):
21    """
22    Allows for strings to hold BeautifulSoup data.
23    """
24
25    def __new__(cls, *args, **kwargs):
26        return str.__new__(cls, *args, **kwargs)
27
28    def __init__(self, val):
29        self.soup = val
30
31
32class ListWriter:
33    """
34    Allows for XMLWriter to write to a list instead of a file.
35    """
36
37    def __init__(self, out):
38        self.out = out
39
40    def write(self, data):
41        self.out.append(data)
42
43
44def identify_table(soup, htmldict, numtable):
45    """
46    Checks whether the given BeautifulSoup tag is the table
47    the user intends to process.
48    """
49
50    if soup is None or soup.name != 'table':
51        return False  # Tag is not a <table>
52
53    elif 'table_id' not in htmldict:
54        return numtable == 1
55    table_id = htmldict['table_id']
56
57    if isinstance(table_id, str):
58        return 'id' in soup.attrs and soup['id'] == table_id
59    elif isinstance(table_id, int):
60        return table_id == numtable
61
62    # Return False if an invalid parameter is given
63    return False
64
65
66class HTMLInputter(core.BaseInputter):
67    """
68    Input lines of HTML in a valid form.
69
70    This requires `BeautifulSoup
71    <http://www.crummy.com/software/BeautifulSoup/>`_ to be installed.
72    """
73
74    def process_lines(self, lines):
75        """
76        Convert the given input into a list of SoupString rows
77        for further processing.
78        """
79
80        try:
81            from bs4 import BeautifulSoup
82        except ImportError:
83            raise core.OptionalTableImportError('BeautifulSoup must be '
84                                                'installed to read HTML tables')
85
86        if 'parser' not in self.html:
87            with warnings.catch_warnings():
88                # Ignore bs4 parser warning #4550.
89                warnings.filterwarnings('ignore', '.*no parser was explicitly specified.*')
90                soup = BeautifulSoup('\n'.join(lines))
91        else:  # use a custom backend parser
92            soup = BeautifulSoup('\n'.join(lines), self.html['parser'])
93        tables = soup.find_all('table')
94        for i, possible_table in enumerate(tables):
95            if identify_table(possible_table, self.html, i + 1):
96                table = possible_table  # Find the correct table
97                break
98        else:
99            if isinstance(self.html['table_id'], int):
100                err_descr = f"number {self.html['table_id']}"
101            else:
102                err_descr = f"id '{self.html['table_id']}'"
103            raise core.InconsistentTableError(
104                f'ERROR: HTML table {err_descr} not found')
105
106        # Get all table rows
107        soup_list = [SoupString(x) for x in table.find_all('tr')]
108
109        return soup_list
110
111
112class HTMLSplitter(core.BaseSplitter):
113    """
114    Split HTML table data.
115    """
116
117    def __call__(self, lines):
118        """
119        Return HTML data from lines as a generator.
120        """
121        for line in lines:
122            if not isinstance(line, SoupString):
123                raise TypeError('HTML lines should be of type SoupString')
124            soup = line.soup
125            header_elements = soup.find_all('th')
126            if header_elements:
127                # Return multicolumns as tuples for HTMLHeader handling
128                yield [(el.text.strip(), el['colspan']) if el.has_attr('colspan')
129                       else el.text.strip() for el in header_elements]
130            data_elements = soup.find_all('td')
131            if data_elements:
132                yield [el.text.strip() for el in data_elements]
133        if len(lines) == 0:
134            raise core.InconsistentTableError('HTML tables must contain data '
135                                              'in a <table> tag')
136
137
138class HTMLOutputter(core.TableOutputter):
139    """
140    Output the HTML data as an ``astropy.table.Table`` object.
141
142    This subclass allows for the final table to contain
143    multidimensional columns (defined using the colspan attribute
144    of <th>).
145    """
146
147    default_converters = [core.convert_numpy(int),
148                          core.convert_numpy(float),
149                          core.convert_numpy(str)]
150
151    def __call__(self, cols, meta):
152        """
153        Process the data in multidimensional columns.
154        """
155        new_cols = []
156        col_num = 0
157
158        while col_num < len(cols):
159            col = cols[col_num]
160            if hasattr(col, 'colspan'):
161                # Join elements of spanned columns together into list of tuples
162                span_cols = cols[col_num:col_num + col.colspan]
163                new_col = core.Column(col.name)
164                new_col.str_vals = list(zip(*[x.str_vals for x in span_cols]))
165                new_cols.append(new_col)
166                col_num += col.colspan
167            else:
168                new_cols.append(col)
169                col_num += 1
170
171        return super().__call__(new_cols, meta)
172
173
174class HTMLHeader(core.BaseHeader):
175    splitter_class = HTMLSplitter
176
177    def start_line(self, lines):
178        """
179        Return the line number at which header data begins.
180        """
181
182        for i, line in enumerate(lines):
183            if not isinstance(line, SoupString):
184                raise TypeError('HTML lines should be of type SoupString')
185            soup = line.soup
186            if soup.th is not None:
187                return i
188
189        return None
190
191    def _set_cols_from_names(self):
192        """
193        Set columns from header names, handling multicolumns appropriately.
194        """
195        self.cols = []
196        new_names = []
197
198        for name in self.names:
199            if isinstance(name, tuple):
200                col = core.Column(name=name[0])
201                col.colspan = int(name[1])
202                self.cols.append(col)
203                new_names.append(name[0])
204                for i in range(1, int(name[1])):
205                    # Add dummy columns
206                    self.cols.append(core.Column(''))
207                    new_names.append('')
208            else:
209                self.cols.append(core.Column(name=name))
210                new_names.append(name)
211
212        self.names = new_names
213
214
215class HTMLData(core.BaseData):
216    splitter_class = HTMLSplitter
217
218    def start_line(self, lines):
219        """
220        Return the line number at which table data begins.
221        """
222
223        for i, line in enumerate(lines):
224            if not isinstance(line, SoupString):
225                raise TypeError('HTML lines should be of type SoupString')
226            soup = line.soup
227
228            if soup.td is not None:
229                if soup.th is not None:
230                    raise core.InconsistentTableError('HTML tables cannot '
231                                                      'have headings and data in the same row')
232                return i
233
234        raise core.InconsistentTableError('No start line found for HTML data')
235
236    def end_line(self, lines):
237        """
238        Return the line number at which table data ends.
239        """
240        last_index = -1
241
242        for i, line in enumerate(lines):
243            if not isinstance(line, SoupString):
244                raise TypeError('HTML lines should be of type SoupString')
245            soup = line.soup
246            if soup.td is not None:
247                last_index = i
248
249        if last_index == -1:
250            return None
251        return last_index + 1
252
253
254class HTML(core.BaseReader):
255    """HTML format table.
256
257    In order to customize input and output, a dict of parameters may
258    be passed to this class holding specific customizations.
259
260    **htmldict** : Dictionary of parameters for HTML input/output.
261
262        * css : Customized styling
263            If present, this parameter will be included in a <style>
264            tag and will define stylistic attributes of the output.
265
266        * table_id : ID for the input table
267            If a string, this defines the HTML id of the table to be processed.
268            If an integer, this specifies the index of the input table in the
269            available tables. Unless this parameter is given, the reader will
270            use the first table found in the input file.
271
272        * multicol : Use multi-dimensional columns for output
273            The writer will output tuples as elements of multi-dimensional
274            columns if this parameter is true, and if not then it will
275            use the syntax 1.36583e-13 .. 1.36583e-13 for output. If not
276            present, this parameter will be true by default.
277
278        * raw_html_cols : column name or list of names with raw HTML content
279            This allows one to include raw HTML content in the column output,
280            for instance to include link references in a table.  This option
281            requires that the bleach package be installed.  Only whitelisted
282            tags are allowed through for security reasons (see the
283            raw_html_clean_kwargs arg).
284
285        * raw_html_clean_kwargs : dict of keyword args controlling HTML cleaning
286            Raw HTML will be cleaned to prevent unsafe HTML from ending up in
287            the table output.  This is done by calling ``bleach.clean(data,
288            **raw_html_clean_kwargs)``.  For details on the available options
289            (e.g. tag whitelist) see:
290            https://bleach.readthedocs.io/en/latest/clean.html
291
292        * parser : Specific HTML parsing library to use
293            If specified, this specifies which HTML parsing library
294            BeautifulSoup should use as a backend. The options to choose
295            from are 'html.parser' (the standard library parser), 'lxml'
296            (the recommended parser), 'xml' (lxml's XML parser), and
297            'html5lib'. html5lib is a highly lenient parser and therefore
298            might work correctly for unusual input if a different parser
299            fails.
300
301        * jsfiles : list of js files to include when writing table.
302
303        * cssfiles : list of css files to include when writing table.
304
305        * js : js script to include in the body when writing table.
306
307        * table_class : css class for the table
308
309    """
310
311    _format_name = 'html'
312    _io_registry_format_aliases = ['html']
313    _io_registry_suffix = '.html'
314    _description = 'HTML table'
315
316    header_class = HTMLHeader
317    data_class = HTMLData
318    inputter_class = HTMLInputter
319
320    max_ndim = 2  # HTML supports writing 2-d columns with shape (n, m)
321
322    def __init__(self, htmldict={}):
323        """
324        Initialize classes for HTML reading and writing.
325        """
326        super().__init__()
327        self.html = deepcopy(htmldict)
328        if 'multicol' not in htmldict:
329            self.html['multicol'] = True
330        if 'table_id' not in htmldict:
331            self.html['table_id'] = 1
332        self.inputter.html = self.html
333
334    def read(self, table):
335        """
336        Read the ``table`` in HTML format and return a resulting ``Table``.
337        """
338
339        self.outputter = HTMLOutputter()
340        return super().read(table)
341
342    def write(self, table):
343        """
344        Return data in ``table`` converted to HTML as a list of strings.
345        """
346        # Check that table has only 1-d or 2-d columns. Above that fails.
347        self._check_multidim_table(table)
348
349        cols = list(table.columns.values())
350
351        self.data.header.cols = cols
352
353        if isinstance(self.data.fill_values, tuple):
354            self.data.fill_values = [self.data.fill_values]
355
356        self.data._set_fill_values(cols)
357
358        lines = []
359
360        # Set HTML escaping to False for any column in the raw_html_cols input
361        raw_html_cols = self.html.get('raw_html_cols', [])
362        if isinstance(raw_html_cols, str):
363            raw_html_cols = [raw_html_cols]  # Allow for a single string as input
364        cols_escaped = [col.info.name not in raw_html_cols for col in cols]
365
366        # Kwargs that get passed on to bleach.clean() if that is available.
367        raw_html_clean_kwargs = self.html.get('raw_html_clean_kwargs', {})
368
369        # Use XMLWriter to output HTML to lines
370        w = writer.XMLWriter(ListWriter(lines))
371
372        with w.tag('html'):
373            with w.tag('head'):
374                # Declare encoding and set CSS style for table
375                with w.tag('meta', attrib={'charset': 'utf-8'}):
376                    pass
377                with w.tag('meta', attrib={'http-equiv': 'Content-type',
378                                           'content': 'text/html;charset=UTF-8'}):
379                    pass
380                if 'css' in self.html:
381                    with w.tag('style'):
382                        w.data(self.html['css'])
383                if 'cssfiles' in self.html:
384                    for filename in self.html['cssfiles']:
385                        with w.tag('link', rel="stylesheet", href=filename, type='text/css'):
386                            pass
387                if 'jsfiles' in self.html:
388                    for filename in self.html['jsfiles']:
389                        with w.tag('script', src=filename):
390                            w.data('')  # need this instead of pass to get <script></script>
391            with w.tag('body'):
392                if 'js' in self.html:
393                    with w.xml_cleaning_method('none'):
394                        with w.tag('script'):
395                            w.data(self.html['js'])
396                if isinstance(self.html['table_id'], str):
397                    html_table_id = self.html['table_id']
398                else:
399                    html_table_id = None
400                if 'table_class' in self.html:
401                    html_table_class = self.html['table_class']
402                    attrib = {"class": html_table_class}
403                else:
404                    attrib = {}
405                with w.tag('table', id=html_table_id, attrib=attrib):
406                    with w.tag('thead'):
407                        with w.tag('tr'):
408                            for col in cols:
409                                if len(col.shape) > 1 and self.html['multicol']:
410                                    # Set colspan attribute for multicolumns
411                                    w.start('th', colspan=col.shape[1])
412                                else:
413                                    w.start('th')
414                                w.data(col.info.name.strip())
415                                w.end(indent=False)
416                        col_str_iters = []
417                        new_cols_escaped = []
418
419                        # Make a container to hold any new_col objects created
420                        # below for multicolumn elements.  This is purely to
421                        # maintain a reference for these objects during
422                        # subsequent iteration to format column values.  This
423                        # requires that the weakref info._parent be maintained.
424                        new_cols = []
425
426                        for col, col_escaped in zip(cols, cols_escaped):
427                            if len(col.shape) > 1 and self.html['multicol']:
428                                span = col.shape[1]
429                                for i in range(span):
430                                    # Split up multicolumns into separate columns
431                                    new_col = Column([el[i] for el in col])
432
433                                    new_col_iter_str_vals = self.fill_values(
434                                        col, new_col.info.iter_str_vals())
435                                    col_str_iters.append(new_col_iter_str_vals)
436                                    new_cols_escaped.append(col_escaped)
437                                    new_cols.append(new_col)
438                            else:
439
440                                col_iter_str_vals = self.fill_values(col, col.info.iter_str_vals())
441                                col_str_iters.append(col_iter_str_vals)
442
443                                new_cols_escaped.append(col_escaped)
444
445                    for row in zip(*col_str_iters):
446                        with w.tag('tr'):
447                            for el, col_escaped in zip(row, new_cols_escaped):
448                                # Potentially disable HTML escaping for column
449                                method = ('escape_xml' if col_escaped else 'bleach_clean')
450                                with w.xml_cleaning_method(method, **raw_html_clean_kwargs):
451                                    w.start('td')
452                                    w.data(el.strip())
453                                    w.end(indent=False)
454
455        # Fixes XMLWriter's insertion of unwanted line breaks
456        return [''.join(lines)]
457
458    def fill_values(self, col, col_str_iters):
459        """
460        Return an iterator of the values with replacements based on fill_values
461        """
462        # check if the col is a masked column and has fill values
463        is_masked_column = hasattr(col, 'mask')
464        has_fill_values = hasattr(col, 'fill_values')
465
466        for idx, col_str in enumerate(col_str_iters):
467            if is_masked_column and has_fill_values:
468                if col.mask[idx]:
469                    yield col.fill_values[core.masked]
470                    continue
471
472            if has_fill_values:
473                if col_str in col.fill_values:
474                    yield col.fill_values[col_str]
475                    continue
476
477            yield col_str
478