1# Licensed under a 3-clause BSD style license - see LICENSE.rst
2"""Classes to read AAS MRT table format
3
4Ref: https://journals.aas.org/mrt-standards
5
6:Copyright: Smithsonian Astrophysical Observatory (2021)
7:Author: Tom Aldcroft (aldcroft@head.cfa.harvard.edu), \
8         Suyog Garg (suyog7130@gmail.com)
9"""
10
11import re
12import math
13import warnings
14import numpy as np
15from io import StringIO
16
17from . import core
18from . import fixedwidth, cds
19
20from astropy import units as u
21
22from astropy.table import Table
23from astropy.table import Column, MaskedColumn
24from string import Template
25from textwrap import wrap
26
27MAX_SIZE_README_LINE = 80
28MAX_COL_INTLIMIT = 100000
29
30
31__doctest_skip__ = ['*']
32
33
34BYTE_BY_BYTE_TEMPLATE = [
35    "Byte-by-byte Description of file: $file",
36    "--------------------------------------------------------------------------------",
37    " Bytes Format Units  Label     Explanations",
38    "--------------------------------------------------------------------------------",
39    "$bytebybyte",
40    "--------------------------------------------------------------------------------"]
41
42MRT_TEMPLATE = [
43    "Title:",
44    "Authors:",
45    "Table:",
46    "================================================================================",
47    "$bytebybyte",
48    "Notes:",
49    "--------------------------------------------------------------------------------"]
50
51
52class MrtSplitter(fixedwidth.FixedWidthSplitter):
53    """
54    Contains the join function to left align the MRT columns
55    when writing to a file.
56    """
57    def join(self, vals, widths):
58        vals = [val + ' ' * (width - len(val)) for val, width in zip(vals, widths)]
59        return self.delimiter.join(vals)
60
61
62class MrtHeader(cds.CdsHeader):
63    _subfmt = 'MRT'
64
65    def _split_float_format(self, value):
66        """
67        Splits a Float string into different parts to find number
68        of digits after decimal and check if the value is in Scientific
69        notation.
70
71        Parameters
72        ----------
73        value : str
74            String containing the float value to split.
75
76        Returns
77        -------
78        fmt: (int, int, int, bool, bool)
79            List of values describing the Float sting.
80            (size, dec, ent, sign, exp)
81            size, length of the given string.
82            ent, number of digits before decimal point.
83            dec, number of digits after decimal point.
84            sign, whether or not given value signed.
85            exp, is value in Scientific notation?
86        """
87        regfloat = re.compile(r"""(?P<sign> [+-]*)
88                                  (?P<ent> [^eE.]+)
89                                  (?P<deciPt> [.]*)
90                                  (?P<decimals> [0-9]*)
91                                  (?P<exp> [eE]*-*)[0-9]*""",
92                              re.VERBOSE)
93        mo = regfloat.match(value)
94
95        if mo is None:
96            raise Exception(f'{value} is not a float number')
97        return (len(value),
98                len(mo.group('ent')),
99                len(mo.group('decimals')),
100                mo.group('sign') != "",
101                mo.group('exp') != "")
102
103    def _set_column_val_limits(self, col):
104        """
105        Sets the ``col.min`` and ``col.max`` column attributes,
106        taking into account columns with Null values.
107        """
108        col.max = max(col)
109        col.min = min(col)
110        if col.max is np.ma.core.MaskedConstant:
111            col.max = None
112        if col.min is np.ma.core.MaskedConstant:
113            col.min = None
114
115    def column_float_formatter(self, col):
116        """
117        String formatter function for a column containing Float values.
118        Checks if the values in the given column are in Scientific notation,
119        by spliting the value string. It is assumed that the column either has
120        float values or Scientific notation.
121
122        A ``col.formatted_width`` attribute is added to the column. It is not added
123        if such an attribute is already present, say when the ``formats`` argument
124        is passed to the writer. A properly formatted format string is also added as
125        the ``col.format`` attribute.
126
127        Parameters
128        ----------
129        col : A ``Table.Column`` object.
130        """
131        # maxsize: maximum length of string containing the float value.
132        # maxent: maximum number of digits places before decimal point.
133        # maxdec: maximum number of digits places after decimal point.
134        # maxprec: maximum precision of the column values, sum of maxent and maxdec.
135        maxsize, maxprec, maxent, maxdec = 1, 0, 1, 0
136        sign = False
137        fformat = 'F'
138
139        # Find maximum sized value in the col
140        for val in col.str_vals:
141            # Skip null values
142            if val is None or val == '':
143                continue
144
145            # Find format of the Float string
146            fmt = self._split_float_format(val)
147            # If value is in Scientific notation
148            if fmt[4] is True:
149                # if the previous column value was in normal Float format
150                # set maxsize, maxprec and maxdec to default.
151                if fformat == 'F':
152                    maxsize, maxprec, maxdec = 1, 0, 0
153                # Designate the column to be in Scientific notation.
154                fformat = 'E'
155            else:
156                # Move to next column value if
157                # current value is not in Scientific notation
158                # but the column is designated as such because
159                # one of the previous values was.
160                if fformat == 'E':
161                    continue
162
163            if maxsize < fmt[0]:
164                maxsize = fmt[0]
165            if maxent < fmt[1]:
166                maxent = fmt[1]
167            if maxdec < fmt[2]:
168                maxdec = fmt[2]
169            if fmt[3]:
170                sign = True
171
172            if maxprec < fmt[1] + fmt[2]:
173                maxprec = fmt[1] + fmt[2]
174
175        if fformat == 'E':
176            if getattr(col, 'formatted_width', None) is None:  # If ``formats`` not passed.
177                col.formatted_width = maxsize
178                if sign:
179                    col.formatted_width += 1
180            # Number of digits after decimal is replaced by the precision
181            # for values in Scientific notation, when writing that Format.
182            col.fortran_format = fformat + str(col.formatted_width) + "." + str(maxprec)
183            col.format = str(col.formatted_width) + "." + str(maxdec) + "e"
184        else:
185            lead = ''
186            if getattr(col, 'formatted_width', None) is None:  # If ``formats`` not passed.
187                col.formatted_width = maxent + maxdec + 1
188                if sign:
189                    col.formatted_width += 1
190            elif col.format.startswith('0'):
191                # Keep leading zero, if already set in format - primarily for `seconds` columns
192                # in coordinates; may need extra case if this is to be also supported with `sign`.
193                lead = '0'
194            col.fortran_format = fformat + str(col.formatted_width) + "." + str(maxdec)
195            col.format = lead + col.fortran_format[1:] + "f"
196
197    def write_byte_by_byte(self):
198        """
199        Writes the Byte-By-Byte description of the table.
200
201        Columns that are `astropy.coordinates.SkyCoord` or `astropy.time.TimeSeries`
202        objects or columns with values that are such objects are recognized as such,
203        and some predefined labels and description is used for them.
204        See the Vizier MRT Standard documentation in the link below for more details
205        on these. An example Byte-By-Byte table is shown here.
206
207        See: http://vizier.u-strasbg.fr/doc/catstd-3.1.htx
208
209        Example::
210
211        --------------------------------------------------------------------------------
212        Byte-by-byte Description of file: table.dat
213        --------------------------------------------------------------------------------
214        Bytes Format Units  Label     Explanations
215        --------------------------------------------------------------------------------
216         1- 8  A8     ---    names   Description of names
217        10-14  E5.1   ---    e       [-3160000.0/0.01] Description of e
218        16-23  F8.5   ---    d       [22.25/27.25] Description of d
219        25-31  E7.1   ---    s       [-9e+34/2.0] Description of s
220        33-35  I3     ---    i       [-30/67] Description of i
221        37-39  F3.1   ---    sameF   [5.0/5.0] Description of sameF
222        41-42  I2     ---    sameI   [20] Description of sameI
223        44-45  I2     h      RAh     Right Ascension (hour)
224        47-48  I2     min    RAm     Right Ascension (minute)
225        50-67  F18.15 s      RAs     Right Ascension (second)
226           69  A1     ---    DE-     Sign of Declination
227        70-71  I2     deg    DEd     Declination (degree)
228        73-74  I2     arcmin DEm     Declination (arcmin)
229        76-91  F16.13 arcsec DEs     Declination (arcsec)
230
231        --------------------------------------------------------------------------------
232        """
233        # Get column widths
234        vals_list = []
235        col_str_iters = self.data.str_vals()
236        for vals in zip(*col_str_iters):
237            vals_list.append(vals)
238
239        for i, col in enumerate(self.cols):
240            col.width = max([len(vals[i]) for vals in vals_list])
241            if self.start_line is not None:
242                col.width = max(col.width, len(col.info.name))
243        widths = [col.width for col in self.cols]
244
245        startb = 1  # Byte count starts at 1.
246
247        # Set default width of the Bytes count column of the Byte-By-Byte table.
248        # This ``byte_count_width`` value helps align byte counts with respect
249        # to the hyphen using a format string.
250        byte_count_width = len(str(sum(widths) + len(self.cols) - 1))
251
252        # Format string for Start Byte and End Byte
253        singlebfmt = "{:" + str(byte_count_width) + "d}"
254        fmtb = singlebfmt + "-" + singlebfmt
255        # Add trailing single whitespaces to Bytes column for better visibility.
256        singlebfmt += " "
257        fmtb += " "
258
259        # Set default width of Label and Description Byte-By-Byte columns.
260        max_label_width, max_descrip_size = 7, 16
261
262        bbb = Table(names=['Bytes', 'Format', 'Units', 'Label', 'Explanations'],
263                    dtype=[str] * 5)
264
265        # Iterate over the columns to write Byte-By-Byte rows.
266        for i, col in enumerate(self.cols):
267            # Check if column is MaskedColumn
268            col.has_null = isinstance(col, MaskedColumn)
269
270            if col.format is not None:
271                col.formatted_width = max([len(sval) for sval in col.str_vals])
272
273            # Set MRTColumn type, size and format.
274            if np.issubdtype(col.dtype, np.integer):
275                # Integer formatter
276                self._set_column_val_limits(col)
277                if getattr(col, 'formatted_width', None) is None:  # If ``formats`` not passed.
278                    col.formatted_width = max(len(str(col.max)), len(str(col.min)))
279                col.fortran_format = "I" + str(col.formatted_width)
280                if col.format is None:
281                    col.format = ">" + col.fortran_format[1:]
282
283            elif np.issubdtype(col.dtype, np.dtype(float).type):
284                # Float formatter
285                self._set_column_val_limits(col)
286                self.column_float_formatter(col)
287
288            else:
289                # String formatter, ``np.issubdtype(col.dtype, str)`` is ``True``.
290                dtype = col.dtype.str
291                if col.has_null:
292                    mcol = col
293                    mcol.fill_value = ""
294                    coltmp = Column(mcol.filled(), dtype=str)
295                    dtype = coltmp.dtype.str
296                if getattr(col, 'formatted_width', None) is None:  # If ``formats`` not passed.
297                    col.formatted_width = int(re.search(r'(\d+)$', dtype).group(1))
298                col.fortran_format = "A" + str(col.formatted_width)
299                col.format = str(col.formatted_width) + "s"
300
301            endb = col.formatted_width + startb - 1
302
303            # ``mixin`` columns converted to string valued columns will not have a name
304            # attribute. In those cases, a ``Unknown`` column label is put, indicating that
305            # such columns can be better formatted with some manipulation before calling
306            # the MRT writer.
307            if col.name is None:
308                col.name = "Unknown"
309
310            # Set column description.
311            if col.description is not None:
312                description = col.description
313            else:
314                description = "Description of " + col.name
315
316            # Set null flag in column description
317            nullflag = ""
318            if col.has_null:
319                nullflag = "?"
320
321            # Set column unit
322            if col.unit is not None:
323                col_unit = col.unit.to_string("cds")
324            elif col.name.lower().find("magnitude") > -1:
325                # ``col.unit`` can still be ``None``, if the unit of column values
326                # is ``Magnitude``, because ``astropy.units.Magnitude`` is actually a class.
327                # Unlike other units which are instances of ``astropy.units.Unit``,
328                # application of the ``Magnitude`` unit calculates the logarithm
329                # of the values. Thus, the only way to check for if the column values
330                # have ``Magnitude`` unit is to check the column name.
331                col_unit = "mag"
332            else:
333                col_unit = "---"
334
335            # Add col limit values to col description
336            lim_vals = ""
337            if (col.min and col.max and
338                    not any(x in col.name for x in ['RA', 'DE', 'LON', 'LAT', 'PLN', 'PLT'])):
339                # No col limit values for coordinate columns.
340                if col.fortran_format[0] == 'I':
341                    if abs(col.min) < MAX_COL_INTLIMIT and abs(col.max) < MAX_COL_INTLIMIT:
342                        if col.min == col.max:
343                            lim_vals = "[{0}]".format(col.min)
344                        else:
345                            lim_vals = "[{0}/{1}]".format(col.min, col.max)
346                elif col.fortran_format[0] in ('E', 'F'):
347                    lim_vals = "[{0}/{1}]".format(math.floor(col.min * 100) / 100.,
348                                                  math.ceil(col.max * 100) / 100.)
349
350            if lim_vals != '' or nullflag != '':
351                description = "{0}{1} {2}".format(lim_vals, nullflag, description)
352
353            # Find the maximum label and description column widths.
354            if len(col.name) > max_label_width:
355                max_label_width = len(col.name)
356            if len(description) > max_descrip_size:
357                max_descrip_size = len(description)
358
359            # Add a row for the Sign of Declination in the bbb table
360            if col.name == 'DEd':
361                bbb.add_row([singlebfmt.format(startb),
362                             "A1", "---", "DE-",
363                             "Sign of Declination"])
364                col.fortran_format = 'I2'
365                startb += 1
366
367            # Add Byte-By-Byte row to bbb table
368            bbb.add_row([singlebfmt.format(startb) if startb == endb
369                         else fmtb.format(startb, endb),
370                         "" if col.fortran_format is None else col.fortran_format,
371                         col_unit,
372                         "" if col.name is None else col.name,
373                         description])
374            startb = endb + 2
375
376        # Properly format bbb columns
377        bbblines = StringIO()
378        bbb.write(bbblines, format='ascii.fixed_width_no_header',
379                  delimiter=' ', bookend=False, delimiter_pad=None,
380                  formats={'Format': '<6s',
381                           'Units': '<6s',
382                           'Label': '<' + str(max_label_width) + 's',
383                           'Explanations': '' + str(max_descrip_size) + 's'})
384
385        # Get formatted bbb lines
386        bbblines = bbblines.getvalue().splitlines()
387
388        # ``nsplit`` is the number of whitespaces to prefix to long description
389        # lines in order to wrap them. It is the sum of the widths of the
390        # previous 4 columns plus the number of single spacing between them.
391        # The hyphen in the Bytes column is also counted.
392        nsplit = byte_count_width * 2 + 1 + 12 + max_label_width + 4
393
394        # Wrap line if it is too long
395        buff = ""
396        for newline in bbblines:
397            if len(newline) > MAX_SIZE_README_LINE:
398                buff += ("\n").join(wrap(newline,
399                                         subsequent_indent=" " * nsplit,
400                                         width=MAX_SIZE_README_LINE))
401                buff += "\n"
402            else:
403                buff += newline + "\n"
404
405        # Last value of ``endb`` is the sum of column widths after formatting.
406        self.linewidth = endb
407
408        # Remove the last extra newline character from Byte-By-Byte.
409        buff = buff[:-1]
410        return buff
411
412    def write(self, lines):
413        """
414        Writes the Header of the MRT table, aka ReadMe, which
415        also contains the Byte-By-Byte description of the table.
416        """
417        from astropy.coordinates import SkyCoord
418
419        # Recognised ``SkyCoord.name`` forms with their default column names (helio* require SunPy).
420        coord_systems = {'galactic': ('GLAT', 'GLON', 'b', 'l'),
421                         'ecliptic': ('ELAT', 'ELON', 'lat', 'lon'),      # 'geocentric*ecliptic'
422                         'heliographic': ('HLAT', 'HLON', 'lat', 'lon'),  # '_carrington|stonyhurst'
423                         'helioprojective': ('HPLT', 'HPLN', 'Ty', 'Tx')}
424        eqtnames = ['RAh', 'RAm', 'RAs', 'DEd', 'DEm', 'DEs']
425
426        # list to store indices of columns that are modified.
427        to_pop = []
428
429        # For columns that are instances of ``SkyCoord`` and other ``mixin`` columns
430        # or whose values are objects of these classes.
431        for i, col in enumerate(self.cols):
432            # If col is a ``Column`` object but its values are ``SkyCoord`` objects,
433            # convert the whole column to ``SkyCoord`` object, which helps in applying
434            # SkyCoord methods directly.
435            if not isinstance(col, SkyCoord) and isinstance(col[0], SkyCoord):
436                try:
437                    col = SkyCoord(col)
438                except (ValueError, TypeError):
439                    # If only the first value of the column is a ``SkyCoord`` object,
440                    # the column cannot be converted to a ``SkyCoord`` object.
441                    # These columns are converted to ``Column`` object and then converted
442                    # to string valued column.
443                    if not isinstance(col, Column):
444                        col = Column(col)
445                    col = Column([str(val) for val in col])
446                    self.cols[i] = col
447                    continue
448
449            # Replace single ``SkyCoord`` column by its coordinate components if no coordinate
450            # columns of the correspoding type exist yet.
451            if isinstance(col, SkyCoord):
452                # If coordinates are given in RA/DEC, divide each them into hour/deg,
453                # minute/arcminute, second/arcsecond columns.
454                if ('ra' in col.representation_component_names.keys() and
455                        len(set(eqtnames) - set(self.colnames)) == 6):
456                    ra_c, dec_c = col.ra.hms, col.dec.dms
457                    coords = [ra_c.h.round().astype('i1'), ra_c.m.round().astype('i1'), ra_c.s,
458                              dec_c.d.round().astype('i1'), dec_c.m.round().astype('i1'), dec_c.s]
459                    coord_units = [u.h, u.min, u.second,
460                                   u.deg, u.arcmin, u.arcsec]
461                    coord_descrip = ['Right Ascension (hour)', 'Right Ascension (minute)',
462                                     'Right Ascension (second)', 'Declination (degree)',
463                                     'Declination (arcmin)', 'Declination (arcsec)']
464                    for coord, name, coord_unit, descrip in zip(
465                            coords, eqtnames, coord_units, coord_descrip):
466                        # Have Sign of Declination only in the DEd column.
467                        if name in ['DEm', 'DEs']:
468                            coord_col = Column(list(np.abs(coord)), name=name,
469                                               unit=coord_unit, description=descrip)
470                        else:
471                            coord_col = Column(list(coord), name=name, unit=coord_unit,
472                                               description=descrip)
473                        # Set default number of digits after decimal point for the
474                        # second values, and deg-min to (signed) 2-digit zero-padded integer.
475                        if name == 'RAs':
476                            coord_col.format = '013.10f'
477                        elif name == 'DEs':
478                            coord_col.format = '012.9f'
479                        elif name == 'RAh':
480                            coord_col.format = '2d'
481                        elif name == 'DEd':
482                            coord_col.format = '+03d'
483                        elif name.startswith(('RA', 'DE')):
484                            coord_col.format = '02d'
485                        self.cols.append(coord_col)
486                    to_pop.append(i)   # Delete original ``SkyCoord`` column.
487
488                # For all other coordinate types, simply divide into two columns
489                # for latitude and longitude resp. with the unit used been as it is.
490
491                else:
492                    frminfo = ''
493                    for frame, latlon in coord_systems.items():
494                        if frame in col.name and len(set(latlon[:2]) - set(self.colnames)) == 2:
495                            if frame != col.name:
496                                frminfo = f' ({col.name})'
497                            lon_col = Column(getattr(col, latlon[3]), name=latlon[1],
498                                             description=f'{frame.capitalize()} Longitude{frminfo}',
499                                             unit=col.representation_component_units[latlon[3]],
500                                             format='.12f')
501                            lat_col = Column(getattr(col, latlon[2]), name=latlon[0],
502                                             description=f'{frame.capitalize()} Latitude{frminfo}',
503                                             unit=col.representation_component_units[latlon[2]],
504                                             format='+.12f')
505                            self.cols.append(lon_col)
506                            self.cols.append(lat_col)
507                            to_pop.append(i)   # Delete original ``SkyCoord`` column.
508
509                # Convert all other ``SkyCoord`` columns that are not in the above three
510                # representations to string valued columns. Those could either be types not
511                # supported yet (e.g. 'helioprojective'), or already present and converted.
512                # If there were any extra ``SkyCoord`` columns of one kind after the first one,
513                # then their decomposition into their component columns has been skipped.
514                # This is done in order to not create duplicate component columns.
515                # Explicit renaming of the extra coordinate component columns by appending some
516                # suffix to their name, so as to distinguish them, is not yet implemented.
517                if i not in to_pop:
518                    warnings.warn(f"Coordinate system of type '{col.name}' already stored in table "
519                                  f"as CDS/MRT-syle columns or of unrecognized type. So column {i} "
520                                  f"is being skipped with designation of a string valued column "
521                                  f"`{self.colnames[i]}`.", UserWarning)
522                    self.cols.append(Column(col.to_string(), name=self.colnames[i]))
523                    to_pop.append(i)   # Delete original ``SkyCoord`` column.
524
525            # Convert all other ``mixin`` columns to ``Column`` objects.
526            # Parsing these may still lead to errors!
527            elif not isinstance(col, Column):
528                col = Column(col)
529                # If column values are ``object`` types, convert them to string.
530                if np.issubdtype(col.dtype, np.dtype(object).type):
531                    col = Column([str(val) for val in col])
532                self.cols[i] = col
533
534        # Delete original ``SkyCoord`` columns, if there were any.
535        for i in to_pop[::-1]:
536            self.cols.pop(i)
537
538        # Check for any left over extra coordinate columns.
539        if any(x in self.colnames for x in ['RAh', 'DEd', 'ELON', 'GLAT']):
540            # At this point any extra ``SkyCoord`` columns should have been converted to string
541            # valued columns, together with issuance of a warning, by the coordinate parser above.
542            # This test is just left here as a safeguard.
543            for i, col in enumerate(self.cols):
544                if isinstance(col, SkyCoord):
545                    self.cols[i] = Column(col.to_string(), name=self.colnames[i])
546                    message = ('Table already has coordinate system in CDS/MRT-syle columns. '
547                               f'So column {i} should have been replaced already with '
548                               f'a string valued column `{self.colnames[i]}`.')
549                    raise core.InconsistentTableError(message)
550
551        # Get Byte-By-Byte description and fill the template
552        bbb_template = Template('\n'.join(BYTE_BY_BYTE_TEMPLATE))
553        byte_by_byte = bbb_template.substitute({'file': 'table.dat',
554                                                'bytebybyte': self.write_byte_by_byte()})
555
556        # Fill up the full ReadMe
557        rm_template = Template('\n'.join(MRT_TEMPLATE))
558        readme_filled = rm_template.substitute({'bytebybyte': byte_by_byte})
559        lines.append(readme_filled)
560
561
562class MrtData(cds.CdsData):
563    """MRT table data reader
564    """
565    _subfmt = 'MRT'
566    splitter_class = MrtSplitter
567
568    def write(self, lines):
569        self.splitter.delimiter = ' '
570        fixedwidth.FixedWidthData.write(self, lines)
571
572
573class Mrt(core.BaseReader):
574    """AAS MRT (Machine-Readable Table) format table.
575
576    **Reading**
577    ::
578
579      >>> from astropy.io import ascii
580      >>> table = ascii.read('data.mrt', format='mrt')
581
582    **Writing**
583
584    Use ``ascii.write(table, 'data.mrt', format='mrt')`` to  write tables to
585    Machine Readable Table (MRT) format.
586
587    Note that the metadata of the table, apart from units, column names and
588    description, will not be written. These have to be filled in by hand later.
589
590    See also: :ref:`cds_mrt_format`.
591
592    Caveats:
593
594    * The Units and Explanations are available in the column ``unit`` and
595      ``description`` attributes, respectively.
596    * The other metadata defined by this format is not available in the output table.
597    """
598    _format_name = 'mrt'
599    _io_registry_format_aliases = ['mrt']
600    _io_registry_can_write = True
601    _description = 'MRT format table'
602
603    data_class = MrtData
604    header_class = MrtHeader
605
606    def write(self, table=None):
607        # Construct for writing empty table is not yet done.
608        if len(table) == 0:
609            raise NotImplementedError
610
611        self.data.header = self.header
612        self.header.position_line = None
613        self.header.start_line = None
614
615        # Create a copy of the ``table``, so that it the copy gets modified and
616        # written to the file, while the original table remains as it is.
617        table = table.copy()
618        return super().write(table)
619