1# Licensed under a 3-clause BSD style license - see LICENSE.rst 2""" An extensible ASCII table reader and writer. 3 4core.py: 5 Core base classes and functions for reading and writing tables. 6 7:Copyright: Smithsonian Astrophysical Observatory (2010) 8:Author: Tom Aldcroft (aldcroft@head.cfa.harvard.edu) 9""" 10 11 12import copy 13import csv 14import functools 15import itertools 16import operator 17import os 18import re 19import warnings 20import inspect 21import fnmatch 22 23from collections import OrderedDict 24from contextlib import suppress 25from io import StringIO 26 27import numpy 28 29from astropy.utils.exceptions import AstropyWarning 30 31from astropy.table import Table 32from astropy.utils.data import get_readable_fileobj 33from . import connect 34from .docs import READ_DOCSTRING, WRITE_DOCSTRING 35 36# Global dictionary mapping format arg to the corresponding Reader class 37FORMAT_CLASSES = {} 38 39# Similar dictionary for fast readers 40FAST_CLASSES = {} 41 42 43def _check_multidim_table(table, max_ndim): 44 """Check that ``table`` has only columns with ndim <= ``max_ndim`` 45 46 Currently ECSV is the only built-in format that supports output of arbitrary 47 N-d columns, but HTML supports 2-d. 48 """ 49 # No limit? 50 if max_ndim is None: 51 return 52 53 # Check for N-d columns 54 nd_names = [col.info.name for col in table.itercols() if len(col.shape) > max_ndim] 55 if nd_names: 56 raise ValueError(f'column(s) with dimension > {max_ndim} ' 57 "cannot be be written with this format, try using 'ecsv' " 58 "(Enhanced CSV) format") 59 60 61class CsvWriter: 62 """ 63 Internal class to replace the csv writer ``writerow`` and ``writerows`` 64 functions so that in the case of ``delimiter=' '`` and 65 ``quoting=csv.QUOTE_MINIMAL``, the output field value is quoted for empty 66 fields (when value == ''). 67 68 This changes the API slightly in that the writerow() and writerows() 69 methods return the output written string instead of the length of 70 that string. 71 72 Examples 73 -------- 74 75 >>> from astropy.io.ascii.core import CsvWriter 76 >>> writer = CsvWriter(delimiter=' ') 77 >>> print(writer.writerow(['hello', '', 'world'])) 78 hello "" world 79 """ 80 # Random 16-character string that gets injected instead of any 81 # empty fields and is then replaced post-write with doubled-quotechar. 82 # Created with: 83 # ''.join(random.choice(string.printable[:90]) for _ in range(16)) 84 replace_sentinel = '2b=48Av%0-V3p>bX' 85 86 def __init__(self, csvfile=None, **kwargs): 87 self.csvfile = csvfile 88 89 # Temporary StringIO for catching the real csv.writer() object output 90 self.temp_out = StringIO() 91 self.writer = csv.writer(self.temp_out, **kwargs) 92 93 dialect = self.writer.dialect 94 self.quotechar2 = dialect.quotechar * 2 95 self.quote_empty = (dialect.quoting == csv.QUOTE_MINIMAL) and (dialect.delimiter == ' ') 96 97 def writerow(self, values): 98 """ 99 Similar to csv.writer.writerow but with the custom quoting behavior. 100 Returns the written string instead of the length of that string. 101 """ 102 has_empty = False 103 104 # If QUOTE_MINIMAL and space-delimited then replace empty fields with 105 # the sentinel value. 106 if self.quote_empty: 107 for i, value in enumerate(values): 108 if value == '': 109 has_empty = True 110 values[i] = self.replace_sentinel 111 112 return self._writerow(self.writer.writerow, values, has_empty) 113 114 def writerows(self, values_list): 115 """ 116 Similar to csv.writer.writerows but with the custom quoting behavior. 117 Returns the written string instead of the length of that string. 118 """ 119 has_empty = False 120 121 # If QUOTE_MINIMAL and space-delimited then replace empty fields with 122 # the sentinel value. 123 if self.quote_empty: 124 for values in values_list: 125 for i, value in enumerate(values): 126 if value == '': 127 has_empty = True 128 values[i] = self.replace_sentinel 129 130 return self._writerow(self.writer.writerows, values_list, has_empty) 131 132 def _writerow(self, writerow_func, values, has_empty): 133 """ 134 Call ``writerow_func`` (either writerow or writerows) with ``values``. 135 If it has empty fields that have been replaced then change those 136 sentinel strings back to quoted empty strings, e.g. ``""``. 137 """ 138 # Clear the temporary StringIO buffer that self.writer writes into and 139 # then call the real csv.writer().writerow or writerows with values. 140 self.temp_out.seek(0) 141 self.temp_out.truncate() 142 writerow_func(values) 143 144 row_string = self.temp_out.getvalue() 145 146 if self.quote_empty and has_empty: 147 row_string = re.sub(self.replace_sentinel, self.quotechar2, row_string) 148 149 # self.csvfile is defined then write the output. In practice the pure 150 # Python writer calls with csvfile=None, while the fast writer calls with 151 # a file-like object. 152 if self.csvfile: 153 self.csvfile.write(row_string) 154 155 return row_string 156 157 158class MaskedConstant(numpy.ma.core.MaskedConstant): 159 """A trivial extension of numpy.ma.masked 160 161 We want to be able to put the generic term ``masked`` into a dictionary. 162 The constant ``numpy.ma.masked`` is not hashable (see 163 https://github.com/numpy/numpy/issues/4660), so we need to extend it 164 here with a hash value. 165 166 See https://github.com/numpy/numpy/issues/11021 for rationale for 167 __copy__ and __deepcopy__ methods. 168 """ 169 170 def __hash__(self): 171 '''All instances of this class shall have the same hash.''' 172 # Any large number will do. 173 return 1234567890 174 175 def __copy__(self): 176 """This is a singleton so just return self.""" 177 return self 178 179 def __deepcopy__(self, memo): 180 return self 181 182 183masked = MaskedConstant() 184 185 186class InconsistentTableError(ValueError): 187 """ 188 Indicates that an input table is inconsistent in some way. 189 190 The default behavior of ``BaseReader`` is to throw an instance of 191 this class if a data row doesn't match the header. 192 """ 193 194 195class OptionalTableImportError(ImportError): 196 """ 197 Indicates that a dependency for table reading is not present. 198 199 An instance of this class is raised whenever an optional reader 200 with certain required dependencies cannot operate because of 201 an ImportError. 202 """ 203 204 205class ParameterError(NotImplementedError): 206 """ 207 Indicates that a reader cannot handle a passed parameter. 208 209 The C-based fast readers in ``io.ascii`` raise an instance of 210 this error class upon encountering a parameter that the 211 C engine cannot handle. 212 """ 213 214 215class FastOptionsError(NotImplementedError): 216 """ 217 Indicates that one of the specified options for fast 218 reading is invalid. 219 """ 220 221 222class NoType: 223 """ 224 Superclass for ``StrType`` and ``NumType`` classes. 225 226 This class is the default type of ``Column`` and provides a base 227 class for other data types. 228 """ 229 230 231class StrType(NoType): 232 """ 233 Indicates that a column consists of text data. 234 """ 235 236 237class NumType(NoType): 238 """ 239 Indicates that a column consists of numerical data. 240 """ 241 242 243class FloatType(NumType): 244 """ 245 Describes floating-point data. 246 """ 247 248 249class BoolType(NoType): 250 """ 251 Describes boolean data. 252 """ 253 254 255class IntType(NumType): 256 """ 257 Describes integer data. 258 """ 259 260 261class AllType(StrType, FloatType, IntType): 262 """ 263 Subclass of all other data types. 264 265 This type is returned by ``convert_numpy`` if the given numpy 266 type does not match ``StrType``, ``FloatType``, or ``IntType``. 267 """ 268 269 270class Column: 271 """Table column. 272 273 The key attributes of a Column object are: 274 275 * **name** : column name 276 * **type** : column type (NoType, StrType, NumType, FloatType, IntType) 277 * **dtype** : numpy dtype (optional, overrides **type** if set) 278 * **str_vals** : list of column values as strings 279 * **fill_values** : dict of fill values 280 * **shape** : list of element shape (default [] => scalar) 281 * **data** : list of converted column values 282 * **subtype** : actual datatype for columns serialized with JSON 283 """ 284 285 def __init__(self, name): 286 self.name = name 287 self.type = NoType # Generic type (Int, Float, Str etc) 288 self.dtype = None # Numpy dtype if available 289 self.str_vals = [] 290 self.fill_values = {} 291 self.shape = [] 292 self.subtype = None 293 294 295class BaseInputter: 296 """ 297 Get the lines from the table input and return a list of lines. 298 299 """ 300 301 encoding = None 302 """Encoding used to read the file""" 303 304 def get_lines(self, table, newline=None): 305 """ 306 Get the lines from the ``table`` input. The input table can be one of: 307 308 * File name 309 * String (newline separated) with all header and data lines (must have at least 2 lines) 310 * File-like object with read() method 311 * List of strings 312 313 Parameters 314 ---------- 315 table : str, file-like, list 316 Can be either a file name, string (newline separated) with all header and data 317 lines (must have at least 2 lines), a file-like object with a 318 ``read()`` method, or a list of strings. 319 newline: line separator, if `None` use OS default from ``splitlines()``. 320 321 Returns 322 ------- 323 lines : list 324 List of lines 325 """ 326 try: 327 if (hasattr(table, 'read') 328 or ('\n' not in table + '' and '\r' not in table + '')): 329 with get_readable_fileobj(table, 330 encoding=self.encoding) as fileobj: 331 table = fileobj.read() 332 if newline is None: 333 lines = table.splitlines() 334 else: 335 lines = table.split(newline) 336 except TypeError: 337 try: 338 # See if table supports indexing, slicing, and iteration 339 table[0] 340 table[0:1] 341 iter(table) 342 if len(table) > 1: 343 lines = table 344 else: 345 # treat single entry as if string had been passed directly 346 if newline is None: 347 lines = table[0].splitlines() 348 else: 349 lines = table[0].split(newline) 350 351 except TypeError: 352 raise TypeError( 353 'Input "table" must be a string (filename or data) or an iterable') 354 355 return self.process_lines(lines) 356 357 def process_lines(self, lines): 358 """Process lines for subsequent use. In the default case do nothing. 359 This routine is not generally intended for removing comment lines or 360 stripping whitespace. These are done (if needed) in the header and 361 data line processing. 362 363 Override this method if something more has to be done to convert raw 364 input lines to the table rows. For example the 365 ContinuationLinesInputter derived class accounts for continuation 366 characters if a row is split into lines.""" 367 return lines 368 369 370class BaseSplitter: 371 """ 372 Base splitter that uses python's split method to do the work. 373 374 This does not handle quoted values. A key feature is the formulation of 375 __call__ as a generator that returns a list of the split line values at 376 each iteration. 377 378 There are two methods that are intended to be overridden, first 379 ``process_line()`` to do pre-processing on each input line before splitting 380 and ``process_val()`` to do post-processing on each split string value. By 381 default these apply the string ``strip()`` function. These can be set to 382 another function via the instance attribute or be disabled entirely, for 383 example:: 384 385 reader.header.splitter.process_val = lambda x: x.lstrip() 386 reader.data.splitter.process_val = None 387 388 """ 389 390 delimiter = None 391 """ one-character string used to separate fields """ 392 393 def process_line(self, line): 394 """Remove whitespace at the beginning or end of line. This is especially useful for 395 whitespace-delimited files to prevent spurious columns at the beginning or end.""" 396 return line.strip() 397 398 def process_val(self, val): 399 """Remove whitespace at the beginning or end of value.""" 400 return val.strip() 401 402 def __call__(self, lines): 403 if self.process_line: 404 lines = (self.process_line(x) for x in lines) 405 for line in lines: 406 vals = line.split(self.delimiter) 407 if self.process_val: 408 yield [self.process_val(x) for x in vals] 409 else: 410 yield vals 411 412 def join(self, vals): 413 if self.delimiter is None: 414 delimiter = ' ' 415 else: 416 delimiter = self.delimiter 417 return delimiter.join(str(x) for x in vals) 418 419 420class DefaultSplitter(BaseSplitter): 421 """Default class to split strings into columns using python csv. The class 422 attributes are taken from the csv Dialect class. 423 424 Typical usage:: 425 426 # lines = .. 427 splitter = ascii.DefaultSplitter() 428 for col_vals in splitter(lines): 429 for col_val in col_vals: 430 ... 431 432 """ 433 delimiter = ' ' 434 """ one-character string used to separate fields. """ 435 quotechar = '"' 436 """ control how instances of *quotechar* in a field are quoted """ 437 doublequote = True 438 """ character to remove special meaning from following character """ 439 escapechar = None 440 """ one-character stringto quote fields containing special characters """ 441 quoting = csv.QUOTE_MINIMAL 442 """ control when quotes are recognized by the reader """ 443 skipinitialspace = True 444 """ ignore whitespace immediately following the delimiter """ 445 csv_writer = None 446 csv_writer_out = StringIO() 447 448 def process_line(self, line): 449 """Remove whitespace at the beginning or end of line. This is especially useful for 450 whitespace-delimited files to prevent spurious columns at the beginning or end. 451 If splitting on whitespace then replace unquoted tabs with space first""" 452 if self.delimiter == r'\s': 453 line = _replace_tab_with_space(line, self.escapechar, self.quotechar) 454 return line.strip() 455 456 def __call__(self, lines): 457 """Return an iterator over the table ``lines``, where each iterator output 458 is a list of the split line values. 459 460 Parameters 461 ---------- 462 lines : list 463 List of table lines 464 465 Yields 466 ------ 467 line : list of str 468 Each line's split values. 469 470 """ 471 if self.process_line: 472 lines = [self.process_line(x) for x in lines] 473 474 delimiter = ' ' if self.delimiter == r'\s' else self.delimiter 475 476 csv_reader = csv.reader(lines, 477 delimiter=delimiter, 478 doublequote=self.doublequote, 479 escapechar=self.escapechar, 480 quotechar=self.quotechar, 481 quoting=self.quoting, 482 skipinitialspace=self.skipinitialspace 483 ) 484 for vals in csv_reader: 485 if self.process_val: 486 yield [self.process_val(x) for x in vals] 487 else: 488 yield vals 489 490 def join(self, vals): 491 492 delimiter = ' ' if self.delimiter is None else str(self.delimiter) 493 494 if self.csv_writer is None: 495 self.csv_writer = CsvWriter(delimiter=delimiter, 496 doublequote=self.doublequote, 497 escapechar=self.escapechar, 498 quotechar=self.quotechar, 499 quoting=self.quoting, 500 lineterminator='') 501 if self.process_val: 502 vals = [self.process_val(x) for x in vals] 503 out = self.csv_writer.writerow(vals) 504 505 return out 506 507 508def _replace_tab_with_space(line, escapechar, quotechar): 509 """Replace tabs with spaces in given string, preserving quoted substrings 510 511 Parameters 512 ---------- 513 line : str 514 String containing tabs to be replaced with spaces. 515 escapechar : str 516 Character in ``line`` used to escape special characters. 517 quotechar : str 518 Character in ``line`` indicating the start/end of a substring. 519 520 Returns 521 ------- 522 line : str 523 A copy of ``line`` with tabs replaced by spaces, preserving quoted substrings. 524 """ 525 newline = [] 526 in_quote = False 527 lastchar = 'NONE' 528 for char in line: 529 if char == quotechar and lastchar != escapechar: 530 in_quote = not in_quote 531 if char == '\t' and not in_quote: 532 char = ' ' 533 lastchar = char 534 newline.append(char) 535 return ''.join(newline) 536 537 538def _get_line_index(line_or_func, lines): 539 """Return the appropriate line index, depending on ``line_or_func`` which 540 can be either a function, a positive or negative int, or None. 541 """ 542 543 if hasattr(line_or_func, '__call__'): 544 return line_or_func(lines) 545 elif line_or_func: 546 if line_or_func >= 0: 547 return line_or_func 548 else: 549 n_lines = sum(1 for line in lines) 550 return n_lines + line_or_func 551 else: 552 return line_or_func 553 554 555class BaseHeader: 556 """ 557 Base table header reader 558 """ 559 auto_format = 'col{}' 560 """ format string for auto-generating column names """ 561 start_line = None 562 """ None, int, or a function of ``lines`` that returns None or int """ 563 comment = None 564 """ regular expression for comment lines """ 565 splitter_class = DefaultSplitter 566 """ Splitter class for splitting data lines into columns """ 567 names = None 568 """ list of names corresponding to each data column """ 569 write_comment = False 570 write_spacer_lines = ['ASCII_TABLE_WRITE_SPACER_LINE'] 571 572 def __init__(self): 573 self.splitter = self.splitter_class() 574 575 def _set_cols_from_names(self): 576 self.cols = [Column(name=x) for x in self.names] 577 578 def update_meta(self, lines, meta): 579 """ 580 Extract any table-level metadata, e.g. keywords, comments, column metadata, from 581 the table ``lines`` and update the OrderedDict ``meta`` in place. This base 582 method extracts comment lines and stores them in ``meta`` for output. 583 """ 584 if self.comment: 585 re_comment = re.compile(self.comment) 586 comment_lines = [x for x in lines if re_comment.match(x)] 587 else: 588 comment_lines = [] 589 comment_lines = [re.sub('^' + self.comment, '', x).strip() 590 for x in comment_lines] 591 if comment_lines: 592 meta.setdefault('table', {})['comments'] = comment_lines 593 594 def get_cols(self, lines): 595 """Initialize the header Column objects from the table ``lines``. 596 597 Based on the previously set Header attributes find or create the column names. 598 Sets ``self.cols`` with the list of Columns. 599 600 Parameters 601 ---------- 602 lines : list 603 List of table lines 604 605 """ 606 607 start_line = _get_line_index(self.start_line, self.process_lines(lines)) 608 if start_line is None: 609 # No header line so auto-generate names from n_data_cols 610 # Get the data values from the first line of table data to determine n_data_cols 611 try: 612 first_data_vals = next(self.data.get_str_vals()) 613 except StopIteration: 614 raise InconsistentTableError('No data lines found so cannot autogenerate ' 615 'column names') 616 n_data_cols = len(first_data_vals) 617 self.names = [self.auto_format.format(i) 618 for i in range(1, n_data_cols + 1)] 619 620 else: 621 for i, line in enumerate(self.process_lines(lines)): 622 if i == start_line: 623 break 624 else: # No header line matching 625 raise ValueError('No header line found in table') 626 627 self.names = next(self.splitter([line])) 628 629 self._set_cols_from_names() 630 631 def process_lines(self, lines): 632 """Generator to yield non-blank and non-comment lines""" 633 re_comment = re.compile(self.comment) if self.comment else None 634 # Yield non-comment lines 635 for line in lines: 636 if line.strip() and (not self.comment or not re_comment.match(line)): 637 yield line 638 639 def write_comments(self, lines, meta): 640 if self.write_comment not in (False, None): 641 for comment in meta.get('comments', []): 642 lines.append(self.write_comment + comment) 643 644 def write(self, lines): 645 if self.start_line is not None: 646 for i, spacer_line in zip(range(self.start_line), 647 itertools.cycle(self.write_spacer_lines)): 648 lines.append(spacer_line) 649 lines.append(self.splitter.join([x.info.name for x in self.cols])) 650 651 @property 652 def colnames(self): 653 """Return the column names of the table""" 654 return tuple(col.name if isinstance(col, Column) else col.info.name 655 for col in self.cols) 656 657 def remove_columns(self, names): 658 """ 659 Remove several columns from the table. 660 661 Parameters 662 ---------- 663 names : list 664 A list containing the names of the columns to remove 665 """ 666 colnames = self.colnames 667 for name in names: 668 if name not in colnames: 669 raise KeyError(f"Column {name} does not exist") 670 671 self.cols = [col for col in self.cols if col.name not in names] 672 673 def rename_column(self, name, new_name): 674 """ 675 Rename a column. 676 677 Parameters 678 ---------- 679 name : str 680 The current name of the column. 681 new_name : str 682 The new name for the column 683 """ 684 try: 685 idx = self.colnames.index(name) 686 except ValueError: 687 raise KeyError(f"Column {name} does not exist") 688 689 col = self.cols[idx] 690 691 # For writing self.cols can contain cols that are not Column. Raise 692 # exception in that case. 693 if isinstance(col, Column): 694 col.name = new_name 695 else: 696 raise TypeError(f'got column type {type(col)} instead of required ' 697 f'{Column}') 698 699 def get_type_map_key(self, col): 700 return col.raw_type 701 702 def get_col_type(self, col): 703 try: 704 type_map_key = self.get_type_map_key(col) 705 return self.col_type_map[type_map_key.lower()] 706 except KeyError: 707 raise ValueError('Unknown data type ""{}"" for column "{}"'.format( 708 col.raw_type, col.name)) 709 710 def check_column_names(self, names, strict_names, guessing): 711 """ 712 Check column names. 713 714 This must be done before applying the names transformation 715 so that guessing will fail appropriately if ``names`` is supplied. 716 For instance if the basic reader is given a table with no column header 717 row. 718 719 Parameters 720 ---------- 721 names : list 722 User-supplied list of column names 723 strict_names : bool 724 Whether to impose extra requirements on names 725 guessing : bool 726 True if this method is being called while guessing the table format 727 """ 728 if strict_names: 729 # Impose strict requirements on column names (normally used in guessing) 730 bads = [" ", ",", "|", "\t", "'", '"'] 731 for name in self.colnames: 732 if (_is_number(name) or len(name) == 0 733 or name[0] in bads or name[-1] in bads): 734 raise InconsistentTableError( 735 f'Column name {name!r} does not meet strict name requirements') 736 # When guessing require at least two columns, except for ECSV which can 737 # reliably be guessed from the header requirements. 738 if guessing and len(self.colnames) <= 1 and self.__class__.__name__ != 'EcsvHeader': 739 raise ValueError('Table format guessing requires at least two columns, got {}' 740 .format(list(self.colnames))) 741 742 if names is not None and len(names) != len(self.colnames): 743 raise InconsistentTableError( 744 'Length of names argument ({}) does not match number' 745 ' of table columns ({})'.format(len(names), len(self.colnames))) 746 747 748class BaseData: 749 """ 750 Base table data reader. 751 """ 752 start_line = None 753 """ None, int, or a function of ``lines`` that returns None or int """ 754 end_line = None 755 """ None, int, or a function of ``lines`` that returns None or int """ 756 comment = None 757 """ Regular expression for comment lines """ 758 splitter_class = DefaultSplitter 759 """ Splitter class for splitting data lines into columns """ 760 write_spacer_lines = ['ASCII_TABLE_WRITE_SPACER_LINE'] 761 fill_include_names = None 762 fill_exclude_names = None 763 fill_values = [(masked, '')] 764 formats = {} 765 766 def __init__(self): 767 # Need to make sure fill_values list is instance attribute, not class attribute. 768 # On read, this will be overwritten by the default in the ui.read (thus, in 769 # the current implementation there can be no different default for different 770 # Readers). On write, ui.py does not specify a default, so this line here matters. 771 self.fill_values = copy.copy(self.fill_values) 772 self.formats = copy.copy(self.formats) 773 self.splitter = self.splitter_class() 774 775 def process_lines(self, lines): 776 """ 777 READ: Strip out comment lines and blank lines from list of ``lines`` 778 779 Parameters 780 ---------- 781 lines : list 782 All lines in table 783 784 Returns 785 ------- 786 lines : list 787 List of lines 788 789 """ 790 nonblank_lines = (x for x in lines if x.strip()) 791 if self.comment: 792 re_comment = re.compile(self.comment) 793 return [x for x in nonblank_lines if not re_comment.match(x)] 794 else: 795 return [x for x in nonblank_lines] 796 797 def get_data_lines(self, lines): 798 """READ: Set ``data_lines`` attribute to lines slice comprising table data values. 799 """ 800 data_lines = self.process_lines(lines) 801 start_line = _get_line_index(self.start_line, data_lines) 802 end_line = _get_line_index(self.end_line, data_lines) 803 804 if start_line is not None or end_line is not None: 805 self.data_lines = data_lines[slice(start_line, end_line)] 806 else: # Don't copy entire data lines unless necessary 807 self.data_lines = data_lines 808 809 def get_str_vals(self): 810 """Return a generator that returns a list of column values (as strings) 811 for each data line.""" 812 return self.splitter(self.data_lines) 813 814 def masks(self, cols): 815 """READ: Set fill value for each column and then apply that fill value 816 817 In the first step it is evaluated with value from ``fill_values`` applies to 818 which column using ``fill_include_names`` and ``fill_exclude_names``. 819 In the second step all replacements are done for the appropriate columns. 820 """ 821 if self.fill_values: 822 self._set_fill_values(cols) 823 self._set_masks(cols) 824 825 def _set_fill_values(self, cols): 826 """READ, WRITE: Set fill values of individual cols based on fill_values of BaseData 827 828 fill values has the following form: 829 <fill_spec> = (<bad_value>, <fill_value>, <optional col_name>...) 830 fill_values = <fill_spec> or list of <fill_spec>'s 831 832 """ 833 if self.fill_values: 834 # when we write tables the columns may be astropy.table.Columns 835 # which don't carry a fill_values by default 836 for col in cols: 837 if not hasattr(col, 'fill_values'): 838 col.fill_values = {} 839 840 # if input is only one <fill_spec>, then make it a list 841 with suppress(TypeError): 842 self.fill_values[0] + '' 843 self.fill_values = [self.fill_values] 844 845 # Step 1: Set the default list of columns which are affected by 846 # fill_values 847 colnames = set(self.header.colnames) 848 if self.fill_include_names is not None: 849 colnames.intersection_update(self.fill_include_names) 850 if self.fill_exclude_names is not None: 851 colnames.difference_update(self.fill_exclude_names) 852 853 # Step 2a: Find out which columns are affected by this tuple 854 # iterate over reversed order, so last condition is set first and 855 # overwritten by earlier conditions 856 for replacement in reversed(self.fill_values): 857 if len(replacement) < 2: 858 raise ValueError("Format of fill_values must be " 859 "(<bad>, <fill>, <optional col1>, ...)") 860 elif len(replacement) == 2: 861 affect_cols = colnames 862 else: 863 affect_cols = replacement[2:] 864 865 for i, key in ((i, x) for i, x in enumerate(self.header.colnames) 866 if x in affect_cols): 867 cols[i].fill_values[replacement[0]] = str(replacement[1]) 868 869 def _set_masks(self, cols): 870 """READ: Replace string values in col.str_vals and set masks""" 871 if self.fill_values: 872 for col in (col for col in cols if col.fill_values): 873 col.mask = numpy.zeros(len(col.str_vals), dtype=bool) 874 for i, str_val in ((i, x) for i, x in enumerate(col.str_vals) 875 if x in col.fill_values): 876 col.str_vals[i] = col.fill_values[str_val] 877 col.mask[i] = True 878 879 def _replace_vals(self, cols): 880 """WRITE: replace string values in col.str_vals""" 881 if self.fill_values: 882 for col in (col for col in cols if col.fill_values): 883 for i, str_val in ((i, x) for i, x in enumerate(col.str_vals) 884 if x in col.fill_values): 885 col.str_vals[i] = col.fill_values[str_val] 886 if masked in col.fill_values and hasattr(col, 'mask'): 887 mask_val = col.fill_values[masked] 888 for i in col.mask.nonzero()[0]: 889 col.str_vals[i] = mask_val 890 891 def str_vals(self): 892 """WRITE: convert all values in table to a list of lists of strings 893 894 This sets the fill values and possibly column formats from the input 895 formats={} keyword, then ends up calling table.pprint._pformat_col_iter() 896 by a circuitous path. That function does the real work of formatting. 897 Finally replace anything matching the fill_values. 898 899 Returns 900 ------- 901 values : list of list of str 902 """ 903 self._set_fill_values(self.cols) 904 self._set_col_formats() 905 for col in self.cols: 906 col.str_vals = list(col.info.iter_str_vals()) 907 self._replace_vals(self.cols) 908 return [col.str_vals for col in self.cols] 909 910 def write(self, lines): 911 """Write ``self.cols`` in place to ``lines``. 912 913 Parameters 914 ---------- 915 lines : list 916 List for collecting output of writing self.cols. 917 """ 918 if hasattr(self.start_line, '__call__'): 919 raise TypeError('Start_line attribute cannot be callable for write()') 920 else: 921 data_start_line = self.start_line or 0 922 923 while len(lines) < data_start_line: 924 lines.append(itertools.cycle(self.write_spacer_lines)) 925 926 col_str_iters = self.str_vals() 927 for vals in zip(*col_str_iters): 928 lines.append(self.splitter.join(vals)) 929 930 def _set_col_formats(self): 931 """WRITE: set column formats.""" 932 for col in self.cols: 933 if col.info.name in self.formats: 934 col.info.format = self.formats[col.info.name] 935 936 937def convert_numpy(numpy_type): 938 """Return a tuple containing a function which converts a list into a numpy 939 array and the type produced by the converter function. 940 941 Parameters 942 ---------- 943 numpy_type : numpy data-type 944 The numpy type required of an array returned by ``converter``. Must be a 945 valid `numpy type <https://numpy.org/doc/stable/user/basics.types.html>`_ 946 (e.g., numpy.uint, numpy.int8, numpy.int64, numpy.float64) or a python 947 type covered by a numpy type (e.g., int, float, str, bool). 948 949 Returns 950 ------- 951 converter : callable 952 ``converter`` is a function which accepts a list and converts it to a 953 numpy array of type ``numpy_type``. 954 converter_type : type 955 ``converter_type`` tracks the generic data type produced by the 956 converter function. 957 958 Raises 959 ------ 960 ValueError 961 Raised by ``converter`` if the list elements could not be converted to 962 the required type. 963 """ 964 965 # Infer converter type from an instance of numpy_type. 966 type_name = numpy.array([], dtype=numpy_type).dtype.name 967 if 'int' in type_name: 968 converter_type = IntType 969 elif 'float' in type_name: 970 converter_type = FloatType 971 elif 'bool' in type_name: 972 converter_type = BoolType 973 elif 'str' in type_name: 974 converter_type = StrType 975 else: 976 converter_type = AllType 977 978 def bool_converter(vals): 979 """ 980 Convert values "False" and "True" to bools. Raise an exception 981 for any other string values. 982 """ 983 if len(vals) == 0: 984 return numpy.array([], dtype=bool) 985 986 # Try a smaller subset first for a long array 987 if len(vals) > 10000: 988 svals = numpy.asarray(vals[:1000]) 989 if not numpy.all((svals == 'False') 990 | (svals == 'True') 991 | (svals == '0') 992 | (svals == '1')): 993 raise ValueError('bool input strings must be False, True, 0, 1, or ""') 994 vals = numpy.asarray(vals) 995 996 trues = (vals == 'True') | (vals == '1') 997 falses = (vals == 'False') | (vals == '0') 998 if not numpy.all(trues | falses): 999 raise ValueError('bool input strings must be only False, True, 0, 1, or ""') 1000 1001 return trues 1002 1003 def generic_converter(vals): 1004 return numpy.array(vals, numpy_type) 1005 1006 converter = bool_converter if converter_type is BoolType else generic_converter 1007 1008 return converter, converter_type 1009 1010 1011class BaseOutputter: 1012 """Output table as a dict of column objects keyed on column name. The 1013 table data are stored as plain python lists within the column objects. 1014 """ 1015 converters = {} 1016 # Derived classes must define default_converters and __call__ 1017 1018 @staticmethod 1019 def _validate_and_copy(col, converters): 1020 """Validate the format for the type converters and then copy those 1021 which are valid converters for this column (i.e. converter type is 1022 a subclass of col.type)""" 1023 converters_out = [] 1024 try: 1025 for converter in converters: 1026 converter_func, converter_type = converter 1027 if not issubclass(converter_type, NoType): 1028 raise ValueError() 1029 if issubclass(converter_type, col.type): 1030 converters_out.append((converter_func, converter_type)) 1031 1032 except (ValueError, TypeError): 1033 raise ValueError('Error: invalid format for converters, see ' 1034 'documentation\n{}'.format(converters)) 1035 return converters_out 1036 1037 def _convert_vals(self, cols): 1038 for col in cols: 1039 for key, converters in self.converters.items(): 1040 if fnmatch.fnmatch(col.name, key): 1041 break 1042 else: 1043 if col.dtype is not None: 1044 converters = [convert_numpy(col.dtype)] 1045 else: 1046 converters = self.default_converters 1047 1048 col.converters = self._validate_and_copy(col, converters) 1049 1050 # Catch the last error in order to provide additional information 1051 # in case all attempts at column conversion fail. The initial 1052 # value of of last_error will apply if no converters are defined 1053 # and the first col.converters[0] access raises IndexError. 1054 last_err = 'no converters defined' 1055 1056 while not hasattr(col, 'data'): 1057 # Try converters, popping the unsuccessful ones from the list. 1058 # If there are no converters left here then fail. 1059 if not col.converters: 1060 raise ValueError(f'Column {col.name} failed to convert: {last_err}') 1061 1062 converter_func, converter_type = col.converters[0] 1063 if not issubclass(converter_type, col.type): 1064 raise TypeError('converter type does not match column type') 1065 1066 try: 1067 col.data = converter_func(col.str_vals) 1068 col.type = converter_type 1069 except (TypeError, ValueError) as err: 1070 col.converters.pop(0) 1071 last_err = err 1072 except OverflowError as err: 1073 # Overflow during conversion (most likely an int that 1074 # doesn't fit in native C long). Put string at the top of 1075 # the converters list for the next while iteration. 1076 warnings.warn( 1077 "OverflowError converting to {} in column {}, reverting to String." 1078 .format(converter_type.__name__, col.name), AstropyWarning) 1079 col.converters.insert(0, convert_numpy(numpy.str)) 1080 last_err = err 1081 1082 1083def _deduplicate_names(names): 1084 """Ensure there are no duplicates in ``names`` 1085 1086 This is done by iteratively adding ``_<N>`` to the name for increasing N 1087 until the name is unique. 1088 """ 1089 new_names = [] 1090 existing_names = set() 1091 1092 for name in names: 1093 base_name = name + '_' 1094 i = 1 1095 while name in existing_names: 1096 # Iterate until a unique name is found 1097 name = base_name + str(i) 1098 i += 1 1099 new_names.append(name) 1100 existing_names.add(name) 1101 1102 return new_names 1103 1104 1105class TableOutputter(BaseOutputter): 1106 """ 1107 Output the table as an astropy.table.Table object. 1108 """ 1109 1110 default_converters = [convert_numpy(int), 1111 convert_numpy(float), 1112 convert_numpy(str)] 1113 1114 def __call__(self, cols, meta): 1115 # Sets col.data to numpy array and col.type to io.ascii Type class (e.g. 1116 # FloatType) for each col. 1117 self._convert_vals(cols) 1118 1119 t_cols = [numpy.ma.MaskedArray(x.data, mask=x.mask) 1120 if hasattr(x, 'mask') and numpy.any(x.mask) 1121 else x.data for x in cols] 1122 out = Table(t_cols, names=[x.name for x in cols], meta=meta['table']) 1123 1124 for col, out_col in zip(cols, out.columns.values()): 1125 for attr in ('format', 'unit', 'description'): 1126 if hasattr(col, attr): 1127 setattr(out_col, attr, getattr(col, attr)) 1128 if hasattr(col, 'meta'): 1129 out_col.meta.update(col.meta) 1130 1131 return out 1132 1133 1134class MetaBaseReader(type): 1135 def __init__(cls, name, bases, dct): 1136 super().__init__(name, bases, dct) 1137 1138 format = dct.get('_format_name') 1139 if format is None: 1140 return 1141 1142 fast = dct.get('_fast') 1143 if fast is not None: 1144 FAST_CLASSES[format] = cls 1145 1146 FORMAT_CLASSES[format] = cls 1147 1148 io_formats = ['ascii.' + format] + dct.get('_io_registry_format_aliases', []) 1149 1150 if dct.get('_io_registry_suffix'): 1151 func = functools.partial(connect.io_identify, dct['_io_registry_suffix']) 1152 connect.io_registry.register_identifier(io_formats[0], Table, func) 1153 1154 for io_format in io_formats: 1155 func = functools.partial(connect.io_read, io_format) 1156 header = f"ASCII reader '{io_format}' details\n" 1157 func.__doc__ = (inspect.cleandoc(READ_DOCSTRING).strip() + '\n\n' 1158 + header + re.sub('.', '=', header) + '\n') 1159 func.__doc__ += inspect.cleandoc(cls.__doc__).strip() 1160 connect.io_registry.register_reader(io_format, Table, func) 1161 1162 if dct.get('_io_registry_can_write', True): 1163 func = functools.partial(connect.io_write, io_format) 1164 header = f"ASCII writer '{io_format}' details\n" 1165 func.__doc__ = (inspect.cleandoc(WRITE_DOCSTRING).strip() + '\n\n' 1166 + header + re.sub('.', '=', header) + '\n') 1167 func.__doc__ += inspect.cleandoc(cls.__doc__).strip() 1168 connect.io_registry.register_writer(io_format, Table, func) 1169 1170 1171def _is_number(x): 1172 with suppress(ValueError): 1173 x = float(x) 1174 return True 1175 return False 1176 1177 1178def _apply_include_exclude_names(table, names, include_names, exclude_names): 1179 """ 1180 Apply names, include_names and exclude_names to a table or BaseHeader. 1181 1182 For the latter this relies on BaseHeader implementing ``colnames``, 1183 ``rename_column``, and ``remove_columns``. 1184 1185 Parameters 1186 ---------- 1187 table : `~astropy.table.Table`, `~astropy.io.ascii.BaseHeader` 1188 Input table or BaseHeader subclass instance 1189 names : list 1190 List of names to override those in table (set to None to use existing names) 1191 include_names : list 1192 List of names to include in output 1193 exclude_names : list 1194 List of names to exclude from output (applied after ``include_names``) 1195 1196 """ 1197 def rename_columns(table, names): 1198 # Rename table column names to those passed by user 1199 # Temporarily rename with names that are not in `names` or `table.colnames`. 1200 # This ensures that rename succeeds regardless of existing names. 1201 xxxs = 'x' * max(len(name) for name in list(names) + list(table.colnames)) 1202 for ii, colname in enumerate(table.colnames): 1203 table.rename_column(colname, xxxs + str(ii)) 1204 1205 for ii, name in enumerate(names): 1206 table.rename_column(xxxs + str(ii), name) 1207 1208 if names is not None: 1209 rename_columns(table, names) 1210 else: 1211 colnames_uniq = _deduplicate_names(table.colnames) 1212 if colnames_uniq != list(table.colnames): 1213 rename_columns(table, colnames_uniq) 1214 1215 names_set = set(table.colnames) 1216 1217 if include_names is not None: 1218 names_set.intersection_update(include_names) 1219 if exclude_names is not None: 1220 names_set.difference_update(exclude_names) 1221 if names_set != set(table.colnames): 1222 remove_names = set(table.colnames) - names_set 1223 table.remove_columns(remove_names) 1224 1225 1226class BaseReader(metaclass=MetaBaseReader): 1227 """Class providing methods to read and write an ASCII table using the specified 1228 header, data, inputter, and outputter instances. 1229 1230 Typical usage is to instantiate a Reader() object and customize the 1231 ``header``, ``data``, ``inputter``, and ``outputter`` attributes. Each 1232 of these is an object of the corresponding class. 1233 1234 There is one method ``inconsistent_handler`` that can be used to customize the 1235 behavior of ``read()`` in the event that a data row doesn't match the header. 1236 The default behavior is to raise an InconsistentTableError. 1237 1238 """ 1239 1240 names = None 1241 include_names = None 1242 exclude_names = None 1243 strict_names = False 1244 guessing = False 1245 encoding = None 1246 1247 header_class = BaseHeader 1248 data_class = BaseData 1249 inputter_class = BaseInputter 1250 outputter_class = TableOutputter 1251 1252 # Max column dimension that writer supports for this format. Exceptions 1253 # include ECSV (no limit) and HTML (max_ndim=2). 1254 max_ndim = 1 1255 1256 def __init__(self): 1257 self.header = self.header_class() 1258 self.data = self.data_class() 1259 self.inputter = self.inputter_class() 1260 self.outputter = self.outputter_class() 1261 # Data and Header instances benefit from a little cross-coupling. Header may need to 1262 # know about number of data columns for auto-column name generation and Data may 1263 # need to know about header (e.g. for fixed-width tables where widths are spec'd in header. 1264 self.data.header = self.header 1265 self.header.data = self.data 1266 1267 # Metadata, consisting of table-level meta and column-level meta. The latter 1268 # could include information about column type, description, formatting, etc, 1269 # depending on the table meta format. 1270 self.meta = OrderedDict(table=OrderedDict(), 1271 cols=OrderedDict()) 1272 1273 def _check_multidim_table(self, table): 1274 """Check that the dimensions of columns in ``table`` are acceptable. 1275 1276 The reader class attribute ``max_ndim`` defines the maximum dimension of 1277 columns that can be written using this format. The base value is ``1``, 1278 corresponding to normal scalar columns with just a length. 1279 1280 Parameters 1281 ---------- 1282 table : `~astropy.table.Table` 1283 Input table. 1284 1285 Raises 1286 ------ 1287 ValueError 1288 If any column exceeds the number of allowed dimensions 1289 """ 1290 _check_multidim_table(table, self.max_ndim) 1291 1292 def read(self, table): 1293 """Read the ``table`` and return the results in a format determined by 1294 the ``outputter`` attribute. 1295 1296 The ``table`` parameter is any string or object that can be processed 1297 by the instance ``inputter``. For the base Inputter class ``table`` can be 1298 one of: 1299 1300 * File name 1301 * File-like object 1302 * String (newline separated) with all header and data lines (must have at least 2 lines) 1303 * List of strings 1304 1305 Parameters 1306 ---------- 1307 table : str, file-like, list 1308 Input table. 1309 1310 Returns 1311 ------- 1312 table : `~astropy.table.Table` 1313 Output table 1314 1315 """ 1316 # If ``table`` is a file then store the name in the ``data`` 1317 # attribute. The ``table`` is a "file" if it is a string 1318 # without the new line specific to the OS. 1319 with suppress(TypeError): 1320 # Strings only 1321 if os.linesep not in table + '': 1322 self.data.table_name = os.path.basename(table) 1323 1324 # If one of the newline chars is set as field delimiter, only 1325 # accept the other one as line splitter 1326 if self.header.splitter.delimiter == '\n': 1327 newline = '\r' 1328 elif self.header.splitter.delimiter == '\r': 1329 newline = '\n' 1330 else: 1331 newline = None 1332 1333 # Get a list of the lines (rows) in the table 1334 self.lines = self.inputter.get_lines(table, newline=newline) 1335 1336 # Set self.data.data_lines to a slice of lines contain the data rows 1337 self.data.get_data_lines(self.lines) 1338 1339 # Extract table meta values (e.g. keywords, comments, etc). Updates self.meta. 1340 self.header.update_meta(self.lines, self.meta) 1341 1342 # Get the table column definitions 1343 self.header.get_cols(self.lines) 1344 1345 # Make sure columns are valid 1346 self.header.check_column_names(self.names, self.strict_names, self.guessing) 1347 1348 self.cols = cols = self.header.cols 1349 self.data.splitter.cols = cols 1350 n_cols = len(cols) 1351 1352 for i, str_vals in enumerate(self.data.get_str_vals()): 1353 if len(str_vals) != n_cols: 1354 str_vals = self.inconsistent_handler(str_vals, n_cols) 1355 1356 # if str_vals is None, we skip this row 1357 if str_vals is None: 1358 continue 1359 1360 # otherwise, we raise an error only if it is still inconsistent 1361 if len(str_vals) != n_cols: 1362 errmsg = ('Number of header columns ({}) inconsistent with' 1363 ' data columns ({}) at data line {}\n' 1364 'Header values: {}\n' 1365 'Data values: {}'.format( 1366 n_cols, len(str_vals), i, 1367 [x.name for x in cols], str_vals)) 1368 1369 raise InconsistentTableError(errmsg) 1370 1371 for j, col in enumerate(cols): 1372 col.str_vals.append(str_vals[j]) 1373 1374 self.data.masks(cols) 1375 if hasattr(self.header, 'table_meta'): 1376 self.meta['table'].update(self.header.table_meta) 1377 1378 _apply_include_exclude_names(self.header, self.names, 1379 self.include_names, self.exclude_names) 1380 1381 table = self.outputter(self.header.cols, self.meta) 1382 self.cols = self.header.cols 1383 1384 return table 1385 1386 def inconsistent_handler(self, str_vals, ncols): 1387 """ 1388 Adjust or skip data entries if a row is inconsistent with the header. 1389 1390 The default implementation does no adjustment, and hence will always trigger 1391 an exception in read() any time the number of data entries does not match 1392 the header. 1393 1394 Note that this will *not* be called if the row already matches the header. 1395 1396 Parameters 1397 ---------- 1398 str_vals : list 1399 A list of value strings from the current row of the table. 1400 ncols : int 1401 The expected number of entries from the table header. 1402 1403 Returns 1404 ------- 1405 str_vals : list 1406 List of strings to be parsed into data entries in the output table. If 1407 the length of this list does not match ``ncols``, an exception will be 1408 raised in read(). Can also be None, in which case the row will be 1409 skipped. 1410 """ 1411 # an empty list will always trigger an InconsistentTableError in read() 1412 return str_vals 1413 1414 @property 1415 def comment_lines(self): 1416 """Return lines in the table that match header.comment regexp""" 1417 if not hasattr(self, 'lines'): 1418 raise ValueError('Table must be read prior to accessing the header comment lines') 1419 if self.header.comment: 1420 re_comment = re.compile(self.header.comment) 1421 comment_lines = [x for x in self.lines if re_comment.match(x)] 1422 else: 1423 comment_lines = [] 1424 return comment_lines 1425 1426 def update_table_data(self, table): 1427 """ 1428 Update table columns in place if needed. 1429 1430 This is a hook to allow updating the table columns after name 1431 filtering but before setting up to write the data. This is currently 1432 only used by ECSV and is otherwise just a pass-through. 1433 1434 Parameters 1435 ---------- 1436 table : `astropy.table.Table` 1437 Input table for writing 1438 1439 Returns 1440 ------- 1441 table : `astropy.table.Table` 1442 Output table for writing 1443 """ 1444 return table 1445 1446 def write_header(self, lines, meta): 1447 self.header.write_comments(lines, meta) 1448 self.header.write(lines) 1449 1450 def write(self, table): 1451 """ 1452 Write ``table`` as list of strings. 1453 1454 Parameters 1455 ---------- 1456 table : `~astropy.table.Table` 1457 Input table data. 1458 1459 Returns 1460 ------- 1461 lines : list 1462 List of strings corresponding to ASCII table 1463 1464 """ 1465 1466 # Check column names before altering 1467 self.header.cols = list(table.columns.values()) 1468 self.header.check_column_names(self.names, self.strict_names, False) 1469 1470 # In-place update of columns in input ``table`` to reflect column 1471 # filtering. Note that ``table`` is guaranteed to be a copy of the 1472 # original user-supplied table. 1473 _apply_include_exclude_names(table, self.names, self.include_names, self.exclude_names) 1474 1475 # This is a hook to allow updating the table columns after name 1476 # filtering but before setting up to write the data. This is currently 1477 # only used by ECSV and is otherwise just a pass-through. 1478 table = self.update_table_data(table) 1479 1480 # Check that table column dimensions are supported by this format class. 1481 # Most formats support only 1-d columns, but some like ECSV support N-d. 1482 self._check_multidim_table(table) 1483 1484 # Now use altered columns 1485 new_cols = list(table.columns.values()) 1486 # link information about the columns to the writer object (i.e. self) 1487 self.header.cols = new_cols 1488 self.data.cols = new_cols 1489 self.header.table_meta = table.meta 1490 1491 # Write header and data to lines list 1492 lines = [] 1493 self.write_header(lines, table.meta) 1494 self.data.write(lines) 1495 1496 return lines 1497 1498 1499class ContinuationLinesInputter(BaseInputter): 1500 """Inputter where lines ending in ``continuation_char`` are joined 1501 with the subsequent line. Example:: 1502 1503 col1 col2 col3 1504 1 \ 1505 2 3 1506 4 5 \ 1507 6 1508 """ 1509 1510 continuation_char = '\\' 1511 replace_char = ' ' 1512 # If no_continue is not None then lines matching this regex are not subject 1513 # to line continuation. The initial use case here is Daophot. In this 1514 # case the continuation character is just replaced with replace_char. 1515 no_continue = None 1516 1517 def process_lines(self, lines): 1518 re_no_continue = re.compile(self.no_continue) if self.no_continue else None 1519 1520 parts = [] 1521 outlines = [] 1522 for line in lines: 1523 if re_no_continue and re_no_continue.match(line): 1524 line = line.replace(self.continuation_char, self.replace_char) 1525 if line.endswith(self.continuation_char): 1526 parts.append(line.replace(self.continuation_char, self.replace_char)) 1527 else: 1528 parts.append(line) 1529 outlines.append(''.join(parts)) 1530 parts = [] 1531 1532 return outlines 1533 1534 1535class WhitespaceSplitter(DefaultSplitter): 1536 def process_line(self, line): 1537 """Replace tab with space within ``line`` while respecting quoted substrings""" 1538 newline = [] 1539 in_quote = False 1540 lastchar = None 1541 for char in line: 1542 if char == self.quotechar and (self.escapechar is None 1543 or lastchar != self.escapechar): 1544 in_quote = not in_quote 1545 if char == '\t' and not in_quote: 1546 char = ' ' 1547 lastchar = char 1548 newline.append(char) 1549 1550 return ''.join(newline) 1551 1552 1553extra_reader_pars = ('Reader', 'Inputter', 'Outputter', 1554 'delimiter', 'comment', 'quotechar', 'header_start', 1555 'data_start', 'data_end', 'converters', 'encoding', 1556 'data_Splitter', 'header_Splitter', 1557 'names', 'include_names', 'exclude_names', 'strict_names', 1558 'fill_values', 'fill_include_names', 'fill_exclude_names') 1559 1560 1561def _get_reader(Reader, Inputter=None, Outputter=None, **kwargs): 1562 """Initialize a table reader allowing for common customizations. See ui.get_reader() 1563 for param docs. This routine is for internal (package) use only and is useful 1564 because it depends only on the "core" module. 1565 """ 1566 1567 from .fastbasic import FastBasic 1568 if issubclass(Reader, FastBasic): # Fast readers handle args separately 1569 if Inputter is not None: 1570 kwargs['Inputter'] = Inputter 1571 return Reader(**kwargs) 1572 1573 # If user explicitly passed a fast reader with enable='force' 1574 # (e.g. by passing non-default options), raise an error for slow readers 1575 if 'fast_reader' in kwargs: 1576 if kwargs['fast_reader']['enable'] == 'force': 1577 raise ParameterError('fast_reader required with ' 1578 '{}, but this is not a fast C reader: {}' 1579 .format(kwargs['fast_reader'], Reader)) 1580 else: 1581 del kwargs['fast_reader'] # Otherwise ignore fast_reader parameter 1582 1583 reader_kwargs = dict([k, v] for k, v in kwargs.items() if k not in extra_reader_pars) 1584 reader = Reader(**reader_kwargs) 1585 1586 if Inputter is not None: 1587 reader.inputter = Inputter() 1588 1589 if Outputter is not None: 1590 reader.outputter = Outputter() 1591 1592 # Issue #855 suggested to set data_start to header_start + default_header_length 1593 # Thus, we need to retrieve this from the class definition before resetting these numbers. 1594 try: 1595 default_header_length = reader.data.start_line - reader.header.start_line 1596 except TypeError: # Start line could be None or an instancemethod 1597 default_header_length = None 1598 1599 # csv.reader is hard-coded to recognise either '\r' or '\n' as end-of-line, 1600 # therefore DefaultSplitter cannot handle these as delimiters. 1601 if 'delimiter' in kwargs: 1602 if kwargs['delimiter'] in ('\n', '\r', '\r\n'): 1603 reader.header.splitter = BaseSplitter() 1604 reader.data.splitter = BaseSplitter() 1605 reader.header.splitter.delimiter = kwargs['delimiter'] 1606 reader.data.splitter.delimiter = kwargs['delimiter'] 1607 if 'comment' in kwargs: 1608 reader.header.comment = kwargs['comment'] 1609 reader.data.comment = kwargs['comment'] 1610 if 'quotechar' in kwargs: 1611 reader.header.splitter.quotechar = kwargs['quotechar'] 1612 reader.data.splitter.quotechar = kwargs['quotechar'] 1613 if 'data_start' in kwargs: 1614 reader.data.start_line = kwargs['data_start'] 1615 if 'data_end' in kwargs: 1616 reader.data.end_line = kwargs['data_end'] 1617 if 'header_start' in kwargs: 1618 if (reader.header.start_line is not None): 1619 reader.header.start_line = kwargs['header_start'] 1620 # For FixedWidthTwoLine the data_start is calculated relative to the position line. 1621 # However, position_line is given as absolute number and not relative to header_start. 1622 # So, ignore this Reader here. 1623 if (('data_start' not in kwargs) and (default_header_length is not None) 1624 and reader._format_name not in ['fixed_width_two_line', 'commented_header']): 1625 reader.data.start_line = reader.header.start_line + default_header_length 1626 elif kwargs['header_start'] is not None: 1627 # User trying to set a None header start to some value other than None 1628 raise ValueError('header_start cannot be modified for this Reader') 1629 if 'converters' in kwargs: 1630 reader.outputter.converters = kwargs['converters'] 1631 if 'data_Splitter' in kwargs: 1632 reader.data.splitter = kwargs['data_Splitter']() 1633 if 'header_Splitter' in kwargs: 1634 reader.header.splitter = kwargs['header_Splitter']() 1635 if 'names' in kwargs: 1636 reader.names = kwargs['names'] 1637 if None in reader.names: 1638 raise TypeError('Cannot have None for column name') 1639 if len(set(reader.names)) != len(reader.names): 1640 raise ValueError('Duplicate column names') 1641 if 'include_names' in kwargs: 1642 reader.include_names = kwargs['include_names'] 1643 if 'exclude_names' in kwargs: 1644 reader.exclude_names = kwargs['exclude_names'] 1645 # Strict names is normally set only within the guessing process to 1646 # indicate that column names cannot be numeric or have certain 1647 # characters at the beginning or end. It gets used in 1648 # BaseHeader.check_column_names(). 1649 if 'strict_names' in kwargs: 1650 reader.strict_names = kwargs['strict_names'] 1651 if 'fill_values' in kwargs: 1652 reader.data.fill_values = kwargs['fill_values'] 1653 if 'fill_include_names' in kwargs: 1654 reader.data.fill_include_names = kwargs['fill_include_names'] 1655 if 'fill_exclude_names' in kwargs: 1656 reader.data.fill_exclude_names = kwargs['fill_exclude_names'] 1657 if 'encoding' in kwargs: 1658 reader.encoding = kwargs['encoding'] 1659 reader.inputter.encoding = kwargs['encoding'] 1660 1661 return reader 1662 1663 1664extra_writer_pars = ('delimiter', 'comment', 'quotechar', 'formats', 1665 'strip_whitespace', 1666 'names', 'include_names', 'exclude_names', 1667 'fill_values', 'fill_include_names', 1668 'fill_exclude_names') 1669 1670 1671def _get_writer(Writer, fast_writer, **kwargs): 1672 """Initialize a table writer allowing for common customizations. This 1673 routine is for internal (package) use only and is useful because it depends 1674 only on the "core" module. """ 1675 1676 from .fastbasic import FastBasic 1677 1678 # A value of None for fill_values imply getting the default string 1679 # representation of masked values (depending on the writer class), but the 1680 # machinery expects a list. The easiest here is to just pop the value off, 1681 # i.e. fill_values=None is the same as not providing it at all. 1682 if 'fill_values' in kwargs and kwargs['fill_values'] is None: 1683 del kwargs['fill_values'] 1684 1685 if issubclass(Writer, FastBasic): # Fast writers handle args separately 1686 return Writer(**kwargs) 1687 elif fast_writer and f'fast_{Writer._format_name}' in FAST_CLASSES: 1688 # Switch to fast writer 1689 kwargs['fast_writer'] = fast_writer 1690 return FAST_CLASSES[f'fast_{Writer._format_name}'](**kwargs) 1691 1692 writer_kwargs = dict([k, v] for k, v in kwargs.items() if k not in extra_writer_pars) 1693 writer = Writer(**writer_kwargs) 1694 1695 if 'delimiter' in kwargs: 1696 writer.header.splitter.delimiter = kwargs['delimiter'] 1697 writer.data.splitter.delimiter = kwargs['delimiter'] 1698 if 'comment' in kwargs: 1699 writer.header.write_comment = kwargs['comment'] 1700 writer.data.write_comment = kwargs['comment'] 1701 if 'quotechar' in kwargs: 1702 writer.header.splitter.quotechar = kwargs['quotechar'] 1703 writer.data.splitter.quotechar = kwargs['quotechar'] 1704 if 'formats' in kwargs: 1705 writer.data.formats = kwargs['formats'] 1706 if 'strip_whitespace' in kwargs: 1707 if kwargs['strip_whitespace']: 1708 # Restore the default SplitterClass process_val method which strips 1709 # whitespace. This may have been changed in the Writer 1710 # initialization (e.g. Rdb and Tab) 1711 writer.data.splitter.process_val = operator.methodcaller('strip') 1712 else: 1713 writer.data.splitter.process_val = None 1714 if 'names' in kwargs: 1715 writer.header.names = kwargs['names'] 1716 if 'include_names' in kwargs: 1717 writer.include_names = kwargs['include_names'] 1718 if 'exclude_names' in kwargs: 1719 writer.exclude_names = kwargs['exclude_names'] 1720 if 'fill_values' in kwargs: 1721 # Prepend user-specified values to the class default. 1722 with suppress(TypeError, IndexError): 1723 # Test if it looks like (match, replace_string, optional_colname), 1724 # in which case make it a list 1725 kwargs['fill_values'][1] + '' 1726 kwargs['fill_values'] = [kwargs['fill_values']] 1727 writer.data.fill_values = kwargs['fill_values'] + writer.data.fill_values 1728 if 'fill_include_names' in kwargs: 1729 writer.data.fill_include_names = kwargs['fill_include_names'] 1730 if 'fill_exclude_names' in kwargs: 1731 writer.data.fill_exclude_names = kwargs['fill_exclude_names'] 1732 return writer 1733