1import sys 2import petl 3import typer 4import json as pyjson 5import yaml as pyyaml 6from typing import List 7from ..detector import Detector 8from ..extract import extract 9from ..layout import Layout 10from .main import program 11from .. import helpers 12from . import common 13 14 15@program.command(name="extract") 16def program_extract( 17 # Source 18 source: List[str] = common.source, 19 type: str = common.type, 20 # File 21 path: str = common.path, 22 scheme: str = common.scheme, 23 format: str = common.format, 24 hashing: str = common.hashing, 25 encoding: str = common.encoding, 26 innerpath: str = common.innerpath, 27 compression: str = common.compression, 28 # Control 29 control: str = common.control, 30 # Dialect 31 dialect: str = common.dialect, 32 # Layout 33 header_rows: str = common.header_rows, 34 header_join: str = common.header_join, 35 pick_fields: str = common.pick_fields, 36 skip_fields: str = common.skip_fields, 37 limit_fields: int = common.limit_fields, 38 offset_fields: int = common.offset_fields, 39 pick_rows: str = common.pick_rows, 40 skip_rows: str = common.skip_rows, 41 limit_rows: int = common.limit_rows, 42 offset_rows: int = common.offset_rows, 43 # Schema 44 schema: str = common.schema, 45 # Detector 46 buffer_size: int = common.buffer_size, 47 sample_size: int = common.sample_size, 48 field_type: str = common.field_type, 49 field_names: str = common.field_names, 50 field_confidence: float = common.field_confidence, 51 field_float_numbers: bool = common.field_float_numbers, 52 field_missing_values: str = common.field_missing_values, 53 schema_sync: bool = common.schema_sync, 54 # Command 55 basepath: str = common.basepath, 56 yaml: bool = common.yaml, 57 json: bool = common.json, 58 csv: bool = common.csv, 59): 60 """ 61 Extract a data source. 62 63 Based on the inferred data source type it will return resource or package data. 64 Default output format is tabulated with a front matter. 65 """ 66 67 # Support stdin 68 is_stdin = False 69 if not source and not path: 70 if not sys.stdin.isatty(): 71 is_stdin = True 72 source = [sys.stdin.buffer.read()] 73 74 # Validate input 75 if not source and not path: 76 message = 'Providing "source" or "path" is required' 77 typer.secho(message, err=True, fg=typer.colors.RED, bold=True) 78 raise typer.Exit(1) 79 80 # Normalize parameters 81 source = list(source) if len(source) > 1 else (source[0] if source else None) 82 control = helpers.parse_json_string(control) 83 dialect = helpers.parse_json_string(dialect) 84 header_rows = helpers.parse_csv_string(header_rows, convert=int) 85 pick_fields = helpers.parse_csv_string(pick_fields, convert=int, fallback=True) 86 skip_fields = helpers.parse_csv_string(skip_fields, convert=int, fallback=True) 87 pick_rows = helpers.parse_csv_string(pick_rows, convert=int, fallback=True) 88 skip_rows = helpers.parse_csv_string(skip_rows, convert=int, fallback=True) 89 field_names = helpers.parse_csv_string(field_names) 90 field_missing_values = helpers.parse_csv_string(field_missing_values) 91 92 # Prepare layout 93 layout = ( 94 Layout( 95 header_rows=header_rows, 96 header_join=header_join, 97 pick_fields=pick_fields, 98 skip_fields=skip_fields, 99 limit_fields=limit_fields, 100 offset_fields=offset_fields, 101 pick_rows=pick_rows, 102 skip_rows=skip_rows, 103 limit_rows=limit_rows, 104 offset_rows=offset_rows, 105 ) 106 or None 107 ) 108 109 # Prepare detector 110 detector = Detector( 111 **helpers.remove_non_values( 112 dict( 113 buffer_size=buffer_size, 114 sample_size=sample_size, 115 field_type=field_type, 116 field_names=field_names, 117 field_confidence=field_confidence, 118 field_float_numbers=field_float_numbers, 119 field_missing_values=field_missing_values, 120 schema_sync=schema_sync, 121 ) 122 ) 123 ) 124 125 # Prepare options 126 options = helpers.remove_non_values( 127 dict( 128 type=type, 129 # Spec 130 path=path, 131 scheme=scheme, 132 format=format, 133 hashing=hashing, 134 encoding=encoding, 135 innerpath=innerpath, 136 compression=compression, 137 control=control, 138 dialect=dialect, 139 layout=layout, 140 schema=schema, 141 # Extra 142 basepath=basepath, 143 detector=detector, 144 ) 145 ) 146 147 # Extract data 148 try: 149 process = (lambda row: row.to_dict(json=True)) if json or yaml else None 150 data = extract(source, process=process, **options) 151 except Exception as exception: 152 typer.secho(str(exception), err=True, fg=typer.colors.RED, bold=True) 153 raise typer.Exit(1) 154 155 # Normalize data 156 normdata = data 157 if isinstance(data, list): 158 normdata = {source: data} 159 160 # Return JSON 161 if json: 162 content = pyjson.dumps(data, indent=2, ensure_ascii=False) 163 typer.secho(content) 164 raise typer.Exit() 165 166 # Return YAML 167 if yaml: 168 content = pyyaml.safe_dump(data, allow_unicode=True).strip() 169 typer.secho(content) 170 raise typer.Exit() 171 172 # Return CSV 173 if csv: 174 for number, rows in enumerate(normdata.values(), start=1): 175 for row in rows: 176 if row.row_number == 1: 177 typer.secho(helpers.stringify_csv_string(row.field_names)) 178 typer.secho(row.to_str()) 179 if number < len(normdata): 180 typer.secho("") 181 raise typer.Exit() 182 183 # Return default 184 for number, (name, rows) in enumerate(normdata.items(), start=1): 185 if is_stdin: 186 name = "stdin" 187 prefix = "data" 188 typer.secho(f"# {'-'*len(prefix)}", bold=True) 189 typer.secho(f"# {prefix}: {name}", bold=True) 190 typer.secho(f"# {'-'*len(prefix)}", bold=True) 191 typer.secho("") 192 subdata = helpers.rows_to_data(rows) 193 typer.secho(str(petl.util.vis.lookall(subdata, vrepr=str, style="simple"))) 194 if number < len(normdata): 195 typer.secho("") 196