1import sys
2import petl
3import typer
4import json as pyjson
5import yaml as pyyaml
6from typing import List
7from ..detector import Detector
8from ..extract import extract
9from ..layout import Layout
10from .main import program
11from .. import helpers
12from . import common
13
14
15@program.command(name="extract")
16def program_extract(
17    # Source
18    source: List[str] = common.source,
19    type: str = common.type,
20    # File
21    path: str = common.path,
22    scheme: str = common.scheme,
23    format: str = common.format,
24    hashing: str = common.hashing,
25    encoding: str = common.encoding,
26    innerpath: str = common.innerpath,
27    compression: str = common.compression,
28    # Control
29    control: str = common.control,
30    # Dialect
31    dialect: str = common.dialect,
32    # Layout
33    header_rows: str = common.header_rows,
34    header_join: str = common.header_join,
35    pick_fields: str = common.pick_fields,
36    skip_fields: str = common.skip_fields,
37    limit_fields: int = common.limit_fields,
38    offset_fields: int = common.offset_fields,
39    pick_rows: str = common.pick_rows,
40    skip_rows: str = common.skip_rows,
41    limit_rows: int = common.limit_rows,
42    offset_rows: int = common.offset_rows,
43    # Schema
44    schema: str = common.schema,
45    # Detector
46    buffer_size: int = common.buffer_size,
47    sample_size: int = common.sample_size,
48    field_type: str = common.field_type,
49    field_names: str = common.field_names,
50    field_confidence: float = common.field_confidence,
51    field_float_numbers: bool = common.field_float_numbers,
52    field_missing_values: str = common.field_missing_values,
53    schema_sync: bool = common.schema_sync,
54    # Command
55    basepath: str = common.basepath,
56    yaml: bool = common.yaml,
57    json: bool = common.json,
58    csv: bool = common.csv,
59):
60    """
61    Extract a data source.
62
63    Based on the inferred data source type it will return resource or package data.
64    Default output format is tabulated with a front matter.
65    """
66
67    # Support stdin
68    is_stdin = False
69    if not source and not path:
70        if not sys.stdin.isatty():
71            is_stdin = True
72            source = [sys.stdin.buffer.read()]
73
74    # Validate input
75    if not source and not path:
76        message = 'Providing "source" or "path" is required'
77        typer.secho(message, err=True, fg=typer.colors.RED, bold=True)
78        raise typer.Exit(1)
79
80    # Normalize parameters
81    source = list(source) if len(source) > 1 else (source[0] if source else None)
82    control = helpers.parse_json_string(control)
83    dialect = helpers.parse_json_string(dialect)
84    header_rows = helpers.parse_csv_string(header_rows, convert=int)
85    pick_fields = helpers.parse_csv_string(pick_fields, convert=int, fallback=True)
86    skip_fields = helpers.parse_csv_string(skip_fields, convert=int, fallback=True)
87    pick_rows = helpers.parse_csv_string(pick_rows, convert=int, fallback=True)
88    skip_rows = helpers.parse_csv_string(skip_rows, convert=int, fallback=True)
89    field_names = helpers.parse_csv_string(field_names)
90    field_missing_values = helpers.parse_csv_string(field_missing_values)
91
92    # Prepare layout
93    layout = (
94        Layout(
95            header_rows=header_rows,
96            header_join=header_join,
97            pick_fields=pick_fields,
98            skip_fields=skip_fields,
99            limit_fields=limit_fields,
100            offset_fields=offset_fields,
101            pick_rows=pick_rows,
102            skip_rows=skip_rows,
103            limit_rows=limit_rows,
104            offset_rows=offset_rows,
105        )
106        or None
107    )
108
109    # Prepare detector
110    detector = Detector(
111        **helpers.remove_non_values(
112            dict(
113                buffer_size=buffer_size,
114                sample_size=sample_size,
115                field_type=field_type,
116                field_names=field_names,
117                field_confidence=field_confidence,
118                field_float_numbers=field_float_numbers,
119                field_missing_values=field_missing_values,
120                schema_sync=schema_sync,
121            )
122        )
123    )
124
125    # Prepare options
126    options = helpers.remove_non_values(
127        dict(
128            type=type,
129            # Spec
130            path=path,
131            scheme=scheme,
132            format=format,
133            hashing=hashing,
134            encoding=encoding,
135            innerpath=innerpath,
136            compression=compression,
137            control=control,
138            dialect=dialect,
139            layout=layout,
140            schema=schema,
141            # Extra
142            basepath=basepath,
143            detector=detector,
144        )
145    )
146
147    # Extract data
148    try:
149        process = (lambda row: row.to_dict(json=True)) if json or yaml else None
150        data = extract(source, process=process, **options)
151    except Exception as exception:
152        typer.secho(str(exception), err=True, fg=typer.colors.RED, bold=True)
153        raise typer.Exit(1)
154
155    # Normalize data
156    normdata = data
157    if isinstance(data, list):
158        normdata = {source: data}
159
160    # Return JSON
161    if json:
162        content = pyjson.dumps(data, indent=2, ensure_ascii=False)
163        typer.secho(content)
164        raise typer.Exit()
165
166    # Return YAML
167    if yaml:
168        content = pyyaml.safe_dump(data, allow_unicode=True).strip()
169        typer.secho(content)
170        raise typer.Exit()
171
172    # Return CSV
173    if csv:
174        for number, rows in enumerate(normdata.values(), start=1):
175            for row in rows:
176                if row.row_number == 1:
177                    typer.secho(helpers.stringify_csv_string(row.field_names))
178                typer.secho(row.to_str())
179            if number < len(normdata):
180                typer.secho("")
181        raise typer.Exit()
182
183    # Return default
184    for number, (name, rows) in enumerate(normdata.items(), start=1):
185        if is_stdin:
186            name = "stdin"
187        prefix = "data"
188        typer.secho(f"# {'-'*len(prefix)}", bold=True)
189        typer.secho(f"# {prefix}: {name}", bold=True)
190        typer.secho(f"# {'-'*len(prefix)}", bold=True)
191        typer.secho("")
192        subdata = helpers.rows_to_data(rows)
193        typer.secho(str(petl.util.vis.lookall(subdata, vrepr=str, style="simple")))
194        if number < len(normdata):
195            typer.secho("")
196