1# =================================================================
2#
3# Terms and Conditions of Use
4#
5# Unless otherwise noted, computer program source code of this
6# distribution # is covered under Crown Copyright, Government of
7# Canada, and is distributed under the MIT License.
8#
9# The Canada wordmark and related graphics associated with this
10# distribution are protected under trademark law and copyright law.
11# No permission is granted to use them outside the parameters of
12# the Government of Canada's corporate identity program. For
13# more information, see
14# http://www.tbs-sct.gc.ca/fip-pcim/index-eng.asp
15#
16# Copyright title to all 3rd party software distributed with this
17# software is held by the respective copyright holders as noted in
18# those files. Users are asked to read the 3rd Party Licenses
19# referenced with those assets.
20#
21# Copyright (c) 2016 Government of Canada
22# Copyright (c) 2020 Tom Kralidis
23#
24# Permission is hereby granted, free of charge, to any person
25# obtaining a copy of this software and associated documentation
26# files (the "Software"), to deal in the Software without
27# restriction, including without limitation the rights to use,
28# copy, modify, merge, publish, distribute, sublicense, and/or sell
29# copies of the Software, and to permit persons to whom the
30# Software is furnished to do so, subject to the following
31# conditions:
32#
33# The above copyright notice and this permission notice shall be
34# included in all copies or substantial portions of the Software.
35#
36# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
37# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
38# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
39# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
40# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
41# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
42# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
43# OTHER DEALINGS IN THE SOFTWARE.
44#
45# =================================================================
46
47from collections.abc import Mapping
48from datetime import date, datetime
49import io
50import json
51import logging
52import os
53import pkg_resources
54import re
55from typing import Union
56from xml.dom import minidom
57
58import click
59from jinja2 import Environment, FileSystemLoader
60from jinja2.exceptions import TemplateNotFound
61from jsonschema import validate as jsonschema_validate
62from jsonschema.exceptions import ValidationError
63import yaml
64
65from pygeometa import cli_options
66from pygeometa.helpers import json_serial
67from pygeometa.schemas import get_supported_schemas, load_schema
68
69LOGGER = logging.getLogger(__name__)
70
71SCHEMAS = '{}{}schemas'.format(os.path.dirname(os.path.realpath(__file__)),
72                               os.sep)
73
74VERSION = pkg_resources.require('pygeometa')[0].version
75
76
77def get_charstring(option: Union[str, dict], language: str,
78                   language_alternate: str = None) -> list:
79    """
80    convenience function to return unilingual or multilingual value(s)
81
82    :param option: option value (str or dict if multilingual)
83    :param language: language
84    :param language_alternate: alternate language
85
86    :returns: list of unilingual or multilingual values
87    """
88
89    if option is None:
90        return [None, None]
91    elif isinstance(option, str):  # unilingual
92        return [option, None]
93    elif isinstance(option, list):  # multilingual list
94        return [option, None]
95    else:  # multilingual
96        return [option.get(language), option.get(language_alternate)]
97
98
99def get_distribution_language(section: str) -> str:
100    """
101    derive language of a given distribution construct
102
103    :param section: section name
104
105    :returns: distribution language
106    """
107
108    try:
109        return section.split('_')[1]
110    except IndexError:
111        return 'en'
112
113
114def normalize_datestring(datestring: str, format_: str = 'default') -> str:
115    """
116    groks date string into ISO8601
117
118    :param datestring: date in string representation
119    :format_: datetring format ('year' or default [full])
120
121    :returns: string of properly formatted datestring
122    """
123
124    today_and_now = datetime.utcnow()
125
126    re1 = r'\$Date: (?P<year>\d{4})'
127    re2 = r'\$Date: (?P<date>\d{4}-\d{2}-\d{2}) (?P<time>\d{2}:\d{2}:\d{2})'
128    re3 = r'(?P<start>.*)\$Date: (?P<year>\d{4}).*\$(?P<end>.*)'
129
130    try:
131        if isinstance(datestring, date):
132            if datestring.year < 1900:
133                datestring2 = '{0.day:02d}.{0.month:02d}.{0.year:4d}'.format(
134                    datestring)
135            else:
136                datestring2 = datestring.strftime('%Y-%m-%dT%H:%M:%SZ')
137            if datestring2.endswith('T00:00:00Z'):
138                datestring2 = datestring2.replace('T00:00:00Z', '')
139            return datestring2
140        elif isinstance(datestring, int) and len(str(datestring)) == 4:  # year
141            return str(datestring)
142        if datestring == '$date$':  # $date$ magic keyword
143            return today_and_now.strftime('%Y-%m-%d')
144        elif datestring == '$datetime$':  # $datetime$ magic keyword
145            return today_and_now.strftime('%Y-%m-%dT%H:%M:%SZ')
146        elif datestring == '$year$':  # $year$ magic keyword
147            return today_and_now.strftime('%Y')
148        elif '$year$' in datestring:  # $year$ magic keyword embedded
149            return datestring.replace('$year$', today_and_now.strftime('%Y'))
150        elif datestring.startswith('$Date'):  # svn Date keyword
151            if format_ == 'year':
152                mo = re.match(re1, datestring)
153                return mo.group('year')
154            else:  # default
155                mo = re.match(re2, datestring)
156                return '{}T{}'.format(mo.group('date'), mo.group('time'))
157        elif '$Date' in datestring:  # svn Date keyword embedded
158            if format_ == 'year':
159                mo = re.match(re3, datestring)
160                return '{}{}{}'.format(mo.group('start'),
161                                       mo.group('year'), mo.group('end'))
162    except (AttributeError, TypeError):
163        raise RuntimeError('Invalid datestring: {}'.format(datestring))
164    return datestring
165
166
167def prune_distribution_formats(formats: dict) -> list:
168    """
169    derive a unique list of distribution formats
170
171    :param formats: distribution formats
172
173    :returns: unique distribution formats list
174    """
175
176    counter = 0
177    formats_ = []
178    unique_formats = []
179
180    for k1, v1 in formats.items():
181        row = {}
182        for k2, v2 in v1.items():
183            if k2.startswith('format'):
184                row[k2] = v2
185        formats_.append(row)
186
187    num_elements = len(formats)
188
189    for f in range(0, len(formats_)):
190        counter += 1
191        if formats_[f] not in unique_formats:
192            unique_formats.append(formats_[f])
193        if num_elements == counter:
194            break
195    return unique_formats
196
197
198def prune_transfer_option(formats: dict, language: str) -> list:
199    """
200    derive a unique list of transfer options.
201    The unique character is based on identification language
202
203    :param formats: list of transfer options
204
205    :returns: unique transfer options list
206    """
207
208    unique_transfer = []
209    nil_reasons = ['missing',
210                   'withheld',
211                   'inapplicable',
212                   'unknown',
213                   'template']
214
215    for k, v in formats.items():
216        if language.split(";")[0] in k and language not in nil_reasons:
217            unique_transfer.append(v)
218        elif language in nil_reasons:
219            unique_transfer.append(v)
220    return unique_transfer
221
222
223def read_mcf(mcf: Union[dict, str]) -> dict:
224    """
225    returns dict of YAML file from filepath, string or dict
226
227    :param mcf: str, dict or filepath of MCF data
228
229    :returns: dict of MCF data
230    """
231
232    mcf_dict = {}
233    mcf_versions = ['1.0']
234
235    def __to_dict(mcf_object):
236        """normalize mcf input into dict"""
237
238        dict_ = None
239
240        try:
241            if isinstance(mcf_object, dict):
242                LOGGER.debug('mcf object is already a dict')
243                dict_ = mcf_object
244            elif 'metadata:' in mcf_object:
245                LOGGER.debug('mcf object is a string')
246                dict_ = yaml.load(mcf_object, Loader=yaml.FullLoader)
247            else:
248                LOGGER.debug('mcf object is likely a filepath')
249                with io.open(mcf_object, encoding='utf-8') as fh:
250                    dict_ = yaml.load(fh, Loader=yaml.FullLoader)
251        except yaml.scanner.ScannerError as err:
252            msg = 'YAML parsing error: {}'.format(err)
253            LOGGER.debug(msg)
254            raise MCFReadError(msg)
255
256        return dict_
257
258    # from https://gist.github.com/angstwad/bf22d1822c38a92ec0a9
259    def __dict_merge(dct, merge_dct):
260        """
261        Recursive dict merge. Inspired by :meth:``dict.update()``, instead of
262        updating only top-level keys, __dict_merge recurses down into dicts
263        nested to an arbitrary depth, updating keys. The ``merge_dct`` is
264        merged into ``dct``.
265
266        :param dct: dict onto which the merge is executed
267        :param merge_dct: dct merged into dct
268
269        :returns: None
270        """
271        for k, v in merge_dct.items():
272            if (k in dct and isinstance(dct[k], dict)
273                    and isinstance(merge_dct[k], Mapping)):
274                __dict_merge(dct[k], merge_dct[k])
275            else:
276                if k in dct and k in merge_dct:
277                    pass
278                else:
279                    dct[k] = merge_dct[k]
280
281    def __parse_mcf_dict_recursive(dict2):
282        for k, v in dict2.copy().items():
283            if isinstance(v, dict):
284                __parse_mcf_dict_recursive(v)
285            else:
286                if k == 'base_mcf':
287                    base_mcf_dict = __to_dict(get_abspath(mcf, v))
288                    for k2, v2 in base_mcf_dict.copy().items():
289                        if k2 == 'base_mcf':
290                            base_mcf_dict2 = __to_dict(get_abspath(mcf, v2))
291                            __dict_merge(base_mcf_dict, base_mcf_dict2)
292                            base_mcf_dict.pop(k2, None)
293                    __dict_merge(dict2, base_mcf_dict)
294                    dict2.pop(k, None)
295        return dict2
296
297    LOGGER.debug('reading {}'.format(mcf))
298    mcf_dict = __to_dict(mcf)
299
300    LOGGER.debug('recursively parsing dict')
301
302    mcf_dict = __parse_mcf_dict_recursive(mcf_dict)
303
304    LOGGER.debug('Fully parsed MCF: {}'.format(mcf_dict))
305
306    try:
307        mcf_version = str(mcf_dict['mcf']['version'])
308        LOGGER.info('MCF version: {}'.format(mcf_version))
309    except KeyError:
310        msg = 'no MCF version specified'
311        LOGGER.error(msg)
312        raise MCFReadError(msg)
313
314    for mcf_version_ in mcf_versions:
315        if not mcf_version_.startswith(mcf_version):
316            msg = 'invalid / unsupported version {}'.format(mcf_version)
317            LOGGER.error(msg)
318            raise MCFReadError(msg)
319
320    return mcf_dict
321
322
323def pretty_print(xml: str) -> str:
324    """
325    clean up indentation and spacing
326
327    :param xml: str of XML data
328
329    :returns: str of pretty-printed XML data
330    """
331
332    LOGGER.debug('pretty-printing XML')
333    val = minidom.parseString(xml)
334    return '\n'.join([val for val in val.toprettyxml(indent=' '*2).split('\n') if val.strip()])  # noqa
335
336
337def render_j2_template(mcf: dict, template_dir: str = None) -> str:
338    """
339    convenience function to render Jinja2 template given
340    an mcf file, string, or dict
341
342    :param mcf: dict of MCF data
343    :param template_dir: directory of schema templates
344
345    :returns: str of metadata output
346    """
347
348    LOGGER.debug('Evaluating template directory')
349    if template_dir is None:
350        msg = 'template_dir or schema_local required'
351        LOGGER.error(msg)
352        raise RuntimeError(msg)
353
354    LOGGER.debug('Setting up template environment {}'.format(template_dir))
355    env = Environment(loader=FileSystemLoader([template_dir, SCHEMAS]))
356
357    LOGGER.debug('Adding template filters')
358    env.filters['normalize_datestring'] = normalize_datestring
359    env.filters['get_distribution_language'] = get_distribution_language
360    env.filters['get_charstring'] = get_charstring
361    env.filters['prune_distribution_formats'] = prune_distribution_formats
362    env.filters['prune_transfer_option'] = prune_transfer_option
363    env.globals.update(zip=zip)
364    env.globals.update(get_charstring=get_charstring)
365    env.globals.update(normalize_datestring=normalize_datestring)
366    env.globals.update(prune_distribution_formats=prune_distribution_formats)
367    env.globals.update(prune_transfer_option=prune_transfer_option)
368
369    try:
370        LOGGER.debug('Loading template')
371        template = env.get_template('main.j2')
372    except TemplateNotFound:
373        msg = 'Missing metadata template'
374        LOGGER.error(msg)
375        raise RuntimeError(msg)
376
377    LOGGER.debug('Processing template')
378    xml = template.render(record=mcf,
379                          pygeometa_version=VERSION).encode('utf-8')
380    return pretty_print(xml)
381
382
383def validate_mcf(instance_dict):
384    """
385    Validate an MCF document against the MCF schema
386
387    :param instance_dict: dict of MCF instance
388
389    :returns: `bool` of validation
390    """
391
392    schema_file = os.path.join(SCHEMAS, 'mcf', 'core.yml')
393
394    print(schema_file)
395
396    with open(schema_file) as fh2:
397        schema_dict = yaml.load(fh2, Loader=yaml.FullLoader)
398
399        try:
400            jsonschema_validate(instance_dict, schema_dict)
401        except ValidationError as err:
402            raise MCFValidationError(err)
403
404        return True
405
406
407def get_abspath(mcf, filepath):
408    """helper function absolute file access"""
409
410    abspath = os.path.dirname(os.path.realpath(mcf))
411    return os.path.join(abspath, filepath)
412
413
414class MCFReadError(Exception):
415    """Exception stub for format reading errors"""
416    pass
417
418
419class MCFValidationError(Exception):
420    """Exception stub for validation errors"""
421    pass
422
423
424@click.command()
425@click.pass_context
426@cli_options.ARGUMENT_MCF
427@cli_options.OPTION_OUTPUT
428@cli_options.OPTION_VERBOSITY
429@click.option('--schema',
430              type=click.Choice(get_supported_schemas()),
431              help='Metadata schema')
432@click.option('--schema_local',
433              type=click.Path(exists=True, resolve_path=True,
434                              dir_okay=True, file_okay=False),
435              help='Locally defined metadata schema')
436@cli_options.OPTION_VERBOSITY
437def generate(ctx, mcf, schema, schema_local, output, verbosity):
438    """generate metadata"""
439
440    if verbosity is not None:
441        logging.basicConfig(level=getattr(logging, verbosity))
442
443    if schema is None and schema_local is None:
444        raise click.UsageError('Missing arguments')
445    elif None not in [schema, schema_local]:
446        raise click.UsageError('schema / schema_local are mutually exclusive')
447
448    mcf_dict = read_mcf(mcf)
449
450    if schema is not None:
451        LOGGER.info('Processing {} into {}'.format(mcf, schema))
452        schema_object = load_schema(schema)
453        content = schema_object.write(mcf_dict)
454    else:
455        content = render_j2_template(mcf_dict, template_dir=schema_local)
456
457    if output is None:
458        click.echo(content)
459    else:
460        output.write(content)
461
462
463@click.command()
464@click.pass_context
465@cli_options.ARGUMENT_MCF
466@cli_options.OPTION_VERBOSITY
467def info(ctx, mcf, verbosity):
468    """provide information about an MCF"""
469
470    if verbosity is not None:
471        logging.basicConfig(level=getattr(logging, verbosity))
472
473    LOGGER.info('Processing {}'.format(mcf))
474    try:
475        content = read_mcf(mcf)
476
477        click.echo('MCF overview')
478        click.echo('  version: {}'.format(content['mcf']['version']))
479        click.echo('  identifier: {}'.format(
480            content['metadata']['identifier']))
481        click.echo('  language: {}'.format(
482                   content['metadata']['language']))
483    except Exception as err:
484        raise click.ClickException(err)
485
486
487@click.command()
488@click.pass_context
489def schemas(ctx):
490    """list supported schemas"""
491    click.echo('\n'.join(get_supported_schemas()))
492
493
494@click.command()
495@click.pass_context
496@cli_options.ARGUMENT_MCF
497def validate(ctx, mcf):
498    """Validate MCF Document"""
499
500    click.echo('Validating {}'.format(mcf))
501
502    with open(mcf, encoding='utf8') as fh:
503        instance = json.loads(json.dumps(yaml.load(fh, Loader=yaml.FullLoader),
504                              default=json_serial))
505    validate_mcf(instance)
506
507    click.echo('Valid MCF document')
508