1# ================================================================= 2# 3# Terms and Conditions of Use 4# 5# Unless otherwise noted, computer program source code of this 6# distribution # is covered under Crown Copyright, Government of 7# Canada, and is distributed under the MIT License. 8# 9# The Canada wordmark and related graphics associated with this 10# distribution are protected under trademark law and copyright law. 11# No permission is granted to use them outside the parameters of 12# the Government of Canada's corporate identity program. For 13# more information, see 14# http://www.tbs-sct.gc.ca/fip-pcim/index-eng.asp 15# 16# Copyright title to all 3rd party software distributed with this 17# software is held by the respective copyright holders as noted in 18# those files. Users are asked to read the 3rd Party Licenses 19# referenced with those assets. 20# 21# Copyright (c) 2016 Government of Canada 22# Copyright (c) 2020 Tom Kralidis 23# 24# Permission is hereby granted, free of charge, to any person 25# obtaining a copy of this software and associated documentation 26# files (the "Software"), to deal in the Software without 27# restriction, including without limitation the rights to use, 28# copy, modify, merge, publish, distribute, sublicense, and/or sell 29# copies of the Software, and to permit persons to whom the 30# Software is furnished to do so, subject to the following 31# conditions: 32# 33# The above copyright notice and this permission notice shall be 34# included in all copies or substantial portions of the Software. 35# 36# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 37# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 38# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 39# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 40# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 41# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 42# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 43# OTHER DEALINGS IN THE SOFTWARE. 44# 45# ================================================================= 46 47from collections.abc import Mapping 48from datetime import date, datetime 49import io 50import json 51import logging 52import os 53import pkg_resources 54import re 55from typing import Union 56from xml.dom import minidom 57 58import click 59from jinja2 import Environment, FileSystemLoader 60from jinja2.exceptions import TemplateNotFound 61from jsonschema import validate as jsonschema_validate 62from jsonschema.exceptions import ValidationError 63import yaml 64 65from pygeometa import cli_options 66from pygeometa.helpers import json_serial 67from pygeometa.schemas import get_supported_schemas, load_schema 68 69LOGGER = logging.getLogger(__name__) 70 71SCHEMAS = '{}{}schemas'.format(os.path.dirname(os.path.realpath(__file__)), 72 os.sep) 73 74VERSION = pkg_resources.require('pygeometa')[0].version 75 76 77def get_charstring(option: Union[str, dict], language: str, 78 language_alternate: str = None) -> list: 79 """ 80 convenience function to return unilingual or multilingual value(s) 81 82 :param option: option value (str or dict if multilingual) 83 :param language: language 84 :param language_alternate: alternate language 85 86 :returns: list of unilingual or multilingual values 87 """ 88 89 if option is None: 90 return [None, None] 91 elif isinstance(option, str): # unilingual 92 return [option, None] 93 elif isinstance(option, list): # multilingual list 94 return [option, None] 95 else: # multilingual 96 return [option.get(language), option.get(language_alternate)] 97 98 99def get_distribution_language(section: str) -> str: 100 """ 101 derive language of a given distribution construct 102 103 :param section: section name 104 105 :returns: distribution language 106 """ 107 108 try: 109 return section.split('_')[1] 110 except IndexError: 111 return 'en' 112 113 114def normalize_datestring(datestring: str, format_: str = 'default') -> str: 115 """ 116 groks date string into ISO8601 117 118 :param datestring: date in string representation 119 :format_: datetring format ('year' or default [full]) 120 121 :returns: string of properly formatted datestring 122 """ 123 124 today_and_now = datetime.utcnow() 125 126 re1 = r'\$Date: (?P<year>\d{4})' 127 re2 = r'\$Date: (?P<date>\d{4}-\d{2}-\d{2}) (?P<time>\d{2}:\d{2}:\d{2})' 128 re3 = r'(?P<start>.*)\$Date: (?P<year>\d{4}).*\$(?P<end>.*)' 129 130 try: 131 if isinstance(datestring, date): 132 if datestring.year < 1900: 133 datestring2 = '{0.day:02d}.{0.month:02d}.{0.year:4d}'.format( 134 datestring) 135 else: 136 datestring2 = datestring.strftime('%Y-%m-%dT%H:%M:%SZ') 137 if datestring2.endswith('T00:00:00Z'): 138 datestring2 = datestring2.replace('T00:00:00Z', '') 139 return datestring2 140 elif isinstance(datestring, int) and len(str(datestring)) == 4: # year 141 return str(datestring) 142 if datestring == '$date$': # $date$ magic keyword 143 return today_and_now.strftime('%Y-%m-%d') 144 elif datestring == '$datetime$': # $datetime$ magic keyword 145 return today_and_now.strftime('%Y-%m-%dT%H:%M:%SZ') 146 elif datestring == '$year$': # $year$ magic keyword 147 return today_and_now.strftime('%Y') 148 elif '$year$' in datestring: # $year$ magic keyword embedded 149 return datestring.replace('$year$', today_and_now.strftime('%Y')) 150 elif datestring.startswith('$Date'): # svn Date keyword 151 if format_ == 'year': 152 mo = re.match(re1, datestring) 153 return mo.group('year') 154 else: # default 155 mo = re.match(re2, datestring) 156 return '{}T{}'.format(mo.group('date'), mo.group('time')) 157 elif '$Date' in datestring: # svn Date keyword embedded 158 if format_ == 'year': 159 mo = re.match(re3, datestring) 160 return '{}{}{}'.format(mo.group('start'), 161 mo.group('year'), mo.group('end')) 162 except (AttributeError, TypeError): 163 raise RuntimeError('Invalid datestring: {}'.format(datestring)) 164 return datestring 165 166 167def prune_distribution_formats(formats: dict) -> list: 168 """ 169 derive a unique list of distribution formats 170 171 :param formats: distribution formats 172 173 :returns: unique distribution formats list 174 """ 175 176 counter = 0 177 formats_ = [] 178 unique_formats = [] 179 180 for k1, v1 in formats.items(): 181 row = {} 182 for k2, v2 in v1.items(): 183 if k2.startswith('format'): 184 row[k2] = v2 185 formats_.append(row) 186 187 num_elements = len(formats) 188 189 for f in range(0, len(formats_)): 190 counter += 1 191 if formats_[f] not in unique_formats: 192 unique_formats.append(formats_[f]) 193 if num_elements == counter: 194 break 195 return unique_formats 196 197 198def prune_transfer_option(formats: dict, language: str) -> list: 199 """ 200 derive a unique list of transfer options. 201 The unique character is based on identification language 202 203 :param formats: list of transfer options 204 205 :returns: unique transfer options list 206 """ 207 208 unique_transfer = [] 209 nil_reasons = ['missing', 210 'withheld', 211 'inapplicable', 212 'unknown', 213 'template'] 214 215 for k, v in formats.items(): 216 if language.split(";")[0] in k and language not in nil_reasons: 217 unique_transfer.append(v) 218 elif language in nil_reasons: 219 unique_transfer.append(v) 220 return unique_transfer 221 222 223def read_mcf(mcf: Union[dict, str]) -> dict: 224 """ 225 returns dict of YAML file from filepath, string or dict 226 227 :param mcf: str, dict or filepath of MCF data 228 229 :returns: dict of MCF data 230 """ 231 232 mcf_dict = {} 233 mcf_versions = ['1.0'] 234 235 def __to_dict(mcf_object): 236 """normalize mcf input into dict""" 237 238 dict_ = None 239 240 try: 241 if isinstance(mcf_object, dict): 242 LOGGER.debug('mcf object is already a dict') 243 dict_ = mcf_object 244 elif 'metadata:' in mcf_object: 245 LOGGER.debug('mcf object is a string') 246 dict_ = yaml.load(mcf_object, Loader=yaml.FullLoader) 247 else: 248 LOGGER.debug('mcf object is likely a filepath') 249 with io.open(mcf_object, encoding='utf-8') as fh: 250 dict_ = yaml.load(fh, Loader=yaml.FullLoader) 251 except yaml.scanner.ScannerError as err: 252 msg = 'YAML parsing error: {}'.format(err) 253 LOGGER.debug(msg) 254 raise MCFReadError(msg) 255 256 return dict_ 257 258 # from https://gist.github.com/angstwad/bf22d1822c38a92ec0a9 259 def __dict_merge(dct, merge_dct): 260 """ 261 Recursive dict merge. Inspired by :meth:``dict.update()``, instead of 262 updating only top-level keys, __dict_merge recurses down into dicts 263 nested to an arbitrary depth, updating keys. The ``merge_dct`` is 264 merged into ``dct``. 265 266 :param dct: dict onto which the merge is executed 267 :param merge_dct: dct merged into dct 268 269 :returns: None 270 """ 271 for k, v in merge_dct.items(): 272 if (k in dct and isinstance(dct[k], dict) 273 and isinstance(merge_dct[k], Mapping)): 274 __dict_merge(dct[k], merge_dct[k]) 275 else: 276 if k in dct and k in merge_dct: 277 pass 278 else: 279 dct[k] = merge_dct[k] 280 281 def __parse_mcf_dict_recursive(dict2): 282 for k, v in dict2.copy().items(): 283 if isinstance(v, dict): 284 __parse_mcf_dict_recursive(v) 285 else: 286 if k == 'base_mcf': 287 base_mcf_dict = __to_dict(get_abspath(mcf, v)) 288 for k2, v2 in base_mcf_dict.copy().items(): 289 if k2 == 'base_mcf': 290 base_mcf_dict2 = __to_dict(get_abspath(mcf, v2)) 291 __dict_merge(base_mcf_dict, base_mcf_dict2) 292 base_mcf_dict.pop(k2, None) 293 __dict_merge(dict2, base_mcf_dict) 294 dict2.pop(k, None) 295 return dict2 296 297 LOGGER.debug('reading {}'.format(mcf)) 298 mcf_dict = __to_dict(mcf) 299 300 LOGGER.debug('recursively parsing dict') 301 302 mcf_dict = __parse_mcf_dict_recursive(mcf_dict) 303 304 LOGGER.debug('Fully parsed MCF: {}'.format(mcf_dict)) 305 306 try: 307 mcf_version = str(mcf_dict['mcf']['version']) 308 LOGGER.info('MCF version: {}'.format(mcf_version)) 309 except KeyError: 310 msg = 'no MCF version specified' 311 LOGGER.error(msg) 312 raise MCFReadError(msg) 313 314 for mcf_version_ in mcf_versions: 315 if not mcf_version_.startswith(mcf_version): 316 msg = 'invalid / unsupported version {}'.format(mcf_version) 317 LOGGER.error(msg) 318 raise MCFReadError(msg) 319 320 return mcf_dict 321 322 323def pretty_print(xml: str) -> str: 324 """ 325 clean up indentation and spacing 326 327 :param xml: str of XML data 328 329 :returns: str of pretty-printed XML data 330 """ 331 332 LOGGER.debug('pretty-printing XML') 333 val = minidom.parseString(xml) 334 return '\n'.join([val for val in val.toprettyxml(indent=' '*2).split('\n') if val.strip()]) # noqa 335 336 337def render_j2_template(mcf: dict, template_dir: str = None) -> str: 338 """ 339 convenience function to render Jinja2 template given 340 an mcf file, string, or dict 341 342 :param mcf: dict of MCF data 343 :param template_dir: directory of schema templates 344 345 :returns: str of metadata output 346 """ 347 348 LOGGER.debug('Evaluating template directory') 349 if template_dir is None: 350 msg = 'template_dir or schema_local required' 351 LOGGER.error(msg) 352 raise RuntimeError(msg) 353 354 LOGGER.debug('Setting up template environment {}'.format(template_dir)) 355 env = Environment(loader=FileSystemLoader([template_dir, SCHEMAS])) 356 357 LOGGER.debug('Adding template filters') 358 env.filters['normalize_datestring'] = normalize_datestring 359 env.filters['get_distribution_language'] = get_distribution_language 360 env.filters['get_charstring'] = get_charstring 361 env.filters['prune_distribution_formats'] = prune_distribution_formats 362 env.filters['prune_transfer_option'] = prune_transfer_option 363 env.globals.update(zip=zip) 364 env.globals.update(get_charstring=get_charstring) 365 env.globals.update(normalize_datestring=normalize_datestring) 366 env.globals.update(prune_distribution_formats=prune_distribution_formats) 367 env.globals.update(prune_transfer_option=prune_transfer_option) 368 369 try: 370 LOGGER.debug('Loading template') 371 template = env.get_template('main.j2') 372 except TemplateNotFound: 373 msg = 'Missing metadata template' 374 LOGGER.error(msg) 375 raise RuntimeError(msg) 376 377 LOGGER.debug('Processing template') 378 xml = template.render(record=mcf, 379 pygeometa_version=VERSION).encode('utf-8') 380 return pretty_print(xml) 381 382 383def validate_mcf(instance_dict): 384 """ 385 Validate an MCF document against the MCF schema 386 387 :param instance_dict: dict of MCF instance 388 389 :returns: `bool` of validation 390 """ 391 392 schema_file = os.path.join(SCHEMAS, 'mcf', 'core.yml') 393 394 print(schema_file) 395 396 with open(schema_file) as fh2: 397 schema_dict = yaml.load(fh2, Loader=yaml.FullLoader) 398 399 try: 400 jsonschema_validate(instance_dict, schema_dict) 401 except ValidationError as err: 402 raise MCFValidationError(err) 403 404 return True 405 406 407def get_abspath(mcf, filepath): 408 """helper function absolute file access""" 409 410 abspath = os.path.dirname(os.path.realpath(mcf)) 411 return os.path.join(abspath, filepath) 412 413 414class MCFReadError(Exception): 415 """Exception stub for format reading errors""" 416 pass 417 418 419class MCFValidationError(Exception): 420 """Exception stub for validation errors""" 421 pass 422 423 424@click.command() 425@click.pass_context 426@cli_options.ARGUMENT_MCF 427@cli_options.OPTION_OUTPUT 428@cli_options.OPTION_VERBOSITY 429@click.option('--schema', 430 type=click.Choice(get_supported_schemas()), 431 help='Metadata schema') 432@click.option('--schema_local', 433 type=click.Path(exists=True, resolve_path=True, 434 dir_okay=True, file_okay=False), 435 help='Locally defined metadata schema') 436@cli_options.OPTION_VERBOSITY 437def generate(ctx, mcf, schema, schema_local, output, verbosity): 438 """generate metadata""" 439 440 if verbosity is not None: 441 logging.basicConfig(level=getattr(logging, verbosity)) 442 443 if schema is None and schema_local is None: 444 raise click.UsageError('Missing arguments') 445 elif None not in [schema, schema_local]: 446 raise click.UsageError('schema / schema_local are mutually exclusive') 447 448 mcf_dict = read_mcf(mcf) 449 450 if schema is not None: 451 LOGGER.info('Processing {} into {}'.format(mcf, schema)) 452 schema_object = load_schema(schema) 453 content = schema_object.write(mcf_dict) 454 else: 455 content = render_j2_template(mcf_dict, template_dir=schema_local) 456 457 if output is None: 458 click.echo(content) 459 else: 460 output.write(content) 461 462 463@click.command() 464@click.pass_context 465@cli_options.ARGUMENT_MCF 466@cli_options.OPTION_VERBOSITY 467def info(ctx, mcf, verbosity): 468 """provide information about an MCF""" 469 470 if verbosity is not None: 471 logging.basicConfig(level=getattr(logging, verbosity)) 472 473 LOGGER.info('Processing {}'.format(mcf)) 474 try: 475 content = read_mcf(mcf) 476 477 click.echo('MCF overview') 478 click.echo(' version: {}'.format(content['mcf']['version'])) 479 click.echo(' identifier: {}'.format( 480 content['metadata']['identifier'])) 481 click.echo(' language: {}'.format( 482 content['metadata']['language'])) 483 except Exception as err: 484 raise click.ClickException(err) 485 486 487@click.command() 488@click.pass_context 489def schemas(ctx): 490 """list supported schemas""" 491 click.echo('\n'.join(get_supported_schemas())) 492 493 494@click.command() 495@click.pass_context 496@cli_options.ARGUMENT_MCF 497def validate(ctx, mcf): 498 """Validate MCF Document""" 499 500 click.echo('Validating {}'.format(mcf)) 501 502 with open(mcf, encoding='utf8') as fh: 503 instance = json.loads(json.dumps(yaml.load(fh, Loader=yaml.FullLoader), 504 default=json_serial)) 505 validate_mcf(instance) 506 507 click.echo('Valid MCF document') 508