1# Licensed under a 3-clause BSD style license - see LICENSE.rst 2""" 3This file contains routines to verify the correctness of UCD strings. 4""" 5 6 7# STDLIB 8import re 9 10# LOCAL 11from astropy.utils import data 12 13__all__ = ['parse_ucd', 'check_ucd'] 14 15 16class UCDWords: 17 """ 18 Manages a list of acceptable UCD words. 19 20 Works by reading in a data file exactly as provided by IVOA. This 21 file resides in data/ucd1p-words.txt. 22 """ 23 24 def __init__(self): 25 self._primary = set() 26 self._secondary = set() 27 self._descriptions = {} 28 self._capitalization = {} 29 30 with data.get_pkg_data_fileobj( 31 "data/ucd1p-words.txt", encoding='ascii') as fd: 32 for line in fd.readlines(): 33 type, name, descr = [ 34 x.strip() for x in line.split('|')] 35 name_lower = name.lower() 36 if type in 'QPEVC': 37 self._primary.add(name_lower) 38 if type in 'QSEVC': 39 self._secondary.add(name_lower) 40 self._descriptions[name_lower] = descr 41 self._capitalization[name_lower] = name 42 43 def is_primary(self, name): 44 """ 45 Returns True if *name* is a valid primary name. 46 """ 47 return name.lower() in self._primary 48 49 def is_secondary(self, name): 50 """ 51 Returns True if *name* is a valid secondary name. 52 """ 53 return name.lower() in self._secondary 54 55 def get_description(self, name): 56 """ 57 Returns the official English description of the given UCD 58 *name*. 59 """ 60 return self._descriptions[name.lower()] 61 62 def normalize_capitalization(self, name): 63 """ 64 Returns the standard capitalization form of the given name. 65 """ 66 return self._capitalization[name.lower()] 67 68 69_ucd_singleton = None 70 71 72def parse_ucd(ucd, check_controlled_vocabulary=False, has_colon=False): 73 """ 74 Parse the UCD into its component parts. 75 76 Parameters 77 ---------- 78 ucd : str 79 The UCD string 80 81 check_controlled_vocabulary : bool, optional 82 If `True`, then each word in the UCD will be verified against 83 the UCD1+ controlled vocabulary, (as required by the VOTable 84 specification version 1.2), otherwise not. 85 86 has_colon : bool, optional 87 If `True`, the UCD may contain a colon (as defined in earlier 88 versions of the standard). 89 90 Returns 91 ------- 92 parts : list 93 The result is a list of tuples of the form: 94 95 (*namespace*, *word*) 96 97 If no namespace was explicitly specified, *namespace* will be 98 returned as ``'ivoa'`` (i.e., the default namespace). 99 100 Raises 101 ------ 102 ValueError 103 if *ucd* is invalid 104 """ 105 global _ucd_singleton 106 if _ucd_singleton is None: 107 _ucd_singleton = UCDWords() 108 109 if has_colon: 110 m = re.search(r'[^A-Za-z0-9_.:;\-]', ucd) 111 else: 112 m = re.search(r'[^A-Za-z0-9_.;\-]', ucd) 113 if m is not None: 114 raise ValueError(f"UCD has invalid character '{m.group(0)}' in '{ucd}'") 115 116 word_component_re = r'[A-Za-z0-9][A-Za-z0-9\-_]*' 117 word_re = fr'{word_component_re}(\.{word_component_re})*' 118 119 parts = ucd.split(';') 120 words = [] 121 for i, word in enumerate(parts): 122 colon_count = word.count(':') 123 if colon_count == 1: 124 ns, word = word.split(':', 1) 125 if not re.match(word_component_re, ns): 126 raise ValueError(f"Invalid namespace '{ns}'") 127 ns = ns.lower() 128 elif colon_count > 1: 129 raise ValueError(f"Too many colons in '{word}'") 130 else: 131 ns = 'ivoa' 132 133 if not re.match(word_re, word): 134 raise ValueError(f"Invalid word '{word}'") 135 136 if ns == 'ivoa' and check_controlled_vocabulary: 137 if i == 0: 138 if not _ucd_singleton.is_primary(word): 139 if _ucd_singleton.is_secondary(word): 140 raise ValueError( 141 f"Secondary word '{word}' is not valid as a primary word") 142 else: 143 raise ValueError(f"Unknown word '{word}'") 144 else: 145 if not _ucd_singleton.is_secondary(word): 146 if _ucd_singleton.is_primary(word): 147 raise ValueError( 148 f"Primary word '{word}' is not valid as a secondary word") 149 else: 150 raise ValueError(f"Unknown word '{word}'") 151 152 try: 153 normalized_word = _ucd_singleton.normalize_capitalization(word) 154 except KeyError: 155 normalized_word = word 156 words.append((ns, normalized_word)) 157 158 return words 159 160 161def check_ucd(ucd, check_controlled_vocabulary=False, has_colon=False): 162 """ 163 Returns False if *ucd* is not a valid `unified content descriptor`_. 164 165 Parameters 166 ---------- 167 ucd : str 168 The UCD string 169 170 check_controlled_vocabulary : bool, optional 171 If `True`, then each word in the UCD will be verified against 172 the UCD1+ controlled vocabulary, (as required by the VOTable 173 specification version 1.2), otherwise not. 174 175 has_colon : bool, optional 176 If `True`, the UCD may contain a colon (as defined in earlier 177 versions of the standard). 178 179 Returns 180 ------- 181 valid : bool 182 """ 183 if ucd is None: 184 return True 185 186 try: 187 parse_ucd(ucd, 188 check_controlled_vocabulary=check_controlled_vocabulary, 189 has_colon=has_colon) 190 except ValueError: 191 return False 192 return True 193