1# Licensed under a 3-clause BSD style license - see LICENSE.rst
2"""
3This file contains routines to verify the correctness of UCD strings.
4"""
5
6
7# STDLIB
8import re
9
10# LOCAL
11from astropy.utils import data
12
13__all__ = ['parse_ucd', 'check_ucd']
14
15
16class UCDWords:
17    """
18    Manages a list of acceptable UCD words.
19
20    Works by reading in a data file exactly as provided by IVOA.  This
21    file resides in data/ucd1p-words.txt.
22    """
23
24    def __init__(self):
25        self._primary = set()
26        self._secondary = set()
27        self._descriptions = {}
28        self._capitalization = {}
29
30        with data.get_pkg_data_fileobj(
31                "data/ucd1p-words.txt", encoding='ascii') as fd:
32            for line in fd.readlines():
33                type, name, descr = [
34                    x.strip() for x in line.split('|')]
35                name_lower = name.lower()
36                if type in 'QPEVC':
37                    self._primary.add(name_lower)
38                if type in 'QSEVC':
39                    self._secondary.add(name_lower)
40                self._descriptions[name_lower] = descr
41                self._capitalization[name_lower] = name
42
43    def is_primary(self, name):
44        """
45        Returns True if *name* is a valid primary name.
46        """
47        return name.lower() in self._primary
48
49    def is_secondary(self, name):
50        """
51        Returns True if *name* is a valid secondary name.
52        """
53        return name.lower() in self._secondary
54
55    def get_description(self, name):
56        """
57        Returns the official English description of the given UCD
58        *name*.
59        """
60        return self._descriptions[name.lower()]
61
62    def normalize_capitalization(self, name):
63        """
64        Returns the standard capitalization form of the given name.
65        """
66        return self._capitalization[name.lower()]
67
68
69_ucd_singleton = None
70
71
72def parse_ucd(ucd, check_controlled_vocabulary=False, has_colon=False):
73    """
74    Parse the UCD into its component parts.
75
76    Parameters
77    ----------
78    ucd : str
79        The UCD string
80
81    check_controlled_vocabulary : bool, optional
82        If `True`, then each word in the UCD will be verified against
83        the UCD1+ controlled vocabulary, (as required by the VOTable
84        specification version 1.2), otherwise not.
85
86    has_colon : bool, optional
87        If `True`, the UCD may contain a colon (as defined in earlier
88        versions of the standard).
89
90    Returns
91    -------
92    parts : list
93        The result is a list of tuples of the form:
94
95            (*namespace*, *word*)
96
97        If no namespace was explicitly specified, *namespace* will be
98        returned as ``'ivoa'`` (i.e., the default namespace).
99
100    Raises
101    ------
102    ValueError
103        if *ucd* is invalid
104    """
105    global _ucd_singleton
106    if _ucd_singleton is None:
107        _ucd_singleton = UCDWords()
108
109    if has_colon:
110        m = re.search(r'[^A-Za-z0-9_.:;\-]', ucd)
111    else:
112        m = re.search(r'[^A-Za-z0-9_.;\-]', ucd)
113    if m is not None:
114        raise ValueError(f"UCD has invalid character '{m.group(0)}' in '{ucd}'")
115
116    word_component_re = r'[A-Za-z0-9][A-Za-z0-9\-_]*'
117    word_re = fr'{word_component_re}(\.{word_component_re})*'
118
119    parts = ucd.split(';')
120    words = []
121    for i, word in enumerate(parts):
122        colon_count = word.count(':')
123        if colon_count == 1:
124            ns, word = word.split(':', 1)
125            if not re.match(word_component_re, ns):
126                raise ValueError(f"Invalid namespace '{ns}'")
127            ns = ns.lower()
128        elif colon_count > 1:
129            raise ValueError(f"Too many colons in '{word}'")
130        else:
131            ns = 'ivoa'
132
133        if not re.match(word_re, word):
134            raise ValueError(f"Invalid word '{word}'")
135
136        if ns == 'ivoa' and check_controlled_vocabulary:
137            if i == 0:
138                if not _ucd_singleton.is_primary(word):
139                    if _ucd_singleton.is_secondary(word):
140                        raise ValueError(
141                            f"Secondary word '{word}' is not valid as a primary word")
142                    else:
143                        raise ValueError(f"Unknown word '{word}'")
144            else:
145                if not _ucd_singleton.is_secondary(word):
146                    if _ucd_singleton.is_primary(word):
147                        raise ValueError(
148                            f"Primary word '{word}' is not valid as a secondary word")
149                    else:
150                        raise ValueError(f"Unknown word '{word}'")
151
152        try:
153            normalized_word = _ucd_singleton.normalize_capitalization(word)
154        except KeyError:
155            normalized_word = word
156        words.append((ns, normalized_word))
157
158    return words
159
160
161def check_ucd(ucd, check_controlled_vocabulary=False, has_colon=False):
162    """
163    Returns False if *ucd* is not a valid `unified content descriptor`_.
164
165    Parameters
166    ----------
167    ucd : str
168        The UCD string
169
170    check_controlled_vocabulary : bool, optional
171        If `True`, then each word in the UCD will be verified against
172        the UCD1+ controlled vocabulary, (as required by the VOTable
173        specification version 1.2), otherwise not.
174
175    has_colon : bool, optional
176        If `True`, the UCD may contain a colon (as defined in earlier
177        versions of the standard).
178
179    Returns
180    -------
181    valid : bool
182    """
183    if ucd is None:
184        return True
185
186    try:
187        parse_ucd(ucd,
188                  check_controlled_vocabulary=check_controlled_vocabulary,
189                  has_colon=has_colon)
190    except ValueError:
191        return False
192    return True
193