1# -*- coding: utf-8 -*-
2
3# This Source Code Form is subject to the terms of the Mozilla Public
4# License, v. 2.0. If a copy of the MPL was not distributed with this
5# file, You can obtain one at http://mozilla.org/MPL/2.0/.
6
7from collections import OrderedDict
8import datetime
9import functools
10import json
11from pathlib import Path
12import sys
13import textwrap
14from typing import Any, Callable, Iterable, Sequence, Tuple, Union
15import urllib.request
16
17import appdirs  # type: ignore
18import diskcache  # type: ignore
19import jinja2
20import jsonschema  # type: ignore
21from jsonschema import _utils  # type: ignore
22import yaml
23
24if sys.version_info < (3, 7):
25    import iso8601  # type: ignore
26
27
28TESTING_MODE = "pytest" in sys.modules
29
30
31JSONType = Union[list, dict, str, int, float, None]
32"""
33The types supported by JSON.
34
35This is only an approximation -- this should really be a recursive type.
36"""
37
38# Adapted from
39# https://stackoverflow.com/questions/34667108/ignore-dates-and-times-while-parsing-yaml
40
41
42class _NoDatesSafeLoader(yaml.SafeLoader):
43    @classmethod
44    def remove_implicit_resolver(cls, tag_to_remove):
45        """
46        Remove implicit resolvers for a particular tag
47
48        Takes care not to modify resolvers in super classes.
49
50        We want to load datetimes as strings, not dates, because we
51        go on to serialise as json which doesn't have the advanced types
52        of yaml, and leads to incompatibilities down the track.
53        """
54        if "yaml_implicit_resolvers" not in cls.__dict__:
55            cls.yaml_implicit_resolvers = cls.yaml_implicit_resolvers.copy()
56
57        for first_letter, mappings in cls.yaml_implicit_resolvers.items():
58            cls.yaml_implicit_resolvers[first_letter] = [
59                (tag, regexp) for tag, regexp in mappings if tag != tag_to_remove
60            ]
61
62
63# Since we use JSON schema to validate, and JSON schema doesn't support
64# datetimes, we don't want the YAML loader to give us datetimes -- just
65# strings.
66_NoDatesSafeLoader.remove_implicit_resolver("tag:yaml.org,2002:timestamp")
67
68
69def yaml_load(stream):
70    """
71    Map line number to yaml nodes, and preserve the order
72    of metrics as they appear in the metrics.yaml file.
73    """
74
75    class SafeLineLoader(_NoDatesSafeLoader):
76        pass
77
78    def _construct_mapping_adding_line(loader, node):
79        loader.flatten_mapping(node)
80        mapping = OrderedDict(loader.construct_pairs(node))
81        mapping.defined_in = {"line": node.start_mark.line}
82        return mapping
83
84    SafeLineLoader.add_constructor(
85        yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, _construct_mapping_adding_line
86    )
87    return yaml.load(stream, SafeLineLoader)
88
89
90def ordered_yaml_dump(data, **kwargs):
91    class OrderedDumper(yaml.Dumper):
92        pass
93
94    def _dict_representer(dumper, data):
95        return dumper.represent_mapping(
96            yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, data.items()
97        )
98
99    OrderedDumper.add_representer(OrderedDict, _dict_representer)
100    return yaml.dump(data, Dumper=OrderedDumper, **kwargs)
101
102
103def load_yaml_or_json(path: Path):
104    """
105    Load the content from either a .json or .yaml file, based on the filename
106    extension.
107
108    :param path: `pathlib.Path` object
109    :rtype object: The tree of objects as a result of parsing the file.
110    :raises ValueError: The file is neither a .json, .yml or .yaml file.
111    :raises FileNotFoundError: The file does not exist.
112    """
113    # If in py.test, support bits of literal JSON/YAML content
114    if TESTING_MODE and isinstance(path, dict):
115        return path
116
117    if path.suffix == ".json":
118        with path.open("r", encoding="utf-8") as fd:
119            return json.load(fd)
120    elif path.suffix in (".yml", ".yaml", ".yamlx"):
121        with path.open("r", encoding="utf-8") as fd:
122            return yaml_load(fd)
123    else:
124        raise ValueError(f"Unknown file extension {path.suffix}")
125
126
127def ensure_list(value: Any) -> Sequence[Any]:
128    """
129    Ensures that the value is a list. If it is anything but a list or tuple, a
130    list with a single element containing only value is returned.
131    """
132    if not isinstance(value, (list, tuple)):
133        return [value]
134    return value
135
136
137def to_camel_case(input: str, capitalize_first_letter: bool) -> str:
138    """
139    Convert the value to camelCase.
140
141    This additionally replaces any '.' with '_'. The first letter is capitalized
142    depending on `capitalize_first_letter`.
143    """
144    sanitized_input = input.replace(".", "_").replace("-", "_")
145    # Filter out any empty token. This could happen due to leading '_' or
146    # consecutive '__'.
147    tokens = [s.capitalize() for s in sanitized_input.split("_") if len(s) != 0]
148    # If we're not meant to capitalize the first letter, then lowercase it.
149    if not capitalize_first_letter:
150        tokens[0] = tokens[0].lower()
151    # Finally join the tokens and capitalize.
152    return "".join(tokens)
153
154
155def camelize(value: str) -> str:
156    """
157    Convert the value to camelCase (with a lower case first letter).
158
159    This is a thin wrapper around inflection.camelize that handles dots in
160    addition to underscores.
161    """
162    return to_camel_case(value, False)
163
164
165def Camelize(value: str) -> str:
166    """
167    Convert the value to CamelCase (with an upper case first letter).
168
169    This is a thin wrapper around inflection.camelize that handles dots in
170    addition to underscores.
171    """
172    return to_camel_case(value, True)
173
174
175def snake_case(value: str) -> str:
176    """
177    Convert the value to snake_case.
178    """
179    return value.lower().replace(".", "_").replace("-", "_")
180
181
182@functools.lru_cache()
183def get_jinja2_template(
184    template_name: str, filters: Iterable[Tuple[str, Callable]] = ()
185):
186    """
187    Get a Jinja2 template that ships with glean_parser.
188
189    The template has extra filters for camel-casing identifiers.
190
191    :param template_name: Name of a file in ``glean_parser/templates``
192    :param filters: tuple of 2-tuple. A tuple of (name, func) pairs defining
193        additional filters.
194    """
195    env = jinja2.Environment(
196        loader=jinja2.PackageLoader("glean_parser", "templates"),
197        trim_blocks=True,
198        lstrip_blocks=True,
199    )
200
201    env.filters["camelize"] = camelize
202    env.filters["Camelize"] = Camelize
203    for filter_name, filter_func in filters:
204        env.filters[filter_name] = filter_func
205
206    return env.get_template(template_name)
207
208
209def keep_value(f):
210    """
211    Wrap a generator so the value it returns (rather than yields), will be
212    accessible on the .value attribute when the generator is exhausted.
213    """
214
215    class ValueKeepingGenerator(object):
216        def __init__(self, g):
217            self.g = g
218            self.value = None
219
220        def __iter__(self):
221            self.value = yield from self.g
222
223    @functools.wraps(f)
224    def g(*args, **kwargs):
225        return ValueKeepingGenerator(f(*args, **kwargs))
226
227    return g
228
229
230def get_null_resolver(schema):
231    """
232    Returns a JSON Pointer resolver that does nothing.
233
234    This lets us handle the moz: URLs in our schemas.
235    """
236
237    class NullResolver(jsonschema.RefResolver):
238        def resolve_remote(self, uri):
239            if uri in self.store:
240                return self.store[uri]
241            if uri == "":
242                return self.referrer
243
244    return NullResolver.from_schema(schema)
245
246
247def fetch_remote_url(url: str, cache: bool = True):
248    """
249    Fetches the contents from an HTTP url or local file path, and optionally
250    caches it to disk.
251    """
252    # Include the Python version in the cache key, since caches aren't
253    # sharable across Python versions.
254    key = (url, str(sys.version_info))
255
256    is_http = url.startswith("http")
257
258    if not is_http:
259        with open(url, "r", encoding="utf-8") as fd:
260            return fd.read()
261
262    if cache:
263        cache_dir = appdirs.user_cache_dir("glean_parser", "mozilla")
264        with diskcache.Cache(cache_dir) as dc:
265            if key in dc:
266                return dc[key]
267
268    contents: str = urllib.request.urlopen(url).read()
269
270    if cache:
271        with diskcache.Cache(cache_dir) as dc:
272            dc[key] = contents
273
274    return contents
275
276
277_unset = _utils.Unset()
278
279
280def pprint_validation_error(error) -> str:
281    """
282    A version of jsonschema's ValidationError __str__ method that doesn't
283    include the schema fragment that failed.  This makes the error messages
284    much more succinct.
285
286    It also shows any subschemas of anyOf/allOf that failed, if any (what
287    jsonschema calls "context").
288    """
289    essential_for_verbose = (
290        error.validator,
291        error.validator_value,
292        error.instance,
293        error.schema,
294    )
295    if any(m is _unset for m in essential_for_verbose):
296        return textwrap.fill(error.message)
297
298    instance = error.instance
299    for path in list(error.relative_path)[::-1]:
300        if isinstance(path, str):
301            instance = {path: instance}
302        else:
303            instance = [instance]
304
305    yaml_instance = ordered_yaml_dump(instance, width=72, default_flow_style=False)
306
307    parts = ["```", yaml_instance.rstrip(), "```", "", textwrap.fill(error.message)]
308    if error.context:
309        parts.extend(
310            textwrap.fill(x.message, initial_indent="    ", subsequent_indent="    ")
311            for x in error.context
312        )
313
314    description = error.schema.get("description")
315    if description:
316        parts.extend(["", "Documentation for this node:", _utils.indent(description)])
317
318    return "\n".join(parts)
319
320
321def format_error(filepath: Union[str, Path], header: str, content: str) -> str:
322    """
323    Format a jsonshema validation error.
324    """
325    if isinstance(filepath, Path):
326        filepath = filepath.resolve()
327    else:
328        filepath = "<string>"
329    if header:
330        return f"{filepath}: {header}\n{_utils.indent(content)}"
331    else:
332        return f"{filepath}:\n{_utils.indent(content)}"
333
334
335def parse_expires(expires: str) -> datetime.date:
336    """
337    Parses the expired field date (yyyy-mm-dd) as a date.
338    Raises a ValueError in case the string is not properly formatted.
339    """
340    try:
341        if sys.version_info < (3, 7):
342            try:
343                return iso8601.parse_date(expires).date()
344            except iso8601.ParseError:
345                raise ValueError()
346        else:
347            return datetime.date.fromisoformat(expires)
348    except ValueError:
349        raise ValueError(
350            f"Invalid expiration date '{expires}'. "
351            "Must be of the form yyyy-mm-dd in UTC."
352        )
353
354
355def is_expired(expires: str) -> bool:
356    """
357    Parses the `expires` field in a metric or ping and returns whether
358    the object should be considered expired.
359    """
360    if expires == "never":
361        return False
362    elif expires == "expired":
363        return True
364    else:
365        date = parse_expires(expires)
366        return date <= datetime.datetime.utcnow().date()
367
368
369def validate_expires(expires: str) -> None:
370    """
371    Raises a ValueError in case the `expires` is not ISO8601 parseable,
372    or in case the date is more than 730 days (~2 years) in the future.
373    """
374    if expires in ("never", "expired"):
375        return
376
377    date = parse_expires(expires)
378    max_date = datetime.datetime.now() + datetime.timedelta(days=730)
379    if date > max_date.date():
380        raise ValueError(
381            f"'{expires}' is more than 730 days (~2 years) in the future.",
382            "Please make sure this is intentional.",
383            "You can supress this warning by adding EXPIRATION_DATE_TOO_FAR to no_lint",
384            "See: https://mozilla.github.io/glean_parser/metrics-yaml.html#no_lint",
385        )
386
387
388def report_validation_errors(all_objects):
389    """
390    Report any validation errors found to the console.
391    """
392    found_error = False
393    for error in all_objects:
394        found_error = True
395        print("=" * 78, file=sys.stderr)
396        print(error, file=sys.stderr)
397    return found_error
398
399
400def remove_output_params(d, output_params):
401    """
402    Remove output-only params, such as "defined_in",
403    in order to validate the output against the input schema.
404    """
405    modified_dict = {}
406    for key, value in d.items():
407        if key is not output_params:
408            modified_dict[key] = value
409    return modified_dict
410
411
412# Names of  parameters to pass to all metrics constructors constructors.
413common_metric_args = [
414    "category",
415    "name",
416    "send_in_pings",
417    "lifetime",
418    "disabled",
419]
420
421
422# Names of parameters that only apply to some of the metrics types.
423# **CAUTION**: This list needs to be in the order the Swift & Rust type constructors
424# expects them. (The other language bindings don't care about the order).
425extra_metric_args = [
426    "time_unit",
427    "memory_unit",
428    "allowed_extra_keys",
429    "reason_codes",
430    "range_min",
431    "range_max",
432    "bucket_count",
433    "histogram_type",
434    "numerators",
435]
436
437
438# This includes only things that the language bindings care about, not things
439# that are metadata-only or are resolved into other parameters at parse time.
440# **CAUTION**: This list needs to be in the order the Swift & Rust type constructors
441# expects them. (The other language bindings don't care about the order). The
442# `test_order_of_fields` test checks that the generated code is valid.
443# **DO NOT CHANGE THE ORDER OR ADD NEW FIELDS IN THE MIDDLE**
444metric_args = common_metric_args + extra_metric_args
445
446
447# Names of ping parameters to pass to constructors.
448ping_args = [
449    "include_client_id",
450    "send_if_empty",
451    "name",
452    "reason_codes",
453]
454
455
456# Names of parameters to pass to both metric and ping constructors (no duplicates).
457extra_args = metric_args + [v for v in ping_args if v not in metric_args]
458