1# Copyright 2008-2020 pydicom authors. See LICENSE file for details.
2"""Management of pydicom's data files.
3
4
5External Data Sources
6---------------------
7
8*pydicom* can also search third-party data sources for matching data. To do so
9your project should register its entry points in its `setup.py` file. For
10example, a project named "mydata" with the interface class ``MyInterface``
11should register:
12
13.. codeblock: python
14
15    from setuptools import setup
16
17    setup(
18        ...,
19        entry_points={
20            "pydicom.data.external_sources": "mydata = mydata:MyInterface",
21        },
22    )
23
24The interface class should have, at a minimum, the following two methods:
25
26* ``get_path(self, name: str, dtype: int) -> str`` - returns the absolute path
27  to the first file with a filename `name` or raises a ``ValueError`` if no
28  matching file found.
29* ``get_paths(self, pattern: str, dtype: int) -> List[str]`` - returns a list
30  of absolute paths to filenames matching `pattern`.
31
32Where `name` is the name of the filename to search for, `dtype` is an int
33that indicates the type of data to search for and should be one of the
34following:
35
36* ``0`` - DICOM dataset
37* ``1`` - Character set file
38* ``2`` - Palette file
39* ``3`` - DICOMDIR file
40* ``4`` - JPEG file
41
42And lastly, `pattern` is a str used to filter files against when searching.
43
44For a real-life example of an external data source you can look at the
45`pydicom-data <https://github.com/pydicom/pydicom-data>`_ repository.
46"""
47
48from enum import IntEnum
49import fnmatch
50import os
51from pathlib import Path
52from typing import Dict, List, Union, Optional, TYPE_CHECKING
53import warnings
54
55from pydicom.data.download import (
56    data_path_with_download, calculate_file_hash, get_cached_filehash,
57    get_url_map, get_data_dir
58)
59
60if TYPE_CHECKING:  # pragma: no cover
61    from pydicom import Dataset
62
63
64DATA_ROOT = os.fspath(Path(__file__).parent.resolve())
65"""The absolute path to the pydicom/data directory."""
66
67
68class DataTypes(IntEnum):
69    """Constants for data types."""
70    DATASET = 0
71    CHARSET = 1
72    PALETTE = 2
73    DICOMDIR = 3
74    JPEG = 4
75
76
77def _check_data_hash(fpath: str) -> bool:
78    """Return ``True`` if the SHA256 checksum of the file at ``fpath`` is OK.
79
80    Parameters
81    ----------
82    fpath : str
83        The absolute path to the file to perform the checksum for.
84
85    Returns
86    -------
87    bool
88        ``True`` if the checksum matches those in ``hashes.json``, ``False``
89        otherwise.
90
91    Raises
92    ------
93    pydicom.data.download.NoHashFound
94        If the file is missing from ``hashes.json``.
95    """
96    p = Path(fpath)
97    ext_hash = calculate_file_hash(p)
98    ref_hash = get_cached_filehash(p.name)
99
100    return ext_hash == ref_hash
101
102
103def get_external_sources() -> Dict:
104    """Return a :class:`dict` of external data source interfaces.
105
106    Returns
107    -------
108    dict
109        A dict of ``{'source name': <interface class instance>}``.
110    """
111
112    from pkg_resources import iter_entry_points
113
114    # Prefer pydicom-data as the source
115    entry_point = "pydicom.data.external_sources"
116    sources = {vv.name: vv.load()() for vv in iter_entry_points(entry_point)}
117    out = {}
118    if "pydicom-data" in sources:
119        out["pydicom-data"] = sources["pydicom-data"]
120
121    out.update(sources)
122
123    return out
124
125
126_EXTERNAL_DATA_SOURCES: Optional[Dict] = None
127
128
129def external_data_sources() -> Dict:
130    """Return the available external data sources - loaded once."""
131    global _EXTERNAL_DATA_SOURCES
132    if _EXTERNAL_DATA_SOURCES is None:
133        _EXTERNAL_DATA_SOURCES = get_external_sources()
134    return _EXTERNAL_DATA_SOURCES
135
136
137def online_test_file_dummy_paths() -> Dict[str, str]:
138    """Return a :class:`dict` of dummy paths to the downloadable test files.
139
140    Returns
141    -------
142    dict
143        A dict of dummy paths to the test files available via download.
144    """
145    filenames = list(get_url_map().keys())
146
147    test_files_root = os.path.join(DATA_ROOT, 'test_files')
148
149    dummy_path_map = {
150        os.path.join(test_files_root, filename): filename
151        for filename in filenames
152    }
153
154    return dummy_path_map
155
156
157def fetch_data_files() -> None:
158    """Download missing test files to the local cache."""
159    cache = get_data_dir()
160    paths = {cache / fname: fname for fname in list(get_url_map().keys())}
161
162    error = []
163    for p in paths:
164        # Download missing files or files that don't match the hash
165        try:
166            data_path_with_download(p.name)
167        except Exception:
168            error.append(p.name)
169
170    if error:
171        raise RuntimeError(
172            "An error occurred downloading the following files: "
173            f"{', '.join(error)}"
174        )
175
176
177def get_files(
178        base: Union[str, os.PathLike],
179        pattern: str = "**/*",
180        dtype: int = DataTypes.DATASET
181) -> List[str]:
182    """Return all matching file paths from the available data sources.
183
184    First searches the local *pydicom* data store, then any locally available
185    external sources, and finally the files available in the
186    pydicom/pydicom-data repository.
187
188    .. versionchanged: 2.1
189
190        Added the `dtype` keyword parameter, modified to search locally
191        available external data sources and the pydicom/pydicom-data repository
192
193    Parameters
194    ----------
195    base : str or os.PathLike
196        Base directory to recursively search.
197    pattern : str, optional
198        The pattern to pass to :meth:`~pathlib.Path.glob`, default
199        (``'**/*'``).
200    dtype : int, optional
201        The type of data to search for when using an external source, one of:
202
203        * ``0`` - DICOM dataset
204        * ``1`` - Character set file
205        * ``2`` - Palette file
206        * ``3`` - DICOMDIR file
207        * ``4`` - JPEG file
208
209    Returns
210    -------
211    list of str
212        A list of absolute paths to matching files.
213    """
214    base = Path(base)
215
216    # Search locally
217    files = [os.fspath(m) for m in base.glob(pattern)]
218
219    # Search external sources
220    for lib, source in external_data_sources().items():
221        fpaths = source.get_paths(pattern, dtype)
222        if lib == "pydicom-data":
223            # For pydicom-data, check the hash against hashes.json
224            fpaths = [p for p in fpaths if _check_data_hash(p)]
225
226        files.extend(fpaths)
227
228    # Search http://github.com/pydicom/pydicom-data or local cache
229    # To preserve backwards compatibility filter the downloaded files
230    # as if they are stored within DATA_ROOT/test_files/*.dcm
231    dummy_online_file_path_map = online_test_file_dummy_paths()
232    dummy_online_file_path_filtered = fnmatch.filter(
233        dummy_online_file_path_map.keys(), os.path.join(base, pattern)
234    )
235    download_names = [
236        os.fspath(dummy_online_file_path_map[dummy_path])
237        for dummy_path in dummy_online_file_path_filtered
238    ]
239
240    real_online_file_paths = []
241    download_error = False
242    for filename in download_names:
243        try:
244            real_online_file_paths.append(
245                os.fspath(data_path_with_download(filename))
246            )
247        except Exception:
248            download_error = True
249
250    files += real_online_file_paths
251
252    if download_error:
253        warnings.warn(
254            "One or more download failures occurred, the list of matching "
255            "file paths may be incomplete"
256        )
257
258    return files
259
260
261def get_palette_files(pattern: str = "**/*") -> List[str]:
262    """Return a list of absolute paths to palettes with filenames matching
263    `pattern`.
264
265    .. versionadded:: 1.4
266
267    Parameters
268    ----------
269    pattern : str, optional
270        The pattern to pass to :meth:`~pathlib.Path.glob`, default
271        (``'**/*'``).
272
273    Returns
274    -------
275    list of str
276        A list of absolute paths to matching files.
277    """
278    data_path = Path(DATA_ROOT) / 'palettes'
279
280    files = get_files(base=data_path, pattern=pattern, dtype=DataTypes.PALETTE)
281    files = [filename for filename in files if not filename.endswith('.py')]
282
283    return files
284
285
286def get_testdata_file(
287    name: str, read: bool = False
288) -> Union[str, "Dataset", None]:
289    """Return an absolute path to the first matching dataset with filename
290    `name`.
291
292    .. versionadded:: 1.4
293
294    First searches the local *pydicom* data store, then any locally available
295    external sources, and finally the files available in the
296    pydicom/pydicom-data repository.
297
298    .. versionchanged:: 2.1
299
300        Modified to search locally available external data sources and the
301        pydicom/pydicom-data repository
302
303    .. versionchanged:: 2.2
304
305        Added the `read` keyword parameter.
306
307    Parameters
308    ----------
309    name : str
310        The full file name (without path)
311    read : bool, optional
312        If ``True`` then use :func:`~pydicom.filereader.dcmread` to read the
313        file and return the corresponding
314        :class:`~pydicom.dataset.FileDataset`. Default ``False``.
315
316    Returns
317    -------
318    str, pydicom.dataset.Dataset or None
319        The absolute path of the file if found, the dataset itself if `read` is
320        ``True``, or ``None`` if the file is not found.
321    """
322    from pydicom.filereader import dcmread
323
324    # Check pydicom local
325    data_path = Path(DATA_ROOT) / 'test_files'
326    matches = [m for m in data_path.rglob(name)]
327    if matches:
328        path = os.fspath(matches[0])
329        return dcmread(path, force=True) if read else path
330
331    # Check external data sources
332    for lib, source in external_data_sources().items():
333        try:
334            fpath = source.get_path(name, dtype=DataTypes.DATASET)
335        except ValueError:
336            fpath = None
337
338        # For pydicom-data, check the hash against hashes.json
339        if lib == "pydicom-data":
340            if fpath and _check_data_hash(fpath):
341                return dcmread(fpath, force=True) if read else fpath
342        elif fpath:
343            return dcmread(fpath, force=True) if read else fpath
344
345    # Try online
346    for filename in get_url_map().keys():
347        if filename == name:
348            try:
349                path = os.fspath(data_path_with_download(filename))
350                return dcmread(path, force=True) if read else path
351            except Exception:
352                warnings.warn(
353                    f"A download failure occurred while attempting to "
354                    f"retrieve {name}"
355                )
356
357    return None
358
359
360def get_testdata_files(pattern: str = "**/*") -> List[str]:
361    """Return a list of absolute paths to datasets with filenames matching
362    `pattern`.
363
364    Parameters
365    ----------
366    pattern : str, optional
367        The pattern to pass to :meth:`~pathlib.Path.glob`, default
368        (``'**/*'``).
369
370    Returns
371    -------
372    list of str
373        A list of absolute paths to matching files.
374    """
375    data_path = Path(DATA_ROOT) / 'test_files'
376
377    files = get_files(base=data_path, pattern=pattern, dtype=DataTypes.DATASET)
378    files = [filename for filename in files if not filename.endswith('.py')]
379
380    return files
381
382
383def get_charset_files(pattern: str = "**/*") -> List[str]:
384    """Return a list of absolute paths to charsets with filenames matching
385    `pattern`.
386
387    Parameters
388    ----------
389    pattern : str, optional
390        The pattern to pass to :meth:`~pathlib.Path.glob`, default
391        (``'**/*'``).
392
393    Returns
394    ----------
395    list of str
396        A list of absolute paths to matching files.
397    """
398    data_path = Path(DATA_ROOT) / 'charset_files'
399
400    files = get_files(base=data_path, pattern=pattern, dtype=DataTypes.CHARSET)
401    files = [filename for filename in files if not filename.endswith('.py')]
402
403    return files
404