1# Copyright 2008-2020 pydicom authors. See LICENSE file for details. 2"""Management of pydicom's data files. 3 4 5External Data Sources 6--------------------- 7 8*pydicom* can also search third-party data sources for matching data. To do so 9your project should register its entry points in its `setup.py` file. For 10example, a project named "mydata" with the interface class ``MyInterface`` 11should register: 12 13.. codeblock: python 14 15 from setuptools import setup 16 17 setup( 18 ..., 19 entry_points={ 20 "pydicom.data.external_sources": "mydata = mydata:MyInterface", 21 }, 22 ) 23 24The interface class should have, at a minimum, the following two methods: 25 26* ``get_path(self, name: str, dtype: int) -> str`` - returns the absolute path 27 to the first file with a filename `name` or raises a ``ValueError`` if no 28 matching file found. 29* ``get_paths(self, pattern: str, dtype: int) -> List[str]`` - returns a list 30 of absolute paths to filenames matching `pattern`. 31 32Where `name` is the name of the filename to search for, `dtype` is an int 33that indicates the type of data to search for and should be one of the 34following: 35 36* ``0`` - DICOM dataset 37* ``1`` - Character set file 38* ``2`` - Palette file 39* ``3`` - DICOMDIR file 40* ``4`` - JPEG file 41 42And lastly, `pattern` is a str used to filter files against when searching. 43 44For a real-life example of an external data source you can look at the 45`pydicom-data <https://github.com/pydicom/pydicom-data>`_ repository. 46""" 47 48from enum import IntEnum 49import fnmatch 50import os 51from pathlib import Path 52from typing import Dict, List, Union, Optional, TYPE_CHECKING 53import warnings 54 55from pydicom.data.download import ( 56 data_path_with_download, calculate_file_hash, get_cached_filehash, 57 get_url_map, get_data_dir 58) 59 60if TYPE_CHECKING: # pragma: no cover 61 from pydicom import Dataset 62 63 64DATA_ROOT = os.fspath(Path(__file__).parent.resolve()) 65"""The absolute path to the pydicom/data directory.""" 66 67 68class DataTypes(IntEnum): 69 """Constants for data types.""" 70 DATASET = 0 71 CHARSET = 1 72 PALETTE = 2 73 DICOMDIR = 3 74 JPEG = 4 75 76 77def _check_data_hash(fpath: str) -> bool: 78 """Return ``True`` if the SHA256 checksum of the file at ``fpath`` is OK. 79 80 Parameters 81 ---------- 82 fpath : str 83 The absolute path to the file to perform the checksum for. 84 85 Returns 86 ------- 87 bool 88 ``True`` if the checksum matches those in ``hashes.json``, ``False`` 89 otherwise. 90 91 Raises 92 ------ 93 pydicom.data.download.NoHashFound 94 If the file is missing from ``hashes.json``. 95 """ 96 p = Path(fpath) 97 ext_hash = calculate_file_hash(p) 98 ref_hash = get_cached_filehash(p.name) 99 100 return ext_hash == ref_hash 101 102 103def get_external_sources() -> Dict: 104 """Return a :class:`dict` of external data source interfaces. 105 106 Returns 107 ------- 108 dict 109 A dict of ``{'source name': <interface class instance>}``. 110 """ 111 112 from pkg_resources import iter_entry_points 113 114 # Prefer pydicom-data as the source 115 entry_point = "pydicom.data.external_sources" 116 sources = {vv.name: vv.load()() for vv in iter_entry_points(entry_point)} 117 out = {} 118 if "pydicom-data" in sources: 119 out["pydicom-data"] = sources["pydicom-data"] 120 121 out.update(sources) 122 123 return out 124 125 126_EXTERNAL_DATA_SOURCES: Optional[Dict] = None 127 128 129def external_data_sources() -> Dict: 130 """Return the available external data sources - loaded once.""" 131 global _EXTERNAL_DATA_SOURCES 132 if _EXTERNAL_DATA_SOURCES is None: 133 _EXTERNAL_DATA_SOURCES = get_external_sources() 134 return _EXTERNAL_DATA_SOURCES 135 136 137def online_test_file_dummy_paths() -> Dict[str, str]: 138 """Return a :class:`dict` of dummy paths to the downloadable test files. 139 140 Returns 141 ------- 142 dict 143 A dict of dummy paths to the test files available via download. 144 """ 145 filenames = list(get_url_map().keys()) 146 147 test_files_root = os.path.join(DATA_ROOT, 'test_files') 148 149 dummy_path_map = { 150 os.path.join(test_files_root, filename): filename 151 for filename in filenames 152 } 153 154 return dummy_path_map 155 156 157def fetch_data_files() -> None: 158 """Download missing test files to the local cache.""" 159 cache = get_data_dir() 160 paths = {cache / fname: fname for fname in list(get_url_map().keys())} 161 162 error = [] 163 for p in paths: 164 # Download missing files or files that don't match the hash 165 try: 166 data_path_with_download(p.name) 167 except Exception: 168 error.append(p.name) 169 170 if error: 171 raise RuntimeError( 172 "An error occurred downloading the following files: " 173 f"{', '.join(error)}" 174 ) 175 176 177def get_files( 178 base: Union[str, os.PathLike], 179 pattern: str = "**/*", 180 dtype: int = DataTypes.DATASET 181) -> List[str]: 182 """Return all matching file paths from the available data sources. 183 184 First searches the local *pydicom* data store, then any locally available 185 external sources, and finally the files available in the 186 pydicom/pydicom-data repository. 187 188 .. versionchanged: 2.1 189 190 Added the `dtype` keyword parameter, modified to search locally 191 available external data sources and the pydicom/pydicom-data repository 192 193 Parameters 194 ---------- 195 base : str or os.PathLike 196 Base directory to recursively search. 197 pattern : str, optional 198 The pattern to pass to :meth:`~pathlib.Path.glob`, default 199 (``'**/*'``). 200 dtype : int, optional 201 The type of data to search for when using an external source, one of: 202 203 * ``0`` - DICOM dataset 204 * ``1`` - Character set file 205 * ``2`` - Palette file 206 * ``3`` - DICOMDIR file 207 * ``4`` - JPEG file 208 209 Returns 210 ------- 211 list of str 212 A list of absolute paths to matching files. 213 """ 214 base = Path(base) 215 216 # Search locally 217 files = [os.fspath(m) for m in base.glob(pattern)] 218 219 # Search external sources 220 for lib, source in external_data_sources().items(): 221 fpaths = source.get_paths(pattern, dtype) 222 if lib == "pydicom-data": 223 # For pydicom-data, check the hash against hashes.json 224 fpaths = [p for p in fpaths if _check_data_hash(p)] 225 226 files.extend(fpaths) 227 228 # Search http://github.com/pydicom/pydicom-data or local cache 229 # To preserve backwards compatibility filter the downloaded files 230 # as if they are stored within DATA_ROOT/test_files/*.dcm 231 dummy_online_file_path_map = online_test_file_dummy_paths() 232 dummy_online_file_path_filtered = fnmatch.filter( 233 dummy_online_file_path_map.keys(), os.path.join(base, pattern) 234 ) 235 download_names = [ 236 os.fspath(dummy_online_file_path_map[dummy_path]) 237 for dummy_path in dummy_online_file_path_filtered 238 ] 239 240 real_online_file_paths = [] 241 download_error = False 242 for filename in download_names: 243 try: 244 real_online_file_paths.append( 245 os.fspath(data_path_with_download(filename)) 246 ) 247 except Exception: 248 download_error = True 249 250 files += real_online_file_paths 251 252 if download_error: 253 warnings.warn( 254 "One or more download failures occurred, the list of matching " 255 "file paths may be incomplete" 256 ) 257 258 return files 259 260 261def get_palette_files(pattern: str = "**/*") -> List[str]: 262 """Return a list of absolute paths to palettes with filenames matching 263 `pattern`. 264 265 .. versionadded:: 1.4 266 267 Parameters 268 ---------- 269 pattern : str, optional 270 The pattern to pass to :meth:`~pathlib.Path.glob`, default 271 (``'**/*'``). 272 273 Returns 274 ------- 275 list of str 276 A list of absolute paths to matching files. 277 """ 278 data_path = Path(DATA_ROOT) / 'palettes' 279 280 files = get_files(base=data_path, pattern=pattern, dtype=DataTypes.PALETTE) 281 files = [filename for filename in files if not filename.endswith('.py')] 282 283 return files 284 285 286def get_testdata_file( 287 name: str, read: bool = False 288) -> Union[str, "Dataset", None]: 289 """Return an absolute path to the first matching dataset with filename 290 `name`. 291 292 .. versionadded:: 1.4 293 294 First searches the local *pydicom* data store, then any locally available 295 external sources, and finally the files available in the 296 pydicom/pydicom-data repository. 297 298 .. versionchanged:: 2.1 299 300 Modified to search locally available external data sources and the 301 pydicom/pydicom-data repository 302 303 .. versionchanged:: 2.2 304 305 Added the `read` keyword parameter. 306 307 Parameters 308 ---------- 309 name : str 310 The full file name (without path) 311 read : bool, optional 312 If ``True`` then use :func:`~pydicom.filereader.dcmread` to read the 313 file and return the corresponding 314 :class:`~pydicom.dataset.FileDataset`. Default ``False``. 315 316 Returns 317 ------- 318 str, pydicom.dataset.Dataset or None 319 The absolute path of the file if found, the dataset itself if `read` is 320 ``True``, or ``None`` if the file is not found. 321 """ 322 from pydicom.filereader import dcmread 323 324 # Check pydicom local 325 data_path = Path(DATA_ROOT) / 'test_files' 326 matches = [m for m in data_path.rglob(name)] 327 if matches: 328 path = os.fspath(matches[0]) 329 return dcmread(path, force=True) if read else path 330 331 # Check external data sources 332 for lib, source in external_data_sources().items(): 333 try: 334 fpath = source.get_path(name, dtype=DataTypes.DATASET) 335 except ValueError: 336 fpath = None 337 338 # For pydicom-data, check the hash against hashes.json 339 if lib == "pydicom-data": 340 if fpath and _check_data_hash(fpath): 341 return dcmread(fpath, force=True) if read else fpath 342 elif fpath: 343 return dcmread(fpath, force=True) if read else fpath 344 345 # Try online 346 for filename in get_url_map().keys(): 347 if filename == name: 348 try: 349 path = os.fspath(data_path_with_download(filename)) 350 return dcmread(path, force=True) if read else path 351 except Exception: 352 warnings.warn( 353 f"A download failure occurred while attempting to " 354 f"retrieve {name}" 355 ) 356 357 return None 358 359 360def get_testdata_files(pattern: str = "**/*") -> List[str]: 361 """Return a list of absolute paths to datasets with filenames matching 362 `pattern`. 363 364 Parameters 365 ---------- 366 pattern : str, optional 367 The pattern to pass to :meth:`~pathlib.Path.glob`, default 368 (``'**/*'``). 369 370 Returns 371 ------- 372 list of str 373 A list of absolute paths to matching files. 374 """ 375 data_path = Path(DATA_ROOT) / 'test_files' 376 377 files = get_files(base=data_path, pattern=pattern, dtype=DataTypes.DATASET) 378 files = [filename for filename in files if not filename.endswith('.py')] 379 380 return files 381 382 383def get_charset_files(pattern: str = "**/*") -> List[str]: 384 """Return a list of absolute paths to charsets with filenames matching 385 `pattern`. 386 387 Parameters 388 ---------- 389 pattern : str, optional 390 The pattern to pass to :meth:`~pathlib.Path.glob`, default 391 (``'**/*'``). 392 393 Returns 394 ---------- 395 list of str 396 A list of absolute paths to matching files. 397 """ 398 data_path = Path(DATA_ROOT) / 'charset_files' 399 400 files = get_files(base=data_path, pattern=pattern, dtype=DataTypes.CHARSET) 401 files = [filename for filename in files if not filename.endswith('.py')] 402 403 return files 404