1"""Auto data preparation."""
2import os
3import re
4from pathlib import Path
5import yaml
6from ...utils.download import download
7from ...utils.filesystem import unzip, untar, PathTree
8
9def url_data(url, path=None, overwrite=False, overwrite_folder=False, sha1_hash=None, root=None, disp_depth=1):
10    """Download an given URL
11
12    Parameters
13    ----------
14    url : str
15        URL to download
16    path : str, optional
17        Destination path to store downloaded file. By default stores to the
18        ~/.gluoncv directory with same name as in url.
19        You can also change the default behavior by editing ~/.gluoncv/config.yaml.
20    overwrite : bool, optional
21        Whether to overwrite destination file if already exists.
22    overwrite_folder : bool, optional
23        Whether to extract file to destination folder if already exists. You may use this option if you suspect
24        the destination is corrupted or some files are missing.
25    sha1_hash : str, optional
26        Expected sha1 hash in hexadecimal digits. Will ignore existing file when hash is specified
27        but doesn't match.
28    root : str, optional
29        Destination root dir to store extracted files. By default it's default in ~/.gluoncv directory.
30    disp_depth : int, optional
31        If set > 0, will print out the tree structure of extracted dataset folder with maximum `disp_depth`.
32
33    Returns
34    -------
35    str or tuple of str
36        The file path of the downloaded file.
37    """
38    fname = Path(path or URLs.path(url, c_key='archive'))
39    fname.parent.mkdir(parents=True, exist_ok=True)
40    fname = download(url, path=str(fname.resolve()), overwrite=overwrite, sha1_hash=sha1_hash)
41    extract_root = URLs.path(url, c_key='data')
42    extract_root = extract_root.parent.joinpath(extract_root.stem)
43    extract_root.mkdir(parents=True, exist_ok=True)
44    if fname.endswith('.zip'):
45        folder = unzip(fname, root=root if root else extract_root, strict=overwrite_folder)
46    elif fname.endswith('gz'):
47        folder = untar(fname, root=root if root else extract_root, strict=overwrite_folder)
48    else:
49        raise ValueError('Unknown url data with file: {}'.format(fname))
50
51    if disp_depth > 0:
52        path_tree = PathTree(folder, disp_depth)
53        print(path_tree)
54
55    return Path(folder)
56
57
58class Config:
59    "Setup config at `~/.gluoncv` unless it exists already."
60    config_path = Path(os.getenv('MXNET_HOME', '~/.gluoncv')).expanduser()
61    config_file = config_path/'config.yml'
62
63    def __init__(self):
64        self.config_path.mkdir(parents=True, exist_ok=True)
65        if not self.config_file.exists():
66            self.create_config()
67        self.d = self.load_config()
68
69    def __getitem__(self, k):
70        k = k.lower()
71        if k not in self.d:
72            k = k + '_path'
73        return Path(self.d[k])
74
75    def __getattr__(self, k):
76        if k == 'd':
77            raise AttributeError
78        return self[k]
79
80    def __setitem__(self, k, v):
81        self.d[k] = str(v)
82    def __contains__(self, k):
83        return k in self.d
84
85    def load_config(self):
86        "load and return config if version equals 2 in existing, else create new config."
87        with open(self.config_file, 'r') as f:
88            config = yaml.safe_load(f)
89            if 'version' in config and config['version'] == 2:
90                return config
91            elif 'version' in config:
92                self.create_config(config)
93            else:
94                self.create_config()
95        return self.load_config()
96
97    def create_config(self, cfg=None):
98        "create new config with default paths and set `version` to 2."
99        config = {'data_path':    str(self.config_path/'datasets'),
100                  'archive_path': str(self.config_path/'archive'),
101                  'storage_path': '/tmp',
102                  'model_path':   str(self.config_path/'models'),
103                  'version':      2}
104        if cfg is not None:
105            cfg['version'] = 2
106            config = merge(config, cfg)
107        self.save_file(config)
108
109    def save(self):
110        self.save_file(self.d)
111
112    def save_file(self, config):
113        "save config file at default config location `~/.gluoncv/config.yml`."
114        with self.config_file.open('w') as f:
115            yaml.dump(config, f, default_flow_style=False)
116
117
118# pylint: disable=bad-whitespace
119class URLs():
120    "Global constants for dataset and model URLs."
121    LOCAL_PATH = Path.cwd()
122    MDL = 'http://files.fast.ai/models/'
123    S3  = 'https://s3.amazonaws.com/fast-ai-'
124    URL = f'{S3}sample/'
125
126    S3_IMAGE    = f'{S3}imageclas/'
127    S3_IMAGELOC = f'{S3}imagelocal/'
128    S3_COCO     = f'{S3}coco/'
129
130    # main datasets
131    ADULT_SAMPLE        = f'{URL}adult_sample.tgz'
132    BIWI_SAMPLE         = f'{URL}biwi_sample.tgz'
133    CIFAR               = f'{URL}cifar10.tgz'
134    COCO_SAMPLE         = f'{S3_COCO}coco_sample.tgz'
135    COCO_TINY           = f'{S3_COCO}coco_tiny.tgz'
136    HUMAN_NUMBERS       = f'{URL}human_numbers.tgz'
137    # IMDB                = f'{S3_NLP}imdb.tgz'
138    IMDB_SAMPLE         = f'{URL}imdb_sample.tgz'
139    ML_SAMPLE           = f'{URL}movie_lens_sample.tgz'
140    ML_100k             = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'
141    MNIST_SAMPLE        = f'{URL}mnist_sample.tgz'
142    MNIST_TINY          = f'{URL}mnist_tiny.tgz'
143    MNIST_VAR_SIZE_TINY = f'{S3_IMAGE}mnist_var_size_tiny.tgz'
144    PLANET_SAMPLE       = f'{URL}planet_sample.tgz'
145    PLANET_TINY         = f'{URL}planet_tiny.tgz'
146    IMAGENETTE          = f'{S3_IMAGE}imagenette2.tgz'
147    IMAGENETTE_160      = f'{S3_IMAGE}imagenette2-160.tgz'
148    IMAGENETTE_320      = f'{S3_IMAGE}imagenette2-320.tgz'
149    IMAGEWOOF           = f'{S3_IMAGE}imagewoof2.tgz'
150    IMAGEWOOF_160       = f'{S3_IMAGE}imagewoof2-160.tgz'
151    IMAGEWOOF_320       = f'{S3_IMAGE}imagewoof2-320.tgz'
152    IMAGEWANG           = f'{S3_IMAGE}imagewang.tgz'
153    IMAGEWANG_160       = f'{S3_IMAGE}imagewang-160.tgz'
154    IMAGEWANG_320       = f'{S3_IMAGE}imagewang-320.tgz'
155
156    # kaggle competitions download dogs-vs-cats -p {DOGS.absolute()}
157    DOGS = f'{URL}dogscats.tgz'
158
159    # image classification datasets
160    CALTECH_101  = f'{S3_IMAGE}caltech_101.tgz'
161    CARS         = f'{S3_IMAGE}stanford-cars.tgz'
162    CIFAR_100    = f'{S3_IMAGE}cifar100.tgz'
163    CUB_200_2011 = f'{S3_IMAGE}CUB_200_2011.tgz'
164    FLOWERS      = f'{S3_IMAGE}oxford-102-flowers.tgz'
165    FOOD         = f'{S3_IMAGE}food-101.tgz'
166    MNIST        = f'{S3_IMAGE}mnist_png.tgz'
167    PETS         = f'{S3_IMAGE}oxford-iiit-pet.tgz'
168
169    # Image localization datasets
170    BIWI_HEAD_POSE     = f"{S3_IMAGELOC}biwi_head_pose.tgz"
171    CAMVID             = f'{S3_IMAGELOC}camvid.tgz'
172    CAMVID_TINY        = f'{URL}camvid_tiny.tgz'
173    LSUN_BEDROOMS      = f'{S3_IMAGE}bedroom.tgz'
174    PASCAL_2007        = f'{S3_IMAGELOC}pascal_2007.tgz'
175    PASCAL_2012        = f'{S3_IMAGELOC}pascal_2012.tgz'
176
177    # Medical Imaging datasets
178    #SKIN_LESION        = f'{S3_IMAGELOC}skin_lesion.tgz'
179    SIIM_SMALL         = f'{S3_IMAGELOC}siim_small.tgz'
180
181    @staticmethod
182    def path(url='.', c_key='archive'):
183        "Return local path where to download based on `c_key`"
184        fname = url.split('/')[-1]
185        local_path = URLs.LOCAL_PATH / ('models' if c_key == 'models' else 'datasets')/fname
186        if local_path.exists():
187            return local_path
188        return Config()[c_key]/fname
189
190_URL_REGEX = re.compile(
191    r'^(?:http|ftp)s?://' # http:// or https://
192    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
193    r'localhost|' #localhost...
194    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
195    r'(?::\d+)?' # optional port
196    r'(?:/?|[/?]\S+)$', re.IGNORECASE)
197
198def is_url(url_like):
199    if not isinstance(url_like, str):
200        return False
201    return re.match(_URL_REGEX, url_like) is not None
202