1"""Auto data preparation.""" 2import os 3import re 4from pathlib import Path 5import yaml 6from ...utils.download import download 7from ...utils.filesystem import unzip, untar, PathTree 8 9def url_data(url, path=None, overwrite=False, overwrite_folder=False, sha1_hash=None, root=None, disp_depth=1): 10 """Download an given URL 11 12 Parameters 13 ---------- 14 url : str 15 URL to download 16 path : str, optional 17 Destination path to store downloaded file. By default stores to the 18 ~/.gluoncv directory with same name as in url. 19 You can also change the default behavior by editing ~/.gluoncv/config.yaml. 20 overwrite : bool, optional 21 Whether to overwrite destination file if already exists. 22 overwrite_folder : bool, optional 23 Whether to extract file to destination folder if already exists. You may use this option if you suspect 24 the destination is corrupted or some files are missing. 25 sha1_hash : str, optional 26 Expected sha1 hash in hexadecimal digits. Will ignore existing file when hash is specified 27 but doesn't match. 28 root : str, optional 29 Destination root dir to store extracted files. By default it's default in ~/.gluoncv directory. 30 disp_depth : int, optional 31 If set > 0, will print out the tree structure of extracted dataset folder with maximum `disp_depth`. 32 33 Returns 34 ------- 35 str or tuple of str 36 The file path of the downloaded file. 37 """ 38 fname = Path(path or URLs.path(url, c_key='archive')) 39 fname.parent.mkdir(parents=True, exist_ok=True) 40 fname = download(url, path=str(fname.resolve()), overwrite=overwrite, sha1_hash=sha1_hash) 41 extract_root = URLs.path(url, c_key='data') 42 extract_root = extract_root.parent.joinpath(extract_root.stem) 43 extract_root.mkdir(parents=True, exist_ok=True) 44 if fname.endswith('.zip'): 45 folder = unzip(fname, root=root if root else extract_root, strict=overwrite_folder) 46 elif fname.endswith('gz'): 47 folder = untar(fname, root=root if root else extract_root, strict=overwrite_folder) 48 else: 49 raise ValueError('Unknown url data with file: {}'.format(fname)) 50 51 if disp_depth > 0: 52 path_tree = PathTree(folder, disp_depth) 53 print(path_tree) 54 55 return Path(folder) 56 57 58class Config: 59 "Setup config at `~/.gluoncv` unless it exists already." 60 config_path = Path(os.getenv('MXNET_HOME', '~/.gluoncv')).expanduser() 61 config_file = config_path/'config.yml' 62 63 def __init__(self): 64 self.config_path.mkdir(parents=True, exist_ok=True) 65 if not self.config_file.exists(): 66 self.create_config() 67 self.d = self.load_config() 68 69 def __getitem__(self, k): 70 k = k.lower() 71 if k not in self.d: 72 k = k + '_path' 73 return Path(self.d[k]) 74 75 def __getattr__(self, k): 76 if k == 'd': 77 raise AttributeError 78 return self[k] 79 80 def __setitem__(self, k, v): 81 self.d[k] = str(v) 82 def __contains__(self, k): 83 return k in self.d 84 85 def load_config(self): 86 "load and return config if version equals 2 in existing, else create new config." 87 with open(self.config_file, 'r') as f: 88 config = yaml.safe_load(f) 89 if 'version' in config and config['version'] == 2: 90 return config 91 elif 'version' in config: 92 self.create_config(config) 93 else: 94 self.create_config() 95 return self.load_config() 96 97 def create_config(self, cfg=None): 98 "create new config with default paths and set `version` to 2." 99 config = {'data_path': str(self.config_path/'datasets'), 100 'archive_path': str(self.config_path/'archive'), 101 'storage_path': '/tmp', 102 'model_path': str(self.config_path/'models'), 103 'version': 2} 104 if cfg is not None: 105 cfg['version'] = 2 106 config = merge(config, cfg) 107 self.save_file(config) 108 109 def save(self): 110 self.save_file(self.d) 111 112 def save_file(self, config): 113 "save config file at default config location `~/.gluoncv/config.yml`." 114 with self.config_file.open('w') as f: 115 yaml.dump(config, f, default_flow_style=False) 116 117 118# pylint: disable=bad-whitespace 119class URLs(): 120 "Global constants for dataset and model URLs." 121 LOCAL_PATH = Path.cwd() 122 MDL = 'http://files.fast.ai/models/' 123 S3 = 'https://s3.amazonaws.com/fast-ai-' 124 URL = f'{S3}sample/' 125 126 S3_IMAGE = f'{S3}imageclas/' 127 S3_IMAGELOC = f'{S3}imagelocal/' 128 S3_COCO = f'{S3}coco/' 129 130 # main datasets 131 ADULT_SAMPLE = f'{URL}adult_sample.tgz' 132 BIWI_SAMPLE = f'{URL}biwi_sample.tgz' 133 CIFAR = f'{URL}cifar10.tgz' 134 COCO_SAMPLE = f'{S3_COCO}coco_sample.tgz' 135 COCO_TINY = f'{S3_COCO}coco_tiny.tgz' 136 HUMAN_NUMBERS = f'{URL}human_numbers.tgz' 137 # IMDB = f'{S3_NLP}imdb.tgz' 138 IMDB_SAMPLE = f'{URL}imdb_sample.tgz' 139 ML_SAMPLE = f'{URL}movie_lens_sample.tgz' 140 ML_100k = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip' 141 MNIST_SAMPLE = f'{URL}mnist_sample.tgz' 142 MNIST_TINY = f'{URL}mnist_tiny.tgz' 143 MNIST_VAR_SIZE_TINY = f'{S3_IMAGE}mnist_var_size_tiny.tgz' 144 PLANET_SAMPLE = f'{URL}planet_sample.tgz' 145 PLANET_TINY = f'{URL}planet_tiny.tgz' 146 IMAGENETTE = f'{S3_IMAGE}imagenette2.tgz' 147 IMAGENETTE_160 = f'{S3_IMAGE}imagenette2-160.tgz' 148 IMAGENETTE_320 = f'{S3_IMAGE}imagenette2-320.tgz' 149 IMAGEWOOF = f'{S3_IMAGE}imagewoof2.tgz' 150 IMAGEWOOF_160 = f'{S3_IMAGE}imagewoof2-160.tgz' 151 IMAGEWOOF_320 = f'{S3_IMAGE}imagewoof2-320.tgz' 152 IMAGEWANG = f'{S3_IMAGE}imagewang.tgz' 153 IMAGEWANG_160 = f'{S3_IMAGE}imagewang-160.tgz' 154 IMAGEWANG_320 = f'{S3_IMAGE}imagewang-320.tgz' 155 156 # kaggle competitions download dogs-vs-cats -p {DOGS.absolute()} 157 DOGS = f'{URL}dogscats.tgz' 158 159 # image classification datasets 160 CALTECH_101 = f'{S3_IMAGE}caltech_101.tgz' 161 CARS = f'{S3_IMAGE}stanford-cars.tgz' 162 CIFAR_100 = f'{S3_IMAGE}cifar100.tgz' 163 CUB_200_2011 = f'{S3_IMAGE}CUB_200_2011.tgz' 164 FLOWERS = f'{S3_IMAGE}oxford-102-flowers.tgz' 165 FOOD = f'{S3_IMAGE}food-101.tgz' 166 MNIST = f'{S3_IMAGE}mnist_png.tgz' 167 PETS = f'{S3_IMAGE}oxford-iiit-pet.tgz' 168 169 # Image localization datasets 170 BIWI_HEAD_POSE = f"{S3_IMAGELOC}biwi_head_pose.tgz" 171 CAMVID = f'{S3_IMAGELOC}camvid.tgz' 172 CAMVID_TINY = f'{URL}camvid_tiny.tgz' 173 LSUN_BEDROOMS = f'{S3_IMAGE}bedroom.tgz' 174 PASCAL_2007 = f'{S3_IMAGELOC}pascal_2007.tgz' 175 PASCAL_2012 = f'{S3_IMAGELOC}pascal_2012.tgz' 176 177 # Medical Imaging datasets 178 #SKIN_LESION = f'{S3_IMAGELOC}skin_lesion.tgz' 179 SIIM_SMALL = f'{S3_IMAGELOC}siim_small.tgz' 180 181 @staticmethod 182 def path(url='.', c_key='archive'): 183 "Return local path where to download based on `c_key`" 184 fname = url.split('/')[-1] 185 local_path = URLs.LOCAL_PATH / ('models' if c_key == 'models' else 'datasets')/fname 186 if local_path.exists(): 187 return local_path 188 return Config()[c_key]/fname 189 190_URL_REGEX = re.compile( 191 r'^(?:http|ftp)s?://' # http:// or https:// 192 r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain... 193 r'localhost|' #localhost... 194 r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip 195 r'(?::\d+)?' # optional port 196 r'(?:/?|[/?]\S+)$', re.IGNORECASE) 197 198def is_url(url_like): 199 if not isinstance(url_like, str): 200 return False 201 return re.match(_URL_REGEX, url_like) is not None 202