1"""Helpers for other modules.""" 2 3from __future__ import unicode_literals 4 5import yaml 6from dvc.utils.compat import str, builtin_str, open, cast_bytes_py2 7 8import os 9import sys 10import stat 11import math 12import json 13import shutil 14import hashlib 15import nanotime 16import time 17 18LOCAL_CHUNK_SIZE = 1024 * 1024 19LARGE_FILE_SIZE = 1024 * 1024 * 1024 20LARGE_DIR_SIZE = 100 21 22 23def dos2unix(data): 24 return data.replace(b"\r\n", b"\n") 25 26 27def file_md5(fname): 28 """ get the (md5 hexdigest, md5 digest) of a file """ 29 import dvc.logger as logger 30 from dvc.progress import progress 31 from dvc.istextfile import istextfile 32 33 if os.path.exists(fname): 34 hash_md5 = hashlib.md5() 35 binary = not istextfile(fname) 36 size = os.path.getsize(fname) 37 bar = False 38 if size >= LARGE_FILE_SIZE: 39 bar = True 40 msg = "Computing md5 for a large file {}. This is only done once." 41 logger.info(msg.format(os.path.relpath(fname))) 42 name = os.path.relpath(fname) 43 total = 0 44 45 with open(fname, "rb") as fobj: 46 while True: 47 data = fobj.read(LOCAL_CHUNK_SIZE) 48 if not data: 49 break 50 51 if bar: 52 total += len(data) 53 progress.update_target(name, total, size) 54 55 if binary: 56 chunk = data 57 else: 58 chunk = dos2unix(data) 59 60 hash_md5.update(chunk) 61 62 if bar: 63 progress.finish_target(name) 64 65 return (hash_md5.hexdigest(), hash_md5.digest()) 66 else: 67 return (None, None) 68 69 70def bytes_md5(byts): 71 hasher = hashlib.md5() 72 hasher.update(byts) 73 return hasher.hexdigest() 74 75 76def dict_filter(d, exclude=[]): 77 """ 78 Exclude specified keys from a nested dict 79 """ 80 81 if isinstance(d, list): 82 ret = [] 83 for e in d: 84 ret.append(dict_filter(e, exclude)) 85 return ret 86 elif isinstance(d, dict): 87 ret = {} 88 for k, v in d.items(): 89 if isinstance(k, builtin_str): 90 k = str(k) 91 92 assert isinstance(k, str) 93 if k in exclude: 94 continue 95 ret[k] = dict_filter(v, exclude) 96 return ret 97 98 return d 99 100 101def dict_md5(d, exclude=[]): 102 filtered = dict_filter(d, exclude) 103 byts = json.dumps(filtered, sort_keys=True).encode("utf-8") 104 return bytes_md5(byts) 105 106 107def copyfile(src, dest, no_progress_bar=False, name=None): 108 """Copy file with progress bar""" 109 from dvc.progress import progress 110 111 copied = 0 112 name = name if name else os.path.basename(dest) 113 total = os.stat(src).st_size 114 115 fsrc = open(src, "rb") 116 117 if os.path.isdir(dest): 118 fdest = open(os.path.join(dest, os.path.basename(src)), "wb+") 119 else: 120 fdest = open(dest, "wb+") 121 122 while True: 123 buf = fsrc.read(LOCAL_CHUNK_SIZE) 124 if not buf: 125 break 126 fdest.write(buf) 127 copied += len(buf) 128 if not no_progress_bar: 129 progress.update_target(name, copied, total) 130 131 if not no_progress_bar: 132 progress.finish_target(name) 133 134 fsrc.close() 135 fdest.close() 136 137 138def move(src, dst): 139 dst = os.path.abspath(dst) 140 dname = os.path.dirname(dst) 141 if not os.path.exists(dname): 142 os.makedirs(dname) 143 144 if os.path.islink(src): 145 shutil.copy(os.readlink(src), dst) 146 os.unlink(src) 147 return 148 149 shutil.move(src, dst) 150 151 152def remove(path): 153 import dvc.logger as logger 154 155 if not os.path.exists(path): 156 return 157 158 logger.debug("Removing '{}'".format(os.path.relpath(path))) 159 160 def _chmod(func, p, excinfo): 161 perm = os.stat(p).st_mode 162 perm |= stat.S_IWRITE 163 os.chmod(p, perm) 164 func(p) 165 166 if os.path.isfile(path): 167 _chmod(os.unlink, path, None) 168 else: 169 shutil.rmtree(path, onerror=_chmod) 170 171 172def to_chunks(l, jobs): 173 n = int(math.ceil(len(l) / jobs)) 174 175 if len(l) == 1: 176 return [l] 177 178 if n == 0: 179 n = 1 180 181 return [l[x : x + n] for x in range(0, len(l), n)] 182 183 184# NOTE: Check if we are in a bundle 185# https://pythonhosted.org/PyInstaller/runtime-information.html 186def is_binary(): 187 return getattr(sys, "frozen", False) 188 189 190# NOTE: Fix env variables modified by PyInstaller 191# http://pyinstaller.readthedocs.io/en/stable/runtime-information.html 192def fix_env(env=None): 193 if env is None: 194 env = os.environ.copy() 195 else: 196 env = env.copy() 197 198 if is_binary(): 199 lp_key = "LD_LIBRARY_PATH" 200 lp_orig = env.get(lp_key + "_ORIG", None) 201 if lp_orig is not None: 202 # NOTE: py2 doesn't like unicode strings in environ 203 env[cast_bytes_py2(lp_key)] = cast_bytes_py2(lp_orig) 204 else: 205 env.pop(lp_key, None) 206 207 return env 208 209 210def convert_to_unicode(data): 211 if isinstance(data, builtin_str): 212 return str(data) 213 elif isinstance(data, dict): 214 return dict(map(convert_to_unicode, data.items())) 215 elif isinstance(data, list) or isinstance(data, tuple): 216 return type(data)(map(convert_to_unicode, data)) 217 else: 218 return data 219 220 221def tmp_fname(fname): 222 """ Temporary name for a partial download """ 223 from uuid import uuid4 224 225 return fname + "." + str(uuid4()) + ".tmp" 226 227 228def current_timestamp(): 229 return int(nanotime.timestamp(time.time())) 230 231 232def load_stage_file(path): 233 with open(path, "r") as fobj: 234 return yaml.safe_load(fobj) or {} 235