1#!/bin/sh 2"""": # -*-python-*- 3# https://sourceware.org/bugzilla/show_bug.cgi?id=26034 4export "BUP_ARGV_0"="$0" 5arg_i=1 6for arg in "$@"; do 7 export "BUP_ARGV_${arg_i}"="$arg" 8 shift 9 arg_i=$((arg_i + 1)) 10done 11# Here to end of preamble replaced during install 12bup_python="$(dirname "$0")/../../config/bin/python" || exit $? 13exec "$bup_python" "$0" 14""" 15# end of bup preamble 16 17from __future__ import absolute_import, print_function 18from binascii import hexlify 19import glob, math, os, resource, struct, sys, tempfile 20 21sys.path[:0] = [os.path.dirname(os.path.realpath(__file__)) + '/..'] 22 23from bup import compat, options, git, midx, _helpers, xstat 24from bup.compat import argv_bytes, hexstr, range 25from bup.helpers import (Sha1, add_error, atomically_replaced_file, debug1, fdatasync, 26 handle_ctrl_c, log, mmap_readwrite, qprogress, 27 saved_errors, unlink) 28from bup.io import byte_stream, path_msg 29 30 31PAGE_SIZE=4096 32SHA_PER_PAGE=PAGE_SIZE/20. 33 34optspec = """ 35bup midx [options...] <idxnames...> 36-- 37o,output= output midx filename (default: auto-generated) 38a,auto automatically use all existing .midx/.idx files as input 39f,force merge produce exactly one .midx containing all objects 40p,print print names of generated midx files 41check validate contents of the given midx files (with -a, all midx files) 42max-files= maximum number of idx files to open at once [-1] 43d,dir= directory containing idx/midx files 44""" 45 46merge_into = _helpers.merge_into 47 48 49def _group(l, count): 50 for i in range(0, len(l), count): 51 yield l[i:i+count] 52 53 54def max_files(): 55 mf = min(resource.getrlimit(resource.RLIMIT_NOFILE)) 56 if mf > 32: 57 mf -= 20 # just a safety margin 58 else: 59 mf -= 6 # minimum safety margin 60 return mf 61 62 63def check_midx(name): 64 nicename = git.repo_rel(name) 65 log('Checking %s.\n' % path_msg(nicename)) 66 try: 67 ix = git.open_idx(name) 68 except git.GitError as e: 69 add_error('%s: %s' % (pathmsg(name), e)) 70 return 71 for count,subname in enumerate(ix.idxnames): 72 sub = git.open_idx(os.path.join(os.path.dirname(name), subname)) 73 for ecount,e in enumerate(sub): 74 if not (ecount % 1234): 75 qprogress(' %d/%d: %s %d/%d\r' 76 % (count, len(ix.idxnames), 77 git.shorten_hash(subname).decode('ascii'), 78 ecount, len(sub))) 79 if not sub.exists(e): 80 add_error("%s: %s: %s missing from idx" 81 % (path_msg(nicename), 82 git.shorten_hash(subname).decode('ascii'), 83 hexstr(e))) 84 if not ix.exists(e): 85 add_error("%s: %s: %s missing from midx" 86 % (path_msg(nicename), 87 git.shorten_hash(subname).decode('ascii'), 88 hexstr(e))) 89 prev = None 90 for ecount,e in enumerate(ix): 91 if not (ecount % 1234): 92 qprogress(' Ordering: %d/%d\r' % (ecount, len(ix))) 93 if e and prev and not e >= prev: 94 add_error('%s: ordering error: %s < %s' 95 % (nicename, hexstr(e), hexstr(prev))) 96 prev = e 97 98 99_first = None 100def _do_midx(outdir, outfilename, infilenames, prefixstr): 101 global _first 102 if not outfilename: 103 assert(outdir) 104 sum = hexlify(Sha1(b'\0'.join(infilenames)).digest()) 105 outfilename = b'%s/midx-%s.midx' % (outdir, sum) 106 107 inp = [] 108 total = 0 109 allfilenames = [] 110 midxs = [] 111 try: 112 for name in infilenames: 113 ix = git.open_idx(name) 114 midxs.append(ix) 115 inp.append(( 116 ix.map, 117 len(ix), 118 ix.sha_ofs, 119 isinstance(ix, midx.PackMidx) and ix.which_ofs or 0, 120 len(allfilenames), 121 )) 122 for n in ix.idxnames: 123 allfilenames.append(os.path.basename(n)) 124 total += len(ix) 125 inp.sort(reverse=True, key=lambda x: x[0][x[2] : x[2] + 20]) 126 127 if not _first: _first = outdir 128 dirprefix = (_first != outdir) and git.repo_rel(outdir) + b': ' or b'' 129 debug1('midx: %s%screating from %d files (%d objects).\n' 130 % (dirprefix, prefixstr, len(infilenames), total)) 131 if (opt.auto and (total < 1024 and len(infilenames) < 3)) \ 132 or ((opt.auto or opt.force) and len(infilenames) < 2) \ 133 or (opt.force and not total): 134 debug1('midx: nothing to do.\n') 135 return 136 137 pages = int(total/SHA_PER_PAGE) or 1 138 bits = int(math.ceil(math.log(pages, 2))) 139 entries = 2**bits 140 debug1('midx: table size: %d (%d bits)\n' % (entries*4, bits)) 141 142 unlink(outfilename) 143 with atomically_replaced_file(outfilename, 'wb') as f: 144 f.write(b'MIDX') 145 f.write(struct.pack('!II', midx.MIDX_VERSION, bits)) 146 assert(f.tell() == 12) 147 148 f.truncate(12 + 4*entries + 20*total + 4*total) 149 f.flush() 150 fdatasync(f.fileno()) 151 152 fmap = mmap_readwrite(f, close=False) 153 count = merge_into(fmap, bits, total, inp) 154 del fmap # Assume this calls msync() now. 155 f.seek(0, os.SEEK_END) 156 f.write(b'\0'.join(allfilenames)) 157 finally: 158 for ix in midxs: 159 if isinstance(ix, midx.PackMidx): 160 ix.close() 161 midxs = None 162 inp = None 163 164 165 # This is just for testing (if you enable this, don't clear inp above) 166 if 0: 167 p = midx.PackMidx(outfilename) 168 assert(len(p.idxnames) == len(infilenames)) 169 log(repr(p.idxnames) + '\n') 170 assert(len(p) == total) 171 for pe, e in p, git.idxmerge(inp, final_progress=False): 172 pin = next(pi) 173 assert(i == pin) 174 assert(p.exists(i)) 175 176 return total, outfilename 177 178 179def do_midx(outdir, outfilename, infilenames, prefixstr, prout): 180 rv = _do_midx(outdir, outfilename, infilenames, prefixstr) 181 if rv and opt['print']: 182 prout.write(rv[1] + b'\n') 183 184 185def do_midx_dir(path, outfilename, prout): 186 already = {} 187 sizes = {} 188 if opt.force and not opt.auto: 189 midxs = [] # don't use existing midx files 190 else: 191 midxs = glob.glob(b'%s/*.midx' % path) 192 contents = {} 193 for mname in midxs: 194 m = git.open_idx(mname) 195 contents[mname] = [(b'%s/%s' % (path,i)) for i in m.idxnames] 196 sizes[mname] = len(m) 197 198 # sort the biggest+newest midxes first, so that we can eliminate 199 # smaller (or older) redundant ones that come later in the list 200 midxs.sort(key=lambda ix: (-sizes[ix], -xstat.stat(ix).st_mtime)) 201 202 for mname in midxs: 203 any = 0 204 for iname in contents[mname]: 205 if not already.get(iname): 206 already[iname] = 1 207 any = 1 208 if not any: 209 debug1('%r is redundant\n' % mname) 210 unlink(mname) 211 already[mname] = 1 212 213 midxs = [k for k in midxs if not already.get(k)] 214 idxs = [k for k in glob.glob(b'%s/*.idx' % path) if not already.get(k)] 215 216 for iname in idxs: 217 i = git.open_idx(iname) 218 sizes[iname] = len(i) 219 220 all = [(sizes[n],n) for n in (midxs + idxs)] 221 222 # FIXME: what are the optimal values? Does this make sense? 223 DESIRED_HWM = opt.force and 1 or 5 224 DESIRED_LWM = opt.force and 1 or 2 225 existed = dict((name,1) for sz,name in all) 226 debug1('midx: %d indexes; want no more than %d.\n' 227 % (len(all), DESIRED_HWM)) 228 if len(all) <= DESIRED_HWM: 229 debug1('midx: nothing to do.\n') 230 while len(all) > DESIRED_HWM: 231 all.sort() 232 part1 = [name for sz,name in all[:len(all)-DESIRED_LWM+1]] 233 part2 = all[len(all)-DESIRED_LWM+1:] 234 all = list(do_midx_group(path, outfilename, part1)) + part2 235 if len(all) > DESIRED_HWM: 236 debug1('\nStill too many indexes (%d > %d). Merging again.\n' 237 % (len(all), DESIRED_HWM)) 238 239 if opt['print']: 240 for sz,name in all: 241 if not existed.get(name): 242 prout.write(name + b'\n') 243 244 245def do_midx_group(outdir, outfilename, infiles): 246 groups = list(_group(infiles, opt.max_files)) 247 gprefix = '' 248 for n,sublist in enumerate(groups): 249 if len(groups) != 1: 250 gprefix = 'Group %d: ' % (n+1) 251 rv = _do_midx(outdir, outfilename, sublist, gprefix) 252 if rv: 253 yield rv 254 255 256handle_ctrl_c() 257 258o = options.Options(optspec) 259opt, flags, extra = o.parse(compat.argv[1:]) 260opt.dir = argv_bytes(opt.dir) if opt.dir else None 261opt.output = argv_bytes(opt.output) if opt.output else None 262 263if extra and (opt.auto or opt.force): 264 o.fatal("you can't use -f/-a and also provide filenames") 265if opt.check and (not extra and not opt.auto): 266 o.fatal("if using --check, you must provide filenames or -a") 267 268git.check_repo_or_die() 269 270if opt.max_files < 0: 271 opt.max_files = max_files() 272assert(opt.max_files >= 5) 273 274extra = [argv_bytes(x) for x in extra] 275 276if opt.check: 277 # check existing midx files 278 if extra: 279 midxes = extra 280 else: 281 midxes = [] 282 paths = opt.dir and [opt.dir] or git.all_packdirs() 283 for path in paths: 284 debug1('midx: scanning %s\n' % path) 285 midxes += glob.glob(os.path.join(path, b'*.midx')) 286 for name in midxes: 287 check_midx(name) 288 if not saved_errors: 289 log('All tests passed.\n') 290else: 291 if extra: 292 sys.stdout.flush() 293 do_midx(git.repo(b'objects/pack'), opt.output, extra, b'', 294 byte_stream(sys.stdout)) 295 elif opt.auto or opt.force: 296 sys.stdout.flush() 297 paths = opt.dir and [opt.dir] or git.all_packdirs() 298 for path in paths: 299 debug1('midx: scanning %s\n' % path_msg(path)) 300 do_midx_dir(path, opt.output, byte_stream(sys.stdout)) 301 else: 302 o.fatal("you must use -f or -a or provide input filenames") 303 304if saved_errors: 305 log('WARNING: %d errors encountered.\n' % len(saved_errors)) 306 sys.exit(1) 307