1#!/bin/sh
2"""": # -*-python-*-
3# https://sourceware.org/bugzilla/show_bug.cgi?id=26034
4export "BUP_ARGV_0"="$0"
5arg_i=1
6for arg in "$@"; do
7    export "BUP_ARGV_${arg_i}"="$arg"
8    shift
9    arg_i=$((arg_i + 1))
10done
11# Here to end of preamble replaced during install
12bup_python="$(dirname "$0")/../../config/bin/python" || exit $?
13exec "$bup_python" "$0"
14"""
15# end of bup preamble
16
17from __future__ import absolute_import, print_function
18from binascii import hexlify
19from errno import EACCES
20from io import BytesIO
21import math, os, stat, sys, time
22
23sys.path[:0] = [os.path.dirname(os.path.realpath(__file__)) + '/..']
24
25from bup import compat, hashsplit, git, options, index, client, metadata
26from bup import hlinkdb
27from bup.compat import argv_bytes, environ
28from bup.hashsplit import GIT_MODE_TREE, GIT_MODE_FILE, GIT_MODE_SYMLINK
29from bup.helpers import (add_error, grafted_path_components, handle_ctrl_c,
30                         hostname, istty2, log, parse_date_or_fatal, parse_num,
31                         path_components, progress, qprogress, resolve_parent,
32                         saved_errors, stripped_path_components,
33                         valid_save_name)
34from bup.io import byte_stream, path_msg
35from bup.pwdgrp import userfullname, username
36
37
38optspec = """
39bup save [-tc] [-n name] <filenames...>
40--
41r,remote=  hostname:/path/to/repo of remote repository
42t,tree     output a tree id
43c,commit   output a commit id
44n,name=    name of backup set to update (if any)
45d,date=    date for the commit (seconds since the epoch)
46v,verbose  increase log output (can be used more than once)
47q,quiet    don't show progress meter
48smaller=   only back up files smaller than n bytes
49bwlimit=   maximum bytes/sec to transmit to server
50f,indexfile=  the name of the index file (normally BUP_DIR/bupindex)
51strip      strips the path to every filename given
52strip-path= path-prefix to be stripped when saving
53graft=     a graft point *old_path*=*new_path* (can be used more than once)
54#,compress=  set compression level to # (0-9, 9 is highest) [1]
55"""
56o = options.Options(optspec)
57opt, flags, extra = o.parse(compat.argv[1:])
58
59if opt.indexfile:
60    opt.indexfile = argv_bytes(opt.indexfile)
61if opt.name:
62    opt.name = argv_bytes(opt.name)
63if opt.remote:
64    opt.remote = argv_bytes(opt.remote)
65if opt.strip_path:
66    opt.strip_path = argv_bytes(opt.strip_path)
67
68git.check_repo_or_die()
69if not (opt.tree or opt.commit or opt.name):
70    o.fatal("use one or more of -t, -c, -n")
71if not extra:
72    o.fatal("no filenames given")
73
74extra = [argv_bytes(x) for x in extra]
75
76opt.progress = (istty2 and not opt.quiet)
77opt.smaller = parse_num(opt.smaller or 0)
78if opt.bwlimit:
79    client.bwlimit = parse_num(opt.bwlimit)
80
81if opt.date:
82    date = parse_date_or_fatal(opt.date, o.fatal)
83else:
84    date = time.time()
85
86if opt.strip and opt.strip_path:
87    o.fatal("--strip is incompatible with --strip-path")
88
89graft_points = []
90if opt.graft:
91    if opt.strip:
92        o.fatal("--strip is incompatible with --graft")
93
94    if opt.strip_path:
95        o.fatal("--strip-path is incompatible with --graft")
96
97    for (option, parameter) in flags:
98        if option == "--graft":
99            parameter = argv_bytes(parameter)
100            splitted_parameter = parameter.split(b'=')
101            if len(splitted_parameter) != 2:
102                o.fatal("a graft point must be of the form old_path=new_path")
103            old_path, new_path = splitted_parameter
104            if not (old_path and new_path):
105                o.fatal("a graft point cannot be empty")
106            graft_points.append((resolve_parent(old_path),
107                                 resolve_parent(new_path)))
108
109is_reverse = environ.get(b'BUP_SERVER_REVERSE')
110if is_reverse and opt.remote:
111    o.fatal("don't use -r in reverse mode; it's automatic")
112
113name = opt.name
114if name and not valid_save_name(name):
115    o.fatal("'%s' is not a valid branch name" % path_msg(name))
116refname = name and b'refs/heads/%s' % name or None
117if opt.remote or is_reverse:
118    try:
119        cli = client.Client(opt.remote)
120    except client.ClientError as e:
121        log('error: %s' % e)
122        sys.exit(1)
123    oldref = refname and cli.read_ref(refname) or None
124    w = cli.new_packwriter(compression_level=opt.compress)
125else:
126    cli = None
127    oldref = refname and git.read_ref(refname) or None
128    w = git.PackWriter(compression_level=opt.compress)
129
130handle_ctrl_c()
131
132
133# Metadata is stored in a file named .bupm in each directory.  The
134# first metadata entry will be the metadata for the current directory.
135# The remaining entries will be for each of the other directory
136# elements, in the order they're listed in the index.
137#
138# Since the git tree elements are sorted according to
139# git.shalist_item_sort_key, the metalist items are accumulated as
140# (sort_key, metadata) tuples, and then sorted when the .bupm file is
141# created.  The sort_key should have been computed using the element's
142# mangled name and git mode (after hashsplitting), but the code isn't
143# actually doing that but rather uses the element's real name and mode.
144# This makes things a bit more difficult when reading it back, see
145# vfs.ordered_tree_entries().
146
147# Maintain a stack of information representing the current location in
148# the archive being constructed.  The current path is recorded in
149# parts, which will be something like ['', 'home', 'someuser'], and
150# the accumulated content and metadata for of the dirs in parts is
151# stored in parallel stacks in shalists and metalists.
152
153parts = [] # Current archive position (stack of dir names).
154shalists = [] # Hashes for each dir in paths.
155metalists = [] # Metadata for each dir in paths.
156
157
158def _push(part, metadata):
159    # Enter a new archive directory -- make it the current directory.
160    parts.append(part)
161    shalists.append([])
162    metalists.append([(b'', metadata)]) # This dir's metadata (no name).
163
164
165def _pop(force_tree, dir_metadata=None):
166    # Leave the current archive directory and add its tree to its parent.
167    assert(len(parts) >= 1)
168    part = parts.pop()
169    shalist = shalists.pop()
170    metalist = metalists.pop()
171    # FIXME: only test if collision is possible (i.e. given --strip, etc.)?
172    if force_tree:
173        tree = force_tree
174    else:
175        names_seen = set()
176        clean_list = []
177        metaidx = 1 # entry at 0 is for the dir
178        for x in shalist:
179            name = x[1]
180            if name in names_seen:
181                parent_path = b'/'.join(parts) + b'/'
182                add_error('error: ignoring duplicate path %s in %s'
183                          % (path_msg(name), path_msg(parent_path)))
184                if not stat.S_ISDIR(x[0]):
185                    del metalist[metaidx]
186            else:
187                names_seen.add(name)
188                clean_list.append(x)
189                if not stat.S_ISDIR(x[0]):
190                    metaidx += 1
191
192        if dir_metadata: # Override the original metadata pushed for this dir.
193            metalist = [(b'', dir_metadata)] + metalist[1:]
194        sorted_metalist = sorted(metalist, key = lambda x : x[0])
195        metadata = b''.join([m[1].encode() for m in sorted_metalist])
196        metadata_f = BytesIO(metadata)
197        mode, id = hashsplit.split_to_blob_or_tree(w.new_blob, w.new_tree,
198                                                   [metadata_f],
199                                                   keep_boundaries=False)
200        clean_list.append((mode, b'.bupm', id))
201
202        tree = w.new_tree(clean_list)
203    if shalists:
204        shalists[-1].append((GIT_MODE_TREE,
205                             git.mangle_name(part,
206                                             GIT_MODE_TREE, GIT_MODE_TREE),
207                             tree))
208    return tree
209
210
211lastremain = None
212def progress_report(n):
213    global count, subcount, lastremain
214    subcount += n
215    cc = count + subcount
216    pct = total and (cc*100.0/total) or 0
217    now = time.time()
218    elapsed = now - tstart
219    kps = elapsed and int(cc/1024./elapsed)
220    kps_frac = 10 ** int(math.log(kps+1, 10) - 1)
221    kps = int(kps/kps_frac)*kps_frac
222    if cc:
223        remain = elapsed*1.0/cc * (total-cc)
224    else:
225        remain = 0.0
226    if (lastremain and (remain > lastremain)
227          and ((remain - lastremain)/lastremain < 0.05)):
228        remain = lastremain
229    else:
230        lastremain = remain
231    hours = int(remain/60/60)
232    mins = int(remain/60 - hours*60)
233    secs = int(remain - hours*60*60 - mins*60)
234    if elapsed < 30:
235        remainstr = ''
236        kpsstr = ''
237    else:
238        kpsstr = '%dk/s' % kps
239        if hours:
240            remainstr = '%dh%dm' % (hours, mins)
241        elif mins:
242            remainstr = '%dm%d' % (mins, secs)
243        else:
244            remainstr = '%ds' % secs
245    qprogress('Saving: %.2f%% (%d/%dk, %d/%d files) %s %s\r'
246              % (pct, cc/1024, total/1024, fcount, ftotal,
247                 remainstr, kpsstr))
248
249
250indexfile = opt.indexfile or git.repo(b'bupindex')
251r = index.Reader(indexfile)
252try:
253    msr = index.MetaStoreReader(indexfile + b'.meta')
254except IOError as ex:
255    if ex.errno != EACCES:
256        raise
257    log('error: cannot access %r; have you run bup index?'
258        % path_msg(indexfile))
259    sys.exit(1)
260hlink_db = hlinkdb.HLinkDB(indexfile + b'.hlink')
261
262def already_saved(ent):
263    return ent.is_valid() and w.exists(ent.sha) and ent.sha
264
265def wantrecurse_pre(ent):
266    return not already_saved(ent)
267
268def wantrecurse_during(ent):
269    return not already_saved(ent) or ent.sha_missing()
270
271def find_hardlink_target(hlink_db, ent):
272    if hlink_db and not stat.S_ISDIR(ent.mode) and ent.nlink > 1:
273        link_paths = hlink_db.node_paths(ent.dev, ent.ino)
274        if link_paths:
275            return link_paths[0]
276
277total = ftotal = 0
278if opt.progress:
279    for (transname,ent) in r.filter(extra, wantrecurse=wantrecurse_pre):
280        if not (ftotal % 10024):
281            qprogress('Reading index: %d\r' % ftotal)
282        exists = ent.exists()
283        hashvalid = already_saved(ent)
284        ent.set_sha_missing(not hashvalid)
285        if not opt.smaller or ent.size < opt.smaller:
286            if exists and not hashvalid:
287                total += ent.size
288        ftotal += 1
289    progress('Reading index: %d, done.\n' % ftotal)
290    hashsplit.progress_callback = progress_report
291
292# Root collisions occur when strip or graft options map more than one
293# path to the same directory (paths which originally had separate
294# parents).  When that situation is detected, use empty metadata for
295# the parent.  Otherwise, use the metadata for the common parent.
296# Collision example: "bup save ... --strip /foo /foo/bar /bar".
297
298# FIXME: Add collision tests, or handle collisions some other way.
299
300# FIXME: Detect/handle strip/graft name collisions (other than root),
301# i.e. if '/foo/bar' and '/bar' both map to '/'.
302
303first_root = None
304root_collision = None
305tstart = time.time()
306count = subcount = fcount = 0
307lastskip_name = None
308lastdir = b''
309for (transname,ent) in r.filter(extra, wantrecurse=wantrecurse_during):
310    (dir, file) = os.path.split(ent.name)
311    exists = (ent.flags & index.IX_EXISTS)
312    hashvalid = already_saved(ent)
313    wasmissing = ent.sha_missing()
314    oldsize = ent.size
315    if opt.verbose:
316        if not exists:
317            status = 'D'
318        elif not hashvalid:
319            if ent.sha == index.EMPTY_SHA:
320                status = 'A'
321            else:
322                status = 'M'
323        else:
324            status = ' '
325        if opt.verbose >= 2:
326            log('%s %-70s\n' % (status, path_msg(ent.name)))
327        elif not stat.S_ISDIR(ent.mode) and lastdir != dir:
328            if not lastdir.startswith(dir):
329                log('%s %-70s\n' % (status, path_msg(os.path.join(dir, b''))))
330            lastdir = dir
331
332    if opt.progress:
333        progress_report(0)
334    fcount += 1
335
336    if not exists:
337        continue
338    if opt.smaller and ent.size >= opt.smaller:
339        if exists and not hashvalid:
340            if opt.verbose:
341                log('skipping large file "%s"\n' % path_msg(ent.name))
342            lastskip_name = ent.name
343        continue
344
345    assert(dir.startswith(b'/'))
346    if opt.strip:
347        dirp = stripped_path_components(dir, extra)
348    elif opt.strip_path:
349        dirp = stripped_path_components(dir, [opt.strip_path])
350    elif graft_points:
351        dirp = grafted_path_components(graft_points, dir)
352    else:
353        dirp = path_components(dir)
354
355    # At this point, dirp contains a representation of the archive
356    # path that looks like [(archive_dir_name, real_fs_path), ...].
357    # So given "bup save ... --strip /foo/bar /foo/bar/baz", dirp
358    # might look like this at some point:
359    #   [('', '/foo/bar'), ('baz', '/foo/bar/baz'), ...].
360
361    # This dual representation supports stripping/grafting, where the
362    # archive path may not have a direct correspondence with the
363    # filesystem.  The root directory is represented by an initial
364    # component named '', and any component that doesn't have a
365    # corresponding filesystem directory (due to grafting, for
366    # example) will have a real_fs_path of None, i.e. [('', None),
367    # ...].
368
369    if first_root == None:
370        first_root = dirp[0]
371    elif first_root != dirp[0]:
372        root_collision = True
373
374    # If switching to a new sub-tree, finish the current sub-tree.
375    while parts > [x[0] for x in dirp]:
376        _pop(force_tree = None)
377
378    # If switching to a new sub-tree, start a new sub-tree.
379    for path_component in dirp[len(parts):]:
380        dir_name, fs_path = path_component
381        # Not indexed, so just grab the FS metadata or use empty metadata.
382        try:
383            meta = metadata.from_path(fs_path, normalized=True) \
384                if fs_path else metadata.Metadata()
385        except (OSError, IOError) as e:
386            add_error(e)
387            lastskip_name = dir_name
388            meta = metadata.Metadata()
389        _push(dir_name, meta)
390
391    if not file:
392        if len(parts) == 1:
393            continue # We're at the top level -- keep the current root dir
394        # Since there's no filename, this is a subdir -- finish it.
395        oldtree = already_saved(ent) # may be None
396        newtree = _pop(force_tree = oldtree)
397        if not oldtree:
398            if lastskip_name and lastskip_name.startswith(ent.name):
399                ent.invalidate()
400            else:
401                ent.validate(GIT_MODE_TREE, newtree)
402            ent.repack()
403        if exists and wasmissing:
404            count += oldsize
405        continue
406
407    # it's not a directory
408    if hashvalid:
409        id = ent.sha
410        git_name = git.mangle_name(file, ent.mode, ent.gitmode)
411        git_info = (ent.gitmode, git_name, id)
412        shalists[-1].append(git_info)
413        sort_key = git.shalist_item_sort_key((ent.mode, file, id))
414        meta = msr.metadata_at(ent.meta_ofs)
415        meta.hardlink_target = find_hardlink_target(hlink_db, ent)
416        # Restore the times that were cleared to 0 in the metastore.
417        (meta.atime, meta.mtime, meta.ctime) = (ent.atime, ent.mtime, ent.ctime)
418        metalists[-1].append((sort_key, meta))
419    else:
420        id = None
421        if stat.S_ISREG(ent.mode):
422            try:
423                with hashsplit.open_noatime(ent.name) as f:
424                    (mode, id) = hashsplit.split_to_blob_or_tree(
425                                            w.new_blob, w.new_tree, [f],
426                                            keep_boundaries=False)
427            except (IOError, OSError) as e:
428                add_error('%s: %s' % (ent.name, e))
429                lastskip_name = ent.name
430        elif stat.S_ISDIR(ent.mode):
431            assert(0)  # handled above
432        elif stat.S_ISLNK(ent.mode):
433            try:
434                rl = os.readlink(ent.name)
435            except (OSError, IOError) as e:
436                add_error(e)
437                lastskip_name = ent.name
438            else:
439                (mode, id) = (GIT_MODE_SYMLINK, w.new_blob(rl))
440        else:
441            # Everything else should be fully described by its
442            # metadata, so just record an empty blob, so the paths
443            # in the tree and .bupm will match up.
444            (mode, id) = (GIT_MODE_FILE, w.new_blob(b''))
445
446        if id:
447            ent.validate(mode, id)
448            ent.repack()
449            git_name = git.mangle_name(file, ent.mode, ent.gitmode)
450            git_info = (mode, git_name, id)
451            shalists[-1].append(git_info)
452            sort_key = git.shalist_item_sort_key((ent.mode, file, id))
453            hlink = find_hardlink_target(hlink_db, ent)
454            try:
455                meta = metadata.from_path(ent.name, hardlink_target=hlink,
456                                          normalized=True)
457            except (OSError, IOError) as e:
458                add_error(e)
459                lastskip_name = ent.name
460                meta = metadata.Metadata()
461            metalists[-1].append((sort_key, meta))
462
463    if exists and wasmissing:
464        count += oldsize
465        subcount = 0
466
467
468if opt.progress:
469    pct = total and count*100.0/total or 100
470    progress('Saving: %.2f%% (%d/%dk, %d/%d files), done.    \n'
471             % (pct, count/1024, total/1024, fcount, ftotal))
472
473while len(parts) > 1: # _pop() all the parts above the root
474    _pop(force_tree = None)
475assert(len(shalists) == 1)
476assert(len(metalists) == 1)
477
478# Finish the root directory.
479tree = _pop(force_tree = None,
480            # When there's a collision, use empty metadata for the root.
481            dir_metadata = metadata.Metadata() if root_collision else None)
482
483sys.stdout.flush()
484out = byte_stream(sys.stdout)
485
486if opt.tree:
487    out.write(hexlify(tree))
488    out.write(b'\n')
489if opt.commit or name:
490    if compat.py_maj > 2:
491        # Strip b prefix from python 3 bytes reprs to preserve previous format
492         msgcmd = b'[%s]' % b', '.join([repr(argv_bytes(x))[1:].encode('ascii')
493                                       for x in compat.argv])
494    else:
495        msgcmd = repr(compat.argv)
496    msg = b'bup save\n\nGenerated by command:\n%s\n' % msgcmd
497    userline = (b'%s <%s@%s>' % (userfullname(), username(), hostname()))
498    commit = w.new_commit(tree, oldref, userline, date, None,
499                          userline, date, None, msg)
500    if opt.commit:
501        out.write(hexlify(commit))
502        out.write(b'\n')
503
504msr.close()
505w.close()  # must close before we can update the ref
506
507if opt.name:
508    if cli:
509        cli.update_ref(refname, commit, oldref)
510    else:
511        git.update_ref(refname, commit, oldref)
512
513if cli:
514    cli.close()
515
516if saved_errors:
517    log('WARNING: %d errors encountered while saving.\n' % len(saved_errors))
518    sys.exit(1)
519