1#!/bin/sh
2"""": # -*-python-*-
3# https://sourceware.org/bugzilla/show_bug.cgi?id=26034
4export "BUP_ARGV_0"="$0"
5arg_i=1
6for arg in "$@"; do
7    export "BUP_ARGV_${arg_i}"="$arg"
8    shift
9    arg_i=$((arg_i + 1))
10done
11# Here to end of preamble replaced during install
12bup_python="$(dirname "$0")/../../config/bin/python" || exit $?
13exec "$bup_python" "$0"
14"""
15# end of bup preamble
16
17from __future__ import absolute_import, division, print_function
18from binascii import hexlify
19import os, sys, time
20
21sys.path[:0] = [os.path.dirname(os.path.realpath(__file__)) + '/..']
22
23from bup import compat, hashsplit, git, options, client
24from bup.compat import argv_bytes, environ
25from bup.helpers import (add_error, handle_ctrl_c, hostname, log, parse_num,
26                         qprogress, reprogress, saved_errors,
27                         valid_save_name,
28                         parse_date_or_fatal)
29from bup.io import byte_stream
30from bup.pwdgrp import userfullname, username
31
32
33optspec = """
34bup split [-t] [-c] [-n name] OPTIONS [--git-ids | filenames...]
35bup split -b OPTIONS [--git-ids | filenames...]
36bup split --copy OPTIONS [--git-ids | filenames...]
37bup split --noop [-b|-t] OPTIONS [--git-ids | filenames...]
38--
39 Modes:
40b,blobs    output a series of blob ids.  Implies --fanout=0.
41t,tree     output a tree id
42c,commit   output a commit id
43n,name=    save the result under the given name
44noop       split the input, but throw away the result
45copy       split the input, copy it to stdout, don't save to repo
46 Options:
47r,remote=  remote repository path
48d,date=    date for the commit (seconds since the epoch)
49q,quiet    don't print progress messages
50v,verbose  increase log output (can be used more than once)
51git-ids    read a list of git object ids from stdin and split their contents
52keep-boundaries  don't let one chunk span two input files
53bench      print benchmark timings to stderr
54max-pack-size=  maximum bytes in a single pack
55max-pack-objects=  maximum number of objects in a single pack
56fanout=    average number of blobs in a single tree
57bwlimit=   maximum bytes/sec to transmit to server
58#,compress=  set compression level to # (0-9, 9 is highest) [1]
59"""
60handle_ctrl_c()
61
62o = options.Options(optspec)
63opt, flags, extra = o.parse(compat.argv[1:])
64if opt.name: opt.name = argv_bytes(opt.name)
65if opt.remote: opt.remote = argv_bytes(opt.remote)
66if opt.verbose is None: opt.verbose = 0
67
68if not (opt.blobs or opt.tree or opt.commit or opt.name or
69        opt.noop or opt.copy):
70    o.fatal("use one or more of -b, -t, -c, -n, --noop, --copy")
71if opt.copy and (opt.blobs or opt.tree):
72    o.fatal('--copy is incompatible with -b, -t')
73if (opt.noop or opt.copy) and (opt.commit or opt.name):
74    o.fatal('--noop and --copy are incompatible with -c, -n')
75if opt.blobs and (opt.tree or opt.commit or opt.name):
76    o.fatal('-b is incompatible with -t, -c, -n')
77if extra and opt.git_ids:
78    o.fatal("don't provide filenames when using --git-ids")
79
80if opt.verbose >= 2:
81    git.verbose = opt.verbose - 1
82    opt.bench = 1
83
84max_pack_size = None
85if opt.max_pack_size:
86    max_pack_size = parse_num(opt.max_pack_size)
87max_pack_objects = None
88if opt.max_pack_objects:
89    max_pack_objects = parse_num(opt.max_pack_objects)
90
91if opt.fanout:
92    hashsplit.fanout = parse_num(opt.fanout)
93if opt.blobs:
94    hashsplit.fanout = 0
95if opt.bwlimit:
96    client.bwlimit = parse_num(opt.bwlimit)
97if opt.date:
98    date = parse_date_or_fatal(opt.date, o.fatal)
99else:
100    date = time.time()
101
102total_bytes = 0
103def prog(filenum, nbytes):
104    global total_bytes
105    total_bytes += nbytes
106    if filenum > 0:
107        qprogress('Splitting: file #%d, %d kbytes\r'
108                  % (filenum+1, total_bytes // 1024))
109    else:
110        qprogress('Splitting: %d kbytes\r' % (total_bytes // 1024))
111
112
113is_reverse = environ.get(b'BUP_SERVER_REVERSE')
114if is_reverse and opt.remote:
115    o.fatal("don't use -r in reverse mode; it's automatic")
116start_time = time.time()
117
118if opt.name and not valid_save_name(opt.name):
119    o.fatal("'%r' is not a valid branch name." % opt.name)
120refname = opt.name and b'refs/heads/%s' % opt.name or None
121
122if opt.noop or opt.copy:
123    cli = pack_writer = oldref = None
124elif opt.remote or is_reverse:
125    git.check_repo_or_die()
126    cli = client.Client(opt.remote)
127    oldref = refname and cli.read_ref(refname) or None
128    pack_writer = cli.new_packwriter(compression_level=opt.compress,
129                                     max_pack_size=max_pack_size,
130                                     max_pack_objects=max_pack_objects)
131else:
132    git.check_repo_or_die()
133    cli = None
134    oldref = refname and git.read_ref(refname) or None
135    pack_writer = git.PackWriter(compression_level=opt.compress,
136                                 max_pack_size=max_pack_size,
137                                 max_pack_objects=max_pack_objects)
138
139input = byte_stream(sys.stdin)
140
141if opt.git_ids:
142    # the input is actually a series of git object ids that we should retrieve
143    # and split.
144    #
145    # This is a bit messy, but basically it converts from a series of
146    # CatPipe.get() iterators into a series of file-type objects.
147    # It would be less ugly if either CatPipe.get() returned a file-like object
148    # (not very efficient), or split_to_shalist() expected an iterator instead
149    # of a file.
150    cp = git.CatPipe()
151    class IterToFile:
152        def __init__(self, it):
153            self.it = iter(it)
154        def read(self, size):
155            v = next(self.it, None)
156            return v or b''
157    def read_ids():
158        while 1:
159            line = input.readline()
160            if not line:
161                break
162            if line:
163                line = line.strip()
164            try:
165                it = cp.get(line.strip())
166                next(it, None)  # skip the file info
167            except KeyError as e:
168                add_error('error: %s' % e)
169                continue
170            yield IterToFile(it)
171    files = read_ids()
172else:
173    # the input either comes from a series of files or from stdin.
174    files = extra and (open(argv_bytes(fn), 'rb') for fn in extra) or [input]
175
176if pack_writer:
177    new_blob = pack_writer.new_blob
178    new_tree = pack_writer.new_tree
179elif opt.blobs or opt.tree:
180    # --noop mode
181    new_blob = lambda content: git.calc_hash(b'blob', content)
182    new_tree = lambda shalist: git.calc_hash(b'tree', git.tree_encode(shalist))
183
184sys.stdout.flush()
185out = byte_stream(sys.stdout)
186
187if opt.blobs:
188    shalist = hashsplit.split_to_blobs(new_blob, files,
189                                       keep_boundaries=opt.keep_boundaries,
190                                       progress=prog)
191    for (sha, size, level) in shalist:
192        out.write(hexlify(sha) + b'\n')
193        reprogress()
194elif opt.tree or opt.commit or opt.name:
195    if opt.name: # insert dummy_name which may be used as a restore target
196        mode, sha = \
197            hashsplit.split_to_blob_or_tree(new_blob, new_tree, files,
198                                            keep_boundaries=opt.keep_boundaries,
199                                            progress=prog)
200        splitfile_name = git.mangle_name(b'data', hashsplit.GIT_MODE_FILE, mode)
201        shalist = [(mode, splitfile_name, sha)]
202    else:
203        shalist = hashsplit.split_to_shalist(
204                      new_blob, new_tree, files,
205                      keep_boundaries=opt.keep_boundaries, progress=prog)
206    tree = new_tree(shalist)
207else:
208    last = 0
209    it = hashsplit.hashsplit_iter(files,
210                                  keep_boundaries=opt.keep_boundaries,
211                                  progress=prog)
212    for (blob, level) in it:
213        hashsplit.total_split += len(blob)
214        if opt.copy:
215            sys.stdout.write(str(blob))
216        megs = hashsplit.total_split // 1024 // 1024
217        if not opt.quiet and last != megs:
218            last = megs
219
220if opt.verbose:
221    log('\n')
222if opt.tree:
223    out.write(hexlify(tree) + b'\n')
224if opt.commit or opt.name:
225    msg = b'bup split\n\nGenerated by command:\n%r\n' % compat.argvb
226    ref = opt.name and (b'refs/heads/%s' % opt.name) or None
227    userline = b'%s <%s@%s>' % (userfullname(), username(), hostname())
228    commit = pack_writer.new_commit(tree, oldref, userline, date, None,
229                                    userline, date, None, msg)
230    if opt.commit:
231        out.write(hexlify(commit) + b'\n')
232
233if pack_writer:
234    pack_writer.close()  # must close before we can update the ref
235
236if opt.name:
237    if cli:
238        cli.update_ref(refname, commit, oldref)
239    else:
240        git.update_ref(refname, commit, oldref)
241
242if cli:
243    cli.close()
244
245secs = time.time() - start_time
246size = hashsplit.total_split
247if opt.bench:
248    log('bup: %.2f kbytes in %.2f secs = %.2f kbytes/sec\n'
249        % (size / 1024, secs, size / 1024 / secs))
250
251if saved_errors:
252    log('WARNING: %d errors encountered while saving.\n' % len(saved_errors))
253    sys.exit(1)
254