1# Copyright 2002, 2003, 2004, 2005 Ben Escoto
2#
3# This file is part of rdiff-backup.
4#
5# rdiff-backup is free software; you can redistribute it and/or modify
6# under the terms of the GNU General Public License as published by the
7# Free Software Foundation; either version 2 of the License, or (at your
8# option) any later version.
9#
10# rdiff-backup is distributed in the hope that it will be useful, but
11# WITHOUT ANY WARRANTY; without even the implied warranty of
12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13# General Public License for more details.
14#
15# You should have received a copy of the GNU General Public License
16# along with rdiff-backup; if not, write to the Free Software
17# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
18# 02110-1301, USA
19"""Perform various kinds of comparisons.
20
21For instance, full-file compare, compare by hash, and metadata-only
22compare.  This uses elements of the backup and restore modules.
23
24"""
25
26import os
27from . import Globals, restore, rorpiter, log, backup, rpath, hash, robust, Hardlink
28
29
30def Compare(src_rp, mirror_rp, inc_rp, compare_time):
31    """Compares metadata in src_rp dir with metadata in mirror_rp at time"""
32    repo_side = mirror_rp.conn.compare.RepoSide
33    data_side = src_rp.conn.compare.DataSide
34
35    repo_iter = repo_side.init_and_get_iter(mirror_rp, inc_rp, compare_time)
36    return_val = print_reports(data_side.compare_fast(repo_iter))
37    repo_side.close_rf_cache()
38    return return_val
39
40
41def Compare_hash(src_rp, mirror_rp, inc_rp, compare_time):
42    """Compare files at src_rp with repo at compare_time
43
44    Note metadata differences, but also check to see if file data is
45    different.  If two regular files have the same size, hash the
46    source and compare to the hash presumably already present in repo.
47
48    """
49    repo_side = mirror_rp.conn.compare.RepoSide
50    data_side = src_rp.conn.compare.DataSide
51
52    repo_iter = repo_side.init_and_get_iter(mirror_rp, inc_rp, compare_time)
53    return_val = print_reports(data_side.compare_hash(repo_iter))
54    repo_side.close_rf_cache()
55    return return_val
56
57
58def Compare_full(src_rp, mirror_rp, inc_rp, compare_time):
59    """Compare full data of files at src_rp with repo at compare_time
60
61    Like Compare_hash, but do not rely on hashes, instead copy full
62    data over.
63
64    """
65    repo_side = mirror_rp.conn.compare.RepoSide
66    data_side = src_rp.conn.compare.DataSide
67
68    src_iter = data_side.get_source_select()
69    attached_repo_iter = repo_side.attach_files(src_iter, mirror_rp, inc_rp,
70                                                compare_time)
71    report_iter = data_side.compare_full(src_rp, attached_repo_iter)
72    return_val = print_reports(report_iter)
73    repo_side.close_rf_cache()
74    return return_val
75
76
77def Verify(mirror_rp, inc_rp, verify_time):
78    """Compute SHA1 sums of repository files and check against metadata"""
79    assert mirror_rp.conn is Globals.local_connection
80    repo_iter = RepoSide.init_and_get_iter(mirror_rp, inc_rp, verify_time)
81    base_index = RepoSide.mirror_base.index
82
83    bad_files = 0
84    for repo_rorp in repo_iter:
85        if not repo_rorp.isreg():
86            continue
87        verify_sha1 = get_hash(repo_rorp)
88        if not verify_sha1:
89            log.Log(
90                "Warning: Cannot find SHA1 digest for file %s,\n"
91                "perhaps because this feature was added in v1.1.1" %
92                (repo_rorp.get_safeindexpath(), ), 2)
93            continue
94        fp = RepoSide.rf_cache.get_fp(base_index + repo_rorp.index, repo_rorp)
95        computed_hash = hash.compute_sha1_fp(fp)
96        if computed_hash == verify_sha1:
97            log.Log(
98                "Verified SHA1 digest of %s" % repo_rorp.get_safeindexpath(),
99                5)
100        else:
101            bad_files += 1
102            log.Log(
103                "Warning: Computed SHA1 digest of %s\n   %s\n"
104                "doesn't match recorded digest of\n   %s\n"
105                "Your backup repository may be corrupted!" %
106                (repo_rorp.get_safeindexpath(), computed_hash,
107                 verify_sha1), 2)
108    RepoSide.close_rf_cache()
109    if bad_files:
110        log.Log("Not all files could be verified.", 3)
111        return 2
112    log.Log("Every file verified successfully.", 3)
113    return 0
114
115
116def get_hash(repo_rorp):
117    """ Try to get a sha1 digest from the repository.  If hardlinks
118    are saved in the metadata, get the sha1 from the first hardlink """
119    Hardlink.add_rorp(repo_rorp)
120    if Hardlink.islinked(repo_rorp):
121        verify_sha1 = Hardlink.get_sha1(repo_rorp)
122    elif repo_rorp.has_sha1():
123        verify_sha1 = repo_rorp.get_sha1()
124    else:
125        verify_sha1 = None
126    Hardlink.del_rorp(repo_rorp)
127    return verify_sha1
128
129
130def print_reports(report_iter):
131    """Given an iter of CompareReport objects, print them to screen"""
132    assert not Globals.server
133    changed_files_found = 0
134    for report in report_iter:
135        changed_files_found = 1
136        indexpath = report.index and b"/".join(report.index) or b"."
137        print("%s: %s" % (report.reason, os.fsdecode(indexpath)))
138
139    if not changed_files_found:
140        log.Log("No changes found.  Directory matches archive data.", 3)
141    return changed_files_found
142
143
144def get_basic_report(src_rp, repo_rorp, comp_data_func=None):
145    """Compare src_rp and repo_rorp, return CompareReport
146
147    comp_data_func should be a function that accepts (src_rp,
148    repo_rorp) as arguments, and return 1 if they have the same data,
149    0 otherwise.  If comp_data_func is false, don't compare file data,
150    only metadata.
151
152    """
153    if src_rp:
154        index = src_rp.index
155    else:
156        index = repo_rorp.index
157    if not repo_rorp or not repo_rorp.lstat():
158        return CompareReport(index, "new")
159    elif not src_rp or not src_rp.lstat():
160        return CompareReport(index, "deleted")
161    elif comp_data_func and src_rp.isreg() and repo_rorp.isreg():
162        if src_rp == repo_rorp:
163            meta_changed = 0
164        else:
165            meta_changed = 1
166        data_changed = comp_data_func(src_rp, repo_rorp)
167
168        if not meta_changed and not data_changed:
169            return None
170        if meta_changed:
171            meta_string = "metadata changed, "
172        else:
173            meta_string = "metadata the same, "
174        if data_changed:
175            data_string = "data changed"
176        else:
177            data_string = "data the same"
178        return CompareReport(index, meta_string + data_string)
179    elif src_rp == repo_rorp:
180        return None
181    else:
182        return CompareReport(index, "changed")
183
184
185def log_success(src_rorp, mir_rorp=None):
186    """Log that src_rorp and mir_rorp compare successfully"""
187    path = src_rorp and src_rorp.get_safeindexpath(
188    ) or mir_rorp.get_safeindexpath()
189    log.Log("Successful compare: %s" % (path, ), 5)
190
191
192class RepoSide(restore.MirrorStruct):
193    """On the repository side, comparing is like restoring"""
194
195    @classmethod
196    def init_and_get_iter(cls, mirror_rp, inc_rp, compare_time):
197        """Return rorp iter at given compare time"""
198        cls.set_mirror_and_rest_times(compare_time)
199        cls.initialize_rf_cache(mirror_rp, inc_rp)
200        return cls.subtract_indices(cls.mirror_base.index,
201                                    cls.get_mirror_rorp_iter())
202
203    @classmethod
204    def attach_files(cls, src_iter, mirror_rp, inc_rp, compare_time):
205        """Attach data to all the files that need checking
206
207        Return an iterator of repo rorps that includes all the files
208        that may have changed, and has the fileobj set on all rorps
209        that need it.
210
211        """
212        repo_iter = cls.init_and_get_iter(mirror_rp, inc_rp, compare_time)
213        base_index = cls.mirror_base.index
214        for src_rorp, mir_rorp in rorpiter.Collate2Iters(src_iter, repo_iter):
215            index = src_rorp and src_rorp.index or mir_rorp.index
216            if src_rorp and mir_rorp:
217                if not src_rorp.isreg() and src_rorp == mir_rorp:
218                    log_success(src_rorp, mir_rorp)
219                    continue  # They must be equal, nothing else to check
220                if (src_rorp.isreg() and mir_rorp.isreg()
221                        and src_rorp.getsize() == mir_rorp.getsize()):
222                    fp = cls.rf_cache.get_fp(base_index + index, mir_rorp)
223                    mir_rorp.setfile(fp)
224                    mir_rorp.set_attached_filetype('snapshot')
225
226            if mir_rorp:
227                yield mir_rorp
228            else:
229                yield rpath.RORPath(index)  # indicate deleted mir_rorp
230
231
232class DataSide(backup.SourceStruct):
233    """On the side that has the current data, compare is like backing up"""
234
235    @classmethod
236    def compare_fast(cls, repo_iter):
237        """Compare rorps (metadata only) quickly, return report iter"""
238        src_iter = cls.get_source_select()
239        for src_rorp, mir_rorp in rorpiter.Collate2Iters(src_iter, repo_iter):
240            report = get_basic_report(src_rorp, mir_rorp)
241            if report:
242                yield report
243            else:
244                log_success(src_rorp, mir_rorp)
245
246    @classmethod
247    def compare_hash(cls, repo_iter):
248        """Like above, but also compare sha1 sums of any regular files"""
249
250        def hashes_changed(src_rp, mir_rorp):
251            """Return 0 if their data hashes same, 1 otherwise"""
252            verify_sha1 = get_hash(mir_rorp)
253            if not verify_sha1:
254                log.Log(
255                    "Warning: Metadata file has no digest for %s, "
256                    "unable to compare." % (mir_rorp.get_safeindexpath(), ), 2)
257                return 0
258            elif (src_rp.getsize() == mir_rorp.getsize()
259                  and hash.compute_sha1(src_rp) == verify_sha1):
260                return 0
261            return 1
262
263        src_iter = cls.get_source_select()
264        for src_rp, mir_rorp in rorpiter.Collate2Iters(src_iter, repo_iter):
265            report = get_basic_report(src_rp, mir_rorp, hashes_changed)
266            if report:
267                yield report
268            else:
269                log_success(src_rp, mir_rorp)
270
271    @classmethod
272    def compare_full(cls, src_root, repo_iter):
273        """Given repo iter with full data attached, return report iter"""
274
275        def error_handler(exc, src_rp, repo_rorp):
276            log.Log("Error reading file %s" % src_rp.get_safepath(), 2)
277            return 0  # They aren't the same if we get an error
278
279        def data_changed(src_rp, repo_rorp):
280            """Return 0 if full compare of data matches, 1 otherwise"""
281            if src_rp.getsize() != repo_rorp.getsize():
282                return 1
283            return not robust.check_common_error(error_handler, rpath.cmp,
284                                                 (src_rp, repo_rorp))
285
286        for repo_rorp in repo_iter:
287            src_rp = src_root.new_index(repo_rorp.index)
288            report = get_basic_report(src_rp, repo_rorp, data_changed)
289            if report:
290                yield report
291            else:
292                log_success(repo_rorp)
293
294
295class CompareReport:
296    """When two files don't match, this tells you how they don't match
297
298    This is necessary because the system that is doing the actual
299    comparing may not be the one printing out the reports.  For speed
300    the compare information can be pipelined back to the client
301    connection as an iter of CompareReports.
302
303    """
304    # self.file is added so that CompareReports can masquerade as
305    # RORPaths when in an iterator, and thus get pipelined.
306    file = None
307
308    def __init__(self, index, reason):
309        self.index = index
310        self.reason = reason
311