1# Copyright 2002, 2003, 2004, 2005 Ben Escoto 2# 3# This file is part of rdiff-backup. 4# 5# rdiff-backup is free software; you can redistribute it and/or modify 6# under the terms of the GNU General Public License as published by the 7# Free Software Foundation; either version 2 of the License, or (at your 8# option) any later version. 9# 10# rdiff-backup is distributed in the hope that it will be useful, but 11# WITHOUT ANY WARRANTY; without even the implied warranty of 12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13# General Public License for more details. 14# 15# You should have received a copy of the GNU General Public License 16# along with rdiff-backup; if not, write to the Free Software 17# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 18# 02110-1301, USA 19"""Perform various kinds of comparisons. 20 21For instance, full-file compare, compare by hash, and metadata-only 22compare. This uses elements of the backup and restore modules. 23 24""" 25 26import os 27from . import Globals, restore, rorpiter, log, backup, rpath, hash, robust, Hardlink 28 29 30def Compare(src_rp, mirror_rp, inc_rp, compare_time): 31 """Compares metadata in src_rp dir with metadata in mirror_rp at time""" 32 repo_side = mirror_rp.conn.compare.RepoSide 33 data_side = src_rp.conn.compare.DataSide 34 35 repo_iter = repo_side.init_and_get_iter(mirror_rp, inc_rp, compare_time) 36 return_val = print_reports(data_side.compare_fast(repo_iter)) 37 repo_side.close_rf_cache() 38 return return_val 39 40 41def Compare_hash(src_rp, mirror_rp, inc_rp, compare_time): 42 """Compare files at src_rp with repo at compare_time 43 44 Note metadata differences, but also check to see if file data is 45 different. If two regular files have the same size, hash the 46 source and compare to the hash presumably already present in repo. 47 48 """ 49 repo_side = mirror_rp.conn.compare.RepoSide 50 data_side = src_rp.conn.compare.DataSide 51 52 repo_iter = repo_side.init_and_get_iter(mirror_rp, inc_rp, compare_time) 53 return_val = print_reports(data_side.compare_hash(repo_iter)) 54 repo_side.close_rf_cache() 55 return return_val 56 57 58def Compare_full(src_rp, mirror_rp, inc_rp, compare_time): 59 """Compare full data of files at src_rp with repo at compare_time 60 61 Like Compare_hash, but do not rely on hashes, instead copy full 62 data over. 63 64 """ 65 repo_side = mirror_rp.conn.compare.RepoSide 66 data_side = src_rp.conn.compare.DataSide 67 68 src_iter = data_side.get_source_select() 69 attached_repo_iter = repo_side.attach_files(src_iter, mirror_rp, inc_rp, 70 compare_time) 71 report_iter = data_side.compare_full(src_rp, attached_repo_iter) 72 return_val = print_reports(report_iter) 73 repo_side.close_rf_cache() 74 return return_val 75 76 77def Verify(mirror_rp, inc_rp, verify_time): 78 """Compute SHA1 sums of repository files and check against metadata""" 79 assert mirror_rp.conn is Globals.local_connection 80 repo_iter = RepoSide.init_and_get_iter(mirror_rp, inc_rp, verify_time) 81 base_index = RepoSide.mirror_base.index 82 83 bad_files = 0 84 for repo_rorp in repo_iter: 85 if not repo_rorp.isreg(): 86 continue 87 verify_sha1 = get_hash(repo_rorp) 88 if not verify_sha1: 89 log.Log( 90 "Warning: Cannot find SHA1 digest for file %s,\n" 91 "perhaps because this feature was added in v1.1.1" % 92 (repo_rorp.get_safeindexpath(), ), 2) 93 continue 94 fp = RepoSide.rf_cache.get_fp(base_index + repo_rorp.index, repo_rorp) 95 computed_hash = hash.compute_sha1_fp(fp) 96 if computed_hash == verify_sha1: 97 log.Log( 98 "Verified SHA1 digest of %s" % repo_rorp.get_safeindexpath(), 99 5) 100 else: 101 bad_files += 1 102 log.Log( 103 "Warning: Computed SHA1 digest of %s\n %s\n" 104 "doesn't match recorded digest of\n %s\n" 105 "Your backup repository may be corrupted!" % 106 (repo_rorp.get_safeindexpath(), computed_hash, 107 verify_sha1), 2) 108 RepoSide.close_rf_cache() 109 if bad_files: 110 log.Log("Not all files could be verified.", 3) 111 return 2 112 log.Log("Every file verified successfully.", 3) 113 return 0 114 115 116def get_hash(repo_rorp): 117 """ Try to get a sha1 digest from the repository. If hardlinks 118 are saved in the metadata, get the sha1 from the first hardlink """ 119 Hardlink.add_rorp(repo_rorp) 120 if Hardlink.islinked(repo_rorp): 121 verify_sha1 = Hardlink.get_sha1(repo_rorp) 122 elif repo_rorp.has_sha1(): 123 verify_sha1 = repo_rorp.get_sha1() 124 else: 125 verify_sha1 = None 126 Hardlink.del_rorp(repo_rorp) 127 return verify_sha1 128 129 130def print_reports(report_iter): 131 """Given an iter of CompareReport objects, print them to screen""" 132 assert not Globals.server 133 changed_files_found = 0 134 for report in report_iter: 135 changed_files_found = 1 136 indexpath = report.index and b"/".join(report.index) or b"." 137 print("%s: %s" % (report.reason, os.fsdecode(indexpath))) 138 139 if not changed_files_found: 140 log.Log("No changes found. Directory matches archive data.", 3) 141 return changed_files_found 142 143 144def get_basic_report(src_rp, repo_rorp, comp_data_func=None): 145 """Compare src_rp and repo_rorp, return CompareReport 146 147 comp_data_func should be a function that accepts (src_rp, 148 repo_rorp) as arguments, and return 1 if they have the same data, 149 0 otherwise. If comp_data_func is false, don't compare file data, 150 only metadata. 151 152 """ 153 if src_rp: 154 index = src_rp.index 155 else: 156 index = repo_rorp.index 157 if not repo_rorp or not repo_rorp.lstat(): 158 return CompareReport(index, "new") 159 elif not src_rp or not src_rp.lstat(): 160 return CompareReport(index, "deleted") 161 elif comp_data_func and src_rp.isreg() and repo_rorp.isreg(): 162 if src_rp == repo_rorp: 163 meta_changed = 0 164 else: 165 meta_changed = 1 166 data_changed = comp_data_func(src_rp, repo_rorp) 167 168 if not meta_changed and not data_changed: 169 return None 170 if meta_changed: 171 meta_string = "metadata changed, " 172 else: 173 meta_string = "metadata the same, " 174 if data_changed: 175 data_string = "data changed" 176 else: 177 data_string = "data the same" 178 return CompareReport(index, meta_string + data_string) 179 elif src_rp == repo_rorp: 180 return None 181 else: 182 return CompareReport(index, "changed") 183 184 185def log_success(src_rorp, mir_rorp=None): 186 """Log that src_rorp and mir_rorp compare successfully""" 187 path = src_rorp and src_rorp.get_safeindexpath( 188 ) or mir_rorp.get_safeindexpath() 189 log.Log("Successful compare: %s" % (path, ), 5) 190 191 192class RepoSide(restore.MirrorStruct): 193 """On the repository side, comparing is like restoring""" 194 195 @classmethod 196 def init_and_get_iter(cls, mirror_rp, inc_rp, compare_time): 197 """Return rorp iter at given compare time""" 198 cls.set_mirror_and_rest_times(compare_time) 199 cls.initialize_rf_cache(mirror_rp, inc_rp) 200 return cls.subtract_indices(cls.mirror_base.index, 201 cls.get_mirror_rorp_iter()) 202 203 @classmethod 204 def attach_files(cls, src_iter, mirror_rp, inc_rp, compare_time): 205 """Attach data to all the files that need checking 206 207 Return an iterator of repo rorps that includes all the files 208 that may have changed, and has the fileobj set on all rorps 209 that need it. 210 211 """ 212 repo_iter = cls.init_and_get_iter(mirror_rp, inc_rp, compare_time) 213 base_index = cls.mirror_base.index 214 for src_rorp, mir_rorp in rorpiter.Collate2Iters(src_iter, repo_iter): 215 index = src_rorp and src_rorp.index or mir_rorp.index 216 if src_rorp and mir_rorp: 217 if not src_rorp.isreg() and src_rorp == mir_rorp: 218 log_success(src_rorp, mir_rorp) 219 continue # They must be equal, nothing else to check 220 if (src_rorp.isreg() and mir_rorp.isreg() 221 and src_rorp.getsize() == mir_rorp.getsize()): 222 fp = cls.rf_cache.get_fp(base_index + index, mir_rorp) 223 mir_rorp.setfile(fp) 224 mir_rorp.set_attached_filetype('snapshot') 225 226 if mir_rorp: 227 yield mir_rorp 228 else: 229 yield rpath.RORPath(index) # indicate deleted mir_rorp 230 231 232class DataSide(backup.SourceStruct): 233 """On the side that has the current data, compare is like backing up""" 234 235 @classmethod 236 def compare_fast(cls, repo_iter): 237 """Compare rorps (metadata only) quickly, return report iter""" 238 src_iter = cls.get_source_select() 239 for src_rorp, mir_rorp in rorpiter.Collate2Iters(src_iter, repo_iter): 240 report = get_basic_report(src_rorp, mir_rorp) 241 if report: 242 yield report 243 else: 244 log_success(src_rorp, mir_rorp) 245 246 @classmethod 247 def compare_hash(cls, repo_iter): 248 """Like above, but also compare sha1 sums of any regular files""" 249 250 def hashes_changed(src_rp, mir_rorp): 251 """Return 0 if their data hashes same, 1 otherwise""" 252 verify_sha1 = get_hash(mir_rorp) 253 if not verify_sha1: 254 log.Log( 255 "Warning: Metadata file has no digest for %s, " 256 "unable to compare." % (mir_rorp.get_safeindexpath(), ), 2) 257 return 0 258 elif (src_rp.getsize() == mir_rorp.getsize() 259 and hash.compute_sha1(src_rp) == verify_sha1): 260 return 0 261 return 1 262 263 src_iter = cls.get_source_select() 264 for src_rp, mir_rorp in rorpiter.Collate2Iters(src_iter, repo_iter): 265 report = get_basic_report(src_rp, mir_rorp, hashes_changed) 266 if report: 267 yield report 268 else: 269 log_success(src_rp, mir_rorp) 270 271 @classmethod 272 def compare_full(cls, src_root, repo_iter): 273 """Given repo iter with full data attached, return report iter""" 274 275 def error_handler(exc, src_rp, repo_rorp): 276 log.Log("Error reading file %s" % src_rp.get_safepath(), 2) 277 return 0 # They aren't the same if we get an error 278 279 def data_changed(src_rp, repo_rorp): 280 """Return 0 if full compare of data matches, 1 otherwise""" 281 if src_rp.getsize() != repo_rorp.getsize(): 282 return 1 283 return not robust.check_common_error(error_handler, rpath.cmp, 284 (src_rp, repo_rorp)) 285 286 for repo_rorp in repo_iter: 287 src_rp = src_root.new_index(repo_rorp.index) 288 report = get_basic_report(src_rp, repo_rorp, data_changed) 289 if report: 290 yield report 291 else: 292 log_success(repo_rorp) 293 294 295class CompareReport: 296 """When two files don't match, this tells you how they don't match 297 298 This is necessary because the system that is doing the actual 299 comparing may not be the one printing out the reports. For speed 300 the compare information can be pipelined back to the client 301 connection as an iter of CompareReports. 302 303 """ 304 # self.file is added so that CompareReports can masquerade as 305 # RORPaths when in an iterator, and thus get pipelined. 306 file = None 307 308 def __init__(self, index, reason): 309 self.index = index 310 self.reason = reason 311