1#============================================================= -*-perl-*- 2# 3# BackupPC::PoolWrite package 4# 5# DESCRIPTION 6# 7# This library defines a BackupPC::PoolWrite class for writing 8# files to disk that are candidates for pooling. One instance 9# of this class is used to write each file. The following steps 10# are executed: 11# 12# - As the incoming data arrives, the first 1MB is buffered 13# in memory so the MD5 digest can be computed. 14# 15# - A running comparison against all the candidate pool files 16# (ie: those with the same MD5 digest, usually at most a single 17# file) is done as new incoming data arrives. Up to $MaxFiles 18# simultaneous files can be compared in parallel. This 19# involves reading and uncompressing one or more pool files. 20# 21# - When a pool file no longer matches it is discarded from 22# the search. If there are more than $MaxFiles candidates, one of 23# the new candidates is added to the search, first checking 24# that it matches up to the current point (this requires 25# re-reading one of the other pool files). 26# 27# - When or if no pool files match then the new file is written 28# to disk. This could occur many MB into the file. We don't 29# need to buffer all this data in memory since we can copy it 30# from the last matching pool file, up to the point where it 31# fully matched. 32# 33# - When all the new data is complete, if a pool file exactly 34# matches then the file is simply created as a hardlink to 35# the pool file. 36# 37# AUTHOR 38# Craig Barratt <cbarratt@users.sourceforge.net> 39# 40# COPYRIGHT 41# Copyright (C) 2001-2017 Craig Barratt 42# 43# This program is free software; you can redistribute it and/or modify 44# it under the terms of the GNU General Public License as published by 45# the Free Software Foundation; either version 2 of the License, or 46# (at your option) any later version. 47# 48# This program is distributed in the hope that it will be useful, 49# but WITHOUT ANY WARRANTY; without even the implied warranty of 50# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 51# GNU General Public License for more details. 52# 53# You should have received a copy of the GNU General Public License 54# along with this program; if not, write to the Free Software 55# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 56# 57#======================================================================== 58# 59# Version 3.3.2, released 25 Jan 2017. 60# 61# See http://backuppc.sourceforge.net. 62# 63#======================================================================== 64 65package BackupPC::PoolWrite; 66 67use strict; 68 69use File::Path; 70use Digest::MD5; 71use BackupPC::FileZIO; 72 73sub new 74{ 75 my($class, $bpc, $fileName, $fileSize, $compress) = @_; 76 77 my $self = bless { 78 fileName => $fileName, 79 fileSize => $fileSize, 80 bpc => $bpc, 81 compress => $compress, 82 nWrite => 0, 83 digest => undef, 84 files => [], 85 fileCnt => -1, 86 fhOut => undef, 87 errors => [], 88 data => "", 89 eof => undef, 90 }, $class; 91 92 $self->{hardLinkMax} = $bpc->ConfValue("HardLinkMax"); 93 94 # 95 # Always unlink any current file in case it is already linked 96 # 97 unlink($fileName) if ( -f $fileName ); 98 if ( $fileName =~ m{(.*)/.+} && !-d $1 ) { 99 my $newDir = $1; 100 eval { mkpath($newDir, 0, 0777) }; 101 if ( $@ ) { 102 push(@{$self->{errors}}, "Unable to create directory $newDir for $self->{fileName}"); 103 } 104 } 105 return $self; 106} 107 108my $BufSize = 1048576; # 1MB or 2^20 109my $MaxFiles = 20; # max number of compare files open at one time 110 111sub write 112{ 113 my($a, $dataRef) = @_; 114 115 return if ( $a->{eof} ); 116 $a->{data} .= $$dataRef if ( defined($dataRef) ); 117 return if ( length($a->{data}) < $BufSize && defined($dataRef) ); 118 119 # 120 # Correct the fileSize if it is wrong (rsync might transfer 121 # a file whose length is different to the length sent with the 122 # file list if the file changes between the file list sending 123 # and the file sending). Here we only catch the case where 124 # we haven't computed the digest (ie: we have written no more 125 # than $BufSize). We catch the big file case below. 126 # 127 if ( !defined($dataRef) && !defined($a->{digest}) 128 && $a->{fileSize} != length($a->{data}) ) { 129 #my $newSize = length($a->{data}); 130 #print("Fixing file size from $a->{fileSize} to $newSize\n"); 131 $a->{fileSize} = length($a->{data}); 132 } 133 134 if ( !defined($a->{digest}) && length($a->{data}) > 0 ) { 135 # 136 # build a list of all the candidate matching files 137 # 138 my $md5 = Digest::MD5->new; 139 $a->{fileSize} = length($a->{data}) 140 if ( $a->{fileSize} < length($a->{data}) ); 141 $a->{digest} = $a->{bpc}->Buffer2MD5($md5, $a->{fileSize}, \$a->{data}); 142 if ( !defined($a->{base} = $a->{bpc}->MD52Path($a->{digest}, 143 $a->{compress})) ) { 144 push(@{$a->{errors}}, "Unable to get path from '$a->{digest}'" 145 . " for $a->{fileName}"); 146 } else { 147 while ( @{$a->{files}} < $MaxFiles ) { 148 my $fh; 149 my $fileName = $a->{fileCnt} < 0 ? $a->{base} 150 : "$a->{base}_$a->{fileCnt}"; 151 last if ( !-f $fileName ); 152 # 153 # Don't attempt to match pool files that already 154 # have too many hardlinks. Also, don't match pool 155 # files with only one link since starting in 156 # BackupPC v3.0, BackupPC_nightly could be running 157 # in parallel (and removing those files). This doesn't 158 # eliminate all possible race conditions, but just 159 # reduces the odds. Other design steps eliminate 160 # the remaining race conditions of linking vs 161 # removing. 162 # 163 if ( (stat(_))[3] >= $a->{hardLinkMax} 164 || (stat(_))[3] <= 1 165 || !defined($fh = BackupPC::FileZIO->open($fileName, 0, 166 $a->{compress})) ) { 167 $a->{fileCnt}++; 168 next; 169 } 170 push(@{$a->{files}}, { 171 name => $fileName, 172 fh => $fh, 173 }); 174 $a->{fileCnt}++; 175 } 176 } 177 # 178 # if there are no candidate files then we must write 179 # the new file to disk 180 # 181 if ( !@{$a->{files}} ) { 182 $a->{fhOut} = BackupPC::FileZIO->open($a->{fileName}, 183 1, $a->{compress}); 184 if ( !defined($a->{fhOut}) ) { 185 push(@{$a->{errors}}, "Unable to open $a->{fileName}" 186 . " for writing"); 187 } 188 } 189 } 190 my $dataLen = length($a->{data}); 191 if ( !defined($a->{fhOut}) && length($a->{data}) > 0 ) { 192 # 193 # See if the new chunk of data continues to match the 194 # candidate files. 195 # 196 for ( my $i = 0 ; $i < @{$a->{files}} ; $i++ ) { 197 my($d, $match); 198 my $fileName = $a->{fileCnt} < 0 ? $a->{base} 199 : "$a->{base}_$a->{fileCnt}"; 200 if ( $dataLen > 0 ) { 201 # verify next $dataLen bytes from candidate file 202 my $n = $a->{files}[$i]->{fh}->read(\$d, $dataLen); 203 next if ( $n == $dataLen && $d eq $a->{data} ); 204 } else { 205 # verify candidate file is at EOF 206 my $n = $a->{files}[$i]->{fh}->read(\$d, 100); 207 next if ( $n == 0 ); 208 } 209 #print(" File $a->{files}[$i]->{name} doesn't match\n"); 210 # 211 # this candidate file didn't match. Replace it 212 # with a new candidate file. We have to qualify 213 # any new candidate file by making sure that its 214 # first $a->{nWrite} bytes match, plus the next $dataLen 215 # bytes match $a->{data}. 216 # 217 while ( -f $fileName ) { 218 my $fh; 219 if ( (stat(_))[3] >= $a->{hardLinkMax} 220 || !defined($fh = BackupPC::FileZIO->open($fileName, 0, 221 $a->{compress})) ) { 222 $a->{fileCnt}++; 223 #print(" Discarding $fileName (open failed)\n"); 224 $fileName = "$a->{base}_$a->{fileCnt}"; 225 next; 226 } 227 if ( !$a->{files}[$i]->{fh}->rewind() ) { 228 push(@{$a->{errors}}, 229 "Unable to rewind $a->{files}[$i]->{name}" 230 . " for compare"); 231 } 232 $match = $a->filePartialCompare($a->{files}[$i]->{fh}, $fh, 233 $a->{nWrite}, $dataLen, \$a->{data}); 234 if ( $match ) { 235 $a->{files}[$i]->{fh}->close(); 236 $a->{files}[$i]->{fh} = $fh, 237 $a->{files}[$i]->{name} = $fileName; 238 #print(" Found new candidate $fileName\n"); 239 $a->{fileCnt}++; 240 last; 241 } else { 242 #print(" Discarding $fileName (no match)\n"); 243 } 244 $fh->close(); 245 $a->{fileCnt}++; 246 $fileName = "$a->{base}_$a->{fileCnt}"; 247 } 248 if ( !$match ) { 249 # 250 # We couldn't find another candidate file 251 # 252 if ( @{$a->{files}} == 1 ) { 253 #print(" Exhausted matches, now writing\n"); 254 $a->{fhOut} = BackupPC::FileZIO->open($a->{fileName}, 255 1, $a->{compress}); 256 if ( !defined($a->{fhOut}) ) { 257 push(@{$a->{errors}}, 258 "Unable to open $a->{fileName}" 259 . " for writing"); 260 } else { 261 if ( !$a->{files}[$i]->{fh}->rewind() ) { 262 push(@{$a->{errors}}, 263 "Unable to rewind" 264 . " $a->{files}[$i]->{name} for copy"); 265 } 266 $a->filePartialCopy($a->{files}[$i]->{fh}, $a->{fhOut}, 267 $a->{nWrite}); 268 } 269 } 270 $a->{files}[$i]->{fh}->close(); 271 splice(@{$a->{files}}, $i, 1); 272 $i--; 273 } 274 } 275 } 276 if ( defined($a->{fhOut}) && $dataLen > 0 ) { 277 # 278 # if we are in writing mode then just write the data 279 # 280 my $n = $a->{fhOut}->write(\$a->{data}); 281 if ( $n != $dataLen ) { 282 push(@{$a->{errors}}, "Unable to write $dataLen bytes to" 283 . " $a->{fileName} (got $n)"); 284 } 285 } 286 $a->{nWrite} += $dataLen; 287 $a->{data} = ""; 288 return if ( defined($dataRef) ); 289 290 # 291 # We are at EOF, so finish up 292 # 293 $a->{eof} = 1; 294 295 # 296 # Make sure the fileSize was correct. See above for comments about 297 # rsync. 298 # 299 if ( $a->{nWrite} != $a->{fileSize} ) { 300 # 301 # Oops, fileSize was wrong, so our MD5 digest was wrong and our 302 # effort to match files likely failed. This is ugly, but our 303 # only choice at this point is to re-write the entire file with 304 # the correct length. We need to rename the file, open it for 305 # reading, and then re-write the file with the correct length. 306 # 307 308 #print("Doing big file fixup ($a->{fileSize} != $a->{nWrite})\n"); 309 310 my($fh, $fileName); 311 $a->{fileSize} = $a->{nWrite}; 312 313 if ( defined($a->{fhOut}) ) { 314 if ( $a->{fileName} =~ /(.*)\// ) { 315 $fileName = $1; 316 } else { 317 $fileName = "."; 318 } 319 # 320 # Find a unique target temporary file name 321 # 322 my $i = 0; 323 while ( -f "$fileName/t$$.$i" ) { 324 $i++; 325 } 326 $fileName = "$fileName/t$$.$i"; 327 $a->{fhOut}->close(); 328 if ( !rename($a->{fileName}, $fileName) 329 || !defined($fh = BackupPC::FileZIO->open($fileName, 0, 330 $a->{compress})) ) { 331 push(@{$a->{errors}}, "Can't rename $a->{fileName} -> $fileName" 332 . " or open during size fixup"); 333 } 334 #print("Using temporary name $fileName\n"); 335 } elsif ( defined($a->{files}) && defined($a->{files}[0]) ) { 336 # 337 # We haven't written anything yet, so just use the 338 # compare file to copy from. 339 # 340 $fh = $a->{files}[0]->{fh}; 341 $fh->rewind; 342 #print("Using compare file $a->{files}[0]->{name}\n"); 343 } 344 if ( defined($fh) ) { 345 my $poolWrite = BackupPC::PoolWrite->new($a->{bpc}, $a->{fileName}, 346 $a->{fileSize}, $a->{compress}); 347 my $nRead = 0; 348 349 while ( $nRead < $a->{fileSize} ) { 350 my $thisRead = $a->{fileSize} - $nRead < $BufSize 351 ? $a->{fileSize} - $nRead : $BufSize; 352 my $data; 353 my $n = $fh->read(\$data, $thisRead); 354 if ( $n != $thisRead ) { 355 push(@{$a->{errors}}, 356 "Unable to read $thisRead bytes during resize" 357 . " from temp $fileName (got $n)"); 358 last; 359 } 360 $poolWrite->write(\$data); 361 $nRead += $thisRead; 362 } 363 $fh->close; 364 unlink($fileName) if ( defined($fileName) ); 365 if ( @{$a->{errors}} ) { 366 $poolWrite->close; 367 return (0, $a->{digest}, -s $a->{fileName}, $a->{errors}); 368 } else { 369 return $poolWrite->close; 370 } 371 } 372 } 373 374 if ( $a->{fileSize} == 0 ) { 375 # 376 # Simply create an empty file 377 # 378 local(*OUT); 379 if ( !open(OUT, ">", $a->{fileName}) ) { 380 push(@{$a->{errors}}, "Can't open $a->{fileName} for empty" 381 . " output"); 382 } else { 383 close(OUT); 384 } 385 # 386 # Close the compare files 387 # 388 foreach my $f ( @{$a->{files}} ) { 389 $f->{fh}->close(); 390 } 391 return (1, $a->{digest}, -s $a->{fileName}, $a->{errors}); 392 } elsif ( defined($a->{fhOut}) ) { 393 $a->{fhOut}->close(); 394 # 395 # Close the compare files 396 # 397 foreach my $f ( @{$a->{files}} ) { 398 $f->{fh}->close(); 399 } 400 return (0, $a->{digest}, -s $a->{fileName}, $a->{errors}); 401 } else { 402 if ( @{$a->{files}} == 0 ) { 403 push(@{$a->{errors}}, "Botch, no matches on $a->{fileName}" 404 . " ($a->{digest})"); 405 } elsif ( @{$a->{files}} > 1 ) { 406 # 407 # This is no longer a real error because $Conf{HardLinkMax} 408 # could be hit, thereby creating identical pool files 409 # 410 #my $str = "Unexpected multiple matches on" 411 # . " $a->{fileName} ($a->{digest})\n"; 412 #for ( my $i = 0 ; $i < @{$a->{files}} ; $i++ ) { 413 # $str .= " -> $a->{files}[$i]->{name}\n"; 414 #} 415 #push(@{$a->{errors}}, $str); 416 } 417 for ( my $i = 0 ; $i < @{$a->{files}} ; $i++ ) { 418 if ( link($a->{files}[$i]->{name}, $a->{fileName}) ) { 419 #print(" Linked $a->{fileName} to $a->{files}[$i]->{name}\n"); 420 # 421 # Close the compare files 422 # 423 foreach my $f ( @{$a->{files}} ) { 424 $f->{fh}->close(); 425 } 426 return (1, $a->{digest}, -s $a->{fileName}, $a->{errors}); 427 } 428 } 429 # 430 # We were unable to link to the pool. Either we're at the 431 # hardlink max, or the pool file got deleted. Recover by 432 # writing the matching file, since we still have an open 433 # handle. 434 # 435 for ( my $i = 0 ; $i < @{$a->{files}} ; $i++ ) { 436 if ( !$a->{files}[$i]->{fh}->rewind() ) { 437 push(@{$a->{errors}}, 438 "Unable to rewind $a->{files}[$i]->{name}" 439 . " for copy after link fail"); 440 next; 441 } 442 $a->{fhOut} = BackupPC::FileZIO->open($a->{fileName}, 443 1, $a->{compress}); 444 if ( !defined($a->{fhOut}) ) { 445 push(@{$a->{errors}}, 446 "Unable to open $a->{fileName}" 447 . " for writing after link fail"); 448 } else { 449 $a->filePartialCopy($a->{files}[$i]->{fh}, $a->{fhOut}, 450 $a->{nWrite}); 451 $a->{fhOut}->close; 452 } 453 last; 454 } 455 # 456 # Close the compare files 457 # 458 foreach my $f ( @{$a->{files}} ) { 459 $f->{fh}->close(); 460 } 461 return (0, $a->{digest}, -s $a->{fileName}, $a->{errors}); 462 } 463} 464 465# 466# Finish writing: pass undef dataRef to write so it can do all 467# the work. Returns a 4 element array: 468# 469# (existingFlag, digestString, outputFileLength, errorList) 470# 471sub close 472{ 473 my($a) = @_; 474 475 return $a->write(undef); 476} 477 478# 479# Abort a pool write 480# 481sub abort 482{ 483 my($a) = @_; 484 485 if ( defined($a->{fhOut}) ) { 486 $a->{fhOut}->close(); 487 unlink($a->{fileName}); 488 } 489 foreach my $f ( @{$a->{files}} ) { 490 $f->{fh}->close(); 491 } 492 $a->{files} = []; 493} 494 495# 496# Copy $nBytes from files $fhIn to $fhOut. 497# 498sub filePartialCopy 499{ 500 my($a, $fhIn, $fhOut, $nBytes) = @_; 501 my($nRead); 502 503 while ( $nRead < $nBytes ) { 504 my $thisRead = $nBytes - $nRead < $BufSize 505 ? $nBytes - $nRead : $BufSize; 506 my $data; 507 my $n = $fhIn->read(\$data, $thisRead); 508 if ( $n != $thisRead ) { 509 push(@{$a->{errors}}, 510 "Unable to read $thisRead bytes from " 511 . $fhIn->name . " (got $n)"); 512 return; 513 } 514 $n = $fhOut->write(\$data, $thisRead); 515 if ( $n != $thisRead ) { 516 push(@{$a->{errors}}, 517 "Unable to write $thisRead bytes to " 518 . $fhOut->name . " (got $n)"); 519 return; 520 } 521 $nRead += $thisRead; 522 } 523} 524 525# 526# Compare $nBytes from files $fh0 and $fh1, and also compare additional 527# $extra bytes from $fh1 to $$extraData. 528# 529sub filePartialCompare 530{ 531 my($a, $fh0, $fh1, $nBytes, $extra, $extraData) = @_; 532 my($nRead, $n); 533 my($data0, $data1); 534 535 while ( $nRead < $nBytes ) { 536 my $thisRead = $nBytes - $nRead < $BufSize 537 ? $nBytes - $nRead : $BufSize; 538 $n = $fh0->read(\$data0, $thisRead); 539 if ( $n != $thisRead ) { 540 push(@{$a->{errors}}, "Unable to read $thisRead bytes from " 541 . $fh0->name . " (got $n)"); 542 return; 543 } 544 $n = $fh1->read(\$data1, $thisRead); 545 return 0 if ( $n < $thisRead || $data0 ne $data1 ); 546 $nRead += $thisRead; 547 } 548 if ( $extra > 0 ) { 549 # verify additional bytes 550 $n = $fh1->read(\$data1, $extra); 551 return 0 if ( $n != $extra || $data1 ne $$extraData ); 552 } else { 553 # verify EOF 554 $n = $fh1->read(\$data1, 100); 555 return 0 if ( $n != 0 ); 556 } 557 return 1; 558} 559 560# 561# LinkOrCopy() does a hardlink from oldFile to newFile. 562# 563# If that fails (because there are too many links on oldFile) 564# then oldFile is copied to newFile, and the pool stats are 565# returned to be added to the new file list. That allows 566# BackupPC_link to try again, and to create a new pool file 567# if necessary. 568# 569sub LinkOrCopy 570{ 571 my($bpc, $oldFile, $oldFileComp, $newFile, $newFileComp) = @_; 572 my($nRead, $data); 573 574 unlink($newFile) if ( -f $newFile ); 575 # 576 # Try to link if hardlink limit is ok, and compression types 577 # are the same 578 # 579 return (1, undef) if ( (stat($oldFile))[3] < $bpc->{Conf}{HardLinkMax} 580 && !$oldFileComp == !$newFileComp 581 && link($oldFile, $newFile) ); 582 # 583 # There are too many links on oldFile, or compression 584 # type if different, so now we have to copy it. 585 # 586 # We need to compute the file size, which is expensive 587 # since we need to read the file twice. That's probably 588 # ok since the hardlink limit is rarely hit. 589 # 590 my $readFd = BackupPC::FileZIO->open($oldFile, 0, $oldFileComp); 591 if ( !defined($readFd) ) { 592 return (0, undef, undef, undef, ["LinkOrCopy: can't open $oldFile"]); 593 } 594 while ( $readFd->read(\$data, $BufSize) > 0 ) { 595 $nRead += length($data); 596 } 597 $readFd->rewind(); 598 599 my $poolWrite = BackupPC::PoolWrite->new($bpc, $newFile, 600 $nRead, $newFileComp); 601 while ( $readFd->read(\$data, $BufSize) > 0 ) { 602 $poolWrite->write(\$data); 603 } 604 my($exists, $digest, $outSize, $errs) = $poolWrite->close; 605 606 return ($exists, $digest, $nRead, $outSize, $errs); 607} 608 6091; 610