1package Tie::File; 2 3use strict; 4use warnings; 5 6use Carp ':DEFAULT', 'confess'; 7use POSIX 'SEEK_SET'; 8use Fcntl 'O_CREAT', 'O_RDWR', 'LOCK_EX', 'LOCK_SH', 'O_WRONLY', 'O_RDONLY'; 9use constant O_ACCMODE => O_RDONLY | O_RDWR | O_WRONLY; 10 11 12our $VERSION = "1.09"; 13my $DEFAULT_MEMORY_SIZE = 1<<21; # 2 megabytes 14my $DEFAULT_AUTODEFER_THRESHHOLD = 3; # 3 records 15my $DEFAULT_AUTODEFER_FILELEN_THRESHHOLD = 65536; # 16 disk blocksful 16 17my %good_opt = map {$_ => 1, "-$_" => 1} 18 qw(memory dw_size mode recsep discipline 19 autodefer autochomp autodefer_threshhold concurrent); 20 21our $DIAGNOSTIC = 0; 22our @OFF; # used as a temporary alias in some subroutines. 23our @H; # used as a temporary alias in _annotate_ad_history 24 25sub TIEARRAY { 26 if (@_ % 2 != 0) { 27 croak "usage: tie \@array, $_[0], filename, [option => value]..."; 28 } 29 my ($pack, $file, %opts) = @_; 30 31 # transform '-foo' keys into 'foo' keys 32 for my $key (keys %opts) { 33 unless ($good_opt{$key}) { 34 croak("$pack: Unrecognized option '$key'\n"); 35 } 36 my $okey = $key; 37 if ($key =~ s/^-+//) { 38 $opts{$key} = delete $opts{$okey}; 39 } 40 } 41 42 if ($opts{concurrent}) { 43 croak("$pack: concurrent access not supported yet\n"); 44 } 45 46 unless (defined $opts{memory}) { 47 # default is the larger of the default cache size and the 48 # deferred-write buffer size (if specified) 49 $opts{memory} = $DEFAULT_MEMORY_SIZE; 50 $opts{memory} = $opts{dw_size} 51 if defined $opts{dw_size} && $opts{dw_size} > $DEFAULT_MEMORY_SIZE; 52 # Dora Winifred Read 53 } 54 $opts{dw_size} = $opts{memory} unless defined $opts{dw_size}; 55 if ($opts{dw_size} > $opts{memory}) { 56 croak("$pack: dw_size may not be larger than total memory allocation\n"); 57 } 58 # are we in deferred-write mode? 59 $opts{defer} = 0 unless defined $opts{defer}; 60 $opts{deferred} = {}; # no records are presently deferred 61 $opts{deferred_s} = 0; # count of total bytes in ->{deferred} 62 $opts{deferred_max} = -1; # empty 63 64 # What's a good way to arrange that this class can be overridden? 65 $opts{cache} = Tie::File::Cache->new($opts{memory}); 66 67 # autodeferment is enabled by default 68 $opts{autodefer} = 1 unless defined $opts{autodefer}; 69 $opts{autodeferring} = 0; # but is not initially active 70 $opts{ad_history} = []; 71 $opts{autodefer_threshhold} = $DEFAULT_AUTODEFER_THRESHHOLD 72 unless defined $opts{autodefer_threshhold}; 73 $opts{autodefer_filelen_threshhold} = $DEFAULT_AUTODEFER_FILELEN_THRESHHOLD 74 unless defined $opts{autodefer_filelen_threshhold}; 75 76 $opts{offsets} = [0]; 77 $opts{filename} = $file; 78 unless (defined $opts{recsep}) { 79 $opts{recsep} = _default_recsep(); 80 } 81 $opts{recseplen} = length($opts{recsep}); 82 if ($opts{recseplen} == 0) { 83 croak "Empty record separator not supported by $pack"; 84 } 85 86 $opts{autochomp} = 1 unless defined $opts{autochomp}; 87 88 $opts{mode} = O_CREAT|O_RDWR unless defined $opts{mode}; 89 $opts{rdonly} = (($opts{mode} & O_ACCMODE) == O_RDONLY); 90 $opts{sawlastrec} = undef; 91 92 my $fh; 93 94 if (UNIVERSAL::isa($file, 'GLOB')) { 95 # We use 1 here on the theory that some systems 96 # may not indicate failure if we use 0. 97 # MSWin32 does not indicate failure with 0, but I don't know if 98 # it will indicate failure with 1 or not. 99 unless (seek $file, 1, SEEK_SET) { 100 croak "$pack: your filehandle does not appear to be seekable"; 101 } 102 seek $file, 0, SEEK_SET; # put it back 103 $fh = $file; # setting binmode is the user's problem 104 } elsif (ref $file) { 105 croak "usage: tie \@array, $pack, filename, [option => value]..."; 106 } else { 107 sysopen $fh, $file, $opts{mode}, 0666 or return; 108 binmode $fh; 109 ++$opts{ourfh}; 110 } 111 { my $ofh = select $fh; $| = 1; select $ofh } # autoflush on write 112 if (defined $opts{discipline}) { 113 eval { binmode($fh, $opts{discipline}) }; 114 croak $@ if $@ =~ /Unknown discipline|IO layers .* unavailable/; 115 die if $@; 116 } 117 $opts{fh} = $fh; 118 119 bless \%opts => $pack; 120} 121 122sub FETCH { 123 my ($self, $n) = @_; 124 my $rec; 125 126 # check the defer buffer 127 $rec = $self->{deferred}{$n} if exists $self->{deferred}{$n}; 128 $rec = $self->_fetch($n) unless defined $rec; 129 130 # inlined _chomp1 131 substr($rec, - $self->{recseplen}) = "" 132 if defined $rec && $self->{autochomp}; 133 $rec; 134} 135 136# Chomp many records in-place; return nothing useful 137sub _chomp { 138 my $self = shift; 139 return unless $self->{autochomp}; 140 if ($self->{autochomp}) { 141 for (@_) { 142 next unless defined; 143 substr($_, - $self->{recseplen}) = ""; 144 } 145 } 146} 147 148# Chomp one record in-place; return modified record 149sub _chomp1 { 150 my ($self, $rec) = @_; 151 return $rec unless $self->{autochomp}; 152 return unless defined $rec; 153 substr($rec, - $self->{recseplen}) = ""; 154 $rec; 155} 156 157sub _fetch { 158 my ($self, $n) = @_; 159 160 # check the record cache 161 { my $cached = $self->{cache}->lookup($n); 162 return $cached if defined $cached; 163 } 164 165 if ($#{$self->{offsets}} < $n) { 166 return if $self->{eof}; # request for record beyond end of file 167 my $o = $self->_fill_offsets_to($n); 168 # If it's still undefined, there is no such record, so return 'undef' 169 return unless defined $o; 170 } 171 172 my $fh = $self->{FH}; 173 $self->_seek($n); # we can do this now that offsets is populated 174 my $rec = $self->_read_record; 175 176# If we happen to have just read the first record, check to see if 177# the length of the record matches what 'tell' says. If not, Tie::File 178# won't work, and should drop dead. 179# 180# if ($n == 0 && defined($rec) && tell($self->{fh}) != length($rec)) { 181# if (defined $self->{discipline}) { 182# croak "I/O discipline $self->{discipline} not supported"; 183# } else { 184# croak "File encoding not supported"; 185# } 186# } 187 188 $self->{cache}->insert($n, $rec) if defined $rec && not $self->{flushing}; 189 $rec; 190} 191 192sub STORE { 193 my ($self, $n, $rec) = @_; 194 die "STORE called from _check_integrity!" if $DIAGNOSTIC; 195 196 $self->_fixrecs($rec); 197 198 if ($self->{autodefer}) { 199 $self->_annotate_ad_history($n); 200 } 201 202 return $self->_store_deferred($n, $rec) if $self->_is_deferring; 203 204 205 # We need this to decide whether the new record will fit 206 # It incidentally populates the offsets table 207 # Note we have to do this before we alter the cache 208 # 20020324 Wait, but this DOES alter the cache. TODO BUG? 209 my $oldrec = $self->_fetch($n); 210 211 if (not defined $oldrec) { 212 # We're storing a record beyond the end of the file 213 $self->_extend_file_to($n+1); 214 $oldrec = $self->{recsep}; 215 } 216# return if $oldrec eq $rec; # don't bother 217 my $len_diff = length($rec) - length($oldrec); 218 219 # length($oldrec) here is not consistent with text mode TODO XXX BUG 220 $self->_mtwrite($rec, $self->{offsets}[$n], length($oldrec)); 221 $self->_oadjust([$n, 1, $rec]); 222 $self->{cache}->update($n, $rec); 223} 224 225sub _store_deferred { 226 my ($self, $n, $rec) = @_; 227 $self->{cache}->remove($n); 228 my $old_deferred = $self->{deferred}{$n}; 229 230 if (defined $self->{deferred_max} && $n > $self->{deferred_max}) { 231 $self->{deferred_max} = $n; 232 } 233 $self->{deferred}{$n} = $rec; 234 235 my $len_diff = length($rec); 236 $len_diff -= length($old_deferred) if defined $old_deferred; 237 $self->{deferred_s} += $len_diff; 238 $self->{cache}->adj_limit(-$len_diff); 239 if ($self->{deferred_s} > $self->{dw_size}) { 240 $self->_flush; 241 } elsif ($self->_cache_too_full) { 242 $self->_cache_flush; 243 } 244} 245 246# Remove a single record from the deferred-write buffer without writing it 247# The record need not be present 248sub _delete_deferred { 249 my ($self, $n) = @_; 250 my $rec = delete $self->{deferred}{$n}; 251 return unless defined $rec; 252 253 if (defined $self->{deferred_max} 254 && $n == $self->{deferred_max}) { 255 undef $self->{deferred_max}; 256 } 257 258 $self->{deferred_s} -= length $rec; 259 $self->{cache}->adj_limit(length $rec); 260} 261 262sub FETCHSIZE { 263 my $self = shift; 264 my $n = $self->{eof} ? $#{$self->{offsets}} : $self->_fill_offsets; 265 266 my $top_deferred = $self->_defer_max; 267 $n = $top_deferred+1 if defined $top_deferred && $n < $top_deferred+1; 268 $n; 269} 270 271sub STORESIZE { 272 my ($self, $len) = @_; 273 274 if ($self->{autodefer}) { 275 $self->_annotate_ad_history('STORESIZE'); 276 } 277 278 my $olen = $self->FETCHSIZE; 279 return if $len == $olen; # Woo-hoo! 280 281 # file gets longer 282 if ($len > $olen) { 283 if ($self->_is_deferring) { 284 for ($olen .. $len-1) { 285 $self->_store_deferred($_, $self->{recsep}); 286 } 287 } else { 288 $self->_extend_file_to($len); 289 } 290 return; 291 } 292 293 # file gets shorter 294 if ($self->_is_deferring) { 295 # TODO maybe replace this with map-plus-assignment? 296 for (grep $_ >= $len, keys %{$self->{deferred}}) { 297 $self->_delete_deferred($_); 298 } 299 $self->{deferred_max} = $len-1; 300 } 301 302 $self->_seek($len); 303 $self->_chop_file; 304 $#{$self->{offsets}} = $len; 305# $self->{offsets}[0] = 0; # in case we just chopped this 306 307 $self->{cache}->remove(grep $_ >= $len, $self->{cache}->ckeys); 308} 309 310### OPTIMIZE ME 311### It should not be necessary to do FETCHSIZE 312### Just seek to the end of the file. 313sub PUSH { 314 my $self = shift; 315 $self->SPLICE($self->FETCHSIZE, scalar(@_), @_); 316 317 # No need to return: 318 # $self->FETCHSIZE; # because av.c takes care of this for me 319} 320 321sub POP { 322 my $self = shift; 323 my $size = $self->FETCHSIZE; 324 return if $size == 0; 325# print STDERR "# POPPITY POP POP POP\n"; 326 scalar $self->SPLICE($size-1, 1); 327} 328 329sub SHIFT { 330 my $self = shift; 331 scalar $self->SPLICE(0, 1); 332} 333 334sub UNSHIFT { 335 my $self = shift; 336 $self->SPLICE(0, 0, @_); 337 # $self->FETCHSIZE; # av.c takes care of this for me 338} 339 340sub CLEAR { 341 my $self = shift; 342 343 if ($self->{autodefer}) { 344 $self->_annotate_ad_history('CLEAR'); 345 } 346 347 $self->_seekb(0); 348 $self->_chop_file; 349 $self->{cache}->set_limit($self->{memory}); 350 $self->{cache}->empty; 351 @{$self->{offsets}} = (0); 352 %{$self->{deferred}}= (); 353 $self->{deferred_s} = 0; 354 $self->{deferred_max} = -1; 355} 356 357sub EXTEND { 358 my ($self, $n) = @_; 359 360 # No need to pre-extend anything in this case 361 return if $self->_is_deferring; 362 363 $self->_fill_offsets_to($n); 364 $self->_extend_file_to($n); 365} 366 367sub DELETE { 368 my ($self, $n) = @_; 369 370 if ($self->{autodefer}) { 371 $self->_annotate_ad_history('DELETE'); 372 } 373 374 my $lastrec = $self->FETCHSIZE-1; 375 my $rec = $self->FETCH($n); 376 $self->_delete_deferred($n) if $self->_is_deferring; 377 if ($n == $lastrec) { 378 $self->_seek($n); 379 $self->_chop_file; 380 $#{$self->{offsets}}--; 381 $self->{cache}->remove($n); 382 # perhaps in this case I should also remove trailing null records? 383 # 20020316 384 # Note that delete @a[-3..-1] deletes the records in the wrong order, 385 # so we only chop the very last one out of the file. We could repair this 386 # by tracking deleted records inside the object. 387 } elsif ($n < $lastrec) { 388 $self->STORE($n, ""); 389 } 390 $rec; 391} 392 393sub EXISTS { 394 my ($self, $n) = @_; 395 return 1 if exists $self->{deferred}{$n}; 396 $n < $self->FETCHSIZE; 397} 398 399sub SPLICE { 400 my $self = shift; 401 402 if ($self->{autodefer}) { 403 $self->_annotate_ad_history('SPLICE'); 404 } 405 406 $self->_flush if $self->_is_deferring; # move this up? 407 if (wantarray) { 408 $self->_chomp(my @a = $self->_splice(@_)); 409 @a; 410 } else { 411 $self->_chomp1(scalar $self->_splice(@_)); 412 } 413} 414 415sub DESTROY { 416 my $self = shift; 417 $self->flush if $self->_is_deferring; 418 $self->{cache}->delink if defined $self->{cache}; # break circular link 419 if ($self->{fh} and $self->{ourfh}) { 420 delete $self->{ourfh}; 421 close delete $self->{fh}; 422 } 423} 424 425sub _splice { 426 my ($self, $pos, $nrecs, @data) = @_; 427 my @result; 428 429 $pos = 0 unless defined $pos; 430 431 # Deal with negative and other out-of-range positions 432 # Also set default for $nrecs 433 { 434 my $oldsize = $self->FETCHSIZE; 435 $nrecs = $oldsize unless defined $nrecs; 436 my $oldpos = $pos; 437 438 if ($pos < 0) { 439 $pos += $oldsize; 440 if ($pos < 0) { 441 croak "Modification of non-creatable array value attempted, " . 442 "subscript $oldpos"; 443 } 444 } 445 446 if ($pos > $oldsize) { 447 return unless @data; 448 $pos = $oldsize; # This is what perl does for normal arrays 449 } 450 451 # The manual is very unclear here 452 if ($nrecs < 0) { 453 $nrecs = $oldsize - $pos + $nrecs; 454 $nrecs = 0 if $nrecs < 0; 455 } 456 457 # nrecs is too big---it really means "until the end" 458 # 20030507 459 if ($nrecs + $pos > $oldsize) { 460 $nrecs = $oldsize - $pos; 461 } 462 } 463 464 $self->_fixrecs(@data); 465 my $data = join '', @data; 466 my $datalen = length $data; 467 my $oldlen = 0; 468 469 # compute length of data being removed 470 for ($pos .. $pos+$nrecs-1) { 471 last unless defined $self->_fill_offsets_to($_); 472 my $rec = $self->_fetch($_); 473 last unless defined $rec; 474 push @result, $rec; 475 476 # Why don't we just use length($rec) here? 477 # Because that record might have come from the cache. _splice 478 # might have been called to flush out the deferred-write records, 479 # and in this case length($rec) is the length of the record to be 480 # *written*, not the length of the actual record in the file. But 481 # the offsets are still true. 20020322 482 $oldlen += $self->{offsets}[$_+1] - $self->{offsets}[$_] 483 if defined $self->{offsets}[$_+1]; 484 } 485 $self->_fill_offsets_to($pos+$nrecs); 486 487 # Modify the file 488 $self->_mtwrite($data, $self->{offsets}[$pos], $oldlen); 489 # Adjust the offsets table 490 $self->_oadjust([$pos, $nrecs, @data]); 491 492 { # Take this read cache stuff out into a separate function 493 # You made a half-attempt to put it into _oadjust. 494 # Finish something like that up eventually. 495 # STORE also needs to do something similarish 496 497 # update the read cache, part 1 498 # modified records 499 for ($pos .. $pos+$nrecs-1) { 500 my $new = $data[$_-$pos]; 501 if (defined $new) { 502 $self->{cache}->update($_, $new); 503 } else { 504 $self->{cache}->remove($_); 505 } 506 } 507 508 # update the read cache, part 2 509 # moved records - records past the site of the change 510 # need to be renumbered 511 # Maybe merge this with the previous block? 512 { 513 my @oldkeys = grep $_ >= $pos + $nrecs, $self->{cache}->ckeys; 514 my @newkeys = map $_-$nrecs+@data, @oldkeys; 515 $self->{cache}->rekey(\@oldkeys, \@newkeys); 516 } 517 518 # Now there might be too much data in the cache, if we spliced out 519 # some short records and spliced in some long ones. If so, flush 520 # the cache. 521 $self->_cache_flush; 522 } 523 524 # Yes, the return value of 'splice' *is* actually this complicated 525 wantarray ? @result : @result ? $result[-1] : undef; 526} 527 528 529# write data into the file 530# $data is the data to be written. 531# it should be written at position $pos, and should overwrite 532# exactly $len of the following bytes. 533# Note that if length($data) > $len, the subsequent bytes will have to 534# be moved up, and if length($data) < $len, they will have to 535# be moved down 536sub _twrite { 537 my ($self, $data, $pos, $len) = @_; 538 539 unless (defined $pos) { 540 die "\$pos was undefined in _twrite"; 541 } 542 543 my $len_diff = length($data) - $len; 544 545 if ($len_diff == 0) { # Woo-hoo! 546 my $fh = $self->{fh}; 547 $self->_seekb($pos); 548 $self->_write_record($data); 549 return; # well, that was easy. 550 } 551 552 # the two records are of different lengths 553 # our strategy here: rewrite the tail of the file, 554 # reading ahead one buffer at a time 555 # $bufsize is required to be at least as large as the data we're overwriting 556 my $bufsize = _bufsize($len_diff); 557 my ($writepos, $readpos) = ($pos, $pos+$len); 558 my $next_block; 559 my $more_data; 560 561 # Seems like there ought to be a way to avoid the repeated code 562 # and the special case here. The read(1) is also a little weird. 563 # Think about this. 564 do { 565 $self->_seekb($readpos); 566 my $br = read $self->{fh}, $next_block, $bufsize; 567 $more_data = read $self->{fh}, my($dummy), 1; 568 $self->_seekb($writepos); 569 $self->_write_record($data); 570 $readpos += $br; 571 $writepos += length $data; 572 $data = $next_block; 573 } while $more_data; 574 $self->_seekb($writepos); 575 $self->_write_record($next_block); 576 577 # There might be leftover data at the end of the file 578 $self->_chop_file if $len_diff < 0; 579} 580 581# _iwrite(D, S, E) 582# Insert text D at position S. 583# Let C = E-S-|D|. If C < 0; die. 584# Data in [S,S+C) is copied to [S+D,S+D+C) = [S+D,E). 585# Data in [S+C = E-D, E) is returned. Data in [E, oo) is untouched. 586# 587# In a later version, don't read the entire intervening area into 588# memory at once; do the copying block by block. 589sub _iwrite { 590 my $self = shift; 591 my ($D, $s, $e) = @_; 592 my $d = length $D; 593 my $c = $e-$s-$d; 594 local *FH = $self->{fh}; 595 confess "Not enough space to insert $d bytes between $s and $e" 596 if $c < 0; 597 confess "[$s,$e) is an invalid insertion range" if $e < $s; 598 599 $self->_seekb($s); 600 read FH, my $buf, $e-$s; 601 602 $D .= substr($buf, 0, $c, ""); 603 604 $self->_seekb($s); 605 $self->_write_record($D); 606 607 return $buf; 608} 609 610# Like _twrite, but the data-pos-len triple may be repeated; you may 611# write several chunks. All the writing will be done in 612# one pass. Chunks SHALL be in ascending order and SHALL NOT overlap. 613sub _mtwrite { 614 my $self = shift; 615 my $unwritten = ""; 616 my $delta = 0; 617 618 @_ % 3 == 0 619 or die "Arguments to _mtwrite did not come in groups of three"; 620 621 while (@_) { 622 my ($data, $pos, $len) = splice @_, 0, 3; 623 my $end = $pos + $len; # The OLD end of the segment to be replaced 624 $data = $unwritten . $data; 625 $delta -= length($unwritten); 626 $unwritten = ""; 627 $pos += $delta; # This is where the data goes now 628 my $dlen = length $data; 629 $self->_seekb($pos); 630 if ($len >= $dlen) { # the data will fit 631 $self->_write_record($data); 632 $delta += ($dlen - $len); # everything following moves down by this much 633 $data = ""; # All the data in the buffer has been written 634 } else { # won't fit 635 my $writable = substr($data, 0, $len - $delta, ""); 636 $self->_write_record($writable); 637 $delta += ($dlen - $len); # everything following moves down by this much 638 } 639 640 # At this point we've written some but maybe not all of the data. 641 # There might be a gap to close up, or $data might still contain a 642 # bunch of unwritten data that didn't fit. 643 my $ndlen = length $data; 644 if ($delta == 0) { 645 $self->_write_record($data); 646 } elsif ($delta < 0) { 647 # upcopy (close up gap) 648 if (@_) { 649 $self->_upcopy($end, $end + $delta, $_[1] - $end); 650 } else { 651 $self->_upcopy($end, $end + $delta); 652 } 653 } else { 654 # downcopy (insert data that didn't fit; replace this data in memory 655 # with _later_ data that doesn't fit) 656 if (@_) { 657 $unwritten = $self->_downcopy($data, $end, $_[1] - $end); 658 } else { 659 # Make the file longer to accommodate the last segment that doesn't 660 $unwritten = $self->_downcopy($data, $end); 661 } 662 } 663 } 664} 665 666# Copy block of data of length $len from position $spos to position $dpos 667# $dpos must be <= $spos 668# 669# If $len is undefined, go all the way to the end of the file 670# and then truncate it ($spos - $dpos bytes will be removed) 671sub _upcopy { 672 my $blocksize = 8192; 673 my ($self, $spos, $dpos, $len) = @_; 674 if ($dpos > $spos) { 675 die "source ($spos) was upstream of destination ($dpos) in _upcopy"; 676 } elsif ($dpos == $spos) { 677 return; 678 } 679 680 while (! defined ($len) || $len > 0) { 681 my $readsize = ! defined($len) ? $blocksize 682 : $len > $blocksize ? $blocksize 683 : $len; 684 685 my $fh = $self->{fh}; 686 $self->_seekb($spos); 687 my $bytes_read = read $fh, my($data), $readsize; 688 $self->_seekb($dpos); 689 if ($data eq "") { 690 $self->_chop_file; 691 last; 692 } 693 $self->_write_record($data); 694 $spos += $bytes_read; 695 $dpos += $bytes_read; 696 $len -= $bytes_read if defined $len; 697 } 698} 699 700# Write $data into a block of length $len at position $pos, 701# moving everything in the block forwards to make room. 702# Instead of writing the last length($data) bytes from the block 703# (because there isn't room for them any longer) return them. 704# 705# Undefined $len means 'until the end of the file' 706sub _downcopy { 707 my $blocksize = 8192; 708 my ($self, $data, $pos, $len) = @_; 709 my $fh = $self->{fh}; 710 711 while (! defined $len || $len > 0) { 712 my $readsize = ! defined($len) ? $blocksize 713 : $len > $blocksize? $blocksize : $len; 714 $self->_seekb($pos); 715 read $fh, my($old), $readsize; 716 my $last_read_was_short = length($old) < $readsize; 717 $data .= $old; 718 my $writable; 719 if ($last_read_was_short) { 720 # If last read was short, then $data now contains the entire rest 721 # of the file, so there's no need to write only one block of it 722 $writable = $data; 723 $data = ""; 724 } else { 725 $writable = substr($data, 0, $readsize, ""); 726 } 727 last if $writable eq ""; 728 $self->_seekb($pos); 729 $self->_write_record($writable); 730 last if $last_read_was_short && $data eq ""; 731 $len -= $readsize if defined $len; 732 $pos += $readsize; 733 } 734 return $data; 735} 736 737# Adjust the object data structures following an '_mtwrite' 738# Arguments are 739# [$pos, $nrecs, @length] items 740# indicating that $nrecs records were removed at $recpos (a record offset) 741# and replaced with records of length @length... 742# Arguments guarantee that $recpos is strictly increasing. 743# No return value 744sub _oadjust { 745 my $self = shift; 746 my $delta = 0; 747 my $delta_recs = 0; 748 my $prev_end = -1; 749 750 for (@_) { 751 my ($pos, $nrecs, @data) = @$_; 752 $pos += $delta_recs; 753 754 # Adjust the offsets of the records after the previous batch up 755 # to the first new one of this batch 756 for my $i ($prev_end+2 .. $pos - 1) { 757 $self->{offsets}[$i] += $delta; 758 } 759 760 $prev_end = $pos + @data - 1; # last record moved on this pass 761 762 # Remove the offsets for the removed records; 763 # replace with the offsets for the inserted records 764 my @newoff = ($self->{offsets}[$pos] + $delta); 765 for my $i (0 .. $#data) { 766 my $newlen = length $data[$i]; 767 push @newoff, $newoff[$i] + $newlen; 768 $delta += $newlen; 769 } 770 771 for my $i ($pos .. $pos+$nrecs-1) { 772 last if $i+1 > $#{$self->{offsets}}; 773 my $oldlen = $self->{offsets}[$i+1] - $self->{offsets}[$i]; 774 $delta -= $oldlen; 775 } 776 777 # replace old offsets with new 778 splice @{$self->{offsets}}, $pos, $nrecs+1, @newoff; 779 # What if we just spliced out the end of the offsets table? 780 # shouldn't we clear $self->{eof}? Test for this XXX BUG TODO 781 782 $delta_recs += @data - $nrecs; # net change in total number of records 783 } 784 785 # The trailing records at the very end of the file 786 if ($delta) { 787 for my $i ($prev_end+2 .. $#{$self->{offsets}}) { 788 $self->{offsets}[$i] += $delta; 789 } 790 } 791 792 # If we scrubbed out all known offsets, regenerate the trivial table 793 # that knows that the file does indeed start at 0. 794 $self->{offsets}[0] = 0 unless @{$self->{offsets}}; 795 # If the file got longer, the offsets table is no longer complete 796 # $self->{eof} = 0 if $delta_recs > 0; 797 798 # Now there might be too much data in the cache, if we spliced out 799 # some short records and spliced in some long ones. If so, flush 800 # the cache. 801 $self->_cache_flush; 802} 803 804# If a record does not already end with the appropriate terminator 805# string, append one. 806sub _fixrecs { 807 my $self = shift; 808 for (@_) { 809 $_ = "" unless defined $_; 810 $_ .= $self->{recsep} 811 unless substr($_, - $self->{recseplen}) eq $self->{recsep}; 812 } 813} 814 815 816################################################################ 817# 818# Basic read, write, and seek 819# 820 821# seek to the beginning of record #$n 822# Assumes that the offsets table is already correctly populated 823# 824# Note that $n=-1 has a special meaning here: It means the start of 825# the last known record; this may or may not be the very last record 826# in the file, depending on whether the offsets table is fully populated. 827# 828sub _seek { 829 my ($self, $n) = @_; 830 my $o = $self->{offsets}[$n]; 831 defined($o) 832 or confess("logic error: undefined offset for record $n"); 833 seek $self->{fh}, $o, SEEK_SET 834 or confess "Couldn't seek filehandle: $!"; # "Should never happen." 835} 836 837# seek to byte $b in the file 838sub _seekb { 839 my ($self, $b) = @_; 840 seek $self->{fh}, $b, SEEK_SET 841 or die "Couldn't seek filehandle: $!"; # "Should never happen." 842} 843 844# populate the offsets table up to the beginning of record $n 845# return the offset of record $n 846sub _fill_offsets_to { 847 my ($self, $n) = @_; 848 849 return $self->{offsets}[$n] if $self->{eof}; 850 851 my $fh = $self->{fh}; 852 local *OFF = $self->{offsets}; 853 my $rec; 854 855 until ($#OFF >= $n) { 856 $self->_seek(-1); # tricky -- see comment at _seek 857 $rec = $self->_read_record; 858 if (defined $rec) { 859 push @OFF, int(tell $fh); # Tels says that int() saves memory here 860 } else { 861 $self->{eof} = 1; 862 return; # It turns out there is no such record 863 } 864 } 865 866 # we have now read all the records up to record n-1, 867 # so we can return the offset of record n 868 $OFF[$n]; 869} 870 871sub _fill_offsets { 872 my ($self) = @_; 873 874 my $fh = $self->{fh}; 875 local *OFF = $self->{offsets}; 876 877 $self->_seek(-1); # tricky -- see comment at _seek 878 879 # Tels says that inlining read_record() would make this loop 880 # five times faster. 20030508 881 while ( defined $self->_read_record()) { 882 # int() saves us memory here 883 push @OFF, int(tell $fh); 884 } 885 886 $self->{eof} = 1; 887 $#OFF; 888} 889 890# assumes that $rec is already suitably terminated 891sub _write_record { 892 my ($self, $rec) = @_; 893 my $fh = $self->{fh}; 894 local $\ = ""; 895 print $fh $rec 896 or die "Couldn't write record: $!"; # "Should never happen." 897# $self->{_written} += length($rec); 898} 899 900sub _read_record { 901 my $self = shift; 902 my $rec; 903 { local $/ = $self->{recsep}; 904 my $fh = $self->{fh}; 905 $rec = <$fh>; 906 } 907 return unless defined $rec; 908 if (substr($rec, -$self->{recseplen}) ne $self->{recsep}) { 909 # improperly terminated final record --- quietly fix it. 910# my $ac = substr($rec, -$self->{recseplen}); 911# $ac =~ s/\n/\\n/g; 912 $self->{sawlastrec} = 1; 913 unless ($self->{rdonly}) { 914 local $\ = ""; 915 my $fh = $self->{fh}; 916 print $fh $self->{recsep}; 917 } 918 $rec .= $self->{recsep}; 919 } 920# $self->{_read} += length($rec) if defined $rec; 921 $rec; 922} 923 924sub _rw_stats { 925 my $self = shift; 926 @{$self}{'_read', '_written'}; 927} 928 929################################################################ 930# 931# Read cache management 932 933sub _cache_flush { 934 my ($self) = @_; 935 $self->{cache}->reduce_size_to($self->{memory} - $self->{deferred_s}); 936} 937 938sub _cache_too_full { 939 my $self = shift; 940 $self->{cache}->bytes + $self->{deferred_s} >= $self->{memory}; 941} 942 943################################################################ 944# 945# File custodial services 946# 947 948 949# We have read to the end of the file and have the offsets table 950# entirely populated. Now we need to write a new record beyond 951# the end of the file. We prepare for this by writing 952# empty records into the file up to the position we want 953# 954# assumes that the offsets table already contains the offset of record $n, 955# if it exists, and extends to the end of the file if not. 956sub _extend_file_to { 957 my ($self, $n) = @_; 958 $self->_seek(-1); # position after the end of the last record 959 my $pos = $self->{offsets}[-1]; 960 961 # the offsets table has one entry more than the total number of records 962 my $extras = $n - $#{$self->{offsets}}; 963 964 # Todo : just use $self->{recsep} x $extras here? 965 while ($extras-- > 0) { 966 $self->_write_record($self->{recsep}); 967 push @{$self->{offsets}}, int(tell $self->{fh}); 968 } 969} 970 971# Truncate the file at the current position 972sub _chop_file { 973 my $self = shift; 974 truncate $self->{fh}, tell($self->{fh}); 975} 976 977 978# compute the size of a buffer suitable for moving 979# all the data in a file forward $n bytes 980# ($n may be negative) 981# The result should be at least $n. 982sub _bufsize { 983 my $n = shift; 984 return 8192 if $n <= 0; 985 my $b = $n & ~8191; 986 $b += 8192 if $n & 8191; 987 $b; 988} 989 990################################################################ 991# 992# Miscellaneous public methods 993# 994 995# Lock the file 996sub flock { 997 my ($self, $op) = @_; 998 unless (@_ <= 3) { 999 my $pack = ref $self; 1000 croak "Usage: $pack\->flock([OPERATION])"; 1001 } 1002 my $fh = $self->{fh}; 1003 $op = LOCK_EX unless defined $op; 1004 my $locked = flock $fh, $op; 1005 1006 if ($locked && ($op & (LOCK_EX | LOCK_SH))) { 1007 # If you're locking the file, then presumably it's because 1008 # there might have been a write access by another process. 1009 # In that case, the read cache contents and the offsets table 1010 # might be invalid, so discard them. 20030508 1011 $self->{offsets} = [0]; 1012 $self->{cache}->empty; 1013 } 1014 1015 $locked; 1016} 1017 1018# Get/set autochomp option 1019sub autochomp { 1020 my $self = shift; 1021 if (@_) { 1022 my $old = $self->{autochomp}; 1023 $self->{autochomp} = shift; 1024 $old; 1025 } else { 1026 $self->{autochomp}; 1027 } 1028} 1029 1030# Get offset table entries; returns offset of nth record 1031sub offset { 1032 my ($self, $n) = @_; 1033 1034 if ($#{$self->{offsets}} < $n) { 1035 return if $self->{eof}; # request for record beyond the end of file 1036 my $o = $self->_fill_offsets_to($n); 1037 # If it's still undefined, there is no such record, so return 'undef' 1038 return unless defined $o; 1039 } 1040 1041 $self->{offsets}[$n]; 1042} 1043 1044sub discard_offsets { 1045 my $self = shift; 1046 $self->{offsets} = [0]; 1047} 1048 1049################################################################ 1050# 1051# Matters related to deferred writing 1052# 1053 1054# Defer writes 1055sub defer { 1056 my $self = shift; 1057 $self->_stop_autodeferring; 1058 @{$self->{ad_history}} = (); 1059 $self->{defer} = 1; 1060} 1061 1062# Flush deferred writes 1063# 1064# This could be better optimized to write the file in one pass, instead 1065# of one pass per block of records. But that will require modifications 1066# to _twrite, so I should have a good _twrite test suite first. 1067sub flush { 1068 my $self = shift; 1069 1070 $self->_flush; 1071 $self->{defer} = 0; 1072} 1073 1074sub _old_flush { 1075 my $self = shift; 1076 my @writable = sort {$a<=>$b} (keys %{$self->{deferred}}); 1077 1078 while (@writable) { 1079 # gather all consecutive records from the front of @writable 1080 my $first_rec = shift @writable; 1081 my $last_rec = $first_rec+1; 1082 ++$last_rec, shift @writable while @writable && $last_rec == $writable[0]; 1083 --$last_rec; 1084 $self->_fill_offsets_to($last_rec); 1085 $self->_extend_file_to($last_rec); 1086 $self->_splice($first_rec, $last_rec-$first_rec+1, 1087 @{$self->{deferred}}{$first_rec .. $last_rec}); 1088 } 1089 1090 $self->_discard; # clear out defered-write-cache 1091} 1092 1093sub _flush { 1094 my $self = shift; 1095 my @writable = sort {$a<=>$b} (keys %{$self->{deferred}}); 1096 my @args; 1097 my @adjust; 1098 1099 while (@writable) { 1100 # gather all consecutive records from the front of @writable 1101 my $first_rec = shift @writable; 1102 my $last_rec = $first_rec+1; 1103 ++$last_rec, shift @writable while @writable && $last_rec == $writable[0]; 1104 --$last_rec; 1105 my $end = $self->_fill_offsets_to($last_rec+1); 1106 if (not defined $end) { 1107 $self->_extend_file_to($last_rec); 1108 $end = $self->{offsets}[$last_rec]; 1109 } 1110 my ($start) = $self->{offsets}[$first_rec]; 1111 push @args, 1112 join("", @{$self->{deferred}}{$first_rec .. $last_rec}), # data 1113 $start, # position 1114 $end-$start; # length 1115 push @adjust, [$first_rec, # starting at this position... 1116 $last_rec-$first_rec+1, # this many records... 1117 # are replaced with these... 1118 @{$self->{deferred}}{$first_rec .. $last_rec}, 1119 ]; 1120 } 1121 1122 $self->_mtwrite(@args); # write multiple record groups 1123 $self->_discard; # clear out defered-write-cache 1124 $self->_oadjust(@adjust); 1125} 1126 1127# Discard deferred writes and disable future deferred writes 1128sub discard { 1129 my $self = shift; 1130 $self->_discard; 1131 $self->{defer} = 0; 1132} 1133 1134# Discard deferred writes, but retain old deferred writing mode 1135sub _discard { 1136 my $self = shift; 1137 %{$self->{deferred}} = (); 1138 $self->{deferred_s} = 0; 1139 $self->{deferred_max} = -1; 1140 $self->{cache}->set_limit($self->{memory}); 1141} 1142 1143# Deferred writing is enabled, either explicitly ($self->{defer}) 1144# or automatically ($self->{autodeferring}) 1145sub _is_deferring { 1146 my $self = shift; 1147 $self->{defer} || $self->{autodeferring}; 1148} 1149 1150# The largest record number of any deferred record 1151sub _defer_max { 1152 my $self = shift; 1153 return $self->{deferred_max} if defined $self->{deferred_max}; 1154 my $max = -1; 1155 for my $key (keys %{$self->{deferred}}) { 1156 $max = $key if $key > $max; 1157 } 1158 $self->{deferred_max} = $max; 1159 $max; 1160} 1161 1162################################################################ 1163# 1164# Matters related to autodeferment 1165# 1166 1167# Get/set autodefer option 1168sub autodefer { 1169 my $self = shift; 1170 if (@_) { 1171 my $old = $self->{autodefer}; 1172 $self->{autodefer} = shift; 1173 if ($old) { 1174 $self->_stop_autodeferring; 1175 @{$self->{ad_history}} = (); 1176 } 1177 $old; 1178 } else { 1179 $self->{autodefer}; 1180 } 1181} 1182 1183# The user is trying to store record #$n Record that in the history, 1184# and then enable (or disable) autodeferment if that seems useful. 1185# Note that it's OK for $n to be a non-number, as long as the function 1186# is prepared to deal with that. Nobody else looks at the ad_history. 1187# 1188# Now, what does the ad_history mean, and what is this function doing? 1189# Essentially, the idea is to enable autodeferring when we see that the 1190# user has made three consecutive STORE calls to three consecutive records. 1191# ("Three" is actually ->{autodefer_threshhold}.) 1192# A STORE call for record #$n inserts $n into the autodefer history, 1193# and if the history contains three consecutive records, we enable 1194# autodeferment. An ad_history of [X, Y] means that the most recent 1195# STOREs were for records X, X+1, ..., Y, in that order. 1196# 1197# Inserting a nonconsecutive number erases the history and starts over. 1198# 1199# Performing a special operation like SPLICE erases the history. 1200# 1201# There's one special case: CLEAR means that CLEAR was just called. 1202# In this case, we prime the history with [-2, -1] so that if the next 1203# write is for record 0, autodeferring goes on immediately. This is for 1204# the common special case of "@a = (...)". 1205# 1206sub _annotate_ad_history { 1207 my ($self, $n) = @_; 1208 return unless $self->{autodefer}; # feature is disabled 1209 return if $self->{defer}; # already in explicit defer mode 1210 return unless $self->{offsets}[-1] >= $self->{autodefer_filelen_threshhold}; 1211 1212 local *H = $self->{ad_history}; 1213 if ($n eq 'CLEAR') { 1214 @H = (-2, -1); # prime the history with fake records 1215 $self->_stop_autodeferring; 1216 } elsif ($n =~ /^\d+$/) { 1217 if (@H == 0) { 1218 @H = ($n, $n); 1219 } else { # @H == 2 1220 if ($H[1] == $n-1) { # another consecutive record 1221 $H[1]++; 1222 if ($H[1] - $H[0] + 1 >= $self->{autodefer_threshhold}) { 1223 $self->{autodeferring} = 1; 1224 } 1225 } else { # nonconsecutive- erase and start over 1226 @H = ($n, $n); 1227 $self->_stop_autodeferring; 1228 } 1229 } 1230 } else { # SPLICE or STORESIZE or some such 1231 @H = (); 1232 $self->_stop_autodeferring; 1233 } 1234} 1235 1236# If autodeferring was enabled, cut it out and discard the history 1237sub _stop_autodeferring { 1238 my $self = shift; 1239 if ($self->{autodeferring}) { 1240 $self->_flush; 1241 } 1242 $self->{autodeferring} = 0; 1243} 1244 1245################################################################ 1246 1247 1248# This is NOT a method. It is here for two reasons: 1249# 1. To factor a fairly complicated block out of the constructor 1250# 2. To provide access for the test suite, which need to be sure 1251# files are being written properly. 1252sub _default_recsep { 1253 my $recsep = $/; 1254 if ($^O eq 'MSWin32') { # Dos too? 1255 # Windows users expect files to be terminated with \r\n 1256 # But $/ is set to \n instead 1257 # Note that this also transforms \n\n into \r\n\r\n. 1258 # That is a feature. 1259 $recsep =~ s/\n/\r\n/g; 1260 } 1261 $recsep; 1262} 1263 1264# Utility function for _check_integrity 1265sub _ci_warn { 1266 my $msg = shift; 1267 $msg =~ s/\n/\\n/g; 1268 $msg =~ s/\r/\\r/g; 1269 print "# $msg\n"; 1270} 1271 1272# Given a file, make sure the cache is consistent with the 1273# file contents and the internal data structures are consistent with 1274# each other. Returns true if everything checks out, false if not 1275# 1276# The $file argument is no longer used. It is retained for compatibility 1277# with the existing test suite. 1278sub _check_integrity { 1279 my ($self, $file, $warn) = @_; 1280 my $rsl = $self->{recseplen}; 1281 my $rs = $self->{recsep}; 1282 my $good = 1; 1283 local *_; # local $_ does not work here 1284 local $DIAGNOSTIC = 1; 1285 1286 if (not defined $rs) { 1287 _ci_warn("recsep is undef!"); 1288 $good = 0; 1289 } elsif ($rs eq "") { 1290 _ci_warn("recsep is empty!"); 1291 $good = 0; 1292 } elsif ($rsl != length $rs) { 1293 my $ln = length $rs; 1294 _ci_warn("recsep <$rs> has length $ln, should be $rsl"); 1295 $good = 0; 1296 } 1297 1298 if (not defined $self->{offsets}[0]) { 1299 _ci_warn("offset 0 is missing!"); 1300 $good = 0; 1301 1302 } elsif ($self->{offsets}[0] != 0) { 1303 _ci_warn("rec 0: offset <$self->{offsets}[0]> s/b 0!"); 1304 $good = 0; 1305 } 1306 1307 my $cached = 0; 1308 { 1309 local *F = $self->{fh}; 1310 seek F, 0, SEEK_SET; 1311 local $. = 0; 1312 local $/ = $rs; 1313 1314 while (<F>) { 1315 my $n = $. - 1; 1316 my $cached = $self->{cache}->_produce($n); 1317 my $offset = $self->{offsets}[$.]; 1318 my $ao = tell F; 1319 if (defined $offset && $offset != $ao) { 1320 _ci_warn("rec $n: offset <$offset> actual <$ao>"); 1321 $good = 0; 1322 } 1323 if (defined $cached && $_ ne $cached && ! $self->{deferred}{$n}) { 1324 $good = 0; 1325 _ci_warn("rec $n: cached <$cached> actual <$_>"); 1326 } 1327 if (defined $cached && substr($cached, -$rsl) ne $rs) { 1328 $good = 0; 1329 _ci_warn("rec $n in the cache is missing the record separator"); 1330 } 1331 if (! defined $offset && $self->{eof}) { 1332 $good = 0; 1333 _ci_warn("The offset table was marked complete, but it is missing " . 1334 "element $."); 1335 } 1336 } 1337 if (@{$self->{offsets}} > $.+1) { 1338 $good = 0; 1339 my $n = @{$self->{offsets}}; 1340 _ci_warn("The offset table has $n items, but the file has only $."); 1341 } 1342 1343 my $deferring = $self->_is_deferring; 1344 for my $n ($self->{cache}->ckeys) { 1345 my $r = $self->{cache}->_produce($n); 1346 $cached += length($r); 1347 next if $n+1 <= $.; # checked this already 1348 _ci_warn("spurious caching of record $n"); 1349 $good = 0; 1350 } 1351 my $b = $self->{cache}->bytes; 1352 if ($cached != $b) { 1353 _ci_warn("cache size is $b, should be $cached"); 1354 $good = 0; 1355 } 1356 } 1357 1358 # That cache has its own set of tests 1359 $good = 0 unless $self->{cache}->_check_integrity; 1360 1361 # Now let's check the deferbuffer 1362 # Unless deferred writing is enabled, it should be empty 1363 if (! $self->_is_deferring && %{$self->{deferred}}) { 1364 _ci_warn("deferred writing disabled, but deferbuffer nonempty"); 1365 $good = 0; 1366 } 1367 1368 # Any record in the deferbuffer should *not* be present in the readcache 1369 my $deferred_s = 0; 1370 while (my ($n, $r) = each %{$self->{deferred}}) { 1371 $deferred_s += length($r); 1372 if (defined $self->{cache}->_produce($n)) { 1373 _ci_warn("record $n is in the deferbuffer *and* the readcache"); 1374 $good = 0; 1375 } 1376 if (substr($r, -$rsl) ne $rs) { 1377 _ci_warn("rec $n in the deferbuffer is missing the record separator"); 1378 $good = 0; 1379 } 1380 } 1381 1382 # Total size of deferbuffer should match internal total 1383 if ($deferred_s != $self->{deferred_s}) { 1384 _ci_warn("buffer size is $self->{deferred_s}, should be $deferred_s"); 1385 $good = 0; 1386 } 1387 1388 # Total size of deferbuffer should not exceed the specified limit 1389 if ($deferred_s > $self->{dw_size}) { 1390 _ci_warn("buffer size is $self->{deferred_s} which exceeds the limit " . 1391 "of $self->{dw_size}"); 1392 $good = 0; 1393 } 1394 1395 # Total size of cached data should not exceed the specified limit 1396 if ($deferred_s + $cached > $self->{memory}) { 1397 my $total = $deferred_s + $cached; 1398 _ci_warn("total stored data size is $total which exceeds the limit " . 1399 "of $self->{memory}"); 1400 $good = 0; 1401 } 1402 1403 # Stuff related to autodeferment 1404 if (!$self->{autodefer} && @{$self->{ad_history}}) { 1405 _ci_warn("autodefer is disabled, but ad_history is nonempty"); 1406 $good = 0; 1407 } 1408 if ($self->{autodeferring} && $self->{defer}) { 1409 _ci_warn("both autodeferring and explicit deferring are active"); 1410 $good = 0; 1411 } 1412 if (@{$self->{ad_history}} == 0) { 1413 # That's OK, no additional tests required 1414 } elsif (@{$self->{ad_history}} == 2) { 1415 my @non_number = grep !/^-?\d+$/, @{$self->{ad_history}}; 1416 if (@non_number) { 1417 my $msg; 1418 { local $" = ')('; 1419 $msg = "ad_history contains non-numbers (@{$self->{ad_history}})"; 1420 } 1421 _ci_warn($msg); 1422 $good = 0; 1423 } elsif ($self->{ad_history}[1] < $self->{ad_history}[0]) { 1424 _ci_warn("ad_history has nonsensical values @{$self->{ad_history}}"); 1425 $good = 0; 1426 } 1427 } else { 1428 _ci_warn("ad_history has bad length <@{$self->{ad_history}}>"); 1429 $good = 0; 1430 } 1431 1432 $good; 1433} 1434 1435################################################################ 1436# 1437# Tie::File::Cache 1438# 1439# Read cache 1440 1441package Tie::File::Cache; 1442$Tie::File::Cache::VERSION = $Tie::File::VERSION; 1443use Carp ':DEFAULT', 'confess'; 1444 1445use constant { 1446 HEAP => 0, 1447 HASH => 1, 1448 MAX => 2, 1449 BYTES => 3, 1450 #STAT => 4, # Array with request statistics for each record 1451 #MISS => 5, # Total number of cache misses 1452 #REQ => 6, # Total number of cache requests 1453}; 1454 1455sub new { 1456 my ($pack, $max) = @_; 1457 local *_; 1458 croak "missing argument to ->new" unless defined $max; 1459 my $self = []; 1460 bless $self => $pack; 1461 @$self = (Tie::File::Heap->new($self), {}, $max, 0); 1462 $self; 1463} 1464 1465sub adj_limit { 1466 my ($self, $n) = @_; 1467 $self->[MAX] += $n; 1468} 1469 1470sub set_limit { 1471 my ($self, $n) = @_; 1472 $self->[MAX] = $n; 1473} 1474 1475# For internal use only 1476# Will be called by the heap structure to notify us that a certain 1477# piece of data has moved from one heap element to another. 1478# $k is the hash key of the item 1479# $n is the new index into the heap at which it is stored 1480# If $n is undefined, the item has been removed from the heap. 1481sub _heap_move { 1482 my ($self, $k, $n) = @_; 1483 if (defined $n) { 1484 $self->[HASH]{$k} = $n; 1485 } else { 1486 delete $self->[HASH]{$k}; 1487 } 1488} 1489 1490sub insert { 1491 my ($self, $key, $val) = @_; 1492 local *_; 1493 croak "missing argument to ->insert" unless defined $key; 1494 unless (defined $self->[MAX]) { 1495 confess "undefined max" ; 1496 } 1497 confess "undefined val" unless defined $val; 1498 return if length($val) > $self->[MAX]; 1499 1500# if ($self->[STAT]) { 1501# $self->[STAT][$key] = 1; 1502# return; 1503# } 1504 1505 my $oldnode = $self->[HASH]{$key}; 1506 if (defined $oldnode) { 1507 my $oldval = $self->[HEAP]->set_val($oldnode, $val); 1508 $self->[BYTES] -= length($oldval); 1509 } else { 1510 $self->[HEAP]->insert($key, $val); 1511 } 1512 $self->[BYTES] += length($val); 1513 $self->flush if $self->[BYTES] > $self->[MAX]; 1514} 1515 1516sub expire { 1517 my $self = shift; 1518 my $old_data = $self->[HEAP]->popheap; 1519 return unless defined $old_data; 1520 $self->[BYTES] -= length $old_data; 1521 $old_data; 1522} 1523 1524sub remove { 1525 my ($self, @keys) = @_; 1526 my @result; 1527 1528# if ($self->[STAT]) { 1529# for my $key (@keys) { 1530# $self->[STAT][$key] = 0; 1531# } 1532# return; 1533# } 1534 1535 for my $key (@keys) { 1536 next unless exists $self->[HASH]{$key}; 1537 my $old_data = $self->[HEAP]->remove($self->[HASH]{$key}); 1538 $self->[BYTES] -= length $old_data; 1539 push @result, $old_data; 1540 } 1541 @result; 1542} 1543 1544sub lookup { 1545 my ($self, $key) = @_; 1546 local *_; 1547 croak "missing argument to ->lookup" unless defined $key; 1548 1549# if ($self->[STAT]) { 1550# $self->[MISS]++ if $self->[STAT][$key]++ == 0; 1551# $self->[REQ]++; 1552# my $hit_rate = 1 - $self->[MISS] / $self->[REQ]; 1553# # Do some testing to determine this threshhold 1554# $#$self = STAT - 1 if $hit_rate > 0.20; 1555# } 1556 1557 if (exists $self->[HASH]{$key}) { 1558 $self->[HEAP]->lookup($self->[HASH]{$key}); 1559 } else { 1560 return; 1561 } 1562} 1563 1564# For internal use only 1565sub _produce { 1566 my ($self, $key) = @_; 1567 my $loc = $self->[HASH]{$key}; 1568 return unless defined $loc; 1569 $self->[HEAP][$loc][2]; 1570} 1571 1572# For internal use only 1573sub _promote { 1574 my ($self, $key) = @_; 1575 $self->[HEAP]->promote($self->[HASH]{$key}); 1576} 1577 1578sub empty { 1579 my ($self) = @_; 1580 %{$self->[HASH]} = (); 1581 $self->[BYTES] = 0; 1582 $self->[HEAP]->empty; 1583# @{$self->[STAT]} = (); 1584# $self->[MISS] = 0; 1585# $self->[REQ] = 0; 1586} 1587 1588sub is_empty { 1589 my ($self) = @_; 1590 keys %{$self->[HASH]} == 0; 1591} 1592 1593sub update { 1594 my ($self, $key, $val) = @_; 1595 local *_; 1596 croak "missing argument to ->update" unless defined $key; 1597 if (length($val) > $self->[MAX]) { 1598 my ($oldval) = $self->remove($key); 1599 $self->[BYTES] -= length($oldval) if defined $oldval; 1600 } elsif (exists $self->[HASH]{$key}) { 1601 my $oldval = $self->[HEAP]->set_val($self->[HASH]{$key}, $val); 1602 $self->[BYTES] += length($val); 1603 $self->[BYTES] -= length($oldval) if defined $oldval; 1604 } else { 1605 $self->[HEAP]->insert($key, $val); 1606 $self->[BYTES] += length($val); 1607 } 1608 $self->flush; 1609} 1610 1611sub rekey { 1612 my ($self, $okeys, $nkeys) = @_; 1613 local *_; 1614 my %map; 1615 @map{@$okeys} = @$nkeys; 1616 croak "missing argument to ->rekey" unless defined $nkeys; 1617 croak "length mismatch in ->rekey arguments" unless @$nkeys == @$okeys; 1618 my %adjusted; # map new keys to heap indices 1619 # You should be able to cut this to one loop TODO XXX 1620 for (0 .. $#$okeys) { 1621 $adjusted{$nkeys->[$_]} = delete $self->[HASH]{$okeys->[$_]}; 1622 } 1623 while (my ($nk, $ix) = each %adjusted) { 1624 # @{$self->[HASH]}{keys %adjusted} = values %adjusted; 1625 $self->[HEAP]->rekey($ix, $nk); 1626 $self->[HASH]{$nk} = $ix; 1627 } 1628} 1629 1630sub ckeys { 1631 my $self = shift; 1632 my @a = keys %{$self->[HASH]}; 1633 @a; 1634} 1635 1636# Return total amount of cached data 1637sub bytes { 1638 my $self = shift; 1639 $self->[BYTES]; 1640} 1641 1642# Expire oldest item from cache until cache size is smaller than $max 1643sub reduce_size_to { 1644 my ($self, $max) = @_; 1645 until ($self->[BYTES] <= $max) { 1646 # Note that Tie::File::Cache::expire has been inlined here 1647 my $old_data = $self->[HEAP]->popheap; 1648 return unless defined $old_data; 1649 $self->[BYTES] -= length $old_data; 1650 } 1651} 1652 1653# Why not just $self->reduce_size_to($self->[MAX])? 1654# Try this when things stabilize TODO XXX 1655# If the cache is too full, expire the oldest records 1656sub flush { 1657 my $self = shift; 1658 $self->reduce_size_to($self->[MAX]) if $self->[BYTES] > $self->[MAX]; 1659} 1660 1661# For internal use only 1662sub _produce_lru { 1663 my $self = shift; 1664 $self->[HEAP]->expire_order; 1665} 1666 1667BEGIN { *_ci_warn = \&Tie::File::_ci_warn } 1668 1669sub _check_integrity { # For CACHE 1670 my $self = shift; 1671 my $good = 1; 1672 1673 # Test HEAP 1674 $self->[HEAP]->_check_integrity or $good = 0; 1675 1676 # Test HASH 1677 my $bytes = 0; 1678 for my $k (keys %{$self->[HASH]}) { 1679 if ($k ne '0' && $k !~ /^[1-9][0-9]*$/) { 1680 $good = 0; 1681 _ci_warn "Cache hash key <$k> is non-numeric"; 1682 } 1683 1684 my $h = $self->[HASH]{$k}; 1685 if (! defined $h) { 1686 $good = 0; 1687 _ci_warn "Heap index number for key $k is undefined"; 1688 } elsif ($h == 0) { 1689 $good = 0; 1690 _ci_warn "Heap index number for key $k is zero"; 1691 } else { 1692 my $j = $self->[HEAP][$h]; 1693 if (! defined $j) { 1694 $good = 0; 1695 _ci_warn "Heap contents key $k (=> $h) are undefined"; 1696 } else { 1697 $bytes += length($j->[2]); 1698 if ($k ne $j->[1]) { 1699 $good = 0; 1700 _ci_warn "Heap contents key $k (=> $h) is $j->[1], should be $k"; 1701 } 1702 } 1703 } 1704 } 1705 1706 # Test BYTES 1707 if ($bytes != $self->[BYTES]) { 1708 $good = 0; 1709 _ci_warn "Total data in cache is $bytes, expected $self->[BYTES]"; 1710 } 1711 1712 # Test MAX 1713 if ($bytes > $self->[MAX]) { 1714 $good = 0; 1715 _ci_warn "Total data in cache is $bytes, exceeds maximum $self->[MAX]"; 1716 } 1717 1718 return $good; 1719} 1720 1721sub delink { 1722 my $self = shift; 1723 $self->[HEAP] = undef; # Bye bye heap 1724} 1725 1726################################################################ 1727# 1728# Tie::File::Heap 1729# 1730# Heap data structure for use by cache LRU routines 1731 1732package Tie::File::Heap; 1733use Carp ':DEFAULT', 'confess'; 1734$Tie::File::Heap::VERSION = $Tie::File::Cache::VERSION; 1735use constant { 1736 SEQ => 0, 1737 KEY => 1, 1738 DAT => 2, 1739}; 1740 1741sub new { 1742 my ($pack, $cache) = @_; 1743 die "$pack: Parent cache object $cache does not support _heap_move method" 1744 unless eval { $cache->can('_heap_move') }; 1745 my $self = [[0,$cache,0]]; 1746 bless $self => $pack; 1747} 1748 1749# Allocate a new sequence number, larger than all previously allocated numbers 1750sub _nseq { 1751 my $self = shift; 1752 $self->[0][0]++; 1753} 1754 1755sub _cache { 1756 my $self = shift; 1757 $self->[0][1]; 1758} 1759 1760sub _nelts { 1761 my $self = shift; 1762 $self->[0][2]; 1763} 1764 1765sub _nelts_inc { 1766 my $self = shift; 1767 ++$self->[0][2]; 1768} 1769 1770sub _nelts_dec { 1771 my $self = shift; 1772 --$self->[0][2]; 1773} 1774 1775sub is_empty { 1776 my $self = shift; 1777 $self->_nelts == 0; 1778} 1779 1780sub empty { 1781 my $self = shift; 1782 $#$self = 0; 1783 $self->[0][2] = 0; 1784 $self->[0][0] = 0; # might as well reset the sequence numbers 1785} 1786 1787# notify the parent cache object that we moved something 1788sub _heap_move { 1789 my $self = shift; 1790 $self->_cache->_heap_move(@_); 1791} 1792 1793# Insert a piece of data into the heap with the indicated sequence number. 1794# The item with the smallest sequence number is always at the top. 1795# If no sequence number is specified, allocate a new one and insert the 1796# item at the bottom. 1797sub insert { 1798 my ($self, $key, $data, $seq) = @_; 1799 $seq = $self->_nseq unless defined $seq; 1800 $self->_insert_new([$seq, $key, $data]); 1801} 1802 1803# Insert a new, fresh item at the bottom of the heap 1804sub _insert_new { 1805 my ($self, $item) = @_; 1806 my $i = @$self; 1807 $i = int($i/2) until defined $self->[$i/2]; 1808 $self->[$i] = $item; 1809 $self->[0][1]->_heap_move($self->[$i][KEY], $i); 1810 $self->_nelts_inc; 1811} 1812 1813# Insert [$data, $seq] pair at or below item $i in the heap. 1814# If $i is omitted, default to 1 (the top element.) 1815sub _insert { 1816 my ($self, $item, $i) = @_; 1817# $self->_check_loc($i) if defined $i; 1818 $i = 1 unless defined $i; 1819 until (! defined $self->[$i]) { 1820 if ($self->[$i][SEQ] > $item->[SEQ]) { # inserted item is older 1821 ($self->[$i], $item) = ($item, $self->[$i]); 1822 $self->[0][1]->_heap_move($self->[$i][KEY], $i); 1823 } 1824 # If either is undefined, go that way. Otherwise, choose at random 1825 my $dir; 1826 $dir = 0 if !defined $self->[2*$i]; 1827 $dir = 1 if !defined $self->[2*$i+1]; 1828 $dir = int(rand(2)) unless defined $dir; 1829 $i = 2*$i + $dir; 1830 } 1831 $self->[$i] = $item; 1832 $self->[0][1]->_heap_move($self->[$i][KEY], $i); 1833 $self->_nelts_inc; 1834} 1835 1836# Remove the item at node $i from the heap, moving child items upwards. 1837# The item with the smallest sequence number is always at the top. 1838# Moving items upwards maintains this condition. 1839# Return the removed item. Return undef if there was no item at node $i. 1840sub remove { 1841 my ($self, $i) = @_; 1842 $i = 1 unless defined $i; 1843 my $top = $self->[$i]; 1844 return unless defined $top; 1845 while (1) { 1846 my $ii; 1847 my ($L, $R) = (2*$i, 2*$i+1); 1848 1849 # If either is undefined, go the other way. 1850 # Otherwise, go towards the smallest. 1851 last unless defined $self->[$L] || defined $self->[$R]; 1852 $ii = $R if not defined $self->[$L]; 1853 $ii = $L if not defined $self->[$R]; 1854 unless (defined $ii) { 1855 $ii = $self->[$L][SEQ] < $self->[$R][SEQ] ? $L : $R; 1856 } 1857 1858 $self->[$i] = $self->[$ii]; # Promote child to fill vacated spot 1859 $self->[0][1]->_heap_move($self->[$i][KEY], $i); 1860 $i = $ii; # Fill new vacated spot 1861 } 1862 $self->[0][1]->_heap_move($top->[KEY], undef); 1863 undef $self->[$i]; 1864 $self->_nelts_dec; 1865 return $top->[DAT]; 1866} 1867 1868sub popheap { 1869 my $self = shift; 1870 $self->remove(1); 1871} 1872 1873# set the sequence number of the indicated item to a higher number 1874# than any other item in the heap, and bubble the item down to the 1875# bottom. 1876sub promote { 1877 my ($self, $n) = @_; 1878# $self->_check_loc($n); 1879 $self->[$n][SEQ] = $self->_nseq; 1880 my $i = $n; 1881 while (1) { 1882 my ($L, $R) = (2*$i, 2*$i+1); 1883 my $dir; 1884 last unless defined $self->[$L] || defined $self->[$R]; 1885 $dir = $R unless defined $self->[$L]; 1886 $dir = $L unless defined $self->[$R]; 1887 unless (defined $dir) { 1888 $dir = $self->[$L][SEQ] < $self->[$R][SEQ] ? $L : $R; 1889 } 1890 @{$self}[$i, $dir] = @{$self}[$dir, $i]; 1891 for ($i, $dir) { 1892 $self->[0][1]->_heap_move($self->[$_][KEY], $_) if defined $self->[$_]; 1893 } 1894 $i = $dir; 1895 } 1896} 1897 1898# Return item $n from the heap, promoting its LRU status 1899sub lookup { 1900 my ($self, $n) = @_; 1901# $self->_check_loc($n); 1902 my $val = $self->[$n]; 1903 $self->promote($n); 1904 $val->[DAT]; 1905} 1906 1907 1908# Assign a new value for node $n, promoting it to the bottom of the heap 1909sub set_val { 1910 my ($self, $n, $val) = @_; 1911# $self->_check_loc($n); 1912 my $oval = $self->[$n][DAT]; 1913 $self->[$n][DAT] = $val; 1914 $self->promote($n); 1915 return $oval; 1916} 1917 1918# The hash key has changed for an item; 1919# alter the heap's record of the hash key 1920sub rekey { 1921 my ($self, $n, $new_key) = @_; 1922# $self->_check_loc($n); 1923 $self->[$n][KEY] = $new_key; 1924} 1925 1926sub _check_loc { 1927 my ($self, $n) = @_; 1928 unless (1 || defined $self->[$n]) { 1929 confess "_check_loc($n) failed"; 1930 } 1931} 1932 1933BEGIN { *_ci_warn = \&Tie::File::_ci_warn } 1934 1935sub _check_integrity { 1936 my $self = shift; 1937 my $good = 1; 1938 my %seq; 1939 1940 unless (eval {$self->[0][1]->isa("Tie::File::Cache")}) { 1941 _ci_warn "Element 0 of heap corrupt"; 1942 $good = 0; 1943 } 1944 $good = 0 unless $self->_satisfies_heap_condition(1); 1945 for my $i (2 .. $#{$self}) { 1946 my $p = int($i/2); # index of parent node 1947 if (defined $self->[$i] && ! defined $self->[$p]) { 1948 _ci_warn "Element $i of heap defined, but parent $p isn't"; 1949 $good = 0; 1950 } 1951 1952 if (defined $self->[$i]) { 1953 if ($seq{$self->[$i][SEQ]}) { 1954 my $seq = $self->[$i][SEQ]; 1955 _ci_warn "Nodes $i and $seq{$seq} both have SEQ=$seq"; 1956 $good = 0; 1957 } else { 1958 $seq{$self->[$i][SEQ]} = $i; 1959 } 1960 } 1961 } 1962 1963 return $good; 1964} 1965 1966sub _satisfies_heap_condition { 1967 my $self = shift; 1968 my $n = shift || 1; 1969 my $good = 1; 1970 for (0, 1) { 1971 my $c = $n*2 + $_; 1972 next unless defined $self->[$c]; 1973 if ($self->[$n][SEQ] >= $self->[$c]) { 1974 _ci_warn "Node $n of heap does not predate node $c"; 1975 $good = 0 ; 1976 } 1977 $good = 0 unless $self->_satisfies_heap_condition($c); 1978 } 1979 return $good; 1980} 1981 1982# Return a list of all the values, sorted by expiration order 1983sub expire_order { 1984 my $self = shift; 1985 my @nodes = sort {$a->[SEQ] <=> $b->[SEQ]} $self->_nodes; 1986 map { $_->[KEY] } @nodes; 1987} 1988 1989sub _nodes { 1990 my $self = shift; 1991 my $i = shift || 1; 1992 return unless defined $self->[$i]; 1993 ($self->[$i], $self->_nodes($i*2), $self->_nodes($i*2+1)); 1994} 1995 19961; 1997 1998__END__ 1999 2000=head1 NAME 2001 2002Tie::File - Access the lines of a disk file via a Perl array 2003 2004=head1 SYNOPSIS 2005 2006 use Tie::File; 2007 2008 tie @array, 'Tie::File', filename or die ...; 2009 2010 $array[0] = 'blah'; # first line of the file is now 'blah' 2011 # (line numbering starts at 0) 2012 print $array[42]; # display line 43 of the file 2013 2014 $n_recs = @array; # how many records are in the file? 2015 $#array -= 2; # chop two records off the end 2016 2017 2018 for (@array) { 2019 s/PERL/Perl/g; # Replace PERL with Perl everywhere in the file 2020 } 2021 2022 # These are just like regular push, pop, unshift, shift, and splice 2023 # Except that they modify the file in the way you would expect 2024 2025 push @array, new recs...; 2026 my $r1 = pop @array; 2027 unshift @array, new recs...; 2028 my $r2 = shift @array; 2029 @old_recs = splice @array, 3, 7, new recs...; 2030 2031 untie @array; # all finished 2032 2033 2034=head1 DESCRIPTION 2035 2036C<Tie::File> represents a regular text file as a Perl array. Each 2037element in the array corresponds to a record in the file. The first 2038line of the file is element 0 of the array; the second line is element 20391, and so on. 2040 2041The file is I<not> loaded into memory, so this will work even for 2042gigantic files. 2043 2044Changes to the array are reflected in the file immediately. 2045 2046Lazy people and beginners may now stop reading the manual. 2047 2048=head2 C<unicode> 2049 2050You can read a unicode (UTF-8) file by providing a file handle opened with 2051the desired encoding. It is not safe to write to one because 2052the length in bytes and in characters is often different, Tie::File 2053will miscalculate the length of writes, overwriting parts of other records. 2054 2055=head2 C<recsep> 2056 2057What is a 'record'? By default, the meaning is the same as for the 2058C<E<lt>...E<gt>> operator: It's a string terminated by C<$/>, which is 2059probably C<"\n">. (Minor exception: on DOS and Win32 systems, a 2060'record' is a string terminated by C<"\r\n">.) You may change the 2061definition of "record" by supplying the C<recsep> option in the C<tie> 2062call: 2063 2064 tie @array, 'Tie::File', $file, recsep => 'es'; 2065 2066This says that records are delimited by the string C<es>. If the file 2067contained the following data: 2068 2069 Curse these pesky flies!\n 2070 2071then the C<@array> would appear to have four elements: 2072 2073 "Curse th" 2074 "e p" 2075 "ky fli" 2076 "!\n" 2077 2078An undefined value is not permitted as a record separator. Perl's 2079special "paragraph mode" semantics (E<agrave> la C<$/ = "">) are not 2080emulated. 2081 2082Records read from the tied array do not have the record separator 2083string on the end; this is to allow 2084 2085 $array[17] .= "extra"; 2086 2087to work as expected. 2088 2089(See L<"autochomp">, below.) Records stored into the array will have 2090the record separator string appended before they are written to the 2091file, if they don't have one already. For example, if the record 2092separator string is C<"\n">, then the following two lines do exactly 2093the same thing: 2094 2095 $array[17] = "Cherry pie"; 2096 $array[17] = "Cherry pie\n"; 2097 2098The result is that the contents of line 17 of the file will be 2099replaced with "Cherry pie"; a newline character will separate line 17 2100from line 18. This means that this code will do nothing: 2101 2102 chomp $array[17]; 2103 2104Because the C<chomp>ed value will have the separator reattached when 2105it is written back to the file. There is no way to create a file 2106whose trailing record separator string is missing. 2107 2108Inserting records that I<contain> the record separator string is not 2109supported by this module. It will probably produce a reasonable 2110result, but what this result will be may change in a future version. 2111Use 'splice' to insert records or to replace one record with several. 2112 2113=head2 C<autochomp> 2114 2115Normally, array elements have the record separator removed, so that if 2116the file contains the text 2117 2118 Gold 2119 Frankincense 2120 Myrrh 2121 2122the tied array will appear to contain C<("Gold", "Frankincense", 2123"Myrrh")>. If you set C<autochomp> to a false value, the record 2124separator will not be removed. If the file above was tied with 2125 2126 tie @gifts, "Tie::File", $gifts, autochomp => 0; 2127 2128then the array C<@gifts> would appear to contain C<("Gold\n", 2129"Frankincense\n", "Myrrh\n")>, or (on Win32 systems) C<("Gold\r\n", 2130"Frankincense\r\n", "Myrrh\r\n")>. 2131 2132=head2 C<mode> 2133 2134Normally, the specified file will be opened for read and write access, 2135and will be created if it does not exist. (That is, the flags 2136C<O_RDWR | O_CREAT> are supplied in the C<open> call.) If you want to 2137change this, you may supply alternative flags in the C<mode> option. 2138See L<Fcntl> for a listing of available flags. 2139For example: 2140 2141 # open the file if it exists, but fail if it does not exist 2142 use Fcntl 'O_RDWR'; 2143 tie @array, 'Tie::File', $file, mode => O_RDWR; 2144 2145 # create the file if it does not exist 2146 use Fcntl 'O_RDWR', 'O_CREAT'; 2147 tie @array, 'Tie::File', $file, mode => O_RDWR | O_CREAT; 2148 2149 # open an existing file in read-only mode 2150 use Fcntl 'O_RDONLY'; 2151 tie @array, 'Tie::File', $file, mode => O_RDONLY; 2152 2153Opening the data file in write-only or append mode is not supported. 2154 2155=head2 C<memory> 2156 2157This is an upper limit on the amount of memory that C<Tie::File> will 2158consume at any time while managing the file. This is used for two 2159things: managing the I<read cache> and managing the I<deferred write 2160buffer>. 2161 2162Records read in from the file are cached, to avoid having to re-read 2163them repeatedly. If you read the same record twice, the first time it 2164will be stored in memory, and the second time it will be fetched from 2165the I<read cache>. The amount of data in the read cache will not 2166exceed the value you specified for C<memory>. If C<Tie::File> wants 2167to cache a new record, but the read cache is full, it will make room 2168by expiring the least-recently visited records from the read cache. 2169 2170The default memory limit is 2Mib. You can adjust the maximum read 2171cache size by supplying the C<memory> option. The argument is the 2172desired cache size, in bytes. 2173 2174 # I have a lot of memory, so use a large cache to speed up access 2175 tie @array, 'Tie::File', $file, memory => 20_000_000; 2176 2177Setting the memory limit to 0 will inhibit caching; records will be 2178fetched from disk every time you examine them. 2179 2180The C<memory> value is not an absolute or exact limit on the memory 2181used. C<Tie::File> objects contains some structures besides the read 2182cache and the deferred write buffer, whose sizes are not charged 2183against C<memory>. 2184 2185The cache itself consumes about 310 bytes per cached record, so if 2186your file has many short records, you may want to decrease the cache 2187memory limit, or else the cache overhead may exceed the size of the 2188cached data. 2189 2190 2191=head2 C<dw_size> 2192 2193(This is an advanced feature. Skip this section on first reading.) 2194 2195If you use deferred writing (See L<"Deferred Writing">, below) then 2196data you write into the array will not be written directly to the 2197file; instead, it will be saved in the I<deferred write buffer> to be 2198written out later. Data in the deferred write buffer is also charged 2199against the memory limit you set with the C<memory> option. 2200 2201You may set the C<dw_size> option to limit the amount of data that can 2202be saved in the deferred write buffer. This limit may not exceed the 2203total memory limit. For example, if you set C<dw_size> to 1000 and 2204C<memory> to 2500, that means that no more than 1000 bytes of deferred 2205writes will be saved up. The space available for the read cache will 2206vary, but it will always be at least 1500 bytes (if the deferred write 2207buffer is full) and it could grow as large as 2500 bytes (if the 2208deferred write buffer is empty.) 2209 2210If you don't specify a C<dw_size>, it defaults to the entire memory 2211limit. 2212 2213=head2 Option Format 2214 2215C<-mode> is a synonym for C<mode>. C<-recsep> is a synonym for 2216C<recsep>. C<-memory> is a synonym for C<memory>. You get the 2217idea. 2218 2219=head1 Public Methods 2220 2221The C<tie> call returns an object, say C<$o>. You may call 2222 2223 $rec = $o->FETCH($n); 2224 $o->STORE($n, $rec); 2225 2226to fetch or store the record at line C<$n>, respectively; similarly 2227the other tied array methods. (See L<perltie> for details.) You may 2228also call the following methods on this object: 2229 2230=head2 C<flock> 2231 2232 $o->flock(MODE) 2233 2234will lock the tied file. C<MODE> has the same meaning as the second 2235argument to the Perl built-in C<flock> function; for example 2236C<LOCK_SH> or C<LOCK_EX | LOCK_NB>. (These constants are provided by 2237the C<use Fcntl ':flock'> declaration.) 2238 2239C<MODE> is optional; the default is C<LOCK_EX>. 2240 2241C<Tie::File> maintains an internal table of the byte offset of each 2242record it has seen in the file. 2243 2244When you use C<flock> to lock the file, C<Tie::File> assumes that the 2245read cache is no longer trustworthy, because another process might 2246have modified the file since the last time it was read. Therefore, a 2247successful call to C<flock> discards the contents of the read cache 2248and the internal record offset table. 2249 2250C<Tie::File> promises that the following sequence of operations will 2251be safe: 2252 2253 my $o = tie @array, "Tie::File", $filename; 2254 $o->flock; 2255 2256In particular, C<Tie::File> will I<not> read or write the file during 2257the C<tie> call. (Exception: Using C<mode =E<gt> O_TRUNC> will, of 2258course, erase the file during the C<tie> call. If you want to do this 2259safely, then open the file without C<O_TRUNC>, lock the file, and use 2260C<@array = ()>.) 2261 2262The best way to unlock a file is to discard the object and untie the 2263array. It is probably unsafe to unlock the file without also untying 2264it, because if you do, changes may remain unwritten inside the object. 2265That is why there is no shortcut for unlocking. If you really want to 2266unlock the file prematurely, you know what to do; if you don't know 2267what to do, then don't do it. 2268 2269All the usual warnings about file locking apply here. In particular, 2270note that file locking in Perl is B<advisory>, which means that 2271holding a lock will not prevent anyone else from reading, writing, or 2272erasing the file; it only prevents them from getting another lock at 2273the same time. Locks are analogous to green traffic lights: If you 2274have a green light, that does not prevent the idiot coming the other 2275way from plowing into you sideways; it merely guarantees to you that 2276the idiot does not also have a green light at the same time. 2277 2278=head2 C<autochomp> 2279 2280 my $old_value = $o->autochomp(0); # disable autochomp option 2281 my $old_value = $o->autochomp(1); # enable autochomp option 2282 2283 my $ac = $o->autochomp(); # recover current value 2284 2285See L<"autochomp">, above. 2286 2287=head2 C<defer>, C<flush>, C<discard>, and C<autodefer> 2288 2289See L<"Deferred Writing">, below. 2290 2291=head2 C<offset> 2292 2293 $off = $o->offset($n); 2294 2295This method returns the byte offset of the start of the C<$n>th record 2296in the file. If there is no such record, it returns an undefined 2297value. 2298 2299=head1 Tying to an already-opened filehandle 2300 2301If C<$fh> is a filehandle, such as is returned by C<IO::File> or one 2302of the other C<IO> modules, you may use: 2303 2304 tie @array, 'Tie::File', $fh, ...; 2305 2306Similarly if you opened that handle C<FH> with regular C<open> or 2307C<sysopen>, you may use: 2308 2309 tie @array, 'Tie::File', \*FH, ...; 2310 2311Handles that were opened write-only won't work. Handles that were 2312opened read-only will work as long as you don't try to modify the 2313array. Handles must be attached to seekable sources of data---that 2314means no pipes or sockets. If C<Tie::File> can detect that you 2315supplied a non-seekable handle, the C<tie> call will throw an 2316exception. (On Unix systems, it can detect this.) 2317 2318Note that Tie::File will only close any filehandles that it opened 2319internally. If you passed it a filehandle as above, you "own" the 2320filehandle, and are responsible for closing it after you have untied 2321the @array. 2322 2323Tie::File calls C<binmode> on filehandles that it opens internally, 2324but not on filehandles passed in by the user. For consistency, 2325especially if using the tied files cross-platform, you may wish to 2326call C<binmode> on the filehandle prior to tying the file. 2327 2328=head1 Deferred Writing 2329 2330(This is an advanced feature. Skip this section on first reading.) 2331 2332Normally, modifying a C<Tie::File> array writes to the underlying file 2333immediately. Every assignment like C<$a[3] = ...> rewrites as much of 2334the file as is necessary; typically, everything from line 3 through 2335the end will need to be rewritten. This is the simplest and most 2336transparent behavior. Performance even for large files is reasonably 2337good. 2338 2339However, under some circumstances, this behavior may be excessively 2340slow. For example, suppose you have a million-record file, and you 2341want to do: 2342 2343 for (@FILE) { 2344 $_ = "> $_"; 2345 } 2346 2347The first time through the loop, you will rewrite the entire file, 2348from line 0 through the end. The second time through the loop, you 2349will rewrite the entire file from line 1 through the end. The third 2350time through the loop, you will rewrite the entire file from line 2 to 2351the end. And so on. 2352 2353If the performance in such cases is unacceptable, you may defer the 2354actual writing, and then have it done all at once. The following loop 2355will perform much better for large files: 2356 2357 (tied @a)->defer; 2358 for (@a) { 2359 $_ = "> $_"; 2360 } 2361 (tied @a)->flush; 2362 2363If C<Tie::File>'s memory limit is large enough, all the writing will 2364done in memory. Then, when you call C<-E<gt>flush>, the entire file 2365will be rewritten in a single pass. 2366 2367(Actually, the preceding discussion is something of a fib. You don't 2368need to enable deferred writing to get good performance for this 2369common case, because C<Tie::File> will do it for you automatically 2370unless you specifically tell it not to. See L</Autodeferring>, 2371below.) 2372 2373Calling C<-E<gt>flush> returns the array to immediate-write mode. If 2374you wish to discard the deferred writes, you may call C<-E<gt>discard> 2375instead of C<-E<gt>flush>. Note that in some cases, some of the data 2376will have been written already, and it will be too late for 2377C<-E<gt>discard> to discard all the changes. Support for 2378C<-E<gt>discard> may be withdrawn in a future version of C<Tie::File>. 2379 2380Deferred writes are cached in memory up to the limit specified by the 2381C<dw_size> option (see above). If the deferred-write buffer is full 2382and you try to write still more deferred data, the buffer will be 2383flushed. All buffered data will be written immediately, the buffer 2384will be emptied, and the now-empty space will be used for future 2385deferred writes. 2386 2387If the deferred-write buffer isn't yet full, but the total size of the 2388buffer and the read cache would exceed the C<memory> limit, the oldest 2389records will be expired from the read cache until the total size is 2390under the limit. 2391 2392C<push>, C<pop>, C<shift>, C<unshift>, and C<splice> cannot be 2393deferred. When you perform one of these operations, any deferred data 2394is written to the file and the operation is performed immediately. 2395This may change in a future version. 2396 2397If you resize the array with deferred writing enabled, the file will 2398be resized immediately, but deferred records will not be written. 2399This has a surprising consequence: C<@a = (...)> erases the file 2400immediately, but the writing of the actual data is deferred. This 2401might be a bug. If it is a bug, it will be fixed in a future version. 2402 2403=head2 Autodeferring 2404 2405C<Tie::File> tries to guess when deferred writing might be helpful, 2406and to turn it on and off automatically. 2407 2408 for (@a) { 2409 $_ = "> $_"; 2410 } 2411 2412In this example, only the first two assignments will be done 2413immediately; after this, all the changes to the file will be deferred 2414up to the user-specified memory limit. 2415 2416You should usually be able to ignore this and just use the module 2417without thinking about deferring. However, special applications may 2418require fine control over which writes are deferred, or may require 2419that all writes be immediate. To disable the autodeferment feature, 2420use 2421 2422 (tied @o)->autodefer(0); 2423 2424or 2425 2426 tie @array, 'Tie::File', $file, autodefer => 0; 2427 2428 2429Similarly, C<-E<gt>autodefer(1)> re-enables autodeferment, and 2430C<-E<gt>autodefer()> recovers the current value of the autodefer setting. 2431 2432 2433=head1 CONCURRENT ACCESS TO FILES 2434 2435Caching and deferred writing are inappropriate if you want the same 2436file to be accessed simultaneously from more than one process. Other 2437optimizations performed internally by this module are also 2438incompatible with concurrent access. A future version of this module will 2439support a C<concurrent =E<gt> 1> option that enables safe concurrent access. 2440 2441Previous versions of this documentation suggested using C<memory 2442=E<gt> 0> for safe concurrent access. This was mistaken. Tie::File 2443will not support safe concurrent access before version 0.96. 2444 2445=head1 CAVEATS 2446 2447(That's Latin for 'warnings'.) 2448 2449=over 4 2450 2451=item * 2452 2453Reasonable effort was made to make this module efficient. Nevertheless, 2454changing the size of a record in the middle of a large file will 2455always be fairly slow, because everything after the new record must be 2456moved. 2457 2458=item * 2459 2460The behavior of tied arrays is not precisely the same as for regular 2461arrays. For example: 2462 2463 # This DOES print "How unusual!" 2464 undef $a[10]; print "How unusual!\n" if defined $a[10]; 2465 2466C<undef>-ing a C<Tie::File> array element just blanks out the 2467corresponding record in the file. When you read it back again, you'll 2468get the empty string, so the supposedly-C<undef>'ed value will be 2469defined. Similarly, if you have C<autochomp> disabled, then 2470 2471 # This DOES print "How unusual!" if 'autochomp' is disabled 2472 undef $a[10]; 2473 print "How unusual!\n" if $a[10]; 2474 2475Because when C<autochomp> is disabled, C<$a[10]> will read back as 2476C<"\n"> (or whatever the record separator string is.) 2477 2478There are other minor differences, particularly regarding C<exists> 2479and C<delete>, but in general, the correspondence is extremely close. 2480 2481=item * 2482 2483I have supposed that since this module is concerned with file I/O, 2484almost all normal use of it will be heavily I/O bound. This means 2485that the time to maintain complicated data structures inside the 2486module will be dominated by the time to actually perform the I/O. 2487When there was an opportunity to spend CPU time to avoid doing I/O, I 2488usually tried to take it. 2489 2490=item * 2491 2492You might be tempted to think that deferred writing is like 2493transactions, with C<flush> as C<commit> and C<discard> as 2494C<rollback>, but it isn't, so don't. 2495 2496=item * 2497 2498There is a large memory overhead for each record offset and for each 2499cache entry: about 310 bytes per cached data record, and about 21 bytes 2500per offset table entry. 2501 2502The per-record overhead will limit the maximum number of records you 2503can access per file. Note that I<accessing> the length of the array 2504via C<$x = scalar @tied_file> accesses B<all> records and stores their 2505offsets. The same for C<foreach (@tied_file)>, even if you exit the 2506loop early. 2507 2508=back 2509 2510=head1 SUBCLASSING 2511 2512This version promises absolutely nothing about the internals, which 2513may change without notice. A future version of the module will have a 2514well-defined and stable subclassing API. 2515 2516=head1 WHAT ABOUT C<DB_File>? 2517 2518People sometimes point out that L<DB_File> will do something similar, 2519and ask why C<Tie::File> module is necessary. 2520 2521There are a number of reasons that you might prefer C<Tie::File>. 2522A list is available at C<L<http://perl.plover.com/TieFile/why-not-DB_File>>. 2523 2524=head1 AUTHOR 2525 2526Mark Jason Dominus 2527 2528To contact the author, send email to: C<mjd-perl-tiefile+@plover.com> 2529 2530To receive an announcement whenever a new version of this module is 2531released, send a blank email message to 2532C<mjd-perl-tiefile-subscribe@plover.com>. 2533 2534The most recent version of this module, including documentation and 2535any news of importance, will be available at 2536 2537 http://perl.plover.com/TieFile/ 2538 2539 2540=head1 LICENSE 2541 2542C<Tie::File> version 0.96 is copyright (C) 2003 Mark Jason Dominus. 2543 2544This library is free software; you may redistribute it and/or modify 2545it under the same terms as Perl itself. 2546 2547These terms are your choice of any of (1) the Perl Artistic Licence, 2548or (2) version 2 of the GNU General Public License as published by the 2549Free Software Foundation, or (3) any later version of the GNU General 2550Public License. 2551 2552This library is distributed in the hope that it will be useful, 2553but WITHOUT ANY WARRANTY; without even the implied warranty of 2554MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 2555GNU General Public License for more details. 2556 2557You should have received a copy of the GNU General Public License 2558along with this library program; it should be in the file C<COPYING>. 2559If not, write to the Free Software Foundation, Inc., 51 Franklin Street, 2560Fifth Floor, Boston, MA 02110-1301, USA 2561 2562For licensing inquiries, contact the author at: 2563 2564 Mark Jason Dominus 2565 255 S. Warnock St. 2566 Philadelphia, PA 19107 2567 2568=head1 WARRANTY 2569 2570C<Tie::File> version 0.98 comes with ABSOLUTELY NO WARRANTY. 2571For details, see the license. 2572 2573=head1 THANKS 2574 2575Gigantic thanks to Jarkko Hietaniemi, for agreeing to put this in the 2576core when I hadn't written it yet, and for generally being helpful, 2577supportive, and competent. (Usually the rule is "choose any one.") 2578Also big thanks to Abhijit Menon-Sen for all of the same things. 2579 2580Special thanks to Craig Berry and Peter Prymmer (for VMS portability 2581help), Randy Kobes (for Win32 portability help), Clinton Pierce and 2582Autrijus Tang (for heroic eleventh-hour Win32 testing above and beyond 2583the call of duty), Michael G Schwern (for testing advice), and the 2584rest of the CPAN testers (for testing generally). 2585 2586Special thanks to Tels for suggesting several speed and memory 2587optimizations. 2588 2589Additional thanks to: 2590Edward Avis / 2591Mattia Barbon / 2592Tom Christiansen / 2593Gerrit Haase / 2594Gurusamy Sarathy / 2595Jarkko Hietaniemi (again) / 2596Nikola Knezevic / 2597John Kominetz / 2598Nick Ing-Simmons / 2599Tassilo von Parseval / 2600H. Dieter Pearcey / 2601Slaven Rezic / 2602Eric Roode / 2603Peter Scott / 2604Peter Somu / 2605Autrijus Tang (again) / 2606Tels (again) / 2607Juerd Waalboer / 2608Todd Rinaldo 2609 2610=head1 TODO 2611 2612More tests. (Stuff I didn't think of yet.) 2613 2614Paragraph mode? 2615 2616Fixed-length mode. Leave-blanks mode. 2617 2618Maybe an autolocking mode? 2619 2620For many common uses of the module, the read cache is a liability. 2621For example, a program that inserts a single record, or that scans the 2622file once, will have a cache hit rate of zero. This suggests a major 2623optimization: The cache should be initially disabled. Here's a hybrid 2624approach: Initially, the cache is disabled, but the cache code 2625maintains statistics about how high the hit rate would be *if* it were 2626enabled. When it sees the hit rate get high enough, it enables 2627itself. The STAT comments in this code are the beginning of an 2628implementation of this. 2629 2630Record locking with fcntl()? Then the module might support an undo 2631log and get real transactions. What a tour de force that would be. 2632 2633Keeping track of the highest cached record. This would allow reads-in-a-row 2634to skip the cache lookup faster (if reading from 1..N with empty cache at 2635start, the last cached value will be always N-1). 2636 2637More tests. 2638 2639=cut 2640 2641