1package Tie::File;
2
3use strict;
4use warnings;
5
6use Carp ':DEFAULT', 'confess';
7use POSIX 'SEEK_SET';
8use Fcntl 'O_CREAT', 'O_RDWR', 'LOCK_EX', 'LOCK_SH', 'O_WRONLY', 'O_RDONLY';
9use constant O_ACCMODE => O_RDONLY | O_RDWR | O_WRONLY;
10
11
12our $VERSION = "1.09";
13my $DEFAULT_MEMORY_SIZE = 1<<21;    # 2 megabytes
14my $DEFAULT_AUTODEFER_THRESHHOLD = 3; # 3 records
15my $DEFAULT_AUTODEFER_FILELEN_THRESHHOLD = 65536; # 16 disk blocksful
16
17my %good_opt = map {$_ => 1, "-$_" => 1}
18                 qw(memory dw_size mode recsep discipline
19                    autodefer autochomp autodefer_threshhold concurrent);
20
21our $DIAGNOSTIC = 0;
22our @OFF; # used as a temporary alias in some subroutines.
23our @H; # used as a temporary alias in _annotate_ad_history
24
25sub TIEARRAY {
26  if (@_ % 2 != 0) {
27    croak "usage: tie \@array, $_[0], filename, [option => value]...";
28  }
29  my ($pack, $file, %opts) = @_;
30
31  # transform '-foo' keys into 'foo' keys
32  for my $key (keys %opts) {
33    unless ($good_opt{$key}) {
34      croak("$pack: Unrecognized option '$key'\n");
35    }
36    my $okey = $key;
37    if ($key =~ s/^-+//) {
38      $opts{$key} = delete $opts{$okey};
39    }
40  }
41
42  if ($opts{concurrent}) {
43    croak("$pack: concurrent access not supported yet\n");
44  }
45
46  unless (defined $opts{memory}) {
47    # default is the larger of the default cache size and the
48    # deferred-write buffer size (if specified)
49    $opts{memory} = $DEFAULT_MEMORY_SIZE;
50    $opts{memory} = $opts{dw_size}
51      if defined $opts{dw_size} && $opts{dw_size} > $DEFAULT_MEMORY_SIZE;
52    # Dora Winifred Read
53  }
54  $opts{dw_size} = $opts{memory} unless defined $opts{dw_size};
55  if ($opts{dw_size} > $opts{memory}) {
56      croak("$pack: dw_size may not be larger than total memory allocation\n");
57  }
58  # are we in deferred-write mode?
59  $opts{defer} = 0 unless defined $opts{defer};
60  $opts{deferred} = {};         # no records are presently deferred
61  $opts{deferred_s} = 0;        # count of total bytes in ->{deferred}
62  $opts{deferred_max} = -1;     # empty
63
64  # What's a good way to arrange that this class can be overridden?
65  $opts{cache} = Tie::File::Cache->new($opts{memory});
66
67  # autodeferment is enabled by default
68  $opts{autodefer} = 1 unless defined $opts{autodefer};
69  $opts{autodeferring} = 0;     # but is not initially active
70  $opts{ad_history} = [];
71  $opts{autodefer_threshhold} = $DEFAULT_AUTODEFER_THRESHHOLD
72    unless defined $opts{autodefer_threshhold};
73  $opts{autodefer_filelen_threshhold} = $DEFAULT_AUTODEFER_FILELEN_THRESHHOLD
74    unless defined $opts{autodefer_filelen_threshhold};
75
76  $opts{offsets} = [0];
77  $opts{filename} = $file;
78  unless (defined $opts{recsep}) {
79    $opts{recsep} = _default_recsep();
80  }
81  $opts{recseplen} = length($opts{recsep});
82  if ($opts{recseplen} == 0) {
83    croak "Empty record separator not supported by $pack";
84  }
85
86  $opts{autochomp} = 1 unless defined $opts{autochomp};
87
88  $opts{mode} = O_CREAT|O_RDWR unless defined $opts{mode};
89  $opts{rdonly} = (($opts{mode} & O_ACCMODE) == O_RDONLY);
90  $opts{sawlastrec} = undef;
91
92  my $fh;
93
94  if (UNIVERSAL::isa($file, 'GLOB')) {
95    # We use 1 here on the theory that some systems
96    # may not indicate failure if we use 0.
97    # MSWin32 does not indicate failure with 0, but I don't know if
98    # it will indicate failure with 1 or not.
99    unless (seek $file, 1, SEEK_SET) {
100      croak "$pack: your filehandle does not appear to be seekable";
101    }
102    seek $file, 0, SEEK_SET;    # put it back
103    $fh = $file;                # setting binmode is the user's problem
104  } elsif (ref $file) {
105    croak "usage: tie \@array, $pack, filename, [option => value]...";
106  } else {
107    sysopen $fh, $file, $opts{mode}, 0666 or return;
108    binmode $fh;
109    ++$opts{ourfh};
110  }
111  { my $ofh = select $fh; $| = 1; select $ofh } # autoflush on write
112  if (defined $opts{discipline}) {
113    eval { binmode($fh, $opts{discipline}) };
114    croak $@ if $@ =~ /Unknown discipline|IO layers .* unavailable/;
115    die if $@;
116  }
117  $opts{fh} = $fh;
118
119  bless \%opts => $pack;
120}
121
122sub FETCH {
123  my ($self, $n) = @_;
124  my $rec;
125
126  # check the defer buffer
127  $rec = $self->{deferred}{$n} if exists $self->{deferred}{$n};
128  $rec = $self->_fetch($n) unless defined $rec;
129
130  # inlined _chomp1
131  substr($rec, - $self->{recseplen}) = ""
132    if defined $rec && $self->{autochomp};
133  $rec;
134}
135
136# Chomp many records in-place; return nothing useful
137sub _chomp {
138  my $self = shift;
139  return unless $self->{autochomp};
140  if ($self->{autochomp}) {
141    for (@_) {
142      next unless defined;
143      substr($_, - $self->{recseplen}) = "";
144    }
145  }
146}
147
148# Chomp one record in-place; return modified record
149sub _chomp1 {
150  my ($self, $rec) = @_;
151  return $rec unless $self->{autochomp};
152  return unless defined $rec;
153  substr($rec, - $self->{recseplen}) = "";
154  $rec;
155}
156
157sub _fetch {
158  my ($self, $n) = @_;
159
160  # check the record cache
161  { my $cached = $self->{cache}->lookup($n);
162    return $cached if defined $cached;
163  }
164
165  if ($#{$self->{offsets}} < $n) {
166    return if $self->{eof};  # request for record beyond end of file
167    my $o = $self->_fill_offsets_to($n);
168    # If it's still undefined, there is no such record, so return 'undef'
169    return unless defined $o;
170  }
171
172  my $fh = $self->{FH};
173  $self->_seek($n);             # we can do this now that offsets is populated
174  my $rec = $self->_read_record;
175
176# If we happen to have just read the first record, check to see if
177# the length of the record matches what 'tell' says.  If not, Tie::File
178# won't work, and should drop dead.
179#
180#  if ($n == 0 && defined($rec) && tell($self->{fh}) != length($rec)) {
181#    if (defined $self->{discipline}) {
182#      croak "I/O discipline $self->{discipline} not supported";
183#    } else {
184#      croak "File encoding not supported";
185#    }
186#  }
187
188  $self->{cache}->insert($n, $rec) if defined $rec && not $self->{flushing};
189  $rec;
190}
191
192sub STORE {
193  my ($self, $n, $rec) = @_;
194  die "STORE called from _check_integrity!" if $DIAGNOSTIC;
195
196  $self->_fixrecs($rec);
197
198  if ($self->{autodefer}) {
199    $self->_annotate_ad_history($n);
200  }
201
202  return $self->_store_deferred($n, $rec) if $self->_is_deferring;
203
204
205  # We need this to decide whether the new record will fit
206  # It incidentally populates the offsets table
207  # Note we have to do this before we alter the cache
208  # 20020324 Wait, but this DOES alter the cache.  TODO BUG?
209  my $oldrec = $self->_fetch($n);
210
211  if (not defined $oldrec) {
212    # We're storing a record beyond the end of the file
213    $self->_extend_file_to($n+1);
214    $oldrec = $self->{recsep};
215  }
216#  return if $oldrec eq $rec;    # don't bother
217  my $len_diff = length($rec) - length($oldrec);
218
219  # length($oldrec) here is not consistent with text mode  TODO XXX BUG
220  $self->_mtwrite($rec, $self->{offsets}[$n], length($oldrec));
221  $self->_oadjust([$n, 1, $rec]);
222  $self->{cache}->update($n, $rec);
223}
224
225sub _store_deferred {
226  my ($self, $n, $rec) = @_;
227  $self->{cache}->remove($n);
228  my $old_deferred = $self->{deferred}{$n};
229
230  if (defined $self->{deferred_max} && $n > $self->{deferred_max}) {
231    $self->{deferred_max} = $n;
232  }
233  $self->{deferred}{$n} = $rec;
234
235  my $len_diff = length($rec);
236  $len_diff -= length($old_deferred) if defined $old_deferred;
237  $self->{deferred_s} += $len_diff;
238  $self->{cache}->adj_limit(-$len_diff);
239  if ($self->{deferred_s} > $self->{dw_size}) {
240    $self->_flush;
241  } elsif ($self->_cache_too_full) {
242    $self->_cache_flush;
243  }
244}
245
246# Remove a single record from the deferred-write buffer without writing it
247# The record need not be present
248sub _delete_deferred {
249  my ($self, $n) = @_;
250  my $rec = delete $self->{deferred}{$n};
251  return unless defined $rec;
252
253  if (defined $self->{deferred_max}
254      && $n == $self->{deferred_max}) {
255    undef $self->{deferred_max};
256  }
257
258  $self->{deferred_s} -= length $rec;
259  $self->{cache}->adj_limit(length $rec);
260}
261
262sub FETCHSIZE {
263  my $self = shift;
264  my $n = $self->{eof} ? $#{$self->{offsets}} : $self->_fill_offsets;
265
266  my $top_deferred = $self->_defer_max;
267  $n = $top_deferred+1 if defined $top_deferred && $n < $top_deferred+1;
268  $n;
269}
270
271sub STORESIZE {
272  my ($self, $len) = @_;
273
274  if ($self->{autodefer}) {
275    $self->_annotate_ad_history('STORESIZE');
276  }
277
278  my $olen = $self->FETCHSIZE;
279  return if $len == $olen;      # Woo-hoo!
280
281  # file gets longer
282  if ($len > $olen) {
283    if ($self->_is_deferring) {
284      for ($olen .. $len-1) {
285        $self->_store_deferred($_, $self->{recsep});
286      }
287    } else {
288      $self->_extend_file_to($len);
289    }
290    return;
291  }
292
293  # file gets shorter
294  if ($self->_is_deferring) {
295    # TODO maybe replace this with map-plus-assignment?
296    for (grep $_ >= $len, keys %{$self->{deferred}}) {
297      $self->_delete_deferred($_);
298    }
299    $self->{deferred_max} = $len-1;
300  }
301
302  $self->_seek($len);
303  $self->_chop_file;
304  $#{$self->{offsets}} = $len;
305#  $self->{offsets}[0] = 0;      # in case we just chopped this
306
307  $self->{cache}->remove(grep $_ >= $len, $self->{cache}->ckeys);
308}
309
310### OPTIMIZE ME
311### It should not be necessary to do FETCHSIZE
312### Just seek to the end of the file.
313sub PUSH {
314  my $self = shift;
315  $self->SPLICE($self->FETCHSIZE, scalar(@_), @_);
316
317  # No need to return:
318  #  $self->FETCHSIZE;  # because av.c takes care of this for me
319}
320
321sub POP {
322  my $self = shift;
323  my $size = $self->FETCHSIZE;
324  return if $size == 0;
325#  print STDERR "# POPPITY POP POP POP\n";
326  scalar $self->SPLICE($size-1, 1);
327}
328
329sub SHIFT {
330  my $self = shift;
331  scalar $self->SPLICE(0, 1);
332}
333
334sub UNSHIFT {
335  my $self = shift;
336  $self->SPLICE(0, 0, @_);
337  # $self->FETCHSIZE; # av.c takes care of this for me
338}
339
340sub CLEAR {
341  my $self = shift;
342
343  if ($self->{autodefer}) {
344    $self->_annotate_ad_history('CLEAR');
345  }
346
347  $self->_seekb(0);
348  $self->_chop_file;
349    $self->{cache}->set_limit($self->{memory});
350    $self->{cache}->empty;
351  @{$self->{offsets}} = (0);
352  %{$self->{deferred}}= ();
353    $self->{deferred_s} = 0;
354    $self->{deferred_max} = -1;
355}
356
357sub EXTEND {
358  my ($self, $n) = @_;
359
360  # No need to pre-extend anything in this case
361  return if $self->_is_deferring;
362
363  $self->_fill_offsets_to($n);
364  $self->_extend_file_to($n);
365}
366
367sub DELETE {
368  my ($self, $n) = @_;
369
370  if ($self->{autodefer}) {
371    $self->_annotate_ad_history('DELETE');
372  }
373
374  my $lastrec = $self->FETCHSIZE-1;
375  my $rec = $self->FETCH($n);
376  $self->_delete_deferred($n) if $self->_is_deferring;
377  if ($n == $lastrec) {
378    $self->_seek($n);
379    $self->_chop_file;
380    $#{$self->{offsets}}--;
381    $self->{cache}->remove($n);
382    # perhaps in this case I should also remove trailing null records?
383    # 20020316
384    # Note that delete @a[-3..-1] deletes the records in the wrong order,
385    # so we only chop the very last one out of the file.  We could repair this
386    # by tracking deleted records inside the object.
387  } elsif ($n < $lastrec) {
388    $self->STORE($n, "");
389  }
390  $rec;
391}
392
393sub EXISTS {
394  my ($self, $n) = @_;
395  return 1 if exists $self->{deferred}{$n};
396  $n < $self->FETCHSIZE;
397}
398
399sub SPLICE {
400  my $self = shift;
401
402  if ($self->{autodefer}) {
403    $self->_annotate_ad_history('SPLICE');
404  }
405
406  $self->_flush if $self->_is_deferring; # move this up?
407  if (wantarray) {
408    $self->_chomp(my @a = $self->_splice(@_));
409    @a;
410  } else {
411    $self->_chomp1(scalar $self->_splice(@_));
412  }
413}
414
415sub DESTROY {
416  my $self = shift;
417  $self->flush if $self->_is_deferring;
418  $self->{cache}->delink if defined $self->{cache}; # break circular link
419  if ($self->{fh} and $self->{ourfh}) {
420      delete $self->{ourfh};
421      close delete $self->{fh};
422  }
423}
424
425sub _splice {
426  my ($self, $pos, $nrecs, @data) = @_;
427  my @result;
428
429  $pos = 0 unless defined $pos;
430
431  # Deal with negative and other out-of-range positions
432  # Also set default for $nrecs
433  {
434    my $oldsize = $self->FETCHSIZE;
435    $nrecs = $oldsize unless defined $nrecs;
436    my $oldpos = $pos;
437
438    if ($pos < 0) {
439      $pos += $oldsize;
440      if ($pos < 0) {
441        croak "Modification of non-creatable array value attempted, " .
442              "subscript $oldpos";
443      }
444    }
445
446    if ($pos > $oldsize) {
447      return unless @data;
448      $pos = $oldsize;          # This is what perl does for normal arrays
449    }
450
451    # The manual is very unclear here
452    if ($nrecs < 0) {
453      $nrecs = $oldsize - $pos + $nrecs;
454      $nrecs = 0 if $nrecs < 0;
455    }
456
457    # nrecs is too big---it really means "until the end"
458    # 20030507
459    if ($nrecs + $pos > $oldsize) {
460      $nrecs = $oldsize - $pos;
461    }
462  }
463
464  $self->_fixrecs(@data);
465  my $data = join '', @data;
466  my $datalen = length $data;
467  my $oldlen = 0;
468
469  # compute length of data being removed
470  for ($pos .. $pos+$nrecs-1) {
471    last unless defined $self->_fill_offsets_to($_);
472    my $rec = $self->_fetch($_);
473    last unless defined $rec;
474    push @result, $rec;
475
476    # Why don't we just use length($rec) here?
477    # Because that record might have come from the cache.  _splice
478    # might have been called to flush out the deferred-write records,
479    # and in this case length($rec) is the length of the record to be
480    # *written*, not the length of the actual record in the file.  But
481    # the offsets are still true. 20020322
482    $oldlen += $self->{offsets}[$_+1] - $self->{offsets}[$_]
483      if defined $self->{offsets}[$_+1];
484  }
485  $self->_fill_offsets_to($pos+$nrecs);
486
487  # Modify the file
488  $self->_mtwrite($data, $self->{offsets}[$pos], $oldlen);
489  # Adjust the offsets table
490  $self->_oadjust([$pos, $nrecs, @data]);
491
492  { # Take this read cache stuff out into a separate function
493    # You made a half-attempt to put it into _oadjust.
494    # Finish something like that up eventually.
495    # STORE also needs to do something similarish
496
497    # update the read cache, part 1
498    # modified records
499    for ($pos .. $pos+$nrecs-1) {
500      my $new = $data[$_-$pos];
501      if (defined $new) {
502        $self->{cache}->update($_, $new);
503      } else {
504        $self->{cache}->remove($_);
505      }
506    }
507
508    # update the read cache, part 2
509    # moved records - records past the site of the change
510    # need to be renumbered
511    # Maybe merge this with the previous block?
512    {
513      my @oldkeys = grep $_ >= $pos + $nrecs, $self->{cache}->ckeys;
514      my @newkeys = map $_-$nrecs+@data, @oldkeys;
515      $self->{cache}->rekey(\@oldkeys, \@newkeys);
516    }
517
518    # Now there might be too much data in the cache, if we spliced out
519    # some short records and spliced in some long ones.  If so, flush
520    # the cache.
521    $self->_cache_flush;
522  }
523
524  # Yes, the return value of 'splice' *is* actually this complicated
525  wantarray ? @result : @result ? $result[-1] : undef;
526}
527
528
529# write data into the file
530# $data is the data to be written.
531# it should be written at position $pos, and should overwrite
532# exactly $len of the following bytes.
533# Note that if length($data) > $len, the subsequent bytes will have to
534# be moved up, and if length($data) < $len, they will have to
535# be moved down
536sub _twrite {
537  my ($self, $data, $pos, $len) = @_;
538
539  unless (defined $pos) {
540    die "\$pos was undefined in _twrite";
541  }
542
543  my $len_diff = length($data) - $len;
544
545  if ($len_diff == 0) {          # Woo-hoo!
546    my $fh = $self->{fh};
547    $self->_seekb($pos);
548    $self->_write_record($data);
549    return;                     # well, that was easy.
550  }
551
552  # the two records are of different lengths
553  # our strategy here: rewrite the tail of the file,
554  # reading ahead one buffer at a time
555  # $bufsize is required to be at least as large as the data we're overwriting
556  my $bufsize = _bufsize($len_diff);
557  my ($writepos, $readpos) = ($pos, $pos+$len);
558  my $next_block;
559  my $more_data;
560
561  # Seems like there ought to be a way to avoid the repeated code
562  # and the special case here.  The read(1) is also a little weird.
563  # Think about this.
564  do {
565    $self->_seekb($readpos);
566    my $br = read $self->{fh}, $next_block, $bufsize;
567    $more_data = read $self->{fh}, my($dummy), 1;
568    $self->_seekb($writepos);
569    $self->_write_record($data);
570    $readpos += $br;
571    $writepos += length $data;
572    $data = $next_block;
573  } while $more_data;
574  $self->_seekb($writepos);
575  $self->_write_record($next_block);
576
577  # There might be leftover data at the end of the file
578  $self->_chop_file if $len_diff < 0;
579}
580
581# _iwrite(D, S, E)
582# Insert text D at position S.
583# Let C = E-S-|D|.  If C < 0; die.
584# Data in [S,S+C) is copied to [S+D,S+D+C) = [S+D,E).
585# Data in [S+C = E-D, E) is returned.  Data in [E, oo) is untouched.
586#
587# In a later version, don't read the entire intervening area into
588# memory at once; do the copying block by block.
589sub _iwrite {
590  my $self = shift;
591  my ($D, $s, $e) = @_;
592  my $d = length $D;
593  my $c = $e-$s-$d;
594  local *FH = $self->{fh};
595  confess "Not enough space to insert $d bytes between $s and $e"
596    if $c < 0;
597  confess "[$s,$e) is an invalid insertion range" if $e < $s;
598
599  $self->_seekb($s);
600  read FH, my $buf, $e-$s;
601
602  $D .= substr($buf, 0, $c, "");
603
604  $self->_seekb($s);
605  $self->_write_record($D);
606
607  return $buf;
608}
609
610# Like _twrite, but the data-pos-len triple may be repeated; you may
611# write several chunks.  All the writing will be done in
612# one pass.   Chunks SHALL be in ascending order and SHALL NOT overlap.
613sub _mtwrite {
614  my $self = shift;
615  my $unwritten = "";
616  my $delta = 0;
617
618  @_ % 3 == 0
619    or die "Arguments to _mtwrite did not come in groups of three";
620
621  while (@_) {
622    my ($data, $pos, $len) = splice @_, 0, 3;
623    my $end = $pos + $len;  # The OLD end of the segment to be replaced
624    $data = $unwritten . $data;
625    $delta -= length($unwritten);
626    $unwritten  = "";
627    $pos += $delta;             # This is where the data goes now
628    my $dlen = length $data;
629    $self->_seekb($pos);
630    if ($len >= $dlen) {        # the data will fit
631      $self->_write_record($data);
632      $delta += ($dlen - $len); # everything following moves down by this much
633      $data = ""; # All the data in the buffer has been written
634    } else {                    # won't fit
635      my $writable = substr($data, 0, $len - $delta, "");
636      $self->_write_record($writable);
637      $delta += ($dlen - $len); # everything following moves down by this much
638    }
639
640    # At this point we've written some but maybe not all of the data.
641    # There might be a gap to close up, or $data might still contain a
642    # bunch of unwritten data that didn't fit.
643    my $ndlen = length $data;
644    if ($delta == 0) {
645      $self->_write_record($data);
646    } elsif ($delta < 0) {
647      # upcopy (close up gap)
648      if (@_) {
649        $self->_upcopy($end, $end + $delta, $_[1] - $end);
650      } else {
651        $self->_upcopy($end, $end + $delta);
652      }
653    } else {
654      # downcopy (insert data that didn't fit; replace this data in memory
655      # with _later_ data that doesn't fit)
656      if (@_) {
657        $unwritten = $self->_downcopy($data, $end, $_[1] - $end);
658      } else {
659        # Make the file longer to accommodate the last segment that doesn't
660        $unwritten = $self->_downcopy($data, $end);
661      }
662    }
663  }
664}
665
666# Copy block of data of length $len from position $spos to position $dpos
667# $dpos must be <= $spos
668#
669# If $len is undefined, go all the way to the end of the file
670# and then truncate it ($spos - $dpos bytes will be removed)
671sub _upcopy {
672  my $blocksize = 8192;
673  my ($self, $spos, $dpos, $len) = @_;
674  if ($dpos > $spos) {
675    die "source ($spos) was upstream of destination ($dpos) in _upcopy";
676  } elsif ($dpos == $spos) {
677    return;
678  }
679
680  while (! defined ($len) || $len > 0) {
681    my $readsize = ! defined($len) ? $blocksize
682               : $len > $blocksize ? $blocksize
683               : $len;
684
685    my $fh = $self->{fh};
686    $self->_seekb($spos);
687    my $bytes_read = read $fh, my($data), $readsize;
688    $self->_seekb($dpos);
689    if ($data eq "") {
690      $self->_chop_file;
691      last;
692    }
693    $self->_write_record($data);
694    $spos += $bytes_read;
695    $dpos += $bytes_read;
696    $len -= $bytes_read if defined $len;
697  }
698}
699
700# Write $data into a block of length $len at position $pos,
701# moving everything in the block forwards to make room.
702# Instead of writing the last length($data) bytes from the block
703# (because there isn't room for them any longer) return them.
704#
705# Undefined $len means 'until the end of the file'
706sub _downcopy {
707  my $blocksize = 8192;
708  my ($self, $data, $pos, $len) = @_;
709  my $fh = $self->{fh};
710
711  while (! defined $len || $len > 0) {
712    my $readsize = ! defined($len) ? $blocksize
713      : $len > $blocksize? $blocksize : $len;
714    $self->_seekb($pos);
715    read $fh, my($old), $readsize;
716    my $last_read_was_short = length($old) < $readsize;
717    $data .= $old;
718    my $writable;
719    if ($last_read_was_short) {
720      # If last read was short, then $data now contains the entire rest
721      # of the file, so there's no need to write only one block of it
722      $writable = $data;
723      $data = "";
724    } else {
725      $writable = substr($data, 0, $readsize, "");
726    }
727    last if $writable eq "";
728    $self->_seekb($pos);
729    $self->_write_record($writable);
730    last if $last_read_was_short && $data eq "";
731    $len -= $readsize if defined $len;
732    $pos += $readsize;
733  }
734  return $data;
735}
736
737# Adjust the object data structures following an '_mtwrite'
738# Arguments are
739#  [$pos, $nrecs, @length]  items
740# indicating that $nrecs records were removed at $recpos (a record offset)
741# and replaced with records of length @length...
742# Arguments guarantee that $recpos is strictly increasing.
743# No return value
744sub _oadjust {
745  my $self = shift;
746  my $delta = 0;
747  my $delta_recs = 0;
748  my $prev_end = -1;
749
750  for (@_) {
751    my ($pos, $nrecs, @data) = @$_;
752    $pos += $delta_recs;
753
754    # Adjust the offsets of the records after the previous batch up
755    # to the first new one of this batch
756    for my $i ($prev_end+2 .. $pos - 1) {
757      $self->{offsets}[$i] += $delta;
758    }
759
760    $prev_end = $pos + @data - 1; # last record moved on this pass
761
762    # Remove the offsets for the removed records;
763    # replace with the offsets for the inserted records
764    my @newoff = ($self->{offsets}[$pos] + $delta);
765    for my $i (0 .. $#data) {
766      my $newlen = length $data[$i];
767      push @newoff, $newoff[$i] + $newlen;
768      $delta += $newlen;
769    }
770
771    for my $i ($pos .. $pos+$nrecs-1) {
772      last if $i+1 > $#{$self->{offsets}};
773      my $oldlen = $self->{offsets}[$i+1] - $self->{offsets}[$i];
774      $delta -= $oldlen;
775    }
776
777    # replace old offsets with new
778    splice @{$self->{offsets}}, $pos, $nrecs+1, @newoff;
779    # What if we just spliced out the end of the offsets table?
780    # shouldn't we clear $self->{eof}?   Test for this XXX BUG TODO
781
782    $delta_recs += @data - $nrecs; # net change in total number of records
783  }
784
785  # The trailing records at the very end of the file
786  if ($delta) {
787    for my $i ($prev_end+2 .. $#{$self->{offsets}}) {
788      $self->{offsets}[$i] += $delta;
789    }
790  }
791
792  # If we scrubbed out all known offsets, regenerate the trivial table
793  # that knows that the file does indeed start at 0.
794  $self->{offsets}[0] = 0 unless @{$self->{offsets}};
795  # If the file got longer, the offsets table is no longer complete
796  # $self->{eof} = 0 if $delta_recs > 0;
797
798  # Now there might be too much data in the cache, if we spliced out
799  # some short records and spliced in some long ones.  If so, flush
800  # the cache.
801  $self->_cache_flush;
802}
803
804# If a record does not already end with the appropriate terminator
805# string, append one.
806sub _fixrecs {
807  my $self = shift;
808  for (@_) {
809    $_ = "" unless defined $_;
810    $_ .= $self->{recsep}
811      unless substr($_, - $self->{recseplen}) eq $self->{recsep};
812  }
813}
814
815
816################################################################
817#
818# Basic read, write, and seek
819#
820
821# seek to the beginning of record #$n
822# Assumes that the offsets table is already correctly populated
823#
824# Note that $n=-1 has a special meaning here: It means the start of
825# the last known record; this may or may not be the very last record
826# in the file, depending on whether the offsets table is fully populated.
827#
828sub _seek {
829  my ($self, $n) = @_;
830  my $o = $self->{offsets}[$n];
831  defined($o)
832    or confess("logic error: undefined offset for record $n");
833  seek $self->{fh}, $o, SEEK_SET
834    or confess "Couldn't seek filehandle: $!";  # "Should never happen."
835}
836
837# seek to byte $b in the file
838sub _seekb {
839  my ($self, $b) = @_;
840  seek $self->{fh}, $b, SEEK_SET
841    or die "Couldn't seek filehandle: $!";  # "Should never happen."
842}
843
844# populate the offsets table up to the beginning of record $n
845# return the offset of record $n
846sub _fill_offsets_to {
847  my ($self, $n) = @_;
848
849  return $self->{offsets}[$n] if $self->{eof};
850
851  my $fh = $self->{fh};
852  local *OFF = $self->{offsets};
853  my $rec;
854
855  until ($#OFF >= $n) {
856    $self->_seek(-1);           # tricky -- see comment at _seek
857    $rec = $self->_read_record;
858    if (defined $rec) {
859      push @OFF, int(tell $fh);  # Tels says that int() saves memory here
860    } else {
861      $self->{eof} = 1;
862      return;                   # It turns out there is no such record
863    }
864  }
865
866  # we have now read all the records up to record n-1,
867  # so we can return the offset of record n
868  $OFF[$n];
869}
870
871sub _fill_offsets {
872  my ($self) = @_;
873
874  my $fh = $self->{fh};
875  local *OFF = $self->{offsets};
876
877  $self->_seek(-1);           # tricky -- see comment at _seek
878
879  # Tels says that inlining read_record() would make this loop
880  # five times faster. 20030508
881  while ( defined $self->_read_record()) {
882    # int() saves us memory here
883    push @OFF, int(tell $fh);
884  }
885
886  $self->{eof} = 1;
887  $#OFF;
888}
889
890# assumes that $rec is already suitably terminated
891sub _write_record {
892  my ($self, $rec) = @_;
893  my $fh = $self->{fh};
894  local $\ = "";
895  print $fh $rec
896    or die "Couldn't write record: $!";  # "Should never happen."
897#  $self->{_written} += length($rec);
898}
899
900sub _read_record {
901  my $self = shift;
902  my $rec;
903  { local $/ = $self->{recsep};
904    my $fh = $self->{fh};
905    $rec = <$fh>;
906  }
907  return unless defined $rec;
908  if (substr($rec, -$self->{recseplen}) ne $self->{recsep}) {
909    # improperly terminated final record --- quietly fix it.
910#    my $ac = substr($rec, -$self->{recseplen});
911#    $ac =~ s/\n/\\n/g;
912    $self->{sawlastrec} = 1;
913    unless ($self->{rdonly}) {
914      local $\ = "";
915      my $fh = $self->{fh};
916      print $fh $self->{recsep};
917    }
918    $rec .= $self->{recsep};
919  }
920#  $self->{_read} += length($rec) if defined $rec;
921  $rec;
922}
923
924sub _rw_stats {
925  my $self = shift;
926  @{$self}{'_read', '_written'};
927}
928
929################################################################
930#
931# Read cache management
932
933sub _cache_flush {
934  my ($self) = @_;
935  $self->{cache}->reduce_size_to($self->{memory} - $self->{deferred_s});
936}
937
938sub _cache_too_full {
939  my $self = shift;
940  $self->{cache}->bytes + $self->{deferred_s} >= $self->{memory};
941}
942
943################################################################
944#
945# File custodial services
946#
947
948
949# We have read to the end of the file and have the offsets table
950# entirely populated.  Now we need to write a new record beyond
951# the end of the file.  We prepare for this by writing
952# empty records into the file up to the position we want
953#
954# assumes that the offsets table already contains the offset of record $n,
955# if it exists, and extends to the end of the file if not.
956sub _extend_file_to {
957  my ($self, $n) = @_;
958  $self->_seek(-1);             # position after the end of the last record
959  my $pos = $self->{offsets}[-1];
960
961  # the offsets table has one entry more than the total number of records
962  my $extras = $n - $#{$self->{offsets}};
963
964  # Todo : just use $self->{recsep} x $extras here?
965  while ($extras-- > 0) {
966    $self->_write_record($self->{recsep});
967    push @{$self->{offsets}}, int(tell $self->{fh});
968  }
969}
970
971# Truncate the file at the current position
972sub _chop_file {
973  my $self = shift;
974  truncate $self->{fh}, tell($self->{fh});
975}
976
977
978# compute the size of a buffer suitable for moving
979# all the data in a file forward $n bytes
980# ($n may be negative)
981# The result should be at least $n.
982sub _bufsize {
983  my $n = shift;
984  return 8192 if $n <= 0;
985  my $b = $n & ~8191;
986  $b += 8192 if $n & 8191;
987  $b;
988}
989
990################################################################
991#
992# Miscellaneous public methods
993#
994
995# Lock the file
996sub flock {
997  my ($self, $op) = @_;
998  unless (@_ <= 3) {
999    my $pack = ref $self;
1000    croak "Usage: $pack\->flock([OPERATION])";
1001  }
1002  my $fh = $self->{fh};
1003  $op = LOCK_EX unless defined $op;
1004  my $locked = flock $fh, $op;
1005
1006  if ($locked && ($op & (LOCK_EX | LOCK_SH))) {
1007    # If you're locking the file, then presumably it's because
1008    # there might have been a write access by another process.
1009    # In that case, the read cache contents and the offsets table
1010    # might be invalid, so discard them.  20030508
1011    $self->{offsets} = [0];
1012    $self->{cache}->empty;
1013  }
1014
1015  $locked;
1016}
1017
1018# Get/set autochomp option
1019sub autochomp {
1020  my $self = shift;
1021  if (@_) {
1022    my $old = $self->{autochomp};
1023    $self->{autochomp} = shift;
1024    $old;
1025  } else {
1026    $self->{autochomp};
1027  }
1028}
1029
1030# Get offset table entries; returns offset of nth record
1031sub offset {
1032  my ($self, $n) = @_;
1033
1034  if ($#{$self->{offsets}} < $n) {
1035    return if $self->{eof};     # request for record beyond the end of file
1036    my $o = $self->_fill_offsets_to($n);
1037    # If it's still undefined, there is no such record, so return 'undef'
1038    return unless defined $o;
1039   }
1040
1041  $self->{offsets}[$n];
1042}
1043
1044sub discard_offsets {
1045  my $self = shift;
1046  $self->{offsets} = [0];
1047}
1048
1049################################################################
1050#
1051# Matters related to deferred writing
1052#
1053
1054# Defer writes
1055sub defer {
1056  my $self = shift;
1057  $self->_stop_autodeferring;
1058  @{$self->{ad_history}} = ();
1059  $self->{defer} = 1;
1060}
1061
1062# Flush deferred writes
1063#
1064# This could be better optimized to write the file in one pass, instead
1065# of one pass per block of records.  But that will require modifications
1066# to _twrite, so I should have a good _twrite test suite first.
1067sub flush {
1068  my $self = shift;
1069
1070  $self->_flush;
1071  $self->{defer} = 0;
1072}
1073
1074sub _old_flush {
1075  my $self = shift;
1076  my @writable = sort {$a<=>$b} (keys %{$self->{deferred}});
1077
1078  while (@writable) {
1079    # gather all consecutive records from the front of @writable
1080    my $first_rec = shift @writable;
1081    my $last_rec = $first_rec+1;
1082    ++$last_rec, shift @writable while @writable && $last_rec == $writable[0];
1083    --$last_rec;
1084    $self->_fill_offsets_to($last_rec);
1085    $self->_extend_file_to($last_rec);
1086    $self->_splice($first_rec, $last_rec-$first_rec+1,
1087                   @{$self->{deferred}}{$first_rec .. $last_rec});
1088  }
1089
1090  $self->_discard;               # clear out defered-write-cache
1091}
1092
1093sub _flush {
1094  my $self = shift;
1095  my @writable = sort {$a<=>$b} (keys %{$self->{deferred}});
1096  my @args;
1097  my @adjust;
1098
1099  while (@writable) {
1100    # gather all consecutive records from the front of @writable
1101    my $first_rec = shift @writable;
1102    my $last_rec = $first_rec+1;
1103    ++$last_rec, shift @writable while @writable && $last_rec == $writable[0];
1104    --$last_rec;
1105    my $end = $self->_fill_offsets_to($last_rec+1);
1106    if (not defined $end) {
1107      $self->_extend_file_to($last_rec);
1108      $end = $self->{offsets}[$last_rec];
1109    }
1110    my ($start) = $self->{offsets}[$first_rec];
1111    push @args,
1112         join("", @{$self->{deferred}}{$first_rec .. $last_rec}), # data
1113         $start,                                                  # position
1114         $end-$start;                                             # length
1115    push @adjust, [$first_rec, # starting at this position...
1116                   $last_rec-$first_rec+1,  # this many records...
1117                   # are replaced with these...
1118                   @{$self->{deferred}}{$first_rec .. $last_rec},
1119                  ];
1120  }
1121
1122  $self->_mtwrite(@args);  # write multiple record groups
1123  $self->_discard;               # clear out defered-write-cache
1124  $self->_oadjust(@adjust);
1125}
1126
1127# Discard deferred writes and disable future deferred writes
1128sub discard {
1129  my $self = shift;
1130  $self->_discard;
1131  $self->{defer} = 0;
1132}
1133
1134# Discard deferred writes, but retain old deferred writing mode
1135sub _discard {
1136  my $self = shift;
1137  %{$self->{deferred}} = ();
1138  $self->{deferred_s}  = 0;
1139  $self->{deferred_max}  = -1;
1140  $self->{cache}->set_limit($self->{memory});
1141}
1142
1143# Deferred writing is enabled, either explicitly ($self->{defer})
1144# or automatically ($self->{autodeferring})
1145sub _is_deferring {
1146  my $self = shift;
1147  $self->{defer} || $self->{autodeferring};
1148}
1149
1150# The largest record number of any deferred record
1151sub _defer_max {
1152  my $self = shift;
1153  return $self->{deferred_max} if defined $self->{deferred_max};
1154  my $max = -1;
1155  for my $key (keys %{$self->{deferred}}) {
1156    $max = $key if $key > $max;
1157  }
1158  $self->{deferred_max} = $max;
1159  $max;
1160}
1161
1162################################################################
1163#
1164# Matters related to autodeferment
1165#
1166
1167# Get/set autodefer option
1168sub autodefer {
1169  my $self = shift;
1170  if (@_) {
1171    my $old = $self->{autodefer};
1172    $self->{autodefer} = shift;
1173    if ($old) {
1174      $self->_stop_autodeferring;
1175      @{$self->{ad_history}} = ();
1176    }
1177    $old;
1178  } else {
1179    $self->{autodefer};
1180  }
1181}
1182
1183# The user is trying to store record #$n Record that in the history,
1184# and then enable (or disable) autodeferment if that seems useful.
1185# Note that it's OK for $n to be a non-number, as long as the function
1186# is prepared to deal with that.  Nobody else looks at the ad_history.
1187#
1188# Now, what does the ad_history mean, and what is this function doing?
1189# Essentially, the idea is to enable autodeferring when we see that the
1190# user has made three consecutive STORE calls to three consecutive records.
1191# ("Three" is actually ->{autodefer_threshhold}.)
1192# A STORE call for record #$n inserts $n into the autodefer history,
1193# and if the history contains three consecutive records, we enable
1194# autodeferment.  An ad_history of [X, Y] means that the most recent
1195# STOREs were for records X, X+1, ..., Y, in that order.
1196#
1197# Inserting a nonconsecutive number erases the history and starts over.
1198#
1199# Performing a special operation like SPLICE erases the history.
1200#
1201# There's one special case: CLEAR means that CLEAR was just called.
1202# In this case, we prime the history with [-2, -1] so that if the next
1203# write is for record 0, autodeferring goes on immediately.  This is for
1204# the common special case of "@a = (...)".
1205#
1206sub _annotate_ad_history {
1207  my ($self, $n) = @_;
1208  return unless $self->{autodefer}; # feature is disabled
1209  return if $self->{defer};     # already in explicit defer mode
1210  return unless $self->{offsets}[-1] >= $self->{autodefer_filelen_threshhold};
1211
1212  local *H = $self->{ad_history};
1213  if ($n eq 'CLEAR') {
1214    @H = (-2, -1);              # prime the history with fake records
1215    $self->_stop_autodeferring;
1216  } elsif ($n =~ /^\d+$/) {
1217    if (@H == 0) {
1218      @H =  ($n, $n);
1219    } else {                    # @H == 2
1220      if ($H[1] == $n-1) {      # another consecutive record
1221        $H[1]++;
1222        if ($H[1] - $H[0] + 1 >= $self->{autodefer_threshhold}) {
1223          $self->{autodeferring} = 1;
1224        }
1225      } else {                  # nonconsecutive- erase and start over
1226        @H = ($n, $n);
1227        $self->_stop_autodeferring;
1228      }
1229    }
1230  } else {                      # SPLICE or STORESIZE or some such
1231    @H = ();
1232    $self->_stop_autodeferring;
1233  }
1234}
1235
1236# If autodeferring was enabled, cut it out and discard the history
1237sub _stop_autodeferring {
1238  my $self = shift;
1239  if ($self->{autodeferring}) {
1240    $self->_flush;
1241  }
1242  $self->{autodeferring} = 0;
1243}
1244
1245################################################################
1246
1247
1248# This is NOT a method.  It is here for two reasons:
1249#  1. To factor a fairly complicated block out of the constructor
1250#  2. To provide access for the test suite, which need to be sure
1251#     files are being written properly.
1252sub _default_recsep {
1253  my $recsep = $/;
1254  if ($^O eq 'MSWin32') {       # Dos too?
1255    # Windows users expect files to be terminated with \r\n
1256    # But $/ is set to \n instead
1257    # Note that this also transforms \n\n into \r\n\r\n.
1258    # That is a feature.
1259    $recsep =~ s/\n/\r\n/g;
1260  }
1261  $recsep;
1262}
1263
1264# Utility function for _check_integrity
1265sub _ci_warn {
1266  my $msg = shift;
1267  $msg =~ s/\n/\\n/g;
1268  $msg =~ s/\r/\\r/g;
1269  print "# $msg\n";
1270}
1271
1272# Given a file, make sure the cache is consistent with the
1273# file contents and the internal data structures are consistent with
1274# each other.  Returns true if everything checks out, false if not
1275#
1276# The $file argument is no longer used.  It is retained for compatibility
1277# with the existing test suite.
1278sub _check_integrity {
1279  my ($self, $file, $warn) = @_;
1280  my $rsl = $self->{recseplen};
1281  my $rs  = $self->{recsep};
1282  my $good = 1;
1283  local *_;                     # local $_ does not work here
1284  local $DIAGNOSTIC = 1;
1285
1286  if (not defined $rs) {
1287    _ci_warn("recsep is undef!");
1288    $good = 0;
1289  } elsif ($rs eq "") {
1290    _ci_warn("recsep is empty!");
1291    $good = 0;
1292  } elsif ($rsl != length $rs) {
1293    my $ln = length $rs;
1294    _ci_warn("recsep <$rs> has length $ln, should be $rsl");
1295    $good = 0;
1296  }
1297
1298  if (not defined $self->{offsets}[0]) {
1299    _ci_warn("offset 0 is missing!");
1300    $good = 0;
1301
1302  } elsif ($self->{offsets}[0] != 0) {
1303    _ci_warn("rec 0: offset <$self->{offsets}[0]> s/b 0!");
1304    $good = 0;
1305  }
1306
1307  my $cached = 0;
1308  {
1309    local *F = $self->{fh};
1310    seek F, 0, SEEK_SET;
1311    local $. = 0;
1312    local $/ = $rs;
1313
1314    while (<F>) {
1315      my $n = $. - 1;
1316      my $cached = $self->{cache}->_produce($n);
1317      my $offset = $self->{offsets}[$.];
1318      my $ao = tell F;
1319      if (defined $offset && $offset != $ao) {
1320        _ci_warn("rec $n: offset <$offset> actual <$ao>");
1321        $good = 0;
1322      }
1323      if (defined $cached && $_ ne $cached && ! $self->{deferred}{$n}) {
1324        $good = 0;
1325        _ci_warn("rec $n: cached <$cached> actual <$_>");
1326      }
1327      if (defined $cached && substr($cached, -$rsl) ne $rs) {
1328        $good = 0;
1329        _ci_warn("rec $n in the cache is missing the record separator");
1330      }
1331      if (! defined $offset && $self->{eof}) {
1332        $good = 0;
1333        _ci_warn("The offset table was marked complete, but it is missing " .
1334                 "element $.");
1335      }
1336    }
1337    if (@{$self->{offsets}} > $.+1) {
1338        $good = 0;
1339        my $n = @{$self->{offsets}};
1340        _ci_warn("The offset table has $n items, but the file has only $.");
1341    }
1342
1343    my $deferring = $self->_is_deferring;
1344    for my $n ($self->{cache}->ckeys) {
1345      my $r = $self->{cache}->_produce($n);
1346      $cached += length($r);
1347      next if $n+1 <= $.;         # checked this already
1348      _ci_warn("spurious caching of record $n");
1349      $good = 0;
1350    }
1351    my $b = $self->{cache}->bytes;
1352    if ($cached != $b) {
1353      _ci_warn("cache size is $b, should be $cached");
1354      $good = 0;
1355    }
1356  }
1357
1358  # That cache has its own set of tests
1359  $good = 0 unless $self->{cache}->_check_integrity;
1360
1361  # Now let's check the deferbuffer
1362  # Unless deferred writing is enabled, it should be empty
1363  if (! $self->_is_deferring && %{$self->{deferred}}) {
1364    _ci_warn("deferred writing disabled, but deferbuffer nonempty");
1365    $good = 0;
1366  }
1367
1368  # Any record in the deferbuffer should *not* be present in the readcache
1369  my $deferred_s = 0;
1370  while (my ($n, $r) = each %{$self->{deferred}}) {
1371    $deferred_s += length($r);
1372    if (defined $self->{cache}->_produce($n)) {
1373      _ci_warn("record $n is in the deferbuffer *and* the readcache");
1374      $good = 0;
1375    }
1376    if (substr($r, -$rsl) ne $rs) {
1377      _ci_warn("rec $n in the deferbuffer is missing the record separator");
1378      $good = 0;
1379    }
1380  }
1381
1382  # Total size of deferbuffer should match internal total
1383  if ($deferred_s != $self->{deferred_s}) {
1384    _ci_warn("buffer size is $self->{deferred_s}, should be $deferred_s");
1385    $good = 0;
1386  }
1387
1388  # Total size of deferbuffer should not exceed the specified limit
1389  if ($deferred_s > $self->{dw_size}) {
1390    _ci_warn("buffer size is $self->{deferred_s} which exceeds the limit " .
1391             "of $self->{dw_size}");
1392    $good = 0;
1393  }
1394
1395  # Total size of cached data should not exceed the specified limit
1396  if ($deferred_s + $cached > $self->{memory}) {
1397    my $total = $deferred_s + $cached;
1398    _ci_warn("total stored data size is $total which exceeds the limit " .
1399             "of $self->{memory}");
1400    $good = 0;
1401  }
1402
1403  # Stuff related to autodeferment
1404  if (!$self->{autodefer} && @{$self->{ad_history}}) {
1405    _ci_warn("autodefer is disabled, but ad_history is nonempty");
1406    $good = 0;
1407  }
1408  if ($self->{autodeferring} && $self->{defer}) {
1409    _ci_warn("both autodeferring and explicit deferring are active");
1410    $good = 0;
1411  }
1412  if (@{$self->{ad_history}} == 0) {
1413    # That's OK, no additional tests required
1414  } elsif (@{$self->{ad_history}} == 2) {
1415    my @non_number = grep !/^-?\d+$/, @{$self->{ad_history}};
1416    if (@non_number) {
1417      my $msg;
1418      { local $" = ')(';
1419        $msg = "ad_history contains non-numbers (@{$self->{ad_history}})";
1420      }
1421      _ci_warn($msg);
1422      $good = 0;
1423    } elsif ($self->{ad_history}[1] < $self->{ad_history}[0]) {
1424      _ci_warn("ad_history has nonsensical values @{$self->{ad_history}}");
1425      $good = 0;
1426    }
1427  } else {
1428    _ci_warn("ad_history has bad length <@{$self->{ad_history}}>");
1429    $good = 0;
1430  }
1431
1432  $good;
1433}
1434
1435################################################################
1436#
1437# Tie::File::Cache
1438#
1439# Read cache
1440
1441package Tie::File::Cache;
1442$Tie::File::Cache::VERSION = $Tie::File::VERSION;
1443use Carp ':DEFAULT', 'confess';
1444
1445use constant {
1446    HEAP  => 0,
1447    HASH  => 1,
1448    MAX   => 2,
1449    BYTES => 3,
1450    #STAT  => 4, # Array with request statistics for each record
1451    #MISS  => 5, # Total number of cache misses
1452    #REQ   => 6, # Total number of cache requests
1453};
1454
1455sub new {
1456  my ($pack, $max) = @_;
1457  local *_;
1458  croak "missing argument to ->new" unless defined $max;
1459  my $self = [];
1460  bless $self => $pack;
1461  @$self = (Tie::File::Heap->new($self), {}, $max, 0);
1462  $self;
1463}
1464
1465sub adj_limit {
1466  my ($self, $n) = @_;
1467  $self->[MAX] += $n;
1468}
1469
1470sub set_limit {
1471  my ($self, $n) = @_;
1472  $self->[MAX] = $n;
1473}
1474
1475# For internal use only
1476# Will be called by the heap structure to notify us that a certain
1477# piece of data has moved from one heap element to another.
1478# $k is the hash key of the item
1479# $n is the new index into the heap at which it is stored
1480# If $n is undefined, the item has been removed from the heap.
1481sub _heap_move {
1482  my ($self, $k, $n) = @_;
1483  if (defined $n) {
1484    $self->[HASH]{$k} = $n;
1485  } else {
1486    delete $self->[HASH]{$k};
1487  }
1488}
1489
1490sub insert {
1491  my ($self, $key, $val) = @_;
1492  local *_;
1493  croak "missing argument to ->insert" unless defined $key;
1494  unless (defined $self->[MAX]) {
1495    confess "undefined max" ;
1496  }
1497  confess "undefined val" unless defined $val;
1498  return if length($val) > $self->[MAX];
1499
1500#  if ($self->[STAT]) {
1501#    $self->[STAT][$key] = 1;
1502#    return;
1503#  }
1504
1505  my $oldnode = $self->[HASH]{$key};
1506  if (defined $oldnode) {
1507    my $oldval = $self->[HEAP]->set_val($oldnode, $val);
1508    $self->[BYTES] -= length($oldval);
1509  } else {
1510    $self->[HEAP]->insert($key, $val);
1511  }
1512  $self->[BYTES] += length($val);
1513  $self->flush if $self->[BYTES] > $self->[MAX];
1514}
1515
1516sub expire {
1517  my $self = shift;
1518  my $old_data = $self->[HEAP]->popheap;
1519  return unless defined $old_data;
1520  $self->[BYTES] -= length $old_data;
1521  $old_data;
1522}
1523
1524sub remove {
1525  my ($self, @keys) = @_;
1526  my @result;
1527
1528#  if ($self->[STAT]) {
1529#    for my $key (@keys) {
1530#      $self->[STAT][$key] = 0;
1531#    }
1532#    return;
1533#  }
1534
1535  for my $key (@keys) {
1536    next unless exists $self->[HASH]{$key};
1537    my $old_data = $self->[HEAP]->remove($self->[HASH]{$key});
1538    $self->[BYTES] -= length $old_data;
1539    push @result, $old_data;
1540  }
1541  @result;
1542}
1543
1544sub lookup {
1545  my ($self, $key) = @_;
1546  local *_;
1547  croak "missing argument to ->lookup" unless defined $key;
1548
1549#  if ($self->[STAT]) {
1550#    $self->[MISS]++  if $self->[STAT][$key]++ == 0;
1551#    $self->[REQ]++;
1552#    my $hit_rate = 1 - $self->[MISS] / $self->[REQ];
1553#    # Do some testing to determine this threshhold
1554#    $#$self = STAT - 1 if $hit_rate > 0.20;
1555#  }
1556
1557  if (exists $self->[HASH]{$key}) {
1558    $self->[HEAP]->lookup($self->[HASH]{$key});
1559  } else {
1560    return;
1561  }
1562}
1563
1564# For internal use only
1565sub _produce {
1566  my ($self, $key) = @_;
1567  my $loc = $self->[HASH]{$key};
1568  return unless defined $loc;
1569  $self->[HEAP][$loc][2];
1570}
1571
1572# For internal use only
1573sub _promote {
1574  my ($self, $key) = @_;
1575  $self->[HEAP]->promote($self->[HASH]{$key});
1576}
1577
1578sub empty {
1579  my ($self) = @_;
1580  %{$self->[HASH]} = ();
1581    $self->[BYTES] = 0;
1582    $self->[HEAP]->empty;
1583#  @{$self->[STAT]} = ();
1584#    $self->[MISS] = 0;
1585#    $self->[REQ] = 0;
1586}
1587
1588sub is_empty {
1589  my ($self) = @_;
1590  keys %{$self->[HASH]} == 0;
1591}
1592
1593sub update {
1594  my ($self, $key, $val) = @_;
1595  local *_;
1596  croak "missing argument to ->update" unless defined $key;
1597  if (length($val) > $self->[MAX]) {
1598    my ($oldval) = $self->remove($key);
1599    $self->[BYTES] -= length($oldval) if defined $oldval;
1600  } elsif (exists $self->[HASH]{$key}) {
1601    my $oldval = $self->[HEAP]->set_val($self->[HASH]{$key}, $val);
1602    $self->[BYTES] += length($val);
1603    $self->[BYTES] -= length($oldval) if defined $oldval;
1604  } else {
1605    $self->[HEAP]->insert($key, $val);
1606    $self->[BYTES] += length($val);
1607  }
1608  $self->flush;
1609}
1610
1611sub rekey {
1612  my ($self, $okeys, $nkeys) = @_;
1613  local *_;
1614  my %map;
1615  @map{@$okeys} = @$nkeys;
1616  croak "missing argument to ->rekey" unless defined $nkeys;
1617  croak "length mismatch in ->rekey arguments" unless @$nkeys == @$okeys;
1618  my %adjusted;                 # map new keys to heap indices
1619  # You should be able to cut this to one loop TODO XXX
1620  for (0 .. $#$okeys) {
1621    $adjusted{$nkeys->[$_]} = delete $self->[HASH]{$okeys->[$_]};
1622  }
1623  while (my ($nk, $ix) = each %adjusted) {
1624    # @{$self->[HASH]}{keys %adjusted} = values %adjusted;
1625    $self->[HEAP]->rekey($ix, $nk);
1626    $self->[HASH]{$nk} = $ix;
1627  }
1628}
1629
1630sub ckeys {
1631  my $self = shift;
1632  my @a = keys %{$self->[HASH]};
1633  @a;
1634}
1635
1636# Return total amount of cached data
1637sub bytes {
1638  my $self = shift;
1639  $self->[BYTES];
1640}
1641
1642# Expire oldest item from cache until cache size is smaller than $max
1643sub reduce_size_to {
1644  my ($self, $max) = @_;
1645  until ($self->[BYTES] <= $max) {
1646    # Note that Tie::File::Cache::expire has been inlined here
1647    my $old_data = $self->[HEAP]->popheap;
1648    return unless defined $old_data;
1649    $self->[BYTES] -= length $old_data;
1650  }
1651}
1652
1653# Why not just $self->reduce_size_to($self->[MAX])?
1654# Try this when things stabilize   TODO XXX
1655# If the cache is too full, expire the oldest records
1656sub flush {
1657  my $self = shift;
1658  $self->reduce_size_to($self->[MAX]) if $self->[BYTES] > $self->[MAX];
1659}
1660
1661# For internal use only
1662sub _produce_lru {
1663  my $self = shift;
1664  $self->[HEAP]->expire_order;
1665}
1666
1667BEGIN { *_ci_warn = \&Tie::File::_ci_warn }
1668
1669sub _check_integrity {          # For CACHE
1670  my $self = shift;
1671  my $good = 1;
1672
1673  # Test HEAP
1674  $self->[HEAP]->_check_integrity or $good = 0;
1675
1676  # Test HASH
1677  my $bytes = 0;
1678  for my $k (keys %{$self->[HASH]}) {
1679    if ($k ne '0' && $k !~ /^[1-9][0-9]*$/) {
1680      $good = 0;
1681      _ci_warn "Cache hash key <$k> is non-numeric";
1682    }
1683
1684    my $h = $self->[HASH]{$k};
1685    if (! defined $h) {
1686      $good = 0;
1687      _ci_warn "Heap index number for key $k is undefined";
1688    } elsif ($h == 0) {
1689      $good = 0;
1690      _ci_warn "Heap index number for key $k is zero";
1691    } else {
1692      my $j = $self->[HEAP][$h];
1693      if (! defined $j) {
1694        $good = 0;
1695        _ci_warn "Heap contents key $k (=> $h) are undefined";
1696      } else {
1697        $bytes += length($j->[2]);
1698        if ($k ne $j->[1]) {
1699          $good = 0;
1700          _ci_warn "Heap contents key $k (=> $h) is $j->[1], should be $k";
1701        }
1702      }
1703    }
1704  }
1705
1706  # Test BYTES
1707  if ($bytes != $self->[BYTES]) {
1708    $good = 0;
1709    _ci_warn "Total data in cache is $bytes, expected $self->[BYTES]";
1710  }
1711
1712  # Test MAX
1713  if ($bytes > $self->[MAX]) {
1714    $good = 0;
1715    _ci_warn "Total data in cache is $bytes, exceeds maximum $self->[MAX]";
1716  }
1717
1718  return $good;
1719}
1720
1721sub delink {
1722  my $self = shift;
1723  $self->[HEAP] = undef;        # Bye bye heap
1724}
1725
1726################################################################
1727#
1728# Tie::File::Heap
1729#
1730# Heap data structure for use by cache LRU routines
1731
1732package Tie::File::Heap;
1733use Carp ':DEFAULT', 'confess';
1734$Tie::File::Heap::VERSION = $Tie::File::Cache::VERSION;
1735use constant {
1736    SEQ => 0,
1737    KEY => 1,
1738    DAT => 2,
1739};
1740
1741sub new {
1742  my ($pack, $cache) = @_;
1743  die "$pack: Parent cache object $cache does not support _heap_move method"
1744    unless eval { $cache->can('_heap_move') };
1745  my $self = [[0,$cache,0]];
1746  bless $self => $pack;
1747}
1748
1749# Allocate a new sequence number, larger than all previously allocated numbers
1750sub _nseq {
1751  my $self = shift;
1752  $self->[0][0]++;
1753}
1754
1755sub _cache {
1756  my $self = shift;
1757  $self->[0][1];
1758}
1759
1760sub _nelts {
1761  my $self = shift;
1762  $self->[0][2];
1763}
1764
1765sub _nelts_inc {
1766  my $self = shift;
1767  ++$self->[0][2];
1768}
1769
1770sub _nelts_dec {
1771  my $self = shift;
1772  --$self->[0][2];
1773}
1774
1775sub is_empty {
1776  my $self = shift;
1777  $self->_nelts == 0;
1778}
1779
1780sub empty {
1781  my $self = shift;
1782  $#$self = 0;
1783  $self->[0][2] = 0;
1784  $self->[0][0] = 0;            # might as well reset the sequence numbers
1785}
1786
1787# notify the parent cache object that we moved something
1788sub _heap_move {
1789  my $self = shift;
1790  $self->_cache->_heap_move(@_);
1791}
1792
1793# Insert a piece of data into the heap with the indicated sequence number.
1794# The item with the smallest sequence number is always at the top.
1795# If no sequence number is specified, allocate a new one and insert the
1796# item at the bottom.
1797sub insert {
1798  my ($self, $key, $data, $seq) = @_;
1799  $seq = $self->_nseq unless defined $seq;
1800  $self->_insert_new([$seq, $key, $data]);
1801}
1802
1803# Insert a new, fresh item at the bottom of the heap
1804sub _insert_new {
1805  my ($self, $item) = @_;
1806  my $i = @$self;
1807  $i = int($i/2) until defined $self->[$i/2];
1808  $self->[$i] = $item;
1809  $self->[0][1]->_heap_move($self->[$i][KEY], $i);
1810  $self->_nelts_inc;
1811}
1812
1813# Insert [$data, $seq] pair at or below item $i in the heap.
1814# If $i is omitted, default to 1 (the top element.)
1815sub _insert {
1816  my ($self, $item, $i) = @_;
1817#  $self->_check_loc($i) if defined $i;
1818  $i = 1 unless defined $i;
1819  until (! defined $self->[$i]) {
1820    if ($self->[$i][SEQ] > $item->[SEQ]) { # inserted item is older
1821      ($self->[$i], $item) = ($item, $self->[$i]);
1822      $self->[0][1]->_heap_move($self->[$i][KEY], $i);
1823    }
1824    # If either is undefined, go that way.  Otherwise, choose at random
1825    my $dir;
1826    $dir = 0 if !defined $self->[2*$i];
1827    $dir = 1 if !defined $self->[2*$i+1];
1828    $dir = int(rand(2)) unless defined $dir;
1829    $i = 2*$i + $dir;
1830  }
1831  $self->[$i] = $item;
1832  $self->[0][1]->_heap_move($self->[$i][KEY], $i);
1833  $self->_nelts_inc;
1834}
1835
1836# Remove the item at node $i from the heap, moving child items upwards.
1837# The item with the smallest sequence number is always at the top.
1838# Moving items upwards maintains this condition.
1839# Return the removed item.  Return undef if there was no item at node $i.
1840sub remove {
1841  my ($self, $i) = @_;
1842  $i = 1 unless defined $i;
1843  my $top = $self->[$i];
1844  return unless defined $top;
1845  while (1) {
1846    my $ii;
1847    my ($L, $R) = (2*$i, 2*$i+1);
1848
1849    # If either is undefined, go the other way.
1850    # Otherwise, go towards the smallest.
1851    last unless defined $self->[$L] || defined $self->[$R];
1852    $ii = $R if not defined $self->[$L];
1853    $ii = $L if not defined $self->[$R];
1854    unless (defined $ii) {
1855      $ii = $self->[$L][SEQ] < $self->[$R][SEQ] ? $L : $R;
1856    }
1857
1858    $self->[$i] = $self->[$ii]; # Promote child to fill vacated spot
1859    $self->[0][1]->_heap_move($self->[$i][KEY], $i);
1860    $i = $ii; # Fill new vacated spot
1861  }
1862  $self->[0][1]->_heap_move($top->[KEY], undef);
1863  undef $self->[$i];
1864  $self->_nelts_dec;
1865  return $top->[DAT];
1866}
1867
1868sub popheap {
1869  my $self = shift;
1870  $self->remove(1);
1871}
1872
1873# set the sequence number of the indicated item to a higher number
1874# than any other item in the heap, and bubble the item down to the
1875# bottom.
1876sub promote {
1877  my ($self, $n) = @_;
1878#  $self->_check_loc($n);
1879  $self->[$n][SEQ] = $self->_nseq;
1880  my $i = $n;
1881  while (1) {
1882    my ($L, $R) = (2*$i, 2*$i+1);
1883    my $dir;
1884    last unless defined $self->[$L] || defined $self->[$R];
1885    $dir = $R unless defined $self->[$L];
1886    $dir = $L unless defined $self->[$R];
1887    unless (defined $dir) {
1888      $dir = $self->[$L][SEQ] < $self->[$R][SEQ] ? $L : $R;
1889    }
1890    @{$self}[$i, $dir] = @{$self}[$dir, $i];
1891    for ($i, $dir) {
1892      $self->[0][1]->_heap_move($self->[$_][KEY], $_) if defined $self->[$_];
1893    }
1894    $i = $dir;
1895  }
1896}
1897
1898# Return item $n from the heap, promoting its LRU status
1899sub lookup {
1900  my ($self, $n) = @_;
1901#  $self->_check_loc($n);
1902  my $val = $self->[$n];
1903  $self->promote($n);
1904  $val->[DAT];
1905}
1906
1907
1908# Assign a new value for node $n, promoting it to the bottom of the heap
1909sub set_val {
1910  my ($self, $n, $val) = @_;
1911#  $self->_check_loc($n);
1912  my $oval = $self->[$n][DAT];
1913  $self->[$n][DAT] = $val;
1914  $self->promote($n);
1915  return $oval;
1916}
1917
1918# The hash key has changed for an item;
1919# alter the heap's record of the hash key
1920sub rekey {
1921  my ($self, $n, $new_key) = @_;
1922#  $self->_check_loc($n);
1923  $self->[$n][KEY] = $new_key;
1924}
1925
1926sub _check_loc {
1927  my ($self, $n) = @_;
1928  unless (1 || defined $self->[$n]) {
1929    confess "_check_loc($n) failed";
1930  }
1931}
1932
1933BEGIN { *_ci_warn = \&Tie::File::_ci_warn }
1934
1935sub _check_integrity {
1936  my $self = shift;
1937  my $good = 1;
1938  my %seq;
1939
1940  unless (eval {$self->[0][1]->isa("Tie::File::Cache")}) {
1941    _ci_warn "Element 0 of heap corrupt";
1942    $good = 0;
1943  }
1944  $good = 0 unless $self->_satisfies_heap_condition(1);
1945  for my $i (2 .. $#{$self}) {
1946    my $p = int($i/2);          # index of parent node
1947    if (defined $self->[$i] && ! defined $self->[$p]) {
1948      _ci_warn "Element $i of heap defined, but parent $p isn't";
1949      $good = 0;
1950    }
1951
1952    if (defined $self->[$i]) {
1953      if ($seq{$self->[$i][SEQ]}) {
1954        my $seq = $self->[$i][SEQ];
1955        _ci_warn "Nodes $i and $seq{$seq} both have SEQ=$seq";
1956        $good = 0;
1957      } else {
1958        $seq{$self->[$i][SEQ]} = $i;
1959      }
1960    }
1961  }
1962
1963  return $good;
1964}
1965
1966sub _satisfies_heap_condition {
1967  my $self = shift;
1968  my $n = shift || 1;
1969  my $good = 1;
1970  for (0, 1) {
1971    my $c = $n*2 + $_;
1972    next unless defined $self->[$c];
1973    if ($self->[$n][SEQ] >= $self->[$c]) {
1974      _ci_warn "Node $n of heap does not predate node $c";
1975      $good = 0 ;
1976    }
1977    $good = 0 unless $self->_satisfies_heap_condition($c);
1978  }
1979  return $good;
1980}
1981
1982# Return a list of all the values, sorted by expiration order
1983sub expire_order {
1984  my $self = shift;
1985  my @nodes = sort {$a->[SEQ] <=> $b->[SEQ]} $self->_nodes;
1986  map { $_->[KEY] } @nodes;
1987}
1988
1989sub _nodes {
1990  my $self = shift;
1991  my $i = shift || 1;
1992  return unless defined $self->[$i];
1993  ($self->[$i], $self->_nodes($i*2), $self->_nodes($i*2+1));
1994}
1995
19961;
1997
1998__END__
1999
2000=head1 NAME
2001
2002Tie::File - Access the lines of a disk file via a Perl array
2003
2004=head1 SYNOPSIS
2005
2006 use Tie::File;
2007
2008 tie @array, 'Tie::File', filename or die ...;
2009
2010 $array[0] = 'blah';      # first line of the file is now 'blah'
2011                            # (line numbering starts at 0)
2012 print $array[42];        # display line 43 of the file
2013
2014 $n_recs = @array;        # how many records are in the file?
2015 $#array -= 2;            # chop two records off the end
2016
2017
2018 for (@array) {
2019   s/PERL/Perl/g;        # Replace PERL with Perl everywhere in the file
2020 }
2021
2022 # These are just like regular push, pop, unshift, shift, and splice
2023 # Except that they modify the file in the way you would expect
2024
2025 push @array, new recs...;
2026 my $r1 = pop @array;
2027 unshift @array, new recs...;
2028 my $r2 = shift @array;
2029 @old_recs = splice @array, 3, 7, new recs...;
2030
2031 untie @array;            # all finished
2032
2033
2034=head1 DESCRIPTION
2035
2036C<Tie::File> represents a regular text file as a Perl array.  Each
2037element in the array corresponds to a record in the file.  The first
2038line of the file is element 0 of the array; the second line is element
20391, and so on.
2040
2041The file is I<not> loaded into memory, so this will work even for
2042gigantic files.
2043
2044Changes to the array are reflected in the file immediately.
2045
2046Lazy people and beginners may now stop reading the manual.
2047
2048=head2 C<unicode>
2049
2050You can read a unicode (UTF-8) file by providing a file handle opened with
2051the desired encoding. It is not safe to write to one because
2052the length in bytes and in characters is often different, Tie::File
2053will miscalculate the length of writes, overwriting parts of other records.
2054
2055=head2 C<recsep>
2056
2057What is a 'record'?  By default, the meaning is the same as for the
2058C<E<lt>...E<gt>> operator: It's a string terminated by C<$/>, which is
2059probably C<"\n">.  (Minor exception: on DOS and Win32 systems, a
2060'record' is a string terminated by C<"\r\n">.)  You may change the
2061definition of "record" by supplying the C<recsep> option in the C<tie>
2062call:
2063
2064	tie @array, 'Tie::File', $file, recsep => 'es';
2065
2066This says that records are delimited by the string C<es>.  If the file
2067contained the following data:
2068
2069	Curse these pesky flies!\n
2070
2071then the C<@array> would appear to have four elements:
2072
2073	"Curse th"
2074	"e p"
2075	"ky fli"
2076	"!\n"
2077
2078An undefined value is not permitted as a record separator.  Perl's
2079special "paragraph mode" semantics (E<agrave> la C<$/ = "">) are not
2080emulated.
2081
2082Records read from the tied array do not have the record separator
2083string on the end; this is to allow
2084
2085	$array[17] .= "extra";
2086
2087to work as expected.
2088
2089(See L<"autochomp">, below.)  Records stored into the array will have
2090the record separator string appended before they are written to the
2091file, if they don't have one already.  For example, if the record
2092separator string is C<"\n">, then the following two lines do exactly
2093the same thing:
2094
2095	$array[17] = "Cherry pie";
2096	$array[17] = "Cherry pie\n";
2097
2098The result is that the contents of line 17 of the file will be
2099replaced with "Cherry pie"; a newline character will separate line 17
2100from line 18.  This means that this code will do nothing:
2101
2102	chomp $array[17];
2103
2104Because the C<chomp>ed value will have the separator reattached when
2105it is written back to the file.  There is no way to create a file
2106whose trailing record separator string is missing.
2107
2108Inserting records that I<contain> the record separator string is not
2109supported by this module.  It will probably produce a reasonable
2110result, but what this result will be may change in a future version.
2111Use 'splice' to insert records or to replace one record with several.
2112
2113=head2 C<autochomp>
2114
2115Normally, array elements have the record separator removed, so that if
2116the file contains the text
2117
2118	Gold
2119	Frankincense
2120	Myrrh
2121
2122the tied array will appear to contain C<("Gold", "Frankincense",
2123"Myrrh")>.  If you set C<autochomp> to a false value, the record
2124separator will not be removed.  If the file above was tied with
2125
2126	tie @gifts, "Tie::File", $gifts, autochomp => 0;
2127
2128then the array C<@gifts> would appear to contain C<("Gold\n",
2129"Frankincense\n", "Myrrh\n")>, or (on Win32 systems) C<("Gold\r\n",
2130"Frankincense\r\n", "Myrrh\r\n")>.
2131
2132=head2 C<mode>
2133
2134Normally, the specified file will be opened for read and write access,
2135and will be created if it does not exist.  (That is, the flags
2136C<O_RDWR | O_CREAT> are supplied in the C<open> call.)  If you want to
2137change this, you may supply alternative flags in the C<mode> option.
2138See L<Fcntl> for a listing of available flags.
2139For example:
2140
2141	# open the file if it exists, but fail if it does not exist
2142	use Fcntl 'O_RDWR';
2143	tie @array, 'Tie::File', $file, mode => O_RDWR;
2144
2145	# create the file if it does not exist
2146	use Fcntl 'O_RDWR', 'O_CREAT';
2147	tie @array, 'Tie::File', $file, mode => O_RDWR | O_CREAT;
2148
2149	# open an existing file in read-only mode
2150	use Fcntl 'O_RDONLY';
2151	tie @array, 'Tie::File', $file, mode => O_RDONLY;
2152
2153Opening the data file in write-only or append mode is not supported.
2154
2155=head2 C<memory>
2156
2157This is an upper limit on the amount of memory that C<Tie::File> will
2158consume at any time while managing the file.  This is used for two
2159things: managing the I<read cache> and managing the I<deferred write
2160buffer>.
2161
2162Records read in from the file are cached, to avoid having to re-read
2163them repeatedly.  If you read the same record twice, the first time it
2164will be stored in memory, and the second time it will be fetched from
2165the I<read cache>.  The amount of data in the read cache will not
2166exceed the value you specified for C<memory>.  If C<Tie::File> wants
2167to cache a new record, but the read cache is full, it will make room
2168by expiring the least-recently visited records from the read cache.
2169
2170The default memory limit is 2Mib.  You can adjust the maximum read
2171cache size by supplying the C<memory> option.  The argument is the
2172desired cache size, in bytes.
2173
2174 # I have a lot of memory, so use a large cache to speed up access
2175 tie @array, 'Tie::File', $file, memory => 20_000_000;
2176
2177Setting the memory limit to 0 will inhibit caching; records will be
2178fetched from disk every time you examine them.
2179
2180The C<memory> value is not an absolute or exact limit on the memory
2181used.  C<Tie::File> objects contains some structures besides the read
2182cache and the deferred write buffer, whose sizes are not charged
2183against C<memory>.
2184
2185The cache itself consumes about 310 bytes per cached record, so if
2186your file has many short records, you may want to decrease the cache
2187memory limit, or else the cache overhead may exceed the size of the
2188cached data.
2189
2190
2191=head2 C<dw_size>
2192
2193(This is an advanced feature.  Skip this section on first reading.)
2194
2195If you use deferred writing (See L<"Deferred Writing">, below) then
2196data you write into the array will not be written directly to the
2197file; instead, it will be saved in the I<deferred write buffer> to be
2198written out later.  Data in the deferred write buffer is also charged
2199against the memory limit you set with the C<memory> option.
2200
2201You may set the C<dw_size> option to limit the amount of data that can
2202be saved in the deferred write buffer.  This limit may not exceed the
2203total memory limit.  For example, if you set C<dw_size> to 1000 and
2204C<memory> to 2500, that means that no more than 1000 bytes of deferred
2205writes will be saved up.  The space available for the read cache will
2206vary, but it will always be at least 1500 bytes (if the deferred write
2207buffer is full) and it could grow as large as 2500 bytes (if the
2208deferred write buffer is empty.)
2209
2210If you don't specify a C<dw_size>, it defaults to the entire memory
2211limit.
2212
2213=head2 Option Format
2214
2215C<-mode> is a synonym for C<mode>.  C<-recsep> is a synonym for
2216C<recsep>.  C<-memory> is a synonym for C<memory>.  You get the
2217idea.
2218
2219=head1 Public Methods
2220
2221The C<tie> call returns an object, say C<$o>.  You may call
2222
2223	$rec = $o->FETCH($n);
2224	$o->STORE($n, $rec);
2225
2226to fetch or store the record at line C<$n>, respectively; similarly
2227the other tied array methods.  (See L<perltie> for details.)  You may
2228also call the following methods on this object:
2229
2230=head2 C<flock>
2231
2232	$o->flock(MODE)
2233
2234will lock the tied file.  C<MODE> has the same meaning as the second
2235argument to the Perl built-in C<flock> function; for example
2236C<LOCK_SH> or C<LOCK_EX | LOCK_NB>.  (These constants are provided by
2237the C<use Fcntl ':flock'> declaration.)
2238
2239C<MODE> is optional; the default is C<LOCK_EX>.
2240
2241C<Tie::File> maintains an internal table of the byte offset of each
2242record it has seen in the file.
2243
2244When you use C<flock> to lock the file, C<Tie::File> assumes that the
2245read cache is no longer trustworthy, because another process might
2246have modified the file since the last time it was read.  Therefore, a
2247successful call to C<flock> discards the contents of the read cache
2248and the internal record offset table.
2249
2250C<Tie::File> promises that the following sequence of operations will
2251be safe:
2252
2253	my $o = tie @array, "Tie::File", $filename;
2254	$o->flock;
2255
2256In particular, C<Tie::File> will I<not> read or write the file during
2257the C<tie> call.  (Exception: Using C<mode =E<gt> O_TRUNC> will, of
2258course, erase the file during the C<tie> call.  If you want to do this
2259safely, then open the file without C<O_TRUNC>, lock the file, and use
2260C<@array = ()>.)
2261
2262The best way to unlock a file is to discard the object and untie the
2263array.  It is probably unsafe to unlock the file without also untying
2264it, because if you do, changes may remain unwritten inside the object.
2265That is why there is no shortcut for unlocking.  If you really want to
2266unlock the file prematurely, you know what to do; if you don't know
2267what to do, then don't do it.
2268
2269All the usual warnings about file locking apply here.  In particular,
2270note that file locking in Perl is B<advisory>, which means that
2271holding a lock will not prevent anyone else from reading, writing, or
2272erasing the file; it only prevents them from getting another lock at
2273the same time.  Locks are analogous to green traffic lights: If you
2274have a green light, that does not prevent the idiot coming the other
2275way from plowing into you sideways; it merely guarantees to you that
2276the idiot does not also have a green light at the same time.
2277
2278=head2 C<autochomp>
2279
2280	my $old_value = $o->autochomp(0);    # disable autochomp option
2281	my $old_value = $o->autochomp(1);    #  enable autochomp option
2282
2283	my $ac = $o->autochomp();   # recover current value
2284
2285See L<"autochomp">, above.
2286
2287=head2 C<defer>, C<flush>, C<discard>, and C<autodefer>
2288
2289See L<"Deferred Writing">, below.
2290
2291=head2 C<offset>
2292
2293	$off = $o->offset($n);
2294
2295This method returns the byte offset of the start of the C<$n>th record
2296in the file.  If there is no such record, it returns an undefined
2297value.
2298
2299=head1 Tying to an already-opened filehandle
2300
2301If C<$fh> is a filehandle, such as is returned by C<IO::File> or one
2302of the other C<IO> modules, you may use:
2303
2304	tie @array, 'Tie::File', $fh, ...;
2305
2306Similarly if you opened that handle C<FH> with regular C<open> or
2307C<sysopen>, you may use:
2308
2309	tie @array, 'Tie::File', \*FH, ...;
2310
2311Handles that were opened write-only won't work.  Handles that were
2312opened read-only will work as long as you don't try to modify the
2313array.  Handles must be attached to seekable sources of data---that
2314means no pipes or sockets.  If C<Tie::File> can detect that you
2315supplied a non-seekable handle, the C<tie> call will throw an
2316exception.  (On Unix systems, it can detect this.)
2317
2318Note that Tie::File will only close any filehandles that it opened
2319internally.  If you passed it a filehandle as above, you "own" the
2320filehandle, and are responsible for closing it after you have untied
2321the @array.
2322
2323Tie::File calls C<binmode> on filehandles that it opens internally,
2324but not on filehandles passed in by the user. For consistency,
2325especially if using the tied files cross-platform, you may wish to
2326call C<binmode> on the filehandle prior to tying the file.
2327
2328=head1 Deferred Writing
2329
2330(This is an advanced feature.  Skip this section on first reading.)
2331
2332Normally, modifying a C<Tie::File> array writes to the underlying file
2333immediately.  Every assignment like C<$a[3] = ...> rewrites as much of
2334the file as is necessary; typically, everything from line 3 through
2335the end will need to be rewritten.  This is the simplest and most
2336transparent behavior.  Performance even for large files is reasonably
2337good.
2338
2339However, under some circumstances, this behavior may be excessively
2340slow.  For example, suppose you have a million-record file, and you
2341want to do:
2342
2343	for (@FILE) {
2344	  $_ = "> $_";
2345	}
2346
2347The first time through the loop, you will rewrite the entire file,
2348from line 0 through the end.  The second time through the loop, you
2349will rewrite the entire file from line 1 through the end.  The third
2350time through the loop, you will rewrite the entire file from line 2 to
2351the end.  And so on.
2352
2353If the performance in such cases is unacceptable, you may defer the
2354actual writing, and then have it done all at once.  The following loop
2355will perform much better for large files:
2356
2357	(tied @a)->defer;
2358	for (@a) {
2359	  $_ = "> $_";
2360	}
2361	(tied @a)->flush;
2362
2363If C<Tie::File>'s memory limit is large enough, all the writing will
2364done in memory.  Then, when you call C<-E<gt>flush>, the entire file
2365will be rewritten in a single pass.
2366
2367(Actually, the preceding discussion is something of a fib.  You don't
2368need to enable deferred writing to get good performance for this
2369common case, because C<Tie::File> will do it for you automatically
2370unless you specifically tell it not to.  See L</Autodeferring>,
2371below.)
2372
2373Calling C<-E<gt>flush> returns the array to immediate-write mode.  If
2374you wish to discard the deferred writes, you may call C<-E<gt>discard>
2375instead of C<-E<gt>flush>.  Note that in some cases, some of the data
2376will have been written already, and it will be too late for
2377C<-E<gt>discard> to discard all the changes.  Support for
2378C<-E<gt>discard> may be withdrawn in a future version of C<Tie::File>.
2379
2380Deferred writes are cached in memory up to the limit specified by the
2381C<dw_size> option (see above).  If the deferred-write buffer is full
2382and you try to write still more deferred data, the buffer will be
2383flushed.  All buffered data will be written immediately, the buffer
2384will be emptied, and the now-empty space will be used for future
2385deferred writes.
2386
2387If the deferred-write buffer isn't yet full, but the total size of the
2388buffer and the read cache would exceed the C<memory> limit, the oldest
2389records will be expired from the read cache until the total size is
2390under the limit.
2391
2392C<push>, C<pop>, C<shift>, C<unshift>, and C<splice> cannot be
2393deferred.  When you perform one of these operations, any deferred data
2394is written to the file and the operation is performed immediately.
2395This may change in a future version.
2396
2397If you resize the array with deferred writing enabled, the file will
2398be resized immediately, but deferred records will not be written.
2399This has a surprising consequence: C<@a = (...)> erases the file
2400immediately, but the writing of the actual data is deferred.  This
2401might be a bug.  If it is a bug, it will be fixed in a future version.
2402
2403=head2 Autodeferring
2404
2405C<Tie::File> tries to guess when deferred writing might be helpful,
2406and to turn it on and off automatically.
2407
2408	for (@a) {
2409	  $_ = "> $_";
2410	}
2411
2412In this example, only the first two assignments will be done
2413immediately; after this, all the changes to the file will be deferred
2414up to the user-specified memory limit.
2415
2416You should usually be able to ignore this and just use the module
2417without thinking about deferring.  However, special applications may
2418require fine control over which writes are deferred, or may require
2419that all writes be immediate.  To disable the autodeferment feature,
2420use
2421
2422	(tied @o)->autodefer(0);
2423
2424or
2425
2426       	tie @array, 'Tie::File', $file, autodefer => 0;
2427
2428
2429Similarly, C<-E<gt>autodefer(1)> re-enables autodeferment, and
2430C<-E<gt>autodefer()> recovers the current value of the autodefer setting.
2431
2432
2433=head1 CONCURRENT ACCESS TO FILES
2434
2435Caching and deferred writing are inappropriate if you want the same
2436file to be accessed simultaneously from more than one process.  Other
2437optimizations performed internally by this module are also
2438incompatible with concurrent access.  A future version of this module will
2439support a C<concurrent =E<gt> 1> option that enables safe concurrent access.
2440
2441Previous versions of this documentation suggested using C<memory
2442=E<gt> 0> for safe concurrent access.  This was mistaken.  Tie::File
2443will not support safe concurrent access before version 0.96.
2444
2445=head1 CAVEATS
2446
2447(That's Latin for 'warnings'.)
2448
2449=over 4
2450
2451=item *
2452
2453Reasonable effort was made to make this module efficient.  Nevertheless,
2454changing the size of a record in the middle of a large file will
2455always be fairly slow, because everything after the new record must be
2456moved.
2457
2458=item *
2459
2460The behavior of tied arrays is not precisely the same as for regular
2461arrays.  For example:
2462
2463	# This DOES print "How unusual!"
2464	undef $a[10];  print "How unusual!\n" if defined $a[10];
2465
2466C<undef>-ing a C<Tie::File> array element just blanks out the
2467corresponding record in the file.  When you read it back again, you'll
2468get the empty string, so the supposedly-C<undef>'ed value will be
2469defined.  Similarly, if you have C<autochomp> disabled, then
2470
2471	# This DOES print "How unusual!" if 'autochomp' is disabled
2472	undef $a[10];
2473        print "How unusual!\n" if $a[10];
2474
2475Because when C<autochomp> is disabled, C<$a[10]> will read back as
2476C<"\n"> (or whatever the record separator string is.)
2477
2478There are other minor differences, particularly regarding C<exists>
2479and C<delete>, but in general, the correspondence is extremely close.
2480
2481=item *
2482
2483I have supposed that since this module is concerned with file I/O,
2484almost all normal use of it will be heavily I/O bound.  This means
2485that the time to maintain complicated data structures inside the
2486module will be dominated by the time to actually perform the I/O.
2487When there was an opportunity to spend CPU time to avoid doing I/O, I
2488usually tried to take it.
2489
2490=item *
2491
2492You might be tempted to think that deferred writing is like
2493transactions, with C<flush> as C<commit> and C<discard> as
2494C<rollback>, but it isn't, so don't.
2495
2496=item *
2497
2498There is a large memory overhead for each record offset and for each
2499cache entry: about 310 bytes per cached data record, and about 21 bytes
2500per offset table entry.
2501
2502The per-record overhead will limit the maximum number of records you
2503can access per file. Note that I<accessing> the length of the array
2504via C<$x = scalar @tied_file> accesses B<all> records and stores their
2505offsets.  The same for C<foreach (@tied_file)>, even if you exit the
2506loop early.
2507
2508=back
2509
2510=head1 SUBCLASSING
2511
2512This version promises absolutely nothing about the internals, which
2513may change without notice.  A future version of the module will have a
2514well-defined and stable subclassing API.
2515
2516=head1 WHAT ABOUT C<DB_File>?
2517
2518People sometimes point out that L<DB_File> will do something similar,
2519and ask why C<Tie::File> module is necessary.
2520
2521There are a number of reasons that you might prefer C<Tie::File>.
2522A list is available at C<L<http://perl.plover.com/TieFile/why-not-DB_File>>.
2523
2524=head1 AUTHOR
2525
2526Mark Jason Dominus
2527
2528To contact the author, send email to: C<mjd-perl-tiefile+@plover.com>
2529
2530To receive an announcement whenever a new version of this module is
2531released, send a blank email message to
2532C<mjd-perl-tiefile-subscribe@plover.com>.
2533
2534The most recent version of this module, including documentation and
2535any news of importance, will be available at
2536
2537	http://perl.plover.com/TieFile/
2538
2539
2540=head1 LICENSE
2541
2542C<Tie::File> version 0.96 is copyright (C) 2003 Mark Jason Dominus.
2543
2544This library is free software; you may redistribute it and/or modify
2545it under the same terms as Perl itself.
2546
2547These terms are your choice of any of (1) the Perl Artistic Licence,
2548or (2) version 2 of the GNU General Public License as published by the
2549Free Software Foundation, or (3) any later version of the GNU General
2550Public License.
2551
2552This library is distributed in the hope that it will be useful,
2553but WITHOUT ANY WARRANTY; without even the implied warranty of
2554MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
2555GNU General Public License for more details.
2556
2557You should have received a copy of the GNU General Public License
2558along with this library program; it should be in the file C<COPYING>.
2559If not, write to the Free Software Foundation, Inc., 51 Franklin Street,
2560Fifth Floor, Boston, MA  02110-1301, USA
2561
2562For licensing inquiries, contact the author at:
2563
2564	Mark Jason Dominus
2565	255 S. Warnock St.
2566	Philadelphia, PA 19107
2567
2568=head1 WARRANTY
2569
2570C<Tie::File> version 0.98 comes with ABSOLUTELY NO WARRANTY.
2571For details, see the license.
2572
2573=head1 THANKS
2574
2575Gigantic thanks to Jarkko Hietaniemi, for agreeing to put this in the
2576core when I hadn't written it yet, and for generally being helpful,
2577supportive, and competent.  (Usually the rule is "choose any one.")
2578Also big thanks to Abhijit Menon-Sen for all of the same things.
2579
2580Special thanks to Craig Berry and Peter Prymmer (for VMS portability
2581help), Randy Kobes (for Win32 portability help), Clinton Pierce and
2582Autrijus Tang (for heroic eleventh-hour Win32 testing above and beyond
2583the call of duty), Michael G Schwern (for testing advice), and the
2584rest of the CPAN testers (for testing generally).
2585
2586Special thanks to Tels for suggesting several speed and memory
2587optimizations.
2588
2589Additional thanks to:
2590Edward Avis /
2591Mattia Barbon /
2592Tom Christiansen /
2593Gerrit Haase /
2594Gurusamy Sarathy /
2595Jarkko Hietaniemi (again) /
2596Nikola Knezevic /
2597John Kominetz /
2598Nick Ing-Simmons /
2599Tassilo von Parseval /
2600H. Dieter Pearcey /
2601Slaven Rezic /
2602Eric Roode /
2603Peter Scott /
2604Peter Somu /
2605Autrijus Tang (again) /
2606Tels (again) /
2607Juerd Waalboer /
2608Todd Rinaldo
2609
2610=head1 TODO
2611
2612More tests.  (Stuff I didn't think of yet.)
2613
2614Paragraph mode?
2615
2616Fixed-length mode.  Leave-blanks mode.
2617
2618Maybe an autolocking mode?
2619
2620For many common uses of the module, the read cache is a liability.
2621For example, a program that inserts a single record, or that scans the
2622file once, will have a cache hit rate of zero.  This suggests a major
2623optimization: The cache should be initially disabled.  Here's a hybrid
2624approach: Initially, the cache is disabled, but the cache code
2625maintains statistics about how high the hit rate would be *if* it were
2626enabled.  When it sees the hit rate get high enough, it enables
2627itself.  The STAT comments in this code are the beginning of an
2628implementation of this.
2629
2630Record locking with fcntl()?  Then the module might support an undo
2631log and get real transactions.  What a tour de force that would be.
2632
2633Keeping track of the highest cached record. This would allow reads-in-a-row
2634to skip the cache lookup faster (if reading from 1..N with empty cache at
2635start, the last cached value will be always N-1).
2636
2637More tests.
2638
2639=cut
2640
2641