1# Paranoid::IO::Line -- Paranoid Line-based I/O functions
2#
3# $Id: lib/Paranoid/IO/Line.pm, 2.08 2020/12/31 12:10:06 acorliss Exp $
4#
5# This software is free software.  Similar to Perl, you can redistribute it
6# and/or modify it under the terms of either:
7#
8#   a)     the GNU General Public License
9#          <https://www.gnu.org/licenses/gpl-1.0.html> as published by the
10#          Free Software Foundation <http://www.fsf.org/>; either version 1
11#          <https://www.gnu.org/licenses/gpl-1.0.html>, or any later version
12#          <https://www.gnu.org/licenses/license-list.html#GNUGPL>, or
13#   b)     the Artistic License 2.0
14#          <https://opensource.org/licenses/Artistic-2.0>,
15#
16# subject to the following additional term:  No trademark rights to
17# "Paranoid" have been or are conveyed under any of the above licenses.
18# However, "Paranoid" may be used fairly to describe this unmodified
19# software, in good faith, but not as a trademark.
20#
21# (c) 2005 - 2020, Arthur Corliss (corliss@digitalmages.com)
22# (tm) 2008 - 2020, Paranoid Inc. (www.paranoid.com)
23#
24#####################################################################
25
26#####################################################################
27#
28# Environment definitions
29#
30#####################################################################
31
32package Paranoid::IO::Line;
33
34use 5.008;
35
36use strict;
37use warnings;
38use vars qw($VERSION @EXPORT @EXPORT_OK %EXPORT_TAGS);
39use base qw(Exporter);
40use Fcntl qw(:DEFAULT :seek :flock :mode);
41use Paranoid qw(:all);
42use Paranoid::Debug qw(:all);
43use Paranoid::IO qw(:all);
44use Paranoid::Input qw(:all);
45
46($VERSION) = ( q$Revision: 2.08 $ =~ /(\d+(?:\.\d+)+)/sm );
47
48@EXPORT      = qw(sip nlsip tailf nltailf slurp nlslurp piolClose);
49@EXPORT_OK   = ( @EXPORT, qw(PIOMAXLNSIZE) );
50%EXPORT_TAGS = ( all => [@EXPORT_OK], );
51
52use constant STAT_INO => 1;
53use constant STAT_SIZ => 7;
54use constant PDEFLNSZ => 2048;
55
56use constant PBFLAG => 0;
57use constant PBBUFF => 1;
58
59use constant PBF_DRAIN  => 0;
60use constant PBF_NORMAL => 1;
61use constant PBF_DELETE => -1;
62
63#####################################################################
64#
65# Module code follows
66#
67#####################################################################
68
69{
70    my $mlnsz = PDEFLNSZ;
71
72    sub PIOMAXLNSIZE : lvalue {
73
74        # Purpose:  Gets/sets default line size of I/O
75        # Returns:  $mlnsz
76        # Usage:    $limit = PIOMAXLNSIZE;
77        # Usage:    FSZLIMIT = 100;
78
79        $mlnsz;
80    }
81
82    # Manage buffers: $buffers{$name} => [$flag, $content ];
83    my %buffers;
84
85    sub _chkBuffer { return exists $buffers{ $_[0] } }
86
87    sub _chkStat {
88
89        # Purpose:  Checks stat data to see if the underlying
90        #           file has changed
91        # Returns:  Boolean
92        # Usage:    $rv = _chkStat($file);
93
94        my $file = shift;
95        my $rv   = 0;
96        my ( $fh, $fpos, @fstat, @fhstat );
97
98        pdebug( 'entering w/(%s)', PDLEVEL3, $file );
99        pIn();
100
101        # Check to see if we can get a valid file handle
102        if ( defined( $fh = popen( $file, O_RDONLY ) ) ) {
103            @fhstat = stat $fh;
104            $fpos   = ptell($fh);
105
106            if ( @fhstat and $fpos < $fhstat[STAT_SIZ] ) {
107
108                # Still have content to read, continue on
109                pdebug( 'still have content to drain', PDLEVEL3 );
110                $rv = 1;
111
112            } else {
113
114                # Check the file system to see if we're still
115                # operating on the same file
116                @fstat = stat $file;
117
118                if ( scalar @fstat ) {
119
120                    # Check inode
121                    if ( $fhstat[STAT_INO] != $fstat[STAT_INO] ) {
122                        pdebug( 'file was replaced with a new file',
123                            PDLEVEL3 );
124                    } else {
125                        if ( $fstat[STAT_SIZ] < $fpos ) {
126                            pdebug( 'file was truncated', PDLEVEL3 );
127                        } else {
128                            pdebug( 'file is unchanged', PDLEVEL3 );
129                            $rv = 1;
130                        }
131                    }
132
133                } else {
134                    pdebug( 'file was deleted', PDLEVEL3 );
135                }
136            }
137        } else {
138            pdebug( 'invalid/non-existent file', PDLEVEL3 );
139        }
140
141        pOut();
142        pdebug( 'leaving w/rv: %s', PDLEVEL3, $rv );
143
144        return $rv;
145    }
146
147    sub piolClose {
148
149        # Purpose:  Closes file handles and deletes the associated
150        #           buffer
151        # Returns:  Boolean
152        # Usage:    $rv = piolClose($file);
153
154        my $file = shift;
155
156        delete $buffers{$file};
157
158        return pclose($file);
159    }
160
161    sub sip ($\@;$$) {
162
163        # Purpose:  Reads a chunk from the passwed handle or file name
164        # Returns:  Number of lines read or undef critical failures
165        # Usage:    $nlines = sip($fh, @lines);
166        # Usage:    $nlines = sip($filename, @lines);
167        # Usage:    $nlines = sip($filename, @lines, 1);
168
169        my $file    = shift;
170        my $aref    = shift;
171        my $doChomp = shift;
172        my $noLocks = shift;
173        my $rv      = 1;
174        my ( $buffer, $bflag, $in, $content, $bread, $irv, @tmp, $line );
175
176        pdebug( 'entering w/(%s)(%s)(%s)', PDLEVEL1, $file, $aref, $doChomp );
177        pIn();
178
179        @$aref = ();
180
181        # Check the file
182        piolClose($file) unless _chkStat($file);
183
184        # Get/initialize buffer
185        if ( exists $buffers{$file} ) {
186            $bflag  = $buffers{$file}[PBFLAG];
187            $buffer = $buffers{$file}[PBBUFF];
188        } else {
189            $buffers{$file} = [ PBF_NORMAL, '' ];
190            $buffer         = '';
191            $bflag          = PBF_NORMAL;
192        }
193
194        # Read what we can
195        $content = '';
196        $bread   = 0;
197        while ( $bread < PIOMAXFSIZE ) {
198            $irv = $noLocks ? pnlread( $file, $in ) : pread( $file, $in );
199            if ( defined $irv ) {
200                $bread += $irv;
201                $content .= $in;
202                last if $irv < PIOBLKSIZE;
203            } else {
204                $rv = undef;
205                last;
206            }
207        }
208
209        # Post processing
210        if ($rv) {
211
212            if ( length $content ) {
213
214                # Add the buffer
215                $content = "$buffer$content";
216
217                # Process buffer drain conditions
218                pdebug( 'starting buffer flag: (%s)', PDLEVEL4, $bflag );
219                pdebug( 'starting buffer: (%s)',      PDLEVEL4, $buffer );
220                if ( !$bflag and $content =~ /@{[NEWLINE_REGEX]}/so ) {
221                    pdebug( 'draining to next newline', PDLEVEL4 );
222                    $content =~ s/^.*?@{[NEWLINE_REGEX]}//so;
223                    $bflag  = PBF_NORMAL;
224                    $buffer = '';
225                }
226
227                # Check for newlines
228                if ( $content =~ /@{[NEWLINE_REGEX]}/so ) {
229
230                    # Split lines along newline boundaries
231                    @tmp = split m/(@{[NEWLINE_REGEX]})/so, $content;
232                    while ( scalar @tmp > 1 ) {
233                        if ( length $tmp[0] > PIOMAXLNSIZE ) {
234                            splice @tmp, 0, 2;
235                            $line = undef;
236                        } else {
237                            $line = join '', splice @tmp, 0, 2;
238                        }
239                        push @$aref, $line;
240                    }
241
242                    # Check for undefined lines
243                    $rv = scalar @$aref;
244                    @$aref = grep {defined} @$aref;
245                    if ( $rv != scalar @$aref ) {
246                        Paranoid::ERROR =
247                            pdebug( 'found %s lines over PIOMAXLNSIZE',
248                            PDLEVEL1, $rv - @$aref );
249                        $rv = undef;
250                    }
251
252                    # Check for an unterminated line at the end and
253                    # buffer appropriately
254                    if ( scalar @tmp ) {
255
256                        # Content left over, update the buffer
257                        if ( length $tmp[0] > PIOMAXLNSIZE ) {
258                            $buffer = '';
259                            $bflag  = PBF_DRAIN;
260                            $rv     = undef;
261                            Paranoid::ERROR =
262                                pdebug( 'buffer is over PIOMAXLNSIZE',
263                                PDLEVEL1 );
264                        } else {
265                            $buffer = $tmp[0];
266                            $bflag  = PBF_NORMAL;
267                        }
268                    } else {
269
270                        # Nothing left over, make sure the buffer is empty
271                        $buffer = '';
272                        $bflag  = PBF_NORMAL;
273                    }
274
275                } else {
276
277                    # Check buffered block for PIOILNSIZE limit
278                    if ( length $content > PIOMAXLNSIZE ) {
279                        $buffer = '';
280                        $bflag  = PBF_DRAIN;
281                        $rv     = undef;
282                        Paranoid::ERROR =
283                            pdebug( 'block is over PIOMAXLNSIZE', PDLEVEL1 );
284                    } else {
285                        $rv     = 0;
286                        $buffer = $content;
287                        $bflag  = PBF_NORMAL;
288                    }
289                }
290                pdebug( 'ending buffer flag: (%s)', PDLEVEL4, $bflag );
291                pdebug( 'ending buffer: (%s)',      PDLEVEL4, $buffer );
292
293            } else {
294                $rv = 0;
295            }
296        }
297
298        # Set PTRUE_ZERO if needed
299        $rv = PTRUE_ZERO if defined $rv and $rv == 0;
300
301        # Save the buffer
302        $buffers{$file}[PBFLAG] = $bflag;
303        $buffers{$file}[PBBUFF] = $buffer;
304
305        # Chomp if necessary
306        pchomp(@$aref) if $doChomp and scalar @$aref;
307
308        pdebug( 'returning %s lines', PDLEVEL2, scalar @$aref );
309
310        pOut();
311        pdebug( 'leaving w/rv: %s', PDLEVEL1, $rv );
312
313        return $rv;
314    }
315
316}
317
318sub nlsip {
319
320    # Purpose:  Wrapper for sip that enables non-locking reads
321    # Returns:  Return value from sip
322    # Usage:    $nlines = nlsip($file, @lines);
323
324    my $file    = shift;
325    my $aref    = shift;
326    my $doChomp = shift;
327
328    return sip( $file, @$aref, $doChomp, 1 );
329}
330
331sub tailf ($\@;$$$) {
332
333    # Purpose:  Augments sip's tailing abilities by seeking to
334    #           the end (or, optionally, backwards)
335    # Returns:  Number of lines tailed
336    # Usage:    $nlines = tail($filename, @lines);
337    # Usage:    $nlines = tail($filename, @lines, $chomp);
338    # Usage:    $nlines = tail($filename, @lines, $lnOffset);
339
340    my $file    = shift;
341    my $aref    = shift;
342    my $doChomp = shift || 0;
343    my $offset  = shift || -10;
344    my $noLocks = shift;
345    my ( $rv, $ofsb, @lines );
346
347    pdebug( 'entering w/(%s)(%s)(%s)(%s)',
348        PDLEVEL1, $file, $aref, $doChomp, $offset );
349    pIn();
350
351    @$aref = ();
352
353    # Check to see if we've already opened this file
354    if ( _chkBuffer($file) ) {
355
356        # Offset is only used on the initial open
357        $offset = 0;
358
359    } else {
360
361        # TODO: At some point we might want to honor positive offsets to mimic
362        # the behavior of UNIX tail
363
364        # Calculate how far back we need to go from the end
365        $ofsb = $offset * ( PIOMAXLNSIZE +1 );
366        Paranoid::ERROR =
367            pdebug( 'WARNING:  called with a positive line offset', PDLEVEL1 )
368            unless $ofsb < 0;
369
370        # Open the file and move the cursor
371        pseek( $file, $ofsb, SEEK_END ) if popen( $file, O_RDONLY );
372
373    }
374
375    # If $offset is set we have trailing lines to handle
376    if ($offset) {
377
378        # Consume everything to the end of the file
379        do {
380            $noLocks
381                ? nlsip( $file, @lines, $doChomp )
382                : sip( $file, @lines, $doChomp );
383            push @$aref, @lines;
384        } while scalar @lines;
385
386        # Trim list to the request size
387        if ( scalar @$aref > abs $offset ) {
388            splice @$aref, 0, @$aref - abs $offset;
389        }
390        $rv = scalar @$aref;
391        $rv = PTRUE_ZERO unless $rv;
392
393    } else {
394
395        # Do a single sip
396        $rv =
397            $noLocks
398            ? nlsip( $file, @$aref, $doChomp )
399            : sip( $file, @$aref, $doChomp );
400    }
401
402    pOut();
403    pdebug( 'leaving w/rv: %s', PDLEVEL1, $rv );
404
405    return $rv;
406}
407
408sub nltailf ($\@;$$$) {
409
410    # Purpose:  Wrapper for sip that enables non-locking reads
411    # Returns:  Return value from sip
412    # Usage:    $nlines = nlsip($file, @lines);
413
414    my $file    = shift;
415    my $aref    = shift;
416    my $doChomp = shift;
417    my $offset  = shift;
418
419    return tailf( $file, @$aref, $doChomp, $offset, 1 );
420}
421
422sub slurp ($\@;$$) {
423
424    # Purpose:  Reads a file into memory
425    # Returns:  Number of lines read/undef
426    # Usage:    $nlines = slurp($filename, @lines;
427    # Usage:    $nlines = slurp($filename, @lines, 1);
428
429    my $file    = shift;
430    my $aref    = shift;
431    my $doChomp = shift || 0;
432    my $noLocks = shift;
433    my $rv      = 1;
434    my @fstat;
435
436    pdebug( 'entering w/(%s)(%s)(%s)', PDLEVEL1, $file, $aref, $doChomp );
437    pIn();
438
439    # Start sipping
440    $rv = sip( $file, @$aref, $doChomp, $noLocks );
441    if ( ref $file eq 'GLOB' ) {
442        @fstat = stat $file if fileno $file;
443    } else {
444        @fstat = stat $file;
445    }
446    if ( scalar @fstat and $fstat[STAT_SIZ] > PIOMAXFSIZE ) {
447        Paranoid::ERROR = pdebug( 'file size exceeds PIOMAXFSIZE', PDLEVEL1 );
448        $rv = undef;
449    }
450
451    # Count lins if sip never complained
452    $rv = scalar @$aref if defined $rv;
453
454    # Close everything out
455    piolClose($file);
456
457    pOut();
458    pdebug( 'leaving w/rv: %s', PDLEVEL1, $rv );
459
460    return $rv;
461}
462
463sub nlslurp ($\@;$$) {
464
465    # Purpose:  Performs a non-locking slurp
466    # Returns:  Number of lines/undef
467    # Usage:    $nlines = nlslurp($filename, @lines);
468    # Usage:    $nlines = nlslurp($filename, @lines, 1);
469
470    my $file    = shift;
471    my $aref    = shift;
472    my $doChomp = shift || 0;
473
474    return slurp( $file, @$aref, $doChomp, 1 );
475}
476
4771;
478
479__END__
480
481=head1 NAME
482
483Paranoid::IO::Line - Paranoid Line-based I/O functions
484
485=head1 VERSION
486
487$Id: lib/Paranoid/IO/Line.pm, 2.08 2020/12/31 12:10:06 acorliss Exp $
488
489=head1 SYNOPSIS
490
491  use Paranoid::IO::Line;
492
493  PIOMAXLNSIZE = 4096;
494
495  $nlines = sip($filename, @lines);
496  $nlines = sip($filename, @lines, 1);
497  $nlines = tailf($filename, @lines);
498  $nlines = tailf($filename, @lines, 1);
499  $nlines = tailf($filename, @lines, 1, -100);
500
501  piolClose($filename);
502
503  $nlines = slurp($filename, @lines);
504
505=head1 DESCRIPTION
506
507This module extends and leverages L<Paranoid::IO>'s capabilities with an eye
508towards line-based text files, such as log files.  It does so while
509maintaining a paranoid stance towards I/O.  For that reason the functions here
510only work on limited chunks of data at a time, both in terms of maximum memory
511kept in memory at a time and the maximum record length.  L<Paranoid::IO>
512provides I<PIOMAXFSIZE> which controls the former, but this module provides
513I<PIOMAXLNSIZE> which controls the latter.
514
515Even with the paranoid slant of these functions they should really be treated
516as convenience functions which can simplify higher level code without
517incurring any significant risk to the developer or system.  They inherit not
518only opportunistic I/O but platform-agnostic record separators via internal
519use of I<pchomp> from L<Paranoid::Input>.
520
521B<NOTE:> while this does build off the foundation provided by L<Paranoid::IO>
522it is important to note that you should not work on the same files using
523:<Paranoid::IO>'s functions while also using the functions in this module.
524While the former works from raw I/O the latter has to manage buffers in order
525to identify record boundaries.  If you were to, say, I<sip> from a file, then
526I<pread> or I<pseek> elsewhere it would render those buffers not only useless,
527but corrupt.  This is important to note since the functions here do leverage
528the file handle caching features provided by I<popen>.
529
530It should also be noted that since we're anticipating line-based records we
531expect every line, even the last line in a file, to be properly terminated
532with a record separator (new line sequence).
533
534As with all L<Paranoid> modules string descriptions of errors can be retrieved
535from L<Paranoid::ERROR> as they occur.
536
537=head1 IMPORT LISTS
538
539This module exports the following symbols by default:
540
541    sip nlsip tailf nltailf slurp nlslurp piolClose
542
543The following specialized import lists also exist:
544
545    List        Members
546    --------------------------------------------------------
547    all         @defaults PIOMAXLNSIZE
548
549=head1 SUBROUTINES/METHODS
550
551=head2 PIOMAXLNSIZE
552
553The valute returned/set by this lvalue function is the maximum line length
554supported by functions like B<sip> (documented below).  Unless explicitly set
555this defaults to 2KB.  Any lines found which exceed this are discarded.
556
557=head2 sip
558
559    $nlines = sip($filename, @lines);
560    $nlines = sip($filename, @lines, 1);
561
562This function allows you to read a text file into memory in chunks, the
563lines of which are placed into the passed array reference.  The chunks are
564read in at up to L<PIOMAXFSIZE> in size at a time.  File locking is used
565and autochomping is also supported.
566
567This returns the number of lines extracted or boolean false if any errors
568occurred, such as lines exceeding I<PIOMAXLNSIZE> or other I/O errors.  If
569there were no errors but also no content it will return B<0 but true>, which
570will satisfy boolean tests.
571
572The passed array is always purged prior to execution.  This can potentially
573help differentiate types of errors:
574
575    $nlines = sip($filename, @lines);
576
577    warn "successfully extracted lines"
578        if $nlines and scalar @lines;
579    warn "no errors, but no lines"
580        if $nlines and ! scalar @lines;
581    warn "line length exceeded on some lines"
582        if !$nlines and scalar @lines;
583    warn "I/O errors or all lines exceeded line length"
584        if !$nlines and ! scalar @lines;
585
586Typically, if all one cares about is extracting good lines and discarding bad
587ones all you need is:
588
589    warn "good to go" if scalar @lines or $nlines;
590
591    # or, more likely:
592    if (@lines) {
593        # process input...
594    }
595
596B<NOTE:> I<sip> does try to check the file stat with every call.  This allows
597us to automatically flush buffers and reopen files in the event that the file
598you're sipping from was truncated, deleted, or overwritten.
599
600The third argument is a boolean option which controls whether lines are
601automatically chomped or not.  It defaults to not.
602
603=head2 nlsip
604
605    $nlines = nlsip($filename, @lines);
606    $nlines = nlsip($filename, @lines, 1);
607
608A very thin wrapper for I<sip> that disables file locking.
609
610=head2 tailf
611
612    $nlines = tailf($filename, @lines);
613    $nlines = tailf($filename, @lines, 1);
614    $nlines = tailf($filename, @lines, 1, -100);
615
616The only difference between this function and B<sip> is that tailf opens the
617file and immediately seeks to the end.  If an optional fourth argument is
618passed it will seek backwards to extract and return that number of lines (if
619possible).  Depending on the number passed one must be prepared for enough
620memory to be allocated to store B<PIOMAXLNSIZE> * that number. If no number is
621specified it is assumed to be B<-10>.  Specifying this argument on a file
622already opened by I<sip> or I<tailf> will have no effect.
623
624Return values are identical to I<sip>.
625
626=head2 nltailf
627
628    $nlines = nltailf($filename, @lines);
629    $nlines = nltailf($filename, @lines, -100);
630    $nlines = nltailf($filename, @lines, -100, 1);
631
632A very thin wrapper for I<tailf> that disables file locking.
633
634=head2 slurp
635
636  $nlines = slurp($filename, @lines);
637  $nlines = slurp($filename, @lines, 1);
638
639This function is essentially another wrapper for I<sip>, but with some
640different behavior.  While I<sip> was written from the expectation that the
641developer would be either working on chunks from a very large file or a file
642that may grow while being accessed.  I<slurp>, on the other hand, expects to
643work exclusively on small files that can safely fit into memory.  It also sees
644no need to cache file handles since all operations will subsequently be done
645in memory.
646
647Files with slurp are explicitly closed after the read.  All the normal
648safeguards apply:  I<PIOMAXFSIZE> is the largest amount of data that will be
649read into memory, and all lines must be within I<PIOMAXLNSIZE>.
650
651The third argument is a boolean option which controls whether lines are
652automatically chomped or not.  It defaults to not.
653
654=head2 nlslurp
655
656  $nlines = nlslurp($filename, @lines);
657  $nlines = nlslurp($filename, @lines, 1);
658
659A very thin wrapper for I<slurp> that disables file locking.
660
661=head2 piolClose
662
663  $rv = piolClose($filename);
664
665This closes all file handles and deletes any existing buffers.  Works
666indiscriminatley and returns the exit value of I<pclose>.
667
668=head1 DEPENDENCIES
669
670=over
671
672=item o
673
674L<Fcntl>
675
676=item o
677
678L<Paranoid>
679
680=item o
681
682L<Paranoid::Debug>
683
684=item o
685
686L<Paranoid::Input>
687
688=item o
689
690L<Paranoid::IO>
691
692=back
693
694=head1 BUGS AND LIMITATIONS
695
696While all of these functions will just as happily accept file handles as well
697as file names doing will almost certainly cause any number of bugs.  Beyond
698the inherited L<Paranoid::IO> issues (like not getting the fork-safe features
699for any file handle opened directly by the developer) there are other issues.
700Buffers, for instance, can only be managed by one consistent name, there is no
701way to correlate them and make them interchangeable.  There are other
702subtleties as well, but there is no need to detail them all.
703
704Suffice it to say that when using this module one should only use file names,
705and use them consistently.
706
707=head1 AUTHOR
708
709Arthur Corliss (corliss@digitalmages.com)
710
711=head1 LICENSE AND COPYRIGHT
712
713This software is free software.  Similar to Perl, you can redistribute it
714and/or modify it under the terms of either:
715
716  a)     the GNU General Public License
717         <https://www.gnu.org/licenses/gpl-1.0.html> as published by the
718         Free Software Foundation <http://www.fsf.org/>; either version 1
719         <https://www.gnu.org/licenses/gpl-1.0.html>, or any later version
720         <https://www.gnu.org/licenses/license-list.html#GNUGPL>, or
721  b)     the Artistic License 2.0
722         <https://opensource.org/licenses/Artistic-2.0>,
723
724subject to the following additional term:  No trademark rights to
725"Paranoid" have been or are conveyed under any of the above licenses.
726However, "Paranoid" may be used fairly to describe this unmodified
727software, in good faith, but not as a trademark.
728
729(c) 2005 - 2020, Arthur Corliss (corliss@digitalmages.com)
730(tm) 2008 - 2020, Paranoid Inc. (www.paranoid.com)
731
732