1# Paranoid::IO::Line -- Paranoid Line-based I/O functions 2# 3# $Id: lib/Paranoid/IO/Line.pm, 2.08 2020/12/31 12:10:06 acorliss Exp $ 4# 5# This software is free software. Similar to Perl, you can redistribute it 6# and/or modify it under the terms of either: 7# 8# a) the GNU General Public License 9# <https://www.gnu.org/licenses/gpl-1.0.html> as published by the 10# Free Software Foundation <http://www.fsf.org/>; either version 1 11# <https://www.gnu.org/licenses/gpl-1.0.html>, or any later version 12# <https://www.gnu.org/licenses/license-list.html#GNUGPL>, or 13# b) the Artistic License 2.0 14# <https://opensource.org/licenses/Artistic-2.0>, 15# 16# subject to the following additional term: No trademark rights to 17# "Paranoid" have been or are conveyed under any of the above licenses. 18# However, "Paranoid" may be used fairly to describe this unmodified 19# software, in good faith, but not as a trademark. 20# 21# (c) 2005 - 2020, Arthur Corliss (corliss@digitalmages.com) 22# (tm) 2008 - 2020, Paranoid Inc. (www.paranoid.com) 23# 24##################################################################### 25 26##################################################################### 27# 28# Environment definitions 29# 30##################################################################### 31 32package Paranoid::IO::Line; 33 34use 5.008; 35 36use strict; 37use warnings; 38use vars qw($VERSION @EXPORT @EXPORT_OK %EXPORT_TAGS); 39use base qw(Exporter); 40use Fcntl qw(:DEFAULT :seek :flock :mode); 41use Paranoid qw(:all); 42use Paranoid::Debug qw(:all); 43use Paranoid::IO qw(:all); 44use Paranoid::Input qw(:all); 45 46($VERSION) = ( q$Revision: 2.08 $ =~ /(\d+(?:\.\d+)+)/sm ); 47 48@EXPORT = qw(sip nlsip tailf nltailf slurp nlslurp piolClose); 49@EXPORT_OK = ( @EXPORT, qw(PIOMAXLNSIZE) ); 50%EXPORT_TAGS = ( all => [@EXPORT_OK], ); 51 52use constant STAT_INO => 1; 53use constant STAT_SIZ => 7; 54use constant PDEFLNSZ => 2048; 55 56use constant PBFLAG => 0; 57use constant PBBUFF => 1; 58 59use constant PBF_DRAIN => 0; 60use constant PBF_NORMAL => 1; 61use constant PBF_DELETE => -1; 62 63##################################################################### 64# 65# Module code follows 66# 67##################################################################### 68 69{ 70 my $mlnsz = PDEFLNSZ; 71 72 sub PIOMAXLNSIZE : lvalue { 73 74 # Purpose: Gets/sets default line size of I/O 75 # Returns: $mlnsz 76 # Usage: $limit = PIOMAXLNSIZE; 77 # Usage: FSZLIMIT = 100; 78 79 $mlnsz; 80 } 81 82 # Manage buffers: $buffers{$name} => [$flag, $content ]; 83 my %buffers; 84 85 sub _chkBuffer { return exists $buffers{ $_[0] } } 86 87 sub _chkStat { 88 89 # Purpose: Checks stat data to see if the underlying 90 # file has changed 91 # Returns: Boolean 92 # Usage: $rv = _chkStat($file); 93 94 my $file = shift; 95 my $rv = 0; 96 my ( $fh, $fpos, @fstat, @fhstat ); 97 98 pdebug( 'entering w/(%s)', PDLEVEL3, $file ); 99 pIn(); 100 101 # Check to see if we can get a valid file handle 102 if ( defined( $fh = popen( $file, O_RDONLY ) ) ) { 103 @fhstat = stat $fh; 104 $fpos = ptell($fh); 105 106 if ( @fhstat and $fpos < $fhstat[STAT_SIZ] ) { 107 108 # Still have content to read, continue on 109 pdebug( 'still have content to drain', PDLEVEL3 ); 110 $rv = 1; 111 112 } else { 113 114 # Check the file system to see if we're still 115 # operating on the same file 116 @fstat = stat $file; 117 118 if ( scalar @fstat ) { 119 120 # Check inode 121 if ( $fhstat[STAT_INO] != $fstat[STAT_INO] ) { 122 pdebug( 'file was replaced with a new file', 123 PDLEVEL3 ); 124 } else { 125 if ( $fstat[STAT_SIZ] < $fpos ) { 126 pdebug( 'file was truncated', PDLEVEL3 ); 127 } else { 128 pdebug( 'file is unchanged', PDLEVEL3 ); 129 $rv = 1; 130 } 131 } 132 133 } else { 134 pdebug( 'file was deleted', PDLEVEL3 ); 135 } 136 } 137 } else { 138 pdebug( 'invalid/non-existent file', PDLEVEL3 ); 139 } 140 141 pOut(); 142 pdebug( 'leaving w/rv: %s', PDLEVEL3, $rv ); 143 144 return $rv; 145 } 146 147 sub piolClose { 148 149 # Purpose: Closes file handles and deletes the associated 150 # buffer 151 # Returns: Boolean 152 # Usage: $rv = piolClose($file); 153 154 my $file = shift; 155 156 delete $buffers{$file}; 157 158 return pclose($file); 159 } 160 161 sub sip ($\@;$$) { 162 163 # Purpose: Reads a chunk from the passwed handle or file name 164 # Returns: Number of lines read or undef critical failures 165 # Usage: $nlines = sip($fh, @lines); 166 # Usage: $nlines = sip($filename, @lines); 167 # Usage: $nlines = sip($filename, @lines, 1); 168 169 my $file = shift; 170 my $aref = shift; 171 my $doChomp = shift; 172 my $noLocks = shift; 173 my $rv = 1; 174 my ( $buffer, $bflag, $in, $content, $bread, $irv, @tmp, $line ); 175 176 pdebug( 'entering w/(%s)(%s)(%s)', PDLEVEL1, $file, $aref, $doChomp ); 177 pIn(); 178 179 @$aref = (); 180 181 # Check the file 182 piolClose($file) unless _chkStat($file); 183 184 # Get/initialize buffer 185 if ( exists $buffers{$file} ) { 186 $bflag = $buffers{$file}[PBFLAG]; 187 $buffer = $buffers{$file}[PBBUFF]; 188 } else { 189 $buffers{$file} = [ PBF_NORMAL, '' ]; 190 $buffer = ''; 191 $bflag = PBF_NORMAL; 192 } 193 194 # Read what we can 195 $content = ''; 196 $bread = 0; 197 while ( $bread < PIOMAXFSIZE ) { 198 $irv = $noLocks ? pnlread( $file, $in ) : pread( $file, $in ); 199 if ( defined $irv ) { 200 $bread += $irv; 201 $content .= $in; 202 last if $irv < PIOBLKSIZE; 203 } else { 204 $rv = undef; 205 last; 206 } 207 } 208 209 # Post processing 210 if ($rv) { 211 212 if ( length $content ) { 213 214 # Add the buffer 215 $content = "$buffer$content"; 216 217 # Process buffer drain conditions 218 pdebug( 'starting buffer flag: (%s)', PDLEVEL4, $bflag ); 219 pdebug( 'starting buffer: (%s)', PDLEVEL4, $buffer ); 220 if ( !$bflag and $content =~ /@{[NEWLINE_REGEX]}/so ) { 221 pdebug( 'draining to next newline', PDLEVEL4 ); 222 $content =~ s/^.*?@{[NEWLINE_REGEX]}//so; 223 $bflag = PBF_NORMAL; 224 $buffer = ''; 225 } 226 227 # Check for newlines 228 if ( $content =~ /@{[NEWLINE_REGEX]}/so ) { 229 230 # Split lines along newline boundaries 231 @tmp = split m/(@{[NEWLINE_REGEX]})/so, $content; 232 while ( scalar @tmp > 1 ) { 233 if ( length $tmp[0] > PIOMAXLNSIZE ) { 234 splice @tmp, 0, 2; 235 $line = undef; 236 } else { 237 $line = join '', splice @tmp, 0, 2; 238 } 239 push @$aref, $line; 240 } 241 242 # Check for undefined lines 243 $rv = scalar @$aref; 244 @$aref = grep {defined} @$aref; 245 if ( $rv != scalar @$aref ) { 246 Paranoid::ERROR = 247 pdebug( 'found %s lines over PIOMAXLNSIZE', 248 PDLEVEL1, $rv - @$aref ); 249 $rv = undef; 250 } 251 252 # Check for an unterminated line at the end and 253 # buffer appropriately 254 if ( scalar @tmp ) { 255 256 # Content left over, update the buffer 257 if ( length $tmp[0] > PIOMAXLNSIZE ) { 258 $buffer = ''; 259 $bflag = PBF_DRAIN; 260 $rv = undef; 261 Paranoid::ERROR = 262 pdebug( 'buffer is over PIOMAXLNSIZE', 263 PDLEVEL1 ); 264 } else { 265 $buffer = $tmp[0]; 266 $bflag = PBF_NORMAL; 267 } 268 } else { 269 270 # Nothing left over, make sure the buffer is empty 271 $buffer = ''; 272 $bflag = PBF_NORMAL; 273 } 274 275 } else { 276 277 # Check buffered block for PIOILNSIZE limit 278 if ( length $content > PIOMAXLNSIZE ) { 279 $buffer = ''; 280 $bflag = PBF_DRAIN; 281 $rv = undef; 282 Paranoid::ERROR = 283 pdebug( 'block is over PIOMAXLNSIZE', PDLEVEL1 ); 284 } else { 285 $rv = 0; 286 $buffer = $content; 287 $bflag = PBF_NORMAL; 288 } 289 } 290 pdebug( 'ending buffer flag: (%s)', PDLEVEL4, $bflag ); 291 pdebug( 'ending buffer: (%s)', PDLEVEL4, $buffer ); 292 293 } else { 294 $rv = 0; 295 } 296 } 297 298 # Set PTRUE_ZERO if needed 299 $rv = PTRUE_ZERO if defined $rv and $rv == 0; 300 301 # Save the buffer 302 $buffers{$file}[PBFLAG] = $bflag; 303 $buffers{$file}[PBBUFF] = $buffer; 304 305 # Chomp if necessary 306 pchomp(@$aref) if $doChomp and scalar @$aref; 307 308 pdebug( 'returning %s lines', PDLEVEL2, scalar @$aref ); 309 310 pOut(); 311 pdebug( 'leaving w/rv: %s', PDLEVEL1, $rv ); 312 313 return $rv; 314 } 315 316} 317 318sub nlsip { 319 320 # Purpose: Wrapper for sip that enables non-locking reads 321 # Returns: Return value from sip 322 # Usage: $nlines = nlsip($file, @lines); 323 324 my $file = shift; 325 my $aref = shift; 326 my $doChomp = shift; 327 328 return sip( $file, @$aref, $doChomp, 1 ); 329} 330 331sub tailf ($\@;$$$) { 332 333 # Purpose: Augments sip's tailing abilities by seeking to 334 # the end (or, optionally, backwards) 335 # Returns: Number of lines tailed 336 # Usage: $nlines = tail($filename, @lines); 337 # Usage: $nlines = tail($filename, @lines, $chomp); 338 # Usage: $nlines = tail($filename, @lines, $lnOffset); 339 340 my $file = shift; 341 my $aref = shift; 342 my $doChomp = shift || 0; 343 my $offset = shift || -10; 344 my $noLocks = shift; 345 my ( $rv, $ofsb, @lines ); 346 347 pdebug( 'entering w/(%s)(%s)(%s)(%s)', 348 PDLEVEL1, $file, $aref, $doChomp, $offset ); 349 pIn(); 350 351 @$aref = (); 352 353 # Check to see if we've already opened this file 354 if ( _chkBuffer($file) ) { 355 356 # Offset is only used on the initial open 357 $offset = 0; 358 359 } else { 360 361 # TODO: At some point we might want to honor positive offsets to mimic 362 # the behavior of UNIX tail 363 364 # Calculate how far back we need to go from the end 365 $ofsb = $offset * ( PIOMAXLNSIZE +1 ); 366 Paranoid::ERROR = 367 pdebug( 'WARNING: called with a positive line offset', PDLEVEL1 ) 368 unless $ofsb < 0; 369 370 # Open the file and move the cursor 371 pseek( $file, $ofsb, SEEK_END ) if popen( $file, O_RDONLY ); 372 373 } 374 375 # If $offset is set we have trailing lines to handle 376 if ($offset) { 377 378 # Consume everything to the end of the file 379 do { 380 $noLocks 381 ? nlsip( $file, @lines, $doChomp ) 382 : sip( $file, @lines, $doChomp ); 383 push @$aref, @lines; 384 } while scalar @lines; 385 386 # Trim list to the request size 387 if ( scalar @$aref > abs $offset ) { 388 splice @$aref, 0, @$aref - abs $offset; 389 } 390 $rv = scalar @$aref; 391 $rv = PTRUE_ZERO unless $rv; 392 393 } else { 394 395 # Do a single sip 396 $rv = 397 $noLocks 398 ? nlsip( $file, @$aref, $doChomp ) 399 : sip( $file, @$aref, $doChomp ); 400 } 401 402 pOut(); 403 pdebug( 'leaving w/rv: %s', PDLEVEL1, $rv ); 404 405 return $rv; 406} 407 408sub nltailf ($\@;$$$) { 409 410 # Purpose: Wrapper for sip that enables non-locking reads 411 # Returns: Return value from sip 412 # Usage: $nlines = nlsip($file, @lines); 413 414 my $file = shift; 415 my $aref = shift; 416 my $doChomp = shift; 417 my $offset = shift; 418 419 return tailf( $file, @$aref, $doChomp, $offset, 1 ); 420} 421 422sub slurp ($\@;$$) { 423 424 # Purpose: Reads a file into memory 425 # Returns: Number of lines read/undef 426 # Usage: $nlines = slurp($filename, @lines; 427 # Usage: $nlines = slurp($filename, @lines, 1); 428 429 my $file = shift; 430 my $aref = shift; 431 my $doChomp = shift || 0; 432 my $noLocks = shift; 433 my $rv = 1; 434 my @fstat; 435 436 pdebug( 'entering w/(%s)(%s)(%s)', PDLEVEL1, $file, $aref, $doChomp ); 437 pIn(); 438 439 # Start sipping 440 $rv = sip( $file, @$aref, $doChomp, $noLocks ); 441 if ( ref $file eq 'GLOB' ) { 442 @fstat = stat $file if fileno $file; 443 } else { 444 @fstat = stat $file; 445 } 446 if ( scalar @fstat and $fstat[STAT_SIZ] > PIOMAXFSIZE ) { 447 Paranoid::ERROR = pdebug( 'file size exceeds PIOMAXFSIZE', PDLEVEL1 ); 448 $rv = undef; 449 } 450 451 # Count lins if sip never complained 452 $rv = scalar @$aref if defined $rv; 453 454 # Close everything out 455 piolClose($file); 456 457 pOut(); 458 pdebug( 'leaving w/rv: %s', PDLEVEL1, $rv ); 459 460 return $rv; 461} 462 463sub nlslurp ($\@;$$) { 464 465 # Purpose: Performs a non-locking slurp 466 # Returns: Number of lines/undef 467 # Usage: $nlines = nlslurp($filename, @lines); 468 # Usage: $nlines = nlslurp($filename, @lines, 1); 469 470 my $file = shift; 471 my $aref = shift; 472 my $doChomp = shift || 0; 473 474 return slurp( $file, @$aref, $doChomp, 1 ); 475} 476 4771; 478 479__END__ 480 481=head1 NAME 482 483Paranoid::IO::Line - Paranoid Line-based I/O functions 484 485=head1 VERSION 486 487$Id: lib/Paranoid/IO/Line.pm, 2.08 2020/12/31 12:10:06 acorliss Exp $ 488 489=head1 SYNOPSIS 490 491 use Paranoid::IO::Line; 492 493 PIOMAXLNSIZE = 4096; 494 495 $nlines = sip($filename, @lines); 496 $nlines = sip($filename, @lines, 1); 497 $nlines = tailf($filename, @lines); 498 $nlines = tailf($filename, @lines, 1); 499 $nlines = tailf($filename, @lines, 1, -100); 500 501 piolClose($filename); 502 503 $nlines = slurp($filename, @lines); 504 505=head1 DESCRIPTION 506 507This module extends and leverages L<Paranoid::IO>'s capabilities with an eye 508towards line-based text files, such as log files. It does so while 509maintaining a paranoid stance towards I/O. For that reason the functions here 510only work on limited chunks of data at a time, both in terms of maximum memory 511kept in memory at a time and the maximum record length. L<Paranoid::IO> 512provides I<PIOMAXFSIZE> which controls the former, but this module provides 513I<PIOMAXLNSIZE> which controls the latter. 514 515Even with the paranoid slant of these functions they should really be treated 516as convenience functions which can simplify higher level code without 517incurring any significant risk to the developer or system. They inherit not 518only opportunistic I/O but platform-agnostic record separators via internal 519use of I<pchomp> from L<Paranoid::Input>. 520 521B<NOTE:> while this does build off the foundation provided by L<Paranoid::IO> 522it is important to note that you should not work on the same files using 523:<Paranoid::IO>'s functions while also using the functions in this module. 524While the former works from raw I/O the latter has to manage buffers in order 525to identify record boundaries. If you were to, say, I<sip> from a file, then 526I<pread> or I<pseek> elsewhere it would render those buffers not only useless, 527but corrupt. This is important to note since the functions here do leverage 528the file handle caching features provided by I<popen>. 529 530It should also be noted that since we're anticipating line-based records we 531expect every line, even the last line in a file, to be properly terminated 532with a record separator (new line sequence). 533 534As with all L<Paranoid> modules string descriptions of errors can be retrieved 535from L<Paranoid::ERROR> as they occur. 536 537=head1 IMPORT LISTS 538 539This module exports the following symbols by default: 540 541 sip nlsip tailf nltailf slurp nlslurp piolClose 542 543The following specialized import lists also exist: 544 545 List Members 546 -------------------------------------------------------- 547 all @defaults PIOMAXLNSIZE 548 549=head1 SUBROUTINES/METHODS 550 551=head2 PIOMAXLNSIZE 552 553The valute returned/set by this lvalue function is the maximum line length 554supported by functions like B<sip> (documented below). Unless explicitly set 555this defaults to 2KB. Any lines found which exceed this are discarded. 556 557=head2 sip 558 559 $nlines = sip($filename, @lines); 560 $nlines = sip($filename, @lines, 1); 561 562This function allows you to read a text file into memory in chunks, the 563lines of which are placed into the passed array reference. The chunks are 564read in at up to L<PIOMAXFSIZE> in size at a time. File locking is used 565and autochomping is also supported. 566 567This returns the number of lines extracted or boolean false if any errors 568occurred, such as lines exceeding I<PIOMAXLNSIZE> or other I/O errors. If 569there were no errors but also no content it will return B<0 but true>, which 570will satisfy boolean tests. 571 572The passed array is always purged prior to execution. This can potentially 573help differentiate types of errors: 574 575 $nlines = sip($filename, @lines); 576 577 warn "successfully extracted lines" 578 if $nlines and scalar @lines; 579 warn "no errors, but no lines" 580 if $nlines and ! scalar @lines; 581 warn "line length exceeded on some lines" 582 if !$nlines and scalar @lines; 583 warn "I/O errors or all lines exceeded line length" 584 if !$nlines and ! scalar @lines; 585 586Typically, if all one cares about is extracting good lines and discarding bad 587ones all you need is: 588 589 warn "good to go" if scalar @lines or $nlines; 590 591 # or, more likely: 592 if (@lines) { 593 # process input... 594 } 595 596B<NOTE:> I<sip> does try to check the file stat with every call. This allows 597us to automatically flush buffers and reopen files in the event that the file 598you're sipping from was truncated, deleted, or overwritten. 599 600The third argument is a boolean option which controls whether lines are 601automatically chomped or not. It defaults to not. 602 603=head2 nlsip 604 605 $nlines = nlsip($filename, @lines); 606 $nlines = nlsip($filename, @lines, 1); 607 608A very thin wrapper for I<sip> that disables file locking. 609 610=head2 tailf 611 612 $nlines = tailf($filename, @lines); 613 $nlines = tailf($filename, @lines, 1); 614 $nlines = tailf($filename, @lines, 1, -100); 615 616The only difference between this function and B<sip> is that tailf opens the 617file and immediately seeks to the end. If an optional fourth argument is 618passed it will seek backwards to extract and return that number of lines (if 619possible). Depending on the number passed one must be prepared for enough 620memory to be allocated to store B<PIOMAXLNSIZE> * that number. If no number is 621specified it is assumed to be B<-10>. Specifying this argument on a file 622already opened by I<sip> or I<tailf> will have no effect. 623 624Return values are identical to I<sip>. 625 626=head2 nltailf 627 628 $nlines = nltailf($filename, @lines); 629 $nlines = nltailf($filename, @lines, -100); 630 $nlines = nltailf($filename, @lines, -100, 1); 631 632A very thin wrapper for I<tailf> that disables file locking. 633 634=head2 slurp 635 636 $nlines = slurp($filename, @lines); 637 $nlines = slurp($filename, @lines, 1); 638 639This function is essentially another wrapper for I<sip>, but with some 640different behavior. While I<sip> was written from the expectation that the 641developer would be either working on chunks from a very large file or a file 642that may grow while being accessed. I<slurp>, on the other hand, expects to 643work exclusively on small files that can safely fit into memory. It also sees 644no need to cache file handles since all operations will subsequently be done 645in memory. 646 647Files with slurp are explicitly closed after the read. All the normal 648safeguards apply: I<PIOMAXFSIZE> is the largest amount of data that will be 649read into memory, and all lines must be within I<PIOMAXLNSIZE>. 650 651The third argument is a boolean option which controls whether lines are 652automatically chomped or not. It defaults to not. 653 654=head2 nlslurp 655 656 $nlines = nlslurp($filename, @lines); 657 $nlines = nlslurp($filename, @lines, 1); 658 659A very thin wrapper for I<slurp> that disables file locking. 660 661=head2 piolClose 662 663 $rv = piolClose($filename); 664 665This closes all file handles and deletes any existing buffers. Works 666indiscriminatley and returns the exit value of I<pclose>. 667 668=head1 DEPENDENCIES 669 670=over 671 672=item o 673 674L<Fcntl> 675 676=item o 677 678L<Paranoid> 679 680=item o 681 682L<Paranoid::Debug> 683 684=item o 685 686L<Paranoid::Input> 687 688=item o 689 690L<Paranoid::IO> 691 692=back 693 694=head1 BUGS AND LIMITATIONS 695 696While all of these functions will just as happily accept file handles as well 697as file names doing will almost certainly cause any number of bugs. Beyond 698the inherited L<Paranoid::IO> issues (like not getting the fork-safe features 699for any file handle opened directly by the developer) there are other issues. 700Buffers, for instance, can only be managed by one consistent name, there is no 701way to correlate them and make them interchangeable. There are other 702subtleties as well, but there is no need to detail them all. 703 704Suffice it to say that when using this module one should only use file names, 705and use them consistently. 706 707=head1 AUTHOR 708 709Arthur Corliss (corliss@digitalmages.com) 710 711=head1 LICENSE AND COPYRIGHT 712 713This software is free software. Similar to Perl, you can redistribute it 714and/or modify it under the terms of either: 715 716 a) the GNU General Public License 717 <https://www.gnu.org/licenses/gpl-1.0.html> as published by the 718 Free Software Foundation <http://www.fsf.org/>; either version 1 719 <https://www.gnu.org/licenses/gpl-1.0.html>, or any later version 720 <https://www.gnu.org/licenses/license-list.html#GNUGPL>, or 721 b) the Artistic License 2.0 722 <https://opensource.org/licenses/Artistic-2.0>, 723 724subject to the following additional term: No trademark rights to 725"Paranoid" have been or are conveyed under any of the above licenses. 726However, "Paranoid" may be used fairly to describe this unmodified 727software, in good faith, but not as a trademark. 728 729(c) 2005 - 2020, Arthur Corliss (corliss@digitalmages.com) 730(tm) 2008 - 2020, Paranoid Inc. (www.paranoid.com) 731 732