1############################################################################### 2# 3# Class: NaturalDocs::LineReader 4# 5############################################################################### 6# 7# An object to handle reading text files line by line in a cross platform manner. Using this class instead of the standard 8# angle brackets approach has the following benefits: 9# 10# - It strips all three types of line breaks automatically: CR/LF (Windows) LF (Unix) and CR (Classic Mac). You do not need to 11# call chomp(). Perl's chomp() fails when parsing Windows-format line breaks on a Unix platform anyway. It leaves the /r on, 12# which screws everything up. 13# - It reads Classic Mac files line by line correctly, whereas the Perl version returns it all as one line. 14# - It abstracts away ignoring the Unicode BOM on the first line, if present. 15# 16############################################################################### 17 18# This file is part of Natural Docs, which is Copyright � 2003-2010 Greg Valure 19# Natural Docs is licensed under version 3 of the GNU Affero General Public License (AGPL) 20# Refer to License.txt for the complete details 21 22use strict; 23use integer; 24 25use Encode; 26 27 28package NaturalDocs::LineReader; 29 30# 31# Constants: Members 32# 33# LINEREADER_FILEHANDLE - The file handle being used to read the file. Has the LINEREADER_ prefix to make sure it doesn't 34# conflict with any actual filehandles named FILEHANDLE in the program. 35# CACHED_LINES - An arrayref of lines already read into memory. 36# 37use NaturalDocs::DefineMembers 'LINEREADER_FILEHANDLE', 38 'CACHED_LINES'; 39 40# 41# Function: New 42# 43# Creates and returns a new object. 44# 45# Parameters: 46# 47# filehandle - The file handle being used to read the file. 48# 49sub New #(filehandle) 50 { 51 my ($selfPackage, $filehandle) = @_; 52 53 my $object = [ ]; 54 55 $object->[LINEREADER_FILEHANDLE] = $filehandle; 56 $object->[CACHED_LINES] = [ ]; 57 58 binmode($filehandle, ':raw'); 59 60 my $hasBOM = 0; 61 my $possibleBOM = undef; 62 read($filehandle, $possibleBOM, 2); 63 64 if ($possibleBOM eq "\xEF\xBB") 65 { 66 read($filehandle, $possibleBOM, 1); 67 if ($possibleBOM eq "\xBF") 68 { 69 binmode($filehandle, ':crlf:encoding(UTF-8)'); # Strict UTF-8, not Perl's lax version. 70 $hasBOM = 1; 71 } 72 } 73 elsif ($possibleBOM eq "\xFE\xFF") 74 { 75 binmode($filehandle, ':crlf:encoding(UTF-16BE)'); 76 $hasBOM = 1; 77 } 78 elsif ($possibleBOM eq "\xFF\xFE") 79 { 80 binmode($filehandle, ':crlf:encoding(UTF-16LE)'); 81 $hasBOM = 1; 82 } 83 84 if (!$hasBOM) 85 { 86 seek($filehandle, 0, 0); 87 88 my $rawData = undef; 89 my $readLength = -s $filehandle; 90 91 # Since we're only reading the data to determine if it's UTF-8, sanity check the file length. We may run 92 # across a huge extensionless system file and we don't want to load the whole thing. Half a meg should 93 # be good enough to encompass giant source files while not bogging things down on system files. 94 if ($readLength > 512 * 1024) 95 { $readLength = 512 * 1024; } 96 97 read($filehandle, $rawData, $readLength); 98 99 eval 100 { $rawData = Encode::decode("UTF-8", $rawData, Encode::FB_CROAK); }; 101 102 if ($::EVAL_ERROR) 103 { binmode($filehandle, ':crlf'); } 104 else 105 { 106 # Theoretically, since this is valid UTF-8 data we should be able to split it on line breaks and feed them into 107 # CACHED_LINES instead of setting the encoding to UTF-8 and seeking back to zero just to read it all again. 108 # Alas, this doesn't work for an easily identifiable reason. I'm sure there is one, but I couldn't figure it out 109 # before my patience ran out so I'm just letting the file cache absorb the hit instead. If we were ever to do 110 # this in the future you'd have to handle the file length capping code above too. 111 binmode($filehandle, ':crlf:encoding(UTF-8)'); 112 } 113 114 seek($filehandle, 0, 0); 115 } 116 117 bless $object, $selfPackage; 118 return $object; 119 }; 120 121 122# 123# Function: Chomp 124# 125# Removes any line breaks from the end of a value. It does not remove any that are in the middle of it. 126# 127# Parameters: 128# 129# lineRef - A *reference* to the line to chomp. 130# 131sub Chomp #(lineRef) 132 { 133 my ($self, $lineRef) = @_; 134 $$lineRef =~ s/(?:\r\n|\r|\n)$//; 135 }; 136 137 138# 139# Function: Get 140# 141# Returns the next line of text from the file, or undef if there are no more. The line break will be removed automatically. If 142# the first line contains a Unicode BOM, that will also be removed automatically. 143# 144sub Get 145 { 146 my $self = shift; 147 my $line = undef; 148 149 if (scalar @{$self->[CACHED_LINES]} == 0) 150 { 151 my $filehandle = $self->[LINEREADER_FILEHANDLE]; 152 my $rawLine = <$filehandle>; 153 154 if (!defined $rawLine) 155 { return undef; } 156 157 $self->Chomp(\$rawLine); 158 159 if ($rawLine =~ /\r/) 160 { 161 push @{$self->[CACHED_LINES]}, split(/\r/, $rawLine); # Split for Classic Mac 162 $line = shift @{$self->[CACHED_LINES]}; 163 } 164 else 165 { $line = $rawLine; } 166 } 167 else 168 { $line = shift @{$self->[CACHED_LINES]}; } 169 170 return $line; 171 } 172 173 174# 175# Function: GetAll 176# 177# Returns an array of all the lines from the file. The line breaks will be removed automatically. If the first line contains a 178# Unicode BOM, that will also be removed automatically. 179# 180sub GetAll 181 { 182 my $self = shift; 183 184 my $filehandle = $self->[LINEREADER_FILEHANDLE]; 185 my $rawContent; 186 187 read($filehandle, $rawContent, -s $filehandle); 188 189 return split(/\r\n|\n|\r/, $rawContent); 190 } 191 1921; 193