1###############################################################################
2#
3#   Class: NaturalDocs::LineReader
4#
5###############################################################################
6#
7#   An object to handle reading text files line by line in a cross platform manner.  Using this class instead of the standard
8#	angle brackets approach has the following benefits:
9#
10#	- It strips all three types of line breaks automatically: CR/LF (Windows) LF (Unix) and CR (Classic Mac).  You do not need to
11#	  call chomp().  Perl's chomp() fails when parsing Windows-format line breaks on a Unix platform anyway.  It leaves the /r on,
12#	  which screws everything up.
13#	- It reads Classic Mac files line by line correctly, whereas the Perl version returns it all as one line.
14#	- It abstracts away ignoring the Unicode BOM on the first line, if present.
15#
16###############################################################################
17
18# This file is part of Natural Docs, which is Copyright � 2003-2010 Greg Valure
19# Natural Docs is licensed under version 3 of the GNU Affero General Public License (AGPL)
20# Refer to License.txt for the complete details
21
22use strict;
23use integer;
24
25use Encode;
26
27
28package NaturalDocs::LineReader;
29
30#
31#	Constants: Members
32#
33#	LINEREADER_FILEHANDLE - The file handle being used to read the file.  Has the LINEREADER_ prefix to make sure it doesn't
34#											 conflict with any actual filehandles named FILEHANDLE in the program.
35#	CACHED_LINES - An arrayref of lines already read into memory.
36#
37use NaturalDocs::DefineMembers 'LINEREADER_FILEHANDLE',
38                                                 'CACHED_LINES';
39
40#
41#   Function: New
42#
43#   Creates and returns a new object.
44#
45#   Parameters:
46#
47#       filehandle - The file handle being used to read the file.
48#
49sub New #(filehandle)
50    {
51    my ($selfPackage, $filehandle) = @_;
52
53    my $object = [ ];
54
55    $object->[LINEREADER_FILEHANDLE] = $filehandle;
56    $object->[CACHED_LINES] = [ ];
57
58    binmode($filehandle, ':raw');
59
60	my $hasBOM = 0;
61    my $possibleBOM = undef;
62    read($filehandle, $possibleBOM, 2);
63
64    if ($possibleBOM eq "\xEF\xBB")
65        {
66        read($filehandle, $possibleBOM, 1);
67        if ($possibleBOM eq "\xBF")
68            {
69            binmode($filehandle, ':crlf:encoding(UTF-8)');  # Strict UTF-8, not Perl's lax version.
70			$hasBOM = 1;
71            }
72        }
73    elsif ($possibleBOM eq "\xFE\xFF")
74        {
75        binmode($filehandle, ':crlf:encoding(UTF-16BE)');
76		$hasBOM = 1;
77        }
78    elsif ($possibleBOM eq "\xFF\xFE")
79        {
80        binmode($filehandle, ':crlf:encoding(UTF-16LE)');
81		$hasBOM = 1;
82        }
83
84	if (!$hasBOM)
85        {
86        seek($filehandle, 0, 0);
87
88		my $rawData = undef;
89		my $readLength = -s $filehandle;
90
91		# Since we're only reading the data to determine if it's UTF-8, sanity check the file length.  We may run
92		# across a huge extensionless system file and we don't want to load the whole thing.  Half a meg should
93		# be good enough to encompass giant source files while not bogging things down on system files.
94		if ($readLength > 512 * 1024)
95			{  $readLength = 512 * 1024;  }
96
97		read($filehandle, $rawData, $readLength);
98
99		eval
100			{  $rawData = Encode::decode("UTF-8", $rawData, Encode::FB_CROAK);  };
101
102		if ($::EVAL_ERROR)
103			{  binmode($filehandle, ':crlf');  }
104		else
105			{
106			# Theoretically, since this is valid UTF-8 data we should be able to split it on line breaks and feed them into
107			# CACHED_LINES instead of setting the encoding to UTF-8 and seeking back to zero just to read it all again.
108			# Alas, this doesn't work for an easily identifiable reason.  I'm sure there is one, but I couldn't figure it out
109			# before my patience ran out so I'm just letting the file cache absorb the hit instead.  If we were ever to do
110			# this in the future you'd have to handle the file length capping code above too.
111			binmode($filehandle, ':crlf:encoding(UTF-8)');
112			}
113
114		seek($filehandle, 0, 0);
115		}
116
117    bless $object, $selfPackage;
118    return $object;
119    };
120
121
122#
123#   Function: Chomp
124#
125#   Removes any line breaks from the end of a value.  It does not remove any that are in the middle of it.
126#
127#   Parameters:
128#
129#       lineRef - A *reference* to the line to chomp.
130#
131sub Chomp #(lineRef)
132    {
133    my ($self, $lineRef) = @_;
134    $$lineRef =~ s/(?:\r\n|\r|\n)$//;
135    };
136
137
138#
139#	Function: Get
140#
141#	Returns the next line of text from the file, or undef if there are no more.  The line break will be removed automatically.  If
142#	the first line contains a Unicode BOM, that will also be removed automatically.
143#
144sub Get
145	{
146	my $self = shift;
147	my $line = undef;
148
149	if (scalar @{$self->[CACHED_LINES]} == 0)
150		{
151		my $filehandle = $self->[LINEREADER_FILEHANDLE];
152		my $rawLine = <$filehandle>;
153
154		if (!defined $rawLine)
155			{  return undef;  }
156
157		$self->Chomp(\$rawLine);
158
159        if ($rawLine =~ /\r/)
160        	{
161	  		push @{$self->[CACHED_LINES]}, split(/\r/, $rawLine);  # Split for Classic Mac
162			$line = shift @{$self->[CACHED_LINES]};
163          	}
164        else
165        	{  $line = $rawLine;  }
166		}
167	else
168		{  $line = shift @{$self->[CACHED_LINES]};  }
169
170	return $line;
171	}
172
173
174#
175#	Function: GetAll
176#
177#	Returns an array of all the lines from the file.  The line breaks will be removed automatically.  If the first line contains a
178#	Unicode BOM, that will also be removed automatically.
179#
180sub GetAll
181	{
182	my $self = shift;
183
184	my $filehandle = $self->[LINEREADER_FILEHANDLE];
185	my $rawContent;
186
187    read($filehandle, $rawContent, -s $filehandle);
188
189    return split(/\r\n|\n|\r/, $rawContent);
190	}
191
1921;
193