1package HTML::Lint;
2
3use warnings;
4use strict;
5
6use HTML::Lint::Error;
7use HTML::Lint::Parser ();
8
9use HTML::Entities ();
10
11=head1 NAME
12
13HTML::Lint - check for HTML errors in a string or file
14
15=head1 VERSION
16
17Version 2.32
18
19=cut
20
21our $VERSION = '2.32';
22
23=head1 SYNOPSIS
24
25    my $lint = HTML::Lint->new;
26    $lint->only_types( HTML::Lint::Error::STRUCTURE );
27
28    # Parse lines of data.
29    $lint->newfile( $filename );
30    while ( my $line = <> ) {
31        $lint->parse( $line );
32    }
33    $lint->eof();
34
35    # Or, parse an entire file at once.
36    $lint->parse_file( $filename );
37
38    # Fetch the errors that the linter found.
39    my $error_count = $lint->errors;
40
41    foreach my $error ( $lint->errors ) {
42        print $error->as_string, "\n";
43    }
44
45HTML::Lint also comes with a wrapper program called F<weblint> that handles
46linting from the command line:
47
48    $ weblint http://www.cnn.com/
49    http://www.cnn.com/ (395:83) <IMG SRC="spacer.gif"> tag has no HEIGHT and WIDTH attributes.
50    http://www.cnn.com/ (395:83) <IMG SRC="goofus.gif"> does not have ALT text defined
51    http://www.cnn.com/ (396:217) Unknown element <nobr>
52    http://www.cnn.com/ (396:241) </nobr> with no opening <nobr>
53    http://www.cnn.com/ (842:7) target attribute in <a> is repeated
54
55And finally, you can also get L<Apache::HTML::Lint> that passes any
56mod_perl-generated code through HTML::Lint and get it dumped into your
57Apache F<error_log>.
58
59    [Mon Jun  3 14:03:31 2002] [warn] /foo.pl (1:45) </p> with no opening <p>
60    [Mon Jun  3 14:03:31 2002] [warn] /foo.pl (1:49) Unknown element <gronk>
61    [Mon Jun  3 14:03:31 2002] [warn] /foo.pl (1:56) Unknown attribute "x" for tag <table>
62
63=cut
64
65=head1 METHODS
66
67NOTE: Some of these methods mirror L<HTML::Parser>'s methods, but HTML::Lint
68is not a subclass of HTML::Parser.
69
70=head2 new()
71
72Create an HTML::Lint object, which inherits from HTML::Parser.
73You may pass the types of errors you want to check for in the
74C<only_types> parm.
75
76    my $lint = HTML::Lint->new( only_types => HTML::Lint::Error::STRUCTURE );
77
78If you want more than one, you must pass an arrayref:
79
80    my $lint = HTML::Lint->new(
81        only_types => [HTML::Lint::Error::STRUCTURE, HTML::Lint::Error::FLUFF] );
82
83=cut
84
85sub new {
86    my $class = shift;
87    my %args = @_;
88
89    my $self = {
90        _errors => [],
91        _types  => [],
92    };
93    bless $self, $class;
94
95    if ( my $only = $args{only_types} ) {
96        $self->only_types( ref $only eq 'ARRAY' ? @{$only} : $only );
97        delete $args{only_types};
98    }
99
100    warn "Unknown argument $_\n" for keys %args;
101
102    return $self;
103}
104
105=head2 $lint->parser()
106
107Returns the parser object for this object, creating one if necessary.
108
109=cut
110
111sub parser {
112    my $self = shift;
113
114    if ( not $self->{_parser} ) {
115        $self->{_parser} = HTML::Lint::Parser->new( sub { $self->gripe( @_ ) } );
116        $self->{_parser}->ignore_elements( qw(script style) );
117    }
118
119    return $self->{_parser};
120}
121
122=head2 $lint->parse( $text )
123
124=head2 $lint->parse( $code_ref )
125
126Passes in a chunk of HTML to be linted, either as a piece of text,
127or a code reference.
128See L<HTML::Parser>'s C<parse_file> method for details.
129
130=cut
131
132sub parse {
133    my $self = shift;
134
135    my $rc = $self->parser->parse( @_ );
136
137    $self->{_parse_called} = 1;
138
139    return $rc;
140}
141
142=head2 $lint->parse_file( $file )
143
144Analyzes HTML directly from a file. The C<$file> argument can be a filename,
145an open file handle, or a reference to an open file handle.
146See L<HTML::Parser>'s C<parse_file> method for details.
147
148=cut
149
150sub parse_file {
151    my $self = shift;
152
153    my $rc = $self->parser->parse_file( @_ );
154
155    $self->{_parse_called} = 1;
156    $self->eof;
157
158    return $rc;
159}
160
161=head2 $lint->eof()
162
163Signals the end of a block of text getting passed in.  This must be
164called to make sure that all parsing is complete before looking at errors.
165
166Any parameters (and there shouldn't be any) are passed through to
167HTML::Parser's eof() method.
168
169=cut
170
171sub eof {   ## no critic ( Subroutines::ProhibitBuiltinHomonyms )
172    my $self = shift;
173
174    my $rc;
175    my $parser = $self->parser;
176    if ( $parser ) {
177        $rc = $parser->eof(@_);
178        delete $self->{_parser};
179        $self->{_eof_called} = 1;
180    }
181
182    return $rc;
183}
184
185=head2 $lint->errors()
186
187In list context, C<errors> returns all of the errors found in the
188parsed text.  Each error is an object of the type L<HTML::Lint::Error>.
189
190In scalar context, it returns the number of errors found.
191
192=cut
193
194sub errors {
195    my $self = shift;
196
197    if ( !$self->{_parse_called} ) {
198        $self->gripe( 'api-parse-not-called' );
199    }
200    elsif ( !$self->{_eof_called} ) {
201        $self->gripe( 'api-eof-not-called' );
202    }
203
204    if ( wantarray ) {
205        return @{$self->{_errors}};
206    }
207    else {
208        return scalar @{$self->{_errors}};
209    }
210}
211
212=head2 $lint->clear_errors()
213
214Clears the list of errors, in case you want to print and clear, print and clear.
215
216=cut
217
218sub clear_errors {
219    my $self = shift;
220
221    $self->{_errors} = [];
222
223    return;
224}
225
226=head2 $lint->only_types( $type1[, $type2...] )
227
228Specifies to only want errors of a certain type.
229
230    $lint->only_types( HTML::Lint::Error::STRUCTURE );
231
232Calling this without parameters makes the object return all possible
233errors.
234
235The error types are C<STRUCTURE>, C<HELPER> and C<FLUFF>.
236See L<HTML::Lint::Error> for details on these types.
237
238=cut
239
240sub only_types {
241    my $self = shift;
242
243    $self->{_types} = [@_];
244
245    return;
246}
247
248=head2 $lint->gripe( $errcode, [$key1=>$val1, ...] )
249
250Adds an error message, in the form of an L<HTML::Lint::Error> object,
251to the list of error messages for the current object.  The file,
252line and column are automatically passed to the L<HTML::Lint::Error>
253constructor, as well as whatever other key value pairs are passed.
254
255For example:
256
257    $lint->gripe( 'attr-repeated', tag => $tag, attr => $attr );
258
259Usually, the user of the object won't call this directly, but just
260in case, here you go.
261
262=cut
263
264sub gripe {
265    my $self = shift;
266
267    my $error = HTML::Lint::Error->new(
268        $self->{_file}, $self->parser->{_line}, $self->parser->{_column}, @_ );
269
270    my @keeps = @{$self->{_types}};
271    if ( !@keeps || $error->is_type(@keeps) ) {
272        push( @{$self->{_errors}}, $error );
273    }
274
275    return;
276}
277
278
279=head2 $lint->newfile( $filename )
280
281Call C<newfile()> whenever you switch to another file in a batch
282of linting.  Otherwise, the object thinks everything is from the
283same file.  Note that the list of errors is NOT cleared.
284
285Note that I<$filename> does NOT need to match what's put into C<parse()>
286or C<parse_file()>.  It can be a description, a URL, or whatever.
287
288You should call C<newfile()> even if you are only validating one file. If
289you do not call C<newfile()> then your errors will not have a filename
290attached to them.
291
292=cut
293
294sub newfile {
295    my $self = shift;
296    my $file = shift;
297
298    delete $self->{_parser};
299    delete $self->{_parse_called};
300    delete $self->{_eof_called};
301    $self->{_file} = $file;
302    $self->{_line} = 0;
303    $self->{_column} = 0;
304    $self->{_first_seen} = {};
305
306    return $self->{_file};
307} # newfile
308
3091;
310
311=head1 MODIFYING HTML::LINT'S BEHAVIOR
312
313Sometimes you'll have HTML that for some reason cannot conform to
314HTML::Lint's expectations.  For those instances, you can use HTML
315comments to modify HTML::Lint's behavior.
316
317Say you have an image where for whatever reason you can't get
318dimensions for the image.  This HTML snippet:
319
320    <img src="logo.png" height="120" width="50" alt="Company logo">
321    <img src="that.png">
322
323causes this error:
324
325    foo.html (14:20) <img src="that.png"> tag has no HEIGHT and WIDTH attributes
326
327But if for some reason you can't get those dimensions when you build
328the page, you can at least stop HTML::Lint complaining about it.
329
330    <img src="this.png" height="120" width="50" alt="Company logo">
331    <!-- html-lint elem-img-sizes-missing: off, elem-img-alt-missing: off -->
332    <img src="that.png">
333    <!-- html-lint elem-img-sizes-missing: on, elem-img-alt-missing: off -->
334
335If you want to turn off all HTML::Lint warnings for a block of code, use
336
337    <!-- html-lint all: off -->
338
339And turn them back on with
340
341    <!-- html-lint all: on -->
342
343You don't have to use "on" and "off".  For "on", you can use "true"
344or "1".  For "off", you can use "0" or "false".
345
346For a list of possible errors and their codes, see L<HTML::Lint::Error>,
347or run F<perldoc HTML::Lint::Error>.
348
349=head1 BUGS, WISHES AND CORRESPONDENCE
350
351All bugs and requests are now being handled through GitHub.
352
353    https://github.com/petdance/html-lint/issues
354
355DO NOT send bug reports to http://rt.cpan.org/ or http://code.google.com/
356
357=head1 TODO
358
359=over 4
360
361=item * Check for attributes that require values
362
363=item * <TABLE>s that have no rows.
364
365=item * Form fields that aren't in a FORM
366
367=item * DIVs with nothing in them.
368
369=item * HEIGHT= that have percents in them.
370
371=item * Check for goofy stuff like:
372
373    <b><li></b><b>Hello Reader - Spanish Level 1 (K-3)</b>
374
375=back
376
377=head1 COPYRIGHT & LICENSE
378
379Copyright 2005-2018 Andy Lester.
380
381This program is free software; you can redistribute it and/or modify it
382under the terms of the Artistic License v2.0.
383
384http://www.opensource.org/licenses/Artistic-2.0
385
386Please note that these modules are not products of or supported by the
387employers of the various contributors to the code.
388
389=head1 AUTHOR
390
391Andy Lester, andy at petdance.com
392
393=cut
394
3951;
396