1#!/usr/bin/env perl
2# ts=4
3# Warren Block
4# special thanks to Glen Barber for limitless
5# patience and the use of his svn repository
6
7# igor: check man pages and DocBook
8# needs Perl 5.8 or higher
9
10use strict;
11use warnings;
12use locale;
13
14#  Copyright (c) 2012, 2013, 2014 Warren Block
15#  All rights reserved.
16#
17#  Redistribution and use in source and binary forms, with or without
18#  modification, are permitted provided that the following conditions
19#  are met:
20#  1. Redistributions of source code must retain the above copyright
21#     notice, this list of conditions and the following disclaimer.
22#  2. Redistributions in binary form must reproduce the above copyright
23#     notice, this list of conditions and the following disclaimer in the
24#     documentation and/or other materials provided with the distribution.
25#
26#  THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
27#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29#  ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
30#  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31#  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32#  OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33#  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34#  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35#  OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36#  SUCH DAMAGE.
37
38use Getopt::Std;
39use File::Basename;
40use POSIX qw/strftime/;
41
42my $file  = "/usr/bin/file";
43my $gzcat = "/usr/bin/gzcat";
44my $bzcat = "/usr/bin/bzcat";
45my $man   = "/usr/bin/man";
46
47my $tmpdir = "/tmp";
48
49my $rev = '$Revision: 596 $';
50
51my ($fh, $tmpfile, $stdinfile, $docdate);
52
53my ($prevline, $prevnonblank, $origline) = ('', '');
54my $ignoreblock;
55my $titleblock = 0;
56my $today;
57
58my $linelensgml;
59my ($startline, $stopline);
60my $vid;
61my ($ignoreblockstart, $ignoreblockend);
62my %misspelled_words;
63my @badphrases;
64my @contractions;
65my @freebsdobs;
66my ($lc_regex, $uc_regex, $fixedcase_regex, $ignoreregex);
67my ($indent_regex, $inline_regex);
68my ($redundantword_regex, $redundanttagword_regex);
69my (@straggler_tags, $literalblock_regex);
70my $eos_regex;
71my (@openclose_tags, $openclose_regex, %opentag, $list_regex, $parawrap_regex);
72
73my ($bname, $type);
74
75my $prog = basename($0);
76
77sub usage {
78	$rev =~ /Revision: (\d+)/;
79	my $version = "1.$1";
80	print <<USAGE;
81$prog $version
82usage: $prog -h
83       $prog [-abcdefilmnorstuwxyzDERSWXZ] [-C range] [-L n] file [file ...]
84
85    -h  show summary of command line options and exit
86
87    Output options
88        -R        ANSI highlights (use with 'less -R')
89        -C range  Restrict output to a range of lines from the source file
90        -v        Verbose output
91        -V vid    Restrict output to a VID (or 'latest') in a VuXML file
92        -X        XML output (overrides -R)
93
94    Tests
95        If individual test options are given, only those tests are done.
96
97    Shortcuts
98        -z  all standard non-whitespace tests
99        -Z  all standard whitespace tests
100
101    Tests for all files
102        -a  abbreviations like "e.g.," and "i.e.,"
103        -b  bad phrases
104        -f  FreeBSD obsolete features
105        -r  repeated words
106        -s  spelling
107        -u  contractions
108        -w  whitespace
109        -y  style suggestions (off by default)
110
111    mdoc(7) tests
112        -d  document date (.Dd)
113        -e  sentences should begin on a new line
114        -g  See Also xrefs are not duplicated
115        -m  mdoc structure requirements
116        -p  mdoc whitespace requirements
117        -x  additional xref (.Xr) tests (off by default, implies -m)
118        -D  all but document date (same as -abefmrsuw)
119
120    DocBook tests
121        -c  title capitalization
122        -i  indentation
123        -l  long lines (see -L below)
124        -n  sentences start with two spaces
125        -o  open/close tags match
126        -t  tag usage style
127        -E  writing style
128        -S  straggler tags with undesired content whitespace
129        -W  whitespace on SGML indentation
130
131    DocBook test options
132        -L n  set line length used in long line test (default 70)
133
134    EXAMPLES
135
136        $prog -R gpart.8.gz | less -R -S
137        $prog -R -D -y /usr/share/man/man7/tuning.7.gz | less -R -S
138        cat /usr/share/man/man1/csh.1.gz | $prog -D
139        $prog -Rz chapter.sgml | less -RS
140        $prog -R `find /usr/doc/en_US.ISO8859-1/ -name "*.xml"` | less -RS
141        $prog -RD /usr/share/man/man8/* | less -RS
142
143    gzip and bzip2 files are automatically decompressed.
144USAGE
145	exit 0;
146}
147
148our ($opt_a, $opt_b, $opt_c, $opt_d, $opt_e, $opt_f, $opt_g, $opt_h,
149	 $opt_i, $opt_l, $opt_m, $opt_n, $opt_o, $opt_p, $opt_r, $opt_s,
150	 $opt_t, $opt_u, $opt_v, $opt_w, $opt_x, $opt_y, $opt_z, $opt_C,
151	 $opt_E, $opt_D, $opt_L, $opt_R, $opt_S, $opt_V, $opt_W, $opt_X,
152	 $opt_Z);
153
154getopts('abcdefghilmnoprstuvwxyzC:DEL:RSV:WXZ');
155
156usage() if $opt_h;
157
158my $verbose = 1 if $opt_v;
159
160# ANSI color codes
161my @colors = qw/ red green yellow blue magenta cyan /;
162my %ansi;
163my $inverse  = "\033[7m";
164my $reset    = "\033[0;24;27m";
165my $lf = '';	# filename
166my $rf = '';
167my $ll = '';	# line number
168my $lr = '';
169my $lh = '[';	# highlight
170my $rh = ']';
171my $li = '[';	# whitespace
172my $ri = ']';
173
174# mdoc SEE ALSO section flag and xrefs
175my $seealso = 0;
176my %seealsoxrefs;
177
178# mdoc macros
179my @macros = (qw/ Dd Dt Os Sh_NAME Nm Nd Sh_SYNOPSIS Sh_DESCRIPTION /);
180my %macroval;
181
182sub INT_handler {
183	( close $fh or die "could not close filehandle:$!\n" ) if fileno($fh);
184	removetempfiles();
185	exit 0;
186}
187
188sub initialize {
189	$today = strftime("%B %e, %Y", localtime);
190	$today =~ s/  / /g;
191
192	# ANSI color codes
193	for my $i (0..@colors-1) {
194		$ansi{"dark$colors[$i]"} = "\033["   . ($i+31) . "m";
195		$ansi{"$colors[$i]"}     = "\033[1;" . ($i+31) . "m";
196	}
197	# minor hackery: darkblue is so dark it needs a white background
198	$ansi{"darkblue"} = $ansi{"darkblue"} . "\033[47m";
199
200	# use ANSI highlights
201	if ( $opt_R ) {
202		$lf = $ansi{darkyellow};	# filename
203		$rf = $reset;
204		$ll = $ansi{darkcyan};		# line number
205		$lr = $reset;
206		$lh = $ansi{darkgreen};		# highlight
207		$rh = $reset;
208		$li = $inverse;				# whitespace
209		$ri = $reset;
210	}
211
212	# SGML line length
213	$linelensgml = 70;
214	if ( defined($opt_L) && ($opt_L =~ /(\d+)/) ) {
215		$linelensgml = $1 if $1 > 0;
216	}
217
218	# -C start-end limits output to a range of lines
219	if ( $opt_C ) {
220		($startline, $stopline) = split(':|-', $opt_C);
221		die "-C option requires a line number range (start- | start-end | -end)\n" unless $startline || $stopline;
222	}
223
224	# -V vid limits output to a range of lines
225	if ( $opt_V ) {
226		$vid = $opt_V;
227		unless ($vid eq 'latest' || $vid =~/.*-.*-/) {
228			die "-V requires vulnerability ID like 348bfa69-25a2-11e5-ade1-0011d823eebd\n";
229		}
230		$startline=999999;
231		$stopline = $startline;
232	}
233
234	# -D equals -abefgmprsuw
235	if ( $opt_D ) {
236		$opt_a = $opt_b = $opt_e = $opt_f = $opt_g = $opt_m = $opt_p
237			   = $opt_r = $opt_s = $opt_u = $opt_w = 1;
238	}
239
240	if ( $opt_z ) {
241		# all non-whitespace tests
242		$opt_a = $opt_b = $opt_c = $opt_d = $opt_e = $opt_f = $opt_g
243			   = $opt_m = $opt_o = $opt_p = $opt_r = $opt_s = $opt_u
244			   = $opt_E = $opt_S = 1;
245	}
246
247	if ( $opt_Z ) {
248		# all whitespace tests
249		$opt_i = $opt_l = $opt_n = $opt_t = $opt_w = $opt_W = 1;
250	}
251
252	if ( $opt_x ) {
253		# -x implies -m
254		$opt_m = 1;
255	}
256
257	# if no tests are chosen, do them all
258	unless ( $opt_a || $opt_b || $opt_c || $opt_d || $opt_e
259		  || $opt_f || $opt_g || $opt_i || $opt_l || $opt_m
260		  || $opt_n || $opt_o || $opt_p || $opt_r || $opt_s
261		  || $opt_t || $opt_u || $opt_w || $opt_x || $opt_y
262		  || $opt_E || $opt_S || $opt_W ) {
263		$opt_a = $opt_b = $opt_c = $opt_d = $opt_e
264			   = $opt_f = $opt_g = $opt_i = $opt_l = $opt_m
265			   = $opt_n = $opt_o = $opt_p = $opt_r = $opt_s
266			   = $opt_t = $opt_u = $opt_w = $opt_E = $opt_S
267			   = $opt_W = 1;
268		$opt_x = $opt_y = 0;
269	}
270
271	init_ignoreblocks();
272	init_spellingerrors();
273	init_badphrases();
274	init_contractions();
275	init_freebsdobs();
276	init_doc_titles();
277	init_doc_indentation();
278	init_doc_sentence();
279	init_doc_openclose();
280	init_literalblock_regex();
281	init_doc_writestyle();
282	init_doc_stragglers();
283
284	# ctrl-c handler
285	$SIG{'INT'} = 'INT_handler';
286	# do the same thing if the pipe closes
287	$SIG{'PIPE'} = 'INT_handler';
288
289	# autoflush
290	$| = 1;
291
292	# allow stdin
293	push @ARGV, "stdin" if $#ARGV < 0;
294}
295
296sub firstext {
297	my $fname = shift;
298	my $ext = '';
299	if ( basename($fname) =~ /\.(.*?)(?:\.|$)/ ) {
300		$ext = $1;
301	}
302	return $ext;
303}
304
305sub lastext {
306	my $fname = shift;
307	my $ext = '';
308	if ( basename($fname) =~ /\.([^.]*?)$/ ) {
309		$ext = $1;
310	}
311	return $ext;
312}
313
314sub baseonly {
315	my $fname = shift;
316	$fname = basename($fname);
317	$fname =~ s/\..*$//;
318	return $fname;
319}
320
321sub tmpfilename {
322	my $fname = shift;
323	my $ext = firstext($fname);
324	my $name = baseonly($fname);
325	return "$tmpdir/$prog-tmp-$$-$name.$ext";
326}
327
328sub filetype {
329	my $fname = shift;
330	# detect type from extension if possible
331	my $ext = lastext($fname);
332	if ( $ext ) {
333		print "detecting file type by extension: '$ext'\n" if $verbose;
334		for ( $ext ) {
335			if    ( /\d{1}/ ) { return "troff"   }
336			elsif ( /bz2/i  ) { return "bzip"    }
337			elsif ( /gz/i   ) { return "gzip"    }
338			elsif ( /sgml/i ) { return "sgml"    }
339			elsif ( /xml/i  ) { return "xml"     }
340			else              { return "unknown" }
341		}
342	}
343	# fall back to file(1)
344	print "detecting file type with file(1)\n" if $verbose;
345	my $out = `$file -b $fname`;
346	$out =~ /^(\S+\s+\S+)/;	# first two words
347	if ( $1 ) {
348		my $id = $1;
349		for ( $id ) {
350			if    ( /^troff/ )         { return "troff"   }
351			elsif ( /^exported SGML/ ) { return "sgml"    }
352			# some DocBook documents are detected as "Lisp/Scheme"
353			elsif ( /^Lisp\/Scheme/ )  { return "sgml"    }
354			elsif ( /^gzip/ )          { return "gzip"    }
355			elsif ( /^bzip/ )          { return "bzip"    }
356			else                       { return "unknown" }
357		}
358	}
359	return "unknown";
360}
361
362sub uncompress {
363	my ($fname, $type) = @_;
364	my $tmpfile = tmpfilename($fname);
365	print "uncompressing '$fname' to '$tmpfile'\n" if $verbose;
366	for ( $type ) {
367		if ( /gzip/ ) {
368			system("$gzcat $fname > $tmpfile") == 0
369				or die "could not create '$tmpfile':$!\n";
370		}
371		elsif ( /bzip/ ) {
372			system("$bzcat $fname > $tmpfile") == 0
373				or die "could not create '$tmpfile':$!\n";
374		}
375		else {
376			die "unknown compression type '$type'\n";
377		}
378	}
379	return $tmpfile;
380}
381
382sub writestdinfile {
383	$stdinfile = "$tmpdir/$prog-stdin.$$";
384	open $fh, ">", $stdinfile or die "could not create '$stdinfile':$!\n";
385	print $fh <STDIN>;
386	close $fh or die "could not close '$stdinfile':$!\n";
387	return $stdinfile;
388}
389
390sub removetempfiles {
391	if ( $stdinfile && -f $stdinfile ) {
392		print "deleting stdinfile '$stdinfile'\n" if $verbose;
393		unlink $stdinfile or die "could not remove '$stdinfile':$!\n";
394	}
395	if ( $tmpfile && -f $tmpfile ) {
396		print "deleting tmpfile '$tmpfile'\n" if $verbose;
397		unlink $tmpfile   or die "could not remove '$tmpfile':$!\n";
398	}
399}
400
401sub xmlize {
402	my $txt = shift;
403	$txt =~ s/'/&apos;/g;
404	$txt =~ s/"/&quot;/g;
405	$txt =~ s/</&lt;/g;
406	$txt =~ s/>/&gt;/g;
407	return $txt;
408}
409
410sub showline {
411	my ($bname, $linenum, $color, $errordesc, $txt) = @_;
412	# limit output to line number range
413	return if $startline && ($. < $startline);
414	if ( !$opt_X ) {
415		print "$lf$bname$rf:";
416		print "$ll$linenum$lr:";
417		print $color if $opt_R;
418		print "$errordesc";
419		print $reset if $opt_R;
420		print ":$txt\n";
421	} else {
422		print "    <error ";
423		print "line=\"$linenum\" ";
424		# these two are not presently implemented in igor
425		print "column=\"1\" ";
426		print "severity=\"warning\" ";
427		#
428		print "message=\"", xmlize($errordesc), "\" ";
429		print "source=\"$prog\"";
430		print "/>\n";
431	}
432}
433
434sub is_lowercase {
435	my $word = shift;
436	return $word =~ /^[a-z]{1}/;
437}
438
439sub is_uppercase {
440	my $word = shift;
441	return $word =~ /^[A-Z]{1}/;
442}
443
444sub highlight_word {
445	my ($txt, $word) = @_;
446	$txt =~ s/\Q$word\E/$lh$word$rh/g;
447	return $txt;
448}
449
450sub highlight_string {
451	my $txt = shift;
452	return "$lh$txt$rh";
453}
454
455sub expand_tabs {
456	my $txt = shift;
457	$txt =~ s/\t/        /g;
458	return $txt;
459}
460
461sub leading_space {
462	my $txt = shift;
463	my $leading;
464	$txt =~ /^(\s+)/;
465	$leading = ($1 ? $1 : '');
466	$leading = expand_tabs($leading);
467	return $leading;
468}
469
470sub splitter {
471	my $txt = shift;
472	return ($txt) unless ( $txt =~ /$ignoreblockstart|$ignoreblockend/ );
473	my @split = split /($ignoreblockstart|$ignoreblockend)/, $txt;
474	return grep { ! /^\s*$/ } @split;
475}
476
477sub init_ignoreblocks {
478	print "initializing ignoreblocks\n" if $verbose;
479	# create regex for sgml block start and end
480	my @ignoreblock_tags = qw/ literallayout screen programlisting /;
481	$ignoreblockstart = '(?:<!--|<!\[';
482	for my $tag (@ignoreblock_tags) {
483		$ignoreblockstart .= "|<$tag.*?>";
484	}
485	$ignoreblockstart .= ')';
486	$ignoreblockend = '(?:-->|\]\]>';
487	for my $tag (@ignoreblock_tags) {
488		$ignoreblockend .= "|<\/$tag>";
489	}
490	$ignoreblockend .= ')';
491}
492
493sub showwhitespace {
494	my $txt = shift;
495	$txt =~ s/\t/{tab}/g;
496	return $txt;
497}
498
499# global tests
500
501sub abbrevs {
502	my ($bname, $line, $txt) = @_;
503	return if $txt =~ /^\s*$/;
504	return if $ignoreblock;
505	my $txtbak = $txt;;
506
507	if ( $txt =~ /(?:\W|^)c\.f\./i ) {
508		$txt =~ s/(c\.f\.)/$lh$1$rh/i;
509		showline($bname, $line, $ansi{darkmagenta}, 'use "cf."', $txt);
510	}
511
512	$txt = $txtbak;
513	if ( $txt =~ /(?:\W|^)e\.?g\.(?:[^,:]|$)/ ) {
514		$txt =~ s/(e\.?g\.)/$lh$1$rh/;
515		showline($bname, $line, $ansi{darkmagenta}, 'no comma after "e.g."', $txt);
516	}
517
518	$txt = $txtbak;
519	if ( $txt =~ /(?:\W|^)i\.?e\.(?:[^,:]|$)/ ) {
520		$txt =~ s/(i\.?e\.)/$lh$1$rh/;
521		showline($bname, $line, $ansi{darkmagenta}, 'no comma after "i.e."', $txt);
522	}
523
524	$txt = $txtbak;
525	if ( $txt =~ /(?:\W|^)a\.k\.a\./i ) {
526		$txt =~ s/(a\.k\.a\.)/$lh$1$rh/i;
527		showline($bname, $line, $ansi{darkmagenta}, 'use "aka" (AP style)', $txt);
528	}
529
530	$txt = $txtbak;
531	if ( $txt =~ /(?:\W|^)v\.?s(?:\.|\s|$)/i ) {
532		$txt =~ s/(v\.?s\.)/$lh$1$rh/i;
533		showline($bname, $line, $ansi{darkmagenta}, '"versus" abbreviated', $txt);
534	}
535}
536
537sub init_badphrases {
538	print "initializing badphrases\n" if $verbose;
539	@badphrases = ('2nd', '3rd', '3way', '4th', '5th','allow to',
540				   'allows to', 'become gain', 'be also', 'been also',
541				   'being build', 'can not', "chroot'd", "compress'd",
542				   'could might', 'could of', 'equally as', 'for to',
543				   "ftp'd", 'get take', "gzip'd", 'in on', 'it self',
544				   'may will', "mfc'ed", 'might could', 'often are'
545				   ,"or'ing", 'that without', 'the a', 'the each',
546				   'the to', 'this mean that', 'to can', 'to for',
547				   'to of', 'to performs', 'will has', 'with to',
548				   'would of',);
549}
550
551sub badphrases {
552	my ($bname, $line, $txt) = @_;
553	my $txtbak = $txt;
554	return if $txt =~ /^\s*$/;
555
556	for my $bad (@badphrases) {
557		$txt = $txtbak;
558		# check for a loose but fast match first
559		if ( $txt =~ /\Q$bad\E/i ) {
560			if ( $txt =~ s/\b(\Q$bad\E)\b/$lh$1$rh/i ) {
561				showline($bname, $line, $ansi{yellow}, 'bad phrase', $txt);
562			}
563		}
564
565		# detect bad phrases wrapping over two lines
566		# skip this test if the phrase was all on the previous line
567		next if ( $prevline =~ /\Q$bad\E\b/i );
568
569		$txt = "$prevline $txtbak";
570		if ( $txt =~ /\Q$bad\E\b/i ) {
571			my @right = split /\s/, $bad;
572			my @left  = ();
573			my $leftstr = '';
574			while ( @right ) {
575				push @left, shift @right;
576				$leftstr = join ' ',@left;
577				last if ( $prevline =~ /(\Q$leftstr\E)\s*$/i );
578			}
579			unless ( $leftstr =~ /\Q$bad\E/ ) {
580				showline($bname, $line - 1, $ansi{yellow}, 'bad phrase',
581					"... $lh$leftstr$rh");
582				$txt = $txtbak;
583				my $rightstr = join ' ', @right;
584				$txt =~ s/(\Q$rightstr\E)/$lh$1$rh/i;
585				showline($bname, $line, $ansi{yellow}, 'bad phrase', $txt);
586			}
587		}
588	}
589}
590
591sub init_contractions {
592	print "initializing contractions\n" if $verbose;
593	@contractions = ("aren't", "can't", "doesn't", "don't", "hasn't",
594					 "i'll", "i'm", "isn't", "it's", "i've", "let's",
595					 "shouldn't", "that's", "they'll", "you're",
596					 "you've", "we'd", "we'll", "we're", "we've",
597					 "won't", "would've");
598}
599
600sub contractions {
601	my ($bname, $line, $txt) = @_;
602	my $txtbak = $txt;
603	return if $txt =~ /^\s*$/;
604
605	for my $con (@contractions) {
606		$txt = $txtbak;
607		if ( $txt =~ /\Q$con\E/i ) {
608			if ( $txt =~ s/\b(\Q$con\E)\b/$lh$1$rh/i ) {
609				showline($bname, $line, $ansi{yellow}, 'contraction', $txt);
610			}
611		}
612	}
613}
614
615sub init_freebsdobs {
616	print "initializing FreeBSDobs\n" if $verbose;
617	@freebsdobs = qw/ cvsup /;
618}
619
620sub freebsdobsolete {
621	my ($bname, $line, $txt) = @_;
622	return if $txt =~ /^\s*$/;
623
624	for my $word (@freebsdobs) {
625		if ( $txt =~ s/(\s+)($word)([^.]+.*)$/$1$lh$2$lr$3/ ) {
626			showline($bname, $line, $ansi{darkgreen}, 'freebsd-obsolete', $txt);
627		}
628	}
629}
630
631sub repeatedwords {
632	my ($bname, $line, $txt) = @_;
633	return if $txt =~ /^\s*$/;
634
635	my $txtbak = $txt;
636	my %count = ();
637	my @words = grep(! /^\s*$/, split /\b/, $txt);
638	map { $count{$_}++ } @words;
639	my @multiples = grep { $count{$_} > 1 } keys %count;
640	#for my $word (keys %count) {
641	for my $word (@multiples) {
642		# skip special cases
643		# repeated numbers
644		next if $word =~ /\d{1}/;
645		# repeated slashes
646		next if $word eq '/';
647		# repeated rows of dashes
648		next if $word =~ /-+/;
649		# repeated rows of underscores
650		next if $word =~ /_+/;
651		# skip some mdoc commands
652		next if $word =~ /Fl|Ns|Oc|Oo/;
653		$txt = $txtbak;
654		if ( $txt =~ s/\b(\Q$word\E\s+\Q$word\E)\b/$lh$1$rh/i ) {
655			print "repeatedwords: repeat found:'$word'\n" if $verbose;
656			showline($bname, $line, $ansi{darkred}, 'repeated', $txt);
657		}
658	}
659	# check for repeated word from the end of the previous line
660	# to the beginning of the current line
661	# $prevline =~ m%(\w+\s+)*([^ *.#|+-]+\s*)$%;
662	$prevline =~ m%(\w+\s+)*(\S+\s*)$%;
663	my $cmd = ($1 ? $1 : '');
664	my $prevlastword = ($2 ? $2 : '');
665	# short-circuit when the previous line...
666	# had no last word
667	return unless $prevlastword;
668	# didn't repeat any of the words on the current line
669	$count{$prevlastword}++;
670	return unless $count{$prevlastword} > 1;
671	# was a groff(7) comment
672	return if $prevlastword eq '.c';
673	# was a groff(7) zero-space character for tables (\&.)
674	return if $prevlastword eq '\&.';
675	# was a single non-word character
676	return if $prevlastword =~ /^\W{1}$/;
677	# was an mdoc(7) or nroff(7) comment
678	return if $prevlastword =~ /^\W{1}\\\"/;
679	# was an mdoc command
680	return if $prevlastword =~ /\.(?:Ar|Oo|Nm|Tp)/i;
681	# when the next-to-last word was an mdoc command
682	return if $cmd =~ /Ar |Cm |Fa |Em |Ic |Ip |It |Li |Pa |Ss /i;
683	if ( $txt =~ s/^\s*(\Q$prevlastword\E)(\s+.*)$/$lh$1$rh$2/ ) {
684		showline($bname, $line - 1, $ansi{darkred}, 'repeated',
685			"... $cmd$lh$prevlastword$rh");
686		showline($bname, $line, $ansi{darkred}, 'repeated', $txt);
687	}
688}
689
690# read an external file of spelling errors
691# the misspelled word is the first sequence of \w or ' characters
692# up to a non-word character
693sub readspelling {
694	my $spname = shift;
695	my $added = 0;
696	print "adding spelling file '$spname'\n" if $verbose;
697	open my $sf, '<', $spname or die "cannot open '$spname':$!\n";
698	while ( <$sf> ) {
699		next if /^$/;
700		next if /^\s*#/;
701		if ( /^\s*((?:\w|\')+)\W+/ ) {
702			$misspelled_words{$1} = 1;
703			$added++;
704		}
705	}
706	close $sf or die "could not close '$spname':$!\n";
707	print "added misspellings: $added\n" if $verbose;
708}
709
710# list of common spellingwords
711sub init_spellingerrors {
712	print "initializing spellingerrors\n" if $verbose;
713	for my $word (qw/ &nbps; aan abel abismal abjectely ablve abondan abotu abour abouy abscence absense
714		absolue absolut absolutelly absolutly absoulte abuttes accelleration acceptible acces accesable
715		accesed accesing accessable accidentaly accidently acclerate acclerating accomadate accomodate
716		accoring accound accpeted accroding accross accuarate acculate acess achitecture achive
717		acknowledgent acquisions acse actal actaully activly actuall actualy actyually acutally adavnce
718		adddress adde addesses addiotional additionnal additonal additonally addres addreses addressess
719		addresss addtions adecuate adhear adhearance adherance adiministration adjustement administator
720		adminstrator adminstrators admited adress adressed adresses advence adventerous advertisment
721		advetise advetised adviasory advices aer afterall afternoont agai agains ageing aggree
722		aggregatable aggresive aggresively agian agregate agregation agressive agressively agrivating
723		agument ahold ahte ahve akses aksually alaram albel albels alergic algoritm alignement allign
724		alligned allmost alloacted allos allready allright allthough allways alot alreday alredy alright
725		altenrative althought althougn altough amasing amke ammend ammount amn amybe analasys analyizing
726		ancestory ande anderstand andthe ane anf annonymous annotatation annotatations annoucement
727		annoucing announcment annoyting anonnyed anonymus anormalous ansamble answeres antiq anual anyay
728		anyhitng anyhoo anymore anyonw anyore anythign anyways anywere aobut apac apllay apllication
729		apparant apparantly apparentely apparentry apparnetly appart appartment appearence appearred
730		appendencies apperantly appercaite appers appicable appleances appleis appliabce applicatin
731		applieds applogize appraently appriciate appriciated appropiate approprate apreciate apreciated
732		apropriate aproval aptch aqueue arbitary arbitrafy arbritrary archiecture architectual arent
733		arguements arguemnt aritmetic aritmetics arledy arond aroudn aroung arrisen arround arrray
734		artikels aslo asoc asparin assigenments assocation assoicated assotiations assumtion aswers
735		asychronous asynchonously asynchroneous atack athalon athe athentication athough atleast atrget
736		atribute attachements attatude attemps attemts atuomatic atuomatically augus autentication
737		autheinticating authenticatication authention authetnication authoratative authorative authorty
738		automaticall automaticaly automaticly autonimous avaiable avaialble avaible availabe availabel
739		availablity availbility availible availiblity avaliable avalibale avalible avilable aweful
740		awhile awlso awsome axatly axcuse bannana bartition basec basicly basse bateria baybe beachmark
741		beacuse beated becasue becease beceause becuase becuse beeing beffer beggining begining beginnig
742		behaiver behauvier behvaiour beign beleive belive belived benefitial benfit benifit beoken
743		beowser ber berak bercause berkley beseuse besure beter bettr beurocratic beween beyonf bgack
744		bgiger bheve bikesheding bince bineary birght birt blatently bloatwed bloging bnechmark boostrap
745		boostrapping bootabe bootleneck bootlenecks bootsrap boradband bordism borken borre borred
746		borring boting bottem bottonm boundries boundry boxd bradband branche briner bringign brocessor
747		broked brokeness broser brower browesable browseable browswer btit buch bugzills buidl buildling
748		buildt buile buillt buld bulding bulds bultin burried bycicle bysect bysected byt cacheing
749		calatog calcualted cale calender calles caluclate caluclated camllia campatibility cange cannnot
750		cannonical cant capabilites capabilties capabiltiy capabily capitzliation captial captialism
751		caracteristics casse casues catagory catched cathegory ceep ceratin ceratinly cerificate
752		certaintly certian certifcate certifcates certificat certifictate certiin certiinly chace
753		chacing chaged challange challanging chane chang changable changess changs chaning chanse
754		charakteristic charakteristics cheapter chech checkng checksuming chek chekc cheked cheking
755		chhosing chian chipest choise choosed choosen choses chronologocal chunck chuncking cince
756		cirruption claimst clal clarifynig classifcation cleand cleandepened clearification clearl clen
757		cliens cluter cmmit cmopile cmopiles cmplain cmplaining cna cnanot cnditions cobsidered
758		cofiguration colision colisions colom comands comapred combersome comemnts comiling comit
759		comiters comitted comitter comlplex commandline commen commenly commer commerical commericial
760		comming commited commiter commiters commiting committment committs commnad commnads commnand
761		commnications communciation communciations comooil comooiled compability comparision
762		comparisions compatability compatabilty compatablity compatiable compatibilty compatiblity
763		compentens compiel compilcated compilling compiltaion complaing complainig comples complet
764		completly completness complie componet componetn compontens comprimise compromiseable
765		comptemporary comsume comsumed comsumption comunication comunity concatanated concensus
766		conctacted conect conected conection conerter conerters configrable configration configuation
767		configuraiton configurate confimation confiuration confiused confugure confussion congraturation
768		congraturations conitinue conjob conjuction connecion connecs connecter connecters connectin
769		connenctions connet conneting connnects conntact conntect conpact conputer conreoller consensu
770		consept consequtive conservatie considerd consistant consistentency consitute conslusion
771		construcgtor consuption contai containg contect conteins contens continously continu continus
772		contiune contol contrained contribuition contributer contributers controled controler controll
773		conujunction conut conuter conuters conveinently convelient conveniece convertion convesation
774		convienient convinience coordinatory coorparative copiedd copmiler copmilers coppied corectly
775		correced correctely correcture correleate corresponsding corrsponding cosnole costantly couldnt
776		cound cource courious courve coyping crach craching crahs crahsed crasch crasching crassing
777		crasy crazyness creapage creapt creat creatopm credentail credentails creeate crnuch crnuching
778		csvup cuase curcuit currenly currentlu currnetly currrently curser customaril custommer
779		custommers cuttoff cuty cvould cvs2vn damange damanged datas dayt dbout deactive deaemon
780		deaemons deafult dealocates deamon deamons deault debuf debuging decendant decentant decicission
781		decidely decission declerations decliens decompresssion decribed decriptor ded defalt defaut
782		defautl deffirent definate definately definiately definitiely definitly definitons defintion
783		degradate degugging dehaviour deicde deine deines deivce dekstop delcared delending deley
784		deliever delievers dellicious delste demnstrate depcreation depdendency depedancy depedencies
785		depedency depeding depednent depencdny dependacies dependancies dependancy dependancys dependant
786		dependding dependeancy dependeant dependecies dependecy dependend dependendencies dependiences
787		dependiency dependig depenesis deploies deprechated deprectated depricated derivats derrivates
788		desapointed desaster desasters descendand descendents desciptors descirption descrete describd
789		descrpition descrption desease deseases desing desireable desperatly despert desprate
790		destinatino destine destory detatched detec detecing detemine deterined devdeloper deveation
791		deveices develoeprs developement developeminet developped developper developpers developre
792		developvers devestate devestating devide devided devies devinces devisions devives devleop devot
793		diablog dictaded dictonary did'n didicated didnt didsk didunt dieing diferent diffence
794		differenciate differencies differenlty differents differnce differnces differnece differnetiates
795		differnt diffrent diffrently diffsof dificult dificulty diging dilema diliver dilligence dind't
796		dindt diphthongs dircet dirctory directorys diretly diretories diretory dirft dirver diry disabe
797		disappered disasterous disclamier discourraged discoverd discuessed dismouted dispair dispalay
798		dispaly dissable dissabled dissapeared dissapointment dissillusioned distain distiguish distord
799		distorded distribition distribitions distribtue distributted distribvution distrubute
800		distrubuted dnow docuentary documantation documenation documentaiton documentatino
801		documentiation documention documetation documtns doen't doesen doesent doesnot doesnt doest
802		domainmame domani donatiosn donde donn't donot dont donw dor dotally doues droped droping
803		drustrating ducplications duplictiy duratoin duratoins durign durning durring dwsktop dynaic ean
804		eanble earler easely eather ebeen ecah eceived ecourage ecouraged ect ecurrent effecive effetive
805		effetively efficancy efficency efficent efficently effor efford eficciently efter ehere elememt
806		elipsis elliminates emaling embaress embaressing eme emial emporer enabe enbale enchanced
807		enclousure enconter encrypion encyrpt encyrpted ende endianess endoresed endtdate enior
808		enivorement enoountering enought enourmous enow enscrambled ensute enteries enterprse
809		enthusiatic entierly entites enviorement enviornment enviornmental envirionment enviroment
810		enviroments environement environnement equipted equivalen equivilent erebuild erlier erliere
811		errore errorneusly erros escolated esier esiest esle esome essense estracting ethenret etherenet
812		ething ethings etnry evenning eventaul eventaully eventhough everthing everythign everytime
813		everyting evet eveyr evne evreyone ewhich exagerate examble exapnd excactly excat exceedes
814		excelent excellant excercize excersise excert excesive exclusivly execept execption execptions
815		exectable exectables exectuable execuation exellent exemple exemtion exeption exercice exibits
816		exisiting exisitng existance existsing exmaple expalin expecially experied experince expession
817		expiremental expirience expirt explaination explainations explaned explans explantation
818		explatnation explicitely exponentionally exquse exsits exstra extemely exteneded extenstions
819		extentensible extention extentions extranious extreemly extremly facilites facter faield failded
820		faile failes failur faimiliar faliure falsh familar farwarding fase faught feasable febuary
821		fecth feebsd feelt fgights fianlly fids fiel fiels fien fienw figureing fileame filewall filks
822		filname finaly firmwares firmwrae fisrt fitler fixe fixen fixztion flages flasg flexable focuss
823		folkz folllowed follwo follwoing follwong folow folowed folowing fomr forbiden forcable forece
824		foreignphrse forgoten formate formated formost fornated forsee forthermore forusers fot foto
825		fotos foudn foward fowarding fractoinal fraemwork fragemented fragmentated fragmentatio
826		frameowkr fransisco frebsd freedback freeed freezed freind frequence freze frezze frome fthernet
827		fucntion fuction fulfil funcational funcition functionmames functionnality functoin functuion
828		funtion furhter furthur fush futher futur fysical gaint garanties gatherd gauging gaurd geeting
829		generaly generat genertaes geniue geograhically gernal gernerates gettign ghostscrip giove
830		givent glas gnerated gnoime godo gohostscrip goiung gonna gonne goot gotta grafic grammer grap
831		grapics gratefull grately graub greaet greate greatful greatfully greif grpahs gruop gthe guage
832		guarateed guarentee guarenteed guarentees guarranteed guidence gurantees hackyness hade haed hai
833		haing halp hanbook handeling hapen hapilly happend happended happends happing happpens hardisk
834		hardwares hardwrae harmpless harrass harrassment harsch hashs hatered hav havent havfe havn't
835		headup healt heavly heirarchy hellon helpfuk helpfull hep hereon hessitate hessitation
836		hexadecimals hexidecimal hibarnate hibarnating hiearchy hierachy hierarchial hierarhy higest
837		hight higly hinderences hiuge hobbiest hodling homours honets honnest honnestly honnor honnorr
838		honnorred honnors honst hookled hopful hopfully horiztonal horiztonally hounderd hounderds
839		howeber howevrr hsotname hsotnames htat hte hter htere hthe htink htis hunderts hypens
840		hypervisior hypocracy ibn idee identially identifer identifers identifiy identiy idff
841		idosyncracies iea ifhghting im imagen imagening imatating imbeded imeplementation immanent
842		immediatly immenent immidiatly immitating impariment impedence impelment implemenation
843		implementaitons implementating implementng implemetation implemetn implentor implicitely
844		implicits impliment implimentation implmentation imporant imporvement imposable imposible
845		improbe improove improoved improvments imprted inacativity inaccesible inadvertant inadvertantly
846		incase incedent incldue incluseion incomming incompeents incomptaible inconsistancy inconsitent
847		inconvienent incopatible incrase incrimental incrment incrmental indefinately indefinitly
848		indended indendently indentical indentifier indentifiers indention indentions indepedently
849		independant independantly independendly independet indepth indestrcteble indiate indiciations
850		indicies indivual indivudual indstalled inetersting infact infavour infomation informations
851		informatoin infrastcture infrasture infromation inherity inital initalise initalization
852		initalize initalized initiatior initiliased initilize inititialization inport inpossible inpunt
853		inputed inquiery insall insatll insatlled insensivite instace instal instalation instaled
854		instaler installad installaed installaing installatio installtion installtions instanciation
855		insted insteresting instractions instructuions instuctions intall intallation integerate
856		integreated integrituy intendend intepretation interal interations interchangable interchangably
857		interconverts interes interesitng interesst interessting intereting interfactive interfer
858		interferring intergrated interist interisting intermal intermittant intermittantly internaly
859		internat interneal interogate interpretedt interpretted interpretter interpretting interressing
860		interrest interresting interrestingly interrim interrups intersting interupt intial
861		intialization intialize intolerate intregate intrest intresting introduceing intruction
862		invarients invicible invole involes involvemnt invoplved invovle irt isnt isntall isoltation
863		isonly issueing ist istead isuus isystem ita iteinerant itelf ith itnel itseld ive iwll jailes
864		joing jornal jounal jsut juged juste kenrel kerel kerenel kerenels kerenl kerle kernal kernell
865		kernl keybaord killled kno knowlegde knowlege knowlodgeable knwo kust kwyrod labes lable lables
866		laeyer lagacy lanaguage langage languge laods larged lastest laterly latley latre laught
867		laughted layed layput lazyness leasure leat leav legitimite lemme lenght leson leter lettesrs
868		lexicographal lgertimately libararies libary librairies libraray libraris libraru licencing
869		licene liek lien liesure lightnig ligned liinux likeing likly liks limtations lineair linerly
870		ling liniarly lised lisens listet listning lite literrally littel litteral litterally liviness
871		llow lniux loadeded loally locak localy loccked locically loder loged loggoued loggs loging
872		loink lok loke lokking loks looh lookig lookking looksy loopack loosing loosly losseless lpatop
873		lpdng lter ltieral mabe maby mabye macademia machien machiens machin machince machinew maching
874		machne macrow macrows maek mahually mailling maintainace maintainance maintaince maintanance
875		maintaned maintanence maintenable maintence maintened maintener mames manageement managemnet
876		managent managment mananged manangement manaul manditory mangagement manged mangment manpage
877		manpages manuallying manualy manuell manufactring manyally marcro marcros markkup maschine maske
878		mater mathced maun maxaximum maximium maximun mdorn meaninful meantine measusre mechanim
879		mechanims mechiansm mechnism mechnisms memeber memery memroy ment mentined mentionned menue meny
880		mergeing mericracy meriticracy merrits mes messege messgae messgaes metada methode mfcd
881		micrcontroller microbnechmark mininum minmum minumum minut minuts miror mis miscelleneous
882		miscellenious mising misprediced missign missinc missking misspeling missplelling misterious
883		mistery mistypted misunterstood mkaes mke moble modifing modifiy modifiyng modiying momment
884		monalithic moniter monolitic mont montherboard montor montoring monut monuted mooved moter
885		motercycle motercycles motiviation moudels mountign mpre mssing multile multipled multipy
886		mutiple mutualy mvoed mysefl myst myt namming natioal natsy ncessary ncie nderstand necassary
887		neccasary neccesary neccesery neccessary necesary necessairely necessarely needto neet neetwork
888		neglegt negociate negociated neightbor nemisis nescessarily nescessary nessesery nimber nintees
889		nobady noet noice noipe nomally nonexistant noone normanlly notaions notavailable notefection
890		nothern nothin noticable notied notofocations notquite nouvou numberic numer numner nusance
891		nutrual obejct obfascated objejcts obselete obsolote obsticles obvoius ocassionaly occassion
892		occassionally occassions occation occations occurance occured occurence occurences occuried
893		occuring ocure oether ofcourse offenseive offical ofr oftem okey om ommisions ommit ommited
894		ommitt ommitted omre omrning onfigured ongoin onl onle onlne onlt onsult onthe ontop onts onw
895		ony oparation operationg opertunity opion opperation oppertunity oppinion oppions oppisite
896		oprations oprion oprions optial optiion optionsal optoin ordenary orginal orginally originaes
897		origine orignal ot otehr otsuts ouf ouput ouputing outher outout outstaning outtage overhall
898		overidden overlaping overlayed overrided overriden overritten overwritting ovre owkr pacakge
899		pacakges pachae packge packges padd padds paert painfull panices parallell paramenter parametr
900		parametrs paramtere paramters paranthesis paremeter parenticies parhaps parition paritioning
901		paritions parntheses parrallel parrellel partameters partialy particualar particulary partion
902		partions partionting partiton partitoning partitons passprhase passtrough passwrd pasto patche
903		patchex pathalogical pathces pathes peaople peform peformance peformed peice peices pengiun
904		peopel pepetual pepetually perfecly perfom perfomance perfoms perfor perfored performace
905		performancing performence performend perhas periperal peripherial peripherials permanant
906		permantly peroid persisent persistant personnal personnally personnaly persoon pertubation
907		peticular pevious pfew pgk phabriator pheraps phisical phoneix phorase phyiscall physcal physial
908		physicaly piblic pitty placte plaing plateform platfrom platorms pleae plin plisss poatch poblem
909		poblematic poeple pofessional poinitng poirts poitn poitner politley poluting polution pople
910		popularuity pordriere porevious porject porrtability porst portes portupgrde posible positiv
911		positve possability possbile posseses possibillity possilbe possition postion postitions postive
912		postress poting potr potupgrade poud poudirere poudrier poudrierre pourdiere pourdriere
913		pouridere poweful powerfull poyrts prameter pratcice preatty preblem preceed preceeded
914		preceeding preceeds precice precidence predictibly preemtive prefere prefered prefering
915		preferrable preferrably preffer preffered prefferred preform preformance premissions
916		preoblematic prepair prepairing preperation preperations preprend preprietary preprocesor
917		presense presidence presonally presumeably prety pretyt preume prevelent previos previouse
918		previousely previus prevoius pricipal primative primatives princial principes priorisation
919		priotity prirority pritn pritnf pritnfs privelege priveleged priviledge priviledges privilige
920		privledged privleges probabilly probabyl probaly probbaly probblem probem problaly proble
921		problen problme problmes probobly proccess proccesses proceedure proces proceses procols
922		proctect proepr proeprly profesional profesionals proffesional profie profilier profissional
923		progam progams progess programable programatic programlistning programm programms progrtam
924		projcet projecte prolematic prolonges promiscous promiscuos promisive promissed promissing
925		prompot promt proove propaged proped propegation propigate propogate propogation propolsal
926		proporion proporty propper propreitary propreitery propsing prorammer prorgram prosessor prot
927		protcol protcols protec prots provde provent provice providre pseuuedo pshycial pssword psuedo
928		ptach ptiner pudate puncing puroses pursache pursached puting qeustion quandries quard quater
929		quaterly queestion querys quesston questionr questoin questsions queueing qui quickier quiety
930		quirck quire quitted quoteas rabase rabased rabmling rae rans rapidely rase rasing raspberri
931		rater reactoin readd readning realated realloacted realy realyl reaosn reasoably reasonnable
932		reassambled reate reboote rebove rebuilded rebuitling rebult reccomended receieve recevied
933		recieve recieved recinded recive recoide recomend recomendation recomended recommand recommanded
934		recommanding recommened recommented reconigize recrusively redable redering rediculous
935		redundantcy reeated reelvent reember refered referes refering refernce refernces refernece
936		refferance refreind refridgerator refulat regardes regened regularely regularlly regulat reguls
937		reivew reized relaly relase relases relavent releated relese relesed relevent reloation
938		reloations relply rember remdial remebered remebers rememver remmeber remobal remvoe remvoed
939		rendtion repare reparing repative repetion repitition repititions replaceing replacemnet
940		replases replce repleaced reponding reponse reponses reponsible reposotory repostory
941		reprecussion reprecussions reproducable reproducibily reproductible reprository repy requiment
942		requireing requiretd requirments requistes requred rerurn resampeling resaonnable resemblence
943		resently resetart resetted resiilver resilliancy resillience resilliency resillient resise
944		resistnace resitor resitors resivoir reslove resloving resolf resonable resonably resons resouce
945		respecitively responce respository respresentation resseler ressources restaring restartet
946		restaurnat restaurnats resuce resuerrect resuerrecting resurections resusccitate rethnik retnia
947		retreive retrive returs reuild revalent reveiw reversse revison revisons rewcursion rewite
948		rewriten rezervation riddens rigth riht rmeoval rmore rmove roken roling rott roughy rreally
949		rreplace rrquest rudamentary runing runinig runnig runnign runnnig runnning ruote ruter sacn
950		saerch safed sahred saif saior sais salavge satsify saturage scenartio sched scheduld
951		schedulling scritp scrubing scrupt seached secction secend secion secions secondes secttion
952		secturity secund securiy seemless seemlessly seens seether senarios sence sendt sepaking
953		separatly separe separtely sepcial sepcific sepcifies sepcify seperate seperated seperately
954		seperates seperating seperation seperator seprate sequencially serching sercurity serie
955		seriosuly serius serivce serveral servicability servise sesion setable setiing seting setings
956		settt sevice sexond sey shae shaer shaers sheding shepard shepards shephard shepharding shooping
957		shoping shoud shoudl shoudn't shoulld shrinked shuld shure shuting shyed siginificant
958		significnat signle siilar sile sime similat simillar simpel simpl simplfied simplier simpliifed
959		simplyfies simular simultanious simultaniously singel singeling singels singnificant sinificant
960		sinse sintax sistems sitll skiped sligh slighly slove sloved slpw slue smaler smebody smeone
961		snapshoted snoflake snopped soe soehow soemone soemones soemthing soething softaware softner
962		softwae sofware sohuld soif soley solition solusion someoene somes somethign somethng sometime
963		someting sometjhing someway somoene somthing somwhere sonud sonuds soo soruce sparce spearator
964		specfic specifes specifi specifiaction specificially specificly specifig specifing specifiy
965		specifiying specifyed spectacte speficy sperate spesific spindels spititng splic spliting
966		splitted spose spreadth srews srtuff srync ssorted sspares ssytem stabalization stadnard
967		stairing standart standerd stantdard startet starup staticlly statuc steller steping stilla
968		stiring stkicks stoll stollen stoped stoping stoppe stoppped straigh strang strangly strat
969		strategie strenght striaght stricktly strippped stroage structurees stucture stuf stystem subet
970		submited submiter submitt substaintally substition substract substraction subsytem subsytems
971		subverion succed succeded succeds succesful succesfully successfull successfuly suceeding
972		sucesfully sucess sucessful sucessfull sucessors suckser sucksers suddently sudirectories
973		suffecient sufficent suficient sugesstion sugest sugested suggesiton suggestsions suggetion
974		suggetions sugroup suject sumbit sume superceed superiour supoose suposed suposedly suppor
975		suppotr suppotred supprts supress supressed supresses suprise suprised suprising surpise
976		surpised surpressed surprice surpriced surprize surprized surronded surroudn surroudning
977		susbtitute suspec suspection sutiable swape swepped swich switche swith swithc switich
978		switiching swop syas symetrical symtom symtoms synchronisaton syncrhonous syncrhonously
979		synonomous sysem sysetm syslodg systeam systme systmes sytem sytems sytsem taged taging taht
980		tahts talkes targer tat te teamm techer techical techincally techncially teh tehre tehse tehy
981		tempaltes temperatire templaitize temporarely tenticles tere terirrlbe termal termonology
982		termperature termporary tey th tha thaat thaknk thakns thank's thankje thansk thanx thatis thats
983		thay theese thefirst themeing thems themself theoraticly theorethically ther therads therefor
984		theres therory thets theyre theyve thie thier thign thigns thingking thinke thinkg thinkw
985		thinling thirs thnak thnig thnk tho thos thouch thoug thougt thouogh threated thremal throgh
986		throtteling throug throughly throught throuhg throwed thru thrugh tht thta thudner thwo thye ti
987		tiems tihngs tihs timestatmp tinket tinketing tipycal tirck tird titeled tlak tlaking tnan
988		todays todl togehter togethe tohers tols tomake tommorow toolcain toolchian topick totaly
989		tottaly tought tougue tpage tpye traafic tradeing traditoin traditoinal tradtional trafic
990		trailling tranalation tranalations tranfer tranfered tranfers transfered transfering
991		translateion translater translaters transltion transmision traslate traslation treatement trid
992		triede triewd trigonmetric tring tripple trival trnaslate trnaslated trofy troublehsooting
993		troubleshoute troughout trow trows trpi trrue trry trubolsome truely trully tryed tryied tryign
994		tsable tsart tsill tsrarted tthe tthis tu tunning tunr turend turnt tutoriales tye tyhrow
995		typicall typicaly udnerstand udnerstandable udpate udpates uesd uisng umounted uncapable
996		unchaged unchange uncoment unconsistent undefinied undefinitely undeflowed undersatnd understadn
997		understadning understandlable understandood understaning underway undescores undesireable
998		undestand undustrialized unecessary unecrypted unfortauntely unfortenately unfortuante
999		unfortunatelly unfortunatly unfortuntelly unfrastructure unfreezed uniion uniquily unitentinally
1000		unknwn unkown unlinke unmouting unnceccessary unneccessary unnecssary unprivilegded unrelevant
1001		unresolveable unreversable unsubstanciated unsuccesful unsucessfully unsutiable untill untis
1002		unuseable upate updaing updateing updte updtes upgade upgaded upgarde upo uppon uprade upsteam
1003		upstrewams upto ur usally useable useage usedul usefull useing usesd usign ussage usse ussually
1004		usully utilites utilties uttrerly vagrand varaible varanty varialbe varialbes varliable varois
1005		varoius vender vengeace veresion verion verison verry versionned versionning versoin verty
1006		veryify virtial virutal visable voa volenteer volenteers voltave vontinues votlage vulnability
1007		waas waht wahtever wakupe wantd warant waranted wass webupage wecam wehre wek wel wer wether
1008		whanever whats whcih whe whene whereever wheres whewn whhich whie whihc whilte whinning whish
1009		whit whith whne wht wich wierd wiht wihtout wilde wirh wirtten wistle wistles witdh withe
1010		withhin withing withme withough withouth witk witout witt wlll wnat wnats wnet wo wonderfull
1011		woner wont wor worflows workint workoad workoads workstion worng worrty woth woud woudl wouldbe
1012		wouldnt wouls wranty wraper wriatble writen writtend writting wroking wroute wsouse wuch
1013		xontains ycould yea yeild yeilds yesm yhe youd youi youll youre yu yuo yut /) {
1014		$misspelled_words{$word} = 1;
1015	}
1016	print "spellingerrors: ", scalar (keys %misspelled_words), " misspellings known\n" if $verbose;
1017	my @spellfiles;
1018	# IGORSPELLFILES environment variable is a whitespace-separated list of files
1019	push (@spellfiles, split /\s/, $ENV{'IGORSPELLFILES'}) if defined($ENV{'IGORSPELLFILES'});
1020	# all files found in /usr/local/etc/igor/spelling
1021	push (@spellfiles, split /\s/, `ls /usr/local/etc/igor/spelling/*`) if -d '/usr/local/etc/igor/spelling';
1022	for my $spellfile (@spellfiles) {
1023		readspelling($spellfile);
1024	}
1025}
1026
1027sub spellingerrors {
1028	my ($bname, $line, $txt) = @_;
1029	return if $txt =~ /^\s*$/;
1030
1031	my $txtbak = $txt;
1032	my @words = split /\W+/, $txt;
1033	for my $currentword (@words) {
1034		if ( $misspelled_words{lc($currentword)} ) {
1035			$txt = highlight_word($txt, $currentword);
1036		}
1037	}
1038	if ( $txt ne $txtbak ) {
1039		showline($bname, $line, $ansi{darkmagenta}, 'spelling', $txt);
1040	}
1041}
1042
1043sub whitespace {
1044	my ($bname, $line, $txt) = @_;
1045	return if $txt =~ /^$/;
1046
1047	my $txtbak = $txt;
1048	if ( $txt =~ s/^(\s+)$/$li$1$ri/ ) {
1049		showline($bname, $line, $ansi{darkblue}, 'blank line with whitespace', $txt);
1050	}
1051	$txt = $txtbak;
1052	if ( $txt =~ s/(\S+)(\s+)$/$1$li$2$ri/ ) {
1053		showline($bname, $line, $ansi{darkblue}, 'trailing whitespace', $txt);
1054	}
1055	$txt = $txtbak;
1056	if ( $txt =~ s/( +)\t+/$li$1$ri/ ) {
1057		showline($bname, $line, $ansi{darkmagenta}, 'tab after space', $txt);
1058	}
1059}
1060
1061
1062# global batch tests
1063sub style {
1064	my ($bname, $txt) = @_;
1065	print "$lf$bname style check:$rf\n";
1066
1067	my $you = ($txt =~ s/you\b/you/gi);
1068	my $your = ($txt =~ s/your/your/gi);
1069	if ( $you || $your ) {
1070		print "  $lh\"you\" used $you time", ($you==1 ? '':'s'), "$rh\n" if $you;
1071		print "  $lh\"your\" used $your time", ($your==1 ? '':'s'), "$rh\n" if $your;
1072		print "    \"You\" and \"your\" are informal and subjective.\n";
1073		print "    Attempt to be formal and objective: \"the file\" rather than \"your file\".\n";
1074	}
1075
1076	my $should = ($txt =~ s/should/should/gi);
1077	if ( $should ) {
1078		print "  $lh\"should\" used $should time", ($should==1 ? '':'s'), "$rh\n";
1079		print "    Use \"should\" sparingly, it is feeble and suggests unsureness.\n";
1080		print "    Attempt to be imperative: \"do this\" rather than \"you should do this\".\n";
1081	}
1082
1083	my $obviously = ($txt =~ s/obviously/obviously/gi);
1084	if ( $obviously ) {
1085		print "  $lh\"obviously\" used $obviously time", ($obviously==1 ? '':'s'), "$rh\n";
1086		print "    If it is really obvious, it does not need to be pointed out.\n";
1087	}
1088
1089	my $needless = ($txt =~ s/needless to say/needless to say/gi);
1090	if ( $needless ) {
1091		print "  $lh\"needless to say\" used $needless time", ($needless==1 ? '':'s'), "$rh\n";
1092		print "    If it doesn't need to be said, why say it?\n";
1093	}
1094
1095	my $thefollowing = ($txt =~ s/the following/the following/gi);
1096	my $asfollows    = ($txt =~ s/as follows/as follows/gi);
1097	if ( $thefollowing || $asfollows ) {
1098		print "  $lh\"the following\" used $thefollowing time", ($thefollowing==1 ? '':'s'), "$rh\n" if $thefollowing;
1099		print "  $lh\"as follows\" used $asfollows time",       ($asfollows==1    ? '':'s'), "$rh\n" if $asfollows;
1100		print "    If something is following, the reader can see it without being told.\n";
1101	}
1102
1103	my $followingexample = ($txt =~ s/following example/following example/gi);
1104	if ( $followingexample ) {
1105		print "  $lh\"following example\" used $followingexample time", ($followingexample==1 ? '':'s'), "$rh\n";
1106		print "    If an example is following, the reader can see it without being told.\n";
1107	}
1108
1109	my $simply = ($txt =~ s/simply/simply/gi);
1110	my $basically = ($txt =~ s/basically/basically/gi);
1111	if ( $simply || $basically ) {
1112		print "  $lh\"simply\" used $simply time", ($simply==1 ? '':'s'), "$rh\n" if $simply;
1113		print "    Use \"simply\" to mean \"in a simple manner\", \"just\", or \"merely\", not the\n";
1114		print "    patronizing \"details omitted because they are not simple enough for you\".\n";
1115		print "  $lh\"basically\" used $basically time", ($basically==1 ? '':'s'), "$rh\n" if $basically;
1116		print "    Use \"basically\" to mean \"essentially\" or \"fundamentally\", not \"only the\n";
1117		print "    basics are shown because anything more will be too complicated for you\".\n";
1118	}
1119
1120	my $the = ($txt =~ s/(?:^the|\.\s+the)\b/the/gi);
1121	my $sent = ($txt =~ s/([^.]+\.\s+)/$1/gi);
1122	my $percent = ($sent > 0 ? int($the/$sent*100) : 0);
1123	if ( $the && ($percent > 19) ) {
1124		print "  $lh\"The\" used to start a sentence $the time", ($the==1 ? '':'s'), " in $sent sentence", ($sent==1 ? '':'s'), " ($percent%)$rh\n";
1125		print "    Starting too many sentences with \"the\" can be repetitive\n";
1126		print "    and dull to read.\n";
1127	}
1128
1129	my $cf = ($txt =~ s/\Wcf\./cf./gi);
1130	my $eg = ($txt =~ s/e\.g\./e.g./gi);
1131	my $ie = ($txt =~ s/i\.e\./i.e./gi);
1132	my $nb = ($txt =~ s/n\.b\./n.b./gi);
1133	if ( $cf ) {
1134		print "  $lh\"cf.\" used $cf time", ($cf==1 ? '':'s'), "$rh\n";
1135		print "    \"Cf.\" (Latin \"confer\") means \"${lf}compare$rf\" and is mostly used in academic\n";
1136		print "    and scientific writing.  Consider replacing with the more common English\n";
1137		print "    words.\n";
1138	}
1139	if ( $eg ) {
1140		print "  $lh\"e.g.\" used $eg time", ($eg==1 ? '':'s'), "$rh\n";
1141		print "    \"E.g.\" (Latin \"exempli gratia\") means \"${lf}for example$rf\" and is mostly\n";
1142		print "    used in academic and scientific writing.  Consider replacing with the\n";
1143		print "    more common English words.  Both forms are usually followed by a\n";
1144		print "    comma for a verbal pause:  \"e.g., a b c\" or \"for example, a b c\"\n";
1145	}
1146	if ( $ie ) {
1147		print "  $lh\"i.e.\" used $ie time", ($ie==1 ? '':'s'), "$rh\n";
1148		print "    \"I.e.\" (Latin \"id est\") means \"${lf}that is$rf\" and is mostly used in academic\n";
1149		print "    and scientific writing.  Consider replacing with the more common\n";
1150		print "    English words.  Both forms are usually followed by a comma for\n";
1151		print "    a verbal pause:  \"i.e., a b c\" or \"that is, a b c\"\n";
1152	}
1153	if ( $nb ) {
1154		print "  $lh\"n.b.\" used $nb time", ($nb==1 ? '':'s'), "$rh\n";
1155		print "    \"N.b.\" (Latin \"nota bene\") means \"${lf}note$rf\" or \"${lf}take notice${rf}\" and is mostly\n";
1156		print "    used in academic and scientific writing.  Consider replacing with\n";
1157		print "    the more common English words.\n";
1158	}
1159
1160	my $inorderto = ($txt =~ s/in order to/in order to/gi);
1161	if ( $inorderto ) {
1162		print "  $lh\"in order to\" used $inorderto time", ($inorderto==1 ? '':'s'), "$rh\n";
1163		print "    Unless \"in order to\" has some special meaning here, \"to\" is simpler.\n";
1164	}
1165
1166	my $invoke = ($txt =~ s/invoke/invoke/gi);
1167	if ( $invoke ) {
1168		print "  $lh\"invoke\" used $invoke time", ($invoke==1 ? '':'s'), "$rh\n";
1169		print "    Unless \"invoke\" has some special meaning in context, \"run\" is simpler.\n";
1170	}
1171
1172	my $parenplural = ($txt =~ s/\(s\)/\(s\)/gi);
1173	if ( $parenplural ) {
1174		print "  $lh\"(s)\" used $parenplural time", ($parenplural==1 ? '':'s'), "$rh\n";
1175		print "    Please do not form plurals this way.  It is a holdover from lazy\n";
1176		print "    programming practices, is difficult to read, and almost always\n";
1177		print "    unnecessary.  A plural formed with a plain \"s\" is usually correct\n";
1178		print "    when speaking about numbers of one or more.\n";
1179	}
1180
1181	# type-specific tests
1182	if ( $type eq "troff" ) {
1183		my $examples = ($txt =~ /\n\.\s*Sh\s+EXAMPLES/i);
1184		unless ( $examples ) {
1185			print "  ${lh}no \"EXAMPLES\" section found$rh\n";
1186			print "    Even trivial examples can improve clarity.\n";
1187			print "    Common-use examples are better yet.\n";
1188		}
1189	}
1190}
1191
1192# mdoc line-by-line tests
1193my @md_displays;
1194sub mdoc_whitespace {
1195	my ($bname, $line, $txt) = @_;
1196
1197	if ( $txt =~ /^\.\s*Bd\s/ ) {
1198		push @md_displays, ($txt =~ /-(?:literal|unfilled)/ || 0);
1199	} elsif ( $txt =~ /^\.\s*Ed\b/ ) {
1200		pop @md_displays;
1201	} elsif ( ! length $txt && ! grep $_, @md_displays ) {
1202		showline($bname, $line, $ansi{darkblue}, "blank line", $txt);
1203	}
1204}
1205
1206sub mdoc_date {
1207	my ($bname, $line, $txt) = @_;
1208	return if $txt =~ /^\s*$/;
1209
1210	if ( $txt =~ s/^(\.\s*Dd\s+)(.*)$/$1$lh$2$rh/ ) {
1211		$docdate = $2;
1212		showline($bname, $line, $ansi{darkyellow}, "date not today, $today", $txt) if $docdate ne $today;
1213	}
1214}
1215
1216sub mdoc_sentence {
1217	my ($bname, $line, $txt) = @_;
1218	return if $txt =~ /^\s*$/;
1219
1220	if ( $txt =~ s/^(\w{2,}.*?[^ .]{2,}\.\s+)(A |I |\w{2,})(.*)$/$1$lh$2$3$rh/ ) {
1221		showline($bname, $line, $ansi{darkcyan}, 'sentence not on new line', $txt);
1222	}
1223}
1224
1225sub init_mdoc_uniqxrefs {
1226	print "initializing mdoc_uniqxrefs\n" if $verbose;
1227	%seealsoxrefs = ();
1228}
1229
1230sub mdoc_uniqxrefs {
1231	my ($bname, $line, $txt) = @_;
1232	return if $txt =~ /^\s*$/;
1233
1234	# set a flag to indicate when a .Sh SEE ALSO section is found
1235	if ( $txt =~ /^\.Sh\s+(.*)/i ) {
1236		$seealso = ( $1 =~ /SEE ALSO/i );
1237		print "mdoc_uniqxrefs: SEE ALSO section found\n" if $verbose;
1238		return;
1239	}
1240
1241	# only check xrefs for repeats inside a SEE ALSO section
1242	if ( $seealso ) {
1243		# if inside a SEE ALSO section, stop looking for duplicates
1244		# after non-.Xr macros.  These would probably be text sections
1245		# talking about the external references, not included in the list.
1246		if ( ($txt =~ /^\./) && ($txt !~ /^\.Xr/i) ) {
1247			$seealso = 0;
1248			return;
1249		}
1250
1251		# allow both valid mdoc formats (.Xr umount 8 ,)
1252		# and bad ones (.Xr xorg.conf(5),)
1253		if ( $txt =~ /\.Xr\s+(.*)(?:\s|\()(\d{1}\w?)/i ) {
1254			my $xrefname = $1;
1255			my $xrefsect = $2;
1256			if ( $seealsoxrefs{"$xrefname-$xrefsect"} ) {
1257				$txt =~ s/($xrefname.*$xrefsect)/$lh$1$rh/g;
1258				showline($bname, $line, $ansi{yellow}, "duplicate SEE ALSO reference", $txt);
1259			} else {
1260				$seealsoxrefs{"$xrefname-$xrefsect"} = 1;
1261			}
1262		}
1263	}
1264}
1265
1266sub showmacvals {
1267	my ($lastmacro, $bname, $line) = @_;
1268	for my $macro (@macros) {
1269		last if $macro eq $lastmacro;
1270		unless ( $macroval{$macro} ) {
1271			showline($bname, $line, $ansi{red}, ".$lastmacro used here", "but .$macro has not been defined");
1272		}
1273	}
1274}
1275
1276sub init_mdoc_structure {
1277	print "initializing mdoc_structure\n" if $verbose;
1278	for my $macro (@macros) {
1279		$macro =~ tr/_/ /;
1280		$macroval{$macro} = '';
1281	}
1282}
1283
1284sub mdoc_structure {
1285	my ($bname, $line, $txt) = @_;
1286	return if $txt =~ /^\s*$/;
1287
1288	# skip if the line starts with an mdoc macro
1289	# technically, whitespace is allowed before macros
1290	return unless $txt =~ /^\s*\./;
1291
1292	# check for required minimum macros
1293	my $parm;
1294	for my $macro (@macros) {
1295		$parm = '';
1296		$macro =~ tr/_/ /;
1297		next if $macroval{$macro};
1298		if ( $txt =~ /^\.\s*\Q$macro\E\s*(.*)/i ) {
1299			my $parm = $1;
1300			# provide a blank parameter for macros with optional parameters
1301			$parm = ' ' if ($macro =~ /^Os|Sh NAME|Sh SYNOPSIS|Sh DESCRIPTION/) && (!$parm);
1302			$macroval{$macro} = $parm;
1303			showmacvals($macro, $bname, $line);
1304			last;
1305		}
1306	}
1307
1308	# check external refs (.Xr)
1309	# suggested by Glen Barber
1310	return unless $txt =~ /^.Xr/;
1311
1312	# characters to treat as whitespace in an Xr macro
1313	my $wspace = '[ (),.:]';
1314	# character class for section numbers
1315	# an initial number possibly followed by a letter
1316	my $sect = '\d{1}[A-Za-z]?';
1317
1318	my $xname = '';
1319	$xname = $1 if $txt =~ /^.Xr$wspace+(\S+)/;
1320	my $xsection = '';
1321	$xsection = $1 if $txt =~ /^.Xr$wspace+\S+$wspace+($sect)/;
1322
1323	if ( ! $xname ) {
1324		showline($bname, $line, $ansi{yellow}, 'xref name missing', $txt);
1325		return;
1326	}
1327
1328	if ( $xname =~ /\($sect\)/ ) {
1329		$txt =~ s/($xname)/$lh$1$rh/;
1330		showline($bname, $line, $ansi{yellow}, 'section number in name', $txt);
1331		return;
1332	}
1333
1334	if ( $xsection && ($xsection gt "9") ) {
1335		$txt =~ s/^(.Xr$wspace+\S+$wspace+)($sect)/$1$lh$2$rh/;
1336		showline($bname, $line, $ansi{yellow}, 'section higher than 9', $txt);
1337		# no point in checking for sections higher than 9
1338		return;
1339	}
1340
1341	if ( $opt_x ) {
1342		system("$man -w $xsection $xname >/dev/null 2>&1");
1343		if ( $? ) {
1344			if ( $xsection ) {
1345				$txt =~ s/^(.Xr$wspace+)(\S+$wspace+$sect)/$1$lh$2$rh/;
1346			} else {
1347				$txt =~ s/^(.Xr$wspace+)(\S+)/$1$lh$2$rh/;
1348			}
1349			showline($bname, $line, $ansi{darkmagenta}, 'external man page not found', $txt);
1350			# not found, no point in checking if it's this one
1351			return;
1352		}
1353	}
1354
1355	# is this external reference referring to itself?
1356	# skip if the .Nm macro has no value
1357	return if $macroval{'Nm'} ne $xname;
1358	my $currsection = '';
1359	if ( $macroval{'Dt'} =~ /^\S+\s+($sect)/ ) {
1360		$currsection = $1;
1361	}
1362	return if $xsection ne $currsection;
1363	if ( $xsection && $currsection ) {
1364			$txt =~ s/^(.Xr$wspace+)(\S+$wspace+$sect)/$1$lh$2$rh/;
1365		} else {
1366			$txt =~ s/^(.Xr$wspace+)(\S+)/$1$lh$2$rh/;
1367		}
1368	showline($bname, $line, $ansi{darkmagenta}, 'xref refers to *this* page (use .Nm)', $txt);
1369}
1370
1371
1372# DocBook line-by-line tests
1373
1374sub init_doc_titles {
1375	print "initializing doc_titles\n" if $verbose;
1376	# build regex of words that should be lowercase in titles
1377	my @lc_words = qw/ a an and at by down for from in into like near
1378					   nor of off on onto or over past the to upon with /;
1379	$lc_regex = '(?:' . join('|', @lc_words) . ')';
1380	my @uc_words = qw/ about are how log new not set tag use
1381					   one two three four five six seven eight nine /;
1382	$uc_regex = '(?:' . join('|', @uc_words) . ')';
1383	my @fixedcase_words = qw/ amd64 i386 iSCSI x86 /;
1384	$fixedcase_regex = '(?:' . join('|', @fixedcase_words) . ')';
1385
1386	# build regex for ignoring DocBook tagged words in titles
1387	# like <command>ls</command>
1388	my @ignoretags = qw/ acronym application command filename function
1389						 link literal varname replaceable systemitem tag /;
1390	for my $tag (@ignoretags) {
1391		$tag = "<$tag.*?>.*?<\/$tag>";
1392	}
1393	$ignoreregex = '<anchor.*?>|' . join('|', @ignoretags)
1394}
1395
1396sub doc_titles {
1397	my ($bname, $line, $txt) = @_;
1398	return if $txt =~ /^\s*$/;
1399
1400	my $txtbak = $txt;
1401
1402	return if $ignoreblock;
1403	$titleblock = 1 if $txt =~ /<title/;
1404	return unless $titleblock;
1405
1406	print "doc_titles: '$txt'\n" if $verbose;
1407
1408	my @words;
1409
1410	# take the text from between title tags, or the
1411	# whole line if a title tag is not present
1412	# split the result into an array of words, keeping
1413	# ignorable DocBook tags wrapped around text
1414	if ( ($txt =~ /<title.*?>(.*?)(?:<\/title>|$)/)
1415		 || ($txt =~ /(.*)(?:<\/title>)/) ) {
1416		@words = split /($ignoreregex|\s+)/, $1;
1417	} else {
1418		@words = split /($ignoreregex|\s+)/, $txt;
1419	}
1420
1421	# use AP style: capitalize words longer than three letters; see also
1422	# http://www.freebsd.org/cgi/cvsweb.cgi/doc/en_US.ISO8859-1/books/handbook/linuxemu/chapter.sgml#rev1.48
1423	WORD: for my $i (0..$#words) {
1424		my $word = $words[$i];
1425
1426		print "doc_titles: analyzing '$word'\n" if $verbose;
1427
1428		next WORD if $word =~ /\s+/;
1429		next WORD if $word =~ /$ignoreregex/;
1430
1431		# special case: skip the contents of some unfinished tags
1432		# <title>Configuring <acronym role="Domain Name
1433		#   System">DNS</acronym></title>
1434		next WORD if $word =~ /(?:role)=/;
1435
1436		# special case: allow single lowercase "s" for plurals
1437		next WORD if $word eq 's';
1438
1439		# special case words that should not be capitalized
1440		next WORD if $word =~ /^$fixedcase_regex$/;
1441
1442		# first word should be capitalized
1443		if ( ($txt =~ /<title/) && ($i == 0) ) {
1444			if ( is_lowercase($word) ) {
1445				$words[$i] = highlight_string($word);
1446			}
1447			# first word is special, skip other tests
1448			next WORD;
1449		}
1450
1451		# last word should be capitalized
1452		if ( ($txt =~ /<\/title/) && ($i == $#words) ) {
1453			if ( is_lowercase($word) ) {
1454				$words[$i] = highlight_string($word);
1455			}
1456			# last word is special, skip other tests
1457			last WORD;
1458		}
1459
1460		# words that should be lower case
1461		if ( is_uppercase($word) ) {
1462			if ( $word =~ /^$lc_regex$/i ) {
1463				$words[$i] = highlight_string($word);
1464				next WORD;
1465			}
1466		}
1467
1468		# words that should be upper case
1469		if ( is_lowercase($word) ) {
1470			if ( $word !~ /^$lc_regex$/i ) {
1471				if ( (length($word) > 3) ) {
1472					$words[$i] = highlight_string($word);
1473					next WORD;
1474				}
1475			}
1476			if ( $word =~ /^$uc_regex$/i ) {
1477				$words[$i] = highlight_string($word);
1478				next WORD;
1479			}
1480		}
1481	}
1482
1483	# reconstruct the now-capitalized title
1484	$txt = '';
1485	$txt = $1 if $txtbak =~ /^(.*<title.*?>)/;
1486	$txt .= join('', @words);
1487	$txt .= $1 if $txtbak =~ /(<\/title.*?>)/;
1488
1489	if ( $txt ne $txtbak ) {
1490		print "doc_titles:\n     original='$txtbak'\n  highlighted='$txt'\n" if $verbose;
1491		showline($bname, $line, $ansi{blue}, 'capitalization', $txt);
1492	}
1493
1494	$titleblock = 0 if $txt =~ /<\/title>/;
1495}
1496
1497sub init_doc_indentation {
1498	print "initializing doc_indentation\n" if $verbose;
1499	# build regex for detecting DocBook tags that begin or
1500	# end an indented section
1501	my @indent_tags = qw/ abstract answer appendix article articleinfo
1502						  author authorgroup biblioentry bibliography
1503						  biblioset blockquote book bookinfo callout
1504						  calloutlist category chapter chapterinfo colophon
1505						  caution contrib date day entry event example
1506						  figure formalpara funcdef funcsynopsis
1507						  funcprototype glossary glossdef glossdiv
1508						  glossentry glossterm important imageobject
1509						  imageobjectco info informaltable
1510						  informalexample itemizedlist legalnotice
1511						  listitem mediaobject mediaobjectco month name
1512						  note orderedlist para paramdef partintro
1513						  personname preface procedure qandadiv
1514						  qandaentry qandaset question row screenco
1515						  sect1 sect2 sect3 sect4 sect5 section
1516						  seglistitem segmentedlist sidebar step
1517						  stepalternatives surname table tbody tgroup
1518						  thead tip title variablelist varlistentry
1519						  warning year /;
1520	# add VuXML tags
1521	@indent_tags = (@indent_tags, qw/ affects body cvename dates
1522						description discovery head html li name p range
1523						references topic ul vuln vuxml /);
1524	@indent_tags = (sort {length($b) <=> length($a)} @indent_tags);
1525	print "indentation tags: @indent_tags\n" if $verbose;
1526	$indent_regex = '(?:' . join('|', @indent_tags) . ')';
1527	print "indentation regex: $indent_regex\n" if $verbose;
1528	# build regex for inline tags like
1529	# <filename>blah</filename>
1530	my @inline_tags = qw/ a acronym application citetitle command
1531						  computeroutput devicename emphasis envar
1532						  errorname filename firstterm footnote function
1533						  guimenu guimenuitem hostid imagedata indexterm
1534						  keycap keycombo link literal makevar option
1535						  optional package parameter primary quote
1536						  remark replaceable secondary see seg sgmltag
1537						  simpara strong structname systemitem term tt
1538						  ulink uri varname /;
1539	# add VuXML tags
1540	@inline_tags = (@inline_tags, qw/ ge gt le lt url /);
1541	@inline_tags = (sort {length($b) <=> length($a)} @inline_tags);
1542	print "inline tags: @inline_tags\n" if $verbose;
1543	$inline_regex = '(?:' . join('|', @inline_tags) . ')';
1544	print "inline regex: $inline_regex\n" if $verbose;
1545}
1546
1547sub doc_indentation {
1548	my ($bname, $line, $currline) = @_;
1549	my ($init_prev_indent, $init_curr_indent);
1550	return if $currline =~ /^\s*$/;
1551
1552	# indents are not significant inside ignorable SGML blocks.
1553	return if $ignoreblock;
1554
1555	return if $currline =~ /^\s*<!--.*-->\s*$/;
1556
1557	# \b is needed here to prevent <parameter> being detected as <para>
1558	return unless $prevnonblank =~ /<\/*$indent_regex\b.*?>/;
1559
1560	my $prev_indent = length(leading_space($prevnonblank));
1561	my $curr_indent = length(leading_space($currline));
1562	if ( $verbose ) {
1563		# save initial values for later verbose reporting
1564		$init_prev_indent = $prev_indent;
1565		$init_curr_indent = $curr_indent;
1566	}
1567
1568	# indent once for open tag on previous line
1569	$prev_indent += 2 if $prevnonblank =~ /<$indent_regex\b/;
1570
1571	# allow for inline tag indenting, like
1572	# <link
1573	#   url=
1574	# or
1575	# <makevar>xyz
1576	#   abc</makevar>
1577	my $count = 0;
1578	$count += ($prevnonblank =~ s/(<$inline_regex)\b/$1/g);
1579	$count -= ($prevnonblank =~ s/(<\/$inline_regex)\b/$1/g);
1580	$prev_indent += (2 * $count);
1581
1582	# if previous line ends in an open xref, indent
1583	$prev_indent += 2 if ($prevnonblank =~ /<xref\s*$/);
1584
1585	# <xref> has no close tag, but uses "linkend=" the same as <link>
1586	# which *does* have a close tag... so if there's a linkend= on
1587	# previous line but no </ulink> or </link> on either previous
1588	# or current lines, assume it's an xref and outdent
1589	my $broken_regex = '(?:(?:linkend|url)=)';
1590	if ( $prevnonblank =~ /^\s*$broken_regex/ ) {
1591		if ($prevnonblank !~ /<\/(?:link|ulink)/) {
1592			if ($currline !~ /<\/(?:link|ulink)/) {
1593				$prev_indent -= 2;
1594			}
1595		}
1596	}
1597
1598	# outdent for close tag at end of previous line
1599	$prev_indent -= 2 if ($prevnonblank =~ /\S+.*<\/$indent_regex>\s*$/);
1600
1601	# outdent for close tag at the start of this line
1602	$prev_indent -= 2 if ($currline =~ /^\s*<\/$indent_regex/);
1603
1604	# outdent after footnote
1605	$prev_indent -=2 if $prevnonblank =~ /<\/para><\/footnote>/;
1606
1607	# singleton tags like <entry/> are really just an empty
1608	# open/close tag, <entry></entry>, allow for them
1609	$prev_indent -=2 if $prevnonblank =~ /\/>$/;
1610
1611	# close tags after long sections of nonindented blocks,
1612	# like the end of a programlisting, cannot be correctly
1613	# checked for indentation in this hacky way, so ignore them
1614	if ( ($prevnonblank =~ /$ignoreblockstart|$ignoreblockend/)
1615		|| ($currline =~ /$ignoreblockend/) ) {
1616		$curr_indent = $prev_indent;
1617	}
1618
1619	if ( $curr_indent != $prev_indent ) {
1620		if ( $verbose ) {
1621			print "doc_indentation:\n";
1622			my $vprev = showwhitespace($prevnonblank);
1623			my $vcurr = showwhitespace($currline);
1624			print "previous nonblank line: '$vprev\'\n";
1625			print "          current line: '$vcurr\'\n";
1626			print "\t\t\t\tinitial\tfinal\n";
1627			print "previous nonblank indent:\t$init_prev_indent\t$prev_indent\n";
1628			print "          current indent:\t$init_curr_indent\t$curr_indent\n";
1629		}
1630		my $out = $origline;
1631		$out =~ s/(^\s+)/$li$1$ri/;
1632		showline($bname, $line, $ansi{darkred}, 'bad tag indent', $out);
1633	}
1634}
1635
1636# split and return leading space and content
1637sub splitleading {
1638	my $txt = shift;
1639	my $inspace = '';
1640	my $content = $txt;
1641	if ( $txt =~ /^(\s*)(.*)/ ) {
1642		$inspace = $1 if $1;
1643		$content = $2 if $2;
1644	}
1645	return ($inspace, $content);
1646}
1647
1648sub doc_longlines {
1649	my ($bname, $line, $txt) = @_;
1650	return if $txt =~ /^\s*$/;
1651	return if $ignoreblock;
1652	# if line is longer than $linelensgml (normally 70) chars
1653	# and the part after the indent has spaces
1654	# this should be smarter, like seeing if the part before the space
1655	# will benefit from wrapping
1656
1657	# ignore long lines with these tags
1658	return if $txt =~ /<(?:!DOCTYPE|!ENTITY|pubdate|releaseinfo)/;
1659
1660	$txt = expand_tabs($txt);
1661
1662	if ( length($txt) > $linelensgml ) {
1663		my ($inspace, $content) = splitleading($txt);
1664		my $currline = substr($content, 0, $linelensgml - length($inspace));
1665		my $nextline = substr($content, length($currline));
1666		if ( $currline =~ / / ) {
1667			$currline =~ s/^(.*)? (.*)$/$1$li $ri$2/;
1668			showline($bname, $line, $ansi{green}, 'wrap long line', "$inspace$currline$nextline");
1669		} elsif ( $nextline =~ s/ /$li $ri/ ) {
1670			showline($bname, $line, $ansi{green}, 'wrap long line', "$inspace$currline$nextline");
1671		}
1672	}
1673}
1674
1675sub init_doc_sentence {
1676	print "initializing doc_sentence\n" if $verbose;
1677	# end of sentence characters: literal dot, question mark, exclamation point
1678	$eos_regex = '\.|\?\!';
1679}
1680
1681sub doc_sentence {
1682	my ($bname, $line, $txt) = @_;
1683
1684	return if $txt =~ /^\s*$/;
1685	return if $ignoreblock;
1686
1687	# skip if there is no end-of-sentence character
1688	return unless $txt =~ /(?:$eos_regex)/;
1689
1690	my $errcount = 0;
1691	my ($inspace, $content) = splitleading($txt);
1692	my @sentences = grep (! /^$/, split /((?:.*?(?:$eos_regex)+\s+)|(?:<.*?>))/, $content);
1693
1694	for my $s (@sentences) {
1695		# skip unless it has a one-space possible sentence start
1696		next unless $s =~ /\. $/;
1697
1698		# SGML markup, like "<emphasis>bold</emphasis>."
1699		#next if $s =~ />\. $/;
1700
1701		# single dots, like from "find . -name '*.sgml'"
1702		next if $s =~ / \. $/;
1703
1704		# initials
1705		next if $s =~ /[A-Z]{1}\. $/;
1706
1707		# common abbreviations
1708		next if $s =~ /(?:Ave|Dr|Ed|etc|Inc|Jr|Mass|Pub|Sp|St|Str|str|o\.o)\. $/;
1709
1710		# ignore misuse of cf., e.g., i.e., and v.s., they are not
1711		# end of sentence errors
1712		next if $s =~ /(?:cf|e(?:\.)*g|i\.e|v\.s)\. $/i;
1713
1714		# months
1715		next if $s =~ /(?:Jan|Feb|Mar|Apr|May|Jul|Aug|Sep|Oct|Nov|Dec)\. $/;
1716
1717		# numbers, like "... and 1997."
1718		next if $s =~ /\d+\. $/;
1719
1720		# ellipsis
1721		next if $s =~ /\.\.\. $/;
1722
1723		# it must be a single-space sentence start
1724		$s =~ s/ $/$li $ri/;
1725		$errcount++;
1726	}
1727
1728	if ( $errcount ) {
1729		# reassemble the now-highlighted string
1730		$txt = $inspace . join('', @sentences);
1731		showline($bname, $line, $ansi{darkblue}, 'use two spaces at sentence start', $txt);
1732	}
1733}
1734
1735sub init_doc_openclose {
1736	print "initializing doc_openclose\n" if $verbose;
1737	@openclose_tags = qw/ callout entry filename footnote li listitem literal p para row step /;
1738	for my $tag (@openclose_tags) {
1739		$opentag{$tag} = 0;
1740	}
1741	$openclose_regex = join('|', @openclose_tags);
1742	my @list_tags = qw/ itemizedlist orderedlist variablelist /;
1743	$list_regex = join('|', @list_tags);
1744	my @parawrap_tags = qw/ footnote listitem /;
1745	$parawrap_regex = join('|', @parawrap_tags);
1746}
1747
1748sub doc_openclose {
1749	my ($bname, $line, $txt) = @_;
1750	return if $txt =~ /^\s*$/;
1751	return if $ignoreblock;
1752	return unless $txt =~ /</;
1753
1754	my $errcount = 0;
1755	my ($inspace, $content) = splitleading($txt);
1756	my @chunks = split(/(<.*?(?:>|$))/, $content);
1757	@chunks = grep (! /^\s*$/, @chunks);
1758
1759	for my $chunk (@chunks) {
1760		next unless $chunk =~ /</;
1761
1762		for my $tag (@openclose_tags) {
1763			next unless $chunk =~ /(?:$openclose_regex)/;
1764			if ( $chunk =~ /$tag/ ) {
1765				# check for open without close
1766				if ( $opentag{$tag} && $chunk =~ /<$tag\b/ ) {
1767					$chunk =~ s/(<$tag\b)/$lh$1$rh/;
1768					showline($bname, $line, $ansi{red}, "open <$tag> without closing", $inspace . join('', @chunks));
1769				}
1770
1771				# check for close without open
1772				if ( ! $opentag{$tag} && $chunk =~ /<\/$tag>/ ) {
1773					$chunk =~ s/(<\/$tag\W)/$lh$1$rh/;
1774					showline($bname, $line, $ansi{red}, "close </$tag> without opening", $inspace . join('', @chunks));
1775				}
1776
1777				# evaluate closes
1778				$opentag{$tag} = 0 if $chunk =~ /<\/$tag>/;
1779				# evaluate opens
1780				$opentag{$tag} = 1 if $chunk =~ /<$tag\b/;
1781			}
1782		}
1783
1784		# special-case closes
1785		# <para> can be inside footnotes or lists
1786		$opentag{'para'} = 0 if $chunk =~ /<(?:$parawrap_regex)\b/;
1787		$opentag{'para'} = 0 if $chunk =~ /<\/(?:$list_regex)>/;
1788
1789		# list tags like <itemizedlist> start a new list
1790		# so 'listitem' is no longer open
1791		$opentag{'listitem'} = 0 if $chunk =~ /<(?:$list_regex)\b/;
1792
1793		# procedures can be nested, so <procedure> closes <step>
1794		$opentag{'step'} = 0 if $chunk =~ /<procedure\b/;
1795
1796
1797		# special-case opens
1798		$opentag{'para'} = 1 if $chunk =~ /<\/(?:$parawrap_regex)>/;
1799		$opentag{'para'} = 1 if $chunk =~ /<(?:$list_regex)\b/;
1800
1801		# list tags like </itemizedlist> end a list
1802		# so 'listitem' is open again
1803		$opentag{'listitem'} = 1 if $chunk =~ /<\/(?:$list_regex)>/;
1804
1805		# procedures can be nested, so </procedure> opens <step>
1806		$opentag{'step'} = 1 if $chunk =~ /<\/procedure\b/;
1807	}
1808}
1809
1810sub init_literalblock_regex {
1811	print "initializing literalblock_regex\n" if $verbose;
1812	# used by multiple tests
1813	$literalblock_regex = 'literallayout|programlisting|screen';
1814}
1815
1816sub doc_tagstyle_whitespace {
1817	my ($bname, $line, $currline) = @_;
1818	return if $ignoreblock;
1819
1820	my $currlinebak = $currline;
1821
1822	# <title>
1823	if ( $currline =~ s/^(\s*\S+.*?)(<title)/$1$lh$2$rh/ ) {
1824		showline($bname, $line, $ansi{darkcyan}, 'put <title> on new line', $currline);
1825		$currline = $currlinebak;
1826	}
1827
1828	# <para>
1829	if ( $currline =~ s/(<para>)\s*$/$1/ ) {
1830		showline($bname, $line, $ansi{red}, 'start <para> content on same line', $currline);
1831	}
1832	if ( $currline =~ s/(<\/para>)([^< ]+)$/$1$lh$2$rh/ ) {
1833		showline($bname, $line, $ansi{red}, 'character data is not allowed here', $currline);
1834		$currline = $currlinebak;
1835	}
1836
1837	# (programlisting>
1838	if ( $currline =~ /<programlisting/ ) {
1839		# <programlisting> should not be used as an inline tag
1840		if ( $currline =~ s/(\S+\s*<programlisting.*?>)/$lh$1$rh/ ) {
1841			showline($bname, $line, $ansi{red}, 'do not use <programlisting> inline in other elements', $currline);
1842			$currline = $currlinebak;
1843		} elsif ( ($currline =~ /\s*<programlisting/)
1844			&& ($prevnonblank !~ /<\/(?:entry|formalpara|indexterm|note|para|programlisting|screen|title)>\s*$/) ) {
1845			# <programlisting> allowed inside these elements
1846			return if $prevnonblank =~ /<(?:example|informalexample)>/;
1847			$currline =~ s/(<programlisting.*?>)/$lh$1$rh/;
1848			showline($bname, $line, $ansi{red}, 'do not use <programlisting> inside other elements', $currline);
1849			$currline = $currlinebak;
1850		}
1851	}
1852
1853	# elements that should be preceded by a blank line
1854	if ( $prevline =~ /\S+/ ) {
1855		# an open tag like <informalexample> is okay, otherwise
1856		# there should be a blank line before these tags
1857		if ( ($prevline !~ /<.*?>\s*$/) && ($currline =~ s/(<(?:$literalblock_regex).*?(?:>|$))/$lh$1$rh/) ) {
1858			showline($bname, $line, $ansi{darkcyan}, "precede $1 with a blank line", $currline);
1859			$currline = $currlinebak;
1860		}
1861	}
1862
1863	# elements that should be followed by a blank line
1864	if ( $currline =~ /\S+/ ) {
1865		# a close tag like </note> is okay, otherwise there
1866		# should be a blank line after these tags
1867		# unless they are followed by another close tag on the same line
1868		# example: </literallayout></entry>
1869		# if ( ($currline !~ /^\s*<\//) && ($prevline =~ /(<\/(?:$literalblock_regex|row|step|title)>)/) ) {
1870		if ( ($currline !~ /^\s*<\//) && ($prevline =~ /(<\/(?:$literalblock_regex|row|step|title)>)/) && ($prevline !~ /<\/entry>$/) ) {
1871			showline($bname, $line, $ansi{darkcyan}, "add blank line after $1 on previous line", "$lh$currline$rh");
1872		}
1873	}
1874}
1875
1876sub init_doc_writestyle {
1877	print "initializing doc_writestyle\n" if $verbose;
1878	$redundantword_regex = 'command|filename|keycap|option';
1879	$redundanttagword_regex = '(<\/(?:command> command|filename> file|keycap> key|option> option))\b';
1880}
1881
1882sub doc_writestyle {
1883	my ($bname, $line, $currline) = @_;
1884	return if $ignoreblock;
1885
1886	my $currlinebak = $currline;
1887
1888	# test for redundant markup and words starting on the previous line
1889	if ( $prevline =~ /(<\/(?:$redundantword_regex)>*\s*$)/ ) {
1890		my $prevend = $1;
1891		for my $word (split('|', $redundantword_regex)) {
1892			next unless $prevend =~ /$word/;
1893			next unless $currline =~ /^\s*>*\s*(\w+)\s*(?:\W+|$)/;
1894			my $firstword = $1;
1895			if ( "$prevend $firstword" =~ /$redundanttagword_regex/ ) {
1896				$currline =~ s/^(\s*)($firstword)\b/$1$lh$2$rh/;
1897				showline($bname, $line-1, $ansi{darkmagenta}, 'redundant markup and word', "... $lh$prevend$rh");
1898				showline($bname, $line,   $ansi{darkmagenta}, 'redundant markup and word', $currline);
1899				$currline = $currlinebak;
1900				last;
1901			}
1902		}
1903	}
1904
1905	# test for redundant markup and words on the current line
1906	if ( $currline =~ /$redundantword_regex/ ) {
1907		if ( $currline =~ s/$redundanttagword_regex/$lh$1$rh/ ) {
1908			showline($bname, $line, $ansi{darkmagenta}, 'redundant markup and word', $currline);
1909			$currline = $currlinebak;
1910		}
1911	}
1912}
1913
1914sub init_doc_stragglers {
1915	print "initializing doc_stragglers\n" if $verbose;
1916	@straggler_tags = qw/ application command entry filename
1917						  guibutton guimenu keycap link literal para
1918						  title ulink uri varname /;
1919}
1920
1921sub doc_stragglers {
1922	my ($bname, $line, $txt) = @_;
1923	return if $txt =~ /^\s*$/;
1924
1925	# check for spaces after open tags or before close tags
1926	# like <title> Something</title>
1927	# or <filename>/etc/rc.conf </filename>
1928
1929	# these tags should not have spaces or tabs around content
1930	# opening tags (this will not catch link tags with attributes)
1931	for my $tag (@straggler_tags) {
1932		next if $tag eq 'entry';
1933		if ( $txt =~ /(<$tag>\s+)/ ) {
1934			print "doc_stragglers opening tags: tag='$tag', found='$1'\n" if $verbose;
1935			$txt = highlight_word($txt, $1);
1936			showline($bname, $line, $ansi{yellow}, "space before content", $txt);
1937		}
1938	}
1939	# closing tags
1940	for my $tag (@straggler_tags) {
1941		next if $tag eq 'entry';
1942		if ( $txt =~ /(\s+<\/$tag>)/ ) {
1943			print "doc_stragglers closing tags: tag='$tag', found='$1'\n" if $verbose;
1944			$txt = highlight_word($txt, $1);
1945			showline($bname, $line, $ansi{yellow}, "space after content", $txt);
1946		}
1947	}
1948	# special case: link tags
1949	# like <link xlink:href="&url.articles.gjournal-desktop;">
1950	# ignore the opening < and just key off of xlink:href
1951	if ( $txt =~ /(xlink:href\S+?>)(.)/ ) {
1952		my $lastchar = $2;
1953		if ( $lastchar eq ' ' || $lastchar eq "\t" ) {
1954			print "doc_stragglers xlink:href, found='$1$lastchar'\n" if $verbose;
1955			$txt = highlight_word($txt, $1);
1956			showline($bname, $line, $ansi{yellow}, "space before content", $txt);
1957		}
1958	}
1959
1960	# check for literal start tags without listing on the same line
1961	my $tag;
1962	if ( $txt =~ />\s*$/ ) {
1963		if ( $txt =~ /<($literalblock_regex)[^<]?>$/ ) {
1964			$tag = $1;
1965			$txt =~ s/(<$tag[^<]?>)$/$lh$1$rh/;
1966			showline($bname, $line, $ansi{yellow}, "put <$tag> listing on same line", $txt);
1967			return;
1968		} elsif ( $txt =~ /^\s*<\/($literalblock_regex)[^<]?>/ ) {
1969			$tag = $1;
1970			$txt =~ s/(<\/$tag[^<]?>)$/$lh$1$rh/;
1971			showline($bname, $line, $ansi{yellow}, "straggling </$tag>", $txt);
1972			return;
1973		}
1974	}
1975
1976	# the following tests are only for close tags at the start of a line
1977	return unless $txt =~ /^\s*<\//;
1978
1979	return if $ignoreblock;
1980
1981	# stragglers can't be detected when coming out of an ignore block
1982	return if ( $prevline =~ /$ignoreblockstart|$ignoreblockend/ );
1983
1984	# more special-case hackery to handle
1985	#   </table>
1986	# </para>
1987	if ( ($prevline =~ /<\/table>\s*$/)
1988		&& ($txt =~ /^\s*<\/para>\s*$/) ) {
1989		return;
1990	}
1991
1992	# even more special-case hackery to handle
1993	#   <para>...</para>
1994	#   <note>...</note>
1995	# </entry>
1996	if ( ($prevline =~ /<\/para>|<\/note>\s*$/)
1997		&& ($txt =~ /^\s*<\/entry>\s*$/) ) {
1998		return;
1999	}
2000
2001	for my $tag (@straggler_tags) {
2002		if ( $txt =~ /^\s*(<\/$tag>)\s*$/ ) {
2003			$txt = highlight_word($txt, $1);
2004			showline($bname, $line, $ansi{yellow}, "straggling </$tag>", $txt);
2005		}
2006	}
2007}
2008
2009sub doc_whitespace {
2010	my ($bname, $line, $txt) = @_;
2011	my $txtbak = $txt;
2012
2013	# indents and tabs/spaces are not significant inside
2014	# ignorable SGML blocks
2015	return if $ignoreblock;
2016
2017	# multiples of eight spaces at the start a line
2018	# (after zero or more tabs) should be a tab
2019	if ( $txt =~ s/^(\t* {8})+/$li$1$ri/g ) {
2020		showline($bname, $line, $ansi{darkmagenta}, 'use tabs instead of spaces', $txt);
2021	}
2022
2023	# tabs hidden in paragraphs is also bad
2024	$txt = $txtbak;
2025	if ( $txt =~ s/^(\s*\S+)(.*)(\t)/$1$2$li$3$ri/ ) {
2026		showline($bname, $line, $ansi{darkmagenta}, 'tab in content', $txt);
2027	}
2028
2029	# if coming out of an ignoreblock, odd spaces are
2030	# an artifact of splitting the line and can't be checked
2031	return if ( $prevline =~ /$ignoreblockstart|$ignoreblockend/ );
2032
2033	# one or more occurrences of single tabs or double spaces,
2034	# followed by a single space, is a bad indent
2035	# if ( $txt =~ s/^((?:(?:  )+|(?:\t+))* )\b/$li$1$ri/ ) {
2036
2037	# but simpler just to expand tabs to 8 spaces
2038	# and check for an odd number of spaces
2039	$txt = $txtbak;
2040	$txt = expand_tabs($txt);
2041	if ( $txt =~ s/^((?:  )* )\b/$li$1$ri/ ) {
2042		showline($bname, $line, $ansi{darkred}, 'bad indent', $txt);
2043	}
2044}
2045
2046
2047# DocBook batch tests
2048
2049
2050
2051# remember previous line for comparison
2052sub saveprevline {
2053	my $pline = shift;
2054	$prevline = $pline;
2055	if ( $pline =~ /\S+/ ) {
2056		# treat comments as blank lines
2057		return if $pline =~ /\s*<!--/;
2058		return if $pline =~ /-->\s*$/;
2059		$prevnonblank = $pline;
2060	}
2061}
2062
2063
2064initialize();
2065
2066if ( $opt_X ) {
2067	print "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
2068	print "<checkstyle version=\"7.1\">\n";
2069}
2070
2071# main loop
2072foreach my $fname (@ARGV) {
2073	if ( $fname ne 'stdin' ) {
2074		next if -d $fname;
2075		unless ( -f $fname ) {
2076			print "$fname: not found\n";
2077			next;
2078		}
2079		unless ( -r $fname ) {
2080			print "$fname: not readable\n";
2081			next;
2082		}
2083	}
2084
2085	unless ( $opt_X ) {
2086		print "$fname:\n" if $#ARGV > 0;
2087	} else {
2088		print "  <file name=\"", xmlize($fname), "\">\n";
2089	}
2090	$fname = writestdinfile() if $fname eq "stdin";
2091
2092	$bname = basename($fname);
2093	$tmpfile = '';
2094	$type = filetype($fname);
2095
2096	if ( $type =~ /gzip|bzip/ ) {
2097		$tmpfile = uncompress($fname, $type);
2098		$type = filetype($tmpfile);
2099	}
2100
2101	print "detected file type:$type\n" if $verbose;
2102
2103	open $fh, '<', ($tmpfile ? $tmpfile : $fname) or die "cannot open '$tmpfile':$!\n";
2104
2105	# reset for each new document
2106	init_mdoc_uniqxrefs() if $opt_g;	# mdoc see also xrefs
2107	init_mdoc_structure() if $opt_m;	# mdoc tag presence
2108	$ignoreblock = 0;		# ignore SGML block
2109	my $saveindent = '';	# SGML indent level
2110
2111	# line-by-line tests
2112	while (<$fh>) {
2113		# limit output to one vulnerability ID
2114		if ( $vid ) {
2115			if ( $_ =~ /<vuln/ ) {
2116				print "checking for VID in '$_'\n" if $verbose;
2117				if ( $vid eq 'latest' ) {
2118					$startline = $. if $_ =~ /<vuln vid=/;
2119				} else {
2120					$startline = $. if $_ =~ /<vuln vid=\"$vid\"/;
2121				}
2122				$stopline  = $. if $_ =~ /<\/vuln/;
2123				print "VID: startline=$startline, stopline=$stopline\n" if $verbose;
2124			}
2125		}
2126
2127		# end if past specified ending line number
2128		last if $stopline && ($. > $stopline);
2129
2130		chomp;
2131
2132		# global tests
2133		abbrevs($bname, $., $_)         if $opt_a;
2134		badphrases($bname, $., $_)      if $opt_b;
2135		contractions($bname, $., $_)    if $opt_u;
2136		freebsdobsolete($bname, $., $_) if $opt_f;
2137		repeatedwords($bname, $., $_)   if $opt_r;
2138		spellingerrors($bname, $., $_)  if $opt_s;
2139		whitespace($bname, $., $_)      if $opt_w;
2140
2141		# mdoc line tests
2142		if ( $type eq "troff" ) {
2143			next if /^\.\\\"/;	# ignore comments for these tests
2144
2145			mdoc_whitespace($bname, $., $_) if $opt_p;
2146			mdoc_date($bname, $., $_)       if $opt_d;
2147			mdoc_sentence($bname, $., $_)   if $opt_e;
2148			mdoc_uniqxrefs($bname, $., $_)  if $opt_g;
2149			mdoc_structure($bname, $., $_)  if $opt_m;
2150		}
2151
2152		# DocBook line tests
2153		if ( $type =~ /sgml|xml/ ) {
2154			$origline = $_;
2155			doc_stragglers($bname, $., $_)          if $opt_S;
2156			doc_tagstyle_whitespace($bname, $., $_) if $opt_t;
2157			for my $segment (splitter($_)) {
2158				if ( $segment =~ /($ignoreblockstart)/ ) {
2159					# when entering an ignore block, test the full
2160					# line for indentation unless it is a comment
2161					unless ( $origline =~ /^\s*<!--/ ) {
2162						doc_indentation($bname, $., $origline) if $opt_i;
2163						# test just the indent for whitespace
2164						my ($origindent, undef) = splitleading($origline);
2165						doc_whitespace($bname, $., $origindent) if $opt_W;
2166						$saveindent = leading_space($origline);
2167						# save the same state information as the main loop would
2168						saveprevline($saveindent . $1);
2169						# test just the leading whitespace
2170					}
2171					$ignoreblock++;
2172					next;
2173				} elsif ( $segment =~ /($ignoreblockend)/ ) {
2174					# restore the indent level at the end of an ignore block
2175					$ignoreblock--;
2176					$prevline = substr($saveindent,0,length($saveindent)-2) . $1;
2177					next;
2178				}
2179				doc_titles($bname, $., $segment)      if $opt_c;
2180				doc_indentation($bname, $., $segment) if $opt_i;
2181				doc_longlines($bname, $., $segment)   if $opt_l;
2182				doc_sentence($bname, $., $segment)    if $opt_n;
2183				doc_openclose($bname, $., $segment)   if $opt_o;
2184				doc_writestyle($bname, $., $segment)  if $opt_E;
2185				doc_whitespace($bname, $., $segment)  if $opt_W;
2186			}
2187		}
2188		saveprevline($_);
2189	}
2190
2191	close $fh or die "could not close file:$!\n";
2192
2193	if ( $opt_d || $opt_y ) {
2194		# skip batch tests if a line range is set
2195		last if $opt_C;
2196
2197		# slurp the whole file
2198		open $fh, '<', ($tmpfile ? $tmpfile : $fname) or die "cannot open '$tmpfile':$!\n";
2199		my $fulltext = do { local($/); <$fh> };
2200		close $fh or die "could not close file:$!\n";
2201
2202		# global batch tests
2203		style($bname, $fulltext) if $opt_y;
2204
2205		# mdoc batch tests
2206		if ( ($type eq "troff") && ($opt_d) && (!$docdate) ) {
2207			showline($bname, '-', '.Dd date not set', '', '');
2208		}
2209	}
2210
2211	if ( $opt_X ) {
2212		print "  </file>\n";
2213	}
2214
2215	removetempfiles();
2216}
2217
2218if ( $opt_X ) {
2219	print "</checkstyle>\n";
2220}
2221