#!/usr/bin/env perl
# ts=4
# Warren Block
# special thanks to Glen Barber for limitless
# patience and the use of his svn repository

# igor: check man pages and DocBook
# needs Perl 5.8 or higher

use strict;
use warnings;

#  Copyright (c) 2012 Warren Block
#  All rights reserved.
#
#  Redistribution and use in source and binary forms, with or without
#  modification, are permitted provided that the following conditions
#  are met:
#  1. Redistributions of source code must retain the above copyright
#     notice, this list of conditions and the following disclaimer.
#  2. Redistributions in binary form must reproduce the above copyright
#     notice, this list of conditions and the following disclaimer in the
#     documentation and/or other materials provided with the distribution.
#
#  THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
#  ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
#  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
#  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
#  OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
#  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
#  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
#  OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
#  SUCH DAMAGE.

use Getopt::Std;
use File::Basename;
use POSIX qw/strftime/;

my $file  = "/usr/bin/file";
my $gzcat = "/usr/bin/gzcat";
my $bzcat = "/usr/bin/bzcat";
my $man   = "/usr/bin/man";

my $tmpdir = "/tmp";

my $rev = '$Revision$';

my ($fh, $tmpfile, $stdinfile, $docdate);

my ($prevline, $origline) = ('', '');
my $ignoreblock;
my $titleblock = 0;
my $today;

my ($ignoreblockstart, $ignoreblockend);
my %misspelled_words;
my @badphrases;
my @freebsdobs;
my ($lc_regex, $uc_regex, $ignoreregex);
my ($indent_regex, $inline_regex);
my (@straggler_tags, $literalblock_regex);
my (@openclose_tags, %opentag, $list_regex, $parawrap_regex);

my ($bname, $type);

my $prog = basename($0);

sub usage {
	$rev =~ /Revision: (\d+)/;
	my $version = "1.$1";
	print <<USAGE;
$prog $version
usage: $prog [-h] | [-abcdefilmnorstwyDRSW] file [file ...]

    -h  show summary of command line options and exit

    Output options
        -R  ANSI highlights (use with 'less -R')

    Tests
    	If individual test options are given, only those tests are done.

    Shortcuts
        -z  all standard non-whitespace tests
        -Z  all standard whitespace tests

    Tests for all files
        -a  abbreviations like "e.g.," and "i.e.,"
        -b  bad phrases
        -f  FreeBSD obsolete features
        -r  repeated words
        -s  spelling
        -w  whitespace
        -y  style suggestions (off by default)

    mdoc(7) tests
        -d  mdoc document date
        -e  sentences should begin on a new line
        -m  mdoc structure requirements
        -D  all but document date (same as -befmrsw)

    DocBook tests
        -c  title capitalization
        -i  indentation
        -l  long lines
        -n  sentences start with two spaces
        -o  open/close tags match
        -t  tag usage style
        -S  straggler tags on lines after content
        -W  whitespace on SGML indentation

    EXAMPLES

        $prog -R gpart.8.gz | less -R -S
        $prog -R -D -y /usr/share/man/man7/tuning.7.gz | less -R -S
        cat /usr/share/man/man1/csh.1.gz | $prog -D
        $prog -Rz chapter.sgml | less -RS
        $prog -R `find /usr/doc/en_US.ISO8859-1/ -name "*.sgml"` | less -RS
        $prog -RD /usr/share/man/man8/* | less -RS

    gzip and bzip2 files are automatically decompressed.
USAGE
	exit 0;
}

our ($opt_a, $opt_b, $opt_c, $opt_d, $opt_e, $opt_f, $opt_h, $opt_i,
	 $opt_l, $opt_m, $opt_n, $opt_o, $opt_r, $opt_s, $opt_t, $opt_w,
	 $opt_y, $opt_z, $opt_D, $opt_R, $opt_S, $opt_W, $opt_Z);

getopts('abcdefhilmnorstwyzDRSWZ');

usage() if $opt_h;

# ANSI color codes
my @colors = qw/ red green yellow blue magenta cyan /;
my %ansi;
my $inverse  = "\033[7m";
my $reset    = "\033[0;24;27m";
my $lf = '';	# filename
my $rf = '';
my $ll = '';	# line number
my $lr = '';
my $lh = '[';	# highlight
my $rh = ']';
my $li = '[';	# whitespace
my $ri = ']';

# mdoc macros
my @macros = (qw/ Dd Dt Os Sh_NAME Nm Nd Sh_SYNOPSIS Sh_DESCRIPTION /);
my %macroval;

sub INT_handler {
	( close $fh or die "could not close filehandle:$!\n" ) if fileno($fh);
	removetempfiles();
	exit 0;
}

sub initialize {
	$today = strftime("%B %e, %Y", localtime);
	$today =~ s/  / /g;

	# ANSI color codes
	for my $i (0..@colors-1) {
		$ansi{"dark$colors[$i]"} = "\033["   . ($i+31) . "m";
		$ansi{"$colors[$i]"}     = "\033[1;" . ($i+31) . "m";
	}

	# use ANSI highlights
	if ( $opt_R ) {
		$lf = $ansi{darkyellow};	# filename
		$rf = $reset;
		$ll = $ansi{darkcyan};		# line number
		$lr = $reset;
		$lh = $ansi{darkgreen};		# highlight
		$rh = $reset;
		$li = $inverse;				# whitespace
		$ri = $reset;
	}

	# -D equals -befmrsw
	if ( $opt_D ) {
		$opt_b = $opt_e = $opt_f = $opt_m = $opt_r = $opt_s
		= $opt_w = 1;
	} elsif ( $opt_z ) {
		# all non-whitespace tests
		$opt_a = $opt_b = $opt_c = $opt_d = $opt_e = $opt_f = $opt_m
		= $opt_o = $opt_r = $opt_s = $opt_t = 1;
	} elsif ( $opt_Z ) {
		# all whitespace tests
		$opt_i = $opt_l = $opt_n = $opt_t = $opt_w = $opt_S = $opt_W = 1;
	} else {
		# if no tests are chosen, do them all
		unless ( $opt_a || $opt_b || $opt_c || $opt_d || $opt_e
			  || $opt_f || $opt_i || $opt_l || $opt_m || $opt_n
			  || $opt_o || $opt_r || $opt_s || $opt_t || $opt_w
			  || $opt_y || $opt_S || $opt_W ) {
			$opt_a = $opt_b = $opt_c = $opt_d = $opt_e
				   = $opt_f = $opt_i = $opt_l = $opt_m = $opt_n
				   = $opt_o = $opt_r = $opt_s = $opt_t = $opt_w
				   = $opt_S = $opt_W = 1;
			$opt_y = 0;
		}
	}

	init_ignoreblocks();
	init_spellingerrors();
	init_badphrases();
	init_freebsdobs();
	init_mdoc_structure();
	init_doc_titles();
	init_doc_indentation();
	init_doc_openclose();
	init_doc_stragglers();

	# ctrl-c handler
	$SIG{'INT'} = 'INT_handler';
	# do the same thing if the pipe closes
	$SIG{'PIPE'} = 'INT_handler';

	# autoflush
	$| = 1;

	# allow stdin
	push @ARGV, "stdin" if $#ARGV < 0;
}

sub filetype {
	my $fname = shift;
	my $out = `$file -b $fname`;
	$out =~ /^(\S+\s+\S+)/;	# first two words
	if ( $1 ) {
		my $id = $1;
		for ( $id ) {
			if    ( /^troff/ )         { return "troff"   }
			elsif ( /^exported SGML/ ) { return "sgml"    }
			elsif ( /^gzip/ )          { return "gzip"    }
			elsif ( /^bzip/ )          { return "bzip"    }
			else                       { return "unknown" }
		}
	}
	return "unknown";
}

sub uncompress {
	my ($fname, $type) = @_;
	my $tmpfile = "$tmpdir/$prog-tmp.$$";
	for ( $type ) {
		if ( /gzip/ ) {
			system("$gzcat $fname > $tmpfile") == 0
				or die "could not create '$tmpfile':$!\n";
		}
		elsif ( /bzip/ ) {
			system("$bzcat $fname > $tmpfile") == 0
				or die "could not create '$tmpfile':$!\n";
		}
		else {
			die "unknown compression type '$type'\n";
		}
	}
	return $tmpfile;
}

sub writestdinfile {
	$stdinfile = "$prog-stdin.$$";
	open $fh, ">", $stdinfile or die "could not create '$stdinfile':$!\n";
	print $fh <STDIN>;
	close $fh or die "could not close '$stdinfile':$!\n";
	return $stdinfile;
}

sub removetempfiles {
	if ( $stdinfile && -f $stdinfile ) {
		unlink $stdinfile or die "could not remove '$stdinfile':$!\n";
	}
	if ( $tmpfile && -f $tmpfile ) {
		unlink $tmpfile   or die "could not remove '$tmpfile':$!\n";
	}
}

sub showline {
	my ($bname, $linenum, $color, $errordesc, $txt) = @_;
	print "$lf$bname$rf:";
	print "$ll$linenum$lr:";
	print $color if $opt_R;
	print "$errordesc";
	print $reset if $opt_R;
	print ":$txt\n";
}

sub init_ignoreblocks {
	# create regex for sgml block start and end
	my @ignoreblock_tags = qw/ literallayout screen programlisting /;
	$ignoreblockstart = '(?:<!--|<!\[';
	for my $tag (@ignoreblock_tags) {
		$ignoreblockstart .= "|<$tag.*?>";
	}
	$ignoreblockstart .= ')';
	$ignoreblockend = '(?:-->|\]\]>';
	for my $tag (@ignoreblock_tags) {
		$ignoreblockend .= "|<\/$tag>";
	}
	$ignoreblockend .= ')';
}

# global tests

sub abbrevs {
	my ($bname, $line, $txt) = @_;
	return if $ignoreblock;
	my $txtbak = $txt;;

	if ( $txt =~ /\We\.?g\.[^,:]/i ) {
		$txt =~ s/(e\.?g\.)/$lh$1$rh/i;
		showline($bname, $line, $ansi{darkmagenta}, 'no comma after "e.g."', $txt);
	}

	$txt = $txtbak;
	if ( $txt =~ /\Wi\.?e\.[^,:]/i ) {
		$txt =~ s/(i\.?e\.)/$lh$1$rh/i;
		showline($bname, $line, $ansi{darkmagenta}, 'no comma after "i.e."', $txt);
	}

	$txt = $txtbak;
	if ( $txt =~ /\Wa\.k\.a\./i ) {
		$txt =~ s/(a\.k\.a\.)/$lh$1$rh/i;
		showline($bname, $line, $ansi{darkmagenta}, 'use "aka"', $txt);
	}

	$txt = $txtbak;
	if ( $txt =~ /\Wvs(?:\.|\s)/i ) {
		$txt =~ s/(vs\.)/$lh$1$rh/i;
		showline($bname, $line, $ansi{darkmagenta}, '"versus" abbreviated', $txt);
	}
}

sub init_badphrases {
	@badphrases = ("chroot'd", "compress'd", 'equally as', "gzip'd",
				   'the to', 'this mean that', 'to for');
}

sub badphrases {
	my ($bname, $line, $txt) = @_;
	my $txtbak = $txt;
	for my $bad (@badphrases) {
		$txt = $txtbak;
		if ( $txt =~ s/(\b\Q$bad\E)\b/$lh$1$rh/i ) {
			showline($bname, $line, $ansi{yellow}, 'bad phrase', $txt);
		}
		# detect bad phrases wrapping over two lines
		# skip this test if the phrase was all on one line
		next if ( $prevline =~ /\Q$bad\E\b/i );
		$txt = "$prevline $txtbak";
		if ( $txt =~ /\Q$bad\E\b/i ) {
			my @right = split /\s/, $bad;
			my @left  = ();
			my $leftstr = '';
			while ( @right ) {
				push @left, shift @right;
				$leftstr = join ' ',@left;
				last if ( $prevline =~ /(\Q$leftstr\E)\s*$/i );
			}
			unless ( $leftstr =~ /\Q$bad\E/ ) {
				showline($bname, $line - 1, $ansi{yellow}, 'bad phrase',
					"... $lh$leftstr$rh");
				$txt = $txtbak;
				my $rightstr = join ' ', @right;
				$txt =~ s/(\Q$rightstr\E)/$lh$1$rh/i;
				showline($bname, $line, $ansi{yellow}, 'bad phrase', $txt);
			}
		}
	}
}

sub init_freebsdobs {
	@freebsdobs = qw/ cvsup /;
}

sub freebsdobsolete {
	my ($bname, $line, $txt) = @_;
	for my $word (@freebsdobs) {
		if ( $txt =~ s/(\s+)($word)([^.]+.*)$/$1$lh$2$lr$3/ ) {
			showline($bname, $line, $ansi{darkgreen}, 'freebsd-obsolete', $txt);
		}
	}
}

sub repeatedwords {
	my ($bname, $line, $txt) = @_;
	my $txtbak = $txt;
	my %count = ();
	map { $count{$_}++ } (split /\W+/, $txt);
	for my $word (keys %count) {
		next if $count{$word} < 2;
		next if $word =~ /\d{1}/;
		# special cases
		# skip Fl mdoc command
		next if $word =~ /Fl|Ns|Oc|Oo/;
		$txt = $txtbak;
		if ( $txt =~ s/(^|\s+)(\Q$word\E)(\s+)(\Q$word\E)(\W+)/$1$lh$2$3$4$rh$5/i ) {
			showline($bname, $line, $ansi{darkred}, 'repeated', $txt);
		}
	}
	# check for repeated word from the end of the previous line
	# to the beginning of the current line
	# $prevline =~ m%(\w+\s+)*([^ *.#|+-]+\s*)$%;
	$prevline =~ m%(\w+\s+)*(\S+\s*)$%;
	my $cmd = ($1 ? $1 : '');
	my $prevlastword = ($2 ? $2 : '');
	# short-circuit when the previous line...
	# had no last word
	return unless $prevlastword;
	# didn't repeat any of the words on the current line
	$count{$prevlastword}++;
	return unless $count{$prevlastword} > 1;
	# was a groff(7) comment
	return if $prevlastword eq '.c';
	# was a groff(7) zero-space character for tables (\&.)
	return if $prevlastword eq '\&.';
	# was a single non-word character
	return if $prevlastword =~ /^\W{1}$/;
	# was an mdoc(7) or nroff(7) comment
	return if $prevlastword =~ /^\W{1}\\\"/;
	# was an mdoc command
	return if $prevlastword =~ /\.(?:Ar|Oo|Nm|Tp)/i;
	# when the next-to-last word was an mdoc command
	return if $cmd =~ /Ar |Cm |Fa |Em |Ic |Ip |It |Li |Pa |Ss /i;
	if ( $txt =~ s/^\s*(\Q$prevlastword\E)(\s+.*)$/$lh$1$rh$2/ ) {
		showline($bname, $line - 1, $ansi{darkred}, 'repeated',
			"... $cmd$lh$prevlastword$rh");
		showline($bname, $line, $ansi{darkred}, 'repeated', $txt);
	}
}

sub init_spellingerrors {
	for my $word (qw/ abondan acceptible acces accesed accesing accomodate accross achitecture
			acknowledgent addresss adhearance adminstrator advices aggregatable albel albels
			alot alright ammount ande annonymous anonymus anormalous approprate aqueue arbitary
			arguements aritmetic aritmetics assocation assoicated asynchroneous authorty
			automaticaly availabe availible availablity begining belive besure boostrap
			bootsrap boundries boundry brower browseable calcualted cannonical cant captial
			caracteristics certian certificat choise chronologocal cince colision colisions
			comiters comming commited commiter commiters commiting comparision compability
			compatability compatiblity conjuction connecter connecters connectin connnects
			consistant coordinatory credentail credentails currenly deamon debuging decidely
			deicde decribed dependant dependancy dependancys dependendencies dependiency
			descendents detec detecing developement devide dictonary dieing differenciate
			diffrent diffsof directorys diretories diretory documenation doesen ect efficent
			emporer enviornment esle everytime exagerate excercize excercize exibits existance
			explicitely exponentionally extemely extention extentions extremly facilites
			failback formated forusers foward freind fucntion fuction fulfil funtion futher
			guarentee guarentees hapen hereon hierachy hiearchy hte idosyncracies improvments
			incomming indended indentical independant indepth inital inititialization inputed
			installtion intepretation interchangably intial isonly issueing intall kernal
			knowlege lable lables langage languge layed libary libraru linerly loosing
			managment manaul mangagement maintainance minumum multipled myst neccessary
			negociated nomally noone occured ommit ommited ommitt ommitted optiion ouput
			overriden paramtere paramters partion partions partiton partitons pathes pepetual
			pepetually perfom perfoms peripherial peripherials phoneix physial platfrom
			posseses preceed preceeded preceeds prefered prefferred preform presense pricipal
			proceedure progam progams propogate protcol protcols provde recieved refering
			refulat relevent reloation reloations resemblence respecitively rewriten
			sepcifies sepcify seperate seperated seperates seperation setable seting setings
			settt shuting significnat simultanious slighly sofware soley someway spearator
			specifes specifiy specifiying splitted stantdard steping subet subsytems succeds
			succesful supressed supresses surpressed synonomous sytem sytems teh termporary
			thefirst therefor throughly todays tpye tradtional transfered transfering
			translater translaters truely typicaly undesireable unfreezed unlinke
			unresolveable upto usally usefull usse wich wierd withough wor wsouse /) {
		$misspelled_words{$word} = 1;
	}
}

sub spellingerrors {
	my ($bname, $line, $txt) = @_;
	my $txtbak = $txt;
	my @words = split /\W+/, lc($txt);
	for my $currentword (@words) {
		if ( $misspelled_words{$currentword} ) {
			$txt = $txtbak;
			$txt =~ s/\b(\Q$currentword\E)\b/$lh$1$rh/i;
			showline($bname, $line, $ansi{darkmagenta}, 'spelling', $txt);
		}
	}
}

sub whitespace {
	my ($bname, $line, $txt) = @_;
	my $txtbak = $txt;
	if ( $txt =~ s/^(\s+)$/$li$1$ri/ ) {
		showline($bname, $line, $ansi{darkblue}, 'blank line with whitespace', $txt);
	}
	$txt = $txtbak;
	if ( $txt =~ s/(\S+)(\s+)$/$1$li$2$ri/ ) {
		showline($bname, $line, $ansi{darkblue}, 'trailing whitespace', $txt);
	}
	$txt = $txtbak;
	if ( $txt =~ s/( +)\t+/$li$1$ri/ ) {
		showline($bname, $line, $ansi{darkmagenta}, 'tab after space', $txt);
	}
}


# global batch tests
sub style {
	my ($bname, $txt) = @_;
	print "$lf$bname style check:$rf\n";

	my $you = ($txt =~ s/you\b/you/gi);
	my $your = ($txt =~ s/your/your/gi);
	if ( $you || $your ) {
		print "  $lh\"you\" used $you time", ($you==1 ? '':'s'), "$rh\n" if $you;
		print "  $lh\"your\" used $your time", ($your==1 ? '':'s'), "$rh\n" if $your;
		print "    \"You\" and \"your\" are informal and subjective.\n";
		print "    Try to be formal and objective: \"the file\" rather than \"your file\".\n";
	}

	my $should = ($txt =~ s/should/should/gi);
	if ( $should ) {
		print "  $lh\"should\" used $should time", ($should==1 ? '':'s'), "$rh\n";
		print "    Use \"should\" sparingly, it is feeble.\n";
		print "    Try to be imperative: \"do this\" rather than \"you should do this\".\n";
	}

	my $obviously = ($txt =~ s/obviously/obviously/gi);
	if ( $obviously ) {
		print "  $lh\"obviously\" used $obviously time", ($obviously==1 ? '':'s'), "$rh\n";
		print "    If it is really obvious, it does not need to be pointed out.\n";
	}

	my $needless  = ($txt =~ s/needless to say/needless to say/gi);
	if ( $needless ) {
		print "  $lh\"needless to say\" used $needless time", ($needless==1 ? '':'s'), "$rh\n";
		print "    If it doesn't need to be said, why say it?\n";
	}

	my $simply = ($txt =~ s/simply/simply/gi);
	my $basically = ($txt =~ s/basically/basically/gi);
	if ( $simply || $basically ) {
		print "  $lh\"simply\" used $simply time", ($simply==1 ? '':'s'), "$rh\n" if $simply;
		print "  $lh\"basically\" used $basically time", ($basically==1 ? '':'s'), ".$rh\n" if $basically;
		print "    Can be read as patronizing.\n";
	}

	my $the = ($txt =~ s/(?:^the|\.\s+the)\b/the/gi);
	my $sent = ($txt =~ s/([^.]+\.\s+)/$1/gi);
	my $percent = ($sent > 0 ? int($the/$sent*100) : 0);
	if ( $the && ($percent > 19) ) {
		print "  $lh\"The\" used to start a sentence $the time", ($the==1 ? '':'s'), " in $sent sentence", ($sent==1 ? '':'s'), " ($percent%)$rh\n";
		print "    Starting too many sentences with \"the\" can be repetitive\n";
		print "    and dull to read.\n";
	}

	my $eg = ($txt =~ s/e\.g\./e.g./gi);
	my $ie = ($txt =~ s/i\.e\./i.e./gi);
	if ( $eg ) {
		print "  $lh\"e.g.\" used $eg time", ($eg==1 ? '':'s'), "$rh\n";
		print "    \"E.g.\" (Latin \"exempli gratia\") means \"${lf}for example$rf\" and is mostly\n";
		print "    used in academic and scientific writing.  Consider replacing with the\n";
		print "    more common English words.  Both forms are usually followed by a\n";
		print "    comma for a verbal pause:  \"e.g., a b c\" or \"for example, a b c\"\n";
	}
	if ( $ie ) {
		print "  $lh\"i.e.\" used $ie time", ($ie==1 ? '':'s'), "$rh\n";
		print "    \"I.e.\" (Latin \"id est\") means \"${lf}that is$rf\" and is mostly used in academic\n";
		print "    and scientific writing.  Consider replacing with the more common\n";
		print "    English words.  Both forms are usually followed by a comma for\n";
		print "    a verbal pause:  \"i.e., a b c\" or \"that is, a b c\"\n";
	}

	# type-specific tests
	if ( $type eq "troff" ) {
		my $examples = ($txt =~ /\n\.\s*Sh\s+EXAMPLES/i);
		unless ( $examples ) {
			print "  ${lh}no \"EXAMPLES\" section found$rh\n";
			print "    Even trivial examples can improve clarity.\n";
			print "    Common-use examples are better yet.\n";
		}
	}
}

# mdoc line-by-line tests
sub mdoc_date {
	my ($bname, $line, $txt) = @_;
	if ( $txt =~ s/^\.\s*Dd\s+(.*)$/$lh$1$rh/ ) {
		$docdate = $1 if $1;
		showline($bname, $line, $ansi{darkyellow}, 'date not today', $txt) if $docdate ne $today;
	}
}

sub mdoc_sentence {
	my ($bname, $line, $txt) = @_;
	if ( $txt =~ s/^(\w{2,}.*?[^ .]{2,}\.\s+)(A |I |\w{2,})(.*)$/$1$lh$2$3$rh/ ) {
		showline($bname, $line, $ansi{darkcyan}, 'sentence not on new line', $txt);
	}
}

sub showmacvals {
	my ($lastmacro, $bname, $line) = @_;
	for my $macro (@macros) {
		last if $macro eq $lastmacro;
		unless ( $macroval{$macro} ) {
			showline($bname, $line, $ansi{red}, ".$lastmacro used here", "but .$macro has not been defined");
		}
	}
}

sub init_mdoc_structure {
	for my $macro (@macros) {
		$macro =~ tr/_/ /;
		$macroval{$macro} = '';
	}
}

sub mdoc_structure {
	my ($bname, $line, $txt) = @_;
	return unless substr($txt, 0, 1) eq '.';

	# check for required minimum macros
	my $parm;
	for my $macro (@macros) {
		$parm = '';
		$macro =~ tr/_/ /;
		next if $macroval{$macro};
		if ( $txt =~ /^\.\s*\Q$macro\E\s*(.*)/i ) {
			my $parm = $1;
			# provide a blank parameter for macros with optional parameters
			$parm = ' ' if ($macro =~ /^Os|Sh NAME|Sh SYNOPSIS|Sh DESCRIPTION/) && (!$parm);
			$macroval{$macro} = $parm;
			showmacvals($macro, $bname, $line);
			last;
		}
	}

	# check external refs (.Xr)
	# suggested by Glen Barber
	return unless $txt =~ /^.Xr/;

	# characters to treat as whitespace in an Xr macro
	my $wspace = '[ (),.:]';
	# character class for section numbers
	# an initial number possibly followed by a letter
	my $sect = '\d{1}[A-Za-z]?';

	my $xname = '';
	$xname = $1 if $txt =~ /^.Xr$wspace+(\S+)/;
	my $xsection = '';
	$xsection = $1 if $txt =~ /^.Xr$wspace+\S+$wspace+($sect)/;

	if ( ! $xname ) {
		showline($bname, $line, $ansi{yellow}, 'xref name missing', $txt);
		return;
	}

	if ( $xname =~ /\($sect\)/ ) {
		$txt =~ s/($xname)/$lh$1$rh/;
		showline($bname, $line, $ansi{yellow}, 'section number in name', $txt);
		return;
	}

	if ( $xsection && ($xsection gt "9") ) {
		$txt =~ s/^(.Xr$wspace+\S+$wspace+)($sect)/$1$lh$2$rh/;
		showline($bname, $line, $ansi{yellow}, 'section higher than 9', $txt);
		# no point in checking for sections higher than 9
		return;
	}

	system("$man -w $xsection $xname >/dev/null 2>&1");
	if ( $? ) {
		if ( $xsection ) {
			$txt =~ s/^(.Xr$wspace+)(\S+$wspace+$sect)/$1$lh$2$rh/;
		} else {
			$txt =~ s/^(.Xr$wspace+)(\S+)/$1$lh$2$rh/;
		}
		showline($bname, $line, $ansi{darkmagenta}, 'external man page not found', $txt);
		# not found, no point in checking if it's this one
		return;
	}

	# is this external reference referring to itself?
	# skip if the .Nm macro has no value
	return if $macroval{'Nm'} ne $xname;
	my $currsection = '';
	if ( $macroval{'Dt'} =~ /^\S+\s+($sect)/ ) {
		$currsection = $1;
	}
	return if $xsection ne $currsection;
	if ( $xsection && $currsection ) {
			$txt =~ s/^(.Xr$wspace+)(\S+$wspace+$sect)/$1$lh$2$rh/;
		} else {
			$txt =~ s/^(.Xr$wspace+)(\S+)/$1$lh$2$rh/;
		}
	showline($bname, $line, $ansi{darkmagenta}, 'xref refers to *this* page', $txt);
}


# DocBook line-by-line tests

sub is_lowercase {
	my $word = shift;
	return $word =~ /^[a-z]{1}/;
}

sub is_uppercase {
	my $word = shift;
	return $word =~ /^[A-Z]{1}/;
}

sub highlight_word {
	my ($txt, $word) = @_;
	$txt =~ s/\Q$word\E/$lh$word$rh/g;
	return $txt;
}

sub highlight_string {
	my $txt = shift;
	return "$lh$txt$rh";
}

sub expand_tabs {
	my $txt = shift;
	$txt =~ s/\t/        /g;
	return $txt;
}

sub leading_space {
	my $txt = shift;
	my $leading;
	$txt =~ /^(\s+)/;
	$leading = ($1 ? $1 : '');
	$leading = expand_tabs($leading);
	return $leading;
}

sub splitter {
	my $txt = shift;
	return ($txt) unless ( $txt =~ /$ignoreblockstart|$ignoreblockend/ );
	my @split = split /($ignoreblockstart|$ignoreblockend)/, $txt;
	return grep { ! /^$/ } @split;
}

sub init_doc_titles {
	# build regex of words that should be lowercase in titles
	my @lc_words = qw/ a an and at by down for from in into like near
					   of off on onto over past the to upon with /;
	$lc_regex = '(?:' . join('|', @lc_words) . ')';
	my @uc_words = qw/ new use one two three four five six seven eight nine /;
	$uc_regex = '(?:' . join('|', @uc_words) . ')';

	# build regex for ignoring DocBook tagged words in titles
	# like <command>ls</command>
	my @ignoretags = qw/ acronym application command filename function
						 hostid literal makevar replaceable /;
	for my $tag (@ignoretags) {
		$tag = "<$tag.*?>.*?<\/$tag>";
	}
	$ignoreregex = '<anchor.*?>|' . join('|', @ignoretags)
}

sub doc_titles {
	my ($bname, $line, $txt) = @_;
	my $txtbak = $txt;

	return if $ignoreblock;
	$titleblock = 1 if $txt =~ /<title/;
	return unless $titleblock;

	my @words;

	# take the text from between title tags, or the
	# whole line if a title tag is not present
	# split the result into an array of words, keeping
	# ignorable DocBook tags wrapped around text
	if ( ($txt =~ /<title.*?>(.*?)(?:<\/title>|$)/)
		 || ($txt =~ /(.*)(?:<\/title>)/) ) {
		# @words = grep (! /^\s*$/, split /($ignoreregex|\s+)/, $1);
		@words = split /($ignoreregex|\s+)/, $1;
	} else {
		# @words = grep (! /^\s*$/, split /($ignoreregex|\s+)/, $txt);
		@words = split /($ignoreregex|\s+)/, $txt;
	}

	# filter out single tags like <anchor id="something">
	@words = grep { ! /<anchor.*?>/ } @words;

	# use AP style: capitalize words longer than three letters
	WORD: for my $i (0..$#words) {
		my $word = $words[$i];

		next WORD if $word =~ /$ignoreregex/;

		# special case: skip the contents of some unfinished tags
		# <title>Configuring <acronym role="Domain Name
		#   System">DNS</acronym></title>
		next WORD if $word =~ /(?:role)=/;

		# special case: allow single lowercase "s" for plurals
		next WORD if $word eq 's';

		# special case words that should not be capitalized
		next WORD if $word =~ /^(?:amd64|i386|x86)$/;

		# first word should be capitalized
		if ( ($txt =~ /<title/) && ($i == 0) ) {
			if ( is_lowercase($word) ) {
				$words[$i] = highlight_string($word);
			}
			# first word is special, skip other tests
			next WORD;
		}

		# last word should be capitalized
		if ( ($txt =~ /<\/title/) && ($i == $#words) ) {
			if ( is_lowercase($word) ) {
				$words[$i] = highlight_string($word);
			}
			# last word is special, skip other tests
			last WORD;
		}

		# words that should be lower case
		if ( is_uppercase($word) ) {
			if ( $word =~ /^$lc_regex$/i ) {
				$words[$i] = highlight_string($word);
				next WORD;
			}
		}

		# words that should be upper case
		if ( is_lowercase($word) ) {
			if ( $word !~ /^$lc_regex$/i ) {
				if ( (length($word) > 3) ) {
					$words[$i] = highlight_string($word);
					next WORD;
				}
			}
			if ( $word =~ /^$uc_regex$/i ) {
				$words[$i] = highlight_string($word);
				next WORD;
			}
		}
	}

	# reconstruct the now-capitalized title
	$txt = '';
	$txt = $1 if $txtbak =~ /^(.*<title.*?>)/;
	$txt .= join('', @words);
	$txt .= $1 if $txtbak =~ /(<\/title.*?>)/;

	if ( $txt ne $txtbak ) {
		showline($bname, $line, $ansi{blue}, 'capitalization', $txt);
	}

	$titleblock = 0 if $txt =~ /<\/title>/;
}

sub init_doc_indentation {
	# build regex for detecting DocBook tags that begin or
	# end an indented section
	my @indent_tags = qw/ abstract answer appendix article articleinfo
						  author authorgroup book bookinfo callout
						  calloutlist chapter chapterinfo caution entry
						  example formalpara important indexterm
						  informaltable itemizedlist keycombo listitem
						  note orderedlist para procedure qandaentry
						  qandaset question row screenco sect1 sect2
						  sect3 sect4 sect5 step surname table tbody
						  tgroup thead tip title variablelist
						  varlistentry warning /;
	$indent_regex = '(?:' . join('|', @indent_tags) . ')';
	# build regex for inline tags like
	# <filename>blah</filename>
	my @inline_tags = qw/ acronym application citetitle command devicename
						  emphasis errorname filename firstterm footnote
						  guimenu hostid imagedata link literal makevar
						  optional parameter primary quote replaceable
						  secondary see term ulink /;
	$inline_regex = '(?:' . join('|', @inline_tags) . ')';
}

sub doc_indentation {
	my ($bname, $line, $currline) = @_;

	# indents are not significant inside ignorable SGML blocks.
	return if $ignoreblock;

	return if $currline =~ /^\s*<!--.*-->\s*$/;

	# \b is needed here to prevent <parameter> being detected as <para>
	return unless $prevline =~ /<\/*$indent_regex\b.*?>/;

	my $prev_indent = length(leading_space($prevline));
	my $curr_indent = length(leading_space($currline));

	# indent once for open tag on previous line
	$prev_indent += 2 if $prevline =~ /<$indent_regex/;

	# allow for inline tag indenting, like
	# <link
	#   url=
	# or
	# <makevar>xyz
	#   abc</makevar>
	my $count = 0;
	$count += ($prevline =~ s/(<$inline_regex)\b/$1/g);
	$count -= ($prevline =~ s/(<\/$inline_regex)\b/$1/g);
	$prev_indent += (2 * $count);

	# if previous line ends in an open xref, indent
	$prev_indent += 2 if ($prevline =~ /<xref\s*$/);

	# <xref> has no close tag, but uses "linkend=" the same as <link>
	# which *does* have a close tag... so if there's a linkend= on
	# previous line but no </ulink> or </link> on either previous
	# or current lines, assume it's an xref and outdent
	my $broken_regex = '(?:(?:linkend|url)=)';
	if ( $prevline =~ /^\s*$broken_regex/ ) {
		if ($prevline !~ /<\/(?:link|ulink)/) {
			if ($currline !~ /<\/(?:link|ulink)/) {
				$prev_indent -= 2;
			}
		}
	}

	# outdent for close tag at end of previous line
	$prev_indent -= 2 if ($prevline =~ /\S+.*<\/$indent_regex>\s*$/);

	# outdent for close tag at the start of this line
	$prev_indent -= 2 if ($currline =~ /^\s*<\/$indent_regex/);

	# outdent after footnote
	$prev_indent -=2 if $prevline =~ /<\/para><\/footnote>/;

	# close tags after long sections of nonindented blocks,
	# like the end of a programlisting, cannot be correctly
	# checked for indentation in this hacky way, so ignore them
	if ( ($prevline =~ /$ignoreblockstart|$ignoreblockend/)
		|| ($currline =~ /$ignoreblockend/) ) {
		$curr_indent = $prev_indent;
	}

	if ( $curr_indent != $prev_indent ) {
		my $out = $origline;
		$out =~ s/(^\s+)/$li$1$ri/;
		showline($bname, $line, $ansi{darkred}, 'bad tag indent', $out);
	}
}

sub doc_longlines {
	my ($bname, $line, $txt) = @_;
	return if $ignoreblock;
	# if line is longer than 70 chars
	# and the part after the indent has spaces
	# this should be smarter, like seeing if the part before the space
	# will benefit from wrapping

	# special-case
	return if $txt =~ /<!DOCTYPE|<!ENTITY/;

	$txt = expand_tabs($txt);

	if ( length($txt) > 70 ) {
		$txt =~ /^(\s*)(.*)/;
		my $inspace = ($1 ? $1 : '');
		my $content = ($2 ? $2 : '');
		my $currline = substr($content, 0, 70 - length($inspace));
		my $nextline = substr($content, length($currline));
		if ( $currline =~ / / ) {
			$currline =~ s/^(.*)? (.*)$/$1$li $ri$2/;
			showline($bname, $line, $ansi{green}, 'wrap long line', "$inspace$currline$nextline");
		} elsif ( $nextline =~ s/ /$li $ri/ ) {
			showline($bname, $line, $ansi{green}, 'wrap long line', "$inspace$currline$nextline");
		}
	}
}

sub doc_sentence {
	my ($bname, $line, $txt) = @_;
	return if $ignoreblock;
	return unless $txt =~ /(?:.|\?|!)/;

	if ( $txt =~ /\. \S/ ) {
		# avoid detecting initials and other abbreviations as sentence endings
		if ( $txt !~ /(?: [A-Z]{1}|Dr|Ed|Inc|Jr|Mass|Pub|Sp|Jan|Feb|Mar|Apr|May|Jul|Aug|Sep|Oct|Nov|Dec|\..{1})\./i ) {
			$txt =~ s/(?:\.|\?|!)( )(\S)/.$li$1$ri$2/;
			showline($bname, $line, $ansi{darkblue}, 'use two spaces at sentence start', $txt);
		}
	}
}

sub init_doc_openclose {
	@openclose_tags = qw/ entry footnote listitem para row /;
	for my $tag (@openclose_tags) {
		$opentag{$tag} = 0;
	}
	my @list_tags = qw/ itemizedlist orderedlist variablelist /;
	$list_regex = join('|', @list_tags);
	my @parawrap_tags = qw/ footnote listitem /;
	$parawrap_regex = join('|', @parawrap_tags);
}

sub doc_openclose {
	my ($bname, $line, $txt) = @_;
	return if $ignoreblock;
	return unless $txt =~ /</;

	my @chunks = split(/(<\S*?>)/, $txt);
	@chunks = grep (! /^\s*$/, @chunks);

	for my $chunk (@chunks) {
		$chunk =~ s/^\s*//;
		next unless $chunk =~ /</;

		for my $tag (@openclose_tags) {

			# check for open without close
			if ( $opentag{$tag} && $chunk =~ /<$tag\W/ ) {
				$txt =~ s/(<$tag\W)/$lh$1$rh/;
				showline($bname, $line, $ansi{red}, "open <$tag> without closing", $txt);
			}

			# check for close without open
			if ( ! $opentag{$tag} && $chunk =~ /<\/$tag\W/ ) {
				$txt =~ s/(<\/$tag\W)/$lh$1$rh/;
				showline($bname, $line, $ansi{red}, "close </$tag> without opening", $txt);
			}

			# evaluate closes
			$opentag{$tag} = 0 if $chunk =~ /<\/$tag\W/;

			# special-case closes
			# <para> can be inside footnotes or listitems
			$opentag{'para'} = 0 if $chunk =~ /<(?:$parawrap_regex)\W/;
			$opentag{'para'} = 0 if $chunk =~ /<\/(?:$list_regex)/;

			# list tags like <itemizedlist> start a new list
			# so 'listitem' is no longer open
			$opentag{'listitem'} = 0 if $chunk =~ /<(?:$list_regex)/;


			# evaluate opens
			$opentag{$tag} = 1 if $chunk =~ /<$tag\W/;

			# special-case opens
			$opentag{'para'} = 1 if $chunk =~ /<\/(?:$parawrap_regex)\W/;
			$opentag{'para'} = 1 if $chunk =~ /<(?:$list_regex)/;

			# list tags like </itemizedlist> end a list
			# so 'listitem' is open again
			$opentag{'listitem'} = 1 if $chunk =~ /<\/(?:$list_regex)/;
		}
	}
}

sub doc_tagstyle {
	my ($bname, $line, $currline) = @_;
	return if $ignoreblock;

	if ( $currline =~ /^\s*\S+\s*<title/ ) {
		showline($bname, $line, $ansi{yellow}, 'put <title> on new line', $currline);
	}

	# <programlisting> should not be used as an inline tag
	if ( $currline =~ s/(\S+\s*<programlisting.*?>)/$lh$1$rh/ ) {
		showline($bname, $line, $ansi{yellow}, 'do not use <programlisting> inline in other elements', $currline);
	} elsif ( ($currline =~ /\s*<programlisting/)
		 && ($prevline !~ /<\/(?:entry|formalpara|indexterm|para|programlisting|screen|title)>\s*$/) ) {
		# <programlisting> should not be used inside elements
		$currline =~ s/(<programlisting.*?>)/$lh$1$rh/;
		showline($bname, $line, $ansi{yellow}, 'do not use <programlisting> inside other elements', $currline);
	}
}

sub init_doc_stragglers {
	@straggler_tags = qw/ command entry literal para title /;
	$literalblock_regex = 'literallayout|programlisting|screen';
}

sub doc_stragglers {
	my ($bname, $line, $txt) = @_;

	# check for literal start tags without listing on the same line
	my $tag;
	if ( $txt =~ />\s*$/ ) {
		if ( $txt =~ /<($literalblock_regex)[^<]?>$/ ) {
			$tag = $1;
			$txt =~ s/(<$tag[^<]?>)$/$lh$1$rh/;
			showline($bname, $line, $ansi{yellow}, "put <$tag> listing on same line", $txt);
			return;
		} elsif ( $txt =~ /^\s*<\/($literalblock_regex)[^<]?>/ ) {
			$tag = $1;
			$txt =~ s/(<\/$tag[^<]?>)$/$lh$1$rh/;
			showline($bname, $line, $ansi{yellow}, "straggling </$tag>", $txt);
			return;
		}
	}

	# the following tests are only for close tags at the start of a line
	return unless $txt =~ /^\s*<\//;

	return if $ignoreblock;

	# stragglers can't be detected when coming out of an ignore block
	return if ( $prevline =~ /$ignoreblockstart|$ignoreblockend/ );

	# more special-case hackery to handle
	#   </table>
	# </para>
	if ( ($prevline =~ /<\/table>\s*$/)
		&& ($txt =~ /^\s*<\/para>\s*$/) ) {
		return;
	}

	for my $tag (@straggler_tags) {
		if ( $txt =~ /^\s*(<\/$tag>)\s*$/ ) {
			$txt = highlight_word($txt, $1);
			showline($bname, $line, $ansi{yellow}, "straggling </$tag>", $txt);
		}
	}
}

sub doc_whitespace {
	my ($bname, $line, $txt) = @_;
	my $txtbak = $txt;

	# indents and tabs/spaces are not significant inside
	# ignorable SGML blocks
	return if $ignoreblock;

	# multiples of eight spaces at the start a line
	# (after zero or more tabs) should be a tab
	if ( $txt =~ s/^(\t* {8})+/$li$1$ri/g ) {
		showline($bname, $line, $ansi{darkmagenta}, 'use tabs instead of spaces', $txt);
	}

	# tabs hidden in paragraphs is also bad
	$txt = $txtbak;
	if ( $txt =~ s/^(\s*\S+)(.*)(\t)/$1$2$li$3$ri/ ) {
		showline($bname, $line, $ansi{darkmagenta}, 'tab in content', $txt);
	}

	# if coming out of an ignoreblock, odd spaces are
	# an artifact of splitting the line and can't be checked
	return if ( $prevline =~ /$ignoreblockstart|$ignoreblockend/ );

	# one or more occurrences of single tabs or double spaces,
	# followed by a single space, is a bad indent
	# if ( $txt =~ s/^((?:(?:  )+|(?:\t+))* )\b/$li$1$ri/ ) {

	# but simpler just to expand tabs to 8 spaces
	# and check for an odd number of spaces
	$txt = $txtbak;
	$txt = expand_tabs($txt);
	if ( $txt =~ s/^((?:  )* )\b/$li$1$ri/ ) {
		showline($bname, $line, $ansi{darkred}, 'bad indent', $txt);
	}
}


# DocBook batch tests


initialize();


# main loop
foreach my $fname (@ARGV) {
	next unless -f $fname;
	print "$fname:\n" if $#ARGV > 0;
	$fname = writestdinfile() if $fname eq "stdin";

	$bname = basename($fname);
	$tmpfile = '';
	$type = filetype($fname);

	if ( $type =~ /gzip|bzip/ ) {
		$tmpfile = uncompress($fname, $type);
		$type = filetype($tmpfile);
	}

	open $fh, '<', ($tmpfile ? $tmpfile : $fname) or die "cannot open '$tmpfile':$!\n";

	# reset for each new document
	init_mdoc_structure();	# mdoc tag presence
	$ignoreblock = 0;		# ignore SGML block
	my $saveindent = '';	# SGML indent level

	# line-by-line tests
	while (<$fh>) {
		chomp;
		next if /^$/;

		# global tests
		abbrevs($bname, $., $_)         if $opt_a;
		badphrases($bname, $., $_)      if $opt_b;
		freebsdobsolete($bname, $., $_) if $opt_f;
		repeatedwords($bname, $., $_)   if $opt_r;
		spellingerrors($bname, $., $_)  if $opt_s;
		whitespace($bname, $., $_)      if $opt_w;

		# mdoc line tests
		next if /^\.\\\"/;	# ignore comments for these tests

		if ( $type eq "troff" ) {
			mdoc_date($bname, $., $_)      if $opt_d;
			mdoc_sentence($bname, $., $_)  if $opt_e;
			mdoc_structure($bname, $., $_) if $opt_m;
		}

		# DocBook line tests
		if ( $type eq "sgml" ) {
			$origline = $_;
			doc_stragglers($bname, $., $_)  if $opt_S;
			for my $segment (splitter($_)) {
				next if /^\s*$/;	# ignore blank segments

				if ( $segment =~ /($ignoreblockstart)/ ) {
					# when entering an ignore block, save the indent level
					$ignoreblock = 1;
					$saveindent = leading_space($origline);
					$prevline = $saveindent . $1;
					next;
				} elsif ( $segment =~ /($ignoreblockend)/ ) {
					# restore the indent level at the end of an ignore block
					$ignoreblock = 0;
					$prevline = substr($saveindent,0,length($saveindent)-2) . $1;
					next;
				}
				doc_titles($bname, $., $segment)      if $opt_c;
				doc_indentation($bname, $., $segment) if $opt_i;
				doc_longlines($bname, $., $segment)   if $opt_l;
				doc_sentence($bname, $., $segment)    if $opt_n;
				doc_openclose($bname, $., $segment)   if $opt_o;
				doc_tagstyle($bname, $., $segment)    if $opt_t;
				doc_whitespace($bname, $., $segment)  if $opt_W;
			}
		}
		$prevline = $_;
	}

	close $fh or die "could not close file:$!\n";

	if ( $opt_d || $opt_o || $opt_y ) {
		# slurp the whole file
		open $fh, '<', ($tmpfile ? $tmpfile : $fname) or die "cannot open '$tmpfile':$!\n";
		my $fulltext = do { local($/); <$fh> };
		close $fh or die "could not close file:$!\n";

		# global batch tests
		style($bname, $fulltext) if $opt_y;

		# mdoc batch tests
		if ( ($type eq "troff") && ($opt_d) && (!$docdate) ) {
			showline($bname, '-', '.Dd date not set', '', '');
		}
	}

	removetempfiles();
}