#!/usr/bin/env perl # ts=4 # Warren Block # special thanks to Glen Barber for limitless # patience and the use of his svn repository # igor: check man pages and DocBook # needs Perl 5.8 or higher use strict; use warnings; use locale; # Copyright (c) 2012, 2013, 2014 Warren Block # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. use Getopt::Std; use File::Basename; use POSIX qw/strftime/; my $file = "/usr/bin/file"; my $gzcat = "/usr/bin/gzcat"; my $bzcat = "/usr/bin/bzcat"; my $man = "/usr/bin/man"; my $tmpdir = "/tmp"; my $rev = '$Revision$'; my ($fh, $tmpfile, $stdinfile, $docdate); my ($prevline, $prevnonblank, $origline) = ('', ''); my $ignoreblock; my $titleblock = 0; my $today; my $linelensgml; my ($startline, $stopline); my $vid; my ($ignoreblockstart, $ignoreblockend); my %misspelled_words; my @badphrases; my @contractions; my @freebsdobs; my ($lc_regex, $uc_regex, $fixedcase_regex, $ignoreregex); my ($indent_regex, $inline_regex); my ($redundantword_regex, $redundanttagword_regex); my (@straggler_tags, $literalblock_regex); my $eos_regex; my (@openclose_tags, $openclose_regex, %opentag, $list_regex, $parawrap_regex); my ($bname, $type); my $prog = basename($0); sub usage { $rev =~ /Revision: (\d+)/; my $version = "1.$1"; print < 0; } # -C start-end limits output to a range of lines if ( $opt_C ) { ($startline, $stopline) = split(':|-', $opt_C); die "-C option requires a line number range (start- | start-end | -end)\n" unless $startline || $stopline; } # -V vid limits output to a range of lines if ( $opt_V ) { $vid = $opt_V; unless ($vid eq 'latest' || $vid =~/.*-.*-/) { die "-V requires vulnerability ID like 348bfa69-25a2-11e5-ade1-0011d823eebd\n"; } $startline=999999; $stopline = $startline; } # -D equals -abefgmprsuw if ( $opt_D ) { $opt_a = $opt_b = $opt_e = $opt_f = $opt_g = $opt_m = $opt_p = $opt_r = $opt_s = $opt_u = $opt_w = 1; } if ( $opt_z ) { # all non-whitespace tests $opt_a = $opt_b = $opt_c = $opt_d = $opt_e = $opt_f = $opt_g = $opt_m = $opt_o = $opt_p = $opt_r = $opt_s = $opt_u = $opt_E = 1; } if ( $opt_Z ) { # all whitespace tests $opt_i = $opt_l = $opt_n = $opt_t = $opt_w = $opt_S = $opt_W = 1; } if ( $opt_x ) { # -x implies -m $opt_m = 1; } # if no tests are chosen, do them all unless ( $opt_a || $opt_b || $opt_c || $opt_d || $opt_e || $opt_f || $opt_g || $opt_i || $opt_l || $opt_m || $opt_n || $opt_o || $opt_p || $opt_r || $opt_s || $opt_t || $opt_u || $opt_w || $opt_x || $opt_y || $opt_E || $opt_S || $opt_W ) { $opt_a = $opt_b = $opt_c = $opt_d = $opt_e = $opt_f = $opt_g = $opt_i = $opt_l = $opt_m = $opt_n = $opt_o = $opt_p = $opt_r = $opt_s = $opt_t = $opt_u = $opt_w = $opt_E = $opt_S = $opt_W = 1; $opt_x = $opt_y = 0; } init_ignoreblocks(); init_spellingerrors(); init_badphrases(); init_contractions(); init_freebsdobs(); init_doc_titles(); init_doc_indentation(); init_doc_sentence(); init_doc_openclose(); init_literalblock_regex(); init_doc_writestyle(); init_doc_stragglers(); # ctrl-c handler $SIG{'INT'} = 'INT_handler'; # do the same thing if the pipe closes $SIG{'PIPE'} = 'INT_handler'; # autoflush $| = 1; # allow stdin push @ARGV, "stdin" if $#ARGV < 0; } sub firstext { my $fname = shift; my $ext = ''; if ( basename($fname) =~ /\.(.*?)(?:\.|$)/ ) { $ext = $1; } return $ext; } sub lastext { my $fname = shift; my $ext = ''; if ( basename($fname) =~ /\.([^.]*?)$/ ) { $ext = $1; } return $ext; } sub baseonly { my $fname = shift; $fname = basename($fname); $fname =~ s/\..*$//; return $fname; } sub tmpfilename { my $fname = shift; my $ext = firstext($fname); my $name = baseonly($fname); return "$tmpdir/$prog-tmp-$$-$name.$ext"; } sub filetype { my $fname = shift; # detect type from extension if possible my $ext = lastext($fname); if ( $ext ) { print "detecting file type by extension: '$ext'\n" if $verbose; for ( $ext ) { if ( /\d{1}/ ) { return "troff" } elsif ( /bz2/i ) { return "bzip" } elsif ( /gz/i ) { return "gzip" } elsif ( /sgml/i ) { return "sgml" } elsif ( /xml/i ) { return "xml" } else { return "unknown" } } } # fall back to file(1) print "detecting file type with file(1)\n" if $verbose; my $out = `$file -b $fname`; $out =~ /^(\S+\s+\S+)/; # first two words if ( $1 ) { my $id = $1; for ( $id ) { if ( /^troff/ ) { return "troff" } elsif ( /^exported SGML/ ) { return "sgml" } # some DocBook documents are detected as "Lisp/Scheme" elsif ( /^Lisp\/Scheme/ ) { return "sgml" } elsif ( /^gzip/ ) { return "gzip" } elsif ( /^bzip/ ) { return "bzip" } else { return "unknown" } } } return "unknown"; } sub uncompress { my ($fname, $type) = @_; my $tmpfile = tmpfilename($fname); print "uncompressing '$fname' to '$tmpfile'\n" if $verbose; for ( $type ) { if ( /gzip/ ) { system("$gzcat $fname > $tmpfile") == 0 or die "could not create '$tmpfile':$!\n"; } elsif ( /bzip/ ) { system("$bzcat $fname > $tmpfile") == 0 or die "could not create '$tmpfile':$!\n"; } else { die "unknown compression type '$type'\n"; } } return $tmpfile; } sub writestdinfile { $stdinfile = "$tmpdir/$prog-stdin.$$"; open $fh, ">", $stdinfile or die "could not create '$stdinfile':$!\n"; print $fh ; close $fh or die "could not close '$stdinfile':$!\n"; return $stdinfile; } sub removetempfiles { if ( $stdinfile && -f $stdinfile ) { print "deleting stdinfile '$stdinfile'\n" if $verbose; unlink $stdinfile or die "could not remove '$stdinfile':$!\n"; } if ( $tmpfile && -f $tmpfile ) { print "deleting tmpfile '$tmpfile'\n" if $verbose; unlink $tmpfile or die "could not remove '$tmpfile':$!\n"; } } sub xmlize { my $txt = shift; $txt =~ s/'/'/g; $txt =~ s/"/"/g; $txt =~ s//>/g; return $txt; } sub showline { my ($bname, $linenum, $color, $errordesc, $txt) = @_; # limit output to line number range return if $startline && ($. < $startline); if ( !$opt_X ) { print "$lf$bname$rf:"; print "$ll$linenum$lr:"; print $color if $opt_R; print "$errordesc"; print $reset if $opt_R; print ":$txt\n"; } else { print " \n"; } } sub is_lowercase { my $word = shift; return $word =~ /^[a-z]{1}/; } sub is_uppercase { my $word = shift; return $word =~ /^[A-Z]{1}/; } sub highlight_word { my ($txt, $word) = @_; $txt =~ s/\Q$word\E/$lh$word$rh/g; return $txt; } sub highlight_string { my $txt = shift; return "$lh$txt$rh"; } sub expand_tabs { my $txt = shift; $txt =~ s/\t/ /g; return $txt; } sub leading_space { my $txt = shift; my $leading; $txt =~ /^(\s+)/; $leading = ($1 ? $1 : ''); $leading = expand_tabs($leading); return $leading; } sub splitter { my $txt = shift; return ($txt) unless ( $txt =~ /$ignoreblockstart|$ignoreblockend/ ); my @split = split /($ignoreblockstart|$ignoreblockend)/, $txt; return grep { ! /^\s*$/ } @split; } sub init_ignoreblocks { print "initializing ignoreblocks\n" if $verbose; # create regex for sgml block start and end my @ignoreblock_tags = qw/ literallayout screen programlisting /; $ignoreblockstart = '(?:|\]\]>'; for my $tag (@ignoreblock_tags) { $ignoreblockend .= "|<\/$tag>"; } $ignoreblockend .= ')'; } sub showwhitespace { my $txt = shift; $txt =~ s/\t/{tab}/g; return $txt; } # global tests sub abbrevs { my ($bname, $line, $txt) = @_; return if $txt =~ /^\s*$/; return if $ignoreblock; my $txtbak = $txt;; if ( $txt =~ /(?:\W|^)c\.f\./i ) { $txt =~ s/(c\.f\.)/$lh$1$rh/i; showline($bname, $line, $ansi{darkmagenta}, 'use "cf."', $txt); } $txt = $txtbak; if ( $txt =~ /(?:\W|^)e\.?g\.(?:[^,:]|$)/ ) { $txt =~ s/(e\.?g\.)/$lh$1$rh/; showline($bname, $line, $ansi{darkmagenta}, 'no comma after "e.g."', $txt); } $txt = $txtbak; if ( $txt =~ /(?:\W|^)i\.?e\.(?:[^,:]|$)/ ) { $txt =~ s/(i\.?e\.)/$lh$1$rh/; showline($bname, $line, $ansi{darkmagenta}, 'no comma after "i.e."', $txt); } $txt = $txtbak; if ( $txt =~ /(?:\W|^)a\.k\.a\./i ) { $txt =~ s/(a\.k\.a\.)/$lh$1$rh/i; showline($bname, $line, $ansi{darkmagenta}, 'use "aka" (AP style)', $txt); } $txt = $txtbak; if ( $txt =~ /(?:\W|^)v\.?s(?:\.|\s|$)/i ) { $txt =~ s/(v\.?s\.)/$lh$1$rh/i; showline($bname, $line, $ansi{darkmagenta}, '"versus" abbreviated', $txt); } } sub init_badphrases { print "initializing badphrases\n" if $verbose; @badphrases = ('2nd', '3rd', '3way', '4th', '5th','allow to', 'allows to', 'become gain', 'be also', 'been also', 'can not', "chroot'd", "compress'd", 'could might', 'could of', 'equally as', 'for to', "ftp'd", 'get take', "gzip'd", 'in on', 'it self', 'may will', "mfc'ed", 'might could', 'often are',"or'ing", 'that without', 'the a', 'the each', 'the to', 'this mean that', 'to can', 'to for', 'to of', 'to performs', 'will has', 'with to', 'would of',); } sub badphrases { my ($bname, $line, $txt) = @_; my $txtbak = $txt; return if $txt =~ /^\s*$/; for my $bad (@badphrases) { $txt = $txtbak; # check for a loose but fast match first if ( $txt =~ /\Q$bad\E/i ) { if ( $txt =~ s/\b(\Q$bad\E)\b/$lh$1$rh/i ) { showline($bname, $line, $ansi{yellow}, 'bad phrase', $txt); } } # detect bad phrases wrapping over two lines # skip this test if the phrase was all on the previous line next if ( $prevline =~ /\Q$bad\E\b/i ); $txt = "$prevline $txtbak"; if ( $txt =~ /\Q$bad\E\b/i ) { my @right = split /\s/, $bad; my @left = (); my $leftstr = ''; while ( @right ) { push @left, shift @right; $leftstr = join ' ',@left; last if ( $prevline =~ /(\Q$leftstr\E)\s*$/i ); } unless ( $leftstr =~ /\Q$bad\E/ ) { showline($bname, $line - 1, $ansi{yellow}, 'bad phrase', "... $lh$leftstr$rh"); $txt = $txtbak; my $rightstr = join ' ', @right; $txt =~ s/(\Q$rightstr\E)/$lh$1$rh/i; showline($bname, $line, $ansi{yellow}, 'bad phrase', $txt); } } } } sub init_contractions { print "initializing contractions\n" if $verbose; @contractions = ("aren't", "can't", "doesn't", "don't", "hasn't", "i'll", "i'm", "isn't", "it's", "i've", "let's", "shouldn't", "that's", "they'll", "you're", "you've", "we'd", "we'll", "we're", "we've", "won't", "would've"); } sub contractions { my ($bname, $line, $txt) = @_; my $txtbak = $txt; return if $txt =~ /^\s*$/; for my $con (@contractions) { $txt = $txtbak; if ( $txt =~ /\Q$con\E/i ) { if ( $txt =~ s/\b(\Q$con\E)\b/$lh$1$rh/i ) { showline($bname, $line, $ansi{yellow}, 'contraction', $txt); } } } } sub init_freebsdobs { print "initializing FreeBSDobs\n" if $verbose; @freebsdobs = qw/ cvsup /; } sub freebsdobsolete { my ($bname, $line, $txt) = @_; return if $txt =~ /^\s*$/; for my $word (@freebsdobs) { if ( $txt =~ s/(\s+)($word)([^.]+.*)$/$1$lh$2$lr$3/ ) { showline($bname, $line, $ansi{darkgreen}, 'freebsd-obsolete', $txt); } } } sub repeatedwords { my ($bname, $line, $txt) = @_; return if $txt =~ /^\s*$/; my $txtbak = $txt; my %count = (); my @words = grep(! /^\s*$/, split /\b/, $txt); map { $count{$_}++ } @words; my @multiples = grep { $count{$_} > 1 } keys %count; #for my $word (keys %count) { for my $word (@multiples) { # skip special cases # repeated numbers next if $word =~ /\d{1}/; # repeated slashes next if $word eq '/'; # repeated rows of dashes next if $word =~ /-+/; # repeated rows of underscores next if $word =~ /_+/; # skip some mdoc commands next if $word =~ /Fl|Ns|Oc|Oo/; $txt = $txtbak; if ( $txt =~ s/\b(\Q$word\E\s+\Q$word\E)\b/$lh$1$rh/i ) { showline($bname, $line, $ansi{darkred}, 'repeated', $txt); } } # check for repeated word from the end of the previous line # to the beginning of the current line # $prevline =~ m%(\w+\s+)*([^ *.#|+-]+\s*)$%; $prevline =~ m%(\w+\s+)*(\S+\s*)$%; my $cmd = ($1 ? $1 : ''); my $prevlastword = ($2 ? $2 : ''); # short-circuit when the previous line... # had no last word return unless $prevlastword; # didn't repeat any of the words on the current line $count{$prevlastword}++; return unless $count{$prevlastword} > 1; # was a groff(7) comment return if $prevlastword eq '.c'; # was a groff(7) zero-space character for tables (\&.) return if $prevlastword eq '\&.'; # was a single non-word character return if $prevlastword =~ /^\W{1}$/; # was an mdoc(7) or nroff(7) comment return if $prevlastword =~ /^\W{1}\\\"/; # was an mdoc command return if $prevlastword =~ /\.(?:Ar|Oo|Nm|Tp)/i; # when the next-to-last word was an mdoc command return if $cmd =~ /Ar |Cm |Fa |Em |Ic |Ip |It |Li |Pa |Ss /i; if ( $txt =~ s/^\s*(\Q$prevlastword\E)(\s+.*)$/$lh$1$rh$2/ ) { showline($bname, $line - 1, $ansi{darkred}, 'repeated', "... $cmd$lh$prevlastword$rh"); showline($bname, $line, $ansi{darkred}, 'repeated', $txt); } } # read an external file of spelling errors # the misspelled word is the first sequence of \w or ' characters # up to a non-word character sub readspelling { my $spname = shift; my $added = 0; print "adding spelling file '$spname'\n" if $verbose; open my $sf, '<', $spname or die "cannot open '$spname':$!\n"; while ( <$sf> ) { next if /^$/; next if /^\s*#/; if ( /^\s*((?:\w|\')+)\W+/ ) { $misspelled_words{$1} = 1; $added++; } } close $sf or die "could not close '$spname':$!\n"; print "added misspellings: $added\n" if $verbose; } # list of common spellingwords sub init_spellingerrors { print "initializing spellingerrors\n" if $verbose; for my $word (qw/ &nbps; aan abismal abjectely abondan abotu abour abscence absolut absolutly abuttes accelleration acceptible acces accesable accesed accesing accessable accidently accomadate accomodate accpeted accroding accross acess achitecture achive acknowledgent acquisions acse actal actaully actuall actualy actyually acutally adavnce adddress adde addesses addiotional additionnal additonal additonally addreses addressess addresss addtions adecuate adhearance adiministration adminstrator admited adressed adresses advence advertisment advetise advetised advices aer agai ageing aggree aggregatable agregate agregation ahold ahte ahve akses alaram albel albels alergic alignement alligned alloacted allready allright allthough allways alot alredy alright althougn altough ammend ammount amn amybe analasys ande andthe ane anf annonymous annoucement annoucing announcment annoyting anonnyed anonymus anormalous answeres anyay anyhitng anyhoo anymore anyonw anyore anyways anywere aobut apac apllication apparentry appart appendencies appers appicable appleances appleis appraently appriciated appropiate approprate apreciated apropriate aptch aqueue arbitary arbitrafy arbritrary archiecture architectual arent arguements arguemnt aritmetic aritmetics arledy arond aroung arrisen arround arrray aslo asparin assigenments assocation assoicated assotiations assumtion asychronous asynchonously asynchroneous athalon athe athentication atleast atrget attachements attemps attemts atuomatic atuomatically autentication autheinticating authention authetnication authorty automaticaly automaticly autonimous avaiable avaialble availabe availablity availbility availible availiblity avalible aweful awhile awsome axatly bannana basec basicly baybe beachmark beacuse beated becease beceause becuase becuse beeing beggining begining behaiver behvaiour beign beleive belive benefitial beoken ber bercause berkley beseuse besure beter beyonf bgack bgiger bheve bikesheding bince bineary birght birt bloatwed bloging bnechmark boostrap boostrapping bootabe bootleneck bootlenecks bootsrap boradband bordism borken borre borred boting bottem bottonm boundries boundry boxd bradband branche briner bringign brocessor broked broser brower browesable browseable browswer buch bugzills buidl buildling buile buillt buld bulding bulds burried byt calatog calcualted cale calles camllia campatibility cannonical cant capabilites capabilties capabiltiy capabily capitzliation captial captialism caracteristics catched cathegory ceratin ceratinly cerificate certian certifcate certifcates certificat certifictate chaged chane changable changess changs chaning charakteristic charakteristics cheapter chech checksuming chek cheked cheking chipest choise choosed choosen choses chronologocal cince cirruption clal clarifynig classifcation cleand clen cliens cluter cmplain cmplaining cna cnanot cobsidered colision colisions comands comemnts comiling comiters comitted comlplex commandline commen commenly commer commericial comming commited commiter commiters commiting commnad commnads commnand commnications communciation communciations comooil comooiled compability comparision compatability compatabilty compatablity compatiable compatibilty compatiblity compentens compilcated compiltaion complaing comples complet completly completness complie componet componetn compontens compromiseable comptemporary comsume comsumed comsumption comunication comunity concatanated conctacted conected conection configrable configration configuation confimation confiused congraturation congraturations conitinue conjob conjuction connecion connecs connecter connecters connectin connenctions connet conneting connnects conpact conputer conreoller consensu conservatie consistant consistentency construcgtor consuption contai containg contect conteins continously contiune contrained contribuition controled controll conujunction conveinently convelient convinience coordinatory coppied correced correctely correcture correleate corresponsding corrsponding cosnole couldnt coyping crasy creapt creat creatopm credentail credentails creeate csvup cuase currenly currentlu currrently curser custommer custommers cuttoff cuty cvould cvs2vn damange damanged datas dayt dbout deactive deafult dealocates deamon deamons deault debuging decendant decentant decicission decidely declerations decliens decompresssion decribed ded defalt defaut defautl deffirent definately definitiely definitly definitons degradate degugging dehaviour deicde deivce dekstop delcared delending dellicious demnstrate depcreation depdendency depedancy depedencies depeding depencdny dependacies dependancies dependancy dependancys dependant dependding dependeancy dependeant dependecies dependecy dependend dependendencies dependiency dependig deploies deprectated depricated derivats derrivates desapointed desaster desasters descendents desciptors describd descrpition descrption despert destinatino destine destory detec detecing detemine deterined devdeloper develoeprs developement developped developper developvers devide devinces devives devleop diablog dictaded dictonary did'n didnt didsk dieing diferent diffence differenciate differencies differents differnetiates differnt diffrent diffrently diffsof diging diliver dilligence dind't dindt directorys diretly diretories diretory diry disappered disasterous discourraged discuessed dismouted dispair dispaly dissabled dissapeared dissapointment dissillusioned distiguish distribtue docuentary documantation documenation documentaiton documentatino documentiation documetation doen't doesen doesent doesnot doesnt domainmame domani donatiosn donn't donot dont droping drustrating durning dynaic ean eanble earler easely eather ebeen ecah eceived ecourage ecouraged ect ecurrent effecive effetive effetively efficent effor efford ehere elipsis emaling eme emporer enabe enbale enchanced enclousure encrypion encyrpt encyrpted endoresed enior enivorement enoountering enought enourmous enow enscrambled ensute enteries enthusiatic entierly enviornment enviroment enviroments environnement equivalen erebuild erliere errorneusly erros escolated esle esome etherenet etnry evenning eventaul eventaully eventhough everthing everytime everyting evet eveyr evreyone ewhich exagerate examble excactly excat exceedes excellant excercize excert excesive exclusivly execption execptions exectable exectables exectuable execuation exellent exemple exemtion exeption exercice exibits exisiting exisitng existance existsing experied experince expession expirience expirt explaination explainations explaned explans explantation explicitely exponentionally exquse exsits extemely exteneded extenstions extentensible extention extentions extranious extreemly extremly facilites facter faile failes faimiliar faliure falsh familar farwarding fase faught feebsd feelt fgights fianlly fiel fien fienw figureing fileame filks filname finaly firmwares firmwrae fisrt fitler fixen fixztion flasg focuss folkz follwo follwoing follwong folowing fomr forbiden forcable foreignphrse forgoten formated fornated forthermore forusers fot foto fotos foudn foward fowarding fragemented fragmentated fragmentatio frameowkr frebsd freedback freeed freezed freind frequence frezze fthernet fucntion fuction fulfil funcition functionmames functionnality functuion funtion furthur fush futher gaint garanties geeting generat genertaes geniue gernal gernerates ghostscrip givent gohostscrip goiung gonna gonne goot gotta grafic grammer grap grapics gratefull grately graub greate greatful greatfully greif grpahs gthe guage guarateed guarentee guarentees guarranteed guidence gurantees hackyness hade haed hai halp hanbook handeling hapen happend happended happpens hardisk hardwares harmpless harrass harrassment hashs hatered hav havent havfe headup healt heavly heirarchy hellon helpfuk helpfull hep hereon hessitate hessitation hexadecimals hibarnate hibarnating hiearchy hierachy hierarchial hierarhy hight higly hinderences hodling homours honets honnor honnors hookled hopful hopfully horiztonal horiztonally hounderd hounderds howeber howevrr htat hte hter htere hthe htink htis hunderts hypens hypervisior ibn idee identially identifiy idosyncracies iea ifhghting im imagening imbeded imeplementation immediatly immenent immidiatly immitating impariment implemenation implementaitons implementating implemetn implentor implicits impliment implmentation imporant imposible improbe improvments imprted inacativity inaccesible inadvertant inadvertantly incase incluseion incomming incompeents incomptaible inconvienent incrase indefinately indended indentical indentifiers indention indentions independant independantly independendly independet indepth indiate indiciations indicies indivual indivudual inetersting infact infavour infomation informations infrastcture infrasture infromation inherity inital initalize initalized initiatior initiliased inititialization inpunt inputed insall insensivite instace instal instalation instaled instaler installaed installaing installatio installtion insted insteresting instractions instuctions intall integreated integrituy intepretation interations interchangably interconverts interes interesitng interfactive interfer interferring intermal interneal interogate interpretedt interpretted interpretting interressing interresting interrestingly interrups intersting intial intregate intrest intresting intruction invarients invicible invole involes involvemnt invoplved invovle irt isnt isntall isonly issueing ist isuus isystem ita ith ive iwll joing jsut juged juste kenrel kerenel kerenels kernal kernell keybaord killled knowlegde knowlege knowlodgeable knwo kust kwyrod labes lable lables laeyer langage languge laods larged lastest laterly layed layput lazyness leasure leav legitimite lemme leson leter lettesrs lgertimately libararies libary librairies libraris libraru licencing licene liek ligned liinux likeing limtations lineair linerly liniarly lised listning lite litteral litterally llow lniux loally localy loder loggoued loging loink loke loks lookking looksy loopack loosing loosly losseless lpatop lpdng ltieral mabe maby mabye macademia machin machince machinew maek mahually mailling maintainace maintainance maintaince maintanance maintanence maintence maintener mames manageement managemnet managment mananged manangement manaul manditory mangagement manged manuallying manualy manyally markkup maschine maske maun maxaximum maximium mdorn meantine measusre mechanim mechanims mechiansm mechnism mechnisms memeber ment mentined mentionned mergeing mericracy meriticracy merrits mes messege metada mfcd micrcontroller microbnechmark mininum minumum miror mis misprediced missplelling misterious mistery mistypted misunterstood moble modiying momment monolitic mont montherboard monut monuted moter motercycle motercycles mountign mpre mssing multile multipled multipy mutiple mysefl myst myt natioal natsy ncessary ncie nderstand neccasary neccesary neccesery neccessary necesary necessarely needto neet neetwork negociated neightbor nintees nobady noipe nomally noone notaions notavailable notefection nothern nothin noticable notied notofocations notquite nouvou numberic numer numner nusance nutrual obfascated objejcts obsticles obvoius ocassionaly occassionally occation occations occurance occured occurence occurences occuring oether ofcourse offical ofr oftem om ommisions ommit ommited ommitt ommitted omrning onfigured onl onle onlne onsult onthe ontop onts ony operationg opion opperation oppertunity oppinion oppions oppisite oprations oprion oprions optial optiion optionsal originaes ot otehr otsuts ouf ouput ouputing outher outout overhall overidden overlaping overrided overriden overritten overwritting ovre owkr pacakge pacakges pachae packge packges paert painfull panices parallell paramenter parametr parametrs paramtere paramters parenticies parition paritioning paritions parntheses parrellel partameters partion partions partionting partiton partitoning partitons passtrough passwrd pasto pathes peformed pengiun peopel pepetual pepetually perfecly perfom perfomance perfoms perfor perfored performace performancing performence performend periperal peripherial peripherials permanant permantly persisent personnal persoon peticular pfew pgk phabriator phoneix phyiscall physcal physial pitty platfrom platorms pleae plisss poatch poeple pofessional poitn poitner poluting porevious portes portupgrde posible positiv positve possability posseses possilbe possition postitions postive postress poting potr potupgrade poud poudirere pourdiere pourdriere pouridere poyrts prameter pratcice preatty preblem preceed preceeded preceeding preceeds precice predictibly prefere prefered prefering preferrable preferrably prefferred preform premissions preoblematic prepair prepairing preperation preperations preprend preprietary preprocesor presense presidence presonally presumeably preume prevelent previos previouse prevoius pricipal princial principes priorisation prirority privelege priveleged privilige privledged privleges probabilly probabyl probaly probbaly problaly proble problmes proccess proccesses proceedure proceses procols proffesional profie progam progams programable programatic programlistning programm programms projcet projecte prolematic promiscuos promissed prompot promt propaged proped propegation propogate propogation proporion proporty propper protcol protcols provde provent providre pseuuedo pshycial pssword psuedo ptach puncing puroses quandries quard queestion querys questionr questoin questsions queueing qui quickier quiety quire quitted quoteas rabase rabased rae rans rase rasing raspberri readd readning realated realloacted realy reaosn reasoably reasonnable reassambled reate reboote rebove rebuilded rebuitling rebult receieve recevied recieve recieved recinded recoide recomend recomendation recommand recommanded recommanding recommened recommented reconigize recrusively redable redering rediculous reeated reelvent reember referes refering refernece refferance refreind refulat regened regularlly reguls reivew reized relaly relase relavent releated relesed relevent reloation reloations rember remdial remebers rememver remvoe remvoed rendtion repative repitition repititions replacemnet replce repleaced reponding reponsible reposotory repostory reprecussion reprecussions reprository repy requireing requirments requistes requred resampeling resaonnable resemblence resently resetart resetted resiilver resilliancy resillience resilliency resillient resise resitor reslove resloving resolf resonable resouce respecitively responce respository respresentation resseler ressources restaring restaurnat restaurnats resurections resusccitate rethnik retnia retreive retrive returs revison revisons rewcursion rewite rewriten riddens rigth riht rmore roken roling rreally rreplace rrquest runing runnig runnign runnning ruter sacn safed sahred saif saior saturage scenartio sched schedulling scritp scrubing scrupt seached secend secion secions securiy seemless seemlessly seens seether senarios sence sendt sepcial sepcific sepcifies sepcify seperate seperated seperately seperates seperating seperation seperator seprate serching serveral sesion setable seting setings settt sexond shae sheding shepard shepards shephard shepharding shooping shoping shoud shoudn't shoulld shrinked shure shuting shyed significnat sile sime simillar simplfied simultanious simultaniously singel singeling singels singnificant sinse sistems sitll sligh slighly slue smebody snapshoted snoflake snopped soe soehow soemone soemones soemthing soething softner sofware sohuld soley solition someoene somes somethng someting someway somthing somwhere sonud sonuds soo soruce sparce spearator specfic specifes specifi specifiaction specificly specifig specifing specifiy specifiying specifyed speficy spindels spititng splic splitted spose srews srtuff ssorted sspares stabalization stadnard standart stantdard staticlly statuc steping stilla stiring stkicks stoll stollen stoped stoping stoppped strang strat strenght striaght stricktly strippped structurees stuf stystem subet submited submiter submitt substaintally substition substraction subsytems subverion succed succeded succeds succesful succesfully successfull successfuly suceeding sucesfully sucess sucessors suckser sucksers suddently sudirectories suficient sugesstion sugested suggesiton suggestsions suggetion sugroup suject sume superiour supprts supressed supresses surpised surpressed surprice surpriced surprize surprized susbtitute suspection swepped switche symetrical symtoms synchronisaton synonomous sysem sysetm syslodg sytem sytems sytsem taged taging taht tahts talkes targer tat te teamm techer techical techincally techncially teh tehse tehy templaitize temporarely tere termal termonology termporary th tha thaat thaknk thakns thank's thankje thansk thanx thatis thats thay thefirst thems themself ther therads therefor theres therory thets theyre theyve thie thier thigns thingking thinke thinkg thinkw thinling thirs thnak thnig tho thos thouch thoug thougt thouogh threated throgh throtteling throughly throuhg throwed thru tht thta thwo ti timestatmp tipycal tirck titeled tlak tlaking tnan todays todl togethe tols tomake toolcain tottaly tought tougue tpage tpye traafic tradeing tradtional trafic trailling tranalation tranalations transfered transfering translateion translater translaters transmision trid triede triewd trigonmetric troubleshoute troughout trow trows trpi trrue trry trubolsome truely trully tryed tryied tryign tsable tsart tsill tsrarted tthe tthis tu tunning turnt tutoriales tye typicall typicaly udnerstand udnerstandable uesd uisng umounted uncapable unchaged unchange uncoment unconsistent undefinitely undeflowed understandlable understandood understaning underway undescores undesireable undestand undustrialized unecessary unecrypted unfortauntely unfortunatly unfortuntelly unfrastructure unfreezed uniion uniquily unknwn unlinke unmouting unnceccessary unneccessary unprivilegded unrelevant unresolveable unreversable unsucessfully untill untis updaing updateing updte upgarde upsteam upstrewams upto ur usally useable useage usedul usefull useing usign usse utilites utilties uttrerly vagrand varialbe varialbes varois vender verion verison versionned versionning verty veryify virtial virutal volenteer volenteers voltave vontinues vulnability waas waht wahtever wakupe warant waranted wass wek wel wer wether whanever whats whcih whe whene whereever wheres whewn whie whihc whilte whinning whith wich wierd wiht wilde wirh wirtten wistle wistles witdh withing withme withough withouth witk witt wnet wo wont wor worflows workoad workoads workstion worng woth woud wouldnt wouls wranty wriatble writen writtend writting wroute wsouse wuch ycould yeild yeilds yesm yhe youd youi youll youre yu yuo yut /) { $misspelled_words{$word} = 1; } print "spellingerrors: ", scalar (keys %misspelled_words), " misspellings known\n" if $verbose; my @spellfiles; # IGORSPELLFILES environment variable is a whitespace-separated list of files push (@spellfiles, split /\s/, $ENV{'IGORSPELLFILES'}) if defined($ENV{'IGORSPELLFILES'}); # all files found in /usr/local/etc/igor/spelling push (@spellfiles, split /\s/, `ls /usr/local/etc/igor/spelling/*`) if -d '/usr/local/etc/igor/spelling'; for my $spellfile (@spellfiles) { readspelling($spellfile); } } sub spellingerrors { my ($bname, $line, $txt) = @_; return if $txt =~ /^\s*$/; my $txtbak = $txt; my @words = split /\W+/, $txt; for my $currentword (@words) { if ( $misspelled_words{lc($currentword)} ) { $txt = highlight_word($txt, $currentword); } } if ( $txt ne $txtbak ) { showline($bname, $line, $ansi{darkmagenta}, 'spelling', $txt); } } sub whitespace { my ($bname, $line, $txt) = @_; return if $txt =~ /^$/; my $txtbak = $txt; if ( $txt =~ s/^(\s+)$/$li$1$ri/ ) { showline($bname, $line, $ansi{darkblue}, 'blank line with whitespace', $txt); } $txt = $txtbak; if ( $txt =~ s/(\S+)(\s+)$/$1$li$2$ri/ ) { showline($bname, $line, $ansi{darkblue}, 'trailing whitespace', $txt); } $txt = $txtbak; if ( $txt =~ s/( +)\t+/$li$1$ri/ ) { showline($bname, $line, $ansi{darkmagenta}, 'tab after space', $txt); } } # global batch tests sub style { my ($bname, $txt) = @_; print "$lf$bname style check:$rf\n"; my $you = ($txt =~ s/you\b/you/gi); my $your = ($txt =~ s/your/your/gi); if ( $you || $your ) { print " $lh\"you\" used $you time", ($you==1 ? '':'s'), "$rh\n" if $you; print " $lh\"your\" used $your time", ($your==1 ? '':'s'), "$rh\n" if $your; print " \"You\" and \"your\" are informal and subjective.\n"; print " Try to be formal and objective: \"the file\" rather than \"your file\".\n"; } my $should = ($txt =~ s/should/should/gi); if ( $should ) { print " $lh\"should\" used $should time", ($should==1 ? '':'s'), "$rh\n"; print " Use \"should\" sparingly, it is feeble.\n"; print " Try to be imperative: \"do this\" rather than \"you should do this\".\n"; } my $obviously = ($txt =~ s/obviously/obviously/gi); if ( $obviously ) { print " $lh\"obviously\" used $obviously time", ($obviously==1 ? '':'s'), "$rh\n"; print " If it is really obvious, it does not need to be pointed out.\n"; } my $needless = ($txt =~ s/needless to say/needless to say/gi); if ( $needless ) { print " $lh\"needless to say\" used $needless time", ($needless==1 ? '':'s'), "$rh\n"; print " If it doesn't need to be said, why say it?\n"; } my $thefollowing = ($txt =~ s/the following/the following/gi); my $asfollows = ($txt =~ s/as follows/as follows/gi); if ( $thefollowing || $asfollows ) { print " $lh\"the following\" used $thefollowing time", ($thefollowing==1 ? '':'s'), "$rh\n" if $thefollowing; print " $lh\"as follows\" used $asfollows time", ($asfollows==1 ? '':'s'), "$rh\n" if $asfollows; print " If something is following, the reader can see it without being told.\n"; } my $followingexample = ($txt =~ s/following example/following example/gi); if ( $followingexample ) { print " $lh\"following example\" used $followingexample time", ($followingexample==1 ? '':'s'), "$rh\n"; print " If an example is following, the reader can see it without being told.\n"; } my $simply = ($txt =~ s/simply/simply/gi); my $basically = ($txt =~ s/basically/basically/gi); if ( $simply || $basically ) { print " $lh\"simply\" used $simply time", ($simply==1 ? '':'s'), "$rh\n" if $simply; print " Use \"simply\" to mean \"in a simple manner\", \"just\", or \"merely\", not the\n"; print " patronizing \"details omitted because they are not simple enough for you\".\n"; print " $lh\"basically\" used $basically time", ($basically==1 ? '':'s'), "$rh\n" if $basically; print " Use \"basically\" to mean \"essentially\" or \"fundamentally\", not \"only the\n"; print " basics are shown because anything more will be too complicated for you\".\n"; } my $the = ($txt =~ s/(?:^the|\.\s+the)\b/the/gi); my $sent = ($txt =~ s/([^.]+\.\s+)/$1/gi); my $percent = ($sent > 0 ? int($the/$sent*100) : 0); if ( $the && ($percent > 19) ) { print " $lh\"The\" used to start a sentence $the time", ($the==1 ? '':'s'), " in $sent sentence", ($sent==1 ? '':'s'), " ($percent%)$rh\n"; print " Starting too many sentences with \"the\" can be repetitive\n"; print " and dull to read.\n"; } my $cf = ($txt =~ s/\Wcf\./cf./gi); my $eg = ($txt =~ s/e\.g\./e.g./gi); my $ie = ($txt =~ s/i\.e\./i.e./gi); my $nb = ($txt =~ s/n\.b\./n.b./gi); if ( $cf ) { print " $lh\"cf.\" used $cf time", ($cf==1 ? '':'s'), "$rh\n"; print " \"Cf.\" (Latin \"confer\") means \"${lf}compare$rf\" and is mostly used in academic\n"; print " and scientific writing. Consider replacing with the more common English\n"; print " words.\n"; } if ( $eg ) { print " $lh\"e.g.\" used $eg time", ($eg==1 ? '':'s'), "$rh\n"; print " \"E.g.\" (Latin \"exempli gratia\") means \"${lf}for example$rf\" and is mostly\n"; print " used in academic and scientific writing. Consider replacing with the\n"; print " more common English words. Both forms are usually followed by a\n"; print " comma for a verbal pause: \"e.g., a b c\" or \"for example, a b c\"\n"; } if ( $ie ) { print " $lh\"i.e.\" used $ie time", ($ie==1 ? '':'s'), "$rh\n"; print " \"I.e.\" (Latin \"id est\") means \"${lf}that is$rf\" and is mostly used in academic\n"; print " and scientific writing. Consider replacing with the more common\n"; print " English words. Both forms are usually followed by a comma for\n"; print " a verbal pause: \"i.e., a b c\" or \"that is, a b c\"\n"; } if ( $nb ) { print " $lh\"n.b.\" used $nb time", ($nb==1 ? '':'s'), "$rh\n"; print " \"N.b.\" (Latin \"nota bene\") means \"${lf}note$rf\" or \"${lf}take notice${rf}\" and is mostly\n"; print " used in academic and scientific writing. Consider replacing with\n"; print " the more common English words.\n"; } my $inorderto = ($txt =~ s/in order to/in order to/gi); if ( $inorderto ) { print " $lh\"in order to\" used $inorderto time", ($inorderto==1 ? '':'s'), "$rh\n"; print " Unless \"in order to\" has some special meaning here, \"to\" is simpler.\n"; } my $invoke = ($txt =~ s/invoke/invoke/gi); if ( $invoke ) { print " $lh\"invoke\" used $invoke time", ($invoke==1 ? '':'s'), "$rh\n"; print " Unless \"invoke\" has some special meaning in context, \"run\" is simpler.\n"; } # type-specific tests if ( $type eq "troff" ) { my $examples = ($txt =~ /\n\.\s*Sh\s+EXAMPLES/i); unless ( $examples ) { print " ${lh}no \"EXAMPLES\" section found$rh\n"; print " Even trivial examples can improve clarity.\n"; print " Common-use examples are better yet.\n"; } } } # mdoc line-by-line tests my @md_displays; sub mdoc_whitespace { my ($bname, $line, $txt) = @_; if ( $txt =~ /^\.\s*Bd\s/ ) { push @md_displays, ($txt =~ /-(?:literal|unfilled)/ || 0); } elsif ( $txt =~ /^\.\s*Ed\b/ ) { pop @md_displays; } elsif ( ! length $txt && ! grep $_, @md_displays ) { showline($bname, $line, $ansi{darkblue}, "blank line", $txt); } } sub mdoc_date { my ($bname, $line, $txt) = @_; return if $txt =~ /^\s*$/; if ( $txt =~ s/^(\.\s*Dd\s+)(.*)$/$1$lh$2$rh/ ) { $docdate = $2; showline($bname, $line, $ansi{darkyellow}, "date not today, $today", $txt) if $docdate ne $today; } } sub mdoc_sentence { my ($bname, $line, $txt) = @_; return if $txt =~ /^\s*$/; if ( $txt =~ s/^(\w{2,}.*?[^ .]{2,}\.\s+)(A |I |\w{2,})(.*)$/$1$lh$2$3$rh/ ) { showline($bname, $line, $ansi{darkcyan}, 'sentence not on new line', $txt); } } sub init_mdoc_uniqxrefs { print "initializing mdoc_uniqxrefs\n" if $verbose; %seealsoxrefs = (); } sub mdoc_uniqxrefs { my ($bname, $line, $txt) = @_; return if $txt =~ /^\s*$/; # set a flag to indicate when a .Sh SEE ALSO section is found if ( $txt =~ /^\.Sh\s+(.*)/i ) { $seealso = ( $1 =~ /SEE ALSO/i ); print "mdoc_uniqxrefs: SEE ALSO section found\n" if $verbose; return; } # only check xrefs for repeats inside a SEE ALSO section if ( $seealso ) { # if inside a SEE ALSO section, stop looking for duplicates # after non-.Xr macros. These would probably be text sections # talking about the external references, not included in the list. if ( ($txt =~ /^\./) && ($txt !~ /^\.Xr/i) ) { $seealso = 0; return; } # allow both valid mdoc formats (.Xr umount 8 ,) # and bad ones (.Xr xorg.conf(5),) if ( $txt =~ /\.Xr\s+(.*)(?:\s|\()(\d{1}\w?)/i ) { my $xrefname = $1; my $xrefsect = $2; if ( $seealsoxrefs{"$xrefname-$xrefsect"} ) { $txt =~ s/($xrefname.*$xrefsect)/$lh$1$rh/g; showline($bname, $line, $ansi{yellow}, "duplicate SEE ALSO reference", $txt); } else { $seealsoxrefs{"$xrefname-$xrefsect"} = 1; } } } } sub showmacvals { my ($lastmacro, $bname, $line) = @_; for my $macro (@macros) { last if $macro eq $lastmacro; unless ( $macroval{$macro} ) { showline($bname, $line, $ansi{red}, ".$lastmacro used here", "but .$macro has not been defined"); } } } sub init_mdoc_structure { print "initializing mdoc_structure\n" if $verbose; for my $macro (@macros) { $macro =~ tr/_/ /; $macroval{$macro} = ''; } } sub mdoc_structure { my ($bname, $line, $txt) = @_; return if $txt =~ /^\s*$/; # skip if the line starts with an mdoc macro # technically, whitespace is allowed before macros return unless $txt =~ /^\s*\./; # check for required minimum macros my $parm; for my $macro (@macros) { $parm = ''; $macro =~ tr/_/ /; next if $macroval{$macro}; if ( $txt =~ /^\.\s*\Q$macro\E\s*(.*)/i ) { my $parm = $1; # provide a blank parameter for macros with optional parameters $parm = ' ' if ($macro =~ /^Os|Sh NAME|Sh SYNOPSIS|Sh DESCRIPTION/) && (!$parm); $macroval{$macro} = $parm; showmacvals($macro, $bname, $line); last; } } # check external refs (.Xr) # suggested by Glen Barber return unless $txt =~ /^.Xr/; # characters to treat as whitespace in an Xr macro my $wspace = '[ (),.:]'; # character class for section numbers # an initial number possibly followed by a letter my $sect = '\d{1}[A-Za-z]?'; my $xname = ''; $xname = $1 if $txt =~ /^.Xr$wspace+(\S+)/; my $xsection = ''; $xsection = $1 if $txt =~ /^.Xr$wspace+\S+$wspace+($sect)/; if ( ! $xname ) { showline($bname, $line, $ansi{yellow}, 'xref name missing', $txt); return; } if ( $xname =~ /\($sect\)/ ) { $txt =~ s/($xname)/$lh$1$rh/; showline($bname, $line, $ansi{yellow}, 'section number in name', $txt); return; } if ( $xsection && ($xsection gt "9") ) { $txt =~ s/^(.Xr$wspace+\S+$wspace+)($sect)/$1$lh$2$rh/; showline($bname, $line, $ansi{yellow}, 'section higher than 9', $txt); # no point in checking for sections higher than 9 return; } if ( $opt_x ) { system("$man -w $xsection $xname >/dev/null 2>&1"); if ( $? ) { if ( $xsection ) { $txt =~ s/^(.Xr$wspace+)(\S+$wspace+$sect)/$1$lh$2$rh/; } else { $txt =~ s/^(.Xr$wspace+)(\S+)/$1$lh$2$rh/; } showline($bname, $line, $ansi{darkmagenta}, 'external man page not found', $txt); # not found, no point in checking if it's this one return; } } # is this external reference referring to itself? # skip if the .Nm macro has no value return if $macroval{'Nm'} ne $xname; my $currsection = ''; if ( $macroval{'Dt'} =~ /^\S+\s+($sect)/ ) { $currsection = $1; } return if $xsection ne $currsection; if ( $xsection && $currsection ) { $txt =~ s/^(.Xr$wspace+)(\S+$wspace+$sect)/$1$lh$2$rh/; } else { $txt =~ s/^(.Xr$wspace+)(\S+)/$1$lh$2$rh/; } showline($bname, $line, $ansi{darkmagenta}, 'xref refers to *this* page (use .Nm)', $txt); } # DocBook line-by-line tests sub init_doc_titles { print "initializing doc_titles\n" if $verbose; # build regex of words that should be lowercase in titles my @lc_words = qw/ a an and at by down for from in into like near nor of off on onto or over past the to upon with /; $lc_regex = '(?:' . join('|', @lc_words) . ')'; my @uc_words = qw/ about are how log new not set tag use one two three four five six seven eight nine /; $uc_regex = '(?:' . join('|', @uc_words) . ')'; my @fixedcase_words = qw/ amd64 i386 iSCSI x86 /; $fixedcase_regex = '(?:' . join('|', @fixedcase_words) . ')'; # build regex for ignoring DocBook tagged words in titles # like ls my @ignoretags = qw/ acronym application command filename function link literal varname replaceable systemitem tag /; for my $tag (@ignoretags) { $tag = "<$tag.*?>.*?<\/$tag>"; } $ignoreregex = '|' . join('|', @ignoretags) } sub doc_titles { my ($bname, $line, $txt) = @_; return if $txt =~ /^\s*$/; my $txtbak = $txt; return if $ignoreblock; $titleblock = 1 if $txt =~ /(.*?)(?:<\/title>|$)/) || ($txt =~ /(.*)(?:<\/title>)/) ) { @words = split /($ignoreregex|\s+)/, $1; } else { @words = split /($ignoreregex|\s+)/, $txt; } # use AP style: capitalize words longer than three letters; see also # http://www.freebsd.org/cgi/cvsweb.cgi/doc/en_US.ISO8859-1/books/handbook/linuxemu/chapter.sgml#rev1.48 WORD: for my $i (0..$#words) { my $word = $words[$i]; print "doc_titles: analyzing '$word'\n" if $verbose; next WORD if $word =~ /\s+/; next WORD if $word =~ /$ignoreregex/; # special case: skip the contents of some unfinished tags # <title>Configuring <acronym role="Domain Name # System">DNS</acronym> next WORD if $word =~ /(?:role)=/; # special case: allow single lowercase "s" for plurals next WORD if $word eq 's'; # special case words that should not be capitalized next WORD if $word =~ /^$fixedcase_regex$/; # first word should be capitalized if ( ($txt =~ / 3) ) { $words[$i] = highlight_string($word); next WORD; } } if ( $word =~ /^$uc_regex$/i ) { $words[$i] = highlight_string($word); next WORD; } } } # reconstruct the now-capitalized title $txt = ''; $txt = $1 if $txtbak =~ /^(.*<title.*?>)/; $txt .= join('', @words); $txt .= $1 if $txtbak =~ /(<\/title.*?>)/; if ( $txt ne $txtbak ) { print "doc_titles:\n original='$txtbak'\n highlighted='$txt'\n" if $verbose; showline($bname, $line, $ansi{blue}, 'capitalization', $txt); } $titleblock = 0 if $txt =~ /<\/title>/; } sub init_doc_indentation { print "initializing doc_indentation\n" if $verbose; # build regex for detecting DocBook tags that begin or # end an indented section my @indent_tags = qw/ abstract answer appendix article articleinfo author authorgroup biblioentry bibliography biblioset blockquote book bookinfo callout calloutlist category chapter chapterinfo colophon caution contrib date day entry event example figure formalpara funcdef funcsynopsis funcprototype glossary glossdef glossdiv glossentry glossterm important imageobject imageobjectco info informaltable informalexample itemizedlist legalnotice listitem mediaobject mediaobjectco month name note orderedlist para paramdef partintro personname preface procedure qandadiv qandaentry qandaset question row screenco sect1 sect2 sect3 sect4 sect5 section seglistitem segmentedlist sidebar step stepalternatives surname table tbody tgroup thead tip title variablelist varlistentry warning year /; # add VuXML tags @indent_tags = (@indent_tags, qw/ affects body cvename dates description discovery head html li name p range references topic ul vuln vuxml /); @indent_tags = (sort {length($b) <=> length($a)} @indent_tags); print "indentation tags: @indent_tags\n" if $verbose; $indent_regex = '(?:' . join('|', @indent_tags) . ')'; print "indentation regex: $indent_regex\n" if $verbose; # build regex for inline tags like # <filename>blah</filename> my @inline_tags = qw/ a acronym application citetitle command computeroutput devicename emphasis envar errorname filename firstterm footnote function guimenu guimenuitem hostid imagedata indexterm keycap keycombo link literal makevar option optional package parameter primary quote remark replaceable secondary see seg sgmltag simpara strong structname systemitem term tt ulink uri varname /; # add VuXML tags @inline_tags = (@inline_tags, qw/ ge gt le lt url /); @inline_tags = (sort {length($b) <=> length($a)} @inline_tags); print "inline tags: @inline_tags\n" if $verbose; $inline_regex = '(?:' . join('|', @inline_tags) . ')'; print "inline regex: $inline_regex\n" if $verbose; } sub doc_indentation { my ($bname, $line, $currline) = @_; my ($init_prev_indent, $init_curr_indent); return if $currline =~ /^\s*$/; # indents are not significant inside ignorable SGML blocks. return if $ignoreblock; return if $currline =~ /^\s*<!--.*-->\s*$/; # \b is needed here to prevent <parameter> being detected as <para> return unless $prevnonblank =~ /<\/*$indent_regex\b.*?>/; my $prev_indent = length(leading_space($prevnonblank)); my $curr_indent = length(leading_space($currline)); if ( $verbose ) { # save initial values for later verbose reporting $init_prev_indent = $prev_indent; $init_curr_indent = $curr_indent; } # indent once for open tag on previous line $prev_indent += 2 if $prevnonblank =~ /<$indent_regex\b/; # allow for inline tag indenting, like # <link # url= # or # <makevar>xyz # abc</makevar> my $count = 0; $count += ($prevnonblank =~ s/(<$inline_regex)\b/$1/g); $count -= ($prevnonblank =~ s/(<\/$inline_regex)\b/$1/g); $prev_indent += (2 * $count); # if previous line ends in an open xref, indent $prev_indent += 2 if ($prevnonblank =~ /<xref\s*$/); # <xref> has no close tag, but uses "linkend=" the same as <link> # which *does* have a close tag... so if there's a linkend= on # previous line but no </ulink> or </link> on either previous # or current lines, assume it's an xref and outdent my $broken_regex = '(?:(?:linkend|url)=)'; if ( $prevnonblank =~ /^\s*$broken_regex/ ) { if ($prevnonblank !~ /<\/(?:link|ulink)/) { if ($currline !~ /<\/(?:link|ulink)/) { $prev_indent -= 2; } } } # outdent for close tag at end of previous line $prev_indent -= 2 if ($prevnonblank =~ /\S+.*<\/$indent_regex>\s*$/); # outdent for close tag at the start of this line $prev_indent -= 2 if ($currline =~ /^\s*<\/$indent_regex/); # outdent after footnote $prev_indent -=2 if $prevnonblank =~ /<\/para><\/footnote>/; # singleton tags like <entry/> are really just an empty # open/close tag, <entry></entry>, allow for them $prev_indent -=2 if $prevnonblank =~ /\/>$/; # close tags after long sections of nonindented blocks, # like the end of a programlisting, cannot be correctly # checked for indentation in this hacky way, so ignore them if ( ($prevnonblank =~ /$ignoreblockstart|$ignoreblockend/) || ($currline =~ /$ignoreblockend/) ) { $curr_indent = $prev_indent; } if ( $curr_indent != $prev_indent ) { if ( $verbose ) { print "doc_indentation:\n"; my $vprev = showwhitespace($prevnonblank); my $vcurr = showwhitespace($currline); print "previous nonblank line: '$vprev\'\n"; print " current line: '$vcurr\'\n"; print "\t\t\t\tinitial\tfinal\n"; print "previous nonblank indent:\t$init_prev_indent\t$prev_indent\n"; print " current indent:\t$init_curr_indent\t$curr_indent\n"; } my $out = $origline; $out =~ s/(^\s+)/$li$1$ri/; showline($bname, $line, $ansi{darkred}, 'bad tag indent', $out); } } # split and return leading space and content sub splitleading { my $txt = shift; my $inspace = ''; my $content = $txt; if ( $txt =~ /^(\s*)(.*)/ ) { $inspace = $1 if $1; $content = $2 if $2; } return ($inspace, $content); } sub doc_longlines { my ($bname, $line, $txt) = @_; return if $txt =~ /^\s*$/; return if $ignoreblock; # if line is longer than $linelensgml (normally 70) chars # and the part after the indent has spaces # this should be smarter, like seeing if the part before the space # will benefit from wrapping # ignore long lines with these tags return if $txt =~ /<(?:!DOCTYPE|!ENTITY|pubdate|releaseinfo)/; $txt = expand_tabs($txt); if ( length($txt) > $linelensgml ) { my ($inspace, $content) = splitleading($txt); my $currline = substr($content, 0, $linelensgml - length($inspace)); my $nextline = substr($content, length($currline)); if ( $currline =~ / / ) { $currline =~ s/^(.*)? (.*)$/$1$li $ri$2/; showline($bname, $line, $ansi{green}, 'wrap long line', "$inspace$currline$nextline"); } elsif ( $nextline =~ s/ /$li $ri/ ) { showline($bname, $line, $ansi{green}, 'wrap long line', "$inspace$currline$nextline"); } } } sub init_doc_sentence { print "initializing doc_sentence\n" if $verbose; # end of sentence characters: literal dot, question mark, exclamation point $eos_regex = '\.|\?\!'; } sub doc_sentence { my ($bname, $line, $txt) = @_; return if $txt =~ /^\s*$/; return if $ignoreblock; # skip if there is no end-of-sentence character return unless $txt =~ /(?:$eos_regex)/; my $errcount = 0; my ($inspace, $content) = splitleading($txt); my @sentences = grep (! /^$/, split /((?:.*?(?:$eos_regex)+\s+)|(?:<.*?>))/, $content); for my $s (@sentences) { # skip unless it has a one-space possible sentence start next unless $s =~ /\. $/; # SGML markup, like "<emphasis>bold</emphasis>." #next if $s =~ />\. $/; # single dots, like from "find . -name '*.sgml'" next if $s =~ / \. $/; # initials next if $s =~ /[A-Z]{1}\. $/; # common abbreviations next if $s =~ /(?:Ave|Dr|Ed|etc|Inc|Jr|Mass|Pub|Sp|St|Str|str|o\.o)\. $/; # ignore misuse of cf., e.g., i.e., and v.s., they are not # end of sentence errors next if $s =~ /(?:cf|e(?:\.)*g|i\.e|v\.s)\. $/i; # months next if $s =~ /(?:Jan|Feb|Mar|Apr|May|Jul|Aug|Sep|Oct|Nov|Dec)\. $/; # numbers, like "... and 1997." next if $s =~ /\d+\. $/; # ellipsis next if $s =~ /\.\.\. $/; # it must be a single-space sentence start $s =~ s/ $/$li $ri/; $errcount++; } if ( $errcount ) { # reassemble the now-highlighted string $txt = $inspace . join('', @sentences); showline($bname, $line, $ansi{darkblue}, 'use two spaces at sentence start', $txt); } } sub init_doc_openclose { print "initializing doc_openclose\n" if $verbose; @openclose_tags = qw/ callout entry filename footnote li listitem literal p para row step /; for my $tag (@openclose_tags) { $opentag{$tag} = 0; } $openclose_regex = join('|', @openclose_tags); my @list_tags = qw/ itemizedlist orderedlist variablelist /; $list_regex = join('|', @list_tags); my @parawrap_tags = qw/ footnote listitem /; $parawrap_regex = join('|', @parawrap_tags); } sub doc_openclose { my ($bname, $line, $txt) = @_; return if $txt =~ /^\s*$/; return if $ignoreblock; return unless $txt =~ /</; my $errcount = 0; my ($inspace, $content) = splitleading($txt); my @chunks = split(/(<.*?(?:>|$))/, $content); @chunks = grep (! /^\s*$/, @chunks); for my $chunk (@chunks) { next unless $chunk =~ /</; for my $tag (@openclose_tags) { next unless $chunk =~ /(?:$openclose_regex)/; if ( $chunk =~ /$tag/ ) { # check for open without close if ( $opentag{$tag} && $chunk =~ /<$tag\b/ ) { $chunk =~ s/(<$tag\b)/$lh$1$rh/; showline($bname, $line, $ansi{red}, "open <$tag> without closing", $inspace . join('', @chunks)); } # check for close without open if ( ! $opentag{$tag} && $chunk =~ /<\/$tag>/ ) { $chunk =~ s/(<\/$tag\W)/$lh$1$rh/; showline($bname, $line, $ansi{red}, "close </$tag> without opening", $inspace . join('', @chunks)); } # evaluate closes $opentag{$tag} = 0 if $chunk =~ /<\/$tag>/; # evaluate opens $opentag{$tag} = 1 if $chunk =~ /<$tag\b/; } } # special-case closes # <para> can be inside footnotes or lists $opentag{'para'} = 0 if $chunk =~ /<(?:$parawrap_regex)\b/; $opentag{'para'} = 0 if $chunk =~ /<\/(?:$list_regex)>/; # list tags like <itemizedlist> start a new list # so 'listitem' is no longer open $opentag{'listitem'} = 0 if $chunk =~ /<(?:$list_regex)\b/; # procedures can be nested, so <procedure> closes <step> $opentag{'step'} = 0 if $chunk =~ /<procedure\b/; # special-case opens $opentag{'para'} = 1 if $chunk =~ /<\/(?:$parawrap_regex)>/; $opentag{'para'} = 1 if $chunk =~ /<(?:$list_regex)\b/; # list tags like </itemizedlist> end a list # so 'listitem' is open again $opentag{'listitem'} = 1 if $chunk =~ /<\/(?:$list_regex)>/; # procedures can be nested, so </procedure> opens <step> $opentag{'step'} = 1 if $chunk =~ /<\/procedure\b/; } } sub init_literalblock_regex { print "initializing literalblock_regex\n" if $verbose; # used by multiple tests $literalblock_regex = 'literallayout|programlisting|screen'; } sub doc_tagstyle_whitespace { my ($bname, $line, $currline) = @_; return if $ignoreblock; my $currlinebak = $currline; # <title> if ( $currline =~ s/^(\s*\S+.*?)(<title)/$1$lh$2$rh/ ) { showline($bname, $line, $ansi{darkcyan}, 'put <title> on new line', $currline); $currline = $currlinebak; } # <para> if ( $currline =~ s/(<\/para>)([^< ]+)$/$1$lh$2$rh/ ) { showline($bname, $line, $ansi{red}, 'character data is not allowed here', $currline); $currline = $currlinebak; } # (programlisting> if ( $currline =~ /<programlisting/ ) { # <programlisting> should not be used as an inline tag if ( $currline =~ s/(\S+\s*<programlisting.*?>)/$lh$1$rh/ ) { showline($bname, $line, $ansi{red}, 'do not use <programlisting> inline in other elements', $currline); $currline = $currlinebak; } elsif ( ($currline =~ /\s*<programlisting/) && ($prevnonblank !~ /<\/(?:entry|formalpara|indexterm|note|para|programlisting|screen|title)>\s*$/) ) { # <programlisting> allowed inside these elements return if $prevnonblank =~ /<(?:example|informalexample)>/; $currline =~ s/(<programlisting.*?>)/$lh$1$rh/; showline($bname, $line, $ansi{red}, 'do not use <programlisting> inside other elements', $currline); $currline = $currlinebak; } } # elements that should be preceded by a blank line if ( $prevline =~ /\S+/ ) { # an open tag like <informalexample> is okay, otherwise # there should be a blank line before these tags if ( ($prevline !~ /<.*?>\s*$/) && ($currline =~ s/(<(?:$literalblock_regex).*?(?:>|$))/$lh$1$rh/) ) { showline($bname, $line, $ansi{darkcyan}, "precede $1 with a blank line", $currline); $currline = $currlinebak; } } # elements that should be followed by a blank line if ( $currline =~ /\S+/ ) { # a close tag like </note> is okay, otherwise there # should be a blank line after these tags # unless they are followed by another close tag on the same line # example: </literallayout></entry> # if ( ($currline !~ /^\s*<\//) && ($prevline =~ /(<\/(?:$literalblock_regex|row|step|title)>)/) ) { if ( ($currline !~ /^\s*<\//) && ($prevline =~ /(<\/(?:$literalblock_regex|row|step|title)>)/) && ($prevline !~ /<\/entry>$/) ) { showline($bname, $line, $ansi{darkcyan}, "add blank line after $1 on previous line", "$lh$currline$rh"); } } } sub init_doc_writestyle { print "initializing doc_writestyle\n" if $verbose; $redundantword_regex = 'command|filename|keycap|option'; $redundanttagword_regex = '(<\/(?:command> command|filename> file|keycap> key|option> option))\b'; } sub doc_writestyle { my ($bname, $line, $currline) = @_; return if $ignoreblock; my $currlinebak = $currline; # test for redundant markup and words starting on the previous line if ( $prevline =~ /(<\/(?:$redundantword_regex)>*\s*$)/ ) { my $prevend = $1; for my $word (split('|', $redundantword_regex)) { next unless $prevend =~ /$word/; next unless $currline =~ /^\s*>*\s*(\w+)\s*(?:\W+|$)/; my $firstword = $1; if ( "$prevend $firstword" =~ /$redundanttagword_regex/ ) { $currline =~ s/^(\s*)($firstword)\b/$1$lh$2$rh/; showline($bname, $line-1, $ansi{darkmagenta}, 'redundant markup and word', "... $lh$prevend$rh"); showline($bname, $line, $ansi{darkmagenta}, 'redundant markup and word', $currline); $currline = $currlinebak; last; } } } # test for redundant markup and words on the current line if ( $currline =~ /$redundantword_regex/ ) { if ( $currline =~ s/$redundanttagword_regex/$lh$1$rh/ ) { showline($bname, $line, $ansi{darkmagenta}, 'redundant markup and word', $currline); $currline = $currlinebak; } } } sub init_doc_stragglers { print "initializing doc_stragglers\n" if $verbose; @straggler_tags = qw/ command entry literal para title /; } sub doc_stragglers { my ($bname, $line, $txt) = @_; return if $txt =~ /^\s*$/; # check for literal start tags without listing on the same line my $tag; if ( $txt =~ />\s*$/ ) { if ( $txt =~ /<($literalblock_regex)[^<]?>$/ ) { $tag = $1; $txt =~ s/(<$tag[^<]?>)$/$lh$1$rh/; showline($bname, $line, $ansi{yellow}, "put <$tag> listing on same line", $txt); return; } elsif ( $txt =~ /^\s*<\/($literalblock_regex)[^<]?>/ ) { $tag = $1; $txt =~ s/(<\/$tag[^<]?>)$/$lh$1$rh/; showline($bname, $line, $ansi{yellow}, "straggling </$tag>", $txt); return; } } # the following tests are only for close tags at the start of a line return unless $txt =~ /^\s*<\//; return if $ignoreblock; # stragglers can't be detected when coming out of an ignore block return if ( $prevline =~ /$ignoreblockstart|$ignoreblockend/ ); # more special-case hackery to handle # </table> # </para> if ( ($prevline =~ /<\/table>\s*$/) && ($txt =~ /^\s*<\/para>\s*$/) ) { return; } for my $tag (@straggler_tags) { if ( $txt =~ /^\s*(<\/$tag>)\s*$/ ) { $txt = highlight_word($txt, $1); showline($bname, $line, $ansi{yellow}, "straggling </$tag>", $txt); } } } sub doc_whitespace { my ($bname, $line, $txt) = @_; my $txtbak = $txt; # indents and tabs/spaces are not significant inside # ignorable SGML blocks return if $ignoreblock; # multiples of eight spaces at the start a line # (after zero or more tabs) should be a tab if ( $txt =~ s/^(\t* {8})+/$li$1$ri/g ) { showline($bname, $line, $ansi{darkmagenta}, 'use tabs instead of spaces', $txt); } # tabs hidden in paragraphs is also bad $txt = $txtbak; if ( $txt =~ s/^(\s*\S+)(.*)(\t)/$1$2$li$3$ri/ ) { showline($bname, $line, $ansi{darkmagenta}, 'tab in content', $txt); } # if coming out of an ignoreblock, odd spaces are # an artifact of splitting the line and can't be checked return if ( $prevline =~ /$ignoreblockstart|$ignoreblockend/ ); # one or more occurrences of single tabs or double spaces, # followed by a single space, is a bad indent # if ( $txt =~ s/^((?:(?: )+|(?:\t+))* )\b/$li$1$ri/ ) { # but simpler just to expand tabs to 8 spaces # and check for an odd number of spaces $txt = $txtbak; $txt = expand_tabs($txt); if ( $txt =~ s/^((?: )* )\b/$li$1$ri/ ) { showline($bname, $line, $ansi{darkred}, 'bad indent', $txt); } } # DocBook batch tests # remember previous line for comparison sub saveprevline { my $pline = shift; $prevline = $pline; if ( $pline =~ /\S+/ ) { # treat comments as blank lines return if $pline =~ /\s*<!--/; return if $pline =~ /-->\s*$/; $prevnonblank = $pline; } } initialize(); # main loop foreach my $fname (@ARGV) { if ( $fname ne 'stdin' ) { next if -d $fname; unless ( -f $fname ) { print "$fname: not found\n"; next; } unless ( -r $fname ) { print "$fname: not readable\n"; next; } } unless ( $opt_X ) { print "$fname:\n" if $#ARGV > 0; } else { print "<file name=\"", xmlize($fname), "\">\n"; } $fname = writestdinfile() if $fname eq "stdin"; $bname = basename($fname); $tmpfile = ''; $type = filetype($fname); if ( $type =~ /gzip|bzip/ ) { $tmpfile = uncompress($fname, $type); $type = filetype($tmpfile); } print "detected file type:$type\n" if $verbose; open $fh, '<', ($tmpfile ? $tmpfile : $fname) or die "cannot open '$tmpfile':$!\n"; # reset for each new document init_mdoc_uniqxrefs() if $opt_g; # mdoc see also xrefs init_mdoc_structure() if $opt_m; # mdoc tag presence $ignoreblock = 0; # ignore SGML block my $saveindent = ''; # SGML indent level # line-by-line tests while (<$fh>) { # limit output to one vulnerability ID if ( $vid ) { if ( $_ =~ /<vuln/ ) { print "checking for VID in '$_'\n" if $verbose; if ( $vid eq 'latest' ) { $startline = $. if $_ =~ /<vuln vid=/; } else { $startline = $. if $_ =~ /<vuln vid=\"$vid\"/; } $stopline = $. if $_ =~ /<\/vuln/; print "VID: startline=$startline, stopline=$stopline\n" if $verbose; } } # end if past specified ending line number last if $stopline && ($. > $stopline); chomp; # global tests abbrevs($bname, $., $_) if $opt_a; badphrases($bname, $., $_) if $opt_b; contractions($bname, $., $_) if $opt_u; freebsdobsolete($bname, $., $_) if $opt_f; repeatedwords($bname, $., $_) if $opt_r; spellingerrors($bname, $., $_) if $opt_s; whitespace($bname, $., $_) if $opt_w; # mdoc line tests if ( $type eq "troff" ) { next if /^\.\\\"/; # ignore comments for these tests mdoc_whitespace($bname, $., $_) if $opt_p; mdoc_date($bname, $., $_) if $opt_d; mdoc_sentence($bname, $., $_) if $opt_e; mdoc_uniqxrefs($bname, $., $_) if $opt_g; mdoc_structure($bname, $., $_) if $opt_m; } # DocBook line tests if ( $type =~ /sgml|xml/ ) { $origline = $_; doc_stragglers($bname, $., $_) if $opt_S; doc_tagstyle_whitespace($bname, $., $_) if $opt_t; for my $segment (splitter($_)) { if ( $segment =~ /($ignoreblockstart)/ ) { # when entering an ignore block, test the full # line for indentation unless it is a comment unless ( $origline =~ /^\s*<!--/ ) { doc_indentation($bname, $., $origline) if $opt_i; # test just the indent for whitespace my ($origindent, undef) = splitleading($origline); doc_whitespace($bname, $., $origindent) if $opt_W; $saveindent = leading_space($origline); # save the same state information as the main loop would saveprevline($saveindent . $1); # test just the leading whitespace } $ignoreblock = 1; next; } elsif ( $segment =~ /($ignoreblockend)/ ) { # restore the indent level at the end of an ignore block $ignoreblock = 0; $prevline = substr($saveindent,0,length($saveindent)-2) . $1; next; } doc_titles($bname, $., $segment) if $opt_c; doc_indentation($bname, $., $segment) if $opt_i; doc_longlines($bname, $., $segment) if $opt_l; doc_sentence($bname, $., $segment) if $opt_n; doc_openclose($bname, $., $segment) if $opt_o; doc_writestyle($bname, $., $segment) if $opt_E; doc_whitespace($bname, $., $segment) if $opt_W; } } saveprevline($_); } close $fh or die "could not close file:$!\n"; if ( $opt_d || $opt_y ) { # skip batch tests if a line range is set last if $opt_C; # slurp the whole file open $fh, '<', ($tmpfile ? $tmpfile : $fname) or die "cannot open '$tmpfile':$!\n"; my $fulltext = do { local($/); <$fh> }; close $fh or die "could not close file:$!\n"; # global batch tests style($bname, $fulltext) if $opt_y; # mdoc batch tests if ( ($type eq "troff") && ($opt_d) && (!$docdate) ) { showline($bname, '-', '.Dd date not set', '', ''); } } print "</file>\n" if $opt_X; removetempfiles(); }