#htmlchek.awk: Syntactically checks HTML files for a number of possible errors.
#
# Typical use:
#
# awk -f htmlchek.awk [options] infile.html > outfile.check
#
# Where options have the form "option=value", and are detailed in the
# documentation.
#
# This program is written in the ``awk'' programming language (on Sun systems
# and some others, non-archaic ``awk'' is called ``nawk'', so that ``nawk''
# should be used instead of ``awk''). Also, a freely-redistributable ``awk''
# interpreter called ``gawk'', which is free of the bugs that some of the
# vendor-supplied ``awk''/``nawk'' programs suffer from, is available for most
# platforms, and as source from the FSF GNU project. Use the separate file
# htmlchek.sh, distributed with the htmlchek package, to avoid running this
# program under incompatible ``old awk'' on Unix.
#
# Copyright H. Churchyard 1994, 1995 -- freely redistributable.
#
# Version 1.0 10/15/94 -- Tested with gawk 2.15 and nawk on SunOS, gawk 2.14
# and nawk on DEC Alpha OSF/1, and gawk 2.11 on 16-bit MS-DOS.
#
# Version 1.1 10/25/94 -- Fixed a tag option parsing bug that I had detected,
# but not corrected, before posting 1.0. Added checks against -only
# elements in ..., against and not in ..
, and
# for the presence of and <h1>, and <img> with ``alt'' option. A user
# has reported success with gawk 2.15 on VMS. Multiple files on the command
# line are now supported, and the syntax ``<A B = "c">'' generates a warning,
# rather than errors.
#
# Version 1.2 10/30/94 -- Some code cleanup, was too restrictive on possible
# ampersand codes. Handle <head>...<body>...</body> syntax, support <!doctype>
# and <meta> tags, warn about whitespace after ``<'' (and after ``>'' of some
# opening pairing tags), warn about some empty <X></X> elements. Almost too
# big for 16-bit MS-DOS.
#
# Version 2.0 11/20/94 -- Clarified status of U and DFN tags, added rudimentary
# cross-reference checking ability, support HTMLPlus and Netscape extensions,
# added checks for location option values which are null or missing or have
# embedded whitespace, check tag names with embedded `=' or `"', miscellaneous
# non-whitespace and <dl> checks, separated docs from program source, improved
# docs, added perl port and Unix shell scripts.
#
# Version 3.0 12/1/94 -- Added automatic file-local reference checking,
# recognize separate behavior of optionally-pairing and obligatorily-pairing
# tags, allow redefinition of the language checked for through command-line
# switches or an external configuration file, cut down on some repeated
# redundant error messages, updated HTML 3.0 preliminary specification that
# is checked for, do not treat `<' followed by whitespace as a tag beginning,
# check for allowed and required tag options, check for quoted URL's, numerous
# minor fixes and enhancements. Tested under VMS Posix.
#
# Version 3.01 12/11/94 -- Allow <!--...--> in low-level mark up, unquoted
# non-alphanumeric option values are now recognized as errors, fixed a typo in
# one errormessage
#
# Version 4.0 1/17/95 -- Added internal cross-reference checking (not as hard
# as I thought it would be!); added option of generating dependency map; added
# command-line options to allow `<' and`>' characters within quoted attribute
# values and <!-- --> comments, and `>' characters outside tags; added sample
# configuration files; added check for content of <ADDRESS> element; now detect
# multiple <HEAD> elements in document; <OPTION>, <TEXTAREA>, and <TITLE>
# elements should not contain any tags; <INPUT>, <SELECT> and <TEXTAREA> do not
# have to be _immediately_ contained within a <FORM> (inclusion exception);
# allow reqopts= command-line option to specify multiple required attributes
# for a single tag; added dlstrict= option and changed default strictness to
# that of dlstrict=1; differentiated novalopts= from tagopts=; added
# subtract="..." command-line option (to facilitate checking files outside
# current directory); updated Arena/HTML3 language definition; tinkered with
# the Netscape language definition (in the absence of any definitive
# documentation); improved internal htmlchek.pl options checking; other minor
# fixes and enhancements.
#
# Version 4.1 2/20/1995 -- Don't warn about null <TEXTAREA></TEXTAREA>
# element; only check for inappropriate whitespace within elements
# commonly rendered as underlined (<A> and <U>); check ordering of head
# tags before body tags even in absence of explicit <head>...</head>;
# allow comments between list items; only output non-numeric unquoted
# option values in each file; corrected processing of HTML3 <LH>;
# updated HTML 3 language definition to January 19 1995 draft; tinkered
# with Netscape extensions language-definition yet again; added inline=1
# command-line parameter; added listfile=/lf= command-line parameter
# (especially for greater MS-DOS convenience); allow cf= as abbreviation
# of configfile=; ampersands followed by non-alphabetics generate
# warnings rather than errors.
#
BEGIN{#List of known HTML tagwords, divided into pairing tags, <X>...</X>, and
#non-pairing tags -- those where <X> occurs without a following </X>.
#Pairing tags are further classified into list tags, and those tags which
#do not self-nest, etc.
#
#Non-pairing:
#
unpair["!--"]=1;unpair["!DOCTYPE"]=1;unpair["BASE"]=1;unpair["BR"]=1;
unpair["COMMENT"]=1;unpair["HR"]=1;unpair["IMG"]=1;unpair["INPUT"]=1;
unpair["ISINDEX"]=1;unpair["LINK"]=1;unpair["META"]=1;unpair["NEXTID"]=1;
#
#Optionally-pairing:
#
canpair["DD"]=1;canpair["DT"]=1;canpair["LI"]=1;canpair["OPTION"]=1;
canpair["P"]=1;canpair["PLAINTEXT"]=1;
#
#Pairing:
#
pair["A"]=1;pair["ADDRESS"]=1;pair["B"]=1;pair["BLOCKQUOTE"]=1;pair["BODY"]=1;
pair["CITE"]=1;pair["CODE"]=1;pair["DFN"]=1;pair["DIR"]=1;pair["DL"]=1;
pair["EM"]=1;pair["FORM"]=1;pair["H1"]=1;pair["H2"]=1;pair["H3"]=1;
pair["H4"]=1;pair["H5"]=1;pair["H6"]=1;pair["HEAD"]=1;pair["HTML"]=1;
pair["I"]=1;pair["KBD"]=1;pair["KEY"]=1;pair["LISTING"]=1;pair["MENU"]=1;
pair["OL"]=1;pair["PRE"]=1;pair["S"]=1;pair["SAMP"]=1;pair["SELECT"]=1;
pair["STRONG"]=1;pair["TEXTAREA"]=1;pair["TITLE"]=1;pair["TT"]=1;pair["U"]=1;
pair["UL"]=1;pair["VAR"]=1;pair["XMP"]=1;
#
# The union of the set of tags in ``pair'' with the sets of tags in ``unpair''
# and ``canpair'' is the set of all tags known to this program.
#
#Deprecated:
#
deprec["COMMENT"]=1;deprec["LISTING"]=1;deprec["PLAINTEXT"]=1;deprec["XMP"]=1;
#
#These tags are proposed and/or used, but are are not part of the HTML 1.24 DTD:
#
nonstd["DFN"]=1;nonstd["KEY"]=1;nonstd["U"]=1;nonstd["S"]=1;
#
#Allowed in the <head>...</head> element:
#
inhead["ISINDEX"]=1;inhead["HEAD"]=1;inhead["!--"]=1;
#
#.. and also not allowed in <body>...</body>:
#
headonly["BASE"]=1;headonly["LINK"]=1;headonly["META"]=1;headonly["NEXTID"]=1;
headonly["TITLE"]=1;
#
#Allowed only in context of form -- OPTION only in context of SELECT:
#
formonly["INPUT"]=1;formonly["SELECT"]=1;formonly["TEXTAREA"]=1;
#
#Lists -- all <LI> must be first order daughter of these and vice versa:
#
list["DIR"]=1;list["MENU"]=1;list["OL"]=1;list["UL"]=1;
#
#Lists that do not involve <LI> -- this is almost only used for the "Maximum
#depth of list embedding" diagnostic:
#
nonlilist["DL"]=1;
#
#Lists whose <LI> can only contain low-level markup.
#
lowlvlist["DIR"]=1;lowlvlist["MENU"]=1;
#
#These elements can't contain _any_other_ tags within them.
#
pcdata["TITLE"]=1;pcdata["OPTION"]=1;pcdata["TEXTAREA"]=1;
#
#These tags require the presence of some option -- A is checked separately:
#
rqopt["BASE","HREF"]=1;rqopt["IMG","SRC"]=1;rqopt["LINK","HREF"]=1;
rqopt["META","CONTENT"]=1;rqopt["NEXTID","N"]=1;rqopt["SELECT","NAME"]=1;
rqopt["TEXTAREA","NAME"]=1;rqopt["TEXTAREA","ROWS"]=1;
rqopt["TEXTAREA","COLS"]=1;
#
#Allowed options; if opt["TAG","OPTION"]==1, then that option does not require
#a value.
#
opt["A","HREF"]=2;opt["A","METHODS"]=2;opt["A","NAME"]=2;opt["A","REL"]=2;
opt["A","REV"]=2;opt["A","TITLE"]=2;opt["A","URN"]=2;opt["BASE","HREF"]=2;
opt["DIR","COMPACT"]=1;opt["DL","COMPACT"]=1;opt["FORM","ACTION"]=2;
opt["FORM","ENCTYPE"]=2;opt["FORM","METHOD"]=1;opt["HTML","VERSION"]=2;
opt["IMG","ALIGN"]=2;opt["IMG","ALT"]=2;opt["IMG","ISMAP"]=1;opt["IMG","SRC"]=2;
opt["INPUT","ALIGN"]=2;opt["INPUT","CHECKED"]=1;opt["INPUT","MAXLENGTH"]=2;
opt["INPUT","NAME"]=2;opt["INPUT","SIZE"]=2;opt["INPUT","SRC"]=2;
opt["INPUT","TYPE"]=2;opt["INPUT","VALUE"]=2;opt["LINK","HREF"]=2;
opt["LINK","METHODS"]=2;opt["LINK","REL"]=2;opt["LINK","REV"]=2;
opt["LINK","TITLE"]=2;opt["LINK","URN"]=2;opt["MENU","COMPACT"]=1;
opt["META","CONTENT"]=2;opt["META","HTTP-EQUIV"]=2;opt["META","NAME"]=2;
opt["NEXTID","N"]=2;opt["OL","COMPACT"]=1;opt["OPTION","SELECTED"]=1;
opt["OPTION","VALUE"]=2;opt["PRE","WIDTH"]=2;opt["SELECT","MULTIPLE"]=1;
opt["SELECT","NAME"]=2;opt["SELECT","SIZE"]=2;opt["TEXTAREA","COLS"]=2;
opt["TEXTAREA","NAME"]=2;opt["TEXTAREA","ROWS"]=2;opt["UL","COMPACT"]=1;
#
#These elements -- and also <LI> in MENU or DIR -- can only contain low-level
#markup (ADDRESS is hard-wired separately because it can contain <P>):
#
text["DT"]=1;text["H1"]=1;text["H2"]=1;text["H3"]=1;text["H4"]=1;text["H5"]=1;
text["H6"]=1;text["PRE"]=1;
#
#These low-level markup elements can only contain other low-level mark-up.
#Special coding to allow headings in <A>, and <HR> in <PRE>.
#
lowlv["A"]=1;lowlv["B"]=1;lowlv["CITE"]=1;lowlv["CODE"]=1;lowlv["DFN"]=1;
lowlv["EM"]=1;lowlv["I"]=1;lowlv["KBD"]=1;lowlv["S"]=1;lowlv["SAMP"]=1;
lowlv["STRONG"]=1;lowlv["TT"]=1;lowlv["U"]=1;lowlv["VAR"]=1;
#
#Non-pairing low-level markup tags:
#
lwlvunp["BR"]=1;lwlvunp["IMG"]=1;lwlvunp["!--"]=1;
#
#Pairing but non-self-nesting tags -- i.e. one occurrence of <x>...</x> can
#never occur inside another occurrence of <x>...</x>, no matter how many
#intervening levels of embedding. I'm actually stricter than the standard
#here, since such self-nesting is almost certain to be by mistake, and this
#is a powerful error-detecting technique.
#
#In official specification:
nonnest["A"]=1;nonnest["ADDRESS"]=1;nonnest["FORM"]=1;nonnest["DIR"]=1;
nonnest["H1"]=1;nonnest["H2"]=1;nonnest["H3"]=1;
nonnest["H4"]=1;nonnest["H5"]=1;nonnest["H6"]=1;nonnest["HTML"]=1;
nonnest["MENU"]=1;nonnest["PRE"]=1;nonnest["SELECT"]=1;nonnest["TEXTAREA"]=1;
nonnest["TITLE"]=1;
#Added by me:
nonnest["B"]=1;nonnest["CITE"]=1;nonnest["CODE"]=1;nonnest["DFN"]=1;
nonnest["EM"]=1;nonnest["I"]=1;nonnest["KBD"]=1;nonnest["LISTING"]=1;
nonnest["S"]=1;nonnest["SAMP"]=1;nonnest["STRONG"]=1;nonnest["TT"]=1;
nonnest["U"]=1;nonnest["VAR"]=1;nonnest["XMP"]=1;
#
#nonnest["BODY"]=1;nonnest["HEAD"]=1; #Separate checks for these
#Document-enclosing tag:
html["HTML"]=1;
#
# In Posix-compliant awk, the lf= or listfile= option will have to be preceded
# by -v parameter: -v lf=names.lis
# Manipulating ARGV and ARGC will not work on older awks.
if (lf)
{if (listfile)
{print "Error: both lf= and listfile= specified";err=1;exit(1)}
else {listfile=lf}};
if (listfile)
{args=0;
while ((status=(getline listfiline < listfile))==1)
{++args;
while (ARGV[args]~/\075/) {++args};
sub(/[ \t]+$/,"",listfiline);sub(/^[ \t]+/,"",listfiline);
ARGV[args]=listfiline}
if ((status==-1)||(args==0))
{print "Error opening list file!";err=1;exit(1)}
else {ARGC=(args+1);lf="";listfile=""}}}
function startit() {
if ((lf)||(listfile))
{print "Errror: lf= or listfile= variable came into effect after BEGIN block;";
print "use `-v lf=...' or `-v listfile=...' as first command line parameters.";
err=1;exit(1)}
xxllm="(which should only include low-level markup)";
namechars="other than `A-Z', `a-z', `0-9', `-', or `.'";
initscalrs();
#Configuration file
if (cf)
{if (configfile)
{print "Error: both cf= and configfile= specified";err=1;exit(1)}
else {configfile=cf}};
if (configfile)
{readit=0;
while ((status=(getline configline < configfile))==1)
{readit=1;gsub(/[ \t]+/,"",configline);x=split(configline,cfgarr,"=");
if (x==2) {setoption(cfgarr[1],cfgarr[2])}
else {if (x>2) {print "Invalid line in config file:",configline}}};
if ((status==-1)||(readit==0))
{print "Error opening configuration file!";err=1;exit(1)}}
#
# HTML 3.0 extensions according to Jan. 19 1995 Arena document:
#
#idlgs["TAG"]=1 means that "ID", "LANG", and "CLASS" are allowed options.
#
h3=0;#controls LH allowed in lists
if (((arena)||(html3)||(htmlplus))&&(!((html3=="off")||(htmlplus=="off")||(arena=="off"))))
{pair["ABBREV"]=1;pair["ABOVE"]=1;pair["ACRONYM"]=1;pair["ARRAY"]=1;
pair["AU"]=1;pair["BELOW"]=1;pair["BIG"]=1;pair["BOX"]=1;pair["BQ"]=1;
pair["CAPTION"]=1;pair["DFN"]=1;pair["FIG"]=1;pair["FN"]=1;pair["LANG"]=1;
pair["MATH"]=1;pair["NOTE"]=1;pair["PERSON"]=1;pair["Q"]=1;pair["ROOT"]=1;
pair["S"]=1;pair["SMALL"]=1;pair["SUB"]=1;pair["SUP"]=1;pair["TABLE"]=1;
pair["U"]=1;unpair["ATOP"]=1;unpair["LEFT"]=1;unpair["OVER"]=1;
unpair["OVERLAY"]=1;unpair["RIGHT"]=1;unpair["TAB"]=1;canpair["AROW"]=1;
canpair["ITEM"]=1;canpair["LH"]=1;canpair["TD"]=1;canpair["TH"]=1;
canpair["TR"]=1;lowlv["ABBREV"]=1;lowlv["ACRONYM"]=1;lowlv["AU"]=1;
lowlv["BIG"]=1;lowlv["LANG"]=1;lowlv["PERSON"]=1;lowlv["Q"]=1;
lowlv["SMALL"]=1;lowlv["SUB"]=1;lowlv["SUP"]=1;lwlvunp["TAB"]=1;
text["LH"]=1;text["CAPTION"]=1;idlgs["A"]=1;idlgs["ABBREV"]=1;
idlgs["ACRONYM"]=1;idlgs["ADDRESS"]=1;idlgs["AU"]=1;idlgs["B"]=1;
idlgs["BIG"]=1;idlgs["BLOCKQUOTE"]=1;idlgs["BODY"]=1;idlgs["BQ"]=1;
idlgs["BR"]=1;idlgs["CAPTION"]=1;idlgs["CITE"]=1;idlgs["CODE"]=1;
idlgs["DD"]=1;idlgs["DFN"]=1;idlgs["DL"]=1;idlgs["DT"]=1;idlgs["EM"]=1;
idlgs["FIG"]=1;idlgs["FN"]=1;idlgs["H1"]=1;idlgs["H2"]=1;idlgs["H3"]=1;
idlgs["H4"]=1;idlgs["H5"]=1;idlgs["H6"]=1;idlgs["I"]=1;idlgs["IMG"]=1;
idlgs["INPUT"]=1;idlgs["KBD"]=1;idlgs["LANG"]=1;idlgs["LH"]=1;
idlgs["LI"]=1;idlgs["NOTE"]=1;idlgs["OL"]=1;idlgs["OPTION"]=1;idlgs["P"]=1;
idlgs["PERSON"]=1;idlgs["PRE"]=1;idlgs["Q"]=1;idlgs["S"]=1;idlgs["SAMP"]=1;
idlgs["SELECT"]=1;idlgs["SMALL"]=1;idlgs["STRONG"]=1;idlgs["SUB"]=1;
idlgs["SUP"]=1;idlgs["TABLE"]=1;idlgs["TD"]=1;idlgs["TEXTAREA"]=1;
idlgs["TH"]=1;idlgs["TR"]=1;idlgs["TT"]=1;idlgs["U"]=1;idlgs["UL"]=1;
idlgs["VAR"]=1;opt["A","BASE"]=2;opt["A","MD"]=2;opt["A","SHAPE"]=2;
opt["ABOVE","SYMBOL"]=2;opt["ARRAY","COLDEF"]=2;opt["ARRAY","DELIM"]=2;
opt["ARRAY","LABELS"]=1;opt["BASE","ID"]=2;opt["BELOW","SYMBOL"]=2;
opt["BODY","POSITION"]=2;opt["BOX","DELIM"]=2;opt["BOX","SIZE"]=2;
opt["BR","ALIGN"]=2;opt["CAPTION","ALIGN"]=2;opt["FIG","ALIGN"]=2;
opt["FIG","BASE"]=2;opt["FIG","HEIGHT"]=2;opt["FIG","HSPACE"]=2;
opt["FIG","ISMAP"]=1;opt["FIG","MD"]=2;opt["FIG","SRC"]=2;
opt["FIG","UNITS"]=2;opt["FIG","URN"]=2;opt["FIG","VSPACE"]=2;
opt["FIG","WIDTH"]=2;opt["H1","ALIGN"]=2;opt["H1","NOFOLD"]=1;
opt["H1","NOWRAP"]=1;opt["H2","ALIGN"]=2;opt["H2","NOFOLD"]=1;
opt["H2","NOWRAP"]=1;opt["H3","ALIGN"]=2;opt["H3","NOFOLD"]=1;
opt["H3","NOWRAP"]=1;opt["H4","ALIGN"]=2;opt["H4","NOFOLD"]=1;
opt["H4","NOWRAP"]=1;opt["H5","ALIGN"]=2;opt["H5","NOFOLD"]=1;
opt["H5","NOWRAP"]=1;opt["H6","ALIGN"]=2;opt["H6","NOFOLD"]=1;
opt["H6","NOWRAP"]=1;opt["HR","ALIGN"]=2;opt["HR","BASE"]=2;
opt["HR","MD"]=2;opt["HR","SRC"]=2;opt["HR","URN"]=2;opt["HR","WIDTH"]=2;
opt["IMG","BASE"]=2;opt["IMG","HEIGHT"]=2;opt["IMG","MD"]=2;
opt["IMG","UNITS"]=2;opt["IMG","URN"]=2;opt["IMG","WIDTH"]=2;
opt["INPUT","BASE"]=2;opt["INPUT","MD"]=2;opt["INPUT","URN"]=2;
opt["ISINDEX","HREF"]=2;opt["ISINDEX","PROMPT"]=2;opt["ITEM","ALIGN"]=2;
opt["ITEM","COLSPAN"]=2;opt["ITEM","ROWSPAN"]=2;opt["LI","BASE"]=2;
opt["LI","DINGBAT"]=2;opt["LI","MD"]=2;opt["LI","SKIP"]=2;opt["LI","SRC"]=2;
opt["LI","URN"]=2;opt["MATH","ID"]=2;opt["MATH","MODEL"]=2;
opt["NOTE","BASE"]=2;opt["NOTE","MD"]=2;opt["NOTE","ROLE"]=2;
opt["NOTE","SRC"]=2;opt["NOTE","URN"]=2;opt["OL","CONTINUE"]=1;
opt["OL","INHERIT"]=1;opt["OL","START"]=2;opt["OL","TYPE"]=2;
opt["OPTION","SHAPE"]=2;opt["OVER","SYMBOL"]=2;opt["OVERLAY","BASE"]=2;
opt["OVERLAY","HEIGHT"]=2;opt["OVERLAY","ISMAP"]=1;opt["OVERLAY","MD"]=2;
opt["OVERLAY","SEQ"]=2;opt["OVERLAY","SRC"]=2;opt["OVERLAY","UNITS"]=2;
opt["OVERLAY","URN"]=2;opt["OVERLAY","WIDTH"]=2;opt["OVERLAY","X"]=2;
opt["OVERLAY","Y"]=2;opt["P","ALIGN"]=2;opt["P","NOFOLD"]=1;
opt["P","NOWRAP"]=1;opt["ROOT","ROOT"]=2;opt["SELECT","BASE"]=2;
opt["SELECT","MD"]=2;opt["SELECT","SRC"]=2;opt["SELECT","URN"]=2;
opt["SUB","ALIGN"]=2;opt["SUP","ALIGN"]=2;opt["TAB","AFTER"]=2;
opt["TAB","BEFORE"]=2;opt["TAB","CENTER"]=1;opt["TAB","ID"]=2;
opt["TAB","RIGHT"]=1;opt["TAB","TO"]=2;opt["TABLE","ALIGN"]=2;
opt["TABLE","BORDER"]=1;opt["TABLE","COLSPEC"]=2;opt["TABLE","UNITS"]=2;
opt["TD","ALIGN"]=2;opt["TD","AXES"]=2;opt["TD","AXIS"]=2;
opt["TD","COLSPAN"]=2;opt["TD","NOWRAP"]=1;opt["TD","ROWSPAN"]=2;
opt["TD","VALIGN"]=2;opt["TH","ALIGN"]=2;opt["TH","AXES"]=2;
opt["TH","AXIS"]=2;opt["TH","COLSPAN"]=2;opt["TH","NOWRAP"]=1;
opt["TH","ROWSPAN"]=2;opt["TH","VALIGN"]=2;opt["TR","ALIGN"]=2;
opt["TR","VALIGN"]=2;opt["UL","BASE"]=2;opt["UL","DINGBAT"]=2;
opt["UL","MD"]=2;opt["UL","PLAIN"]=1;opt["UL","SRC"]=2;opt["UL","URN"]=2;
opt["UL","WRAP"]=2;txtf["ADDRESS"]=1;txtf["BLOCKQUOTE"]=1;txtf["BQ"]=1;
txtf["BR"]=1;txtf["DD"]=1;txtf["DL"]=1;txtf["DT"]=1;txtf["FIG"]=1;
txtf["H1"]=1;txtf["H2"]=1;txtf["H3"]=1;txtf["H4"]=1;txtf["H5"]=1;
txtf["H6"]=1;txtf["HR"]=1;txtf["LI"]=1;txtf["NOTE"]=1;txtf["OL"]=1;
txtf["P"]=1;txtf["PRE"]=1;txtf["TABLE"]=1;txtf["UL"]=1;
rqopt["ARRAY","COLDEF"]=1;rqopt["FIG","SRC"]=1;rqopt["NOTE","SRC"]=1;
rqopt["OVERLAY","SRC"]=1;inidlgs["ID"]=1;inidlgs["LANG"]=1;
intxtf["CLEAR"]=1;intxtf["NEEDS"]=1;html["HTMLPLUS"]=1;
#latest HTML3 patches
inidlgs["CLASS"]=1;headonly["STYLE"]=1;headonly["STYLES"]=1;
pcdata["STYLE"]=1;pair["STYLES"]=1;canpair["STYLE"]=1;
opt["BODY","BACKGROUND"]=2;opt["IMG","BASELINE"]=2;opt["STYLE","ID"]=2;
opt["STYLES","NOTATION"]=2;opt["HTML","ROLE"]=2;opt["HTML","URN"]=2;
reqopt["STYLE","ID"]=1;reqopt["STYLES","NOTATION"]=1;
#
deprec["HTMLPLUS"]=1;deprec["DIR"]=1;deprec["MENU"]=1;deprec["NEXTID"]=1;
deprec["BLOCKQUOTE"]=1;lwlvunp["MATH"]=1;lwlvunp["FN"]=1;h3=1;
for (x in nonstd) {delete nonstd[x]}};
#
#Netscape extensions (I go strictly by the documentation, such as there is, so
#no BLINK):
#
if ((netscape)&&(netscape!="off"))
{pair["CENTER"]=1;pair["NOBR"]=1;pair["FONT"]=1;canpair["BASEFONT"]=1;
unpair["WBR"]=1;opt["ISINDEX","PROMPT"]=1;opt["HR","SIZE"]=2;
opt["HR","WIDTH"]=2;opt["HR","ALIGN"]=2;opt["HR","NOSHADE"]=1;
opt["UL","TYPE"]=2;opt["OL","TYPE"]=2;opt["OL","START"]=2;
opt["LI","TYPE"]=2;opt["LI","VALUE"]=2;opt["IMG","WIDTH"]=2;
opt["IMG","HEIGHT"]=2;opt["IMG","BORDER"]=2;opt["IMG","VSPACE"]=2;
opt["IMG","HSPACE"]=2;opt["BR","CLEAR"]=2;opt["FONT","SIZE"]=2;
opt["BASEFONT","SIZE"]=2;opt["P","ALIGN"]=2;opt["H1","ALIGN"]=2;
opt["H2","ALIGN"]=2;opt["H3","ALIGN"]=2;opt["H4","ALIGN"]=2;
opt["H5","ALIGN"]=2;opt["H6","ALIGN"]=2;opt["IMG","LOWSRC"]=2;
lwlvunp["WBR"]=1;lwlvunp["CENTER"]=1;lowlv["FONT"]=1;lowlv["NOBR"]=1};
#
if (nonrecurpair) {setoption("nonrecurpair",nonrecurpair)};
if (strictpair) {setoption("strictpair",strictpair)};
if (loosepair) {setoption("loosepair",loosepair)};
if (nonpair) {setoption("nonpair",nonpair)};
if (nonblock) {setoption("nonblock",nonblock)};
if (lowlevelpair) {setoption("lowlevelpair",lowlevelpair)};
if (lowlevelnonpair) {setoption("lowlevelnonpair",lowlevelnonpair)};
if (deprecated) {setoption("deprecated",deprecated)};
if (tagopts) {setoption("tagopts",tagopts)};
if (novalopts) {setoption("novalopts",novalopts)};
if (reqopts) {setoption("reqopts",reqopts)};
if (!dlstrict) {dlstrict=1}
else {if (dlstrict!~/^[123]$/)
{print "Config error: dlstrict= must be 1, 2, or 3";err=1;exit(1)}};
if (!metachar) {metachar=2}
else {if (metachar!~/^[123]$/)
{print "Config error: metachar= must be 1, 2, or 3";err=1;exit(1)}};
if (refsfile)
{currf[1]=(refsfile ".SRC");currf[2]=(refsfile ".NAME");
currf[3]=(refsfile ".HREF");x=3;
if ((xref)&&(map)) {currf[4]=(refsfile ".MAP");x=4};
if (append)
{for (i=1;i<=x;++i) {print "" >> currf[i]}}
else {for (i=1;i<=x;++i) {print "" > currf[i]}}};
for (x in unpair) {if (x in pair)
{print "Internal logical inconsistency:",x,"defined as both pairing and non-pairing tag";
err=1;exit(1)}}}
#
# Main
#
{if (FNR==1) {if (NR!=1) {endit();
print "\n========================================\n"}
else {startit()};
fn=FILENAME;
# Next line is Unix-specific
sub(/^\.\//,"",fn);
if (subtract)
{if (index(fn,subtract)==1)
{fn=substr(fn,(length(subtract)+1))}
else
{print "Filename",fn,"does not have \042" subtract "\042 prefix specified in subtract= option";
print "Exiting prematurely...";err=1;exit(1)}};
nampref=(dirprefix fn "\043");lochpref=(dirprefix fn);
if (fn~/.\//) {fromroot=fn;sub(/\/[^\057]*$/,"/",fromroot)}
else {fromroot=""};fromroot=(dirprefix fromroot);
if (fn!="-") {if (inline) {printf "HTMLCHEK:"};
print "Diagnostics for file \042" fn "\042:"}};
if (inline) {print;s="HTMLCHEK:"}
else {if (sugar) {s=(fn ": " FNR ": ")}};
lastbeg=0;currsrch=1;txtbeg=1;
while (match(substr($0,currsrch),/[<>]/)!=0)
{currsrch=(currsrch+RSTART);
if (substr($0,(currsrch-1),1)=="<")
{if (state)
{parsetag(currsrch-1);lastbeg=(currsrch-1);
state=1;continuation=1;if (!nxrdo) {redo=1};
if ((metachar!=3)||((!inquote)&&(lasttag!="!--")))
{print s "Multiple `<' without `>' ERROR!",crl()}}
else {if ((currsrch>length($0))||(substr($0,currsrch,1)~/^[ \t]$/))
{print s "Whitespace after `<': Incorrect SGML tag syntax ERROR!",crl() ",Ignoring";
wastext=1}
else {if (!wastext)
{if (substr($0,txtbeg,(currsrch-(txtbeg+1)))!~/^[ \t]*$/)
{wastext=1}};
if (wastext)
{headbody=hedbodarr[hedbodvar];
if ((!bodywarn)&&(!headbody)&&((!nestvar)||(nestarr[nestvar]=="HTML")))
{print s "Was non-whitespace outside <body>...</body> Warning!",crl();
bodywarn=1}
else {if ((headbody=="HEAD")&&(nestarr[nestvar]=="HEAD"))
{print s "Was non-whitespace in <head>...</head> outside any element ERROR!",crl()}}};
if ((currsrch==2)||(substr($0,(currsrch-2),1)~/^[ \t]$/))
{prews=1};
lastbeg=currsrch;state=1;prevtag=lasttag;lasttag="";lastopt=""}}}
else {if (substr($0,(currsrch-1),1)==">")
{if (state==0)
{if (!nogtwarn) {print s "`>' without `<' Warning!",crl()};
wastext=1}
else {parsetag(currsrch-1);
if ((metachar==3)&&((inquote)||((lasttag=="!--")&&(!comterr)&&(lastcomt!="--"))))
{lastbeg=(currsrch-1);continuation=1;
if (!nxrdo) {redo=1}}
else {if ((inquote)||(inequal)) {malft()};
if (optfree) {misstest()};
if ((lasttag=="!--")&&(lastcomt!="--"))
{print s "!-- comment not terminated by \042--\042 ERROR!",crl()};
if ((lasttag=="IMG")&&(alt==0))
{print s "IMG tag without ALT option Warning!",crl();++wasnoalt};
if ((lasttag=="LINK")&&(linkone==1)&&(linktwo==1))
{++linkrmhm};
if ((lasttag=="A")&&(!wasname)&&(!washref))
{print s "<A> tag occurred without reference (NAME,HREF,ID) option ERROR!",crl()};
head=("^" lasttag SUBSEP);
for (x in rqopt)
{if (x~head)
{split(x,optx,SUBSEP);
if (!(optx[2] in curtagopts))
{print s "<" lasttag "> tag occurred without",optx[2],"option ERROR!",crl()}}};
if ((wasname>1)||(washref>1))
{print s "Multiple reference (NAME,ID;HREF,SRC,BULLET) options ERROR!",crl(),"on tag",lasttag};
if ((!wastext)&&(lasttag==("/" prevtag))&&(lasttag!="/TEXTAREA"))
{print s "Null <x>...</x> element Warning!",crl(),"on tag",lasttag};
if ((lasttag~/^[AU]$/)&&((currsrch>length($0))||(substr($0,currsrch,1)~/^[ \t]$/))&&(!nowswarn))
{print s "Whitespace after `>' of underline markup opening tag Warning!",crl(),"on tag",lasttag;
++wswarn};
wastext=0;txtbeg=currsrch;prews=0;
state=0;continuation=0}}}
else {print s "Internal error",crl(),"ignore"}}};
if ((state==1)||((lastbeg==0)&&(continuation==1)))
{parsetag(length($0)+1);continuation=1}
else {if ((!state)&&($0!~/^[ \t]*$/)&&($0!~/>[ \t]*$/))
{wastext=1}};
if ($0~/&/) # Don't actually check against the list of &xxx; codes.
{gsub(/&[A-Za-z][-A-Za-z0-9.]*;/,"");gsub(/&\043[0-9][0-9]*;/,"");
x=0;x=gsub(/&+[^a-zA-Z&]/,"");x=(x+gsub(/&+$/,""));
if (x) {print "Loose ampersand (may be OK) Warning!",crl()};
if ($0~/&/)
{print s "Apparent non-complying ampersand code ERROR!",crl()}}}
#
#
# parsetag() communicates with main() through these global variables:
# - lastbeg (zero if no `<' ocurred on line, otherwise points to character
# immediately after the last `<' encountered).
# - state (one if unresolved `<', zero otherwise).
# - continuation (one if unresolved `<' from previous line, zero otherwise),
# - inquote (one if inside option quotes <tag opt="....">).
#
function parsetag(inp) {
if (!lastbeg) {lastbeg=1};
numf=split(substr($0,lastbeg,(inp-lastbeg)),arr);
if (substr($0,lastbeg,(inp-lastbeg))~/[ \t]$/) {nxrdo=1}
else {nxrdo=0}
if (numf==0)
{if (!continuation)
{print s "Null tagname ERROR!",crl();state=0;
inquote=0;inequal=0;optfree=0;wasopt=0;linkone=0;linktwo=0;
wasname=0;washref=0;for (x in curtagopts) {delete curtagopts[x]}};
return}
else {if (!continuation)
{arr[1]=upcase(arr[1]);if (arr[1]~/^!--/) {raw=arr[1];arr[1]="!--"}
else {raw=""};
if (arr[1]~/[\075\042]/)
{print s "Bad tagname ERROR!",crl(),"on tag",arr[1]};
lasttag=arr[1];alt=0;
if (arr[1]~/^\//) # </TAG> found
{sub(/^\//,"",arr[1]);
if ((prews)&&(arr[1]~/^[AU]$/)&&(!nowswarn))
{print s "Whitespace before `<' of underline closing tag Warning!",crl(),"on tag",lasttag;
++wswarn};
if (arr[1] in unpair)
{print s "Closing tag on empty element (non-pairing tag) ERROR!",crl(),"on tag /" arr[1]}
else {poppdstak=0;
if ((arr[1] in pair)||(arr[1] in canpair))
{if ((nestvar<=0)||(lev[arr[1]]<=0))
{print s "Extraneous /" arr[1],"tag without preceding",arr[1],"tag ERROR!",crl() ", Ignoring"}
else {if (nestarr[nestvar]!=arr[1])
{if ((nestvar>2)&&(nestarr[(nestvar-2)]==arr[1]))
{if ((nestarr[nestvar] in canpair)&&((nestarr[(nestvar-1)]~/^L[HI]$/)||(nestarr[(nestvar-1)]~/^D[TD]$/)))
{--lev[nestarr[nestvar]];--nestvar;
poppdstak=1}};
if ((nestvar>1)&&(nestarr[(nestvar-1)]==arr[1]))
{if (!(nestarr[nestvar] in canpair))
# Implicit end of optionally-pairing element
{print s "Missing /" nestarr[nestvar],"tag (should be located before /" arr[1],"tag) ERROR!",crl()};
--lev[nestarr[nestvar]];
--nestvar;poppdstak=1;--lev[arr[1]]}
else
{print s "Improper nesting ERROR!",crl() ": /" nestarr[nestvar],"expected, /" arr[1],"found";
--lev[arr[1]]}}
else {--lev[arr[1]]};
if (nestarr[nestvar] in list)
{if (!isli[nestvar])
{print s "Empty list (without <LI>) ERROR!",crl(),"on tag /" arr[1]};
if ((wastext)&&(!poppdstak))
{print s "Non-whitespace outside <LI> in list ERROR!",crl(),"on tag",arr[1]}};
if (nestarr[nestvar]=="DL")
{if (!isdtdd[nestvar])
{print s "Empty DL list (without <dt>/<dd>) ERROR!",crl()};
if ((wastext)&&(!poppdstak))
{print s "Non-whitespace outside <dt>/<dd> in <dl> list ERROR!",crl(),"on tag",arr[1]}};
--nestvar}}
else {revusarr[arr[1]]=1;
if ((!lev[arr[1]])||(lev[arr[1]]<=0))
{print s "Extraneous closing tag </x> ERROR!",crl(),"on unknown tag /" arr[1]}
else {--lev[arr[1]]}}};
if (arr[1]=="HEAD")
{if (title==0)
{print s "No <TITLE> in <head>...</head> ERROR!",crl()};
base=0;title=0;--hedbodvar};
if (arr[1]=="BODY")
{if (headone==0)
{print s "No <H1> in <body>...</body> Warning!",crl()};
headone=0;bodywarn=0;--hedbodvar};
if ((arr[1] in list)||(arr[1] in nonlilist))
{--listdep}}
else
# <TAG> found
{if ((nestarr[nestvar] in pcdata)&&(arr[1]!=nestarr[nestvar]))
{print s "Tag inside",nestarr[nestvar],"element ERROR!",crl(),"on tag",lasttag};
if ((arr[1] in pair)||(arr[1] in canpair)||(arr[1] in unpair))
{known=1}
else {known=0};
if (!((arr[1] in lowlv)||(arr[1] in lwlvunp)))
{curnest="";
if ((nestvar>1)&&(arr[1]!="LI")&&(nestarr[nestvar]=="LI")&&(nestarr[(nestvar-1)] in lowlvlist))
{curnest=("LI in " nestarr[(nestvar-1)])}
else
{if ((nestarr[nestvar] in text)||(nestarr[nestvar] in lowlv))
{if ((arr[1]~/^H[1-6]$/)&&(nestarr[nestvar]=="A"))
{print s arr[1],"heading in <A>...</A> element Warning!",crl()}
else
{if ((arr[1]!=nestarr[nestvar])&&(!((arr[1]=="HR")&&(nestarr[nestvar]=="PRE"))))
# inclusion exceptions
{if (!((arr[1] in formonly)&&(lev["FORM"]>0)))
{curnest=nestarr[nestvar]}}}}
else {if ((arr[1]!="P")&&(lev["ADDRESS"]>0))
{curnest="ADDRESS"}}};
if (curnest)
{if (known)
{if (!(((arr[1]=="LI")||(arr[1]~/^D[DT]$/))&&((nestarr[nestvar]=="DT")||(nestarr[nestvar]=="LH"))))
{print s arr[1],"tag, which is not low-level markup, nested in",curnest,"element ERROR!",crl()}}
else
{print s "Unknown tag",arr[1],"nested in",curnest, "element",xxllm,"Warning!",crl()}}};
if (arr[1] in html) {++lev["HTML"]}
else {++lev[arr[1]]};
# Not necessarily immediately contained in FORM
if ((arr[1] in formonly)&&(lev["FORM"]<=0))
{print s "<" arr[1] "> outside of <form>...</form> ERROR!",crl()};
if ((arr[1]=="OPTION")&&(nestarr[nestvar]!="SELECT")&&(nestarr[nestvar]!="OPTION"))
{print s "<" arr[1] "> outside of <select>...</select> ERROR!",crl()};
if ((arr[1]=="STYLE")&&(nestarr[nestvar]!~/^STYLES?$/))
{print s "<" arr[1] "> outside of <styles>...</styles> ERROR!",crl()};
if (nestarr[nestvar] in list)
{if (wastext)
{print s "Non-whitespace outside <LI> in list ERROR!",crl(),"on tag",arr[1]};
if ((arr[1]!="LI")&&(!((arr[1]=="LH")&&(h3)))&&(arr[1]!="!--"))
{print s "Tag in list occurred outside <LI> ERROR!",crl(),"on tag",arr[1]}};
if (nestarr[nestvar]=="DL")
{if (wastext)
{print s "Non-whitespace outside <dt>/<dd> in <dl> list ERROR!",crl(),"on tag",arr[1]};
if ((arr[1]!~/^D[DT]$/)&&(!((arr[1]=="LH")&&(h3)))&&(arr[1]!="!--"))
{print s "Tag in <dl> list occurred outside <dt>/<dd> ERROR!",crl(),"on tag",arr[1]}};
headbody="";implicit=0;
if ((arr[1] in pair)||(arr[1] in canpair))
{if ((arr[1]=="HEAD")||(arr[1]=="BODY"))
{if ((!("HTML" in lev))||(lev["HTML"]==0))
{print s "HEAD or BODY outside of <HTML>...</HTML> Warning!",crl()};
if (hedbodvar>0)
{if ((hedbodarr[hedbodvar]=="HEAD")&&(arr[1]=="BODY"))
{hedbodarr[hedbodvar]=arr[1];--lev["HEAD"];
print s "Assumed an implicit `</HEAD>' before <BODY> Warning!",crl();
if ((nestarr[nestvar]!="HEAD")&&(nestarr[nestvar]in pair))
{print s "Improper nesting on implicit </HEAD> ERROR!",crl() ", tag /" nestarr[nestvar],"expected"};
nestarr[nestvar]=arr[1];implicit=1;
if (title==0)
{print s "No <TITLE> in <head>...</head> ERROR!",crl()}}
else
{print s "HEAD or BODY nested inside HEAD or BODY element ERROR!",crl()}}
else {if ((arr[1]=="BODY")&&(!("HEAD" in usarr)))
{print "<body> without preceding <head>...</head> Warning!",crl()};
if ((nestvar>0)&&(nestarr[nestvar]!="HTML"))
{print s "HEAD or BODY contained inside non-HTML element ERROR!",crl()}};
hbwarn=0;base=0;title=0;headone=0;loosbtag=0;
if (arr[1]=="HEAD") {++numheads};
if (!implicit) {++hedbodvar;
hedbodarr[hedbodvar]=arr[1]}};
if (!implicit)
{if ((!(nestarr[nestvar] in canpair))||(!(arr[1] in canpair)))
{++nestvar}
else
{xx=0;
if ((nestarr[nestvar]=="LH")&&(arr[1]!="LI")&&(arr[1]!~/^D[TD]$/)) {xx=1};
if (((nestarr[nestvar]=="LI")&&(arr[1]!="LI"))||((nestarr[nestvar]~/^D[TD]$/)&&(arr[1]!~/^D[TD]$/))) {xx=1};
if (xx)
{++nestvar}
else
{if ((nestvar>2)&&((nestarr[(nestvar-1)]~/^L[HI]$/)||(nestarr[(nestvar-1)]~/^D[TD]$/)))
{if (((nestarr[nestvar]!="LI")&&(arr[1]=="LI"))||((nestarr[nestvar]!~/^D[TD]$/)&&(arr[1]~/^D[TD]$/)))
{--nestvar}}}};
if (arr[1] in html) {nestarr[nestvar]="HTML"}
else {nestarr[nestvar]=arr[1]}};
isli[nestvar]=0;isdtdd[nestvar]=0;isdt[nestvar]=0};
if (hedbodvar) {headbody=hedbodarr[hedbodvar]};
if ((arr[1] in list)||(arr[1] in nonlilist))
{++listdep;if (listdep>maxlist) {maxlist=listdep}};
if (arr[1]=="LI")
{isli[(nestvar-1)]=1;
if ((nestvar<2)||(!(nestarr[(nestvar-1)] in list)))
{print s "<LI> outside of list ERROR!",crl()}};
if (arr[1]~/^D[DT]$/)
{isdtdd[(nestvar-1)]=1;
if ((nestvar<2)||(nestarr[(nestvar-1)]!="DL"))
{print s "<dt>/<dd> outside of <dl> list ERROR!",crl(),"on tag",arr[1]}
else {if (arr[1]=="DT")
{isdt[(nestvar-1)]=1}
else {if (dlstrict>1)
{if (!isdt[(nestvar-1)])
{print s "<DD> without preceding <DT> in <DL> list Warning!",crl()};
if (dlstrict>2)
{isdt[nestvar-1]=0}
else {isdt[nestvar-1]=1}}}}};
if (!headbody)
{if ((lasttag!="!--")&&(lasttag!="!DOCTYPE")&&(!(lasttag in html))&&(!hbwarn))
{print s "Tag outside of HEAD or BODY element Warning!",crl(),"on tag",arr[1];
hbwarn=1}}
else {if (arr[1]=="PLAINTEXT")
{print s "<PLAINTEXT> in <head>...</head> or <body>...</body> ERROR!",crl()}};
if (headbody=="HEAD")
{if (!((arr[1] in inhead)||(arr[1] in headonly)))
{print s "Disallowed tag in <head>...</head> ERROR!",crl(),"on tag",arr[1]};
if (arr[1]=="TITLE")
{++title;
if (title>1)
{print s "Multiple <TITLE> tags in <head> ERROR!",crl()}}
if (arr[1]=="BASE")
{++base;
if (base>1)
{print s "Multiple <BASE> tags in <head> Warning!",crl()}}};
if (arr[1] in headonly)
{if (headbody=="BODY")
{print s "Disallowed tag in <body>...</body> ERROR!",crl(),"on tag",arr[1]}
else {if ((headbody!="HEAD")&&(loosbtag))
{print s "Tag",arr[1],"that belongs in HEAD occurred after a tag that belongs in BODY ERROR!",crl()}}}
else
{if ((!(arr[1] in inhead))&&(!headbody)&&(known)&&(arr[1]!="!DOCTYPE"))
{loosbtag=1}};
if (arr[1]~/^H[1-6]$/)
{newheadlev=substr(arr[1],2,1);
if (newheadlev>(headlevel+1))
{print s "Warning! Jump from header level H" headlevel, "to level H" newheadlev,crl()};
headlevel=newheadlev;
if (headlevel==1)
{++headone;
if (headone>1)
{print s "Multiple <H1> headings Warning!",crl()}}};
if ((arr[1]=="!DOCTYPE")&&(nestvar))
{print s "<!DOCTYPE...> enclosed within <x>...</x> ERROR!",crl()};
if (arr[1] in html) {if (nestvar>1)
{print s "<HTML> enclosed within <x>...</x> ERROR!",crl()};
bodywarn=0;hbwarn=0;headone=0;
loosbtag=0};
if ((arr[1] in nonnest)&&(lev[arr[1]]>1))
{print s "Self-nesting of unselfnestable tag ERROR!",crl() ", of level",lev[arr[1]],"on tag",arr[1]}};
if (arr[1] in html) {usarr["HTML"]=1}
else {usarr[arr[1]]=1};
if (arr[1]=="!--") {startf=1;comterr=0;cmplxcmt=0;lastcomt=""}
else {startf=2};
inquote=0;inequal=0;optfree=0;wasopt=0;linkone=0;linktwo=0;
wasname=0;washref=0;for (x in curtagopts) {delete curtagopts[x]}}
else {startf=1};
if (lasttag!~/^!/) # Remainder of stuff in <...> after tag word
{for (i=startf;i<=numf;++i)
{if ((!inequal)&&(!inquote))
{if ((arr[i]~/^[^=\042]*(=\042[^\042]*\042)?$/)||(arr[i]~/^[^=\042]*=(\042)?[^\042]*$/))
{if ((optfree)&&((arr[i]~/^=[^=\042][^=\042]*$/)||(arr[i]~/^=\042[^\042]*\042$/)))
{if (!malftag) {sub(/^\075/,"",arr[i]);
if (arr[i]~/\042/)
{optvalproc(arr[i],1)}
else {optvalproc(arr[i],0)}};
optfree=0}
else {if ((optfree)&&((arr[i]~/^=\042/)||(arr[i]=="="))) {inequal=1};
split(arr[i],arr2,"=");
if (arr2[1]=="")
{if (!inequal)
{print s "Null tag option ERROR!",crl(),"on tag",lasttag;malftag=1}}
else {if (optfree) {misstest()};
arr2[1]=upcase(arr2[1]);optfree=1;++wasopt;
malftag=0;optvalstr="";redo=0;
if (lasttag~/^\//)
{print s "Option on closing tag",lasttag,"ERROR!",crl()}
else {optarr[lasttag,arr2[1]]=1;
lastopt=arr2[1];
if ((lastopt!~/^[A-Z][-A-Z0-9.]*$/)&&(lastopt!="<"))
{print s "Option name \042" lastopt "\042 is not alphanumeric Warning!",crl(),"on tag",lasttag};
curtagopts[lastopt]=1;
if ((known)&&(!((lasttag,lastopt) in opt)))
{if (!(((lasttag in idlgs)&&(lastopt in inidlgs))||((lasttag in txtf)&&(lastopt in intxtf))))
{print s lastopt,"not recognized as an option for",lasttag,"tag Warning!",crl()}};
if ((lasttag=="IMG")&&(arr2[1]=="ALT"))
{alt=1}}};
if (arr[i]~/^[^=\042][^=\042]*=$/)
{inequal=1};
if (arr[i]~/[\075]/) {optvalstr=arr[i];
gsub(/^[^=]*=/,"",optvalstr)};
q=gsub(/\042/,"",arr[i])
if (q==1)
{inquote=1};
if ((optvalstr)&&(!inequal)&&(!inquote))
{optfree=0;
if (!malftag) {optvalproc(optvalstr,q)}}}}
else {malft()}}
else {if ((inequal)&&(!inquote))
{if (arr[i]~/\042/)
{if (arr[i]~/^\042[^\042]*(\042)?$/)
{if (gsub(/\042/,"",arr[i])==2)
{if (!malftag) {sub(/^\075/,"",arr[i]);
optvalproc(arr[i],1)};
inequal=0;optfree=0}
else {optvalstr=arr[i];inquote=1}}
else {malft()}}
else {if (arr[i]!~/\075/)
{if (!malftag) {optvalproc(arr[i],0)};
inequal=0;optfree=0}
else {malft()}}}
else {if (arr[i]~/\042/)
{inquote=0;inequal=0;optfree=0;
if (arr[i]!~/^[^\042]*\042$/)
{malft()}
else {if (redo)
{optvalstr=(optvalstr arr[i]);redo=0}
else {optvalstr=(optvalstr " " arr[i])};
if (!malftag) {optvalproc(optvalstr,1)}}}
else {if (redo)
{optvalstr=(optvalstr arr[i]);redo=0}
else {optvalstr=(optvalstr " " arr[i])}}}}}}
else {if (lasttag=="!--")
{if (!continuation) {sub(/^!--/,"",raw);arr[1]=raw}
else {if ((metachar==1)&&(!cmplxcmt))
{print s "Complex comment Warning!",crl();
cmplxcmt=1};
if (lastcomt=="--")
{print s "Apparent \042--\042 embedded in comment Warning!",crl();comterr=1}};
for (i=startf;i<=numf;++i)
{if (((arr[i]~/--/)&&(i<numf))||((arr[i]~/--./)&&(i==numf)))
{print s "Apparent \042--\042 embedded in comment Warning!",crl();comterr=1}};
if (arr[numf]~/--$/)
{lastcomt="--"}
else {lastcomt=""}}};
return}}
#
#
# Return as much location information as possible in diagnostics:
#
# Current location:
function crl() {if ((fn)&&(fn!="-"))
{return ("at line " FNR " of file \042" fn "\042")}
else {return ("at line " NR)}}
# End of file location:
function ndl() {if ((fn)&&(fn!="-"))
{return ("at END of file \042" fn "\042")}
else {return "at END"}}
#
#
# Error message returned from numerous places in the program...
#
function malft()
{print s "Malformed tag option ERROR!",crl(),"on tag",lasttag;malftag=1}
#
#
#Check for non-kosher null options:
#
function misstest()
{if (((lasttag=="A")&&(lastopt=="NAME"))||(lastopt=="HREF")||(lastopt=="ID"))
{print s "Missing reference option value ERROR!",crl(),"on tag",lasttag ", option",lastopt}
else
{if (opt[lasttag,lastopt]==2)
{print s "Missing option value ERROR!",crl(),"on tag",lasttag ", option",lastopt}
else
{if (((lasttag in idlgs)&&(lastopt in inidlgs))||((lasttag in txtf)&&(lastopt in intxtf)))
{print s "Missing option value ERROR!",crl(),"on tag",lasttag ", option",lastopt}}}}
#
#
#Set property arrays from command line variable or configuration file.
#
function setoption(inname,invalu,invarr) {
#allow command line options to override config file
if (inname=="htmlplus")
{if (htmlplus) {return}
else {htmlplus=invalu;return}};
if (inname=="html3")
{if (html3) {return}
else {html3=invalu;return}};
if (inname=="arena")
{if (arena) {return}
else {arena=invalu;return}};
if (inname=="netscape")
{if (netscape) {return}
else {netscape=invalu;return}};
if (inname=="dlstrict")
{if (dlstrict) {return}
else {dlstrict=invalu;return}};
if (inname=="metachar")
{if (metachar) {return}
else {metachar=invalu;return}};
if (inname=="nogtwarn")
{nogtwarn=invalu;return};
if (inname=="nowswarn")
{nowswarn=invalu;return};
if (invalu~/\075/)
{print "Invalid syntax on",inname "= configuration option, ignoring"}
else {if ((inname=="novalopts")||(inname=="tagopts")||(inname=="reqopts"))
{numf=split(invalu,invarr,":");
for (i=1;i<=numf;++i)
{numf2=split(invarr[i],invarr2,",")
if (numf2!=2)
{print "Invalid syntax on",inname "= configuration option, ignoring"}
else {if (inname=="novalopts")
{opt[upcase(invarr2[1]),upcase(invarr2[2])]=1}
else {if (inname=="reqopts")
{rqopt[upcase(invarr2[1]),upcase(invarr2[2])]=1};
opt[upcase(invarr2[1]),upcase(invarr2[2])]=2}}}}
else {numf=split(invalu,invarr,",")
for (i=1;i<=numf;++i)
{invarr[i]=upcase(invarr[i]);
if (inname=="nonrecurpair")
{pair[invarr[i]]=1;strictclean(invarr[i]);
nonnest[invarr[i]]=1}
else {if (inname=="strictpair")
{pair[invarr[i]]=1;strictclean(invarr[i]);
delete nonnest[invarr[i]]}
else {if (inname=="loosepair")
{if (notredef(invarr[i]))
{canpair[invarr[i]]=1;delete unpair[invarr[i]];
nonstrictclean(invarr[i])}}
else {if (inname=="nonpair")
{if (notredef(invarr[i]))
{unpair[invarr[i]]=1;delete canpair[invarr[i]];
nonstrictclean(invarr[i])}}
else {if (inname=="nonblock") {text[invarr[i]]=1;
delete unpair[invarr[i]]}
else {if (inname=="lowlevelpair") {lowlv[invarr[i]]=1;
strictclean(invarr[i])}
else {if (inname=="lowlevelnonpair")
{if (notredef(invarr[i])) {text[invarr[i]]=1;
nonstrictclean(invarr[i])}}
else {if (inname=="deprecated") {deprec[invarr[i]]=1}
else {print "Unrecognized configuration option",inname;
return}}}}}}}}}}}}
#
function strictclean(param) {
delete nonstd[param];
delete unpair[param];delete canpair[param];delete lwlvunp[param]}
#
function nonstrictclean(param) {
delete nonstd[param];
delete pair[param];delete nonnest[param];delete lowlv[param]}
#
#Stuff which has special hard-wired processing; don't allow user to redefine
#
function notredef(param) {
if ((param in list)||(param in nonlilist)||(param in html)||(param=="HEAD")||(param=="BODY"))
{return 0}
else {return 1}}
#
#
# This subroutine receives the raw option value string, for every tag option
# that does have a value. It does some errorchecking and cleanup, and writes
# to the .NAME, .HREF, and .SRC files when requested.
#
function optvalproc(val,quoted)
{currfn=0;if (quoted) {gsub(/\042/,"",val);sub(/^ /,"",val);sub(/ $/,"",val)};
if (lasttag=="LINK")
{xxx=upcase(val);
if ((lastopt=="REV")&&(xxx=="MADE")) {++linkone};
if ((lastopt=="HREF")&&(val~/^mailto:/)) {++linktwo}};
if ((usebase)&&(lasttag=="BASE")&&(lastopt=="HREF"))
{if ((quoted)&&(val)&&(val!="=")&&(val!~/[^ ] [^ ]/))
{nampref=(val "\043");lochpref=val;
if (val~/.\//) {fromroot=val;sub(/\/[^\057]*$/,"/",fromroot)}
else {fromroot=""}}
else {print s "Bad <BASE HREF=\042...\042>",crl() ", Ignoring"}}
else {if (((lasttag=="A")&&(lastopt=="NAME"))||(lastopt=="ID"))
{currfn=2;++wasname;
if ((val)&&(val!="="))
{if (("\043" val) in namearr)
{print s "Duplicate location \042\043" val "\042 ERROR!",crl(),"on tag",lasttag,"option",lastopt}
else {if (val~/^\043/)
{print s "Invalid \043-initial location \042" val "\042 ERROR!",crl(),"on tag",lasttag,"option",lastopt}
else {namearr[("\043" val)]=1}}}}
else {if ((lastopt=="SRC")||(lastopt=="BULLET"))
{currfn=1;++washref}
else {if (lastopt=="HREF")
{currfn=3;++washref;
if (val~/^\043/)
{loclhrefarr[val]=1}}}}};
if (currfn)
{if (val~/[^-a-zA-Z0-9.]/)
{if (!quoted)
{print s "Unquoted non-alphanumeric reference option value ERROR!",crl(),"on tag",lasttag ", option",lastopt}
else {if (currfn==2)
{print s "Character",namechars,"in location name Warning!",crl(),"on tag",lasttag ", option",lastopt}}}
else {if (!quoted)
{print s "Unquoted reference option value Warning!",crl(),"on tag",lasttag ", option",lastopt}};
if (val~/[^ ] [^ ]/)
{print s "Whitespace in reference option value Warning!",crl(),"on tag",lasttag ", option",lastopt}
else {if (val=="")
{print s "Null reference option value ERROR!",crl(),"on tag",lasttag,"option",lastopt}
else {
# Skip the residue of Malformed Tag Option cases; OK to do
# this, since "=" is not a valid URL; However, a minor bug
# is that <A NAME="="> will not be checked, and will not
# result in any errormessage.
if (((refsfile)||(xref))&&(val!="="))
{if (currfn==2) {val=(nampref val)}
else {if ((currfn==3)&&(val~/^\043/))
{val=(lochpref val)}
else {if (val~/^http:[^\057]*$/)
sub(/^http:/,"",val);
if ((val!~/^[^\057]*:/)&&(val!~/^\//))
{if (val~/^~/)
{print s "Relative URL beginning with '~' Warning!",crl(),"on tag",lasttag,"option",lastopt}
else {val=(fromroot val)}}}};
# This monstrosity supports "../" in URL's:
while (val~/\057[^\057]*[^\057]\057\.\.\057/)
{sub(/\057[^\057]*[^\057]\057\.\.\057/,"\057",val)};
if ((val~/[:\057]\.\.\057/)||(val~/^\.\.\057/))
{print s "Unresolved \042../\042 in URL Warning!",crl(),"on tag",lasttag,"option",lastopt};
if (!xref) {print val > currf[currfn]}
else {if (currfn==1)
{xsrcarr[val]=1;
if (map) {xmaparr[lochpref,val]=1}}
else {if (currfn==2) {xnamearr[val]=1}
else {if (currfn==3)
{xhrefarr[val]=1;
if (map) {if (val~/\043[^\057\043]*$/) {sub(/\043[^\057\043]*$/,"",val)};
xmaparr[lochpref,val]=1}}}}}}}}}
else {if ((!quoted)&&(val!="\075"))
{if (val~/[^-a-zA-Z0-9.]/)
{print s "Unquoted non-alphanumeric option value \042" val "\042 Warning!",crl(),"on tag option",lastopt}
else {if (val~/[^-0-9.]/)
{unqopt[(lastopt "=" upcase(val))]=1}}}}}
#
#
# Start each file with a clean slate.
#
function initscalrs() {
state=0;continuation=0;nestvar=0;bodywarn=0;maxlist=0;listdep=0;headone=0;
headlevel=0;br=0;wasnoalt=0;loosbtag=0;wswarn=0;hedbodvar=0;linkrmhm=0;
wastext=0;prevtag="";hbwarn=0;s="";prews=0;numheads=0;lasttag=""}
#
#
# Uppercasing routine; in GAWK can replace upcase() with built-in function
# toupper() for a speed boost.
#
BEGIN{
upc["a"]="A";upc["b"]="B";upc["c"]="C";upc["d"]="D";upc["e"]="E";upc["f"]="F";
upc["g"]="G";upc["h"]="H";upc["i"]="I";upc["j"]="J";upc["k"]="K";upc["l"]="L";
upc["m"]="M";upc["n"]="N";upc["o"]="O";upc["p"]="P";upc["q"]="Q";upc["r"]="R";
upc["s"]="S";upc["t"]="T";upc["u"]="U";upc["v"]="V";upc["w"]="W";upc["x"]="X";
upc["y"]="Y";upc["z"]="Z";
}
#
function upcase(upcins,k) {
if (upcins~/[a-z]/)
{for (k in upc) {if (upcins~k) {gsub(k,upc[k],upcins)}}};
return upcins}
#
#
# End-of-file routine.
#
END{if ((NR>0)&&(!err))
{endit();
if (xref)
{for (x in xhrefarr)
{if (x in xnamearr) {delete xhrefarr[x];delete xnamearr[x]}};
if (map)
{for (x in xmaparr)
{split(x,mapx,SUBSEP);
xdeparr[mapx[1]]=(xdeparr[mapx[1]] "\n\t" mapx[2])}};
if (refsfile)
{for (x in xnamearr) {print x > currf[2]};
for (x in xhrefarr) {print x > currf[3]};
for (x in xsrcarr) {print x > currf[1]};
if (map)
{for (x in xdeparr)
{print "File",x,"references:" xdeparr[x] > currf[4]}}}
else {print "\n========================================\n";
print "<A NAME=\042...\042> and ID=\042...\042 locations not referenced from within the files checked:\n"
for (x in xnamearr) {print x};
print "\n----------------------------------------\n";
print "HREF=\042...\042 references not found in the files checked:\n";
for (x in xhrefarr) {print x};
print "\n----------------------------------------\n";
print "SRC=\042...\042 (and BULLET=\042...\042) references:\n"
for (x in xsrcarr) {print x};
if (map)
{print "\n----------------------------------------\n";
print "Reference dependencies:\n";
for (x in xdeparr)
{print "File",x,"references:" xdeparr[x]}}}}}}
#
# File-final global errors and tag diagnostics.
# Information is passed here through arrays:
# - usarr[x]: The tag <x> was used.
# - revusarr[x]: The reverse tag </x> was used.
# - lev[x]: Current degree of self-nesting of paired tag <x>...</x>.
# - optarr[x,y]: The option y was used with tag <x>.
#and also through the variables maxlist and continuation.
#
function endit() {
if (!xref) {if ((currf[2])&&(refsfile)) {print lochpref > currf[2]}}
else {xnamearr[lochpref]=1};
if (inline) {s="HTMLCHEK:"}
else {if (sugar) {s=(fn ": END: ")}};
if (continuation)
{print s "Was awaiting a `>' ERROR!",ndl()};
if ((wastext)&&(!bodywarn))
{print s "File-final uncontained non-whitespace Warning!",ndl()};
for (x in usarr)
{if ((x in pair)&&(lev[x]>0))
{print s "Pending unresolved <x> without </x> ERROR! of level",lev[x],ndl(),"on tag",x}};
if (!("HTML" in usarr)) {print s "<HTML> not used in document Warning!",ndl()};
if (!("HEAD" in usarr)) {print s "<HEAD> not used in document Warning!",ndl()};
if (!("BODY" in usarr)) {print s "<BODY> not used in document Warning!",ndl()};
if (linkrmhm==0)
{print s "<LINK REV=\042made\042 HREF=\042mailto:...\042> not used in document Warning!",ndl()};
if (numheads>1)
{print s "<HEAD> used multiple (" numheads ") times ERROR!",ndl()};
if (!("TITLE" in usarr)) {print s "<TITLE> not used in document ERROR!",ndl()};
if (wasnoalt)
{print s "<IMG> tags were found without ALT option",wasnoalt,"times Warning!",ndl();
print "Advice: Add ALT=\042\042 to purely decorative images, and meaningful text to others."};
if (wswarn)
{print s "Whitespace separated underlining tags from enclosed element",wswarn,"times Warning!",ndl();
print "Advice: Change ``<X> text </X>'' syntax to preferred ``<X>text</X>'' syntax."};
for (x in loclhrefarr)
{if (!(x in namearr))
{print s "Was a dangling file-local reference \042" x "\042 ERROR!",ndl()}};
for (x in unqopt)
{if (!br) {printf "\n";if (inline) {printf "HTMLCHEK:"};
printf "Unquoted tag option=value pairs:";br=1};
printf " %s",x};
if (br) {printf "\n"};
for (x in usarr)
{options="";head=("^" x SUBSEP);
for (z in optarr)
{if (z~head)
{split(z,optx,SUBSEP);
options=(options " " optx[2])}};
unknown=0;if (!br) {print "";br=1};
if (inline) {printf "HTMLCHEK:"};
printf "%s %s %s","Tag",x,"occurred";
if (options)
{printf "%s%s",", with options",options};
if (!((x in pair)||(x in canpair)||(x in unpair)))
{printf ("; Warning! tag is unknown " ndl());unknown=1
if (x!~/^[A-Z!][-A-Z0-9.]*$/)
{printf ("; Warning! tag is not alphanumeric " ndl())}};
if (x in deprec)
{printf ("; Warning! tag is obsolescent and deprecated " ndl())}
else {if (x in nonstd)
{printf ("; Warning! tag is not (yet) a part of HTML standard " ndl())}};
if ((unknown)&&(x in revusarr)&&(lev[x]!=0))
{printf ("; Closing tag </" x "> of unknown tag " x " encountered and ");
printf ("balance of <" x "> minus </" x "> nonzero (" lev[x] ") Warning! " ndl())};
printf "\n"};
if (maxlist) {if (inline) {printf "HTMLCHEK:"};
print "Maximum depth of list embedding was",maxlist};
#Reinitialize for next file
initscalrs();
for (x in lev) {delete lev[x]};
for (x in usarr) {delete usarr[x]};
for (x in optarr) {delete optarr[x]};
for (x in unqopt) {delete unqopt[x]};
for (x in namearr) {delete namearr[x]};
for (x in revusarr) {delete revusarr[x]};
for (x in loclhrefarr) {delete loclhrefarr[x]}}
#-=- -=- -=- -=- -=- -=- -=- -=- -=- -=- -=- -=- -=- -=- -=- -=-
##EOF