home *** CD-ROM | disk | FTP | other *** search
- #!/usr/local/bin/perl
- #xtraclnk.pl: Extracts hypertext links from HTML files; isolates text contained
- # in <A> and <TITLE> elements.
- #
- # Typical use:
- #
- # perl xtraclnk.pl [options] infiles.html > outfile
- #
- # Where options have the form "option=value", as discussed below (command line
- # options other than ``title='' and ``loc='' work the same way as those of the
- # htmlchek program in this distribution).
- #
- # Whenever xtraclnk.pl encounters an <A HREF="URL">Text</A> link in an input
- # file, it copies this to the output. Whenever xtraclnk.pl encounters an
- # <A NAME="name">Text</A> anchor in an input file, it copies this as an
- # <A HREF="currentfile.html#name">Text</A> link _to_ the current input file.
- # Finally, the contents of a <TITLE>Text</TITLE> element are copied as an
- # <A HREF="currentfile.html">Text</A> link _to_ the current input file.
- # Each link in the ouput occupies exactly one line.
- #
- # This program was suggested by an idea of John Harper; what he had in mind,
- # I think, was to use this as part of a CGI script which would dynamically
- # construct an HTML document with links to all files with a title or anchors
- # that contain text matching a user-specified search pattern. However,
- # xtraclnk.pl also has some value as an HTML style debugging tool: if you have
- # used a lot of context-dependent titles like "Intro" and meaningless anchor
- # text like "Click Here", this will be very apparent when you view the HTML
- # document (derived with xtraclnk.pl using the ``title='' option) which
- # contains only the text inside titles and anchors in your other HTML
- # documents. This program can also be used to enforce consistency in link text:
- # if there is random variation between different <A HREF="...">LinkText</A>
- # elements which all point towards the same resource, this will be apparent
- # when the output of xtraclnk.pl is sorted. Also, by looking over the sorted
- # output of <tt>xtraclnk.pl</tt>, it becomes relatively easy to detect mistaken
- # links, that point to someplace other than what was intended.
- #
- # If you apply xtraclnk.pl to a list of filenames that are all specified
- # relative to the current directory then all the references to files in
- # subordinate directories will be expressed from the point of view of the top
- # directory (i.e. relative URL pathnames will have the current directory as
- # starting point). Under Unix, you can use:
- #
- # perl xtraclnk.pl `find . -name \*.html -print` > output
- #
- # Since xtraclnk.pl is a hacked-down version of the htmlchek error checker, it
- # is rather robust in its handling of incorrect HTML code (but it generally has
- # the same limitations that htmlchek does with metachar=2). Though it is not
- # a general-purpose error checker like htmlchek, xtraclnk.pl does return
- # errormessages about HTML errors connected with its functioning (note that it
- # ignores all tags in a file except <A>, <BASE>, <TITLE>, and the
- # ALT="..." attribute valuex of <IMG>).
- #
- # Command-line
- # options:
- #
- # dirprefix=... A string to be prefixed to URL's in the output links, in
- # order to resolve relative URL's into absolute URL's.
- # (See the htmlchek documentation for the complexities of use.)
- #
- # usebase=1 Take the prefix from a <BASE HREF="..."> tag in each file.
- #
- # sugar=1 Use the Unix ``filename: linenumber:'' format in reporting
- # errors.
- #
- # title=... Make the output file a valid HTML document, with <br> at
- # the end of each line, and a title as specified. Error
- # messages (if any) appear as HTML comments in the outputfile.
- # (If this title= option is not specified on the command line,
- # the output will tailored for human readability, and will not
- # really be an HTML file.)
- # Note that the output with title= will still be a HTML file
- # if you run it though the ``sort'' and ``uniq'' filters. It
- # will also remain HTML if you run it through ``grep'' -- as
- # long as you keep the first and last lines; for example (under
- # Unix):
- # perl xtraclnk.pl title="Link Stuff" *.html > out
- # head -1 out > linkfile.html
- # egrep 'pattern' out >> linkfile.html
- # tail -1 out >> linkfile.html
- #
- # loc=... Whether or not to include the location (input filename and
- # linenumber) from which each output link is derived. If you
- # don't include locations, it's hard to tell where bad links
- # came from; if you do include locations, the output will be
- # larger, and running the output though sort and uniq won't be
- # as useful for detecting inconsistent link text.
- # By default, source locations are not included in the
- # output. A value of loc=1 causes locations to be included.
- # A value of loc=hide (or anything beginning with the three
- # characters "hid...") will include locations as HTML comments,
- # if the title= option has alson been specified.
- #
- # Copyright 1994, 1995 by H. Churchyard, churchh@uts.cc.utexas.edu -- freely
- # redistributable.
- #
- # Version 1.0 12/15/94
- # Version 1.1 12/18/94 -- improve HTML-icity of "title=" option output, etc.
- # Version 1.11 12/19/94 -- squashed minor bugs. Was informally made
- # available by HTTP from uts.cc.utexas.edu.
- # Version 1.2 1/9/95 -- Added loc= option, include <IMG ALT="..."> text in
- # links. Included in htmlchek 4.0 release.
- #
- eval "exec /usr/local/bin/perl -S $0 $*"
- if $running_under_some_shell; # This emulates #! processing on NIH machines
- #
- # Setup:
- #
- $known{'A'} = 1; $known{'IMG'} = 1; $known{'TITLE'} = 1; $known{'/A'} = 1;
- $known{'/TITLE'} = 1; $known{'BASE'} = 1; $pair{'A'} = 1; $pair{'TITLE'} = 1;
- #
- &initscalrs();
- $usebase = 0; $dirprefix = ''; $sugar = 0; $title = ''; $loc = 0;
- #process any FOO=bar switches
- eval '$'.$1.'$2;' while $ARGV[0] =~ /^(usebase=|dirprefix=|sugar=|title=|loc=)(.*)/ && shift;
- $[ = 1; # set array base to 1
- $, = ' '; # set output field separator
- $\ = "\n"; # set output record separator
- foreach $X (@ARGV) {
- if ($X =~ /^[^=]+=/) {
- print STDERR "Apparent misspelled or badly-placed command-line option $&";
- print STDERR "Attempting to continue anyway...";}}
- #
- if ($title) {
- print " <html><head><title>$title</title></head><body><h1>$title</h1>";
- $E = '<br>'; $gt = '>'; $lt = '<'; $A = '<!-- '; $Z = ' -->';
- if ($loc =~ /^HID/i)
- {$AA = '<!-- '; $ZZ = ' -->';}
- else
- {$AA = ''; $ZZ = '';}}
- else {
- $E = ''; $gt = '>'; $lt = '<'; $A = ''; $Z = ''; $AA = ''; $ZZ = '';}
- #
- # Main
- #
- $stuperlRS = $/;
- while (<>) {
- if ($_ =~ /$stuperlRS$/o) { # strip record separator, allow for last line to
- chop;} # be unterminated.
- if (($.-$FNRbase) == 1) {
- $fn = $ARGV;
- # Next line is Unix-specific
- $fn =~ s/^\.\///;
- $nampref = ($dirprefix . $fn . '#');
- $lochpref = ($dirprefix . $fn);
- if ($fn =~ /.\//) {
- $fromroot = $fn; $fromroot =~ s/\/[^\057]*$/\//;}
- else {
- $fromroot = '';}
- $fromroot=($dirprefix . $fromroot);}
- if ($sugar) {$S = ($fn . ': ' . ($.-$FNRbase) . ': ');}
- if ($loc) {$L = ($fn . ' ' . ($.-$FNRbase));}
- $lastbeg = 0; $currsrch = 1; $txtbeg = 1;
- while ((((substr($_, $currsrch) =~ /[<>]/) eq 1) &&
- ($RSTART = length($`)+1)) != 0) {
- $currsrch = ($currsrch + $RSTART);
- if (substr($_, ($currsrch - 1), 1) eq '<') {
- if ($state) {
- print $A . $S . "Multiple `$lt' without `$gt' ERROR!", &crl() .
- $Z;}
- else {
- if (($currsrch > length($_)) ||
- (substr($_, $currsrch, 1) =~ /^[ \t]$/)) {
- print $A . $S .
- "Whitespace after `$lt': Incorrect SGML syntax ERROR!",
- &crl() . ", Ignoring$Z";}
- else {
- if (($nestvar) && ($currsrch > ($txtbeg + 1))) {
- $line = ($line . substr($_, $txtbeg,
- ($currsrch - ($txtbeg + 1))));}
- $lastbeg = $currsrch; $state = 1;
- $lasttag = ''; $lastopt = '';}}}
- else {
- if (substr($_, ($currsrch - 1), 1) eq '>') {
- if ($state == 0) {
- next;} #`>' without `<'
- else {
- &parsetag($currsrch - 1);
- if (($inquote) || ($inequal)) {
- &malft();}
- if ($optfree) {
- &misstest();}
- if (($lasttag eq 'A') && (!$wasname) && (!$washref)) {
- print $A . $S . $lt .
- "A$gt tag occurred without reference (NAME,HREF,ID) option ERROR!",
- &crl() . $Z;}
- if (($wasname > 1) || ($washref > 1)) {
- print $A . $S .
- 'Multiple reference (NAME,ID;HREF) options ERROR!',
- &crl(), 'on tag', $lasttag . $Z;}
- $txtbeg = $currsrch;
- $state = 0; $continuation = 0;}}
- else {
- print $A . $S . 'Internal error', &crl(), 'ignore' . $Z;}}}
- if (($state == 1) || (($lastbeg == 0) && ($continuation == 1))) {
- &parsetag(length($_) + 1);
- $continuation = 1;}
- else {
- if (($nestvar) && (!$state) && ($txtbeg <= length($_))) {
- $line = ($line . substr($_, $txtbeg) . ' ');}
- else {
- $line = ($line . ' ');}}}
- continue {
- $FNRbase = $. if eof;}
- #
- # End-of-file routine.
- #
- if ($. > 0) {&endit()};
- if ($title) {print '<hr></body></html>';}
- #
- #
- # parsetag() communicates with main() through these global variables:
- # - $lastbeg (zero if no `<' ocurred on line, otherwise points to character
- # immediately after the last `<' encountered).
- # - $state (one if unresolved `<', zero otherwise).
- # - $continuation (one if unresolved `<' from previous line, zero otherwise),
- # - $inquote (one if inside option quotes <tag opt="...">).
- #
- sub parsetag {
- local($inp) = @_;
- if (!$lastbeg) {
- $lastbeg = 1;}
- $numf = (@arr = split(' ', substr($_, $lastbeg, ($inp - $lastbeg))));
- if ($numf == 0) {
- if (!$continuation) {
- print $A . $S . "Blank $lt$gt ERROR!", &crl() . $Z;
- $state = 0;}
- return;}
- else {
- if (!$continuation) {
- $arr[1] =~ tr/a-z/A-Z/;
- $lasttag = $arr[1];
- if (defined $known{$arr[1]}) {
- if ($arr[1] =~ /^\//) {
- # </TAG> found
- $arr[1] =~ s/^\///;
- if (defined $pair{$arr[1]}) {
- if (($nestvar <= 0) || ($lev{$arr[1]} <= 0)) {
- print $A . $S . 'Extraneous /' . $arr[1],
- 'tag without preceding', $arr[1], 'tag ERROR!',
- &crl() . ', Ignoring' . $Z;}
- else {--$nestvar; --$lev{$arr[1]};
- if ($arr[1] eq 'TITLE') {&doout($lochpref);}
- else
- {if ($currf[2]) {&doout($currf[2]);}
- if ($currf[3]) {&doout($currf[3]);}}}}}
- else {
- # <TAG> found
- if ($arr[1] ne 'IMG') {$line = '';}
- ++$lev{$arr[1]};
- if (defined $pair{$arr[1]}) {
- $currf[2] = ''; $currf[3] = '';
- ++$nestvar;
- if (($lev{$arr[1]} > 1) || ($nestvar > 1)) {
- print $A . $S . 'Nesting ERROR!', &crl(),
- "on tag $arr[1]" . $Z;}}}}
- $startf = 2; $inquote = 0; $inequal = 0; $optfree = 0;
- $wasopt = 0; $wasname = 0; $washref = 0;}
- else {
- $startf = 1;}
- # Remainder of stuff in <...> after tag word
- if (defined $known{$lasttag}) {
- for ($i = $startf; $i <= $numf; ++$i) {
- if ((!$inequal) && (!$inquote)) {
- if (($arr[$i] =~
- /^[^=\042]*(=\042[^\042]*\042)?$/) ||
- ($arr[$i] =~ /^[^=\042]*=(\042)?[^\042]*$/)) {
- if (($optfree) &&
- (($arr[$i] =~ /^=[^=\042][^=\042]*$/) ||
- ($arr[$i] =~ /^=\042[^\042]*\042$/))) {
- if (!$malftag) {
- $arr[$i] =~ s/^\075//;
- if ($arr[$i] =~ /\042/) {
- &optvalproc($arr[$i],1);}
- else {&optvalproc($arr[$i],0);}}
- $optfree = 0; ++$tagwarn;}
- else {
- if (($optfree) && (($arr[$i] =~ /^=\042/) ||
- ($arr[$i] eq '='))) {
- $inequal = 1; ++$tagwarn;}
- @arr2 = split(/=/, $arr[$i], 2);
- if ($arr2[1] eq '') {
- if (!$inequal) {
- print $A . $S . 'Null tag option ERROR!',
- &crl(), "on tag $lasttag" . $Z;
- $malftag = 1;}}
- else {
- if ($optfree) {
- &misstest();}
- $arr2[1] =~ tr/a-z/A-Z/;
- $optfree = 1; ++$wasopt;
- $malftag = 0; $optvalstr = '';
- if ($lasttag =~ /^\//) {
- print $A . $S . 'Option on closing tag',
- $lasttag, 'Warning!', &crl() . $Z;}
- else {
- $lastopt = $arr2[1];}}
- if ($arr[$i] =~ /^[^=\042][^=\042]*=$/) {
- $inequal = 1;}
- if ($arr[$i] =~ /[\075]/) {
- $optvalstr = $arr[$i];
- $optvalstr =~ s/^[^=]*=//;}
- $stuperltmp = $arr[$i];
- $Q = ($stuperltmp =~ s/\042//g);
- if ($Q == 1) {
- $inquote = 1;}
- if (($optvalstr)&&(!$inequal)&&(!$inquote)) {
- $optfree = 0;
- if (!$malftag) {
- &optvalproc($optvalstr,$Q);}}}}
- else {
- &malft();}}
- else {
- if (($inequal) && (!$inquote)) {
- ++$tagwarn;
- if ($arr[$i] =~ /\042/) {
- if ($arr[$i] =~ /^\042[^\042]*(\042)?$/) {
- $stuperltmp = $arr[$i];
- if (($stuperltmp =~ s/\042//g) == 2) {
- if (!$malftag) {
- $stuperltmp =~ s/^\075//;
- &optvalproc($stuperltmp,1);}
- $inequal = 0; $optfree = 0;}
- else {
- $optvalstr = $arr[$i];
- $inquote = 1;}}
- else {
- &malft();}}
- else {
- if ($arr[$i] !~ /[\075]/) {
- if (!$malftag) {
- &optvalproc($arr[$i],0);}
- $inequal = 0; $optfree = 0;}
- else {
- &malft();}}}
- else {
- if ($arr[$i] =~ /\042/) {
- $inquote = 0; $inequal = 0; $optfree = 0;
- if ($arr[$i] !~ /^[^\042]*\042$/) {
- &malft();}
- else {
- $optvalstr = ($optvalstr . ' ' . $arr[$i]);
- if (!$malftag) {
- &optvalproc($optvalstr,1);}}}
- else {
- $optvalstr = ($optvalstr . ' ' . $arr[$i]);}}}}}
- return;}}
- #
- #
- # Return as much location information as possible in diagnostics:
- #
- # Current location:
- sub crl {
- if (($fn)&&($fn ne '-')) {
- return ('at line ' . ($.-$FNRbase) . " of file \042" . $fn . "\042");}
- else {
- return ('at line ' . $.);}}
- #
- # End of file location:
- sub ndl {
- if (($fn)&&($fn ne '-')) {
- return ("at END of file \042" . $fn . "\042");}
- else {
- return 'at END';}}
- #
- # Error message returned from numerous places in the program...
- #
- sub malft {
- print $A . $S . 'Malformed tag option ERROR!', &crl(), 'on tag', $lasttag .
- $Z;
- $malftag = 1;}
- #
- #
- #Check for non-kosher null options:
- #
- sub misstest {
- if ((($lasttag eq 'A') && ($lastopt eq 'NAME')) || ($lastopt eq 'HREF') ||
- ($lastopt eq 'ID')) {
- print $A . $S . 'Missing reference option value', &crl(),
- "on tag $lasttag, option $lastopt" . $Z;}}
- #
- #
- sub doout {
- local($href) = @_;
- $line =~ s/[ \t][ \t]+/ /g; $line =~ s/\t/ /g;
- $line =~ s/^ //; $line =~ s/ $//;
- if ($line eq '') {
- $line = '[ EMPTY ANCHOR TEXT ]';}
- print "<A HREF=\042" . $href . "\042>" . $line . '</A>', $AA . $L . $ZZ . $E;}
- #
- # This subroutine receives the raw option value string, for every tag option
- # that does have a value. It does some errorchecking and cleanup, and sets
- # the URL or name of the current anchor.
- #
- sub optvalproc {
- local($val, $quoted) = @_;
- $currfn = 0;
- if ($quoted) {
- $val =~ s/\042//g; $val =~ s/^ //; $val =~ s/ $//;}
- if ($lasttag eq 'IMG') {
- if (($lastopt eq 'ALT') && ($val =~ /[^ \t]/)) {
- $line = ($line . " [ $val ] ");}}
- elsif ($lasttag eq 'BASE') {
- if (($usebase) && ($lastopt eq 'HREF')) {
- if (($quoted) && ($val) && ($val ne '=') && ($val !~ /[^ ] [^ ]/)) {
- $nampref = ($val . '#'); $lochpref = $val;
- if ($val =~ /.\//) {
- $fromroot = $val;
- $fromroot =~ s/\/[^\057]*$/\//;}
- else {
- $fromroot = '';}}
- else {
- print $A . $S . "Bad $lt" . "BASE HREF=\042...\042$gt", &crl() .
- ', Ignoring' . $Z;}}}
- else {
- if ((($lasttag eq 'A') && ($lastopt eq 'NAME')) || ($lastopt eq 'ID')) {
- $currfn = 2; ++$wasname;
- if ($val =~ /^#/) {
- print $A . $S . "Invalid #-initial location \042" .
- $val . "\042 ERROR!", &crl(), 'on tag', $lasttag,
- 'option', $lastopt . $Z;}}
- else {
- if ($lastopt eq 'HREF') {
- $currfn = 3; ++$washref;}}}
- if ($currfn) {
- if (!$quoted) {
- print $A . $S . 'Unquoted reference option value Warning!', &crl(),
- "on tag $lasttag, option $lastopt$Z";}
- if ($val =~ /[^ ] [^ ]/) {
- print $A . $S . 'Whitespace in reference option value Warning!',
- &crl(), "on tag $lasttag, option $lastopt$Z";}
- else {
- if ($val eq '') {
- print $A . $S . 'Null reference option value ERROR!', &crl(),
- "on tag $lasttag, option $lastopt$Z";}
- else {
- # Skip the residue of Malformed Tag Option cases; OK to do
- # this, since "=" is not a valid URL; However, a minor bug
- # is that <A NAME="="> will not be checked, and will not
- # result in any errormessage.
- if ($val ne '=') {
- if ($currfn == 2) {
- $val = ($nampref . $val);}
- else {
- if (($currfn == 3) && ($val =~ /^#/)) {
- $val = ($lochpref . $val);}
- else {
- if ($val =~ /^http:[^\057]*$/) {
- $val =~ s/^http://;}
- if (($val !~ /^[^\057]*:/) && ($val !~ /^\//)) {
- if ($val =~ /^~/) {
- print $A . $S .
- "Relative URL beginning with '~' Warning!",
- &crl(),"on tag $lasttag option $lastopt$Z";}
- else {
- $val = ($fromroot . $val);}}}}
- # This monstrosity supports "../" in URL's:
- while ($val =~ /\057[^\057]*[^\057]\057\.\.\057/) {
- $val =~ s/\057[^\057]*[^\057]\057\.\.\057/\057/;}
- if (($val =~ /[:\057]\.\.\057/) || ($val =~ /^\.\.\057/)) {
- print $A . $S . "Unresolved \042../\042 in URL Warning!",
- &crl(), "on tag $lasttag option $lastopt$Z";}
- $currf[$currfn] = $val;}}}}}
- #
- #
- # Start each file with a clean slate.
- #
- sub initscalrs {
- $state = 0; $continuation = 0; $nestvar = 0; $S = ''; $L = ''; $line = '';}
- #
- #
- #
- sub endit {
- if ($sugar) {$S = ($fn . ': END: ');}
- if ($continuation) {
- print $A . $S . "Was awaiting a `$gt' ERROR!", &ndl() . $Z;}
- foreach $X (sort(keys %pair)) {
- if ($lev{$X} > 0) {
- print $A . $S . "Pending unresolved $lt" .
- "x$gt without $lt/x$gt ERROR!", &ndl(), 'on tag', $X . $Z;}}
- #Reinitialize for next file
- &initscalrs();
- undef %lev;}
- #-=- -=- -=- -=- -=- -=- -=- -=- -=- -=- -=- -=- -=- -=- -=- -=-
- ##EOF
-