home *** CD-ROM | disk | FTP | other *** search
- #!/usr/local/bin/perl
- #htmlsrpl.pl: HTML-aware search-and-replace; acts either only outside HTML/SGML
- # tags, or only within HTML/SGML tags; can also upper-case tag names
- #
- # Typical use:
- #
- # perl htmlsrpl.pl [options] infile.html > outfile.html
- #
- # Where options have the form "option=value"; all options should precede
- # filename arguments on the command line. (See the documentation.)
- #
- # Copyright H. Churchyard 1994, 1995 -- freely redistributable. This code is
- # awk-influenced (so sue me). Tested under Perl 4 (I'm still not sure whether
- # the fact that "s/$x/$y/" is equivalent to "s/$x/$y/e" is a bug or not).
- #
- # Version 1.0 12/21/94 -- Preliminary version.
- # Version 1.01 12/22/94 -- Minor bugfix.
- # Version 1.1 1/7/95 -- Added inside=, inmost=, oustside= , etc. Included in
- # htmlchek 4.0 release.
- # Version 1.11 1/22/95 -- Added "Changed!/Unchanged" final status messages.
- # Included in htmlchek 4.1 release.
- #
- eval "exec /usr/local/bin/perl -S $0 $*"
- if $running_under_some_shell; # this emulates #! processing on NIH machines.
- #process any FOO=bar switches
- $old= ''; $new = ''; $intags = 0; $regexp = 0; $regeval = 0; $upcase = 0;
- $lines = 0; $delete = 0; $case = 0; $slash=0; $inmost=''; $inside = '';
- $outside = '';
- eval '$'.$1.'$2;' while $ARGV[0] =~ /^(old=|new=|intags=|lines=|regexp=|regeval=|upcase=|delete=|case=|slash=|inmost=|inside=|outside=)(.*)/ && shift;
- $[ = 1; # set array base to 1
- $, = ' '; # set output field separator
- $\ = "\n"; # set output record separator
- foreach $X (@ARGV) {
- if ($X =~ /^[^=]+=/) {
- print STDERR "Apparent misspelled or badly-placed command-line option $&";
- print STDERR "Attempting to continue anyway...";}}
- $filstr = join(' ',@ARGV); $changed = 0;
- if ($lines) {$/ = "\0777"; $* = 1;}
- else {$/ = "\n";}
- if (($outside) && (!(($inside) || ($inmost)))) {$applyit = 1;}
- else {$applyit = 0;}
- #
- $unpair{'!--'} = 1; $unpair{'!DOCTYPE'} = 1; $unpair{'BASE'} = 1;
- $unpair{'BR'} = 1; $unpair{'COMMENT'} = 1; $unpair{'HR'} = 1;
- $unpair{'IMG'} = 1; $unpair{'INPUT'} = 1; $unpair{'ISINDEX'} = 1;
- $unpair{'LINK'} = 1; $unpair{'META'} = 1; $unpair{'NEXTID'} = 1;
- $unpair{'ATOP'} = 1; $unpair{'LEFT'} = 1;
- $unpair{'OVER'} = 1; $unpair{'OVERLAY'} = 1; $unpair{'RIGHT'} = 1;
- $unpair{'TAB'} = 1; $unpair{'BASEFONT'} = 1; $unpair{'WBR'} = 1;
- $nestvar = 0; $numins = 0; $numout = 0;
- if ($inmost) {
- $inmost =~ tr/a-z/A-Z/;
- if ($inmost =~ /[^-.a-zA-Z0-9]/) {
- die 'Non-alphanumeric value of inmost= was specified';}
- if (defined $unpair{$inmost}) {
- die "Non-pairing tag $inmost specified as value of inmost=";}}
- if ($inside) {
- $numins = (@inarr = split(/,/, $inside));
- for ($i = 1; $i <= $numins; ++$i) {
- $inarr[$i] =~ tr/a-z/A-Z/;
- if ((!$inarr[$i]) || ($inarr[$i] =~ /[^-.a-zA-Z0-9]/)) {
- die 'Non-alphanumeric value of inside= was specified';}
- if (defined $xxin{$inarr[$i]}) {
- die 'Duplicate values of inside= were specified';}
- if (defined $unpair{$inarr[$i]}) {
- die "Non-pairing tag $inarr[$i] specified as value of inside=";}
- else {
- $xxin{$inarr[$i]} = 1;}}}
- if ($outside) {
- $numout = (@outarr = split(/,/, $outside));
- for ($i = 1; $i <= $numout; ++$i) {
- $outarr[$i] =~ tr/a-z/A-Z/;
- if ((!$outarr[$i]) || ($outarr[$i] =~ /[^-.a-zA-Z0-9]/)) {
- die 'Non-alphanumeric value of outside= was specified';}
- if (defined $xxout{$outarr[$i]}) {
- die 'Duplicate values of outside= were specified';}
- if (defined $xxin{$outarr[$i]}) {
- die "Tagname $outarr[$i] specified as both outside= and inside=";}
- if (defined $unpair{$outarr[$i]}) {
- die "Non-pairing tag $outarr[$i] specified as value of outside=";}
- else {
- $xxout{$outarr[$i]} = 1;}}}
- #
- if ((!$old) && (!$upcase)) {die "No `old=' string was specified";}
- if (($delete) && (($new) || ($regexp) || ($regeval))) {
- die "Incompatible option specified with `delete=1'";}
- if (($regexp) && ($regeval)) {die 'Both regexp=1 and regeval=1 specified';}
- if (($case) && (!$delete) && (!$regexp) && (!$regeval)) {
- die 'Option case=1 specified without any of regexp=1, regeval=1, or delete=1 also being specified';}
- if ($delete) {$slash=1;}
- if (($upcase) || ($delete) || ($slash)) {$intags = 1;}
- #
- # Main
- #
- # Variable ``$state'' is one if there is an unresolved `<', zero otherwise.
- # ``$lastbeg'' is zero if no `<' has ocurred in $_, otherwise it points to the
- # character immediately after the last `<' encountered.
- #
- $xRS = "\n"; $state = 0;
- while (<>) {
- if ($_ =~ /$xRS$/o) { # strip record separator, allow for last line to
- chop;} # be unterminated.
- $lastbeg = 0; $currsrch = 1; $txtbeg = 1;
- while ((((substr($_, $currsrch) =~ /[<>]/) eq 1) &&
- ($RSTART = length($`)+1)) != 0) {
- $currsrch = ($currsrch + $RSTART);
- if (substr($_, ($currsrch - 1), 1) eq '<') {
- if ($state) {
- print "\nERROR!";
- die "Multiple `<' without `>' ERROR!";}
- else {
- if (($currsrch > length($_)) ||
- (substr($_, $currsrch, 1) =~ /^[ \t]$/)) {
- print "\nERROR!";
- die "Whitespace after `<': Bad SGML syntax ERROR!";}
- else {
- if ($currsrch > ($txtbeg + 1)) {
- if ((!$intags) && (($applyit) || (!(($inmost) ||
- ($numins) || ($numout))))) {
- printf "%s", &changeht(substr($_, $txtbeg,
- ($currsrch - ($txtbeg + 1))));}
- else {
- printf "%s", substr($_, $txtbeg,
- ($currsrch - ($txtbeg + 1)));}}
- $deletit = 0;
- $lastbeg = $currsrch; $state = 1;}}}
- else {
- if (substr($_, ($currsrch - 1), 1) eq '>') {
- if ($state == 0) {
- next;} #`>' without `<'
- else {
- &parsetag($currsrch - 1);
- if (!$deletit) {printf "%s", '>';}
- $txtbeg = $currsrch; $state = 0;}}
- else {die 'Internal error, ignore';}}}
- #At EOL:
- if ($state == 1) {
- &parsetag(length($_) + 1);}
- elsif ($txtbeg <= length($_)) {
- if ((!$intags) && (($applyit) || (!(($inmost) || ($numins) ||
- ($numout))))) {
- printf "%s", &changeht(substr($_, $txtbeg));}
- else {printf "%s", substr($_, $txtbeg);}}
- if (!(($state) && ($deletit))) {printf "\n";}}
- #
- #END routine:
- #
- if ($state) {
- die "Was awaiting a `>' ERROR! at END";}
- if ($changed) {
- print STDERR "Changed! on input", $filstr;}
- else {
- print STDERR "Unchanged on input", $filstr;}
- #
- #
- sub parsetag {
- local($inp) = @_;
- $docap = $lastbeg;
- if (!$lastbeg) {
- $strx = '' ; $lastbeg = 1;}
- else {$strx= '<';}
- if ($inp != $lastbeg) {
- $str = &upc(substr($_, $lastbeg, ($inp - $lastbeg)));
- if (($oldapply) || (!(($inmost) || ($numins) ||($numout)))) {
- if (($slash) && ($docap) && ($str =~ /^\//))
- {$strx = ($strx . '/'); $str= substr($str, 2);}
- if ($delete) {
- if ($docap) {&getdel($str);}
- if (!$deletit) {printf "%s%s", $strx, $str;}
- else {$changed=1;}}
- else {
- if (($intags) && ($old))
- {printf "%s%s", $strx, &changeht($str);}
- else {printf "%s%s", $strx, $str;}}}
- else {printf "%s%s", $strx, $str;}}}
- #
- sub upc {
- local($upcx) = @_;
- if ($docap) {
- $upcx =~ /^[^ \t\n]+/;
- ($tagname = $&) =~ tr/a-z/A-Z/;
- if ($upcase) {$upcx = ($tagname . $');}
- $oldapply = $applyit;
- #tag stack accounting
- if ((($inmost) || ($numins)|| ($numout)) &&
- (!(defined $unpair{$tagname}))) {
- $applyit = 1; $clostag = '';
- if ($tagname !~ /^\//) {
- ++$nestvar;
- $nestarr[$nestvar] = $tagname;}
- else {
- $clostag = substr($tagname,2);
- while ($nestarr[$nestvar] ne $clostag) {
- --$nestvar;
- if ($nestvar <= 0) {
- print "\nERROR!";
- die "/$clostag tag encountered when apparently not in $clostag element";}}
- --$nestvar;}
- if (($inmost) && ($nestarr[$nestvar] ne $inmost)) {
- $applyit = 0;}
- if ($numins) {
- if ($nestvar < $numins) {$applyit = 0;}
- else {
- $mask = 1;
- $stackstr = (" " . join(" ",@nestarr[1..$nestvar]) . " ");
- foreach $X (keys %xxin) {
- if (index($stackstr,(" " . $X . " ")) <= 0) {
- $mask = 0;}}
- if (($applyit) && ($mask)) {$applyit = 1;}
- else {$applyit = 0;}}}
- if (($numout) && ($nestvar)) {
- $mask = 1;
- $stackstr = (" " . join(" ",@nestarr[1..$nestvar]) . " ");
- foreach $X (keys %xxout) {
- ##print $stackstr,"XX",(" " . $X . " ");##debugXX
- if (index($stackstr,(" " . $X . " ")) > 0) {
- $mask = 0;}}
- if (($applyit) && ($mask)) {$applyit = 1;}
- else {$applyit = 0;}}
- if ($clostag) {$oldapply = $applyit;}}}
- return $upcx;}
- #
- sub getdel {
- local($inz) = @_;
- $inz =~ /^[^ \t\n]+/;
- $X = $&;
- if ($case) {
- if ($X =~ /$old/io) {
- $deletit = 1;}}
- else {
- if ($X =~ /$old/o) {
- $deletit = 1;}}}
- #
- sub changeht {
- local($field) = @_;
- if ($regeval) {
- if ($case) {
- $X = ($field =~ s/$old/$new/eeigo);}
- else {
- $X = ($field =~ s/$old/$new/eego);}
- if ($X) {$changed = 1;}
- return $field;}
- elsif ($regexp) {
- if ($case) {
- $X = ($field =~ s/$old/$new/igo);}
- else {
- $X = ($field =~ s/$old/$new/go);}
- if ($X) {$changed = 1;}
- return $field;}
- else {
- $startf = 1; $newf = '';
- while (($ndx = index(substr($field,$startf),$old)) > 0) {
- $changed = 1;
- $newf = ($newf . substr($field,$startf,($ndx-1)) . $new);
- $startf = ($startf + ($ndx-1) + length($old));}
- $newf = ($newf . substr($field,$startf));
- return $newf;}}
- ##EOF
-