home *** CD-ROM | disk | FTP | other *** search
- #!/usr/local/bin/perl
- #dehtml.pl: Removes all HTML tags from file, preliminary to spell check; common
- # ampersand "&entities;" are also resolved into single characters.
- #
- # Typical use:
- #
- # perl dehtml.pl infile.html > outfile.txt
- #
- # This program processes all files on the command line to STDOUT; to process a
- # number of files individually, use the iteration mechanism of your shell; for
- # example:
- #
- # for a in *.html ; do perl dehtml.pl $a > otherdir/$a ; done
- #
- # in Unix sh, or:
- #
- # for %a in (*.htm) do call dehtml %a otherdir\%a
- #
- # in MS-DOS, where dehtml.bat is the following one-line batch file:
- #
- # perl dehtml.pl %1 > %2
- #
- # Copyright H. Churchyard 1994 -- freely redistributable.
- #
- # Version 1.0 11/27/94 -- Tested with 4.03[56] on SunOS and DEC Alpha OSF/1,
- # and MacPerl 4.13. Included in htmlchek 3.0 release.
- # Version 1.1 12/6/94 -- Fixed minor bug which could unpredictably cause a
- # string such as "é" to be reduced into a single character;
- # added "". Included in htmlchek 3.01 release.
- # Version 1.2 1/12/95 -- No error on `>' outside tag; minor bugfix. Included
- # in htmlchek 4.0 release.
- #
- # This program is a port to perl of the original dehtml.awk (the port was
- # fairly mechanical, so programming style and efficency may not be high).
- #
- eval "exec /usr/local/bin/perl -S $0 $*"
- if $running_under_some_shell;
- # this emulates #! processing on NIH machines.
- $[ = 1; # set array base to 1
- $, = ' '; # set output field separator
- $\ = "\n"; # set output record separator
- #
- $amp{' '} = "\040"; $amp{' '}="\040";
- $amp{'"'} = "\042"; $amp{'"'}="\042";
- $amp{'<'} = "\074"; $amp{'<'}="\074"; $amp{'>'} = "\076";
- $amp{'>'}="\076"; $amp{'À'}="\300"; $amp{'Á'}="\301";
- $amp{'Â'}="\302"; $amp{'Ã'}="\303"; $amp{'Ä'}="\304";
- $amp{'Å'}="\305"; $amp{'Æ'}="\306"; $amp{'Ç'}="\307";
- $amp{'È'}="\310"; $amp{'É'}="\311"; $amp{'Ê'}="\312";
- $amp{'Ë'}="\313"; $amp{'Ì'}="\314"; $amp{'Í'}="\315";
- $amp{'Î'}="\316"; $amp{'Ï'}="\317"; $amp{'Ð'}="\320";
- $amp{'Ñ'}="\321"; $amp{'Ò'}="\322"; $amp{'Ó'}="\323";
- $amp{'Ô'}="\324"; $amp{'Õ'}="\325"; $amp{'Ö'}="\326";
- $amp{'Ø'}="\330"; $amp{'Ù'}="\331"; $amp{'Ú'}="\332";
- $amp{'Û'}="\333"; $amp{'Ü'}="\334"; $amp{'Ý'}="\335";
- $amp{'Þ'}="\336"; $amp{'ß'}="\337"; $amp{'à'}="\340";
- $amp{'á'}="\341"; $amp{'â'}="\342"; $amp{'ã'}="\343";
- $amp{'ä'}="\344"; $amp{'å'}="\345"; $amp{'æ'}="\346";
- $amp{'ç'}="\347"; $amp{'è'}="\350"; $amp{'é'}="\351";
- $amp{'ê'}="\352"; $amp{'ë'}="\353"; $amp{'ì'}="\354";
- $amp{'í'}="\355"; $amp{'î'}="\356"; $amp{'ï'}="\357";
- $amp{'ð'}="\360"; $amp{'ñ'}="\361"; $amp{'ò'}="\362";
- $amp{'ó'}="\363"; $amp{'ô'}="\364"; $amp{'õ'}="\365";
- $amp{'ö'}="\366"; $amp{'ø'}="\370"; $amp{'ù'}="\371";
- $amp{'ú'}="\372"; $amp{'û'}="\373"; $amp{'ü'}="\374";
- $amp{'ý'}="\375"; $amp{'þ'}="\376"; $amp{'ÿ'}="\377";
- $amp{'®'}="\256"; $amp{'©'}="\251"; $amp{'£'} = "\243";
- $amp{''}="-";
- #
- # Main
- #
- # Variable ``$state'' is one if unresolved `<', zero otherwise.
- #
- $stuperlRS = $/;
- while (<>) {
- if ($_ =~ /$stuperlRS$/o) { # strip record separator, allow for last line to
- chop;} # be unterminated.
- $line = ''; $errstr = ''; $erra = 0; $errb = 0;
- $currsrch = 1; $txtbeg = 1;
- while ((((substr($_, $currsrch) =~ /[<>]/) eq 1) &&
- ($RSTART = length($`)+1)) != 0) {
- $currsrch = ($currsrch + $RSTART);
- if (substr($_, ($currsrch - 1), 1) eq '<') {
- if ($state) {
- if (!$erra) {
- $errstr = ($errstr .
- "&&^Multiple `<' without `>' ERROR!, Ignoring^&&\n");
- $erra = 1;}}
- else {
- if (($currsrch > length($_)) ||
- (substr($_, $currsrch, 1) =~ /^[ \t]$/)) {
- if (!$errb) {
- $errstr = ($errstr .
- "&&^Whitespace after `<': Bad SGML syntax ERROR!, Ignoring^&&\n");
- $errb = 1;}}
- else {
- if ($currsrch > ($txtbeg + 1)) {
- $line = ($line . substr($_, $txtbeg,
- ($currsrch - ($txtbeg + 1))));}
- $state = 1;}}}
- else {
- if (substr($_, ($currsrch - 1), 1) eq '>') {
- if ($state == 0) {
- next;}
- else {$txtbeg = $currsrch; $state = 0;}}
- else {print 'Internal error, ignore';}}}
- #At EOL:
- if ((!$state) && ($txtbeg <= length($_))) {
- $line = ($line . substr($_, $txtbeg));}
- if ($line =~ /?[-0-9a-zA-Z.]*;/) {
- foreach $X (keys %amp) {
- $s_ = $amp{$X}; $line =~ s/$X/$s_/g;
- if ($line !~ /&/) {
- last;}}
- $line =~ s/&(#38|amp);/&/g;}
- if (($line) || ((!$state) && ($_ =~ /^$/))) {
- if ((!$state) || ($errstr) || ($line =~ /[ \t]$/))
- {print $line;}
- else {printf "%s", $line;}}
- if ($errstr) {
- printf '%s', $errstr;}}
- #
- #Minor bug: &g<X>t; will translate to a `>' character!
- #
- #END routine:
- #
- if ($state) {
- print "&&^Was awaiting a `>' ERROR! at END^&&";}
- ##EOF
-