home *** CD-ROM | disk | FTP | other *** search
- #!/usr/local/bin/perl
-
- open (IN, "<$ARGV[0]") || die ("Can't open input file '$ARGV[0]'\n");
- open (OUT, ">$ARGV[1]") || die ("Can't open output file '$ARGV[1]'\n");
-
- # Parse the HTML clipboard format used by IE 4.0
- # Set $url = URL of page (overriden by BASE element if it exists)
- # $fragment = selected part of the HTML
- while (<IN>) {
- /SourceURL:([^\r\n]+)/ && ($source_url = $1);
- /<[Bb][Aa][Ss][Ee]\s+[^>]+[Hh][Rr][Ee][Ff]="?([^">]+)"?>"/ && ($base_url=$1);
-
-
- if (m@<[Tt][Ii][Tt][Ll][Ee]>(.*)</[Tt][Ii][Tt][Ll][Ee]>@) {
- $title = $1;
- } elsif (m@<[Tt][Ii][Tt][Ll][Ee]>@) {
- $title = $';
- $intitle = 1;
- } elsif (m@</[Tt][Ii][Tt][Ll][Ee]>@) {
- $title .= $`;
- $intitle = 0;
- } else {
- $title .= $_ if ($intitle);
- }
-
-
- if (/<!--StartFragment-->(.*)<!--EndFragment-->/) {
- $fragment = $1;
- } elsif (/<!--StartFragment-->/) {
- $fragment = $';
- $infrag = 1;
- } elsif (/<!--EndFragment-->/) {
- $fragment .= $`;
- $infrag = 0;
- } else {
- $fragment .= $_ if ($infrag);
- }
- }
-
- $base_url = $source_url unless (defined ($base_url));
-
- # Print source as title/URL pair
- $title = $source_url unless (defined ($title));
- $title = &cleanup ($title);
- print OUT "From: $title\n$source_url\n";
-
- #print STDERR "url = $url\n";
- #print STDERR "fragment = $fragment\n";
-
- $_ = $fragment;
-
-
- # Loop through all links of the form <A>...</A>
- while (m@<[Aa]\W[^>]*[Hh][Rr][Ee][Ff]="?([^">]+)"?[^>]*>(([^<]|<[^/]|</[^Aa]|</[Aa][^>])*)</[Aa]>@) {
-
- $_ = $'; # Next iteration of loop starts after this match
-
- # Convert link to an absolute URL
- $href = &relative ($base_url, $1);
-
- # Clean up link text (between <A> and </A>)
- $text = &cleanup ($2, $href);
-
- # Print the text/URL pair
- print OUT "$text\n$href\n";
- }
-
-
- # Convert $href (relative to $base) into an absolute URL
- sub relative {
- local ($base, $href) = @_;
-
- if ($href =~ /^http:|^ftp:|^file:|^mailto:|^gopher:|^https:/) {
- # Absolute reference
- return $href;
- } elsif ($href =~ m@^/@) {
- # Global reference to same host
- $base =~ m@^[^/\\]*[/\\]+[^/\\]*/@;
- return $& . substr ($href, 1);
- } else {
- # Local reference to same directory
- $base =~ m@[^/\\]*$@;
- return $` . $href;
- }
- }
-
- # Clean up text
- sub cleanup {
- local ($_, $url) = @_;
- local ($original) = $_;
-
- # Strip out all tags except <A> and </A>
- s@<[Bb][Rr]>@ @g;
- s@</?[^>]*>@@g;
-
- if (!/\w/) {
- # if no text left, it's probably an image. Try to find an ALT tag.
- if ($original =~ m@<[Ii][Mm][Gg][^>]*[Aa][Ll][Tt]="([^"]+)"[^>]*>@ # double-quoted ALT attribute
- || $original =~ m@<[Ii][Mm][Gg][^>]*[Aa][Ll][Tt]='([^']+)'[^>]*>@ # single-quoted ALT attribute
- || $original =~ m@<[Ii][Mm][Gg][^>]*[Aa][Ll][Tt]=([^ >]+)[^>]*>@ # unquoted ALT attribute
- ) {
- $_ = "Image [$1]";
- } else {
- $_ = "Image [$href]";
- }
- }
-
- # Replace common entities
- s@"@\"@g;
- s@&@&@g;
- s@<@<@g;
- s@>@>@g;
- s@ @ @g;
-
- # Delete weird chars inserted by IE4
- tr/\xC2\xA0/ /;
-
- # Compress whitespace
- tr/ \t\r\n/ /s;
-
- return $_;
- }
-