Chip 2001 Mobile

home *** CD-ROM | disk | FTP | other *** search

/ Chip 2001 Mobile / Chip_Mobile_2001.iso / palm / tools / remoteco / remoteco.exe / clipboard.pl < prev next >

Wrap

Perl Script | 1999-03-11 | 3.1 KB | 123 lines

#!/usr/local/bin/perl open (IN, "<$ARGV[0]") || die ("Can't open input file '$ARGV[0]'\n"); open (OUT, ">$ARGV[1]") || die ("Can't open output file '$ARGV[1]'\n"); # Parse the HTML clipboard format used by IE 4.0 # Set $url = URL of page (overriden by BASE element if it exists) # $fragment = selected part of the HTML while (<IN>) { /SourceURL:([^\r\n]+)/ && ($source_url = $1); /<[Bb][Aa][Ss][Ee]\s+[^>]+[Hh][Rr][Ee][Ff]="?([^">]+)"?>"/ && ($base_url=$1); if (m@<[Tt][Ii][Tt][Ll][Ee]>(.*)</[Tt][Ii][Tt][Ll][Ee]>@) { $title = $1; } elsif (m@<[Tt][Ii][Tt][Ll][Ee]>@) { $title = $'; $intitle = 1; } elsif (m@</[Tt][Ii][Tt][Ll][Ee]>@) { $title .= $`; $intitle = 0; } else { $title .= $_ if ($intitle); } if (/(.*)/) { $fragment = $1; } elsif (//) { $fragment = $'; $infrag = 1; } elsif (//) { $fragment .= $`; $infrag = 0; } else { $fragment .= $_ if ($infrag); } } $base_url = $source_url unless (defined ($base_url)); # Print source as title/URL pair $title = $source_url unless (defined ($title)); $title = &cleanup ($title); print OUT "From: $title\n$source_url\n"; #print STDERR "url = $url\n"; #print STDERR "fragment = $fragment\n"; $_ = $fragment; # Loop through all links of the form <A>...</A> while (m@<[Aa]\W[^>]*[Hh][Rr][Ee][Ff]="?([^">]+)"?[^>]*>(([^<]|<[^/]|</[^Aa]|</[Aa][^>])*)</[Aa]>@) { $_ = $'; # Next iteration of loop starts after this match # Convert link to an absolute URL $href = &relative ($base_url, $1); # Clean up link text (between <A> and </A>) $text = &cleanup ($2, $href); # Print the text/URL pair print OUT "$text\n$href\n"; } # Convert $href (relative to $base) into an absolute URL sub relative { local ($base, $href) = @_; if ($href =~ /^http:|^ftp:|^file:|^mailto:|^gopher:|^https:/) { # Absolute reference return $href; } elsif ($href =~ m@^/@) { # Global reference to same host $base =~ m@^[^/\\]*[/\\]+[^/\\]*/@; return $& . substr ($href, 1); } else { # Local reference to same directory $base =~ m@[^/\\]*$@; return $` . $href; } } # Clean up text sub cleanup { local ($_, $url) = @_; local ($original) = $_; # Strip out all tags except <A> and </A> s@<[Bb][Rr]>@ @g; s@</?[^>]*>@@g; if (!/\w/) { # if no text left, it's probably an image. Try to find an ALT tag. if ($original =~ m@<[Ii][Mm][Gg][^>]*[Aa][Ll][Tt]="([^"]+)"[^>]*>@ # double-quoted ALT attribute || $original =~ m@<[Ii][Mm][Gg][^>]*[Aa][Ll][Tt]='([^']+)'[^>]*>@ # single-quoted ALT attribute || $original =~ m@<[Ii][Mm][Gg][^>]*[Aa][Ll][Tt]=([^ >]+)[^>]*>@ # unquoted ALT attribute ) { $_ = "Image [$1]"; } else { $_ = "Image [$href]"; } } # Replace common entities s@"@\"@g; s@&@&@g; s@<@<@g; s@>@>@g; s@ @ @g; # Delete weird chars inserted by IE4 tr/\xC2\xA0/ /; # Compress whitespace tr/ \t\r\n/ /s; return $_; }