# This is a shell archive. Save it in a file, remove anything before # this line, and then unpack it by entering "sh file". Note, it may # create directories; files and directories will be owned by you and # have default permissions. # # This archive contains: # # w3get # hget # ftplib.pl # url.pl # echo x - w3get sed 's/^X//' >w3get << 'END-of-w3get' X#!/usr/local/bin/perl X#!/usr/local/bin/proxyperl X# X# w3get - point it at a http: url and it recursively retrieves href's X# and img src's starting from that page X# X# Version 0.1 by Brooks Cutter (bcutter@paradyne.com) 2/5/94 X# X# Usage: w3get [-d] [-v] X# X# where fully qualified url is like http://host/file.html X# like the Mosaic What's new page: X# http://www.ncsa.uiuc.edu/SDG/Software/Mosaic/Docs/whats-new.html X# X# -d prints debugging information. X# -v is verbose (prints a message for each url it descends) X# X# X# I wrote this program a month ago in preperation for a presentation X# on Mosaic and the World Wide Web. I had a sun there and wanted to X# display parts of the web without using a slow PPP connection. X# I haven't done anything with it since then (except today to document X# it and clean it up) so don't intend to develop it further until X# a need arises. Feel free to hack this up and pass it around. X# (and pass me a copy please...) X# X# X# If you are a AT&T Site behind the proxy gateway, you will need X# my version of proxyperl. Email me for more info, and then set below to 1. X$att_proxy = 0; X# Uses Paradyne Automounter setup.. X$pdn = 1; X X# This string is prepended to the rewritten url's X# It could also be a 'file://...' or 'ftp://...', etc... X$redirect = 'http://wwwhome.paradyne.com/~bcutter/z'; X# directory where I can write my output to X$outdir = "$ENV{'HOME'}/public_html/w3get"; Xmkdir($outdir, 0755) unless(-d $outdir); X Xdie "$0: \n(like http://www.ncsa.uiuc.edu/SDG/Software/Mosaic/Docs/whats-new.html)\n" unless (@ARGV); X Xunshift(@INC,"/pdn/appl/perl/lib","/pdn/appl/perl/lib/sys") if ($pdn); X Xrequire 'url.pl'; Xrequire 'getopts.pl'; X X&Getopts('dv'); X$'ipcdebug = 1 if (($opt_d) && ($att_proxy)); X X#$version = "HTTP 1.0"; X Xpush(@todo, @ARGV); XFOREVER: while (1) { X #last unless(@todo); Xprint '@todo = ',scalar @todo," ($todo[0])\n" if ($opt_d); X unless(@todo) { X last unless(@remote); X @todo = @remote; @remote = (); X } X $node_url = shift(@todo); X $seen{$node_url} = 1; # So I only descend each url once... X X print "Checking url $node_url\n" if ($opt_v); X unless ($node_url =~ m!^http://!) { X warn "Argument must be fully qualified url (ie: http://host/file.html):\n$node_url\n"; X next; X } X# If it's already pulled the page down, it shouldn't retrieve it X# again - but it needs to open it, parse the hyperlinks and then X# retrieve those if necessary. Right now it pulls everything down X# whether it has it or not. X# X# ($url_fn,$url_dir) = &url2fndir($url); X# next if (-e "$outdir/$url_fn"); X# X if ($page = &url'get($node_url,$version)) { X if ($node_url =~ /html/i) { X $page = &url'abs($node_url,$page); X # This should be combined into the one above, but it was a quick kludge X # (like this program) X $page = &url'img_abs($node_url,$page); X } X } else { warn "$!\n"; next; } X $node_host = ''; X if ($node_url =~ m|^http://([^/]+)/?.*$|) { X $node_host = $1; X } X X # I should really get the type from HTTP/1.0 headers... X if ($node_url =~ /html$/i) { X @links = &parse_html($page); X @http = &extract_http(@links); X for (@http) { X s/#.*//; # Delete skipto marks X next if ($seen{$_}); X next if (/htbin/); # skip hitbin X next if (/cgi.*bin/); # skip hitbin X next if (/\?/); # Skip argument urls... X #next unless (/paradyne.com/); # If you don't want to stray from a domain X if (($node_host) && ($node_url =~ m!http://$node_host!)) { X # Do local ones first X push(@todo, $_); X } else { X push(@remote, $_); X } X $seen{$_} = 1; X } X @links2 = &localize(@links); # Should use pointers X &save_url($node_url, @links2); X } else { X &save_url($node_url, $page); X } X next; X} X Xexit; X X Xsub save_url { X local($url) = shift(@_); X local($url_fn, $url_dir) = &url2fndir($url); X return unless($url_fn); X if ($url_dir) { X if ((-e "$outdir/$url_dir") && (!-d "$outdir/$url_dir")) { X # url was previously referenced like: X # http://host/directory - and thought it was a file when a X # directory index was generated. So move it to index.html... X system("mv $outdir/$url_dir $outdir/$url_dir.index"); X system("mkdir -p $outdir/$url_dir"); X system("mv $outdir/$url_dir.index $outdir/$url_dir/index.html"); X } elsif (!-e "$outdir/$url_dir") { system("mkdir -p $outdir/$url_dir"); } X } X print STDERR "Writing $url to $url_fn\n"; X if (-e "$outdir/$url_fn") { X print STDERR "--->>> HEY, $url_fn already exists!\n"; X return; X } X open(OUT, ">$outdir/$url_fn"); X print OUT @_; X close(OUT); X} X Xsub url2fndir { X local($url) = shift(@_); X return($cache_url_fn{$url},$cache_url_dir{$url}) X if (($cache_url_fn{$url}) && ($cache_url_dir{$url})); X local($url_fn,$url_dir); X X if ($url =~ m!^http://(.+)$!) { X $url_fn = $1; X $url_fn =~ tr/~/_/d; X @url_dir = split(/\//, $url_fn); pop(@url_dir); X $url_dir = join('/',@url_dir); X $cache_url_fn{$url} = $url_fn; X $cache_url_dir{$url} = $url_dir; X return($url_fn,$url_dir); X } X return(''); X} X Xsub extract_http { X local($url); X local($_,@return); X X for (@_) { X next unless ((/^]+)>$/i) || (/^<(img\s+.*src)=([^>]+)>$/i)) { X $cmd = $1; X $url = $2; X $url =~ tr/'"//d; # Delete quotes X X if (($url =~ /^http:/) || ($url =~ m!^[/a-zA-Z0-9]!)) { X push(@return, $url); X } X next; X } X } X return(@return); X} X Xsub localize { X local($_,@return); X local(@r); X X for (@_) { X unless ((/^]+)>$/i) || (/^<(img\s+.*src)=([^>]+)>$/i)) { X $cmd = $1; X $url = $2; X $url =~ tr/'"//d; # Delete quotes X#print "localize found url $url\n"; X X if ($url =~ m!^http://(.+)$!) { X push(@r, "<$cmd=\"$redirect/$1\">"); X } else { X } X next; X } X } X return(@r); X} X Xsub parse_html { X local(@data) = (); X local($save, $_, $lt, $gt); X NEXTLINE: for (split(/\r/,$_[0])) { X $save .= $_; X if ((($lt = index($save,'<')) == -1) || (index($save,'>',$lt) == -1)) X { next; } X $lt = $gt = 0; X while (($lt = index($save, '<', $gt)) >= $[) { X # This is the data *BEFORE* the '<' X if ($lt) { # do If isn't /^', $lt); X if ($gt == -1) { X $save = substr($save, $lt); X next NEXTLINE; X } X # This is the data *INSIDE* the <> X $data = substr($save, $lt, ($gt-$lt+1)); X push(@data, $data); X } X $save = substr($save, ($gt+1)); X } X push(@data, $save); X return(@data); X} X X# EOF END-of-w3get echo x - hget sed 's/^X//' >hget << 'END-of-hget' X#!/usr/local/bin/proxyperl -s X# X# hget --- get an html page from an http server X# X# Oscar Nierstrasz 26/8/93 oscar@cui.unige.ch X# X# An example of using url'get. X# url'abs converts all relative URLs to absolute ones. X# X# Warning: only http, ftp and gopher URLs are currently understood. X# X# See urlget (separate script) for correct handling of gopher protocol. X Xunshift(@INC,"/appl/perl/lib","/appl/perl/lib/sys"); Xrequire "url.pl"; X Xdie "Usage: hget [-abs] ...\n" unless $#ARGV >= 0; X$timeout = 60; X Xif ($v) { $version = "HTTP 1.0"; } X$abs = 1; X Xforeach $url (@ARGV) { X if ($page = &url'get($url,$version)) { X if ($abs) { $page = &url'abs($url,$page); } X if ($page) { print $page; } X else { print STDERR "hget: couldn't retrieve $url\n"; } X } X else { warn "$!\n"; } X} X X__END__ X END-of-hget echo x - ftplib.pl sed 's/^X//' >ftplib.pl << 'END-of-ftplib.pl' X# X# This is a set of ftp library routines using chat2.pl X# X# Return code information taken from RFC 959 X X# Written by Gene Spafford X# Last update: 10 April 92, Version 0.9 X# X X# X# Most of these routines communicate over an open ftp channel X# The channel is opened with the "ftp'open" call. X# X X# MODIFIED by Oscar Nierstrasz X# Sept. 14, 1993 -- added ftp'grab -- like ftp'get except returns X# the file retrieved instead of writing it to a local file. X Xpackage ftp; Xrequire "chat2.pl"; Xrequire "syscall.ph"; X X X########################################################################### X# X# The following are the variables local to this package. X# I declare them all up front so I can remember what I called 'em. :-) X# X########################################################################### X XLOCAL_VARS: { X $Control; X $Data_handle; X $Host; X $Myhost = "\0" x 65; X (syscall(&SYS_gethostname, $Myhost, 65) == 0) || X die "Cannot 'gethostname' of local machine (in ftplib)\n"; X $Myhost =~ s/\0*$//; X $NeedsCleanup; X $NeedsClose; X $ftp_error; X $ftp_matched; X $ftp_trans_flag; X @ftp_list; X X local(@tmp) = getservbyname("ftp", "tcp"); X ($FTP = $tmp[2]) || X die "Unable to get service number for 'ftp' (in ftplib)!\n"; X X @std_actions = ( X 'TIMEOUT', X q($ftp_error = "Connection timed out for $Host!\n"; undef), X 'EOF', X q($ftp_error = "Connection to $Host timed out unexpectedly!\n"; undef) X ); X X @sigs = ('INT', 'HUP', 'TERM', 'QUIT'); # sigs we'll catch & terminate on X} X X X X########################################################################### X# X# The following are intended to be the user-callable routines. X# Each of these does one of the ftp keyword functions. X# X########################################################################### X Xsub error { ## Public X $ftp_error; X} X X####################################################### X X# cd up a directory level X Xsub cdup { ## Public X &do_ftp_cmd(200, "cdup"); X} X X####################################################### X X# close an open ftp connection X Xsub close { ## Public X return unless $NeedsClose; X &do_ftp_cmd(221, "quit"); X &chat'close($Control); X undef $NeedsClose; X &do_ftp_signals(0); X} X X####################################################### X X# change remote directory X Xsub cwd { ## Public X &do_ftp_cmd(250, "cwd", @_); X} X X####################################################### X X# delete a remote file X Xsub delete { ## Public X &do_ftp_cmd(250, "dele", @_); X} X X####################################################### X X# get a directory listing of remote directory ("ls -l") X Xsub dir { ## Public X &do_ftp_listing("list", @_); X} X X####################################################### X X# get a remote file to a local file X# get(remote[, local]) X Xsub get { ## Public X local($remote, $local) = @_; X ($local = $remote) unless $local; X X unless (open(DFILE, ">$local")) { X $ftp_error = "Open of local file $local failed: $!"; X return undef; X } else { X $NeedsCleanup = $local; X } X X return undef unless &do_open_dport; # Open a data channel X unless (&do_ftp_cmd(150, "retr $remote")) { X $ftp_error .= "\nFile $remote not fetched from $Host\n"; X close DFILE; X unlink $local; X undef $NeedsCleanup; X return; X } X X $ftp_trans_flag = 0; X X do { X &chat'expect($Data_handle, 60, X '.|\n', q{print DFILE ($chat'thisbuf) || X ($ftp_trans_flag = 3); undef $chat'S}, X 'EOF', '$ftp_trans_flag = 1', X 'TIMEOUT', '$ftp_trans_flag = 2'); X } until $ftp_trans_flag; X X close DFILE; X &chat'close($Data_handle); # Close the data channel X X undef $NeedsCleanup; X if ($ftp_trans_flag > 1) { X unlink $local; X $ftp_error = "Unexpected " . ($ftp_trans_flag == 2 ? "timeout" : X ($ftp_trans_flag != 3 ? "failure" : "local write failure")) . X " getting $remote\n"; X } X X &do_ftp_cmd(226); X} X X####################################################### X X# grab a remote file and return the result X# grab(remote) X# [like get but doesn't create a file] X Xsub grab { ## Public X local($remote) = @_; X local($page); X X return undef unless &do_open_dport; # Open a data channel X unless (&do_ftp_cmd(150, "retr $remote")) { X $ftp_error .= "\nFile $remote not fetched from $Host\n"; X undef $NeedsCleanup; X return; X } X X $ftp_trans_flag = 0; X X do { X &chat'expect($Data_handle, 60, X '.|\n', q{ $page .= $chat'thisbuf; X undef $chat'S}, X 'EOF', '$ftp_trans_flag = 1', X 'TIMEOUT', '$ftp_trans_flag = 2'); X } until $ftp_trans_flag; X X close DFILE; X &chat'close($Data_handle); # Close the data channel X X undef $NeedsCleanup; X if ($ftp_trans_flag > 1) { X $ftp_error = "Unexpected " . X ($ftp_trans_flag == 2 ? "timeout" : "failure" ) . X " getting $remote\n"; X } X X &do_ftp_cmd(226); X $page; X} X X####################################################### X X# Do a simple name list ("ls") X Xsub list { ## Public X &do_ftp_listing("nlst", @_); X} X X####################################################### X X# Make a remote directory X Xsub mkdir { ## Public X &do_ftp_cmd(257, "mkd", @_); X} X X####################################################### X X# Open an ftp connection to remote host X Xsub open { ## Public X if ($NeedsClose) { X $ftp_error = "Connection still open to $Host!"; X return undef; X } X X $Host = shift(@_); X local($User, $Password, $Acct) = @_; X $User = "anonymous" unless $User; X $Password = "-" . $main'ENV{'USER'} . "@$Myhost" unless $Password; X $ftp_error = ''; X X unless($Control = &chat'open_port($Host, $FTP)) { X $ftp_error = "Unable to connect to $Host ftp port: $!"; X return undef; X } X X unless(&chat'expect($Control, 60, X "^220 .*\n", "1", X "^\d\d\d .*\n", "undef")) { X $ftp_error = "Error establishing control connection to $Host"; X &chat'close($Control); X return undef; X } X &do_ftp_signals($NeedsClose = 1); X X unless (&do_ftp_cmd(331, "user $User")) { X $ftp_error .= "\nUser command failed establishing connection to $Host"; X return undef; X } X X unless (&do_ftp_cmd("(230|332|202)", "pass $Password")) { X $ftp_error .= "\nPassword command failed establishing connection to $Host"; X return undef; X } X X return 1 unless $Acct; X X unless (&do_ftp_cmd("(230|202)", "pass $Password")) { X $ftp_error .= "\nAcct command failed establishing connection to $Host"; X return undef; X } X 1; X} X X####################################################### X X# Get name of current remote directory X Xsub pwd { ## Public X if (&do_ftp_cmd(257, "pwd")) { X $ftp_matched =~ m/^257 (.+)\r?\n/; X $1; X } else { X undef; X } X} X X####################################################### X X# Rename a remote file X Xsub rename { ## Public X local($from, $to) = @_; X X &do_ftp_cmd(350, "rnfr $from") && &do_ftp_cmd(250, "rnto $to"); X} X X####################################################### X X# Set transfer type X Xsub type { ## Public X &do_ftp_cmd(200, "type", @_); X} X X X########################################################################### X# X# The following are intended to be utility routines used only locally. X# Users should not call these directly. X# X########################################################################### X Xsub do_ftp_cmd { ## Private X local($okay, @commands, $val) = @_; X X $commands[0] && X &chat'print($Control, join(" ", @commands), "\r\n"); X X &chat'expect($Control, 60, X "^$okay .*\\n", '$ftp_matched = $&; 1', X '^(\d)\d\d .*\\n', '($String = $&) =~ y/\r\n//d; X $ftp_error = qq{Unexpected reply for ' . X "@commands" . ': $String}; X $1 > 3 ? undef : 1', X @std_actions X ); X} X X####################################################### X Xsub do_ftp_listing { ## Private X local(@lcmd) = @_; X @ftp_list = (); X $ftp_trans_flag = 0; X X return undef unless &do_open_dport; X X return undef unless &do_ftp_cmd(150, @lcmd); X do { # Following is grotty, but chat2 makes us do it X &chat'expect($Data_handle, 30, X "(.*)\r?\n", 'push(@ftp_list, $1)', X "EOF", '$ftp_trans_flag = 1'); X } until $ftp_trans_flag; X X &chat'close($Data_handle); X return undef unless &do_ftp_cmd(226); X X grep(y/\r\n//d, @ftp_list); X @ftp_list; X} X X####################################################### X Xsub do_open_dport { ## Private X local(@foo, $port) = &chat'open_listen; X ($port, $Data_handle) = splice(@foo, 4, 2); X X unless ($Data_handle) { X $ftp_error = "Unable to open data port: $!"; X return undef; X } X X push(@foo, $port >> 8, $port & 0xff); X local($myhost) = (join(',', @foo)); X X &do_ftp_cmd(200, "port $myhost"); X} X X####################################################### X# X# To cleanup after a problem X# X Xsub do_ftp_abort { X die unless $NeedsClose; X X &chat'print($Control, "abor", "\r\n"); X &chat'close($Data_handle); X &chat'expect($Control, 10, '.', undef); X &chat'close($Control); X X close DFILE; X unlink($NeedsCleanup) if $NeedsCleanup; X die; X} X X####################################################### X# X# To set signals to do the abort properly X# X Xsub do_ftp_signals { X local($flag, $sig) = @_; X X local ($old, $new) = ('DEFAULT', "ftp'do_ftp_abort"); X $flag || (($old, $new) = ($new, $old)); X foreach $sig (@sigs) { X ($SIG{$sig} == $old) && ($SIG{$sig} = $new); X } X} X X1; END-of-ftplib.pl echo x - url.pl sed 's/^X//' >url.pl << 'END-of-url.pl' X#! /bin/perl X# X# url.pl --- recognize, parse and retrieve URLs X# X# This package contains: X# X# url'href: identify URLs and turn them into hypertext links X# url'get: parse an URL and perform an http get X# url'parse: parse an URL and return ($type,$host,$port,$path,$request) X# url'abs: convert relative URLs to absolute ones X# url'http: perform an http request and return the result X# url'gopher: perform a gopher request and return the result X# url'ftp: perform an ftp request and return the result X# X# Oscar Nierstrasz 26/8/93 oscar@cui.unige.ch X# X# 14/9/93 -- added url'gopher (not 100% stable) and url'ftp X# X# BUGS: relative paths work only if directories are always X# terminated with a "/" -- otherwise assumes the directory is X# just a filename and remembers the parent directory as the X# current path. X# X# Can't get $! to return error messages properly. X Xpackage url; X Xrequire "sys/socket.ph" unless($att_proxy); X X# unshift(@INC, "/homes/spaf/lib/perl"); X#unshift(@INC, "/user/u1/oscar/Cmd/PerlLib"); X X# Gene Spafford's ftp package (and using the chat package). X# Added ftp'grab -- a variant of ftp'get that returns its result X# rather than writing to a local file. Xrequire "ftplib.pl"; X X$user = getlogin; X X# locals: X$host = undef; X$port = undef; X$request = undef; X Xunless ($att_proxy) { X $sockaddr = 'S n a4 x8'; X chop($thishost = `hostname`); X ($name, $aliases, $proto) = getprotobyname("tcp"); X ($name, $aliases, $type, $len, $thisaddr) = gethostbyname($thishost); X $thissock = pack($sockaddr, &AF_INET, 0, $thisaddr); X} X X# Try to recognize URLs and ftp file indentifiers and convert them into HREFs: X# This routine is evolving. The patterns are not perfect. X# This is really a parsing problem, and not a job for perl ... X# It is also generally impossible to distinguish ftp site names X# from newsgroup names if the ":" is missing. X# An arbitrary file name ("runtime.pl") can also be confused. Xsub href { X # study; # doesn't speed things up ... X X # to avoid special cases for beginning & end of line X s|^|#|; s|$|#|; X X # URLS: : X s|(news:[\w.]+)|$&|g; X s|(http:[\w/.:+\-]+)|$&|g; X s|(file:[\w/.:+\-]+)|$&|g; X s|(ftp:[\w/.:+\-]+)|$&|g; X s|(wais:[\w/.:+\-]+)|$&|g; X s|(gopher:[\w/.:+\-]+)|$&|g; X s|(telnet:[\w/.:+\-]+)|$&|g; X # s|(\w+://[\w/.:+\-]+)|$&|g; X X # catch some newsgroups to avoid confusion with sites: X s|([^\w\-/.:@>])(alt\.[\w.+\-]+[\w+\-]+)|$1$2|g; X s|([^\w\-/.:@>])(bionet\.[\w.+\-]+[\w+\-]+)|$1$2|g; X s|([^\w\-/.:@>])(bit\.[\w.+\-]+[\w+\-]+)|$1$2|g; X s|([^\w\-/.:@>])(comp\.[\w.+\-]+[\w+\-]+)|$1$2|g; X s|([^\w\-/.:@>])(gnu\.[\w.+\-]+[\w+\-]+)|$1$2|g; X s|([^\w\-/.:@>])(misc\.[\w.+\-]+[\w+\-]+)|$1$2|g; X s|([^\w\-/.:@>])(news\.[\w.+\-]+[\w+\-]+)|$1$2|g; X s|([^\w\-/.:@>])(rec\.[\w.+\-]+[\w+\-]+)|$1$2|g; X X # FTP locations (with directory): X # anonymous@: X s|(anonymous@)([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w\d+\-/.]+)|$1$2:$4$3|g; X # ftp@: X s|(ftp@)([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w\d+\-/.]+)|$1$2:$4$3|g; X # : X s|([^\w\-/.:@>])([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w\d+\-/.]+)|$1$2:$4$3|g; X # NB: don't confuse an http server with a port number for X # an FTP location! X # internet number version: : X s|([^\w\-/.:@])(\d{2,}\.\d{2,}\.\d+\.\d+):([\w\d+\-/.]+)|$1$2:$3|g; X X # just the site name (assume two dots): X s|([^\w\-/.:@>])([a-zA-Z][\w+\-]+\.[\w.+\-]+\.[a-zA-Z]{2,})([^\w\d\-/.:!])|$1$2$3|g; X # NB: can be confused with newsgroup names! X # .com has only one dot: X s|([^\w\-/.:@>])([a-zA-Z][\w.+\-]+\.com)([^\w\-/.:])|$1$2$3|g; X X # just internet numbers: X s|([^\w\-/.:@])(\d+\.\d+\.\d+\.\d+)([^\w\-/.:])|$1$2$3|g; X # unfortunately inet numbers can easily be confused with X # european telephone numbers ... X X s|^#||; s|#$||; X} X X# parse an URL, issue the request and return the result Xsub get { X local($url,$version) = @_; X ($type,$host,$port,$path,$request) = &parse($type,$host,$port,$path,$url); X if ($host) { X if ($type eq "http") { &http($host,$port,$request,$version); } X elsif ($type eq "gopher") { &gopher($host,$port,$request); } X elsif ($type eq "ftp") { &ftp($host,$request); } X else { print STDERR "url'get: $type requests unimplemented\n"; } X } X else { X undef; X } X} X X# convert an URL to ($type,host,port,path,request) X# given previous type, host, port and path, will handle relative URLs X# NB: May need special processing for different service types (e.g., news) Xsub parse { X local($type,$host,$port,$path,$url) = @_; X if ($url =~ m|^(\w+)://(.*)|) { X $type = $1; X $host = $2; X $port = &defport($type); X $request = "/"; # default X ($host =~ s|^([^/]+)(/.*)$|$1|) && ($request = $2); X ($host =~ s/:(\d+)$//) && ($port = $1); X ($path = $request) =~ s|[^/]*$||; X } X else { X # relative URL of form ":" X if ($url =~ /^(\w+):(.*)/) { X $type = $1; X $request = $2; X } X # relative URL of form "" X else { $request = $url; } X $request =~ s|^$|/|; X $request =~ s|^([^/])|$path$1|; # relative path X $request =~ s|/\./|/|g; X while ($request =~ m|/\.\./|) { X $request =~ s|[^/]*/\.\./||; X } X # assume previous host & port: X unless ($host) { X # $! = "url'parse: no host for $url\n"; X print STDERR "url'parse: no host for $url\n"; X return (undef,undef,undef,undef,undef); X } X } X ($type,$host,$port,$path,$request); X} X X# convert relative http URLs to absolute ones: X# should be patched to handle HREFs w/o double quotes ... X# also need to handle inlined images! Xsub abs { X local($url,$page) = @_; X ($type,$host,$port,$path,$request) = &parse(undef,undef,undef,undef,$url); X $root = "http://$host:$port"; X @hrefs = split(/<[Aa]/,$page); X $n = $[; X while (++$n <= $#hrefs) { X # absolute URLs ok: X ($hrefs[$n] =~ m|href\s*=\s*"http://|i) && next; X ($hrefs[$n] =~ m|href\s*=\s*"\w+:|i) && next; X # relative URL from root: X ($hrefs[$n] =~ s|href\s*=\s*"/([^"]*)"|HREF="$root/$1"|i) && next; X ($hrefs[$n] =~ s|href\s*=\s*/([^>]*)>|HREF=$root/$1>|i) && next; X # relative from $path: X $hrefs[$n] =~ s|href\s*=\s*"([^/"][^"]*)"|HREF="$root$path$1"|i; X $hrefs[$n] =~ s|href\s*=\s*([^/">][^>]*)>|HREF=$root$path$1>|i; X # collapse relative paths: X $hrefs[$n] =~ s|/\./|/|g; X while ($hrefs[$n] =~ m|/\.\./|) { X $hrefs[$n] =~ s|[^/]*/\.\./||; X } X } X join("]*)>|SRC=$root/$1>|i) && next; X # relative from $path: X $srcs[$n] =~ s|SRC\s*=\s*"([^/"][^"]*)"|SRC="$root$path$1"|i; X $srcs[$n] =~ s|SRC\s*=\s*([^/">][^>]*)>|SRC=$root$path$1>|i; X # collapse relative paths: X $srcs[$n] =~ s|/\./|/|g; X while ($srcs[$n] =~ m|/\.\./|) { X $srcs[$n] =~ s|[^/]*/\.\./||; X } X } X join("; X $SIG{'ALRM'} = "IGNORE"; X !) { X return undef; X } X &'ipcclose($s) if ($'att_proxy); X close(FS); X # With HTTP/1.0 would include MIME header X $page; X} X X# This doesn't always work -- gopher URLs sometimes contain X# a leading file type in the pathname which must be stripped off. X# needs work. URLs may also contain blanks, tabs and other nasties. X# IS THIS THE RIGHT PROTOCOL FOR GOPHER??? Xsub gopher { X local($host,$port,$request) = @_; X if ($'att_proxy) { X ($fqdn, $aliases, $type, $len, $thataddr) = gethostbyname($host); X $that = pack($sockaddr, &AF_INET, $port, $thataddr); X socket(FS, &AF_INET, &SOCK_STREAM, $proto) || return undef; X bind(FS, $thissock) || return undef; X } else { X # Proxy code X local($ipcpath, $s); X #$'ipcdebug = 1; X $ipcpath = &'ipcpath($host, 'tcp', $port); X $s = &'ipcopen($ipcpath, ''); X if ($s == -1) { X $ipcpath = &'ipcpath($host, 'proxy', $port); X $s = &'ipcopen($ipcpath, ''); X die X "Unable to open connection to host $host on port $port via tcp or proxy\n" X if ($s == -1); X } X open(FS, "+<&$s"); X } X X # gopher doesn't need leading "/": X $request =~ s|^/||; X # try to strip off the gopher type ... X ($request =~ s|^([I]?\d+)/||) && ($gtype = $1); X local($/); X unless (eval q! X $SIG{'ALRM'} = "url'timeout"; X alarm(30); X unleess ($'att_proxy) { connect(FS, $that) || return undef; } X select(FS); $| = 1; select(STDOUT); X print FS "$request\r\n"; X $page = ; X $SIG{'ALRM'} = "IGNORE"; X !) { X return undef; X } X &'ipcclose($s) if ($'att_proxy); X close(FS); X # This return value will also contain a leading type field. X # Should be stripped off by the calling routine ... X $page; X} X X# ftp'grab is a version of ftp'get that returns the page X# retrieved rather than writing it to a local file. X# Perhaps not so nice for big files, but what the heck. Xsub ftp { X local($host,$file) = @_; X &ftp'open($host, "ftp", "$user@$thishost") || &fail; X &ftp'type("i") || &fail; X $page = &ftp'grab($file) || &fail; X &ftp'close; X $page; X} X Xsub fail { X $save = &ftp'error; X &ftp'close; X die $save; X} X Xsub timeout { die "Timeout\n"; } X X# default ports Xsub defport { X local($type) = @_; X if ($type eq "http") { 80; } X elsif ($type eq "gopher") { 70; } X else { undef; } X} X X1; X END-of-url.pl chmod +x w3get hget exit