#! /bin/perl # # url.pl --- recognize, parse and retrieve URLs # # This package contains: # # url'href: identify URLs and turn them into hypertext links # url'get: parse an URL and perform an http get # url'parse: parse an URL and return ($type,$host,$port,$path,$request) # url'abs: convert relative URLs to absolute ones # url'http: perform an http request and return the result # url'gopher: perform a gopher request and return the result # url'ftp: perform an ftp request and return the result # # Oscar Nierstrasz 26/8/93 oscar@cui.unige.ch # # 14/9/93 -- added url'gopher (not 100% stable) and url'ftp # # BUGS: relative paths work only if directories are always # terminated with a "/" -- otherwise assumes the directory is # just a filename and remembers the parent directory as the # current path. # # Can't get $! to return error messages properly. package url; require "sys/socket.ph" unless($att_proxy); # unshift(@INC, "/homes/spaf/lib/perl"); #unshift(@INC, "/user/u1/oscar/Cmd/PerlLib"); # Gene Spafford's ftp package (and using the chat package). # Added ftp'grab -- a variant of ftp'get that returns its result # rather than writing to a local file. require "ftplib.pl"; $user = getlogin; # locals: $host = undef; $port = undef; $request = undef; unless ($att_proxy) { $sockaddr = 'S n a4 x8'; chop($thishost = `hostname`); ($name, $aliases, $proto) = getprotobyname("tcp"); ($name, $aliases, $type, $len, $thisaddr) = gethostbyname($thishost); $thissock = pack($sockaddr, &AF_INET, 0, $thisaddr); } # Try to recognize URLs and ftp file indentifiers and convert them into HREFs: # This routine is evolving. The patterns are not perfect. # This is really a parsing problem, and not a job for perl ... # It is also generally impossible to distinguish ftp site names # from newsgroup names if the ":" is missing. # An arbitrary file name ("runtime.pl") can also be confused. sub href { # study; # doesn't speed things up ... # to avoid special cases for beginning & end of line s|^|#|; s|$|#|; # URLS: : s|(news:[\w.]+)|$&|g; s|(http:[\w/.:+\-]+)|$&|g; s|(file:[\w/.:+\-]+)|$&|g; s|(ftp:[\w/.:+\-]+)|$&|g; s|(wais:[\w/.:+\-]+)|$&|g; s|(gopher:[\w/.:+\-]+)|$&|g; s|(telnet:[\w/.:+\-]+)|$&|g; # s|(\w+://[\w/.:+\-]+)|$&|g; # catch some newsgroups to avoid confusion with sites: s|([^\w\-/.:@>])(alt\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(bionet\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(bit\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(comp\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(gnu\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(misc\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(news\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(rec\.[\w.+\-]+[\w+\-]+)|$1$2|g; # FTP locations (with directory): # anonymous@: s|(anonymous@)([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w\d+\-/.]+)|$1$2:$4$3|g; # ftp@: s|(ftp@)([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w\d+\-/.]+)|$1$2:$4$3|g; # : s|([^\w\-/.:@>])([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w\d+\-/.]+)|$1$2:$4$3|g; # NB: don't confuse an http server with a port number for # an FTP location! # internet number version: : s|([^\w\-/.:@])(\d{2,}\.\d{2,}\.\d+\.\d+):([\w\d+\-/.]+)|$1$2:$3|g; # just the site name (assume two dots): s|([^\w\-/.:@>])([a-zA-Z][\w+\-]+\.[\w.+\-]+\.[a-zA-Z]{2,})([^\w\d\-/.:!])|$1$2$3|g; # NB: can be confused with newsgroup names! # .com has only one dot: s|([^\w\-/.:@>])([a-zA-Z][\w.+\-]+\.com)([^\w\-/.:])|$1$2$3|g; # just internet numbers: s|([^\w\-/.:@])(\d+\.\d+\.\d+\.\d+)([^\w\-/.:])|$1$2$3|g; # unfortunately inet numbers can easily be confused with # european telephone numbers ... s|^#||; s|#$||; } # parse an URL, issue the request and return the result sub get { local($url,$version) = @_; ($type,$host,$port,$path,$request) = &parse($type,$host,$port,$path,$url); if ($host) { if ($type eq "http") { &http($host,$port,$request,$version); } elsif ($type eq "gopher") { &gopher($host,$port,$request); } elsif ($type eq "ftp") { &ftp($host,$request); } else { print STDERR "url'get: $type requests unimplemented\n"; } } else { undef; } } # convert an URL to ($type,host,port,path,request) # given previous type, host, port and path, will handle relative URLs # NB: May need special processing for different service types (e.g., news) sub parse { local($type,$host,$port,$path,$url) = @_; if ($url =~ m|^(\w+)://(.*)|) { $type = $1; $host = $2; $port = &defport($type); $request = "/"; # default ($host =~ s|^([^/]+)(/.*)$|$1|) && ($request = $2); ($host =~ s/:(\d+)$//) && ($port = $1); ($path = $request) =~ s|[^/]*$||; } else { # relative URL of form ":" if ($url =~ /^(\w+):(.*)/) { $type = $1; $request = $2; } # relative URL of form "" else { $request = $url; } $request =~ s|^$|/|; $request =~ s|^([^/])|$path$1|; # relative path $request =~ s|/\./|/|g; while ($request =~ m|/\.\./|) { $request =~ s|[^/]*/\.\./||; } # assume previous host & port: unless ($host) { # $! = "url'parse: no host for $url\n"; print STDERR "url'parse: no host for $url\n"; return (undef,undef,undef,undef,undef); } } ($type,$host,$port,$path,$request); } # convert relative http URLs to absolute ones: # should be patched to handle HREFs w/o double quotes ... # also need to handle inlined images! sub abs { local($url,$page) = @_; ($type,$host,$port,$path,$request) = &parse(undef,undef,undef,undef,$url); $root = "http://$host:$port"; @hrefs = split(/<[Aa]/,$page); $n = $[; while (++$n <= $#hrefs) { # absolute URLs ok: ($hrefs[$n] =~ m|href\s*=\s*"http://|i) && next; ($hrefs[$n] =~ m|href\s*=\s*"\w+:|i) && next; # relative URL from root: ($hrefs[$n] =~ s|href\s*=\s*"/([^"]*)"|HREF="$root/$1"|i) && next; ($hrefs[$n] =~ s|href\s*=\s*/([^>]*)>|HREF=$root/$1>|i) && next; # relative from $path: $hrefs[$n] =~ s|href\s*=\s*"([^/"][^"]*)"|HREF="$root$path$1"|i; $hrefs[$n] =~ s|href\s*=\s*([^/">][^>]*)>|HREF=$root$path$1>|i; # collapse relative paths: $hrefs[$n] =~ s|/\./|/|g; while ($hrefs[$n] =~ m|/\.\./|) { $hrefs[$n] =~ s|[^/]*/\.\./||; } } join("]*)>|SRC=$root/$1>|i) && next; # relative from $path: $srcs[$n] =~ s|SRC\s*=\s*"([^/"][^"]*)"|SRC="$root$path$1"|i; $srcs[$n] =~ s|SRC\s*=\s*([^/">][^>]*)>|SRC=$root$path$1>|i; # collapse relative paths: $srcs[$n] =~ s|/\./|/|g; while ($srcs[$n] =~ m|/\.\./|) { $srcs[$n] =~ s|[^/]*/\.\./||; } } join("; $SIG{'ALRM'} = "IGNORE"; !) { return undef; } &'ipcclose($s) if ($'att_proxy); close(FS); # With HTTP/1.0 would include MIME header $page; } # This doesn't always work -- gopher URLs sometimes contain # a leading file type in the pathname which must be stripped off. # needs work. URLs may also contain blanks, tabs and other nasties. # IS THIS THE RIGHT PROTOCOL FOR GOPHER??? sub gopher { local($host,$port,$request) = @_; if ($'att_proxy) { ($fqdn, $aliases, $type, $len, $thataddr) = gethostbyname($host); $that = pack($sockaddr, &AF_INET, $port, $thataddr); socket(FS, &AF_INET, &SOCK_STREAM, $proto) || return undef; bind(FS, $thissock) || return undef; } else { # Proxy code local($ipcpath, $s); #$'ipcdebug = 1; $ipcpath = &'ipcpath($host, 'tcp', $port); $s = &'ipcopen($ipcpath, ''); if ($s == -1) { $ipcpath = &'ipcpath($host, 'proxy', $port); $s = &'ipcopen($ipcpath, ''); die "Unable to open connection to host $host on port $port via tcp or proxy\n" if ($s == -1); } open(FS, "+<&$s"); } # gopher doesn't need leading "/": $request =~ s|^/||; # try to strip off the gopher type ... ($request =~ s|^([I]?\d+)/||) && ($gtype = $1); local($/); unless (eval q! $SIG{'ALRM'} = "url'timeout"; alarm(30); unleess ($'att_proxy) { connect(FS, $that) || return undef; } select(FS); $| = 1; select(STDOUT); print FS "$request\r\n"; $page = ; $SIG{'ALRM'} = "IGNORE"; !) { return undef; } &'ipcclose($s) if ($'att_proxy); close(FS); # This return value will also contain a leading type field. # Should be stripped off by the calling routine ... $page; } # ftp'grab is a version of ftp'get that returns the page # retrieved rather than writing it to a local file. # Perhaps not so nice for big files, but what the heck. sub ftp { local($host,$file) = @_; &ftp'open($host, "ftp", "$user@$thishost") || &fail; &ftp'type("i") || &fail; $page = &ftp'grab($file) || &fail; &ftp'close; $page; } sub fail { $save = &ftp'error; &ftp'close; die $save; } sub timeout { die "Timeout\n"; } # default ports sub defport { local($type) = @_; if ($type eq "http") { 80; } elsif ($type eq "gopher") { 70; } else { undef; } } 1;