home *** CD-ROM | disk | FTP | other *** search
- #! /usr/bin/env python
-
- """A variant on webchecker that creates a mirror copy of a remote site."""
-
- __version__ = "$Revision: 1.9 $"
-
- import os
- import sys
- import string
- import urllib
- import getopt
-
- import webchecker
-
- # Extract real version number if necessary
- if __version__[0] == '$':
- _v = string.split(__version__)
- if len(_v) == 3:
- __version__ = _v[1]
-
- def main():
- verbose = webchecker.VERBOSE
- try:
- opts, args = getopt.getopt(sys.argv[1:], "qv")
- except getopt.error, msg:
- print msg
- print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."
- return 2
- for o, a in opts:
- if o == "-q":
- verbose = 0
- if o == "-v":
- verbose = verbose + 1
- c = Sucker()
- c.setflags(verbose=verbose)
- c.urlopener.addheaders = [
- ('User-agent', 'websucker/%s' % __version__),
- ]
- for arg in args:
- print "Adding root", arg
- c.addroot(arg)
- print "Run..."
- c.run()
-
- class Sucker(webchecker.Checker):
-
- checkext = 0
- nonames = 1
-
- # SAM 11/13/99: in general, URLs are now URL pairs.
- # Since we've suppressed name anchor checking,
- # we can ignore the second dimension.
-
- def readhtml(self, url_pair):
- url = url_pair[0]
- text = None
- path = self.savefilename(url)
- try:
- f = open(path, "rb")
- except IOError:
- f = self.openpage(url_pair)
- if f:
- info = f.info()
- nurl = f.geturl()
- if nurl != url:
- url = nurl
- path = self.savefilename(url)
- text = f.read()
- f.close()
- self.savefile(text, path)
- if not self.checkforhtml(info, url):
- text = None
- else:
- if self.checkforhtml({}, url):
- text = f.read()
- f.close()
- return text, url
-
- def savefile(self, text, path):
- dir, base = os.path.split(path)
- makedirs(dir)
- try:
- f = open(path, "wb")
- f.write(text)
- f.close()
- self.message("saved %s", path)
- except IOError, msg:
- self.message("didn't save %s: %s", path, str(msg))
-
- def savefilename(self, url):
- type, rest = urllib.splittype(url)
- host, path = urllib.splithost(rest)
- while path[:1] == "/": path = path[1:]
- user, host = urllib.splituser(host)
- host, port = urllib.splitnport(host)
- host = string.lower(host)
- if not path or path[-1] == "/":
- path = path + "index.html"
- if os.sep != "/":
- path = string.join(string.split(path, "/"), os.sep)
- if os.name == "mac":
- path = os.sep + path
- path = os.path.join(host, path)
- return path
-
- def makedirs(dir):
- if not dir:
- return
- if os.path.exists(dir):
- if not os.path.isdir(dir):
- try:
- os.rename(dir, dir + ".bak")
- os.mkdir(dir)
- os.rename(dir + ".bak", os.path.join(dir, "index.html"))
- except os.error:
- pass
- return
- head, tail = os.path.split(dir)
- if not tail:
- print "Huh? Don't know how to make dir", dir
- return
- makedirs(head)
- os.mkdir(dir, 0777)
-
- if __name__ == '__main__':
- sys.exit(main() or 0)
-