home *** CD-ROM | disk | FTP | other *** search
/ PC World 2001 April / PCWorld_2001-04_cd.bin / Software / TemaCD / webclean / !!!python!!! / BeOpen-Python-2.0.exe / WEBSUCKER.PY < prev    next >
Encoding:
Python Source  |  2000-09-28  |  3.5 KB  |  127 lines

  1. #! /usr/bin/env python
  2.  
  3. """A variant on webchecker that creates a mirror copy of a remote site."""
  4.  
  5. __version__ = "$Revision: 1.9 $"
  6.  
  7. import os
  8. import sys
  9. import string
  10. import urllib
  11. import getopt
  12.  
  13. import webchecker
  14.  
  15. # Extract real version number if necessary
  16. if __version__[0] == '$':
  17.     _v = string.split(__version__)
  18.     if len(_v) == 3:
  19.         __version__ = _v[1]
  20.  
  21. def main():
  22.     verbose = webchecker.VERBOSE
  23.     try:
  24.         opts, args = getopt.getopt(sys.argv[1:], "qv")
  25.     except getopt.error, msg:
  26.         print msg
  27.         print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."
  28.         return 2
  29.     for o, a in opts:
  30.         if o == "-q":
  31.             verbose = 0
  32.         if o == "-v":
  33.             verbose = verbose + 1
  34.     c = Sucker()
  35.     c.setflags(verbose=verbose)
  36.     c.urlopener.addheaders = [
  37.             ('User-agent', 'websucker/%s' % __version__),
  38.         ]
  39.     for arg in args:
  40.         print "Adding root", arg
  41.         c.addroot(arg)
  42.     print "Run..."
  43.     c.run()
  44.  
  45. class Sucker(webchecker.Checker):
  46.  
  47.     checkext = 0
  48.     nonames = 1
  49.  
  50.     # SAM 11/13/99: in general, URLs are now URL pairs.
  51.     # Since we've suppressed name anchor checking,
  52.     # we can ignore the second dimension.
  53.  
  54.     def readhtml(self, url_pair):
  55.         url = url_pair[0]
  56.         text = None
  57.         path = self.savefilename(url)
  58.         try:
  59.             f = open(path, "rb")
  60.         except IOError:
  61.             f = self.openpage(url_pair)
  62.             if f:
  63.                 info = f.info()
  64.                 nurl = f.geturl()
  65.                 if nurl != url:
  66.                     url = nurl
  67.                     path = self.savefilename(url)
  68.                 text = f.read()
  69.                 f.close()
  70.                 self.savefile(text, path)
  71.                 if not self.checkforhtml(info, url):
  72.                     text = None
  73.         else:
  74.             if self.checkforhtml({}, url):
  75.                 text = f.read()
  76.             f.close()
  77.         return text, url
  78.  
  79.     def savefile(self, text, path):
  80.         dir, base = os.path.split(path)
  81.         makedirs(dir)
  82.         try:
  83.             f = open(path, "wb")
  84.             f.write(text)
  85.             f.close()
  86.             self.message("saved %s", path)
  87.         except IOError, msg:
  88.             self.message("didn't save %s: %s", path, str(msg))
  89.  
  90.     def savefilename(self, url):
  91.         type, rest = urllib.splittype(url)
  92.         host, path = urllib.splithost(rest)
  93.         while path[:1] == "/": path = path[1:]
  94.         user, host = urllib.splituser(host)
  95.         host, port = urllib.splitnport(host)
  96.         host = string.lower(host)
  97.         if not path or path[-1] == "/":
  98.             path = path + "index.html"
  99.         if os.sep != "/":
  100.             path = string.join(string.split(path, "/"), os.sep)
  101.             if os.name == "mac":
  102.                 path = os.sep + path
  103.         path = os.path.join(host, path)
  104.         return path
  105.  
  106. def makedirs(dir):
  107.     if not dir:
  108.         return
  109.     if os.path.exists(dir):
  110.         if not os.path.isdir(dir):
  111.             try:
  112.                 os.rename(dir, dir + ".bak")
  113.                 os.mkdir(dir)
  114.                 os.rename(dir + ".bak", os.path.join(dir, "index.html"))
  115.             except os.error:
  116.                 pass
  117.         return
  118.     head, tail = os.path.split(dir)
  119.     if not tail:
  120.         print "Huh?  Don't know how to make dir", dir
  121.         return
  122.     makedirs(head)
  123.     os.mkdir(dir, 0777)
  124.  
  125. if __name__ == '__main__':
  126.     sys.exit(main() or 0)
  127.