home *** CD-ROM | disk | FTP | other *** search
/ PC World 2001 April / PCWorld_2001-04_cd.bin / Software / TemaCD / webclean / !!!python!!! / BeOpen-Python-2.0.exe / ROBOTPARSER.PY < prev    next >
Encoding:
Python Source  |  2000-09-28  |  3.3 KB  |  98 lines

  1. """
  2.  
  3. Robots.txt file parser class.  Accepts a list of lines or robots.txt URL as
  4. input, builds a set of rules from that list, then answers questions about
  5. fetchability of other URLs.
  6.  
  7. """
  8.  
  9. class RobotFileParser:
  10.  
  11.     def __init__(self):
  12.         self.rules = {}
  13.         self.debug = 0
  14.         self.url = ''
  15.         self.last_checked = 0
  16.  
  17.     def mtime(self):
  18.         return self.last_checked
  19.  
  20.     def modified(self):
  21.         import time
  22.         self.last_checked = time.time()
  23.  
  24.     def set_url(self, url):
  25.         self.url = url
  26.  
  27.     def read(self):
  28.         import urllib
  29.         self.parse(urllib.urlopen(self.url).readlines())
  30.  
  31.     def parse(self, lines):
  32.         """parse the input lines from a robot.txt file"""
  33.         import string, re
  34.         active = []
  35.         for line in lines:
  36.             if self.debug: print '>', line,
  37.             # blank line terminates current record
  38.             if not line[:-1]:
  39.                 active = []
  40.                 continue
  41.             # remove optional comment and strip line
  42.             line = string.strip(line[:string.find(line, '#')])
  43.             if not line:
  44.                 continue
  45.             line = re.split(' *: *', line)
  46.             if len(line) == 2:
  47.                 line[0] = string.lower(line[0])
  48.                 if line[0] == 'user-agent':
  49.                     # this record applies to this user agent
  50.                     if self.debug: print '>> user-agent:', line[1]
  51.                     active.append(line[1])
  52.                     if not self.rules.has_key(line[1]):
  53.                         self.rules[line[1]] = []
  54.                 elif line[0] == 'disallow':
  55.                     if line[1]:
  56.                         if self.debug: print '>> disallow:', line[1]
  57.                         for agent in active:
  58.                             self.rules[agent].append(re.compile(line[1]))
  59.                     else:
  60.                         pass
  61.                         for agent in active:
  62.                             if self.debug: print '>> allow', agent
  63.                             self.rules[agent] = []
  64.                 else:
  65.                     if self.debug: print '>> unknown:', line
  66.  
  67.         self.modified()
  68.  
  69.     # returns true if agent is allowed to fetch url
  70.     def can_fetch(self, useragent, url):
  71.         """using the parsed robots.txt decide if useragent can fetch url"""
  72.         import urlparse
  73.         ag = useragent
  74.         if not self.rules.has_key(ag): ag = '*'
  75.         if not self.rules.has_key(ag):
  76.             if self.debug: print '>> allowing', url, 'fetch by', useragent
  77.             return 1
  78.         path = urlparse.urlparse(url)[2]
  79.         for rule in self.rules[ag]:
  80.             if rule.match(path) is not None:
  81.                 if self.debug: print '>> disallowing', url, 'fetch by', useragent
  82.                 return 0
  83.         if self.debug: print '>> allowing', url, 'fetch by', useragent
  84.         return 1
  85.  
  86. def _test():
  87.     rp = RobotFileParser()
  88.     rp.debug = 1
  89.     rp.set_url('http://www.musi-cal.com/robots.txt')
  90.     rp.read()
  91.     print rp.rules
  92.     print rp.can_fetch('*', 'http://www.musi-cal.com.com/')
  93.     print rp.can_fetch('Musi-Cal-Robot',
  94.                        'http://www.musi-cal.com/cgi-bin/event-search?city=San+Francisco')
  95.  
  96. if __name__ == "__main__":
  97.     _test()
  98.