Chip 2011 November

home *** CD-ROM | disk | FTP | other *** search

/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_262 / HTMLParser.pyo (.txt) < prev next >

Wrap

Python Compiled Bytecode | 2011-09-09 | 9.8 KB | 315 lines

# Source Generated with Decompyle++ # File: in.pyo (Python 2.7) import markupbase import re interesting_normal = re.compile('[&<]') interesting_cdata = re.compile('<(/|\\Z)') incomplete = re.compile('&[a-zA-Z#]') entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') starttagopen = re.compile('<[a-zA-Z]') piclose = re.compile('>') commentclose = re.compile('--\\s*>') tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*') attrfind = re.compile('\\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\\s*=\\s*(\\\'[^\\\']*\\\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\$\$_#=~@]*))?') locatestarttagend = re.compile('\n <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name\n (?:\\s+ # whitespace before attribute name\n (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name\n (?:\\s*=\\s* # value indicator\n (?:\'[^\']*\' # LITA-enclosed value\n |\\"[^\\"]*\\" # LIT-enclosed value\n |[^\'\\">\\s]+ # bare value\n )\n )?\n )\n )*\n \\s* # trailing whitespace\n', re.VERBOSE) endendtag = re.compile('>') endtagfind = re.compile('</\\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\\s*>') class HTMLParseError(Exception): def __init__(self, msg, position = (None, None)): self.msg = msg self.lineno = position[0] self.offset = position[1] def __str__(self): result = self.msg if self.lineno is not None: result = result + ', at line %d' % self.lineno if self.offset is not None: result = result + ', column %d' % (self.offset + 1) return result class HTMLParser(markupbase.ParserBase): CDATA_CONTENT_ELEMENTS = ('script', 'style') def __init__(self): self.reset() def reset(self): self.rawdata = '' self.lasttag = '???' self.interesting = interesting_normal markupbase.ParserBase.reset(self) def feed(self, data): self.rawdata = self.rawdata + data self.goahead(0) def close(self): self.goahead(1) def error(self, message): raise HTMLParseError(message, self.getpos()) __starttag_text = None def get_starttag_text(self): return self._HTMLParser__starttag_text def set_cdata_mode(self): self.interesting = interesting_cdata def clear_cdata_mode(self): self.interesting = interesting_normal def goahead(self, end): rawdata = self.rawdata i = 0 n = len(rawdata) while i < n: match = self.interesting.search(rawdata, i) if match: j = match.start() else: j = n if i < j: self.handle_data(rawdata[i:j]) i = self.updatepos(i, j) if i == n: break startswith = rawdata.startswith if startswith('<', i): if starttagopen.match(rawdata, i): k = self.parse_starttag(i) elif startswith('</', i): k = self.parse_endtag(i) elif startswith('<!--', i): k = self.parse_comment(i) elif startswith('<?', i): k = self.parse_pi(i) elif startswith('<!', i): k = self.parse_declaration(i) elif i + 1 < n: self.handle_data('<') k = i + 1 else: break if k < 0: if end: self.error('EOF in middle of construct') break i = self.updatepos(i, k) continue if startswith('&#', i): match = charref.match(rawdata, i) if match: name = match.group()[2:-1] self.handle_charref(name) k = match.end() if not startswith(';', k - 1): k = k - 1 i = self.updatepos(i, k) continue elif ';' in rawdata[i:]: self.handle_data(rawdata[0:2]) i = self.updatepos(i, 2) break continue if startswith('&', i): match = entityref.match(rawdata, i) if match: name = match.group(1) self.handle_entityref(name) k = match.end() if not startswith(';', k - 1): k = k - 1 i = self.updatepos(i, k) continue match = incomplete.match(rawdata, i) if match: if end and match.group() == rawdata[i:]: self.error('EOF in middle of entity or char ref') break elif i + 1 < n: self.handle_data('&') i = self.updatepos(i, i + 1) else: break if end and i < n: self.handle_data(rawdata[i:n]) i = self.updatepos(i, n) self.rawdata = rawdata[i:] return None def parse_pi(self, i): rawdata = self.rawdata match = piclose.search(rawdata, i + 2) if not match: return -1 j = None.start() self.handle_pi(rawdata[i + 2:j]) j = match.end() return j def parse_starttag(self, i): self._HTMLParser__starttag_text = None endpos = self.check_for_whole_start_tag(i) if endpos < 0: return endpos rawdata = None.rawdata self._HTMLParser__starttag_text = rawdata[i:endpos] attrs = [] match = tagfind.match(rawdata, i + 1) k = match.end() self.lasttag = tag = rawdata[i + 1:k].lower() while k < endpos: m = attrfind.match(rawdata, k) if not m: break (attrname, rest, attrvalue) = m.group(1, 2, 3) if not rest: attrvalue = None elif "'" == "'": pass elif not "'" == "'" == attrvalue[-1:]: if '"' == '"': pass elif '"' == attrvalue[-1:]: attrvalue = attrvalue[1:-1] attrvalue = self.unescape(attrvalue) attrs.append((attrname.lower(), attrvalue)) k = m.end() end = rawdata[k:endpos].strip() if end not in ('>', '/>'): (lineno, offset) = self.getpos() self.error('junk characters in start tag: %r' % (rawdata[k:endpos][:20],)) return endpos def check_for_whole_start_tag(self, i): rawdata = self.rawdata m = locatestarttagend.match(rawdata, i) if m: j = m.end() next = rawdata[j:j + 1] if next == '>': return j + 1 if None == '/': if rawdata.startswith('/>', j): return j + 2 if None.startswith('/', j): return -1 None.updatepos(i, j + 1) self.error('malformed empty start tag') if next == '': return -1 if None in 'abcdefghijklmnopqrstuvwxyz=/ABCDEFGHIJKLMNOPQRSTUVWXYZ': return -1 None.updatepos(i, j) self.error('malformed start tag') raise AssertionError('we should not get here!') def parse_endtag(self, i): rawdata = self.rawdata match = endendtag.search(rawdata, i + 1) if not match: return -1 j = None.end() match = endtagfind.match(rawdata, i) if not match: self.error('bad end tag: %r' % (rawdata[i:j],)) tag = match.group(1) self.handle_endtag(tag.lower()) self.clear_cdata_mode() return j def handle_startendtag(self, tag, attrs): self.handle_starttag(tag, attrs) self.handle_endtag(tag) def handle_starttag(self, tag, attrs): pass def handle_endtag(self, tag): pass def handle_charref(self, name): pass def handle_entityref(self, name): pass def handle_data(self, data): pass def handle_comment(self, data): pass def handle_decl(self, decl): pass def handle_pi(self, data): pass def unknown_decl(self, data): self.error('unknown declaration: %r' % (data,)) entitydefs = None def unescape(self, s): if '&' not in s: return s def replaceEntities(s): s = s.groups()[0] if s[0] == '#': s = s[1:] if s[0] in ('x', 'X'): c = int(s[1:], 16) else: c = int(s) return unichr(c) import htmlentitydefs if HTMLParser.entitydefs is None: entitydefs = HTMLParser.entitydefs = { 'apos': u"'" } for k, v in htmlentitydefs.name2codepoint.iteritems(): entitydefs[k] = unichr(v) try: return self.entitydefs[s] except KeyError: return '&' + s + ';' return re.sub('&(#?[xX]?(?:[0-9a-fA-F]+|\\w{1,8}));', replaceEntities, s)