home *** CD-ROM | disk | FTP | other *** search
/ PC World 2005 June / PCWorld_2005-06_cd.bin / software / vyzkuste / firewally / firewally.exe / framework-2.3.exe / markupbase.py < prev    next >
Text File  |  2003-12-30  |  14KB  |  384 lines

  1. """Shared support for scanning document type declarations in HTML and XHTML."""
  2.  
  3. import re
  4.  
  5. _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
  6. _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
  7. _commentclose = re.compile(r'--\s*>')
  8. _markedsectionclose = re.compile(r']\s*]\s*>')
  9.  
  10. # An analysis of the MS-Word extensions is available at
  11. # http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
  12.  
  13. _msmarkedsectionclose = re.compile(r']\s*>')
  14.  
  15. del re
  16.  
  17.  
  18. class ParserBase:
  19.     """Parser base class which provides some common support methods used
  20.     by the SGML/HTML and XHTML parsers."""
  21.  
  22.     def __init__(self):
  23.         if self.__class__ is ParserBase:
  24.             raise RuntimeError(
  25.                 "markupbase.ParserBase must be subclassed")
  26.  
  27.     def error(self, message):
  28.         raise NotImplementedError(
  29.             "subclasses of ParserBase must override error()")
  30.  
  31.     def reset(self):
  32.         self.lineno = 1
  33.         self.offset = 0
  34.  
  35.     def getpos(self):
  36.         """Return current line number and offset."""
  37.         return self.lineno, self.offset
  38.  
  39.     # Internal -- update line number and offset.  This should be
  40.     # called for each piece of data exactly once, in order -- in other
  41.     # words the concatenation of all the input strings to this
  42.     # function should be exactly the entire input.
  43.     def updatepos(self, i, j):
  44.         if i >= j:
  45.             return j
  46.         rawdata = self.rawdata
  47.         nlines = rawdata.count("\n", i, j)
  48.         if nlines:
  49.             self.lineno = self.lineno + nlines
  50.             pos = rawdata.rindex("\n", i, j) # Should not fail
  51.             self.offset = j-(pos+1)
  52.         else:
  53.             self.offset = self.offset + j-i
  54.         return j
  55.  
  56.     _decl_otherchars = ''
  57.  
  58.     # Internal -- parse declaration (for use by subclasses).
  59.     def parse_declaration(self, i):
  60.         # This is some sort of declaration; in "HTML as
  61.         # deployed," this should only be the document type
  62.         # declaration ("<!DOCTYPE html...>").
  63.         # ISO 8879:1986, however, has more complex
  64.         # declaration syntax for elements in <!...>, including:
  65.         # --comment--
  66.         # [marked section]
  67.         # name in the following list: ENTITY, DOCTYPE, ELEMENT,
  68.         # ATTLIST, NOTATION, SHORTREF, USEMAP,
  69.         # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
  70.         rawdata = self.rawdata
  71.         j = i + 2
  72.         assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
  73.         if rawdata[j:j+1] in ("-", ""):
  74.             # Start of comment followed by buffer boundary,
  75.             # or just a buffer boundary.
  76.             return -1
  77.         # A simple, practical version could look like: ((name|stringlit) S*) + '>'
  78.         n = len(rawdata)
  79.         if rawdata[j:j+1] == '--': #comment
  80.             # Locate --.*-- as the body of the comment
  81.             return self.parse_comment(i)
  82.         elif rawdata[j] == '[': #marked section
  83.             # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
  84.             # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
  85.             # Note that this is extended by Microsoft Office "Save as Web" function
  86.             # to include [if...] and [endif].
  87.             return self.parse_marked_section(i)
  88.         else: #all other declaration elements
  89.             decltype, j = self._scan_name(j, i)
  90.         if j < 0:
  91.             return j
  92.         if decltype == "doctype":
  93.             self._decl_otherchars = ''
  94.         while j < n:
  95.             c = rawdata[j]
  96.             if c == ">":
  97.                 # end of declaration syntax
  98.                 data = rawdata[i+2:j]
  99.                 if decltype == "doctype":
  100.                     self.handle_decl(data)
  101.                 else:
  102.                     self.unknown_decl(data)
  103.                 return j + 1
  104.             if c in "\"'":
  105.                 m = _declstringlit_match(rawdata, j)
  106.                 if not m:
  107.                     return -1 # incomplete
  108.                 j = m.end()
  109.             elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
  110.                 name, j = self._scan_name(j, i)
  111.             elif c in self._decl_otherchars:
  112.                 j = j + 1
  113.             elif c == "[":
  114.                 # this could be handled in a separate doctype parser
  115.                 if decltype == "doctype":
  116.                     j = self._parse_doctype_subset(j + 1, i)
  117.                 elif decltype in ("attlist", "linktype", "link", "element"):
  118.                     # must tolerate []'d groups in a content model in an element declaration
  119.                     # also in data attribute specifications of attlist declaration
  120.                     # also link type declaration subsets in linktype declarations
  121.                     # also link attribute specification lists in link declarations
  122.                     self.error("unsupported '[' char in %s declaration" % decltype)
  123.                 else:
  124.                     self.error("unexpected '[' char in declaration")
  125.             else:
  126.                 self.error(
  127.                     "unexpected %s char in declaration" % `rawdata[j]`)
  128.             if j < 0:
  129.                 return j
  130.         return -1 # incomplete
  131.  
  132.     # Internal -- parse a marked section
  133.     # Override this to handle MS-word extension syntax <![if word]>content<![endif]>
  134.     def parse_marked_section( self, i, report=1 ):
  135.         rawdata= self.rawdata
  136.         assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
  137.         sectName, j = self._scan_name( i+3, i )
  138.         if j < 0:
  139.             return j
  140.         if sectName in ("temp", "cdata", "ignore", "include", "rcdata"):
  141.             # look for standard ]]> ending
  142.             match= _markedsectionclose.search(rawdata, i+3)
  143.         elif sectName in ("if", "else", "endif"):
  144.             # look for MS Office ]> ending
  145.             match= _msmarkedsectionclose.search(rawdata, i+3)
  146.         else:
  147.             self.error('unknown status keyword %s in marked section' % `rawdata[i+3:j]`)
  148.         if not match:
  149.             return -1
  150.         if report:
  151.             j = match.start(0)
  152.             self.unknown_decl(rawdata[i+3: j])
  153.         return match.end(0)
  154.  
  155.     # Internal -- parse comment, return length or -1 if not terminated
  156.     def parse_comment(self, i, report=1):
  157.         rawdata = self.rawdata
  158.         if rawdata[i:i+4] != '<!--':
  159.             self.error('unexpected call to parse_comment()')
  160.         match = _commentclose.search(rawdata, i+4)
  161.         if not match:
  162.             return -1
  163.         if report:
  164.             j = match.start(0)
  165.             self.handle_comment(rawdata[i+4: j])
  166.         return match.end(0)
  167.  
  168.     # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
  169.     # returning the index just past any whitespace following the trailing ']'.
  170.     def _parse_doctype_subset(self, i, declstartpos):
  171.         rawdata = self.rawdata
  172.         n = len(rawdata)
  173.         j = i
  174.         while j < n:
  175.             c = rawdata[j]
  176.             if c == "<":
  177.                 s = rawdata[j:j+2]
  178.                 if s == "<":
  179.                     # end of buffer; incomplete
  180.                     return -1
  181.                 if s != "<!":
  182.                     self.updatepos(declstartpos, j + 1)
  183.                     self.error("unexpected char in internal subset (in %s)"
  184.                                % `s`)
  185.                 if (j + 2) == n:
  186.                     # end of buffer; incomplete
  187.                     return -1
  188.                 if (j + 4) > n:
  189.                     # end of buffer; incomplete
  190.                     return -1
  191.                 if rawdata[j:j+4] == "<!--":
  192.                     j = self.parse_comment(j, report=0)
  193.                     if j < 0:
  194.                         return j
  195.                     continue
  196.                 name, j = self._scan_name(j + 2, declstartpos)
  197.                 if j == -1:
  198.                     return -1
  199.                 if name not in ("attlist", "element", "entity", "notation"):
  200.                     self.updatepos(declstartpos, j + 2)
  201.                     self.error(
  202.                         "unknown declaration %s in internal subset" % `name`)
  203.                 # handle the individual names
  204.                 meth = getattr(self, "_parse_doctype_" + name)
  205.                 j = meth(j, declstartpos)
  206.                 if j < 0:
  207.                     return j
  208.             elif c == "%":
  209.                 # parameter entity reference
  210.                 if (j + 1) == n:
  211.                     # end of buffer; incomplete
  212.                     return -1
  213.                 s, j = self._scan_name(j + 1, declstartpos)
  214.                 if j < 0:
  215.                     return j
  216.                 if rawdata[j] == ";":
  217.                     j = j + 1
  218.             elif c == "]":
  219.                 j = j + 1
  220.                 while j < n and rawdata[j].isspace():
  221.                     j = j + 1
  222.                 if j < n:
  223.                     if rawdata[j] == ">":
  224.                         return j
  225.                     self.updatepos(declstartpos, j)
  226.                     self.error("unexpected char after internal subset")
  227.                 else:
  228.                     return -1
  229.             elif c.isspace():
  230.                 j = j + 1
  231.             else:
  232.                 self.updatepos(declstartpos, j)
  233.                 self.error("unexpected char %s in internal subset" % `c`)
  234.         # end of buffer reached
  235.         return -1
  236.  
  237.     # Internal -- scan past <!ELEMENT declarations
  238.     def _parse_doctype_element(self, i, declstartpos):
  239.         name, j = self._scan_name(i, declstartpos)
  240.         if j == -1:
  241.             return -1
  242.         # style content model; just skip until '>'
  243.         rawdata = self.rawdata
  244.         if '>' in rawdata[j:]:
  245.             return rawdata.find(">", j) + 1
  246.         return -1
  247.  
  248.     # Internal -- scan past <!ATTLIST declarations
  249.     def _parse_doctype_attlist(self, i, declstartpos):
  250.         rawdata = self.rawdata
  251.         name, j = self._scan_name(i, declstartpos)
  252.         c = rawdata[j:j+1]
  253.         if c == "":
  254.             return -1
  255.         if c == ">":
  256.             return j + 1
  257.         while 1:
  258.             # scan a series of attribute descriptions; simplified:
  259.             #   name type [value] [#constraint]
  260.             name, j = self._scan_name(j, declstartpos)
  261.             if j < 0:
  262.                 return j
  263.             c = rawdata[j:j+1]
  264.             if c == "":
  265.                 return -1
  266.             if c == "(":
  267.                 # an enumerated type; look for ')'
  268.                 if ")" in rawdata[j:]:
  269.                     j = rawdata.find(")", j) + 1
  270.                 else:
  271.                     return -1
  272.                 while rawdata[j:j+1].isspace():
  273.                     j = j + 1
  274.                 if not rawdata[j:]:
  275.                     # end of buffer, incomplete
  276.                     return -1
  277.             else:
  278.                 name, j = self._scan_name(j, declstartpos)
  279.             c = rawdata[j:j+1]
  280.             if not c:
  281.                 return -1
  282.             if c in "'\"":
  283.                 m = _declstringlit_match(rawdata, j)
  284.                 if m:
  285.                     j = m.end()
  286.                 else:
  287.                     return -1
  288.                 c = rawdata[j:j+1]
  289.                 if not c:
  290.                     return -1
  291.             if c == "#":
  292.                 if rawdata[j:] == "#":
  293.                     # end of buffer
  294.                     return -1
  295.                 name, j = self._scan_name(j + 1, declstartpos)
  296.                 if j < 0:
  297.                     return j
  298.                 c = rawdata[j:j+1]
  299.                 if not c:
  300.                     return -1
  301.             if c == '>':
  302.                 # all done
  303.                 return j + 1
  304.  
  305.     # Internal -- scan past <!NOTATION declarations
  306.     def _parse_doctype_notation(self, i, declstartpos):
  307.         name, j = self._scan_name(i, declstartpos)
  308.         if j < 0:
  309.             return j
  310.         rawdata = self.rawdata
  311.         while 1:
  312.             c = rawdata[j:j+1]
  313.             if not c:
  314.                 # end of buffer; incomplete
  315.                 return -1
  316.             if c == '>':
  317.                 return j + 1
  318.             if c in "'\"":
  319.                 m = _declstringlit_match(rawdata, j)
  320.                 if not m:
  321.                     return -1
  322.                 j = m.end()
  323.             else:
  324.                 name, j = self._scan_name(j, declstartpos)
  325.                 if j < 0:
  326.                     return j
  327.  
  328.     # Internal -- scan past <!ENTITY declarations
  329.     def _parse_doctype_entity(self, i, declstartpos):
  330.         rawdata = self.rawdata
  331.         if rawdata[i:i+1] == "%":
  332.             j = i + 1
  333.             while 1:
  334.                 c = rawdata[j:j+1]
  335.                 if not c:
  336.                     return -1
  337.                 if c.isspace():
  338.                     j = j + 1
  339.                 else:
  340.                     break
  341.         else:
  342.             j = i
  343.         name, j = self._scan_name(j, declstartpos)
  344.         if j < 0:
  345.             return j
  346.         while 1:
  347.             c = self.rawdata[j:j+1]
  348.             if not c:
  349.                 return -1
  350.             if c in "'\"":
  351.                 m = _declstringlit_match(rawdata, j)
  352.                 if m:
  353.                     j = m.end()
  354.                 else:
  355.                     return -1    # incomplete
  356.             elif c == ">":
  357.                 return j + 1
  358.             else:
  359.                 name, j = self._scan_name(j, declstartpos)
  360.                 if j < 0:
  361.                     return j
  362.  
  363.     # Internal -- scan a name token and the new position and the token, or
  364.     # return -1 if we've reached the end of the buffer.
  365.     def _scan_name(self, i, declstartpos):
  366.         rawdata = self.rawdata
  367.         n = len(rawdata)
  368.         if i == n:
  369.             return None, -1
  370.         m = _declname_match(rawdata, i)
  371.         if m:
  372.             s = m.group()
  373.             name = s.strip()
  374.             if (i + len(s)) == n:
  375.                 return None, -1  # end of buffer
  376.             return name.lower(), m.end()
  377.         else:
  378.             self.updatepos(declstartpos, i)
  379.             self.error("expected name token")
  380.  
  381.     # To be overridden -- handlers for unknown objects
  382.     def unknown_decl(self, data):
  383.         pass
  384.