PC World 2001 April

home *** CD-ROM | disk | FTP | other *** search

/ PC World 2001 April / PCWorld_2001-04_cd.bin / Software / TemaCD / webclean / webparser / sgmlop.c < prev

Wrap

Text File | 2001-01-16 | 20KB | 822 lines

/* * SGMLOP * $Id: sgmlop.c,v 1.6 2001/01/16 19:22:15 calvin Exp $ * * The sgmlop accelerator module * * This module provides a FastSGMLParser type, which is designed to * speed up the standard sgmllib and xmllib modules. The parser can * be configured to support either basic SGML (enough of it to process * HTML documents, at least) or XML. This module also provides an * Element type, useful for fast but simple DOM implementations. * * History: * 1998-04-04 fl Created (for coreXML) * 1998-04-05 fl Added close method * 1998-04-06 fl Added parse method, revised callback interface * 1998-04-14 fl Fixed parsing of PI tags * 1998-05-14 fl Cleaned up for first public release * 1998-05-19 fl Fixed xmllib compatibility: handle_proc, handle_special * 1998-05-22 fl Added attribute parser * 1999-06-20 fl Added Element data type, various bug fixes. * 2000-05-28 fl Fixed data truncation error (@SGMLOP1) * 2000-05-28 fl Added temporary workaround for unicode problem (@SGMLOP2) * 2000-05-28 fl Removed optional close argument (@SGMLOP3) * 2000-05-28 fl Raise exception on recursive feed (@SGMLOP4) * 2000-07-05 fl Fixed attribute handling in empty tags (@SGMLOP6) Changes from Bastian Kleineidam <calvin@users.sourceforge.net> * new reset function * use METH_VARARGS in method tables * flush unprocessed data on close * removed element and treebuilder, dont need it and its not working anyway * merged register() function into constructor * give error on missing callbacks * better start tag parsing * direct call of unknown_starttag, unknown_endtag * fixed bug with unquoted attrs ending with a slash: <a href=http://foo/>bar</a> * deleted xml parser, I only need sgml/html * * Copyright (c) 1998-2000 by Secret Labs AB * Copyright (c) 1998-2000 by Fredrik Lundh * * fredrik@pythonware.com * http://www.pythonware.com * * By obtaining, using, and/or copying this software and/or its * associated documentation, you agree that you have read, understood, * and will comply with the following terms and conditions: * * Permission to use, copy, modify, and distribute this software and its * associated documentation for any purpose and without fee is hereby * granted, provided that the above copyright notice appears in all * copies, and that both that copyright notice and this permission notice * appear in supporting documentation, and that the name of Secret Labs * AB or the author not be used in advertising or publicity pertaining to * distribution of the software without specific, written prior * permission. * * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include "Python.h" #include <ctype.h> #ifdef SGMLOP_UNICODE_SUPPORT /* wide character set (experimental) */ /* FIXME: under Python 1.6, the current version converts Unicode strings to UTF-8, and parses the result as if it was an ASCII string. */ #define CHAR_T Py_UNICODE #define ISALNUM Py_UNICODE_ISALNUM #define ISSPACE Py_UNICODE_ISSPACE #define TOLOWER Py_UNICODE_TOLOWER #else /* 8-bit character set */ #define CHAR_T char #define ISALNUM isalnum #define ISSPACE isspace #define TOLOWER tolower #endif #if 0 static int memory = 0; #define ALLOC(size, comment)\ do { memory += size; printf("%8d - %s\n", memory, comment); } while (0) #define RELEASE(size, comment)\ do { memory -= size; printf("%8d - %s\n", memory, comment); } while (0) #else #define ALLOC(size, comment) #define RELEASE(size, comment) #endif /* ==================================================================== */ /* parser data type */ /* state flags */ #define MAYBE 1 #define SURE 2 /* parser type definition */ typedef struct { PyObject_HEAD /* state attributes */ int feed; int shorttag; /* 0=normal 2=parsing shorttag */ int doctype; /* 0=normal 1=dtd pending 2=parsing dtd */ /* buffer (holds incomplete tags) */ char* buffer; int bufferlen; /* current amount of data */ int buffertotal; /* actually allocated */ /* callbacks */ PyObject* unknown_starttag; PyObject* unknown_endtag; PyObject* handle_proc; PyObject* handle_special; PyObject* handle_charref; PyObject* handle_entityref; PyObject* handle_data; PyObject* handle_cdata; PyObject* handle_comment; } FastSGMLParserObject; staticforward PyTypeObject FastSGMLParser_Type; /* forward declarations */ static int fastfeed(FastSGMLParserObject* self); static PyObject* attrparse(const CHAR_T *p, int len); /* -------------------------------------------------------------------- */ /* create parser */ static PyObject* _sgmlop_new(PyObject* item) { FastSGMLParserObject* self; self = PyObject_NEW(FastSGMLParserObject, &FastSGMLParser_Type); if (self == NULL) return NULL; self->feed = 0; self->shorttag = 0; self->doctype = 0; self->buffer = NULL; self->bufferlen = 0; self->buffertotal = 0; /* register callbacks */ self->unknown_starttag = PyObject_GetAttrString(item, "unknown_starttag"); self->unknown_endtag = PyObject_GetAttrString(item, "unknown_endtag"); self->handle_proc = PyObject_GetAttrString(item, "handle_proc"); self->handle_special = PyObject_GetAttrString(item, "handle_special"); self->handle_charref = PyObject_GetAttrString(item, "handle_charref"); self->handle_entityref = PyObject_GetAttrString(item, "handle_entityref"); self->handle_data = PyObject_GetAttrString(item, "handle_data"); self->handle_cdata = PyObject_GetAttrString(item, "handle_cdata"); self->handle_comment = PyObject_GetAttrString(item, "handle_comment"); /* PyErr_Clear(); *//* commented out because we dont accept missing callbacks! */ return (PyObject*) self; } static PyObject* _sgmlop_sgmlparser(PyObject* self, PyObject* args) { PyObject* item; if (!PyArg_ParseTuple(args, "O", &item)) return NULL; return _sgmlop_new(item); } static void _sgmlop_dealloc(FastSGMLParserObject* self) { if (self->buffer) free(self->buffer); Py_DECREF(self->unknown_starttag); Py_DECREF(self->unknown_endtag); Py_DECREF(self->handle_proc); Py_DECREF(self->handle_special); Py_DECREF(self->handle_charref); Py_DECREF(self->handle_entityref); Py_DECREF(self->handle_data); Py_DECREF(self->handle_cdata); Py_DECREF(self->handle_comment); PyMem_DEL(self); } /* release the internal buffer and reset all values except the function callbacks */ static void reset(FastSGMLParserObject* self) { if (self->buffer!=NULL) { free(self->buffer); self->buffer = NULL; } self->bufferlen = 0; self->buffertotal = 0; self->feed = 0; self->shorttag = 0; self->doctype = 0; } /* reset the parser */ static PyObject* _sgmlop_reset(FastSGMLParserObject* self, PyObject* args) { if (!PyArg_NoArgs(args)) return NULL; reset(self); Py_INCREF(Py_None); return Py_None; } /* -------------------------------------------------------------------- */ /* feed data to parser. the parser processes as much of the data as possible, and keeps the rest in a local buffer. */ static PyObject* feed(FastSGMLParserObject* self, char* string, int stringlen, int last) { /* common subroutine for SGMLParser.feed and SGMLParser.close */ int length; if (self->feed) { /* dealing with recursive feeds isn's exactly trivial, so let's just bail out before the parser messes things up */ PyErr_SetString(PyExc_AssertionError, "recursive feed"); return NULL; } /* append new text block to local buffer */ if (!self->buffer) { length = stringlen; self->buffer = malloc(length); self->buffertotal = stringlen; } else { length = self->bufferlen + stringlen; if (length > self->buffertotal) { self->buffer = realloc(self->buffer, length); self->buffertotal = length; } } if (!self->buffer) { PyErr_NoMemory(); return NULL; } memcpy(self->buffer + self->bufferlen, string, stringlen); self->bufferlen = length; self->feed = 1; length = fastfeed(self); self->feed = 0; if (length < 0) return NULL; if (length > self->bufferlen) { /* ran beyond the end of the buffer (internal error)*/ PyErr_SetString(PyExc_AssertionError, "buffer overrun"); return NULL; } if (length > 0 && length < self->bufferlen) /* adjust buffer */ memmove(self->buffer, self->buffer + length, self->bufferlen - length); self->bufferlen = self->bufferlen - length; /* if data remains in the buffer even through this is the last call, do an extra handle_data to get rid of it */ if (last) { if (!PyObject_CallFunction(self->handle_data, "s#", self->buffer, self->bufferlen)) return NULL; /* shut the parser down and release the internal buffers */ reset(self); } return Py_BuildValue("i", self->bufferlen); } static PyObject* _sgmlop_feed(FastSGMLParserObject* self, PyObject* args) { /* feed a chunk of data to the parser */ char* string; int stringlen; if (!PyArg_ParseTuple(args, "t#", &string, &stringlen)) return NULL; return feed(self, string, stringlen, 0); } static PyObject* _sgmlop_close(FastSGMLParserObject* self, PyObject* args) { /* flush parser buffers */ if (!PyArg_NoArgs(args)) return NULL; return feed(self, "", 0, 1); } static PyObject* _sgmlop_parse(FastSGMLParserObject* self, PyObject* args) { /* feed a single chunk of data to the parser */ char* string; int stringlen; if (!PyArg_ParseTuple(args, "t#", &string, &stringlen)) return NULL; return feed(self, string, stringlen, 1); } /* -------------------------------------------------------------------- */ /* type interface */ static PyMethodDef _sgmlop_methods[] = { /* incremental parsing */ {"feed", (PyCFunction) _sgmlop_feed, METH_VARARGS}, /* reset the parser */ {"reset", (PyCFunction) _sgmlop_reset, 0}, {"close", (PyCFunction) _sgmlop_close, 0}, /* one-shot parsing */ {"parse", (PyCFunction) _sgmlop_parse, METH_VARARGS}, {NULL, NULL} }; static PyObject* _sgmlop_getattr(FastSGMLParserObject* self, char* name) { return Py_FindMethod(_sgmlop_methods, (PyObject*) self, name); } statichere PyTypeObject FastSGMLParser_Type = { PyObject_HEAD_INIT(NULL) 0, /* ob_size */ "FastSGMLParser", /* tp_name */ sizeof(FastSGMLParserObject), /* tp_size */ 0, /* tp_itemsize */ /* methods */ (destructor)_sgmlop_dealloc, /* tp_dealloc */ 0, /* tp_print */ (getattrfunc)_sgmlop_getattr, /* tp_getattr */ 0 /* tp_setattr */ }; /* ==================================================================== */ /* python module interface */ static PyMethodDef _functions[] = { {"SGMLParser", _sgmlop_sgmlparser, METH_VARARGS}, {NULL, NULL} }; void #ifdef WIN32 __declspec(dllexport) #endif initsgmlop(void) { /* Patch object type */ FastSGMLParser_Type.ob_type = &PyType_Type; Py_InitModule("sgmlop", _functions); } /* -------------------------------------------------------------------- */ /* the parser does it all in a single loop, keeping the necessary state in a few flag variables and the data buffer. if you have a good optimizer, this can be incredibly fast. */ #define TAG 0x100 #define TAG_START 0x101 #define TAG_END 0x102 #define TAG_EMPTY 0x103 #define DIRECTIVE 0x104 #define DOCTYPE 0x105 #define PI 0x106 #define DTD_START 0x107 #define DTD_END 0x108 #define DTD_ENTITY 0x109 #define CDATA 0x200 #define ENTITYREF 0x400 #define CHARREF 0x401 #define COMMENT 0x800 static int fastfeed(FastSGMLParserObject* self) { CHAR_T *end; /* tail */ CHAR_T *p, *q, *s; /* scanning pointers */ CHAR_T *b, *t, *e; /* token start/end */ int token; s = q = p = (CHAR_T*) self->buffer; end = (CHAR_T*) (self->buffer + self->bufferlen); while (p < end) { q = p; /* start of token */ if (*p == '<') { int has_attr; /* <tags> */ token = TAG_START; if (++p >= end) goto eol; if (*p == '!') { /* <! directive */ if (++p >= end) goto eol; token = DIRECTIVE; b = t = p; if (*p == '-') { int i; /*  */ token = COMMENT; b = p + 2; for (;;) { if (p+3 >= end) goto eol; if (p[1] != '-') p += 2; /* boyer moore, sort of ;-) */ else if (p[0] != '-') p++; else { i=2; /* skip spaces */ while (isspace(p[i])) { ++i; if (p+i >= end) goto eol; } if (p[i]=='>') break; p+=i; } } e = p; p += i+1; goto eot; } } else if (*p == '?') { token = PI; if (++p >= end) goto eol; } else if (*p == '/') { /* </endtag> */ token = TAG_END; if (++p >= end) goto eol; } /* process tag name */ b = p; while (ISALNUM(*p) || *p == '-' || *p == '.' || *p == ':' || *p == '?') { *p = (CHAR_T) TOLOWER(*p); if (++p >= end) goto eol; } t = p; has_attr = 0; if (*p == '/') { /* <tag/data/ or <tag/> */ token = TAG_START; e = p; if (++p >= end) goto eol; if (*p == '>') { /* <tag/> */ token = TAG_EMPTY; if (++p >= end) goto eol; } else /* <tag/data/ */ self->shorttag = SURE; /* we'll generate an end tag when we stumble upon the end slash */ } else { /* skip attributes */ int quote = 0; int last = 0; int error = 0; int state = 0; while (*p != '>' || (quote && !error)) { if (!ISSPACE(*p)) { if (state==3) error=1; has_attr = 1; /* FIXME: note: end tags cannot have attributes! */ } else if (state==3) state=0; if (quote) { if (*p == quote) { quote = 0; state = 3; } } else { if (*p=='=') { if (state==1) error=1; else state=1; } if (*p == '"' || *p == '\'') { if (state!=1) error=1; quote = *p; state=2; } } if (*p == '[' && !quote && self->doctype) { self->doctype = SURE; token = DTD_START; e = p++; goto eot; } last = *p; if (++p >= end) goto eol; } e = p++; //if (last == '/') { /* <tag/> */ // e--; // token = TAG_EMPTY; //} else if { if (token == PI && last == '?') e--; if (self->doctype == MAYBE) self->doctype = 0; /* there was no dtd */ if (has_attr) ; /* FIXME: process attributes */ } } else if (*p == '/' && self->shorttag) { /* end of shorttag. this generates an empty end tag */ token = TAG_END; self->shorttag = 0; b = t = e = p; if (++p >= end) goto eol; } else if (*p == ']' && self->doctype) { /* end of dtd. this generates an empty end tag */ token = DTD_END; /* FIXME: who handles the ending > !? */ b = t = e = p; if (++p >= end) goto eol; self->doctype = 0; } else if (*p == '%' && self->doctype) { /* doctype entities */ token = DTD_ENTITY; if (++p >= end) goto eol; b = t = p; while (ISALNUM(*p) || *p == '.') if (++p >= end) goto eol; e = p; if (*p == ';') p++; } else if (*p == '&') { /* entities */ token = ENTITYREF; if (++p >= end) goto eol; if (*p == '#') { token = CHARREF; if (++p >= end) goto eol; } b = t = p; while (ISALNUM(*p) || *p == '.') if (++p >= end) goto eol; e = p; if (*p == ';') p++; } else { /* raw data */ if (++p >= end) { q = p; goto eol; } continue; } eot: /* end of token */ if (q != s) { /* flush any raw data before this tag */ PyObject* res; res = PyObject_CallFunction(self->handle_data, "s#", s, q-s); if (!res) return -1; Py_DECREF(res); } /* invoke callbacks */ if (token & TAG) { if (token == TAG_END) { PyObject* res; res = PyObject_CallFunction(self->unknown_endtag, "s#", b, t-b); if (!res) return -1; Py_DECREF(res); } else if (token == DIRECTIVE || token == DOCTYPE) { PyObject* res; res = PyObject_CallFunction(self->handle_special, "s#", b, e-b); if (!res) return -1; Py_DECREF(res); } else if (token == PI) { PyObject* res; int len = t-b; while (ISSPACE(*t)) t++; res = PyObject_CallFunction(self->handle_proc, "s#s#", b, len, t, e-t); if (!res) return -1; Py_DECREF(res); } else { PyObject* res; PyObject* attr; int len = t-b; while (ISSPACE(*t)) t++; attr = attrparse(t, e-t); if (!attr) return -1; res = PyObject_CallFunction(self->unknown_starttag, "s#O", b, len, attr); Py_DECREF(attr); if (!res) return -1; Py_DECREF(res); if (token == TAG_EMPTY) { res = PyObject_CallFunction(self->unknown_endtag, "s#", b, len); if (!res) return -1; Py_DECREF(res); } } } else if (token == ENTITYREF) { PyObject* res; res = PyObject_CallFunction(self->handle_entityref, "s#", b, e-b); if (!res) return -1; Py_DECREF(res); } else if (token == CHARREF) { PyObject* res; res = PyObject_CallFunction(self->handle_charref, "s#", b, e-b); if (!res) return -1; Py_DECREF(res); } else if (token == CDATA) { PyObject* res; res = PyObject_CallFunction(self->handle_cdata, "s#", b, e-b); if (!res) return -1; Py_DECREF(res); } else if (token == COMMENT) { PyObject* res; res = PyObject_CallFunction(self->handle_comment, "s#", b, e-b); if (!res) return -1; Py_DECREF(res); } q = p; /* start of token */ s = p; /* start of span */ } eol: /* end of line */ if (q != s) { PyObject* res; res = PyObject_CallFunction(self->handle_data, "s#", s, q-s); if (!res) return -1; Py_DECREF(res); } /* returns the number of bytes consumed in this pass */ return ((char*) q) - self->buffer; } static PyObject* attrparse(const CHAR_T* p, int len) { PyObject* attrs; PyObject* res; PyObject* key = NULL; PyObject* value = NULL; const CHAR_T* end = p + len; const CHAR_T* q; attrs = PyList_New(0); while (p < end) { /* skip leading space */ while (p < end && ISSPACE(*p)) p++; if (p >= end) break; /* get attribute name (key) */ q = p; while (p < end && *p != '=' && !ISSPACE(*p)) p++; key = PyString_FromStringAndSize(q, p-q); if (key == NULL) goto err; value = key; /* in SGML mode, default is same as key */ Py_INCREF(value); while (p < end && ISSPACE(*p)) p++; if (p < end && *p == '=') { /* attribute value found */ Py_DECREF(value); if (p < end) p++; while (p < end && ISSPACE(*p)) p++; q = p; if (p < end && (*p == '"' || *p == '\'')) { p++; while (p < end && *p != *q) p++; value = PyString_FromStringAndSize(q+1, p-q-1); if (p < end && *p == *q) p++; } else { while (p < end && !ISSPACE(*p)) p++; value = PyString_FromStringAndSize(q, p-q); } if (value == NULL) goto err; } /* add to list */ res = PyTuple_New(2); if (!res) goto err; PyTuple_SET_ITEM(res, 0, key); PyTuple_SET_ITEM(res, 1, value); if (PyList_Append(attrs, res) < 0) { Py_DECREF(res); goto err; } Py_DECREF(res); key = NULL; value = NULL; } return attrs; err: Py_XDECREF(key); Py_XDECREF(value); Py_DECREF(attrs); return NULL; }