home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
PC World 2001 April
/
PCWorld_2001-04_cd.bin
/
Software
/
TemaCD
/
webclean
/
webparser
/
sgmlop.c
< prev
Wrap
Text File
|
2001-01-16
|
20KB
|
822 lines
/*
* SGMLOP
* $Id: sgmlop.c,v 1.6 2001/01/16 19:22:15 calvin Exp $
*
* The sgmlop accelerator module
*
* This module provides a FastSGMLParser type, which is designed to
* speed up the standard sgmllib and xmllib modules. The parser can
* be configured to support either basic SGML (enough of it to process
* HTML documents, at least) or XML. This module also provides an
* Element type, useful for fast but simple DOM implementations.
*
* History:
* 1998-04-04 fl Created (for coreXML)
* 1998-04-05 fl Added close method
* 1998-04-06 fl Added parse method, revised callback interface
* 1998-04-14 fl Fixed parsing of PI tags
* 1998-05-14 fl Cleaned up for first public release
* 1998-05-19 fl Fixed xmllib compatibility: handle_proc, handle_special
* 1998-05-22 fl Added attribute parser
* 1999-06-20 fl Added Element data type, various bug fixes.
* 2000-05-28 fl Fixed data truncation error (@SGMLOP1)
* 2000-05-28 fl Added temporary workaround for unicode problem (@SGMLOP2)
* 2000-05-28 fl Removed optional close argument (@SGMLOP3)
* 2000-05-28 fl Raise exception on recursive feed (@SGMLOP4)
* 2000-07-05 fl Fixed attribute handling in empty tags (@SGMLOP6)
Changes from Bastian Kleineidam <calvin@users.sourceforge.net>
* new reset function
* use METH_VARARGS in method tables
* flush unprocessed data on close
* removed element and treebuilder, dont need it and its not working anyway
* merged register() function into constructor
* give error on missing callbacks
* better start tag parsing
* direct call of unknown_starttag, unknown_endtag
* fixed bug with unquoted attrs ending with a slash:
<a href=http://foo/>bar</a>
* deleted xml parser, I only need sgml/html
*
* Copyright (c) 1998-2000 by Secret Labs AB
* Copyright (c) 1998-2000 by Fredrik Lundh
*
* fredrik@pythonware.com
* http://www.pythonware.com
*
* By obtaining, using, and/or copying this software and/or its
* associated documentation, you agree that you have read, understood,
* and will comply with the following terms and conditions:
*
* Permission to use, copy, modify, and distribute this software and its
* associated documentation for any purpose and without fee is hereby
* granted, provided that the above copyright notice appears in all
* copies, and that both that copyright notice and this permission notice
* appear in supporting documentation, and that the name of Secret Labs
* AB or the author not be used in advertising or publicity pertaining to
* distribution of the software without specific, written prior
* permission.
*
* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
#include "Python.h"
#include <ctype.h>
#ifdef SGMLOP_UNICODE_SUPPORT
/* wide character set (experimental) */
/* FIXME: under Python 1.6, the current version converts Unicode
strings to UTF-8, and parses the result as if it was an ASCII
string. */
#define CHAR_T Py_UNICODE
#define ISALNUM Py_UNICODE_ISALNUM
#define ISSPACE Py_UNICODE_ISSPACE
#define TOLOWER Py_UNICODE_TOLOWER
#else
/* 8-bit character set */
#define CHAR_T char
#define ISALNUM isalnum
#define ISSPACE isspace
#define TOLOWER tolower
#endif
#if 0
static int memory = 0;
#define ALLOC(size, comment)\
do { memory += size; printf("%8d - %s\n", memory, comment); } while (0)
#define RELEASE(size, comment)\
do { memory -= size; printf("%8d - %s\n", memory, comment); } while (0)
#else
#define ALLOC(size, comment)
#define RELEASE(size, comment)
#endif
/* ==================================================================== */
/* parser data type */
/* state flags */
#define MAYBE 1
#define SURE 2
/* parser type definition */
typedef struct {
PyObject_HEAD
/* state attributes */
int feed;
int shorttag; /* 0=normal 2=parsing shorttag */
int doctype; /* 0=normal 1=dtd pending 2=parsing dtd */
/* buffer (holds incomplete tags) */
char* buffer;
int bufferlen; /* current amount of data */
int buffertotal; /* actually allocated */
/* callbacks */
PyObject* unknown_starttag;
PyObject* unknown_endtag;
PyObject* handle_proc;
PyObject* handle_special;
PyObject* handle_charref;
PyObject* handle_entityref;
PyObject* handle_data;
PyObject* handle_cdata;
PyObject* handle_comment;
} FastSGMLParserObject;
staticforward PyTypeObject FastSGMLParser_Type;
/* forward declarations */
static int fastfeed(FastSGMLParserObject* self);
static PyObject* attrparse(const CHAR_T *p, int len);
/* -------------------------------------------------------------------- */
/* create parser */
static PyObject* _sgmlop_new(PyObject* item) {
FastSGMLParserObject* self;
self = PyObject_NEW(FastSGMLParserObject, &FastSGMLParser_Type);
if (self == NULL)
return NULL;
self->feed = 0;
self->shorttag = 0;
self->doctype = 0;
self->buffer = NULL;
self->bufferlen = 0;
self->buffertotal = 0;
/* register callbacks */
self->unknown_starttag = PyObject_GetAttrString(item, "unknown_starttag");
self->unknown_endtag = PyObject_GetAttrString(item, "unknown_endtag");
self->handle_proc = PyObject_GetAttrString(item, "handle_proc");
self->handle_special = PyObject_GetAttrString(item, "handle_special");
self->handle_charref = PyObject_GetAttrString(item, "handle_charref");
self->handle_entityref = PyObject_GetAttrString(item, "handle_entityref");
self->handle_data = PyObject_GetAttrString(item, "handle_data");
self->handle_cdata = PyObject_GetAttrString(item, "handle_cdata");
self->handle_comment = PyObject_GetAttrString(item, "handle_comment");
/* PyErr_Clear(); *//* commented out because we dont accept missing
callbacks! */
return (PyObject*) self;
}
static PyObject* _sgmlop_sgmlparser(PyObject* self, PyObject* args) {
PyObject* item;
if (!PyArg_ParseTuple(args, "O", &item))
return NULL;
return _sgmlop_new(item);
}
static void
_sgmlop_dealloc(FastSGMLParserObject* self)
{
if (self->buffer)
free(self->buffer);
Py_DECREF(self->unknown_starttag);
Py_DECREF(self->unknown_endtag);
Py_DECREF(self->handle_proc);
Py_DECREF(self->handle_special);
Py_DECREF(self->handle_charref);
Py_DECREF(self->handle_entityref);
Py_DECREF(self->handle_data);
Py_DECREF(self->handle_cdata);
Py_DECREF(self->handle_comment);
PyMem_DEL(self);
}
/* release the internal buffer and reset all values except the function
callbacks */
static void reset(FastSGMLParserObject* self) {
if (self->buffer!=NULL) {
free(self->buffer);
self->buffer = NULL;
}
self->bufferlen = 0;
self->buffertotal = 0;
self->feed = 0;
self->shorttag = 0;
self->doctype = 0;
}
/* reset the parser */
static PyObject* _sgmlop_reset(FastSGMLParserObject* self, PyObject* args) {
if (!PyArg_NoArgs(args))
return NULL;
reset(self);
Py_INCREF(Py_None);
return Py_None;
}
/* -------------------------------------------------------------------- */
/* feed data to parser. the parser processes as much of the data as
possible, and keeps the rest in a local buffer. */
static PyObject*
feed(FastSGMLParserObject* self, char* string, int stringlen, int last)
{
/* common subroutine for SGMLParser.feed and SGMLParser.close */
int length;
if (self->feed) {
/* dealing with recursive feeds isn's exactly trivial, so
let's just bail out before the parser messes things up */
PyErr_SetString(PyExc_AssertionError, "recursive feed");
return NULL;
}
/* append new text block to local buffer */
if (!self->buffer) {
length = stringlen;
self->buffer = malloc(length);
self->buffertotal = stringlen;
} else {
length = self->bufferlen + stringlen;
if (length > self->buffertotal) {
self->buffer = realloc(self->buffer, length);
self->buffertotal = length;
}
}
if (!self->buffer) {
PyErr_NoMemory();
return NULL;
}
memcpy(self->buffer + self->bufferlen, string, stringlen);
self->bufferlen = length;
self->feed = 1;
length = fastfeed(self);
self->feed = 0;
if (length < 0)
return NULL;
if (length > self->bufferlen) {
/* ran beyond the end of the buffer (internal error)*/
PyErr_SetString(PyExc_AssertionError, "buffer overrun");
return NULL;
}
if (length > 0 && length < self->bufferlen)
/* adjust buffer */
memmove(self->buffer, self->buffer + length,
self->bufferlen - length);
self->bufferlen = self->bufferlen - length;
/* if data remains in the buffer even through this is the
last call, do an extra handle_data to get rid of it */
if (last) {
if (!PyObject_CallFunction(self->handle_data,
"s#", self->buffer, self->bufferlen))
return NULL;
/* shut the parser down and release the internal buffers */
reset(self);
}
return Py_BuildValue("i", self->bufferlen);
}
static PyObject*
_sgmlop_feed(FastSGMLParserObject* self, PyObject* args)
{
/* feed a chunk of data to the parser */
char* string;
int stringlen;
if (!PyArg_ParseTuple(args, "t#", &string, &stringlen))
return NULL;
return feed(self, string, stringlen, 0);
}
static PyObject*
_sgmlop_close(FastSGMLParserObject* self, PyObject* args)
{
/* flush parser buffers */
if (!PyArg_NoArgs(args))
return NULL;
return feed(self, "", 0, 1);
}
static PyObject*
_sgmlop_parse(FastSGMLParserObject* self, PyObject* args)
{
/* feed a single chunk of data to the parser */
char* string;
int stringlen;
if (!PyArg_ParseTuple(args, "t#", &string, &stringlen))
return NULL;
return feed(self, string, stringlen, 1);
}
/* -------------------------------------------------------------------- */
/* type interface */
static PyMethodDef _sgmlop_methods[] = {
/* incremental parsing */
{"feed", (PyCFunction) _sgmlop_feed, METH_VARARGS},
/* reset the parser */
{"reset", (PyCFunction) _sgmlop_reset, 0},
{"close", (PyCFunction) _sgmlop_close, 0},
/* one-shot parsing */
{"parse", (PyCFunction) _sgmlop_parse, METH_VARARGS},
{NULL, NULL}
};
static PyObject*
_sgmlop_getattr(FastSGMLParserObject* self, char* name)
{
return Py_FindMethod(_sgmlop_methods, (PyObject*) self, name);
}
statichere PyTypeObject FastSGMLParser_Type = {
PyObject_HEAD_INIT(NULL)
0, /* ob_size */
"FastSGMLParser", /* tp_name */
sizeof(FastSGMLParserObject), /* tp_size */
0, /* tp_itemsize */
/* methods */
(destructor)_sgmlop_dealloc, /* tp_dealloc */
0, /* tp_print */
(getattrfunc)_sgmlop_getattr, /* tp_getattr */
0 /* tp_setattr */
};
/* ==================================================================== */
/* python module interface */
static PyMethodDef _functions[] = {
{"SGMLParser", _sgmlop_sgmlparser, METH_VARARGS},
{NULL, NULL}
};
void
#ifdef WIN32
__declspec(dllexport)
#endif
initsgmlop(void)
{
/* Patch object type */
FastSGMLParser_Type.ob_type = &PyType_Type;
Py_InitModule("sgmlop", _functions);
}
/* -------------------------------------------------------------------- */
/* the parser does it all in a single loop, keeping the necessary
state in a few flag variables and the data buffer. if you have
a good optimizer, this can be incredibly fast. */
#define TAG 0x100
#define TAG_START 0x101
#define TAG_END 0x102
#define TAG_EMPTY 0x103
#define DIRECTIVE 0x104
#define DOCTYPE 0x105
#define PI 0x106
#define DTD_START 0x107
#define DTD_END 0x108
#define DTD_ENTITY 0x109
#define CDATA 0x200
#define ENTITYREF 0x400
#define CHARREF 0x401
#define COMMENT 0x800
static int
fastfeed(FastSGMLParserObject* self)
{
CHAR_T *end; /* tail */
CHAR_T *p, *q, *s; /* scanning pointers */
CHAR_T *b, *t, *e; /* token start/end */
int token;
s = q = p = (CHAR_T*) self->buffer;
end = (CHAR_T*) (self->buffer + self->bufferlen);
while (p < end) {
q = p; /* start of token */
if (*p == '<') {
int has_attr;
/* <tags> */
token = TAG_START;
if (++p >= end)
goto eol;
if (*p == '!') {
/* <! directive */
if (++p >= end)
goto eol;
token = DIRECTIVE;
b = t = p;
if (*p == '-') {
int i;
/* <!-- comment --> */
token = COMMENT;
b = p + 2;
for (;;) {
if (p+3 >= end)
goto eol;
if (p[1] != '-')
p += 2; /* boyer moore, sort of ;-) */
else if (p[0] != '-')
p++;
else {
i=2;
/* skip spaces */
while (isspace(p[i])) {
++i;
if (p+i >= end)
goto eol;
}
if (p[i]=='>')
break;
p+=i;
}
}
e = p;
p += i+1;
goto eot;
}
} else if (*p == '?') {
token = PI;
if (++p >= end)
goto eol;
} else if (*p == '/') {
/* </endtag> */
token = TAG_END;
if (++p >= end)
goto eol;
}
/* process tag name */
b = p;
while (ISALNUM(*p) || *p == '-' || *p == '.' ||
*p == ':' || *p == '?') {
*p = (CHAR_T) TOLOWER(*p);
if (++p >= end)
goto eol;
}
t = p;
has_attr = 0;
if (*p == '/') {
/* <tag/data/ or <tag/> */
token = TAG_START;
e = p;
if (++p >= end)
goto eol;
if (*p == '>') {
/* <tag/> */
token = TAG_EMPTY;
if (++p >= end)
goto eol;
} else
/* <tag/data/ */
self->shorttag = SURE;
/* we'll generate an end tag when we stumble upon
the end slash */
} else {
/* skip attributes */
int quote = 0;
int last = 0;
int error = 0;
int state = 0;
while (*p != '>' || (quote && !error)) {
if (!ISSPACE(*p)) {
if (state==3) error=1;
has_attr = 1;
/* FIXME: note: end tags cannot have attributes! */
}
else if (state==3) state=0;
if (quote) {
if (*p == quote) {
quote = 0;
state = 3;
}
} else {
if (*p=='=') {
if (state==1) error=1;
else state=1;
}
if (*p == '"' || *p == '\'') {
if (state!=1) error=1;
quote = *p;
state=2;
}
}
if (*p == '[' && !quote && self->doctype) {
self->doctype = SURE;
token = DTD_START;
e = p++;
goto eot;
}
last = *p;
if (++p >= end)
goto eol;
}
e = p++;
//if (last == '/') {
/* <tag/> */
// e--;
// token = TAG_EMPTY;
//} else if {
if (token == PI && last == '?')
e--;
if (self->doctype == MAYBE)
self->doctype = 0; /* there was no dtd */
if (has_attr)
; /* FIXME: process attributes */
}
} else if (*p == '/' && self->shorttag) {
/* end of shorttag. this generates an empty end tag */
token = TAG_END;
self->shorttag = 0;
b = t = e = p;
if (++p >= end)
goto eol;
} else if (*p == ']' && self->doctype) {
/* end of dtd. this generates an empty end tag */
token = DTD_END;
/* FIXME: who handles the ending > !? */
b = t = e = p;
if (++p >= end)
goto eol;
self->doctype = 0;
} else if (*p == '%' && self->doctype) {
/* doctype entities */
token = DTD_ENTITY;
if (++p >= end)
goto eol;
b = t = p;
while (ISALNUM(*p) || *p == '.')
if (++p >= end)
goto eol;
e = p;
if (*p == ';')
p++;
} else if (*p == '&') {
/* entities */
token = ENTITYREF;
if (++p >= end)
goto eol;
if (*p == '#') {
token = CHARREF;
if (++p >= end)
goto eol;
}
b = t = p;
while (ISALNUM(*p) || *p == '.')
if (++p >= end)
goto eol;
e = p;
if (*p == ';')
p++;
} else {
/* raw data */
if (++p >= end) {
q = p;
goto eol;
}
continue;
}
eot: /* end of token */
if (q != s) {
/* flush any raw data before this tag */
PyObject* res;
res = PyObject_CallFunction(self->handle_data,
"s#", s, q-s);
if (!res)
return -1;
Py_DECREF(res);
}
/* invoke callbacks */
if (token & TAG) {
if (token == TAG_END) {
PyObject* res;
res = PyObject_CallFunction(self->unknown_endtag,
"s#", b, t-b);
if (!res)
return -1;
Py_DECREF(res);
} else if (token == DIRECTIVE || token == DOCTYPE) {
PyObject* res;
res = PyObject_CallFunction(self->handle_special,
"s#", b, e-b);
if (!res)
return -1;
Py_DECREF(res);
} else if (token == PI) {
PyObject* res;
int len = t-b;
while (ISSPACE(*t))
t++;
res = PyObject_CallFunction(self->handle_proc,
"s#s#", b, len, t, e-t);
if (!res)
return -1;
Py_DECREF(res);
} else {
PyObject* res;
PyObject* attr;
int len = t-b;
while (ISSPACE(*t))
t++;
attr = attrparse(t, e-t);
if (!attr)
return -1;
res = PyObject_CallFunction(self->unknown_starttag,
"s#O", b, len, attr);
Py_DECREF(attr);
if (!res)
return -1;
Py_DECREF(res);
if (token == TAG_EMPTY) {
res = PyObject_CallFunction(self->unknown_endtag,
"s#", b, len);
if (!res)
return -1;
Py_DECREF(res);
}
}
} else if (token == ENTITYREF) {
PyObject* res;
res = PyObject_CallFunction(self->handle_entityref,
"s#", b, e-b);
if (!res)
return -1;
Py_DECREF(res);
} else if (token == CHARREF) {
PyObject* res;
res = PyObject_CallFunction(self->handle_charref,
"s#", b, e-b);
if (!res)
return -1;
Py_DECREF(res);
} else if (token == CDATA) {
PyObject* res;
res = PyObject_CallFunction(self->handle_cdata,
"s#", b, e-b);
if (!res)
return -1;
Py_DECREF(res);
} else if (token == COMMENT) {
PyObject* res;
res = PyObject_CallFunction(self->handle_comment,
"s#", b, e-b);
if (!res)
return -1;
Py_DECREF(res);
}
q = p; /* start of token */
s = p; /* start of span */
}
eol: /* end of line */
if (q != s) {
PyObject* res;
res = PyObject_CallFunction(self->handle_data,
"s#", s, q-s);
if (!res)
return -1;
Py_DECREF(res);
}
/* returns the number of bytes consumed in this pass */
return ((char*) q) - self->buffer;
}
static PyObject*
attrparse(const CHAR_T* p, int len)
{
PyObject* attrs;
PyObject* res;
PyObject* key = NULL;
PyObject* value = NULL;
const CHAR_T* end = p + len;
const CHAR_T* q;
attrs = PyList_New(0);
while (p < end) {
/* skip leading space */
while (p < end && ISSPACE(*p))
p++;
if (p >= end)
break;
/* get attribute name (key) */
q = p;
while (p < end && *p != '=' && !ISSPACE(*p))
p++;
key = PyString_FromStringAndSize(q, p-q);
if (key == NULL)
goto err;
value = key; /* in SGML mode, default is same as key */
Py_INCREF(value);
while (p < end && ISSPACE(*p))
p++;
if (p < end && *p == '=') {
/* attribute value found */
Py_DECREF(value);
if (p < end)
p++;
while (p < end && ISSPACE(*p))
p++;
q = p;
if (p < end && (*p == '"' || *p == '\'')) {
p++;
while (p < end && *p != *q)
p++;
value = PyString_FromStringAndSize(q+1, p-q-1);
if (p < end && *p == *q)
p++;
} else {
while (p < end && !ISSPACE(*p))
p++;
value = PyString_FromStringAndSize(q, p-q);
}
if (value == NULL)
goto err;
}
/* add to list */
res = PyTuple_New(2);
if (!res)
goto err;
PyTuple_SET_ITEM(res, 0, key);
PyTuple_SET_ITEM(res, 1, value);
if (PyList_Append(attrs, res) < 0) {
Py_DECREF(res);
goto err;
}
Py_DECREF(res);
key = NULL;
value = NULL;
}
return attrs;
err:
Py_XDECREF(key);
Py_XDECREF(value);
Py_DECREF(attrs);
return NULL;
}