home *** CD-ROM | disk | FTP | other *** search
- From: lee@sq.sq.com (Liam R. E. Quin)
- Newsgroups: alt.sources
- Subject: lq-text Full Text Retrieval Database Part 02/13
- Message-ID: <1991Mar4.020107.16142@sq.sq.com>
- Date: 4 Mar 91 02:01:07 GMT
-
-
- : cut here --- cut here --
- : To unbundle, sh this file
- #! /bin/sh
- : part 02
- echo x - lq-text/src/filters/Makefile 1>&2
- sed 's/^X//' >lq-text/src/filters/Makefile <<'@@@End of lq-text/src/filters/Makefile'
- X# filters/Makefile -- Copyright 1990 Liam R. Quin. All Rights Reserved.
- X# This code is NOT in the public domain.
- X# See the file ../COPYRIGHT for full details.
- X
- X# This Makefile belongs in the "src/filters" directory.
- X#
- X# Note that most of the actual configuration is done in ../Makefile and
- X# in ../h/global.h, and not here.
- X
- X# $Id: Makefile,v 1.4 90/10/06 00:57:26 lee Rel $
- X
- X
- X# This is what gets made:
- XTARGETS = MailFilter NewsFilter
- XLIBFILES=$(TARGETS)
- XEXTRA=-DMAILFILTER='$(MAILFILTER)' -DNEWSFILTER='$(NEWSFILTER)' $(EXTRA)
- X
- XSRCS = FilterMain.c FilterType.c MailFilter.c NewsFilter.c
- XOBJS = FilterMain.o FilterType.o MailFilter.o NewsFilter.o
- X
- XPWD=filters
- X
- XDESTDIR=../lib
- XLQ=../lib/liblq.a
- XMODE=755
- X
- X# for compiling:
- XEXTRA=-I../h
- XRANLIB=echo
- X
- Xall: $(TARGETS)
- X
- Xsaber_src:
- X echo $(PWD)
- X #cd $(PWD)
- X #load $(CFLAGS) $(SRCS)
- X #cd ..
- X
- Xsaber_obj:
- X #cd $(PWD)
- X #load $(CFLAGS) $(SRCS)
- X #cd ..
- X
- Xinstall: all
- X for i in $(LIBFILES); do cp "$$i" $(DESTDIR); \
- X strip "$(DESTDIR)/$$i" ; \
- X chmod $(MODE) "$(DESTDIR)/$$i" ; \
- X done
- X
- Xtidy:
- X /bin/rm -f *.o core m.log tags
- X
- Xclean: tidy
- X /bin/rm -f $(TARGETS) $(TEST)
- X
- Xdepend:
- X mkdep $(CFLAGS) *.c
- X
- XCFilter: FilterMain.o CFilter.o
- X $(CC) $(CFLAGS) -o CFilter FilterMain.o CFilter.o $(MALLOC) $(LQ)
- X
- XNewsFilter: FilterMain.o NewsFilter.o
- X $(CC) $(CFLAGS) -o NewsFilter FilterMain.o NewsFilter.o $(MALLOC) $(LQ)
- X
- XMailFilter: FilterMain.o MailFilter.o
- X $(CC) $(CFLAGS) -o MailFilter FilterMain.o MailFilter.o $(MALLOC) $(LQ)
- X
- XCDMSFilter: FilterMain.o CDMSFilter.o
- X $(CC) $(CFLAGS) -o CDMSFilter FilterMain.o CDMSFilter.o $(MALLOC) $(LQ)
- X
- X#
- X# $Log: Makefile,v $
- X# Revision 1.4 90/10/06 00:57:26 lee
- X# Prepared for first beta release.
- X#
- X# Revision 1.3 90/10/03 21:14:45 lee
- X# Added MAILFILTER stuff.
- X#
- X# Revision 1.2 90/09/28 21:54:43 lee
- X# No longer uses OWNER.
- X#
- X# Revision 1.1 90/08/09 19:17:58 lee
- X# Initial revision
- X
- X# DO NOT PUT ANYTHING AFTER THIS LINE
- X# DO NOT DELETE THIS LINE -- mkdep uses it.
- X# DO NOT PUT ANYTHING AFTER THIS LINE, IT WILL GO AWAY.
- X
- XFilterMain.o: FilterMain.c
- XMailFilter.o: MailFilter.c /usr/include/malloc.h
- XMailFilter.o: ../h/wordrules.h ../h/emalloc.h
- XNewsFilter.o: NewsFilter.c
- XNewsFilter.o: ../h/wordrules.h ../h/emalloc.h
- XTroffFilter.o: TroffFilter.c
- XTroffFilter.o: ../h/wordrules.h ../h/emalloc.h
- X
- X# IF YOU PUT ANYTHING HERE IT WILL GO AWAY
- @@@End of lq-text/src/filters/Makefile
- echo x - lq-text/src/filters/NewsFilter.c 1>&2
- sed 's/^X//' >lq-text/src/filters/NewsFilter.c <<'@@@End of lq-text/src/filters/NewsFilter.c'
- X/* NewsFilter.c -- Copyright 1989 Liam R. Quin. All Rights Reserved.
- X * This code is NOT in the public domain.
- X * See the file COPYRIGHT for full details.
- X */
- X
- X/* $Id: NewsFilter.c,v 1.5 90/10/06 00:57:27 lee Rel1-10 $
- X */
- X
- X/* Filter for usenet articles.
- X * Throw away all of the header except
- X * Subject
- X * From
- X * Organi[sz]ation
- X *
- X * Probably ought to keep Message-ID, but I can't store it anyway!
- X *
- X * See FilterMain and wordrules.h for more info.
- X *
- X */
- X
- X#ifdef SYSV
- X extern int _filbuf(), _flsbuf(); /* for lint! */
- X#endif
- X#include <stdio.h>
- X#include <malloc.h>
- X#include <ctype.h>
- X#include "wordrules.h"
- X
- X#include "emalloc.h"
- X
- X#define STREQ(boy, girl) ((*(boy) == *(girl)) && !strcmp(boy, girl))
- X
- X/** C Library functions that need to be declared: **/
- X#ifndef tolower
- X extern int tolower();
- X#endif
- Xextern int strcmp();
- X/** Functions in this file that need to be declared **/
- Xint GetChar();
- Xvoid Header(), Body();
- X/** **/
- X
- Xextern char *progname;
- Xvoid Filter();
- X
- Xchar *KeepThese[] = { /* these must be sorted on the first character */
- X "from",
- X "keywords",
- X "summary",
- X "subject",
- X "organisation",
- X "organization",
- X 0
- X};
- X
- Xint icstreq(s1, s2) /* case insensitive strcmp */
- X char *s1, *s2;
- X{
- X register char ch1, ch2;
- X
- X while (*s1 && *s2) {
- X if (*s1 != *s2) {
- X if (isupper(*s1)) {
- X ch1 = tolower(*s1);
- X ch2 = (*s2);
- X } else if (isupper(*s2)) {
- X /* Note that we only have to test one character for case! */
- X ch1 = (*s1);
- X ch2 = tolower(*s2);
- X } else {
- X return 0; /* they are different */
- X }
- X if (ch1 != ch2) return 0; /* the strings differ */
- X }
- X s1++; s2++;
- X }
- X if (!*s1 && !*s2) {
- X return 1; /* they are the same */
- X }
- X return 0; /* they are different */
- X}
- X
- Xint
- XIsWanted(String)
- X char *String;
- X{
- X char **p;
- X int ch = String[0];
- X
- X if (isupper(ch)) ch = tolower(ch);
- X
- X for (p = KeepThese; *p && **p; p++) {
- X if (ch < **p) return 0; /* gone too far */
- X else if (icstreq(String, *p)) return 1;
- X }
- X return 0;
- X}
- X
- Xvoid
- XFilter(InputFile, Name)
- X FILE *InputFile;
- X char *Name;
- X{
- X Header(InputFile, Name);
- X Body(InputFile, Name);
- X}
- X
- Xtypedef enum {
- X F_NotSeenAnythingYet,
- X F_InTheFirstWord,
- X F_AfterTheFirstWord
- X} t_FirstWord;
- X
- Xint InWord = 0;
- X
- Xvoid
- XHeader(InputFile, Name)
- X FILE *InputFile;
- X char *Name;
- X{
- X int AtStartOfLine = 1;
- X int IgnoreLine = 1; /* initialised for lint and gcc -W really... */
- X t_FirstWord FirstWord = F_NotSeenAnythingYet;
- X int ch;
- X static int BufLen;
- X static char *Buffer = 0;
- X int AtStartOfWord;
- X register char *q;
- X
- X if (Buffer == 0) {
- X BufLen = 24;
- X Buffer = emalloc(BufLen);
- X }
- X
- X q = Buffer;
- X InWord = 0;
- X
- X while ((ch = GetChar(InputFile)) != EOF) {
- X if (ch == '\n') {
- X if (AtStartOfLine) { /* a blank line */
- X putchar('\n');
- X return;
- X }
- X }
- X
- X InWord = InWord ? WithinWord(ch) : StartsWord(ch);
- X
- X switch (FirstWord) {
- X case F_NotSeenAnythingYet:
- X if (InWord) {
- X FirstWord = F_InTheFirstWord;
- X if (q - Buffer >= BufLen - 1) {
- X int where = q - Buffer;
- X
- X BufLen += 24;
- X Buffer = erealloc(Buffer, BufLen);
- X q = &Buffer[where];
- X }
- X *q++ = ch;
- X } else {
- X putchar(' ');
- X }
- X break;
- X case F_InTheFirstWord:
- X if (InWord) {
- X if (q - Buffer >= BufLen - 1) {
- X int where = q - Buffer;
- X
- X BufLen += 24;
- X Buffer = erealloc(Buffer, BufLen += 24);
- X q = &Buffer[where];
- X }
- X *q++ = ch;
- X break;
- X } else { /* reached the end of the first word on the line */
- X *q = '\0';
- X /* See if it's a keyword */
- X if ((IgnoreLine = !IsWanted(Buffer)) != 0) {
- X /* Turn the word into one that won't get indexed,
- X * so that word counmts are unaffected:
- X * We use qxxxxxxx (any number of x's) for this.
- X */
- X for (q = Buffer; *q; q++) {
- X putchar((q == Buffer) ? 'q' : 'x');
- X }
- X putchar (ch == '\n' ? '\n' : ' ');
- X } else {
- X printf("%s%c", Buffer, ch == '\n' ? ch : ' ');
- X }
- X FirstWord = F_AfterTheFirstWord;
- X }
- X break;
- X default:
- X if ((AtStartOfLine = (ch == '\n'))) {
- X IgnoreLine = 0;
- X q = Buffer;
- X FirstWord = F_NotSeenAnythingYet;
- X AtStartOfWord = 1;
- X }
- X if (InWord && !IgnoreLine) {
- X putchar(ch);
- X } else {
- X if (AtStartOfWord && InWord) {
- X putchar('q');
- X AtStartOfWord = 0;
- X } else if (InWord) {
- X putchar('x');
- X } else if (isspace(ch)) {
- X putchar(ch);
- X } else {
- X putchar(' ');
- X }
- X }
- X if (!InWord) AtStartOfWord = 1;
- X }
- X if ((AtStartOfLine = (ch == '\n'))) {
- X IgnoreLine = 0;
- X q = Buffer;
- X FirstWord = F_NotSeenAnythingYet;
- X AtStartOfWord = 1;
- X }
- X }
- X if (ch == EOF) {
- X fprintf(stderr, "%s: warning: Mail folder %s has no message body\n",
- X progname, Name);
- X }
- X}
- X
- Xvoid
- XBody(InputFile, Name)
- X FILE *InputFile;
- X char *Name;
- X{
- X int ch;
- X
- X while ((ch = GetChar(InputFile)) != EOF) {
- X if (InWord = InWord ? WithinWord(ch) : StartsWord(ch)) {
- X putchar(ch);
- X } else {
- X putchar((ch == '\n') ? '\n' : ' ');
- X }
- X }
- X}
- X
- X#ifdef __GNU__
- Xinline
- X#endif
- Xint
- XGetChar(fd)
- X FILE *fd;
- X{
- X static int LastChar = 0;
- X
- X if (LastChar) {
- X int ch = LastChar;
- X LastChar = 0;
- X return ch;
- X }
- X
- X /* Only return a single quote if it is surrounded by letters */
- X if ((LastChar = getc(fd)) == '\'') {
- X LastChar = getc(fd);
- X if (InWord && isalpha(LastChar)) return '\'';
- X else return ' ';
- X } else {
- X int ch = LastChar;
- X LastChar = 0;
- X return ch;
- X }
- X}
- X
- X/*
- X * $Log: NewsFilter.c,v $
- X * Revision 1.5 90/10/06 00:57:27 lee
- X * Prepared for first beta release.
- X *
- X * Revision 1.4 90/09/20 16:36:59 lee
- X * Fixed icstrcmp() and IsWanted() so that the unwanted parts of headers
- X * get deleted again.... (oops!)
- X *
- X * Revision 1.3 90/09/19 21:19:50 lee
- X * Now supports turning unindexed stuff into qxxxxx-words.
- X *
- X * Revision 1.2 90/08/29 21:56:58 lee
- X * Alpha release.
- X *
- X * Revision 1.1 90/08/09 19:17:57 lee
- X * Initial revision
- X *
- X * Revision 1.2 89/09/16 21:16:01 lee
- X * First demonstratable version.
- X *
- X * Revision 1.1 89/09/07 21:05:48 lee
- X * Initial revision
- X *
- X */
- @@@End of lq-text/src/filters/NewsFilter.c
- echo x - lq-text/src/h/Liamdbm.h 1>&2
- sed 's/^X//' >lq-text/src/h/Liamdbm.h <<'@@@End of lq-text/src/h/Liamdbm.h'
- X/* Liamdbm.h -- Copyright 1989 Liam R. Quin. All Rights Reserved.
- X * This code is NOT in the public domain.
- X * See the file COPYRIGHT for full details.
- X */
- X
- X/* $Id: Liamdbm.h,v 1.2 90/10/06 02:18:14 lee Rel1-10 $
- X *
- X * This is used with gdbm. I have not linked with gdbm, and, if you
- X * wish to do so, you must be careful not to voilate any copyright
- X * notices... (sigh)
- X *
- X * The version of gdbm for which I had a manual is rather old and had no
- X * ndbm compatibility.
- X */
- X
- X#include "gdbm.h"
- Xextern datum gdbm_fetch();
- Xextern datum gdbm_firstkey();
- Xextern datum gdbm_nextkey();
- X
- Xtypedef char DBM;
- X
- X#define dbm_store(db, key, data, mode) gdbm_store(db, key, data)
- X/* gdbm_open is stupder than ndbm_open.... */
- X#define dbm_open(FileName, Mode, m) gdbm_open(FileName, 512, Mode, 0)
- X#define dbm_fetch gdbm_fetch
- X#define dbm_close gdbm_close
- X#define dbm_firstkey gdbm_firstkey
- X#define dbm_nextkey gdbm_nextkey
- X
- X/*
- X * $Log: Liamdbm.h,v $
- X * Revision 1.2 90/10/06 02:18:14 lee
- X * Prepared for first beta release.
- X *
- X *
- X */
- @@@End of lq-text/src/h/Liamdbm.h
- echo x - lq-text/src/h/Revision.h 1>&2
- sed 's/^X//' >lq-text/src/h/Revision.h <<'@@@End of lq-text/src/h/Revision.h'
- X/* This header file gets updated with every distributed change to any source
- X * file anywhere in the lq-text package.
- X * A short description of the change is added to the Log here, too.
- X * Lee.
- X */
- X
- X#define LQTEXTREVISION "Release 1.10"
- X
- X/* $Revision: 1.10 $
- X *
- X * Revision 1.6 90/10/04 17:12:45 lee
- X * lqtext now compiles and mostly works under BSD.
- X * Fixes bug in phrase matching -- PhraseMatchLevel now works on one-word
- X * phrases.
- X *
- X * Revision 1.5 90/09/28 22:19:36 lee
- X * Made GetChar() a macro in lqaddfile -- speed improvement...
- X *
- X * Revision 1.4 90/09/20 16:37:35 lee
- X * Fixed Mail and News filters so that they throw away the unwanted header
- X * parts correctly.
- X *
- X * Revision 1.3 90/09/20 12:51:24 lee
- X * Major sdbm initialisation bug fixed.
- X *
- X * Revision 1.2 90/09/20 11:52:35 lee
- X * Fixed the filters so that lqshow highlights the right word (the qxx fix)
- X *
- X * Revision 1.1 90/09/20 11:52:18 lee
- X * Initial revision
- X *
- X *
- X */
- @@@End of lq-text/src/h/Revision.h
- echo x - lq-text/src/h/blkheader.h 1>&2
- sed 's/^X//' >lq-text/src/h/blkheader.h <<'@@@End of lq-text/src/h/blkheader.h'
- X/* blkheader.h -- Copyright 1989 Liam R. Quin. All Rights Reserved.
- X * This code is NOT in the public domain.
- X * See the file COPYRIGHT for full details.
- X *
- X * (was called blockheader.h, but this was too long on SysV for RCS)
- X */
- X
- X/* descibe the physical WOrdPlace database...
- X *
- X * $Header: /usr/src/cmd/lq-text/src/h/RCS/blkheader.h,v 1.2 90/03/20 20:57:46 lee Rel1-10 $
- X *
- X * $Log: blkheader.h,v $
- X * Revision 1.2 90/03/20 20:57:46 lee
- X * removed WID from the block. This reduces checking, but should also
- X * noticeably reduce the size of the database.
- X *
- X * Revision 1.1 90/03/20 20:54:44 lee
- X * Initial revision
- X *
- X */
- X
- X/* The header of each block -- I can't use sReadNumber, because I don't know
- X * the size of NextOffset until I get to the end, and it's too late by then!
- X *
- X * I should really store the block offset, and not the byte offset. This
- X * would save a whole byte -- I could use 3 bytes for the NextBlock!
- X */
- Xtypedef struct {
- X unsigned long NextOffset; /* a byte offset */
- X char Data[1]; /* the address of this is where the number start... */
- X} t_BlockHeader;
- @@@End of lq-text/src/h/blkheader.h
- echo x - lq-text/src/h/emalloc.h 1>&2
- sed 's/^X//' >lq-text/src/h/emalloc.h <<'@@@End of lq-text/src/h/emalloc.h'
- X/* emalloc.h -- Copyright 1989 Liam R. Quin. All Rights Reserved.
- X * This code is NOT in the public domain.
- X * See the file COPYRIGHT for full details.
- X */
- X
- X/* emalloc.h -- header file for emalloc.c, Liam Quin's malloc() wrapper
- X *
- X * $Id: emalloc.h,v 1.5 91/03/02 19:40:04 lee Rel1-10 $
- X *
- X * $Log: emalloc.h,v $
- X * Revision 1.5 91/03/02 19:40:04 lee
- X * Simpler version of malloc defines if MALLOCTRACE unused...
- X *
- X * Revision 1.4 91/03/02 18:31:21 lee
- X * Simpler call to malloc wrappers if MALLOCTRACE undefined.
- X *
- X * Revision 1.3 90/10/06 02:18:26 lee
- X * Prepared for first beta release.
- X *
- X * Revision 1.2 90/08/29 21:57:44 lee
- X * removed most of the testing code
- X *
- X * Revision 1.1 90/08/09 19:14:48 lee
- X * Initial revision
- X *
- X * Revision 2.2 89/10/08 20:45:20 lee
- X * Working version of nx-text engine. Addfile and wordinfo work OK.
- X *
- X *
- X */
- X
- Xextern int _LiamIsInCurses;
- X
- X#define InitScr() (_LiamIsInCurses = initscr())
- X#define EndWin() (_LiamIsInCurses ? (_LiamIsInCurses = 0), endwin() : 0)
- X
- Xextern char *_emalloc(), *_erealloc(), *_ecalloc();
- Xextern void _efree();
- X
- X#ifdef MALLOCTRACE
- X#define emalloc(u) _emalloc(u, __FILE__, __LINE__)
- X#define erealloc(s, u) _erealloc(s, u, __FILE__, __LINE__)
- X#define ecalloc(n, siz) _ecalloc(n, siz, __FILE__, __LINE__)
- X#define efree(s) _efree(s, __FILE__, __LINE__)
- X#else
- X#define emalloc _emalloc
- X#define erealloc _erealloc
- X#define ecalloc _ecalloc
- X#define efree _efree
- X#endif
- @@@End of lq-text/src/h/emalloc.h
- echo x - lq-text/src/h/fileinfo.h 1>&2
- sed 's/^X//' >lq-text/src/h/fileinfo.h <<'@@@End of lq-text/src/h/fileinfo.h'
- X/* fileinfo.h -- Copyright 1989 Liam R. Quin. All Rights Reserved.
- X * This code is NOT in the public domain.
- X * See the file COPYRIGHT for full details.
- X */
- X
- X/* Internal structure used by NX-Text to represent a word */
- X
- X/* Needs: sys/types.h */
- X
- X/* $Id: fileinfo.h,v 1.2 90/10/06 02:18:27 lee Rel1-10 $
- X *
- X * $Log: fileinfo.h,v $
- X * Revision 1.2 90/10/06 02:18:27 lee
- X * Prepared for first beta release.
- X *
- X * Revision 1.1 90/08/09 19:14:57 lee
- X * Initial revision
- X *
- X * Revision 2.2 89/10/08 20:45:57 lee
- X * Working version of nx-text engine. Addfile and wordinfo work OK.
- X *
- X * Revision 2.1 89/10/02 01:14:29 lee
- X * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
- X *
- X * Revision 1.2 89/09/16 21:15:19 lee
- X * First demonstratable version.
- X *
- X * Revision 1.1 89/09/07 21:00:34 lee
- X * Initial revision
- X *
- X *
- X */
- X
- Xtypedef unsigned long t_FID;
- X
- Xtypedef struct {
- X char *Name;
- X t_FID FID; /* File Identifier */
- X int FilterType; /* command to ASCIIify, 0 unknown, 1 none */
- X time_t Date; /* when the file was last indexed */
- X FILE *Stream;
- X} t_FileInfo;
- X
- X#define FindFile(name) ((*(name) == '/') ? (name) : _FindFile(name))
- Xextern char *_FindFile();
- @@@End of lq-text/src/h/fileinfo.h
- echo x - lq-text/src/h/filter.h 1>&2
- sed 's/^X//' >lq-text/src/h/filter.h <<'@@@End of lq-text/src/h/filter.h'
- X/* filter.h -- Copyright 1989 Liam R. Quin. All Rights Reserved.
- X * This code is NOT in the public domain.
- X * See the file COPYRIGHT for full details.
- X */
- X
- X/* filter.h -- define filter table for NX-Text, Liam Quin's text retrieval
- X * program.
- X * This table is built from a file like a simplified /etc/magic, normally
- X * stored in /usr/local/lib/nx-text/lib/filtertable
- X * but you can set this either here or in the Makefile.
- X *
- X * NEEDS: stdio.h
- X *
- X * $Id: filter.h,v 1.6 91/03/02 18:45:04 lee Rel1-10 $
- X *
- X * $Log: filter.h,v $
- X * Revision 1.6 91/03/02 18:45:04 lee
- X * Spell MAILFILTER correctly in the ifdef...
- X *
- X * Revision 1.5 90/10/13 03:11:31 lee
- X * Now defines filters for easier stand-alone testing of stuff...
- X *
- X * Revision 1.4 90/10/06 02:18:28 lee
- X * Prepared for first beta release.
- X *
- X * Revision 1.3 90/09/28 23:03:16 lee
- X * Now use MAILFILTER and NEWSFILTER...
- X *
- X * Revision 1.2 90/08/29 21:57:57 lee
- X * removed most of the testing code
- X *
- X * Revision 1.1 90/08/09 19:15:01 lee
- X * Initial revision
- X *
- X * Revision 2.2 89/10/08 20:46:04 lee
- X * Working version of nx-text engine. Addfile and wordinfo work OK.
- X *
- X * Revision 2.1 89/10/02 01:14:33 lee
- X * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
- X *
- X *
- X */
- X
- X#define FTYPE_NEWS 1
- X#define FTYPE_MAIL 2
- X#define FTYPE_CDMS 3
- X#define FTYPE_MOSTLYASCII 4
- X#define FTYPE_C_SOURCE 5
- X
- X/* The Type field in each array entry is so that I can do some very simple
- X * checking...
- X */
- Xextern int fclose(), pclose();
- Xstruct s_FilterTable {
- X int Type;
- X int (* close)(); /* how to close the darned stream */
- X char *String;
- X};
- X#ifndef FILTERDEF
- Xextern struct s_FilterTable FilterTable[];
- X#else
- Xstruct s_FilterTable FilterTable[] = {
- X { 0, fclose, 0 }, /* use fopen() */
- X#ifndef NEWSFILTER
- X# define NEWSFILTER "NewsFilter"
- X#endif
- X { FTYPE_NEWS, pclose, NEWSFILTER },
- X#ifndef MAILFILTER
- X# define MAILFILTER "MailFilter"
- X#endif
- X { FTYPE_MAIL, pclose, MAILFILTER },
- X#ifdef FTYPE_CDMS /* CrystalWriter from Syntactics... */
- X { FTYPE_CDMS, pclose, "CDMSFilter" },
- X#endif
- X#ifdef FTYPE_NTROFF
- X { FTYPE_NTROFF, pclose, "lqderoff" }, /* not yet released, sorry */
- X#endif
- X { FTYPE_MOSTLYASCII, pclose, "AsciiFilter" },
- X#ifdef FTYPE_C_SOURCE
- X { FTYPE_C_SOURCE, pclose, "CFilter" }, /* leave me last! */
- X#endif
- X /* If you add more, you MUST update MaxFilterType */
- X { 0, 0, 0 }
- X};
- X#endif
- X#define MaxFilterType FTYPE_C_SOURCE
- @@@End of lq-text/src/h/filter.h
- echo x - lq-text/src/h/globals.h 1>&2
- sed 's/^X//' >lq-text/src/h/globals.h <<'@@@End of lq-text/src/h/globals.h'
- X/* globals.h -- Copyright 1989 Liam R. Quin. All Rights Reserved.
- X * This code is NOT in the public domain.
- X * See the file COPYRIGHT for full details.
- X *
- X * $Id: globals.h,v 1.6 91/02/20 19:26:53 lee Rel1-10 $
- X *
- X * (see Log at end of this file for change history. Keep this up to date
- X * using rcs if you have it...)
- X */
- X
- X/* globals.h -- declarations of globally accessible variables, and also
- X * of configurable parameters.
- X *
- X * Some of the configuation options might be given in ../Makefile, so
- X * you must check in there too.
- X *
- X * Everything that includes this file must be linked with Defaults.c
- X */
- X
- X/*
- X * DOCPATH gives the list of directories in which to search in order
- X * to find files to retrieve and to index. The default can be wired
- X * in here, or can be simply "." (in which case relative pathnames will
- X * be from wherever one invokes the commands, and absolute pathnames
- X * will be absolute. For example,
- X * #define DFLTDOCPATH "/usr/man:."
- X * In any case, it can be overridden by a DOCPATH line in the configuration
- X * file for a given database (README in the database directory), and also
- X * by an environment variable DOCPATH (the latter taking precedence over
- X * the former).
- X *
- X * Use ((char *) 0) to disable the default -- in this case, you always have
- X * to give one, either with the $DOCPATH variable or in the database file.
- X *
- X */
- X#ifndef DFLTDOCPATH
- X# define DFLTDOCPATH ((char *) 0)
- X#endif
- X
- X/* LQTEXTDIR: if the programs can't find the directory to use -- i.e.,
- X * there was no -d option and $(LQTEXTDIR) is unset, we either
- X * look in UNDERHOME (if that was defined here) or in wherever LQTEXTDIR
- X * is defined to point.
- X */
- X#ifndef LQTEXTDIR
- X# define LQTEXTDIR "/usr/spool/lqtextdir"
- X#endif
- X
- X/* If UNDERHOME is set, look there for a directory -- e.g.
- X * #define UNDERHOME "sockdrawer"
- X * would make lqtext programs look for a directory something like
- X * /users/liam/sockdrawer
- X * (where /users/liam is my login directory)
- X */
- X#ifndef UNDERHOME
- X# define UNDERHOME "LQTEXTDIR"
- X#endif
- X
- X/* The name of a configuration file found in the database directory:
- X */
- X#define CONFIGFILE "README"
- X
- X/* If the config file doesn't give a filename for a list of common
- X * words, we look for one called DFLTCOMMONFILE (and don't mind if we
- X * don't find it). Use "/dev/null" or ((char *) 0) if you want to
- X * disable the default.
- X * It's case sensitive, of course.
- X */
- X#define DFLTCOMMONFILE "CommonWords"
- X
- X#ifndef PAGER
- X/* The default pager to use if the user doesn't set $PAGER. This is only
- X * used in lqshow, the browser. Good things to try are
- X * more, "less -Ce", and (generally only on System V) "pg -ns".
- X * Specify an absolute path if possible. It's often a lot faster, and
- X * it's somewhat safer...
- X */
- X# ifdef BSD
- X# define PAGER "/usr/ucb/more"
- X# else
- X# define PAGER "/usr/bin/pg -ns"
- X# endif
- X#endif
- X
- X#ifndef DBMCREAT
- X/* If you are using dbm or gdbm (?), you will need to create the dbm files
- X * by hand yourself. Defining DBMCREAT as 0 makes the software do this
- X * automatically, with a very slight performance penalty.
- X *
- X * ndbm and sdbm can use O_CREAT, so set it to 1 here for them.
- X * You will also have to look at ../Makefile, ../PORTING, smalldb.h and
- X * ../lqlib/smalldb.h, making whatever changes are needed.
- X */
- X# define DBMCREAT 1 /* 1 for ndbm, 0 for dbm */
- X#endif
- X
- X#ifdef sparc
- X# define NEEDALIGN
- X#endif
- X
- X#ifdef mips /* e.g. SGI machines */
- X# define NEEDALIGN
- X#endif
- X
- X/* NEEDALIGN is for C compilers that require C structures to start at
- X * word boundaries. You need this on sparc and sgi machines...
- X */
- X
- X/***
- X *** If you want to change anything beyond here...
- X ***
- X *** well, you can.
- X *** After all, it's your copy.
- X ***
- X *** But don't come running back to me if it doesn't work!
- X *** At least not until you have tried
- X *** + understanding what the problem is;
- X *** + looking at the source to see why;
- X *** + fixing the problem;
- X *** + taking off your shoes and socks and grinning for a while.
- X ***
- X *** Liam.
- X ***
- X ***/
- X
- X/* The following let you reconfigure the names of the files that form
- X * part of the database, but there is no point in doing so unless you
- X * are porting to some strange system that has absurd filename restrictions!
- X */
- X#ifndef WORDINDEX
- X# define WORDINDEX "wordlist"
- X /* This is a dbm file, so you'll get two files, one with ".pag"
- X * stuck on the end and one with ".dir" on the end.
- X * It contains an entry for every word in the database, enabling
- X * the software to go from a word to an integer (well, a t_WID)
- X * very quickly.
- X * It tends to be a little over one tenth of the size of the DATABASE.
- X */
- X#endif
- X#ifndef WIDINDEXFILE
- X# define WIDINDEXFILE "WIDIndex"
- X /* WIDINDEXFILE contains each word in the datbase, together with some
- X * information and the first few bytes of data.
- X * It contains WIDBLOCKSIZE bytes for every word, but this has to
- X * be at least MAXWORDLEN + 10 bytes long (see WordInfo.c).
- X */
- X#endif
- X#ifndef DATABASE
- X# define DATABASE "data"
- X /* For those words whose data doesn't fit into the first WIDBLOCKSIZE
- X * bytes, space is allocated in this file in BLOCKSIZE chunks. Make
- X * BLOCKSIZE small, or you will waste a lot of space -- on the other
- X * hand, there's a 4-byte-per-block overhead at the moment.
- X * This file gets very b i g indeed.
- X */
- X#endif
- X#ifndef FILEINDEX
- X# define FILEINDEX "FileList"
- X /* This is a list of every file in the database, again in dbm format,
- X * so there are actually two files (a .pag and a .dir) involved.
- X * If your files are short, it will quickly grow to a tenth of the size
- X * of the database.
- X * It stores the filename, and some other information.
- X */
- X#endif
- X#ifndef FIDFILE
- X# define FIDFILE "FIDFile"
- X /* This contains the largest currently used file number... you can
- X * look at it to see how many files have been indexed.
- X * It is only a few bytes long.
- X */
- X#endif
- X#ifndef WIDFILE
- X# define WIDFILE "WIDFile"
- X /* This contains the largest currently used word number... you can
- X * look at it to see how many unique words have been seen.
- X * It is only a few bytes long.
- X */
- X#endif
- X
- X#ifndef WIDBLOCKSIZE
- X# define WIDBLOCKSIZE 32
- X/* WIDBLOCKSIZE absolutely must be large enough to fit at least one byte
- X * of actual data, or all hell will break loose.
- X * (actually that could be fixed...).
- X * In any case, it has to contain (apart from the >= 1 byte of data):
- X * + the length count (1 byte) and the word itself (no null on the end)
- X * + the block number in the database (1..5 bytes)
- X * + the number of matches (1..5 bytes)
- X *
- X * It helps efficiency very, very slightly if these are a power of two
- X * bytes, as then they never cross Unix block boundaries.
- X *
- X */
- X#endif
- X
- X#ifndef BLOCKSIZE
- X#define BLOCKSIZE 64
- X/* BLOCKSIZE is the size of blocks in the data file. There are several
- X * tradeoffs:
- X * + there is a 4-bytes-per-block overhead for list pointers, so it's
- X * a good idea to make them large
- X * + there's a bit of work involved in fetching the blocks, so things go
- X * faster if they're larger...
- X * + many blocks are not full, so it's a good idea to make them small.
- X * On average, a little over (BLOCKSIZE - 4) / 2 bytes are wasted for
- X * every word chain.
- X * + since many of the blocks are not full, it's a good idea to make them
- X * small, minimising the amount of extra data that gets copied around by
- X * the Unix kernel. If the blocks are smaller it'll go faster...
- X *
- X * It helps efficiency very, very slightly if these are a power of two
- X * bytes, as then they never cross Unix block boundaries.
- X *
- X */
- X#endif
- X
- X/**** Some useful macros */
- X
- X/* STREQ(a,b) is much faster than strcmp() in the (common) case that the
- X * first character of the strings differ.
- X * It is due (as far as I know) to Henry Spencer, at the University of
- X * Toronto Zoology Dept.,
- X * utzoo!henry
- X */
- X#ifndef STREQ
- X# define STREQ(henry,utzoo) (*(henry) == *(utzoo) && !strcmp(henry, utzoo))
- X#endif
- X
- X/* Inline functions are functions that get expanded inline during
- X * compilation -- sort of like macros with real local arguments.
- X * Not all compilers support them.
- X */
- X#ifdef __GNUC__
- X# define INLINE inline
- X#else
- X# define INLINE /* not supported */
- X#endif
- X
- X#ifdef DefineThem
- X# define DECL(name, type, value) type name = value
- X# define EXTERN /* just define them please */
- X#else
- X# define EXTERN extern /* declare but do not define */
- X# define DECL(name, type, value) EXTERN type name
- X#endif
- X
- X/****/
- X
- X/* Now declare (or define) things: */
- X
- Xextern char *progname; /* from progname.c, for error messages */
- XDECL(CommonWordFile, char *, DFLTCOMMONFILE);
- XDECL(DatabaseDir, char *, LQTEXTDIR);
- XDECL(FileIndex, char *, FILEINDEX);
- XDECL(WordIndex, char *, WORDINDEX);
- XDECL(DataBase, char *, DATABASE);
- XDECL(FidFile, char *, FIDFILE);
- XDECL(WidFile, char *, WIDFILE);
- XDECL(WidIndexFile, char *, WIDINDEXFILE);
- XDECL(DocPath, char *, DFLTDOCPATH);
- X
- X/*
- X * $Log: globals.h,v $
- X * Revision 1.6 91/02/20 19:26:53 lee
- X * Added NEEDALIGN on mips systems
- X * (thanks to Mark Moraes, moraes@cs.toronto.edu)
- X *
- X * Revision 1.5 90/10/07 20:41:20 lee
- X * Added NEEDALIGN for fussy architectures.
- X *
- X * Revision 1.4 90/10/06 02:21:21 lee
- X * Prepared for first beta release.
- X *
- X * Revision 1.3 90/10/03 21:31:54 lee
- X * Added definition of PAGER, which has moved here from lqshow.c
- X *
- X * Revision 1.2 90/08/09 19:15:03 lee
- X * after BSD lint and saber-C
- X *
- X * Revision 1.1 90/03/23 17:32:11 lee
- X * Initial revision
- X *
- X *
- X */
- @@@End of lq-text/src/h/globals.h
- echo x - lq-text/src/h/numbers.h 1>&2
- sed 's/^X//' >lq-text/src/h/numbers.h <<'@@@End of lq-text/src/h/numbers.h'
- X/* numbers.h -- Copyright 1989 Liam R. Quin. All Rights Reserved.
- X * This code is NOT in the public domain.
- X * See the file COPYRIGHT for full details.
- X */
- X
- X/* ReadNumber and WriteNumber take/return a long, using a compression
- X * algorithm to reduce the amount of data taken.
- X *
- X * They use (char *) pointers instead if prefixes with an s.
- X *
- X * $Id: numbers.h,v 1.3 90/10/06 02:18:30 lee Rel1-10 $
- X *
- X */
- X
- Xextern INLINE unsigned long fReadNumber();
- Xextern INLINE unsigned long sReadNumber();
- X
- Xextern INLINE void fWriteNumber();
- Xextern INLINE void sWriteNumber();
- X
- X/*
- X * $Log: numbers.h,v $
- X * Revision 1.3 90/10/06 02:18:30 lee
- X * Prepared for first beta release.
- X *
- X * Revision 1.2 90/08/09 19:15:42 lee
- X * after BSD lint and saber-C
- X *
- X * Revision 1.1 90/04/19 19:27:04 lee
- X * Initial revision
- X *
- X * Revision 2.2 89/10/08 20:46:43 lee
- X * Working version of nx-text engine. Addfile and wordinfo work OK.
- X *
- X * Revision 1.2 89/09/16 21:15:40 lee
- X * First demonstratable version.
- X *
- X * Revision 1.1 89/09/07 21:06:02 lee
- X * Initial revision
- X *
- X */
- @@@End of lq-text/src/h/numbers.h
- echo x - lq-text/src/h/pblock.h 1>&2
- sed 's/^X//' >lq-text/src/h/pblock.h <<'@@@End of lq-text/src/h/pblock.h'
- X/* pblock.h -- Copyright 1989 Liam R. Quin. All Rights Reserved.
- X * This code is NOT in the public domain.
- X * See the file COPYRIGHT for full details.
- X */
- X
- X#ifndef PBLOCK_H /* the matching endif is at the end of the file... */
- X
- X# define PBLOCK_H
- X/* The physical Word Database...
- X *
- X * First, there is the WID (from 1 to 4 bytes)
- X *
- X * Then, there is a NEXT pointer (or 0).
- X *
- X * Then, there is a list of (FID, OFFSET) pairs.
- X *
- X * $Header: /usr/src/cmd/lq-text/src/h/RCS/pblock.h,v 1.2 90/08/09 19:15:45 lee Rel1-10 $
- X *
- X * $Log: pblock.h,v $
- X * Revision 1.2 90/08/09 19:15:45 lee
- X * after BSD lint and saber-C
- X *
- X * Revision 1.1 90/03/01 23:54:37 lee
- X * Initial revision
- X *
- X * Revision 2.2 89/10/08 20:47:04 lee
- X * Working version of nx-text engine. Addfile and wordinfo work OK.
- X *
- X * Revision 2.1 89/10/02 01:15:36 lee
- X * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
- X *
- X * Revision 1.2 89/09/16 21:15:43 lee
- X * First demonstratable version.
- X *
- X * Revision 1.1 89/09/07 21:06:09 lee
- X * Initial revision
- X *
- X *
- X */
- X
- Xtypedef struct {
- X t_FID FID;
- X unsigned long BlockInFile;
- X unsigned short WordInBlock;
- X unsigned short Flags;
- X unsigned char StuffBefore; /* preceding ignored garbage */
- X} t_WordPlace;
- X
- X/* This structure is really only used by addfile; elsewhere arrays of
- X * WordlPlace are used.
- X */
- X
- Xtypedef struct s_WordPlaceList {
- X char *Word;
- X t_WordPlace WordPlace;
- X struct s_WordPlaceList *Next;
- X} t_WordPlaceList;
- X
- X/* Warning: One cannot use structure copy for a pblock! */
- X
- X/* This does *NOT* correspond to the physical disk layout -- see pblock.c */
- Xtypedef struct {
- X t_WID WID; /* for checking; */
- X unsigned long ChainStart;
- X unsigned long NumberOfWordPlaces;
- X t_WordPlace WordPlaces[1]; /* made by joining lots of disk blocks... */
- X} t_pblock;
- X
- X#endif
- @@@End of lq-text/src/h/pblock.h
- echo x - lq-text/src/h/phrase.h 1>&2
- sed 's/^X//' >lq-text/src/h/phrase.h <<'@@@End of lq-text/src/h/phrase.h'
- X/* phrase.h -- Copyright 1989 Liam R. Quin. All Rights Reserved.
- X * This code is NOT in the public domain.
- X * See the file COPYRIGHT for full details.
- X */
- X
- X/* LQ-Text -- Liam's Text Retrieval Package
- X * Liam R. Quin, September 1989, and later...
- X *
- X * phrase.h -- data structures for handling entire phrases
- X *
- X */
- X
- X/* $Id: phrase.h,v 1.2 90/10/06 02:18:33 lee Rel1-10 $
- X *
- X */
- X
- X/* Represent a Phrase as a linked list of WordInfo pointers, plus a list
- X * of matches.
- X */
- X
- Xtypedef struct s_PhraseItem {
- X t_WordInfo *Word;
- X struct s_PhraseItem *Next;
- X unsigned long SearchIndex; /* For phrase-matching */
- X char *WordStart; /* pointer into original phrase */
- X} t_PhraseItem;
- X
- Xtypedef enum {
- X PCM_AnyCase, /* Ignore case entirely */
- X PCM_HalfCase, /* Upper only matches upper; lower matches either */
- X PCM_SameCase, /* Exact matching */
- X} t_PhraseCaseMatch;
- X
- Xtypedef struct s_Match {
- X t_WID WID;
- X t_WordPlace *Where;
- X struct s_Match *Next;
- X} t_Match;
- X
- Xtypedef struct s_MatchList {
- X t_Match *Match;
- X struct s_MatchList *Next;
- X} t_MatchList;
- X
- X
- Xtypedef struct s_Phrase {
- X t_PhraseItem *Words; /* list of words and pblocks */
- X char *OriginalString; /* as supplied by the user */
- X char *ModifiedString; /* after deleting short/unindexed words */
- X unsigned long NumberOfMatches;
- X t_MatchList *Matches;
- X struct s_Phrase *Next; /* for use when we're in a list of phrases... */
- X unsigned short HasUnknownWords;
- X} t_Phrase;
- X
- X/* This is for FilleList() */
- Xtypedef struct s_Answer {
- X char *Answer;
- X struct s_Answer *Next;
- X} t_Answer;
- X
- X/*
- X * $Log: phrase.h,v $
- X * Revision 1.2 90/10/06 02:18:33 lee
- X * Prepared for first beta release.
- X *
- X * Revision 1.1 90/08/09 19:15:49 lee
- X * Initial revision
- X *
- X * Revision 1.1 89/09/17 23:03:37 lee
- X * Initial revision
- X *
- X */
- @@@End of lq-text/src/h/phrase.h
- echo x - lq-text/src/h/smalldb.h 1>&2
- sed 's/^X//' >lq-text/src/h/smalldb.h <<'@@@End of lq-text/src/h/smalldb.h'
- X/* smalldb.h -- Copyright 1989 Liam R. Quin. All Rights Reserved.
- X * This code is NOT in the public domain.
- X * See the file COPYRIGHT for full details.
- X */
- X
- X/* $Id: smalldb.h,v 1.3 91/03/03 00:12:56 lee Exp $
- X */
- X
- X/* You must include fcntl.h before this file. */
- X
- X#ifdef ndbm
- X# include <ndbm.h>
- X# define FoundDbmOK
- X# define NDBM
- X#endif
- X
- X#ifdef sdbm
- X# include "sdbm.h"
- X# define FoundDbmOK
- X# define NDBM /* it's compatible */
- X#endif
- X
- X#ifdef ozmahash
- X# include "ozmadbm.h"
- X# define FoundDbmOK
- X# define NDBM /* it's compatible as well... */
- X#endif
- X
- X#ifndef FoundDbmOK
- X# include "Liamdbm.h"
- X#endif
- X
- X#ifndef O_RDWR
- X# include <fcntl.h>
- X#endif
- X
- X#define CACHE 2 /* size of DBM cache in startdb() -- I only use two! */
- X/* If you rip out the dbm cache stuff for use elsewhere, increse the 2
- X * to something like 5 or so!!! Each entry uses two file pointers.
- X * Lee
- X */
- X
- X#ifndef CACHE
- X# define startdb(FilePrefix) dbm_open(FilePrefix, O_RDWR|O_CREAT, 0640)
- X# define enddb(db) { if (db) dbm_close(db); }
- X#endif
- X
- X
- X#ifndef startdb
- XDBM *startdb();
- X#endif
- X
- X#ifndef enddb
- X# ifdef CACHE
- X# define enddb(db) /* nothing to do, because of the cache */
- X# else
- X void enddb();
- X# endif /* CACHE */
- X#endif /* !enddb */
- X
- X/*
- X * $Log: smalldb.h,v $
- X * Revision 1.3 91/03/03 00:12:56 lee
- X * Integrated ozmahash.
- X *
- X * Revision 1.2 90/10/06 02:18:36 lee
- X * Prepared for first beta release.
- X *
- X * Revision 1.1 90/08/09 19:16:00 lee
- X * Initial revision
- X *
- X * Revision 2.2 89/10/08 20:47:19 lee
- X * Working version of nx-text engine. Addfile and wordinfo work OK.
- X *
- X * Revision 2.1 89/10/02 01:16:01 lee
- X * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
- X *
- X * Revision 1.2 89/09/16 21:15:45 lee
- X * First demonstratable version.
- X *
- X * Revision 1.1 89/09/07 21:06:12 lee
- X * Initial revision
- X *
- X */
- @@@End of lq-text/src/h/smalldb.h
- echo x - lq-text/src/h/wordindex.h 1>&2
- sed 's/^X//' >lq-text/src/h/wordindex.h <<'@@@End of lq-text/src/h/wordindex.h'
- X/* wordindex.h -- Copyright 1989 Liam R. Quin. All Rights Reserved.
- X * This code is NOT in the public domain.
- X * See the file COPYRIGHT for full details.
- X */
- X
- X/* (this file is currently empty, but might return...) */
- X
- X/*
- X * $Id: wordindex.h,v 1.2 90/10/06 02:18:38 lee Rel1-10 $
- X *
- X * $Log: wordindex.h,v $
- X * Revision 1.2 90/10/06 02:18:38 lee
- X * Prepared for first beta release.
- X *
- X * Revision 1.1 90/08/09 19:16:02 lee
- X * Initial revision
- X *
- X * Revision 2.1 89/10/02 01:16:06 lee
- X * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
- X *
- X * Revision 1.2 89/09/16 21:15:47 lee
- X * First demonstratable version.
- X *
- X * Revision 1.1 89/09/07 21:06:13 lee
- X * Initial revision
- X *
- X *
- X */
- @@@End of lq-text/src/h/wordindex.h
- echo x - lq-text/src/h/wordinfo.h 1>&2
- sed 's/^X//' >lq-text/src/h/wordinfo.h <<'@@@End of lq-text/src/h/wordinfo.h'
- X/* wordinfo.h -- Copyright 1989 Liam R. Quin. All Rights Reserved.
- X * This code is NOT in the public domain.
- X * See the file COPYRIGHT for full details.
- X */
- X
- X/*
- X * $Id: wordinfo.h,v 1.3 90/10/06 02:21:30 lee Rel1-10 $
- X */
- X
- Xtypedef unsigned long t_WID;
- X
- X#ifndef PBLOCK_H
- X# include "pblock.h"
- X#endif
- X
- X#ifndef WIDBLOCKSIZE
- X#define WIDBLOCKSIZE 32
- X#endif
- X
- Xextern char *WidIndexFile; /* Default.c */
- X
- X/* this is a hack for speed: */
- X#define GetNextWID SpoofGetNextWID
- X
- X/** A t_WordInfo describes a single word, in terms of
- X ** where it came from
- X ** how to find its database entries
- X ** how to find the in-core database entries (a copy of the above)
- X **/
- X
- X/* There would be a performance benefit if this struct was smaller.
- X * It was foolish of me to use WordInfo for so many different things in
- X * addfile, and now I pay the price.
- X * Addfile may end up calling malloc for 10,000 of these things...
- X *
- X * There should be:
- X * t_WordPlace (exists, pblock.h)
- X * for recording a specific occurrence of a given word in a given file
- X * t_WordInfo (definition follows... look down...)
- X * for recording information about a WID's entry in the database
- X * t_WordPlaceList
- X * for addfile to make a list of word places...
- X * t_pblock (exists, see pblock.h)
- X * for containing the list of WordPlaces found in the database for a
- X * given word, or for putting them there. Uses arrays rather than
- X * lists to squeeze a few extra milliseconds. Some hope :-( :-)
- X *
- X * t_WordPlaceList almost certainly happen in the next major edit phase...
- X * t_WordInfo will then be somewhat smaller.
- X * All of the entries marked with a leading comment (below) should
- X * be elsewhere (and some of them were, in the Grand Design!).
- X *
- X */
- Xtypedef struct s_WordInfo {
- X char *Word;
- X t_WID WID; /* My Word Identifier */
- X unsigned long NumberOfWordPlaces; /* total */
- X t_FID FID; /* where we got it from */
- X unsigned long Offset; /* word entry position in the data base */
- X struct s_WordInfo *Next; /* for making lists of WordInfo structs */
- X char *DataBlock; /* for writing me out to the index */
- X char *WordPlaceStart;
- X t_WordPlace *WordPlaces; /* first few pairs */
- X t_WordPlace WordPlace; /* For addfile -- this is due to go!!!! */
- X /* shorts are at the end to obviate alignment padding... */
- X unsigned long WordPlacesInHere;
- X unsigned short Length; /* Word length; reduce the need for strlen */
- X#if 0
- X unsigned char Flags;
- X /* Flags serve two purposes:
- X * the LSB says whether the entry is sorted.
- X * the remainder are a logical AND of all entries in a sorted
- X * block. NOTE: if the block is unsorted, the other bits should
- X * still be up to date.
- X */
- X#endif
- X} t_WordInfo;
- X
- X/*
- X * $Log: wordinfo.h,v $
- X * Revision 1.3 90/10/06 02:21:30 lee
- X * Prepared for first beta release.
- X *
- X * Revision 1.2 90/08/09 19:16:04 lee
- X * after BSD lint and saber-C
- X *
- X * Revision 2.2 89/10/08 20:47:27 lee
- X * Working version of nx-text engine. Addfile and wordinfo work OK.
- X *
- X * Revision 2.1 89/10/02 01:16:15 lee
- X * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
- X *
- X * Revision 1.3 89/09/17 23:04:52 lee
- X * Various fixes; NumberInBlock now a short...
- X *
- X * Revision 1.2 89/09/16 21:15:49 lee
- X * First demonstratable version.
- X *
- X * Revision 1.1 89/09/07 21:06:16 lee
- X * Initial revision
- X *
- X */
- @@@End of lq-text/src/h/wordinfo.h
- echo x - lq-text/src/h/wordrules.h 1>&2
- sed 's/^X//' >lq-text/src/h/wordrules.h <<'@@@End of lq-text/src/h/wordrules.h'
- X/* wordrules.h -- Copyright 1989 Liam R. Quin. All Rights Reserved.
- X * This code is NOT in the public domain.
- X * See the file COPYRIGHT for full details.
- X */
- X
- X/* $Id: wordrules.h,v 1.2 90/10/06 02:18:39 lee Rel1-10 $
- X *
- X */
- X
- X/* Rules for determining what an indexable word looks like;
- X * These are implemented by the various filters, as well as by
- X * the indexing software itself. This means that the filters
- X * don't need to keep track of word lengths, as addfile will do this,
- X * but that they should not emit non-word stuff if they can help it,
- X * turning it into the equivalent amount (in bytes) of white-space
- X * instead.
- X * They should also turn words they don't want indexed into "qxxx",
- X * with the right number of x's (e.g. "bare" --> "qxxx").
- X */
- X
- X/* A "word" is a letter followed by any combination of
- X * letters, digits or '_'. An embedded (not trailing) ' is also allowed
- X * (_ is allowed so that one can index progamming languages; strictly
- X * speaking, a lot of languages allow _ at the start too, but I don't
- X * want to get confused by nroff output etc., which contains lines of
- X * underscores)
- X *
- X * This scheme currently excludes numbers...
- X * 31, 31.4 and 31.9e4 will all be ignored. So will 1987.
- X */
- X
- X#define StartsWord(ch) isalpha(ch)
- X#define WithinWord(ch) (isalnum(ch) || (ch == '_') || (ch == '\''))
- X#define EndsWord(ch) isalnum(ch)
- X
- X/* Don't index words unless they are at least MinWordLength characters
- X * long!
- X */
- X#define MinWordLength 3
- X#define MaxWordLength 18 /* truncate words to this */
- X/* The Following is for *.WordPlace.BlockInFile. If words are constrained
- X * to be 3 or more characters long, there can be at most
- X * (FileBlockSize / 4) of them in a block (since words must be separated
- X * by at least one character).
- X * Hence, 7 bits, which allows 0..127 giving 128 distinct values,
- X * gives us a block that is 128 * (MinWordLength + 1) bytes long.
- X */
- X#define FileBlockSize (128 * (MinWordLength + 1))
- X
- X/* WordPlace Flags:
- X * When a plural word is found, or a possessive word, it is reduced to
- X * being singular, and flags are set appropriately.
- X * Also, a flag is set to say if the word started with a Capital Letter.
- X * This puts Window, windows, and Window's all together, but enables them
- X * to be differentiated for searching if required.
- X * These flags are implemented by WordInfo and addfile, not by the various
- X * filters, but the filters must preserve capitalisation of the first letter
- X * in each word, and pass through apostrophes within words (like this's).
- X */
- X
- X#define WPF_WASPLURAL 0001 /* The word... ended in s */
- X#define WPF_UPPERCASE 0002 /* ...Started with a capital letter */
- X#define WPF_POSSESSIVE 0004 /* ...ended in 's */
- X#define WPF_ENDEDINING 0010 /* ...ended in ing */
- X#define WPF_LASTWASCOMMON 0020 /* the previous word was common */
- X#define WPF_LASTHADLETTERS 0040 /* we skipped some letters to get here */
- X#define WPF_HASSTUFFBEFORE 0100 /* Other than 1 byte of garbage before */
- X#define WPF_LASTINBLOCK 0200 /* I'm the last word in this block */
- X
- X/* new note (jan 90):
- X * You can't currently have both plural and posessive in the most common case
- X * of the boys' muddy feet (for example), as the trailing ' gets deleted.
- X * this doesn't matter, but perhaps that combination should be reserved for
- X * had-another-standard-ending??? e.g. -ed or -ing, that isn't often followed by
- X * -s or -'s...
- X *
- X * Also, ENDEDINING (ended in "ing") is currently unused entirely.
- X * Perhaps if it is set, the plural and possessive bits should index which of
- X * four endings was found, although this would preclude special-casing of the
- X * s's combination. Probably better that way.
- X *
- X * I should very much like to have another flag or two, perhaps embedded in
- X * one of the other fields. This might be feasible if there is a pre-scan
- X * when the index is written to determine the most common (modal) flags and
- X * distance (currently I assume 1) and to omit these whenever they are the default.
- X * In this case, the fact that every occurrence of Jesus starts with a capital
- X * letter (and ends in -s, *blush*), can still lead to most of the flags being
- X * omitted.
- X *
- X * The next revision will separate the list of FIDs from the rest of the information,
- X * in which case the embedding of the flags becomes a little trickier. This
- X * belongs in the TODO file now, sorry.
- X *
- X * Liam Quin, January 22nd 1990, at home in Warrington, England (ugh)
- X *
- X */
- X
- X/*
- X * $Log: wordrules.h,v $
- X * Revision 1.2 90/10/06 02:18:39 lee
- X * Prepared for first beta release.
- X *
- X * Revision 1.1 90/08/09 19:16:05 lee
- X * Initial revision
- X *
- X * Revision 2.2 89/10/08 20:47:35 lee
- X * Working version of nx-text engine. Addfile and wordinfo work OK.
- X *
- X * Revision 2.1 89/10/02 01:16:19 lee
- X * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
- X *
- X * Revision 1.2 89/09/16 21:15:52 lee
- X * First demonstratable version.
- X *
- X * Revision 1.1 89/09/07 21:06:17 lee
- X * Initial revision
- X *
- X */
- @@@End of lq-text/src/h/wordrules.h
- echo end of part 02
- --
- Liam R. E. Quin, lee@sq.com, SoftQuad Inc., Toronto, +1 (416) 963-8337
-