home *** CD-ROM | disk | FTP | other *** search
- From: lee@sq.sq.com (Liam R. E. Quin)
- Newsgroups: alt.sources
- Subject: lq-text Full Text Retrieval Database Part 04/13
- Message-ID: <1991Mar4.020316.16307@sq.sq.com>
- Date: 4 Mar 91 02:03:16 GMT
-
- : cut here --- cut here --
- : To unbundle, sh this file
- #! /bin/sh
- : part 04
- echo x - lq-text/src/liblqtext/Phrase.c 1>&2
- sed 's/^X//' >lq-text/src/liblqtext/Phrase.c <<'@@@End of lq-text/src/liblqtext/Phrase.c'
- X/* Phrase.c -- Copyright 1989 Liam R. Quin. All Rights Reserved.
- X * This code is NOT in the public domain.
- X * See the file COPYRIGHT for full details.
- X */
- X
- X/*
- X * Deal with (WID, FID, Offfset) triples
- X * Liam Quin, September 1989
- X *
- X * $Id: Phrase.c,v 1.11 91/02/22 18:22:59 lee Rel1-10 $
- X *
- X * $Log: Phrase.c,v $
- X * Revision 1.11 91/02/22 18:22:59 lee
- X * Improved a trace message.
- X *
- X * Revision 1.10 90/10/06 00:11:57 lee
- X * Prepared for first beta release.
- X *
- X * Revision 1.9 90/10/04 17:10:43 lee
- X * Case matching now works on one-word phrases.
- X *
- X * Revision 1.8 90/10/03 20:47:06 lee
- X * changed an int to an unsigned long
- X *
- X * Revision 1.7 90/08/29 21:46:39 lee
- X * Alpha release.
- X *
- X * Revision 1.6 90/03/29 19:59:05 lee
- X * Now passes gcc -Wall
- X *
- X * Revision 1.5 90/03/19 00:02:23 lee
- X * Simplified phrase matching greatly by adding new routine, and also
- X * improved checking of word flags and generation of ModifiedString,
- X * the canonical phrase, in String2Phrase.
- X *
- X * Revision 1.4 90/03/16 22:43:15 lee
- X * After Richard's consummate help...
- X * fixed two severe bugs in matching, and started to simplify
- X * MakeMatches().
- X *
- X * Revision 1.3 90/03/14 21:02:14 lee
- X * Before Richard...
- X *
- X * Revision 1.2 90/03/12 22:39:30 lee
- X * prepared for general release.
- X *
- X * Revision 1.1 89/09/17 23:01:34 lee
- X * Initial revision
- X *
- X *
- X */
- X
- X/** Unix system calls that need to be declared: **/
- Xextern void exit();
- X/** Unix/C Library Functions: **/
- Xextern unsigned int sleep();
- X#ifndef tolower
- X extern int tolower();
- X#endif
- Xextern int strlen();
- Xextern char *strcpy();
- X/** lqtext functions: **/
- Xextern int TooCommon();
- Xextern char *UnFlag();
- X/** **/
- X
- X#include "globals.h" /* defines and declarations for database filenames */
- X
- X#include <stdio.h> /* stderr, also for fileinfo.h */
- X#include <fcntl.h>
- X#include <malloc.h>
- X#include <sys/types.h>
- X#include <ctype.h>
- X
- X#include "fileinfo.h" /* for wordinfo.h */
- X#include "wordinfo.h"
- X#include "pblock.h"
- X#include "phrase.h"
- X#include "wordrules.h"
- X
- X#include "emalloc.h"
- X
- X#ifndef STREQ
- X# define STREQ(boy,girl) ((*(boy) == *(girl)) && (!strcmp((boy),(girl))))
- X#endif
- X
- X#ifndef new
- X# define new(type) ((type *) emalloc(sizeof (type)))
- X#endif
- X
- X#ifndef MAXPHRASELEN
- X# define MAXPHRASELEN 2000
- X#endif
- X
- Xextern int AsciiTrace;
- X
- Xt_Phrase *
- XString2Phrase(String)
- X char *String;
- X{
- X extern t_WordInfo *WID2WordInfo();
- X extern t_WID Word2WID();
- X extern char *WordRoot();
- X
- X t_Phrase *Result;
- X t_PhraseItem **ThisWord;
- X /* (* 3 because in the worst case, "a a a" expands to "[a] [a] [a]") */
- X register char *p;
- X register char *q;
- X char *LastStart = 0;
- X char *PrevLastEnd = (char *) 0;
- X int InWord = 0;
- X int Flags = 0;
- X int FoundLetters = 0;
- X
- X if (AsciiTrace > 50) {
- X fprintf(stderr, "String2Phrase(%s)\n", String);
- X }
- X Result = (t_Phrase *) emalloc(sizeof(t_Phrase));
- X Result->Next = (t_Phrase *) 0;
- X p = Result->ModifiedString = emalloc(strlen(String) * 3 + 1);
- X
- X Result->HasUnknownWords = 0;
- X
- X *(ThisWord = &Result->Words) = (t_PhraseItem *) 0;
- X
- X /* March along the supplied phrase, looking for keywords.
- X * surround unindexed or short words with [brackets].
- X * Also converts to lower case and strips plurals.
- X */
- X for (q = String; /*LOTSOFTIMES*/; q++) {
- X
- X if (AsciiTrace > 50) {
- X fputc(*q, stderr);
- X }
- X
- X if (!InWord && !StartsWord(*q)) {
- X if (!*q) {
- X break;
- X } else {
- X if (!LastStart) continue;
- X }
- X }
- X
- X if (!InWord) {
- X LastStart = q;
- X if (StartsWord(*q)) {
- X InWord = 1;
- X }
- X continue;
- X } else if (isalpha(*q)) {
- X /* in a word and found letters, so remember in case we skip
- X * this word...
- X */
- X FoundLetters = 1;
- X }
- X
- X /* ASSERT: inword == 1 */
- X
- X if (*q == '\'') {
- X if (!WithinWord(q[1])) {
- X InWord = 0;
- X }
- X }
- X
- X if (!*q || !WithinWord(*q)) {
- X InWord = 0;
- X }
- X
- X
- X if (LastStart && !InWord) {
- X int Length = q - LastStart;
- X int UsedABracket = 0;
- X
- X if (p > Result->ModifiedString) *p++ = ' ';
- X
- X /* we have reached the end of a word, is it long enough? */
- X if (!FoundLetters) {
- X *p++ = '[';
- X UsedABracket = 1;
- X } else if (Length < MinWordLength) {
- X *p++ = '[';
- X UsedABracket = 1;
- X if (FoundLetters) {
- X Flags |= WPF_LASTHADLETTERS;
- X FoundLetters = 0;
- X }
- X } else {
- X t_WID WID;
- X t_WordInfo *W;
- X char SaveEnd = (*q);
- X t_WordInfo TryRoot;
- X register char *p2;
- X char RootBuffer[MaxWordLength + 1];
- X char *R = RootBuffer;
- X
- X /* Add the word to the chain, too: */
- X *q = '\0';
- X
- X FoundLetters = 0; /* unnecessary now (?) */
- X TryRoot.Length = Length;
- X TryRoot.WordPlace.Flags = Flags;
- X if (isupper(*LastStart)) {
- X TryRoot.WordPlace.Flags |= WPF_UPPERCASE;
- X }
- X Flags = 0;
- X
- X for (p2 = LastStart; *p2; p2++) {
- X *R++ = isupper(*p2) ? tolower(*p2) : *p2;
- X }
- X *R = '\0';
- X TryRoot.Word = RootBuffer;
- X
- X R = WordRoot(&TryRoot);
- X
- X if (TooCommon(&TryRoot)) {
- X *p++ = '[';
- X *p++ = '*';
- X UsedABracket = 1;
- X Flags |= WPF_LASTWASCOMMON;
- X if (AsciiTrace > 10) {
- X fprintf(stderr, " Common(%s) ", TryRoot.Word);
- X }
- X } else if (!(WID = Word2WID(TryRoot.Word, TryRoot.Length)) ||
- X (W = WID2WordInfo(WID)) == (t_WordInfo *) 0) {
- X *p++ = '[';
- X *p++ = WID ? '@' : '?';
- X UsedABracket = 1;
- X if (AsciiTrace > 10) {
- X fprintf(stderr, " Unknown(%s) ", TryRoot.Word);
- X }
- X Result->HasUnknownWords++;
- X } else {
- X if ((*ThisWord = new(t_PhraseItem)) == (t_PhraseItem *) 0) {
- X fprintf(stderr,
- X "Not enough memory for PHRASE \"%s\"", String);
- X return (t_Phrase *) 0;
- X }
- X W->WordPlace.Flags |= TryRoot.WordPlace.Flags;
- X if (PrevLastEnd == (char *) 0) {
- X W->WordPlace.StuffBefore = 0;
- X } else {
- X W->WordPlace.StuffBefore = LastStart - PrevLastEnd;
- X }
- X PrevLastEnd = &q[1];
- X
- X if (AsciiTrace) {
- X fprintf(stderr, "Word %s --> %s, %lu matches\n",
- X LastStart,
- X UnFlag(W, W->WordPlace.Flags),
- X W->NumberOfWordPlaces);
- X }
- X /* point to the new space */
- X (*ThisWord)->Word = W;
- X (*ThisWord)->WordStart = LastStart;
- X (*ThisWord)->Next = (t_PhraseItem *) 0;
- X (*ThisWord)->SearchIndex = 0L;
- X ThisWord = &(*ThisWord)->Next;
- X
- X /** (void) strcpy(p, R); **/
- X /** p += TryRoot.Length; **/
- X
- X (void) strcpy(p, LastStart);
- X p += q - LastStart; /* q points one beyond the end */
- X
- X LastStart = q;
- X
- X }
- X *q = SaveEnd;
- X }
- X while (LastStart < q) {
- X *p++ = *LastStart++;
- X }
- X if (UsedABracket) {
- X *p++ = ']';
- X }
- X LastStart = 0;
- X } /* if */
- X if (!*q) break;
- X } /* for */
- X *p= '\0';
- X
- X if (ThisWord == &Result->Words) {
- X /* There were no words in the phrase! */
- X return (t_Phrase *) 0;
- X }
- X
- X Result->OriginalString = emalloc(q - String + 2);
- X (void) strcpy(Result->OriginalString, String);
- X
- X Result->NumberOfMatches = 0;
- X Result->Matches = (t_MatchList *) 0;
- X if (AsciiTrace > 1) {
- X fprintf(stderr, "phrase \"%s\",\n", Result->OriginalString, String);
- X fprintf(stderr, "Canonical form \"%s\"\n", Result->ModifiedString);
- X }
- X return Result;
- X}
- X
- X#define MaxDistance 20
- X
- Xt_Answer *
- XGetFiles(Phrase)
- X t_Phrase *Phrase;
- X{
- X char *MakeOneDescription();
- X
- X t_Answer *Result = 0;
- X t_Answer **RP = &Result;
- X t_MatchList *MP;
- X t_FID LastFID;
- X unsigned long ThisFIDNumberOfMatches = 0L;
- X
- X if (!Phrase || !Phrase->Matches) {
- X return Result;
- X }
- X
- X LastFID = Phrase->Matches->Match->Where->FID;
- X
- X for (MP = Phrase->Matches; MP; MP = MP->Next) {
- X if (MP->Match->Where->FID != LastFID) {
- X char *p;
- X
- X p = MakeOneDescription(LastFID, ThisFIDNumberOfMatches);
- X
- X if ((*RP = new(t_Answer)) == (t_Answer *) 0) {
- X return Result;
- X }
- X (*RP)->Answer = p;
- X RP = &(*RP)->Next;
- X *RP = (t_Answer *) 0;
- X ThisFIDNumberOfMatches = 0L;
- X } else {
- X ++ThisFIDNumberOfMatches;
- X }
- X
- X LastFID = MP->Match->Where->FID;
- X }
- X
- X if (ThisFIDNumberOfMatches) {
- X char *p = MakeOneDescription(LastFID, ThisFIDNumberOfMatches);
- X
- X if ((*RP = new(t_Answer)) == (t_Answer *) 0) {
- X return Result;
- X }
- X (*RP)->Answer = p;
- X RP = &(*RP)->Next;
- X *RP = (t_Answer *) 0;
- X ThisFIDNumberOfMatches = 0L;
- X }
- X
- X return Result;
- X}
- X
- Xchar *
- XMakeOneDescription(FID, NumberOfMatches)
- X t_FID FID;
- X unsigned long NumberOfMatches;
- X{
- X extern char *ctime();
- X extern t_FileInfo *GetFileInfo();
- X
- X char *Date;
- X char *p;
- X t_FileInfo *FileInfo;
- X char NumBuf[20];
- X
- X if (!FID) return (char *) 0;
- X
- X if (!(FileInfo = GetFileInfo(FID))) return (char *) 0;
- X
- X Date = ctime(&(FileInfo->Date));
- X /** Tue Oct 3 00:57:11 BST 1989 **/
- X Date[10] = '\0';
- X
- X (void) sprintf(NumBuf, "%lu", NumberOfMatches);
- X
- X p = emalloc((unsigned) (strlen(FileInfo->Name) + strlen(NumBuf) + 11));
- X (void) sprintf(p, "%-.5s %s %s", NumBuf, Date, FileInfo->Name);
- X efree(FileInfo->Name);
- X efree(FileInfo);
- X return p;
- X}
- X
- Xvoid
- XResetPhraseMatch(Phrase)
- X t_Phrase *Phrase;
- X{
- X t_PhraseItem *Word;
- X
- X if (!Phrase || !Phrase->Words) return;
- X
- X for (Word = Phrase->Words; Word; Word = Word->Next) {
- X Word->SearchIndex = 0;
- X }
- X Phrase->NumberOfMatches = 0;
- X}
- X
- X/* Default is to check case, etc. only if given in the input phrase.
- X * This is an enum from phrase.h, and only used in MakeMatches().
- X */
- X
- Xextern t_PhraseCaseMatch PhraseMatchLevel;
- X
- Xlong
- XMakeMatches(Phrase)
- X t_Phrase *Phrase;
- X{
- X /* Each word has a pointer (SearchIndex) to the last Word Place
- X * that was examined. This enables an O(NumberOfWords) search instead
- X * of O(NumberOfWords * NumberOfWords) search.
- X */
- X static int ContinuesMatch();
- X
- X unsigned long PIFB; /* PlaceInFirstBlock */
- X t_MatchList **MLPP = &(Phrase->Matches);
- X t_Match **MPP;
- X t_Match **OldMatch;
- X t_WordPlace *pp;
- X t_PhraseItem *Word;
- X long Result = 0L;
- X long LastResult = (-1L); /* to detect new matches */
- X t_PhraseItem *LeastWord;
- X int HowGood;
- X
- X if (!Phrase) {
- X return 0L;
- X }
- X
- X ResetPhraseMatch(Phrase);
- X /* Each iteration over this list either produces a match or rejects a
- X * possible phrase starting place.
- X */
- X
- X if (AsciiTrace > 1) {
- X fprintf(stderr, "Match(%s)\n", Phrase->ModifiedString);
- X }
- X
- X /* A phrase with garbage words can't match anything */
- X if (Phrase->HasUnknownWords && PhraseMatchLevel != PCM_AnyCase) {
- X return 0L;
- X }
- X
- X /* Ensure that the matches for the first word have been read */
- X if (Phrase->Words->Word->WordPlacesInHere <
- X Phrase->Words->Word->NumberOfWordPlaces) {
- X extern t_WordPlace *GetWordPlaces();
- X t_WordInfo *W = Phrase->Words->Word; /* less indirection! */
- X
- X if (W->WordPlaces) {
- X (void) efree(W->WordPlaces);
- X }
- X
- X W->WordPlaces = GetWordPlaces(
- X W->WID,
- X W->WordPlaceStart,
- X (unsigned) WIDBLOCKSIZE - (W->WordPlaceStart - W->DataBlock),
- X W->Offset,
- X W->NumberOfWordPlaces
- X );
- X W->WordPlacesInHere = W->NumberOfWordPlaces;
- X }
- X
- X /* Find the word in the phrase with least matches: */
- X LeastWord = Phrase->Words;
- X for (Word = Phrase->Words; Word; Word = Word->Next) {
- X if (Word->Word->NumberOfWordPlaces <
- X LeastWord->Word->NumberOfWordPlaces) {
- X LeastWord = Word;
- X }
- X }
- X
- X /* For each match in the first word in the phrase: */
- X for (PIFB = 0; PIFB < Phrase->Words->Word->NumberOfWordPlaces; PIFB++) {
- X t_WordPlace *LastFOP = (t_WordPlace *) 0;
- X
- X /* The idea is that the next two loops are are likely to reduce
- X * considerably the number of places we have to consider in the
- X * case that the first word in the phrase has a lot of matches
- X * and there is a subsequent word with relatively few matches.
- X * Experiments suggest that this is fairly common.
- X *
- X * This is still a nearly (i.e. slightly-better-than) linear
- X * algorithm w.r.t the total number of matches in all of the
- X * words added up. Note that I alter LeastWord->SearchIndex in
- X * one of the two loops that follow, so when WordPlaces from that
- X * word are considered, we don't have to look at any twice.
- X *
- X * In order to do better, one would have to be able to avoid
- X * looking at some or (better!) most of the WordPlaces.
- X *
- X * For example, not fetching so many from disk:
- X * if we didn't do the fetches until we needed to, and we gave
- X * GetWordPlaces a minimum FID to look for, we might be able
- X * to reduce things by (say) 15%.
- X * If all of the FIDS were stored separately, we would not
- X * have to look at the (Block, Word, Flags, StuffBefore) stuff at
- X * all, and that would be much faster. One way to do that might be
- X * to store the list of FIDs with the word (as now), and perhaps
- X * some flags and the count of words/fid, and to store the rest
- X * in a per-file data structure.
- X *
- X * That would be a major, major hack...
- X * ... sigh.o
- X *
- X */
- X
- X while (LeastWord->Word->WordPlaces[LeastWord->SearchIndex].FID <
- X Phrase->Words->Word->WordPlaces[PIFB].FID) {
- X if (++(LeastWord->SearchIndex) >=
- X LeastWord->Word->NumberOfWordPlaces) {
- X goto GiveUp;
- X }
- X }
- X
- X while (Phrase->Words->Word->WordPlaces[PIFB].FID <
- X LeastWord->Word->WordPlaces[LeastWord->SearchIndex].FID) {
- X if (++PIFB >= Phrase->Words->Word->NumberOfWordPlaces) {
- X goto GiveUp;
- X }
- X }
- X
- X /* The following comment tells Sabre_C not to moan about "if (0)" */
- X /*SUPPRESS558*/
- X if (0) {
- XGiveUp:
- X break;
- X }
- X /* end of attempted speed improvement */
- X
- X /* Optimistically allocate a new match: */
- X if (1 || Result != LastResult) {
- X *MLPP = (t_MatchList *) emalloc(sizeof(t_MatchList));
- X (*MLPP)->Match = (t_Match *) 0;
- X OldMatch = MPP = &((*MLPP)->Match);
- X MLPP = &(*MLPP)->Next;
- X *MLPP = (t_MatchList *) 0;
- X }
- X LastResult = Result;
- X
- X pp = &Phrase->Words->Word->WordPlaces[Phrase->Words->SearchIndex = PIFB];
- X /* When we have a partially completed match,
- X * FOP (declared below) will point to the WordPlace currently
- X * being considered to see if it extends the partial match;
- X * LastFOP points to the previous WordPlace in the match.
- X */
- X
- X /* For each word in the phrase: */
- X for (Word = Phrase->Words; Word; Word = Word->Next) {
- X int GotOne = 0;
- X
- X /* Ensure that the matches word have been read */
- X if (Word->Word->WordPlacesInHere <
- X Word->Word->NumberOfWordPlaces) {
- X extern t_WordPlace *GetWordPlaces();
- X t_WordInfo *W = Word->Word; /* less indirection! */
- X
- X if (W->WordPlaces) {
- X (void) efree(W->WordPlaces);
- X }
- X W->WordPlaces = GetWordPlaces(
- X W->WID,
- X W->WordPlaceStart,
- X (unsigned) WIDBLOCKSIZE - (W->WordPlaceStart - W->DataBlock),
- X W->Offset,
- X W->NumberOfWordPlaces
- X );
- X W->WordPlacesInHere = W->NumberOfWordPlaces;
- X }
- X
- X /* For each occurrence of that word: */
- X for (; Word->SearchIndex < Word->Word->NumberOfWordPlaces;
- X ++Word->SearchIndex) {
- X register t_WordPlace *FOP =
- X &Word->Word->WordPlaces[Word->SearchIndex];
- X
- X#if 0
- X /* Speedup -- binary search to find next candidate...
- X * this is commented out because it actually seems to
- X * make things run slower!
- X */
- X {
- X int low = Word->SearchIndex;
- X int high = Word->Word->NumberOfWordPlaces - 1;
- X t_WordPlace *Places = Word->Word->WordPlaces;
- X int guess = (high + low) / 2;
- X
- X while (low < high) {
- X if (Places[guess].FID < pp->FID) {
- X /* not gone far enough */
- X low = guess + 1;
- X } else {
- X high = guess;
- X }
- X guess = (high + low) / 2;
- X }
- X if (guess != Word->SearchIndex) {
- X Word->SearchIndex = guess;
- X FOP = &Word->Word->WordPlaces[Word->SearchIndex];
- X }
- X }
- X#endif
- X
- X if (!LastFOP) {
- X LastFOP = FOP;
- X }
- X
- X /** So:
- X ** | int PIFB = each match in the first word in the phrase
- X ** | t_WordPlace *pp = each match in the phrase
- X ** | t_PhraseItem *Word = each word in the phrase
- X ** | unsigned SearchIndex = each match of that word
- X ** | t_WordPlace *FOP = each occurrence of that word
- X **
- X ** Hence, we are comparing pp and FOP, hoping that each time
- X ** round the (Word) loop we will advance FOP.
- X ** Once we have decided that FOP and pp relate to the
- X ** same file and that FOP is no earlier than pp in the
- X ** file, we must then check that FOP is advancing the
- X ** chain by comparing it to the previous element in the
- X ** list (LastFOP).
- X **
- X ** When we break from this inner list, we must either have
- X ** eliminated this particular (PIFB) as starting a match-
- X ** chain, or have decided that we have extended the
- X ** current match chain (by setting GotOne).
- X **/
- X
- X
- X if (LastFOP == FOP) {
- X HowGood = CheckFlags(Word->Word, FOP);
- X } else {
- X HowGood = ContinuesMatch(Word->Word, pp, LastFOP, FOP);
- X }
- X
- X switch (HowGood) {
- X case 0:
- X /* G O T C H A !!!! */
- X /* extend the HitList, since it's OK so far. */
- X
- X *MPP = (t_Match *) emalloc(sizeof(t_Match));
- X (*MPP)->WID = Word->Word->WID;
- X (*MPP)->Where = FOP;
- X (*MPP)->Next = (t_Match *) 0;
- X MPP = &(*MPP)->Next;
- X GotOne++;
- X break;
- X case 1: /* gone too far */
- X if (AsciiTrace > 10) {
- X t_WordInfo WW;
- X
- X WW = *(Word->Word);
- X
- X if (LastFOP == FOP) {
- X /* UnFlag() returns a pointer to a static buffer,
- X * so I have to use two printf() calls here.
- X */
- X fprintf(stderr, "Reject(%s (%d) != ",
- X UnFlag(&WW, WW.WordPlace.Flags),
- X WW.WordPlace.Flags);
- X fprintf(stderr, "%s (%d)) [flags]\n",
- X UnFlag(&WW, FOP->Flags), FOP->Flags);
- X } else {
- X fprintf(stderr, "Reject(%s) -- too far\n",
- X UnFlag(&WW, WW.WordPlace.Flags));
- X }
- X }
- X break;
- X case -1:
- X continue; /* not there yet */
- X default:
- X fprintf(stderr, "\n\rInternal Error %s: %d\n", __FILE__,
- X __LINE__ - 1);
- X (void) sleep(4); /* for curses stuff... */
- X exit(1);
- X }
- X
- X /* Remember where we got up to... so that we can extend
- X * the list when we start looking at the next word.
- X */
- X LastFOP = FOP;
- X
- X if (AsciiTrace >= 4) {
- X t_WordInfo WW;
- X
- X WW = *(Word->Word);
- X /* UnFlag() returns a pointer to a static buffer */
- X fprintf(stderr, "Partial match %s",
- X UnFlag(&WW, Word->Word->WordPlace.Flags));
- X fprintf(stderr, "(Word (%s,%lu,%u) in file %lu)\n",
- X UnFlag(&WW, FOP->Flags),
- X FOP->BlockInFile, FOP->WordInBlock,
- X FOP->FID
- X );
- X }
- X /* If we got to here, we extended the list, which is fine;
- X * otherwise, if we hit a continue, we try to carry on
- X * looking at matches of this word, and if we hit a break
- X * before we set "GotOne", we give up on this match
- X * altogether.
- X */
- X break;
- X } /* For each occurrence of that word: */
- X
- X if (!GotOne) {
- X t_Match *MP;
- X /* This word isn't here, so neither is the phrase found
- X * in this file starting here.
- X */
- X
- X for (MP = (*OldMatch); MP != (t_Match *) 0; /*void*/) {
- X t_Match *Next = MP->Next;
- X
- X efree((char *) MP);
- X MP = Next;
- X }
- X
- X *OldMatch = (t_Match *) 0;
- X break;
- X } else {
- X /* If we've reached the end of the phrase, i.e. if
- X * Word->Next is zero, we have successfully added a new
- X * phrase!
- X */
- X if (Word->Next == (t_PhraseItem *) 0) {
- X if (AsciiTrace > 10) {
- X fprintf(stderr, "Result now %d\n", Result + 1);
- X }
- X Result++;
- X }
- X }
- X
- X } /* end for (each word in the phrase) */
- X } /* end (for each FID/Offset pair in the first word */
- X return Phrase->NumberOfMatches = Result;
- X}
- X
- X
- Xstatic int
- XContinuesMatch(QueryWord, First, Prev, New)
- X t_WordInfo *QueryWord;
- X t_WordPlace *First;
- X t_WordPlace *Prev;
- X t_WordPlace *New;
- X{
- X /* Return Value is
- X * -1 --- if New occurs before Prev (and thus isn't part of the match)
- X * 0 --- if it's the next word in the match
- X * +1 --- if we've gone past it
- X * Note: you can use these values in a switch() if you want.
- X */
- X
- X /* First check we are looking at the right file:
- X * Have we gone far enough?
- X */
- X if (New->FID < First->FID) {
- X return -1; /* not far enough */
- X } else if (New->FID > First->FID) {
- X return 1; /* too far */
- X } else if (Prev == New) {
- X return 0;
- X }
- X
- X /* Hey everybody, they're the same!
- X * That means that this might be a candidate for a MATCH!!!!
- X */
- X
- X /* if (SimplyAnywhereWillDo) { OK; break; } */
- X
- X /* Clearly later words in the phrase can't be in earlier
- X * blocks...
- X */
- X if (New->BlockInFile < First->BlockInFile) {
- X /* Although we are in the right file, we have not
- X * yet reached the correct offset.
- X */
- X return -1;
- X }
- X
- X /* If we get to here,
- X * . we are in the right file
- X * . we are at least as far into the file as the start
- X * of the phrase
- X */
- X
- X /* Now check that we are a reasonable distance past
- X * the preceding word (checking that we are not on the first
- X * match in the list, of course):
- X */
- X if (New->BlockInFile < Prev->BlockInFile) {
- X /* not gone far enough */
- X return -1;
- X }
- X if (New->BlockInFile > Prev->BlockInFile + 1) {
- X /* If they are more than one block apart, I
- X * don't believe them to be part of a phrase!
- X */
- X return 1;
- X }
- X if (New->BlockInFile == Prev->BlockInFile) {
- X /* If they are in the same block, one must be
- X * exactly one word beyond the other. I don't
- X * think they can ever be the same, unless there
- X * is a serious bug somewhere!
- X */
- X if (New->WordInBlock <= Prev->WordInBlock) {
- X /* too early in the block */
- X return -1;
- X }
- X switch (PhraseMatchLevel) {
- X case PCM_AnyCase:
- X if (New->WordInBlock > Prev->WordInBlock + 4) {
- X return 1; /* gone too far */
- X /* We allow a few words slop in this case, though... */
- X }
- X break; /* clearly OK */
- X case PCM_SameCase:
- X case PCM_HalfCase:
- X default:
- X if (New->WordInBlock > Prev->WordInBlock + 1) {
- X return 1; /* gone too far */
- X }
- X }
- X } else {
- X /* they are in adjacent blocks */
- X if (New->WordInBlock > 0 ||
- X !(Prev->Flags & WPF_LASTINBLOCK)) {
- X /* there is another word between them, so
- X * we have gone too far.
- X * I went to a lot of effort in addfile.c to
- X * mantain that flag, just for this!
- X */
- X return 1;
- X }
- X }
- X /* So they are adjacent words.
- X * Now, I wonder if they are plausible distances
- X * apart, and whether the common words skipped are
- X * the same?
- X * Also, what about other flag details?
- X */
- X
- X /* NOTDONE */
- X
- X /* Now we check that the word matches the given word
- X * -- in other words, that possessive/plural/case
- X * is correct if required. Do this later as it is
- X * relatively expensive I expect, and we will not
- X * usually care about case.
- X *
- X * Since the word is in the right place, if it fails here there
- X * is no point in looking at the next word in this block!
- X */
- X
- X return CheckFlags(QueryWord, New);
- X}
- X
- XCheckFlags(QueryWord, New)
- X t_WordInfo *QueryWord;
- X t_WordPlace *New;
- X{
- X /* First check case */
- X switch (PhraseMatchLevel) {
- X
- X default: /* defensive! */
- X fprintf(stderr, "\n\rinternal error %d %s\n", __LINE__, __FILE__);
- X (void) sleep(4);
- X break;
- X case PCM_AnyCase:
- X break; /* clearly OK */
- X case PCM_SameCase:
- X if ((QueryWord->WordPlace.Flags & (WPF_UPPERCASE|WPF_POSSESSIVE)) !=
- X (New->Flags & (WPF_UPPERCASE|WPF_POSSESSIVE))) {
- X /* The cases are different, no good */
- X return 1; /* give up on this match */
- X }
- X if (QueryWord->WordPlace.StuffBefore > 0) {
- X int Difference;
- X
- X Difference = QueryWord->WordPlace.StuffBefore - New->StuffBefore;
- X if (Difference < -2 || Difference > 4) {
- X return 1; /* give up on this match */
- X }
- X }
- X /* Now, what about skipped common words? */
- X if ((New->Flags & WPF_LASTWASCOMMON) !=
- X (QueryWord->WordPlace.Flags & WPF_LASTWASCOMMON)) {
- X return 1; /* give up on this match */
- X }
- X
- X /* plurals: this should be separate */
- X if ((QueryWord->WordPlace.Flags & WPF_WASPLURAL) &&
- X !(New->Flags & WPF_WASPLURAL)) {
- X return 1; /* give up on this match */
- X }
- X
- X /* Only do this test if we are being awfully strict.
- X * Remember also that the first word in the phrase will
- X * not usually have this set.
- X */
- X if ((QueryWord->WordPlace.Flags & WPF_LASTHADLETTERS) &&
- X !(New->Flags & WPF_LASTHADLETTERS)) {
- X return 1; /* give up on this match */
- X }
- X break;
- X case PCM_HalfCase:
- X /* In this case, we are lax about things, but if the
- X * user typed plural/possessive/capital, we only
- X * match one with the same attribute.
- X */
- X if ((QueryWord->WordPlace.Flags & WPF_UPPERCASE) &&
- X !(New->Flags & WPF_UPPERCASE)) {
- X if (AsciiTrace > 4) {
- X fprintf(stderr, "Reject [uppercase]\n");
- X }
- X return 1; /* give up on this match */
- X }
- X
- X /* plurals: this should be separate */
- X if ((New->Flags & WPF_WASPLURAL) &&
- X !(QueryWord->WordPlace.Flags & WPF_WASPLURAL)) {
- X if (AsciiTrace > 4) {
- X fprintf(stderr, "Reject [plural]\n");
- X }
- X return 1; /* give up on this match */
- X }
- X
- X /* Now, what about skipped common words? */
- X if ((QueryWord->WordPlace.Flags & WPF_LASTWASCOMMON) &&
- X !(New->Flags & WPF_LASTWASCOMMON)) {
- X if (AsciiTrace > 4) {
- X fprintf(stderr, "Reject [last was common]\n");
- X }
- X return 1; /* give up on this match */
- X }
- X /* Stuff before, if given, must be present: */
- X if (QueryWord->WordPlace.StuffBefore > 1) {
- X if (New->StuffBefore < QueryWord->WordPlace.StuffBefore - 1) {
- X if (AsciiTrace > 4) {
- X fprintf(stderr, "Reject [Stuff Before %d != Q%d]\n",
- X QueryWord->WordPlace.StuffBefore,
- X New->StuffBefore);
- X }
- X return 1;
- X } /* don't care if there is too much there, though */
- X }
- X if ((QueryWord->WordPlace.Flags & WPF_POSSESSIVE) &&
- X !(New->Flags & WPF_POSSESSIVE)) {
- X if (AsciiTrace > 4) {
- X fprintf(stderr, "Reject [user flag]\n");
- X }
- X return 1; /* give up on this match */
- X }
- X break;
- X }
- X
- X /* If we got here...
- X *
- X */
- X
- X return 0; /* It's all OK! */
- X}
- @@@End of lq-text/src/liblqtext/Phrase.c
- echo x - lq-text/src/liblqtext/Root.c 1>&2
- sed 's/^X//' >lq-text/src/liblqtext/Root.c <<'@@@End of lq-text/src/liblqtext/Root.c'
- X/* Root.c -- Copyright 1989 Liam R. Quin. All Rights Reserved.
- X * This code is NOT in the public domain.
- X * See the file COPYRIGHT for full details.
- X */
- X
- X/*
- X * $Id: Root.c,v 2.7 91/03/03 00:13:36 lee Rel1-10 $
- X *
- X * $Log: Root.c,v $
- X * Revision 2.7 91/03/03 00:13:36 lee
- X * cosmetic changes.
- X *
- X * Revision 2.6 90/10/06 00:11:59 lee
- X * Prepared for first beta release.
- X *
- X * Revision 2.5 90/08/29 21:46:42 lee
- X * Alpha release.
- X *
- X * Revision 2.4 90/08/09 19:16:29 lee
- X * BSD lint and fixes...
- X *
- X * Revision 2.3 90/03/29 23:00:04 lee
- X * Now passes gcc -Wall
- X *
- X * Revision 2.2 89/10/08 20:44:56 lee
- X * Working version of nx-text engine. Addfile and wordinfo work OK.
- X *
- X * Revision 2.1 89/10/02 01:13:07 lee
- X * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
- X *
- X *
- X */
- X
- X#include "globals.h" /* defines and declarations for database filenames */
- X
- X#include <sys/types.h>
- X#include <fcntl.h> /* for my header files, sorry */
- X#include <stdio.h>
- X#include <malloc.h>
- X#include <ctype.h>
- X
- X#include "fileinfo.h"
- X#include "wordinfo.h"
- X#include "wordrules.h"
- X#include "emalloc.h"
- X
- X/** Unix system calls that need to be declared: **/
- X /* (none) */
- X/** C Library functions that nees to be declared: **/
- Xextern void perror();
- Xextern int strcmp();
- Xextern int strlen();
- Xextern char *strcpy();
- Xextern char *strcat();
- X#ifndef tolower
- X extern int toupper();
- X#endif
- X
- X/** lqtext functions that need to be declared: **/
- X/** Functions from this file that need to be declared: **/
- Xvoid InsertCommonWord();
- X/** **/
- X
- X/** Useful macros **/
- X#define new(type) ((type *) emalloc(sizeof(type)))
- X /* so you can say
- X * struct foo *x = enew(struct foo)
- X */
- X
- X#define STRCMP(s1,s2) ((*(s1) == *(s2)) ? strcmp(s1, s2) : *(s1) - *(s2))
- X /* faster then strcmp in the (common) case where the
- X * strings differ at the first character.
- X * From an idea by Henry Spencer (utzoo!henry)
- X */
- X
- X/** **/
- X
- Xextern int AsciiTrace;
- X
- X/* This routine is only sensible for English (although it could be
- X * modified...), but that does not matter.
- X */
- Xchar *
- XWordRoot(WordInfo)
- X t_WordInfo *WordInfo;
- X{
- X char *Word;
- X
- X if (!WordInfo) return "@#!!";
- X
- X Word = WordInfo->Word;
- X
- X if (!Word) {
- X return "oh dear";
- X }
- X
- X if (!*Word) {
- X return Word;
- X }
- X
- X /** delete trailing <'s> and mark posessive */
- X while (WordInfo->Length >= 3 && Word[WordInfo->Length - 1] == 's' &&
- X Word[WordInfo->Length - 2] == '\'') {
- X WordInfo->Length -= 2;
- X Word[WordInfo->Length] = '\0';
- X WordInfo->WordPlace.Flags |= WPF_POSSESSIVE;
- X }
- X
- X /** delete trailing plural suffix and mark plural */
- X
- X /* It's important to realise that the purpose of this routine is not
- X * in any way to reduce a word to an etymological root. In other words,
- X * no attempt is made to differentiate between plurals and present
- X * participles, or words that simply happen to end in `s'.
- X * Hence, elephants, blunderbus, hostess, runs and tomatoes are all
- X * candidates. Of course, one would like to do as well as one can!
- X * Again, the object isn't to derive the correct singular, but instead
- X * to be fairly fast, and, above all, to ensure that any transformations
- X * are reversible!
- X *
- X * The result is that I can store dog and dogs in the same Wordinfo
- X * chain. In the case that either word is unusual, there is a space
- X * saving of (on average) 30 or so bytes. More usefully, if you ask
- X * for `Window', I will automatically find `Windows' as well.
- X *
- X * so...
- X * XXXo, XXXss, XXXsh, XXXch, XXXx --> +es
- X * except: pianos, dynamos, photos
- X * XXCy --> XXCies [ C consonant]
- X * XXVy --> XXVys [ V vowel ]
- X * f or fe --> ves (12 cases only)
- X * vowel change:
- X * foot/feet (why bother with these? -- use a thesaurus!)
- X * need to keep penny/pence separate
- X * See Thomson & Martinet, section 8ff (I think)
- X */
- X if (WordInfo->Length > 2 && Word[WordInfo->Length - 1] == 's') {
- X WordInfo->WordPlace.Flags |= WPF_WASPLURAL; /* WRONG */
- X switch (Word[WordInfo->Length - 2]) {
- X case 'e':
- X if (WordInfo->Length >= 3) switch (Word[WordInfo->Length - 3]) {
- X case 'i': /* xxcies --> xxxy */
- X if (WordInfo->Length > 3) {
- X Word[WordInfo->Length - 3] = 'y';
- X WordInfo->Length -= 2;
- X } else { /* ies not a plural, but lies is :-) */
- X WordInfo->Length--; /* just the s */
- X }
- X break;
- X case 's':
- X case 'h':
- X case 'x':
- X case 'o': /* xxxoes --> xxx */
- X WordInfo->Length -= 2;
- X break;
- X default: /* xxxes -> xxxe */
- X WordInfo->Length -= 1;
- X break;
- X } else { /* too short */
- X WordInfo->WordPlace.Flags &=
- X (unsigned short)~(unsigned short)WPF_WASPLURAL;
- X }
- X break;
- X case 'y': /* xxxvys --> xxxvy */
- X switch (Word[WordInfo->Length - 2]) { /* e.g. holidays */
- X case 'a': /* flays */
- X case 'e': /* beys */
- X case 'i': /* ??iys?? */
- X case 'o': /* boys */
- X case 'u': /* guys */
- X WordInfo->Length--; /* just remove the s */
- X break;
- X default: /*not a plural, e.g. Unixsys, happy */
- X WordInfo->WordPlace.Flags &=
- X (unsigned short)~(unsigned short)WPF_WASPLURAL;
- X break;
- X }
- X break;
- X case 's': /* trailing ss doesn't mark a plural! */
- X WordInfo->WordPlace.Flags &=
- X (unsigned short)~(unsigned short)WPF_WASPLURAL;
- X break;
- X case 'u':
- X /* ONE bus, thus, omnibus; TWO gnus, TWO emus */
- X /* So it doesn't work for gnus and emus right now! */
- X WordInfo->WordPlace.Flags &=
- X (unsigned short)~(unsigned short)WPF_WASPLURAL;
- X break;
- X case 'i': /* not a plural.. this, his, fleur-de-lis */
- X WordInfo->WordPlace.Flags &=
- X (unsigned short)~(unsigned short)WPF_WASPLURAL;
- X break;
- X case 'a': /* has */
- X case 'o': /* cos */
- X if (WordInfo->Length < 4) {
- X WordInfo->WordPlace.Flags &=
- X (unsigned short)~(unsigned short)WPF_WASPLURAL;
- X break;
- X }
- X /* else fall through */
- X default: /* just plain s */
- X WordInfo->Length -= 1;
- X break;
- X }
- X Word[WordInfo->Length] = '\0';
- X }
- X /* Should check for ii --> ius here, but that would increase the length
- X * of the word and therefore will break lots of things.
- X */
- X return WordInfo->Word;
- X}
- X
- Xchar *
- XUnFlag(WordInfo, Flags)
- X t_WordInfo *WordInfo;
- X unsigned int Flags;
- X{
- X static char Buffer[MaxWordLength + 5]; /* 's + es + \0 */
- X register char *p, *q;
- X int Length;
- X
- X if (!WordInfo) return "(null word info)";
- X if (!WordInfo->Word) return "(null word)";
- X if (!WordInfo->Word[0]) return "(empty word)";
- X
- X p = Buffer;
- X q = WordInfo->Word;
- X while (*p++ = *q++)
- X ;
- X *p = '\0';
- X
- X if ((Length = p - Buffer) != WordInfo->Length) {
- X /* Well, maybe I can't count */
- X WordInfo->Length = Length = strlen(Buffer);
- X }
- X
- X if (Flags & WPF_WASPLURAL) {
- X if (Length >= 2) switch (Buffer[Length - 1]) {
- X case 'y':
- X if (Length > 2) switch (Buffer[Length - 2]) {
- X case 'a':
- X case 'e':
- X case 'i':
- X case 'o':
- X case 'u':
- X Buffer[Length++] = 's'; /* e.g. days */
- X break;
- X default:
- X strcpy(&Buffer[Length - 1], "ies"); /* ladies */
- X Length += 2;
- X }
- X break;
- X case 's':
- X if (Length > 2) if (Buffer[Length - 2] == 'u') {
- X strcpy(&Buffer[Length - 1], "ii"); /* Genii */
- X break;
- X } /* else fall through... */
- X case 'o':
- X case 'h':
- X case 'x':
- X strcat(Buffer, "es");
- X Length += 2;
- X break;
- X default:
- X Buffer[Length++] = 's';
- X }
- X Buffer[Length] = '\0';
- X }
- X
- X if (Flags & WPF_POSSESSIVE) {
- X Buffer[Length++] = '\'';
- X Buffer[Length++] = 's';
- X Buffer[Length] = '\0';
- X }
- X
- X if (Flags & WPF_UPPERCASE) {
- X Buffer[0] = toupper(Buffer[0]);
- X }
- X
- X return Buffer;
- X}
- X
- Xtypedef struct s_WordList {
- X char *Word;
- X unsigned short Flags;
- X struct s_WordList *Next;
- X} t_WordList;
- X
- Xstatic t_WordList *CommonWords = 0;
- X
- Xint
- XTooCommon(WordInfo)
- X t_WordInfo *WordInfo;
- X{
- X register char *Word = WordInfo->Word;
- X register t_WordList **WP;
- X
- X for (WP = &CommonWords; *WP; WP = &(*WP)->Next) {
- X int i = STRCMP((*WP)->Word, Word);
- X
- X if (i == 0) return 1; /* yes, it's common */
- X else if (i > 0) return 0;
- X }
- X return 0;
- X}
- X
- Xstatic char *FileName = "Internal Error";
- X/* should be set before being printed! */
- X
- Xint
- XReadCommonWords(CommonFile)
- X char *CommonFile;
- X{
- X extern char *fgets();
- X extern int AsciiTrace;
- X
- X FILE *fd;
- X extern int errno;
- X char Buffer[200];
- X t_WordInfo W;
- X char *Root;
- X t_WordList *WP;
- X
- X errno = 0;
- X
- X if ((fd = fopen(CommonFile, "r")) == (FILE *) 0) {
- X int e = errno;
- X
- X fprintf(stderr, "Can't open common word list ");
- X errno = e;
- X perror(CommonFile);
- X return -1;
- X }
- X
- X FileName = CommonFile;
- X
- X while (fgets(Buffer, sizeof(Buffer), fd) != (char *) 0) {
- X register char *p;
- X char *Start;
- X
- X for (p = Buffer; *p; p++) {
- X if (*p == '#') break;
- X if (StartsWord(*p)) break;
- X }
- X
- X if (*p == '#' || !*p) {
- X continue;
- X }
- X
- X Start = p;
- X
- X for (; *p; p++) {
- X if (!WithinWord(*p)) break;
- X if (*p == '\'' && !WithinWord(p[1])) break;
- X }
- X
- X if (p - Start + 1 < MinWordLength) continue;
- X
- X *p = '\0'; /* delete trailing \n or whatever */
- X W.WordPlace.Flags = 0;
- X W.Word = Start;
- X W.Length = p - Start; /* length excludes the \0 */
- X
- X Root = WordRoot(&W);
- X InsertCommonWord(Root, W.WordPlace.Flags);
- X }
- X (void) fclose(fd);
- X
- X#if 0
- X if (!CommonWords) {
- X fprintf(stderr, "No common words found in file \"%s\"\n", FileName);
- X exit(1);
- X }
- X#endif
- X
- X if (AsciiTrace > 1) {
- X for (WP = CommonWords; WP; WP = WP->Next) {
- X fprintf(stderr, "Ignore: \"%s\"\n", WP->Word);
- X }
- X }
- X FileName = "Internal Error";
- X return 0;
- X}
- X
- Xvoid
- XInsertCommonWord(Root, Flags)
- X char *Root;
- X unsigned int Flags;
- X{
- X register t_WordList **WP;
- X t_WordList *W;
- X
- X for (WP = &CommonWords; *WP; WP = &(*WP)->Next) {
- X int i = STRCMP((*WP)->Word, Root);
- X
- X if (i == 0) return;
- X else if (i > 0) break;
- X }
- X /* insert it before this one! */
- X W = (*WP);
- X (*WP) = (t_WordList *) emalloc(sizeof(t_WordList));
- X (*WP)->Word = emalloc(strlen(Root) + 1);
- X (void) strcpy((*WP)->Word, Root);
- X (*WP)->Flags = Flags;
- X (*WP)->Next = W;
- X return;
- X}
- @@@End of lq-text/src/liblqtext/Root.c
- echo x - lq-text/src/liblqtext/WordInfo.c 1>&2
- sed 's/^X//' >lq-text/src/liblqtext/WordInfo.c <<'@@@End of lq-text/src/liblqtext/WordInfo.c'
- X/* WordInfo.c -- Copyright 1989 Liam R. Quin. All Rights Reserved.
- X * This code is NOT in the public domain.
- X * See the file COPYRIGHT for full details.
- X */
- X
- X/* WordInfo.c -- handle the database of words for NX-Text.
- X *
- X * NX-Text keeps a master list of all of the words that have ever been
- X * seen. Currently, this is in dbm format (sdbm or ndbm).
- X * For each word, there's an associated WID (a unique number), an offset
- X * into the master database (see pblock.c), and possibly thesaurus info.
- X *
- X * $Id: WordInfo.c,v 2.11 90/10/13 03:10:07 lee Rel1-10 $
- X *
- X * $Log: WordInfo.c,v $
- X * Revision 2.11 90/10/13 03:10:07 lee
- X * Type error -- efree() needs a char *.
- X *
- X * Revision 2.10 90/10/06 00:12:01 lee
- X * Prepared for first beta release.
- X *
- X * Revision 2.9 90/09/29 23:47:30 lee
- X * Reduced the size of a buffer, and plugged yet another memory leak!
- X *
- X * Revision 2.8 90/09/10 13:38:50 lee
- X * deleted declaration of sleep()
- X *
- X * Revision 2.7 90/08/29 21:46:48 lee
- X * Alpha release.
- X *
- X * Revision 2.6 90/08/12 17:33:38 lee
- X * malloc changes; added SlayWordInfo() and MakeWordInfo().
- X *
- X * Revision 2.5 90/08/09 19:16:35 lee
- X * BSD lint and fixes...
- X *
- X * Revision 2.4 90/03/22 14:23:19 lee
- X * new calls to efree();
- X * Offset now stored as a block number, not a byte offset
- X *
- X * Revision 2.3 90/03/21 14:59:13 lee
- X * Numerous changes. WID2WordInfo() no longer calles GetWordPlaces().
- X *
- X * Revision 2.2 89/10/08 20:45:05 lee
- X * Working version of nx-text engine. Addfile and wordinfo work OK.
- X *
- X * Revision 2.1 89/10/02 01:13:56 lee
- X * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
- X *
- X * Revision 1.4 89/09/17 23:01:53 lee
- X * Various fixes; NumberInBlock now a short...
- X *
- X * Revision 1.3 89/09/16 21:16:07 lee
- X * First demonstratable version.
- X *
- X * Revision 1.2 89/09/11 00:35:03 lee
- X * Some speedups, but WID not working properly...
- X *
- X * Revision 1.1 89/09/07 21:05:51 lee
- X * Initial revision
- X *
- X *
- X */
- X
- X#include "globals.h" /* defines and declarations for database filenames */
- X
- X#include <errno.h>
- X#include <fcntl.h>
- X#include <malloc.h>
- X#include <signal.h>
- X#include <stdio.h>
- X#include <string.h>
- X#include <ctype.h>
- X#include <sys/types.h>
- X#include <sys/stat.h>
- X#include <unistd.h>
- X
- X#include "fileinfo.h"
- X#include "smalldb.h"
- X#include "wordindex.h"
- X#include "wordinfo.h"
- X#include "numbers.h"
- X
- X#include "emalloc.h"
- X
- X#include "wordrules.h" /* max word length */
- X
- X#include "pblock.h"
- X
- X/** declarations: **/
- X/** Unix system calls that need to be declared: **/
- Xextern int open(), close(); /* (these are not the stdio fopen and fclose) */
- Xextern int creat();
- Xextern void exit();
- Xextern long lseek(); /* watch out for this on 16 bit (286, PDP11) systems! */
- Xextern int read(), write();
- Xextern int stat();
- Xextern unsigned alarm(/* unsigned */);
- X
- X/** Unix Library Calls that need to be declared: **/
- X#ifndef tolower /* e.g. on SunOS */
- Xextern int tolower();
- X#endif
- Xextern void perror();
- X/* extern int sleep(); -- this is unsigned on some systems */
- Xextern int lockf();
- X/** lqtext Library calls that need to be declared: **/
- Xextern void Deletepblock();
- X
- X/** Functions within this file that need to be declared: **/
- Xt_WordInfo *MakeWordInfo();
- Xvoid SlayWordInfo();
- X
- X/** **/
- X
- Xextern int AsciiTrace;
- Xextern char *progname;
- X
- X#define new(type) ( ((type) *) emalloc(sizeof(type)) )
- X
- X/* Format when using ndbm: */
- X
- Xtypedef struct {
- X t_WID WID;
- X unsigned long Offset; /* position in the database */
- X unsigned long NumberOfWordPlaces;
- X char Word[1]; /* Cheat here */
- X} t_WordIndexEntry;
- X
- X/* Replacement fomat, intended to be faster and to use much less disk space.
- X * Still use dbm for Word2WID, but not for the reverse mapping.
- X * Also, cache the most recently-used WordInfo entry...
- X * I have not measured how much of a win this is, but a lot of the code
- X * calls Word2Wid() and then WID2WordInfo().
- X */
- X
- Xstatic int Widfd = (-1);
- X
- Xt_WordInfo *
- XWID2WordInfo(WID)
- X t_WID WID;
- X{
- X extern t_WordPlace *GetWordPlaces(); /* pblock.c */
- X
- X char Buffer[WIDBLOCKSIZE + 5]; /* the +5 allows for overrun... */
- X char *q = Buffer;
- X t_WordInfo *WP;
- X
- X /* The above calculation is derived like this:
- X *
- X * The entry contains the total number of pairs (>= 1 byte),
- X * the length of the string (>= 1 byte), and the string (>= 3 bytes)
- X * (actually >= MinWordLength bytes, but setting this less than 3
- X * would be a major disaster!)
- X * Hence, there are WIDBLOCKSIZE - (2 + length) bytes left for pairs.
- X * Now, each pair is at least 2 bytes, so halve the remainder. I add
- X * one coz WIDBLOCKSIZE - 3 is odd, so I would otherwise lose one!
- X */
- X
- X if (Widfd < 0) {
- X if ((Widfd = open(WidIndexFile, O_RDWR|O_CREAT, 0766)) < 0) {
- X fprintf(stderr, "Can't open WID file \"%s\"\n", WidIndexFile);
- X exit(1);
- X }
- X }
- X
- X if (lseek(Widfd, (long) (WID * WIDBLOCKSIZE), 0) < 0) {
- X return (t_WordInfo *) 0;
- X }
- X
- X if (read(Widfd, Buffer, WIDBLOCKSIZE) != WIDBLOCKSIZE) {
- X return (t_WordInfo *) 0;
- X }
- X
- X {
- X unsigned short L;
- X
- X if ((L = sReadNumber(&q)) == 0) {
- X (void) fprintf(stderr,
- X "%s: Database corrupt, WID %lu has length zero\n",
- X progname, WID);
- X return (t_WordInfo *) 0;
- X }
- X WP = MakeWordInfo(WID, (int) L, q);
- X q += L;
- X }
- X
- X WP->Offset = (sReadNumber(&q) >> 1) * BLOCKSIZE;
- X WP->NumberOfWordPlaces = sReadNumber(&q);
- X
- X /* Now, maybe read some WordPlace tuplets: */
- X#if 1
- X if (q - Buffer < WIDBLOCKSIZE) {
- X WP->WordPlaces = GetWordPlaces(
- X WP->WID,
- X q,
- X WIDBLOCKSIZE - (q - Buffer),
- X WP->Offset,
- X WP->NumberOfWordPlaces
- X );
- X WP->WordPlacesInHere = WP->NumberOfWordPlaces;
- X } else {
- X fprintf(stderr, "%s: Internal error, block too small for %ld (%s)\n",
- X progname, WP->WID, WP->Word);
- X exit(1);
- X }
- X
- X#else
- X WP->WordPlaces = (t_WordPlace *) 0;
- X if (q - Buffer < WIDBLOCKSIZE) {
- X WP->DataBlock = emalloc(WIDBLOCKSIZE + 5);
- X (void) memcpy(WP->DataBlock, Buffer, WIDBLOCKSIZE);
- X WP->WordPlaceStart = &(WP->DataBlock[q - Buffer]);
- X }
- X#endif
- X
- X /* done! */
- X return WP;
- X}
- X
- Xstatic char PairBuffer[WIDBLOCKSIZE + 5]; /* the +5 allows for overrun... */
- X
- X/* Make WordInfo Block Header... */
- Xvoid
- XMkWIBH(WordInfo, pblock)
- X t_WordInfo *WordInfo;
- X t_pblock *pblock;
- X{
- X char *q = PairBuffer;
- X
- X#ifdef ASCIITRACE
- X if (AsciiTrace > 15) {
- X fprintf(stderr, "\tMake info block header for %s, Offset %lu==%lu\n",
- X WordInfo->Word, pblock->ChainStart, WordInfo->Offset);
- X }
- X#endif
- X
- X sWriteNumber(&q, WordInfo->Length);
- X (void) strncpy(q, WordInfo->Word, WordInfo->Length);
- X q += WordInfo->Length;
- X if (pblock) sWriteNumber(&q, (pblock->ChainStart / BLOCKSIZE) << 1);
- X else sWriteNumber(&q, 0L);
- X sWriteNumber(&q, WordInfo->NumberOfWordPlaces);
- X
- X WordInfo->WordPlaceStart = q;
- X WordInfo->DataBlock = PairBuffer;
- X}
- X
- X/* Make WordInfo Block ... */
- Xint
- XMkWIB(WordInfo, pblock)
- X t_WordInfo *WordInfo;
- X t_pblock *pblock;
- X{
- X extern unsigned short PutWordPlaces();
- X
- X /* See how many pairs from the given pblock fit into WordInfo,
- X * and leave them in PairBuffer...
- X */
- X
- X if (AsciiTrace > 3) {
- X fprintf(stderr, "Make info block for %s\n", WordInfo->Word);
- X }
- X
- X MkWIBH(WordInfo, pblock);
- X
- X if (pblock == (t_pblock *) 0) {
- X /* No WordPlaces to put in! */
- X WordInfo->WordPlacesInHere = 0;
- X return 0;
- X }
- X
- X return WordInfo->WordPlacesInHere = PutWordPlaces(
- X pblock->WordPlaces,
- X WordInfo->WID,
- X WordInfo->WordPlaceStart,
- X WIDBLOCKSIZE - (WordInfo->WordPlaceStart - PairBuffer),
- X pblock->ChainStart,
- X pblock->NumberOfWordPlaces);
- X}
- X
- Xchar *
- XString2SixBitString(String)
- X unsigned char *String;
- X{
- X static unsigned char Buffer[MaxWordLength + 1];
- X register unsigned char *p;
- X register unsigned char *Bufp = Buffer;
- X unsigned short Val;
- X int BitsLeft = 0;
- X
- X /* BUG: we lose word-processing accents, etc. and 8-bitness if
- X * we do this. Also, it slows things down very, very slightly.
- X */
- X
- X /* Some ascii character equivalents:
- X * '0' 48 060 0x30
- X * 'A' 65 0101 0x41
- X * '_' 95 0137 0x5f
- X * 'a' 97 0141 0x61
- X */
- X for (p = String; *p; p++) {
- X if (!isalnum(*p) && *p != '\'' && *p != '_') {
- X return (char *) 0;
- X }
- X if (isupper(*p)) *p = tolower(*p);
- X /* Store as
- X * 0-9 --> 0-9 (easy!)
- X * a-z --> 10...35
- X * _/' --> 36/37
- X * hence, I need 6 bits per character. This also leaves rather
- X * a lot of bits spare (38..64, 27 or so spaces). As I fold case,
- X * and don't have controls, I don't know what to do there. I
- X * could store digrams. There are 38*38 = 1444 of these, but
- X * some of them don't happen. Not worth the effort.
- X */
- X if (isdigit(*p)) {
- X Val = (*p) - '0';
- X } else if (isalpha(*p)) {
- X Val = (*p) - 'a' + ('9' - '0');
- X } else if (*p == '\'') {
- X Val = ('9' - '0') + ('z' - 'a') + 1;
- X } else if (*p == '_') {
- X Val = ('9' - '0') + ('z' - 'a') + 2;
- X } else {
- X#define NEXTISEIGHT ('9' - '0') + ('z' - 'a') + 3
- X Val = NEXTISEIGHT;
- X }
- X /* Write the first half */
- X if (!(BitsLeft & 07)) { /* i.e. it's 0 or 8 */
- X *Bufp = (Val << 2);
- X BitsLeft = 2;
- X } else {
- X /* top BITSLEFT bits */
- X *Bufp++ |= (Val >> (6 - BitsLeft));
- X *Bufp = (unsigned) (Val << (2 + BitsLeft)); /* lose some bits */
- X if ((BitsLeft -= 6) < 0) BitsLeft += 8;
- X }
- X }
- X if (BitsLeft) {
- X Bufp++;
- X }
- X *Bufp = 0;
- X return (char *) Buffer;
- X}
- X
- Xunsigned char *
- XSixBitString2String(SixBitString)
- X char *SixBitString;
- X{
- X static unsigned char Buffer[MaxWordLength + 2];
- X register unsigned char *p = (unsigned char *) SixBitString;
- X int BitsLeft = 0;
- X unsigned char *Bufp = Buffer;
- X
- X while (*p) {
- X if (!(BitsLeft & 07)) { /* i.e. it's 0 or 8 */
- X *Bufp++ = (*p) >> 2;
- X BitsLeft = 2;
- X } else {
- X /* W R O N G */
- X /* bottom BITSLEFT bits */
- X *Bufp = ((*p) << (6 - BitsLeft));
- X /* Rest */
- X *Bufp = (unsigned) ((*p) << (2 + BitsLeft)); /* lose some bits */
- X if ((BitsLeft -= 6) < 0) BitsLeft += 8;
- X }
- X }
- X return (unsigned char *) "notdone'fixme"; /* NOTDONE FIXME */
- X}
- X
- X#ifdef TESTSIX
- Xchar *progname= "testsix";
- Xmain(argc, argv)
- X int argc;
- X char *argv[];
- X{
- X extern char *gets();
- X
- X char Line[4096];
- X int Encode = 1;
- X
- X if (argc != 3) {
- X fprintf(stderr, "bad arg count; usage: %s -[de]\n", progname);
- X }
- X if (STREQ(argv[1], "-e")) Encode = 1;
- X else if (STREQ(argv[1], "-d")) Decode = 1;
- X else {
- X fprintf(stderr, "usage: %s -[d|e]\n", progname);
- X exit(1);
- X }
- X while (gets(Line) != (char *) 0) {
- X char *Result;
- X
- X if (Encode) {
- X Result = String2SixBitString(Line);
- X } else {
- X if (STREQ(Line, "(cannot be encoded)")) {
- X Result = "(this line was not saved)";
- X } else {
- X Result = SixBitString2String(line);
- X }
- X }
- X if (Result) {
- X printf("%s\n", Result);
- X } else {
- X printf("(cannot be encoded)\n");
- X }
- X }
- X}
- X
- X#endif /*TESTSIX*/
- X
- X
- Xt_WID
- XWord2WID(Word, Length)
- X char *Word;
- X unsigned int Length;
- X{
- X DBM *db;
- X datum key, data;
- X char *q;
- X t_WID WID;
- X char Buffer[8];
- X /* enough for the binary representation of a number -- see numbers.c;
- X * this is _not_ sizeof(long). It's probably 5, in fact, although
- X * for small numbers it's less.
- X */
- X
- X if (Length > MaxWordLength) {
- X Length = MaxWordLength; /* NOTE: no trailing \0 required. */
- X }
- X
- X /* contact database server */
- X if ((db = startdb(WordIndex)) == (DBM *) 0) {
- X fprintf(stderr, "dbmopen(%s) failed\n", WordIndex);
- X exit(2);
- X }
- X
- X key.dptr = Word;
- X key.dsize = Length;
- X
- X data = dbm_fetch(db, key);
- X
- X enddb(db);
- X
- X if (data.dsize == 0) {
- X return (t_WID) 0;
- X }
- X
- X /* do this because ReadNumber will leave q pointing beyond Buffer: */
- X (void) memcpy(Buffer, data.dptr, data.dsize);
- X q = Buffer;
- X WID = sReadNumber(&q);
- X if (q - Buffer != data.dsize) {
- X fprintf(stderr, "Word2Wid failed... got %lu\n");
- X return (t_WID) 0;
- X }
- X return WID;
- X}
- X
- Xchar *
- XWID2Word(WID)
- X t_WID WID;
- X{
- X t_WordInfo *W;
- X char *Word;
- X
- X if (WID == (t_WID) 0) {
- X return (char *) 0;
- X }
- X
- X if ((W = WID2WordInfo(WID)) == (t_WordInfo *) 0) {
- X return (char *) 0;
- X }
- X Word = W->Word;
- X W->Word = (char *) 0;
- X SlayWordInfo(W);
- X return Word;
- X}
- X
- Xint
- XPutWordInfoIntoIndex(WordInfo, Offset)
- X t_WordInfo *WordInfo;
- X unsigned long Offset;
- X{
- X DBM *db;
- X char NumBuf[sizeof(t_WID) + 1];
- X char *q = NumBuf;
- X datum key, data;
- X int RetVal;
- X
- X /** First, write the WID itself, so we can go from Word to WID */
- X
- X key.dptr = WordInfo->Word;
- X key.dsize = WordInfo->Length;
- X
- X sWriteNumber(&q, WordInfo->WID);
- X
- X data.dptr = NumBuf;
- X data.dsize = q - NumBuf;
- X
- X /* contact database server */
- X if ((db = startdb(WordIndex)) == (DBM *) 0) {
- X fprintf(stderr, "dbmopen(%s) failed\n", WordIndex);
- X exit(2);
- X }
- X
- X RetVal = dbm_store(db, key, data, DBM_REPLACE);
- X
- X enddb(db);
- X
- X /** Now, ensure that we have a physical block for WordInfo. If
- X ** we don't, there is something very wrong in pblock.c, our only
- X ** possible caller.
- X **/
- X
- X if (WordInfo->DataBlock == (char *) 0) {
- X if (Offset) {
- X fprintf(stderr, "WARNING: WordInfo corrupt for \"%s\"\n",
- X WordInfo->Word);
- X }
- X (void) MkWIB(WordInfo, (t_pblock *) 0);
- X }
- X
- X /** Now write the physical entry... */
- X
- X if (Widfd < 0) {
- X if ((Widfd = open(WidIndexFile, O_RDWR|O_CREAT, 0766)) < 0) {
- X fprintf(stderr, "Can't open WID file \"%s\"\n", WidIndexFile);
- X exit(1);
- X }
- X }
- X
- X if (lseek(Widfd, (long) (WordInfo->WID * WIDBLOCKSIZE), 0) < 0) {
- X perror("lseek");
- X exit(1);
- X }
- X
- X if (write(Widfd, WordInfo->DataBlock, WIDBLOCKSIZE) != WIDBLOCKSIZE) {
- X perror("write");
- X exit(1);
- X }
- X
- X return RetVal;
- X}
- X
- Xint
- XWID_sig()
- X{
- X return 0;
- X}
- X
- Xt_WID
- XGetMaxWID()
- X{
- X extern int errno;
- X extern long atol();
- X
- X int fd;
- X char Buffer[20]; /* large enough to sprintf() a WID */
- X struct stat StatBuf;
- X#if 0
- X int FileKey; /* what one gets from a lock... */
- X int NumberOfTriesLeft = 5;
- X#endif
- X
- X /* ensure that the file is there */
- X if (stat(WidFile, &StatBuf) == -1) {
- X return 0;
- X }
- X
- X if ((fd = open(WidFile, O_RDWR, 0)) < 0) {
- X fprintf(stderr, "Warning: Can't open WID file");
- X return 0;
- X }
- X
- X#if 0
- X errno = 0;
- X
- X /** Lock the file **/
- X
- X do {
- X /* Set a timeout of 2 seconds */
- X signal(SIGALRM, WID_sig);
- X (void) alarm(3);
- X if ((FileKey = lockf(fd, F_LOCK, 0L)) < 0) {
- X switch (errno) {
- X case EACCES: /*[sic]*/ /* another process has the lock */
- X fprintf(stderr, "Please wait...\n");
- X /* shouldn't happen */
- X break;
- X case EDEADLK:
- X fprintf(stderr, "Warning: can't lock \"%s\" -- EDEADLK\n", WidFile);
- X FileKey = 1;
- X break;
- X case EINTR:
- X fprintf(stderr, "Please Wait... someone has the key...\n");
- X sleep(1);
- X break;
- X }
- X }
- X if (--NumberOfTriesLeft <= 0) {
- X fprintf(stderr, "Warning: can't lock ");
- X perror(WidFile);
- X (void) close(fd);
- X return 0;
- X }
- X } while (FileKey < 0);
- X (void) alarm(0);
- X
- X if (stat(WidFile, &StatBuf) == -1) {
- X fprintf(stderr, "It went away!\n");
- X return 0;
- X }
- X#endif
- X
- X /* Read the file */
- X if (read(fd, Buffer, (unsigned int) StatBuf.st_size) < 0) {
- X fprintf(stderr, "Can't read from \"%s\"\n", WidFile);
- X exit(1);
- X }
- X
- X#if 0
- X /** Unlock the file **/
- X if (lockf(fd, F_ULOCK, 0L) < 0 && FileKey == 0) {
- X fprintf(stderr, "Warning: might not have unlocked \"%s\"\n",
- X WidFile);
- X }
- X#endif
- X (void) close(fd);
- X
- X Buffer[StatBuf.st_size] = '\0';
- X
- X return atol(Buffer);
- X}
- X
- X
- Xt_WID LastNextWIDVal = (t_WID) 0;
- X
- X#undef GetNextWID
- X
- Xt_WID GetNextWID(), GetMaxWID();
- X
- XINLINE
- Xt_WID
- XSpoofGetNextWID()
- X{
- X static int SinceLastUpdate = 0;
- X
- X /* Call the real function sometimes, so that the database does
- X * get updated in case of a crash or for other users.
- X */
- X if (++SinceLastUpdate > 500) {
- X SinceLastUpdate = 0;
- X return GetNextWID(1);
- X }
- X
- X if (LastNextWIDVal == (t_WID) 0) {
- X SinceLastUpdate = 0;
- X LastNextWIDVal = GetMaxWID();
- X }
- X return ++LastNextWIDVal;
- X}
- X
- Xvoid
- XWriteCurrentMaxWID()
- X{
- X (void) GetNextWID(1);
- X}
- X
- Xt_WID
- XGetNextWID(WriteCurrent)
- X int WriteCurrent; /* simply write the current MaxWID if true */
- X{
- X extern int errno;
- X extern long atol();
- X
- X int fd;
- X char Buffer[20];
- X struct stat StatBuf;
- X#if 0
- X int FileKey; /* what one gets from a lock... */
- X int NumberOfTriesLeft = 5;
- X#endif
- X t_WID Result;
- X
- X /** Alter the file, so other programs can see the new words...
- X **/
- X
- X /* ensure that the file is there */
- X if (stat(WidFile, &StatBuf) == -1) {
- X fprintf(stderr, "Creating WID file \"%s\"\n", WidFile);
- X if ((fd = creat(WidFile, 02666)) < 0) {
- X fprintf(stderr, "Can't create WID file \"%s\"\n", WidFile);
- X exit(1);
- X }
- X (void) close(fd);
- X return GetNextWID(WriteCurrent);
- X
- X /*NOTREACHED*/
- X }
- X
- X if ((fd = open(WidFile, O_RDWR, 0)) < 0) {
- X fprintf(stderr, "Can't open WID file");
- X perror(WidFile);
- X exit(1);
- X }
- X
- X#if 0
- X errno = 0;
- X
- X /** Lock the file **/
- X
- X do {
- X /* Set a timeout of 2 seconds */
- X signal(SIGALRM, WID_sig);
- X (void) alarm(3);
- X if ((FileKey = lockf(fd, F_LOCK, 0L)) < 0) {
- X switch (errno) {
- X case EACCES: /*[sic]*/ /* another process has the lock */
- X fprintf(stderr, "Please wait...\n");
- X /* shouldn't happen */
- X break;
- X case EDEADLK:
- X fprintf(stderr, "Warning: can't lock \"%s\" -- EDEADLK\n", WidFile);
- X FileKey = 1;
- X break;
- X case EINTR:
- X fprintf(stderr, "Please Wait... someone has the key...\n");
- X sleep(1);
- X break;
- X }
- X }
- X if (--NumberOfTriesLeft <= 0) {
- X fprintf(stderr, "Warning: can't lock the file \"%s\"\n", WidFile);
- X }
- X } while (FileKey < 0);
- X (void) alarm(0);
- X
- X if (stat(WidFile, &StatBuf) == -1) {
- X fprintf(stderr, "It went away!\n");
- X exit(1);
- X }
- X#endif
- X
- X /* Read the file */
- X
- X if (read(fd, Buffer, (unsigned int) StatBuf.st_size) < 0) {
- X fprintf(stderr, "Can't read from \"%s\"\n", WidFile);
- X exit(1);
- X }
- X
- X Buffer[StatBuf.st_size] = '\0';
- X
- X Result = atol(Buffer);
- X
- X /* if WriteCurrent is set, we should not increment anything */
- X if (!WriteCurrent) {
- X ++LastNextWIDVal;
- X ++Result;
- X }
- X
- X if (Result < LastNextWIDVal) {
- X Result = LastNextWIDVal;
- X }
- X
- X (void) sprintf(Buffer, "%lu\n", Result);
- X
- X /* Move to the start of the file and write the now value.
- X * No need to truncate the file, because it didn't shrink!
- X */
- X (void) lseek(fd, 0, 0L);
- X (void) write(fd, Buffer, (unsigned int) strlen(Buffer));
- X
- X#if 0
- X /** Unlock the file **/
- X
- X if (lockf(fd, F_ULOCK, 0L) < 0 && FileKey == 0) {
- X fprintf(stderr, "Warning: might not have unlocked \"%s\"\n",
- X WidFile);
- X }
- X#endif
- X (void) close(fd);
- X
- X return Result;
- X}
- X
- Xint
- XDeleteWord(Word)
- X char *Word;
- X{
- X extern t_pblock *Getpblock();
- X
- X t_WID WID;
- X t_WordInfo *WordInfo;
- X t_pblock *tmp;
- X
- X if ((WID = Word2WID(Word, strlen(Word))) == (t_WID) 0) {
- X return -1; /* not there */
- X }
- X
- X /* get info from the list */
- X if ((WordInfo = WID2WordInfo(WID)) == (t_WordInfo *) 0) {
- X return -1;
- X }
- X
- X if ((tmp = Getpblock(WordInfo)) != (t_pblock *) NULL) {
- X Deletepblock(tmp);
- X (void) efree(tmp);
- X }
- X
- X /* delete the offset from the database, but retain the WID: */
- X WordInfo->Offset = 0L;
- X WordInfo->NumberOfWordPlaces = 0L;
- X WordInfo->WordPlacesInHere = 0;
- X PutWordInfoIntoIndex(WordInfo, 0L);
- X SlayWordInfo(WordInfo);
- X
- X return 0;
- X}
- X
- X/* Routines to create and destroy WordInfo structures */
- XINLINE t_WordInfo *
- XMakeWordInfo(WID, Length, Word)
- X t_WID WID;
- X int Length;
- X char *Word; /* the word, which might not be nul-terminated */
- X{
- X register t_WordInfo *WP;
- X WP = (t_WordInfo *) emalloc(sizeof(t_WordInfo));
- X
- X WP->WID = WID;
- X WP->FID = (t_FID) 0;
- X WP->Next = (t_WordInfo *) 0;
- X WP->NumberOfWordPlaces = 0;
- X WP->DataBlock = (char *) 0;
- X WP->WordPlaceStart = (char *) 0;
- X WP->WordPlaces = (t_WordPlace *) 0;
- X WP->WordPlacesInHere = 0;
- X WP->WordPlace.FID = 0; /* mark as invalid */
- X WP->WordPlace.Flags = 0; /* this gets used anyway, so set it to zero! */
- X
- X WP->Word = emalloc(Length + 1);
- X
- X (void) strncpy(WP->Word, Word, Length);
- X WP->Length = Length;
- X WP->Word[Length] = '\0'; /* strncpy does not add a null */
- X WP->Offset = 0;
- X
- X return WP;
- X}
- X
- Xvoid
- XSlayWordInfo(WP)
- X t_WordInfo *WP;
- X{
- X if (!WP) return;
- X if (WP->Word) efree(WP->Word);
- X if (WP->WordPlaces) efree((char *)WP-> WordPlaces);
- X
- X WP->Next = (t_WordInfo *) 0;
- X /* The above line is to force a run-time error in the common
- X * (but wrong) case
- X * for (w = WordList; w; w = w->Next) SlayWordInfo(w);
- X */
- X efree((char *) WP);
- X}
- @@@End of lq-text/src/liblqtext/WordInfo.c
- echo end of part 04
- --
- Liam R. E. Quin, lee@sq.com, SoftQuad Inc., Toronto, +1 (416) 963-8337
-