Source Code 1992 March

home *** CD-ROM | disk | FTP | other *** search

/ Source Code 1992 March / Source_Code_CD-ROM_Walnut_Creek_March_1992.iso / usenet / altsrcs / 2 / 2952 < prev next >

Wrap

Internet Message Format | 1991-03-04 | 58.2 KB

From: lee@sq.sq.com (Liam R. E. Quin) Newsgroups: alt.sources Subject: lq-text Full Text Retrieval Database Part 04/13 Message-ID: <1991Mar4.020316.16307@sq.sq.com> Date: 4 Mar 91 02:03:16 GMT : cut here --- cut here -- : To unbundle, sh this file #! /bin/sh : part 04 echo x - lq-text/src/liblqtext/Phrase.c 1>&2 sed 's/^X//' >lq-text/src/liblqtext/Phrase.c <<'@@@End of lq-text/src/liblqtext/Phrase.c' X/* Phrase.c -- Copyright 1989 Liam R. Quin. All Rights Reserved. X * This code is NOT in the public domain. X * See the file COPYRIGHT for full details. X */ X X/* X * Deal with (WID, FID, Offfset) triples X * Liam Quin, September 1989 X * X * $Id: Phrase.c,v 1.11 91/02/22 18:22:59 lee Rel1-10 $ X * X * $Log: Phrase.c,v $ X * Revision 1.11 91/02/22 18:22:59 lee X * Improved a trace message. X * X * Revision 1.10 90/10/06 00:11:57 lee X * Prepared for first beta release. X * X * Revision 1.9 90/10/04 17:10:43 lee X * Case matching now works on one-word phrases. X * X * Revision 1.8 90/10/03 20:47:06 lee X * changed an int to an unsigned long X * X * Revision 1.7 90/08/29 21:46:39 lee X * Alpha release. X * X * Revision 1.6 90/03/29 19:59:05 lee X * Now passes gcc -Wall X * X * Revision 1.5 90/03/19 00:02:23 lee X * Simplified phrase matching greatly by adding new routine, and also X * improved checking of word flags and generation of ModifiedString, X * the canonical phrase, in String2Phrase. X * X * Revision 1.4 90/03/16 22:43:15 lee X * After Richard's consummate help... X * fixed two severe bugs in matching, and started to simplify X * MakeMatches(). X * X * Revision 1.3 90/03/14 21:02:14 lee X * Before Richard... X * X * Revision 1.2 90/03/12 22:39:30 lee X * prepared for general release. X * X * Revision 1.1 89/09/17 23:01:34 lee X * Initial revision X * X * X */ X X/** Unix system calls that need to be declared: **/ Xextern void exit(); X/** Unix/C Library Functions: **/ Xextern unsigned int sleep(); X#ifndef tolower X extern int tolower(); X#endif Xextern int strlen(); Xextern char *strcpy(); X/** lqtext functions: **/ Xextern int TooCommon(); Xextern char *UnFlag(); X/** **/ X X#include "globals.h" /* defines and declarations for database filenames */ X X#include <stdio.h> /* stderr, also for fileinfo.h */ X#include <fcntl.h> X#include <malloc.h> X#include <sys/types.h> X#include <ctype.h> X X#include "fileinfo.h" /* for wordinfo.h */ X#include "wordinfo.h" X#include "pblock.h" X#include "phrase.h" X#include "wordrules.h" X X#include "emalloc.h" X X#ifndef STREQ X# define STREQ(boy,girl) ((*(boy) == *(girl)) && (!strcmp((boy),(girl)))) X#endif X X#ifndef new X# define new(type) ((type *) emalloc(sizeof (type))) X#endif X X#ifndef MAXPHRASELEN X# define MAXPHRASELEN 2000 X#endif X Xextern int AsciiTrace; X Xt_Phrase * XString2Phrase(String) X char *String; X{ X extern t_WordInfo *WID2WordInfo(); X extern t_WID Word2WID(); X extern char *WordRoot(); X X t_Phrase *Result; X t_PhraseItem **ThisWord; X /* (* 3 because in the worst case, "a a a" expands to "[a] [a] [a]") */ X register char *p; X register char *q; X char *LastStart = 0; X char *PrevLastEnd = (char *) 0; X int InWord = 0; X int Flags = 0; X int FoundLetters = 0; X X if (AsciiTrace > 50) { X fprintf(stderr, "String2Phrase(%s)\n", String); X } X Result = (t_Phrase *) emalloc(sizeof(t_Phrase)); X Result->Next = (t_Phrase *) 0; X p = Result->ModifiedString = emalloc(strlen(String) * 3 + 1); X X Result->HasUnknownWords = 0; X X *(ThisWord = &Result->Words) = (t_PhraseItem *) 0; X X /* March along the supplied phrase, looking for keywords. X * surround unindexed or short words with [brackets]. X * Also converts to lower case and strips plurals. X */ X for (q = String; /*LOTSOFTIMES*/; q++) { X X if (AsciiTrace > 50) { X fputc(*q, stderr); X } X X if (!InWord && !StartsWord(*q)) { X if (!*q) { X break; X } else { X if (!LastStart) continue; X } X } X X if (!InWord) { X LastStart = q; X if (StartsWord(*q)) { X InWord = 1; X } X continue; X } else if (isalpha(*q)) { X /* in a word and found letters, so remember in case we skip X * this word... X */ X FoundLetters = 1; X } X X /* ASSERT: inword == 1 */ X X if (*q == '\'') { X if (!WithinWord(q[1])) { X InWord = 0; X } X } X X if (!*q || !WithinWord(*q)) { X InWord = 0; X } X X X if (LastStart && !InWord) { X int Length = q - LastStart; X int UsedABracket = 0; X X if (p > Result->ModifiedString) *p++ = ' '; X X /* we have reached the end of a word, is it long enough? */ X if (!FoundLetters) { X *p++ = '['; X UsedABracket = 1; X } else if (Length < MinWordLength) { X *p++ = '['; X UsedABracket = 1; X if (FoundLetters) { X Flags |= WPF_LASTHADLETTERS; X FoundLetters = 0; X } X } else { X t_WID WID; X t_WordInfo *W; X char SaveEnd = (*q); X t_WordInfo TryRoot; X register char *p2; X char RootBuffer[MaxWordLength + 1]; X char *R = RootBuffer; X X /* Add the word to the chain, too: */ X *q = '\0'; X X FoundLetters = 0; /* unnecessary now (?) */ X TryRoot.Length = Length; X TryRoot.WordPlace.Flags = Flags; X if (isupper(*LastStart)) { X TryRoot.WordPlace.Flags |= WPF_UPPERCASE; X } X Flags = 0; X X for (p2 = LastStart; *p2; p2++) { X *R++ = isupper(*p2) ? tolower(*p2) : *p2; X } X *R = '\0'; X TryRoot.Word = RootBuffer; X X R = WordRoot(&TryRoot); X X if (TooCommon(&TryRoot)) { X *p++ = '['; X *p++ = '*'; X UsedABracket = 1; X Flags |= WPF_LASTWASCOMMON; X if (AsciiTrace > 10) { X fprintf(stderr, " Common(%s) ", TryRoot.Word); X } X } else if (!(WID = Word2WID(TryRoot.Word, TryRoot.Length)) || X (W = WID2WordInfo(WID)) == (t_WordInfo *) 0) { X *p++ = '['; X *p++ = WID ? '@' : '?'; X UsedABracket = 1; X if (AsciiTrace > 10) { X fprintf(stderr, " Unknown(%s) ", TryRoot.Word); X } X Result->HasUnknownWords++; X } else { X if ((*ThisWord = new(t_PhraseItem)) == (t_PhraseItem *) 0) { X fprintf(stderr, X "Not enough memory for PHRASE \"%s\"", String); X return (t_Phrase *) 0; X } X W->WordPlace.Flags |= TryRoot.WordPlace.Flags; X if (PrevLastEnd == (char *) 0) { X W->WordPlace.StuffBefore = 0; X } else { X W->WordPlace.StuffBefore = LastStart - PrevLastEnd; X } X PrevLastEnd = &q[1]; X X if (AsciiTrace) { X fprintf(stderr, "Word %s --> %s, %lu matches\n", X LastStart, X UnFlag(W, W->WordPlace.Flags), X W->NumberOfWordPlaces); X } X /* point to the new space */ X (*ThisWord)->Word = W; X (*ThisWord)->WordStart = LastStart; X (*ThisWord)->Next = (t_PhraseItem *) 0; X (*ThisWord)->SearchIndex = 0L; X ThisWord = &(*ThisWord)->Next; X X /** (void) strcpy(p, R); **/ X /** p += TryRoot.Length; **/ X X (void) strcpy(p, LastStart); X p += q - LastStart; /* q points one beyond the end */ X X LastStart = q; X X } X *q = SaveEnd; X } X while (LastStart < q) { X *p++ = *LastStart++; X } X if (UsedABracket) { X *p++ = ']'; X } X LastStart = 0; X } /* if */ X if (!*q) break; X } /* for */ X *p= '\0'; X X if (ThisWord == &Result->Words) { X /* There were no words in the phrase! */ X return (t_Phrase *) 0; X } X X Result->OriginalString = emalloc(q - String + 2); X (void) strcpy(Result->OriginalString, String); X X Result->NumberOfMatches = 0; X Result->Matches = (t_MatchList *) 0; X if (AsciiTrace > 1) { X fprintf(stderr, "phrase \"%s\",\n", Result->OriginalString, String); X fprintf(stderr, "Canonical form \"%s\"\n", Result->ModifiedString); X } X return Result; X} X X#define MaxDistance 20 X Xt_Answer * XGetFiles(Phrase) X t_Phrase *Phrase; X{ X char *MakeOneDescription(); X X t_Answer *Result = 0; X t_Answer **RP = &Result; X t_MatchList *MP; X t_FID LastFID; X unsigned long ThisFIDNumberOfMatches = 0L; X X if (!Phrase || !Phrase->Matches) { X return Result; X } X X LastFID = Phrase->Matches->Match->Where->FID; X X for (MP = Phrase->Matches; MP; MP = MP->Next) { X if (MP->Match->Where->FID != LastFID) { X char *p; X X p = MakeOneDescription(LastFID, ThisFIDNumberOfMatches); X X if ((*RP = new(t_Answer)) == (t_Answer *) 0) { X return Result; X } X (*RP)->Answer = p; X RP = &(*RP)->Next; X *RP = (t_Answer *) 0; X ThisFIDNumberOfMatches = 0L; X } else { X ++ThisFIDNumberOfMatches; X } X X LastFID = MP->Match->Where->FID; X } X X if (ThisFIDNumberOfMatches) { X char *p = MakeOneDescription(LastFID, ThisFIDNumberOfMatches); X X if ((*RP = new(t_Answer)) == (t_Answer *) 0) { X return Result; X } X (*RP)->Answer = p; X RP = &(*RP)->Next; X *RP = (t_Answer *) 0; X ThisFIDNumberOfMatches = 0L; X } X X return Result; X} X Xchar * XMakeOneDescription(FID, NumberOfMatches) X t_FID FID; X unsigned long NumberOfMatches; X{ X extern char *ctime(); X extern t_FileInfo *GetFileInfo(); X X char *Date; X char *p; X t_FileInfo *FileInfo; X char NumBuf[20]; X X if (!FID) return (char *) 0; X X if (!(FileInfo = GetFileInfo(FID))) return (char *) 0; X X Date = ctime(&(FileInfo->Date)); X /** Tue Oct 3 00:57:11 BST 1989 **/ X Date[10] = '\0'; X X (void) sprintf(NumBuf, "%lu", NumberOfMatches); X X p = emalloc((unsigned) (strlen(FileInfo->Name) + strlen(NumBuf) + 11)); X (void) sprintf(p, "%-.5s %s %s", NumBuf, Date, FileInfo->Name); X efree(FileInfo->Name); X efree(FileInfo); X return p; X} X Xvoid XResetPhraseMatch(Phrase) X t_Phrase *Phrase; X{ X t_PhraseItem *Word; X X if (!Phrase || !Phrase->Words) return; X X for (Word = Phrase->Words; Word; Word = Word->Next) { X Word->SearchIndex = 0; X } X Phrase->NumberOfMatches = 0; X} X X/* Default is to check case, etc. only if given in the input phrase. X * This is an enum from phrase.h, and only used in MakeMatches(). X */ X Xextern t_PhraseCaseMatch PhraseMatchLevel; X Xlong XMakeMatches(Phrase) X t_Phrase *Phrase; X{ X /* Each word has a pointer (SearchIndex) to the last Word Place X * that was examined. This enables an O(NumberOfWords) search instead X * of O(NumberOfWords * NumberOfWords) search. X */ X static int ContinuesMatch(); X X unsigned long PIFB; /* PlaceInFirstBlock */ X t_MatchList **MLPP = &(Phrase->Matches); X t_Match **MPP; X t_Match **OldMatch; X t_WordPlace *pp; X t_PhraseItem *Word; X long Result = 0L; X long LastResult = (-1L); /* to detect new matches */ X t_PhraseItem *LeastWord; X int HowGood; X X if (!Phrase) { X return 0L; X } X X ResetPhraseMatch(Phrase); X /* Each iteration over this list either produces a match or rejects a X * possible phrase starting place. X */ X X if (AsciiTrace > 1) { X fprintf(stderr, "Match(%s)\n", Phrase->ModifiedString); X } X X /* A phrase with garbage words can't match anything */ X if (Phrase->HasUnknownWords && PhraseMatchLevel != PCM_AnyCase) { X return 0L; X } X X /* Ensure that the matches for the first word have been read */ X if (Phrase->Words->Word->WordPlacesInHere < X Phrase->Words->Word->NumberOfWordPlaces) { X extern t_WordPlace *GetWordPlaces(); X t_WordInfo *W = Phrase->Words->Word; /* less indirection! */ X X if (W->WordPlaces) { X (void) efree(W->WordPlaces); X } X X W->WordPlaces = GetWordPlaces( X W->WID, X W->WordPlaceStart, X (unsigned) WIDBLOCKSIZE - (W->WordPlaceStart - W->DataBlock), X W->Offset, X W->NumberOfWordPlaces X ); X W->WordPlacesInHere = W->NumberOfWordPlaces; X } X X /* Find the word in the phrase with least matches: */ X LeastWord = Phrase->Words; X for (Word = Phrase->Words; Word; Word = Word->Next) { X if (Word->Word->NumberOfWordPlaces < X LeastWord->Word->NumberOfWordPlaces) { X LeastWord = Word; X } X } X X /* For each match in the first word in the phrase: */ X for (PIFB = 0; PIFB < Phrase->Words->Word->NumberOfWordPlaces; PIFB++) { X t_WordPlace *LastFOP = (t_WordPlace *) 0; X X /* The idea is that the next two loops are are likely to reduce X * considerably the number of places we have to consider in the X * case that the first word in the phrase has a lot of matches X * and there is a subsequent word with relatively few matches. X * Experiments suggest that this is fairly common. X * X * This is still a nearly (i.e. slightly-better-than) linear X * algorithm w.r.t the total number of matches in all of the X * words added up. Note that I alter LeastWord->SearchIndex in X * one of the two loops that follow, so when WordPlaces from that X * word are considered, we don't have to look at any twice. X * X * In order to do better, one would have to be able to avoid X * looking at some or (better!) most of the WordPlaces. X * X * For example, not fetching so many from disk: X * if we didn't do the fetches until we needed to, and we gave X * GetWordPlaces a minimum FID to look for, we might be able X * to reduce things by (say) 15%. X * If all of the FIDS were stored separately, we would not X * have to look at the (Block, Word, Flags, StuffBefore) stuff at X * all, and that would be much faster. One way to do that might be X * to store the list of FIDs with the word (as now), and perhaps X * some flags and the count of words/fid, and to store the rest X * in a per-file data structure. X * X * That would be a major, major hack... X * ... sigh.o X * X */ X X while (LeastWord->Word->WordPlaces[LeastWord->SearchIndex].FID < X Phrase->Words->Word->WordPlaces[PIFB].FID) { X if (++(LeastWord->SearchIndex) >= X LeastWord->Word->NumberOfWordPlaces) { X goto GiveUp; X } X } X X while (Phrase->Words->Word->WordPlaces[PIFB].FID < X LeastWord->Word->WordPlaces[LeastWord->SearchIndex].FID) { X if (++PIFB >= Phrase->Words->Word->NumberOfWordPlaces) { X goto GiveUp; X } X } X X /* The following comment tells Sabre_C not to moan about "if (0)" */ X /*SUPPRESS558*/ X if (0) { XGiveUp: X break; X } X /* end of attempted speed improvement */ X X /* Optimistically allocate a new match: */ X if (1 || Result != LastResult) { X *MLPP = (t_MatchList *) emalloc(sizeof(t_MatchList)); X (*MLPP)->Match = (t_Match *) 0; X OldMatch = MPP = &((*MLPP)->Match); X MLPP = &(*MLPP)->Next; X *MLPP = (t_MatchList *) 0; X } X LastResult = Result; X X pp = &Phrase->Words->Word->WordPlaces[Phrase->Words->SearchIndex = PIFB]; X /* When we have a partially completed match, X * FOP (declared below) will point to the WordPlace currently X * being considered to see if it extends the partial match; X * LastFOP points to the previous WordPlace in the match. X */ X X /* For each word in the phrase: */ X for (Word = Phrase->Words; Word; Word = Word->Next) { X int GotOne = 0; X X /* Ensure that the matches word have been read */ X if (Word->Word->WordPlacesInHere < X Word->Word->NumberOfWordPlaces) { X extern t_WordPlace *GetWordPlaces(); X t_WordInfo *W = Word->Word; /* less indirection! */ X X if (W->WordPlaces) { X (void) efree(W->WordPlaces); X } X W->WordPlaces = GetWordPlaces( X W->WID, X W->WordPlaceStart, X (unsigned) WIDBLOCKSIZE - (W->WordPlaceStart - W->DataBlock), X W->Offset, X W->NumberOfWordPlaces X ); X W->WordPlacesInHere = W->NumberOfWordPlaces; X } X X /* For each occurrence of that word: */ X for (; Word->SearchIndex < Word->Word->NumberOfWordPlaces; X ++Word->SearchIndex) { X register t_WordPlace *FOP = X &Word->Word->WordPlaces[Word->SearchIndex]; X X#if 0 X /* Speedup -- binary search to find next candidate... X * this is commented out because it actually seems to X * make things run slower! X */ X { X int low = Word->SearchIndex; X int high = Word->Word->NumberOfWordPlaces - 1; X t_WordPlace *Places = Word->Word->WordPlaces; X int guess = (high + low) / 2; X X while (low < high) { X if (Places[guess].FID < pp->FID) { X /* not gone far enough */ X low = guess + 1; X } else { X high = guess; X } X guess = (high + low) / 2; X } X if (guess != Word->SearchIndex) { X Word->SearchIndex = guess; X FOP = &Word->Word->WordPlaces[Word->SearchIndex]; X } X } X#endif X X if (!LastFOP) { X LastFOP = FOP; X } X X /** So: X ** | int PIFB = each match in the first word in the phrase X ** | t_WordPlace *pp = each match in the phrase X ** | t_PhraseItem *Word = each word in the phrase X ** | unsigned SearchIndex = each match of that word X ** | t_WordPlace *FOP = each occurrence of that word X ** X ** Hence, we are comparing pp and FOP, hoping that each time X ** round the (Word) loop we will advance FOP. X ** Once we have decided that FOP and pp relate to the X ** same file and that FOP is no earlier than pp in the X ** file, we must then check that FOP is advancing the X ** chain by comparing it to the previous element in the X ** list (LastFOP). X ** X ** When we break from this inner list, we must either have X ** eliminated this particular (PIFB) as starting a match- X ** chain, or have decided that we have extended the X ** current match chain (by setting GotOne). X **/ X X X if (LastFOP == FOP) { X HowGood = CheckFlags(Word->Word, FOP); X } else { X HowGood = ContinuesMatch(Word->Word, pp, LastFOP, FOP); X } X X switch (HowGood) { X case 0: X /* G O T C H A !!!! */ X /* extend the HitList, since it's OK so far. */ X X *MPP = (t_Match *) emalloc(sizeof(t_Match)); X (*MPP)->WID = Word->Word->WID; X (*MPP)->Where = FOP; X (*MPP)->Next = (t_Match *) 0; X MPP = &(*MPP)->Next; X GotOne++; X break; X case 1: /* gone too far */ X if (AsciiTrace > 10) { X t_WordInfo WW; X X WW = *(Word->Word); X X if (LastFOP == FOP) { X /* UnFlag() returns a pointer to a static buffer, X * so I have to use two printf() calls here. X */ X fprintf(stderr, "Reject(%s (%d) != ", X UnFlag(&WW, WW.WordPlace.Flags), X WW.WordPlace.Flags); X fprintf(stderr, "%s (%d)) [flags]\n", X UnFlag(&WW, FOP->Flags), FOP->Flags); X } else { X fprintf(stderr, "Reject(%s) -- too far\n", X UnFlag(&WW, WW.WordPlace.Flags)); X } X } X break; X case -1: X continue; /* not there yet */ X default: X fprintf(stderr, "\n\rInternal Error %s: %d\n", __FILE__, X __LINE__ - 1); X (void) sleep(4); /* for curses stuff... */ X exit(1); X } X X /* Remember where we got up to... so that we can extend X * the list when we start looking at the next word. X */ X LastFOP = FOP; X X if (AsciiTrace >= 4) { X t_WordInfo WW; X X WW = *(Word->Word); X /* UnFlag() returns a pointer to a static buffer */ X fprintf(stderr, "Partial match %s", X UnFlag(&WW, Word->Word->WordPlace.Flags)); X fprintf(stderr, "(Word (%s,%lu,%u) in file %lu)\n", X UnFlag(&WW, FOP->Flags), X FOP->BlockInFile, FOP->WordInBlock, X FOP->FID X ); X } X /* If we got to here, we extended the list, which is fine; X * otherwise, if we hit a continue, we try to carry on X * looking at matches of this word, and if we hit a break X * before we set "GotOne", we give up on this match X * altogether. X */ X break; X } /* For each occurrence of that word: */ X X if (!GotOne) { X t_Match *MP; X /* This word isn't here, so neither is the phrase found X * in this file starting here. X */ X X for (MP = (*OldMatch); MP != (t_Match *) 0; /*void*/) { X t_Match *Next = MP->Next; X X efree((char *) MP); X MP = Next; X } X X *OldMatch = (t_Match *) 0; X break; X } else { X /* If we've reached the end of the phrase, i.e. if X * Word->Next is zero, we have successfully added a new X * phrase! X */ X if (Word->Next == (t_PhraseItem *) 0) { X if (AsciiTrace > 10) { X fprintf(stderr, "Result now %d\n", Result + 1); X } X Result++; X } X } X X } /* end for (each word in the phrase) */ X } /* end (for each FID/Offset pair in the first word */ X return Phrase->NumberOfMatches = Result; X} X X Xstatic int XContinuesMatch(QueryWord, First, Prev, New) X t_WordInfo *QueryWord; X t_WordPlace *First; X t_WordPlace *Prev; X t_WordPlace *New; X{ X /* Return Value is X * -1 --- if New occurs before Prev (and thus isn't part of the match) X * 0 --- if it's the next word in the match X * +1 --- if we've gone past it X * Note: you can use these values in a switch() if you want. X */ X X /* First check we are looking at the right file: X * Have we gone far enough? X */ X if (New->FID < First->FID) { X return -1; /* not far enough */ X } else if (New->FID > First->FID) { X return 1; /* too far */ X } else if (Prev == New) { X return 0; X } X X /* Hey everybody, they're the same! X * That means that this might be a candidate for a MATCH!!!! X */ X X /* if (SimplyAnywhereWillDo) { OK; break; } */ X X /* Clearly later words in the phrase can't be in earlier X * blocks... X */ X if (New->BlockInFile < First->BlockInFile) { X /* Although we are in the right file, we have not X * yet reached the correct offset. X */ X return -1; X } X X /* If we get to here, X * . we are in the right file X * . we are at least as far into the file as the start X * of the phrase X */ X X /* Now check that we are a reasonable distance past X * the preceding word (checking that we are not on the first X * match in the list, of course): X */ X if (New->BlockInFile < Prev->BlockInFile) { X /* not gone far enough */ X return -1; X } X if (New->BlockInFile > Prev->BlockInFile + 1) { X /* If they are more than one block apart, I X * don't believe them to be part of a phrase! X */ X return 1; X } X if (New->BlockInFile == Prev->BlockInFile) { X /* If they are in the same block, one must be X * exactly one word beyond the other. I don't X * think they can ever be the same, unless there X * is a serious bug somewhere! X */ X if (New->WordInBlock <= Prev->WordInBlock) { X /* too early in the block */ X return -1; X } X switch (PhraseMatchLevel) { X case PCM_AnyCase: X if (New->WordInBlock > Prev->WordInBlock + 4) { X return 1; /* gone too far */ X /* We allow a few words slop in this case, though... */ X } X break; /* clearly OK */ X case PCM_SameCase: X case PCM_HalfCase: X default: X if (New->WordInBlock > Prev->WordInBlock + 1) { X return 1; /* gone too far */ X } X } X } else { X /* they are in adjacent blocks */ X if (New->WordInBlock > 0 || X !(Prev->Flags & WPF_LASTINBLOCK)) { X /* there is another word between them, so X * we have gone too far. X * I went to a lot of effort in addfile.c to X * mantain that flag, just for this! X */ X return 1; X } X } X /* So they are adjacent words. X * Now, I wonder if they are plausible distances X * apart, and whether the common words skipped are X * the same? X * Also, what about other flag details? X */ X X /* NOTDONE */ X X /* Now we check that the word matches the given word X * -- in other words, that possessive/plural/case X * is correct if required. Do this later as it is X * relatively expensive I expect, and we will not X * usually care about case. X * X * Since the word is in the right place, if it fails here there X * is no point in looking at the next word in this block! X */ X X return CheckFlags(QueryWord, New); X} X XCheckFlags(QueryWord, New) X t_WordInfo *QueryWord; X t_WordPlace *New; X{ X /* First check case */ X switch (PhraseMatchLevel) { X X default: /* defensive! */ X fprintf(stderr, "\n\rinternal error %d %s\n", __LINE__, __FILE__); X (void) sleep(4); X break; X case PCM_AnyCase: X break; /* clearly OK */ X case PCM_SameCase: X if ((QueryWord->WordPlace.Flags & (WPF_UPPERCASE|WPF_POSSESSIVE)) != X (New->Flags & (WPF_UPPERCASE|WPF_POSSESSIVE))) { X /* The cases are different, no good */ X return 1; /* give up on this match */ X } X if (QueryWord->WordPlace.StuffBefore > 0) { X int Difference; X X Difference = QueryWord->WordPlace.StuffBefore - New->StuffBefore; X if (Difference < -2 || Difference > 4) { X return 1; /* give up on this match */ X } X } X /* Now, what about skipped common words? */ X if ((New->Flags & WPF_LASTWASCOMMON) != X (QueryWord->WordPlace.Flags & WPF_LASTWASCOMMON)) { X return 1; /* give up on this match */ X } X X /* plurals: this should be separate */ X if ((QueryWord->WordPlace.Flags & WPF_WASPLURAL) && X !(New->Flags & WPF_WASPLURAL)) { X return 1; /* give up on this match */ X } X X /* Only do this test if we are being awfully strict. X * Remember also that the first word in the phrase will X * not usually have this set. X */ X if ((QueryWord->WordPlace.Flags & WPF_LASTHADLETTERS) && X !(New->Flags & WPF_LASTHADLETTERS)) { X return 1; /* give up on this match */ X } X break; X case PCM_HalfCase: X /* In this case, we are lax about things, but if the X * user typed plural/possessive/capital, we only X * match one with the same attribute. X */ X if ((QueryWord->WordPlace.Flags & WPF_UPPERCASE) && X !(New->Flags & WPF_UPPERCASE)) { X if (AsciiTrace > 4) { X fprintf(stderr, "Reject [uppercase]\n"); X } X return 1; /* give up on this match */ X } X X /* plurals: this should be separate */ X if ((New->Flags & WPF_WASPLURAL) && X !(QueryWord->WordPlace.Flags & WPF_WASPLURAL)) { X if (AsciiTrace > 4) { X fprintf(stderr, "Reject [plural]\n"); X } X return 1; /* give up on this match */ X } X X /* Now, what about skipped common words? */ X if ((QueryWord->WordPlace.Flags & WPF_LASTWASCOMMON) && X !(New->Flags & WPF_LASTWASCOMMON)) { X if (AsciiTrace > 4) { X fprintf(stderr, "Reject [last was common]\n"); X } X return 1; /* give up on this match */ X } X /* Stuff before, if given, must be present: */ X if (QueryWord->WordPlace.StuffBefore > 1) { X if (New->StuffBefore < QueryWord->WordPlace.StuffBefore - 1) { X if (AsciiTrace > 4) { X fprintf(stderr, "Reject [Stuff Before %d != Q%d]\n", X QueryWord->WordPlace.StuffBefore, X New->StuffBefore); X } X return 1; X } /* don't care if there is too much there, though */ X } X if ((QueryWord->WordPlace.Flags & WPF_POSSESSIVE) && X !(New->Flags & WPF_POSSESSIVE)) { X if (AsciiTrace > 4) { X fprintf(stderr, "Reject [user flag]\n"); X } X return 1; /* give up on this match */ X } X break; X } X X /* If we got here... X * X */ X X return 0; /* It's all OK! */ X} @@@End of lq-text/src/liblqtext/Phrase.c echo x - lq-text/src/liblqtext/Root.c 1>&2 sed 's/^X//' >lq-text/src/liblqtext/Root.c <<'@@@End of lq-text/src/liblqtext/Root.c' X/* Root.c -- Copyright 1989 Liam R. Quin. All Rights Reserved. X * This code is NOT in the public domain. X * See the file COPYRIGHT for full details. X */ X X/* X * $Id: Root.c,v 2.7 91/03/03 00:13:36 lee Rel1-10 $ X * X * $Log: Root.c,v $ X * Revision 2.7 91/03/03 00:13:36 lee X * cosmetic changes. X * X * Revision 2.6 90/10/06 00:11:59 lee X * Prepared for first beta release. X * X * Revision 2.5 90/08/29 21:46:42 lee X * Alpha release. X * X * Revision 2.4 90/08/09 19:16:29 lee X * BSD lint and fixes... X * X * Revision 2.3 90/03/29 23:00:04 lee X * Now passes gcc -Wall X * X * Revision 2.2 89/10/08 20:44:56 lee X * Working version of nx-text engine. Addfile and wordinfo work OK. X * X * Revision 2.1 89/10/02 01:13:07 lee X * New index format, with Block/WordInBlock/Flags/BytesSkipped info. X * X * X */ X X#include "globals.h" /* defines and declarations for database filenames */ X X#include <sys/types.h> X#include <fcntl.h> /* for my header files, sorry */ X#include <stdio.h> X#include <malloc.h> X#include <ctype.h> X X#include "fileinfo.h" X#include "wordinfo.h" X#include "wordrules.h" X#include "emalloc.h" X X/** Unix system calls that need to be declared: **/ X /* (none) */ X/** C Library functions that nees to be declared: **/ Xextern void perror(); Xextern int strcmp(); Xextern int strlen(); Xextern char *strcpy(); Xextern char *strcat(); X#ifndef tolower X extern int toupper(); X#endif X X/** lqtext functions that need to be declared: **/ X/** Functions from this file that need to be declared: **/ Xvoid InsertCommonWord(); X/** **/ X X/** Useful macros **/ X#define new(type) ((type *) emalloc(sizeof(type))) X /* so you can say X * struct foo *x = enew(struct foo) X */ X X#define STRCMP(s1,s2) ((*(s1) == *(s2)) ? strcmp(s1, s2) : *(s1) - *(s2)) X /* faster then strcmp in the (common) case where the X * strings differ at the first character. X * From an idea by Henry Spencer (utzoo!henry) X */ X X/** **/ X Xextern int AsciiTrace; X X/* This routine is only sensible for English (although it could be X * modified...), but that does not matter. X */ Xchar * XWordRoot(WordInfo) X t_WordInfo *WordInfo; X{ X char *Word; X X if (!WordInfo) return "@#!!"; X X Word = WordInfo->Word; X X if (!Word) { X return "oh dear"; X } X X if (!*Word) { X return Word; X } X X /** delete trailing <'s> and mark posessive */ X while (WordInfo->Length >= 3 && Word[WordInfo->Length - 1] == 's' && X Word[WordInfo->Length - 2] == '\'') { X WordInfo->Length -= 2; X Word[WordInfo->Length] = '\0'; X WordInfo->WordPlace.Flags |= WPF_POSSESSIVE; X } X X /** delete trailing plural suffix and mark plural */ X X /* It's important to realise that the purpose of this routine is not X * in any way to reduce a word to an etymological root. In other words, X * no attempt is made to differentiate between plurals and present X * participles, or words that simply happen to end in `s'. X * Hence, elephants, blunderbus, hostess, runs and tomatoes are all X * candidates. Of course, one would like to do as well as one can! X * Again, the object isn't to derive the correct singular, but instead X * to be fairly fast, and, above all, to ensure that any transformations X * are reversible! X * X * The result is that I can store dog and dogs in the same Wordinfo X * chain. In the case that either word is unusual, there is a space X * saving of (on average) 30 or so bytes. More usefully, if you ask X * for `Window', I will automatically find `Windows' as well. X * X * so... X * XXXo, XXXss, XXXsh, XXXch, XXXx --> +es X * except: pianos, dynamos, photos X * XXCy --> XXCies [ C consonant] X * XXVy --> XXVys [ V vowel ] X * f or fe --> ves (12 cases only) X * vowel change: X * foot/feet (why bother with these? -- use a thesaurus!) X * need to keep penny/pence separate X * See Thomson & Martinet, section 8ff (I think) X */ X if (WordInfo->Length > 2 && Word[WordInfo->Length - 1] == 's') { X WordInfo->WordPlace.Flags |= WPF_WASPLURAL; /* WRONG */ X switch (Word[WordInfo->Length - 2]) { X case 'e': X if (WordInfo->Length >= 3) switch (Word[WordInfo->Length - 3]) { X case 'i': /* xxcies --> xxxy */ X if (WordInfo->Length > 3) { X Word[WordInfo->Length - 3] = 'y'; X WordInfo->Length -= 2; X } else { /* ies not a plural, but lies is :-) */ X WordInfo->Length--; /* just the s */ X } X break; X case 's': X case 'h': X case 'x': X case 'o': /* xxxoes --> xxx */ X WordInfo->Length -= 2; X break; X default: /* xxxes -> xxxe */ X WordInfo->Length -= 1; X break; X } else { /* too short */ X WordInfo->WordPlace.Flags &= X (unsigned short)~(unsigned short)WPF_WASPLURAL; X } X break; X case 'y': /* xxxvys --> xxxvy */ X switch (Word[WordInfo->Length - 2]) { /* e.g. holidays */ X case 'a': /* flays */ X case 'e': /* beys */ X case 'i': /* ??iys?? */ X case 'o': /* boys */ X case 'u': /* guys */ X WordInfo->Length--; /* just remove the s */ X break; X default: /*not a plural, e.g. Unixsys, happy */ X WordInfo->WordPlace.Flags &= X (unsigned short)~(unsigned short)WPF_WASPLURAL; X break; X } X break; X case 's': /* trailing ss doesn't mark a plural! */ X WordInfo->WordPlace.Flags &= X (unsigned short)~(unsigned short)WPF_WASPLURAL; X break; X case 'u': X /* ONE bus, thus, omnibus; TWO gnus, TWO emus */ X /* So it doesn't work for gnus and emus right now! */ X WordInfo->WordPlace.Flags &= X (unsigned short)~(unsigned short)WPF_WASPLURAL; X break; X case 'i': /* not a plural.. this, his, fleur-de-lis */ X WordInfo->WordPlace.Flags &= X (unsigned short)~(unsigned short)WPF_WASPLURAL; X break; X case 'a': /* has */ X case 'o': /* cos */ X if (WordInfo->Length < 4) { X WordInfo->WordPlace.Flags &= X (unsigned short)~(unsigned short)WPF_WASPLURAL; X break; X } X /* else fall through */ X default: /* just plain s */ X WordInfo->Length -= 1; X break; X } X Word[WordInfo->Length] = '\0'; X } X /* Should check for ii --> ius here, but that would increase the length X * of the word and therefore will break lots of things. X */ X return WordInfo->Word; X} X Xchar * XUnFlag(WordInfo, Flags) X t_WordInfo *WordInfo; X unsigned int Flags; X{ X static char Buffer[MaxWordLength + 5]; /* 's + es + \0 */ X register char *p, *q; X int Length; X X if (!WordInfo) return "(null word info)"; X if (!WordInfo->Word) return "(null word)"; X if (!WordInfo->Word[0]) return "(empty word)"; X X p = Buffer; X q = WordInfo->Word; X while (*p++ = *q++) X ; X *p = '\0'; X X if ((Length = p - Buffer) != WordInfo->Length) { X /* Well, maybe I can't count */ X WordInfo->Length = Length = strlen(Buffer); X } X X if (Flags & WPF_WASPLURAL) { X if (Length >= 2) switch (Buffer[Length - 1]) { X case 'y': X if (Length > 2) switch (Buffer[Length - 2]) { X case 'a': X case 'e': X case 'i': X case 'o': X case 'u': X Buffer[Length++] = 's'; /* e.g. days */ X break; X default: X strcpy(&Buffer[Length - 1], "ies"); /* ladies */ X Length += 2; X } X break; X case 's': X if (Length > 2) if (Buffer[Length - 2] == 'u') { X strcpy(&Buffer[Length - 1], "ii"); /* Genii */ X break; X } /* else fall through... */ X case 'o': X case 'h': X case 'x': X strcat(Buffer, "es"); X Length += 2; X break; X default: X Buffer[Length++] = 's'; X } X Buffer[Length] = '\0'; X } X X if (Flags & WPF_POSSESSIVE) { X Buffer[Length++] = '\''; X Buffer[Length++] = 's'; X Buffer[Length] = '\0'; X } X X if (Flags & WPF_UPPERCASE) { X Buffer[0] = toupper(Buffer[0]); X } X X return Buffer; X} X Xtypedef struct s_WordList { X char *Word; X unsigned short Flags; X struct s_WordList *Next; X} t_WordList; X Xstatic t_WordList *CommonWords = 0; X Xint XTooCommon(WordInfo) X t_WordInfo *WordInfo; X{ X register char *Word = WordInfo->Word; X register t_WordList **WP; X X for (WP = &CommonWords; *WP; WP = &(*WP)->Next) { X int i = STRCMP((*WP)->Word, Word); X X if (i == 0) return 1; /* yes, it's common */ X else if (i > 0) return 0; X } X return 0; X} X Xstatic char *FileName = "Internal Error"; X/* should be set before being printed! */ X Xint XReadCommonWords(CommonFile) X char *CommonFile; X{ X extern char *fgets(); X extern int AsciiTrace; X X FILE *fd; X extern int errno; X char Buffer[200]; X t_WordInfo W; X char *Root; X t_WordList *WP; X X errno = 0; X X if ((fd = fopen(CommonFile, "r")) == (FILE *) 0) { X int e = errno; X X fprintf(stderr, "Can't open common word list "); X errno = e; X perror(CommonFile); X return -1; X } X X FileName = CommonFile; X X while (fgets(Buffer, sizeof(Buffer), fd) != (char *) 0) { X register char *p; X char *Start; X X for (p = Buffer; *p; p++) { X if (*p == '#') break; X if (StartsWord(*p)) break; X } X X if (*p == '#' || !*p) { X continue; X } X X Start = p; X X for (; *p; p++) { X if (!WithinWord(*p)) break; X if (*p == '\'' && !WithinWord(p[1])) break; X } X X if (p - Start + 1 < MinWordLength) continue; X X *p = '\0'; /* delete trailing \n or whatever */ X W.WordPlace.Flags = 0; X W.Word = Start; X W.Length = p - Start; /* length excludes the \0 */ X X Root = WordRoot(&W); X InsertCommonWord(Root, W.WordPlace.Flags); X } X (void) fclose(fd); X X#if 0 X if (!CommonWords) { X fprintf(stderr, "No common words found in file \"%s\"\n", FileName); X exit(1); X } X#endif X X if (AsciiTrace > 1) { X for (WP = CommonWords; WP; WP = WP->Next) { X fprintf(stderr, "Ignore: \"%s\"\n", WP->Word); X } X } X FileName = "Internal Error"; X return 0; X} X Xvoid XInsertCommonWord(Root, Flags) X char *Root; X unsigned int Flags; X{ X register t_WordList **WP; X t_WordList *W; X X for (WP = &CommonWords; *WP; WP = &(*WP)->Next) { X int i = STRCMP((*WP)->Word, Root); X X if (i == 0) return; X else if (i > 0) break; X } X /* insert it before this one! */ X W = (*WP); X (*WP) = (t_WordList *) emalloc(sizeof(t_WordList)); X (*WP)->Word = emalloc(strlen(Root) + 1); X (void) strcpy((*WP)->Word, Root); X (*WP)->Flags = Flags; X (*WP)->Next = W; X return; X} @@@End of lq-text/src/liblqtext/Root.c echo x - lq-text/src/liblqtext/WordInfo.c 1>&2 sed 's/^X//' >lq-text/src/liblqtext/WordInfo.c <<'@@@End of lq-text/src/liblqtext/WordInfo.c' X/* WordInfo.c -- Copyright 1989 Liam R. Quin. All Rights Reserved. X * This code is NOT in the public domain. X * See the file COPYRIGHT for full details. X */ X X/* WordInfo.c -- handle the database of words for NX-Text. X * X * NX-Text keeps a master list of all of the words that have ever been X * seen. Currently, this is in dbm format (sdbm or ndbm). X * For each word, there's an associated WID (a unique number), an offset X * into the master database (see pblock.c), and possibly thesaurus info. X * X * $Id: WordInfo.c,v 2.11 90/10/13 03:10:07 lee Rel1-10 $ X * X * $Log: WordInfo.c,v $ X * Revision 2.11 90/10/13 03:10:07 lee X * Type error -- efree() needs a char *. X * X * Revision 2.10 90/10/06 00:12:01 lee X * Prepared for first beta release. X * X * Revision 2.9 90/09/29 23:47:30 lee X * Reduced the size of a buffer, and plugged yet another memory leak! X * X * Revision 2.8 90/09/10 13:38:50 lee X * deleted declaration of sleep() X * X * Revision 2.7 90/08/29 21:46:48 lee X * Alpha release. X * X * Revision 2.6 90/08/12 17:33:38 lee X * malloc changes; added SlayWordInfo() and MakeWordInfo(). X * X * Revision 2.5 90/08/09 19:16:35 lee X * BSD lint and fixes... X * X * Revision 2.4 90/03/22 14:23:19 lee X * new calls to efree(); X * Offset now stored as a block number, not a byte offset X * X * Revision 2.3 90/03/21 14:59:13 lee X * Numerous changes. WID2WordInfo() no longer calles GetWordPlaces(). X * X * Revision 2.2 89/10/08 20:45:05 lee X * Working version of nx-text engine. Addfile and wordinfo work OK. X * X * Revision 2.1 89/10/02 01:13:56 lee X * New index format, with Block/WordInBlock/Flags/BytesSkipped info. X * X * Revision 1.4 89/09/17 23:01:53 lee X * Various fixes; NumberInBlock now a short... X * X * Revision 1.3 89/09/16 21:16:07 lee X * First demonstratable version. X * X * Revision 1.2 89/09/11 00:35:03 lee X * Some speedups, but WID not working properly... X * X * Revision 1.1 89/09/07 21:05:51 lee X * Initial revision X * X * X */ X X#include "globals.h" /* defines and declarations for database filenames */ X X#include <errno.h> X#include <fcntl.h> X#include <malloc.h> X#include <signal.h> X#include <stdio.h> X#include <string.h> X#include <ctype.h> X#include <sys/types.h> X#include <sys/stat.h> X#include <unistd.h> X X#include "fileinfo.h" X#include "smalldb.h" X#include "wordindex.h" X#include "wordinfo.h" X#include "numbers.h" X X#include "emalloc.h" X X#include "wordrules.h" /* max word length */ X X#include "pblock.h" X X/** declarations: **/ X/** Unix system calls that need to be declared: **/ Xextern int open(), close(); /* (these are not the stdio fopen and fclose) */ Xextern int creat(); Xextern void exit(); Xextern long lseek(); /* watch out for this on 16 bit (286, PDP11) systems! */ Xextern int read(), write(); Xextern int stat(); Xextern unsigned alarm(/* unsigned */); X X/** Unix Library Calls that need to be declared: **/ X#ifndef tolower /* e.g. on SunOS */ Xextern int tolower(); X#endif Xextern void perror(); X/* extern int sleep(); -- this is unsigned on some systems */ Xextern int lockf(); X/** lqtext Library calls that need to be declared: **/ Xextern void Deletepblock(); X X/** Functions within this file that need to be declared: **/ Xt_WordInfo *MakeWordInfo(); Xvoid SlayWordInfo(); X X/** **/ X Xextern int AsciiTrace; Xextern char *progname; X X#define new(type) ( ((type) *) emalloc(sizeof(type)) ) X X/* Format when using ndbm: */ X Xtypedef struct { X t_WID WID; X unsigned long Offset; /* position in the database */ X unsigned long NumberOfWordPlaces; X char Word[1]; /* Cheat here */ X} t_WordIndexEntry; X X/* Replacement fomat, intended to be faster and to use much less disk space. X * Still use dbm for Word2WID, but not for the reverse mapping. X * Also, cache the most recently-used WordInfo entry... X * I have not measured how much of a win this is, but a lot of the code X * calls Word2Wid() and then WID2WordInfo(). X */ X Xstatic int Widfd = (-1); X Xt_WordInfo * XWID2WordInfo(WID) X t_WID WID; X{ X extern t_WordPlace *GetWordPlaces(); /* pblock.c */ X X char Buffer[WIDBLOCKSIZE + 5]; /* the +5 allows for overrun... */ X char *q = Buffer; X t_WordInfo *WP; X X /* The above calculation is derived like this: X * X * The entry contains the total number of pairs (>= 1 byte), X * the length of the string (>= 1 byte), and the string (>= 3 bytes) X * (actually >= MinWordLength bytes, but setting this less than 3 X * would be a major disaster!) X * Hence, there are WIDBLOCKSIZE - (2 + length) bytes left for pairs. X * Now, each pair is at least 2 bytes, so halve the remainder. I add X * one coz WIDBLOCKSIZE - 3 is odd, so I would otherwise lose one! X */ X X if (Widfd < 0) { X if ((Widfd = open(WidIndexFile, O_RDWR|O_CREAT, 0766)) < 0) { X fprintf(stderr, "Can't open WID file \"%s\"\n", WidIndexFile); X exit(1); X } X } X X if (lseek(Widfd, (long) (WID * WIDBLOCKSIZE), 0) < 0) { X return (t_WordInfo *) 0; X } X X if (read(Widfd, Buffer, WIDBLOCKSIZE) != WIDBLOCKSIZE) { X return (t_WordInfo *) 0; X } X X { X unsigned short L; X X if ((L = sReadNumber(&q)) == 0) { X (void) fprintf(stderr, X "%s: Database corrupt, WID %lu has length zero\n", X progname, WID); X return (t_WordInfo *) 0; X } X WP = MakeWordInfo(WID, (int) L, q); X q += L; X } X X WP->Offset = (sReadNumber(&q) >> 1) * BLOCKSIZE; X WP->NumberOfWordPlaces = sReadNumber(&q); X X /* Now, maybe read some WordPlace tuplets: */ X#if 1 X if (q - Buffer < WIDBLOCKSIZE) { X WP->WordPlaces = GetWordPlaces( X WP->WID, X q, X WIDBLOCKSIZE - (q - Buffer), X WP->Offset, X WP->NumberOfWordPlaces X ); X WP->WordPlacesInHere = WP->NumberOfWordPlaces; X } else { X fprintf(stderr, "%s: Internal error, block too small for %ld (%s)\n", X progname, WP->WID, WP->Word); X exit(1); X } X X#else X WP->WordPlaces = (t_WordPlace *) 0; X if (q - Buffer < WIDBLOCKSIZE) { X WP->DataBlock = emalloc(WIDBLOCKSIZE + 5); X (void) memcpy(WP->DataBlock, Buffer, WIDBLOCKSIZE); X WP->WordPlaceStart = &(WP->DataBlock[q - Buffer]); X } X#endif X X /* done! */ X return WP; X} X Xstatic char PairBuffer[WIDBLOCKSIZE + 5]; /* the +5 allows for overrun... */ X X/* Make WordInfo Block Header... */ Xvoid XMkWIBH(WordInfo, pblock) X t_WordInfo *WordInfo; X t_pblock *pblock; X{ X char *q = PairBuffer; X X#ifdef ASCIITRACE X if (AsciiTrace > 15) { X fprintf(stderr, "\tMake info block header for %s, Offset %lu==%lu\n", X WordInfo->Word, pblock->ChainStart, WordInfo->Offset); X } X#endif X X sWriteNumber(&q, WordInfo->Length); X (void) strncpy(q, WordInfo->Word, WordInfo->Length); X q += WordInfo->Length; X if (pblock) sWriteNumber(&q, (pblock->ChainStart / BLOCKSIZE) << 1); X else sWriteNumber(&q, 0L); X sWriteNumber(&q, WordInfo->NumberOfWordPlaces); X X WordInfo->WordPlaceStart = q; X WordInfo->DataBlock = PairBuffer; X} X X/* Make WordInfo Block ... */ Xint XMkWIB(WordInfo, pblock) X t_WordInfo *WordInfo; X t_pblock *pblock; X{ X extern unsigned short PutWordPlaces(); X X /* See how many pairs from the given pblock fit into WordInfo, X * and leave them in PairBuffer... X */ X X if (AsciiTrace > 3) { X fprintf(stderr, "Make info block for %s\n", WordInfo->Word); X } X X MkWIBH(WordInfo, pblock); X X if (pblock == (t_pblock *) 0) { X /* No WordPlaces to put in! */ X WordInfo->WordPlacesInHere = 0; X return 0; X } X X return WordInfo->WordPlacesInHere = PutWordPlaces( X pblock->WordPlaces, X WordInfo->WID, X WordInfo->WordPlaceStart, X WIDBLOCKSIZE - (WordInfo->WordPlaceStart - PairBuffer), X pblock->ChainStart, X pblock->NumberOfWordPlaces); X} X Xchar * XString2SixBitString(String) X unsigned char *String; X{ X static unsigned char Buffer[MaxWordLength + 1]; X register unsigned char *p; X register unsigned char *Bufp = Buffer; X unsigned short Val; X int BitsLeft = 0; X X /* BUG: we lose word-processing accents, etc. and 8-bitness if X * we do this. Also, it slows things down very, very slightly. X */ X X /* Some ascii character equivalents: X * '0' 48 060 0x30 X * 'A' 65 0101 0x41 X * '_' 95 0137 0x5f X * 'a' 97 0141 0x61 X */ X for (p = String; *p; p++) { X if (!isalnum(*p) && *p != '\'' && *p != '_') { X return (char *) 0; X } X if (isupper(*p)) *p = tolower(*p); X /* Store as X * 0-9 --> 0-9 (easy!) X * a-z --> 10...35 X * _/' --> 36/37 X * hence, I need 6 bits per character. This also leaves rather X * a lot of bits spare (38..64, 27 or so spaces). As I fold case, X * and don't have controls, I don't know what to do there. I X * could store digrams. There are 38*38 = 1444 of these, but X * some of them don't happen. Not worth the effort. X */ X if (isdigit(*p)) { X Val = (*p) - '0'; X } else if (isalpha(*p)) { X Val = (*p) - 'a' + ('9' - '0'); X } else if (*p == '\'') { X Val = ('9' - '0') + ('z' - 'a') + 1; X } else if (*p == '_') { X Val = ('9' - '0') + ('z' - 'a') + 2; X } else { X#define NEXTISEIGHT ('9' - '0') + ('z' - 'a') + 3 X Val = NEXTISEIGHT; X } X /* Write the first half */ X if (!(BitsLeft & 07)) { /* i.e. it's 0 or 8 */ X *Bufp = (Val << 2); X BitsLeft = 2; X } else { X /* top BITSLEFT bits */ X *Bufp++ |= (Val >> (6 - BitsLeft)); X *Bufp = (unsigned) (Val << (2 + BitsLeft)); /* lose some bits */ X if ((BitsLeft -= 6) < 0) BitsLeft += 8; X } X } X if (BitsLeft) { X Bufp++; X } X *Bufp = 0; X return (char *) Buffer; X} X Xunsigned char * XSixBitString2String(SixBitString) X char *SixBitString; X{ X static unsigned char Buffer[MaxWordLength + 2]; X register unsigned char *p = (unsigned char *) SixBitString; X int BitsLeft = 0; X unsigned char *Bufp = Buffer; X X while (*p) { X if (!(BitsLeft & 07)) { /* i.e. it's 0 or 8 */ X *Bufp++ = (*p) >> 2; X BitsLeft = 2; X } else { X /* W R O N G */ X /* bottom BITSLEFT bits */ X *Bufp = ((*p) << (6 - BitsLeft)); X /* Rest */ X *Bufp = (unsigned) ((*p) << (2 + BitsLeft)); /* lose some bits */ X if ((BitsLeft -= 6) < 0) BitsLeft += 8; X } X } X return (unsigned char *) "notdone'fixme"; /* NOTDONE FIXME */ X} X X#ifdef TESTSIX Xchar *progname= "testsix"; Xmain(argc, argv) X int argc; X char *argv[]; X{ X extern char *gets(); X X char Line[4096]; X int Encode = 1; X X if (argc != 3) { X fprintf(stderr, "bad arg count; usage: %s -[de]\n", progname); X } X if (STREQ(argv[1], "-e")) Encode = 1; X else if (STREQ(argv[1], "-d")) Decode = 1; X else { X fprintf(stderr, "usage: %s -[d|e]\n", progname); X exit(1); X } X while (gets(Line) != (char *) 0) { X char *Result; X X if (Encode) { X Result = String2SixBitString(Line); X } else { X if (STREQ(Line, "(cannot be encoded)")) { X Result = "(this line was not saved)"; X } else { X Result = SixBitString2String(line); X } X } X if (Result) { X printf("%s\n", Result); X } else { X printf("(cannot be encoded)\n"); X } X } X} X X#endif /*TESTSIX*/ X X Xt_WID XWord2WID(Word, Length) X char *Word; X unsigned int Length; X{ X DBM *db; X datum key, data; X char *q; X t_WID WID; X char Buffer[8]; X /* enough for the binary representation of a number -- see numbers.c; X * this is _not_ sizeof(long). It's probably 5, in fact, although X * for small numbers it's less. X */ X X if (Length > MaxWordLength) { X Length = MaxWordLength; /* NOTE: no trailing \0 required. */ X } X X /* contact database server */ X if ((db = startdb(WordIndex)) == (DBM *) 0) { X fprintf(stderr, "dbmopen(%s) failed\n", WordIndex); X exit(2); X } X X key.dptr = Word; X key.dsize = Length; X X data = dbm_fetch(db, key); X X enddb(db); X X if (data.dsize == 0) { X return (t_WID) 0; X } X X /* do this because ReadNumber will leave q pointing beyond Buffer: */ X (void) memcpy(Buffer, data.dptr, data.dsize); X q = Buffer; X WID = sReadNumber(&q); X if (q - Buffer != data.dsize) { X fprintf(stderr, "Word2Wid failed... got %lu\n"); X return (t_WID) 0; X } X return WID; X} X Xchar * XWID2Word(WID) X t_WID WID; X{ X t_WordInfo *W; X char *Word; X X if (WID == (t_WID) 0) { X return (char *) 0; X } X X if ((W = WID2WordInfo(WID)) == (t_WordInfo *) 0) { X return (char *) 0; X } X Word = W->Word; X W->Word = (char *) 0; X SlayWordInfo(W); X return Word; X} X Xint XPutWordInfoIntoIndex(WordInfo, Offset) X t_WordInfo *WordInfo; X unsigned long Offset; X{ X DBM *db; X char NumBuf[sizeof(t_WID) + 1]; X char *q = NumBuf; X datum key, data; X int RetVal; X X /** First, write the WID itself, so we can go from Word to WID */ X X key.dptr = WordInfo->Word; X key.dsize = WordInfo->Length; X X sWriteNumber(&q, WordInfo->WID); X X data.dptr = NumBuf; X data.dsize = q - NumBuf; X X /* contact database server */ X if ((db = startdb(WordIndex)) == (DBM *) 0) { X fprintf(stderr, "dbmopen(%s) failed\n", WordIndex); X exit(2); X } X X RetVal = dbm_store(db, key, data, DBM_REPLACE); X X enddb(db); X X /** Now, ensure that we have a physical block for WordInfo. If X ** we don't, there is something very wrong in pblock.c, our only X ** possible caller. X **/ X X if (WordInfo->DataBlock == (char *) 0) { X if (Offset) { X fprintf(stderr, "WARNING: WordInfo corrupt for \"%s\"\n", X WordInfo->Word); X } X (void) MkWIB(WordInfo, (t_pblock *) 0); X } X X /** Now write the physical entry... */ X X if (Widfd < 0) { X if ((Widfd = open(WidIndexFile, O_RDWR|O_CREAT, 0766)) < 0) { X fprintf(stderr, "Can't open WID file \"%s\"\n", WidIndexFile); X exit(1); X } X } X X if (lseek(Widfd, (long) (WordInfo->WID * WIDBLOCKSIZE), 0) < 0) { X perror("lseek"); X exit(1); X } X X if (write(Widfd, WordInfo->DataBlock, WIDBLOCKSIZE) != WIDBLOCKSIZE) { X perror("write"); X exit(1); X } X X return RetVal; X} X Xint XWID_sig() X{ X return 0; X} X Xt_WID XGetMaxWID() X{ X extern int errno; X extern long atol(); X X int fd; X char Buffer[20]; /* large enough to sprintf() a WID */ X struct stat StatBuf; X#if 0 X int FileKey; /* what one gets from a lock... */ X int NumberOfTriesLeft = 5; X#endif X X /* ensure that the file is there */ X if (stat(WidFile, &StatBuf) == -1) { X return 0; X } X X if ((fd = open(WidFile, O_RDWR, 0)) < 0) { X fprintf(stderr, "Warning: Can't open WID file"); X return 0; X } X X#if 0 X errno = 0; X X /** Lock the file **/ X X do { X /* Set a timeout of 2 seconds */ X signal(SIGALRM, WID_sig); X (void) alarm(3); X if ((FileKey = lockf(fd, F_LOCK, 0L)) < 0) { X switch (errno) { X case EACCES: /*[sic]*/ /* another process has the lock */ X fprintf(stderr, "Please wait...\n"); X /* shouldn't happen */ X break; X case EDEADLK: X fprintf(stderr, "Warning: can't lock \"%s\" -- EDEADLK\n", WidFile); X FileKey = 1; X break; X case EINTR: X fprintf(stderr, "Please Wait... someone has the key...\n"); X sleep(1); X break; X } X } X if (--NumberOfTriesLeft <= 0) { X fprintf(stderr, "Warning: can't lock "); X perror(WidFile); X (void) close(fd); X return 0; X } X } while (FileKey < 0); X (void) alarm(0); X X if (stat(WidFile, &StatBuf) == -1) { X fprintf(stderr, "It went away!\n"); X return 0; X } X#endif X X /* Read the file */ X if (read(fd, Buffer, (unsigned int) StatBuf.st_size) < 0) { X fprintf(stderr, "Can't read from \"%s\"\n", WidFile); X exit(1); X } X X#if 0 X /** Unlock the file **/ X if (lockf(fd, F_ULOCK, 0L) < 0 && FileKey == 0) { X fprintf(stderr, "Warning: might not have unlocked \"%s\"\n", X WidFile); X } X#endif X (void) close(fd); X X Buffer[StatBuf.st_size] = '\0'; X X return atol(Buffer); X} X X Xt_WID LastNextWIDVal = (t_WID) 0; X X#undef GetNextWID X Xt_WID GetNextWID(), GetMaxWID(); X XINLINE Xt_WID XSpoofGetNextWID() X{ X static int SinceLastUpdate = 0; X X /* Call the real function sometimes, so that the database does X * get updated in case of a crash or for other users. X */ X if (++SinceLastUpdate > 500) { X SinceLastUpdate = 0; X return GetNextWID(1); X } X X if (LastNextWIDVal == (t_WID) 0) { X SinceLastUpdate = 0; X LastNextWIDVal = GetMaxWID(); X } X return ++LastNextWIDVal; X} X Xvoid XWriteCurrentMaxWID() X{ X (void) GetNextWID(1); X} X Xt_WID XGetNextWID(WriteCurrent) X int WriteCurrent; /* simply write the current MaxWID if true */ X{ X extern int errno; X extern long atol(); X X int fd; X char Buffer[20]; X struct stat StatBuf; X#if 0 X int FileKey; /* what one gets from a lock... */ X int NumberOfTriesLeft = 5; X#endif X t_WID Result; X X /** Alter the file, so other programs can see the new words... X **/ X X /* ensure that the file is there */ X if (stat(WidFile, &StatBuf) == -1) { X fprintf(stderr, "Creating WID file \"%s\"\n", WidFile); X if ((fd = creat(WidFile, 02666)) < 0) { X fprintf(stderr, "Can't create WID file \"%s\"\n", WidFile); X exit(1); X } X (void) close(fd); X return GetNextWID(WriteCurrent); X X /*NOTREACHED*/ X } X X if ((fd = open(WidFile, O_RDWR, 0)) < 0) { X fprintf(stderr, "Can't open WID file"); X perror(WidFile); X exit(1); X } X X#if 0 X errno = 0; X X /** Lock the file **/ X X do { X /* Set a timeout of 2 seconds */ X signal(SIGALRM, WID_sig); X (void) alarm(3); X if ((FileKey = lockf(fd, F_LOCK, 0L)) < 0) { X switch (errno) { X case EACCES: /*[sic]*/ /* another process has the lock */ X fprintf(stderr, "Please wait...\n"); X /* shouldn't happen */ X break; X case EDEADLK: X fprintf(stderr, "Warning: can't lock \"%s\" -- EDEADLK\n", WidFile); X FileKey = 1; X break; X case EINTR: X fprintf(stderr, "Please Wait... someone has the key...\n"); X sleep(1); X break; X } X } X if (--NumberOfTriesLeft <= 0) { X fprintf(stderr, "Warning: can't lock the file \"%s\"\n", WidFile); X } X } while (FileKey < 0); X (void) alarm(0); X X if (stat(WidFile, &StatBuf) == -1) { X fprintf(stderr, "It went away!\n"); X exit(1); X } X#endif X X /* Read the file */ X X if (read(fd, Buffer, (unsigned int) StatBuf.st_size) < 0) { X fprintf(stderr, "Can't read from \"%s\"\n", WidFile); X exit(1); X } X X Buffer[StatBuf.st_size] = '\0'; X X Result = atol(Buffer); X X /* if WriteCurrent is set, we should not increment anything */ X if (!WriteCurrent) { X ++LastNextWIDVal; X ++Result; X } X X if (Result < LastNextWIDVal) { X Result = LastNextWIDVal; X } X X (void) sprintf(Buffer, "%lu\n", Result); X X /* Move to the start of the file and write the now value. X * No need to truncate the file, because it didn't shrink! X */ X (void) lseek(fd, 0, 0L); X (void) write(fd, Buffer, (unsigned int) strlen(Buffer)); X X#if 0 X /** Unlock the file **/ X X if (lockf(fd, F_ULOCK, 0L) < 0 && FileKey == 0) { X fprintf(stderr, "Warning: might not have unlocked \"%s\"\n", X WidFile); X } X#endif X (void) close(fd); X X return Result; X} X Xint XDeleteWord(Word) X char *Word; X{ X extern t_pblock *Getpblock(); X X t_WID WID; X t_WordInfo *WordInfo; X t_pblock *tmp; X X if ((WID = Word2WID(Word, strlen(Word))) == (t_WID) 0) { X return -1; /* not there */ X } X X /* get info from the list */ X if ((WordInfo = WID2WordInfo(WID)) == (t_WordInfo *) 0) { X return -1; X } X X if ((tmp = Getpblock(WordInfo)) != (t_pblock *) NULL) { X Deletepblock(tmp); X (void) efree(tmp); X } X X /* delete the offset from the database, but retain the WID: */ X WordInfo->Offset = 0L; X WordInfo->NumberOfWordPlaces = 0L; X WordInfo->WordPlacesInHere = 0; X PutWordInfoIntoIndex(WordInfo, 0L); X SlayWordInfo(WordInfo); X X return 0; X} X X/* Routines to create and destroy WordInfo structures */ XINLINE t_WordInfo * XMakeWordInfo(WID, Length, Word) X t_WID WID; X int Length; X char *Word; /* the word, which might not be nul-terminated */ X{ X register t_WordInfo *WP; X WP = (t_WordInfo *) emalloc(sizeof(t_WordInfo)); X X WP->WID = WID; X WP->FID = (t_FID) 0; X WP->Next = (t_WordInfo *) 0; X WP->NumberOfWordPlaces = 0; X WP->DataBlock = (char *) 0; X WP->WordPlaceStart = (char *) 0; X WP->WordPlaces = (t_WordPlace *) 0; X WP->WordPlacesInHere = 0; X WP->WordPlace.FID = 0; /* mark as invalid */ X WP->WordPlace.Flags = 0; /* this gets used anyway, so set it to zero! */ X X WP->Word = emalloc(Length + 1); X X (void) strncpy(WP->Word, Word, Length); X WP->Length = Length; X WP->Word[Length] = '\0'; /* strncpy does not add a null */ X WP->Offset = 0; X X return WP; X} X Xvoid XSlayWordInfo(WP) X t_WordInfo *WP; X{ X if (!WP) return; X if (WP->Word) efree(WP->Word); X if (WP->WordPlaces) efree((char *)WP-> WordPlaces); X X WP->Next = (t_WordInfo *) 0; X /* The above line is to force a run-time error in the common X * (but wrong) case X * for (w = WordList; w; w = w->Next) SlayWordInfo(w); X */ X efree((char *) WP); X} @@@End of lq-text/src/liblqtext/WordInfo.c echo end of part 04 -- Liam R. E. Quin, lee@sq.com, SoftQuad Inc., Toronto, +1 (416) 963-8337