home *** CD-ROM | disk | FTP | other *** search
- (* INVERT.PAS version 0.10 3/23/86 *)
- (* Adds support for file size up to 4 megabytes *)
-
-
- (*******************************************************************)
- (* *)
- (* INVERT *)
- (* *)
- (* Copyright 1986 by Mark J. Welch *)
- (* P.O. Box 2409, SF, CA 94126 *)
- (* *)
- (* Portions copyright 1986 by JimKeo *)
- (* *)
- (* Last revised 3/22/86 by MJW *)
- (* *)
- (* 1) Read text files and "invert" them into a sorted list of *)
- (* words and their locations within the file. *)
- (* 2) Store inversion to disk file *)
- (* 3) Later, modify program to allow unlimited expandability *)
- (* of inversion file and easy search/access to keywords *)
- (* *)
- (* Words are stored in a binary tree, and location pointers *)
- (* are stored in a linked list rooted at each tree entry. *)
- (* *)
- (*******************************************************************)
-
- (**********************************************************************)
- (* *)
- (* WARNING *)
- (* *)
- (* WARNING: To save on execution time, the program does not now check *)
- (* available heap space before calling New(). As a result, it will *)
- (* crash if it runs out of memory, usually after about an hour of *)
- (* execution; if it crashes, all work is lost. It also crashes if too *)
- (* many files are inverted at one time. This error-handling should be *)
- (* improved in any "release" version. -MJW *)
- (* *)
- (**********************************************************************)
-
- Program Invert;
- (* Invert a text file for quick searching, etc. *)
-
- CONST
-
- FileNameLength = 20; (* 14 is probably sufficient, but... *)
- WordLength = 14; (* truncate words at 14 characters *)
- MaxFiles = 256; (* maximum number of files to invert *)
- (* -- needed for FileList array, but *)
- (* could be replaced with a linked *)
- (* list later. *)
- TYPE
-
- S = String[WordLength];
- FileName = String[FileNameLength];
- FileNum = Integer;
- FileLists = Array[1..MaxFiles] of FileName;
-
- LocationPtr = ^LocationType;
- LocationType = RECORD
- F: FileNum; (* index into FileList *)
- FilBlockPtr: Integer;(* 128-byte block offset *)
- FilPosPtr: Byte; (* byte offset in file *)
- Next: LocationPtr; (* next location in linked list *)
- END;
-
- WordPtr = ^WordType;
- WordType = RECORD
- Location: LocationPtr; (* pointer into file *)
- Parent: WordPtr; (* in b-tree *)
- SmallerChild: WordPtr; (* less-than *)
- GreaterChild: WordPtr; (* greater-than *)
- Text: S; (* the word *)
- END;
-
- VAR
-
- FileList: FileLists; (* list of files that are being inverted *)
- NumFiles: Integer; (* number of files inverted so far *)
- WordRoot: WordPtr; (* root of the b-tree *)
- i: integer; (* index for FOR loop *)
-
-
-
-
-
-
- (****************************************************************)
- (* *)
- (* PrintResults *)
- (* *)
- (* Print the entire tree in alphabetical order to a file *)
- (* *)
- (****************************************************************)
-
-
- Procedure PrintResults;
- var outfil: text;
-
- i: integer;
-
- (***********************************************)
- (* *)
- (* PrintTree *)
- (* *)
- (* RECURSIVE: print entire branch of tree by *)
- (* first printing entire Smaller branch, then *)
- (* current node text, and then entire Larger *)
- (* *)
- (***********************************************)
- Procedure PrintTree(X: WordPtr);
- Var L: LocationPtr;
- i: integer;
- r: real;
- Begin
- If X^.SmallerChild <> Nil (* leftmost children first *)
- then PrintTree(X^.SmallerChild);
- L := X^.Location;
- If L <> Nil (* Then current node *)
- then
- begin
- for i := 2 to ord(x^.text[0]) do
- if ((x^.text[i] >= 'A') and (x^.text[i] <= 'Z'))
- then x^.text[i] := chr(ord(x^.text[i])+32);
- Write(outfil,X^.Text,': ');
- Repeat
- r := (L^.FilBlockPtr * 128.0) +L^.FilPosPtr;
- Write(outfil,L^.F:1,'-',r:1:0,' ');
- L := L^.Next;
- Until L = Nil;
- Writeln(outfil);
- end;
- If X^.GreaterChild <> Nil (* and then right children *)
- then PrintTree(X^.GreaterChild);
- End; (*PrintTree *)
-
- begin (* PrintResults *)
- Assign(outfil,'INVERT.INX');
- Rewrite(outfil);
- for i := 1 to NumFiles do
- Writeln(outfil, i:1,' ',FileList[i]);
- writeln(outfil);
- PrintTree(WordRoot);
- writeln(outfil);
- close(outfil);
- end; (* printResults *)
-
-
-
- (*******************************************************************)
- (* *)
- (* Title *)
- (* *)
- (*******************************************************************)
-
- Procedure Title;
- Begin
- Writeln;
- Writeln(' Invert -- Version 0.10 -- 3/23/86');
- Writeln;
- Writeln('Copyright 1986 by Mark J. Welch, Box 2409, SF CA 94126');
- Writeln(' (Portions Copyright 1986 by Jim Keohane)');
- Writeln;
- Writeln;
- End;
-
-
- (*******************************************************************)
- (* *)
- (* MemoryAvail *)
- (* *)
- (*******************************************************************)
- Function MemoryAvail: Real;
- Var M: Real;
- begin
- M := Memavail;
- If M < 0 then M := 65536.0 + M;
- MemoryAvail := M * 16;
- end;
-
-
- (*******************************************************************)
- (* *)
- (* CreateRoot *)
- (* *)
- (* The binary tree has to start somewhere: start it here. *)
- (* *)
- (*******************************************************************)
- Procedure CreateRoot;
- Begin
- New(WordRoot);
- WordRoot^.GreaterChild := Nil;
- WordRoot^.SmallerChild := Nil;
- WordRoot^.Text := 'Mzzzzzzzzz'; (* let's split the alphabet here *)
- (* to improve initial tree balance *)
- WordRoot^.Location := Nil;
- WordRoot^.Parent := Nil; (* no parent for root *)
- End; (* CreateRoot *)
-
-
-
- (****************************************************************)
- (* *)
- (* InvertFil *)
- (* *)
- (* Given a file, add all its words and their locations *)
- (* to the inversion tree. *)
- (* *)
- (****************************************************************)
-
- Procedure InvertFile(FN: FileName);
- var block:array[0..127] of char;
- j,k:integer;
- Fil: file;
- St: S;
- StPtr,CurrentLoc: LocationPtr;
- c: char;
- i: integer;
- matchkey: boolean;
-
-
- (****************************************************************)
- (* *)
- (* AddWord *)
- (* *)
- (* Given a word and a prepared location link/pointer, *)
- (* add it into the inversion structure, either as a new *)
- (* word or onto an existing linked-list. *)
- (* *)
- (****************************************************************)
- Procedure AddWord(St: S; StLoc: LocationPtr);
- Var CurrentWord: WordPtr;
- Match: Boolean;
-
-
- (**********************************************)
- (* *)
- (* NewChild *)
- (* *)
- (* Add St as a new word in tree *)
- (* *)
- (**********************************************)
- Procedure NewChild(var X: WordPtr; var St: s);
- (* match is imported as a "global" variable *)
- begin
- Match := true;
- GetMem(X,sizeof(X^)-WordLength+Length(St));
- X^.text := St;
- X^.Parent := CurrentWord;
- X^.SmallerChild := Nil;
- X^.GreaterChild := Nil;
- X^.Location := StLoc;
- end; (* NewChild *)
-
-
- Begin (* AddWord *)
- (* First search if it exists *)
-
- CurrentWord := WordRoot; (* start at root of tree *)
- Match := False; (* haven't found the right place yet *)
- Repeat
- If (CurrentWord^.Text[1] = St[1])
- then
- if (CurrentWord^.Text = St)
- then begin
- match := true; (* global *)
- CurrentLoc := CurrentWord^.Location;
- While CurrentLoc^.Next <> Nil Do
- CurrentLoc := CurrentLoc^.Next;
- CurrentLoc^.Next := StLoc;
- end
- else if (CurrentWord^.Text > St)
- then if CurrentWord^.SmallerChild <> Nil
- then CurrentWord := CurrentWord^.SmallerChild
- else NewChild(CurrentWord^.SmallerChild,st)
- else if CurrentWord^.GreaterChild <> Nil
- then CurrentWord := CurrentWord^.GreaterChild
- else NewChild(CurrentWord^.GreaterChild,st)
- else
- if (CurrentWord^.Text[1] > St[1])
- then if CurrentWord^.SmallerChild <> Nil
- then CurrentWord := CurrentWord^.SmallerChild
- else NewChild(CurrentWord^.SmallerChild,st)
- else if CurrentWord^.GreaterChild <> Nil
- then CurrentWord := CurrentWord^.GreaterChild
- else NewChild(CurrentWord^.GreaterChild,st);
- Until Match;
- End; (* AddWord *)
-
-
-
-
-
- Begin (* InvertFil *)
- (* JimKeo-modified code included in this procedure *)
-
- Writeln('Invert: ',FN); (* let user know what file we're fiddling with *)
- NumFiles := NumFiles + 1;
- If NumFiles > MaxFiles (* remove this when FileList is a linked list *)
- then
- begin (* crash impolitely, trashing all work done so far *)
- writeln('Too many files inverted: maximum is ',MaxFiles);
- Halt;
- end
- else FileList[NumFiles] := fn;
- Assign(fil,fn);
- Reset(fil);
- St := '';
- k := 0; {block no}
- blockread(fil,block,1,j); {read first block}
- while (j=1) do
- begin
- for i := 0 to 127 do {128 bytes per block}
- begin
- c:=block[i];
- If ((c >= 'A') and (c < 'z'))
- and ((c >= 'a') or (c <= 'Z'))
- then st := st + UpCase(c)
- else
- if (Ord(St[0]) > 0)
- then
- begin
- matchKey := false;
- case st[1] of
- 'A': matchKey := (st = 'A') or (st = 'AND') or (st = 'AN');
- 'I': matchKey := (st = 'IN') or (st = 'IS') or
- (st = 'IT') or (st = 'ITS');
- 'N': matchKey := (st = 'NOT');
- 'O': matchKey := (st = 'OR') or (st = 'ON') or (st = 'OF');
- 'T': matchKey := (st = 'THE') or (st = 'TO');
- 'Y': matchKey := (st = 'YOU');
- end; (* case *)
- if matchKey
- then St := ''
- else
- begin
- New(StPtr);
- StPtr^.F := NumFiles;
- StPtr^.FilBlockPtr := k;
- StPtr^.FilPosPtr :=i-ord(st[0]); {FilePos out}
- StPtr^.Next := Nil;
- AddWord(St,StPtr);
- St := '';
- end;
- end;
- end;
- k:=k+1; {add in 128 bytes per block}
- blockread(fil,block,1,j) {read next and j=1 if more}
- end;
- Writeln(' Done Inverting ',FN);
- End; (* InvertFile *)
-
-
-
- (************************************************************************)
- (* *)
- (* Main Program Body *)
- (* *)
- (************************************************************************)
- Begin (* Main Program *)
- Title;
- NumFiles := 0;
- CreateRoot;
- Writeln('Bytes of Available Memory at start: ',MemoryAvail:6:0);
- For i := 1 to ParamCount do
- begin
- InvertFile(ParamStr(i));
- writeln(' Bytes of Available Memory: ',MemoryAvail:6:0);
- end;
- Writeln('Done inverting, now storing results....');
- PrintResults;
- End.
-
-
- (**********************************************************************)
- (* p.p.s. To reduce mem reqs, I would move "Text:S" to end of record *)
- (* description. Then, instead of New(recptr) I would use *)
- (* GetMem(recptr,sizeof(recptr^)-WordLength+Length(St)); That way, if *)
- (* St='HEAD', you save 10 bytes of memory! -JimKeo[hane] *)
- (**********************************************************************)
-
-