home *** CD-ROM | disk | FTP | other *** search
- {$A+,B-,D-,E-,F+,I+,L-,N-,O-,R-,S-,V-}
- Unit Token;
- Interface
- Uses Objects;
-
- Const
- TokenStringSize = 35; { Maximum size of a string to be tokenized }
- TokenEntryListSize = 20; { Number of tokens per hash entry block }
- HashTableSize = 211; { Size of hash table }
-
- Type
- TokenStringPtr = ^TokenString;
- TokenString = String[TokenStringSize];
-
- TokenTextTablePtr = ^TokenTextTableType;
- TokenTextTableType = Record
- TokenTextEntry : Array[0..255] Of TokenStringPtr; { Index by Lo(Token) }
- End;
-
- HashEntryPtr = ^HashEntryType;
- HashEntryType = Record
- { An array of tokens of strings all hashing to the same value }
- EntChain : HashEntryPtr; { Blocks chained onto TokTable }
- EntTokenCount : Word; { Number of tokens in this block }
- EntToken : Array[1..TokenEntryListSize] Of Word;
- End;
-
- PToken = ^TToken;
- TToken = Object(TObject)
- TokMaxToken : Word; { Maximum current token }
-
- Constructor Init;
- { Initialize hash table }
-
- Constructor RestoreHashTable(FileName : String);
- { Restore hash table from named file }
-
- Function TokenText(Token : Word) : TokenString;
- { Return text given token, or null string if token not in table. }
-
- Function TokenInsertText(St : TokenString) : Word;
- { Enter string in hash table if not duplicate; return token }
-
- Procedure TokenUpdateText(Token : Word; St : TokenString);
- { Update the text associated with a token }
-
- Function TextToken(St : TokenString) : Word;
- { Locate text in hash table; return token, or 0 if not found }
-
- Function TokenAddress(Token : Word) : TokenStringPtr;
- { Return address of string represented by Token (no checking) }
-
- Procedure SaveHashTable(FileName : String);
- { Save hash table to named file }
-
- Destructor Done; Virtual;
- { Releases all storage associated with hash table }
-
- Procedure EditMatch(Count : Byte; Var MatchTable;
- St : TokenString; TotalMatch : Boolean);
- { Return a set of tokens of strings that most nearly match string St as
- determined by EditDistance. Count specifies the maximum number of
- tokens to be returned. MatchTable is an array of at least Count
- words. Tokens are returned in order of smallest to largest
- EditDistance. If TotalMatch is TRUE, all words are examined;
- otherwise only words beginning with the same first letter as St are
- examined (saves time). }
-
- Function HashListLength(Bucket : Word) : Word;
- { Return the number of entries in the indicated hash bucket entry chain. This
- function is for performance analysis purposes only }
-
- Private
-
- TokTextTable : Array[0..255] Of TokenTextTablePtr; { Index by Hi(Token) }
- TokHashTable : Array[0..HashTableSize-1] Of HashEntryPtr;
-
- Procedure InsertHashEntry(Token, Hash : Word);
- { Insert Token in hash table chain }
-
- Procedure InsertTextEntry(Token : Word; Var St : TokenString);
- { Insert word string in word table }
-
- Function LocateString(Var St : TokenString; Hash : Word) : Word;
- { Locate text in hash table; return token, or 0 if not found }
- End;
-
- Implementation
- Uses EditDist, PairHeap;
- Const
- TextBufSize = 16384; { Size of text buffer for Save/Restore }
- SaveMagicNumber = $EF120550; { Magic number for save/restore }
-
- Type
- MatchRecordPtr = ^MatchRecord;
- MatchRecord = Object(HeapEntry)
- { Used by EditMatch to sort tokens }
- Token : Word;
- Distance : Word;
- End;
-
- HeapControl = Object(TopSoMany)
- Function Less(Var x, y : HeapEntry) : Boolean; Virtual;
- End;
-
- Function HashPJW(Var s : TokenString) : Word;
- { Hash string to a number between 0 and HashTableSize-1 }
- Function HashPJWPrim(Var s : TokenString) : LongInt;
- Inline(
- $5E/ { pop si}
- $58/ { pop ax}
- $1E/ { push ds}
- $8E/$D8/ { mov ds,ax}
- $31/$DB/ { xor bx,bx}
- $31/$D2/ { xor dx,dx}
- $AC/ { lodsb}
- $30/$E4/ { xor ah,ah}
- $89/$C1/ { mov cx,ax}
- $E3/$2C/ { jcxz done}
- {next:}
- $D1/$E3/ { shl bx,1}
- $D1/$D2/ { rcl dx,1}
- $D1/$E3/ { shl bx,1}
- $D1/$D2/ { rcl dx,1}
- $D1/$E3/ { shl bx,1}
- $D1/$D2/ { rcl dx,1}
- $D1/$E3/ { shl bx,1}
- $D1/$D2/ { rcl dx,1}
- $AC/ { lodsb}
- $01/$C3/ { add bx,ax}
- $83/$D2/$00/ { adc dx,0}
- $F6/$C6/$F0/ { test dh,$F0}
- $74/$0F/ { jz skip}
- $88/$F0/ { mov al,dh}
- $80/$E6/$0F/ { and dh,$0F}
- $D0/$E8/ { shr al,1}
- $D0/$E8/ { shr al,1}
- $D0/$E8/ { shr al,1}
- $D0/$E8/ { shr al,1}
- $30/$C3/ { xor bl,al}
- {skip:}
- $E2/$D4/ { loop next}
- {done:}
- $1F/ { pop ds}
- $89/$D8); { mov ax,bx}
- Begin
- HashPJW := HashPJWPrim(s) Mod HashTableSize;
- End;
-
- Constructor TToken.Init;
- { Initialize control pointers }
- Begin
- If Not TObject.Init Then Fail;
- TokMaxToken := 0;
- FillChar(TokTextTable, SizeOf(TokTextTable), 0);
- FillChar(TokHashTable, SizeOf(TokHashTable), 0);
- End;
-
- Procedure TToken.InsertHashEntry(Token, Hash : Word);
- Var
- Entry : HashEntryPtr;
- Begin
- Entry := TokHashTable[Hash];
- If (Entry = Nil) Or (Entry^.EntTokenCount >= TokenEntryListSize) Then
- Begin
- New(Entry);
- If Entry <> Nil Then With Entry^ Do Begin
- EntChain := TokHashTable[Hash];
- EntTokenCount := 0;
- TokHashTable[Hash] := Entry;
- End;
- End;
- If Entry <> Nil Then With Entry^ Do Begin
- Inc(EntTokenCount);
- EntToken[EntTokenCount] := Token;
- End;
- End;
-
- Procedure TToken.InsertTextEntry(Token : Word; Var St : TokenString);
- { Insert word string in word table }
- Var
- j : Word;
- Begin
- j := Hi(Token);
- If TokTextTable[j] = Nil Then Begin
- New(TokTextTable[j]);
- If TokTextTable[j] <> Nil Then With TokTextTable[j]^ Do Begin
- FillChar(TokenTextEntry, SizeOf(TokenTextEntry), 0);
- End;
- End;
- If TokTextTable[j] <> Nil Then With TokTextTable[j]^ Do Begin
- j := Lo(Token);
- If TokenTextEntry[j] <> Nil Then
- FreeMem(TokenTextEntry[j], Succ(Length(TokenTextEntry[j]^)));
- GetMem(TokenTextEntry[j], Succ(Length(St)));
- If TokenTextEntry[j] <> Nil Then TokenTextEntry[j]^ := St;
- End;
- End;
-
- Function TToken.LocateString(Var St : TokenString; Hash : Word) : Word;
- { Locate text in hash table; return token, or 0 if not found }
- Var
- Entry, Trail : HashEntryPtr;
- i, Token : Word;
- Found : Boolean;
- Begin
- LocateString := 0;
- Entry := TokHashTable[Hash];
- Trail := Nil;
- Found := False;
- While Entry <> Nil Do With Entry^ Do Begin
- i := 1;
- Repeat
- Token := EntToken[i];
- Inc(i);
- Found := TokTextTable[Hi(Token)]^.TokenTextEntry[Lo(Token)]^ = St;
- Until Found Or (i > EntTokenCount);
- If Found Then Begin
- LocateString := Token;
- Dec(i, 2);
- If i > 0 Then Begin
- EntToken[Succ(i)] := EntToken[i];
- EntToken[i] := Token;
- End Else If Trail <> Nil Then Begin
- i := Trail^.EntTokenCount;
- EntToken[1] := Trail^.EntToken[i];
- Trail^.EntToken[i] := Token;
- End;
- Entry := Nil;
- End Else Begin
- Trail := Entry;
- Entry := EntChain;
- End;
- End;
- End;
-
- Function TToken.TokenText(Token : Word) : TokenString;
- Begin
- TokenText := '';
- If TokTextTable[Hi(Token)] <> Nil Then With TokTextTable[Hi(Token)]^ Do
- If TokenTextEntry[Lo(Token)] <> Nil Then
- TokenText := TokenTextEntry[Lo(Token)]^
- End;
-
- Function TToken.TokenInsertText(St : TokenString) : Word;
- Var
- h, j : Word;
- Begin
- h := HashPJW(St);
- j := LocateString(St, h);
- If j = 0 Then Begin
- If TokMaxToken < $FFFF Then Begin
- Inc(TokMaxToken);
- j := TokMaxToken;
- InsertTextEntry(j, St);
- InsertHashEntry(j, h);
- End;
- End;
- TokenInsertText := j;
- End;
-
- Procedure TToken.TokenUpdateText(Token : Word; St : TokenString);
- Var
- h : Word;
- Begin
- InsertTextEntry(Token, St);
- h := HashPJW(St);
- If LocateString(St, h) = 0 Then
- InsertHashEntry(Token, h);
- If TokMaxToken < Token Then TokMaxToken := Token;
- End;
-
- Function TToken.TextToken(St : TokenString) : Word;
- Begin
- TextToken := LocateString(St, HashPJW(St));
- End;
-
- Function TToken.TokenAddress(Token : Word) : TokenStringPtr;
- Begin
- TokenAddress := TokTextTable[Hi(Token)]^.TokenTextEntry[Lo(Token)];
- End;
-
- Procedure TToken.SaveHashTable(FileName : String);
- Type
- TextBuffer = Array[1..TextBufSize] Of Char;
- Var
- Buf : ^TextBuffer;
- f : Text;
- i : Word;
- Begin
- Assign(f, FileName);
- New(Buf);
- If Buf <> Nil Then SetTextBuf(f, Buf^, TextBufSize);
- ReWrite(f);
- WriteLn(f, SaveMagicNumber);
- For i := 1 To TokMaxToken Do
- WriteLn(f, TokenAddress(i)^);
- Close(f);
- Dispose(Buf);
- End;
-
- Constructor TToken.RestoreHashTable(FileName : String);
- Type
- TextBuffer = Array[1..TextBufSize] Of Char;
- Var
- Buf : ^TextBuffer;
- n : LongInt;
- i : Word;
- f : Text;
- st : TokenString;
- ch : Char;
- Begin
- TokMaxToken := 0;
- FillChar(TokTextTable, SizeOf(TokTextTable), 0);
- FillChar(TokHashTable, SizeOf(TokHashTable), 0);
- {$I-}
- Assign(f, FileName);
- New(Buf);
- If Buf <> Nil Then SetTextBuf(f, Buf^, TextBufSize);
- Reset(f);
- {$I+}
- If IoResult = 0 Then Begin
- ReadLn(f, n);
- If n = SaveMagicNumber Then Begin
- i := 1;
- While Not Eof(f) Do Begin
- ReadLn(f, st);
- TokenUpdateText(i, st);
- Inc(i);
- End;
- End;
- Close(f);
- End;
- Dispose(Buf);
- End;
-
- Destructor TToken.Done;
- Var
- i, j : Byte;
- Entry, Temp : HashEntryPtr;
- Begin
- For i := 0 To 255 Do If TokTextTable[i] <> Nil Then
- With TokTextTable[i]^ Do Begin
- For j := 0 To 255 Do If TokenTextEntry[j] <> Nil Then
- FreeMem(TokenTextEntry[j], Succ(Length(TokenTextEntry[j]^)));
- Dispose(TokTextTable[i]);
- End;
- For i := 0 To Pred(HashTableSize) Do Begin
- Entry := TokHashTable[i];
- While Entry <> Nil Do Begin
- Temp := Entry^.EntChain;
- Dispose(Entry);
- Entry := Temp;
- End;
- End;
- End;
-
- Function HeapControl.Less(Var x, y : HeapEntry) : Boolean;
- Var
- xx : MatchRecord Absolute x;
- yy : MatchRecord Absolute y;
- Begin
- Less := xx.Distance > yy.Distance;
- End;
-
- Procedure TToken.EditMatch(Count : Byte; Var MatchTable;
- St : TokenString; TotalMatch : Boolean);
- Var
- Heap : HeapControl;
- Rec : MatchRecordPtr;
- i, j, Dist : Word;
- Match : Array[1..255] Of Word Absolute MatchTable;
- Begin
- Heap.Init(Count);
- With Heap Do Begin
- For i := 1 To TokMaxToken Do Begin
- If TotalMatch
- Or (TokTextTable[Hi(i)]^.TokenTextEntry[Lo(i)]^[1] = St[1])
- Or (TokTextTable[Hi(i)]^.TokenTextEntry[Lo(i)]^[1] = St[2])
- Then Begin
- Dist := EditDistance(St,
- TokTextTable[Hi(i)]^.TokenTextEntry[Lo(i)]^);
- Rec := GetDiscard;
- If Rec = Nil Then New(Rec);
- If Rec <> Nil Then Begin
- With Rec^ Do Begin
- Token := i;
- Distance := Dist;
- End;
- Insert(Rec^);
- End;
- End;
- End;
- Repeat
- Rec := GetDiscard;
- If Rec <> Nil Then Dispose(Rec);
- Until Rec = Nil;
- j := EntryCount;
- End;
- For i := Count DownTo 1 Do If i > j Then Match[i] := 0 Else Begin
- Rec := Heap.DeleteLowEntry;
- Match[i] := Rec^.Token;
- Dispose(Rec);
- End;
- End;
-
- Function TToken.HashListLength(Bucket : Word) : Word;
- Var
- Count : Word;
- Entry : HashEntryPtr;
- Begin
- HashListLength := 0;
- If Bucket < HashTableSize Then Begin
- Count := 0;
- Entry := TokHashTable[Bucket];
- While Entry <> Nil Do With Entry^ Do Begin
- Inc(Count, EntTokenCount);
- Entry := EntChain;
- End;
- HashListLength := Count;
- End;
- End;
-
- End.