Programmer 7500

home *** CD-ROM | disk | FTP | other *** search

/ Programmer 7500 / MAX_PROGRAMMERS.iso / PASCAL / KRUSE_11.ZIP / $NDEXTEX.PAS next >

Wrap

Pascal/Delphi Source File | 1990-12-01 | 30.5 KB | 906 lines

{ 1. Program IndexText(InText, InIndex, NewIndex, OutIndex, HashFile, Input, Output); 2. Function Lt(u, v: word): Boolean; 3. Procedure ReadWord(var F: text; var w: word); 4. Procedure WriteWord(var F: text; w: word); 5. Procedure SplitWords; phase 1 6. Function HashAddress(w: word): hashentry; 7. Procedure Initialize; 8. Procedure GetWord; 9. Procedure GetChar(var ch: char); 10. Procedure AddChar(ch: char); 11. Procedure Conclude; 12. Procedure ClassifyWords; phase 2 13. Procedure BuildTree(var root: pointer; ch: char); 14. Procedure Insert(p: pointer); 15. Function Power2(c: integer): level; 16. Procedure FindRoot; 17. Procedure ConnectSubtrees; 18. Procedure GetNode(var p: pointer; ch: char); 19. Procedure Process(r: reference); 20. Procedure UpdateNode(p: pointer; r: reference); 21. Procedure NewWord(var p: pointer; r: reference); 22. Procedure InsertTree(r, p: pointer); 23. Procedure OutputTree(p: pointer); 24. Procedure PutNode(p: pointer); 25. Procedure UpdateHashFile; phase 3 26. Function HashAddress(w: word): hashentry; 27. Procedure MergeIndices; 28. Procedure CopyLine } Program IndexText(InText, InIndex, NewIndex, OutIndex, HashFile, NewHashFile, Input, Output); {Produces word counts and list of references for the document file InText. Uses the master word list in file InIndex, if provided. Output word list for the new text goes to file NewIndex. The merger of these two files becomes OutIndex. HashFile contains the common words to be ignored. If not specified, it is created on output, containing the words so flagged by the user.} Const maxwd = 20; {More letters in a word will be ignored.} minwd = 3; {Shorter words will be ignored} hashsize = 2003; {should be a prime; size of hash table} linesperpage = 66; {assumes standard spacing and paper} maxheight = 20; {for building binary tree in phase 2} A = 'A'; Z = 'Z'; hyphen = '-'; blank = ' '; apostrophe = ''''; {requires two apostrophes to represent one} underscore = '_'; ordbackspace = 8; {ASCII control character for backspace} ordformfeed = 12; {ASCII control character for new page} changecase = 32; {ASCII difference between upper and lower case} Type word = packed array[1..maxwd] of char; reference = record wd: word; pg: integer; {page number} end; fileref = file of reference; {used for local files} letter = A..Z; hashentry = 1..hashsize; Var InText, {document being processed} InIndex, {master word list} NewIndex, {word list of current document} OutIndex: text; {updated master word list} HashFile, NewHashFile: file of word; {local file, used to update HashFile} RefFile: array[letter] of fileref; {local files used for auxiliary storage of words from phase 1 to phase 2: separate file for each initial letter} blankword: word; {will contain all blanks} outcount: array[letter] of integer; {counters for word files} wordcount: integer; {count of all words in the text} Function Lt(u,v: word): Boolean; {Determine if word u precedes word v lexicographically.} Var i: 1..maxwd; {loop variable} Begin {function Lt} i := 1; While (i < maxwd) and (u[i] = v[i]) do i := i + 1; Lt := (u[i] < v[i]) {Above is version that works with ASCII code. For codes where blank comes after letters, modifications are necessary.} End; {function Lt} Procedure ReadWord( var F: text; var w: word); {reads word w from text file F; assumes not at end of file} Var c: 1..maxwd; Begin {procedure ReadWord} For c := 1 to maxwd do read(F, w[c]) End; {procedure ReadWord} procedure WriteWord(var F: text; w: word); {writes word w to text file F} var c: 1..maxwd; begin for c := 1 to maxwd do write(F, w[c]) end; {Phase 1: Splitting the Text into Words} Procedure SplitWords; {sets up hash table, reads text, and divides into 26 word lists} Var hash: array[hashentry] of word; {hash table} pagecount, {keeps the current page number} addpage, {amount to increase pagecount after word} linecount: integer; {line number on the current page} w: word; {word currently being processed} x: hashentry; {location of w, if in hash table} endinput: Boolean; {true if and only if input has all been read} firstletter: char; {Into which file does word w go?} {The following are kept for use in procedure GetWord, and for efficiency are set up only once in procedure Initialize.} backspace, formfeed: char; {ASCII control characters} contchar, {characters OK in the middle of a word} alphabet: set of char; {letters only --- to start a word} {Implementation dependent: A good Pascal compiler should allow "set of char"; otherwise, a restricted range is required.} function HashAddress(w: word): hashentry; {modified from the textbook} {calculates the location in hash table of word w, or, if not there, returns pointing to the blank word where w should go} var x, {calculated location} inc: integer; {increment for open addressing} begin {function HashAddress} x := abs(ord(w[1])*ord(w[2])+ord(w[4])+ord(w[6])) mod hashsize + 1; {Hash function assumes long word length. For short word machines we must ensure that the result is non-negative, and worry about overflow.} if (hash[x] <> w) and (hash[x] <> blankword) then begin inc := (abs(ord(w[3])-95) mod 29); {A key dependent increment is used to avoid clustering.} repeat inc := inc + 1; if inc > hashsize then writeln(w,' causes hash table to become full, infinite loop.'); x := x + inc; if x > hashsize then x := x - hashsize; until (w = hash[x]) or (blankword = hash[x]) end; HashAddress := x end; {function HashAddress} Procedure Initialize; {sets up constant-valued sets for use in GetWord; opens the text file and initializes various counters; opens file holding hash table (if any), and reads or otherwise initializes the table.} Var ch: char; {used as an index} i: integer; {general--purpose loop control} Begin {procedure Initialize} backspace:= chr(ordbackspace); formfeed := chr(ordformfeed); {Initialize ASCII control characters.} alphabet := ['A'..'Z', 'a'..'z']; {letters only, to start a word} contchar := alphabet + [hyphen, apostrophe, backspace, underscore]; {characters that will not terminate the word} For i := 1 to maxwd do blankword[i] := blank; reset(InText); endinput := eof(InText); Repeat write( 'What is the page number on which the text begins?'); readln(pagecount); if pagecount < 0 then writeln('Must be a nonnegative integer.') until pagecount >= 0; linecount := 0; addpage := 0; wordcount := 0; For ch := A to Z do Begin Rewrite( RefFile[ch] ); Outcount[ch] := 0 End; reset(HashFile); if eof(HashFile) then begin {There is no previous table; initialize the table to all blanks.} writeln('Cannot open file for hash table. Creating a new table.'); for i := 1 to hashsize do hash[i] := blankword end else begin {Retrieve the previous hash table.} i := 0; repeat i := i + 1; hash[i] := HashFile^; get(HashFile) until eof(HashFile) or (i >= hashsize); if (not eof(HashFile)) or (i <> hashsize) then writeln('Error in reading hash table. Incorrect number of entries.') end end; {procedure Initialize} Procedure GetWord( var w: word); {Gets words from input file InText, and returns only words at least minwd characters long. Parameter endinput becomes true if and only if the end of InText is reached with no word to return. This parameter is set by the subsidiary procedure GetChar. The procedure also updates global variables wordcount and linecount, updates the global variable pagecount after each linesperpage cr's, or after each formfeed, whichever comes first, and uses the sets alphabet and contchar and various character constants.} label 1; {used by GetChar to exit procedure on eof(InText)} Var c: 0..maxwd; {count of characters in word} ch: char; {character currently processed} endln: Boolean; {At the end of a line?} Procedure GetChar(var ch: char); {gets a character from input text into ch; checks for eof; updates page count and line count} Begin {procedure GetChar} If eof(InText) then If c >= minwd then ch := '.' {special character to end the current word} Else begin {no word to return; set endinput} endinput := true; goto 1 {Exit from GetWord.} End Else begin {not at end of file: process next character} ch := InText^; endln := eoln(InText); get(InText); If endln then Begin linecount := linecount + 1; If linecount >= linesperpage then Begin addpage := addpage + 1; linecount := 0 End End; If ch = formfeed then Begin addpage := addpage + 1; linecount := 0; endln := true; {Treat formfeed like end of line.} ch := blank End End End; {procedure GetChar} Procedure AddChar(ch: char); {adds given character to word, if possible} Begin {procedure AddChar} If c < maxwd then Begin c := c + 1; w[c] := ch End End; {procedure AddChar} begin {procedure GetWord} repeat {until current word is at least minwd chars long} c := 0; repeat GetChar(ch) {Find a letter that will start the word.} until ch in alphabet; pagecount := pagecount + addpage; addpage := 0; If ch in ['a'..'z'] then {Translate the first letter to uppercase.} ch := chr(ord(ch) - changecase); {system dependent} AddChar(ch); {Put first letter into the word.} GetChar(ch); While ch in contchar do If ch in alphabet then {Add letters directly to word.} Begin {processing letter} AddChar(ch); GetChar(ch) End {processing letter} Else If ch = hyphen then Begin {processing hyphen} GetChar(ch); {Find what comes after hyphen.} If endln then GetChar(ch) {Delete both the hyphen and the end of line.} Else if ch = hyphen then {Two hyphens represent a dash.} ch := blank {Use a blank to terminate the word.} Else If ch in alphabet then AddChar(hyphen) {Include hyphens between letters} Else {nothing} {Delete all other hyphens} End {processing hyphen} Else if ch = apostrophe then Begin {processing apostrophe} GetChar(ch); If ch = 's' then {Delete 's at end of word only.} Begin GetChar(ch); If ch in contchar then Begin AddChar(apostrophe); AddChar('s') End End Else if ch in alphabet then AddChar(apostrophe) {Allow contractions.} End {processing apostrophe} Else {Remaining possibilities are backspace and underscore.} GetChar(ch); {Delete these characters.} {while loop on continuing characters ends here.} wordcount := wordcount + 1 Until c >= minwd; {Skip over short words.} While c < maxwd do {Fill with blanks.} Begin c := c + 1; w[c] := blank End; 1: {When end of file occurs, program will exit to here from GetChar.} End; {procedure GetWord} Procedure Conclude; {Writes out counts of various word lists. For some systems, it is necessary to close files, which should be done in this procedure.} Var ch: char; {loop index} Begin {procedure Conclude} writeln('The total number of words read in is ', wordcount:7); writeln; writeln('The number of words to process further in the next stage,'); writeln('beginning with each letter, is below.'); writeln; for ch := 'A' to 'M' do write(' ', ch:1, ' '); writeln; for ch := 'A' to 'M' do write(outcount[ch]:4, ' '); writeln; writeln; for ch := 'N' to 'Z' do write(' ', ch:1, ' '); writeln; for ch := 'N' to 'Z' do write(outcount[ch]:4, ' '); writeln; writeln End; {procedure Conclude} Begin {procedure SplitWords} Initialize; {sets up files, hash table, constants} GetWord(w); {obtains a single word from InText} While not endinput do Begin x := HashAddress(w); If w <> hash[x] then Begin {Not in hash table; put into RefFile.} firstletter := w[1]; outcount[firstletter] := outcount[firstletter] + 1; With RefFile[firstletter]^ do Begin wd := w; pg := pagecount End; Put(RefFile[firstletter]) End; GetWord(w) End; Conclude {writes word counts to Output} End; {procedure SplitWords} {Phase 2: Classifying the words} Procedure ClassifyWords; {For each letter of the alphabet, the procedure reads in a list of words from InIndex, builds them into a binary tree, supplements it with entries from RefFile, and writes result to NewIndex and NewHashFile.} Type wordtype = (hash, count, index); {three ways to process a word} pointref = ^reflist; reflist = record {list of references} pg: integer; next: pointref end; pointer = ^node; node = record {vertex of the binary tree} wd: word; left, right: pointer; case kind: wordtype of hash: (); {empty} count: (ct: integer); index: (ref: pointref) end; Var root: pointer; {root of the binary tree} ch: char; {Loop on the first letter of word.} procedure BuildTree(var root: pointer; ch: char); {Uses an auxiliary procedure GetNode(p) to obtain a list of items in proper order of keys, and builds them into a binary search tree.} const maxheight = 20; type level = -1 .. maxheight; {number of steps above leaves} var lastnode: array[level] of pointer; {contains a pointer to the last node processed on each level} counter: integer; {number of nodes read in so far} p: pointer; {p^ is the present input node} lev: level; {level of p^} function Power2(c: integer): level; {Finds the highest power of 2 that divides c. Requires c <> 0.} var lev: level; begin {Function Power2} lev := 0; while not odd(c) do begin c := c div 2; lev := lev + 1 end; Power2 := lev end; {Function Power2} procedure Insert(p: pointer); {Inserts p^ as rightmost node of a partial binary search tree.} var lev: level; {level of p^} begin {Procedure Insert} lev := Power2(counter); p^.right := nil; p^.left := lastnode[lev - 1]; lastnode[lev] := p; if lastnode[lev + 1] <> nil then with lastnode[lev + 1]^ do if right = nil then right := p end; {Procedure Insert} procedure FindRoot; var lev: level; begin {Procedure FindRoot} if counter = 0 then root := nil {Tree is empty.} else begin {Non-empty tree} lev := maxheight; {Find the highest occupied level; it gives the root} while lastnode[lev] = nil do lev := lev - 1; root := lastnode[lev] end end; {Procedure FindRoot} procedure ConnectSubtrees; var p: pointer; lev: level; s: level; begin {Procedure ConnectSubtrees} lev := maxheight; while (lastnode[lev] = nil) and (lev > 1) do lev := lev - 1; {Find the highest node: root} while lev > 1 do {Nodes on levels 1 and 0 are already OK} with lastnode[lev]^ do if right <> nil then lev := lev - 1 {Search down for the highest dangling node} else begin {Case: right subtree is undefined.} p := left; {Find the highest entry in lastnode that} s := lev - 1; {is not in the left subtree.} repeat p := p^.right; s := s - 1 until (p = nil) or (p <> lastnode[s]); right := lastnode[s]; lev := s {Nodes on levels between lev and s are on the left.} end {Connecting dangling subtrees} end; {Procedure ConnectSubtrees} Procedure GetNode( var p: pointer; ch: char); {reads a word from file InIndex and sets node correspondingly; returns p = nil at eof or when next word starts later than ch} Var wordcode: char; {letter indicating type of word} Begin {procedure GetNode} While (not eof(InIndex)) and (InIndex^ = blank) do Get(InIndex); {Skip all the leading blanks.} If eof(InIndex) then p := nil Else if InIndex^ > ch then p := nil Else begin new(p); with p^ do begin ReadWord(InIndex, wd); Read(InIndex, wordcode); If wordcode = 'i' then begin kind := index; ref := nil end Else if wordcode = 'c' then begin kind := count; ct := 0 end Else Writeln('Erroneous word code in file InIndex.') End; {with statement setting up node} readln(InIndex) {Advance to the start of the next entry.} End End; {procedure GetNode} begin {Procedure BuildTree} for lev := -1 to maxheight do lastnode[lev] := nil; counter := 0; GetNode(p, ch); while p <> nil do begin counter := counter + 1; Insert(p); GetNode(p, ch) end; {receiving and processing input} FindRoot; ConnectSubtrees end; {Procedure BuildTree} Procedure Process( r: reference); {takes the word and page reference r and updates the binary tree} Var p: pointer; {Trace through the tree.} found: Boolean; {Is the word in the tree?} Procedure UpdateNode( p: pointer; r: reference); {uses reference r to update information in node p^} Var q: pointref; {used to add reference to list} Begin {procedure UpdateNode} With p^ do Case kind of hash:; {no action needed} count: ct := ct + 1; index: If ref = nil then Begin new(ref); ref^.pg := r.pg; ref^.next := nil End Else if ref^.pg <> r.pg then Begin {Add the new reference to the list.} New(q); q^.pg := r.pg; q^.next := ref; ref := q End End {case statement to update tree} End; {procedure UpdateNode} Procedure NewWord(var p: pointer; r: reference); {Creates a node for the first occurrence of a new reference r. A pointer to the new node is returned in p.} Var response: char; {answer received from user} Begin {procedure NewWord} new(p); With p^ do Begin wd := r.wd; left := nil; right := nil; Repeat {Ask user what kind of word.} WriteWord(output, wd); write('is (F, C, I)?'); read(response) Until response in ['F', 'C', 'I' ,'f', 'c', 'i']; Case response of 'F','f': kind := hash; 'C','c': Begin kind := count; ct := 1 End; 'I','i': Begin kind := index; new(ref); ref^.pg := r.pg; ref^.next := nil; End End {case statement} End {with statement} End; {procedure NewWord} Procedure InsertTree(r, p: pointer); {adds a node p^ to the tree with root r^; requires that r <> nil and p^ not be in the tree; proceeds by recursion} Begin {procedure InsertTree} If Lt(p^.wd, r^.wd) then If r^.left = nil then r^.left := p Else InsertTree(r^.left, p) Else If r^.right = nil then r^.right := p Else InsertTree(r^.right, p) End; {procedure InsertTree} Begin {procedure Process} If root = nil then {The tree might be empty.} NewWord(root, r) Else begin {case of nonempty tree} p := root; {Begin a tree search.} found := false; Repeat If r.wd = p^.wd then found := true Else If Lt(r.wd, p^.wd) then p := p^.left Else p := p^.right Until found or (p = nil); If found then UpdateNode(p, r) Else begin {p^ was not found: add it to the tree.} NewWord(p, r); InsertTree(root, p) End End End; {procedure Process} Procedure OutputTree( p: pointer); {traverses the tree for which p^ is the root in inorder} Procedure PutNode(p: pointer); Var q: pointref; {used to traverse list of references} Begin {procedure PutNode} With p^ do Case kind of hash: Begin NewHashFile^ := wd; put( NewHashFile ) End; count: If ct <> 0 then {Otherwise, word is not in the document.} Begin WriteWord(NewIndex, wd); write(NewIndex, 'c'); writeln( NewIndex, ct:5) End; index: If ref <> nil then Begin WriteWord(NewIndex, wd); write(NewIndex, 'i'); q := ref; Repeat write( NewIndex, q^.pg:5); q := q^.next Until q = nil; writeln( NewIndex ) End End {case statement} End; {procedure PutNode} Begin {procedure OutputTree} If p <> nil then With p^ do Begin OutputTree(left); {Traverse the left subtree.} PutNode(p); OutputTree(right); {Traverse the right subtree.} Dispose(p) End End; {procedure OutputTree} Begin {procedure ClassifyWords} writeln('At the appearance of each word, give its disposition:'); writeln(' F --- Forget all occurrences of this word.'); writeln(' C --- Count how many times this word appears.'); writeln(' I --- Index this word: list the pages on which it appears.'); Reset(InIndex); Rewrite(NewIndex); For ch := A to Z do {Start main loop on first letter of word.} Begin BuildTree(root, ch); {Get the part of master wordlist starting with ch from the file InIndex, and build it into a binary tree.} reset(RefFile[ch]); While not eof(RefFile[ch]) do Begin Process(RefFile[ch]^); {Use new words from RefFile[ch] to update the tree.} get( RefFile[ch] ) End; OutputTree(root) {Write the contents of the tree into files NewIndex and NewHashFile.} End {main loop on letters of alphabet} End; {procedure ClassifyWords} {Phase 3: Updating the Permanent Files} Procedure UpdateHashFile; {reads in old hash table, inserts file of new entries; writes out to HashFile} Var hash: array[hashentry] of word; x: hashentry; w: word; Function HashAddress(w: word): hashentry; {calculates the location in hash table of word w, or, if none, returns pointing to the blank word where w should go} Var x, {calculated location} inc: integer; {increment for open addressing} Begin {function HashAddress} x := (ord(w[1]) * ord(w[3]) * ord(w[4]) + ord(w[6])) mod hashsize + 1; {Hash function assumes long word length. For short word machines, we must ensure that the result is nonnegative, and worry about overflow.} If (hash[x] <> w) and (hash[x] <> blankword) then Begin inc := 1; Repeat x := x + inc; If x > hashsize then x := x - hashsize; inc := inc + 2 Until (w = hash[x]) or (blankword = hash[x]) End; HashAddress := x End; {function HashAddress} Begin {procedure UpdateHashFile} reset(HashFile); If eof(HashFile) then {HashFile is empty; create new table.} For x := 1 to hashsize do hash[x] := blankword Else For x := 1 to hashsize do read(HashFile, hash[x]); {Some versions of Pascal do not allow procedures read and write for files other than text. For such systems, expand to use get and put.} reset(NewHashFile); While not eof(NewHashFile) do Begin read(NewHashFile, w); hash[HashAddress(w)] := w {If the table is full, new entries will replace old ones.} End; rewrite(HashFile); For x := 1 to hashsize do Write(HashFile, hash[x]) End; {procedure UpdateHashFile} Procedure MergeIndices; {merges files NewIndex and InIndex into file OutIndex} Var u, v: word; {for new and old indices, respectively} m, n: integer; {counts for above entries} ukind, vkind: char; {Is the word of kind i or c?} Procedure CopyLine( var w: word; var F: text; newline, endline: Boolean); {Copies the remainder of a line from the file F to OutIndex. If newline is true, then the word w is also written, and kind is copied. If endline is true, then the line written to OutIndex is ended. The procedure also reads a new word w from the next line in F.} Var n: integer; {number copied from file to file} kind: char; {word code copied from file to file} Begin {procedure CopyLine} If newline then Begin WriteWord(OutIndex, w); read(F, kind); write(OutIndex, kind) End Else While (not eof(F)) and (not eoln(F)) and (F^ = blank) do get(F); While (not eof(F)) and (not eoln(F)) do Begin read(F, n); write(OutIndex, n:5); While (not eoln(F)) and (F^ = blank) do get(F); {Skip blanks.} End; readln(F); If not eof(F) then ReadWord(F, w); If endline then writeln(OutIndex) End; {procedure CopyLine} Begin {procedure MergeIndices} reset(NewIndex); reset(InIndex); rewrite(OutIndex); If eof(NewIndex) or eof(InIndex) then writeln('One of the indices is empty. No merge will be done.') Else Begin ReadWord(NewIndex, u); ReadWord( InIndex, v); Repeat If Lt(u,v) then CopyLine(u, NewIndex, true, true) {Boolean parameters mean, respectively; start new line; end the line.} Else If Lt(v,u) then CopyLine(v, InIndex, true, true) Else begin {Words are equal. Determine the kind of word.} read(NewIndex, ukind); read( InIndex, vkind); If ukind <> vkind then writeln('Inconsistent word types found in merge.'); WriteWord(OutIndex, u); write(OutIndex, ukind); If ukind = 'c' then Begin readln(NewIndex, m); readln( InIndex, n); m := m + n; writeln(OutIndex, m:5); If not eof(NewIndex) then ReadWord(NewIndex, u); If not eof( InIndex) then ReadWord( InIndex, v) End Else begin {Copy both lists of page numbers.} CopyLine(u, NewIndex, false, false); CopyLine(v, InIndex, false, true) End End {finished processing equal words} Until eof(NewIndex) or eof(InIndex); While not eof(NewIndex) do CopyLine(u, NewIndex, true, true); While not eof(InIndex) do CopyLine(v, InIndex, true, true) {At most one of the two loops above will iterate.} End End; {procedure MergeIndices} Begin {main program} SplitWords; {phase 1} ClassifyWords; {phase 2} UpdateHashFile; {phase 3, first part} MergeIndices; {phase 3, second part} End.