Programmer 7500

home *** CD-ROM | disk | FTP | other *** search

/ Programmer 7500 / MAX_PROGRAMMERS.iso / PASCAL / KRUSE_11.ZIP / INDEXHAS.PAS < prev next >

Wrap

Pascal/Delphi Source File | 1990-12-01 | 31.5 KB | 931 lines

{outline of declaration of subprograms: 1. program IndexText(InText, InIndex, NewIndex, OutIndex, HashFile, NewHashFile, input, output); (main program) 2. function Lt(u, v: word): Boolean; 3. procedure ReadWord(var f: text; var w: word); 4. procedure WriteWord(var f: text; w: word); 4a. built in CPU time function clock; 5. procedure SplitWords; (phase 1) 5a. function FindFile(ch: char): filecode; 6. function HashAddress(w: word): hashentry; 7. procedure Initialize; 8. procedure GetWord; 8a. procedure TellUserPage; 9. procedure GetChar(var ch: char); 10. procedure AddChar(ch: char); 11. procedure Conclude; procedure ClassifyWords; procedure InitializeTable(RefTable: RefHashTable); function HashAddress(x: reference): integer; procedure Insert(x: reference; pos: integer; var RefTable: RefHashTable); procedure Place(var F: fileref; RefTable: RefHashTable); function Empty(L: list): Boolean; procedure LinkEntries(RefTable: RefHashTable; var NewList: list); procedure RemoveFirst(var p: pointer; L: list); procedure SkipBlank(var F: text); procedure ReadReference(var r: pointer; var F: text); procedure WriteReference(p: pointer; var NewIndex, NewHashFile: text); procedure GetWordType(p: pointer); procedure Delete(var p: pointer); procedure CompareAndMerge(NewList: list;var InIndex,NewIndex,NewHashFile: text); procedure Merge(p, q: pointer; var r: pointer); procedure Divide(var p, q: pointer); procedure MergeSort(var p: pointer); procedure MainMergeSort(var L: list); } program IndexText(InText, InIndex, NewIndex, HashFile, NewHashFile, input, output); {Produces word counts and list of references for the document file InText. Uses the master word list in file InIndex, if provided. Output word list for new text goes to file NewIndex. HashFile contains the common words to be ignored. If not specified, it is created on output, containing the words so flagged by the user.} {This implementation uses only phases 1 and 2. A smaller array of text files is also used, as specified in the exercise section.} const maxwd = 20; {More letters in word will be ignored.} minwd = 1; {Shorter words will be ignored.} hashsize = 2003; {should be a prime} linesperpage = 66; {assumes standard spacing and paper} maxheight = 20; {for building binary tree in phase 2} A = 'A'; Z = 'Z'; hyphen = '-'; blank = ' '; apostrophe = ''''; {requires two `'s to represent one} underscore = '_'; ordbackspace = 8; {ASCII control character for backspace} ordformfeed = 12; {ASCII control character for new page} changecase = 32; {ASCII difference between upper and lower case} nfiles = 8; {number of temporary files for unprocessed words} MaxRowLength = 130; {maximum length of output records} type word = packed array[1..maxwd] of char; reference = record wd: word; pg: integer; {count or page number} end; fileref = file of reference; {used for local files} letter = A..Z; hashentry = 1..hashsize; filecode = 1..nfiles; var InText, {document being processed} InIndex, {master word list} NewIndex, {word list of current document} HashFile, NewHashFile: text; RefFile: array[filecode] of fileref; {local files used for auxilary storage of words from phase 1 to phase 2: Normally, a separate file exist for each initial letter, this version uses nfiles files due operating system constraints.} blankword: word; {will contain all blanks} {The next two variables were originally declared in procedure SplitWords, they have been moved to this level in order to access them globally.} outcount: array[filecode] of integer; {counters for word files} wordcount: integer; {count of all words in the text} intextname, inlistname, newlistname, newhashname: word; {used to get filename from user} lastletter: array[filecode] of letter; {last letter in each file} PresentTime, StartTime: integer; {used to track CPU time} RowLength: integer; {ensures records will not exceed MaxRowLength} function Lt( u, v: word): Boolean; {Determains if word u precedes word v lexicographically.} begin Lt := (u < v) end; procedure ReadWord( var F: text; var w: word); {Reads word w from text file F. Assumes not at end of file.} {Uses packed array, replace using a loop if your system does not support packed arrays. } begin {procedure ReadWord} read(F, w) end; {procedure ReadWord} procedure WriteWord( var F: text; w: word); {Writes word w to text file F} {Uses packed array, replace using a loop if your system does not support packed arrays. } begin {procedure WriteWord} write(F, w) end; {procedure WriteWord} procedure SetTimer; {Call once at beginning of program execution.} {Finds the CPU time when called, and keeps in variables for reference.} {System dependent procedure.} begin PresentTime := clock; StartTime := PresentTime; end; function TotalTime: real; {Returns the total CPU time, in seconds, since call to SetTimer.} {System dependent procedure.} begin TotalTime := (clock - StartTime) / 1000.0; end; function ElapsedTime: real; {Returns elapsed CPU time since last call to function ElapsedTime, or call to SetTimer, whichever is more recent.} {System dependent procedure.} var r: integer; begin r := clock; ElapsedTime := (r - PresentTime) / 1000.0; PresentTime := r; end; procedure SplitWords; {sets up hash table, reads text, and divides into nfiles word lists} var hash: array[hashentry] of reference; {hash table} pagecount: integer; {keeps the current page number} addpage: integer; {amount to increase pagecount after word} linecount: integer; {lines on the current page} w: word; {word currently being processed} x: hashentry; {location of w, if in hash table} endinput: Boolean; {true if and only if input has all been read} code: filecode; {into which file does word go?} {The following variables are kept for use in procedure GetWord, and for efficiency are set up only once in procedure Initialize:} backspace, formfeed: char; alphabet, {letters only - to start a word} contchar: set of char; {other characters ok in middle of word} function FindFile( ch: letter): filecode; {Uses binary decision tree to select one of nfiles = 8 files depending on the letter ch. These letters must be the same as those in the global array lastletter .} begin {function FindFile} if ch < 'M' then if ch < 'E' then if ch < 'C' then FindFile := 1 else FindFile := 2 else if ch < 'H' then FindFile := 3 else FindFile := 4 else if ch < 'S' then if ch < 'P' then FindFile := 5 else FindFile := 6 else if ch < 'T' then FindFile := 7 else FindFile := 8 end; {function FindFile} function HashAddress(w: word): hashentry; {calculates the location in hash table of word w, or, if not there, returns pointing to the blank word where w should go} var x, {calculated location} inc: integer; {increment for open addressing} begin {function HashAddress} x := abs(ord(w[1])*ord(w[2])+ord(w[4])+ord(w[6])) mod hashsize + 1; {Hash function assumes long word length. For short word machines we must ensure that the result is non-negative, and worry about overflow.} if (hash[x].wd <> w) and (hash[x].wd <> blankword) then begin inc := (abs(ord(w[3])-95) mod 29); {A key dependent increment is used to avoid clustering.} repeat inc := inc + 1; if inc > hashsize then writeln(w,' causes hash table to become full, infinite loop.'); x := x + inc; if x > hashsize then x := x - hashsize; until (w = hash[x].wd) or (blankword = hash[x].wd) end; HashAddress := x end; {function HashAddress} procedure Initialize; {sets up constant-valued sets for use in GetWord. Opens the text file and initializes various counters. Opens file holding hash table (if any), and reads or otherwise initializes table} var i: integer; {general purpose loop control} begin {procedure Initialize} backspace:= chr(ordbackspace); formfeed := chr(ordformfeed); {initialize ASCII control characters} alphabet := ['A'..'Z', 'a'..'z']; {letters only, to start a word} contchar := [hyphen, apostrophe, backspace, underscore]; {characters which will not terminate word} for i := 1 to maxwd do blankword[i] := blank; write('Name of input text file?'); ReadWord(input, intextname); readln; open(InText, intextname, readonly); reset(InText); endinput := eof(InText); repeat write( 'What is the page number on which the text begins?'); readln(pagecount); if pagecount < 0 then writeln('Must be a non-negative integer.') until pagecount >= 0; linecount := 0; addpage := 0; wordcount := 0; for i := 1 to nfiles do begin rewrite( RefFile[i] ); outcount[i] := 0 end; lastletter[1] := 'B'; lastletter[2] := 'D'; lastletter[3] := 'G'; lastletter[4] := 'L'; lastletter[5] := 'O'; lastletter[6] := 'R'; lastletter[7] := 'S'; lastletter[8] := 'Z'; reset(HashFile); {assumes HASHFILE.DAT is in current directory} for i := 1 to hashsize do with hash[i] do begin read(HashFile, pg); get(HashFile); {skip the blank between number and word} ReadWord(HashFile, wd); readln(HashFile); pg := 0; {initialize all the counts to 0} end; writeln('The hash table has been read.') end; {procedure Initialize} procedure GetWord( var w: word); {Gets words from input file InText, and returns only words at least minwd characters long. Parameter endinput becomes true if and only if the end of InText is reached with no word to return. the procedure also updates global variables wordcount and linecount, updates the global variable pagecount after each linesperpage cr's, or after each formfeed, whichever comes first, and uses the sets alphabet and contchar and various character constants.} label 1; {used by GetChar to exit procedure upon eof(InText)} var c: 0..maxwd; {count of characters in word} ch: char; {character currently processed} endln: Boolean; {at the end of a line?} procedure TellUserPage; {keep the user informed of progress} var i: integer; begin i := pagecount + addpage; writeln('At page', i:4, ' word count is', wordcount:7) end; procedure GetChar(var ch: char); {gets a character from input text into ch; checks for eof; updates page count and line count} begin {procedure GetChar} if eof(InText) then if c >= minwd then ch := '.' {special character to end the current word} else begin {no word to return; set endinput} endinput := true; goto 1 {exit from GetWord.} end else begin {not end of file: process next character} while InText^ in [underscore, backspace] do get( InText); ch := InText^; endln := eoln(InText); get(InText); if endln then begin linecount := linecount + 1; if linecount >= linesperpage then begin addpage := addpage + 1; linecount := 0; TellUserPage end end; if ch = formfeed then begin addpage := addpage + 1; linecount := 0; TellUserPage; endln := true; {Treat formfeed like end of line.} ch := blank end end end; {procedure GetChar} procedure AddChar(ch: char); {adds given character to word, if possible} begin {procedure AddChar} if c < maxwd then begin c := c + 1; w[c] := ch end end; {procedure AddChar} begin {procedure GetWord} repeat {until current word is at least minwd chars long} c := 0; repeat GetChar(ch) {Find a letter which will start the word.} until ch in alphabet; pagecount := pagecount + addpage; addpage := 0; if ch in ['a'..'z'] then {translate first letter to upper case.} ch := chr(ord(ch) - changecase); {assumes ASCII ordering of letters} AddChar(ch); {put first letter into the word} GetChar(ch); while (ch in alphabet) or (ch in contchar) do if ch in alphabet then {add letters directly to word} begin {processing letter} AddChar(ch); GetChar(ch) end {processing letter} else if ch = hyphen then begin {processing hyphen} GetChar(ch); {Find what comes after hyphen.} if endln then while ch = ' ' do GetChar(ch) {Delete both the hyphen and the end of line} else if ch = hyphen then {Two hyphens form a dash; ends word} ch := blank {Use a blank to terminate the word.} else if ch in alphabet then AddChar(hyphen) {Include other hyphens in word} else {nothing} end {processing hyphen} else if ch = apostrophe then begin {processing apostrophe} GetChar(ch); if ch = 's' then {Delete `'s' at end of word only} begin GetChar(ch); if ch in contchar then begin AddChar(apostrophe); AddChar('s') end end else if ch in alphabet then AddChar(apostrophe) {Allow contractions.} end {processing apostrophe} else {Remaining possibilities are backspace and underscore.} GetChar(ch); {Delete these characters.} {While loop on continuing characters ends here.} wordcount := wordcount + 1 until c >= minwd; {Skip over short words.} while c < maxwd do {Fill with blanks.} begin c := c + 1; w[c] := blank end; 1: {When end of file occurs, program will exit to here from GetChar} end; {procedure GetWord} procedure Conclude; {Writes out counts of various word lists. For some systems, it is necessary to close files, which should be done here.} var i,j: integer; {loop index} response: char; {user's answer to question} begin {procedure Conclude} writeln('The total number of words read in is ', wordcount:7); writeln; writeln('The number of words to process further in the next stage,'); writeln('in each temporary file, is below.'); writeln(' a-b c-d e-g h-l m-o p-r s t-z'); for i := 1 to nfiles do write(outcount[i]:8); writeln; writeln; (* not implemented: repeat write('Do you wish the counts from hash table to be kept in a file (y,n)?'); readln(response); if response > 'Z' then response := chr(ord(response)-changecase) until response in ['N', 'Y']; if response = 'Y' then begin write('Name of file ?'); ReadWord(input, newhashname); readln; open(NewHashFile, newhashname); rewrite(NewHashFile); for i := 1 to hashsize do with hash[i] do begin write(NewHashFile, pg:4, ' '); j := 1; repeat write(NewHashFile, wd[j]); j := j + 1; until (wd[j] = ' ') or (j >= maxwd); writeln(NewHashFile) end end *) end; {procedure Conclude} begin {procedure SplitWords} Initialize; {sets up files, hash table, constants} GetWord(w); {obtain a single word from InText} while not endinput do begin x := HashAddress(w); if w = hash[x].wd then hash[x].pg := hash[x].pg + 1 else begin {not in hash table; put into RefFile} code := FindFile( w[1] ); outcount[code] := outcount[code] + 1; with RefFile[code]^ do begin wd := w; pg := pagecount end; Put(RefFile[code]) end; GetWord(w) end; Conclude {writes word counts to output.} end; {procedure SplitWords} {start of phase 2} procedure ClassifyWords; {The references stored in the temporary files are placed in a new hash table, the words from the file InIndex are compared with the words in the new table as they are merged into the file NewIndex.} const RefTableSize = 3023; {Size of the hash table to temporarily store words.} RefTableMax = 3022; type wordtype = (hash, count, index); {ways to process a word} pointref = ^reflist; reflist = record {list of references} pg: integer; next: pointref end; pointer = ^node; node = record {node of list storing wrods.} wd: word; kind: wordtype; ct: integer; ref: pointref; next: pointer end; {Cannot use varying types as @wordtype is not known upon first reading.} list = record head: pointer end; RefHashTable = array[0..RefTableMax] of list; var code: filecode; {loop through temporary files} RefTable: RefHashTable; {stores all references in memory} NewList: list; function Empty(L: list): Boolean; begin Empty := (L.head = nil) end; procedure InitializeTable(var RefTable: RefHashTable); var i: integer; begin {procedure InitializeTable} for i := 0 to RefTableMax do RefTable[i].head := nil; end; {procedure InitializeTable} function RefTableAddress(x: reference): integer; { Returns hashed address of reference. } var i: integer; h: integer; begin {function Hash} h := 0; with x do for i := 1 to maxwd do h := h + ord(wd[i]); RefTableAddress := h mod RefTableSize end; {function Hash} procedure Insert(x: reference; pos: integer; var RefTable: RefHashTable); { Inserts the reference into the hash table of references. } var done: Boolean; p: pointer; q: pointref; begin {procedure Insert} done := false; p := RefTable[pos].head; while (p <> nil) and (not done) do begin {Search for the word, update the reference if it is found.} if p^.wd = x.wd then begin p^.ct := p^.ct + 1; {update count and page reference} new(q); q^.pg := x.pg; q^.next := p^.ref; p^.ref := q; done := true end else p := p^.next end; if not done then begin {Insert a new entry if the word is not already in the table.} p := nil; new(p); p^.wd := x.wd; p^.ct := 1; {Initialize the count and the page refernces.} new(q); q^.pg := x.pg; q^.next := nil; p^.ref := q; p^.next := RefTable[pos].head; RefTable[pos].head := p end end; {procedure Insert} procedure Place(var F: fileref; var RefTable: RefHashTable); { Places the words in file @F into the reference table. } var x: reference; begin {procedure Place} reset(F); while not eof(F) do begin x := F^; get(F); Insert(x, RefTableAddress(x), RefTable) end end; {procedure Place} procedure LinkEntries(var RefTable: RefHashTable; var NewList: list); { The references in the hashed table are combined into the list @NewList. } var i: integer; p: pointer; begin {procedure LinkEntries} i := 0; while (i < RefTableMax) and Empty(RefTable[i]) do {find the first entry} i := i + 1; if i <= RefTableMax then begin NewList.head := RefTable[i].head; {Initialize the list to point to the first entry.} p := RefTable[i].head; if p <> nil then {Find the end of the first chain.} while p^.next <> nil do p := p^.next; while (i < RefTableMax) do {link remaining entries} begin i := i + 1; if not Empty(RefTable[i]) then begin p^.next := RefTable[i].head; while p^.next <> nil do {Move @p to the end of the chain.} p := p^.next end end end else NewList.head := nil end; {procedure LinkEntries} procedure RemoveFirst(var p: pointer; var L: list); { Removes the first node from the list @L. } begin p := L.head; if not Empty(L) then begin L.head := L.head^.next; p^.next := nil end end; procedure ReadReference(var r: pointer; var F: text); { Reads refernce from the file @F. } var k: char; begin {procedure ReadReference} if eof(F) then r := nil else begin ReadWord(F, r^.wd); readln(F, k); case k of 'F', 'f': r^.kind := hash; 'C', 'c': begin r^.kind := count; r^.ct := 0 end; 'I', 'i': begin r^.kind := index; r^.ref := nil end end end end; {procedure ReadReference} procedure WriteReference(p: pointer; var NewIndex, NewHashFile: text); var q: pointref; begin {procedure WriteReference} with p^ do case kind of hash: begin WriteWord(NewHashFile, wd); writeln(NewHashFile) end; count:begin WriteWord(NewIndex, wd); write(NewIndex, 'c'); writeln(NewIndex, ct:5) end; index:begin WriteWord(NewIndex, wd); write(NewIndex, 'i'); q := ref; while q <> nil do begin write(NewIndex, q^.pg:5); q := q^.next end; writeln(NewIndex) end end end; {procedure WriteReference} procedure GetWordType(p: pointer); { Request the user to specify the category of the given word. } var response: char; begin {procedure GetWordType} with p^ do begin repeat WriteWord(output, wd); write(' is (F, C, I)?'); readln(response) until response in ['F', 'f', 'C', 'c', 'I', 'i']; case response of 'F', 'f': kind := hash; 'C', 'c': kind := count; 'I', 'i': kind := index end end end; {procedure GetWordType} procedure Delete(var p: pointer); { Delete the word @p^ as well as all of the page references associated with it. } var q, r: pointref; begin {procedure Delete} if p^.kind = index then begin q := p^.ref; while q <> nil do begin {dispose the reference list} r := q^.next; dispose(q); p^.ref := r; q := r end end; dispose(p) {dispose the node itself} end; {procedure Delete} procedure CompareAndMerge(var L: list; var InIndex, NewIndex, NewHashFile: text); { Compare the list @L with @InIndex, merge if was found. } var p, r: pointer; begin {procedure CompareAndMerge} RemoveFirst(p, L); new(r); ReadReference(r, InIndex); while p <> nil do if r = nil then begin GetWordType(p); WriteReference(p, NewIndex, NewHashFile); Delete(p); {Remove reference list and node from memory.} RemoveFirst(p, L) end else if p^.wd < r^.wd then begin GetWordType(p); WriteReference(p, NewIndex, NewHashFile); Delete(p); {Remove reference list and node from memory.} RemoveFirst(p, L) end else if p^.wd > r^.wd then {do not write word not used to NewIndex} ReadReference(r, InIndex) else begin {p^.wd = r^.wd} p^.kind := r^.kind; WriteReference(p, NewIndex, NewHashFile); Delete(p); RemoveFirst(p, L); ReadReference(r, InIndex) end end; {procedure CompareAndMerge} procedure Merge(p, q: pointer; var r: pointer); {Merges two sorted lists into one, that will begin at r; requires that both lists be non empty. This version is modified slightly from the version listed in the text due to a difference in the data structures used.} var s: pointer; {always points to last node of sorted list} begin {procedure Merge} if (p = nil) or (q = nil) then writeln('Merge called with empty list(s).'); {First find the head, r, of the merged list.} if p^.wd <= q^.wd then {change .info.key to .wd} begin r := p; p := p^.next end else begin r := q; q := q^.next end; s := r; {s always points to the last entry of the merged list.} while (p <> nil) and (q <> nil) do if p^.wd <= q^.wd then {change .info.key to .wd} begin s^.next := p; {Attach the node with the smaller key to the sorted list.} s := p; p := p^.next {Advance to the next unmerged node.} end else begin s^.next := q; s := q; q := q^.next end; {After one list is exhausted, attach the remainder of the other one.} if p = nil then s^.next := q else s^.next := p end; {procedure Merge} (*===========================================================================*) procedure Divide(var p, q: pointer); {takes the list to which p points, divides it in half, and returns with p pointing to head of the first half and q to the head of second half; requires that the original list contain at least two items, or an error occurs} var r: pointer; begin {procedure Divide} q := p; {Start q at position 1, and r at position 3.} r := p^.next; r := r^.next; while r <> nil do {Move r two positions for each move of q.} begin r := r^.next; q := q^.next; if r <> nil then r := r^.next end; {Break the list into halves after q^.} r := q^.next; q^.next := nil; q := r end; {procedure Divide} procedure MainMergeSort(var L: list); {Main procedure to invoke recursive MergeSort} procedure MergeSort(var p: pointer); {divides the list starting at p^ in half, sorts it recursively, and merges the sublists} var q: pointer; {marks the halfway point in the list} begin if p <> nil then if p^.next <> nil then begin {Otherwise, list has 0 or 1 entry, with no need to sort.} Divide(p, q); MergeSort(p); MergeSort(q); Merge(p, q, p) End End; begin MergeSort(L.head) end; (*===========================================================================*) begin {procedure ClassifyWords} write('Name of input word list ?'); ReadWord(input, inlistname); readln; open(InIndex, inlistname, readonly); {may vary on different systems} reset(InIndex); write('Name of output word list ?'); ReadWord(input, newlistname); readln; open(NewIndex, newlistname); {may vary on different systems} rewrite(NewIndex); write('Name of file for new hash words ?'); ReadWord(input, newhashname); readln; open(NewHashFile, newhashname); {may vary on different systems} rewrite(NewHashFile); InitializeTable(RefTable); for code := 1 to nfiles do Place(RefFile[code], RefTable); LinkEntries(RefTable, NewList); MainMergeSort(NewList); if not Empty(NewList) then CompareAndMerge(NewList, InIndex, NewIndex, NewHashFile); close(InIndex); {may vary on different systems} close(NewIndex); close(NewHashFile) end; {procedure ClassifyWords} begin {main program} SetTimer; SplitWords; {Phase 1} writeln('Time in first phase is ', ElapsedTime:7:1, ' seconds.'); writeln; ClassifyWords; {Phase 2} writeln('Time in second phase is', ElapsedTime:7:1, ' seconds.'); writeln; writeln('Processing of input document ', intextname, ' is complete.'); writeln('Total time in program was ', TotalTime:7:1, ' seconds.') end.