Source Code 1992 March

home *** CD-ROM | disk | FTP | other *** search

/ Source Code 1992 March / Source_Code_CD-ROM_Walnut_Creek_March_1992.iso / usenet / altsrcs / 1 / 1515 < prev next >

Wrap

Internet Message Format | 1990-12-28 | 47.5 KB

From: gtoal@tharr.UUCP (Graham Toal) Newsgroups: alt.sources Subject: Spelling utilities in C (Part 3 of 3) Message-ID: <809@tharr.UUCP> Date: 28 Jun 90 07:09:57 GMT #!/bin/sh-----cut here-----cut here-----cut here-----cut here----- # shar: Shell Archiver # Run the following text with /bin/sh to create: # dawg.c # dwgcheck.c # pack.c # pckcheck.c # pdawg.c # ppack.c # proot.c # sample.c # tell.c cat - << \SHAR_EOF > dawg.c /* The first three #define's are to repair mangling by BITNET mailers */ #define EOR ^ /* This should be Caret (hat, up arrow) -- C Ex-or */ #define MOD % /* This should be Percent -- C Modulus */ #define NOT ~ /* This should be Tilde (twiddle) -- C unary Not */ #include "copyr.h" /*****************************************************************************/ /* */ /* FILE : DAWG.C */ /* */ /* Convert an alphabetically-ordered dictionary file in text format */ /* (one word per line) into a Directed Acyclic Word Graph. The command */ /* syntax is */ /* */ /* DAWG <text file (inc .ext)> <output file (no ext)> */ /* */ /* The first 4 bytes of the output file form a 24-bit number containing */ /* the number of edges in the dawg. The rest of the file contains the */ /* dawg itself (see "The World's Fastest Scrabble Program" by Appel */ /* and Jacobson for a description of a dawg). */ /* */ /*****************************************************************************/ #include "grope.h" #ifndef grope_h /* If you don't have grope.h, create an empty one. These will do for a basic system: */ #undef KNOWS_VOID #undef STDLIBS #undef PROTOTYPES #define SYS_ANY 1 #define COMPILER_ANY 1 #define SMALL_MEMORY 1 /* To be defined if you have to generate the data structure in bits. This will certainly be true for any non-trivial dictionary on MSDOS, or most home micros with 1Mb Ram or under. */ #endif /* Portability notes: 0) KISS! Keep It Simple, Stupid! 1) No typedef's 2) No bitfields 3) No structs 4) No #if defined() 5) No complex #if's 6) No procedure variables 7) Don't trust tolower() as some libs don't range check 8) Stay character-set independent (EBCDIC should work?) 9) Assume sizeof(long) >= 32 bits, but sizeof(int) just >= 16 10) Note use of ABS in preference to unsigned longs. 11) Assume 8-bit char set 12) Don't use k&r reserved words for variables, even if not in ANSI. Such as 'entry'. 13) Unix people; no sys/ or machine/ files please unless under a suitable #ifdef, and generic code supplied for everyone else... 14) this is byte-sex independant at the moment. KEEP IT THAT WAY! Don't assume dawgs are stored in a fixed form, such as so-called 'net-order'. 15) Since C doesn't range-check arrays, we'll do it. If possible, leave the running systems with range checking on if you can afford it! 16) No nested include files. 17) Don't pull in any include files twice. (*Kills* the Atari!) 18) Don't use 'register' -- trust the compiler; the compiler is your friend. */ /*#define RCHECK 1*/ /* We want range-checking of array accesses */ #include <stdio.h> #ifdef LIB_STRINGS #include <strings.h> /* Some unixes, Mac? */ #else #include <string.h> #endif #ifdef SYS_MAC /* To compile with THINK C 4.0, place the files dawg.h grope.h, dawg.c and utils.c in a folder. Create a project that contains dawg.c and the libraries unix and ANSI. */ #include <unix.h> #include <stdlib.h> #include <console.h> /* Assume a 1Mb mac for the moment. Someday I'll add dynamic RAM sizing */ #define SMALL_MEMORY 1 #endif #include "dawg.h" /* main system constants */ #include "utils.c" /* utils.c pulls in header file for malloc/free & exit */ #define ABS(x) ((x) < 0 ? -(x) : (x)) /** * The following constant, HASH_TABLE_SIZE, can be changed according to * dictionary size. The two values shown will be fine for both large * and small systems. It MUST be prime. **/ #ifdef SMALL_MEMORY #define HASH_TABLE_SIZE 30011 #ifdef SYS_MAC /* boy, you guys must be *really* tight on space... are you sure you can handle a reasonable size of dictionary with such a small table? Bump this back up as soon as you get everything else working... (I was given this info by a Mac site while they were first porting this stuff; maybe now it works on macs we can put the buffer size back up to something reasonable) */ #undef HASH_TABLE_SIZE #define HASH_TABLE_SIZE 17389 #endif #else /* pick one about 20% larger than needed */ #define HASH_TABLE_SIZE 240007 /* Suitable prime numbers if you want to tune it: 30011 150001 <-- probably a good compromise. OK for dicts to about 1Mb text 200003 220009 240007 If you try any others, for goodness sake CHECK THAT THEY ARE PRIME!!! */ #endif #define MAX_EDGES (HASH_TABLE_SIZE-1) static FILE *fpin, *fpout; /* Input/output files */ static char current_word[(MAX_LINE+1)]; /* The last word read from fpin */ static int first_diff, save_first_diff; /* The position of the first letter at which */ /* current_word differs from the previous */ /* word read */ static NODE PCCRAP *hash_table; static int first_time = TRUE; static int this_char = '?', lastch; static NODE PCCRAP *dawg; static int FILE_ENDED = FALSE; /* I'm having real problems getting merged dawgs to work on the PC; this is yet another filthy hack. Sorry. */ static INDEX nwords, nnodes, total_edges; #ifdef SMALL_MEMORY #define MAX_SUBDAWG 30000 /* Big enough for largest dict, even on small systems */ static NODE PCCRAP *merge; static INDEX dawgsize[256]; static INDEX dawgstart[256]; static INDEX dawgentry[256]; static int nextslot; /* Dawgs are packed in sequentially - not in their 'proper' indexed position */ #endif /* Shorthand macros for array accesses with checking If RCHECK isn't defined, these don't contribute any overhead. I suggest leaving RCHECK on except for production code. */ #define EDGE_LIST(idx) \ edge_list[RANGECHECK(idx, MAX_CHARS)] #define CURRENT_WORD(idx) \ current_word[RANGECHECK(idx, MAX_LINE+1)] #define DAWG(idx) \ dawg[RANGECHECK(idx, MAX_EDGES)] #define HASH_TABLE(idx) \ hash_table[RANGECHECK(idx, HASH_TABLE_SIZE)] /* Forward references */ #ifdef PROTOTYPES static INDEX build_node(int depth); static INDEX add_node(NODE *edge_list, INDEX nedges); static void read_next_word(VOID); static INDEX compute_hashcode(NODE *edge_list, INDEX nedges); static void report_size(VOID); #else static INDEX build_node(); static INDEX add_node(); static void read_next_word(); static INDEX compute_hashcode(); static void report_size(); #endif #ifdef SMALL_MEMORY #ifdef PROTOTYPES static void merge_dawg (char *filename); #else static void merge_dawg (); #endif #endif /** * main * * Program entry point **/ long words; /* dirty communication variable -- the multi-pass stuff was hacked in at the last minute. */ int #ifdef PROTOTYPES main(int argc, char **argv) #else main(argc, argv) int argc; char **argv; #endif { INDEX i; char fname[128]; #ifdef SYS_MAC argc = ccommand(&argv); #endif if (argc != 3) { fprintf(stderr, "usage: dawg dictfile.ext dawgfile\n"); exit(EXIT_ERROR); } /** * Allocate the memory for the dawg **/ if ((dawg = MALLOC(MAX_EDGES, sizeof(NODE PCCRAP *))) == NULL) { fprintf(stderr, "Can\'t allocate dictionary memory\n"); #ifndef SMALL_MEMORY fprintf(stderr, "-- try recompiling with -DSMALL_MEMORY\n"); #endif exit(EXIT_ERROR); } for (i = 0; i < (INDEX)MAX_EDGES; i++) dawg[i] = 0L; /** * Allocate the hash table. * Fill it with zeroes later just before use. Don't trust calloc etc. * - not portable enough. Anyway, in the multi-pass version we don't * want to continually free/claim... **/ if ((hash_table = MALLOC((HASH_TABLE_SIZE+1), sizeof(NODE))) == NULL) { fprintf(stderr, "Can\'t allocate memory for hash table\n"); exit(EXIT_ERROR); } /** * Open the input/output files **/ fpin = fopen(argv[1], READ_TEXT); if (fpin == NULL) { fprintf(stderr, "Can\'t open text file \"%s\"\n", argv[1]); /* Could print out error string but it's not portable... */ exit(EXIT_ERROR); } /** * Read the first word from the dictionary **/ first_time = TRUE; nwords = 0; current_word[0] = 0; read_next_word(); lastch = *current_word; /** * Initialise the counters, taking account of the root node (which is * a special case) **/ nnodes = 1; total_edges = MAX_CHARS; /** * Build the dawg and report the outcome **/ /* Now, in the dim & distant past, this code supported the concept of a restricted character set - ie a..z & A..Z were packed into 6 bits; this caused awful problems in the loop below, where we had to try to keep the loop-control variable and the character code in synch; nowadays chars are 8 bits or else, so I'm starting to tidy up the places where these hacks were necessary... */ #ifdef SMALL_MEMORY for (this_char = 0; this_char < MAX_CHARS; this_char++) { if (FILE_ENDED) break; /* Don't waste time in this loop... */ #endif /* Explicitly initialise hash table to all zeros */ {INDEX a; for (a = 0; a <= HASH_TABLE_SIZE; a++) hash_table[a] = 0;} words = 0; (void) build_node(0); #ifdef SMALL_MEMORY #ifdef DEBUG fprintf(stderr, "Char %d done. %ld Words, %ld Nodes\n", *current_word, words, total_edges); #endif if (total_edges /* words */ == 0) continue; #endif /** * Save the dawg to file **/ #ifdef SMALL_MEMORY sprintf(fname, "%s%c%d", argv[2], EXT_CHAR, lastch); #else sprintf(fname, "%s%cdwg", argv[2], EXT_CHAR); #endif fpout = fopen(fname, WRITE_BIN); if (fpout == NULL) { fprintf(stderr, "Can\'t open output file \"%s\"\n", fname); exit(EXIT_ERROR); } #ifdef DEBUG fprintf(stderr, "Writing to %s\n", fname); #endif *dawg = total_edges; total_edges = sizeof(NODE) * (total_edges + 1); /* Convert to byte count */ { long cnt; if ((cnt = putwords(dawg, (long)total_edges, fpout)) != total_edges) { fprintf(stderr, "%ld bytes written instead of %ld\n.", cnt, total_edges); exit(EXIT_ERROR); } } fclose(fpout); /** * Read the first word from the dictionary **/ first_diff = save_first_diff; first_time = FALSE; nwords = 0; /** * Initialise the counters, taking account of the root node (which is * a special case) **/ nnodes = 1; total_edges = MAX_CHARS; lastch = *current_word; /** * Build the dawg and report the outcome **/ #ifdef SMALL_MEMORY } #endif fclose(fpin); fprintf(stderr, "Dawg generated\n"); #ifdef SMALL_MEMORY merge_dawg(argv[2]); #endif exit(EXIT_OK); } /** * BUILD_NODE * * Recursively build the next node and all its sub-nodes **/ static INDEX #ifdef PROTOTYPES build_node(int depth) #else build_node(depth) int depth; #endif { INDEX nedges = 0; INDEX i; NODE *edge_list; edge_list = NULL; if (CURRENT_WORD(depth) == 0) { /** * End of word reached. If the next word isn't a continuation of the * current one, then we've reached the bottom of the recursion tree. **/ read_next_word(); if (first_diff < depth) return(0); } edge_list = (NODE *)malloc(MAX_CHARS*sizeof(NODE)); /* Note this is a 'near' array */ if (edge_list == NULL) { fprintf(stderr, "Stack full (depth %d)\n", depth); exit(EXIT_ERROR); } for (i = 0; i < MAX_CHARS; i++) EDGE_LIST(i) = 0L; /** * Loop through all the sub-nodes until a word is read which can't * be reached via this node **/ do { /* Construct the edge. Letter.... */ EDGE_LIST(nedges) = (NODE) (((NODE)CURRENT_WORD(depth))) << (NODE)V_LETTER; /* ....end-of-word flag.... */ if (CURRENT_WORD(depth+1L) == 0) EDGE_LIST(nedges) |= M_END_OF_WORD; /* ....and node pointer. */ EDGE_LIST(nedges) |= build_node(depth+1); nedges++; /* (don't ++ in a macro) */ } while (first_diff == depth); if (first_diff > depth) { fprintf(stderr, "Internal error -- first_diff = %d, depth = %d\n", first_diff, depth); exit(EXIT_ERROR); } EDGE_LIST(nedges-1) |= M_END_OF_NODE; /* Flag the last edge in the node */ /** * Add the node to the dawg **/ if (depth) { NODE result = add_node(edge_list, nedges); free(edge_list); return(result); } /** * depth is zero, so the root node (as a special case) goes at the start **/ edge_list[MAX_CHARS-1] |= M_END_OF_NODE; /* For backward searches */ for (i = 0; i < MAX_CHARS; i++) { dawg[i+1] = edge_list[i]; } free(edge_list); return(0); } /** * ADD_NODE * * Add a node to the dawg if it isn't already there. A hash table is * used to speed up the search for an identical node. **/ static INDEX #ifdef PROTOTYPES add_node(NODE *edge_list, INDEX nedges) #else add_node(edge_list, nedges) NODE *edge_list; INDEX nedges; #endif { NODE hash_entry; INDEX inc; INDEX a, first_a; INDEX i; /** * Look for an identical node. A quadratic probing algorithm is used * to traverse the hash table. **/ first_a = compute_hashcode(edge_list, nedges); first_a = ABS(first_a) MOD HASH_TABLE_SIZE; a = first_a; inc = 9; for (;;) { hash_entry = HASH_TABLE(a) & M_NODE_POINTER; if (hash_entry == 0) break; /* Node not found, so add it to the dawg */ for (i = 0; i < nedges; i++) if (DAWG((hash_entry+i) MOD HASH_TABLE_SIZE) != EDGE_LIST(i)) break; /* On the 1.6M dictionary, this gave a rangecheck with < 0. Now fixed I think - it was a problem with MOD giving unexpected results. */ if (i == nedges) { return(hash_entry); /* Node found */ } /** * Node not found here. Look in the next spot **/ a += inc; inc += 8; if (a >= HASH_TABLE_SIZE) a -= HASH_TABLE_SIZE; if (inc >= HASH_TABLE_SIZE) inc -= HASH_TABLE_SIZE; if (a == first_a) { fprintf(stderr, "Hash table full\n"); exit(EXIT_ERROR); } } /** * Add the node to the dawg **/ if (total_edges + nedges >= MAX_EDGES) { fprintf(stderr, "Error -- dictionary full - total edges = %ld\n", total_edges); exit(EXIT_ERROR); } HASH_TABLE(a) |= total_edges + 1; nnodes++; for (i = 0; i < nedges; i++) { DAWG((total_edges + 1 + i) MOD HASH_TABLE_SIZE) = EDGE_LIST(i); } total_edges += nedges; return(total_edges - nedges + 1); } /** * READ_NEXT_WORD * * Read the next word from the input file, setting first_diff accordingly **/ static void #ifdef PROTOTYPES read_next_word(VOID) #else read_next_word() #endif { /* This stuff imposes the limitation of not allowing '\0' in a word; not yet a problem but the dawg structure itself could probably cope if the feature were wanted. (Maybe with a little teweaking) */ char linebuff[(MAX_LINE+1)]; int length; for (;;) { int next = 0, c; for (;;) { c = fgetc(fpin); if (FILE_ENDED || c == EOF || ferror(fpin) || feof(fpin)) { /* for some reason, we always get a blank line at the end of files */ current_word[0] = 0; first_diff = -1; linebuff[next] = '\0'; FILE_ENDED = TRUE; return; } c &= 255; if (next == 0 && c == '\n') continue; /* skip blank lines... */ linebuff[next++] = c; if (c == '\n') break; } linebuff[next] = '\0'; words++; length = strlen(linebuff); if (linebuff[length-1] == '\n') linebuff[length-1] = '\0'; if (linebuff[length] == '\n') linebuff[length] = '\0'; if (length < 2 || length > MAX_LINE-1) { fprintf(stderr, "\n%s - invalid length\n", linebuff); continue; /* Previously exit()ed, but now ignore so that test sites without my pddict can use /usr/dict/words */ } break; } for (length = 0; current_word[length] == linebuff[length]; length++) { /* Get common part of string to check order */ } if (current_word[length] > linebuff[length]) { fprintf(stderr, "Error -- %s (word out of sequence)\n", linebuff); exit(EXIT_ERROR); } first_diff = length; nwords++; strcpy(current_word, linebuff); if ((nwords > 1) && (!(nwords & 0x3FF))) report_size(); #ifdef SMALL_MEMORY if (current_word[0] != lastch) { save_first_diff = first_diff; first_diff = -1; report_size(); } #else this_char = current_word[0]; /* for diagnostics... */ #endif } /** * COMPUTE_HASHCODE * * Compute the hash code for a node **/ static INDEX #ifdef PROTOTYPES compute_hashcode(NODE *edge_list, INDEX nedges) #else compute_hashcode(edge_list, nedges) NODE *edge_list; INDEX nedges; #endif { INDEX i; INDEX res = 0L; for (i = 0; i < nedges; i++) res = ((res << 1) | (res >> 31)) EOR EDGE_LIST(i); return(res); } /** * REPORT_SIZE * * Report the current size etc **/ static void #ifdef PROTOTYPES report_size(VOID) #else report_size() #endif { if (first_time) { fprintf(stderr, " Words Nodes Edges Bytes BPW\n"); fprintf(stderr, " ----- ----- ----- ----- ---\n"); first_time = FALSE; } if (*current_word) fprintf(stderr, "%c:", *current_word); else fprintf(stderr, "Total:"); /* THE FOLLOWING IS RATHER GRATUITOUS USE OF FLOATING POINT - REMOVE IT AND REPLACE WITH INTEGER ARITHMETIC * 100 */ /* (hey - I already did this in the copy I sent to Richard; so how come its missing? Oh no, not again: I've got out of synch and used an old copy, haven't I? :-( ) */ fprintf(stderr, " %7ld %7ld %7ld %7ld %5.2f\n", nwords, nnodes, total_edges, sizeof(NODE PCCRAP *)*(total_edges+1), (float)(sizeof(NODE PCCRAP *)*(total_edges+1))/(float)nwords); } #ifdef SMALL_MEMORY static void #ifdef PROTOTYPES merge_dawg (char *filename) #else merge_dawg (filename) char *filename; #endif { FILE *fp, *outfile; NODE data, edge; INDEX nedges, nextfree, i, dentry; INDEX count, lastnode; int dictch, padding; char fname[128]; nedges = (INDEX)MAX_SUBDAWG; if ((merge = MALLOC((SIZE_T)nedges, sizeof(INDEX))) == 0) { fprintf(stderr, "Memory allocation error -- %ld wanted\n", nedges*sizeof(INDEX)); exit(EXIT_ERROR); } nextfree = MAX_CHARS; /* 0 is special, 1..26 are root nodes for a..z */ nextslot = 0; for (dictch = 0; dictch < MAX_CHARS; dictch++) { /*** * Open the file and find out the size of the dawg ***/ sprintf(fname, "%s%c%d", filename, EXT_CHAR, dictch); if ((fp = fopen(fname, READ_BIN)) == NULL) { continue; } nedges = getword(fp); dawgstart[nextslot] = nextfree; dawgsize[nextslot] = nedges-MAX_CHARS; /* the first entry is (erroneously) the pointer to the chunk */ dentry = getword(fp); dawgentry[nextslot] = dentry - MAX_CHARS + nextfree; nextfree += nedges - MAX_CHARS; nextslot++; fclose(fp); } /* Now output total edges, and starts[a..z] */ /* Then set nextfree to start of each block */ sprintf(fname, "%s%cdwg", filename, EXT_CHAR); outfile = fopen(fname, WRITE_BIN); if (outfile == NULL) { fprintf(stderr, "Cannot open output dawg file %s\n", fname); exit(EXIT_ERROR); } putword(nextfree, outfile); nextfree = 1; nextslot = 0; padding = 0; lastnode = MAX_CHARS-1; for (;;) { if (dawgentry[lastnode] != 0) break; /* Leave with 'lastnode' set */ lastnode -= 1; } for (dictch = 0; dictch < MAX_CHARS; dictch++) { NODE edge = dawgentry[nextslot]; if (edge == 0) { padding++; continue; } if (dictch == lastnode) { edge |= M_END_OF_NODE; } else { edge &= (NOT M_END_OF_NODE); } putword(edge, outfile); nextfree++; nextslot++; } nextfree += padding; while (padding > 0) { putword(0L, outfile); padding -= 1; } /* nextslot = 0; ???? This one not used? */ for (dictch = 0; dictch < MAX_CHARS; dictch++) { sprintf(fname, "%s%c%d", filename, EXT_CHAR, dictch); if ((fp = fopen(fname, READ_BIN)) == NULL) { continue; } nedges = getword(fp); for (i = 0; i < MAX_CHARS; i++) { (void) getword(fp); /* Skip root pointers */ } nedges -= MAX_CHARS; count = getwords(&merge[1], (long)(nedges*sizeof(NODE)), fp); if (count != nedges*sizeof(NODE)) { fprintf(stderr, "Dictionary read error - %ld wanted - %ld got\n", nedges*sizeof(NODE), count); exit(EXIT_ERROR); } fclose(fp); DELETE(fname); /* On floppy systems, we can almost get away with little more space than the final files would take! */ /* relocate all nodes */ for (i = 1; i <= nedges; i++) { data = merge[i]; edge = data & M_NODE_POINTER; if (edge > MAX_CHARS) { data &= (NOT M_NODE_POINTER); data |= edge - MAX_CHARS - 1 + nextfree; merge[i] = data; } putword(merge[i], outfile); } nextfree += nedges; /* nextslot++; this one not used??? */ } fclose(outfile); FREE(merge); } #endif SHAR_EOF cat - << \SHAR_EOF > dwgcheck.c /* File: dwgcheck.c Author: Graham Toal Purpose: check correct spelling using dict.dwg Creation date: 22/06/90 Lastedit: 23/06/90 01:10:12 Description: This can be expanded to be like the unix 'spell' command. This demo file simply checks words passed on the command line. note how it remembers words so that the second time it sees them, it will call them correct. This is how you should implement the 'ignore' feature of an interactive checker. This code uses the dawg data structure, which I recommend you stick with; however for *extremely* fast checking in special circumstances you can use the 'pack' versions of check() et al. */ /* Manadatory header files */ #include <stdio.h> #include "dawg.h" #include "grope.h" /*#define RCHECK*/ /* Enable for internal range checks... */ #include "utils.c" /* Headers here as needed on per-program basis */ #include <ctype.h> /* eg, for isalpha() */ /* Spelling library utilities */ #include "init.c" /* Loading dicts */ #include "dyntrie.c" /* Creating dicts at run-time */ #include "print.c" /* Printing dicts */ #include "check.c" /* Checking words */ #include "locate.c" /* Finding words by their stems */ #include "soundex.c" /* Soundex algorithm for word-likeness comparison */ #include "similcmp.c" /* Closeness-metric for correction */ #include "correct.c" /* Hack code to attempt error correction (uses above) */ /* Let's be friendly to these guys... */ #ifdef SYS_MAC /* To compile with THINK C 4.0, place all the relevant .h and .c files in a folder. Then create a project which contains this main.c and the libraries unix and ANSI. */ #include <unix.h> #include <stdlib.h> #include <console.h> #endif /* Your own header files go here, for instance: #include "readword.h" */ int #ifdef PROTOTYPES main(int argc, char **argv) #else main(argc, argv) int argc; char **argv; #endif { NODE PCCRAP *dawg; /* precompiled dictionary (dict.dwg) */ INDEX edges; /* size of above */ NODE PCCRAP *userdict; /* run-time dictionary, added to dynamically */ char *word; int each; #ifdef SYS_MAC argc = ccommand(&argv); /* Mac users have my sympathy */ #endif /* Your program goes here... */ if (argc == 1) { fprintf(stderr, "usage: %s possibly mispeled wurdz\n", argv[0]); exit(EXIT_ERROR); } if (!dawg_init("", &dawg, &edges)) exit(EXIT_ERROR); if (!trie_new(&userdict)) exit(EXIT_ERROR); for (each = 1; each < argc; each++) { word = argv[each]; fprintf(stderr, "* %s:\n", word); if (dawg_check(dawg, word) || dawg_check(userdict, word)) { fprintf(stderr, "Correct\n"); } else { fprintf(stderr, "Wrong\n"); (void)trie_add(&userdict, word); } if (each+1 != argc) fprintf(stderr, "\n"); } exit(EXIT_OK); } SHAR_EOF cat - << \SHAR_EOF > pack.c /* The line below is to protect against Bitnet mailer damage */ #define MOD % /* This should be a Percent (for C Modulus) */ /* Blech. This file is a mess. Make it the first rewrite... */ #include "copyr.h" /*****************************************************************************/ /* */ /* FILE : PACK.C */ /* */ /* Re-pack a dawg/trie data structure using Franklin Liang's */ /* algorithm for sparse matrix storage. Final structure allows */ /* direct indexing into trie nodes, so is exceedingly fast at checking. */ /* Unfortunately the trade off is that any algorithms which walk */ /* the data structure and generate words will take much longer */ /* */ /* PACK <dawg file (inc .ext)> <pack file (inc .ext)> */ /* */ /*****************************************************************************/ /* Pending: see what closest gap between dawgptr + freenode is, and see whether we can save space by overlapping input & output arrays with a window between them. Should get almost 50% of memory back. Also, because of hacking around a bug some time back, I'm using an extra array (new_loc) for relocation of pointers, when in fact I could (and have in the past) managed to relocate them in situ with not much extra overhead. As I said, it needs a rewrite... */ /* Note: I tried one implementation of this which used bitsets to test whether two nodes were compatible. In fact, it wasn't sufficiently faster to justify the extra space it used for the arrays of flags. Now I check for compatibility on the fly with lots of comparisons. I do however have a seperate byte array to flag whether a trie is based at any address. There's probably a way of removing this. */ #include "grope.h" #ifndef grope_h /* If you don't have grope.h, create an empty one. These will do for a basic system: */ #undef KNOWS_VOID #undef STDLIBS #undef PROTOTYPES #define SYS_ANY 1 #define COMPILER_ANY 1 #define SMALL_MEMORY 1 /* To be defined if you have to generate the data structure in bits. This will certainly be true for any non-trivial dictionary on MSDOS, or most home micros with 1Mb Ram or under. */ #endif #include <stdio.h> /*#define RCHECK*/ /* Turn this back on if you have any problems. */ #include "dawg.h" #include "utils.c" #include "init.c" #include "print.c" #ifdef SYS_MAC /* To compile with THINK C 4.0, place the files dawg.h grope.h, pack.c and utils.c in a folder. Create a project that contains pack.c and the libraries unix and ANSI. */ #include <unix.h> #include <stdlib.h> #include <console.h> /* Assume a 1Mb mac for the moment. Someday I'll add dynamic RAM sizing */ #define SMALL_MEMORY #endif #define TRUE (0==0) #define FALSE (0!=0) #define MAX_WORD_LENGTH 32 /* These two magic numbers alter a very hacky heuristic employed in the packing algorithm. Tweaking them judiciously ought to speed it up significantly (at the expense of a slightly sparser packing */ #define DENSE_LOWER 100 #define DENSE_UPPER 200 /*###########################################################################*/ INDEX ptrie_size = 0; static NODE PCCRAP *ptrie; #ifdef RCHECK /* can't use the standard range_check macro because these are slightly non-standard. */ #define PTRIE(x) ptrie[RANGECHECK((x), ptrie_size)] #define DATA(x) (((x) >> 12) == 0xf2f1 ? toosmall((x), 0, __LINE__) : (x)) #else /* so supply an explicit base case */ #define PTRIE(x) ptrie[x] #define DATA(x) (x) #endif static char PCCRAP *trie_at; /* save some time by caching this info -- previously it was a function called on each test */ static INDEX freenode, lowest_base = 1, highest_base = -1; static int debug = FALSE; int #ifdef PROTOTYPES compatible(NODE *node, INDEX i) /* Can a node be overlaid here? */ #else compatible(node, i) NODE *node; INDEX i; /* Can a node be overlaid here? */ #endif { int c; for (c = 0; c < MAX_CHARS; c++) { if ((node[c]&M_FREE) == 0) { /* Something to slot in... */ if ((PTRIE(i+c) & M_FREE) == 0) return(FALSE); /* but no slot for it */ } } return(TRUE); } INDEX #ifdef PROTOTYPES find_slot(NODE *node) #else find_slot(node) NODE *node; #endif { /* Try each position until we can overlay this node */ INDEX i; for (i = lowest_base; i < freenode; i++) { if ((!trie_at[i]) && (compatible(node, i))) { /* nothing here already */ /* and this one will fit */ return(i); } } fprintf(stderr, "Should never fail to find a slot!\n"); /* because the output array is larger than the input... */ exit(5); /* NOT REACHED */ return(0); } static int changes; INDEX #ifdef PROTOTYPES pack(NODE *node) #else pack(node) NODE *node; #endif { int c; INDEX slot; NODE value; slot = find_slot(node); /* slot is also returned as the result, to be used in relocation */ /* Place node at slot */ changes = 0; for (c = 0; c < MAX_CHARS; c++) { value = node[c]; if ((value&M_FREE) == 0) { /* Something to fit? */ if (((value>>V_LETTER)&M_LETTER) == (INDEX)c+BASE_CHAR) { /* Just a consistency check - could safely be removed */ PTRIE(slot+c) = DATA(value); changes++; } } } /* The hack heuristics below keep a N^2 operation down to linear time! */ if ((slot == lowest_base) || (slot >= lowest_base+DENSE_LOWER)) { /* Heuristic is: we increase the initial search position if the array is packed solid up to this point or we're finding it *very* hard to find suitable slots */ int found = FALSE; for (;;) { INDEX c; while (trie_at[lowest_base]) lowest_base++; for (c = lowest_base; c < lowest_base+MAX_CHARS; c++) { if ((PTRIE(c)&M_FREE) != 0) {found = TRUE; break;} } if (found && (slot < lowest_base+DENSE_UPPER)) break; /* ^ Skip hard slots to fill */ lowest_base++; /* although nothing is based here, the next N slots are filled with data from the last N tries. */ /* Actually I'm not sure if 256 sequential trees each with the same element used would actually block the next 256 slots without a trie_at[] flag being set for them. However it does no harm to believe this... I must formally check this someday once all the other stuff is in order. */ } } if (slot > highest_base) highest_base = slot; /* This is info for when we try to overlap input & output arrays, -- with the output sitting some large number of bytes lower down than the input. */ trie_at[slot] = TRUE; return(slot); } /*###########################################################################*/ static NODE PCCRAP *dawg; static INDEX PCCRAP *new_loc; static INDEX nedges; NODE this_node[MAX_CHARS]; INDEX #ifdef PROTOTYPES take_node(INDEX ptr) #else take_node(ptr) INDEX ptr; #endif { NODE data; INDEX edge; int let; int endsword; int endsnode; char ch; int changes = 0; for (let = 0; let < MAX_CHARS; let++) this_node[let] = M_FREE; for (;;) { data = dawg[ptr++]; if (data == 0) { return(-1); } else { endsword = ((data & M_END_OF_WORD) != 0); endsnode = ((data & M_END_OF_NODE) != 0); edge = data & M_NODE_POINTER; let = (int) ((data >> V_LETTER) & M_LETTER); ch = let + BASE_CHAR; this_node[let] = edge & M_NODE_POINTER; if (endsword) this_node[let] |= M_END_OF_WORD; this_node[let] |= (NODE)ch<<V_LETTER; changes++; if (endsnode) break; } } if (changes != 0) { return(ptr); } else { fprintf(stderr, "Fatal error\n"); return(0); } } NODE #ifdef PROTOTYPES redirect_node(NODE ptr) #else redirect_node(ptr) NODE ptr; #endif { NODE data; INDEX edge; int endsword; char ch; data = DATA(PTRIE(ptr)); endsword = ((data & M_END_OF_WORD) != 0); edge = data & M_NODE_POINTER; ch = (int) ((data >> V_LETTER) & M_LETTER); /*edge = dawg[edge] & M_NODE_POINTER;*/ edge = new_loc[edge]; ptr = edge; ptr |= (NODE)ch<<V_LETTER; if (endsword) ptr |= M_END_OF_WORD; return(ptr); } int #ifdef PROTOTYPES main(int argc, char **argv) #else main(argc, argv) int argc; char **argv; #endif { INDEX dawgptr = 1, trieptr, newdawgptr, i, nodes = 0; FILE *triefile; #ifdef SYS_MAC argc = ccommand(&argv); #endif if (argc != 3) { fprintf(stderr, "pack: wrong parameters - should be infile outfile\n"); exit(EXIT_ERROR); } if (!dawg_init((argc >= 2 ? argv[1]: ""), &dawg, &nedges)) exit(EXIT_ERROR); if (debug) dawg_print(dawg, (INDEX)ROOT_NODE); /* assume small test file! */ freenode = ((nedges*16)/15)+(4*MAX_CHARS); /* Minimal slop for pathological packing? */ ptrie_size = freenode; ptrie = MALLOC((SIZE_T)freenode, sizeof(NODE)); if (ptrie == NULL) { fprintf(stderr, "Cannot claim %ld bytes for ptrie[]\n", sizeof(NODE)*freenode); exit(EXIT_ERROR); } new_loc = MALLOC((SIZE_T)freenode, sizeof(NODE)); if (new_loc == NULL) { fprintf(stderr, "Cannot claim %ld bytes for new_loc[]\n", sizeof(NODE)*freenode); exit(EXIT_ERROR); } trie_at = (char PCCRAP *)MALLOC((SIZE_T)freenode, sizeof(char)); if (trie_at == NULL) { fprintf(stderr, "Cannot claim %ld bytes for trie_at[]\n", freenode); exit(EXIT_ERROR); } for (i = 0; i < freenode; i++) { ptrie[i] = M_FREE; new_loc[i] = 0; trie_at[i] = FALSE; } dawg[0] = 0; /* 1st entry is never looked at, and maps to itself */ dawgptr = 1; /* Relocate initial node at 1 seperately */ newdawgptr = take_node(dawgptr /* 1 */); trieptr = pack(this_node); /*dawg[dawgptr] = trieptr;*/ /* What the hell does this do??? I've forgotten!!! - oh yes, this was relocating in situ, without new_loc... */ new_loc[dawgptr] = trieptr; dawgptr = MAX_CHARS+1; {INDEX maxdiff = 0, diff; for (;;) { if (highest_base > dawgptr) { diff = highest_base - dawgptr; if (diff > maxdiff) maxdiff = diff; } newdawgptr = take_node(dawgptr); if (newdawgptr == -1) { dawgptr++; continue; } trieptr = pack(this_node); /*dawg[dawgptr] = trieptr;*/ new_loc[dawgptr] = trieptr; dawgptr = newdawgptr; if (dawgptr > nedges) { break; /* AHA!!! I was doing this in the wrong order & lost last entry! *AND* had '>=' for '>' */ } nodes++; if ((nodes MOD 1000) == 0) fprintf(stderr, "%ld nodes\n", nodes); } if (debug) fprintf(stderr, "wavefront gets up to %ld\n", maxdiff); } if (debug) fprintf(stderr, "All packed - used %ld nodes\n", highest_base); for (trieptr = 1; trieptr < freenode; trieptr++) { /* extract edge from ptrie[trieptr], look it up via dawg[edge], slot it back in (while preserving other bits) */ PTRIE(trieptr) = redirect_node(trieptr); } /* Finally, remember to bump up highest_base in case last node is only one edge and 25 others are missing! */ if (debug) fprintf(stderr, "Redirected...\n"); triefile = fopen((argc >= 3 ? argv[2] : DEFAULT_PACK), WRITE_BIN); if (triefile == NULL) { fprintf(stderr, "Cannot write to packed trie file\n"); exit(1); } if (debug) fprintf(stderr, "File opened...\n"); ptrie[0] = highest_base+MAX_CHARS-1;/* File size in words (-1 because doesn't include itself) */ if (debug) fprintf(stderr, "Dumping... (0..%ld)\n", highest_base+MAX_CHARS-1); for (i = 0; i < highest_base+MAX_CHARS; i++) {/* dump packed DAG */ NODE n; n = DATA(PTRIE(i)); putword(n, triefile); if (ferror(triefile)) { fprintf(stderr, "*** TROUBLE: Writing DAG -- disk full?\n"); fclose(triefile); exit(1); } } if (debug) fprintf(stderr, "Dumped...\n"); fclose(triefile); if (debug) fprintf(stderr, "Done.\n"); } SHAR_EOF cat - << \SHAR_EOF > pckcheck.c /* File: pckcheck.c Author: Graham Toal Purpose: check correct spelling using dict.pck Creation date: 22/06/90 Lastedit: 23/06/90 01:15:39 Description: This can be expanded to be like the unix 'spell' command. This demo file simply checks words passed on the command line. note how it remembers words so that the second time it sees them, it will call them correct. This is how you should implement the 'ignore' feature of an interactive checker. This version used the fast 'packed' data structure. This is approximately 3 times faster, but not all utilities support the packed versions. Also utilities which walk the trie are considerably slower (say by a factor of 100) -- so chose when to used 'dawg' and when to use 'pack'ed versions carefully! */ /* Manadatory header files */ #include <stdio.h> #include "dawg.h" #include "grope.h" /*#define RCHECK*/ /* Enable for internal range checks... */ #include "utils.c" /* Headers here as needed on per-program basis */ #include <ctype.h> /* eg, for isalpha() */ /* Spelling library utilities */ #include "init.c" /* Loading dicts */ #include "dyntrie.c" /* Creating dicts at run-time */ #include "print.c" /* Printing dicts */ #include "check.c" /* Checking words */ #include "locate.c" /* Finding words by their stems */ #include "soundex.c" /* Soundex algorithm for word-likeness comparison */ #include "similcmp.c" /* Closeness-metric for correction */ #include "correct.c" /* Hack code to attempt error correction (uses above) */ /* Let's be friendly to these guys... */ #ifdef SYS_MAC /* To compile with THINK C 4.0, place all the relevant .h and .c files in a folder. Then create a project which contains this main.c and the libraries unix and ANSI. */ #include <unix.h> #include <stdlib.h> #include <console.h> #endif /* Your own header files go here, for instance: #include "readword.h" */ int #ifdef PROTOTYPES main(int argc, char **argv) #else main(argc, argv) int argc; char **argv; #endif { NODE PCCRAP *ptrie; /* precompiled dictionary (dict.pck) */ INDEX edges; /* size of above */ NODE PCCRAP *userdict; /* run-time dictionary, added to dynamically */ char *word; int each; #ifdef SYS_MAC argc = ccommand(&argv); /* Mac users have my sympathy */ #endif /* Your program goes here... */ if (argc == 1) { fprintf(stderr, "usage: %s possibly mispeled wurdz\n", argv[0]); exit(EXIT_ERROR); } if (!pack_init("", &ptrie, &edges)) exit(EXIT_ERROR); if (!trie_new(&userdict)) exit(EXIT_ERROR); for (each = 1; each < argc; each++) { word = argv[each]; fprintf(stderr, "* %s:\n", word); if (pack_check(ptrie, word) || dawg_check(userdict, word)) { fprintf(stderr, "Correct\n"); } else { fprintf(stderr, "Wrong\n"); (void)trie_add(&userdict, word); } if (each+1 != argc) fprintf(stderr, "\n"); } exit(EXIT_OK); } SHAR_EOF cat - << \SHAR_EOF > pdawg.c /* Manadatory headers */ /*#define RCHECK*/ /*#define DEBUG*/ #include <stdio.h> #include "dawg.h" #include "grope.h" #include "utils.c" /* Headers here as needed on per-program basis */ #include <ctype.h> /* for isalpha() */ /* Spelling library utilities */ #include "init.c" #include "print.c" /* #include "check.c" #include "locate.c" #include "similcmp.c" #include "soundex.c" #include "correct.c" */ /*#include "dyntrie.c"*/ int #ifdef PROTOTYPES main(int argc, char **argv) #else main(argc, argv) int argc; char **argv; #endif { NODE PCCRAP *dawg; INDEX nedges; if (!dawg_init((argc == 1 ? "" : argv[1]), &dawg, &nedges)) exit(EXIT_ERROR); dawg_print(dawg, (INDEX)ROOT_NODE); fprintf(stderr, "Finished printing dawg\n"); exit(0); } SHAR_EOF cat - << \SHAR_EOF > ppack.c /* Manadatory headers */ #include <stdio.h> #include "dawg.h" #include "grope.h" #include "utils.c" /* Headers here as needed on per-program basis */ #include <ctype.h> /* for isalpha() */ /* Spelling library utilities */ #include "init.c" #include "print.c" /* #include "check.c" #include "locate.c" #include "similcmp.c" #include "soundex.c" #include "correct.c" */ /*#include "dyntrie.c"*/ int #ifdef PROTOTYPES main(int argc, char **argv) #else main(argc, argv) int argc; char **argv; #endif { NODE PCCRAP *ptrie; INDEX nedges; if (!pack_init((argc == 1 ? "": argv[1]), &ptrie, &nedges)) exit(EXIT_ERROR); pack_print(ptrie, (INDEX)ROOT_NODE); exit(0); } SHAR_EOF cat - << \SHAR_EOF > proot.c /* File: proot.c Author: Graham Toal Purpose: find all words starting with 'root' Creation date: 22/06/90 Lastedit: 22:24:32 Description: some spelling programs remove characters from the end of wrongly spelt words one by one until the resulting root is found to be a prefix of other words in the dictionary. This works because the assumption is made that the word was in fact correct, but was an inflected form of a word in the dictionary which had not been stored. */ /* Manadatory header files */ #include <stdio.h> #include "dawg.h" #include "grope.h" #include "utils.c" /* Headers here as needed on per-program basis */ #include <ctype.h> /* eg, for isalpha() */ /* Spelling library utilities */ #include "init.c" /* Loading dicts */ /*#include "dyntrie.c"*/ /* Creating dicts at run-time */ #include "print.c" /* Printing dicts */ /*#include "check.c"*/ /* Checking words */ #include "locate.c" /* Finding words by their stems */ /*#include "soundex.c"*/ /* Soundex algorithm for word-likeness comparison */ /*#include "similcmp.c"*//* Closeness-metric for correction */ /*#include "correct.c"*/ /* Code to attempt error correction (uses above) */ /* Let's be friendly to these guys... */ #ifdef SYS_MAC /* To compile with THINK C 4.0, place all the relevant .h and .c files in a folder. Then create a project which contains this main.c and the libraries unix and ANSI. */ #include <unix.h> #include <stdlib.h> #include <console.h> #endif /* Your own header files go here, for instance: #include "readword.h" */ int #ifdef PROTOTYPES main(int argc, char **argv) #else main(argc, argv) int argc; char **argv; #endif { NODE PCCRAP *dawg; INDEX root; INDEX edges; int each; #ifdef SYS_MAC argc = ccommand(&argv); /* Mac users have my sympathy */ #endif /* Your program goes here... */ if (argc == 1) { fprintf(stderr, "usage: %s part\n", argv[0]); exit(EXIT_ERROR); } if (!dawg_init("", &dawg, &edges)) exit(EXIT_ERROR); for (each = 1; each < argc; each++) { fprintf(stderr, "* %s:\n", argv[each]); root = dawg_locate_prefix(dawg, argv[each], ROOT_NODE); if (root != ROOT_NODE) { dawg_print_prefix(dawg, argv[each], root); } else { fprintf(stderr, "(none found)\n"); } if (each+1 != argc) fprintf(stderr, "\n"); } exit(EXIT_OK); } SHAR_EOF cat - << \SHAR_EOF > sample.c /* Dummy program for writing word utilities. If you stick to the style shown here, your programs will remain portable across a vast range of machines. Skeleton file by Graham Toal. Remove this header when you've added your own... (If you want to see some worked examples, try tell.c dwgcheck.c pckcheck.c) */ /* File: Author: Purpose: Creation date: Lastedit: Description: */ /* Manadatory header files */ #include <stdio.h> #include "dawg.h" #include "grope.h" #include "utils.c" /* Headers here as needed on per-program basis */ #include <ctype.h> /* eg, for isalpha() */ /* Spelling library utilities */ #include "init.c" /* Loading dicts */ #include "dyntrie.c" /* Creating dicts at run-time */ #include "print.c" /* Printing dicts */ #include "check.c" /* Checking words */ #include "locate.c" /* Finding words by their stems */ #include "soundex.c" /* Soundex algorithm for word-likeness comparison */ #include "similcmp.c" /* Closeness-metric for correction */ #include "correct.c" /* Hack code to attempt error correction (uses above) */ /* Let's be friendly to these guys... */ #ifdef SYS_MAC /* To compile with THINK C 4.0, place all the relevant .h and .c files in a folder. Then create a project which contains this main.c and the libraries unix and ANSI. */ #include <unix.h> #include <stdlib.h> #include <console.h> #endif /* Your own header files go here, for instance: #include "readword.h" */ int #ifdef PROTOTYPES main(int argc, char **argv) #else main(argc, argv) int argc; char **argv; #endif { #ifdef SYS_MAC argc = ccommand(&argv); /* Mac users have my sympathy */ #endif /* Your program goes here... */ exit(EXIT_OK); } SHAR_EOF cat - << \SHAR_EOF > tell.c /* File: tell.c Author: Graham Toal Purpose: offer correct spelling Creation date: 22/06/90 Lastedit: 22:24:32 Description: Like the unix 'spelltell' command. */ /* Manadatory header files */ #include <stdio.h> #include "dawg.h" #include "grope.h" #include "utils.c" /* Headers here as needed on per-program basis */ #include <ctype.h> /* eg, for isalpha() */ /* Spelling library utilities */ #include "init.c" /* Loading dicts */ #include "dyntrie.c" /* Creating dicts at run-time */ #include "print.c" /* Printing dicts */ #include "check.c" /* Checking words */ #include "locate.c" /* Finding words by their stems */ #include "soundex.c" /* Soundex algorithm for word-likeness comparison */ #include "similcmp.c" /* Closeness-metric for correction */ #include "correct.c" /* Hack code to attempt error correction (uses above) */ /* Let's be friendly to these guys... */ #ifdef SYS_MAC /* To compile with THINK C 4.0, place all the relevant .h and .c files in a folder. Then create a project which contains this main.c and the libraries unix and ANSI. */ #include <unix.h> #include <stdlib.h> #include <console.h> #endif /* Your own header files go here, for instance: #include "readword.h" */ int #ifdef PROTOTYPES main(int argc, char **argv) #else main(argc, argv) int argc; char **argv; #endif { NODE PCCRAP *dawg; INDEX edges; int each; #ifdef SYS_MAC argc = ccommand(&argv); /* Mac users have my sympathy */ #endif /* Your program goes here... */ if (argc == 1) { fprintf(stderr, "usage: %s mispeled wurdz\n", argv[0]); exit(EXIT_ERROR); } if (!dawg_init("", &dawg, &edges)) exit(EXIT_ERROR); for (each = 1; each < argc; each++) { fprintf(stderr, "* %s:\n", argv[each]); if (!dawg_correct(dawg, argv[each])) { fprintf(stderr, "(none found)\n"); } if (each+1 != argc) fprintf(stderr, "\n"); } exit(EXIT_OK); } SHAR_EOF