Source Code 1992 March

home *** CD-ROM | disk | FTP | other *** search

/ Source Code 1992 March / Source_Code_CD-ROM_Walnut_Creek_March_1992.iso / usenet / altsrcs / 1 / 1514 < prev next >

Wrap

Internet Message Format | 1990-12-28 | 40.4 KB

From: gtoal@tharr.UUCP (Graham Toal) Newsgroups: alt.sources Subject: Spelling utilities in C (Part 2 of 3) Message-ID: <808@tharr.UUCP> Date: 28 Jun 90 07:07:54 GMT #!/bin/sh-----cut here-----cut here-----cut here-----cut here----- # shar: Shell Archiver # Run the following text with /bin/sh to create: # check.c # correct.c # dyntrie.c # init.c # locate.c # print.c # similcmp.c # soundex.c # utils.c cat - << \SHAR_EOF > check.c /* File: check.c Author: Graham Toal Purpose: Check a word using DAWG or TRIE. Functions exported: dawg_check, pack_check Internal function: dawg_ck, pack_ck Description: call as: dawg_check(dawg, "word") pack_check(trie, "word") Simple spelling-check. Unfortunately the two interfaces for DAWGs and TRIEs are different, and even DAWG stuff differs from program to program. The problem is primarily in how to spot the end of a search; at the moment it is done when a particular pointer points to the root. Unfortunately we enter the data structure either at root+1 or at some arbitrary point, so this scheme means passing in two pointers. The next idea is to check that the *data* pointed to by the terminating pointer is 0; this would be OK but I was hoping to leave the initial word in the dawg free for expansion... (dawg contains [0, Node1..NodeN]) *best* idea would be to terminate on *INDEX* being 0. This is probably also simplest & I'll code it up properly when I'm awake... */ static int #ifdef PROTOTYPES dawg_ck(NODE PCCRAP *dict, NODE PCCRAP *edge, char *word) #else dawg_ck(dict, edge, word) NODE PCCRAP *dict; NODE PCCRAP *edge; char *word; #endif { if (edge == dict) return(0!=0); for (;;) { if (*word == (((*edge >> V_LETTER) & M_LETTER))) { if (*++word == '\0') { return((*edge & M_END_OF_WORD) != 0); } else { if ((edge = dict+(*edge & M_NODE_POINTER)) == dict) break; continue; } } if (((*edge++) & M_END_OF_NODE) != 0) break; } return(0!=0); } int #ifdef PROTOTYPES dawg_check(NODE PCCRAP *dict, char *word) #else dawg_check(dict, word) NODE PCCRAP *dict; char *word; #endif { return(dawg_ck(dict, dict+ROOT_NODE, word)); } static int #ifdef PROTOTYPES pack_ck(NODE PCCRAP *ptrie, INDEX p, char *s) #else pack_ck(ptrie, p, s) NODE PCCRAP *ptrie; INDEX p; char *s; #endif { /* Note: node is passed in as a parameter so that it is in a register - otherwise it would take an extra instruction every time it was accessed. We trust the compiler to allocate register variables most efficiently -- when people tinker, it might improve one machine but ruin another... */ #define next_letter(s) \ p = ptrie[p + *s]; \ if (((p >> V_LETTER) & M_LETTER) != *s++) return(0!=0); \ if (*s == '\0') return((p >> V_END_OF_NODE) != 0); \ if ((p &= M_NODE_POINTER) == 0) return(0!=0) /* Depending on whether your machine has a cache or not, and whether it optimises pipeline fetches, you should decrease or increase the number of macro instantiations in the loop below appropriately */ for (;;) { next_letter(s); next_letter(s); next_letter(s); next_letter(s); } } int #ifdef PROTOTYPES pack_check(NODE PCCRAP *ptrie, char *s) #else pack_check(ptrie, s) NODE PCCRAP *ptrie; char *s; #endif { return(pack_ck(ptrie, 1L, s)); } SHAR_EOF cat - << \SHAR_EOF > correct.c /* File: correct.c Author: Graham Toal Purpose: Correct a word using Soudex and similcmp on DAWG. Functions exported: dawg_correct Internal functions: reccmp, walk_rest_dawg, walk_dawg Description: Reduce word to rough phonetic representation. Reduce all words in dawg to same form. Compare. This results in a large list of plausible (HA!) candidates. Then apply 'similcmp' which returns a distance metric representing closeness of two words. Sort by this metric, and return N best matches. If one stands out above others, return it alone. I'm working on a *much better* algorithm, but this will do for prototyping the interface. Final version of this will have callbacks to handle corrected words as they are produced. */ #include "soundex.c" #include "similcmp.c" typedef struct { char *possible; int score; } rec; /* Numbers of parameters exploded during process of removing global state :( */ static void #ifdef PROTOTYPES walk_rest_dawg(NODE PCCRAP *dawg, INDEX node, int len, char *word, char *original, char *target, rec *ans, int *nextfreep) #else walk_rest_dawg(dawg, node, len, word, original, target, ans, nextfreep) NODE PCCRAP *dawg; INDEX node; int len; char *word; char *original; char *target; rec *ans; int *nextfreep; #endif #define nextfree (*nextfreep) { NODE PCCRAP *edge; long c; for (edge = (NODE PCCRAP *)&dawg[node]; ;edge++) { c = *edge; /* Avoid MS C compiler bug :-( */ c = c >> V_LETTER; c = c & M_LETTER; word[len] = (char)c; if ((*edge & M_END_OF_WORD) != 0) { word[len+1] = 0; if (strcmp(soundex(word), target) == 0) { ans[nextfree].possible = (char *)Malloc(len+1, 1); if (ans[nextfree].possible == NULL) { fprintf(stderr,"I\'ve no idea - it could be anything...\n"); exit(0); } strcpy(ans[nextfree].possible, word); ans[nextfree].score = simil(word, original); if (nextfree < 511) nextfree++; } } c = *edge & M_NODE_POINTER; if (c != 0) { walk_rest_dawg(dawg, c, len+1, word, original, target, ans, nextfreep); } if ((*edge & M_END_OF_NODE) != 0) break; /* End of node */ } #undef nextfree } static void #ifdef PROTOTYPES walk_dawg( NODE PCCRAP *dawg, INDEX node, int len, char *original, char *target, char targets, char *word, rec *ans, int *nextfreep) #else walk_dawg(dawg, node, len, original, target, targets, word, ans, nextfreep) NODE PCCRAP *dawg; INDEX node; int len; char *original; char *target; char targets; char *word; rec *ans; int *nextfreep; #endif #define nextfree (*nextfreep) { NODE PCCRAP *edge; long c; edge = (NODE PCCRAP *)&dawg[node]; /* Only search trie starting with initial letter */ for (;;) { c = *edge; /* Avoid MS C compiler bug :-( */ c = c >> V_LETTER; c = c & M_LETTER; word[len] = (char)c; if (c == targets) { c = *edge & M_NODE_POINTER; walk_rest_dawg(dawg, c, len+1, word, original, target, ans, nextfreep); return; } edge++; } #undef nextfree } static int #ifdef PROTOTYPES reccmp(const void *a, const void *b) #else reccmp(a, b) void *a; void *b; #endif { return(((rec *)b)->score - ((rec *)a)->score); } int #ifdef PROTOYPES dawg_correct(NODE PCCRAP *dawg, char *original) #else dawg_correct(dawg, original) NODE PCCRAP *dawg; char *original; #endif { char word[81]; char target[256]; static rec ans[256]; /* static because brain-dead MSC can't hack a big stack :-( */ int targets; int nextfree = 0; int i, prev=0; targets = original[0]; strcpy(target, soundex(original)); walk_dawg( dawg, (INDEX)ROOT_NODE, 0, original, target, targets, word, ans, &nextfree); if (nextfree == 0) return(FALSE); qsort(ans, (size_t)nextfree, sizeof(rec), /*&*/reccmp); for (i = 0; i < nextfree; i++) { if (((prev*100) / (ans[i].score)) > 120) break; fprintf(stdout, "%s (%d)\n", ans[i].possible, ans[i].score); if (ans[i].score >= 100) break; if (i == 5) break; prev = ans[i].score; } return(TRUE); } SHAR_EOF cat - << \SHAR_EOF > dyntrie.c /* The #define is to repair mangling by BITNET mailers */ #define NOT ~ /* This should be Tilde (twiddle) -- C unary Not */ /* File: dyntrie.c Author: Graham Toal Purpose: Check a word using DAWG or TRIE. Functions exported: trie_new, trie_add, trie_dump Internal functions: insert_simple recurse_add Description: Builds DAWG-compatible trie in store, incrementally; words do not have to be presented in order. (Note it is NOT a 'packed-trie'; in our scheme it is closer to a dawg - but without the tail-compression precomputed dawgs have). Any tries built using this should be used by check_dawg, print_dawg etc. Has rather nice property of only claiming enough memory for the job; dynamically moves the data structure when it fills! */ /* To avoid global state, I'm putting these useful bits of info in the two words before the dawg itself; I *HOPE* that all systems allow negative indexing off arrays; I'm not at all sure they do though. However since I moved the base of the dict up by adding 2 to it, I *should* be able to get the old address by by subtracting 2 from it, so I'm using the first pair of macros below, not the more intuitive second pair. I should really do this for ordinary dawgs & tries too. Indeed I should *really* implement the dictlib.h interface, but thats a month or two off. Also I'ld like some feedback first. */ #define EXTRAS 2 #define MAX_ENTRIES(dawg) (*(dawg-2)) /* Safe way of aliasing */ #define NEXT_FREE(dawg) (*(dawg-1)) /* #define MAX_ENTRIES(dawg) dawg[-2] */ /* 'Obvious' alias */ /* #define NEXT_FREE(dawg) dawg[-1] */ /* may be buggy */ #define EMPTY 0 #define INIT_MAX_ENTRIES 512 /* Slop is so that we don't need to be continually checking nextfree as being close to max_entries */ #define SLOP 12 /* A debugging macro which rangechecks arrays -- happens when utils.c is included with RCHECK defined. Otherwise no overhead. */ #define DICT(idx) dict[RANGECHECK(idx, MAX_ENTRIES(dict))] /* This quick hack job has special-case code in each function for the root node-set. Its rather tacky; I could clean it up and make the root node like all other nodes, but why bother; this is only test code anyway... */ static INDEX #ifdef PROTOTYPES insert_simple(NODE PCCRAP **dictp, char *word) #else insert_simple(dictp, word) NODE PCCRAP **dictp; char *word; #endif #define dict (*dictp) { NODE bugfix; INDEX i; NODE n; int c; if (NEXT_FREE(dict) >= MAX_ENTRIES(dict)-SLOP) { NODE PCCRAP *moved; moved = MALLOC((SIZE_T)MAX_ENTRIES(dict)*2+EXTRAS, sizeof(NODE)); if (moved != NULL) { moved += EXTRAS; MAX_ENTRIES(moved) = MAX_ENTRIES(dict); NEXT_FREE(moved) = NEXT_FREE(dict); /* Should use realloc but appears to be buggy :-( */ for (i = MAX_ENTRIES(dict); i < MAX_ENTRIES(dict)*2; i++) { moved[i] = EMPTY; } for (i = 0; i < MAX_ENTRIES(dict); i++) moved[i] = DICT(i); dict -= EXTRAS; FREE(dict); dict = moved; MAX_ENTRIES(dict) *= 2; } else { fprintf(stderr, "Can\'t add any more words again\n"); } } c = (*word++)&255; if (c == '\0') return(TERMINAL_NODE); i = NEXT_FREE(dict)++; bugfix = insert_simple(&dict, word); n = ((NODE)c << (NODE)V_LETTER) | M_END_OF_NODE | bugfix; DICT(i) = n; if (*word == '\0') DICT(i) |= M_END_OF_WORD; return(i); #undef dict } static void #ifdef PROTOTYPES recurse_add(NODE PCCRAP **dictp, INDEX base, char *word) #else recurse_add(dictp, base, word) NODE PCCRAP **dictp; INDEX base; char *word; #endif #define dict (*dictp) { NODE bugfix; INDEX i = base, newbase; NODE unpicked[MAX_CHARS], n; int c, ch; int gap, nextslot = 0, j; /* First see if first char is already in trie */ ch = (*word++)&255; for (;;) { c = (int)(((dict[i] >> V_LETTER) & M_LETTER)&255); if (ch == c) { newbase = dict[i]&M_NODE_POINTER; if (newbase == 0) { /* have abc (this is c), adding (abc)defg */ dict[i] &= (NOT M_NODE_POINTER); bugfix = insert_simple(&dict, word); dict[i] |= bugfix; } else { if (*word == '\0') { dict[i] |= M_END_OF_WORD; } else { recurse_add(dictp, newbase, word); } } return; } if ((dict[i]&M_END_OF_NODE) != 0) break; i++; } /* unpick base */ for (nextslot = 0; nextslot < MAX_CHARS; nextslot++) { unpicked[nextslot] = EMPTY; } nextslot = 0; i = base; j = 0; for (;;) { if ((j == 0) && (((dict[i] >> V_LETTER) & M_LETTER) > ch)) { j = 1; newbase = nextslot++; } n = dict[i]; dict[i] = EMPTY; i++; unpicked[nextslot++] = n & (NOT M_END_OF_NODE); if ((n & M_END_OF_NODE) != 0) break; } if (j == 0) newbase = nextslot++; /* Was it last alphabetically? */ /* add this char to the node */ if (*word == '\0') { unpicked[newbase] = ((NODE)ch << (NODE)V_LETTER) | TERMINAL_NODE | M_END_OF_WORD; } else { /* and insert the rest of the string with insert_simple */ bugfix = insert_simple(&dict, word); unpicked[newbase] = ((NODE)ch << (NODE)V_LETTER) | bugfix; /* The insert_simple call has to come after above loop, because of freeing slots... */ } unpicked[nextslot-1] |= M_END_OF_NODE; /* scan for suitably-sized gap */ /* MAX_CHARS+1 should be first 'real' node base (0, 1..MAX_CHARS reserved) */ /* The particular application this is wanted for doesn't have a large number of words to be added dynamically, so the BFI code below which finds free holes in the trie space works fine. However if some bright spark cares to keep a freelist *in the holes* then this program effectively implements a linear-time sorting algorithm. (I know it's not *really*, but think about it before jumping down my throat; the N^2 case is very statistically unlikely) */ for (i = (INDEX)MAX_CHARS+1; i < MAX_ENTRIES(dict)-nextslot-SLOP; i++) { /* Even without the freelist, the sneaky hack in pack.c for noting earliest bunch of free holes might well make a significant difference... (It was a lowest_search_start variable and used a bit of hysteresis) */ gap = TRUE; for (j = 0; j < nextslot; j++) { if (dict[i+j] != EMPTY) { gap = FALSE; break; } } if (gap) break; } if (!gap) { NODE PCCRAP *moved; moved = MALLOC((SIZE_T)MAX_ENTRIES(dict)*2+EXTRAS, sizeof(NODE)); if (moved != NULL) { moved += EXTRAS; MAX_ENTRIES(moved) = MAX_ENTRIES(dict); NEXT_FREE(moved) = NEXT_FREE(dict); /* Should use realloc but appears to be buggy :-( */ for (i = MAX_ENTRIES(dict); i < MAX_ENTRIES(dict)*2; i++) moved[i] = EMPTY; for (i = 0; i < MAX_ENTRIES(dict); i++) moved[i] = DICT(i); dict -= EXTRAS; FREE(dict); dict = moved; /* If your compiler has aliasing bugs they may hit here. */ MAX_ENTRIES(dict) *= 2; /* scan for suitably-sized gap */ for (i = ((MAX_ENTRIES(dict)/2)-(MAX_CHARS+1)); i < MAX_ENTRIES(dict)-nextslot; i++) { gap = TRUE; for (j = 0; j < nextslot; j++) { if (dict[i+j] != EMPTY) { gap = FALSE; break; } } if (gap) break; } } if (!gap) { fprintf(stderr, "Can\'t add any more words\n"); return; } } newbase = i; /* reinsert */ for (j = 0; j < nextslot; j++) { dict[newbase+j] = unpicked[j]; } if (newbase+nextslot-1 >= NEXT_FREE(dict)) NEXT_FREE(dict) = newbase+nextslot; /* update pointer to the base of this node */ for (i = 0; i < MAX_ENTRIES(dict); i++) { if ((dict[i] & M_NODE_POINTER) == base) { dict[i] &= (NOT M_NODE_POINTER); dict[i] |= newbase; break; /* (should only be one since trie, not dawg) */ } } #undef dict } void #ifdef PROTOTYPES trie_add(NODE PCCRAP **dictp, char *word) #else trie_add(dictp, word) NODE PCCRAP **dictp; char *word; #endif #define dict (*dictp) { INDEX next; INDEX bugfix; int c; /* Root node is pre-reserved MAX_CHARS entries */ c = (*word++)&255; /* bugfix - remember chars can be signed :-( */ if (DICT((INDEX)ROOT_NODE+c) == EMPTY) { /* I'm allowing the root node to be sparse for speed; it *should* be dense like any other node. May change this later */ if (*word == '\0') { DICT((INDEX)ROOT_NODE+c) = ((NODE)c << (NODE)V_LETTER) | M_END_OF_WORD; } else { bugfix = insert_simple(&dict, word); DICT((INDEX)ROOT_NODE+c) = ((NODE)c << (NODE)V_LETTER) | bugfix; } } else { if (*word == '\0') { DICT((INDEX)ROOT_NODE+c) |= M_END_OF_WORD; } else { next = DICT((INDEX)ROOT_NODE+c) & M_NODE_POINTER; if (next == 0) { /* have 'a', adding 'abcd' */ /* (Should really check the letter for safety...) */ bugfix = insert_simple(&dict, word); DICT((INDEX)ROOT_NODE+c) = ((NODE)c << (NODE)V_LETTER) | bugfix | M_END_OF_WORD; } else { recurse_add(dictp, next, word); } } } #undef dict } int #ifdef PROTOTYPES trie_new(NODE PCCRAP **dawgp) #else trie_new(dawgp) NODE PCCRAP **dawgp; #endif #define dawg (*dawgp) { INDEX i; dawg = MALLOC(INIT_MAX_ENTRIES+EXTRAS, sizeof(NODE)); dawg += EXTRAS; MAX_ENTRIES(dawg) = INIT_MAX_ENTRIES; NEXT_FREE(dawg) = MAX_CHARS+1; dawg[0] = 0; for (i = 1; i <= MAX_CHARS; i++) dawg[i] = 0; dawg[MAX_CHARS] |= M_END_OF_NODE; /* 0 base, MAX_CHARS chars */ return(TRUE); #undef dawg } int #ifdef PROTOTYPES trie_dump(NODE PCCRAP *dawg, char *filename) #else trie_dump(dawg, filename) NODE PCCRAP *dawg; char *filename; #endif { INDEX cnt, max; FILE *dumper; max = NEXT_FREE(dawg)*sizeof(NODE); /* I *knew* the next_free variable would come in handy :-) */ dumper = fopen(filename, WRITE_BIN); if (dumper == NULL) { fprintf(stderr, "Failed to dump trie on file \'%s\'\n", filename); return(FALSE); } putword(max, dumper); if ((cnt = putwords(dawg, max, dumper)) != max) { fprintf(stderr, "Failed to dump: %ld bytes written instead of %ld\n", cnt, max); return(FALSE); } fclose(dumper); return(TRUE); } #undef MAX_ENTRIES #undef NEXT_FREE #undef DICT #undef SLOP SHAR_EOF cat - << \SHAR_EOF > init.c /* File: init.c Authors: Richard Hooker, Graham Toal Purpose: Loading of dictionary for spelling checker. Functions exported: dawg_init, pack_init Internal functions: spell_init Description: The address of a dictionary (either a PACKed dict or a DAWG) is set up by this procedure. This gives us the option of connecting the dict in read-only (or copy-on-write) virtual memory. On non-VM machines, we simply allocate a large buffer into which the relevant part of the dictionary is loaded. The magic number is used to check the dict type, *and* the machine byte-sex. If the sex is reversed, even a VM system has to load the data into writable memory (so that it can reverse it). */ /*######################### INTERNAL FUNCTIONS #########################*/ static int #ifdef PROTOTYPES spell_init(char *filename, NODE PCCRAP **dictp, char *DEFAULT_DICT, long magic_number, INDEX *nedges) #else spell_init(filename, dictp, DEFAULT_DICT, magic_number, nedges) char *filename; NODE PCCRAP **dictp; char *DEFAULT_DICT; long magic_number; INDEX *nedges; #endif #define dict (*dictp) { FILE *fp; INDEX count; /* init_dict("") gets default */ if (*filename == '\0') filename = DEFAULT_DICT; /* Open the file and find out the size of the dict -- which is stored in the first word. Later I'll change the dict format to have a header, and the header will have to be skipped by this module. */ if ((fp = fopen(filename, "rb")) == NULL) { fprintf (stderr, "Can\'t open file \"%s\"\n", filename); return(FALSE); } *nedges = getword(fp); #ifdef DEBUG fprintf(stderr, "dawg contains %8lx edges\n", *nedges); #endif /* Allocate precisely enough memory for all edges + 0 at root node. */ if ((dict = MALLOC((SIZE_T)((*nedges)+1), sizeof(NODE PCCRAP *))) == 0) { fprintf (stderr, "Can\'t allocate space for dictionary\n"); return(FALSE); } dict[0] = 0; /* Root node set to 0; terminal nodes point to 0. */ /* Load in the dictionary. Routine 'getwords' should be efficient */ count = getwords(&dict[1], (long)(4*(*nedges)), fp); if (count != 4*(*nedges)) { fprintf(stderr, "Failed to read dictionary %s - wanted %ld bytes, got %ld\n", filename, 4*(*nedges), count); return(FALSE); } fclose(fp); return(TRUE); #undef dict } /*####################### EXPORTED FUNCTIONS #########################*/ int #ifdef PROTOTYPES dawg_init(char *filename, NODE PCCRAP **dawgp, INDEX *nedges) #else dawg_init(filename, dawgp, nedges) char *filename; NODE PCCRAP **dawgp; INDEX *nedges; #endif { return(spell_init(filename, dawgp, DEFAULT_DAWG, DAWG_MAGIC, nedges)); } int #ifdef PROTOTYPES pack_init(char *filename, NODE PCCRAP **packp, INDEX *nedges) #else pack_init(filename, packp, nedges) char *filename; NODE PCCRAP **packp; INDEX *nedges; #endif { return(spell_init(filename, packp, DEFAULT_PACK, PACK_MAGIC, nedges)); } SHAR_EOF cat - << \SHAR_EOF > locate.c /* File: locate.c Author: Graham Toal Purpose: Find all words beginning with particular prefix. Functions exported: locate_prefix Description: Searches as in spell-check for prefix in dict, but doesn't fail if word doesn't terminate at that point. It returns an index into the dict which can be used with print_dawg_prefix to display all the words found. However it is more useful than that; text-analysis programs can check that a word matches "root*", for instance, when doing stylistic analysis etc. */ INDEX #ifdef PROTOTYPES dawg_locate_prefix(NODE PCCRAP *dawg, char *word, INDEX edge) #else dawg_locate_prefix(dawg, word, edge) NODE PCCRAP *dawg; char *word; INDEX edge; #endif { for (;;) { if (*word == (((dawg[edge] >> V_LETTER) & M_LETTER))) { if (*++word == '\0') { return(dawg[edge]&M_NODE_POINTER); } else { if ((edge = (dawg[edge] & M_NODE_POINTER)) == ROOT_NODE) break; continue; } } if (((dawg[edge++]) & M_END_OF_NODE) != 0) break; } /* What to do when none found? -- fail-safe, or some error code...? */ return(ROOT_NODE); } SHAR_EOF cat - << \SHAR_EOF > print.c /* File: print.c Author: Graham Toal Purpose: Print a packed trie to stderr. Functions exported: dawg_print, pack_print, dawg_print_prefix Internal functions: pack_pr dawg_pr dawg_pr_prefix Description: Pre-order traverse of packed TRIE or DAWG. Will be modified soon to take output file as parameter. Then sometime after that to callback with each string at it is generated. Meanwhile, people requiring custom versions of dawg-walking stuff might as well just copy this code and edit it to do what they want. The special version print_dawg_prefix takes a NODE from somewhere in a dict as a parameter, and prints out only those words which contain that node. You have to locate the node seperately with 'locate_prefix', and pass the prefix string in so it can be printed. */ static void #ifdef PROTOTYPES dawg_pr(NODE PCCRAP *dawg, INDEX node, int len) #else dawg_pr(dawg, node, len) NODE PCCRAP *dawg; INDEX node; int len; #endif { static char word[MAX_WORD_LEN]; NODE PCCRAP *edge; for (edge = (NODE PCCRAP *)&dawg[node]; ; edge++) { long c; c = *edge; /* Don't rewrite this - its avoiding a MSC bug */ c = c >> V_LETTER; c = c & M_LETTER; word[len] = (char)c; if ((*edge & M_END_OF_WORD) != 0) { word[len+1] = '\0'; fprintf(stdout, "%s\n", word); } c = *edge & M_NODE_POINTER; if ((*edge & M_NODE_POINTER) != 0) dawg_pr (dawg, c, len + 1); if ((*edge & M_END_OF_NODE) != 0) break; /* End of node */ } } void #ifdef PROTOTYPES dawg_print(NODE PCCRAP *dawg, INDEX node) #else dawg_print(dawg, node) NODE PCCRAP *dawg; INDEX node; #endif { dawg_pr(dawg, node, 0); } static void #ifdef PROTOTYPES pack_pr(NODE PCCRAP *ptrie, INDEX i, int pos) #else pack_pr(ptrie, i, pos) NODE PCCRAP *ptrie; INDEX i; int pos; #endif { static char s[MAX_WORD_LEN+1]; int b; INDEX p; for (b = BASE_CHAR; b < BASE_CHAR+MAX_CHARS; b++) { if (b != 0) { p = ptrie[i+b-BASE_CHAR]; if (((p>>V_LETTER)&M_LETTER) == b) { s[pos] = b; s[pos+1] = '\0'; if ((p & M_END_OF_WORD) != 0) fprintf(stderr, "%s\n", s); if ((p &= M_NODE_POINTER) != 0) { pack_pr(ptrie, p, pos+1); } } } } } void #ifdef PROTOTYPES pack_print(NODE PCCRAP *ptrie, INDEX node) #else pack_print(ptrie, node) NODE PCCRAP *ptrie; INDEX node; #endif { pack_pr(ptrie, node, 0); } static void #ifdef PROTOTYPES dawg_pr_prefix(NODE PCCRAP *dawg, char *prefix, INDEX node, int len) #else dawg_pr_prefix(dawg, prefix, node, len) NODE PCCRAP *dawg; char *prefix; INDEX node; int len; #endif { NODE PCCRAP *edge; static char word[MAX_WORD_LEN]; long c; for (edge = (NODE PCCRAP *)&dawg[node]; ; edge++) { /* Don't 'fix' - compiler bugaround for microsoft :-( */ c = *edge; c = c >> V_LETTER; c = c & M_LETTER; word[len] = (char)c; if ((*edge & M_END_OF_WORD) != 0) { word[len+1] = 0; fprintf(stdout, "%s%s\n", prefix, word); } c = *edge & M_NODE_POINTER; if (c != 0) dawg_pr_prefix(dawg, prefix, c, len + 1); /* End of node - check repair later - I accidentally nobbled it */ if ((*edge & M_END_OF_NODE) != 0) break; } } void #ifdef PROTOTYPES dawg_print_prefix(NODE PCCRAP *dawg, char *prefix, INDEX node) #else dawg_print_prefix(dawg, prefix, node) NODE PCCRAP *dawg; char *prefix; INDEX node; #endif { dawg_pr_prefix(dawg, prefix, node, 0); } SHAR_EOF cat - << \SHAR_EOF > similcmp.c #ifndef similcmp_c #define similcmp_c 1 #ifdef LIB_STRINGS #include <strings.h> #else #include <string.h> #endif /* File: similcmp.c Authors: John Ratcliffe, and an anonymous coder. Purpose: Better approximate string equality test. Functions exported: simil Internal functions: GCsubstr Description: See below. I'll replace this one eventually with my own algorithm which does sophisticated compound-grapheme analysis and returns a degree of phonetic similarity and a probability that the transformations made were valid. 'ghoti==fish' (Match = 90%, Prob = 1%) ;-) lauGH = f match 100% prob 30% wOmen = i match 90% prob 10% staTIon = sh match 100% prob 5% */ /* Ratcliff/Obershelp pattern matching * * Approximate word matching: takes two words and returns a * similarity score based on co-occurrence of subpatterns. * * This code appeared in a letter to the editor in DDJ, 11/88. * The original article on the pattern matching, "Pattern Matching * by Gestalt" by John Ratcliff, appeared in the July 1988 * issue (#181) but the algorithm was presented in assembly. * * The 11/88 issue also contained another verison in C which was * a more faithful translation of the original; it has the * advantage of not being recursive. * * The algorithm seems to work nicely for a variety of test cases. * The main drawback of the algorithm is the cost of the pairwise * comparisons. It is significantly more expensive than stemming, * soundex, and the like. Might consider using this as a second * phase... */ int #ifdef PROTOTYPES GCsubstr(char *st1, char *end1, char *st2, char *end2) #else GCsubstr(st1, end1, st2, end2) char *st1; char *end1; char *st2; char *end2; #endif { register char *a1, *a2; char *b1, *s1, *b2, *s2; short max, i; if (end1 <= st1 || end2 <= st2) return(0); if (end1 == st1 + 1 && end2 == st2 + 1) return(0); max = 0; b1 = end1; b2 = end2; for (a1 = st1; a1 < b1; a1++) { for (a2 = st2; a2 < b2; a2++) { if (*a1 == *a2) { /* determine length of common substring */ for (i = 1; a1[i] && (a1[i] == a2[i]); i++) /* do nothing */; if (i > max) { max = i; s1 = a1; s2 = a2; b1 = end1 - max; b2 = end2 - max; } } } } if (!max) return(0); max += GCsubstr(s1 + max, end1, s2 + max, end2); /* rhs */ max += GCsubstr(st1, s1, st2, s2); /* lhs */ return(max); } int #ifdef PROTOTYPES simil(char *s1, char *s2) #else simil(s1, s2) char *s1; char *s2; #endif { int l1 = strlen(s1), l2 = strlen(s2); if (strcmp(s1, s2) == 0) return(100); /* exact match end-case */ return(200 * GCsubstr(s1, s1 + l1, s2, s2 + l2) / (l1 + l2)); } #endif /* similcmp_c */ SHAR_EOF cat - << \SHAR_EOF > soundex.c #ifndef soundex_c #define soundex_c 1 #ifdef LIB_STRINGS #include <strings.h> #else #include <string.h> #endif /* isalpha & toupper; define your own if you don't have ctype.h */ #include <ctype.h> /* File: soundex.c Authors: Jonathan Leffler Purpose: Approximate string equality test. Functions exported: soundex Internal functions: nsoundex Description: There are umpteen soundexes around. At least this one is commented... (Actually I'd prefer one which understood ph/f and Mac/Mc as special cases; short of a proper phonetic alg such as I'm currently developing) See below for description: */ /* ** SOUNDEX CODING ** ** Rules: ** 1. Retain the first letter; ignore non-alphabetic characters. ** 2. Replace second and subsequent characters by a group code. ** Group Letters ** 1 BFPV ** 2 CGJKSXZ ** 3 DT ** 4 L ** 5 MN ** 6 R ** 3. Do not repeat digits ** 4. Truncate or ser-pad to 4-character result. ** ** Originally formatted with tabstops set at 4 spaces -- you were warned! ** ** Code by: Jonathan Leffler (john@sphinx.co.uk) ** This code is shareware -- I wrote it; you can have it for free ** if you supply it to anyone else who wants it for free. ** ** BUGS: Assumes 7-ASCII; fails on ISO Latin1 */ static char lookup[] = { '0', /* A */ '1', /* B */ '2', /* C */ '3', /* D */ '0', /* E */ '1', /* F */ '2', /* G */ '0', /* H */ '0', /* I */ '2', /* J */ '2', /* K */ '4', /* L */ '5', /* M */ '5', /* N */ '0', /* O */ '1', /* P */ '0', /* Q */ '6', /* R */ '2', /* S */ '3', /* T */ '0', /* U */ '1', /* V */ '0', /* W */ '2', /* X */ '0', /* Y */ '2' /* Z */ }; /* Soundex for arbitrary number of characters of information */ #define MAX_SOUNDEX_LEN 10 static char * #ifdef PROTOTYPES nsoundex(char *str, int n) #else nsoundex(str, n) char *str; /* In: String to be converted */ int n; /* In: Number of characters in result string */ #endif { static char buff[MAX_SOUNDEX_LEN]; register char *s; register char *t; char c; char l; if (n <= 0) n = 4; /* Default */ if (n > sizeof(buff) - 1) n = sizeof(buff) - 1; t = &buff[0]; for (s = str; ((c = *s) != '\0') && t < &buff[n]; s++) { if (!isalpha(c)) continue; c = toupper(c); if (t == &buff[0]) { l = *t++ = c; continue; } c = lookup[c-'A']; /* Fails on any alpha not in 'a'..'z' */ if (c != '0' && c != l) l = *t++ = c; } while (t < &buff[n]) *t++ = '0'; *t = '\0'; return(&buff[0]); } /* Normal external interface */ char * #ifdef PROTOTYPES soundex(char *str) #else soundex(str) char *str; #endif { return(nsoundex(str, 4)); /* vary this? - try 5 or 6? */ } #endif /* soundex_c */ SHAR_EOF cat - << \SHAR_EOF > utils.c /* File: utils.c Author: Graham Toal Purpose: Portability library Description: Most of what differs between operating systems is handled here. This module is *incredibly* hacky -- I've just been interested in finding the problems so far, not in making the solutions neat. The final release of this suite will concentrate on cleaning up this file! */ /* PENDING: Generic load dict; meanwhile should also put efficient msdos file loading into getwords(). See spelldawg for best coding. */ #define SIZE_T size_t /* MSDos redefines this for huge allocation */ #ifdef SYS_RISCOS #define DEFAULT_DAWG "run:dict-dwg" #define DEFAULT_PACK "run:dict-pck" #else #ifdef SYS_EMAS #define DEFAULT_DAWG "dict#dwg" #define DEFAULT_PACK "dict#pck" #else /* Unix, MessDross, etc... */ #define DEFAULT_DAWG "dict.dwg" #define DEFAULT_PACK "dict.pck" #endif #endif /* Undo any #define's here if they are inappropriate for some system */ #define EXT_CHAR '.' #define READ_TEXT "r" #define WRITE_BIN "wb" #define READ_BIN "rb" /* system configuration */ #ifdef KNOWS_VOID #define VOID void #else /* As in... void fred(VOID) */ #define void int #define VOID #endif #ifdef SYS_MSDOS #ifdef COMPILER_ZORTECH int _stack = 0x3000; #define PCCRAP far #else #ifdef COMPILER_TURBOC #define PCCRAP far #else #define PCCRAP huge #endif #endif #else #define PCCRAP #endif /* Hmph... I don't really want to do realloc too. Just as well that one implmentation is buggy; saves me having to work out what to do :-) */ #ifdef SYS_MSDOS /* Normally int... */ #undef SIZE_T #define SIZE_T long /* (Still to test AZTEC & LATTICE) */ /* Mallocs of > 64K (maybe even > 32K) have to come off the far/huge heap */ #ifdef COMPILER_ZORTECH #include <dos.h> #define MALLOC(x,s) (NODE PCCRAP *)farmalloc((x)*(s)) /* Zortech */ #define Malloc(x,s) malloc((x)*(s)) #define FREE(x) farfree(x) #define Free(x) free(x) #else /* else not microsoft */ #ifdef COMPILER_TURBOC #include <alloc.h> #define MALLOC(x,s) (NODE PCCRAP *)farmalloc((x)*(s)) /* Turbo */ #define Malloc(x,s) malloc((x)*(s)) #define FREE(x) farfree(x) #define Free(x) free(x) #else /* else not turbo */ #include <malloc.h> #ifdef NEVER #define MALLOC(x,s) (NODE PCCRAP *)my_halloc((long)(x),(size_t)(s)) /* Microsoft, Aztec */ #define Malloc(x,s) my_malloc((x)*(s)) #define FREE(x) my_hfree((void huge *)(x)) /* For some reason MICROSOFT */ #define Free(x) my_free((void *)x) /* complains of a type mismatch */ /* without these casts */ #endif #define MALLOC(x,s) (NODE PCCRAP *)halloc((long)(x),(size_t)(s)) /* Microsoft, Aztec */ #define Malloc(x,s) malloc((x)*(s)) #define FREE(x) hfree((void huge *)(x)) /* For some reason MICROSOFT */ #define Free(x) free((void *)x) /* complains of a type mismatch */ /* without these casts */ #endif #endif #else /* else not SYS_MSDOS */ #ifdef STDLIBS #include <stdlib.h> /* for malloc, free & exit */ #define MALLOC(x,s) (NODE PCCRAP *)malloc(((x)*(s))) #define Malloc(x,s) malloc((x)*(s)) #define FREE(x) free(x) #define Free(x) free(x) #else #define MALLOC(x,s) (NODE PCCRAP *)malloc(((x)*(s))) #define Malloc(x,s) malloc((x)*(s)) #define FREE(x) free(x) #define Free(x) free(x) #ifndef size_t /* Doesn't work if size_t was typedef'd */ #define size_t int /* And if an int isn't big enough we're in trouble! */ #endif #ifdef PROTOTYPES extern void *malloc(size_t bytes); extern void exit(int rc); #else extern void *malloc(); extern void exit(); #endif #endif /* not stdlibs */ #endif /* Not msdos */ #ifdef SYS_RISCOS #undef EXT_CHAR #define EXT_CHAR '-' #endif #ifdef SYS_EMAS #undef EXT_CHAR #define EXT_CHAR '#' #endif #ifdef SYS_MAC #ifdef COMPILER_THINK #undef WRITE_BIN #undef READ_BIN #define WRITE_BIN "w" #define READ_BIN "r" #endif #endif #ifdef MEMMODELS #define SMALL_MEMORY 1 #endif /* Error handling At the moment we'll just go for OK / NOT OK. Later we should fix all exit's to use a specific code symbol (eg EXIT_MALLOCFAIL) but this has to be done system by system. Whenever a new one is added, a default should be set up (as 0?) */ /*#include <errno.h>*/ /* Only later when we want to be more precise! */ #define EXIT_OK (0) /* Generic success */ #define EXIT_ERROR (1) /* Generaic failure */ /* For each system, replace generic errors with system-dependant ones. */ #ifdef vms /* * VMS-specific error status codes. Approximate Ultrix equivalents. */ #include <ssdef.h> #include <stsdef.h> #undef EXIT_OK #define EXIT_OK SS$_NORMAL /* Generic success */ #undef EXIT_ERROR #define EXIT_ERROR SS$_NORMAL /* Don't want random error message! */ #endif #define DELETE(filename) #ifdef __STDC__ #undef DELETE #define DELETE(filename) remove(filename) #else #ifdef unix #undef DELETE #define DELETE(filename) unlink(filename) #endif #endif #ifdef NEVER /* these macros caught the fact that on MICROSOFT, the parameters being passed in were ints -- and I hadn't been given a warning because I had explicitly cast the to size-t. Hence why I've declared SIZE_T as long. This is all a bit messy. Curse you IBM PCs */ void PCCRAP *my_halloc(long n,size_t s) { char PCCRAP *p; p = halloc(n, s); fprintf(stderr, "halloc(%8lx*%d) -> %8lx\n", n, s, (long)p); return(p); } void *my_malloc(size_t s) { char *p; p = malloc(s); fprintf(stderr, "malloc(%4x) -> %4x\n", s, (int)p); return(p); } void my_hfree(void PCCRAP *p) { fprintf(stderr, "hfree(%8lx)\n", (long)p); hfree(p); } void my_free(void *p) { fprintf(stderr, "free(%4x)\n", (int)p); free(p); } #endif #ifdef RCHECK #ifndef PROTOTYPES long toosmall(idx, max, line) long idx; long max; int line; #else long toosmall(long idx, long max, int line) #endif { if (line == 0) { fprintf(stderr, "RANGE CHECK: %ld should not be less than 0 (max %ld)\n", idx, max); } else { fprintf(stderr, "RANGE CHECK AT LINE %d: %ld should not be less than 0 (max %ld)\n", line, idx, max); } return(0L); } #ifndef PROTOTYPES long toobig(idx, max, line) long idx; long max; int line; #else long toobig(long idx, long max, int line) #endif { if (line == 0) { fprintf(stderr, "RANGE CHECK: %ld should not be larger than %ld\n", idx, max); } else { fprintf(stderr, "RANGE CHECK AT LINE %d: %ld should not be larger than %ld\n", line, idx, max); } return(max); } #ifdef KNOWS_LINE #define TOOSMALL(idx, max) toosmall((long)idx, (long)max, __LINE__) #define TOOBIG(idx, max) toobig((long)idx, (long)max, __LINE__) #else #define TOOSMALL(idx, max) toosmall((long)idx, (long)max, 0) #define TOOBIG(idx, max) toobig((long)idx, (long)max, 0) #endif #define RANGECHECK(idx,max) \ (idx < 0 ? (TOOSMALL(idx,max)) : (idx >= max ? (TOOBIG(idx,max)) : idx)) #else #define RANGECHECK(idx,max) (idx) #endif #ifdef PROTOTYPES long getwords(long PCCRAP *data, long count, FILE *fp) #else long getwords(data, count, fp) long PCCRAP *data; long count; FILE *fp; #endif { #ifdef SYS_MSDOS char PCCRAP *p; int c; long byte_count; p = (char PCCRAP *)(&data[0]); /* Somewhere I have a version which fread()s into a 16K near buffer, then copies it out; use that technique here - *MUCH* faster */ for (byte_count = 0; byte_count < count; byte_count++) { c = fgetc(fp); if (c == -1 || ferror(fp)) { printf ("Dictionary read error - %ld wanted - %ld got\n", count, byte_count)/*, exit(0)*/; break; } *p++ = (c & 255); } return(count); #else return((long)fread(&data[0], (size_t)1, (size_t)count, fp)); #endif } #ifdef PROTOTYPES long putwords(long PCCRAP *data, long count, FILE *fp) #else long putwords(data, count, fp) long PCCRAP *data; long count; FILE *fp; #endif { #ifdef SYS_MSDOS extern int _NEAR _CDECL errno; long i; char PCCRAP *p; p = (char PCCRAP *)&data[0]; if (fp == NULL) { fprintf(stderr, "putwords: no file?\n"); exit(0); } for (i = 0L; i < count; i++) { /* Somewhere I have a version which copies to a 16K near buffer, then frwrite()s it out; use that technique here - *MUCH* faster */ int rc = fputc(*p++, fp); if (ferror(fp)) { fprintf(stderr, "rc = %d\n", rc); perror("dawg"); fprintf (stderr, "Error writing to output file\n"); exit(0); } } return(count); #else return(fwrite(&data[0], (size_t)1, (size_t)count, fp)); #endif } static long PCCRAP *WTMP = NULL; /* A bit hacky having this single 4 bytes in heap space, but it makes things a little more consistent. it'll all go away eventually anyway... */ #ifdef PROTOTYPES void putword(long w, FILE *fp) #else void putword(w, fp) long w; FILE *fp; #endif { if (WTMP == NULL) { WTMP = (long PCCRAP *)MALLOC(1,sizeof(long)); } *WTMP = w; if (putwords(WTMP, (long)sizeof(long), fp) != sizeof(long)) { /* (using putwords avoids confusion over bytesex) */ fprintf(stderr, "Data write error - putword\n"); } } #ifdef PROTYPES long getword(FILE *fp) #else long getword(fp) FILE *fp; #endif { if (WTMP == NULL) { WTMP = (long PCCRAP *)MALLOC(1,sizeof(long)); } if (getwords(WTMP, (long)sizeof(long), fp) != sizeof(long)) { fprintf(stderr, "Data read error - getword\n"); } return(*WTMP); } SHAR_EOF