home *** CD-ROM | disk | FTP | other *** search
- /* BIC: Bickel's Algorithm test program
- Spell Checker and Corrector
-
- (C)opyright May 1990, 1991 by Daniel Lawrence
- All Rights Reserved
-
- Revision History:
- 29-mar-91 Trying some filters to improve this
- */
-
- #define maindef 1
-
- #include <stdio.h>
- #include <ctype.h>
- #include "dopt.h"
- #include "dstruct.h"
- #include "ddef.h"
-
-
- #define MWSIZE 40 /* maximum # chars in a word */
-
- /* letter weighing table */
-
- char lweight[26] = {
- /* a */ 3,
- /* b */ 6,
- /* c */ 5,
- /* d */ 4,
- /* e */ 3,
- /* f */ 5,
- /* g */ 5,
- /* h */ 4,
- /* i */ 3,
- /* j */ 8,
- /* k */ 7,
- /* l */ 4,
- /* m */ 5,
- /* n */ 3,
- /* o */ 3,
- /* p */ 5,
- /* q */ 7,
- /* r */ 4,
- /* s */ 3,
- /* t */ 3,
- /* u */ 4,
- /* v */ 6,
- /* w */ 5,
- /* x */ 8,
- /* y */ 8,
- /* z */ 9
- };
-
- /* some limits */
- #define MAXMATCHES 50
-
- /* local prototypes */
-
- #if TURBO
- char *cleanword(char *word);
- char *sortword(char *word);
- char *intersect(char *word1, char *word2);
- int weight(char *cletters);
- int mismatch(char *, char *);
- #else
- char *cleanword();
- char *sortword();
- char *intersect();
- int weight();
- int mismatch();
- #endif
-
- /* GLOBALS for BIC */
-
- char testwrd[MWSIZE]; /* word to test against dictionary */
- char sortwrd[MWSIZE]; /* sorted letters of testwrd */
- char dictwrd[MWSIZE]; /* dictionary word to compare */
- int likeness; /* likeness score of current word */
- int threshold; /* threshold to display likeness */
- int max_matches; /* maximum number of matches */
- char matches[MAXMATCHES][128]; /* list of current matches */
- int match_scores[MAXMATCHES]; /* difference scores for these */
- int swterse = FALSE; /* called from EMACS? */
- int swverbose = FALSE; /* display extra info? */
- int exact_match = FALSE; /* is this word in the dictionary? */
-
- main(argc, argv)
-
- int argc; /* command line argument count */
- char **argv; /* argument vector */
-
- {
- register int dif_value; /* difference value of current word */
- register char *sortptr; /* ptr to the sorted cleaned dictionary word */
- register char first_char; /* first character in word to test */
-
- max_matches = 12; /* default list length! */
-
- if (argc < 2) {
- usage();
- exit(EXBADOPT);
- }
-
- /* deal with the command options */
- while (argc > 1) {
-
- /* grab the next option */
- --argc;
- argv++;
- strcpy(testwrd, argv[0]);
-
- if (*testwrd == '-') {
-
- switch (testwrd[1]) {
-
- case 'm':
- --argc;
- argv++;
- max_matches = atoi(argv[0]);
- break;
-
- case 't':
- swterse = TRUE;
- break;
-
- case 'v':
- swverbose = TRUE;
- break;
-
- default:
- usage();
- exit(EXBADOPT);
- }
- }
-
- }
-
- /* announce us */
- if (swterse == FALSE)
- printf("BIC %s Bickel's Test Algorithm\n", VERSION);
-
- /* initialize the lcase array */
- init_lcase();
-
- /* grab the test word */
- strcpy(sortwrd, sortword(cleanword(testwrd)));
- threshold = weight(intersect(sortwrd, sortwrd)) - 4;
- if (swverbose)
- printf("threshold = %u\n", threshold);
-
- /* initailize the list of matches */
- init_match();
-
- /* open the main dictionary */
- if (mopen() == FALSE)
- exit(EXMDICT);
-
- first_char = lcase[*testwrd];
- skipdict(first_char);
- strcpy(dictwrd, nxtmword());
-
- /* scan the dictionary */
- while (lcase[*dictwrd] == first_char) {
-
- /* dump them if off by more than 3 chars! */
- if (abs(strlen(dictwrd) - strlen(testwrd)) < 4) {
-
- /* clean and sort the word */
- sortptr = sortword(cleanword(dictwrd));
-
- /* calculate the likeness score */
- likeness = weight(intersect(sortwrd, sortptr));
-
- /* tell us about it */
- if (likeness > threshold) {
-
- /* are these nearly alike? */
- if ((dif_value = mismatch(sortwrd, sortptr)) < 20) {
- add_match(dictwrd, dif_value);
-
- /* is this the word? */
- if (strcmp(testwrd, dictwrd) == 0) {
- exact_match = TRUE;
- if (swterse) {
- printf("EXACT MATCH\n");
- mclose();
- exit(EXGOOD);
- }
- }
-
- }
- }
- }
-
- /* get a dictionary word */
- nextword: strcpy(dictwrd, nxtmword());
-
- /* at the end... skip out */
- if (strcmp(dictwrd, hivalue) == 0)
- break;
- }
-
- /* close the main dictionary */
- mclose();
-
- /* show us the results */
- show_match();
-
- /* error code of EXMISS if word is not in dictionary */
- if (exact_match) {
- exit(EXGOOD);
- } else {
- exit(EXMISS);
- }
- }
-
- usage() /* print the command line usage */
-
- {
- printf("BIC: Bickel's Algorithm Test Program\n");
- printf(" for MicroSPELL %s\n",VERSION);
- puts("\nUsage\n");
- puts(" BIC {-t|-v} {-m <n>} <test word>\n");
- puts(" -m <n> return top <n> suggestions / default = 12");
- puts(" -t terse mode");
- puts(" -v verbose mode");
- }
-
-
- #if RAMSIZE & LATTICE & MSDOS
- /* These routines will allow me to track memory usage by placing
- a layer on top of the standard system malloc() and free() calls.
- with this code defined, the number of allocated bytes is displayed
- in the upper right corner of the screen
- */
-
- #undef malloc
- #undef free
-
- char *allocate(nbytes) /* allocate nbytes and track */
-
- unsigned nbytes; /* # of bytes to allocate */
-
- {
- char *mp; /* ptr returned from malloc */
- char *malloc();
-
- mp = malloc(nbytes);
- if (mp) {
- envram += nbytes;
- #if RAMSHOW
- dspram();
- #endif
- }
-
- return(mp);
- }
-
- release(mp) /* release malloced memory and track */
-
- char *mp; /* chunk of RAM to release */
-
- {
- unsigned *lp; /* ptr to the long containing the block size */
-
- if (mp) {
- lp = ((unsigned *)mp) - 1;
-
- /* update amount of ram currently malloced */
- envram -= (long)*lp - 2;
- free(mp);
- #if RAMSHOW
- dspram();
- #endif
- }
- }
-
- #if RAMSHOW
- dspram() /* display the amount of RAM currently malloced */
-
- {
- char mbuf[20];
- char *sp;
-
- /* TTmove(term.t_nrow - 1, 70);*/
- sprintf(mbuf, "[%lu]", envram);
- sp = &mbuf[0];
- puts(sp);
- }
- #endif
- #endif
-
- #if AZTEC & MSDOS
- #undef fgetc
- /* a1getc: Get an ascii char from the file input stream
- but DO NOT strip the high bit
- */
-
- int a1getc(fp)
-
- FILE *fp;
-
- {
- int c; /* translated character */
-
- c = getc(fp); /* get the character */
-
- /* if its a <LF> char, throw it out */
- while (c == 10)
- c = getc(fp);
-
- /* if its a <RETURN> char, change it to a LF */
- if (c == '\r')
- c = '\n';
-
- /* if its a ^Z, its an EOF */
- if (c == 26)
- c = EOF;
-
- return(c);
- }
- #endif
-
- #if CMS
- #undef fopen
- /* The IBM 30xx likes to tell us when file opens
- fail...it's too chatty....I like to handle these myself */
-
- FILE *cmsopen(file, mode)
-
- char *file; /* name of file to open */
- char *mode; /* mode to open it in */
-
- {
- quiet(1);
- return(fopen(file,mode));
- }
- #endif
-
- /**********************************************************************/
-
- /* clean word: dump all the non alpha chars, and make them
- lowercase only
- */
-
- char *cleanword(word)
-
- char *word; /* word to clean */
-
- {
- register char *sp; /* ptr into return buffer */
- static char rbuf[MWSIZE]; /* return buffer */
-
- /* scan through the word */
- sp = rbuf;
- while (*word) {
- if (isalpha(*word)) {
- if (islower(*word))
- *sp++ = *word;
- else
- *sp++ = *word + 32;
- }
-
- ++word;
- }
-
- *sp = 0;
- return(rbuf);
- }
-
- /* Sort word: bubble sort the letters in a word....
- average word size is 4.5 chars. Bubble sort
- is most efficient at that size
- */
-
- char *sortword(word)
-
- char *word; /* word to sort letters in */
-
- {
- register int i, j; /* bubble sort indexes */
- register int temp; /* temp for swap */
- register int swapflag; /* did a swap occur? */
- static char sword[MWSIZE]; /* return buffer */
-
- /* copy the word to the return buffer */
- strcpy(sword, word);
-
- /* bubble sort it */
- for (i = strlen(sword) - 1; i > 0; i--) {
-
- /* scan through the list once */
- swapflag = FALSE;
- for (j = 0; j < i; j++) {
-
- /* compare */
- if (sword[j] > sword[j + 1]) {
-
- temp = sword[j];
- sword[j] = sword[j + 1];
- sword[j + 1] = temp;
- swapflag = TRUE;
- }
- }
-
- /* if no movement occurred, we are done early */
- if (swapflag == FALSE)
- break;
- }
-
- return(sword);
- }
-
- /* intersect: find all letters occuring in 2 words */
-
- char *intersect(word1, word2)
-
- char *word1; /* first word to intersect with */
- char *word2; /* second word */
-
- {
- register char *iptr; /* ptr into return buffer */
- static char rbuf[MWSIZE]; /* return buffer */
-
- /* scan through both words */
- iptr = rbuf;
- while (*word1 && *word2) {
-
- /* if they are the same, record it */
- if (*word1 == *word2) {
- *iptr = *word1++;
- ++word2;
-
- /* skip duplicates */
- while (*word1 == *iptr)
- ++word1;
- while (*word2 == *iptr)
- ++word2;
-
- ++iptr;
-
- continue;
- }
-
- /* advance the appropriate word */
- if (*word1 < *word2)
- ++word1;
- else
- ++word2;
- }
-
- /* terminate and return the intersection */
- *iptr = 0;
- return(rbuf);
- }
-
- /* weight: using the common letters between two words, apply
- bickels weighing function to produce a sameness score
- */
-
- int weight(cletters)
-
- char *cletters; /* common letters between two words */
-
- {
- int result; /* resulting weighing factor */
-
- /* scan the letters, add up the result */
- result = 0;
- while (*cletters)
- result += lweight[*cletters++ - 'a'];
-
- return(result);
- }
-
- /* How many characters are different between these words? */
-
- int mismatch(word1, word2)
-
- char *word1; /* first word to compare */
- char *word2; /* second word to compare */
-
- {
- int count; /* count of mismatched characters */
-
- count = 0;
- while (*word1) {
-
- /* at the end of word2? */
- if (*word2 == 0) {
- count += lweight[*word1-'a'];
- ++word1;
- } else {
-
- /* a match, advance */
- if (*word1 == *word2) {
- ++word1;
- ++word2;
- } else {
- if (*word1 < *word2) {
- count += lweight[*word1-'a'];
- ++word1;
- } else {
- count += lweight[*word2-'a'];
- ++word2;
- }
- }
- }
- }
-
- /* past the rest */
- while (*word2) {
- ++count;
- count += lweight[*word2-'a'];
- ++word2;
- }
-
- return(count);
- }
-
- init_match()
-
- {
- register int index;
-
- for (index = 0; index < max_matches; index++) {
- matches[index][0] = 0;
- match_scores[index] = 1000;
- }
- }
-
- add_match(word, score)
-
- char *word; /* word to add to match list */
- int score; /* score of that word */
-
- {
- register int index; /* ptr into match list */
- register int rest; /* ptr into rest of list to move */
-
- for (index = 0; index < max_matches; index++) {
- if (match_scores[index] > score) {
- for (rest = max_matches; rest > index; --rest) {
- match_scores[rest] = match_scores[rest-1];
- strcpy(matches[rest], matches[rest-1]);
- }
- match_scores[index] = score;
- strcpy(matches[index], word);
- return;
- }
- }
- }
-
- show_match()
-
- {
- register int index;
- register int split;
-
- if (swterse == FALSE) {
- for (index = 0; index < max_matches; index++) {
- if (match_scores[index] > 999)
- return;
- if (swverbose)
- printf("[%u] %s\n", match_scores[index], matches[index]);
- else
- printf("%s\n", matches[index]);
- }
- } else {
- split = (max_matches + 2) / 3;
- for (index = 0; index < split; index++) {
- printf("%20s %20s %20s \n",
- matches[index],
- matches[index + split],
- matches[index + split * 2]);
- }
- }
- }
-