home *** CD-ROM | disk | FTP | other *** search
- /****************************************************************************
- *****************************************************************************
- FILE : soundex.c
- FUNCTION : This module contains the two algorithms SOUNDEX and PHONIX as
- they were defined by Gadd.
- NOTES : This is an ANSI-C version of the original C++ file.
- LITERATURE: T.N. Gadd: 'Fishing fore Werds': Phonetic Retrieval of written
- text in Information Retrieval Systems, Program 22/3, 1988,
- p.222-237.
- T.N. Gadd: PHONIX --- The Algorithm, Program 24/4, 1990,
- p.363-366.
- *****************************************************************************
- ****************************************************************************/
-
- /* #define TEST */ /* activates procedures main() and PrintCode() */
- /* #define DEBUG */ /* activates some debug information */
-
- #include <ctype.h>
- #include <string.h>
-
- #ifdef TEST
- #include <stdio.h>
- #endif /* TEST */
-
- #ifdef WIN32
- #include <stdio.h>
- #endif /* WIN32 */
-
- /****************************************************************************
- NAME : StrDel
- INPUT : char *DelPos --- pointer to first char to be deleted
- int DelSize --- number of chars to be deleted
- OUTPUT : char *DelPos
- FUNCTION: This procedure deletes DelSize chars at position DelPos and moves
- the remaining chars left to DelPos.
- EXAMPLE : If Del is pointing at the L of the string "DELETE" the call
- StrDel(Del, 2) will return Del pointing at "TE".
- ****************************************************************************/
- void StrDel (DelPos, DelSize)
- char *DelPos;
- int DelSize;
- {
- /* move chars left */
- char *Help = DelPos + DelSize;
- while (*Help)
- *DelPos++ = *Help++;
-
- /* move trailing \0 */
- *DelPos = *Help;
- }
-
-
- /****************************************************************************
- NAME : StrIns
- INPUT : char *InsPos --- pointer to insert position
- OUTPUT : char *InStr --- new string to be inserted
- FUNCTION: StrIns moves the chars at position InsPos right and copies the
- string InsStr into this free space.
- EXAMPLE : If Ins is pointing at the S of the string "INSERT" the call
- StrIns(Ins, "NEW") will return Ins pointing at "NEWSERT".
- ****************************************************************************/
- void StrIns (InsPos, InsStr)
- char *InsPos;
- char *InsStr;
- {
- int i;
- int MoveSize = strlen(InsStr);
-
- /* move chars right */
- for (i = strlen(InsPos)+1; i >= 0; i--)
- InsPos[i+MoveSize] = InsPos[i];
-
- /* copy InsStr to InsPos */
- while (*InsStr)
- *InsPos++ = *InsStr++;
- }
-
-
- /****************************************************************************
- NAME : IsVowel
- INPUT : char c --- char to be examined
- OUTPUT : int --- 1 or 0
- FUNCTION: IsVowel checks if c is an uppercase vowel or an uppercase Y. If c
- is one of those chars IsVowel will return a 1, else it will return
- a 0.
- ****************************************************************************/
- int IsVowel (c)
- unsigned char c;
- {
- return (c == 'A') || (c == 'E') || (c == 'I') ||
- (c == 'O') || (c == 'U') || (c == 'Y') ||
- (c == 0304) || (c == 0344) || (c == 0334) ||
- (c == 0366) || (c == 0326) || (c == 0374);
- }
-
-
- /****************************************************************************
- NAME : SoundexCode
- INPUT : char *Name --- string to be calculated
- OUTPUT : char *Key --- soundex code for Name
- FUNCTION: This procedure calculates a four-letter soundex code for the string
- Name and returns this code in the string Key.
- ****************************************************************************/
- #define SoundexLen 4 /* length of a soundex code */
- #define SoundexKey "Z000" /* default key for soundex code */
-
- char SCode(c)
- unsigned char c;
- {
- /* set letter values */
- static int Code[] = {0, 1, 2, 3, 0, 1, 2, 0, 0, 2, 2, 4, 5, 5, 0,
- 1, 2, 6, 2, 3, 0, 1, 0, 2, 0, 2};
-
- if (c == 95) return(2); /* german sz */
- return(Code[toupper(c)-'A']);
- }
-
- void SoundexCode (Name, Key)
- char *Name;
- char *Key;
- {
- char LastLetter;
- int Index;
-
- /* set default key */
- strcpy(Key, SoundexKey);
-
- /* keep first letter */
- Key[0] = *Name;
- LastLetter = *Name;
- Name++;
-
- /* scan rest of string */
- for (Index = 1; (Index < SoundexLen) && *Name; Name++)
- {
- /* use only letters */
- if (isalpha(*Name))
- {
- /* ignore duplicate successive chars */
- if (LastLetter != *Name)
- {
- /* new LastLetter */
- LastLetter = *Name;
-
- /* ignore letters with code 0 */
- if (SCode(*Name) != 0)
- {
- Key[Index] = '0' + SCode(*Name);
- Index++;
- }
- }
- }
- }
- }
-
-
- /****************************************************************************
- NAME : PhonixCode
- INPUT : char *Name --- string to be calculated
- OUTPUT : char *Key --- phonix code for Name
- FUNCTION: This procedure calculates a eight-letter phonix code for the string
- Name and returns this code in the string Key.
- ****************************************************************************/
- #define PhonixLen 8 /* length of a phonix code */
- #define PhonixKey "Z0000000" /* default key for phonix code */
-
- void PhonixCode (Name, Key)
- char *Name;
- char *Key;
- {
- int Code[25];
- char LastLetter;
- int Index;
-
- /* set letter values */
- Code[0] = 0;
- Code[1] = 1;
- Code[2] = 2;
- Code[3] = 3;
- Code[4] = 0;
- Code[5] = 7;
- Code[6] = 2;
- Code[7] = 0;
- Code[8] = 0;
- Code[9] = 2;
- Code[10] = 2;
- Code[11] = 4;
- Code[12] = 5;
- Code[13] = 5;
- Code[14] = 0;
- Code[15] = 1;
- Code[16] = 2;
- Code[17] = 6;
- Code[18] = 8;
- Code[19] = 3;
- Code[20] = 0;
- Code[21] = 7;
- Code[22] = 0;
- Code[23] = 8;
- Code[24] = 0;
- Code[25] = 8;
-
- /* set default key */
- strcpy(Key, PhonixKey);
-
- /* keep first letter or replace it with '$' */
- Key[0] = IsVowel(*Name) ? '$' : *Name;
- LastLetter = *Name;
- Name++;
-
- /* NOTE: Gadd replaces vowels being the first letter of the */
- /* word with a 'v'. Due to the implementation of WAIS all */
- /* letters will be lowercased. Therefore '$' is used instead */
- /* of 'v'. */
-
- /* scan rest of string */
- for (Index = 1; (Index < PhonixLen) && *Name; Name++)
- {
- /* use only letters */
- if (isalpha(*Name))
- {
- /* ignore duplicate successive chars */
- if (LastLetter != *Name)
- {
- LastLetter = *Name;
-
- /* ignore letters with code 0 except as separators */
- if (Code[toupper(*Name)-'A'] != 0)
- {
- Key[Index] = '0' + Code[toupper(*Name)-'A'];
- Index++;
- }
- }
- }
- }
- }
-
-
- /****************************************************************************
- NAME : PhonixReplace1
- INPUT : int where --- replace OldStr only if it occurs at this position
- char *Name --- string to work
- char *OldStr --- old letter group to delete
- char *NewStr --- new letter group to insert
- int CondPre --- condition referring to letter before OldStr
- int CondPost --- condition referring to letter after OldStr
- OUTPUT : char *Name --- Name with replaced letter group
- FUNCTION: This procedure replaces the letter group OldStr with the letter
- group NewStr in the string Name, regarding the position of OldStr
- where (START, MIDDLE, END, ALL) and the conditions CondPre and
- CondPost (NON, VOC, CON).
- EXAMPLE : PhonixReplace1(START, "WAWA", "W", "V", NON, NON) replaces only the
- first W with a V because of the condition START.
- EXAMPLE : PhonixReplace1(START, "WAWA", "W", "V", NON, CON) replaces neither
- the first W with a V (because of the condition CON, i.e. a consonant
- must follow the W) nor the second W (because of the condition START).
- ****************************************************************************/
- #define NON 1 /* no condition */
- #define VOC 2 /* vowel needed */
- #define CON 3 /* consonant needed */
-
- #define START 1 /* condition refers to beginning of Name */
- #define MIDDLE 2 /* condition refers to middle of Name */
- #define END 3 /* condition refers to EndPos of Name */
- #define ALL 4 /* condition refers to whole Name */
-
- void PhonixReplace1 (Where, Name, OldStr, NewStr, CondPre, CondPost)
- int Where;
- char *Name;
- char *OldStr;
- char *NewStr;
- int CondPre;
- int CondPost;
- {
- char *OldStrPos;
- char *EndPos;
- char *NamePtr = Name;
-
- /* vowels before or after OldStr */
- char LetterPre; /* letter before OldStr */
- char LetterPost; /* letter after OldStr */
- #ifndef WIN32
- int VowelPre; /* LetterPre is vowel? */
- int VowelPost; /* LetterPost is vowel? */
- #endif
- int OkayPre; /* pre-condition okay? */
- int OkayPost; /* post-condition okay? */
-
- do
- {
- /* find OldStr in NamePtr */
- OldStrPos = strstr(NamePtr, OldStr);
-
- /* find EndPos of Name */
- EndPos = &Name[strlen(Name)-strlen(OldStr)];
-
- /* check conditions if OldStrPos != NULL */
- if (OldStrPos)
- {
- /* vowel before OldStrPos */
- LetterPre = *(OldStrPos-1);
- /* vowel after OldStrPos+strlen(OldStr) */
- LetterPost = *(OldStrPos+strlen(OldStr));
-
- /* check conditions */
- switch (CondPre)
- {
- case NON: OkayPre = 1;
- break;
- case VOC: OkayPre = LetterPre ? IsVowel(LetterPre) : 0;
- break;
- case CON: OkayPre = LetterPre ? !IsVowel(LetterPre) : 0;
- break;
- default : OkayPre = 0;
- break;
- }
- switch (CondPost)
- {
- case NON: OkayPost = 1;
- break;
- case VOC: OkayPost = LetterPost ? IsVowel(LetterPost) : 0;
- break;
- case CON: OkayPost = LetterPost ? !IsVowel(LetterPost) : 0;
- break;
- default : OkayPost = 0;
- break;
- }
- }
-
- /* replace OldStr with NewStr */
- if (OldStrPos && OkayPre && OkayPost &&
- ((Where == START) && (OldStrPos == Name) ||
- (Where == MIDDLE) && (OldStrPos != Name) && (OldStrPos != EndPos) ||
- (Where == END) && (OldStrPos == EndPos) ||
- (Where == ALL)))
- {
- /* replace old letter group with new letter group */
- StrDel(OldStrPos, strlen(OldStr));
- StrIns(OldStrPos, NewStr);
-
- /* advance NamePtr to the position of OldStr */
- NamePtr = OldStrPos;
-
- #ifdef DEBUG
- printf("Replace = %s-->%s\n", OldStr, NewStr);
- #endif /* DEBUG */
- }
- else
- /* advance NamePtr one char */
- NamePtr++;
- }
- while (OldStrPos);
- }
-
-
- /****************************************************************************
- NAME : PhonixReplace2
- INPUT : int where --- replace OldStr only if it occurs at this position
- char *Name --- string to work
- char *OldStr --- old letter group to delete
- char *NewStr --- new letter group to insert
- OUTPUT : char *Name --- Name with replaced letter group
- FUNCTION: This procedure replaces the letter group OldStr with the letter
- group NewStr in the string Name, regarding the position of OldStr
- where (START, MIDDLE, END, ALL).
- EXAMPLE : PhonixReplace2(START, "WAWA", "W", "V") replaces only the first W
- with a V because of the condition START.
- ****************************************************************************/
- void PhonixReplace2 (Where, Name, OldStr, NewStr)
- int Where;
- char *Name;
- char *OldStr;
- char *NewStr;
- {
- char *OldStrPos;
- char *EndPos;
- char *NamePtr = Name;
-
- do
- {
- /* find OldStr in NamePtr */
- OldStrPos = strstr(NamePtr, OldStr);
-
- /* find EndPos of Name */
- EndPos = &Name[strlen(Name)-strlen(OldStr)];
-
- /* replace OldStr with NewStr */
- if (OldStrPos &&
- ((Where == START) && (OldStrPos == Name) ||
- (Where == MIDDLE) && (OldStrPos != Name) && (OldStrPos != EndPos) ||
- (Where == END) && (OldStrPos == EndPos) ||
- (Where == ALL)))
- {
- /* replace old letter group with new letter group */
- StrDel(OldStrPos, strlen(OldStr));
- StrIns(OldStrPos, NewStr);
-
- /* advance NamePtr to the position of OldStr */
- NamePtr = OldStrPos;
-
- #ifdef DEBUG
- printf("Replace = %s-->%s\n", OldStr, NewStr);
- #endif /* DEBUG */
- }
- else
- /* advance NamePtr one char */
- NamePtr++;
- }
- while (OldStrPos);
- }
-
-
- /****************************************************************************
- NAME : Phonix
- INPUT : char *Name --- string to calculate phonix code for
- OUTPUT : char *Key --- phonix code of Name
- FUNCTION: Phonix calculates the phonix code for the string Name.
- ****************************************************************************/
- void Phonix (Name, Key)
- char *Name;
- char *Key;
- {
- /* use new variable NewName to remain Name unchanged */
- char NewName[50];
- int i;
-
- strcpy(NewName, Name);
-
- /* uppercase NewName */
- #ifdef WIN32
- for (i=0; i < (int)strlen(NewName); i++)
- #else
- for (i=0; i < strlen(NewName); i++)
- #endif
- if (islower(NewName[i]))
- NewName[i] = toupper(NewName[i]);
-
- #ifdef DEBUG
- printf("Name = %s\n", NewName);
- #endif /* DEBUG */
-
- /* replace letter groups according to Gadd's definition */
- PhonixReplace2(ALL , NewName, "DG" , "G" );
- PhonixReplace2(ALL , NewName, "CO" , "KO" );
- PhonixReplace2(ALL , NewName, "CA" , "KA" );
- PhonixReplace2(ALL , NewName, "CU" , "KU" );
- PhonixReplace2(ALL , NewName, "CY" , "SI" );
- PhonixReplace2(ALL , NewName, "CI" , "SI" );
- PhonixReplace2(ALL , NewName, "CE" , "SE" );
- PhonixReplace1(START , NewName, "CL" , "KL" , NON, VOC);
- PhonixReplace2(ALL , NewName, "CK" , "K" );
- PhonixReplace2(END , NewName, "GC" , "K" );
- PhonixReplace2(END , NewName, "JC" , "K" );
- PhonixReplace1(START , NewName, "CHR" , "KR" , NON, VOC);
- PhonixReplace1(START , NewName, "CR" , "KR" , NON, VOC);
- PhonixReplace2(START , NewName, "WR" , "R" );
- PhonixReplace2(ALL , NewName, "NC" , "NK" );
- PhonixReplace2(ALL , NewName, "CT" , "KT" );
- PhonixReplace2(ALL , NewName, "PH" , "F" );
- PhonixReplace2(ALL , NewName, "AA" , "AR" );
- PhonixReplace2(ALL , NewName, "SCH" , "SH" );
- PhonixReplace2(ALL , NewName, "BTL" , "TL" );
- PhonixReplace2(ALL , NewName, "GHT" , "T" );
- PhonixReplace2(ALL , NewName, "AUGH" , "ARF" );
- PhonixReplace1(MIDDLE, NewName, "LJ" , "LD" , VOC, VOC);
- PhonixReplace2(ALL , NewName, "LOUGH", "LOW" );
- PhonixReplace2(START , NewName, "Q" , "KW" );
- PhonixReplace2(START , NewName, "KN" , "N" );
- PhonixReplace2(END , NewName, "GN" , "N" );
- PhonixReplace2(ALL , NewName, "GHN" , "N" );
- PhonixReplace2(END , NewName, "GNE" , "N" );
- PhonixReplace2(ALL , NewName, "GHNE" , "NE" );
- PhonixReplace2(END , NewName, "GNES" , "NS" );
- PhonixReplace2(START , NewName, "GN" , "N" );
- PhonixReplace1(MIDDLE, NewName, "GN" , "N" , NON, CON);
- PhonixReplace1(END , NewName, "GN" , "N" , NON, NON); /* NON,CON */
- PhonixReplace2(START , NewName, "PS" , "S" );
- PhonixReplace2(START , NewName, "PT" , "T" );
- PhonixReplace2(START , NewName, "CZ" , "C" );
- PhonixReplace1(MIDDLE, NewName, "WZ" , "Z" , VOC, NON);
- PhonixReplace2(MIDDLE, NewName, "CZ" , "CH" );
- PhonixReplace2(ALL , NewName, "LZ" , "LSH" );
- PhonixReplace2(ALL , NewName, "RZ" , "RSH" );
- PhonixReplace1(MIDDLE, NewName, "Z" , "S" , NON, VOC);
- PhonixReplace2(ALL , NewName, "ZZ" , "TS" );
- PhonixReplace1(MIDDLE, NewName, "Z" , "TS" , CON, NON);
- PhonixReplace2(ALL , NewName, "HROUG", "REW" );
- PhonixReplace2(ALL , NewName, "OUGH" , "OF" );
- PhonixReplace1(MIDDLE, NewName, "Q" , "KW" , VOC, VOC);
- PhonixReplace1(MIDDLE, NewName, "J" , "Y" , VOC, VOC);
- PhonixReplace1(START , NewName, "YJ" , "Y" , NON, VOC);
- PhonixReplace2(START , NewName, "GH" , "G" );
- PhonixReplace1(END , NewName, "E" , "GH" , VOC, NON);
- PhonixReplace2(START , NewName, "CY" , "S" );
- PhonixReplace2(ALL , NewName, "NX" , "NKS" );
- PhonixReplace2(START , NewName, "PF" , "F" );
- PhonixReplace2(END , NewName, "DT" , "T" );
- PhonixReplace2(END , NewName, "TL" , "TIL" );
- PhonixReplace2(END , NewName, "DL" , "DIL" );
- PhonixReplace2(ALL , NewName, "YTH" , "ITH" );
- PhonixReplace1(START , NewName, "TJ" , "CH" , NON, VOC);
- PhonixReplace1(START , NewName, "TSJ" , "CH" , NON, VOC);
- PhonixReplace1(START , NewName, "TS" , "T" , NON, VOC);
- PhonixReplace1(ALL , NewName, "TCH" , "CH" );
- PhonixReplace1(MIDDLE, NewName, "WSK" , "VSKIE", VOC, NON);
- PhonixReplace1(END , NewName, "WSK" , "VSKIE", VOC, NON);
- PhonixReplace1(START , NewName, "MN" , "N" , NON, VOC);
- PhonixReplace1(START , NewName, "PN" , "N" , NON, VOC);
- PhonixReplace1(MIDDLE, NewName, "STL" , "SL" , VOC, NON);
- PhonixReplace1(END , NewName, "STL" , "SL" , VOC, NON);
- PhonixReplace2(END , NewName, "TNT" , "ENT" );
- PhonixReplace2(END , NewName, "EAUX" , "OH" );
- PhonixReplace2(ALL , NewName, "EXCI" , "ECS" );
- PhonixReplace2(ALL , NewName, "X" , "ECS" );
- PhonixReplace2(END , NewName, "NED" , "ND" );
- PhonixReplace2(ALL , NewName, "JR" , "DR" );
- PhonixReplace2(END , NewName, "EE" , "EA" );
- PhonixReplace2(ALL , NewName, "ZS" , "S" );
- PhonixReplace1(MIDDLE, NewName, "R" , "AH" , VOC, CON);
- PhonixReplace1(END , NewName, "R" , "AH" , VOC, NON); /* VOC,CON */
- PhonixReplace1(MIDDLE, NewName, "HR" , "AH" , VOC, CON);
- PhonixReplace1(END , NewName, "HR" , "AH" , VOC, NON); /* VOC,CON */
- PhonixReplace1(END , NewName, "HR" , "AH" , VOC, NON);
- PhonixReplace2(END , NewName, "RE" , "AR" );
- PhonixReplace1(END , NewName, "R" , "AH" , VOC, NON);
- PhonixReplace2(ALL , NewName, "LLE" , "LE" );
- PhonixReplace1(END , NewName, "LE" , "ILE" , CON, NON);
- PhonixReplace1(END , NewName, "LES" , "ILES" , CON, NON);
- PhonixReplace2(END , NewName, "E" , "" );
- PhonixReplace2(END , NewName, "ES" , "S" );
- PhonixReplace1(END , NewName, "SS" , "AS" , VOC, NON);
- PhonixReplace1(END , NewName, "MB" , "M" , VOC, NON);
- PhonixReplace2(ALL , NewName, "MPTS" , "MPS" );
- PhonixReplace2(ALL , NewName, "MPS" , "MS" );
- PhonixReplace2(ALL , NewName, "MPT" , "MT" );
-
- /* calculate Key for NewName */
- PhonixCode(NewName, Key);
-
- #ifdef DEBUG
- printf("NewName = %s\n", NewName);
- printf("Code = %s\n\n", Key);
- #endif /* DEBUG */
- }
-
-
- /****************************************************************************
- NAME : Soundex
- INPUT : char *Name --- string to calculate soundex code for
- OUTPUT : char *Key --- soundex code of Name
- FUNCTION: Soundex calculates the soundex code for the string Name.
- ****************************************************************************/
-
- void Soundex (Name, Key)
- char *Name;
- char *Key;
- {
- /* use new variable NewName to remain Name unchanged */
- char NewName[50];
- int i;
-
- strcpy(NewName, Name);
-
- /* uppercase NewName */
- #ifdef WIN32
- for (i=0; i < (int)strlen(NewName); i++)
- #else
- for (i=0; i < strlen(NewName); i++)
- #endif
- if (islower(NewName[i]))
- NewName[i] = toupper(NewName[i]);
-
- /* calculate Key for Name */
- SoundexCode(NewName, Key);
- /* fprintf(stderr, "Soundex: %s -> %s\n", Name, Key); */
- }
-
-
- /****************************************************************************
- Now the two procedures PrintCode() and main() follow which will only be
- included if TEST is defined.
- ****************************************************************************/
-
- #ifdef TEST
-
- void PrintCode (Name)
- unsigned char *Name;
- {
- unsigned char SoundexName[SoundexLen+1];
- unsigned char PhonixName[PhonixLen+1];
-
- Soundex(Name, SoundexName);
- Phonix(Name, PhonixName);
- printf("%20s --> %s %s\n", Name, SoundexName, PhonixName);
- }
-
-
- void main ()
- {
- unsigned char s[256];
- PrintCode("CLASSEN");
- PrintCode("WRITE");
- PrintCode("WRIGHT");
- PrintCode("RITE");
- PrintCode("WHITE");
- PrintCode("WAIT");
- PrintCode("WEIGHT");
- PrintCode("KNIGHT");
- PrintCode("NIGHT");
- PrintCode("NITE");
- PrintCode("GNOME");
- PrintCode("NOAM");
- PrintCode("SMIDT");
- PrintCode("SMITH");
- PrintCode("SCHMIT");
- PrintCode("CRAFT");
- PrintCode("KRAFT");
- PrintCode("REES");
- PrintCode("REECE");
- PrintCode("YAEGER");
- PrintCode("YOGA");
- PrintCode("EAGER");
- PrintCode("AUGER");
- PrintCode("Krueger");
- PrintCode("Kruger");
- PrintCode("Krⁿger");
- while (1) {
- PrintCode(gets(s));
- }
- }
-
- #endif /* TEST */
-