home *** CD-ROM | disk | FTP | other *** search
- /* DictToTables.c - convert an ASCII dictionary file into a file of two tables
- containing the frequency of occurrence of each trigram in the dictionary.
-
- "Words" in the dictionary are groups of 3 to MAXLEN letters. All other
- characters are ignored. Thus you may use any text file that contains
- a variety (the bigger the better) of words. You could even use a
- Latin text to generate frequency tables for Latin.
-
- The trigram frequencies are first stored in an array ( nn[][][] ),
- then put in the two tables, then the two tables are written to a file.
- The file, after compilation, may be linked with Unjumble.o or PhoneWords.o
-
- The two tables are a bit table and a nybble table, wherein the frequencies
- have been scaled from 'short int' to nybble (4 bits) size.
- The bit and nybble tables together allow the trigram frequencies to be
- reconstructed. If the frequency for a trigram is non-zero then the
- corresponding bit in the bit table is set. The frequency for that trigram
- is stored as the next nybble in the nybble table. This strategy requires only
- the non-zero frequencies be stored, with the bit table showing the
- position of the non-zero trigrams. With about 4000 non-zero trigrams only
- about 4800 bytes are required to store the trigram data.
-
- dictionary -> words -> nn[][][] -> bit_table/nybble_table -> file
-
- Ron Charlton
- 9002 Balcor Circle
- Knoxville, TN 37923
-
- Phone: (615)694-0800
-
- PLINK: R*CHARLTON
- BINTNET: charltr@utkvx1
-
- 05-Jul-90
-
- This program is in the public domain and may be used for any purpose.
-
- Usage: 1> DictToTables
-
- */
-
- #include <ctype.h>
- #include <stdio.h>
- #include <assert.h>
-
- #define MAXLEN 15
- #define MAXBUCKETS 2000
-
- #define TRUE 1
- #define FALSE 0
-
- #define BIT_TABLE_SIZE (26 * 26 * 26 / 32 + 1)
-
- unsigned long bit_table [ BIT_TABLE_SIZE ]; /* the bit table */
-
- unsigned long *nybble_table; /* the nybble table (allocated later) */
- int nybble_table_size;
-
- unsigned short nn[27][27][27]; /* array of trigram frequencies */
- short bucket [ MAXBUCKETS ];
- int nonzero; /* how many trigrams exist */
-
- char infile [ 81 ] = "dict.txt";
- char outfile [ 81 ] = "tables.c";
- char name [ 81 ];
- char wordbuf [ 256 ];
- FILE *in, *out, *fopen();
-
- main()
- {
- int len;
- unsigned long *calloc();
-
- printf ("Convert an ASCII dictionary file to one containing two tables\n"
- "encoding trigram weights (frequencies). The input file should\n"
- "many different words in the desired language.\n\n" );
-
- /* get file names and open them */
- do_infile();
- do_outfile();
-
- printf ( "analyzing dictionary...\n" );
- /* count trigrams in "words" from infile */
- while ( len = getword () )
- if ( len >= 3 && len <= MAXLEN )
- count_trigrams_in_word ( wordbuf );
-
- close ( in );
-
- printf ( "counting trigrams...\n" );
- do_statistics();
-
- nybble_table_size = nonzero / 8 + 1; /* how many longs in table */
-
- nybble_table = ( unsigned long *)
- calloc ( (size_t) nybble_table_size,
- (size_t) sizeof ( long )); /* allocate */
-
- if ( nybble_table == NULL )
- {
- printf ( "Can't allocate memory for nybble_table.\n" );
- exit ( 20 );
- }
-
- printf ( "Building tables...\n" );
- build_tables(); /* make bit and nybble tables */
-
- printf ( "writing tables to file...\n" );
- nybble_table_to_file(); /* store the nybble table */
-
- bit_table_to_file(); /* store the bit table */
-
- close ( out );
-
- print_results();
-
- }
-
-
- /*--------------------------------------------------------------------*/
-
- count_trigrams_in_word ( w )
- /* store count of each trigram in nn[][][] */
- char *w;
- {
- register int len, pos;
- if ( ( len = strlen ( w ) ) >= 3 )
- for ( pos = 0; pos <= len - 3; pos++ )
- ++ nn [ w[pos] - '@' ] [ w[pos+1] - '@' ] [ w[pos+2] - '@' ];
- }
-
-
-
- /*--------------------------------------------------------------------*/
-
- int getword ()
- /* get a word (contiguous group of letters) from infile & make uppercase.
- RETURNS - length of word in bytes */
- {
- register int inword = FALSE, pos = 0, c;
-
- while ( ( c = getc ( in ) ) != EOF )
- {
- c = toupper ( c );
- if ( isalpha ( c ) )
- {
- inword = TRUE;
- wordbuf [ pos++ ] = (char) c;
- }
- else if ( inword )
- {
- wordbuf [ pos ] = '\0';
- return ( strlen ( wordbuf ) );
- }
- }
- wordbuf [ pos ] = '\0';
- return ( strlen ( wordbuf ) );
- }
-
- /*--------------------------------------------------------------------*/
-
- do_statistics()
- /* count nonzero trigram values, etc. */
- {
- register int i, j, k;
-
- for ( i = 1; i <= 26; i++ )
- for ( j = 1; j <= 26; j++ )
- for ( k = 1; k <= 26; k++ )
- if ( nn[i][j][k] ) /* filter out zeroes */
- {
- ++nonzero;
- ++bucket [ nn[i][j][k] ];
- }
-
- }
-
-
- /*------------------------------------------------------------------------*/
-
- print_results()
- /* print results */
- {
- register int i;
- for ( i = 0; i < MAXBUCKETS; i++ )
- if ( bucket [ i ] )
- printf ( "%4d trigrams occurred %4d times\n", bucket [i], i );
-
- printf ( "non-zero trigrams = %d\n", nonzero );
- }
-
- /*-----------------------------------------------------------------------*/
-
- do_infile()
- {
- printf ( "Enter input file name [%s]: ", infile );
- gets ( name );
- if ( strlen ( name ) > 0 )
- strcpy ( infile, name );
-
- if ( ( in = fopen ( infile, "r" ) ) == NULL )
- {
- printf ( "Can't open %s for input\n", infile );
- exit ( 19 );
- }
- }
-
- /*-----------------------------------------------------------------------*/
-
- do_outfile()
- {
- printf ( "Enter output file name [%s]: ", outfile );
- gets ( name );
- if ( strlen ( name ) > 0 )
- strcpy ( outfile, name );
-
- if ( ( out = fopen ( outfile, "w" ) ) == NULL )
- {
- printf ( "Can't open %s for output\n", outfile );
- exit ( 20 );
- }
- }
-
- /*------------------------------------------------------------------------*/
-
- short log2 ( n )
- /* convert short to fit in a nybble by taking log base 2 (truncated):
- n nonzero - return log base 2 (truncated) of n.
- n zero - return 0 */
- unsigned short n;
- {
- register int log = 0;
- while ( n )
- {
- ++log;
- n >>= 1;
- }
- assert ( log < 16 ); /* so as to fit in a nybble */
- return ( log );
- }
-
- /*------------------------------------------------------------------------*/
-
- build_tables()
- /* use nn[][][] to build the bit and nybble tables */
- {
- register int ptr = 0, i, j, k;
- for ( i = 1; i <= 26; i++ )
- for ( j = 1; j <= 26; j++ )
- for ( k = 1; k <= 26; k++ )
- if ( nn [ i ] [ j ] [ k ] )
- {
- store_nybble ( ptr++, log2 ( nn [ i ] [ j ] [ k ] ) );
- set_bit ( ( i - 1 ) * 26*26 + ( j - 1 ) * 26 + ( k - 1 ) );
- }
- }
-
- /*------------------------------------------------------------------------*/
-
- store_nybble ( nyb_number, value )
- /* put value in nybble pointed to by nyb_number */
- int nyb_number, value;
- {
- int longword, nybble_in_longword;
-
- nybble_in_longword = nyb_number & 7; /* 3 LSBits pick nybble position */
- longword = nyb_number >> 3; /* which longword in table */
- assert ( longword < nybble_table_size );
- nybble_table [ longword ] |=
- (long)value << ( nybble_in_longword * 4 ); /* store it */
- }
-
- /*------------------------------------------------------------------------*/
-
- set_bit ( n )
- /* set bit n in the bit table of 32 bit ints */
- int n;
- {
- int bit, longword;
- longword = n >> 5; /* which longword in table */
- bit = n & 0x1F; /* bit 0 - 31 in longword */
- assert ( longword < BIT_TABLE_SIZE );
- assert ( bit < 32 );
- bit_table [ longword ] |= ( 1L << bit ); /* set the bit */
- }
-
- /*------------------------------------------------------------------------*/
-
- nybble_table_to_file()
- /* store the nybble table in the file */
- {
- int longword;
-
- fprintf ( out, "/* nybble table of trigram frequencies */\n\n" );
- fprintf ( out, "unsigned long nybble_table [] = { 0x%lx\n",
- nybble_table [ 0 ] );
- for ( longword = 1; longword < nybble_table_size; longword++ )
- {
- fprintf ( out, ",0x%lx", nybble_table [ longword ] );
- if ( ( longword % 6 ) == 0 )
- fprintf ( out, "\n" );
- }
- fprintf ( out, "\n\t};\n\n" );
- fprintf ( out, "int nybble_table_size = %d;\n", nybble_table_size );
- }
-
- /*--------------------------------------------------------------------*/
-
- bit_table_to_file()
- /* store the bit table in the file */
- {
- int longword;
-
- fprintf ( out, "\n\n /* = = = = = = = = = = = = = = = = = = */ \n\n" );
- fprintf ( out, "/* bit table of trigram frequencies */\n\n" );
- fprintf ( out, "unsigned long bit_table [] = { 0x%lx\n", bit_table [ 0 ] );
- for ( longword = 1; longword < BIT_TABLE_SIZE; longword++ )
- {
- fprintf ( out, ",0x%lx", bit_table [ longword ] );
- if ( ( longword % 6 ) == 0 )
- fprintf ( out, "\n" );
- }
- fprintf ( out, "\n\t};\n\n" );
- fprintf ( out, "int bit_table_size = %d;\n", BIT_TABLE_SIZE );
- }
-
- /*--------------------------- END OF FILE --------------------------------*/
-