home *** CD-ROM | disk | FTP | other *** search
- /* Copyright (c) 1994 Burra Gopal, Udi Manber. All Rights Reserved. */
-
- /*
- * hash.c: Hash table manipulation routines. Can be used to compute
- * the dictionary as well as compress files.
- */
-
- #include "defs.h"
-
- int next_free_hash = 0;
- hash_entry *free_hash = NULL; /*[DEF_MAX_WORDS]; */
-
- int next_free_str = 0;
- char *free_str = NULL; /*[DEF_MAX_WORDS * AVG_WORD_LEN]; */
-
- extern int usemalloc;
-
- /* -----------------------------------------------------------------
- input: a word (a string of ascii character terminated by NULL)
- output: a hash_value of the input word.
- hash function: if the word has length <= 4
- the hash value is just a concatenation of the last four bits
- of the characters.
- if the word has length > 4, then after the above operation,
- the hash value is updated by adding each remaining character.
- (and AND with the 16-bits mask).
- ---------------------------------------------------------------- */
- int
- thash64k(word, len)
- unsigned char *word;
- int len;
- {
- unsigned int hash_value=0;
- unsigned int mask_4=017;
- unsigned int mask_16=0177777;
- int i;
-
- if(len<=4) {
- for(i=0; i<len; i++) {
- hash_value = (hash_value << 4) | (word[i]&mask_4);
- /* hash_value = hash_value & mask_16; */
- }
- }
- else {
- for(i=0; i<4; i++) {
- hash_value = (hash_value << 4) | (word[i]&mask_4);
- /* hash_value = hash_value & mask_16; */
- }
- for(i=4; i<len; i++)
- hash_value = mask_16 & (hash_value + word[i]);
- }
- return(hash_value & mask_16);
- }
-
- hash_entry *
- get_hash(hash_table, word, len, i)
- hash_entry *hash_table[HASH_TABLE_SIZE];
- unsigned char *word;
- int len;
- int *i;
- {
- hash_entry *e;
-
- *i = thash64k(word, len);
- e = hash_table[*i];
- while(e != NULL) {
- if (!strcmp(e->word, (char *)word)) break;
- else e = e->next;
- }
- return e;
- }
-
- /*
- * Assigns either the freq or the offset to the hash-entry. The kind of
- * information in the entry depends on the caller. Advice: different
- * hash-tables must be used to store information gathered during
- * the build operation and the compress operation by the appropriate
- * module. This can be specified by passing -1's for offset/freq resply.
- */
- hash_entry *
- insert_hash(hash_table, word, len, freq, offset)
- hash_entry *hash_table[HASH_TABLE_SIZE];
- unsigned char *word;
- int len, freq, offset;
- {
- int i;
- hash_entry *e;
-
- e = get_hash(hash_table, word, len, &i);
-
- if (e == NULL) {
- hashalloc(e);
- stralloc(e->word, len + 2);
- strcpy(e->word, (char *)word);
- e->val.offset = 0;
- e->next = hash_table[i];
- hash_table[i] = e;
- }
-
- if ((offset == -1) && (freq != -1)) {
- e->val.attribute.freq += freq;
- /* e->val.attribute.index has to be accessed from outside this function */
- }
- else if ((offset != -1) && (freq == -1)) {
- e->val.offset = offset;
- /* used in building the string table from the dictionary */
- }
- else {
- fprintf(stderr, "error in accessing hash-table [frequencies/offsets]. skipping...\n");
- return (NULL);
- }
-
- #if 0
- printf("%d %x\n", i, e);
- #endif /*0*/
- return e;
- }
-
- /*
- * HASHFILE format: the hash-file is a sequence of "'\0' hash-index word-index word-name"
- * The '\0' is there to indicate that this is not a padded line. Padded lines simply have
- * a '\n' as the first character (words don't have '\0' or '\n'). The hash and word indices
- * are 2 unsigned short integers in binary, MSB first. The word name therefore starts from the
- * 5th character and continues until a '\0' or '\n' is encountered. The total size of the
- * hash-table is therefore (|avgwordlen|+5)*numwords = appx 12 * 50000 = .6MB.
- * Note that there can be multiple lines with the same hash-index.
- */
-
- /* used when computing compress's dictionary */
- int
- dump_hash(hash_table, HASHFILE)
- hash_entry *hash_table[HASH_TABLE_SIZE];
- unsigned char *HASHFILE;
- {
- int i;
- FILE *hashfp;
- int wordindex;
- hash_entry *e, *t;
-
- if ((hashfp = fopen(HASHFILE, "w")) == NULL) {
- fprintf(stderr, "cannot open for writing: %s\n", HASHFILE);
- return 0;
- }
-
- /* We have a guarantee that the wordindex + 1 cannot exceed MAX_WORDS */
- wordindex = 0;
- for(i=0; i<HASH_TABLE_SIZE; i++) {
- e = hash_table[i];
- while (e != NULL) {
- fprintf(hashfp, "%d %d %s\n", i, wordindex, e->word);
- t = e->next;
- strfree(e->word);
- hashfree(e);
- e = t;
- wordindex ++;
- }
- }
-
- fclose(hashfp);
- return wordindex;
- }
-
- /*
- * These are routines that operate on hash-tables of 4K size (used in tbuild.c)
- */
-
- /* crazy hash function that operates on 4K hashtables */
- thash4k(word, len)
- char *word;
- int len;
- {
- unsigned int hash_value=0;
- unsigned int mask_3=07;
- unsigned int mask_12=07777;
- int i;
-
- #if 0
- /* discard prefix = the directory name */
- if (len<=1) return 0;
- i = len-1;
- while(word[i] != '/') i--;
- if ((i > 0) && (word[i] == '/')) {
- word = &word[i+1];
- len = strlen(word);
- }
- #endif /*0*/
-
- if(len<=4) {
- for(i=0; i<len; i++) {
- hash_value = (hash_value << 3) | (word[i]&mask_3);
- }
- }
- else {
- for(i=0; i<4; i++) {
- hash_value = (hash_value << 3) | (word[i]&mask_3);
- }
- for(i=4; i<len; i++)
- hash_value = mask_12 & (hash_value + word[i]);
- }
- return(hash_value & mask_12);
- }
-
- hash_entry *
- get_small_hash(hash_table, word, len, i)
- hash_entry *hash_table[SMALL_HASH_TABLE_SIZE];
- unsigned char *word;
- int len;
- int *i;
- {
- hash_entry *e;
-
- *i = thash4k(word, len);
- e = hash_table[*i];
- while(e != NULL) {
- if (!strcmp(e->word, (char *)word)) break;
- else e = e->next;
- }
- return e;
- }
-
- hash_entry *
- insert_small_hash(hash_table, word, len, freq, offset)
- hash_entry *hash_table[SMALL_HASH_TABLE_SIZE];
- unsigned char *word;
- int len, freq, offset;
- {
- int i;
- hash_entry *e;
-
- e = get_small_hash(hash_table, word, len, &i);
-
- if (e == NULL) {
- hashalloc(e);
- stralloc(e->word, len + 2);
- strcpy(e->word, (char *)word);
- e->val.offset = 0;
- e->next = hash_table[i];
- hash_table[i] = e;
- }
-
- if ((offset == -1) && (freq != -1)) {
- e->val.attribute.freq += freq;
- /* e->val.attribute.index has to be accessed from outside this function */
- }
- else if ((offset != -1) && (freq == -1)) {
- e->val.offset = offset;
- /* used in building the string table from the dictionary */
- }
- else {
- fprintf(stderr, "error in accessing hash-table [frequencies/offsets]. skipping...\n");
- return (NULL);
- }
-
- #if 0
- printf("%d %x\n", i, e);
- #endif /*0*/
- return e;
- }
-
- int
- dump_small_hash(hash_table, HASHFILE)
- hash_entry *hash_table[SMALL_HASH_TABLE_SIZE];
- unsigned char *HASHFILE;
- {
- int i;
- FILE *hashfp;
- int wordindex;
- hash_entry *e, *t;
-
- if ((hashfp = fopen(HASHFILE, "w")) == NULL) {
- fprintf(stderr, "cannot open for writing: %s\n", HASHFILE);
- return 0;
- }
-
- /* We have a guarantee that the wordindex + 1 cannot exceed MAX_WORDS */
- wordindex = 0;
- for(i=0; i<SMALL_HASH_TABLE_SIZE; i++) {
- e = hash_table[i];
- while (e != NULL) {
- fprintf(hashfp, "%d %d %s\n", thash64k(e->word, strlen(e->word)), wordindex, e->word); /* must look like I used 64K table */
- t = e->next;
- strfree(e->word);
- hashfree(e);
- e = t;
- wordindex ++;
- }
- }
-
- fclose(hashfp);
- return wordindex;
- }
-
- /*
- * These are again routines that operate on big (64k) hash-tables
- */
-
- /* used only during debugging to see if output = input */
- int
- dump_hash_debug(hash_table, HASHFILE)
- hash_entry *hash_table[HASH_TABLE_SIZE];
- unsigned char *HASHFILE;
- {
- int i;
- FILE *hashfp;
- hash_entry *e;
-
- if ((hashfp = fopen(HASHFILE, "w")) == NULL) {
- fprintf(stderr, "cannot open for writing: %s\n", HASHFILE);
- return 0;
- }
-
- /* We have a guarantee that the wordindex + 1 cannot exceed MAX_WORDS */
- for(i=0; i<HASH_TABLE_SIZE; i++) {
- e = hash_table[i];
- while (e != NULL) {
- fprintf(hashfp, "%d %d %d %s\n", i, e->val.attribute.freq, e->val.attribute.index, e->word);
- e = e->next;
- }
- }
-
- fclose(hashfp);
- return 1;
- }
-
- /*
- * VERY particular to the format of the hash-table file:
- * -- does an fscanf+2atoi's+strlen all in one scan.
- * Returns 0 if you are in padded are, -1 on EOF, else ~.
- */
- int
- myhashread(fp, pint1, pint2, str, plen)
- FILE *fp;
- int *pint1;
- int *pint2;
- char *str;
- int *plen;
- {
- int numread;
- int int1, int2;
- int c;
-
- if((int1 = getc(fp)) == '\n') return 0; /* padded area */
- if(int1 != 0) return -1; /* formatting error! */
- if ((int1 = getc(fp)) == EOF) return -1;
- if ((int2 = getc(fp)) == EOF) return -1;
- *pint1 = (int1 << 8) | int2; /* hashindex */
- if ((int1 = getc(fp)) == EOF) return -1;
- if ((int2 = getc(fp)) == EOF) return -1;
- *pint2 = (int1 << 8) | int2; /* wordindex */
-
- numread = 5;
- *plen = 0; /* wordname */
- while((c = getc(fp)) != EOF) {
- if ( (c == '\0') || (c == '\n') ){
- ungetc(c, fp);
- str[*plen] = '\0';
- return numread;
- }
- str[(*plen)++] = c;
- numread ++;
- if (numread >= MAX_NAME_LEN) {
- str[*plen - 1] = '\0';
- return numread;
- }
- }
- return -1;
- }
-
- int
- tbuild_hash(hash_table, hashfp, bytestoread)
- hash_entry *hash_table[HASH_TABLE_SIZE];
- FILE *hashfp;
- int bytestoread;
- {
- int hashindex;
- int wordindex;
- int numread = 0;
- int ret;
- int len;
- char *word;
- char dummybuf[MAX_WORD_BUF];
- hash_entry *e;
-
- if (bytestoread == -1) { /* read until end of file */
- while (1)
- {
- if (usemalloc) word = dummybuf;
- else {
- if (free_str == NULL) free_str = (char *)malloc(AVG_WORD_LEN * DEF_MAX_WORDS);
- if (free_str == NULL) break;
- word = &free_str[next_free_str];
- }
- if ((ret = myhashread(hashfp, &hashindex, &wordindex, word, &len)) == 0) continue;
- if (ret == -1) break;
- if ((hashindex >= HASH_TABLE_SIZE) || (hashindex < 0)) continue; /* ignore */
- hashalloc(e);
- if (usemalloc) {
- if ((word = (char *)malloc(len + 2)) == NULL) break;
- strcpy(word, dummybuf);
- }
- else next_free_str += len + 2;
- e->word = word;
- e->val.attribute.freq = 0; /* just exists in compress's dict: not found in text-file yet! */
- e->val.attribute.index = wordindex;
- e->next = hash_table[hashindex];
- hash_table[hashindex] = e;
- #if 0
- printf("word=%s index=%d\n", word, wordindex);
- #endif /*0*/
- }
- }
- else { /* read only a specified number of bytes */
- while (bytestoread > numread)
- {
- if (usemalloc) word = dummybuf;
- else {
- if (free_str == NULL) free_str = (char *)malloc(AVG_WORD_LEN * DEF_MAX_WORDS);
- if (free_str == NULL) break;
- word = &free_str[next_free_str];
- }
- if ((ret = myhashread(hashfp, &hashindex, &wordindex, word, &len)) <= 0) break;
- if ((hashindex >= HASH_TABLE_SIZE) || (hashindex < 0)) continue; /* ignore */
- hashalloc(e);
- if (usemalloc) {
- if ((word = (char *)malloc(len + 2)) == NULL) break;
- strcpy(word, dummybuf);
- }
- else next_free_str += len + 2;
- e->word = word;
- e->val.attribute.freq = 0; /* just exists in compress's dict: not found in text-file yet! */
- e->val.attribute.index = wordindex;
- e->next = hash_table[hashindex];
- hash_table[hashindex] = e;
- wordindex ++;
- numread += ret;
- #if 0
- printf("%d %d %s\n", hashindex, wordindex, word);
- #endif /*0*/
- }
- }
-
- return (wordindex + 1); /* the highest indexed word + 1 */
- }
-
- /*
- * Interprets srcbuf as a series of words separated by newlines and looks
- * for a complete occurrence of words in patbuf in it. If there IS an occurrence,
- * it builds the hash-table for THAT page. The hashfp must start at the
- * beginning on each call.
- */
- int
- build_partial_hash(hash_table, hashfp, srcbuf, srclen, patbuf, patlen, blocksize, loaded_hash_table)
- hash_entry *hash_table[HASH_TABLE_SIZE];
- FILE *hashfp;
- unsigned char *srcbuf;
- int srclen;
- unsigned char *patbuf;
- int patlen;
- int blocksize;
- char loaded_hash_table[HASH_FILE_BLOCKS];
- {
- unsigned char *srcpos;
- unsigned char *srcinit, *srcend, dest[MAX_NAME_LEN];
- int blockindex = 0;
- int i, initlen, endlen;
- unsigned char *strings[MAX_NAME_LEN]; /* maximum pattern length */
- int numstrings = 0;
- int inword = 0;
-
- /*
- * Find all the relevant strings in the pattern.
- */
- i = 0;
- while(i<patlen) {
- if (isalnum(patbuf[i])) {
- if (!inword) {
- strings[numstrings++] = &dest[i];
- inword = 1;
- }
- if (isupper(patbuf[i])) dest[i] = tolower(patbuf[i]);
- else dest[i] = patbuf[i];
- }
- else {
- dest[i] = '\0'; /* ignore that character */
- inword = 0;
- }
- i++;
- }
- #if 0
- for (i=0; i<numstrings; i++) printf("word%d=%s\n", i, strings[i]);
- getchar();
- #endif /*0*/
-
- srcpos = srcbuf;
- while (srcpos < (srcbuf + srclen)) {
- srcinit = srcpos;
- initlen = strlen((char *)srcinit);
- srcend = srcinit + initlen + 1;
- endlen = strlen((char *)srcend);
- #if 0
- printf("%s -- %s\n", srcinit, srcend);
- #endif /*0*/
- for (i=0; i<numstrings; i++)
- if ((strcmp((char *)strings[i], (char *)srcinit) >= 0) && (strcmp((char *)strings[i], (char *)srcend) <= 0)) goto include_page;
- blockindex++;
- srcpos += (initlen + endlen + 2);
- continue;
-
- include_page: /* Include it if any of the patterns fit within this range */
- if (loaded_hash_table[blockindex++]) continue;
- #if 0
- printf("build_partial_hash: hashing words in page# %d\n", blockindex);
- #endif /*0*/
- loaded_hash_table[blockindex - 1] = 1;
- fseek(hashfp, (blockindex-1)*blocksize, 0);
- tbuild_hash(hash_table, hashfp, blocksize);
- srcpos += (initlen + endlen + 2);
- }
- return 0;
- }
-
- pad_hash_file(filename, FILEBLOCKSIZE)
- unsigned char *filename;
- int FILEBLOCKSIZE;
- {
- FILE *outfp, *infp, *indexfp;
- int offset = 0, len;
- unsigned char buf[MAX_NAME_LEN];
- int pid = getpid();
- int i;
- unsigned char word[MAX_NAME_LEN];
- unsigned char prev_word[MAX_NAME_LEN];
- unsigned int hashindex, wordindex;
-
- if ((infp = fopen(filename, "r")) == NULL) {
- fprintf(stderr, "cannot open for reading: %s\n", filename);
- exit(2);
- }
- sprintf(buf, "%s.index", filename);
- if ((indexfp = fopen(buf, "w")) == NULL) {
- fprintf(stderr, "cannot open for writing: %s\n", buf);
- fclose(infp);
- exit(2);
- }
- sprintf(buf, "%s.%d", filename, pid);
- if ((outfp = fopen(buf, "w")) == NULL) {
- fprintf(stderr, "cannot open for writing: %s\n", buf);
- fclose(infp);
- fclose(indexfp);
- exit(2);
- }
- if ((FILEBLOCKSIZE % MIN_BLOCKSIZE) != 0) {
- fprintf(stderr, "invalid block size %d: changing to %d\n", FILEBLOCKSIZE, MIN_BLOCKSIZE);
- FILEBLOCKSIZE = MIN_BLOCKSIZE;
- }
- fprintf(indexfp, "%d\n", FILEBLOCKSIZE);
-
- if ((char*)buf != fgets(buf, MAX_NAME_LEN, infp)) goto end_of_input;
- len = strlen((char *)buf);
-
- sscanf(buf, "%d %d %s\n", &hashindex, &wordindex, word);
- putc(0, outfp);
- putc((hashindex & 0xff00)>>8, outfp);
- putc((hashindex & 0x00ff), outfp);
- putc((wordindex & 0xff00)>>8, outfp);
- putc((wordindex & 0x00ff), outfp);
- fprintf(outfp, "%s", word);
-
- buf[len-1] = '\0'; /* fgets gives you the newline too */
- for (i=0; i< len; i++) if (isupper(buf[i])) buf[i] = tolower(buf[i]);
- for (i=len-2; i>=0; i--) if (buf[i] == ' ') { i++; break; }
- if (i < 0) i = 0;
- strcpy((char *)prev_word, (char *)&buf[i]);
-
- fprintf(indexfp, "%s", &buf[i]); /* the first word */
- putc(0, indexfp); /* null terminated */
- offset += strlen((char *)word)+5;
-
- while(fgets(buf, MAX_NAME_LEN, infp) == (char *)buf) {
- len = strlen((char *)buf);
- if (offset + len > FILEBLOCKSIZE) {
- /* Put the last char of the prev. page */
- fprintf(indexfp, "%s", prev_word);
- putc(0, indexfp); /* null terminated */
-
- for (i=0; i<FILEBLOCKSIZE-offset; i++) /* fill up with so many newlines until the next block size */
- putc('\n', outfp);
-
-
- sscanf(buf, "%d %d %s\n", &hashindex, &wordindex, word);
- putc(0, outfp);
- putc((hashindex & 0xff00)>>8, outfp);
- putc((hashindex & 0x00ff), outfp);
- putc((wordindex & 0xff00)>>8, outfp);
- putc((wordindex & 0x00ff), outfp);
- fprintf(outfp, "%s", word);
-
- buf[len-1] = '\0'; /* fgets gives you the newline too */
- for (i=0; i< len; i++) if (isupper(buf[i])) buf[i] = tolower(buf[i]);
- for (i=len-2; i>=0; i--) if (buf[i] == ' ') { i++; break; }
- if (i < 0) i = 0;
- strcpy((char *)prev_word, (char *)&buf[i]);
-
- fprintf(indexfp, "%s", &buf[i]); /* store the first word at each page */
- putc(0, indexfp); /* null terminated */
- offset = 0;
- }
- else {
- sscanf(buf, "%d %d %s\n", &hashindex, &wordindex, word);
- putc(0, outfp);
- putc((hashindex & 0xff00)>>8, outfp);
- putc((hashindex & 0x00ff), outfp);
- putc((wordindex & 0xff00)>>8, outfp);
- putc((wordindex & 0x00ff), outfp);
- fprintf(outfp, "%s", word);
-
- buf[len-1] = '\0'; /* fgets gives you the newline too */
- for (i=0; i<len; i++) if (isupper(buf[i])) buf[i] = tolower(buf[i]);
- for (i=len-2; i>=0; i--) if (buf[i] == ' ') { i++; break; }
- if (i < 0) i = 0;
- strcpy((char *)prev_word, (char *)&buf[i]);
- }
- offset += strlen((char *)word)+5;
- }
- fprintf(indexfp, "%s", prev_word);
- putc(0, indexfp); /* null terminated */
-
- end_of_input:
- fclose(infp);
- fflush(outfp);
- fclose(outfp);
- fflush(indexfp);
- fclose(indexfp);
- sprintf(buf, "mv %s.%d %s\n", filename, pid, filename);
- system(buf);
- }
-
-