Skunkware 5

home *** CD-ROM | disk | FTP | other *** search

/ Skunkware 5 / Skunkware 5.iso / src / Tools / glimpse-2.1 / compress / hash.c < prev next >

Wrap

C/C++ Source or Header | 1995-05-16 | 16.7 KB | 638 lines

/* Copyright (c) 1994 Burra Gopal, Udi Manber. All Rights Reserved. */ /* * hash.c: Hash table manipulation routines. Can be used to compute * the dictionary as well as compress files. */ #include "defs.h" int next_free_hash = 0; hash_entry *free_hash = NULL; /*[DEF_MAX_WORDS]; */ int next_free_str = 0; char *free_str = NULL; /*[DEF_MAX_WORDS * AVG_WORD_LEN]; */ extern int usemalloc; /* ----------------------------------------------------------------- input: a word (a string of ascii character terminated by NULL) output: a hash_value of the input word. hash function: if the word has length <= 4 the hash value is just a concatenation of the last four bits of the characters. if the word has length > 4, then after the above operation, the hash value is updated by adding each remaining character. (and AND with the 16-bits mask). ---------------------------------------------------------------- */ int thash64k(word, len) unsigned char *word; int len; { unsigned int hash_value=0; unsigned int mask_4=017; unsigned int mask_16=0177777; int i; if(len<=4) { for(i=0; i<len; i++) { hash_value = (hash_value << 4) | (word[i]&mask_4); /* hash_value = hash_value & mask_16; */ } } else { for(i=0; i<4; i++) { hash_value = (hash_value << 4) | (word[i]&mask_4); /* hash_value = hash_value & mask_16; */ } for(i=4; i<len; i++) hash_value = mask_16 & (hash_value + word[i]); } return(hash_value & mask_16); } hash_entry * get_hash(hash_table, word, len, i) hash_entry *hash_table[HASH_TABLE_SIZE]; unsigned char *word; int len; int *i; { hash_entry *e; *i = thash64k(word, len); e = hash_table[*i]; while(e != NULL) { if (!strcmp(e->word, (char *)word)) break; else e = e->next; } return e; } /* * Assigns either the freq or the offset to the hash-entry. The kind of * information in the entry depends on the caller. Advice: different * hash-tables must be used to store information gathered during * the build operation and the compress operation by the appropriate * module. This can be specified by passing -1's for offset/freq resply. */ hash_entry * insert_hash(hash_table, word, len, freq, offset) hash_entry *hash_table[HASH_TABLE_SIZE]; unsigned char *word; int len, freq, offset; { int i; hash_entry *e; e = get_hash(hash_table, word, len, &i); if (e == NULL) { hashalloc(e); stralloc(e->word, len + 2); strcpy(e->word, (char *)word); e->val.offset = 0; e->next = hash_table[i]; hash_table[i] = e; } if ((offset == -1) && (freq != -1)) { e->val.attribute.freq += freq; /* e->val.attribute.index has to be accessed from outside this function */ } else if ((offset != -1) && (freq == -1)) { e->val.offset = offset; /* used in building the string table from the dictionary */ } else { fprintf(stderr, "error in accessing hash-table [frequencies/offsets]. skipping...\n"); return (NULL); } #if 0 printf("%d %x\n", i, e); #endif /*0*/ return e; } /* * HASHFILE format: the hash-file is a sequence of "'\0' hash-index word-index word-name" * The '\0' is there to indicate that this is not a padded line. Padded lines simply have * a '\n' as the first character (words don't have '\0' or '\n'). The hash and word indices * are 2 unsigned short integers in binary, MSB first. The word name therefore starts from the * 5th character and continues until a '\0' or '\n' is encountered. The total size of the * hash-table is therefore (|avgwordlen|+5)*numwords = appx 12 * 50000 = .6MB. * Note that there can be multiple lines with the same hash-index. */ /* used when computing compress's dictionary */ int dump_hash(hash_table, HASHFILE) hash_entry *hash_table[HASH_TABLE_SIZE]; unsigned char *HASHFILE; { int i; FILE *hashfp; int wordindex; hash_entry *e, *t; if ((hashfp = fopen(HASHFILE, "w")) == NULL) { fprintf(stderr, "cannot open for writing: %s\n", HASHFILE); return 0; } /* We have a guarantee that the wordindex + 1 cannot exceed MAX_WORDS */ wordindex = 0; for(i=0; i<HASH_TABLE_SIZE; i++) { e = hash_table[i]; while (e != NULL) { fprintf(hashfp, "%d %d %s\n", i, wordindex, e->word); t = e->next; strfree(e->word); hashfree(e); e = t; wordindex ++; } } fclose(hashfp); return wordindex; } /* * These are routines that operate on hash-tables of 4K size (used in tbuild.c) */ /* crazy hash function that operates on 4K hashtables */ thash4k(word, len) char *word; int len; { unsigned int hash_value=0; unsigned int mask_3=07; unsigned int mask_12=07777; int i; #if 0 /* discard prefix = the directory name */ if (len<=1) return 0; i = len-1; while(word[i] != '/') i--; if ((i > 0) && (word[i] == '/')) { word = &word[i+1]; len = strlen(word); } #endif /*0*/ if(len<=4) { for(i=0; i<len; i++) { hash_value = (hash_value << 3) | (word[i]&mask_3); } } else { for(i=0; i<4; i++) { hash_value = (hash_value << 3) | (word[i]&mask_3); } for(i=4; i<len; i++) hash_value = mask_12 & (hash_value + word[i]); } return(hash_value & mask_12); } hash_entry * get_small_hash(hash_table, word, len, i) hash_entry *hash_table[SMALL_HASH_TABLE_SIZE]; unsigned char *word; int len; int *i; { hash_entry *e; *i = thash4k(word, len); e = hash_table[*i]; while(e != NULL) { if (!strcmp(e->word, (char *)word)) break; else e = e->next; } return e; } hash_entry * insert_small_hash(hash_table, word, len, freq, offset) hash_entry *hash_table[SMALL_HASH_TABLE_SIZE]; unsigned char *word; int len, freq, offset; { int i; hash_entry *e; e = get_small_hash(hash_table, word, len, &i); if (e == NULL) { hashalloc(e); stralloc(e->word, len + 2); strcpy(e->word, (char *)word); e->val.offset = 0; e->next = hash_table[i]; hash_table[i] = e; } if ((offset == -1) && (freq != -1)) { e->val.attribute.freq += freq; /* e->val.attribute.index has to be accessed from outside this function */ } else if ((offset != -1) && (freq == -1)) { e->val.offset = offset; /* used in building the string table from the dictionary */ } else { fprintf(stderr, "error in accessing hash-table [frequencies/offsets]. skipping...\n"); return (NULL); } #if 0 printf("%d %x\n", i, e); #endif /*0*/ return e; } int dump_small_hash(hash_table, HASHFILE) hash_entry *hash_table[SMALL_HASH_TABLE_SIZE]; unsigned char *HASHFILE; { int i; FILE *hashfp; int wordindex; hash_entry *e, *t; if ((hashfp = fopen(HASHFILE, "w")) == NULL) { fprintf(stderr, "cannot open for writing: %s\n", HASHFILE); return 0; } /* We have a guarantee that the wordindex + 1 cannot exceed MAX_WORDS */ wordindex = 0; for(i=0; i<SMALL_HASH_TABLE_SIZE; i++) { e = hash_table[i]; while (e != NULL) { fprintf(hashfp, "%d %d %s\n", thash64k(e->word, strlen(e->word)), wordindex, e->word); /* must look like I used 64K table */ t = e->next; strfree(e->word); hashfree(e); e = t; wordindex ++; } } fclose(hashfp); return wordindex; } /* * These are again routines that operate on big (64k) hash-tables */ /* used only during debugging to see if output = input */ int dump_hash_debug(hash_table, HASHFILE) hash_entry *hash_table[HASH_TABLE_SIZE]; unsigned char *HASHFILE; { int i; FILE *hashfp; hash_entry *e; if ((hashfp = fopen(HASHFILE, "w")) == NULL) { fprintf(stderr, "cannot open for writing: %s\n", HASHFILE); return 0; } /* We have a guarantee that the wordindex + 1 cannot exceed MAX_WORDS */ for(i=0; i<HASH_TABLE_SIZE; i++) { e = hash_table[i]; while (e != NULL) { fprintf(hashfp, "%d %d %d %s\n", i, e->val.attribute.freq, e->val.attribute.index, e->word); e = e->next; } } fclose(hashfp); return 1; } /* * VERY particular to the format of the hash-table file: * -- does an fscanf+2atoi's+strlen all in one scan. * Returns 0 if you are in padded are, -1 on EOF, else ~. */ int myhashread(fp, pint1, pint2, str, plen) FILE *fp; int *pint1; int *pint2; char *str; int *plen; { int numread; int int1, int2; int c; if((int1 = getc(fp)) == '\n') return 0; /* padded area */ if(int1 != 0) return -1; /* formatting error! */ if ((int1 = getc(fp)) == EOF) return -1; if ((int2 = getc(fp)) == EOF) return -1; *pint1 = (int1 << 8) | int2; /* hashindex */ if ((int1 = getc(fp)) == EOF) return -1; if ((int2 = getc(fp)) == EOF) return -1; *pint2 = (int1 << 8) | int2; /* wordindex */ numread = 5; *plen = 0; /* wordname */ while((c = getc(fp)) != EOF) { if ( (c == '\0') || (c == '\n') ){ ungetc(c, fp); str[*plen] = '\0'; return numread; } str[(*plen)++] = c; numread ++; if (numread >= MAX_NAME_LEN) { str[*plen - 1] = '\0'; return numread; } } return -1; } int tbuild_hash(hash_table, hashfp, bytestoread) hash_entry *hash_table[HASH_TABLE_SIZE]; FILE *hashfp; int bytestoread; { int hashindex; int wordindex; int numread = 0; int ret; int len; char *word; char dummybuf[MAX_WORD_BUF]; hash_entry *e; if (bytestoread == -1) { /* read until end of file */ while (1) { if (usemalloc) word = dummybuf; else { if (free_str == NULL) free_str = (char *)malloc(AVG_WORD_LEN * DEF_MAX_WORDS); if (free_str == NULL) break; word = &free_str[next_free_str]; } if ((ret = myhashread(hashfp, &hashindex, &wordindex, word, &len)) == 0) continue; if (ret == -1) break; if ((hashindex >= HASH_TABLE_SIZE) || (hashindex < 0)) continue; /* ignore */ hashalloc(e); if (usemalloc) { if ((word = (char *)malloc(len + 2)) == NULL) break; strcpy(word, dummybuf); } else next_free_str += len + 2; e->word = word; e->val.attribute.freq = 0; /* just exists in compress's dict: not found in text-file yet! */ e->val.attribute.index = wordindex; e->next = hash_table[hashindex]; hash_table[hashindex] = e; #if 0 printf("word=%s index=%d\n", word, wordindex); #endif /*0*/ } } else { /* read only a specified number of bytes */ while (bytestoread > numread) { if (usemalloc) word = dummybuf; else { if (free_str == NULL) free_str = (char *)malloc(AVG_WORD_LEN * DEF_MAX_WORDS); if (free_str == NULL) break; word = &free_str[next_free_str]; } if ((ret = myhashread(hashfp, &hashindex, &wordindex, word, &len)) <= 0) break; if ((hashindex >= HASH_TABLE_SIZE) || (hashindex < 0)) continue; /* ignore */ hashalloc(e); if (usemalloc) { if ((word = (char *)malloc(len + 2)) == NULL) break; strcpy(word, dummybuf); } else next_free_str += len + 2; e->word = word; e->val.attribute.freq = 0; /* just exists in compress's dict: not found in text-file yet! */ e->val.attribute.index = wordindex; e->next = hash_table[hashindex]; hash_table[hashindex] = e; wordindex ++; numread += ret; #if 0 printf("%d %d %s\n", hashindex, wordindex, word); #endif /*0*/ } } return (wordindex + 1); /* the highest indexed word + 1 */ } /* * Interprets srcbuf as a series of words separated by newlines and looks * for a complete occurrence of words in patbuf in it. If there IS an occurrence, * it builds the hash-table for THAT page. The hashfp must start at the * beginning on each call. */ int build_partial_hash(hash_table, hashfp, srcbuf, srclen, patbuf, patlen, blocksize, loaded_hash_table) hash_entry *hash_table[HASH_TABLE_SIZE]; FILE *hashfp; unsigned char *srcbuf; int srclen; unsigned char *patbuf; int patlen; int blocksize; char loaded_hash_table[HASH_FILE_BLOCKS]; { unsigned char *srcpos; unsigned char *srcinit, *srcend, dest[MAX_NAME_LEN]; int blockindex = 0; int i, initlen, endlen; unsigned char *strings[MAX_NAME_LEN]; /* maximum pattern length */ int numstrings = 0; int inword = 0; /* * Find all the relevant strings in the pattern. */ i = 0; while(i<patlen) { if (isalnum(patbuf[i])) { if (!inword) { strings[numstrings++] = &dest[i]; inword = 1; } if (isupper(patbuf[i])) dest[i] = tolower(patbuf[i]); else dest[i] = patbuf[i]; } else { dest[i] = '\0'; /* ignore that character */ inword = 0; } i++; } #if 0 for (i=0; i<numstrings; i++) printf("word%d=%s\n", i, strings[i]); getchar(); #endif /*0*/ srcpos = srcbuf; while (srcpos < (srcbuf + srclen)) { srcinit = srcpos; initlen = strlen((char *)srcinit); srcend = srcinit + initlen + 1; endlen = strlen((char *)srcend); #if 0 printf("%s -- %s\n", srcinit, srcend); #endif /*0*/ for (i=0; i<numstrings; i++) if ((strcmp((char *)strings[i], (char *)srcinit) >= 0) && (strcmp((char *)strings[i], (char *)srcend) <= 0)) goto include_page; blockindex++; srcpos += (initlen + endlen + 2); continue; include_page: /* Include it if any of the patterns fit within this range */ if (loaded_hash_table[blockindex++]) continue; #if 0 printf("build_partial_hash: hashing words in page# %d\n", blockindex); #endif /*0*/ loaded_hash_table[blockindex - 1] = 1; fseek(hashfp, (blockindex-1)*blocksize, 0); tbuild_hash(hash_table, hashfp, blocksize); srcpos += (initlen + endlen + 2); } return 0; } pad_hash_file(filename, FILEBLOCKSIZE) unsigned char *filename; int FILEBLOCKSIZE; { FILE *outfp, *infp, *indexfp; int offset = 0, len; unsigned char buf[MAX_NAME_LEN]; int pid = getpid(); int i; unsigned char word[MAX_NAME_LEN]; unsigned char prev_word[MAX_NAME_LEN]; unsigned int hashindex, wordindex; if ((infp = fopen(filename, "r")) == NULL) { fprintf(stderr, "cannot open for reading: %s\n", filename); exit(2); } sprintf(buf, "%s.index", filename); if ((indexfp = fopen(buf, "w")) == NULL) { fprintf(stderr, "cannot open for writing: %s\n", buf); fclose(infp); exit(2); } sprintf(buf, "%s.%d", filename, pid); if ((outfp = fopen(buf, "w")) == NULL) { fprintf(stderr, "cannot open for writing: %s\n", buf); fclose(infp); fclose(indexfp); exit(2); } if ((FILEBLOCKSIZE % MIN_BLOCKSIZE) != 0) { fprintf(stderr, "invalid block size %d: changing to %d\n", FILEBLOCKSIZE, MIN_BLOCKSIZE); FILEBLOCKSIZE = MIN_BLOCKSIZE; } fprintf(indexfp, "%d\n", FILEBLOCKSIZE); if ((char*)buf != fgets(buf, MAX_NAME_LEN, infp)) goto end_of_input; len = strlen((char *)buf); sscanf(buf, "%d %d %s\n", &hashindex, &wordindex, word); putc(0, outfp); putc((hashindex & 0xff00)>>8, outfp); putc((hashindex & 0x00ff), outfp); putc((wordindex & 0xff00)>>8, outfp); putc((wordindex & 0x00ff), outfp); fprintf(outfp, "%s", word); buf[len-1] = '\0'; /* fgets gives you the newline too */ for (i=0; i< len; i++) if (isupper(buf[i])) buf[i] = tolower(buf[i]); for (i=len-2; i>=0; i--) if (buf[i] == ' ') { i++; break; } if (i < 0) i = 0; strcpy((char *)prev_word, (char *)&buf[i]); fprintf(indexfp, "%s", &buf[i]); /* the first word */ putc(0, indexfp); /* null terminated */ offset += strlen((char *)word)+5; while(fgets(buf, MAX_NAME_LEN, infp) == (char *)buf) { len = strlen((char *)buf); if (offset + len > FILEBLOCKSIZE) { /* Put the last char of the prev. page */ fprintf(indexfp, "%s", prev_word); putc(0, indexfp); /* null terminated */ for (i=0; i<FILEBLOCKSIZE-offset; i++) /* fill up with so many newlines until the next block size */ putc('\n', outfp); sscanf(buf, "%d %d %s\n", &hashindex, &wordindex, word); putc(0, outfp); putc((hashindex & 0xff00)>>8, outfp); putc((hashindex & 0x00ff), outfp); putc((wordindex & 0xff00)>>8, outfp); putc((wordindex & 0x00ff), outfp); fprintf(outfp, "%s", word); buf[len-1] = '\0'; /* fgets gives you the newline too */ for (i=0; i< len; i++) if (isupper(buf[i])) buf[i] = tolower(buf[i]); for (i=len-2; i>=0; i--) if (buf[i] == ' ') { i++; break; } if (i < 0) i = 0; strcpy((char *)prev_word, (char *)&buf[i]); fprintf(indexfp, "%s", &buf[i]); /* store the first word at each page */ putc(0, indexfp); /* null terminated */ offset = 0; } else { sscanf(buf, "%d %d %s\n", &hashindex, &wordindex, word); putc(0, outfp); putc((hashindex & 0xff00)>>8, outfp); putc((hashindex & 0x00ff), outfp); putc((wordindex & 0xff00)>>8, outfp); putc((wordindex & 0x00ff), outfp); fprintf(outfp, "%s", word); buf[len-1] = '\0'; /* fgets gives you the newline too */ for (i=0; i<len; i++) if (isupper(buf[i])) buf[i] = tolower(buf[i]); for (i=len-2; i>=0; i--) if (buf[i] == ' ') { i++; break; } if (i < 0) i = 0; strcpy((char *)prev_word, (char *)&buf[i]); } offset += strlen((char *)word)+5; } fprintf(indexfp, "%s", prev_word); putc(0, indexfp); /* null terminated */ end_of_input: fclose(infp); fflush(outfp); fclose(outfp); fflush(indexfp); fclose(indexfp); sprintf(buf, "mv %s.%d %s\n", filename, pid, filename); system(buf); }