home *** CD-ROM | disk | FTP | other *** search
- /* WIDE AREA INFORMATION SERVER SOFTWARE:
- No guarantees or restrictions. See the readme file for the full standard
- disclaimer.
- */
-
- /* Copyright (c) CNIDR (see ../COPYRIGHT) */
-
-
- /* Hash Table routines
- * for strings based on Common Lisp interface functions.
- * -brewster
- */
-
- /* This hashtable will never grow, rather it just adds entries to the buckets.
- *
- * The contents are all in one contiguous block so that the contents can be
- * sorted. This will cause problems because of the reallocs needed.
- *
- */
-
- /* Change log:
- * $Log: hash.c,v $
- * Revision 1.1 1993/02/16 15:05:35 freewais
- * Initial revision
- *
- * Revision 1.9 92/05/06 17:27:38 jonathan
- * Added cast for htable->contents malloc.
- *
- * Revision 1.8 92/02/21 11:06:53 jonathan
- * Added header, RCSIdent.
- *
- * Revision 1.7 92/02/16 12:37:57 jonathan
- * Changed bzero's to memset's.
- *
- * Revision 1.6 92/02/12 13:20:08 jonathan
- * Added "$Log" so RCS will put the log message in the header
- *
- */
-
- /* Cool things we could do from here:
- A) Use a signature to see if the word is in the hashtable, so that
- get_hash will return NULL very quickly for the new words. This will reduce
- the number of times a whole bucket must be searched. This might not help
- much since the file that I was indexing had 8M words, and only 350k
- different words, so there are 1 NULL in 30 requests expected. Therefore,
- unless searching down a list of buckets takes a long time, this is not a
- big problem. The only case I can think of is if it induces paging.
- B) Since the most recently accessed entry of the hashtable is at the
- front, we can sort the hashtable entries so that the heads of all the
- buckets are clumped, so that hopefully these will stay in memory, and the
- infrequently used ones would move to the paged out space.
- */
-
- #ifndef lint
- static char *RCSid = "$Header: /usr/users/freewais/FreeWAIS-0.1/ir/hash.c,v 1.1 1993/02/16 15:05:35 freewais Exp $";
- #endif
-
- #include "cdialect.h"
- #include "cutil.h"
- #include "hash.h"
- #include <string.h>
- #include "panic.h"
-
- #define ALIGNMENT_SIZE 4 /* the word size to align elements of the contents on */
-
-
- hashtable*
- make_hashtable (number_of_buckets, number_of_elements, element_size)
- long number_of_buckets;
- long number_of_elements;
- long element_size;
- {
- hashtable *htable = (hashtable *)s_malloc(sizeof(hashtable));
-
- if(number_of_buckets > 0)
- htable->number_of_buckets = number_of_buckets;
- else
- htable->number_of_buckets = DEFAULT_NUMBER_OF_BUCKETS;
-
- htable->buckets_mask = htable->number_of_buckets - 1;
-
- if(element_size > 0){
- htable->element_size = element_size;
- }
- else
- return(NULL);
-
- htable->number_of_elements = 0;
-
- if(number_of_elements > 0)
- htable->max_number_of_elements = number_of_elements;
- else
- htable->max_number_of_elements = DEFAULT_NUMBER_OF_ELEMENTS;
-
- htable->buckets
- = (long*)s_malloc(sizeof(long) * htable->number_of_buckets);
- /* this depends on -1L to be made up of -1 characters */
- memset(htable->buckets, -1, sizeof(long) * htable->number_of_buckets);
-
- htable->contents
- = (hash_entry*)s_malloc(htable->max_number_of_elements *
- htable->element_size);
- return(htable);
- }
-
-
- /*----------------------------------------------------------------- */
-
- /* returns a hashcode for a string.
- * key is a string that is assumed to be downcased (if appropriate),
- * function is a number from 0 to 20 that are fairly independent hash
- * functions.
- */
-
-
- /*===========================*
- *=== Hashing Functions ===*
- *===========================*/
-
-
- /*___________________________________________________________________________*/
-
- /* Date: Fri, 13 Dec 91 16:28:53 EST
- From: Henry Massalin <henry@cs.columbia.edu> */
-
- #define reg register
-
- hash(s)
- reg unsigned char *s;
- {
- reg int c;
- reg unsigned long h = 0;
- extern unsigned long RandHash[];
-
- if((h = (unsigned long)*s++) == 0)
- return(h);
- while((c = *s++) != 0)
- h += RandHash[ 0xFF & ((h >> 24) ^ c )];
- return(h);
- }
-
- /*___________________________________________________________________________*/
-
- /*
- * Almost any random permutation will work. Of course, now that I've
- * picked this one, everyone else must use it also, to be consistent.
- *
- * This array is constructed from four independent permutations of the
- * integers 0..255. A different permutation in each of the bytes.
- */
- unsigned long
- RandHash[256] = {
- 0x4BEF74B7,0x7FE1F5A6,0xB4C08BF6,0xC2CF0CD0,
- 0x788E2A29,0xB043B6B9,0x5B56C90C,0x324D8FE7,
- 0x8D9C854A,0xCAE06AC8,0x902076A1,0x949EAA31,
- 0xDC070A08,0xF7499B6D,0x17883EAA,0x22A65A35,
- 0x34FC6407,0x8322F367,0xD105BB0F,0xD8A29526,
- 0x8A1B708A,0xBE85B1E1,0xC66AC334,0xC1B13A1C,
- 0xEDF644E4,0x3EA3806C,0x2732D31D,0xEBF504A9,
- 0x08BB2B52,0x4775992C,0x3C0B8CF7,0x5FA0AEA2,
- 0x7616D761,0x66AA86DB,0xC73305C0,0x09E6EA99,
- 0xC9789DD6,0xF47340F1,0x3941303E,0x7542233A,
- 0xE0812ED1,0xFF2E54FE,0x07B887EB,0x68CD0E93,
- 0xB124C17F,0xD01A9C5B,0x96B490ED,0xA7CAE5FC,
- 0x18D41F37,0x6E8DA1BC,0x9B631940,0x35ABCFD4,
- 0xACA1FB0A,0x2F71B921,0x6F10A747,0xA6680796,
- 0x01F12271,0xFDBE0378,0x02F0FF79,0xEC3FF2D2,
- 0x84485C9C,0x60BF9113,0x98CC35E3,0xAA2A8D4D,
- 0xA5CBA2E9,0xF0AE1EF2,0x990118E2,0x64A966F5,
- 0x8595490D,0x7E9B4391,0xE8311B7D,0x2474BCBF,
- 0x19D6E286,0xDFD32D5C,0xAB5116CA,0x7D72D6D5,
- 0x0BF44F5E,0x6170B3D3,0x2C471CC2,0x2EFD1118,
- 0x8C878836,0x62FE52E0,0x2B8C1277,0x2ACEBDEC,
- 0xB2E8A8A3,0x30B757AE,0x708BBA8D,0x501EF654,
- 0x49A7EF1E,0xFE7636AD,0xB54BBE4F,0x1ADBDB8C,
- 0x72E24DD9,0xF68A2517,0xF56FEEDF,0x5A7EB75D,
- 0xD606F9E6,0xC32526B8,0x5EB613F3,0x0C08CBDE,
- 0x8B965E59,0xB361EB75,0x3FADE465,0xB8FFD16F,
- 0x26442988,0x00D2D830,0x3D58FE41,0x3B386E44,
- 0x1297E914,0xAFC11083,0xC5EBE8B6,0x15A49ECD,
- 0xF950C263,0xA0DA0982,0x4CBDF101,0xE227F77E,
- 0x556DB8DA,0xA1BA2125,0x1E3BD409,0xFCF25155,
- 0x110A983B,0x7B7F1A4C,0xE7FB4C69,0xC8994E1F,
- 0x586C4A42,0x5C1FAF2F,0xAD777D48,0x9186696B,
- 0xD52181B4,0xE94C39EF,0x9EDEB2FF,0x802FDA20,
- 0xA228D902,0x52BC7CFD,0x53DD47AC,0x6D12D25F,
- 0xCE1D6D8F,0xF12C8E2B,0xDD5700C1,0x9A402851,
- 0x386E53D7,0x0DF88281,0x4A55CDC4,0x653692B0,
- 0x203CAC8E,0x747C6164,0xC47A02B2,0x1B4FC82D,
- 0xF8934B7A,0x36A82CB5,0x690CD080,0x2800A524,
- 0x1C045D0E,0xBF09CAF0,0x8814ABB3,0xE3D0776A,
- 0x219AA6D8,0xCC375653,0xFA2327A8,0x57F75F27,
- 0x732DA903,0x8F696738,0x3A453223,0xDA1C9F32,
- 0xCF90152A,0x0F64BFF4,0xD7533FA5,0x8E7D55DC,
- 0xBC598998,0x45110DBA,0x4D92E6FB,0x7746C01A,
- 0x951348A0,0xBBFA7E9B,0x055EAD7B,0x06B36BBB,
- 0x0EE31D97,0x37297F11,0xEF529450,0x51C6E168,
- 0x1F67C63F,0x6B023443,0x449F5BFA,0x48C39A4B,
- 0xD30E426E,0x0ADFE31B,0x467B6085,0x7CC49676,
- 0x140DFA60,0xE16B7274,0x235AEC05,0xD2D7DCC6,
- 0x13946212,0x339D2462,0xFB82F8EE,0xB6197970,
- 0xBD0F3C9D,0xE64E3B28,0xE4B07A87,0x54C92F95,
- 0x6CC583AB,0xD4182015,0x86DC755A,0xA930A333,
- 0x82B273EA,0x923EED7C,0xF354A02E,0xEEEEA439,
- 0x1D89089A,0xD9EDDEE8,0x1017B504,0xA45CB0C7,
- 0x93C7E0A4,0x43D95858,0xDB654594,0x31D578CF,
- 0xCDF98489,0xB915C7F9,0x67846FE5,0x04D150AF,
- 0x5DF314DD,0xCB911772,0xA8627156,0x258FC573,
- 0xEAA5E79F,0x633D4119,0x793501C9,0x7AD80F0B,
- 0x4E5DF04E,0x97AF6884,0xAE3A9745,0xA3ACF48B,
- 0x16398A10,0xDE8346C3,0x42C20606,0x03EAB4B1,
- 0x8798FDCC,0x9CE4FC22,0x81B5DD3D,0x563431BE,
- 0x2D799349,0xF2036C92,0xB7E97B3C,0x71E559C5,
- 0x5960C4BD,0x295B63F8,0xBA5F0B57,0xE5B93DCE,
- 0x6A4ACC90,0x89EC6516,0x4180D566,0x40263300,
- 0x9DE738CB,0x4F2B3746,0x9F66DF9E,0xC0C8CEA7
- };
-
- /*___________________________________________________________________________*/
-
- /*
- * Create permutations
- */
- #if 0
- #define N 256
- #define ulong unsigned long
-
- main()
- {
- int i, j;
- ulong p[N], q[N];
-
- permute(p);
- for(i=3; --i >= 0; ) {
- permute(q);
- for(j=N; --j >= 0; )
- p[j] = (p[j] << 8) + q[j];
- }
-
- printf("ulong RandHash[%d] = {", N);
- for(i=0; i<N; i++)
- printf("%s0x%08X", ((i&3) ? "," : "\n\t"), p[i]);
- printf("\n};\n\n");
- }
-
- permute(p)
- ulong *p;
- {
- int i, r, t;
-
- /* initialize array */
- for(i=N; --i >= 0; )
- p[i] = i;
-
- /* generate random permutation */
- for(i=N; --i >= 0; ) {
- r = (i * (random() & 0xFFFF)) >> 16;
- t = p[r]; p[r] = p[i]; p[i] = t;
- }
- #if 0
- /* make sure there are no cycles of len < N */
- for(i=N; --i >= 0; ) {
- }
- #endif
- }
-
- #endif
-
- /* --------------------------- */
-
- #ifdef UNUSED
-
- /* courtesy ses@ccgr.technion.ac.il, but it turns out in
- informal timings that it increases the index time. sigh. */
-
- static char coeff[] = {
- 61,59,53,47,43,41,37,31,29,23,17,13,11,7,3,1,
- 61,59,53,47,43,41,37,31,29,23,17,13,11,7,3,1,
- 61,59,53,47,43,41,37,31,29,23,17,13,11,7,3,1,
- 61,59,53,47,43,41,37,31,29,23,17,13,11,7,3,1,
- 61,59,53,47,43,41,37,31,29,23,17,13,11,7,3,1,
- 61,59,53,47,43,41,37,31,29,23,17,13,11,7,3,1,
- 61,59,53,47,43,41,37,31,29,23,17,13,11,7,3,1,
- 61,59,53,47,43,41,37,31,29,23,17,13,11,7,3,1};
-
-
- long hash_code(key, function)
- char *key;
- long function;
- {
- register char *foo;
- register long hash = 0;
- register int l;
-
- for(l=function,foo=key;l<sizeof(coeff) && *foo ;l++)
- hash = hash + (*(foo++) * coeff[l]);
-
- return(hash);
- }
-
- #endif /* def UNUSED */
-
-
- /*----------------------------------------------------------------- */
-
- #define HASHTABLE_GROW_FACTOR 4
-
- hash_entry *put_hash(key, htable, element)
- char *key;
- hashtable *htable;
- hash_entry *element;
- {
- /* allocate the entry out of the contents area, then add the entry to
- the head of the bucket list. */
-
- /* old long bucket_number = hash_code(key, 0) & htable->buckets_mask; */
- long bucket_number = hash((unsigned char*)key) & htable->buckets_mask;
- long old_bucket_value = (htable->buckets)[bucket_number];
- long new_bucket_value = htable->number_of_elements++;
- hash_entry *entry_header;
-
- if(new_bucket_value >= htable->max_number_of_elements){
- long new_size =
- HASHTABLE_GROW_FACTOR * htable->max_number_of_elements *
- htable->element_size;
-
- waislog(WLOG_LOW, WLOG_INFO,
- "Expanding hashtable from %ld bytes to %ld bytes address %ld...",
- htable->max_number_of_elements * htable->element_size ,
- new_size, htable->contents);
- htable->max_number_of_elements *= HASHTABLE_GROW_FACTOR;
- htable->contents = (hash_entry*)s_realloc(htable->contents, new_size);
- if(NULL == htable->contents)
- panic("Out of virtual memory.");
- }
- entry_header = &(htable->contents)[new_bucket_value];
- memcpy((char *)entry_header, (char *)element, htable->element_size);
- /* printf("Writing key: '%s' into index %ld\n", key, new_bucket_value); */
- strncpy(entry_header->key, key, MAX_KEY_SIZE + 1);
- entry_header->next = old_bucket_value;
-
- (htable->buckets)[bucket_number] = new_bucket_value;
- return(entry_header);
- }
-
- /*----------------------------------------------------------------- */
-
- /* returns a pointer to the element stored or NULL if none. */
- hash_entry *get_hash (key, htable)
- char *key;
- hashtable *htable;
- {
- /* this looks up the value in the bucket and puts it at the head of the
- bucket. */
-
- /* old long bucket_number = hash_code(key, 0) & htable->buckets_mask; */
- long bucket_number = hash((unsigned char*)key) & htable->buckets_mask;
- long old_bucket_value = (htable->buckets)[bucket_number];
-
- long next_contents_index = old_bucket_value;
-
- while(next_contents_index != -1){
- /* keep looking down the list for the right value */
- if(0 == strcmp((htable->contents)[next_contents_index].key,
- key))
- /* found it */
- return(&(htable->contents)[next_contents_index]);
- else
- next_contents_index = (htable->contents)[next_contents_index].next;
- }
- return(NULL);
- }
-
-
- /*----------------------------------------------------------------- */
-
- boolean clr_hashtable (htable)
- hashtable *htable;
- {
- htable->number_of_elements = 0;
- memset(htable->buckets, -1, sizeof(long) * htable->number_of_buckets);
- return(true);
- }
-
- /*----------------------------------------------------------------- */
-
- /* removes contents and free's memory for the hastable.
- returns true if successful, false otherwise */
- boolean free_hashtable(htable)
- hashtable *htable;
- {
- s_free(htable->contents);
- s_free(htable->buckets);
- s_free(htable);
- return(true);
- }
-
- /*----------------------------------------------------------------- */
-
- /* returns the number of elements in the hashtable */
- long hashtable_count(htable)
- hashtable *htable;
- {
- return(htable->number_of_elements);
- }
-
- /*----------------------------------------------------------------- */
-
- #ifdef WIN32
- static int hash_entry_compare _AP((const void *i,const void *j));
-
- static int hash_entry_compare(const void *I,const void *J)
- {
- hash_entry *i, *j;
- i = (hash_entry *) I;
- j = (hash_entry *) J;
- #else
- static int hash_entry_compare _AP((hash_entry *i,hash_entry *j));
-
- static int hash_entry_compare(i,j)
- hash_entry *i;
- hash_entry *j;
- {
- #endif
- return(strcmp(i->key, j->key));
- }
-
-
- boolean sort_hashtable(htable)
- hashtable *htable;
- {
- #ifdef WIN32
- qsort((void *)htable->contents,
- #else
- qsort(htable->contents,
- #endif
- htable->number_of_elements,
- (size_t)sizeof(hash_entry),
- hash_entry_compare);
- return(true);
- }
-
- /*----------------------------------------------------------------- */
- /* print routines */
-
-
- static void print_content_element(index, htable)
- long index;
- hashtable *htable;
- {
- printf(" Index: %ld, key: '%s', Next index %ld\n",
- index, (htable->contents[index]).key,
- (htable->contents[index]).next);
- }
-
- void print_bucket(bucket, htable)
- long bucket;
- hashtable *htable;
- {
- long next_contents_index = (htable->buckets)[bucket];
-
- printf(" Bucket: %ld\n", bucket);
- while(next_contents_index != -1){
- hash_entry * entry_header = &(htable->contents)[next_contents_index];
- print_content_element(next_contents_index, htable);
- next_contents_index = entry_header->next;
- }
- return;
- }
-
- long bucket_length(bucket, htable)
- long bucket;
- hashtable *htable;
- {
- long next_contents_index = (htable->buckets)[bucket];
- long answer = 0;
- while(next_contents_index != -1){
- answer++;
- next_contents_index = ((htable->contents)[next_contents_index]).next;
- }
- return(answer);
- }
-
- static int bucket_length_compare _AP((long*i,long* j));
-
- static int bucket_length_compare(i,j)
- long *i;
- long *j;
- {
- return((*j) - (*i));
- }
-
- void analyze_hashtable_distribution(htable)
- hashtable * htable;
- {
- long count = 0;
- long max_length = 128;
- long *bucket_length_array = (long *)malloc(sizeof(long) * max_length);
- memset((char*)bucket_length_array, 0, (sizeof(long) * max_length));
-
- waislog(WLOG_LOW, WLOG_INFO, "--Hashtable Analysis--");
- waislog(WLOG_LOW, WLOG_INFO, "Number of buckets: %ld",
- htable->number_of_buckets);
- waislog(WLOG_LOW, WLOG_INFO, "Number of allocated elements %ld",
- htable->number_of_elements);
- waislog(WLOG_LOW, WLOG_INFO, "Max number of elements %ld",
- htable->max_number_of_elements);
-
- while(count < htable->number_of_buckets){
- long length = bucket_length(count, htable);
- if(length > max_length){ /* too small, start again */
- max_length *= 2;
- free((char*)bucket_length_array);
- bucket_length_array = (long *)malloc(sizeof(long) * max_length);
- memset((char*)bucket_length_array, 0, (sizeof(long) * max_length));
- count = 0;
- }
- else{
- bucket_length_array[length]++;
- count++;
- }
- }
- /* print the buckets */
- for(count = 0; count < max_length; count++){
- if(bucket_length_array[count] > 0){
- waislog(WLOG_LOW, WLOG_INFO, "Length %4ld, number of buckets: %5ld.",
- count, bucket_length_array[count]);
- }
- }
- free((char*)bucket_length_array);
- }
-
- void print_hashtable(htable)
- hashtable * htable;
- {
- long count;
- printf("Number of buckets: %ld\n", htable->number_of_buckets);
- printf("Bucket mask %ld\n", htable->buckets_mask);
- printf("Element Size %ld\n", htable->element_size);
- printf("Number of allocated elements %ld\n", htable->number_of_elements);
- printf("Max number of elements %ld\n", htable->max_number_of_elements);
-
- printf("\nContents:\n");
- for(count = 0; count < htable->number_of_elements; count++){
- print_content_element(count, htable);
- }
- printf("\nBuckets:\n");
- for(count = 0; count < htable->number_of_buckets; count++){
- if((htable->buckets)[count] != -1)
- print_bucket(count, htable);
- }
- }
-
- /* for saber trials:
-
- foo(){
- hashtable *htable;
- hash_entry entry;
- htable = make_hashtable(0, 0, sizeof(hash_entry));
- print_hashtable(htable);
- put_hash("food", htable, &entry);
- print_hashtable(htable);
- *get_hash("food", htable);
- put_hash("food", htable, &entry);
- print_hashtable(htable);
- }
- */
-