home *** CD-ROM | disk | FTP | other *** search
- /* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal. All Rights Reserved. */
- /* From mail received from Bill Camargo and Darren Hardy in June 1994 */
- #include <stdio.h>
- #include "region.h"
-
- /*
- * Exports the following routines. Any filtering/attr-val parsing mechanism
- * can be integrated into glimpse and glimpseindex with this interface.
- */
-
- char * /* attrname = */ attr_id_to_name(/* int attrid */);
- int /* attrid = */ attr_name_to_id(/* char *attrname */);
- int attr_dump_names(/* char *filename */);
- int attr_load_names(/* char *filename */);
- int attr_free_table(/* void */);
- int region_initialize(/* void */);
- int region_destroy(/* void */);
- int region_create(/* char *filename */);
- int /* attrid = */ region_identify(/* int offset_in_file, int len_of_region */);
-
- #if STRUCTURED_QUERIES
-
- printsp()
- {
- int x;
-
- printf("stack at %x\n", &x);
- }
-
- /*****************************************************************************/
-
- #define ATTR_HASH_TABLE_SIZE 256 /* must be a power of 16=multiple of 4 bits */
- #define ATTR_HASH_TABLE_MASK 0xff /* bits that mask off the bits in TABLE_SIZE */
- #define ATTR_HASH_STEP_SIZE 2 /* #of nibbles that make up TABLE_SIZE */
-
- attr_element_t *attr_hash_table[ATTR_HASH_TABLE_SIZE];
- char **attr_name_table = NULL;
- int attr_num = 0;
- int attr_maxid = 0;
-
- /* English language characters have all info in lowest 4 bits */
- int
- attr_hash_index(word, len)
- char *word;
- int len;
- {
- int i=0, j, index = 0, temp;
-
- for (i=0; i+ATTR_HASH_STEP_SIZE<=len; i+=ATTR_HASH_STEP_SIZE) {
- temp = 0;
- for (j=0; j<ATTR_HASH_STEP_SIZE; j++)
- temp = (temp << 4) | word[i+j] & 0x0f;
- index = (index + temp) & ATTR_HASH_TABLE_MASK;
- }
- temp = 0;
- for (j=0; i+j<len; j++)
- temp = (temp << 4) | word[i+j] & 0x0f;
- index = (index + temp) & ATTR_HASH_TABLE_MASK;
- return index;
- }
-
- char *
- attr_id_to_name(id)
- int id;
- {
- #if 0
- printf("id = %d\n", id);
- #endif /*0*/
- if ((attr_name_table == NULL) || (id > attr_maxid)) return NULL;
- else return attr_name_table[id];
- }
-
- /*
- * returns the attribute number associated with name, 0 for no attribute --
- * NOTE: name may not be null terminated and you are not allowed to alter it.
- * called during indexing and search.
- */
- int
- attr_name_to_id(name, len)
- char *name;
- int len;
- {
- int index = attr_hash_index(name, len);
- attr_element_t *e = attr_hash_table[index];
- #if 0
- char c = name[len];
- name[len] = '\0';
- fprintf(stderr, "attr=%s @ %d?\n", name, index);
- fflush(stderr);
- name[len] = c;
- #endif /*0*/
-
- while(e != NULL) {
- if (!strncmp(e->attribute, name, len)) break;
- else e = e->next;
- }
- if (e!=NULL) {
- #if 0
- fprintf(stderr, "foundid=%d\n", e->attributeid);
- #endif /*0*/
- return e->attributeid;
- }
- return 0;
- }
-
- /*
- * returns the attribute number (> 0) for the attribute "name". It adds the
- * name as a newly seen attribute if it doesn't exist already (using #tables).
- * called in region_create, which is called during indexing.
- */
- attr_insert_name(name, len)
- char *name;
- int len;
- {
- int index = attr_hash_index(name, len);
- attr_element_t **pe = &attr_hash_table[index], *e;
-
- while(*pe != NULL) {
- if (!strcmp((*pe)->attribute, name)) break;
- else pe = &(*pe)->next;
- }
- if (*pe!=NULL) return (*pe)->attributeid;
-
- e = (attr_element_t *)my_malloc(sizeof(attr_element_t));
- e->attribute = (char *)my_malloc(len + 2);
- strncpy(e->attribute, name, len + 1);
- e->attributeid = (++attr_num);
- e->next = NULL;
- *pe = e;
- #if 0
- fprintf(stderr, "inserting %s %d\n", name, attr_num);
- #endif /*0*/
- return e->attributeid;
- }
-
- /*
- * frees current hash table of attr-value pairs.
- * called after dump in indexing, and at end of search (after previous load).
- */
- int
- attr_free_table()
- {
- int i;
- attr_element_t *e, *temp;
-
- for (i=0; i<ATTR_HASH_TABLE_SIZE; i++) {
- e = attr_hash_table[i];
- while (e != NULL) {
- temp = e->next;
- #if BG_DEBUG
- memory_usage -= strlen(e->attribute) + 2;
- #endif /*BG_DEBUG*/
- my_free(e->attribute, 0);
- my_free(e, sizeof(attr_element_t));
- e = temp;
- }
- attr_hash_table[i] = NULL;
- }
- if (attr_name_table != NULL) {
- my_free(attr_name_table, sizeof(attr_element_t *) * ATTR_HASH_TABLE_SIZE);
- attr_name_table = NULL;
- }
- return 0;
- }
-
- /* Looks for embedded attributes and copies the real attribute into dest */
- attr_extract(dest, src)
- char *dest, *src;
- {
- char *oldsrc = src;
-
- check_again:
- if (!strncmp("embed<", src, 6) || !strncmp("Embed<", src, 6) || !strncmp("EMBED<", src, 6)) {
- src += 6;
- while ((*src != '>') && (*src != '\0')) src++;
- if (*src == '\0') {
- strcpy(dest, oldsrc);
- return;
- }
- while (!isalnum(*src)) src ++; /* assuming type names are .. */
- oldsrc = src;
- goto check_again;
- }
- strcpy(dest, src);
- return;
- }
-
- /*
- * dumps the attribute-list into a file name (id, name, \n)
- * into the file specified and then destroys the hash table.
- * Returns #of attributes dumped into the file, -1 if error.
- * called at the end of indexing.
- */
- int
- attr_dump_names(filename)
- char *filename;
- {
- int i=0;
- int ret = -1;
- FILE *fp = fopen(filename, "w");
- attr_element_t *e;
-
- #if 0
- printf("in dump attr\n");
- #endif /*0*/
- if (fp == NULL) return -1;
- ret = 0;
- for (i=0; i<ATTR_HASH_TABLE_SIZE; i++) {
- e = attr_hash_table[i];
- while (e != NULL) {
- fprintf(fp, "%d,%s ", e->attributeid, e->attribute);
- e = e->next;
- ret ++;
- }
- fputc('\n', fp);
- }
- fflush(fp);
- fclose(fp);
- return ret;
- }
-
- /*
- * constructs a hash-table of attributes by reading them from the file.
- * Returns #of attributes read from the file, -1 if error.
- * Does not recompute hash-indices of attributes.
- * called before searching for attr=val pairs.
- */
- int
- attr_load_names(filename)
- char *filename;
- {
- int index = 0, ret = 0;
- FILE *fp = fopen(filename, "r");
- attr_element_t *e;
- int c = 0;
- char temp[1024]; /* max attr name */
- char buffer[1024+32];/* max attr id pair */
- int i;
- int id;
-
- attr_maxid = 0;
- memset(attr_hash_table, '\0', sizeof(attr_element_t *) * ATTR_HASH_TABLE_SIZE);
- if (fp == NULL) return -1;
- while ((c = getc(fp)) != EOF) {
- if (c == '\n') {
- index ++;
- continue;
- }
- ungetc(c, fp);
- /* fscanf screws up fp and skips over trailing space characters (\t,\n, ) */
- i=0;
- while ((c=getc(fp)) != ' ') buffer[i++] = c;
- buffer[i] = '\0';
- #if 0
- printf("buffer=%s\n", buffer);
- #endif /*0*/
- sscanf(buffer, "%d,%1023s", &id, temp);
- temp[1023] = '\0';
- #if 0
- printf("read attr=%s,%d @ %d\n", temp, id, index);
- #endif /*0*/
- if (id <= 0) continue;
- e = (attr_element_t *)my_malloc(sizeof(attr_element_t));
- e->attributeid = id;
- if (id > attr_maxid) attr_maxid = id;
- e->attribute = (char *)my_malloc(strlen(temp) + 2);
- strcpy(e->attribute, temp);
- e->next = attr_hash_table[index];
- attr_hash_table[index] = e;
- ret ++;
- if (index >= ATTR_HASH_TABLE_SIZE - 1) break;
- }
- fclose(fp);
-
- attr_name_table = (char **)my_malloc(sizeof(char *) * (ret=(ret >= (attr_maxid + 1) ? ret : (attr_maxid + 1))));
- memset(attr_name_table, '\0', sizeof(char *) * ret);
- for (i=0; i<ATTR_HASH_TABLE_SIZE; i++) {
- e = attr_hash_table[i];
- while (e!=NULL) {
- attr_name_table[e->attributeid] = e->attribute;
- e = e->next;
- }
- }
- return ret;
- }
-
- /***************************************************************************/
-
- region_t *current_regions, *nextpos; /* nextpos is hint into list */
-
- /*
- * Called during indexing before region_create.
- * returns 0.
- */
- int
- region_initialize()
- {
- attr_num = 0;
- attr_name_table = NULL;
- memset(attr_hash_table, '\0', sizeof(attr_element_t *) * ATTR_HASH_TABLE_SIZE);
- current_regions = nextpos = NULL;
- return 0;
- }
-
- /*
- * creates a data structure containing the list of attributes
- * which occur at increasing offsets in the given file -- future
- * region_identify() calls use the "current" data structure.
- * returns 0 if success, -1 if it cannot open the file.
- */
- int
- region_create(name)
- char *name;
- {
- FILE *fp;
- AVList *al;
- region_t *prl, *rl, *lastrl;
- Template *t;
- char temp[1024];
-
- current_regions = nextpos = NULL;
- if ((fp = fopen(name, "r")) == NULL) return -1;
- init_parse_template_file(fp);
-
- lastrl = NULL;
- while ((t = parse_template()) != NULL) {
- /* do insertion sort of list returned by parse_template using offsets */
- if ((t->url != NULL) && (strlen(t->url) > 0)) {
- rl = (region_t *)my_malloc(sizeof(region_t));
- /* Darren Hardy's Voodo :-) */
- /* The SOIF looks like this: @TTYPE { URL\n */
- /* t->offset points to the @ */
- /* rl->offset points to the space before URL */
- /* rl->length includes the entire URL */
- rl->offset = t->offset + strlen(t->template_type) + 3;
- rl->length = strlen(t->url) + 1;
- rl->attributeid = attr_insert_name("url", 3);
-
- if ((lastrl != NULL) && (lastrl->offset <= rl->offset)) { /* go forward */
- prl = lastrl;
- while (prl->next != NULL) {
- if (prl->next->offset > rl->offset) {
- rl->prev = prl;
- rl->next = prl->next;
- prl->next->prev = rl;
- prl->next = rl;
- lastrl = rl;
- break;
- }
- else prl = prl->next;
- }
- if (prl->next == NULL) {
- rl->next = NULL;
- rl->prev = prl;
- prl->next = rl;
- lastrl = rl;
- }
- }
- else { /* must go backwards and find the right place to insert */
- prl = lastrl;
- while (prl != NULL) {
- if (prl->offset < rl->offset) {
- rl->prev = prl;
- rl->next = prl->next;
- if (prl->next != NULL)
- prl->next->prev = rl;
- prl->next = rl;
- lastrl = rl;
- break;
- }
- else prl = prl->prev;
- }
- if (prl == NULL) {
- rl->next = current_regions;
- if (current_regions != NULL) current_regions->prev = rl;
- rl->prev = NULL;
- current_regions = rl;
- lastrl = rl;
- }
- }
-
- #if 0
- printf("region url=[%d,%d]\n", rl->offset, rl->offset+rl->length);
- #endif /*0*/
- }
-
- al = t->list;
- while(al != NULL) {
- rl = (region_t *)my_malloc(sizeof(region_t));
- rl->offset = al->data->offset;
- rl->length = al->data->vsize;
- attr_extract(temp, al->data->attribute);
- rl->attributeid = attr_insert_name(temp, strlen(temp));
-
- if ((lastrl != NULL) && (lastrl->offset <= rl->offset)) { /* go forward */
- prl = lastrl;
- while (prl->next != NULL) {
- if (prl->next->offset > rl->offset) {
- rl->prev = prl;
- rl->next = prl->next;
- prl->next->prev = rl;
- prl->next = rl;
- lastrl = rl;
- break;
- }
- else prl = prl->next;
- }
- if (prl->next == NULL) {
- rl->next = NULL;
- rl->prev = prl;
- prl->next = rl;
- lastrl = rl;
- }
- }
- else { /* must go backwards and find the right place to insert */
- prl = lastrl;
- while (prl != NULL) {
- if (prl->offset < rl->offset) {
- rl->prev = prl;
- rl->next = prl->next;
- if (prl->next != NULL)
- prl->next->prev = rl;
- prl->next = rl;
- lastrl = rl;
- break;
- }
- else prl = prl->prev;
- }
- if (prl == NULL) {
- rl->next = current_regions;
- if (current_regions != NULL) current_regions->prev = rl;
- rl->prev = NULL;
- current_regions = rl;
- lastrl = rl;
- }
- }
-
- #if 0
- printf("region %s=[%d,%d]\n", al->data->attribute, rl->offset, rl->offset+rl->length);
- #endif /*0*/
- al = al->next;
- }
- free_template(t);
- }
- finish_parse_template();
- nextpos = current_regions;
- fclose(fp);
- return 0;
- }
-
- /*
- * frees the data structure created for the current file above.
- * returns 0.
- */
- int
- region_destroy()
- {
- region_t *rl = current_regions, *trl;
-
- while (rl != NULL) {
- trl = rl;
- rl = rl->next;
- free(trl, sizeof(region_t));
- }
- current_regions = nextpos = NULL;
- return 0;
- }
-
- /*
- * returns attribute number [1..num_attr] which covers (inclusive)
- * the region * [offset, offset+len] in the "current" file, 0 if none.
- * called during indexing after region_create, and search after
- * attr_load_names. Do not need sophisticated interval trees here!
- */
- int
- region_identify(offset, len)
- int offset, len;
- {
- region_t *rl;
-
- if (nextpos == NULL) nextpos = current_regions;
- rl = nextpos;
- while (rl!=NULL) {
- if (rl->offset > offset + len)
- goto backwards; /* definitely before: can be earlier region OR hole */
- else if ((rl->offset <= offset) && (rl->offset + rl->length >= offset + len))
- return rl->attributeid; /* definitely within */
- else if (rl->offset + rl->length < offset)
- nextpos = rl = rl->next; /* definitely after: later region */
- else return 0; /* overlapping: error */
- }
- return 0; /* reached end of file */
-
- backwards:
- while (rl!=NULL) {
- if (rl->offset > offset + len)
- nextpos = rl = rl->prev; /* definitely before: earlier region */
- else if ((rl->offset <= offset) && (rl->length + rl->length >= offset + len))
- return rl->attributeid; /* definitely within */
- else if (rl->offset + rl->length < offset)
- return 0; /* hole */
- else return 0; /* overlapping: error */
- }
- return 0; /* reached end of file */
- }
-
- #else /*STRUCTURED_QUERIES*/
-
- int attr_num = 0;
-
- char *attr_id_to_name(id)
- int id;
- {
- return NULL;
- }
-
- int attr_name_to_id(name)
- char *name;
- {
- return 0;
- }
-
- int attr_dump_names(name)
- char *name;
- {
- return 0;
- }
-
- int attr_load_names(name)
- char *name;
- {
- return 0;
- }
-
- int attr_free_table()
- {
- return 0;
- }
-
- int region_initialize()
- {
- return 0;
- }
-
- int region_desrtroy()
- {
- return 0;
- }
-
- int region_create(name)
- char *name;
- {
- return 0;
- }
-
- int region_destroy()
- {
- return 0;
- }
-
- int region_identify(offset, len)
- int offset, len;
- {
- return 0;
- }
- #endif /*STRUCTURED_QUERIES*/
-