home *** CD-ROM | disk | FTP | other *** search
- /* WIDE AREA INFORMATION SERVER SOFTWARE
- No guarantees or restrictions. See the readme file for the full standard
- disclaimer.
- Brewster@think.com
- */
-
- /* Copyright (c) CNIDR (see ../COPYRIGHT) */
-
- /* implements the search part of irext.h
- (search_word and finished_search_word)
- -brewster
-
- Split from irsearch.c
-
- 5/31/91 Added scale_scores. Fixed document_score_array to long.
- 7/8/91 Removed scale_scores, handled in search_word with doc_id > 0.
- 2/4/92 Made document_score_array a double.
-
- - Jonny G
- * $Log: sersrch.c,v $
- * Revision 1.33 93/07/21 18:47:38 warnock
- * Added STELAR-specific patches
- *
- * Revision 1.10 1993/10/13 14:14:20 huynh1
- * new code added for encapsulated boolean queries and
- * modified literal search
- *
- * Revision 1.32 93/07/19 17:09:39 warnock
- * fixed problem with multiple documents in single file, from isaacs@hpcc05.corp.hp.com
- *
- * Revision 1.3 1993/07/13 08:19:56 pfeifer
- * Sicherung vor Aenderungen Tung
- *
- * Revision 1.31 93/07/02 18:31:06 warnock
- * included ctype.h
- *
- * Revision 1.3a 93/07/02 17:59:25 warnock
- * change to search_word to handle compressed files from francois
- *
- * Revision 1.1 1993/02/16 15:05:35 freewais
- * Initial revision
- *
- * Revision 1.24 92/04/28 16:56:54 morris
- * added boolean to serial engine
- *
- * Revision 1.23 92/03/15 10:15:18 jonathan
- * Added Simon Spero's ASSIGN replacement for read_bytes.
- *
- * Revision 1.22 92/03/05 07:09:54 shen
- * add two more dummy arguments to call to init_search_engine
- *
- * Revision 1.21 92/02/12 17:29:52 jonathan
- * Conditionalized inclusion of object code.
- *
- * Revision 1.20 92/02/12 13:40:06 jonathan
- * Added "$Log" so RCS will put the log message in the header
- *
- */
-
- #include <string.h>
- #include "cdialect.h"
- #include "irfiles.h"
- #ifdef BIO
- #include "irtfiles.h" /* dgg, for wordDelimiter */
- #endif /* BIO */
- #include "irsearch.h"
- #include "irext.h"
- #include "byte_order.h"
- #include <ctype.h>
-
- #include <math.h>
-
- /* tung, 10/93 */
- #ifdef BOOLEANS
- #include "boolean_op.h"
- #endif /* BOOLEANS */
- /* tung, 10/93 */
-
- /* #define MAXINT (unsigned long)2^(sizeof(long)*8-1) */
-
- #define VALUE 1000000L
-
- #include "stemmer.h"
-
- /* tung, 10/93 */
- #ifdef BOOLEANS
- #include "boolean_op.h"
- #endif /* BOOLEANS */
- /* tung, 10/93 */
-
- #ifdef BOOL
- #include "obj.h"
- #include "irparse.h"
- object* currentQuery = NULL; /* kludge until irext goes away */
- #endif /* def BOOL */
-
- #ifdef WIN32
- #include <io.h>
- #endif
-
- /* weighting for relevant document terms -
- this may become a parameter to the query.
- */
-
- #define RF_WEIGHTING 0.1
-
- /* ==================================
- * === Initialization Functions ===
- * ==================================*/
-
-
- long init_search_engine(file, initialize, for_search, cm_mem_percent,
- text_size, grow_percent)
- char* file;
- boolean initialize;
- boolean for_search;
- long cm_mem_percent; /* unused */
- long text_size; /* unused */
- long grow_percent; /* unused */
- {
- static boolean inited = false;
-
- if (inited == false)
- {
- #ifdef BOOL
- initObj();
- initBool();
- #endif /* BOOL */
- inited = true;
- }
-
- return(0);
- }
-
- long finished_search_engine()
- {
- return(0);
- }
-
-
- /*
- * ext_open_database: see irext.h
- */
-
- long ext_open_database (db, initialize, for_search)
- database *db;
- boolean initialize;
- boolean for_search;
- { /* this has to deal with the .inv file */
- char file[MAX_FILE_NAME_LEN];
-
- if(initialize) /* make a new one */
- db->index_stream = s_fopen(index_filename(file, db), "w+b");
- else if(for_search) /* just search */
- db->index_stream = s_fopen(index_filename(file, db), "rb");
- else /* write to an existing db */
- db->index_stream = s_fopen(index_filename(file, db), "r+b");
-
- if (db->index_stream == NULL) {
- waislog(WLOG_HIGH, WLOG_ERROR,"2can't open the inverted index file %s\n",
- file);
- disposeDatabase(db);
- return(1);
- }
- return(0);
- }
-
-
-
- /*
- * ext_close_database: see irext.h
- */
-
- long ext_close_database (db)
- database *db;
- {
- return(0);
- }
-
- char *database_file(database_name)
- char *database_name;
- {
- return(database_name);
- }
-
- /*===========================*
- *=== Setting Paramters ===*
- *===========================*/
-
- long max_hit_retrieved = 0;
- char **srcs = NULL;
-
- long set_query_parameter (mask, parameters)
- long mask;
- query_parameter_type * parameters;
- {
- switch (mask)
- {
- case SET_MAX_RETRIEVED_MASK:
- max_hit_retrieved = parameters->max_hit_retrieved;
- return(0);
- break;
- case SET_SELECT_SOURCE:
- if(NULL != srcs){
- if(NULL != srcs[0])
- s_free(srcs[0]);
- s_free(srcs);
- }
- srcs = parameters->srcs;
- break;
- default:
- return(-1);
- break;
- }
- return(0);
- }
-
- /*==============================*
- *=== Document Score Array ===*
- *==============================*/
-
- double *document_score_array = NULL;
- long document_score_array_len = 0;
- #ifdef BOOLEANS
- double *prev_score_array = NULL; /* 12/91 GS TLG */
-
- /* tung, 10/93 */
- search_result_struct *search_result_array = NULL;
- /* tung, 10/93 */
-
- #endif /* BOOLEANS */
-
- /* make_document_score_array insures that the document_score_array
- array is long enough, if not it makes it long enough */
- static void make_document_score_array _AP((long length ));
- static void make_document_score_array(length)
- long length;
- {
- if(length <= document_score_array_len)
- return;
- /* we have to make a new one. free the old one first (if any) */
- if(document_score_array != 0){
- s_free(document_score_array);
- #ifdef BOOLEANS
- s_free(prev_score_array); /* 12/91 GS TLG */
- #endif /* BOOLEANS */
- }
- document_score_array = (double*)s_malloc(
- (size_t)(length * sizeof(double)));
- #ifdef BOOLEANS
- prev_score_array = (double*)s_malloc((size_t)(length * sizeof(double))); /* 12/91 GS TLG */
- #endif /* BOOLEANS */
- document_score_array_len = length;
- }
-
- static void destroy_document_score_array _AP((void));
- static void destroy_document_score_array()
- {
- s_free(document_score_array);
- #ifdef BOOLEANS
- s_free(prev_score_array); /* 12/91 GS TLG */
- #endif /* BOOLEANS */
- document_score_array_len = 0;
- }
-
- void clear_document_score_array()
- /* side effects the document_score_array. */
- {
- memset(document_score_array, 0,
- document_score_array_len * sizeof(double));
- #ifdef BOOLEANS
- memset(prev_score_array, 0, /* 12/91 GS TLG */
- document_score_array_len * sizeof(double)); /* 12/91 GS TLG */
- #endif /* BOOLEANS */
- }
-
- /* for debugging purposes */
- void print_document_score_array(start,stop)
- unsigned long start;
- unsigned long stop;
- /* assumes start >= 0, stop < db->doc_table_allocated_entries */
- {
- long i;
- #ifdef WIN32
- for(i = start; i <= (long)stop; i++){
- #else
- for(i = start; i <= stop; i++){
- #endif
- printf("entry number %d: %f \n",
- i, document_score_array[i]);
- }
- }
-
-
-
- /*=========================*
- *=== Best Hits Array ===*
- *=========================*/
-
- hit *best_hits_array = NULL;
- long best_hits_array_len = 0;
- long current_best_hit = 0;
-
- /* see irext.h for doc */
- long init_best_hit (db)
- database *db;
- {
-
- #ifdef BOOL
- if (currentQuery != NULL)
- send(currentQuery,InitBestHit,db);
- #endif /* def BOOL */
-
- return(0);
- }
-
- /* make_best_hits_array insures that the best_hits_array
- array is long enough, if not it makes it long enough */
- static void make_best_hits_array _AP((long length));
- static void make_best_hits_array(length)
- long length;
- {
- if(length <= best_hits_array_len)
- return;
- /* we have to make a new one. free the old one first (if any) */
- if(best_hits_array != 0){
- s_free(best_hits_array);
- }
- best_hits_array = (hit*)s_malloc((size_t)(length * sizeof(hit)));
- best_hits_array_len = length;
- }
-
- static void destroy_best_hits_array _AP((void));
- static void destroy_best_hits_array()
- {
- s_free(best_hits_array);
- best_hits_array_len = 0;
- }
-
- void clear_best_hits_array()
- /* side effects the best_hits_array. XXX could use memset */
- {
- memset((char*)best_hits_array, 0, best_hits_array_len * sizeof(hit));
- }
-
- /* for debugging purposes */
- void print_best_hits()
- {
- long i;
- for( i = 0; i < best_hits_array_len; i++){
- if (best_hits_array[i].weight != 0)
- { printf("Best hit %ld: weight %lf, doc_id %ld, headline %s, filename %s, lines %ld\n",
- i, best_hits_array[i].weight,
- best_hits_array[i].document_id,
- best_hits_array[i].headline,
- best_hits_array[i].filename,
- best_hits_array[i].number_of_lines);
- }
- }
- }
-
- void sort_best_hits(db)
- database * db;
- {
- /* returns nothing.
- * side effects best_hits and document_score_array
- */
-
- long i, doc;
- double worst_weight_to_make_it = 0.0;
- document_table_entry doc_entry;
- long best_hit_number = 0;
-
- /* snuff the scores */
- for(i = 0; i < max_hit_retrieved; i++){
- best_hits_array[i].weight = 0.0;
-
- }
-
- /* loop over the doc, and keep the doc_id and weight in best hit table */
- for(doc = 1; doc < db->doc_table_allocated_entries; doc++){
- double weight = document_score_array[doc];
- /* jmf */
- read_document_table_entry(&doc_entry, doc, db); /* if this could be
- removed, we'd gain speed */
- /* if (doc_entry.document_length) */
- #ifdef WIN32
- /* Windows NT doesn't like dividing by 0. This will cause the first
- entry in the document table (which is a null entry) to be skipped.
- [I don't understand why UNIX or the Macintosh don't have a problem here.]
- */
- if (doc_entry.document_length == 0) continue;
- #endif /* WIN32 */
- weight/=doc_entry.document_length;
- if(worst_weight_to_make_it < weight){
- /* merge it into the best_hits array. start at the bottom */
- for(i = (max_hit_retrieved - 1); i >= 0; i--){
- if(weight > best_hits_array[i].weight
- /* && (check_document_id(doc, db) == true) too slow.*/
- ){
- /* move this entry down */
- if((i + 1) < max_hit_retrieved){
- best_hits_array[i+1].weight = best_hits_array[i].weight;
- best_hits_array[i+1].document_id = best_hits_array[i].document_id;
- }
- best_hits_array[i].document_id = doc;
- best_hits_array[i].weight = weight;
- }
- else
- break;
- }
- }
- }
-
- for(i = 0; i < max_hit_retrieved; i++){
- if(best_hits_array[i].weight <= 0.0)
- return;
- if (read_document_table_entry(&doc_entry,
- best_hits_array[i].document_id,
- db)
- == true){
- best_hits_array[best_hit_number].weight = best_hits_array[i].weight;
-
- best_hits_array[best_hit_number].document_id = best_hits_array[i].document_id;
- best_hits_array[best_hit_number].start_character = doc_entry.start_character;
- best_hits_array[best_hit_number].end_character = doc_entry.end_character;
- best_hits_array[best_hit_number].document_length = doc_entry.document_length;
- best_hits_array[best_hit_number].number_of_lines = doc_entry.number_of_lines;
- sprintf(best_hits_array[best_hit_number].date, "%d", doc_entry.date);
- read_filename_table_entry(doc_entry.filename_id,
- best_hits_array[best_hit_number].filename,
- best_hits_array[best_hit_number].type,
- NULL,
- db),
- strncpy(best_hits_array[best_hit_number].headline,
- read_headline_table_entry(doc_entry.headline_id,db),
- MAX_FILE_NAME_LEN);
- best_hit_number++;
- }
- beFriendly();
- }
- for(i = best_hit_number; i < max_hit_retrieved; i++){
- best_hits_array[best_hit_number].weight = 0.0;
- }
- /* print_best_hits(s); for debugging */
- }
-
-
- /* returns the next best hit */
- long best_hit(db, doc_id, best_character, best_line, score,start,end,date,
- length,nlines,headline,filename,type)
- database *db;
- long *doc_id;
- long *best_character;
- long *best_line;
- double *score;
- long *start,*end,*date,*length,*nlines;
- char *headline,*filename,*type;
- {
- double tmp;
-
- *best_character = 0;
- *best_line = 0;
-
- #ifdef BOOL
- if (currentQuery != NULL) /* for boolean */
- {
- send(currentQuery,GetBestHit,db,doc_id,best_character,best_line,score);
- if (*doc_id > 0)
- return(0); /* ok */
- else
- return(-1); /* no more docs */
- }
- #endif /* BOOL */
-
- if(current_best_hit > best_hits_array_len)
- return(1);
- if(best_hits_array[current_best_hit].weight == 0.0)
- return(1);
- *doc_id = best_hits_array[current_best_hit].document_id;
- tmp = ((double)(best_hits_array[current_best_hit].weight*VALUE));
- *score=tmp;
- *start=best_hits_array[current_best_hit].start_character;
- *end=best_hits_array[current_best_hit].end_character;
- *date=atoi(best_hits_array[current_best_hit].date);
- *length=best_hits_array[current_best_hit].document_length;
- *nlines=best_hits_array[current_best_hit].number_of_lines;
- strcpy(headline,best_hits_array[current_best_hit].headline);
- strcpy(filename,best_hits_array[current_best_hit].filename);
- strcpy(type,best_hits_array[current_best_hit].type);
- current_best_hit++;
- return(0);
- }
-
- long finished_best_hit(db)
- database *db;
- {
-
- #ifdef BOOL
- if (currentQuery != NULL) /* for boolean */
- { send(currentQuery,Delete);
- currentQuery = NULL;
- return(0);
- }
- #endif /* BOOL */
-
- /* if we are on a small machine, we might want to
- destroy_document_score_array */
- clear_document_score_array();
- clear_best_hits_array();
- current_best_hit = 0;
- return(0);
- }
-
- /*=============================*
- *=== Searching for words ===*
- *=============================*/
-
- /* see irext.h for doc */
- long init_search_word (db)
- database* db;
- {
- char fn[256];
- strcpy( fn,db->database_file );
- strcat( fn,synonym_ext );
- syn_ReadFile( fn,&db->syn_Table,&db->syn_Table_Size );
-
- return(0);
- }
-
-
- #ifdef BOOLEANS
- /* tung, 10/93 */
- long word_id = 0;
- extern long number_of_qwords ;
- /* tung, 10/93 */
-
- static boolean gLastAnd= false;
- static boolean gLastNot= false;
- #endif /* BOOLEANS */
-
- /* see irext.h for doc */
- /* returns -1 if error, 1 if word exists, 0 if not */
-
- long search_word(word,char_pos, line_pos, weight, doc_id,
- word_pair, db)
- char *word; /* the word to be searched for */
- long char_pos; /* the position of the start of the word */
- long line_pos; /* is this needed? not for signature system */
- long weight; /* how important the word looks syntactically,
- such as is it bold */
- long doc_id; /* current document, seed words is 0,
- then it increments into the relevant
- document */
- long word_pair;
- database *db;
- {
- /* this side effects the document_score_array,
- * and downcases the word.
- * Returns 0 if successful or word not present,
- * returns non-0 if an error.
- *
- */
-
- long not_full_flag = INDEX_BLOCK_FULL_FLAG; /*start out full so it will go on looking */
- long count, index_block_size;
- long internal_document_id, number_of_valid_entries;
- double internal_weight;
- long index_file_block_number;
- long number_of_occurances;
-
- FOUR_BYTE index_buffer_data[INDEX_ELEMENT_SIZE*(1024/4)];
- char *index_buffer;
- char *i;
- FILE *stream = NULL;
-
-
- #ifdef LITERAL
- #ifdef WIN32
- long txt_pos; /* 2/92 GS TLG */
- document_table_entry doc_entry; /* 2/92 GS TLG */
- static FILE *txt_stream = NULL; /* 2/92 GS TLG */
- #else
- long txt_pos, icnt, wcnt, pcnt; /* 2/92 GS TLG */
- document_table_entry doc_entry; /* 2/92 GS TLG */
- static FILE *txt_stream = NULL; /* 2/92 GS TLG */
- char cmpr_word[MAX_PHRASE_LENGTH + 1]; /* 2/92 GS TLG */
- char phrase[MAX_PHRASE_LENGTH + 1]; /* 2/92 GS TLG */
- #endif
- char txt_filename[MAX_FILENAME_LEN + 1]; /* 2/92 GS TLG */
- char *temp_txt_filename = NULL;
- char prev_txt_filename[MAX_FILENAME_LEN + 1]; /* 2/92 GS TLG */
- char txt_type[MAX_TYPE_LEN + 1]; /* 2/92 GS TLG */
- long phraselen= 0, txt_pos_fix= 0;
- char *document_section = NULL; /* tung , 10/93 */
- long document_section_len = 0; /* tung , 10/93 */
- long phrase_readed = 0; /* tung , 10/93 */
- long phrase_count = 0; /* tung , 10/93 */
- boolean phrase_found = false; /* tung , 10/93 */
- #endif /* LITERAL */
-
- /* do synonym conversion */
-
- /* in theory, one can replace a word with a boolean phrase */
- char *newword;
- double idf;
-
- newword = lookup_Synonym( word,db->syn_Table,db->syn_Table_Size );
- waislog(WLOG_HIGH,WLOG_INFO,"Word %s Syn %s",word,newword);
- strncpy(word,newword,MAX_WORD_LENGTH);
-
- /* call the stemmer */
- #ifdef LITERAL
- if (weight!=LITERAL_FLAG) {
- stemmer(word);
- }
- #else
- stemmer(word);
- #endif /* LITERAL */
-
- /* tung, 10/93 */
- #ifdef BOOLEANS
- if(number_of_qwords > 0) {
- if((weight!=LITERAL_FLAG) && IsOperator(word)) {
- boolean_operations(word);
- return(0);
- }
- if(search_result_array == NULL) {
- search_result_array =
- (search_result_struct *)
- s_malloc((size_t)(number_of_qwords * sizeof(search_result_struct)));
- word_id = 0;
- }
- if(strlen(word) == 1) {
- search_result_array[word_id].number_of_hits = 0;
- search_result_array[word_id].word_id = word_id;
- save_word_id(word_id);
- ++word_id;
- return(0);
- }
- }
- #endif /* BOOLEANS */
- /* tung, 10/93 */
-
-
- #ifdef LITERAL
- if (weight==LITERAL_FLAG) {
- /* goto after_booleans */
- /* printf("search_word: literal word is [%s]\n", word); */
- }
- else
- #endif /* LITERAL */
- #ifdef BOOLEANS
- if (strcmp(word,BOOLEAN_AND)==0) { /* should be all lowercase cmp here */
- gLastAnd= true;
- return(0);
- }
- else if (strcmp(word,BOOLEAN_NOT)==0) {
- /* ^^ this is bad if we intersperse "not"s in a query --
- docs found after not word may include notted word --
- need to go back to doing not words after others --
- but need now to check for literal string first
- */
- gLastNot= true;
- return(0);
- }
- if (weight == BOOLEAN_NOT_FLAG) gLastNot= true;
- #else
- ; /* if not LITERAL_FLAG */
- #endif /* BOOLEANS */
-
- index_buffer = (char*)index_buffer_data;
-
- #ifdef LITERAL
- if (weight==LITERAL_FLAG) {
- /* note: we found the first word of phrase once in map_over_words, but i'm too lazy
- to put another parameter in that cascade of function calls it takes
- to get here.
- */
- char word1[MAX_WORD_LENGTH + 1];
- register int i, len;
- register boolean more;
- phraselen= MIN( MAX_PHRASE_LENGTH, strlen(word));
- len = MIN( MAX_WORD_LENGTH, phraselen);
- for (i=0, more=true; i < len && more; ) {
- word1[i] = word[i++];
- #ifdef BIO
- more= (wordDelimiter(word[i]) == NOT_DELIMITER);
- #else
- more= (isalnum(word[i]));
- #endif /* BIO */
- }
- word1[i]= '\0';
- txt_pos_fix= strlen(word1) + 1;
- /* printf("search_word: literal word1 is [%s]\n", word1); */
- index_file_block_number =
- look_up_word_in_dictionary(word1, &number_of_occurances, db);
- }
- else
- #endif /* LITERAL */
-
- #ifdef PARTIALWORD
- index_file_block_number =
- look_up_partialword_in_dictionary(word, &number_of_occurances, db);
- #else
- index_file_block_number =
- look_up_word_in_dictionary(word, &number_of_occurances, db);
- #endif /* PARTIALWORD */
-
- current_best_hit = 0; /* so that the best hits willstart from 0 */
-
- /* check the document_score_array */
- if(document_score_array_len < db->doc_table_allocated_entries)
- make_document_score_array(db->doc_table_allocated_entries);
-
- if(index_file_block_number >= 0){
- #ifdef PARTIALWORD
- while(index_file_block_number > 0){ /* dgg, need 2nd loop here for multiple partwords */
- #endif /* PARTIALWORD */
- stream = db->index_stream;
-
- while((not_full_flag != INDEX_BLOCK_NOT_FULL_FLAG) &&
- (index_file_block_number != 0)){
- /* read the index block */
- if (0 != fseek(stream, (long)index_file_block_number,
- SEEK_SET))
- {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "fseek failed into the inverted file to position %ld",
- (long)index_file_block_number);
- #ifdef BOOLEANS
- gLastNot= gLastAnd= false;
- #endif /* BOOLEANS */
- return(-1);
- }
- #ifdef WIN32
- _read(fileno(stream),index_buffer,INDEX_BLOCK_HEADER_SIZE);
- #else
-
- fread(index_buffer, INDEX_BLOCK_HEADER_SIZE, 1, stream);
- #endif
-
- ASSIGN(not_full_flag,
- INDEX_BLOCK_FLAG_SIZE,
- index_buffer,
- INDEX_BLOCK_HEADER_SIZE,
- 0 );
- ASSIGN(index_file_block_number,NEXT_INDEX_BLOCK_SIZE,
- index_buffer+INDEX_BLOCK_FLAG_SIZE,
- INDEX_BLOCK_HEADER_SIZE,
- INDEX_BLOCK_FLAG_SIZE);
- ASSIGN(index_block_size,INDEX_BLOCK_SIZE_SIZE,
- index_buffer+INDEX_BLOCK_FLAG_SIZE+NEXT_INDEX_BLOCK_SIZE,
- INDEX_BLOCK_HEADER_SIZE,
- INDEX_BLOCK_FLAG_SIZE+NEXT_INDEX_BLOCK_SIZE);
-
- /*
- this is equivalent, but slower:
-
- not_full_flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, stream);
- index_file_block_number = read_bytes(NEXT_INDEX_BLOCK_SIZE, stream);
- index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, stream);
- */
-
- /* Jim's debug code commented out
- printf("flag = %d, block_num = %d, block_size = %d\n",
- not_full_flag,
- index_file_block_number,
- index_block_size);
- */
- fflush(stdout);
-
- if(EOF == index_block_size)
- {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "reading from the index file failed");
- #ifdef BOOLEANS
- gLastNot= gLastAnd= false;
- #endif /* BOOLEANS */
- return(-1);
- }
-
- if(not_full_flag == INDEX_BLOCK_NOT_FULL_FLAG){
- /* not full */
- number_of_valid_entries = index_file_block_number;
- }
- else if(not_full_flag == INDEX_BLOCK_FULL_FLAG){
- /* full */
- number_of_valid_entries = index_block_size - INDEX_BLOCK_HEADER_SIZE;
- }
- else{ /* bad news, file is corrupted. */
- waislog(WLOG_HIGH, WLOG_ERROR,
- "Expected the flag in the inverted file to be valid. it is %ld",
- not_full_flag);
- #ifdef BOOLEANS
- gLastNot= gLastAnd= false;
- #endif /* BOOLEANS */
- return(-1);
- }
- /* printf("number of valid bytes: %ld\n", number_of_valid_entries); */
-
- /* add the array to the document_score_array */
- number_of_valid_entries /= INDEX_ELEMENT_SIZE;
-
-
- /* tung, 10/93 */
- #ifdef BOOLEANS
- if((number_of_qwords > 0) && (search_result_array != NULL)) {
- if(search_result_array[word_id].doc_ids_array == NULL)
- search_result_array[word_id].doc_ids_array =
- (doc_descr_struct *)
- s_malloc((size_t)(sizeof(doc_descr_struct) * number_of_valid_entries));
- search_result_array[word_id].number_of_hits = number_of_valid_entries;
- }
- #endif /* BOOLEANS */
- /* tung, 10/93 */
-
-
- /* ses - idf is a fist approximation to the inverse document freq. */
- /* what it actually is is the inverse occurance frequency which says
- that the significance of a word is inversly proportional to the number
- of times it occurs in the database */
-
- idf=1.0/number_of_occurances;
- for(count=0;count < number_of_valid_entries;count++) {
- int wgt;
- int did;
- /*
- if(count%1024 == 0) {
- read(fileno(stream),index_buffer,INDEX_ELEMENT_SIZE*
- MIN(1024,number_of_valid_entries-count));
- i=index_buffer;
- }
- */
- did = read_bytes(DOCUMENT_ID_SIZE, stream);
- (void)read_bytes(WORD_POSITION_SIZE, stream);
- txt_pos=read_bytes(CHARACTER_POSITION_SIZE, stream);
- wgt = read_bytes(WEIGHT_SIZE,stream);
- /*
-
- ASSIGN(wgt,WEIGHT_SIZE,
- i+DOCUMENT_ID_SIZE+WORD_POSITION_SIZE+CHARACTER_POSITION_SIZE,
- INDEX_ELEMENT_SIZE,
- DOCUMENT_ID_SIZE+WORD_POSITION_SIZE+CHARACTER_POSITION_SIZE);
- ASSIGN(did,DOCUMENT_ID_SIZE,i,INDEX_ELEMENT_SIZE,0);
- */
- #ifdef LITERAL
- /* dgg -- is this proper update of read form to ASSIGN form ??*/
- /* txt_pos = read_bytes(CHARACTER_POSITION_SIZE, stream);*/ /* 2/92 GS TLG */
- if ((weight == LITERAL_FLAG) && (0 == doc_id)) {
- /*
- ASSIGN(txt_pos,CHARACTER_POSITION_SIZE,i+DOCUMENT_ID_SIZE+WORD_POSITION_SIZE,
- INDEX_ELEMENT_SIZE,DOCUMENT_ID_SIZE+WORD_POSITION_SIZE);
- */
- /* printf("search_word: txtpos=%d, wgt=%d, did=%d\n", txt_pos, wgt, did); */
- }
- #endif /* LITERAL */
-
- /* Commented out as suggested by Stan Isaacs at hp.com to come up with correct
- * weights when there are multiple documents in a file
- *
- * if(wgt>5L)
- * wgt-=5L;
- */
- internal_weight = log((double)wgt);
- internal_weight+=10.0;
- internal_document_id = did;
-
- /*
- printf("entry %ld, Doc_id: %ld, weight %lf \n",
- count, internal_document_id, internal_weight);
- fflush(stdout);
- */
- if(EOF == wgt)
- {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "reading from the doc-id table failed");
- #ifdef BOOLEANS
- gLastNot= gLastAnd= false;
- #endif /* BOOLEANS */
- return(-1);
- }
-
- #ifdef LITERAL
- if ((weight == LITERAL_FLAG) && (0 == doc_id)) { /* 2/92 GS TLG */
- if (true == read_document_table_entry(&doc_entry, /* 2/92 GS TLG */
- internal_document_id, db)) /* 2/92 GS TLG */
- { /* 2/92 GS TLG */
- read_filename_table_entry(doc_entry.filename_id, /* 2/92 GS TLG */
- txt_filename, txt_type, NULL, db); /* 2/92 GS TLG */
- /* printf("search_word: document is [%s]\n", txt_filename); */
- if (NULL == txt_stream) {
- if (probe_file(txt_filename)) {
- txt_stream = s_fopen(txt_filename, "rb");
- }
- else if (probe_file_possibly_compressed(txt_filename)) {
- temp_txt_filename = s_fzcat(txt_filename);
- if (temp_txt_filename) {
- txt_stream = s_fopen(temp_txt_filename, "rb");
- }
- }
-
- strcpy(prev_txt_filename, txt_filename);
- }
- else if (0 != strcmp(txt_filename, prev_txt_filename)) {
- s_fclose(txt_stream);
- if ( temp_txt_filename != NULL ) {
- unlink(temp_txt_filename);
- s_free(temp_txt_filename);
- }
- if (probe_file(txt_filename)) {
- txt_stream = s_fopen(txt_filename, "rb");
- }
- else if (probe_file_possibly_compressed(txt_filename)) {
- temp_txt_filename = s_fzcat(txt_filename);
- if (temp_txt_filename) {
- txt_stream = s_fopen(temp_txt_filename, "rb");
- }
- }
- strcpy(prev_txt_filename, txt_filename); /* 2/92 GS TLG */
- }
-
- txt_pos += doc_entry.start_character - txt_pos_fix; /* dgg */
-
- document_section_len = doc_entry.end_character - txt_pos; /* tung, 10/93 */
- s_fseek(txt_stream, txt_pos, SEEK_SET); /* 2/92 GS TLG */
- document_section =
- (char*) s_malloc((size_t)((document_section_len+1)*sizeof(char))); /* tung, 10/93 */
- fgets(document_section, document_section_len, txt_stream); /* tung, 10/93 */
- phrase_readed = 0; /* tung, 10/93 */
- phrase_readed += strlen(document_section); /* tung, 10/93 */
- document_section = string_downcase(document_section); /* tung, 10/93 */
- #if 0
-
- fread(phrase, 1L, phraselen, txt_stream); /* 2/92 GS TLG */
- /* { phrase[phraselen]= '\0';
- printf("search_word: file phrase is [%s]\n", phrase);
- } */
- #ifdef WIN32
- if (0 != _strnicmp(word, phrase, phraselen)) /* 2/92 GS TLG */
- #else
- if (0 != strncasecmp(word, phrase, phraselen)) /* 2/92 GS TLG */
- #endif
- internal_weight = 0.0; /* 2/92 GS TLG */
- #endif /* 0 */
-
- if (NULL == strstr(document_section, word)) { /* tung, 10/93 */
- while(phrase_readed < document_section_len) { /* tung, 10/93 */
- fgets(document_section, document_section_len, txt_stream); /* tung, 10/93 */
- phrase_readed += strlen(document_section); /* tung, 10/93 */
- document_section = string_downcase(document_section); /* tung, 10/93 */
- if(strstr(document_section, word) != NULL) { /* tung, 10/93 */
- phrase_found = true; /* tung, 10/93 */
- break; /* tung, 10/93 */
- } /* tung, 10/93 */
- } /* tung, 10/93 */
- if(phrase_found == false) /* tung, 10/93 */
- internal_weight = 0.0; /* tung, 10/93 */
- phrase_found = false; /* tung, 10/93 */
- }
- s_free(document_section); /* tung, 10/93 */
- }
- }
- #endif /* LITERAL */
-
- #ifdef BOOLEANS
- if (gLastNot) {
- document_score_array[internal_document_id] = 0;
- /* printf("search_word: boolean 'not' scored\n"); */
- }
- else
- #endif /* BOOLEANS */
- {
- /* if(doc_id > 0) we are doing a relevant document */
- /*
- printf("wgt: %ld, internal weight: %lf, idf: %lf occurances: %ld\n",
- wgt,internal_weight, idf,number_of_occurances);
- fflush(stdout);
- */
- internal_weight*=idf; /* ses - for inverse doc. freq. */
- #ifndef BOOLEANS
- document_score_array[internal_document_id] +=
- (doc_id) ? (internal_weight * RF_WEIGHTING) : internal_weight;
- #else
- /* tung, 10/93 */
- if(number_of_qwords == 0) {
- document_score_array[internal_document_id] +=
- (doc_id) ? (internal_weight * RF_WEIGHTING) : internal_weight;
- }
- else {
- if((number_of_qwords > 0) && (search_result_array != NULL)) {
- if(weight == LITERAL_FLAG) {
- if(document_score_array[internal_document_id] > 0) {
- ((search_result_array[word_id]).doc_ids_array[phrase_count]).doc_id = internal_document_id;
- ((search_result_array[word_id]).doc_ids_array[phrase_count]).score +=
- (doc_id) ? (internal_weight * RF_WEIGHTING) : internal_weight;
- phrase_count++;
- search_result_array[word_id].number_of_hits = phrase_count;
- }
- }
- else {
- ((search_result_array[word_id]).doc_ids_array[count]).doc_id = internal_document_id;
- ((search_result_array[word_id]).doc_ids_array[count]).score +=
- (doc_id) ? (internal_weight * RF_WEIGHTING) : internal_weight;
- }
- }
- }
- #endif /* BOOLEANS */
- /* tung, 10/93 */
-
- }
- /*
- printf("Score array: %lf\n",document_score_array[internal_document_id]);
- fflush(stdout);
- */
-
- i+=INDEX_ELEMENT_SIZE;
- }
- }
-
- #ifdef PARTIALWORD
- index_file_block_number =
- look_up_partialword_in_dictionary(NULL, &number_of_occurances, db);
- }
- #endif /* PARTIALWORD */
-
- /* tung, 10/93 */
- #ifdef BOOLEANS
- if((number_of_qwords > 0) && (search_result_array != NULL)) {
- save_word_id(word_id);
- search_result_array[word_id].word_id = word_id;
- ++word_id;
- }
- #endif /* BOOLEANS */
- /* tung, 10/93 */
-
- #ifdef BOOLEANS
- if(number_of_qwords == 0) {
- for (count=0; count < db->doc_table_allocated_entries; count++) {
- if (!gLastAnd) {
- prev_score_array[count] = document_score_array[count];
- }
- else {
- if ((document_score_array[count] == prev_score_array[count])
- || (prev_score_array[count] == 0)) {
- document_score_array[count] = 0;
- prev_score_array[count] = 0;
- }
- else {
- prev_score_array[count] = document_score_array[count];
- }
- }
- }
- /* if (gLastAnd) printf("search_word: boolean `and' scored\n"); */
- }
- #endif /* BOOLEANS */
-
- #ifdef BOOLEANS
- gLastNot= gLastAnd= false;
- #endif /* BOOLEANS */
- return(1); /* word present */
- }
-
- else if(0 == index_file_block_number){
- /* an error occurred on looking up the word */
- #ifdef BOOLEANS
- gLastNot= gLastAnd= false;
- #endif /* BOOLEANS */
- return(-1);
- }
-
- else { /* index_file_block_number is negative */
- #ifdef BOOLEANS
- /* tung, 10/93 */
- if((number_of_qwords > 0) && (search_result_array != NULL)) {
- save_word_id(word_id);
- search_result_array[word_id].word_id = word_id;
- search_result_array[word_id].number_of_hits = 0;
- ++word_id;
- }
- /* tung, 10/93 */
- else {
- if (gLastAnd)
- for (count=0; count < db->doc_table_allocated_entries; count++) {
- document_score_array[count] = 0;
- prev_score_array[count] = 0;
- }
- }
- gLastNot= gLastAnd= false;
- #endif /* BOOLEANS */
- return(0); /* word not present */
- }
- }
-
-
- /* now collect the best hits */
- long finished_search_word(db)
- database *db;
- {
- #ifdef BOOLEANS
- long number_of_hits; /* tung, 10/93 */
- #endif /* BOOLEANS */
-
- #ifdef BOOL
- if (currentQuery != NULL)
- return; /* do nothing for boolean */
- #endif /* def BOOL */
-
- /* tung, 10/93 */
- #ifdef BOOLEANS
- if((number_of_qwords > 0) && (search_result_array != NULL))
- number_of_hits = retriev_result(db->doc_table_allocated_entries);
- #endif /* BOOLEANS */
- /* tung, 10/93 */
-
- /* check the document_score_array */
- if(document_score_array_len < db->doc_table_allocated_entries)
- make_document_score_array(db->doc_table_allocated_entries);
-
- make_best_hits_array(max_hit_retrieved);
- sort_best_hits(db);
- syn_Free( db->syn_Table,&db->syn_Table_Size );
-
- return(0);
- }
-
-
-