Internet Publisher's Toolbox 2.0

home *** CD-ROM | disk | FTP | other *** search

/ Internet Publisher's Toolbox 2.0 / Internet Publisher's Toolbox.iso / internet / ntserver / wtsource / sersrch.c < prev next >

Wrap

C/C++ Source or Header | 1994-11-23 | 35.1 KB | 1,128 lines

/* WIDE AREA INFORMATION SERVER SOFTWARE No guarantees or restrictions. See the readme file for the full standard disclaimer. Brewster@think.com */ /* Copyright (c) CNIDR (see ../COPYRIGHT) */ /* implements the search part of irext.h (search_word and finished_search_word) -brewster Split from irsearch.c 5/31/91 Added scale_scores. Fixed document_score_array to long. 7/8/91 Removed scale_scores, handled in search_word with doc_id > 0. 2/4/92 Made document_score_array a double. - Jonny G * $Log: sersrch.c,v $ * Revision 1.33 93/07/21 18:47:38 warnock * Added STELAR-specific patches * * Revision 1.10 1993/10/13 14:14:20 huynh1 * new code added for encapsulated boolean queries and * modified literal search * * Revision 1.32 93/07/19 17:09:39 warnock * fixed problem with multiple documents in single file, from isaacs@hpcc05.corp.hp.com * * Revision 1.3 1993/07/13 08:19:56 pfeifer * Sicherung vor Aenderungen Tung * * Revision 1.31 93/07/02 18:31:06 warnock * included ctype.h * * Revision 1.3a 93/07/02 17:59:25 warnock * change to search_word to handle compressed files from francois * * Revision 1.1 1993/02/16 15:05:35 freewais * Initial revision * * Revision 1.24 92/04/28 16:56:54 morris * added boolean to serial engine * * Revision 1.23 92/03/15 10:15:18 jonathan * Added Simon Spero's ASSIGN replacement for read_bytes. * * Revision 1.22 92/03/05 07:09:54 shen * add two more dummy arguments to call to init_search_engine * * Revision 1.21 92/02/12 17:29:52 jonathan * Conditionalized inclusion of object code. * * Revision 1.20 92/02/12 13:40:06 jonathan * Added "$Log" so RCS will put the log message in the header * */ #include <string.h> #include "cdialect.h" #include "irfiles.h" #ifdef BIO #include "irtfiles.h" /* dgg, for wordDelimiter */ #endif /* BIO */ #include "irsearch.h" #include "irext.h" #include "byte_order.h" #include <ctype.h> #include <math.h> /* tung, 10/93 */ #ifdef BOOLEANS #include "boolean_op.h" #endif /* BOOLEANS */ /* tung, 10/93 */ /* #define MAXINT (unsigned long)2^(sizeof(long)*8-1) */ #define VALUE 1000000L #include "stemmer.h" /* tung, 10/93 */ #ifdef BOOLEANS #include "boolean_op.h" #endif /* BOOLEANS */ /* tung, 10/93 */ #ifdef BOOL #include "obj.h" #include "irparse.h" object* currentQuery = NULL; /* kludge until irext goes away */ #endif /* def BOOL */ #ifdef WIN32 #include <io.h> #endif /* weighting for relevant document terms - this may become a parameter to the query. */ #define RF_WEIGHTING 0.1 /* ================================== * === Initialization Functions === * ==================================*/ long init_search_engine(file, initialize, for_search, cm_mem_percent, text_size, grow_percent) char* file; boolean initialize; boolean for_search; long cm_mem_percent; /* unused */ long text_size; /* unused */ long grow_percent; /* unused */ { static boolean inited = false; if (inited == false) { #ifdef BOOL initObj(); initBool(); #endif /* BOOL */ inited = true; } return(0); } long finished_search_engine() { return(0); } /* * ext_open_database: see irext.h */ long ext_open_database (db, initialize, for_search) database *db; boolean initialize; boolean for_search; { /* this has to deal with the .inv file */ char file[MAX_FILE_NAME_LEN]; if(initialize) /* make a new one */ db->index_stream = s_fopen(index_filename(file, db), "w+b"); else if(for_search) /* just search */ db->index_stream = s_fopen(index_filename(file, db), "rb"); else /* write to an existing db */ db->index_stream = s_fopen(index_filename(file, db), "r+b"); if (db->index_stream == NULL) { waislog(WLOG_HIGH, WLOG_ERROR,"2can't open the inverted index file %s\n", file); disposeDatabase(db); return(1); } return(0); } /* * ext_close_database: see irext.h */ long ext_close_database (db) database *db; { return(0); } char *database_file(database_name) char *database_name; { return(database_name); } /*===========================* *=== Setting Paramters ===* *===========================*/ long max_hit_retrieved = 0; char **srcs = NULL; long set_query_parameter (mask, parameters) long mask; query_parameter_type * parameters; { switch (mask) { case SET_MAX_RETRIEVED_MASK: max_hit_retrieved = parameters->max_hit_retrieved; return(0); break; case SET_SELECT_SOURCE: if(NULL != srcs){ if(NULL != srcs[0]) s_free(srcs[0]); s_free(srcs); } srcs = parameters->srcs; break; default: return(-1); break; } return(0); } /*==============================* *=== Document Score Array ===* *==============================*/ double *document_score_array = NULL; long document_score_array_len = 0; #ifdef BOOLEANS double *prev_score_array = NULL; /* 12/91 GS TLG */ /* tung, 10/93 */ search_result_struct *search_result_array = NULL; /* tung, 10/93 */ #endif /* BOOLEANS */ /* make_document_score_array insures that the document_score_array array is long enough, if not it makes it long enough */ static void make_document_score_array _AP((long length )); static void make_document_score_array(length) long length; { if(length <= document_score_array_len) return; /* we have to make a new one. free the old one first (if any) */ if(document_score_array != 0){ s_free(document_score_array); #ifdef BOOLEANS s_free(prev_score_array); /* 12/91 GS TLG */ #endif /* BOOLEANS */ } document_score_array = (double*)s_malloc( (size_t)(length * sizeof(double))); #ifdef BOOLEANS prev_score_array = (double*)s_malloc((size_t)(length * sizeof(double))); /* 12/91 GS TLG */ #endif /* BOOLEANS */ document_score_array_len = length; } static void destroy_document_score_array _AP((void)); static void destroy_document_score_array() { s_free(document_score_array); #ifdef BOOLEANS s_free(prev_score_array); /* 12/91 GS TLG */ #endif /* BOOLEANS */ document_score_array_len = 0; } void clear_document_score_array() /* side effects the document_score_array. */ { memset(document_score_array, 0, document_score_array_len * sizeof(double)); #ifdef BOOLEANS memset(prev_score_array, 0, /* 12/91 GS TLG */ document_score_array_len * sizeof(double)); /* 12/91 GS TLG */ #endif /* BOOLEANS */ } /* for debugging purposes */ void print_document_score_array(start,stop) unsigned long start; unsigned long stop; /* assumes start >= 0, stop < db->doc_table_allocated_entries */ { long i; #ifdef WIN32 for(i = start; i <= (long)stop; i++){ #else for(i = start; i <= stop; i++){ #endif printf("entry number %d: %f \n", i, document_score_array[i]); } } /*=========================* *=== Best Hits Array ===* *=========================*/ hit *best_hits_array = NULL; long best_hits_array_len = 0; long current_best_hit = 0; /* see irext.h for doc */ long init_best_hit (db) database *db; { #ifdef BOOL if (currentQuery != NULL) send(currentQuery,InitBestHit,db); #endif /* def BOOL */ return(0); } /* make_best_hits_array insures that the best_hits_array array is long enough, if not it makes it long enough */ static void make_best_hits_array _AP((long length)); static void make_best_hits_array(length) long length; { if(length <= best_hits_array_len) return; /* we have to make a new one. free the old one first (if any) */ if(best_hits_array != 0){ s_free(best_hits_array); } best_hits_array = (hit*)s_malloc((size_t)(length * sizeof(hit))); best_hits_array_len = length; } static void destroy_best_hits_array _AP((void)); static void destroy_best_hits_array() { s_free(best_hits_array); best_hits_array_len = 0; } void clear_best_hits_array() /* side effects the best_hits_array. XXX could use memset */ { memset((char*)best_hits_array, 0, best_hits_array_len * sizeof(hit)); } /* for debugging purposes */ void print_best_hits() { long i; for( i = 0; i < best_hits_array_len; i++){ if (best_hits_array[i].weight != 0) { printf("Best hit %ld: weight %lf, doc_id %ld, headline %s, filename %s, lines %ld\n", i, best_hits_array[i].weight, best_hits_array[i].document_id, best_hits_array[i].headline, best_hits_array[i].filename, best_hits_array[i].number_of_lines); } } } void sort_best_hits(db) database * db; { /* returns nothing. * side effects best_hits and document_score_array */ long i, doc; double worst_weight_to_make_it = 0.0; document_table_entry doc_entry; long best_hit_number = 0; /* snuff the scores */ for(i = 0; i < max_hit_retrieved; i++){ best_hits_array[i].weight = 0.0; } /* loop over the doc, and keep the doc_id and weight in best hit table */ for(doc = 1; doc < db->doc_table_allocated_entries; doc++){ double weight = document_score_array[doc]; /* jmf */ read_document_table_entry(&doc_entry, doc, db); /* if this could be removed, we'd gain speed */ /* if (doc_entry.document_length) */ #ifdef WIN32 /* Windows NT doesn't like dividing by 0. This will cause the first entry in the document table (which is a null entry) to be skipped. [I don't understand why UNIX or the Macintosh don't have a problem here.] */ if (doc_entry.document_length == 0) continue; #endif /* WIN32 */ weight/=doc_entry.document_length; if(worst_weight_to_make_it < weight){ /* merge it into the best_hits array. start at the bottom */ for(i = (max_hit_retrieved - 1); i >= 0; i--){ if(weight > best_hits_array[i].weight /* && (check_document_id(doc, db) == true) too slow.*/ ){ /* move this entry down */ if((i + 1) < max_hit_retrieved){ best_hits_array[i+1].weight = best_hits_array[i].weight; best_hits_array[i+1].document_id = best_hits_array[i].document_id; } best_hits_array[i].document_id = doc; best_hits_array[i].weight = weight; } else break; } } } for(i = 0; i < max_hit_retrieved; i++){ if(best_hits_array[i].weight <= 0.0) return; if (read_document_table_entry(&doc_entry, best_hits_array[i].document_id, db) == true){ best_hits_array[best_hit_number].weight = best_hits_array[i].weight; best_hits_array[best_hit_number].document_id = best_hits_array[i].document_id; best_hits_array[best_hit_number].start_character = doc_entry.start_character; best_hits_array[best_hit_number].end_character = doc_entry.end_character; best_hits_array[best_hit_number].document_length = doc_entry.document_length; best_hits_array[best_hit_number].number_of_lines = doc_entry.number_of_lines; sprintf(best_hits_array[best_hit_number].date, "%d", doc_entry.date); read_filename_table_entry(doc_entry.filename_id, best_hits_array[best_hit_number].filename, best_hits_array[best_hit_number].type, NULL, db), strncpy(best_hits_array[best_hit_number].headline, read_headline_table_entry(doc_entry.headline_id,db), MAX_FILE_NAME_LEN); best_hit_number++; } beFriendly(); } for(i = best_hit_number; i < max_hit_retrieved; i++){ best_hits_array[best_hit_number].weight = 0.0; } /* print_best_hits(s); for debugging */ } /* returns the next best hit */ long best_hit(db, doc_id, best_character, best_line, score,start,end,date, length,nlines,headline,filename,type) database *db; long *doc_id; long *best_character; long *best_line; double *score; long *start,*end,*date,*length,*nlines; char *headline,*filename,*type; { double tmp; *best_character = 0; *best_line = 0; #ifdef BOOL if (currentQuery != NULL) /* for boolean */ { send(currentQuery,GetBestHit,db,doc_id,best_character,best_line,score); if (*doc_id > 0) return(0); /* ok */ else return(-1); /* no more docs */ } #endif /* BOOL */ if(current_best_hit > best_hits_array_len) return(1); if(best_hits_array[current_best_hit].weight == 0.0) return(1); *doc_id = best_hits_array[current_best_hit].document_id; tmp = ((double)(best_hits_array[current_best_hit].weight*VALUE)); *score=tmp; *start=best_hits_array[current_best_hit].start_character; *end=best_hits_array[current_best_hit].end_character; *date=atoi(best_hits_array[current_best_hit].date); *length=best_hits_array[current_best_hit].document_length; *nlines=best_hits_array[current_best_hit].number_of_lines; strcpy(headline,best_hits_array[current_best_hit].headline); strcpy(filename,best_hits_array[current_best_hit].filename); strcpy(type,best_hits_array[current_best_hit].type); current_best_hit++; return(0); } long finished_best_hit(db) database *db; { #ifdef BOOL if (currentQuery != NULL) /* for boolean */ { send(currentQuery,Delete); currentQuery = NULL; return(0); } #endif /* BOOL */ /* if we are on a small machine, we might want to destroy_document_score_array */ clear_document_score_array(); clear_best_hits_array(); current_best_hit = 0; return(0); } /*=============================* *=== Searching for words ===* *=============================*/ /* see irext.h for doc */ long init_search_word (db) database* db; { char fn[256]; strcpy( fn,db->database_file ); strcat( fn,synonym_ext ); syn_ReadFile( fn,&db->syn_Table,&db->syn_Table_Size ); return(0); } #ifdef BOOLEANS /* tung, 10/93 */ long word_id = 0; extern long number_of_qwords ; /* tung, 10/93 */ static boolean gLastAnd= false; static boolean gLastNot= false; #endif /* BOOLEANS */ /* see irext.h for doc */ /* returns -1 if error, 1 if word exists, 0 if not */ long search_word(word,char_pos, line_pos, weight, doc_id, word_pair, db) char *word; /* the word to be searched for */ long char_pos; /* the position of the start of the word */ long line_pos; /* is this needed? not for signature system */ long weight; /* how important the word looks syntactically, such as is it bold */ long doc_id; /* current document, seed words is 0, then it increments into the relevant document */ long word_pair; database *db; { /* this side effects the document_score_array, * and downcases the word. * Returns 0 if successful or word not present, * returns non-0 if an error. * */ long not_full_flag = INDEX_BLOCK_FULL_FLAG; /*start out full so it will go on looking */ long count, index_block_size; long internal_document_id, number_of_valid_entries; double internal_weight; long index_file_block_number; long number_of_occurances; FOUR_BYTE index_buffer_data[INDEX_ELEMENT_SIZE*(1024/4)]; char *index_buffer; char *i; FILE *stream = NULL; #ifdef LITERAL #ifdef WIN32 long txt_pos; /* 2/92 GS TLG */ document_table_entry doc_entry; /* 2/92 GS TLG */ static FILE *txt_stream = NULL; /* 2/92 GS TLG */ #else long txt_pos, icnt, wcnt, pcnt; /* 2/92 GS TLG */ document_table_entry doc_entry; /* 2/92 GS TLG */ static FILE *txt_stream = NULL; /* 2/92 GS TLG */ char cmpr_word[MAX_PHRASE_LENGTH + 1]; /* 2/92 GS TLG */ char phrase[MAX_PHRASE_LENGTH + 1]; /* 2/92 GS TLG */ #endif char txt_filename[MAX_FILENAME_LEN + 1]; /* 2/92 GS TLG */ char *temp_txt_filename = NULL; char prev_txt_filename[MAX_FILENAME_LEN + 1]; /* 2/92 GS TLG */ char txt_type[MAX_TYPE_LEN + 1]; /* 2/92 GS TLG */ long phraselen= 0, txt_pos_fix= 0; char *document_section = NULL; /* tung , 10/93 */ long document_section_len = 0; /* tung , 10/93 */ long phrase_readed = 0; /* tung , 10/93 */ long phrase_count = 0; /* tung , 10/93 */ boolean phrase_found = false; /* tung , 10/93 */ #endif /* LITERAL */ /* do synonym conversion */ /* in theory, one can replace a word with a boolean phrase */ char *newword; double idf; newword = lookup_Synonym( word,db->syn_Table,db->syn_Table_Size ); waislog(WLOG_HIGH,WLOG_INFO,"Word %s Syn %s",word,newword); strncpy(word,newword,MAX_WORD_LENGTH); /* call the stemmer */ #ifdef LITERAL if (weight!=LITERAL_FLAG) { stemmer(word); } #else stemmer(word); #endif /* LITERAL */ /* tung, 10/93 */ #ifdef BOOLEANS if(number_of_qwords > 0) { if((weight!=LITERAL_FLAG) && IsOperator(word)) { boolean_operations(word); return(0); } if(search_result_array == NULL) { search_result_array = (search_result_struct *) s_malloc((size_t)(number_of_qwords * sizeof(search_result_struct))); word_id = 0; } if(strlen(word) == 1) { search_result_array[word_id].number_of_hits = 0; search_result_array[word_id].word_id = word_id; save_word_id(word_id); ++word_id; return(0); } } #endif /* BOOLEANS */ /* tung, 10/93 */ #ifdef LITERAL if (weight==LITERAL_FLAG) { /* goto after_booleans */ /* printf("search_word: literal word is [%s]\n", word); */ } else #endif /* LITERAL */ #ifdef BOOLEANS if (strcmp(word,BOOLEAN_AND)==0) { /* should be all lowercase cmp here */ gLastAnd= true; return(0); } else if (strcmp(word,BOOLEAN_NOT)==0) { /* ^^ this is bad if we intersperse "not"s in a query -- docs found after not word may include notted word -- need to go back to doing not words after others -- but need now to check for literal string first */ gLastNot= true; return(0); } if (weight == BOOLEAN_NOT_FLAG) gLastNot= true; #else ; /* if not LITERAL_FLAG */ #endif /* BOOLEANS */ index_buffer = (char*)index_buffer_data; #ifdef LITERAL if (weight==LITERAL_FLAG) { /* note: we found the first word of phrase once in map_over_words, but i'm too lazy to put another parameter in that cascade of function calls it takes to get here. */ char word1[MAX_WORD_LENGTH + 1]; register int i, len; register boolean more; phraselen= MIN( MAX_PHRASE_LENGTH, strlen(word)); len = MIN( MAX_WORD_LENGTH, phraselen); for (i=0, more=true; i < len && more; ) { word1[i] = word[i++]; #ifdef BIO more= (wordDelimiter(word[i]) == NOT_DELIMITER); #else more= (isalnum(word[i])); #endif /* BIO */ } word1[i]= '\0'; txt_pos_fix= strlen(word1) + 1; /* printf("search_word: literal word1 is [%s]\n", word1); */ index_file_block_number = look_up_word_in_dictionary(word1, &number_of_occurances, db); } else #endif /* LITERAL */ #ifdef PARTIALWORD index_file_block_number = look_up_partialword_in_dictionary(word, &number_of_occurances, db); #else index_file_block_number = look_up_word_in_dictionary(word, &number_of_occurances, db); #endif /* PARTIALWORD */ current_best_hit = 0; /* so that the best hits willstart from 0 */ /* check the document_score_array */ if(document_score_array_len < db->doc_table_allocated_entries) make_document_score_array(db->doc_table_allocated_entries); if(index_file_block_number >= 0){ #ifdef PARTIALWORD while(index_file_block_number > 0){ /* dgg, need 2nd loop here for multiple partwords */ #endif /* PARTIALWORD */ stream = db->index_stream; while((not_full_flag != INDEX_BLOCK_NOT_FULL_FLAG) && (index_file_block_number != 0)){ /* read the index block */ if (0 != fseek(stream, (long)index_file_block_number, SEEK_SET)) { waislog(WLOG_HIGH, WLOG_ERROR, "fseek failed into the inverted file to position %ld", (long)index_file_block_number); #ifdef BOOLEANS gLastNot= gLastAnd= false; #endif /* BOOLEANS */ return(-1); } #ifdef WIN32 _read(fileno(stream),index_buffer,INDEX_BLOCK_HEADER_SIZE); #else fread(index_buffer, INDEX_BLOCK_HEADER_SIZE, 1, stream); #endif ASSIGN(not_full_flag, INDEX_BLOCK_FLAG_SIZE, index_buffer, INDEX_BLOCK_HEADER_SIZE, 0 ); ASSIGN(index_file_block_number,NEXT_INDEX_BLOCK_SIZE, index_buffer+INDEX_BLOCK_FLAG_SIZE, INDEX_BLOCK_HEADER_SIZE, INDEX_BLOCK_FLAG_SIZE); ASSIGN(index_block_size,INDEX_BLOCK_SIZE_SIZE, index_buffer+INDEX_BLOCK_FLAG_SIZE+NEXT_INDEX_BLOCK_SIZE, INDEX_BLOCK_HEADER_SIZE, INDEX_BLOCK_FLAG_SIZE+NEXT_INDEX_BLOCK_SIZE); /* this is equivalent, but slower: not_full_flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, stream); index_file_block_number = read_bytes(NEXT_INDEX_BLOCK_SIZE, stream); index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, stream); */ /* Jim's debug code commented out printf("flag = %d, block_num = %d, block_size = %d\n", not_full_flag, index_file_block_number, index_block_size); */ fflush(stdout); if(EOF == index_block_size) { waislog(WLOG_HIGH, WLOG_ERROR, "reading from the index file failed"); #ifdef BOOLEANS gLastNot= gLastAnd= false; #endif /* BOOLEANS */ return(-1); } if(not_full_flag == INDEX_BLOCK_NOT_FULL_FLAG){ /* not full */ number_of_valid_entries = index_file_block_number; } else if(not_full_flag == INDEX_BLOCK_FULL_FLAG){ /* full */ number_of_valid_entries = index_block_size - INDEX_BLOCK_HEADER_SIZE; } else{ /* bad news, file is corrupted. */ waislog(WLOG_HIGH, WLOG_ERROR, "Expected the flag in the inverted file to be valid. it is %ld", not_full_flag); #ifdef BOOLEANS gLastNot= gLastAnd= false; #endif /* BOOLEANS */ return(-1); } /* printf("number of valid bytes: %ld\n", number_of_valid_entries); */ /* add the array to the document_score_array */ number_of_valid_entries /= INDEX_ELEMENT_SIZE; /* tung, 10/93 */ #ifdef BOOLEANS if((number_of_qwords > 0) && (search_result_array != NULL)) { if(search_result_array[word_id].doc_ids_array == NULL) search_result_array[word_id].doc_ids_array = (doc_descr_struct *) s_malloc((size_t)(sizeof(doc_descr_struct) * number_of_valid_entries)); search_result_array[word_id].number_of_hits = number_of_valid_entries; } #endif /* BOOLEANS */ /* tung, 10/93 */ /* ses - idf is a fist approximation to the inverse document freq. */ /* what it actually is is the inverse occurance frequency which says that the significance of a word is inversly proportional to the number of times it occurs in the database */ idf=1.0/number_of_occurances; for(count=0;count < number_of_valid_entries;count++) { int wgt; int did; /* if(count%1024 == 0) { read(fileno(stream),index_buffer,INDEX_ELEMENT_SIZE* MIN(1024,number_of_valid_entries-count)); i=index_buffer; } */ did = read_bytes(DOCUMENT_ID_SIZE, stream); (void)read_bytes(WORD_POSITION_SIZE, stream); txt_pos=read_bytes(CHARACTER_POSITION_SIZE, stream); wgt = read_bytes(WEIGHT_SIZE,stream); /* ASSIGN(wgt,WEIGHT_SIZE, i+DOCUMENT_ID_SIZE+WORD_POSITION_SIZE+CHARACTER_POSITION_SIZE, INDEX_ELEMENT_SIZE, DOCUMENT_ID_SIZE+WORD_POSITION_SIZE+CHARACTER_POSITION_SIZE); ASSIGN(did,DOCUMENT_ID_SIZE,i,INDEX_ELEMENT_SIZE,0); */ #ifdef LITERAL /* dgg -- is this proper update of read form to ASSIGN form ??*/ /* txt_pos = read_bytes(CHARACTER_POSITION_SIZE, stream);*/ /* 2/92 GS TLG */ if ((weight == LITERAL_FLAG) && (0 == doc_id)) { /* ASSIGN(txt_pos,CHARACTER_POSITION_SIZE,i+DOCUMENT_ID_SIZE+WORD_POSITION_SIZE, INDEX_ELEMENT_SIZE,DOCUMENT_ID_SIZE+WORD_POSITION_SIZE); */ /* printf("search_word: txtpos=%d, wgt=%d, did=%d\n", txt_pos, wgt, did); */ } #endif /* LITERAL */ /* Commented out as suggested by Stan Isaacs at hp.com to come up with correct * weights when there are multiple documents in a file * * if(wgt>5L) * wgt-=5L; */ internal_weight = log((double)wgt); internal_weight+=10.0; internal_document_id = did; /* printf("entry %ld, Doc_id: %ld, weight %lf \n", count, internal_document_id, internal_weight); fflush(stdout); */ if(EOF == wgt) { waislog(WLOG_HIGH, WLOG_ERROR, "reading from the doc-id table failed"); #ifdef BOOLEANS gLastNot= gLastAnd= false; #endif /* BOOLEANS */ return(-1); } #ifdef LITERAL if ((weight == LITERAL_FLAG) && (0 == doc_id)) { /* 2/92 GS TLG */ if (true == read_document_table_entry(&doc_entry, /* 2/92 GS TLG */ internal_document_id, db)) /* 2/92 GS TLG */ { /* 2/92 GS TLG */ read_filename_table_entry(doc_entry.filename_id, /* 2/92 GS TLG */ txt_filename, txt_type, NULL, db); /* 2/92 GS TLG */ /* printf("search_word: document is [%s]\n", txt_filename); */ if (NULL == txt_stream) { if (probe_file(txt_filename)) { txt_stream = s_fopen(txt_filename, "rb"); } else if (probe_file_possibly_compressed(txt_filename)) { temp_txt_filename = s_fzcat(txt_filename); if (temp_txt_filename) { txt_stream = s_fopen(temp_txt_filename, "rb"); } } strcpy(prev_txt_filename, txt_filename); } else if (0 != strcmp(txt_filename, prev_txt_filename)) { s_fclose(txt_stream); if ( temp_txt_filename != NULL ) { unlink(temp_txt_filename); s_free(temp_txt_filename); } if (probe_file(txt_filename)) { txt_stream = s_fopen(txt_filename, "rb"); } else if (probe_file_possibly_compressed(txt_filename)) { temp_txt_filename = s_fzcat(txt_filename); if (temp_txt_filename) { txt_stream = s_fopen(temp_txt_filename, "rb"); } } strcpy(prev_txt_filename, txt_filename); /* 2/92 GS TLG */ } txt_pos += doc_entry.start_character - txt_pos_fix; /* dgg */ document_section_len = doc_entry.end_character - txt_pos; /* tung, 10/93 */ s_fseek(txt_stream, txt_pos, SEEK_SET); /* 2/92 GS TLG */ document_section = (char*) s_malloc((size_t)((document_section_len+1)*sizeof(char))); /* tung, 10/93 */ fgets(document_section, document_section_len, txt_stream); /* tung, 10/93 */ phrase_readed = 0; /* tung, 10/93 */ phrase_readed += strlen(document_section); /* tung, 10/93 */ document_section = string_downcase(document_section); /* tung, 10/93 */ #if 0 fread(phrase, 1L, phraselen, txt_stream); /* 2/92 GS TLG */ /* { phrase[phraselen]= '\0'; printf("search_word: file phrase is [%s]\n", phrase); } */ #ifdef WIN32 if (0 != _strnicmp(word, phrase, phraselen)) /* 2/92 GS TLG */ #else if (0 != strncasecmp(word, phrase, phraselen)) /* 2/92 GS TLG */ #endif internal_weight = 0.0; /* 2/92 GS TLG */ #endif /* 0 */ if (NULL == strstr(document_section, word)) { /* tung, 10/93 */ while(phrase_readed < document_section_len) { /* tung, 10/93 */ fgets(document_section, document_section_len, txt_stream); /* tung, 10/93 */ phrase_readed += strlen(document_section); /* tung, 10/93 */ document_section = string_downcase(document_section); /* tung, 10/93 */ if(strstr(document_section, word) != NULL) { /* tung, 10/93 */ phrase_found = true; /* tung, 10/93 */ break; /* tung, 10/93 */ } /* tung, 10/93 */ } /* tung, 10/93 */ if(phrase_found == false) /* tung, 10/93 */ internal_weight = 0.0; /* tung, 10/93 */ phrase_found = false; /* tung, 10/93 */ } s_free(document_section); /* tung, 10/93 */ } } #endif /* LITERAL */ #ifdef BOOLEANS if (gLastNot) { document_score_array[internal_document_id] = 0; /* printf("search_word: boolean 'not' scored\n"); */ } else #endif /* BOOLEANS */ { /* if(doc_id > 0) we are doing a relevant document */ /* printf("wgt: %ld, internal weight: %lf, idf: %lf occurances: %ld\n", wgt,internal_weight, idf,number_of_occurances); fflush(stdout); */ internal_weight*=idf; /* ses - for inverse doc. freq. */ #ifndef BOOLEANS document_score_array[internal_document_id] += (doc_id) ? (internal_weight * RF_WEIGHTING) : internal_weight; #else /* tung, 10/93 */ if(number_of_qwords == 0) { document_score_array[internal_document_id] += (doc_id) ? (internal_weight * RF_WEIGHTING) : internal_weight; } else { if((number_of_qwords > 0) && (search_result_array != NULL)) { if(weight == LITERAL_FLAG) { if(document_score_array[internal_document_id] > 0) { ((search_result_array[word_id]).doc_ids_array[phrase_count]).doc_id = internal_document_id; ((search_result_array[word_id]).doc_ids_array[phrase_count]).score += (doc_id) ? (internal_weight * RF_WEIGHTING) : internal_weight; phrase_count++; search_result_array[word_id].number_of_hits = phrase_count; } } else { ((search_result_array[word_id]).doc_ids_array[count]).doc_id = internal_document_id; ((search_result_array[word_id]).doc_ids_array[count]).score += (doc_id) ? (internal_weight * RF_WEIGHTING) : internal_weight; } } } #endif /* BOOLEANS */ /* tung, 10/93 */ } /* printf("Score array: %lf\n",document_score_array[internal_document_id]); fflush(stdout); */ i+=INDEX_ELEMENT_SIZE; } } #ifdef PARTIALWORD index_file_block_number = look_up_partialword_in_dictionary(NULL, &number_of_occurances, db); } #endif /* PARTIALWORD */ /* tung, 10/93 */ #ifdef BOOLEANS if((number_of_qwords > 0) && (search_result_array != NULL)) { save_word_id(word_id); search_result_array[word_id].word_id = word_id; ++word_id; } #endif /* BOOLEANS */ /* tung, 10/93 */ #ifdef BOOLEANS if(number_of_qwords == 0) { for (count=0; count < db->doc_table_allocated_entries; count++) { if (!gLastAnd) { prev_score_array[count] = document_score_array[count]; } else { if ((document_score_array[count] == prev_score_array[count]) || (prev_score_array[count] == 0)) { document_score_array[count] = 0; prev_score_array[count] = 0; } else { prev_score_array[count] = document_score_array[count]; } } } /* if (gLastAnd) printf("search_word: boolean `and' scored\n"); */ } #endif /* BOOLEANS */ #ifdef BOOLEANS gLastNot= gLastAnd= false; #endif /* BOOLEANS */ return(1); /* word present */ } else if(0 == index_file_block_number){ /* an error occurred on looking up the word */ #ifdef BOOLEANS gLastNot= gLastAnd= false; #endif /* BOOLEANS */ return(-1); } else { /* index_file_block_number is negative */ #ifdef BOOLEANS /* tung, 10/93 */ if((number_of_qwords > 0) && (search_result_array != NULL)) { save_word_id(word_id); search_result_array[word_id].word_id = word_id; search_result_array[word_id].number_of_hits = 0; ++word_id; } /* tung, 10/93 */ else { if (gLastAnd) for (count=0; count < db->doc_table_allocated_entries; count++) { document_score_array[count] = 0; prev_score_array[count] = 0; } } gLastNot= gLastAnd= false; #endif /* BOOLEANS */ return(0); /* word not present */ } } /* now collect the best hits */ long finished_search_word(db) database *db; { #ifdef BOOLEANS long number_of_hits; /* tung, 10/93 */ #endif /* BOOLEANS */ #ifdef BOOL if (currentQuery != NULL) return; /* do nothing for boolean */ #endif /* def BOOL */ /* tung, 10/93 */ #ifdef BOOLEANS if((number_of_qwords > 0) && (search_result_array != NULL)) number_of_hits = retriev_result(db->doc_table_allocated_entries); #endif /* BOOLEANS */ /* tung, 10/93 */ /* check the document_score_array */ if(document_score_array_len < db->doc_table_allocated_entries) make_document_score_array(db->doc_table_allocated_entries); make_best_hits_array(max_hit_retrieved); sort_best_hits(db); syn_Free( db->syn_Table,&db->syn_Table_Size ); return(0); }