home *** CD-ROM | disk | FTP | other *** search
- /* WIDE AREA INFORMATION SERVER SOFTWARE:
- No guarantees or restrictions. See the readme file for the full standard
- disclaimer.
-
- Brewster@think.com
- */
-
- /* Copyright (c) CNIDR (see ../COPYRIGHT) */
-
-
- /* Change log:
- * $Log: irtfiles.c,v $
- *
- * Revision 1.4a 93/07/23 02:19:27 warnock
- * corrected arguments in call to map_over_words in finish_document
- *
- * Revision 1.4 1993/09/22 16:07:52 pfeifer
- * Fixed word breaking for german ISO Umlaute and sz
- *
- * Revision 1.3a 93/07/19 17:06:03 warnock
- * fixed problem with multiple documents in single file, from isaacs@hpcc05.corp.hp.com
- *
- * Revision 1.3 1993/06/04 10:23:15 pfeifer
- * Pachtlevel BIBDB
- *
- * Revision 1.2a 93/07/19 16:31:27 warnock
- * Added document type URL from Nathan.Torkington@vuw.ac.nz
- *
- * Revision 1.2 1993/06/01 14:05:54 pfeifer
- * Added code for soundex/phonix indexing and retrieval
- *
- * Revision 1.1 1993/02/16 15:05:35 freewais
- * Initial revision
- *
- * Revision 1.32 92/05/06 17:32:14 jonathan
- * Added new global for current_filename and current_filecount (from
- * riddle@rice.edu).
- *
- * Revision 1.31 92/04/30 12:25:09 jonathan
- * changed a couple of s_free's to free's for ULTRIX CC.
- *
- * Revision 1.30 92/04/29 08:09:55 shen
- * add global variable "_indexable_section", default is true
- *
- * Revision 1.29 92/04/28 17:53:24 jonathan
- * Replaced directory routines with scandir.
- *
- * Revision 1.28 92/03/20 11:02:55 jonathan
- * Added code to handle switches for word_pairs and word_postition info.
- *
- * Revision 1.27 92/02/13 11:23:21 jonathan
- * Removed printable_time() from index logging, since it's done by waislog.
- *
- * Revision 1.26 92/02/12 13:31:29 jonathan
- * Added "$Log" so RCS will put the log message in the header
- *
- */
-
- /*
- * Indexes the words in a text file.
- *
- * Port of irtfiles.lisp.
- *
- * -brewster 6/90
- */
-
- /* the main functions are:
- * index_text_file
- * index_directory
- *
- * Some of the policy issues coded in this file are
- * What extra weight should the headline get?
- *
- */
-
- #include <ctype.h>
- #include <string.h>
- #include "panic.h"
- #include "irdirent.h"
- #include "irhash.h"
- #include "cutil.h"
- #include "futil.h"
- #include "irfiles.h"
- #include "irtfiles.h"
-
- #include "ircfiles.h" /* dgg, need for genbank_header_function test */
- #include "stemmer.h"
-
- #ifdef SOUND
- #include "soundex.h"
- #endif
-
- #ifndef THINK_C
- #include <sys/types.h>
- #include <sys/stat.h>
- #endif /* ndef THINK_C */
-
- #ifdef WIN32
- #include <windows.h>
- long add_word(char*,long,long,long,long,time_t,long,database*,boolean);
- boolean wordbreak_isiso(long);
- #endif
-
- #define MAX_LINE_LENGTH 1000 /* characters */
- #define extra_weight_for_header 10
-
- #ifdef UNIX
- #define PRINT_AS_INDEXING true /* also defined in irfiles.c */
- #else
- #define PRINT_AS_INDEXING false
- #endif
-
- char* header_flag_1;
- char* header_flag_2;
- long len_of_files_since_last_delete = 0;
- long len_of_files_since_last_flush = 0;
- long total_indexed_file_length = 0;
-
- boolean indexingForBeta = false;
-
- long _indexable_section = 1;
-
- char *current_filename = NULL;
- int current_filecount = 0;
-
- boolean index_contents = true;
-
-
- #define keyword_weight 1
-
- /* keywords from command line (set in waisindex.c), used in finish_document */
-
- char* keywords = NULL;
-
- /* name of keyword file from command line, used in finish_document */
- char* keyword_filename = NULL;
-
- #ifdef WIN32
- /* excluded filename(s) from command line, used in index_text_file */
- /* File names are separated by \0 and terminated by \0\0 */
- char ExcludeFiles[EXCLUDEFILENAMESLEN] = "";
- #endif
-
-
- #ifdef WIN32
-
- /*
- * Return non-zero if the supplied trial matches the Mask string,
- * where a ? in the Mask matched anything in the Trial.
- */
- static int MatchFileNameComponent(char *Mask,char *Trial) {
- int i;
-
- i = 0;
- while (Trial[i] && Mask[i]) {
- if (Mask[i]!='?') {
- if (Trial[i]!=Mask[i]) return 0;
- }
- i++;
- }
- if (Trial[i]) {
- /* Ran out of Mask before end of Trial */
- return 0;
- } else {
- /* Ran out of Trial - is remainder of mask just ??? */
- while (Mask[i]) {
- if (Mask[i]!='?') return 0;
- i++;
- }
- return 1;
- }
- }
-
- /*
- * Return non-zero if the supplied file name matches the possibly
- * wildcarded file mask.
- */
- static int WildcardMatch(char *FileMask,char *FileName) {
- char ExpandedMaskName[MAX_FILENAME_LEN];
- char ExpandedMaskExtn[MAX_FILENAME_LEN];
- char TempFileName[MAX_FILENAME_LEN];
- char *p;
- int i;
- int j;
-
- i = 0;
- /* Expand the Name portion of the mask */
- j = 0;
- while (FileMask[i]) {
- if (FileMask[i]=='.') {
- /* We've reached the extension portion. */
- break;
- }
- if (FileMask[i]=='*') {
- /* Expand * to multiple ??? to the end of the string then break. */
- while (j<MAX_FILENAME_LEN-1) ExpandedMaskName[j++] = '?';
- break;
- }
- /* Just copy in the other characters */
- ExpandedMaskName[j++] = FileMask[i++];
- }
- ExpandedMaskName[j] = '\0';
- /* Find the extension */
- while (FileMask[i]) {
- if (FileMask[i]=='.') {
- i++;
- break;
- }
- i++;
- }
- /* Expand the Extension portion of the mask */
- j = 0;
- while (FileMask[i]) {
- if (FileMask[i]=='*') {
- /* Expand * to multiple ??? to the end of the string then break. */
- while (j<MAX_FILENAME_LEN) ExpandedMaskExtn[j++] = '?';
- ExpandedMaskExtn[MAX_FILENAME_LEN-1] = '\0';
- break;
- }
- /* Just copy in the other characters */
- ExpandedMaskExtn[j++] = FileMask[i++];
- }
- ExpandedMaskExtn[j] = '\0';
- /* Change masks to upper case */
- strupr(ExpandedMaskName);
- strupr(ExpandedMaskExtn);
- /* Duplicate the filename string cos we'll change it */
- strncpy(TempFileName,FileName,MAX_FILENAME_LEN-2);
- strupr(TempFileName);
- TempFileName[strlen(TempFileName)+2] = '\0'; /* Ensure its doubly-null terminated */
- /* Now match the file name portion against the supplied name */
- p = strtok(TempFileName,".");
- if (MatchFileNameComponent(ExpandedMaskName,p)) {
- p += strlen(p);
- p++;
- if (MatchFileNameComponent(ExpandedMaskExtn,p)) {
- return 1;
- }
- }
- return 0;
- }
- #endif
-
- /* Handling Word Pairs */
-
- /* makes a word_pair out of a two words:
- make_joint_word("abcdefghijklmnopqrstuvwxyz", "123456789012345678901");
- "abcdefghij1234567890"
- make_joint_word("abcdefghijkl", "123");
- "abcdefghij123"
- make_joint_word("abc", "123");
- "abc123" */
-
- char *make_joint_word(word1, word2)
- char* word1;
- char* word2;
- {
- static char new_word[MAX_WORD_LENGTH + 1];
- strncpy(new_word, word1, MAX_WORD_LENGTH / 2);
- strncpy(new_word + MIN(MAX_WORD_LENGTH / 2, strlen(word1)),
- word2, MAX_WORD_LENGTH - (MAX_WORD_LENGTH / 2));
- return(new_word);
- }
-
- /* returns 0 is successful, non-0 if error */
- static long add_word_before_pairs _AP((char *word, long char_pos,
- long line_pos, long weight,
- long doc_id, time_t date,
- boolean capitalized, database* db,
- boolean word_position, boolean word_pairs));
-
- static long
- add_word_before_pairs(word, char_pos, line_pos,
- weight, doc_id, date, capitalized, db,
- word_position, word_pairs)
- char *word; /* the word to be indexed, this could be a
- word pair. If NULL there are no more words
- to be indexed */
- long char_pos; /* the position of the start of the
- word */
- long line_pos; /* this is passed for the best
- section calculation */
- long weight; /* how important the word looks
- syntactically (such as is it bold)
- NOT used by signature system */
- long doc_id; /* current document, this will never be 0 */
- time_t date; /* display day of this document, 0 if not known */
- boolean capitalized; /* if the word started with a cap */
- database* db; /* database to insert the document */
- boolean word_position; /* if true, include word position in index. */
- boolean word_pairs; /* if true, add pairs of capitalized words */
- {
- static char last_word[MAX_WORD_LENGTH + 1];
- static long last_doc_id = -1;
- /* The way it works is it remembers if the last word if it was
- capitalized (if not it clears the saved word).
- If another capitalized word comes along next
- (and it is in the same document), then it makes a joint word and calls
- add_word with it.
-
- This does not throw away stopwords before forming pairs, so it will
- not be quite what CMDRS does. This should only be used in seeker
- and serial searching before proximity is used.
-
- */
- if(capitalized && word_pairs){
- if(last_word[0] != '\0' && last_doc_id == doc_id){
- #ifdef WIN32
- add_word(make_joint_word(last_word, word),
- char_pos, line_pos, weight, doc_id, date, 1L, db, word_position);
- #else
- add_word(make_joint_word(last_word, word),
- char_pos, line_pos, weight, doc_id, date, 1L, db);
- #endif
- }
- else{
- last_word[0] = '\0';
- }
- strncpy(last_word, word, MAX_WORD_LENGTH);
- last_doc_id = doc_id;
- }
- else{ /* not capitalized or word_pairs is false */
- last_word[0] = '\0';
- }
- return(add_word(word, char_pos, line_pos, weight, doc_id, date, 0L, db, word_position));
- }
-
-
- #ifdef NOTUSED
- #define WORD_LETTERS "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890"
-
-
- static char *new_word _AP((char* line,char* word));
-
- static char *new_word(line,word)
- char *line;
- char *word;
- {
- /* This copies the first word from line into word while downcasing it.
- It returns a pointer into line that is after the word,
- which can be used to call this function again.
- If there are no words left, then NULL is returned,
- and word is length 0.
- There has got to be a better way.
- */
- long i = 0;
- char *beginning_ptr = strpbrk(line, WORD_LETTERS);
- char *next_word;
- long length;
- if(NULL == beginning_ptr){
- word[0] = '\0';
- return(NULL);
- }
- length = strspn(beginning_ptr, WORD_LETTERS);
- next_word = length + beginning_ptr;
-
- length = MIN(MAX_WORD_LENGTH,length);
- for(i=0; i<length; i++){
- word[i] = char_downcase((unsigned long)*beginning_ptr++);
- }
- word[i] = '\0';
- return(next_word);
- }
-
- static boolean reasonable_word _AP((char* word));
-
- static boolean reasonable_word(word)
- char* word;
- /* this should be more sophisticated */
- {
- if(strlen(word) > 1){
- return(TRUE);
- }
- else{
- return(FALSE);
- }
- }
-
- #endif /* def NOTUSED */
-
-
-
- /* MAPPING A FUNCTION OVER WORDS (QUICKLY) */
-
-
- /* map_over_words("foo bar baz", 0L, 1L, 0L, &integer, false, db, dummy_wordfunction) */
- static long dummy_wordfunction(word, char_pos, line_pos,
- weight, doc_id, date, capitalized, db)
- char *word; /* the word to be indexed, this could be a
- word pair. If NULL there are no more words
- to be indexed */
- long char_pos; /* the position of the start of the
- word */
- long line_pos; /* this is passed for the best
- section calculation */
- long weight; /* how important the word looks
- syntactically (such as is it bold)
- NOT used by signature system */
- long doc_id; /* current document, this will never be 0 */
- time_t date; /* display day of this document, 0 if not known */
- boolean capitalized; /* if the word started with a cap */
- database* db; /* database to insert the document */
- {
- if(word != NULL)
- printf("word: %s, char_pos: %ld\n", word, char_pos);
- return(0);
- }
-
-
-
-
- /* returns the number of words added, or -1 if an error occurred */
- long map_over_words(line,
- document_id,
- weight,
- file_position_before_line,
- line_length,
- newline_terminated,
- db,
- wordfunction,
- word_position, word_pairs,
- #ifdef SOUND
- minwordlen, type)
- #else
- minwordlen) /* dgg */
- #endif
-
- char* line;
- long document_id;
- long weight;
- long file_position_before_line;
- long *line_length;
- boolean *newline_terminated;
- database* db;
- wordfunc *wordfunction;
- boolean word_position, word_pairs;
- int minwordlen;
- #ifdef SOUND
- char* type;
- #endif
-
- {
- /* Add words to the index if it should be done.
- * Returns the number of words added.
- * Should it return the amount of weight added?
- * The line length is side effected with the length of the line.
- * Newline_terminated is set based on whether the last character
- * in the string was a newline. If it was not, then it fgets probably
- * did not retrieve the whole line.
- */
-
- long position_in_word = 0;
- long word_count = 0;
- unsigned long ch;
- long char_count = 0;
- boolean capitalized = false; /* if the word starts with a cap */
- char word[MAX_WORD_LENGTH + 1];
-
-
- for(ch = (unsigned char)line[char_count++];
- ch != '\0'; ch = (unsigned char)line[char_count++]){
- #ifdef BIO
- boolean alnum = (wordDelimiter(ch) == NOT_DELIMITER);
- #else
- boolean alnum = isalnum(ch) || wordbreak_isiso(ch);
- #endif
-
- if(alnum){
- /* put the character in the word if not too long */
- if(position_in_word == 0)
- capitalized = isupper((unsigned long)ch)?true:false;
- if(position_in_word < MAX_WORD_LENGTH){
- word[position_in_word++] = char_downcase((unsigned long)ch);
- }
- }
- else{ /* not an in a word */
- if(position_in_word != 0){
- /* then we have collected a word */
- if(position_in_word >= minwordlen){ /* is it reasonable ? */
- word[position_in_word] = '\0';
-
- /* call the stemmer */
- stemmer(word);
-
- if(0 !=
- (*wordfunction)(word,
- file_position_before_line + char_count,
- /*^^ dgg, this param is supposed to be start-of-word, but char_count is now at end-of-word !*/
- 0L, /* line_pos */
- weight,
- document_id,
- (time_t)0L,
- capitalized,
- db,
- word_position,
- word_pairs))
- return(-1); /* error */
- word_count++;
- }
-
- #ifdef SOUND
- /*=========================== SOUNDEX / PHONIX ========================================*/
- if ((word_count == 1) && (type != NULL)) /* use only the first word (i.e. the surname) for SOUNDEX/PHONIX! */
- if ((!strcmp(type, "SOUNDEX")) || (!strcmp(type, "PHONIX")))
- {
- char code[20];
- #ifndef WIN32
- int i;
- #endif
-
- if (!strcmp(type, "SOUNDEX"))
- Soundex(word, code);
- else if (!strcmp(type, "PHONIX"))
- Phonix(word, code);
-
- code[0] = tolower(code[0]);
- #ifdef DEBUG
- fprintf(stderr, "%5d, %4s, %s\n", word_count, code, word);
- #endif
- if (0 != (*wordfunction) (code,
- file_position_before_line + char_count,
- 0L, /* line_pos */
- weight,
- document_id,
- (time_t) 0L,
- capitalized,
- db,
- word_position,
- word_pairs))
- return(-1); /* error */
-
- word_count++;
- }
- /*=====================================================================================*/
- #endif
-
- position_in_word = 0;
- }
- }
- }
- /* finish last word */
- if(position_in_word >= minwordlen){ /* is it reasonable ? */
- word[position_in_word] = '\0';
-
- /* call the stemmer */
- stemmer(word);
-
- if(0 != (*wordfunction)(word,
- file_position_before_line + char_count,
- 0L, /* line_pos */
- weight,
- document_id,
- (time_t)0L,
- capitalized,
- db,
- word_position, word_pairs))
- return(-1);
- word_count++;
- }
-
- /* for debugging
- if(char_count - 1 != strlen(line)) {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "char_count: %ld, strlen: %ld", char_count, strlen(line));
- }
- */
- if(newline_terminated != NULL){
- if('\n' != line[char_count-2])
- *newline_terminated = false;
- else
- *newline_terminated = true;
- }
- if(line_length != NULL)
- *line_length = char_count - 1;
- return(word_count);
- }
-
-
- static long add_words_if_appropriate
- _AP((char* line,long document_id,long weight,long file_position_before_line,
- long* line_length,boolean* newline_terminated,database* db,
- boolean word_position, boolean word_pairs,
- int minwordlen));
-
- static long
- add_words_if_appropriate(line,
- document_id,
- weight,
- file_position_before_line,
- line_length,
- newline_terminated,
- db,
- word_position, word_pairs,
- minwordlen) /* dgg */
- char* line;
- long document_id;
- long weight;
- long file_position_before_line;
- long *line_length;
- boolean *newline_terminated;
- database* db;
- boolean word_position, word_pairs;
- int minwordlen;
- {
- /* Add words to the index if it should be done.
- * Returns the number of words added.
- * Should it return the amount of weight added?
- * The line length is side effected with the length of the line.
- * Newline_terminated is set based on whether the last character
- * in the string was a newline. If it was not, then it fgets probably
- * did not retrieve the whole line.
- */
-
- long position_in_word = 0;
- long word_count = 0;
- char word[MAX_WORD_LENGTH + 1];
- unsigned long ch;
- long char_count = 0;
- boolean capitalized = false; /* if the word starts with a cap */
-
- for(ch = (unsigned char)line[char_count++];
- ch != '\0'; ch = (unsigned char)line[char_count++]){
- #ifdef BIO
- boolean alnum = (wordDelimiter(ch) == NOT_DELIMITER);
- #else
- boolean alnum = isalnum(ch);
- #endif
- if(alnum){
- /* put the character in the word if not too long */
- if(position_in_word == 0)
- capitalized = isupper((unsigned long)ch)?true:false;
- if(position_in_word < MAX_WORD_LENGTH){
- word[position_in_word++] = char_downcase((unsigned long)ch);
- }
- }
- else{ /* not an in a word */
-
- /* call the stemmer */
- stemmer(word);
-
- if(position_in_word != 0){
- /* then we have collected a word */
- if(position_in_word >= minwordlen){ /* is it reasonable ? */
- word[position_in_word] = '\0';
- add_word_before_pairs(word,
- file_position_before_line + char_count,
- 0L, /* line_pos */
- weight,
- document_id,
- (time_t)0L,
- capitalized,
- db,
- word_position, word_pairs);
- word_count++;
- }
- position_in_word = 0;
- }
- }
- }
- /* finish last word */
- if(position_in_word >= minwordlen){ /* is it reasonable ? */
- word[position_in_word] = '\0';
-
- /* call the stemmer */
- stemmer(word);
-
- add_word(word,
- file_position_before_line + char_count,
- 0L, /* line_pos */
- weight,
- document_id,
- (time_t)0L,
- 0L,
- #ifdef WIN32
- db,
- word_position);
- #else
- db);
- #endif
- word_count++;
- }
-
- /* for debugging
- if(char_count - 1 != strlen(line)) {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "char_count: %ld, strlen: %ld", char_count, strlen(line));
- }
- */
- if('\n' != line[char_count-2])
- *newline_terminated = false;
- else
- *newline_terminated = true;
-
- *line_length = char_count - 1;
- return(word_count);
- }
-
- #ifdef WIN32
- static int nodecompare _AP((const void* i,const void* j));
- #else
- static int nodecompare _AP((unsigned long* i,unsigned long* j));
- #endif
-
- static int
- #ifdef WIN32
- nodecompare(const void *I,const void *J)
- {
- unsigned long *i, *j;
- i = (unsigned long *) I;
- j = (unsigned long *) J;
- #else
- nodecompare(i,j)
- unsigned long *i, *j;
- {
- #endif
- if (i[0] < j[0])
- return(-1);
- else if (i[0] > j[0])
- return(1);
- else
- return(0);
- }
-
- #define nodeRange 256 /* 2048 sprint nodes on a full sized machine - should
- be passed in */
- #define iterations_to_reorder 50 /* 1 is best but slow */
-
- static void finish_document
- #ifndef SOUND
- _AP((boolean recountHeader, char* header,char* line,long document_id,
- document_table_entry* the_document_table_entry,
- long file_position_before_line,
- long file_position_before_document,database* db,
- boolean word_position, boolean word_pairs,
- int minwordlen));
- #else
- _AP((boolean recountHeader, char* header,char* line,long document_id,
- document_table_entry* the_document_table_entry,
- long file_position_before_line,
- long file_position_before_document,database* db,
- boolean word_position, boolean word_pairs,
- int minwordlen,
- char* type));
- #endif
-
- static void
- finish_document(recountHeader, header,line,document_id,the_document_table_entry,
- file_position_before_line, file_position_before_document,
- db, word_position, word_pairs,
- #ifndef SOUND
- minwordlen)
- #else
- minwordlen, type)
- #endif
- boolean recountHeader;
- char* header;
- char* line;
- long document_id;
- document_table_entry* the_document_table_entry;
- long file_position_before_line;
- long file_position_before_document;
- database* db;
- boolean word_position, word_pairs;
- int minwordlen;
- #ifdef SOUND
- char* type;
- #endif
- { long line_length;
- boolean newline_terminated;
- long number_of_words;
-
-
- if(0 != strlen(header) && recountHeader){
- /* add weights for the header (if there was one) */
- long number_of_words =
- map_over_words(header, document_id,
- extra_weight_for_header,
- file_position_before_line-
- file_position_before_document,
- &line_length,
- &newline_terminated,
- db,
- add_word_before_pairs,
- word_position, word_pairs,
- #ifdef SOUND
- minwordlen, type);
- #else
- minwordlen);
- #endif
-
- if(number_of_words == -1)
- waislog(WLOG_HIGH, WLOG_ERROR, "map_over_words failed");
- db->total_word_count += number_of_words;
- the_document_table_entry->document_length += number_of_words;
- }
-
- if(keyword_filename != NULL){
- /* add keywords from keyword file (if specified on command line) */
-
- char *tmpFileName = NULL;
- FILE* keyword_stream = NULL;
- char line[MAX_LINE_LENGTH];
-
- if(keyword_filename != NULL &&
- strlen(keyword_filename) > 1 &&
- !strcmp(keyword_filename+(strlen(keyword_filename)-2), ".Z"))
- /* it's a .Z file. First, remove the suffix or many things get confused. */
- keyword_filename[(strlen(keyword_filename)-2)] = 0;
-
- if(probe_file(keyword_filename)) {
- keyword_stream = s_fopen(keyword_filename, "r");
- }
- else if(probe_file_possibly_compressed(keyword_filename)) {
- tmpFileName = s_fzcat(keyword_filename);
- if (tmpFileName) {
- keyword_stream = s_fopen(keyword_filename, "r");
- unlink(tmpFileName);
- free(tmpFileName);
- }
- }
-
- if(NULL == keyword_stream)
- waislog(WLOG_HIGH, WLOG_ERROR,
- "Unable to open keyword file %s", keyword_filename);
- else { /* read keyword_file, index its contents */
- waislog(WLOG_HIGH, WLOG_INDEX,
- "Indexing keyword file %s", keyword_filename);
- while(TRUE){
- /* read a line */
- if( !fgets(line, MAX_LINE_LENGTH, keyword_stream) )
- break; /* eof */
- number_of_words =
- map_over_words(line, document_id,
- keyword_weight,
- 0,
- &line_length,
- &newline_terminated,
- db,
- add_word_before_pairs,
- #ifdef SOUND
- word_position, word_pairs, minwordlen, type);
- #else
- word_position, word_pairs,minwordlen);
- #endif /* SOUND */
- if(number_of_words == -1)
- waislog(WLOG_HIGH, WLOG_ERROR, "map_over_words failed");
- db->total_word_count += number_of_words;
- the_document_table_entry->document_length += number_of_words;
- }
- s_fclose(keyword_stream);
- }
- }
-
-
- /* store out the document header here */
- the_document_table_entry->headline_id =
- write_headline_table_entry(header, db);
- if(NULL == line)
- { /* EOF */
- /* if it goes to the end of the file, then
- * set the end_character to 0 so that it is clear that
- * it goes to the end of the file.
- */
- the_document_table_entry->end_character = 0;
- }
- else /* set the end_character */
- the_document_table_entry->end_character = file_position_before_line;
-
-
- /*
- waislog("start char: %ld, end char: %ld",
- the_document_table_entry->start_character,
- the_document_table_entry->end_character);
- */
-
- if (indexingForBeta)
- { /* we need to decide which sprint node this doc will go in.
- for now we will store the sn in the date field, but that
- is temporary
- NOTE that we must subract 1 from document_id, since we want
- a 0 based number
- */
- static unsigned long* nodes = NULL; /* size/node# inited to 0 to 2047 */
- static long minPos;
- unsigned long size;
-
- if (nodes == NULL)
- { long i;
- long startPos;
- time_t temp_time;
-
- nodes = (unsigned long*)s_malloc(sizeof(unsigned long)*nodeRange*2);
- srand((int)time(&temp_time)); /* try to distribute the entries */
- startPos = rand() % nodeRange; /* for indexes with < nodeRng docs */
- for (i = 0; i < nodeRange; i++)
- { nodes[(i * 2) + 1] = (i + startPos) % nodeRange;
- nodes[i * 2] = 0;
- }
- minPos = 0;
- /*printf("init: ");
- for (i = 0; i < nodeRange; i++)
- printf("<%lu,%lu> ",nodes[i*2],nodes[(i*2)+1]);
- NL();*/
- }
-
- /* place the document in the emptiest node (at minPos) */
- the_document_table_entry->date = (time_t)nodes[(minPos * 2) + 1];
-
- /* increment the size to account for document */
- size = nodes[minPos * 2];
- size += (the_document_table_entry->end_character -
- the_document_table_entry->start_character);
- nodes[minPos * 2] = size;
-
- if ((the_document_table_entry->end_character -
- the_document_table_entry->start_character) > 100000)
- printf("big doc %lu %s\n",the_document_table_entry->end_character - the_document_table_entry->start_character,header);
-
- minPos++;
-
- /* possibly reorder it */
- if (minPos > iterations_to_reorder)
- {
- long i;
- minPos = 0;
- /*printf("before: ");
- for (i = 0; i < nodeRange; i++)
- printf("<%lu,%lu> ",nodes[i*2],nodes[(i*2)+1]);
- NL();*/
- #ifdef WIN32
- qsort((void *)nodes,(size_t)nodeRange,(size_t)(sizeof(unsigned long) * 2),nodecompare);
- #else
- qsort((char*)nodes,nodeRange,sizeof(unsigned long) * 2,nodecompare);
- #endif
- /*printf("after: ");
- for (i = 0; i < nodeRange; i++)
- printf("<%lu,%lu> ",nodes[i*2],nodes[(i*2)+1]);
- NL();*/
- printf("just sorted nodes, min: ");
- for (i = 0; i < 10; i++)
- printf("%lu ",nodes[i * 2]);
- printf(", max: %lu/%lu\n",nodes[(nodeRange * 2)-2],nodes[(nodeRange * 2)-1]);
- }
-
-
-
- #ifdef old
- sn = (document_id - 1) % 2048; /* 2048 = sn's in a full machine */
-
- /* should also take into account the "fullness" of any particular
- node */
- the_document_table_entry->date = (time_t)sn;
- /* waislog(WLOG_LOW, WLOG_INFO,
- "put %s in sprint node %ld",header,sn);*/
- #endif /* def old */
- }
-
- write_document_table_entry(the_document_table_entry, db);
- cprintf(PRINT_AS_INDEXING, ".");
- total_indexed_file_length = /* set this so the speed looks right */
- total_indexed_file_length + file_position_before_line;
- total_indexed_file_length = /* set it back */
- total_indexed_file_length - file_position_before_line;
- }
-
- #define LENGTH_OF_NEWLINE 1 /* this will be 2 on a PC, I think */
-
- /* void index_text_file(filename,
- separator_function,
- header_function,
- date_function,
- finish_header_function,
- type,
- db,
- check_for_text_file,
- check_for_file_already_indexed,
- word_position, word_pairs, minwordlen) */
-
- void index_text_file(filename, dataops, db,
- check_for_text_file,
- check_for_file_already_indexed,
- word_position, word_pairs,
- filter_process_in, filter_process_out)
- char* filename;
- dataopsrec* dataops;
- /*
- boolfunc *separator_function;
- voidfunc *header_function;
- longfunc *date_function;
- voidfunc *finish_header_function;
- char *type;
- */
- database* db;
- boolean check_for_text_file;
- boolean check_for_file_already_indexed;
- boolean word_position, word_pairs;
- FILE *filter_process_in, *filter_process_out;
- {
- /* Adds words to the index for a given file.
- * "words" are extracted as strings of alphanumeric chars, whose
- * length is >=3 but <toobig. Long lines are handled in chunks
- * so that binary files with embedded text are correctly processed.
- * The function arguments can be NULL which means it would
- * always answer NULL.
- * separator_function is called on every line to see if it
- * separates documents.
- * header_function is called on every line so that a headline
- * can be accumulated. This assumes that it will side effect global
- * variables.
- * finish_header_function is called when the document is finished
- * (by separator function responding TRUE or EOF) this will return
- * the headline string or NULL.
- * Presumably finish_header_function will use the
- * effects of header_function. finish_header_function
- * will only be called once, so it should clear whatever state
- * header_function has set.
- * if check_for_text_file then it looks to see if first character
- * in the file is a printable character.
- * if check_for_file_already_indexed then it looks through the filename
- * file to see if the file has not been indexed. If it has,
- * then it is checked to see if it is up-to-date. (it does not
- * kill the old entry (maybe it should)).
- */
-
- long filename_id;
- document_table_entry the_document_table_entry;
- long document_id = next_document_id(db);
-
- /* FILE* input_stream = s_fopen(filename, "r"); */
- FILE* input_stream;
- char *tmpFileName = NULL;
-
- long file_position_before_line = 0;
- long file_position_before_document = 0;
- long date;
- int charsleftover;
- char leftovers[MAX_LINE_LENGTH];
- char *p;
-
- #ifdef WIN32
- /* check the filename is in excluded file name list */
- if (filename != NULL) {
- /* Get the filename component from the path */
- tmpFileName = strrchr(filename, '\\');
- if (tmpFileName != NULL)
- tmpFileName++;
- else
- tmpFileName = filename;
- p = ExcludeFiles;
- while (*p) {
- if (WildcardMatch(p,tmpFileName)) {
- /* The filename is matched in the exclude list */
- waislog(WLOG_MEDIUM, WLOG_INDEX,
- "File: %s excluded from index", filename);
- return;
- }
- p += strlen(p); /* Point to end of string */
- p++; /* Point to next filename */
- }
- }
- #endif
-
- if(filename != NULL &&
- strlen(filename) > 1 &&
- !strcmp(filename+(strlen(filename)-2), ".Z"))
- /* it's a .Z file. First, remove the suffix or many things get confused. */
- filename[(strlen(filename)-2)] = 0;
-
- /* multitype extensions */
- /*
- If dataops->multitype (primary and secondary type) is defined, then
- we need to index filenames that have an extension with the
- dataops->type (primary type) and skip all other files.
-
- The only problem with this approach is that we may loose files which
- dont have an instance of the primary file type
- */
- if ( (dataops->multitype != NULL) &&
- strcmp(filename+(strlen(filename)-strlen(dataops->type)), dataops->type)) {
- waislog(WLOG_HIGH, WLOG_INFO, "Skipping file: %s",
- filename);
- return;
- }
-
-
-
- if(probe_file(filename)) {
- input_stream = s_fopen(filename, "r");
- }
- else if(probe_file_possibly_compressed(filename)) {
- tmpFileName = s_fzcat(filename);
- if (tmpFileName) {
- input_stream = s_fopen(tmpFileName, "r");
- unlink(tmpFileName);
- free(tmpFileName);
- }
- }
-
- /* end of this long one */
-
-
-
- if(NULL == input_stream){
- waislog(WLOG_HIGH, WLOG_ERROR,
- "File %s does not exist", filename);
- /* then the is not a valid file to be indexed */
- return;
- }
- if(check_for_file_already_indexed){
- time_t time;
- char full_path[MAX_FILENAME_LEN];
- truename(filename, full_path);
- if(true == filename_in_database(full_path, dataops->type, &time, db)){
- /* check that it is the same time as this file */
- if(time == file_write_date(filename)){
- waislog(WLOG_HIGH, WLOG_INDEX,
- "File %s already indexed", filename);
- s_fclose(input_stream);
- return;
- }
- }
- }
-
- /* Make the current filename accessible via global variables.
- * Increment current_filecount so routines can efficiently detect
- * changes in the current file.
- * -- Prentiss Riddle, Rice ONCS, riddle@rice.edu, 5/6/92
- */
-
- if(current_filename == NULL) current_filename = s_malloc(MAX_FILENAME_LEN+1);
-
- if (URL_prefix && !strncmp(filename, URL_trim, MIN(strlen(URL_trim), strlen(filename)))) {
- /* trim capable */
- strcpy(current_filename, URL_prefix);
- strcat(current_filename, filename+strlen(URL_trim));
- } else
- strncpy(current_filename, filename, MAX_FILENAME_LEN);
- current_filecount++;
-
- if(check_for_text_file){
- /* if we need this to be a text file, check the first character
- for a printable character */
- long ch = fgetc(input_stream);
- /* printf("First character is '%c'\n", ch); */
- if(EOF == ch || (!isprint(ch) && !isspace(ch))){
- s_fclose(input_stream);
- return;
- }
- ungetc(ch, input_stream);
- }
-
-
- /* multitype extensions */
-
- /* write out the filename */
- if ( dataops->multitype != NULL ) {
- filename_id = write_filename_table_entry(filename, dataops->multitype, db);
- }
- else {
- filename_id = write_filename_table_entry(filename, dataops->type, db);
- }
-
-
- /* (if (not *drop_table*) (make_drop_table)) maybe put in later */
-
- header_flag_1 = NULL;
- the_document_table_entry.filename_id = filename_id;
- the_document_table_entry.start_character = 0;
- the_document_table_entry.document_length = 0;
- the_document_table_entry.number_of_lines = 0;
- the_document_table_entry.date = 0;
- charsleftover = 0;
-
- while(TRUE){
- long line_length;
- boolean newline_terminated;
- char line[MAX_LINE_LENGTH];
- char header[MAX_LINE_LENGTH];
- char* read_line_result;
- boolean eof;
-
- int i, cut, charsread;
-
- /*
- * Read a block of up to MAX_LINE_LENGTH chars from a single "line"
- * and extract a whole number of words from it. Save partial words
- * lying at edge of block in the leftovers[] array for the next loop.
- * This accomodates binary files formats with embedded text, which
- * often have very long "lines". [burchard@geom.umn.edu 4/16/93]
- */
-
- /* prefill line with leftovers before reading in new data */
- beFriendly();
-
- for(i=0; i<charsleftover; i++) line[i] = leftovers[i];
- line[charsleftover] = '\0';
- read_line_result = fgets(line + charsleftover,
- MAX_LINE_LENGTH - charsleftover, input_stream);
-
- /* increment line count if we have finished reading a "line" */
- charsread = strlen(line);
- if(charsread>charsleftover && line[charsread-1]=='\n')
-
- the_document_table_entry.number_of_lines++;
-
- /* save word frag at end as leftovers (unless it fills whole block) */
- for(cut=charsread-1; cut>=0; cut--)
- if(!isascii(line[cut]) || !isalnum(line[cut])) break;
- if(++cut <= 0) charsleftover = 0;
- else {
- for(charsleftover=0, i=cut; i<charsread; charsleftover++, i++)
-
- leftovers[charsleftover] = line[i];
- charsread -= charsleftover;
- }
- line[charsread] = '\0';
-
- /* don't say EOF yet if we still have leftovers to process */
- eof = (!read_line_result && !charsread && !charsleftover);
-
-
- header[0] = '\0'; /* set it to the empty string */
-
- if(eof ||
- ((NULL != dataops->separator_function) && dataops->separator_function(line)) || (keyword_filename != NULL) ){
-
- /* tell this function that there is not more to process */
- if (keyword_filename != NULL) {
- eof = true;
- }
-
-
- /* we are processing a separator, therefore we should
- * finish off the last document, and start a new one
- */
- if(NULL != dataops->finish_header_function){
- dataops->finish_header_function(header);
-
- /* call Victor Nettoyage :-( */
- (void)cleanHeadline(header);
-
- }
- if(0 == strlen(header)){
- char full_path[1000];
- char directory[1000];
- if (!URL_prefix) {
- truename(filename, full_path);
- sprintf(header, "%s %s", pathname_name(full_path),
- pathname_directory(full_path, directory));
- } else
- strncpy(header, current_filename, MAX_FILENAME_LEN);
- }
- if(the_document_table_entry.number_of_lines > 0)
- the_document_table_entry.number_of_lines--; /* dont count separator */
- /* finish off the last */
- finish_document( dataops->extraheaderweight,
- header, line, document_id,
- &the_document_table_entry,
- eof? /* if EOF, use file length */
- file_length(input_stream):file_position_before_line,
- file_position_before_document,
- db, word_position, word_pairs,
- #ifndef SOUND
- dataops->minwordlen);
- #else
- dataops->minwordlen, dataops->indextype);
- #endif
- /* initialize the next one */
- the_document_table_entry.filename_id = filename_id;
- the_document_table_entry.start_character = file_position_before_line;
- the_document_table_entry.number_of_lines = 1; /* count separator */
- the_document_table_entry.date = 0;
- the_document_table_entry.document_length = 0;
- file_position_before_document = file_position_before_line;
-
- document_id = next_document_id(db);
-
- if(!eof)
- { /* not EOF */
- if(NULL != dataops->header_function){
- dataops->header_function(line);
- }
- if (dataops->date_function != NULL &&
- (date = dataops->date_function(line)) > 0)
- the_document_table_entry.date = date;
- /* dgg -- don't know where this goes. */
-
- if (dataops->addseparatorwords) { /* dgg */
- long number_of_words;
- number_of_words = map_over_words(line, document_id, dataops->repeat_weight,
- file_position_before_line -
- file_position_before_document,
- &line_length,
- &newline_terminated,
- db,
- add_word_before_pairs,
- word_position, word_pairs,
- #ifdef SOUND
- dataops->minwordlen,
- dataops->indextype);
- #else
- dataops->minwordlen);
- #endif /* SOUND */
- the_document_table_entry.document_length += number_of_words;
- len_of_files_since_last_delete += number_of_words;
- len_of_files_since_last_flush += number_of_words;
- }
- else {
- line_length = strlen(line);
- newline_terminated = true;
- }
- }
- else{ /* EOF */
- /* printf("closing the file\n"); */
- s_fclose(input_stream);
- return;
- }
- }
-
- else{
- /* not a separator or EOF so process the line */
- long number_of_words;
- if (dataops->date_function != NULL &&
- the_document_table_entry.date == 0 &&
- (date = dataops->date_function(line)) > 0)
- the_document_table_entry.date = date;
-
- if(NULL != dataops->header_function) dataops->header_function(line);
-
- if(index_contents ) {
- if( _indexable_section) {
- number_of_words = map_over_words(line, document_id, dataops->repeat_weight,
- file_position_before_line -
- file_position_before_document,
- &line_length,
- &newline_terminated,
- db,
- add_word_before_pairs,
- word_position, word_pairs,
- #ifdef SOUND
- dataops->minwordlen,
- dataops->indextype);
- #else
- dataops->minwordlen);
- #endif
- if(number_of_words == -1)
- waislog(WLOG_HIGH, WLOG_ERROR, "map_over_words failed");
- the_document_table_entry.document_length += number_of_words;
- len_of_files_since_last_delete += number_of_words;
- len_of_files_since_last_flush += number_of_words;
- db->total_word_count += number_of_words;
- }
- else
- newline_terminated = 0;
- }
- }
- if(newline_terminated)
- file_position_before_line += (line_length +
- LENGTH_OF_NEWLINE /* in case of crlf */
- - 1 /* fgets gets one newline */
- );
- else
- file_position_before_line = ftell(input_stream);
-
-
- /* for debugging
- if(file_position_before_line != ftell(input_stream)) {
- waislog(WLOG_LOW, WLOG_INFO, "ftell: %ld, computed ftell: %ld",
- ftell(input_stream),
- file_position_before_line);
- }
- */
-
- }
- }
-
-
-
-
- /* return TRUE if it is a directory, FALSE otherwise */
- boolean directoryp(file)
- char *file;
-
- {
- #ifdef THINK_C
- return(false);
- #else
- struct stat stbuf;
- if(stat(file, &stbuf) == -1)
- return(FALSE);
- if((stbuf.st_mode & S_IFMT) == S_IFDIR)
- return(true);
- return(FALSE);
- #endif
- }
-
- /* return true if it is a file, FALSE otherwise */
- boolean filep(file)
- char *file;
- {
- #ifdef THINK_C
- return(probe_file(file));
- #else
- struct stat stbuf;
- if(stat(file, &stbuf) == -1)
- return(FALSE);
- if(!((stbuf.st_mode & S_IFMT) == S_IFDIR))
- return(true);
- return(FALSE);
- #endif
- }
-
-
- /* recursively indexes the directory specified.
- * If it is a file, then index it.
- */
- void index_directory(file, dataops, db,
- check_for_text_file,
- check_for_file_already_indexed,
- word_position, word_pairs,
- #ifndef WIN32
- filter_process_in, filter_process_out)
- #else
- filemask, filter_process_in, filter_process_out)
- #endif
- char *file;
- dataopsrec* dataops;
- database* db;
- boolean check_for_text_file;
- boolean check_for_file_already_indexed;
- boolean word_position, word_pairs;
- #ifdef WIN32
- char *filemask;
- #endif
- FILE *filter_process_in, *filter_process_out;
- {
- #ifndef THINK_C
- #ifndef WIN32
- long i, j;
- #endif
-
- if(filep(file)){
- waislog(WLOG_MEDIUM, WLOG_INDEX,
- "Indexing file: %s", file);
- index_text_file(file, dataops, db,
- check_for_text_file,
- check_for_file_already_indexed,
- word_position, word_pairs,NULL,NULL);
- }
- else if(directoryp(file)){
- /* for each name in the directory, call ourselves back */
- #ifdef WIN32
- HANDLE hSearch;
- WIN32_FIND_DATA FindData;
- char Name[MAX_PATH+1];
-
- /* Index the files which match the mask */
- strncpy(Name,file,MAX_PATH);
- Name[MAX_PATH] = '\0';
- strncat(Name,"\\",MAX_PATH);
- Name[MAX_PATH] = '\0';
- strncat(Name,filemask,MAX_PATH);
- Name[MAX_PATH] = '\0';
- hSearch = FindFirstFile(Name,&FindData);
- if (hSearch!=INVALID_HANDLE_VALUE) {
- while (TRUE) {
- if ((FindData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)==0) {
- /* Index the file */
- strncpy(Name,file,MAX_PATH);
- Name[MAX_PATH] = '\0';
- strncat(Name,"\\",MAX_PATH);
- Name[MAX_PATH] = '\0';
- strncat(Name,FindData.cFileName,MAX_PATH);
- Name[MAX_PATH] = '\0';
- waislog(WLOG_MEDIUM, WLOG_INDEX,
- "Indexing file: %s", Name);
- index_text_file(Name, dataops, db,
- check_for_text_file,
- check_for_file_already_indexed,
- word_position, word_pairs,
- filter_process_in, filter_process_out);
- }
- if (!FindNextFile(hSearch,&FindData)) break;
- }
- FindClose(hSearch);
- }
- /* Now do all the subdirectories */
- strncpy(Name,file,MAX_PATH);
- Name[MAX_PATH] = '\0';
- strncat(Name,"\\*",MAX_PATH);
- Name[MAX_PATH] = '\0';
- hSearch = FindFirstFile(Name,&FindData);
- if (hSearch!=INVALID_HANDLE_VALUE) {
- while (TRUE) {
- if (strcmp(FindData.cFileName,".")!=0
- && strcmp(FindData.cFileName,"..")!=0
- && (FindData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)) {
- /* Index the directory */
- strncpy(Name,file,MAX_PATH);
- Name[MAX_PATH] = '\0';
- strncat(Name,"\\",MAX_PATH);
- Name[MAX_PATH] = '\0';
- strncat(Name,FindData.cFileName,MAX_PATH);
- Name[MAX_PATH] = '\0';
- index_directory(Name, dataops, db,
- check_for_text_file,
- check_for_file_already_indexed,
- word_position, word_pairs, filemask,NULL,NULL);
- }
- if (!FindNextFile(hSearch,&FindData)) break;
- }
- FindClose(hSearch);
- }
- #else
- struct dirent **list;
-
- if ((i = scandir(file, &list, NULL, NULL)) < 0) {
- return;
- }
- for(j = 0; j < i; j++) {
- char name[1000]; /* max filename size */
-
- if(strcmp(list[j]->d_name, ".") == 0
- || strcmp(list[j]->d_name, "..") == 0
- )
- continue;
-
- strcpy(name, file); /* copy the filename into the name variable */
- strcat(name, "/");
- strcat(name, list[j]->d_name);
- index_directory(name, dataops, db,
- check_for_text_file,
- check_for_file_already_indexed,
- word_position, word_pairs,NULL,NULL);
- }
- if(list != NULL) {
- for (j = 0; j < i; j++)
- if(list[j] != NULL) free((char *)list[j]);
- free((char *)list);
- }
- #endif /* def WIN32 */
- }
- #endif /* ndef THINK_C */
- }
-
-
- /* returns a pointer to a string with good stuff */
- char *cleanHeadline (headline)
- char *headline;
- {
- long length = strlen(headline) + 1; /* include the trailing null */
- long i,j;
- Boolean spaceFlag = false;
-
-
- /* delete leading spaces */
- #ifdef WIN32
- for(i = 0L; i < (long)strlen(headline); i++){
- #else
- for(i = 0L; i < strlen(headline); i++){
- #endif
- /* if(isprint(headline[i])){ */
- if(isgraph(headline[i])){
- break;
- }
- }
-
- /* and move it */
- memcpy(headline, headline+i, length);
-
-
- /*
- ** - replace all the \n and \r with a space, avoid putting
- ** two spaces one after the other
- */
- headline = headline + i;
- /* replace carriage returns and line feeds */
- #ifdef WIN32
- for (i = 0L, j = 0L; i < (long)strlen(headline)+1; i++) {
- #else
- for (i = 0L, j = 0L; i < strlen(headline)+1; i++) {
- #endif
- if ((headline[i] != '\r') && (headline[i] != '\n')) {
- headline[j++] = headline[i];
- spaceFlag = false;
- }
- else {
-
- if ( spaceFlag == true ) {
- j++;
- }
- else {
- headline[j++] = ' ';
- }
-
- spaceFlag = true;
- }
- }
-
-
- /* delete trailing stuff */
- for(i = strlen(headline) - 1L ; i > 0; i--){
- /* if(isprint(headline[i])){ */
- if(isgraph(headline[i])){
- break;
- }
- headline[i] = '\0';
- }
-
- return(headline);
- }
-
-
-