Internet Publisher's Toolbox 1.0

home *** CD-ROM | disk | FTP | other *** search

/ Internet Publisher's Toolbox 1.0 / Image.iso / toolbox / ntserver / wtsource / waisinde.c < prev next >

Wrap

C/C++ Source or Header | 1995-03-22 | 63.3 KB | 1,681 lines

/* WIDE AREA INFORMATION SERVER SOFTWARE: No guarantees or restrictions. See the readme file for the full standard disclaimer. Brewster@think.com */ /* Copyright (c) CNIDR (see ../COPYRIGHT) */ #ifndef lint static char *RCSid = "$Header: /archives/stelar/src/freeWAIS/freeWAIS-0.2/ir/RCS/waisindex.c,v 1.5 93/07/21 18:53:04 warnock Exp $"; #endif /* * Building an index with a Unix shell interface. * * -brewster 6/90 */ /* Change log: * added -stdio option from jik@athena.mit.edu * $Log: waisindex.c,v $ * Revision 1.5 93/07/21 18:53:04 warnock * Renamed from irbuild.c * Added STELAR-specific patches * * $Log: irbuild.c,v $ * Revision 1.8 1993/10/12 11:18:25 pfeifer * Added stopword file for document style bibdb * * Revision 1.1 93/07/19 16:30:22 warnock * Initial revision * * Revision 1.7 1993/09/22 16:09:13 pfeifer * What have i done ? * * Revision 1.4 93/07/01 19:40:31 warnock * Added prototype for function double * * Revision 1.6 1993/06/04 10:23:15 pfeifer * Pachtlevel BIBDB * * Revision 1.3 93/02/16 17:07:49 freewais * * Revision 1.5 1993/06/02 18:29:00 pfeifer * Added code for local formats * * Revision 1.4 1993/06/01 14:05:54 pfeifer * Added code for soundex/phonix indexing and retrieval * * Revision 1.3 1993/02/16 17:07:49 freewais * added AT&T patches for keyword list * * Revision 1.2 1993/02/16 15:32:21 freewais * added AT&T patch to write first 50 dictionary entries to * src file * * Revision 1.1 1993/02/16 15:05:35 freewais * Initial revision * * Revision 1.47 92/05/10 14:48:17 jonathan * Updated for release. * * Revision 1.46 92/05/08 10:03:17 jonathan * Adjusted memory paramters. It's closer... * * Revision 1.45 92/05/06 17:26:46 jonathan * Added switch for indexing contents, new user-specified type name, new type: * filename, which only puts the name of the file in the header. * * Revision 1.44 92/04/25 21:14:35 brewster * added ziff * * Revision 1.43 92/04/22 15:29:13 jonathan * Added jargon to usage message. * * Revision 1.42 92/04/01 17:08:50 jonathan * Added FTP type. * * Revision 1.41 92/03/25 18:49:39 jonathan * Added log_level and log_file arguments. * * Revision 1.40 92/03/22 18:38:14 brewster * added objective C filter * * Revision 1.39 92/03/20 11:02:44 jonathan * Added code to handle switches for word_pairs and word_postition info. * * Revision 1.38 92/03/17 07:34:32 jonathan * Fixed spacing in usage message. * * Revision 1.37 92/03/10 10:42:51 morris * fixed small bug in command line argument handleing. doesn't die if there * are no args. * * Revision 1.36 92/03/05 07:05:32 shen * add cm grow percent and textsize to command line and init search engine * * Revision 1.35 92/03/04 16:34:09 jonathan * Set wais_pid from getpid(). * * Revision 1.34 92/02/20 09:49:37 jonathan * Added bibtex and nhyp filters from S.P.vandeBurgt@research.ptt.nl. * * Revision 1.33 92/02/17 14:21:08 jonathan * Added switch to disable creation of catalog (-nocat). * * Revision 1.32 92/02/17 12:41:55 jonathan * Added RCSid. * * Revision 1.31 92/02/17 12:41:01 jonathan * Build catalog after completion of indexing. * * Revision 1.30 92/02/12 13:22:53 jonathan * Added "$Log" so RCS will put the log message in the header * */ /* to do: * done: make incremental indexing not index things that are already index * add extra arg -register that will send in description of the server to * the directory of servers. * done: create a source struct in the .src file * make it continuously index to keep itself uptodate. * */ #include <string.h> #include <sys/types.h> #ifndef WIN32 #include <sys/param.h> #endif #include "irdirent.h" #include "cutil.h" #include "futil.h" #include "irfiles.h" #include "irtfiles.h" #include "panic.h" #include "ircfiles.h" #include "version.h" #include "irext.h" #include "stoplist.h" /* dgg */ #ifdef WIN32 #include <windows.h> #include <io.h> #include <fcntl.h> #define MAXPATHLEN 260 int read_src_structure(char*,char**); int retreive_keywords(database*); #endif #ifdef BIO #define INDEXER_DATE "2/16/93" #else #define INDEXER_DATE "2/16/93" #endif #define MAX_LINE_LENGTH 1000 extern char *keyword[50], *descript[1000]; extern short nKeys, nDesLines; extern double compare(); /* for reporting errors, in WAIStation it is defined in CRetrievalApp.c */ extern boolean indexingForBeta; void usage(command) char *command; { /* no args */ fprintf(stderr,"Usage: %s [-d index_filename]\n", command); fprintf(stderr," [-a] /* adding to an existing index, otherwise it erases the index */\n"); fprintf(stderr," [-r] /* recursively index subdirectories */\n"); fprintf(stderr," [-mem mbytes] /* number of megabytes to run this in */\n"); fprintf(stderr," [-register] /* registers the database with the directory of servers.\n"); fprintf(stderr," This should be done with care. */\n"); fprintf(stderr," [-export] /* uses short dbname and port 210 */\n"); fprintf(stderr," [-e [file]] /* set log output to file, or /dev/null if not specified */\n"); fprintf(stderr," [-l log_level] /* set log level. 0 means log nothing,\n"); fprintf(stderr," 10 [the default] means log everything */\n"); fprintf(stderr," [-v] /* print the version of the software */\n"); fprintf(stderr," [-filter process] /* use an external document parser */\n"); fprintf(stderr," [-stdin] /* read file names from stdin */\n"); fprintf(stderr," [-pos | -nopos] /* include (don't include - default) word position information /*\n"); fprintf(stderr," [-nopairs | -pairs] /* don't include (or include - default) word pairs /*\n"); fprintf(stderr," [-nocat] /* inhibit creation of catalog /*\n"); fprintf(stderr," [-contents] /* Index the contents: this is good for types that\n"); fprintf(stderr," inhibit the indexing of the contents (like gif). /*\n"); fprintf(stderr," [-nocontents] /* Index only the filename, not the contents /*\n"); #ifdef BIO fprintf(stderr," [-stop stoplist_filename] /* file of common words to ignore */\n"); fprintf(stderr," [-delim delimiters] /* list of word delimiter symbols */\n"); #endif fprintf(stderr," [-keywords \"<string>\"] /* Keywords to index for each document. */\n"); fprintf(stderr," [-keyword_file <filename>] /* File of keywords to index. */\n"); fprintf(stderr," [-cmmem mem%] /* percent of CM memory (CM code only) */\n"); fprintf(stderr," [-T type] /* type becomes the \"TYPE\" of the document. */\n"); /* multitype extensions */ fprintf(stderr," [-M type,type] /* for multi-type documents. */\n"); #ifdef WIN32 fprintf(stderr," [-x filename,filename] /* ignore the filename(s). */\n"); #endif fprintf(stderr," [-t /* format of the file. if none then each file is a document */\n"); fprintf(stderr," text /* simple text files, this is the default */\n"); fprintf(stderr," | bibtex /* BibTeX / LaTeX format */\n"); fprintf(stderr," | bio /* biology abstract format */\n"); fprintf(stderr," | cmapp /* CM applications from Hypercard */\n"); fprintf(stderr," | dash /* entries separated by a row of dashes */\n"); fprintf(stderr," | dvi /* dvi format */\n"); fprintf(stderr," | emacsinfo /* the GNU documentation system */\n"); fprintf(stderr," | first_line /* first line of file is headline */\n"); fprintf(stderr," | filename /* uses only the filename part of the pathname for the title */\n"); fprintf(stderr," | ftp /* special type for FTP files. First line of file is headline */\n"); fprintf(stderr," | gif /* gif files, only indexes the filename */\n"); fprintf(stderr," | irg /* internet resource guide */\n"); fprintf(stderr," | jargon /* Jargon File 2.9.8 format*/\n"); fprintf(stderr," | listserv_digest /* standard internet mail digest format */\n"); fprintf(stderr," | mail_digest /* standard internet mail digest format */\n"); fprintf(stderr," | mail_or_rmail /* mail or rmail or both */\n"); fprintf(stderr," | medline /* medline format */\n"); fprintf(stderr," | mh_bboard /* MH bulletin board format */\n"); #ifdef WIN32 fprintf(stderr," | ms_kbase /* MS Knowledge Base format */\n"); #endif fprintf(stderr," | netnews /* netnews format */\n"); fprintf(stderr," | nhyp /* ?:? hyper text format, Polytechnic of Central London */\n"); fprintf(stderr," | one_line /* each line is a document */\n"); fprintf(stderr," | para /* paragraphs separated by blank lines */\n"); fprintf(stderr," | pict /* pict files, only indexes the filename */\n"); fprintf(stderr," | ps /* postscript format */\n"); fprintf(stderr," | refer /* refer format */\n"); #ifdef BIBDB fprintf(stderr," | irlist /* irlist mail or rmail or both */\n"); fprintf(stderr," | formfeed /* entries separated by a formfeed */\n"); fprintf(stderr," | bibdb /* steve file entries separated by a formfeed */\n"); fprintf(stderr," | bibinf /* bibinf entries separated by an empty line */\n"); #endif fprintf(stderr," | rn /* netnews saved by the [rt]?rn newsreader */\n"); fprintf(stderr," | server /* server structures for the dir of servers */\n"); #ifdef NeXT fprintf(stderr," | objc /* objective-C .h and .m files */\n"); #endif /* def NeXT */ fprintf(stderr," | tiff /* tiff files, only indexes the filename */\n"); fprintf(stderr," | URL what-to-trim what-to-add /* URL */\n"); fprintf(stderr," | object /* a structured object*/\n"); fprintf(stderr," | inriadoc /* INRIA library catalog */\n"); fprintf(stderr," | paradoc /* INRIA library catalog para-mode */\n "); fprintf(stderr," | fortran /* Fortran files,needs also -filter */\n"); fprintf(stderr," | mime /* Like mail */\n"); #ifdef BIO fprintf(stderr," | genbank /* GenBank flatfile format */\n"); fprintf(stderr," | embl /* EMBL flatfile format */\n"); fprintf(stderr," | pir /* PIR flatfile format */\n"); fprintf(stderr," | prositedoc /* Prosite protein doc format */\n"); fprintf(stderr," | prositedat /* Prosite protein dat format */\n"); fprintf(stderr," | biojournal /* Bio journal TOC on bionet.journals */\n"); fprintf(stderr," | redbook /* Drosophila redbook text */\n"); fprintf(stderr," | flybase /* Drosophila Ashburner data files */\n"); fprintf(stderr," | flystock /* Drosophila stock lists */\n"); fprintf(stderr," | din /* Drosophila Info. Newsletter */\n"); #endif #ifdef SOUND fprintf(stderr," | oneline_phonix /* Phonebooks PHONIX */\n"); fprintf(stderr," | oneline_soundex /* Phonebooks SOUNDEX */\n"); #endif #ifdef AAS fprintf(stderr," | AAS_abstract /* AAS meeting abstracts using AAS LaTeX macros */\n"); #endif /* AAS */ #ifdef STELAR fprintf(stderr," | stelar /* stelar abstracts - third line is hl */\n"); #endif /* STELAR */ #ifdef HTML fprintf(stderr," | html /* Hypertext Markup Language from WWW */\n"); #endif /* HTML */ fprintf(stderr," ] filename filename ...\n"); } /* char *log_file_name = NULL; */ FILE *logfile; extern char* keywords; /* used in irtfiles.c */ extern char* keyword_filename; /* used in irtfiles.c */ #ifdef WIN32 extern char ExcludeFiles[]; /* used in irtfiles.c */ #endif extern boolean index_contents; /* This is the MAIN for building an index. */ void main(argc, argv) int argc; char *argv[]; { database* db = NULL; long argc_copy = argc; char **argv_copy = argv; char *next_argument; char index_filename[1000]; boolean adding_to_existing_index = false; boolean traverse_directory = false; boolean word_positions = false; boolean word_pairs = true; long memory_to_use = -1; long cm_mem_percent = 0; /* default */ long grow_percent = 0; /* default */ long text_size = 0; /* default */ boolean check_for_text_file = false; boolean register_database = false; boolean export_database = false; boolean read_files_from_stdin = false; boolean make_catalog = true; char data_filename[MAXPATHLEN]; char *typename = NULL; /* this is what the user said */ long start_of_filenames; long hashtable_size = 1L<<16; long flush_after_n_words = 300000; char *command_name; char *filter_name = NULL; FILE *filter_process_in = NULL; FILE *filter_process_out = NULL; #ifdef WIN32 PROCESS_INFORMATION piProcInfo; #endif dataopsrec dataops; /*------------- these go into dataops boolean (*separator_function)(); void (*header_function)(); void (*finish_header_function)(); long (*date_function)(); char *type = NULL; int minwordlen= 2; ---------------*/ /* dgg -- put all of these separate, datatype-specific functions & params into a record! */ gDelimiters[0]= '\0'; /* <-- bombs ?? */ dataops.separator_function= NULL; dataops.header_function= NULL; dataops.date_function= NULL; dataops.finish_header_function= NULL; dataops.type= "TEXT"; dataops.indextype= NULL; dataops.multitype=NULL; dataops.addseparatorwords= false; dataops.extraheaderweight= true; dataops.repeat_weight= 1; dataops.minwordlen= 2; dataops.wordDelimiter= wordbreak_notalnum; dataops.delimiters= gDelimiters; wordDelimiter= wordbreak_notalnum; /*------ separator_function = NULL; header_function = NULL; date_function = NULL; finish_header_function = NULL; type = "TEXT"; -------*/ typename = "Text"; next_argument = next_arg(&argc, &argv); command_name = next_argument; logfile = stderr; #ifdef WIN32 wais_pid = GetCurrentProcessId(); #else wais_pid = getpid(); #endif if(0 == argc) { usage(command_name); exit(0); } #ifdef THINK_C strcpy(index_filename, "wais:System Folder:wais-index:index"); #else strcpy(index_filename, "index"); /* in the current directory */ #endif /* THINK_C */ stop_list_file("\0"); /* dgg */ if(NULL == (next_argument = next_arg(&argc, &argv))){ fprintf(stderr,"No arguments specified\n"); exit(0); } while((next_argument != NULL) && '-' == next_argument[0]){ /* then we have an argument to process */ if((0 == strcmp("-i", next_argument)) || /* -i is for backcompatibility */ (0 == strcmp("-d", next_argument))){ if(NULL == (next_argument = next_arg(&argc, &argv))){ fprintf(stderr,"Expected filename for the index\n"); exit(0); } strcpy(index_filename, next_argument); } #ifdef BIO else if (0 == strcmp("-stop", next_argument)){ /* dgg, stoplist file */ if (NULL == (next_argument = next_arg(&argc, &argv))){ fprintf(stderr,"Expected filename for the stoplist\n"); exit(0); } stop_list_file(next_argument); } else if (0 == strcmp("-delim", next_argument)){ /* dgg, delimiters */ if (NULL == (next_argument = next_arg(&argc, &argv))){ fprintf(stderr,"Expected the delimiters argument\n"); exit(0); } strcpy(gDelimiters, next_argument); dataops.wordDelimiter = wordbreak_user; wordDelimiter = wordbreak_user; printf("Delimiters used in index: %s\n\n",gDelimiters); } #endif else if(0 == strcmp("-a", next_argument)){ adding_to_existing_index = true; } else if(0 == strcmp("-r", next_argument)){ traverse_directory = true; } else if(0 == strcmp("-register", next_argument)){ register_database = true; } else if(0 == strcmp("-export", next_argument)){ export_database = true; } else if(0 == strcmp("-v", next_argument)){ fprintf(stderr,"%s: %s %s\n", command_name, VERSION, INDEXER_DATE); #ifdef WIN32 fprintf(stderr,"%s\n",VERWIN32); if (argc_copy == 2) exit(0); #endif } else if (0 == strcmp("-stdin", next_argument)) { read_files_from_stdin = true; } else if (0 == strcmp("-nopos", next_argument)) { word_positions = false; } else if (0 == strcmp("-pos", next_argument)) { word_positions = true; } else if (0 == strcmp("-nopairs", next_argument)) { word_pairs = false; } else if (0 == strcmp("-pairs", next_argument)) { word_pairs = true; } else if (0 == strcmp("-nocat", next_argument)) { make_catalog = false; } else if(0 == strcmp("-mem", next_argument)){ if(NULL == (next_argument = next_arg(&argc, &argv))) panic("Expected a number for the amount of memory to use"); memory_to_use = atol(next_argument); if(memory_to_use < 1) panic("The -mem argument should not be less than 1"); if(memory_to_use > 200) fprintf(stderr,"Warning: The -mem parameter was %ld Mbytes. That is a large number of mega bytes in current machines\n", memory_to_use); } else if(0 == strcmp("-cmmem", next_argument)){ if(NULL == (next_argument = next_arg(&argc, &argv))) panic("Expected a number (1-100) for percentage of memory to use"); cm_mem_percent = atol(next_argument); if(cm_mem_percent < 1) panic("The -cmmem argument should not be less than 1 and less than 100"); if(cm_mem_percent > 100) panic("Warning: The -cmmem parameter was %ld%%. It should be between 1-100.", cm_mem_percent); } else if(0 == strcmp("-filter", next_argument)){ if(NULL == (next_argument = next_arg(&argc, &argv))) panic("Expected the name of a program to use to find keywords"); filter_name=next_argument; } else if(0 == strcmp("-grow", next_argument)){ if(NULL == (next_argument = next_arg(&argc, &argv))) panic("Expected a number (1-100) for database growing percentage"); grow_percent = atol(next_argument); if(grow_percent < 1) panic("The -grow argument should not be less than 1"); } else if(0 == strcmp("-textsize", next_argument)){ if(NULL == (next_argument = next_arg(&argc, &argv))) panic("Expected a number for text size in megabytes"); text_size = atol(next_argument); if(text_size < 1) panic("The -textsize argument should not be less than 1"); } else if (0 == strcmp("-e", next_argument)) { char *peek_argument = peek_arg(&argc, &argv); #ifdef WIN32 log_file_name = "NUL:"; /* default to NUL: */ #else log_file_name = "/dev/null"; /* default to /dev/null */ #endif if ((peek_argument != NULL) && ('-' != peek_argument[0])) { log_file_name = next_arg(&argc, &argv); } /* end if (explicit log file) */ } /* end if (-e) */ else if (0 == strcmp("-l", next_argument)) { #ifdef WIN32 char *pNextArg = next_arg(&argc, &argv); if (pNextArg!=NULL) { wais_log_level = atol(pNextArg); } else { panic("Expected a log-level argument"); } #else wais_log_level = atol(next_arg(&argc, &argv)); #endif } /* end if (-l) */ else if(0 == strcmp("-cm", next_argument)){ /* this is an undocumented argument to help use this to front end the CM application */ indexingForBeta = true; } else if(0 == strcmp("-T", next_argument)){ /* This is a specification for a "Special" type. The next argument is the type name. This will not index the body of the file. */ if(NULL == (next_argument = next_arg(&argc, &argv))) panic("Expected a file type"); typename = next_argument; dataops.type = next_argument; fprintf(stderr,"waisindex: setting type to %s\n", next_argument); dataops.finish_header_function = filename_finish_header_function; } /* multitype extensions */ /* This is a specification for a multi-type document, the types should be entered as a comma delimited list. Note that this only defines all the types available in the database, you also need to specify a -t option so that the indexer knows how to parse the files. One of the limitations here is that each document must be a file with the extension of the file being the document type, so the document #### has a text file ####.TEXT and a jfif file ####.JFIF, not real nice but needed. Note that this contains both the primary and secondary document types, whereas dataops.type contains the primary type. */ else if(0 == strcmp("-M", next_argument)){ if(NULL == (next_argument = next_arg(&argc, &argv))) panic("Expected a multitype list"); dataops.multitype = next_argument; } else if(0 == strcmp("-x", next_argument)){ if(NULL == (next_argument = next_arg(&argc, &argv))) panic("Expected an excluded filename list"); strncpy(ExcludeFiles,next_argument,EXCLUDEFILENAMESLEN-2); ExcludeFiles[strlen(ExcludeFiles)+2] = '\0'; /* Ensure double-null terminated */ strtok(ExcludeFiles,","); while (strtok(NULL,",")!=NULL) /* Cycle until all tokens found */ ; } else if(0 == strcmp("-keywords", next_argument)){ if(NULL == (next_argument = next_arg(&argc, &argv))) panic("Expected -keywords argument string"); keywords = next_argument; } else if(0 == strcmp("-keyword_file", next_argument)){ if(NULL == (next_argument = next_arg(&argc, &argv))) panic("Expected -keyword_file filename"); keyword_filename = next_argument; } else if(0 == strcmp("-contents", next_argument)){ index_contents = true; } else if(0 == strcmp("-nocontents", next_argument)){ index_contents = false; } else if(0 == strcmp("-t", next_argument)){ /* then we have a specialized file */ index_contents = true; if(NULL == (next_argument = next_arg(&argc, &argv))) panic("Expected a file type"); if(0 == strcmp("groliers", next_argument)){ typename = next_argument; dataops.type ="TEXT"; dataops.separator_function = groliers_separator_function; dataops.header_function = groliers_header_function; dataops.finish_header_function = groliers_finish_header_function; } #ifdef BIO else if(0 == strcmp("genbank", next_argument)){/* dgg */ typename = next_argument; dataops.type ="TEXT"; dataops.separator_function = genbank_separator_function; dataops.header_function = genbank_header_function; dataops.finish_header_function = genbank_finish_header_function; dataops.date_function = genbank_date_function; dataops.repeat_weight= 0; dataops.addseparatorwords= true; dataops.extraheaderweight= false; dataops.minwordlen= 2; } else if(0 == strcmp("embl", next_argument)){/* dgg */ typename = next_argument; dataops.type ="TEXT"; dataops.separator_function = embl_separator_function; dataops.header_function = embl_header_function; dataops.finish_header_function = embl_finish_header_function; dataops.date_function = embl_date_function; dataops.repeat_weight= 0; dataops.addseparatorwords= true; dataops.extraheaderweight= false; } else if(0 == strcmp("pir", next_argument)){/* dgg */ typename = next_argument; dataops.type = "TEXT"; dataops.separator_function = pir_separator_function; dataops.header_function = pir_header_function; dataops.finish_header_function = pir_finish_header_function; dataops.date_function = pir_date_function; dataops.repeat_weight= 0; dataops.addseparatorwords= true; dataops.extraheaderweight= false; } else if(0 == strcmp("prositedoc", next_argument)){ /* dgg */ typename = next_argument; dataops.type = "TEXT"; dataops.separator_function = prositedoc_separator_function; dataops.header_function = prositedoc_header_function; dataops.finish_header_function = prositedoc_finish_header_function; dataops.repeat_weight= 0; dataops.addseparatorwords= true; dataops.extraheaderweight= false; } else if(0 == strcmp("prositedat", next_argument)){ /* dgg */ typename = next_argument; dataops.type = "TEXT"; dataops.separator_function = prositedat_separator_function; dataops.header_function = prositedat_header_function; dataops.finish_header_function = prositedat_finish_header_function; dataops.repeat_weight= 0; dataops.addseparatorwords= true; dataops.extraheaderweight= false; } else if(0 == strcmp("biojournal", next_argument)){ /* dgg */ typename = next_argument; dataops.type = "TEXT"; dataops.separator_function = biojournal_separator_function; dataops.header_function = biojournal_header_function; dataops.finish_header_function = biojournal_finish_header_function; dataops.repeat_weight= 0; dataops.addseparatorwords= true; dataops.extraheaderweight= false; } else if(0 == strcmp("redbook", next_argument)){ /* dgg */ typename = next_argument; dataops.type = "TEXT"; dataops.separator_function = redbook_separator_function; dataops.header_function = redbook_header_function; dataops.finish_header_function = redbook_finish_header_function; dataops.repeat_weight= 0; dataops.addseparatorwords= true; dataops.extraheaderweight= false; dataops.wordDelimiter= wordbreak_user; /* redbook_delimiter; */ wordDelimiter= wordbreak_user; /* wordbreak_notgraph; */ dataops.minwordlen= 1; if (gDelimiters[0] == '\0') strcpy( gDelimiters, "/{}()[]%-:#.~*\";,|"); } else if(0 == strcmp("flybase", next_argument)){ /* dgg */ typename = next_argument; dataops.type = "TEXT"; dataops.separator_function = flybase_separator_function; dataops.header_function = flybase_header_function; dataops.finish_header_function = flybase_finish_header_function; dataops.repeat_weight= 0; dataops.addseparatorwords= true; dataops.extraheaderweight= false; dataops.wordDelimiter= wordbreak_user; /* flybase_delimiter; */ wordDelimiter= wordbreak_user; /* wordbreak_notgraph; */ dataops.minwordlen= 1; if (gDelimiters[0] == '\0') strcpy( gDelimiters, "-/{}:.~*\";,|"); /* flybase symbols valid data ()$+-?;.\' possible data and delimiter |;[]-?.~ delimiters solution to confusion: set possible delimiters as delimiters, and permit literal searches with "..." or '...' enclosed strings. */ } else if(0 == strcmp("flystock", next_argument)){ /* dgg */ typename = next_argument; dataops.type = "TEXT"; dataops.separator_function = bio_separator_function; dataops.header_function = bio_header_function; dataops.finish_header_function = bio_finish_header_function; dataops.repeat_weight= 0; dataops.addseparatorwords= true; dataops.extraheaderweight= false; dataops.wordDelimiter= wordbreak_user; /* flybase_delimiter; */ wordDelimiter= wordbreak_user; /* wordbreak_notgraph; */ dataops.minwordlen= 1; if (gDelimiters[0] == '\0') strcpy( gDelimiters, "-/{}:.~*\";,|"); /* flystock symbols valid data []()/-;?+.{} possible data and delimiter =;. ;. in text field is del, in data field is data delimiters *";, more delimiters (from matthewk) - / {} : solution to confusion: set possible delimiters as delimiters, and permit literal searches with "..." or '...' enclosed strings. ! want some way to provide field names (report "stylesheet") with searched/fetched records for flybase, flystock, other data files ! want "keyword [field]" limited searches for some of this to make sense ! */ } else if(0 == strcmp("din", next_argument)){ typename = next_argument; dataops.type = "TEXT"; dataops.separator_function = din_separator_function; dataops.header_function = din_header_function; dataops.finish_header_function = din_finish_header_function; } #endif #ifdef NeXT else if(0 == strcmp("objc", next_argument)){ typename = next_argument; dataops.type = "TEXT"; dataops.separator_function = wobjc_separator_function; dataops.header_function = wobjc_header_function; dataops.finish_header_function = wobjc_finish_header_function; } #endif /* def NeXT */ else if(0 == strcmp("listserv_digest", next_argument)){ typename = next_argument; dataops.type = "TEXT"; dataops.separator_function = listserv_digest_separator_function; dataops.header_function = listserv_header_function; dataops.date_function = listserv_date_function; dataops.finish_header_function = listserv_finish_header_function; } #ifdef AAS else if(0 == strcmp("AAS_abstract", next_argument)){ typename = next_argument; dataops.separator_function = aasab_separator_function; dataops.header_function = aasab_header_function; dataops.finish_header_function = aasab_finish_header_function; } #endif /* AAS */ #ifdef STELAR else if(0==strcmp("stelar",next_argument)){ dataops.type="TEXT"; typename=next_argument; dataops.separator_function=stelar_separator_function; dataops.header_function=stelar_header_function; dataops.finish_header_function=stelar_finish_header_function; } #endif /* STELAR */ else if(0 == strcmp("mail", next_argument)){ typename = next_argument; dataops.type = "TEXT"; dataops.separator_function = mail_separator_function; dataops.header_function = mail_header_function; dataops.date_function = mail_date_function; dataops.finish_header_function = mail_finish_header_function; } else if(0 == strcmp("mail_or_rmail", next_argument)){ typename = next_argument; dataops.type = "TEXT"; dataops.separator_function = mail_or_rmail_separator; dataops.header_function = mail_header_function; dataops.date_function = mail_date_function; dataops.finish_header_function = mail_finish_header_function; } else if(0 == strcmp("mail_digest", next_argument)){ typename = next_argument; dataops.type = "TEXT"; dataops.separator_function = mail_digest_separator_function; dataops.header_function = mail_header_function; dataops.date_function = mail_date_function; dataops.finish_header_function = mail_finish_header_function; } else if(0 == strcmp("mh_bboard", next_argument)){ typename = next_argument; dataops.type = "TEXT"; dataops.separator_function = mh_bboard_separator_function; dataops.header_function = mail_header_function; dataops.date_function = mail_date_function; dataops.finish_header_function = mail_finish_header_function; } #ifdef WIN32 else if(0 == strcmp("ms_kbase", next_argument)){ typename = next_argument; dataops.type = "TEXT"; dataops.separator_function = NULL; dataops.header_function = mskbase_header_function; dataops.date_function = mskbase_date_function; dataops.finish_header_function = mskbase_finish_header_function; } #endif else if(0 == strcmp("rmail", next_argument)){ typename = next_argument; dataops.type = "TEXT"; dataops.separator_function = rmail_separator_function; dataops.header_function = mail_header_function; dataops.date_function = mail_date_function; dataops.finish_header_function = mail_finish_header_function; } else if(0 == strcmp("netnews", next_argument)){ typename = next_argument; dataops.type = "TEXT"; dataops.separator_function = NULL; dataops.header_function = mail_header_function; dataops.date_function = mail_date_function; dataops.finish_header_function = mail_finish_header_function; } else if(0 == strcmp("rn", next_argument)){ typename = next_argument; dataops.type = "TEXT"; dataops.separator_function = rn_separator_function; dataops.header_function = mail_header_function; dataops.date_function = mail_date_function; dataops.finish_header_function = mail_finish_header_function; } #ifdef BIBDB else if(0 == strcmp("irlist", next_argument)){ typename = next_argument; dataops.type = "TEXT"; dataops.separator_function = irlist_separator_function; dataops.header_function = irlist_header_function; dataops.date_function = irlist_date_function; dataops.finish_header_function = mail_finish_header_function; } /* formfeed-separated items , Intro to Algorithms buglist, etc */ else if(0 == strcmp("formfeed", next_argument)){ typename = next_argument; if (!dataops.type || (strlen(dataops.type)==0)) { if (dataops.type) fprintf(stderr, "irbuild: overwriting type %s\n", dataops.type); dataops.type = "TEXT"; } else { fprintf(stderr, "irbuild: using type %s\n", dataops.type); } dataops.separator_function = formfeed_separator_function; dataops.header_function = dash_header_function; dataops.finish_header_function = dash_finish_header_function; } /* formfeed-separated items , steve files */ else if(0 == strcmp("bibdb", next_argument)){ typename = next_argument; if (!dataops.type || (strlen(dataops.type)==0)) { if (dataops.type) fprintf(stderr, "irbuild: overwriting type %s\n", dataops.type); dataops.type = "TEXT"; } else { fprintf(stderr, "irbuild: using type %s\n", dataops.type); stop_list_file("bibdb.stop"); } dataops.separator_function = bibdb_separator_function; dataops.header_function = bibdb_header_function; dataops.date_function = bibdb_date_function; dataops.finish_header_function = bibdb_finish_header_function; } /* formfeed-separated items, bibinbf */ else if(0 == strcmp("bibinf", next_argument)){ typename = next_argument; dataops.type = "TEXT"; dataops.separator_function = bibinf_separator_function; dataops.header_function = bibinf_header_function; #ifdef SIMPLE_BIBINF dataops.date_function = bibinf_date_function; #endif dataops.finish_header_function = bibinf_finish_header_function; } #endif else if(0 == strcmp("emacsinfo", next_argument)){ typename = next_argument; dataops.type = "TEXT"; dataops.separator_function = emacs_info_separator_function; dataops.header_function = emacs_info_header_function; dataops.finish_header_function = emacs_info_finish_header_function; } else if(0 == strcmp("catalog", next_argument)){ typename = next_argument; dataops.type = "TEXT"; dataops.separator_function = catalog_separator_function; dataops.header_function = catalog_header_function; dataops.finish_header_function = catalog_finish_header_function; } else if(0 == strcmp("bio", next_argument)){ typename = next_argument; dataops.type = "TEXT"; dataops.separator_function = bio_separator_function; dataops.header_function = bio_header_function; dataops.finish_header_function = bio_finish_header_function; } else if(0 == strcmp("cmapp", next_argument)){ typename = next_argument; dataops.type = "TEXT"; dataops.separator_function = cmapp_separator_function; dataops.header_function = cmapp_header_function; dataops.finish_header_function = cmapp_finish_header_function; } else if(0 == strcmp("ftp", next_argument)){ dataops.type = "TEXT-FTP"; typename = next_argument; dataops.separator_function = first_line_separator_function; dataops.header_function = first_line_header_function; dataops.finish_header_function = first_line_finish_header_function; } else if(0 == strcmp("jargon", next_argument)){ typename = next_argument; dataops.type = "TEXT"; dataops.separator_function = jargon_separator_function; dataops.header_function = jargon_header_function; dataops.finish_header_function = jargon_finish_header_function; } else if(0 == strcmp("server", next_argument)){ typename = next_argument; dataops.type = "WSRC"; dataops.finish_header_function = filename_finish_header_function; } else if(0 == strcmp("text", next_argument)){ dataops.type = "TEXT"; typename = next_argument; check_for_text_file = true; } else if(0 == strcmp("filename", next_argument)){ dataops.type = "TEXT"; typename = next_argument; dataops.finish_header_function = filename_finish_header_function; } #if 0 /* html format */ else if(0 == strcmp("html", next_argument)){ dataops.type = "HTML"; typename = next_argument; dataops.separator_function = html_separator_function; dataops.header_function = html_header_function; dataops.finish_header_function = html_finish_header_function; } #endif else if(0 == strcmp("irg", next_argument)){ typename = next_argument; dataops.type = "TEXT"; dataops.separator_function = irg_separator_function; dataops.header_function = irg_header_function; dataops.finish_header_function = irg_finish_header_function; } /* dash-separated items , Intro to Algorithms buglist, etc */ else if(0 == strcmp("dash", next_argument)){ dataops.type = "TEXT"; typename = next_argument; dataops.separator_function = dash_separator_function; dataops.header_function = dash_header_function; dataops.finish_header_function = dash_finish_header_function; } /* one_line-separated items */ else if(0 == strcmp("one_line", next_argument)){ dataops.type = "TEXT"; typename = next_argument; dataops.separator_function = one_line_separator_function; dataops.header_function = one_line_header_function; dataops.finish_header_function = one_line_finish_header_function; } /* blank line-separated items (paragraphs) */ else if(0 == strcmp("para", next_argument)){ dataops.type = "TEXT"; typename = next_argument; dataops.separator_function = para_separator_function; dataops.header_function = para_header_function; dataops.finish_header_function = para_finish_header_function; } /* seeker items */ else if(0 == strcmp("seeker", next_argument)){ dataops.type = "TEXT"; typename = next_argument; dataops.separator_function = seeker_separator_function; dataops.header_function = seeker_header_function; dataops.finish_header_function = seeker_finish_header_function; } /* medline format */ else if(0 == strcmp("medline", next_argument)){ dataops.type = "TEXT"; typename = next_argument; dataops.separator_function = medline_separator_function; dataops.header_function = medline_header_function; dataops.finish_header_function = medline_finish_header_function; } /* refer format */ else if(0 == strcmp("refer", next_argument)){ dataops.type = "TEXT"; typename = next_argument; dataops.separator_function = refer_separator_function; dataops.header_function = refer_header_function; dataops.finish_header_function = refer_finish_header_function; } /* first_line format */ else if(0 == strcmp("first_line", next_argument)){ dataops.type = "TEXT"; typename = next_argument; dataops.separator_function = first_line_separator_function; dataops.header_function = first_line_header_function; dataops.finish_header_function = first_line_finish_header_function; } /* rlin items */ else if(0 == strcmp("rlin", next_argument)){ dataops.type = "TEXT"; typename = next_argument; dataops.separator_function = rlin_separator_function; dataops.header_function = rlin_header_function; dataops.finish_header_function = rlin_finish_header_function; } else if(0 == strcmp("dvi", next_argument)){ typename = next_argument; dataops.type = "DVI"; dataops.finish_header_function = filename_finish_header_function; } else if(0 == strcmp("ps", next_argument)){ typename = next_argument; dataops.type = "PS"; dataops.finish_header_function = filename_finish_header_function; } else if(0 == strcmp("pict", next_argument)){ typename = next_argument; dataops.type = "PICT"; dataops.finish_header_function = filename_finish_header_function; index_contents = false; } else if(0 == strcmp("gif", next_argument)){ typename = next_argument; dataops.type = "GIF"; dataops.finish_header_function = filename_finish_header_function; index_contents = false; } else if(0 == strcmp("tiff", next_argument)){ typename = next_argument; dataops.type = "TIFF"; dataops.finish_header_function = filename_finish_header_function; index_contents = false; } else if(0== strcmp("object", next_argument)) { dataops.type = "OBJECT"; typename = next_argument; } else if(0 == strcmp("inriadoc", next_argument)){ typename = next_argument; dataops.type = "TEXT"; dataops.separator_function = NULL; dataops.header_function = inriadoc_header_function; dataops.date_function = NULL; dataops.finish_header_function = inriadoc_finish_header_function; } else if(0 == strcmp("fortran", next_argument)){ typename = next_argument; dataops.type = "FORTRAN"; } else if(0 == strcmp("paradoc", next_argument)){ typename = next_argument; dataops.type = "TEXT"; dataops.separator_function = para_separator_function; dataops.header_function = inriadoc_header_function; dataops.date_function = NULL; dataops.finish_header_function = inriadoc_finish_header_function; } else if(0 == strcmp("mime", next_argument)){ typename = next_argument; dataops.type = "MIME"; dataops.separator_function = mail_separator_function; dataops.header_function = mail_header_function; dataops.date_function = mail_date_function; dataops.finish_header_function = mail_finish_header_function; } /* BibTeX items */ else if(0 == strcmp("bibtex", next_argument)){ dataops.type = "TEXT"; typename = next_argument; dataops.separator_function = bibtex_separator_function; dataops.header_function = bibtex_header_function; dataops.finish_header_function = bibtex_finish_header_function; } /* ?:? seperated hypertext items */ else if(0 == strcmp("nhyp", next_argument)){ dataops.type = "TEXT"; typename = next_argument; dataops.separator_function = nhyp_separator_function; dataops.header_function = nhyp_header_function; dataops.finish_header_function = nhyp_finish_header_function; } /* Uniform Resource Locators - from Nat Torkington */ else if(0 == strcmp("URL", next_argument)) { dataops.type = "URL"; typename = next_argument; URL_trim = s_strdup(next_arg(&argc, &argv)); URL_prefix = s_strdup(next_arg(&argc, &argv)); } else if(0 == strcmp("ziff", next_argument)){ dataops.type = "TEXT"; typename = next_argument; dataops.separator_function = ziff_separator_function; dataops.header_function = ziff_header_function; dataops.finish_header_function = ziff_finish_header_function; } #ifdef SOUND else if(0 == strcmp("oneline_soundex", next_argument)){ dataops.indextype = "SOUNDEX"; dataops.type = "TEXT"; typename = next_argument; dataops.separator_function = one_line_separator_function; dataops.header_function = one_line_header_function; dataops.finish_header_function = one_line_finish_header_function; } else if(0 == strcmp("oneline_phonix", next_argument)){ dataops.indextype = "PHONIX"; dataops.type = "TEXT"; typename = next_argument; dataops.separator_function = one_line_separator_function; dataops.header_function = one_line_header_function; dataops.finish_header_function = one_line_finish_header_function; } #endif #ifdef HTML /* HyperText Markup Lanugage (from World Wide Web) */ else if(0 == strcmp("html", next_argument)){ dataops.type = "HTML"; typename = next_argument; dataops.separator_function = NULL; dataops.header_function = html_header_function; dataops.finish_header_function = html_finish_header_function; } #endif /* HTML */ else{ panic("Don't recognize the '%s' type", next_argument); } } else{ panic("Don't recognize the '%s' option", next_argument); } next_argument = next_arg(&argc, &argv); if (! (read_files_from_stdin || next_argument)) { fprintf(stderr,"No files specified\n"); exit(0); } } start_of_filenames = argc_copy - argc - 1; /* check index */ if(0 == strlen(pathname_name(index_filename))){ waislog(WLOG_HIGH, WLOG_ERROR, "The pathname specified for the destination of the index files ('%s') should have a leaf filename without an extention rather than just a directory.", index_filename); exit(0); } #ifdef WIN32 /* Check that we're on a partition supporting long file names */ if (!CanCreateLongFileNames(index_filename)) { waislog(WLOG_HIGH, WLOG_ERROR, "The pathname specified for the destination of the index files ('%s') is inaccessible or does not support long filenames.", index_filename); exit(0); } #endif waislog(WLOG_MEDIUM, WLOG_INDEX, "Starting to build database %s", index_filename); if(0 != init_search_engine(index_filename, false, false, cm_mem_percent, text_size, grow_percent)) panic("unable to initialize search engine"); if(true == adding_to_existing_index){ db = openDatabase(index_filename, false, false); if (db == NULL){ /* does not exist, create one */ db = openDatabase(index_filename, true, false); if (db == NULL) panic("unable to open the database"); } } else{ db = openDatabase(index_filename, true, false); if (db == NULL) panic("unable to open the database"); } #ifdef BIO write_delimiters(gDelimiters, db); #endif { /* set up the memory hashtable */ if(memory_to_use < 0){ /* default */ /* do nothing */ } else if(memory_to_use <= 2){ hashtable_size = 1L<<16; flush_after_n_words = 50000; } else if(memory_to_use <= 3){ hashtable_size = 1L<<16; flush_after_n_words =850000; } else if(memory_to_use <= 4){ hashtable_size = 1L<<16; flush_after_n_words = 110000; } else if(memory_to_use <= 5){ hashtable_size = 1L<<16; flush_after_n_words = 150000; } else if(memory_to_use <= 10){ /* shown to take about 6MB on a sun4, when it is dict limited */ hashtable_size = 1L<<16; flush_after_n_words = 300000; } else if(memory_to_use <= 20){ hashtable_size = 1L<<17; flush_after_n_words = 600000; } else{ /* over 20 Mbytes */ hashtable_size = 1L<<18; flush_after_n_words = 1000000; } /* Set up the filter process, if needed */ /* We do this before initing the hash table to stop the fork copying a load of rubbish*/ #ifdef WIN32 /* WIN32 Process is used here, how about thread? */ if(filter_name) { HANDLE hChildStdinRd, hChildStdinWr, hChildStdinWrDup, hChildStdoutRd, hChildStdoutWr, hSaveStdin, hSaveStdout; SECURITY_ATTRIBUTES saAttr; BOOL fSuccess; STARTUPINFO siStartInfo; int fd; /* Set the bInheritHandle flag so pipe handles are inherited. */ saAttr.nLength = sizeof(SECURITY_ATTRIBUTES); saAttr.bInheritHandle = TRUE; saAttr.lpSecurityDescriptor = NULL; /* * The steps for redirecting child's STDOUT: * 1. Save current STDOUT, to be restored later. * 2. Create anonymous pipe to be STDOUT for child. * 3. Set STDOUT of parent to be write handle of pipe, so * it is inherited by child. */ /* Save the handle to the current STDOUT. */ hSaveStdout = GetStdHandle(STD_OUTPUT_HANDLE); /* Create a pipe for the child's STDOUT. */ if (! CreatePipe(&hChildStdoutRd, &hChildStdoutWr, &saAttr, 0)) panic("Stdout pipe creation failed\n"); /* Set a write handle to the pipe to be STDOUT. */ if (! SetStdHandle(STD_OUTPUT_HANDLE, hChildStdoutWr)) panic("Redirecting STDOUT failed"); /* * The steps for redirecting child's STDIN: * 1. Save current STDIN, to be restored later. * 2. Create anonymous pipe to be STDIN for child. * 3. Set STDIN of parent to be read handle of pipe, so * it is inherited by child. * 4. Create a noninheritable duplicate of write handle, * and close the inheritable write handle. */ /* Save the handle to the current STDIN. */ hSaveStdin = GetStdHandle(STD_INPUT_HANDLE); /* Create a pipe for the child's STDIN. */ if (! CreatePipe(&hChildStdinRd, &hChildStdinWr, &saAttr, 0)) panic("Stdin pipe creation failed\n"); /* Set a read handle to the pipe to be STDIN. */ if (! SetStdHandle(STD_INPUT_HANDLE, hChildStdinRd)) panic("Redirecting Stdin failed"); /* * Duplicate the write handle to the pipe, so it is not * inherited. */ fSuccess = DuplicateHandle(GetCurrentProcess(), hChildStdinWr, GetCurrentProcess(), &hChildStdinWrDup, 0, FALSE, /* not inherited */ DUPLICATE_SAME_ACCESS); if (!fSuccess) panic("DuplicateHandle failed"); CloseHandle(hChildStdinWr); /* Set up members of STARTUPINFO structure. */ siStartInfo.cb = sizeof(STARTUPINFO); siStartInfo.lpReserved = NULL; siStartInfo.lpReserved2 = NULL; siStartInfo.cbReserved2 = 0; siStartInfo.lpDesktop = NULL; siStartInfo.dwFlags = 0; /* Create the child process. */ fSuccess = CreateProcess(NULL, filter_name, /* command line */ NULL, /* process security attributes */ NULL, /* primary thread security attributes */ TRUE, /* handles are inherited */ 0, /* creation flags */ NULL, /* use parent's environment */ NULL, /* use parent's current directory */ &siStartInfo, /* STARTUPINFO pointer */ &piProcInfo); /* receives PROCESS_INFORMATION */ if (!fSuccess) panic("Create process failed"); /* After process creation, restore the saved STDIN and STDOUT. */ if (! SetStdHandle(STD_INPUT_HANDLE, hSaveStdin)) panic("Re-redirecting Stdin failed\n"); if (! SetStdHandle(STD_OUTPUT_HANDLE, hSaveStdout)) panic("Re-redirecting Stdout failed\n"); fd = _open_osfhandle((long)hChildStdinWrDup, _O_APPEND); if (fd < 0) panic("Convert Win32 Handle to filter_process_in failed\n"); filter_process_in=_fdopen(fd,"wb"); fd = _open_osfhandle((long)hChildStdoutRd, _O_RDONLY); if (fd < 0) panic("Convert Win32 Handle to filter_process_out failed\n"); filter_process_out=_fdopen(fd,"rb"); waislog(WLOG_LOW, WLOG_INDEX, "Filter %s started (%d)",filter_name, piProcInfo.dwProcessId); } #else if(filter_name) { int to_handles[2]; int from_handles[2]; int pid; extern int errno; if (pipe(to_handles) <0) { panic("can't open to pipe"); } if (pipe(from_handles) <0) { panic("can't open from pipe"); } if((pid = fork()) ==0) { /* child */ close(0); close(1); close(2); dup(to_handles[0]); dup(from_handles[1]); /* Set up standard input/output/error */ dup(from_handles[1]); close(to_handles[0]); close(to_handles[1]); close(from_handles[0]); close(from_handles[1]); if(execl(filter_name,filter_name,NULL) == -1) { exit(errno); } /*NOTREACHED*/ } /* parent */ if (pid <0) { panic("Couldn't fork"); } close(to_handles[0]); close(from_handles[1]); filter_process_in=fdopen(to_handles[1],"w"); filter_process_out=fdopen(from_handles[0],"r"); waislog(WLOG_LOW, WLOG_INDEX, "Filter %s started (%d)",filter_name,pid); } #endif /* WIN32 */ init_add_word(db, hashtable_size, flush_after_n_words); } if (read_files_from_stdin) { if (0 != (next_argument = fgets(data_filename, MAXPATHLEN, stdin))) { int len = strlen(next_argument); if (next_argument[len-1] == '\n') { next_argument[len-1] = '\0'; } } } while(NULL != next_argument){ /* the first filename is in next_argument already */ if(directoryp(next_argument)){ if(traverse_directory){ index_directory(next_argument, &dataops, db, check_for_text_file, adding_to_existing_index, word_positions, word_pairs, #ifndef WIN32 filter_process_in,filter_process_out); #else "*.*", filter_process_in,filter_process_out); #endif /* index_directory(next_argument, separator_function, header_function, date_function, finish_header_function, type, db, check_for_text_file, adding_to_existing_index, word_positions, word_pairs, minwordlen); */ } } #ifdef WIN32 else if ((strchr(next_argument,'*')!=NULL)||(strchr(next_argument,'?')!=NULL)) { /* Contains a wildcard */ if (traverse_directory) { /* We're being asked to recursively index a directory tree looking for filenames which match the wildcarded pattern. */ char *cp; char *cPath = next_argument; /* Split the argument into the directory and the wildcard mask */ cp = strrchr(cPath,'\\'); if (cp==NULL) { /* No path */ cPath = "."; cp = next_argument; } else { /* Tie off path */ *cp++ = '\0'; /* File mask */ if (*cp=='\0') cp = "*.*"; } /* Index the directory */ index_directory(cPath, &dataops, db, check_for_text_file, adding_to_existing_index, word_positions, word_pairs, cp, filter_process_in,filter_process_out); } else { /* Not recursive */ HANDLE hSearch; WIN32_FIND_DATA ffd; char FileName[MAX_FILENAME_LEN+1]; char *cp; hSearch = FindFirstFile(next_argument,&ffd); if (hSearch!=INVALID_HANDLE_VALUE) { while (TRUE) { if ((ffd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)==0) { /* Index the file */ strncpy(FileName,next_argument,MAX_FILENAME_LEN); cp = strrchr(FileName,'\\'); if (cp!=NULL) { *(cp+1) = '\0'; strncat(FileName,ffd.cFileName,MAX_FILENAME_LEN); } else { strncpy(FileName,ffd.cFileName,MAX_FILENAME_LEN); } waislog(WLOG_MEDIUM, WLOG_INDEX, "Indexing file: %s", FileName); index_text_file(FileName, &dataops, db, check_for_text_file, adding_to_existing_index, word_positions, word_pairs, filter_process_in,filter_process_out); } if (!FindNextFile(hSearch,&ffd)) break; } FindClose(hSearch); } } } #endif else{ /* not a directory */ waislog(WLOG_MEDIUM, WLOG_INDEX, "Indexing file: %s", next_argument); index_text_file(next_argument, &dataops, db, check_for_text_file, adding_to_existing_index, word_positions, word_pairs, filter_process_in,filter_process_out); /* index_text_file(next_argument, separator_function, header_function, date_function, finish_header_function, type, db, check_for_text_file, adding_to_existing_index, word_positions, word_pairs, minwordlen); */ } if (read_files_from_stdin) { if (0 != (next_argument = fgets(data_filename, MAXPATHLEN, stdin))) { int len = strlen(next_argument); if (next_argument[len-1] == '\n') { next_argument[len-1] = '\0'; } } } else { next_argument = next_arg(&argc, &argv); } } finished_add_word(db); retreive_keywords(db); { char filename[MAX_FILENAME_LEN + 1]; if(!probe_file(source_filename(filename, db))){ char database_name[MAX_FILENAME_LEN]; write_src_structure(source_filename(filename, db), export_database?pathname_name(index_filename): truename(index_filename, database_name), typename, &argv_copy[start_of_filenames], argc_copy - start_of_filenames, export_database, 210L); }else{ char *oldkeys[50]; short oldKeys; if ((oldKeys = read_src_structure(source_filename(filename, db), oldkeys))) { if (compare(keyword, nKeys, oldkeys, oldKeys) > 0.1) { char database_name[MAX_FILENAME_LEN]; waislog(WLOG_MEDIUM,WLOG_INDEX, "Keyword comparison indicates significant change."); waislog(WLOG_MEDIUM,WLOG_INDEX, "Rewriting source description."); waislog(WLOG_MEDIUM,WLOG_INDEX, "New source description should be exported."); write_src_structure(source_filename(filename, db), export_database?pathname_name(index_filename): truename(index_filename, database_name), typename, &argv_copy[start_of_filenames], argc_copy - start_of_filenames, export_database, 210L); } } else { char database_name[MAX_FILENAME_LEN]; waislog(WLOG_MEDIUM,WLOG_INDEX, "No keyword list found."); waislog(WLOG_MEDIUM,WLOG_INDEX, "Rewriting source description."); waislog(WLOG_MEDIUM,WLOG_INDEX, "New source description should be export ed."); write_src_structure(source_filename(filename, db), export_database?pathname_name(index_filename): truename(index_filename, database_name), typename, &argv_copy[start_of_filenames], argc_copy - start_of_filenames, export_database, 210L); } } /* write out a description of the server if appropriate */ if(register_database){ register_src_structure(source_filename(filename, db)); } } if(make_catalog) build_catalog(db); closeDatabase(db); /* wait for filter process to die, if there was one*/ if(filter_process_in) { fprintf(filter_process_in,"Q\n"); fflush(filter_process_in); fclose(filter_process_out); fclose(filter_process_in); #ifdef WIN32 waislog(WLOG_LOW, WLOG_INDEX, "Filter %s Exited (%ld)",filter_name, WaitForSingleObject(piProcInfo.hProcess, INFINITE)); #else waislog(WLOG_LOW, WLOG_INDEX, "Filter %s Exited (%ld)",filter_name,wait(0L)); #endif } waislog(WLOG_MEDIUM, WLOG_INDEX, "Finished build"); exit(0); } #ifdef WIN32 int #endif read_src_structure(filename, output) char *filename; char *output[50]; { FILE *source_stream = s_fopen(filename, "r"); char line[MAX_LINE_LENGTH], *ptr; #ifdef WIN32 int keyflag = 0, linelen, index; #else int keyflag = 0, linelen, i, index; #endif int desflag = 0; int tmp; index = 0; while (fgets(line, MAX_LINE_LENGTH, source_stream)) { linelen = strlen(line); if (keyflag) { if (!strncmp(" )", line, (linelen > 19) ? 19 : linelen)) keyflag = 0; else { line[strlen(line)-1] = '\0'; /* get rid of trailing return */ ptr = line; /* parse keyword */ while (*ptr == ' ') ptr++; output[index] = malloc(strlen(ptr)+1); strcpy(output[index], ptr); index++; } } if (!strncmp(" :keyword-list (", line, (linelen > 18) ? 18 : linelen)) keyflag = 1; if (!strncmp(" :description", line, (linelen > 15) ? 15 : linelen)) desflag = 1; if (desflag) { tmp=strlen(line)+1; descript[nDesLines] = malloc(tmp); strcpy(descript[nDesLines], line); nDesLines++; if (*line == '\"') desflag = 0; } } fclose(source_stream); return(index); } double compare(a, alen, b, blen) char *a[50], *b[50]; short alen, blen; { int changes = 0; int i, j; for (i=0; i<alen; i++) { for (j=0; j<blen; j++) if (!strcmp(a[i], b[j])) break; if (j == blen) changes++; } #ifdef WIN32 if (alen==0) return 0.0; #else printf("%d out of %d\n", changes, alen); /* info stuff */ #endif return((double) changes/alen); }