home *** CD-ROM | disk | FTP | other *** search
- /* WIDE AREA INFORMATION SERVER SOFTWARE:
- No guarantees or restrictions. See the readme file for the full standard
- disclaimer.
-
- Brewster@think.com
- */
-
- /* Copyright (c) CNIDR (see ../COPYRIGHT) */
-
-
- #ifndef lint
- static char *RCSid = "$Header: /archives/stelar/src/freeWAIS/freeWAIS-0.2/ir/RCS/waisindex.c,v 1.5 93/07/21 18:53:04 warnock Exp $";
- #endif
-
- /*
- * Building an index with a Unix shell interface.
- *
- * -brewster 6/90
- */
-
- /* Change log:
- * added -stdio option from jik@athena.mit.edu
- * $Log: waisindex.c,v $
- * Revision 1.5 93/07/21 18:53:04 warnock
- * Renamed from irbuild.c
- * Added STELAR-specific patches
- *
- * $Log: irbuild.c,v $
- * Revision 1.8 1993/10/12 11:18:25 pfeifer
- * Added stopword file for document style bibdb
- *
- * Revision 1.1 93/07/19 16:30:22 warnock
- * Initial revision
- *
- * Revision 1.7 1993/09/22 16:09:13 pfeifer
- * What have i done ?
- *
- * Revision 1.4 93/07/01 19:40:31 warnock
- * Added prototype for function double
- *
- * Revision 1.6 1993/06/04 10:23:15 pfeifer
- * Pachtlevel BIBDB
- *
- * Revision 1.3 93/02/16 17:07:49 freewais
- *
- * Revision 1.5 1993/06/02 18:29:00 pfeifer
- * Added code for local formats
- *
- * Revision 1.4 1993/06/01 14:05:54 pfeifer
- * Added code for soundex/phonix indexing and retrieval
- *
- * Revision 1.3 1993/02/16 17:07:49 freewais
- * added AT&T patches for keyword list
- *
- * Revision 1.2 1993/02/16 15:32:21 freewais
- * added AT&T patch to write first 50 dictionary entries to
- * src file
- *
- * Revision 1.1 1993/02/16 15:05:35 freewais
- * Initial revision
- *
- * Revision 1.47 92/05/10 14:48:17 jonathan
- * Updated for release.
- *
- * Revision 1.46 92/05/08 10:03:17 jonathan
- * Adjusted memory paramters. It's closer...
- *
- * Revision 1.45 92/05/06 17:26:46 jonathan
- * Added switch for indexing contents, new user-specified type name, new type:
- * filename, which only puts the name of the file in the header.
- *
- * Revision 1.44 92/04/25 21:14:35 brewster
- * added ziff
- *
- * Revision 1.43 92/04/22 15:29:13 jonathan
- * Added jargon to usage message.
- *
- * Revision 1.42 92/04/01 17:08:50 jonathan
- * Added FTP type.
- *
- * Revision 1.41 92/03/25 18:49:39 jonathan
- * Added log_level and log_file arguments.
- *
- * Revision 1.40 92/03/22 18:38:14 brewster
- * added objective C filter
- *
- * Revision 1.39 92/03/20 11:02:44 jonathan
- * Added code to handle switches for word_pairs and word_postition info.
- *
- * Revision 1.38 92/03/17 07:34:32 jonathan
- * Fixed spacing in usage message.
- *
- * Revision 1.37 92/03/10 10:42:51 morris
- * fixed small bug in command line argument handleing. doesn't die if there
- * are no args.
- *
- * Revision 1.36 92/03/05 07:05:32 shen
- * add cm grow percent and textsize to command line and init search engine
- *
- * Revision 1.35 92/03/04 16:34:09 jonathan
- * Set wais_pid from getpid().
- *
- * Revision 1.34 92/02/20 09:49:37 jonathan
- * Added bibtex and nhyp filters from S.P.vandeBurgt@research.ptt.nl.
- *
- * Revision 1.33 92/02/17 14:21:08 jonathan
- * Added switch to disable creation of catalog (-nocat).
- *
- * Revision 1.32 92/02/17 12:41:55 jonathan
- * Added RCSid.
- *
- * Revision 1.31 92/02/17 12:41:01 jonathan
- * Build catalog after completion of indexing.
- *
- * Revision 1.30 92/02/12 13:22:53 jonathan
- * Added "$Log" so RCS will put the log message in the header
- *
- */
-
- /* to do:
- * done: make incremental indexing not index things that are already index
- * add extra arg -register that will send in description of the server to
- * the directory of servers.
- * done: create a source struct in the .src file
- * make it continuously index to keep itself uptodate.
- *
- */
-
- #include <string.h>
- #include <sys/types.h>
- #ifndef WIN32
- #include <sys/param.h>
- #endif
- #include "irdirent.h"
- #include "cutil.h"
- #include "futil.h"
- #include "irfiles.h"
- #include "irtfiles.h"
- #include "panic.h"
- #include "ircfiles.h"
- #include "version.h"
- #include "irext.h"
- #include "stoplist.h" /* dgg */
-
- #ifdef WIN32
- #include <windows.h>
- #include <io.h>
- #include <fcntl.h>
- #define MAXPATHLEN 260
- int read_src_structure(char*,char**);
- int retreive_keywords(database*);
- #endif
-
- #ifdef BIO
- #define INDEXER_DATE "2/16/93"
- #else
- #define INDEXER_DATE "2/16/93"
- #endif
- #define MAX_LINE_LENGTH 1000
-
- extern char *keyword[50], *descript[1000];
- extern short nKeys, nDesLines;
- extern double compare();
-
- /* for reporting errors, in WAIStation it is defined in CRetrievalApp.c */
-
- extern boolean indexingForBeta;
-
- void usage(command)
- char *command;
- { /* no args */
- fprintf(stderr,"Usage: %s [-d index_filename]\n", command);
- fprintf(stderr," [-a] /* adding to an existing index, otherwise it erases the index */\n");
- fprintf(stderr," [-r] /* recursively index subdirectories */\n");
- fprintf(stderr," [-mem mbytes] /* number of megabytes to run this in */\n");
- fprintf(stderr," [-register] /* registers the database with the directory of servers.\n");
- fprintf(stderr," This should be done with care. */\n");
- fprintf(stderr," [-export] /* uses short dbname and port 210 */\n");
- fprintf(stderr," [-e [file]] /* set log output to file, or /dev/null if not specified */\n");
- fprintf(stderr," [-l log_level] /* set log level. 0 means log nothing,\n");
- fprintf(stderr," 10 [the default] means log everything */\n");
- fprintf(stderr," [-v] /* print the version of the software */\n");
- fprintf(stderr," [-filter process] /* use an external document parser */\n");
- fprintf(stderr," [-stdin] /* read file names from stdin */\n");
- fprintf(stderr," [-pos | -nopos] /* include (don't include - default) word position information /*\n");
- fprintf(stderr," [-nopairs | -pairs] /* don't include (or include - default) word pairs /*\n");
- fprintf(stderr," [-nocat] /* inhibit creation of catalog /*\n");
- fprintf(stderr," [-contents] /* Index the contents: this is good for types that\n");
- fprintf(stderr," inhibit the indexing of the contents (like gif). /*\n");
- fprintf(stderr," [-nocontents] /* Index only the filename, not the contents /*\n");
- #ifdef BIO
- fprintf(stderr," [-stop stoplist_filename] /* file of common words to ignore */\n");
- fprintf(stderr," [-delim delimiters] /* list of word delimiter symbols */\n");
- #endif
- fprintf(stderr," [-keywords \"<string>\"] /* Keywords to index for each document. */\n");
- fprintf(stderr," [-keyword_file <filename>] /* File of keywords to index. */\n");
-
- fprintf(stderr," [-cmmem mem%] /* percent of CM memory (CM code only) */\n");
- fprintf(stderr," [-T type] /* type becomes the \"TYPE\" of the document. */\n");
- /* multitype extensions */
- fprintf(stderr," [-M type,type] /* for multi-type documents. */\n");
- #ifdef WIN32
- fprintf(stderr," [-x filename,filename] /* ignore the filename(s). */\n");
- #endif
- fprintf(stderr," [-t /* format of the file. if none then each file is a document */\n");
- fprintf(stderr," text /* simple text files, this is the default */\n");
- fprintf(stderr," | bibtex /* BibTeX / LaTeX format */\n");
- fprintf(stderr," | bio /* biology abstract format */\n");
- fprintf(stderr," | cmapp /* CM applications from Hypercard */\n");
- fprintf(stderr," | dash /* entries separated by a row of dashes */\n");
- fprintf(stderr," | dvi /* dvi format */\n");
- fprintf(stderr," | emacsinfo /* the GNU documentation system */\n");
- fprintf(stderr," | first_line /* first line of file is headline */\n");
- fprintf(stderr," | filename /* uses only the filename part of the pathname for the title */\n");
- fprintf(stderr," | ftp /* special type for FTP files. First line of file is headline */\n");
- fprintf(stderr," | gif /* gif files, only indexes the filename */\n");
- fprintf(stderr," | irg /* internet resource guide */\n");
- fprintf(stderr," | jargon /* Jargon File 2.9.8 format*/\n");
- fprintf(stderr," | listserv_digest /* standard internet mail digest format */\n");
- fprintf(stderr," | mail_digest /* standard internet mail digest format */\n");
- fprintf(stderr," | mail_or_rmail /* mail or rmail or both */\n");
- fprintf(stderr," | medline /* medline format */\n");
- fprintf(stderr," | mh_bboard /* MH bulletin board format */\n");
- #ifdef WIN32
- fprintf(stderr," | ms_kbase /* MS Knowledge Base format */\n");
- #endif
- fprintf(stderr," | netnews /* netnews format */\n");
- fprintf(stderr," | nhyp /* ?:? hyper text format, Polytechnic of Central London */\n");
- fprintf(stderr," | one_line /* each line is a document */\n");
- fprintf(stderr," | para /* paragraphs separated by blank lines */\n");
- fprintf(stderr," | pict /* pict files, only indexes the filename */\n");
- fprintf(stderr," | ps /* postscript format */\n");
- fprintf(stderr," | refer /* refer format */\n");
- #ifdef BIBDB
- fprintf(stderr," | irlist /* irlist mail or rmail or both */\n");
- fprintf(stderr," | formfeed /* entries separated by a formfeed */\n");
- fprintf(stderr," | bibdb /* steve file entries separated by a formfeed */\n");
- fprintf(stderr," | bibinf /* bibinf entries separated by an empty line */\n");
- #endif
- fprintf(stderr," | rn /* netnews saved by the [rt]?rn newsreader */\n");
- fprintf(stderr," | server /* server structures for the dir of servers */\n");
- #ifdef NeXT
- fprintf(stderr," | objc /* objective-C .h and .m files */\n");
- #endif /* def NeXT */
- fprintf(stderr," | tiff /* tiff files, only indexes the filename */\n");
- fprintf(stderr," | URL what-to-trim what-to-add /* URL */\n");
- fprintf(stderr," | object /* a structured object*/\n");
- fprintf(stderr," | inriadoc /* INRIA library catalog */\n");
- fprintf(stderr," | paradoc /* INRIA library catalog para-mode */\n ");
- fprintf(stderr," | fortran /* Fortran files,needs also -filter */\n");
- fprintf(stderr," | mime /* Like mail */\n");
-
- #ifdef BIO
- fprintf(stderr," | genbank /* GenBank flatfile format */\n");
- fprintf(stderr," | embl /* EMBL flatfile format */\n");
- fprintf(stderr," | pir /* PIR flatfile format */\n");
- fprintf(stderr," | prositedoc /* Prosite protein doc format */\n");
- fprintf(stderr," | prositedat /* Prosite protein dat format */\n");
- fprintf(stderr," | biojournal /* Bio journal TOC on bionet.journals */\n");
- fprintf(stderr," | redbook /* Drosophila redbook text */\n");
- fprintf(stderr," | flybase /* Drosophila Ashburner data files */\n");
- fprintf(stderr," | flystock /* Drosophila stock lists */\n");
- fprintf(stderr," | din /* Drosophila Info. Newsletter */\n");
- #endif
- #ifdef SOUND
- fprintf(stderr," | oneline_phonix /* Phonebooks PHONIX */\n");
- fprintf(stderr," | oneline_soundex /* Phonebooks SOUNDEX */\n");
- #endif
- #ifdef AAS
- fprintf(stderr," | AAS_abstract /* AAS meeting abstracts using AAS LaTeX macros */\n");
- #endif /* AAS */
- #ifdef STELAR
- fprintf(stderr," | stelar /* stelar abstracts - third line is hl */\n");
- #endif /* STELAR */
- #ifdef HTML
- fprintf(stderr," | html /* Hypertext Markup Language from WWW */\n");
- #endif /* HTML */
- fprintf(stderr," ] filename filename ...\n");
- }
-
- /* char *log_file_name = NULL; */
- FILE *logfile;
-
- extern char* keywords; /* used in irtfiles.c */
- extern char* keyword_filename; /* used in irtfiles.c */
- #ifdef WIN32
- extern char ExcludeFiles[]; /* used in irtfiles.c */
- #endif
-
- extern boolean index_contents;
-
-
- /* This is the MAIN for building an index.
- */
- void
- main(argc, argv)
- int argc;
- char *argv[];
- {
- database* db = NULL;
- long argc_copy = argc;
- char **argv_copy = argv;
- char *next_argument;
- char index_filename[1000];
- boolean adding_to_existing_index = false;
- boolean traverse_directory = false;
- boolean word_positions = false;
- boolean word_pairs = true;
- long memory_to_use = -1;
- long cm_mem_percent = 0; /* default */
- long grow_percent = 0; /* default */
- long text_size = 0; /* default */
- boolean check_for_text_file = false;
- boolean register_database = false;
- boolean export_database = false;
- boolean read_files_from_stdin = false;
- boolean make_catalog = true;
- char data_filename[MAXPATHLEN];
- char *typename = NULL; /* this is what the user said */
- long start_of_filenames;
- long hashtable_size = 1L<<16;
- long flush_after_n_words = 300000;
- char *command_name;
- char *filter_name = NULL;
- FILE *filter_process_in = NULL;
- FILE *filter_process_out = NULL;
- #ifdef WIN32
- PROCESS_INFORMATION piProcInfo;
- #endif
-
-
- dataopsrec dataops;
- /*------------- these go into dataops
- boolean (*separator_function)();
- void (*header_function)();
- void (*finish_header_function)();
- long (*date_function)();
- char *type = NULL;
- int minwordlen= 2;
- ---------------*/
-
- /* dgg -- put all of these separate, datatype-specific functions & params into a record! */
- gDelimiters[0]= '\0'; /* <-- bombs ?? */
- dataops.separator_function= NULL;
- dataops.header_function= NULL;
- dataops.date_function= NULL;
- dataops.finish_header_function= NULL;
- dataops.type= "TEXT";
- dataops.indextype= NULL;
- dataops.multitype=NULL;
- dataops.addseparatorwords= false;
- dataops.extraheaderweight= true;
- dataops.repeat_weight= 1;
- dataops.minwordlen= 2;
- dataops.wordDelimiter= wordbreak_notalnum;
- dataops.delimiters= gDelimiters;
- wordDelimiter= wordbreak_notalnum;
-
- /*------
- separator_function = NULL;
- header_function = NULL;
- date_function = NULL;
- finish_header_function = NULL;
- type = "TEXT";
- -------*/
- typename = "Text";
-
-
- next_argument = next_arg(&argc, &argv);
- command_name = next_argument;
-
- logfile = stderr;
-
- #ifdef WIN32
- wais_pid = GetCurrentProcessId();
- #else
- wais_pid = getpid();
- #endif
-
- if(0 == argc) {
- usage(command_name);
- exit(0);
- }
-
- #ifdef THINK_C
- strcpy(index_filename, "wais:System Folder:wais-index:index");
- #else
- strcpy(index_filename, "index"); /* in the current directory */
- #endif /* THINK_C */
- stop_list_file("\0"); /* dgg */
-
- if(NULL == (next_argument = next_arg(&argc, &argv))){
- fprintf(stderr,"No arguments specified\n");
- exit(0);
- }
- while((next_argument != NULL) && '-' == next_argument[0]){
- /* then we have an argument to process */
- if((0 == strcmp("-i", next_argument)) || /* -i is for backcompatibility */
- (0 == strcmp("-d", next_argument))){
- if(NULL == (next_argument = next_arg(&argc, &argv))){
- fprintf(stderr,"Expected filename for the index\n");
- exit(0);
- }
- strcpy(index_filename, next_argument);
- }
- #ifdef BIO
- else if (0 == strcmp("-stop", next_argument)){ /* dgg, stoplist file */
- if (NULL == (next_argument = next_arg(&argc, &argv))){
- fprintf(stderr,"Expected filename for the stoplist\n");
- exit(0);
- }
- stop_list_file(next_argument);
- }
- else if (0 == strcmp("-delim", next_argument)){ /* dgg, delimiters */
- if (NULL == (next_argument = next_arg(&argc, &argv))){
- fprintf(stderr,"Expected the delimiters argument\n");
- exit(0);
- }
- strcpy(gDelimiters, next_argument);
- dataops.wordDelimiter = wordbreak_user;
- wordDelimiter = wordbreak_user;
- printf("Delimiters used in index: %s\n\n",gDelimiters);
- }
- #endif
-
- else if(0 == strcmp("-a", next_argument)){
- adding_to_existing_index = true;
- }
- else if(0 == strcmp("-r", next_argument)){
- traverse_directory = true;
- }
- else if(0 == strcmp("-register", next_argument)){
- register_database = true;
- }
- else if(0 == strcmp("-export", next_argument)){
- export_database = true;
- }
- else if(0 == strcmp("-v", next_argument)){
- fprintf(stderr,"%s: %s %s\n", command_name, VERSION, INDEXER_DATE);
- #ifdef WIN32
- fprintf(stderr,"%s\n",VERWIN32);
- if (argc_copy == 2)
- exit(0);
- #endif
- }
- else if (0 == strcmp("-stdin", next_argument)) {
- read_files_from_stdin = true;
- }
- else if (0 == strcmp("-nopos", next_argument)) {
- word_positions = false;
- }
- else if (0 == strcmp("-pos", next_argument)) {
- word_positions = true;
- }
- else if (0 == strcmp("-nopairs", next_argument)) {
- word_pairs = false;
- }
- else if (0 == strcmp("-pairs", next_argument)) {
- word_pairs = true;
- }
- else if (0 == strcmp("-nocat", next_argument)) {
- make_catalog = false;
- }
- else if(0 == strcmp("-mem", next_argument)){
- if(NULL == (next_argument = next_arg(&argc, &argv)))
- panic("Expected a number for the amount of memory to use");
- memory_to_use = atol(next_argument);
- if(memory_to_use < 1)
- panic("The -mem argument should not be less than 1");
- if(memory_to_use > 200)
- fprintf(stderr,"Warning: The -mem parameter was %ld Mbytes. That is a large number of mega bytes in current machines\n", memory_to_use);
- }
- else if(0 == strcmp("-cmmem", next_argument)){
- if(NULL == (next_argument = next_arg(&argc, &argv)))
- panic("Expected a number (1-100) for percentage of memory to use");
- cm_mem_percent = atol(next_argument);
- if(cm_mem_percent < 1)
- panic("The -cmmem argument should not be less than 1 and less than 100");
- if(cm_mem_percent > 100)
- panic("Warning: The -cmmem parameter was %ld%%. It should be between 1-100.", cm_mem_percent);
- }
- else if(0 == strcmp("-filter", next_argument)){
- if(NULL == (next_argument = next_arg(&argc, &argv)))
- panic("Expected the name of a program to use to find keywords");
- filter_name=next_argument;
- }
- else if(0 == strcmp("-grow", next_argument)){
- if(NULL == (next_argument = next_arg(&argc, &argv)))
- panic("Expected a number (1-100) for database growing percentage");
- grow_percent = atol(next_argument);
- if(grow_percent < 1)
- panic("The -grow argument should not be less than 1");
- }
- else if(0 == strcmp("-textsize", next_argument)){
- if(NULL == (next_argument = next_arg(&argc, &argv)))
- panic("Expected a number for text size in megabytes");
- text_size = atol(next_argument);
- if(text_size < 1)
- panic("The -textsize argument should not be less than 1");
- }
- else if (0 == strcmp("-e", next_argument)) {
- char *peek_argument = peek_arg(&argc, &argv);
- #ifdef WIN32
- log_file_name = "NUL:"; /* default to NUL: */
- #else
- log_file_name = "/dev/null"; /* default to /dev/null */
- #endif
- if ((peek_argument != NULL) &&
- ('-' != peek_argument[0])) {
- log_file_name = next_arg(&argc, &argv);
- } /* end if (explicit log file) */
- } /* end if (-e) */
- else if (0 == strcmp("-l", next_argument)) {
- #ifdef WIN32
- char *pNextArg = next_arg(&argc, &argv);
- if (pNextArg!=NULL) {
- wais_log_level = atol(pNextArg);
- } else {
- panic("Expected a log-level argument");
- }
- #else
- wais_log_level = atol(next_arg(&argc, &argv));
- #endif
- } /* end if (-l) */
- else if(0 == strcmp("-cm", next_argument)){
- /* this is an undocumented argument to help use this to
- front end the CM application */
- indexingForBeta = true;
- }
- else if(0 == strcmp("-T", next_argument)){
- /* This is a specification for a "Special" type. The next argument
- is the type name. This will not index the body of the file. */
- if(NULL == (next_argument = next_arg(&argc, &argv)))
- panic("Expected a file type");
- typename = next_argument;
- dataops.type = next_argument;
- fprintf(stderr,"waisindex: setting type to %s\n", next_argument);
- dataops.finish_header_function = filename_finish_header_function;
- }
-
- /* multitype extensions */
- /*
- This is a specification for a multi-type document, the types should
- be entered as a comma delimited list. Note that this only defines
- all the types available in the database, you also need to specify a
- -t option so that the indexer knows how to parse the files.
- One of the limitations here is that each document must
- be a file with the extension of the file being the document type, so
- the document #### has a text file ####.TEXT and a jfif file
- ####.JFIF, not real nice but needed.
-
- Note that this contains both the primary and secondary document
- types, whereas dataops.type contains the primary type.
- */
-
- else if(0 == strcmp("-M", next_argument)){
- if(NULL == (next_argument = next_arg(&argc, &argv)))
- panic("Expected a multitype list");
- dataops.multitype = next_argument;
- }
-
- else if(0 == strcmp("-x", next_argument)){
- if(NULL == (next_argument = next_arg(&argc, &argv)))
- panic("Expected an excluded filename list");
- strncpy(ExcludeFiles,next_argument,EXCLUDEFILENAMESLEN-2);
- ExcludeFiles[strlen(ExcludeFiles)+2] = '\0'; /* Ensure double-null terminated */
- strtok(ExcludeFiles,",");
- while (strtok(NULL,",")!=NULL) /* Cycle until all tokens found */ ;
- }
- else if(0 == strcmp("-keywords", next_argument)){
- if(NULL == (next_argument = next_arg(&argc, &argv)))
- panic("Expected -keywords argument string");
- keywords = next_argument;
- }
- else if(0 == strcmp("-keyword_file", next_argument)){
- if(NULL == (next_argument = next_arg(&argc, &argv)))
- panic("Expected -keyword_file filename");
- keyword_filename = next_argument;
- }
-
- else if(0 == strcmp("-contents", next_argument)){
- index_contents = true;
- }
- else if(0 == strcmp("-nocontents", next_argument)){
- index_contents = false;
- }
- else if(0 == strcmp("-t", next_argument)){
- /* then we have a specialized file */
- index_contents = true;
- if(NULL == (next_argument = next_arg(&argc, &argv)))
- panic("Expected a file type");
- if(0 == strcmp("groliers", next_argument)){
- typename = next_argument;
- dataops.type ="TEXT";
- dataops.separator_function = groliers_separator_function;
- dataops.header_function = groliers_header_function;
- dataops.finish_header_function = groliers_finish_header_function;
- }
-
- #ifdef BIO
- else if(0 == strcmp("genbank", next_argument)){/* dgg */
- typename = next_argument;
- dataops.type ="TEXT";
- dataops.separator_function = genbank_separator_function;
- dataops.header_function = genbank_header_function;
- dataops.finish_header_function = genbank_finish_header_function;
- dataops.date_function = genbank_date_function;
- dataops.repeat_weight= 0;
- dataops.addseparatorwords= true;
- dataops.extraheaderweight= false;
- dataops.minwordlen= 2;
- }
- else if(0 == strcmp("embl", next_argument)){/* dgg */
- typename = next_argument;
- dataops.type ="TEXT";
- dataops.separator_function = embl_separator_function;
- dataops.header_function = embl_header_function;
- dataops.finish_header_function = embl_finish_header_function;
- dataops.date_function = embl_date_function;
- dataops.repeat_weight= 0;
- dataops.addseparatorwords= true;
- dataops.extraheaderweight= false;
- }
- else if(0 == strcmp("pir", next_argument)){/* dgg */
- typename = next_argument;
- dataops.type = "TEXT";
- dataops.separator_function = pir_separator_function;
- dataops.header_function = pir_header_function;
- dataops.finish_header_function = pir_finish_header_function;
- dataops.date_function = pir_date_function;
- dataops.repeat_weight= 0;
- dataops.addseparatorwords= true;
- dataops.extraheaderweight= false;
- }
- else if(0 == strcmp("prositedoc", next_argument)){ /* dgg */
- typename = next_argument;
- dataops.type = "TEXT";
- dataops.separator_function = prositedoc_separator_function;
- dataops.header_function = prositedoc_header_function;
- dataops.finish_header_function = prositedoc_finish_header_function;
- dataops.repeat_weight= 0;
- dataops.addseparatorwords= true;
- dataops.extraheaderweight= false;
- }
- else if(0 == strcmp("prositedat", next_argument)){ /* dgg */
- typename = next_argument;
- dataops.type = "TEXT";
- dataops.separator_function = prositedat_separator_function;
- dataops.header_function = prositedat_header_function;
- dataops.finish_header_function = prositedat_finish_header_function;
- dataops.repeat_weight= 0;
- dataops.addseparatorwords= true;
- dataops.extraheaderweight= false;
- }
- else if(0 == strcmp("biojournal", next_argument)){ /* dgg */
- typename = next_argument;
- dataops.type = "TEXT";
- dataops.separator_function = biojournal_separator_function;
- dataops.header_function = biojournal_header_function;
- dataops.finish_header_function = biojournal_finish_header_function;
- dataops.repeat_weight= 0;
- dataops.addseparatorwords= true;
- dataops.extraheaderweight= false;
- }
-
- else if(0 == strcmp("redbook", next_argument)){ /* dgg */
- typename = next_argument;
- dataops.type = "TEXT";
- dataops.separator_function = redbook_separator_function;
- dataops.header_function = redbook_header_function;
- dataops.finish_header_function = redbook_finish_header_function;
- dataops.repeat_weight= 0;
- dataops.addseparatorwords= true;
- dataops.extraheaderweight= false;
- dataops.wordDelimiter= wordbreak_user; /* redbook_delimiter; */
- wordDelimiter= wordbreak_user; /* wordbreak_notgraph; */
- dataops.minwordlen= 1;
- if (gDelimiters[0] == '\0') strcpy( gDelimiters, "/{}()[]%-:#.~*\";,|");
- }
- else if(0 == strcmp("flybase", next_argument)){ /* dgg */
- typename = next_argument;
- dataops.type = "TEXT";
- dataops.separator_function = flybase_separator_function;
- dataops.header_function = flybase_header_function;
- dataops.finish_header_function = flybase_finish_header_function;
- dataops.repeat_weight= 0;
- dataops.addseparatorwords= true;
- dataops.extraheaderweight= false;
- dataops.wordDelimiter= wordbreak_user; /* flybase_delimiter; */
- wordDelimiter= wordbreak_user; /* wordbreak_notgraph; */
- dataops.minwordlen= 1;
- if (gDelimiters[0] == '\0') strcpy( gDelimiters, "-/{}:.~*\";,|");
-
- /* flybase symbols
- valid data ()$+-?;.\'
- possible data and delimiter |;[]-?.~
- delimiters
- solution to confusion: set possible delimiters as delimiters, and
- permit literal searches with "..." or '...' enclosed strings.
- */
-
- }
- else if(0 == strcmp("flystock", next_argument)){ /* dgg */
- typename = next_argument;
- dataops.type = "TEXT";
- dataops.separator_function = bio_separator_function;
- dataops.header_function = bio_header_function;
- dataops.finish_header_function = bio_finish_header_function;
- dataops.repeat_weight= 0;
- dataops.addseparatorwords= true;
- dataops.extraheaderweight= false;
- dataops.wordDelimiter= wordbreak_user; /* flybase_delimiter; */
- wordDelimiter= wordbreak_user; /* wordbreak_notgraph; */
- dataops.minwordlen= 1;
- if (gDelimiters[0] == '\0') strcpy( gDelimiters, "-/{}:.~*\";,|");
-
- /* flystock symbols
- valid data []()/-;?+.{}
- possible data and delimiter =;.
- ;. in text field is del, in data field is data
- delimiters *";,
- more delimiters (from matthewk) - / {} :
-
- solution to confusion: set possible delimiters as delimiters, and
- permit literal searches with "..." or '...' enclosed strings.
- ! want some way to provide field names (report "stylesheet") with
- searched/fetched records for flybase, flystock, other data files
- ! want "keyword [field]" limited searches for some of this to make sense !
- */
- }
-
- else if(0 == strcmp("din", next_argument)){
- typename = next_argument;
- dataops.type = "TEXT";
- dataops.separator_function = din_separator_function;
- dataops.header_function = din_header_function;
- dataops.finish_header_function = din_finish_header_function;
- }
-
- #endif
-
- #ifdef NeXT
- else if(0 == strcmp("objc", next_argument)){
- typename = next_argument;
- dataops.type = "TEXT";
- dataops.separator_function = wobjc_separator_function;
- dataops.header_function = wobjc_header_function;
- dataops.finish_header_function = wobjc_finish_header_function;
- }
- #endif /* def NeXT */
- else if(0 == strcmp("listserv_digest", next_argument)){
- typename = next_argument;
- dataops.type = "TEXT";
- dataops.separator_function = listserv_digest_separator_function;
- dataops.header_function = listserv_header_function;
- dataops.date_function = listserv_date_function;
- dataops.finish_header_function = listserv_finish_header_function;
- }
- #ifdef AAS
- else if(0 == strcmp("AAS_abstract", next_argument)){
- typename = next_argument;
- dataops.separator_function = aasab_separator_function;
- dataops.header_function = aasab_header_function;
- dataops.finish_header_function = aasab_finish_header_function;
- }
- #endif /* AAS */
- #ifdef STELAR
- else if(0==strcmp("stelar",next_argument)){
- dataops.type="TEXT";
- typename=next_argument;
- dataops.separator_function=stelar_separator_function;
- dataops.header_function=stelar_header_function;
- dataops.finish_header_function=stelar_finish_header_function;
- }
- #endif /* STELAR */
- else if(0 == strcmp("mail", next_argument)){
- typename = next_argument;
- dataops.type = "TEXT";
- dataops.separator_function = mail_separator_function;
- dataops.header_function = mail_header_function;
- dataops.date_function = mail_date_function;
- dataops.finish_header_function = mail_finish_header_function;
- }
- else if(0 == strcmp("mail_or_rmail", next_argument)){
- typename = next_argument;
- dataops.type = "TEXT";
- dataops.separator_function = mail_or_rmail_separator;
- dataops.header_function = mail_header_function;
- dataops.date_function = mail_date_function;
- dataops.finish_header_function = mail_finish_header_function;
- }
- else if(0 == strcmp("mail_digest", next_argument)){
- typename = next_argument;
- dataops.type = "TEXT";
- dataops.separator_function = mail_digest_separator_function;
- dataops.header_function = mail_header_function;
- dataops.date_function = mail_date_function;
- dataops.finish_header_function = mail_finish_header_function;
- }
- else if(0 == strcmp("mh_bboard", next_argument)){
- typename = next_argument;
- dataops.type = "TEXT";
- dataops.separator_function = mh_bboard_separator_function;
- dataops.header_function = mail_header_function;
- dataops.date_function = mail_date_function;
- dataops.finish_header_function = mail_finish_header_function;
- }
- #ifdef WIN32
- else if(0 == strcmp("ms_kbase", next_argument)){
- typename = next_argument;
- dataops.type = "TEXT";
- dataops.separator_function = NULL;
- dataops.header_function = mskbase_header_function;
- dataops.date_function = mskbase_date_function;
- dataops.finish_header_function = mskbase_finish_header_function;
- }
- #endif
- else if(0 == strcmp("rmail", next_argument)){
- typename = next_argument;
- dataops.type = "TEXT";
- dataops.separator_function = rmail_separator_function;
- dataops.header_function = mail_header_function;
- dataops.date_function = mail_date_function;
- dataops.finish_header_function = mail_finish_header_function;
- }
- else if(0 == strcmp("netnews", next_argument)){
- typename = next_argument;
- dataops.type = "TEXT";
- dataops.separator_function = NULL;
- dataops.header_function = mail_header_function;
- dataops.date_function = mail_date_function;
- dataops.finish_header_function = mail_finish_header_function;
- }
- else if(0 == strcmp("rn", next_argument)){
- typename = next_argument;
- dataops.type = "TEXT";
- dataops.separator_function = rn_separator_function;
- dataops.header_function = mail_header_function;
- dataops.date_function = mail_date_function;
- dataops.finish_header_function = mail_finish_header_function;
- }
- #ifdef BIBDB
- else if(0 == strcmp("irlist", next_argument)){
- typename = next_argument;
- dataops.type = "TEXT";
- dataops.separator_function = irlist_separator_function;
- dataops.header_function = irlist_header_function;
- dataops.date_function = irlist_date_function;
- dataops.finish_header_function = mail_finish_header_function;
- }
- /* formfeed-separated items , Intro to Algorithms buglist, etc */
- else if(0 == strcmp("formfeed", next_argument)){
- typename = next_argument;
- if (!dataops.type || (strlen(dataops.type)==0)) {
- if (dataops.type)
- fprintf(stderr, "irbuild: overwriting type %s\n", dataops.type);
- dataops.type = "TEXT";
- } else {
- fprintf(stderr, "irbuild: using type %s\n", dataops.type);
- }
- dataops.separator_function = formfeed_separator_function;
- dataops.header_function = dash_header_function;
- dataops.finish_header_function = dash_finish_header_function;
- }
- /* formfeed-separated items , steve files */
- else if(0 == strcmp("bibdb", next_argument)){
- typename = next_argument;
- if (!dataops.type || (strlen(dataops.type)==0)) {
- if (dataops.type)
- fprintf(stderr, "irbuild: overwriting type %s\n", dataops.type);
- dataops.type = "TEXT";
- } else {
- fprintf(stderr, "irbuild: using type %s\n", dataops.type);
- stop_list_file("bibdb.stop");
- }
- dataops.separator_function = bibdb_separator_function;
- dataops.header_function = bibdb_header_function;
- dataops.date_function = bibdb_date_function;
- dataops.finish_header_function = bibdb_finish_header_function;
- }
- /* formfeed-separated items, bibinbf */
- else if(0 == strcmp("bibinf", next_argument)){
- typename = next_argument;
- dataops.type = "TEXT";
- dataops.separator_function = bibinf_separator_function;
- dataops.header_function = bibinf_header_function;
- #ifdef SIMPLE_BIBINF
- dataops.date_function = bibinf_date_function;
- #endif
- dataops.finish_header_function = bibinf_finish_header_function;
- }
- #endif
- else if(0 == strcmp("emacsinfo", next_argument)){
- typename = next_argument;
- dataops.type = "TEXT";
- dataops.separator_function = emacs_info_separator_function;
- dataops.header_function = emacs_info_header_function;
- dataops.finish_header_function = emacs_info_finish_header_function;
- }
- else if(0 == strcmp("catalog", next_argument)){
- typename = next_argument;
- dataops.type = "TEXT";
- dataops.separator_function = catalog_separator_function;
- dataops.header_function = catalog_header_function;
- dataops.finish_header_function = catalog_finish_header_function;
- }
- else if(0 == strcmp("bio", next_argument)){
- typename = next_argument;
- dataops.type = "TEXT";
- dataops.separator_function = bio_separator_function;
- dataops.header_function = bio_header_function;
- dataops.finish_header_function = bio_finish_header_function;
- }
- else if(0 == strcmp("cmapp", next_argument)){
- typename = next_argument;
- dataops.type = "TEXT";
- dataops.separator_function = cmapp_separator_function;
- dataops.header_function = cmapp_header_function;
- dataops.finish_header_function = cmapp_finish_header_function;
- }
- else if(0 == strcmp("ftp", next_argument)){
- dataops.type = "TEXT-FTP";
- typename = next_argument;
- dataops.separator_function = first_line_separator_function;
- dataops.header_function = first_line_header_function;
- dataops.finish_header_function = first_line_finish_header_function;
- }
- else if(0 == strcmp("jargon", next_argument)){
- typename = next_argument;
- dataops.type = "TEXT";
- dataops.separator_function = jargon_separator_function;
- dataops.header_function = jargon_header_function;
- dataops.finish_header_function = jargon_finish_header_function;
- }
- else if(0 == strcmp("server", next_argument)){
- typename = next_argument;
- dataops.type = "WSRC";
- dataops.finish_header_function = filename_finish_header_function;
- }
- else if(0 == strcmp("text", next_argument)){
- dataops.type = "TEXT";
- typename = next_argument;
- check_for_text_file = true;
- }
- else if(0 == strcmp("filename", next_argument)){
- dataops.type = "TEXT";
- typename = next_argument;
- dataops.finish_header_function = filename_finish_header_function;
- }
- #if 0 /* html format */
- else if(0 == strcmp("html", next_argument)){
- dataops.type = "HTML";
- typename = next_argument;
- dataops.separator_function = html_separator_function;
- dataops.header_function = html_header_function;
- dataops.finish_header_function = html_finish_header_function;
- }
- #endif
- else if(0 == strcmp("irg", next_argument)){
- typename = next_argument;
- dataops.type = "TEXT";
- dataops.separator_function = irg_separator_function;
- dataops.header_function = irg_header_function;
- dataops.finish_header_function = irg_finish_header_function;
- }
- /* dash-separated items , Intro to Algorithms buglist, etc */
- else if(0 == strcmp("dash", next_argument)){
- dataops.type = "TEXT";
- typename = next_argument;
- dataops.separator_function = dash_separator_function;
- dataops.header_function = dash_header_function;
- dataops.finish_header_function = dash_finish_header_function;
- }
- /* one_line-separated items */
- else if(0 == strcmp("one_line", next_argument)){
- dataops.type = "TEXT";
- typename = next_argument;
- dataops.separator_function = one_line_separator_function;
- dataops.header_function = one_line_header_function;
- dataops.finish_header_function = one_line_finish_header_function;
- }
- /* blank line-separated items (paragraphs) */
- else if(0 == strcmp("para", next_argument)){
- dataops.type = "TEXT";
- typename = next_argument;
- dataops.separator_function = para_separator_function;
- dataops.header_function = para_header_function;
- dataops.finish_header_function = para_finish_header_function;
- }
- /* seeker items */
- else if(0 == strcmp("seeker", next_argument)){
- dataops.type = "TEXT";
- typename = next_argument;
- dataops.separator_function = seeker_separator_function;
- dataops.header_function = seeker_header_function;
- dataops.finish_header_function = seeker_finish_header_function;
- }
- /* medline format */
- else if(0 == strcmp("medline", next_argument)){
- dataops.type = "TEXT";
- typename = next_argument;
- dataops.separator_function = medline_separator_function;
- dataops.header_function = medline_header_function;
- dataops.finish_header_function = medline_finish_header_function;
- }
- /* refer format */
- else if(0 == strcmp("refer", next_argument)){
- dataops.type = "TEXT";
- typename = next_argument;
- dataops.separator_function = refer_separator_function;
- dataops.header_function = refer_header_function;
- dataops.finish_header_function = refer_finish_header_function;
- }
- /* first_line format */
- else if(0 == strcmp("first_line", next_argument)){
- dataops.type = "TEXT";
- typename = next_argument;
- dataops.separator_function = first_line_separator_function;
- dataops.header_function = first_line_header_function;
- dataops.finish_header_function = first_line_finish_header_function;
- }
- /* rlin items */
- else if(0 == strcmp("rlin", next_argument)){
- dataops.type = "TEXT";
- typename = next_argument;
- dataops.separator_function = rlin_separator_function;
- dataops.header_function = rlin_header_function;
- dataops.finish_header_function = rlin_finish_header_function;
- }
- else if(0 == strcmp("dvi", next_argument)){
- typename = next_argument;
- dataops.type = "DVI";
- dataops.finish_header_function = filename_finish_header_function;
- }
- else if(0 == strcmp("ps", next_argument)){
- typename = next_argument;
- dataops.type = "PS";
- dataops.finish_header_function = filename_finish_header_function;
- }
- else if(0 == strcmp("pict", next_argument)){
- typename = next_argument;
- dataops.type = "PICT";
- dataops.finish_header_function = filename_finish_header_function;
- index_contents = false;
- }
- else if(0 == strcmp("gif", next_argument)){
- typename = next_argument;
- dataops.type = "GIF";
- dataops.finish_header_function = filename_finish_header_function;
- index_contents = false;
- }
- else if(0 == strcmp("tiff", next_argument)){
- typename = next_argument;
- dataops.type = "TIFF";
- dataops.finish_header_function = filename_finish_header_function;
- index_contents = false;
- }
- else if(0== strcmp("object", next_argument)) {
- dataops.type = "OBJECT";
- typename = next_argument;
- }
- else if(0 == strcmp("inriadoc", next_argument)){
- typename = next_argument;
- dataops.type = "TEXT";
- dataops.separator_function = NULL;
- dataops.header_function = inriadoc_header_function;
- dataops.date_function = NULL;
- dataops.finish_header_function = inriadoc_finish_header_function;
- }
- else if(0 == strcmp("fortran", next_argument)){
- typename = next_argument;
- dataops.type = "FORTRAN";
- }
- else if(0 == strcmp("paradoc", next_argument)){
- typename = next_argument;
- dataops.type = "TEXT";
- dataops.separator_function = para_separator_function;
- dataops.header_function = inriadoc_header_function;
- dataops.date_function = NULL;
- dataops.finish_header_function = inriadoc_finish_header_function;
- }
- else if(0 == strcmp("mime", next_argument)){
- typename = next_argument;
- dataops.type = "MIME";
- dataops.separator_function = mail_separator_function;
- dataops.header_function = mail_header_function;
- dataops.date_function = mail_date_function;
- dataops.finish_header_function = mail_finish_header_function;
- }
- /* BibTeX items */
- else if(0 == strcmp("bibtex", next_argument)){
- dataops.type = "TEXT";
- typename = next_argument;
- dataops.separator_function = bibtex_separator_function;
- dataops.header_function = bibtex_header_function;
- dataops.finish_header_function = bibtex_finish_header_function;
- }
- /* ?:? seperated hypertext items */
- else if(0 == strcmp("nhyp", next_argument)){
- dataops.type = "TEXT";
- typename = next_argument;
- dataops.separator_function = nhyp_separator_function;
- dataops.header_function = nhyp_header_function;
- dataops.finish_header_function = nhyp_finish_header_function;
- }
- /* Uniform Resource Locators - from Nat Torkington */
- else if(0 == strcmp("URL", next_argument)) {
- dataops.type = "URL";
- typename = next_argument;
- URL_trim = s_strdup(next_arg(&argc, &argv));
- URL_prefix = s_strdup(next_arg(&argc, &argv));
- }
- else if(0 == strcmp("ziff", next_argument)){
- dataops.type = "TEXT";
- typename = next_argument;
- dataops.separator_function = ziff_separator_function;
- dataops.header_function = ziff_header_function;
- dataops.finish_header_function = ziff_finish_header_function;
- }
- #ifdef SOUND
- else if(0 == strcmp("oneline_soundex", next_argument)){
- dataops.indextype = "SOUNDEX";
- dataops.type = "TEXT";
- typename = next_argument;
- dataops.separator_function = one_line_separator_function;
- dataops.header_function = one_line_header_function;
- dataops.finish_header_function = one_line_finish_header_function;
- }
- else if(0 == strcmp("oneline_phonix", next_argument)){
- dataops.indextype = "PHONIX";
- dataops.type = "TEXT";
- typename = next_argument;
- dataops.separator_function = one_line_separator_function;
- dataops.header_function = one_line_header_function;
- dataops.finish_header_function = one_line_finish_header_function;
- }
- #endif
- #ifdef HTML
- /* HyperText Markup Lanugage (from World Wide Web) */
- else if(0 == strcmp("html", next_argument)){
- dataops.type = "HTML";
- typename = next_argument;
- dataops.separator_function = NULL;
- dataops.header_function = html_header_function;
- dataops.finish_header_function = html_finish_header_function;
- }
- #endif /* HTML */
- else{
- panic("Don't recognize the '%s' type", next_argument);
- }
- }
- else{
- panic("Don't recognize the '%s' option", next_argument);
- }
- next_argument = next_arg(&argc, &argv);
- if (! (read_files_from_stdin || next_argument)) {
- fprintf(stderr,"No files specified\n");
- exit(0);
- }
- }
- start_of_filenames = argc_copy - argc - 1;
-
- /* check index */
- if(0 == strlen(pathname_name(index_filename))){
- waislog(WLOG_HIGH, WLOG_ERROR,
- "The pathname specified for the destination of the index files ('%s') should have a leaf filename without an extention rather than just a directory.",
- index_filename);
- exit(0);
- }
-
- #ifdef WIN32
- /* Check that we're on a partition supporting long file names */
- if (!CanCreateLongFileNames(index_filename)) {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "The pathname specified for the destination of the index files ('%s') is inaccessible or does not support long filenames.",
- index_filename);
- exit(0);
- }
- #endif
-
- waislog(WLOG_MEDIUM, WLOG_INDEX, "Starting to build database %s",
- index_filename);
-
- if(0 != init_search_engine(index_filename, false, false, cm_mem_percent,
- text_size, grow_percent))
- panic("unable to initialize search engine");
-
- if(true == adding_to_existing_index){
- db = openDatabase(index_filename, false, false);
- if (db == NULL){ /* does not exist, create one */
- db = openDatabase(index_filename, true, false);
- if (db == NULL)
- panic("unable to open the database");
- }
- }
- else{
- db = openDatabase(index_filename, true, false);
- if (db == NULL)
- panic("unable to open the database");
- }
-
- #ifdef BIO
- write_delimiters(gDelimiters, db);
- #endif
-
- { /* set up the memory hashtable */
-
- if(memory_to_use < 0){ /* default */
- /* do nothing */
- }
- else if(memory_to_use <= 2){
- hashtable_size = 1L<<16;
- flush_after_n_words = 50000;
- }
-
- else if(memory_to_use <= 3){
- hashtable_size = 1L<<16;
- flush_after_n_words =850000;
- }
- else if(memory_to_use <= 4){
- hashtable_size = 1L<<16;
- flush_after_n_words = 110000;
- }
- else if(memory_to_use <= 5){
- hashtable_size = 1L<<16;
- flush_after_n_words = 150000;
- }
-
- else if(memory_to_use <= 10){
- /* shown to take about 6MB on a sun4, when it is dict limited */
- hashtable_size = 1L<<16;
- flush_after_n_words = 300000;
- }
- else if(memory_to_use <= 20){
- hashtable_size = 1L<<17;
- flush_after_n_words = 600000;
- }
- else{ /* over 20 Mbytes */
- hashtable_size = 1L<<18;
- flush_after_n_words = 1000000;
- }
- /* Set up the filter process, if needed */
- /* We do this before initing the hash table to stop the fork copying a
- load of rubbish*/
- #ifdef WIN32
- /* WIN32 Process is used here, how about thread? */
- if(filter_name) {
- HANDLE hChildStdinRd, hChildStdinWr, hChildStdinWrDup,
- hChildStdoutRd, hChildStdoutWr, hSaveStdin, hSaveStdout;
- SECURITY_ATTRIBUTES saAttr;
- BOOL fSuccess;
- STARTUPINFO siStartInfo;
- int fd;
-
- /* Set the bInheritHandle flag so pipe handles are inherited. */
-
- saAttr.nLength = sizeof(SECURITY_ATTRIBUTES);
- saAttr.bInheritHandle = TRUE;
- saAttr.lpSecurityDescriptor = NULL;
-
- /*
- * The steps for redirecting child's STDOUT:
- * 1. Save current STDOUT, to be restored later.
- * 2. Create anonymous pipe to be STDOUT for child.
- * 3. Set STDOUT of parent to be write handle of pipe, so
- * it is inherited by child.
- */
-
- /* Save the handle to the current STDOUT. */
-
- hSaveStdout = GetStdHandle(STD_OUTPUT_HANDLE);
-
- /* Create a pipe for the child's STDOUT. */
-
- if (! CreatePipe(&hChildStdoutRd, &hChildStdoutWr, &saAttr, 0))
- panic("Stdout pipe creation failed\n");
-
- /* Set a write handle to the pipe to be STDOUT. */
-
- if (! SetStdHandle(STD_OUTPUT_HANDLE, hChildStdoutWr))
- panic("Redirecting STDOUT failed");
- /*
- * The steps for redirecting child's STDIN:
- * 1. Save current STDIN, to be restored later.
- * 2. Create anonymous pipe to be STDIN for child.
- * 3. Set STDIN of parent to be read handle of pipe, so
- * it is inherited by child.
- * 4. Create a noninheritable duplicate of write handle,
- * and close the inheritable write handle.
- */
-
- /* Save the handle to the current STDIN. */
-
- hSaveStdin = GetStdHandle(STD_INPUT_HANDLE);
- /* Create a pipe for the child's STDIN. */
-
- if (! CreatePipe(&hChildStdinRd, &hChildStdinWr, &saAttr, 0))
- panic("Stdin pipe creation failed\n");
-
- /* Set a read handle to the pipe to be STDIN. */
-
- if (! SetStdHandle(STD_INPUT_HANDLE, hChildStdinRd))
- panic("Redirecting Stdin failed");
-
- /*
- * Duplicate the write handle to the pipe, so it is not
- * inherited.
- */
-
- fSuccess = DuplicateHandle(GetCurrentProcess(), hChildStdinWr,
- GetCurrentProcess(), &hChildStdinWrDup, 0,
- FALSE, /* not inherited */
- DUPLICATE_SAME_ACCESS);
- if (!fSuccess)
- panic("DuplicateHandle failed");
-
- CloseHandle(hChildStdinWr);
-
- /* Set up members of STARTUPINFO structure. */
-
- siStartInfo.cb = sizeof(STARTUPINFO);
- siStartInfo.lpReserved = NULL;
- siStartInfo.lpReserved2 = NULL;
- siStartInfo.cbReserved2 = 0;
- siStartInfo.lpDesktop = NULL;
- siStartInfo.dwFlags = 0;
-
- /* Create the child process. */
-
- fSuccess = CreateProcess(NULL,
- filter_name, /* command line */
- NULL, /* process security attributes */
- NULL, /* primary thread security attributes */
- TRUE, /* handles are inherited */
- 0, /* creation flags */
- NULL, /* use parent's environment */
- NULL, /* use parent's current directory */
- &siStartInfo, /* STARTUPINFO pointer */
- &piProcInfo); /* receives PROCESS_INFORMATION */
-
- if (!fSuccess)
- panic("Create process failed");
-
- /* After process creation, restore the saved STDIN and STDOUT. */
-
- if (! SetStdHandle(STD_INPUT_HANDLE, hSaveStdin))
- panic("Re-redirecting Stdin failed\n");
-
- if (! SetStdHandle(STD_OUTPUT_HANDLE, hSaveStdout))
- panic("Re-redirecting Stdout failed\n");
-
- fd = _open_osfhandle((long)hChildStdinWrDup, _O_APPEND);
- if (fd < 0)
- panic("Convert Win32 Handle to filter_process_in failed\n");
- filter_process_in=_fdopen(fd,"wb");
- fd = _open_osfhandle((long)hChildStdoutRd, _O_RDONLY);
- if (fd < 0)
- panic("Convert Win32 Handle to filter_process_out failed\n");
- filter_process_out=_fdopen(fd,"rb");
- waislog(WLOG_LOW, WLOG_INDEX, "Filter %s started (%d)",filter_name,
- piProcInfo.dwProcessId);
- }
-
- #else
- if(filter_name) {
- int to_handles[2];
- int from_handles[2];
- int pid;
- extern int errno;
- if (pipe(to_handles) <0) {
- panic("can't open to pipe");
- }
- if (pipe(from_handles) <0) {
- panic("can't open from pipe");
- }
-
- if((pid = fork()) ==0) {
-
- /* child */
-
- close(0);
- close(1);
- close(2);
- dup(to_handles[0]);
- dup(from_handles[1]); /* Set up standard input/output/error */
- dup(from_handles[1]);
- close(to_handles[0]);
- close(to_handles[1]);
- close(from_handles[0]);
- close(from_handles[1]);
-
- if(execl(filter_name,filter_name,NULL) == -1) {
- exit(errno);
- }
- /*NOTREACHED*/
- }
- /* parent */
-
- if (pid <0) {
- panic("Couldn't fork");
- }
- close(to_handles[0]);
- close(from_handles[1]);
-
- filter_process_in=fdopen(to_handles[1],"w");
- filter_process_out=fdopen(from_handles[0],"r");
- waislog(WLOG_LOW, WLOG_INDEX, "Filter %s started (%d)",filter_name,pid);
- }
- #endif /* WIN32 */
-
- init_add_word(db, hashtable_size, flush_after_n_words);
- }
-
- if (read_files_from_stdin) {
- if (0 != (next_argument = fgets(data_filename, MAXPATHLEN, stdin))) {
- int len = strlen(next_argument);
- if (next_argument[len-1] == '\n') {
- next_argument[len-1] = '\0';
- }
- }
- }
-
- while(NULL != next_argument){ /* the first filename is in next_argument already */
- if(directoryp(next_argument)){
- if(traverse_directory){
- index_directory(next_argument, &dataops, db,
- check_for_text_file,
- adding_to_existing_index,
- word_positions, word_pairs,
- #ifndef WIN32
- filter_process_in,filter_process_out);
- #else
- "*.*", filter_process_in,filter_process_out);
- #endif
- /* index_directory(next_argument,
- separator_function,
- header_function,
- date_function,
- finish_header_function,
- type, db,
- check_for_text_file,
- adding_to_existing_index,
- word_positions, word_pairs, minwordlen); */
- }
- }
- #ifdef WIN32
- else if ((strchr(next_argument,'*')!=NULL)||(strchr(next_argument,'?')!=NULL)) {
- /* Contains a wildcard */
- if (traverse_directory) {
- /* We're being asked to recursively index a directory tree looking for
- filenames which match the wildcarded pattern. */
- char *cp;
- char *cPath = next_argument;
-
- /* Split the argument into the directory and the wildcard mask */
- cp = strrchr(cPath,'\\');
- if (cp==NULL) {
- /* No path */
- cPath = ".";
- cp = next_argument;
- } else {
- /* Tie off path */
- *cp++ = '\0';
- /* File mask */
- if (*cp=='\0') cp = "*.*";
- }
- /* Index the directory */
- index_directory(cPath, &dataops, db,
- check_for_text_file,
- adding_to_existing_index,
- word_positions, word_pairs, cp,
- filter_process_in,filter_process_out);
-
- } else {
- /* Not recursive */
- HANDLE hSearch;
- WIN32_FIND_DATA ffd;
- char FileName[MAX_FILENAME_LEN+1];
- char *cp;
- hSearch = FindFirstFile(next_argument,&ffd);
- if (hSearch!=INVALID_HANDLE_VALUE) {
- while (TRUE) {
- if ((ffd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)==0) {
- /* Index the file */
- strncpy(FileName,next_argument,MAX_FILENAME_LEN);
- cp = strrchr(FileName,'\\');
- if (cp!=NULL) {
- *(cp+1) = '\0';
- strncat(FileName,ffd.cFileName,MAX_FILENAME_LEN);
- } else {
- strncpy(FileName,ffd.cFileName,MAX_FILENAME_LEN);
- }
- waislog(WLOG_MEDIUM, WLOG_INDEX, "Indexing file: %s", FileName);
- index_text_file(FileName, &dataops, db,
- check_for_text_file, adding_to_existing_index,
- word_positions, word_pairs,
- filter_process_in,filter_process_out);
- }
- if (!FindNextFile(hSearch,&ffd)) break;
- }
- FindClose(hSearch);
- }
- }
- }
- #endif
- else{ /* not a directory */
- waislog(WLOG_MEDIUM, WLOG_INDEX,
- "Indexing file: %s", next_argument);
- index_text_file(next_argument, &dataops, db,
- check_for_text_file, adding_to_existing_index,
- word_positions, word_pairs,
- filter_process_in,filter_process_out);
- /* index_text_file(next_argument,
- separator_function,
- header_function,
- date_function,
- finish_header_function,
- type, db,
- check_for_text_file, adding_to_existing_index,
- word_positions, word_pairs, minwordlen); */
- }
- if (read_files_from_stdin) {
- if (0 != (next_argument = fgets(data_filename, MAXPATHLEN, stdin))) {
- int len = strlen(next_argument);
- if (next_argument[len-1] == '\n') {
- next_argument[len-1] = '\0';
- }
- }
- }
- else {
- next_argument = next_arg(&argc, &argv);
- }
- }
- finished_add_word(db);
- retreive_keywords(db);
- {
- char filename[MAX_FILENAME_LEN + 1];
- if(!probe_file(source_filename(filename, db))){
- char database_name[MAX_FILENAME_LEN];
- write_src_structure(source_filename(filename, db),
- export_database?pathname_name(index_filename):
- truename(index_filename, database_name),
- typename,
- &argv_copy[start_of_filenames],
- argc_copy - start_of_filenames,
- export_database,
- 210L);
- }else{
- char *oldkeys[50];
- short oldKeys;
- if ((oldKeys = read_src_structure(source_filename(filename, db),
- oldkeys))) {
-
- if (compare(keyword, nKeys, oldkeys, oldKeys) > 0.1) {
- char database_name[MAX_FILENAME_LEN];
- waislog(WLOG_MEDIUM,WLOG_INDEX,
- "Keyword comparison indicates significant change.");
- waislog(WLOG_MEDIUM,WLOG_INDEX, "Rewriting source description.");
- waislog(WLOG_MEDIUM,WLOG_INDEX,
- "New source description should be exported.");
- write_src_structure(source_filename(filename, db),
- export_database?pathname_name(index_filename):
- truename(index_filename, database_name),
- typename,
- &argv_copy[start_of_filenames],
- argc_copy - start_of_filenames,
- export_database, 210L);
- }
- } else {
- char database_name[MAX_FILENAME_LEN];
- waislog(WLOG_MEDIUM,WLOG_INDEX, "No keyword list found.");
- waislog(WLOG_MEDIUM,WLOG_INDEX, "Rewriting source description.");
- waislog(WLOG_MEDIUM,WLOG_INDEX,
- "New source description should be export ed.");
- write_src_structure(source_filename(filename, db),
- export_database?pathname_name(index_filename):
- truename(index_filename, database_name),
- typename,
- &argv_copy[start_of_filenames],
- argc_copy - start_of_filenames,
- export_database,
- 210L);
- }
- }
- /* write out a description of the server if appropriate */
- if(register_database){
- register_src_structure(source_filename(filename, db));
- }
- }
- if(make_catalog) build_catalog(db);
- closeDatabase(db);
- /* wait for filter process to die, if there was one*/
-
- if(filter_process_in) {
- fprintf(filter_process_in,"Q\n");
- fflush(filter_process_in);
- fclose(filter_process_out);
- fclose(filter_process_in);
- #ifdef WIN32
- waislog(WLOG_LOW, WLOG_INDEX, "Filter %s Exited (%ld)",filter_name,
- WaitForSingleObject(piProcInfo.hProcess, INFINITE));
- #else
- waislog(WLOG_LOW, WLOG_INDEX, "Filter %s Exited (%ld)",filter_name,wait(0L));
- #endif
- }
-
- waislog(WLOG_MEDIUM, WLOG_INDEX, "Finished build");
- exit(0);
- }
-
- #ifdef WIN32
- int
- #endif
- read_src_structure(filename, output)
- char *filename;
- char *output[50];
- {
- FILE *source_stream = s_fopen(filename, "r");
- char line[MAX_LINE_LENGTH], *ptr;
- #ifdef WIN32
- int keyflag = 0, linelen, index;
- #else
- int keyflag = 0, linelen, i, index;
- #endif
- int desflag = 0;
- int tmp;
-
- index = 0;
- while (fgets(line, MAX_LINE_LENGTH, source_stream)) {
- linelen = strlen(line);
- if (keyflag) {
- if (!strncmp(" )", line, (linelen > 19) ? 19 : linelen))
- keyflag = 0;
- else {
- line[strlen(line)-1] = '\0'; /* get rid of trailing return */
- ptr = line; /* parse keyword */
- while (*ptr == ' ')
- ptr++;
- output[index] = malloc(strlen(ptr)+1);
- strcpy(output[index], ptr);
- index++;
- }
- }
- if (!strncmp(" :keyword-list (", line, (linelen > 18) ? 18 : linelen))
- keyflag = 1;
- if (!strncmp(" :description", line, (linelen > 15) ? 15 : linelen))
- desflag = 1;
- if (desflag) {
- tmp=strlen(line)+1;
- descript[nDesLines] = malloc(tmp);
- strcpy(descript[nDesLines], line);
- nDesLines++;
- if (*line == '\"')
- desflag = 0;
- }
- }
- fclose(source_stream);
- return(index);
- }
-
- double compare(a, alen, b, blen)
- char *a[50], *b[50];
- short alen, blen;
- {
- int changes = 0;
- int i, j;
- for (i=0; i<alen; i++) {
- for (j=0; j<blen; j++)
- if (!strcmp(a[i], b[j]))
- break;
- if (j == blen)
- changes++;
- }
- #ifdef WIN32
- if (alen==0) return 0.0;
- #else
- printf("%d out of %d\n", changes, alen); /* info stuff */
- #endif
- return((double) changes/alen);
- }
-