home *** CD-ROM | disk | FTP | other *** search
- #!/usr/local/bin/perl
- #
- # Acknowledgements
- #
- # Thanks to Guy Brooker (guy@jw.estec.esa.nl) for his AA interface,
- # which was the starting point for this program.
- #
- # Paul Clark
- # paul@cs.arizona.edu
- #
- # Modifications
- #
- # 2/22/94 Version 1.0, shell script version Paul Clark
- # 4/21/94 Version 1.1, multiple archives support Paul Clark
- # 4/22/94 Version 1.2, perl script Paul Clark
- # 8/05/94 Version 1.3, verbosity&security Paul Clark
- #10/05/94 Version 1.4, more security, improved
- # output Paul Clark
-
- # **** **** **** **** CONFIGURABLE VARIABLES **** **** **** ****
- $HTTPD_HOME="/usr1/paul/httpd" ;
- $HTTPD_NEWSHOME="/usr1/paul/news" ;
- $GLIMPSE_LOC="/usr/paul/bin/glimpse" ;
-
- $CONVERT="$HTTPD_HOME/wwwlib/cvtwww" ;
- $FSSERV="/cgi-bin/mfs" ;
-
- # Set file name pattern where to suppress HTML tags
- # Comment out to cancel suppression
- # Currently set to "only filenames ending with '.html'"
- $SUPPRESS_HTML_TAGS = "\\.html\$";
-
- # **** **** **** **** NO CONFIGURATION NEEDED BELOW **** **** **** ****
-
- $_ = $ENV{'PATH_INFO'};
- if ( m|^/([0-9][0-9]*)(.*)$| ) {
- $script = $1;
- $path = $2;
- $path =~ s|"||g;
- } else {
- &err_noscript;
- }
-
- open(AMGRCONF,"$HTTPD_HOME/wwwlib/amgr.cfg") || &err_conf;
- undef $indexdir;
- line: while (<AMGRCONF>) {
- @_ = split(/\t/);
- if ( $_[3] eq $script ) {
- $indexdir = $_[0];
- $urlpath = $_[1];
- last line;
- }
- }
- &err_noscript unless $indexdir;
- close(AMGRCONF);
- ($ENV{'HOME'} = $indexdir) || &err_noscript; # some versions of Glimpse need it
-
- # Ensure that Glimpse is available on this machine
- -x $GLIMPSE_LOC || &err_noglimpse ;
-
- # Ensure that index is available
- -r "$indexdir/.glimpse_index" || &err_noindex($indexdir) ;
-
- # To support an ISINDEX type search, set query string if given
- # an argument on the command line
- $prefix="whole=on&case=on&query=" if ( $#ARGV >= 0 );
-
- # Check that a query has been made
- ($query = $ENV{'QUERY_STRING'}) || &err_noquery ;
-
- # Strip the variables out from the query string,
- # and assign them into variables, prefixed by 'QS_'
- @qvars = split( /\&/, $prefix . $query );
- foreach (@qvars) {
- split(/=/);
- $fname = $_[0];
- $fvalue = $_[1];
- $fvalue =~ s/\'//g;
- $cmd = "\$QS_$fname = '$fvalue';" ;
- # print ">>>",$cmd,"\n";
- $cmd = eval $cmd if ( $fname =~ /^[a-z_A-Z]\w*$/ );
- }
- $QS_query =~ s|\+| |g;
- $QS_query =~ s|%(\w\w)|sprintf("%c", hex($1))|ge;
- $pquery = $QS_query;
- $QS_query =~ s|\'|\'\"\'\"\'|g;
-
- $OPT_errors="-$QS_errors" if $QS_errors =~ /^[0-8]$/;
- $OPT_errors="-B" if $QS_errors =~ /^Best\+match$/;
- $OPT_case="-i" if $QS_case =~ /^on$/;
- $OPT_whole="-w" unless $QS_whole =~ /^on$/;
- $path =~ s/\./\\./;
- $path =~ s/\'//g;
- $OPT_filter="-F '$path'" if $path;
-
- if ($QS_maxlines =~ /\d+/) {
- $maxlines = $&;
- } else {
- $maxlines = 20;
- }
- if ($QS_maxfiles =~ /\d+/) {
- $maxfiles = $&;
- } else {
- $maxfiles = 100;
- }
-
- $highlight = $QS_query;
- $highlight =~ s/^\W+//;
- $highlight = join("|",split(/\W+/,$highlight));
- # check if the query contains any words
- &err_badquery if !$highlight;
- $highlight = '\b('.$highlight.')\b' if $OPT_whole;
-
- print "Content-type: text/html\n\n" ;
- print "<HEAD><TITLE>Result for query \"$pquery\"\n";
- print "</TITLE></HEAD><BODY>\n";
- print "<H1>Result for query \"$pquery\"</H1><HR>\n";
-
- chdir $indexdir;
- $cmd = "exec $GLIMPSE_LOC -y -n $OPT_case $OPT_whole $OPT_errors -H . " .
- "$OPT_filter '$QS_query' 2>&1 |";
- $gpid = open(GOUT, $cmd );
- $prevfile = "";
- $lcount = 0;
- $fcount = 0;
- line: while (<GOUT>) {
- ( /^([^:]*):\s*(\d+):(.*)/ ) || next;
- $file = $1;
- $line = $2;
- $string = $3;
- next unless $file =~ s|^$indexdir||o;
- if ($file ne $prevfile) {
- $linecount = 0;
- if ($fcount>$maxfiles) {
- print "<H3>Limit of $maxfiles files exceeded...</H3>\n";
- $file = "";
- $fcount = "at least $fcount";
- $lcount = "at least $lcount";
- last line;
- }
- print "</UL>" if ( $prevfile ne "" );
- $prevfile = $file ;
- print "<H3>File <A HREF=\"",$FSSERV,"/",$script,$file,
- "\">/",$urlpath,$file,"</A></H3><UL>\n" ;
- $fcount++ ;
- }
- $lcount++ ;
- $linecount++;
- if ($linecount>=$maxlines) {
- print "<LI>Limit of $maxlines matched " .
- "lines per file exceeded...\n" if
- $linecount==$maxlines;
- next line;
- }
- if ($SUPPRESS_HTML_TAGS && $file =~ /$SUPPRESS_HTML_TAGS/o) {
- $string =~ s#\</?[a-zA-Z][^>\n]*\>?##g;
- }
- $string =~ s/\&/\&/g;
- $string =~ s/\</\</g;
- $string =~ s/\>/\>/g;
- if ($OPT_case) {
- $string =~ s#$highlight#<B>$&</B>#gio;
- } else {
- $string =~ s#$highlight#<B>$&</B>#go;
- }
- print "<LI><A HREF=\"",$FSSERV,"/",$script,$file,"?",$line,"#mfs\">\n" ;
- print "line ",$line,":",$string,"</A>\n" ;
- }
- print "</UL>\n" if $file ;
- print "<HR>" ;
- print "<H2>Summary for query <code>\"",$QS_query,"\":</code></H2>\n" ;
- print "found ",$lcount," matches in ",$fcount," files\n" ;
- print "</BODY>\n" ;
- close(GOUT);
- unlink "/tmp/.glimpse_tmp.$gpid";
-
- sub diag_exit {
- # exit on error
- exit 1;
- }
- sub err_noquery {
- # The script was called without a query.
- # Provide an ISINDEX type response for browsers
- # without form support.
- print <<'EOM' ;
- Content-type: text/html
-
- <HEAD><TITLE>Glimpse Gateway</TITLE></HEAD>
- <BODY><H1>Glimpse Gateway</H1>
- This is a gateway to Glimpse.
- Type a pattern to search in your browser's search dialog.<P>
-
- <ISINDEX>
-
- <H2>What is Glimpse ?</H2>
- <QUOTE>
- <P>
- Glimpse (which stands for GLobal IMPicit SEarch) is an
- indexing and query system that allows you to search through
- all your files very quickly. For example, a search for
- Schwarzkopf allowing two misspelling errors in 5600 files
- occupying 77MB took 7 seconds on a SUN IPC. Glimpse supports
- most of agrep's options (agrep is our powerful version
- of grep) including approximate matching (e.g., finding
- misspelled words), Boolean queries, and even some limited
- forms of regular expressions.<BR>
- Glimpse's running time is typically slower than systems
- tems using inverted indexes, but its index is an order of
- magnitude smaller (typically 2-5% of the size of the files).
- <H2>Authors of Glimpse</H2>
- Udi Manber, Sun Wu, and Burra Gopal<BR>
- <ADDRESS>
- Department of Computer
- Science, University of Arizona, Tucson, AZ 85721.<BR>
- glimpse@cs.arizona.edu
- </ADDRESS>
- </QUOTE>
-
- <HR>
- <ADDRESS>
- Paul Clark<BR>
- paul@cs.arizona.edu<BR>
- </ADDRESS>
-
- </BODY>
- EOM
- &diag_exit;
- }
-
- sub err_noglimpse {
- #
- # Glimpse was not found
- # Report a useful message
- #
- print <<'EOM' ;
- Content-type: text/html
-
- <HEAD>
- <TITLE>Glimpse not found</TITLE>
- </HEAD>
- <BODY>
- <H1>Glimpse not found</H1>
-
- This gateway relies on <CODE>Glimpse</CODE> search tool.
- If it is installed, please set the correct path in the script file.
- Otherwise obtain the latest version from
- <A HREF="file://ftp.cs.arizona.edu/glimpse">ftp.cs.arizona.edu</A>
- </BODY>
- EOM
- &diag_exit;
- }
-
- sub err_noindex {
- local ($indexdir) = @_;
- # Glimpse index was not found
- # Give recommendations for indexing
- print "Content-type: text/html\n\n";
- print "<HEAD>\n";
- print "<TITLE>Glimpse Index not found</TITLE>\n";
- print "</HEAD>\n";
- print "<BODY>\n";
- print "<H1>Glimpse Index in directory '$indexdir' not found</H1>\n";
- print "Glimpse cannot proceed without index.\n";
- print "Please check if the directory being searched is indexed\n";
- print "by <code>glimpseindex</code>.\n";
- print "</BODY>\n";
- &diag_exit;
- }
-
- sub err_noscript {
- # Glimpse archive was not found
- print "Content-type: text/html\n\n";
- print "<HEAD>\n";
- print "<TITLE>Glimpse Archive not found</TITLE>\n";
- print "</HEAD>\n";
- print "<BODY>\n";
- print "<H1>Glimpse Archive not found</H1>\n";
- print "Cannot find script \"$script\" in config file ".
- "$HTTPD_HOME/wwwlib/amgr.cfg\n";
- print "</BODY>\n";
- &diag_exit;
- }
-
- sub err_conf {
- # Glimpse archive Configuration File was not found
- print "Content-type: text/html\n\n";
- print "<HEAD>\n";
- print "<TITLE>Glimpse Archive Configuration File not found</TITLE>\n";
- print "</HEAD>\n";
- print "<BODY>\n";
- print "<H1>Glimpse Archive Configuration File not found</H1>\n";
- print "Cannot open configuration file $HTTPD_HOME/wwwlib/amgr.cfg\n";
- print "</BODY>\n";
- &diag_exit;
- }
-
- sub err_badquery {
- print "Content-type: text/html\n\n";
- print "<HEAD>\n";
- print "<TITLE>Query is too broad</TITLE>\n";
- print "</HEAD>\n";
- print "<BODY>\n";
- print "<H1>Query is too broad</H1>\n";
- print "The query \"$pquery\" doesn't contain any words and ".
- "thus will take too much time. Please refine your query.\n";
- print "</BODY>\n";
- &diag_exit;
- }
-