home *** CD-ROM | disk | FTP | other *** search
- #!/cc/products/prod/bin/perl
- #
- # Acknowledgements
- #
- # Thanks to Guy Brooker (guy@jw.estec.esa.nl) for his AA interface,
- # which was the starting point for this program.
- #
- # Paul Clark
- # paul@cs.arizona.edu
- #
- # Modifications
- #
- # 2/22/94 Version 1.0, shell script version Paul Clark
- # 4/21/94 Version 1.1, multiple archives support Paul Clark
- # 4/22/94 Version 1.2, perl script Paul Clark
- # 8/05/94 Version 1.3, verbosity&security Paul Clark
- #10/05/94 Version 1.4, more security, improved
- # output Paul Clark
- #12/15/94 -----------, support for individual
- # user archives Nathan Neulinger
- #1/19/95 -----------, more security, output Nathan Neulinger
- #2/2/95 -----------, improved output Nathan Neulinger
- #2/3/95 -----------, changed to POST request Nathan Neulinger
- #2/3/95 -----------, dumped amgr Nathan Neulinger
-
-
- # **** **** **** **** CONFIGURABLE VARIABLES **** **** **** ****
- $GLIMPSE_LOC="/afs/umr.edu/software/glimpse/hpux/bin/glimpse" ;
-
- # Set to use individual amgr.cfg files, instead of main dir amgr.cfg
- @USER_PWENT = getpwuid($>);
- $CONFIG_DIR = $USER_PWENT[7] . "/public_html";
- $USERID = $USER_PWENT[0];
-
- $SERVER_URL = "http://www.umr.edu/~";
-
- $BASE_URL = $SERVER_URL . $USERID;
- $PUBHTML_SEARCH = "/.*/public_html";
-
-
- # **** **** **** **** NO CONFIGURATION NEEDED BELOW **** **** **** ****
-
- # Get the CGI request data
- &get_request;
-
- # Get value and check to make sure archive was given
- $archive = $rqpairs{'archive'};
-
- if ( $archive eq "" ) { &err_noscript; }
- $indexdir = $CONFIG_DIR . "/" . $archive;
-
- # ($ENV{'HOME'} = $indexdir) || &err_noscript; # some versions of Glimpse need it
-
- # Ensure that Glimpse is available on this machine
- -x $GLIMPSE_LOC || &err_noglimpse ;
-
- # Ensure that index is available
- -r "$indexdir/.glimpse_index" || &err_noindex($indexdir) ;
-
- #
- # Fetch values of fields and stuff from form
- #
- $QS_query = $rqpairs{'query'};
- $QS_caseins = $rqpairs{'case'};
- $QS_whole = $rqpairs{'whole'};
- $QS_errors = $rqpairs{'errors'};
- $QS_maxfiles = $rqpairs{'maxfiles'};
- $QS_maxlines = $rqpairs{'maxlines'};
- $QS_showlines = $rqpairs{'showlines'};
- $QS_pathfilter = $rqpairs{'pathfilter'};
- $QS_striptags = $rqpairs{'striptags'};
- $QS_findtitle = $rqpairs{'findtitle'};
-
- #
- # Set defaults if necessary
- #
- if ( $QS_caseins eq "" ) { $QS_caseins = "on"; }
- if ( $QS_whole eq "" ) { $QS_whole = "off"; }
- if ( $QS_errors eq "" ) { $QS_errors = "0"; }
- if ( $QS_showlines eq "" ) { $QS_showlines = "off"; }
- if ( $QS_striptags eq "" ) { $QS_striptags = "off"; }
- if ( $QS_findtitle eq "" ) { $QS_findtitle = "off"; }
-
- if ($QS_maxlines =~ /\d+/) {
- $maxlines = $&;
- } else {
- $maxlines = 20;
- }
- if ($QS_maxfiles =~ /\d+/) {
- $maxfiles = $&;
- } else {
- $maxfiles = 100;
- }
-
- $QS_query =~ s|\+| |g;
- $QS_query =~ s|%(\w\w)|sprintf("%c", hex($1))|ge;
- $pquery = $QS_query;
- $QS_query =~ s|\'|\'\"\'\"\'|g;
-
- $OPT_errors="-$QS_errors" if $QS_errors =~ /^[0-8]$/;
- $OPT_errors="-B" if $QS_errors =~ /^Best\+match$/;
- $OPT_case="-i" if $QS_caseins eq "on";
- $OPT_whole="-w" if $QS_whole eq "on";
- $path =~ s/\./\\./;
- $path =~ s/\'//g;
- $OPT_filter="-F '$QS_pathfilter'" if $QS_pathfilter ne "";
-
-
- $highlight = $QS_query;
- $highlight =~ s/^\W+//;
- $highlight = join("|",split(/\W+/,$highlight));
-
- # check if the query contains any words
- &err_badquery if !$highlight;
- $highlight = '\b('.$highlight.')\b' if $OPT_whole;
-
-
- print "Content-type: text/html\n\n" ;
-
- $TITLE = "Results of your search";
- $SUBTITLE = "Query: \"$pquery\"";
-
- print "<HEAD><TITLE>$TITLE</TITLE></HEAD><BODY>\n";
- print "<H1>$TITLE</H1><HR>\n";
- print "<H2>$SUBTITLE</H2>\n";
- print "<UL>\n";
-
- chdir $indexdir;
- $cmd = "exec $GLIMPSE_LOC -y -n $OPT_case $OPT_whole $OPT_errors -H . " .
- "$OPT_filter '$QS_query' 2>&1 |";
- $gpid = open(GOUT, $cmd );
-
- $prevfile = "";
- $lcount = 0;
- $fcount = 0;
- line: while (<GOUT>) {
-
- ( /^([^ :]*):\s*(\d+):(.*)/ ) || next;
- $file = $1;
- $line = $2;
- $string = $3;
-
- if ( $QS_striptags eq "on" )
- {
- $string =~ s|<[^>]*>||g;
- }
- else
- {
- $string =~ s/&/\&/g;
- $string =~ s/</\</g;
- $string =~ s/>/\>/g;
- }
-
- $file_url = $file;
- $file_path = $file;
- $file_url =~ s:$PUBHTML_SEARCH:$BASE_URL:;
-
- next unless $file =~ s|$PUBHTML_SEARCH||o;
- if ($file ne $prevfile) {
- $linecount = 0;
- if ($fcount>$maxfiles) {
- print "<P></UL><LI><B>More than $maxfiles files were found.</B>\n";
- $file = "";
- $fcount = "at least $fcount";
- $lcount = "at least $lcount";
- last line;
- }
- print "</UL>" if ( $prevfile ne "" );
- $prevfile = $file ;
-
- print "<BR>" if ($QS_showlines eq "on");
- print "<LI><CODE>";
- $file_title = &ExtractTitle($file_path);
- if ($file_title eq "")
- {
- print "File Name: ";
- $file_title = $file;
- }
- else
- {
- print "File Title: ";
- }
- print "</CODE><A HREF=\"$file_url\">$file_title</A>\n";
-
-
- print "<UL>\n"; # Beginning of line listing
- $fcount++ ;
- }
- $lcount++ ;
- $linecount++;
-
- if ($QS_showlines eq "on")
- {
-
- if ($linecount>=$maxlines) {
- print "<LI><B>More than $maxlines line matched.\n</B>" if
- $linecount==$maxlines;
- next line;
- }
- if ($OPT_case) {
- $string =~ s#$highlight#<B>$&</B>#gio;
- } else {
- $string =~ s#$highlight#<B>$&</B>#go;
- }
-
- print "<LI><CODE>Line #", $line, ": </CODE>", $string, "\n";
- }
-
- }
-
- print "</UL>\n";
- print "</UL>\n" if $file ;
- print "<HR>" ;
-
- print "<H2>Query \"<CODE>$QS_query</CODE>\" found ";
- print "$lcount matches in $fcount files</H2>\n";
-
- print "</BODY>\n" ;
- close(GOUT);
- unlink "/tmp/.glimpse_tmp.$gpid";
-
-
- sub ExtractTitle
- {
- local ($FILE) = @_;
- local ($TITLE);
-
- if ( $QS_findtitle eq "on" )
- {
- $TEXT = `head $FILE`;
-
- $TEXT =~ s|<TITLE>(.*)</TITLE>||i;
- $TITLE = $1;
- $TITLE =~ s|<[^>]*>||g;
- }
-
- return ($TITLE);
- }
-
-
-
- sub diag_exit {
- # exit on error
- exit 1;
- }
- sub err_noquery {
- # The script was called without a query.
- # Provide an ISINDEX type response for browsers
- # without form support.
- print <<'EOM' ;
- Content-type: text/html
-
- <HEAD><TITLE>Glimpse Gateway</TITLE></HEAD>
- <BODY><H1>Glimpse Gateway</H1>
-
- <H2>What is Glimpse ?</H2>
- <QUOTE>
- <P>
- Glimpse (which stands for GLobal IMPicit SEarch) is an
- indexing and query system that allows you to search through
- all your files very quickly. For example, a search for
- Schwarzkopf allowing two misspelling errors in 5600 files
- occupying 77MB took 7 seconds on a SUN IPC. Glimpse supports
- most of agrep's options (agrep is our powerful version
- of grep) including approximate matching (e.g., finding
- misspelled words), Boolean queries, and even some limited
- forms of regular expressions.<BR>
- Glimpse's running time is typically slower than systems
- tems using inverted indexes, but its index is an order of
- magnitude smaller (typically 2-5% of the size of the files).
- <H2>Authors of Glimpse</H2>
- Udi Manber, Sun Wu, and Burra Gopal<BR>
- <ADDRESS>
- Department of Computer
- Science, University of Arizona, Tucson, AZ 85721.<BR>
- glimpse@cs.arizona.edu
- </ADDRESS>
- </QUOTE>
-
- <HR>
- <ADDRESS>
- Paul Clark<BR>
- paul@cs.arizona.edu<BR>
- </ADDRESS>
-
- </BODY>
- EOM
- &diag_exit;
- }
-
- sub err_noglimpse {
- #
- # Glimpse was not found
- # Report a useful message
- #
- print <<'EOM' ;
- Content-type: text/html
-
- <HEAD>
- <TITLE>Glimpse not found</TITLE>
- </HEAD>
- <BODY>
- <H1>Glimpse not found</H1>
-
- This gateway relies on <CODE>Glimpse</CODE> search tool.
- If it is installed, please set the correct path in the script file.
- Otherwise obtain the latest version from
- <A HREF="file://ftp.cs.arizona.edu/glimpse">ftp.cs.arizona.edu</A>
- </BODY>
- EOM
- &diag_exit;
- }
-
- sub err_noindex {
- local ($indexdir) = @_;
- # Glimpse index was not found
- # Give recommendations for indexing
- print "Content-type: text/html\n\n";
- print "<HEAD>\n";
- print "<TITLE>Glimpse Index not found</TITLE>\n";
- print "</HEAD>\n";
- print "<BODY>\n";
- print "<H1>Glimpse Index in directory '$indexdir' not found</H1>\n";
- print "Glimpse cannot proceed without index.\n";
- print "Please check if the directory being searched is indexed\n";
- print "by <code>glimpseindex</code>.\n";
- print "</BODY>\n";
- &diag_exit;
- }
-
- sub err_noscript {
- # Glimpse archive was not found
- print "Content-type: text/html\n\n";
- print "<HEAD>\n";
- print "<TITLE>Glimpse Archive not found</TITLE>\n";
- print "</HEAD>\n";
- print "<BODY>\n";
- print "<H1>Glimpse Archive not found</H1>\n";
- print "Cannot find script \"$script\" in config file ".
- "$AMGR_CFG_LOC\n";
- print "</BODY>\n";
- &diag_exit;
- }
-
- #sub err_conf {
- ## Glimpse archive Configuration File was not found
- # print "Content-type: text/html\n\n";
- # #print "<HEAD>\n";
- # print "<TITLE>Glimpse Archive Configuration File not found</TITLE>\n";
- # print "</HEAD>\n";
- # print "<BODY>\n";
- # print "<H1>Glimpse Archive Configuration File not found</H1>\n";
- # print "Cannot open configuration file $AMGR_CFG_LOC\n";
- # print "</BODY>\n";
- # &diag_exit;
- #}
-
- sub err_badquery {
- print "Content-type: text/html\n\n";
- print "<HEAD>\n";
- print "<TITLE>Query is too broad</TITLE>\n";
- print "</HEAD>\n";
- print "<BODY>\n";
- print "<H1>Query is too broad</H1>\n";
- print "The query \"$pquery\" doesn't contain any words and ".
- "thus will take too much time. Please refine your query.\n";
- print "</BODY>\n";
- &diag_exit;
- }
-
-
-
-
-
-
- #############################################################################
- #
- # The following routines are for handling cgi requests
- #
-
-
- # Author:
- # James Tappin: sjt@xun8.sr.bham.ac.uk
- # School of Physics & Space Research University of Birmingham
- # Feb 1993.
-
- # Copyright & Disclaimer.
- # This set of routines may be freely distributed, modified and
- # used, provided this copyright & disclaimer remains intact.
- # This package is used at your own risk, if it does what you
- # want, good; if it doesn't, modify it or use something else--but
- # don't blame me. Support level = negligable (i.e. mail bugs but
- # not requests for extensions)
-
- sub get_request {
-
- # Subroutine get_request reads the POST or GET form request from STDIN
- # into the variable $request, and then splits it into its
- # name=value pairs in the associative array %rqpairs.
- # The number of bytes is given in the environment variable
- # CONTENT_LENGTH which is automatically set by the request generator.
-
- # Encoded HEX values and spaces are decoded in the values at this
- # stage.
-
- # $request will contain the RAW request. N.B. spaces and other
- # special characters are not handler in the name field.
-
- if ($ENV{'REQUEST_METHOD'} eq "POST") {
- read(STDIN, $request, $ENV{'CONTENT_LENGTH'});
- } elsif ($ENV{'REQUEST_METHOD'} eq "GET" ) {
- $request = $ENV{'QUERY_STRING'};
- }
-
- %rqpairs = &url_decode(split(/[&=]/, $request));
- }
-
- sub url_decode {
-
- # Decode a URL encoded string or array of strings
- # + -> space
- # %xx -> character xx
-
- foreach (@_) {
- tr/+/ /;
- s/%(..)/pack("c",hex($1))/ge;
- }
- @_;
- }
-
-
-
-
-