Skunkware 5

home *** CD-ROM | disk | FTP | other *** search

/ Skunkware 5 / Skunkware 5.iso / src / Tools / glimpsehttp / wwwlib / contrib / aglimpse.umr next >

Wrap

Text File | 1995-05-16 | 10.7 KB | 434 lines

#!/cc/products/prod/bin/perl # # Acknowledgements # # Thanks to Guy Brooker (guy@jw.estec.esa.nl) for his AA interface, # which was the starting point for this program. # # Paul Clark # paul@cs.arizona.edu # # Modifications # # 2/22/94 Version 1.0, shell script version Paul Clark # 4/21/94 Version 1.1, multiple archives support Paul Clark # 4/22/94 Version 1.2, perl script Paul Clark # 8/05/94 Version 1.3, verbosity&security Paul Clark #10/05/94 Version 1.4, more security, improved # output Paul Clark #12/15/94 -----------, support for individual # user archives Nathan Neulinger #1/19/95 -----------, more security, output Nathan Neulinger #2/2/95 -----------, improved output Nathan Neulinger #2/3/95 -----------, changed to POST request Nathan Neulinger #2/3/95 -----------, dumped amgr Nathan Neulinger # **** **** **** **** CONFIGURABLE VARIABLES **** **** **** **** $GLIMPSE_LOC="/afs/umr.edu/software/glimpse/hpux/bin/glimpse" ; # Set to use individual amgr.cfg files, instead of main dir amgr.cfg @USER_PWENT = getpwuid($>); $CONFIG_DIR = $USER_PWENT[7] . "/public_html"; $USERID = $USER_PWENT[0]; $SERVER_URL = "http://www.umr.edu/~"; $BASE_URL = $SERVER_URL . $USERID; $PUBHTML_SEARCH = "/.*/public_html"; # **** **** **** **** NO CONFIGURATION NEEDED BELOW **** **** **** **** # Get the CGI request data &get_request; # Get value and check to make sure archive was given $archive = $rqpairs{'archive'}; if ( $archive eq "" ) { &err_noscript; } $indexdir = $CONFIG_DIR . "/" . $archive; # ($ENV{'HOME'} = $indexdir) || &err_noscript; # some versions of Glimpse need it # Ensure that Glimpse is available on this machine -x $GLIMPSE_LOC || &err_noglimpse ; # Ensure that index is available -r "$indexdir/.glimpse_index" || &err_noindex($indexdir) ; # # Fetch values of fields and stuff from form # $QS_query = $rqpairs{'query'}; $QS_caseins = $rqpairs{'case'}; $QS_whole = $rqpairs{'whole'}; $QS_errors = $rqpairs{'errors'}; $QS_maxfiles = $rqpairs{'maxfiles'}; $QS_maxlines = $rqpairs{'maxlines'}; $QS_showlines = $rqpairs{'showlines'}; $QS_pathfilter = $rqpairs{'pathfilter'}; $QS_striptags = $rqpairs{'striptags'}; $QS_findtitle = $rqpairs{'findtitle'}; # # Set defaults if necessary # if ( $QS_caseins eq "" ) { $QS_caseins = "on"; } if ( $QS_whole eq "" ) { $QS_whole = "off"; } if ( $QS_errors eq "" ) { $QS_errors = "0"; } if ( $QS_showlines eq "" ) { $QS_showlines = "off"; } if ( $QS_striptags eq "" ) { $QS_striptags = "off"; } if ( $QS_findtitle eq "" ) { $QS_findtitle = "off"; } if ($QS_maxlines =~ /\d+/) { $maxlines = $&; } else { $maxlines = 20; } if ($QS_maxfiles =~ /\d+/) { $maxfiles = $&; } else { $maxfiles = 100; } $QS_query =~ s|\+| |g; $QS_query =~ s|%(\w\w)|sprintf("%c", hex($1))|ge; $pquery = $QS_query; $QS_query =~ s|\'|\'\"\'\"\'|g; $OPT_errors="-$QS_errors" if $QS_errors =~ /^[0-8]$/; $OPT_errors="-B" if $QS_errors =~ /^Best\+match$/; $OPT_case="-i" if $QS_caseins eq "on"; $OPT_whole="-w" if $QS_whole eq "on"; $path =~ s/\./\\./; $path =~ s/\'//g; $OPT_filter="-F '$QS_pathfilter'" if $QS_pathfilter ne ""; $highlight = $QS_query; $highlight =~ s/^\W+//; $highlight = join("|",split(/\W+/,$highlight)); # check if the query contains any words &err_badquery if !$highlight; $highlight = '\b('.$highlight.')\b' if $OPT_whole; print "Content-type: text/html\n\n" ; $TITLE = "Results of your search"; $SUBTITLE = "Query: \"$pquery\""; print "<HEAD><TITLE>$TITLE</TITLE></HEAD><BODY>\n"; print "<H1>$TITLE</H1><HR>\n"; print "<H2>$SUBTITLE</H2>\n"; print "<UL>\n"; chdir $indexdir; $cmd = "exec $GLIMPSE_LOC -y -n $OPT_case $OPT_whole $OPT_errors -H . " . "$OPT_filter '$QS_query' 2>&1 |"; $gpid = open(GOUT, $cmd ); $prevfile = ""; $lcount = 0; $fcount = 0; line: while (<GOUT>) { ( /^([^ :]*):\s*(\d+):(.*)/ ) || next; $file = $1; $line = $2; $string = $3; if ( $QS_striptags eq "on" ) { $string =~ s|<[^>]*>||g; } else { $string =~ s/&/\&/g; $string =~ s/</\</g; $string =~ s/>/\>/g; } $file_url = $file; $file_path = $file; $file_url =~ s:$PUBHTML_SEARCH:$BASE_URL:; next unless $file =~ s|$PUBHTML_SEARCH||o; if ($file ne $prevfile) { $linecount = 0; if ($fcount>$maxfiles) { print "<P></UL><LI><B>More than $maxfiles files were found.</B>\n"; $file = ""; $fcount = "at least $fcount"; $lcount = "at least $lcount"; last line; } print "</UL>" if ( $prevfile ne "" ); $prevfile = $file ; print "<BR>" if ($QS_showlines eq "on"); print "<LI><CODE>"; $file_title = &ExtractTitle($file_path); if ($file_title eq "") { print "File Name: "; $file_title = $file; } else { print "File Title: "; } print "</CODE><A HREF=\"$file_url\">$file_title</A>\n"; print "<UL>\n"; # Beginning of line listing $fcount++ ; } $lcount++ ; $linecount++; if ($QS_showlines eq "on") { if ($linecount>=$maxlines) { print "<LI><B>More than $maxlines line matched.\n</B>" if $linecount==$maxlines; next line; } if ($OPT_case) { $string =~ s#$highlight#<B>$&</B>#gio; } else { $string =~ s#$highlight#<B>$&</B>#go; } print "<LI><CODE>Line #", $line, ": </CODE>", $string, "\n"; } } print "</UL>\n"; print "</UL>\n" if $file ; print "<HR>" ; print "<H2>Query \"<CODE>$QS_query</CODE>\" found "; print "$lcount matches in $fcount files</H2>\n"; print "</BODY>\n" ; close(GOUT); unlink "/tmp/.glimpse_tmp.$gpid"; sub ExtractTitle { local ($FILE) = @_; local ($TITLE); if ( $QS_findtitle eq "on" ) { $TEXT = `head $FILE`; $TEXT =~ s|<TITLE>(.*)</TITLE>||i; $TITLE = $1; $TITLE =~ s|<[^>]*>||g; } return ($TITLE); } sub diag_exit { # exit on error exit 1; } sub err_noquery { # The script was called without a query. # Provide an ISINDEX type response for browsers # without form support. print <<'EOM' ; Content-type: text/html <HEAD><TITLE>Glimpse Gateway</TITLE></HEAD> <BODY><H1>Glimpse Gateway</H1> <H2>What is Glimpse ?</H2> <QUOTE> <P> Glimpse (which stands for GLobal IMPicit SEarch) is an indexing and query system that allows you to search through all your files very quickly. For example, a search for Schwarzkopf allowing two misspelling errors in 5600 files occupying 77MB took 7 seconds on a SUN IPC. Glimpse supports most of agrep's options (agrep is our powerful version of grep) including approximate matching (e.g., finding misspelled words), Boolean queries, and even some limited forms of regular expressions.<BR> Glimpse's running time is typically slower than systems tems using inverted indexes, but its index is an order of magnitude smaller (typically 2-5% of the size of the files). <H2>Authors of Glimpse</H2> Udi Manber, Sun Wu, and Burra Gopal<BR> <ADDRESS> Department of Computer Science, University of Arizona, Tucson, AZ 85721.<BR> glimpse@cs.arizona.edu </ADDRESS> </QUOTE> <HR> <ADDRESS> Paul Clark<BR> paul@cs.arizona.edu<BR> </ADDRESS> </BODY> EOM &diag_exit; } sub err_noglimpse { # # Glimpse was not found # Report a useful message # print <<'EOM' ; Content-type: text/html <HEAD> <TITLE>Glimpse not found</TITLE> </HEAD> <BODY> <H1>Glimpse not found</H1> This gateway relies on <CODE>Glimpse</CODE> search tool. If it is installed, please set the correct path in the script file. Otherwise obtain the latest version from <A HREF="file://ftp.cs.arizona.edu/glimpse">ftp.cs.arizona.edu</A> </BODY> EOM &diag_exit; } sub err_noindex { local ($indexdir) = @_; # Glimpse index was not found # Give recommendations for indexing print "Content-type: text/html\n\n"; print "<HEAD>\n"; print "<TITLE>Glimpse Index not found</TITLE>\n"; print "</HEAD>\n"; print "<BODY>\n"; print "<H1>Glimpse Index in directory '$indexdir' not found</H1>\n"; print "Glimpse cannot proceed without index.\n"; print "Please check if the directory being searched is indexed\n"; print "by <code>glimpseindex</code>.\n"; print "</BODY>\n"; &diag_exit; } sub err_noscript { # Glimpse archive was not found print "Content-type: text/html\n\n"; print "<HEAD>\n"; print "<TITLE>Glimpse Archive not found</TITLE>\n"; print "</HEAD>\n"; print "<BODY>\n"; print "<H1>Glimpse Archive not found</H1>\n"; print "Cannot find script \"$script\" in config file ". "$AMGR_CFG_LOC\n"; print "</BODY>\n"; &diag_exit; } #sub err_conf { ## Glimpse archive Configuration File was not found # print "Content-type: text/html\n\n"; # #print "<HEAD>\n"; # print "<TITLE>Glimpse Archive Configuration File not found</TITLE>\n"; # print "</HEAD>\n"; # print "<BODY>\n"; # print "<H1>Glimpse Archive Configuration File not found</H1>\n"; # print "Cannot open configuration file $AMGR_CFG_LOC\n"; # print "</BODY>\n"; # &diag_exit; #} sub err_badquery { print "Content-type: text/html\n\n"; print "<HEAD>\n"; print "<TITLE>Query is too broad</TITLE>\n"; print "</HEAD>\n"; print "<BODY>\n"; print "<H1>Query is too broad</H1>\n"; print "The query \"$pquery\" doesn't contain any words and ". "thus will take too much time. Please refine your query.\n"; print "</BODY>\n"; &diag_exit; } ############################################################################# # # The following routines are for handling cgi requests # # Author: # James Tappin: sjt@xun8.sr.bham.ac.uk # School of Physics & Space Research University of Birmingham # Feb 1993. # Copyright & Disclaimer. # This set of routines may be freely distributed, modified and # used, provided this copyright & disclaimer remains intact. # This package is used at your own risk, if it does what you # want, good; if it doesn't, modify it or use something else--but # don't blame me. Support level = negligable (i.e. mail bugs but # not requests for extensions) sub get_request { # Subroutine get_request reads the POST or GET form request from STDIN # into the variable $request, and then splits it into its # name=value pairs in the associative array %rqpairs. # The number of bytes is given in the environment variable # CONTENT_LENGTH which is automatically set by the request generator. # Encoded HEX values and spaces are decoded in the values at this # stage. # $request will contain the RAW request. N.B. spaces and other # special characters are not handler in the name field. if ($ENV{'REQUEST_METHOD'} eq "POST") { read(STDIN, $request, $ENV{'CONTENT_LENGTH'}); } elsif ($ENV{'REQUEST_METHOD'} eq "GET" ) { $request = $ENV{'QUERY_STRING'}; } %rqpairs = &url_decode(split(/[&=]/, $request)); } sub url_decode { # Decode a URL encoded string or array of strings # + -> space # %xx -> character xx foreach (@_) { tr/+/ /; s/%(..)/pack("c",hex($1))/ge; } @_; }