home *** CD-ROM | disk | FTP | other *** search
- From: scott@cs.odu.edu (Scott Yelich)
- Newsgroups: comp.sources.wanted,comp.lang.c,alt.sources
- Subject: Re: key word searches in text files
- Message-ID: <SCOTT.90Apr18133215@croaker.cs.odu.edu>
- Date: 18 Apr 90 17:32:15 GMT
-
- >
- > We maintain a large mass of text files on the board and I would like
- > for a caller to able to look for key works in those files WITHOUT having
- > to use [e]grep to bang away at the files... This is the method we're
- > currently using. It works, but is slow not to mention the wear and
- > tear on the HD.
- >
- > And there any programs (or suggestions) that will take a text file and
- > create a key-word list, ignore the usual non-essential words (and,
- > the, at , etc)?
-
- I call this program ``lookfor'' and I use it on my text file databases...
- I was going to submit it later this year... perhaps you could modify
- it to your liking...
-
- Currently this program is used to keep an ALTERNATE database of
- ``help'' files for consultants. It works off a file such as this:
-
- ------------------------------------------------------------------------------
- SUBJECT : Scott D. Yelich
- TITLE : Scott D. Yelich
- INFO : Wednesday- September 6, 1989 | Wednesday- September 6, 1989 [scott]
- KEY WORDS: scott d. yelich information mail call lives house home where why
- when etc
-
- Permanent mailing address:
-
- Scott D. Yelich
- 4 Kensington Ct.
- Williamsburg, Va. 23185
- (804) 565-1811
- ------------------------------------------------------------------------------
-
- Of course, it is currently configured for OUR local system... but it
- should work without too many problems on almost any unix system.
-
- I am currently working on rw-writing this to make the code faster.
-
- Have fun!
-
-
- -----------------------------------------------------------------------------
- Scott D. Yelich scott@cs.odu.edu [128.82.8.1]
- After he pushed me off the cliff, he asked me, as I fell, ``Why'd you jump?''
- Administrator of: Game design requests to <game-design-request@cs.odu.edu>
- -----------------------------------------------------------------------------
-
-
- #!/bin/sh
- # This is a shell archive.
- # Run the file through sh to extract its contents.
- # shar: Shell Archiver
- # Run the following text with /bin/sh to create:
- # ./lookfor
- # This archive created: Wed Apr 18 13:26:37 1990
- cat << \SHAR_EOF > ./lookfor
- #!/bin/sh
-
- # +--------------------------------------------------------------------------+
- # | Scott D. Yelich xanth.uucp |
- # | C.S. Department scott@cs.odu.edu |
- # | Old Dominion University scott@xanth.cs.odu.edu |
- # | Norfolk, VA 23529-0612 [ 128.82.8.1 ] |
- # +--------------------------------------------------------------------------+
-
- # +--------------------------------------------------------------------------+
- # | Copyright 1989 Scott D. Yelich. All rights reserved. Last Mod: 8/8/89 |
- # +--------------------------------------------------------------------------+
-
- INITIALIZE_VARIABLES ()
- {
- #
- # Initialize external executable paths (Who needs a path variable anyway!).
- #
- PATH=
- LS=/bin/ls
- TR=/bin/tr
- CP=/bin/cp
- MV=/bin/mv
- RM=/bin/rm
- AWK=/bin/awk
- PWD=/bin/pwd
- CAT=/bin/cat
- SED=/bin/sed
- EXPR=/bin/expr
- SORT=/bin/sort
- ECHO=/bin/echo
- DIFF=/bin/diff
- SORT=/bin/sort
- TAIL=/usr/ucb/tail
- MAIL=/usr/ucb/mail
- EGREP=/bin/egrep
- COLRM=/usr/ucb/colrm
- HELPMAN=/usr/ucb/man
- YPWHICH=/bin/ypwhich
- BASENAME=/bin/basename
-
- #
- # Initialize internal variables and flags...
- #
- SHOW=
- SYNC=
- TSKGI=
- NAIVE=
- OUTPUT=
- LOCATE=
- HEADER=
- MANUAL=
- VERBOSE=
- OPTIONS=
- ADD_FILE=
- CONFIGURE=
- INITIALIZE=
- UPDATE_FILE=
- MATCH_WORDS=
- FIND_KEYWORD=
- LIST_KEYWORDS=
-
- #
- # Initialize others and miscellaneous...
- #
- CR="
- UNIQ=$$
- TMP=/tmp
- NULL=/dev/null
- umask 117
- TABS=" "
- INFO=":INFO :"
- TITLE=":TITLE :"
- SUBJECT=":SUBJECT :"
- KEYWORD=":KEY WORDS:"
- WHOIAM=`/usr/ucb/whoami`
- SERVERS="helios yucca tuna nansen granite hengest"
- MASTER_LIB=/usr/amon-re/local/lib/sysman
- MASTER_KEYWORDS=$MASTER_LIB/KEYWORDS
- MASTER_FILENAMES=$MASTER_LIB/FILENAMES
- NFSERVER=`$YPWHICH | $TR "." " " | $AWK '{print $1}'`
- LIB=/usr/$NFSERVER/local/lib/sysman
- KEYWORDS=$LIB/KEYWORDS
- FILENAMES=$LIB/FILENAMES
- PWD_DIRECTORY=`$PWD`
- TEMP_1=$TMP/"$UPROGNAME"_"$UNIQ"_TEMP_1
- TEMP_2=$TMP/"$UPROGNAME"_"$UNIQ"_TEMP_2
- TEMP_3=$TMP/"$UPROGNAME"_"$UNIQ"_TEMP_3
- for VAR in $VARS
- do
- OPTIONS=No
- case $OPTION in
- "-add" |"-add" |"-a") ADD_FILE=$VAR ;OPTIONS=Yes ;;
- "-list" |"-list" |"-l") SHOW=Yes ;OPTIONS= ;;
- "-help" |"-help" |"-h") MANUAL=Yes ;OPTIONS= ;;
- "-find" |"-find" |"-f") FIND_KEYWORD=Yes ;OPTIONS= ;;
- "-guru" |"-guru" |"-g") TSKGI="I $TSKGI" ;OPTIONS= ;;
- "-info" |"-info" |"-i") TSKGI="I $TSKGI" ;OPTIONS= ;;
- "-help" |"-help" |"-h") MANUAL=Yes ;OPTIONS= ;;
- "-match" |"-match"|"-m") MATCH_WORDS=Yes ;OPTIONS= ;;
- "-title" |"-title"|"-t") TSKGI="T $TSKGI" ;OPTIONS= ;;
- "-update" |"-up" |"-u") UPDATE_FILE=$MASTER_LIB/$VAR;OPTIONS=Yes ;;
- "-output" |"-out" |"-o") OUTPUT=$VAR ;OPTIONS=Yes ;;
- "-subject" |"-sub" |"-s") TSKGI="S $TSKGI" ;OPTIONS= ;;
- "-keyword" |"-key" |"-k") TSKGI="K $TSKGI" ;OPTIONS= ;;
- "-verbose" |"-verb" |"-v") VERBOSE=Yes ;OPTIONS= ;;
- "-database" |"-data" |"-d") LIB=$VAR ;OPTIONS=Yes ;;
- "-configure" |"-conf" |"-c") CONFIGURE=Yes ;OPTIONS= ;;
- "-elaborate" |"-elab" |"-e") LOCATE="$LOCATE$VAR " ;OPTIONS=Yes ;;
- "--conf" |"--c" ) CONFIGURE=No ;OPTIONS= ;;
- "-sync" |"-synk" ) SYNC=Yes ;OPTIONS= ;;
- "-naive" |"-naive"|"-n") NAIVE=Yes ;OPTIONS= ;;
- esac
- if [ "$OPTIONS" = "No" ]; then
- if [ "$OPTION" -a "$OPTION" != "$UPROGNAME" ]; then
- LOCATE="$LOCATE$OPTION "
- fi
- fi
- if [ "$OPTIONS" != "Yes" ]; then
- OPTION=$VAR
- else
- OPTION=
- fi
- done
- if [ -z "$PAGER" ]; then
- PAGER=/usr/new/less
- fi
- if [ -z "$EDITOR" ]; then
- EDITOR="/usr/new/emacs -nw"
- fi
- if [ ! "$LOCATE" ]; then
- MANUAL=Yes
- fi
- if [ ! "$TSKGI" ]; then
- TSKGI="K"
- fi
- if [ "$VERBOSE" ]; then
- $ECHO "$UPROGNAME: Executing."
- $ECHO "$UPROGNAME: Initializing."
- fi
- : ; }
-
- MANPAGE ()
- {
- #
- # Function: MAN
- #
- if [ "$VERBOSE" ]; then
- $ECHO "$UPROGNAME: Checking variables for displaying manual page."
- fi
- if [ "$MANUAL" ]; then
- if [ "$VERBOSE" ]; then
- $ECHO "$UPROGNAME: Displaying the manual page for $UPROGNAME."
- fi
- $HELPMAN $LPROGNAME
- exit 0
- fi
- : ; }
-
- NOTIFY ()
- {
- #
- # Function: NOTIFY
- #
- if [ "$VERBOSE" ]; then
- $ECHO "$UPROGNAME: Notify systems."
- fi
- $MAIL -s "Sysman updated: $FILENAME" system <<EOF_NOTIFY >$NULL
-
- I updated sysman...
-
- Here is the new "$FILENAME", please take a look at it.
- If you want to change this file, use "sysman -v -u $FILENAME"
-
- Thank you.
-
- ---8<--[ CUT HERE ]----------------------------------------------------------
- ~r $UPDATE_FILE
-
- EOF_NOTIFY
- : ; }
-
- INSTALL ()
- {
- #
- # Function: INSTALL
- #
- if [ ! "$FILENAME" ]; then
- FILENAME=`$BASENAME $UPDATE_FILE`
- fi
- if [ "$VERBOSE" ]; then
- $ECHO "$UPROGNAME: Installing- $FILENAME"
- fi
- for MACHINE in $SERVERS
- do
- $CP $UPDATE_FILE /usr/$MACHINE/local/lib/sysman/$FILENAME
- done
- : ; }
-
- ADD ()
- {
- #
- # Function: ADD
- #
- if [ "$VERBOSE" ]; then
- $ECHO "$UPROGNAME: Add $ADD_FILE"
- fi
- if [ "$ADD_FILE" ]; then
- FILE=`$BASENAME $ADD_FILE`
- DUMMY=`($LS $MASTER_LIB/$FILE.* ) 2>$NULL | $TR "." " " | $SORT -n +1.0 | $TAIL -1 | $AWK '{print $2}'`
- APPENDAGE=`$EXPR 0$DUMMY + 1`
- FILENAME=$FILE.$APPENDAGE
- UPDATE_FILE=$MASTER_LIB/$FILENAME
- $CAT >$UPDATE_FILE <<EOF
- SUBJECT : Subject
- TITLE : Title
- INFO : WeekDay- Month Day, Year | WeekDay- Month Day, Year [Guru]
- KEY WORDS: KeyWords
- EOF
- $CAT $ADD_FILE >>$UPDATE_FILE 2>$NULL
- if [ "$CONFIGURE" != "No" ]; then
- CONFIGURE=Yes
- fi
- fi
- : ; }
-
- UPDATE ()
- {
- #
- # Function: UPDATE
- #
- if [ "$VERBOSE" ]; then
- $ECHO "$UPROGNAME: Update $UPDATE_FILE"
- fi
- if [ "$UPDATE_FILE" -o "$ADD_FILE" ]; then
- if [ -w "$UPDATE_FILE" ]; then
- $EDITOR $UPDATE_FILE
- $RM -rf $MASTER_LIB/*~
- INSTALL
- NOTIFY
- if [ "$CONFIGURE" != "No" ]; then
- CONFIGURE=Yes
- fi
- else
- $ECHO "$UPROGNAME*ERROR: File \"$UPDATE_FILE\" does not exist!"
- exit 1
- fi
- fi
- : ; }
-
- SYNCRONIZE ()
- {
- #
- # Function: SYNCRONIZE
- #
- if [ "$VERBOSE" ]; then
- $ECHO "$UPROGNAME: Syncronizing databases..."
- fi
- if [ "$SYNC" ]; then
- $RM -rf $TEMP_3
- $LS $MASTER_LIB >$TEMP_1
- for MACHINE in $SERVERS
- do
- $LS /usr/$MACHINE/local/lib/sysman >$TEMP_2
- $DIFF $TEMP_1 $TEMP_2 | $EGREP \< | $AWK '{print $2}' >>$TEMP_3
- done
- $SORT -u $TEMP_3 >$TEMP_1
- for FILENAME in `$CAT $TEMP_1`
- do
- UPDATE_FILE=$MASTER_LIB/$FILENAME
- INSTALL
- done
- exit 0
- fi
- : ; }
-
- MAKE_DATA ()
- {
- #
- # Function: MAKE_DATA
- #
- if [ "$VERBOSE" ]; then
- $ECHO "$UPROGNAME: Checking master database for files."
- fi
- #
- # CONFIGURE the database?
- #
- if [ "$CONFIGURE" = "Yes" ]; then
- if [ "$VERBOSE" ]; then
- $ECHO "$UPROGNAME: Creating new database information-- PLEASE WAIT..."
- fi
- cd $MASTER_LIB
- $RM -rf $MASTER_KEYWORDS
- $RM -rf $MASTER_FILENAMES
- $EGREP "^KEY WORD|^SUBJECT|^TITLE|^INFO" * >$MASTER_FILENAMES
- #
- # I would think it would be possible to speed up this next loop somehow.
- # Old runtime: 15 minutes...
- # New runtime: 32 seconds!!!
- #
- $EGREP -h "^KEY WORD" * >$TEMP_1
- $TR -cs A-Za-z '\012' <$TEMP_1 >$TEMP_2
- $SORT -u $TEMP_2 >$MASTER_KEYWORDS
- UPDATE_FILE=$MASTER_KEYWORDS
- FILENAME=`$BASENAME $UPDATE_FILE`
- INSTALL
- UPDATE_FILE=$MASTER_FILENAMES
- FILENAME=`$BASENAME $UPDATE_FILE`
- INSTALL
- exit 0
- else
- if [ "$CONFIGURE" = "No" ]; then
- if [ "$VERBOSE" ]; then
- $ECHO "$UPROGNAME: --conf. Exiting."
- fi
- exit 0
- fi
- fi
- : ; }
-
- LIST ()
- {
- #
- # Function: LIST
- #
- if [ "$VERBOSE" ]; then
- $ECHO "$UPROGNAME: List."
- fi
- $RM -rf $TMP/$UPROGNAME*
- for KEY in $TSKGI
- do
- case $KEY in
- "T") $EGREP "$TITLE" $FILENAMES >>$TEMP_1 ;;
- "S") $EGREP "$SUBJECT" $FILENAMES >>$TEMP_1 ;;
- "K") $EGREP "$KEYWORD" $FILENAMES >>$TEMP_1 ;;
- "G") $EGREP "$INFO" $FILENAMES >>$TEMP_1 ;;
- "I") $EGREP "$INFO" $FILENAMES >>$TEMP_1 ;;
- esac
- done
- MATCH_KEY=`$ECHO $LOCATE | $TR " " "|"`
- if [ "$MATCH_KEY" ]; then
- $EGREP -i $MATCH_KEY $TEMP_1 >$TEMP_2
- else
- $MV $TEMP_1 $TEMP_2
- fi
- : ; }
-
- CHECK ()
- {
- #
- # Function: CHECK
- #
- if [ "$VERBOSE" ]; then
- $ECHO "$UPROGNAME: Matching: "$LOCATE
- fi
- BOGUS=
- VALID=
- NOT_VALID=
- for WORD in $LOCATE
- do
- if [ ! "`$EGREP -i $WORD $FILENAMES | $COLRM 81 999`" ]; then
- $ECHO "$UPROGNAME*WARNING: Word \""$WORD"\" is not recognized [IGNORED]."
- NOT_VALID="$NOT_VALID$WORD "
- else
- VALID="$VALID$WORD "
- fi
- done
- # If the following code is included.... people just run sysman without the
- # offending keyword..... lets just make sysman do that automatically.
- # if [ "$NOT_VALID" ]; then
- # exit 1
- # fi
- : ; }
-
- MATCH ()
- {
- #
- # Function: MATCH
- #
- if [ "$VERBOSE" ]; then
- $ECHO "$UPROGNAME: Checking variables to match a sub string."
- fi
- if [ "$MATCH_WORDS" ]; then
- if [ "$VERBOSE" ]; then
- $ECHO "$UPROGNAME: Matching strings : "$LOCATE
- fi
- for WORD in $LOCATE
- do
- $EGREP -i $WORD $KEYWORDS
- done
- exit 0
- fi
- : ; }
-
- SEARCH ()
- {
- #
- # Function: SEARCH
- #
- # Without ``NAIVE'' ALL keywords must be matched...
- # With ``NAIVE'' the keywords are significant from the left...
- if [ "$VERBOSE" ]; then
- $ECHO "$UPROGNAME: Searching the database for matching files."
- fi
- LOCATE="$VALID $NOT_VALID"
- for WORD in $LOCATE
- do
- $EGREP -i $WORD $TEMP_2 >$TEMP_1
- if [ ! "$NAIVE" ]; then
- $MV $TEMP_1 $TEMP_2
- else
- if [ -s "$TEMP_1" ]; then
- $MV $TEMP_1 $TEMP_2
- else
- $ECHO "$UPROGNAME*WARNING: Duh, ignoring \`\`$WORD'' "
- fi
- fi
- done
- : ; }
-
- VIEW ()
- {
- #
- # Function: VIEW
- #
- if [ "$VERBOSE" ]; then
- $ECHO "$UPROGNAME: View."
- fi
- $COLRM 79 9999 <$TEMP_2 >$TEMP_1
- if [ "$SHOW" ]; then
- for KEY in $TSKGI
- do
- case $KEY in
- "T") $EGREP "$TITLE" $TEMP_1 | $SED "s/$TITLE/$CR$TABS/g" ;;
- "S") $EGREP "$SUBJECT" $TEMP_1 | $SED "s/$SUBJECT/$CR$TABS/g" ;;
- "K") $EGREP "$KEYWORD" $TEMP_1 | $SED "s/$KEYWORD/$CR$TABS/g" ;;
- "G") $EGREP "$INFO" $TEMP_1 | $SED "s/$INFO/$CR$TABS/g" ;;
- "I") $EGREP "$INFO" $TEMP_1 | $SED "s/$INFO/$CR$TABS/g" ;;
- esac
- done
- exit 0
- fi
- if [ "$VERBOSE" ]; then
- $ECHO "$UPROGNAME: Displaying the list of answer files."
- fi
- if [ -s $TEMP_1 ]; then
- $TR ":" " " <$TEMP_1 | $AWK '{print $1}' | $SORT -u >$TEMP_2
- else
- $ECHO -n "$UPROGNAME*ERROR: No matches for "
- for KEY in $TSKGI
- do
- case $KEY in
- "T") $ECHO -n "[title] " ;;
- "S") $ECHO -n "[subject] " ;;
- "K") $ECHO -n "[keyword] " ;;
- "G") $ECHO -n "[guru] " ;;
- "I") $ECHO -n "[info] " ;;
- esac
- done
- $ECHO ": $LOCATE"
- exit 1
- fi
- if [ "$VERBOSE" -a "$OUTPUT" ]; then
- $ECHO "$UPROGNAME: Saving all answers to: $OUTPUT"
- fi
- cd $LIB
- if [ "$OUTPUT" ]; then
- cd $PWD_DIRECTORY
- while read COPY
- do
- $CAT $LIB/$COPY >>$OUTPUT
- $ECHO "_______________________________________________________________________________" >>$OUTPUT
- done <$TEMP_2
- else
- $PAGER `$CAT $TEMP_2`
- fi
- : ; }
-
- CLEAN ()
- {
- #
- # Function: CLEAN
- #
- if [ "$VERBOSE" ]; then
- $ECHO "$UPROGNAME: Cleaning up after $UPROGNAME."
- fi
- $RM -rf $TMP/$UPROGNAME*
- cd $PWD_DIRECTORY
- : ; } 2>$NULL
-
- MAIN ()
- {
- #
- # Function: MAIN
- #
- INITIALIZE_VARIABLES
- ADD
- UPDATE
- MAKE_DATA
- SYNCRONIZE
- LIST
- MANPAGE
- CHECK
- MATCH
- SEARCH
- VIEW
- CLEAN
- : ; }
-
- #
- # This next part of the code sets up the intial hook.
- #
- WPROGNAME=$0
- LPROGNAME=`/bin/basename $0`
- UPROGNAME=`/bin/basename $LPROGNAME | /bin/tr "[a-z]" "[A-Z]"`
- VARS="$@ $UPROGNAME"
- MAIN
- SHAR_EOF
- # End of shell archive
- exit 0
- --
-
- -----------------------------------------------------------------------------
- Scott D. Yelich scott@cs.odu.edu [128.82.8.1]
- After he pushed me off the cliff, he asked me, as I fell, ``Why'd you jump?''
- Administrator of: Game design requests to <game-design-request@cs.odu.edu>
- -----------------------------------------------------------------------------
-