PC World Komputer 1995 November

home *** CD-ROM | disk | FTP | other *** search

/ PC World Komputer 1995 November / PCWK1195.iso / inne / win95 / sieciowe / hotja32.lzh / hotjava / classsrc / browser / tools / javasearch / searcher.java < prev next >

Wrap

Text File | 1995-08-11 | 13KB | 391 lines

/* * @(#)Searcher.java 1.19 95/03/20 David A. Brown * * Copyright (c) 1994 Sun Microsystems, Inc. All Rights Reserved. * * Permission to use, copy, modify, and distribute this software * and its documentation for NON-COMMERCIAL purposes and without * fee is hereby granted provided that this copyright notice * appears in all copies. Please refer to the file "copyright.html" * for further important copyright and licensing information. * * SUN MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF * THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED * TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A * PARTICULAR PURPOSE, OR NON-INFRINGEMENT. SUN SHALL NOT BE LIABLE FOR * ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR * DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. */ package browser.tools.JavaSearch; import java.util.Vector; import java.util.Enumeration; import java.io.*; /** * Searcher: Interface for *searching* a JavaSearch database. * */ class Searcher { /** The database we're searching */ Database db; /** The window driving us, if any (for messages; can be null). */ SearchWindow searchWindow; /** The stop list, which we use to reject unimportant words. **/ StopList stopList; /** How many words we've read out of the index so far * doing the current search. */ int indexReadCounter; // Modes used internally by doSearch() static final int OR_MODE = 0; static final int AND_MODE = 1; static final int NOT_MODE = 2; /** Searcher constructor. Specify a Database object and window. */ public Searcher(Database theDB, SearchWindow sWin) { //System.out.println("Searcher [Database "+theDB.dbName+"]..."); db = theDB; searchWindow = sWin; stopList = new StopList(System.getenv("HOTJAVA_HOME")+File.separator+ "doc"+File.separator+ StopList.defaultFile); } /** Searcher constructor. Specify a Database object. */ public Searcher(Database theDB) { this(theDB, null); } /** * Main "Search Engine" function, specifying a search string. * * Specify a search string, consisting of keywords to search for, * along with optional boolean keywords "and", "or" and "not". * Words must be separated by spaces!! * This method just breaks the search string into a Vector * of search words, and calls doSearch(Vector searchWords) * to do the actual searching. * * See below for details. */ public DocList doSearch(String searchString) { //System.out.println("doSearch: searchString '"+searchString+"'..."); if ((searchString == null) || (searchString.length()==0)) { return null; } Vector wordVector = new Vector(); int wordStart = 0; // Break apart searchString, adding keywords to wordVector. while (wordStart<searchString.length()) { // Skip over any spaces in searchString while (searchString.charAt(wordStart) == ' ') { wordStart++; if (wordStart >= searchString.length()) break; } if (wordStart >= searchString.length()) break; // Ok, wordStart really points to the start of a word. int wordEnd = searchString.indexOf(' ',wordStart); if (wordEnd == -1) wordEnd = searchString.length(); String keyword = searchString.substring(wordStart,wordEnd); wordVector.addElement(keyword); wordStart = wordEnd+1; } return doSearch(wordVector); } /** * Main "Search Engine" function, specifying a Vector of * search words. * * The searchWords parameter is a Vector of words representing * a "search string" (ie. keywords to search for, mixed in with * the optional boolean keywords "and", "or" and "not".) * * The search words are processed left to right * (i.e. in the order they appear in the Vector), with any * boolean keyword affecting how the *subsequent* word's * hits are merged into the final result. Note that * "not" means "and not". * * Example: the Vector for "foo and bar or baz not mumble" is * processed as follows: * - Find all documents containing "foo" * - INTERSECT this result with all documents containing "bar" * - Now ADD all documents containing "baz" to the result * - INTERSECT this result with all documents NOT * containing "mumble" * * Again, remember "not" means "and not". This means * that the results you get from searching for simply "not foo" * might not be what you expect! * * This method returns a DocList containing all documents * which match the search string, in Doc ID order, * or null if no documents match the search string. */ public DocList doSearch(Vector searchWords) { //System.out.println("doSearch: Number of searchWords: "+ // searchWords.size()+"..."); //System.out.println("doSearch: searchWords "+searchWords); // Start of with an empty set of Doc IDs IDVector resultVector = new IDVector(); // The query words we've dropped due to presence on stop list Vector stoppedWords = new Vector(2); // The "boolean mode" to use when combining // our next set of Doc IDs with the // result set we're accumulating. int booleanMode = OR_MODE; // OR is the default boolean onlyStopWords = true; // so far we've only seen stop words Enumeration e = searchWords.elements(); while (e.hasMoreElements()) { String keyword = (String)e.nextElement(); // Make sure keyword is all lower-case keyword = IndexingInputStream.downcase(keyword); //System.out.println("doSearch: processing keyword '"+keyword+"'."); // If the word is a boolean control keyword, // update booleanMode and continue // on to the next word if (keyword.equals("or")) { booleanMode = OR_MODE; continue; } else if (keyword.equals("and")) { booleanMode = AND_MODE; continue; } else if (keyword.equals("not")) { booleanMode = NOT_MODE; continue; } // Is it on our stop list? If so, skip it. if (stopList.isStopWord(keyword)) { stoppedWords.addElement(keyword); continue; } // Ok, keyword is a real word. Get an IDVector for it. Word w = getWordFromIndex(keyword); //System.out.println("doSearch: getWordFromIndex returned: "+w); // Now we have a bunch of Doc IDs in w.idvector // (or, w may be null!) // Merge that vector into resultVector, using // the current booleanMode: switch (booleanMode) { case AND_MODE: if (w == null) { // ANDing with a null set gives you a null set! System.out.println("ANDing with a null set: clearing resultVector!"); resultVector.clear(); } else { // If we've only seen stopwords so far, don't bother to AND // the null result with anything. if (onlyStopWords) { resultVector = w.idvector; } else { resultVector.intersectWith(w.idvector); } } break; case NOT_MODE: if (w == null) { // "AND NOT <null set>" doesn't change // what you started with. //System.out.println("NOT with a null set; resultVector unaffected."); } else { resultVector.intersectWithNot(w.idvector); } break; default: // OR mode if (w == null) { // "OR <null set>" doesn't change what you started with. //System.out.println("OR with a null set; resultVector unaffected."); } else { resultVector.unionWith(w.idvector); } break; } // Reset booleanMode for the next word (we imply OR // if there's no boolean control keyword) booleanMode = OR_MODE; // If we've gotten this far, we're seeing real words. onlyStopWords = false; } // Done with the search! Results are in resultVector. // Display stopped words, if any. if (stoppedWords.size() > 0) { if (searchWindow == null) { System.out.print("Searcher: Discarded "); for (int i = 0; i < stoppedWords.size(); i++) { System.out.print(stoppedWords.elementAt(i)+" "); } System.out.print("\n"); } else { searchWindow.displayStoppedWords(stoppedWords); } } if (resultVector.count == 0) { //System.out.println("doSearch: resultVector.count is 0!"); return null; } //System.out.println("doSearch: resultVector.count is "+ // resultVector.count); // REMIND: Would be cool to have a MAX_SEARCH_HITS paramater // here, so we don't create a DocList if we have an absurdly // large number of hits. // // If (resultVector.count > MAX_SEARCH_HITS), we could either // truncate resultVector to an acceptable length, // or (better) return a special value indicating // we had too many hits (so the user should refine his // search, and try again...) // // Actually, it would be cool for this method to // somehow return a number_of_hits *and* a DocList! // So if the DocList comes back null, the // number_of_hits tells you whether your got NO hits, // or too many hits. // Generate and return a new DocList based on resultVector -- // this involves opening the .docs and .docindex files, // and reading a Doc object for each hit. DocList dl = new DocList(db, resultVector); return dl; } /** Look up the specified word in the index, and return a * Word object. Return null if the word wasn't found. */ public synchronized Word getWordFromIndex(String aWord) { //System.out.println("getWordFromIndex: finding '"+aWord+"'..."); indexReadCounter = 0; // Open the index and qindex files RandomAccessFile indexFile = new RandomAccessFile(db.indexFilename,"r"); RandomAccessFile qindexFile = new RandomAccessFile(db.qindexFilename,"r"); int numWords = qindexFile.length()/4; //System.out.println("getWordFromIndex: qindexFile has "+ // numWords+" entries."); // Binary search the qindex file (each qindex // entry points at a word in the index file) Word resultWord = bsearchWord(aWord, 0, numWords-1, qindexFile, indexFile); //System.out.println("getWordFromIndex: Got resultWord: "+resultWord); //System.out.println(" Did "+indexReadCounter+" read"+ // ((indexReadCounter==1)?"":"s")+ // " from the index/qindex files."); return resultWord; } // // These methods are the guts of the getWordFromIndex() functionality // /** * Binary-search index file for the specified word. * Returns a Word object if the word is found; * Returns null if the word is not in the index. * Calls itself recursively. */ private Word bsearchWord(String word, int loPos, int hiPos, RandomAccessFile qindexFile, RandomAccessFile indexFile) { //System.out.println(" bsearchWord ("+word+"), range ["+ // loPos+","+hiPos+"]..."); if (loPos > hiPos) { //System.out.println(" loPos > hiPos. ["+loPos+","+hiPos+"]. Returning null."); return null; } else if (loPos == hiPos) { if (word.equals(getWordAtPos(loPos, qindexFile, indexFile))) { return new Word(word,indexFile); } else { return null; } } else { // loPos and hiPos describe a range. Check the // *middle* of the range, and recurse. int midPos = (loPos+hiPos)/2; String midWord = getWordAtPos(midPos, qindexFile, indexFile); //System.out.println(" midPos "+midPos+ // ", midWord '"+midWord+"'."); int compareResult = word.compareTo(midWord); if (compareResult == 0) { // Direct hit! return new Word(word,indexFile); } else if (compareResult < 0) { // word < midWord: Search the lower half return bsearchWord(word, loPos, midPos-1, qindexFile, indexFile); } else { // word > midWord: Search the upper half return bsearchWord(word, midPos+1, hiPos, qindexFile, indexFile); } } } /** Return the word found at the Nth position in the index file. * Leaves indexFile pointing at the word's doc entries. */ private String getWordAtPos(int n, RandomAccessFile qindexFile, RandomAccessFile indexFile) { //System.out.print(" getWordAtPos "+n+"... "); // Read an index file position from the qindex file. // qindex entries are ints, which are 4 bytes long. qindexFile.seek(n*4); int indexPos = qindexFile.readInt(); // Read just the word (a String) from the index file indexFile.seek(indexPos); String word = indexFile.readLine(); //System.out.println("found word '"+word+"'."); indexReadCounter++; // One indexReadCounter increment // means one int from the qindexFile // and one String from the indexFile. return word; } }