home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
PC World Komputer 1995 November
/
PCWK1195.iso
/
inne
/
win95
/
sieciowe
/
hotja32.lzh
/
hotjava
/
classsrc
/
browser
/
tools
/
javasearch
/
indexinginputstream.java
< prev
next >
Wrap
Text File
|
1995-08-11
|
5KB
|
171 lines
/*
* @(#)IndexingInputStream.java 1.10 95/03/14 David A. Brown
*
* Copyright (c) 1994 Sun Microsystems, Inc. All Rights Reserved.
*
* Permission to use, copy, modify, and distribute this software
* and its documentation for NON-COMMERCIAL purposes and without
* fee is hereby granted provided that this copyright notice
* appears in all copies. Please refer to the file "copyright.html"
* for further important copyright and licensing information.
*
* SUN MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF
* THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
* TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
* PARTICULAR PURPOSE, OR NON-INFRINGEMENT. SUN SHALL NOT BE LIABLE FOR
* ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR
* DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
*/
package browser.tools.JavaSearch;
import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.File;
/** Input stream to parse documents for indexing */
class IndexingInputStream extends BufferedInputStream {
/** "Document type" of the file we're parsing.
* See "Document type" in Doc.java.
*/
private int docType = Doc.INVALID_TYPE;
/** Basic constructors; see FileInputStream for details */
public IndexingInputStream(String name) {
super(new FileInputStream(name), 2048);
}
/*
public IndexingInputStream(File file) {
super(file, 1024);
}
*/
/** Set the Document Type */
public void setDocType(int type) {
docType = type;
}
/**
* Return the next "word" in the input stream, null on EOF.
*/
public String getWord() {
int c;
// First, skip over any non-alphanumeric characters,
// and check for EOF
while (true) {
c = read();
if (c == -1) return null;
if (isAlphanumeric((char)c)) break;
// Doc Type-specific features:
if ((docType == Doc.HTML_TYPE) && (c == '<')) {
// If we detect a '<'...
while ((c = read()) != '>') {
// skip till the next '>', but bail on EOF
if (c == -1) return null;
}
}
// Does NEWS need anything special here? Probably not...
}
// Start the input buffer
StringBuffer input = new StringBuffer();
input.appendChar((char)c);
// Process more chars:
//
// We're guaranteed to have at least a one-char word,
// so we'll return a string either when we hit a
// delimiter, OR if we hit EOF. (Then, the next call to
// this method will immediately return null, signaling
// the EOF.)
//
while (true) {
c = read();
if (isAlphanumeric((char)c)) {
input.appendChar((char)c);
}
else { // c is non-alphanumeric, or could be -1 if EOF
// If by chance c was '<'...
if ((docType == Doc.HTML_TYPE) && (c == '<')) {
while ((c = read()) != '>') {
// skip till the next '>', but bail on EOF
if (c == -1) break;
}
}
// Ok, process and return the string we've built up
downcase(input);
return input.toString();
}
}
}
/**
* Return true if the given character is alphanumeric,
* i.e. NOT a word delimiter
*/
private static boolean isAlphanumeric(char c) {
if ((c >= 'a') && (c <= 'z')) return true;
if ((c >= 'A') && (c <= 'Z')) return true;
if ((c >= '0') && (c <= '9')) return true;
// REMIND: Is it always the right thing to have "'"
// count as alphanumeric?
if (c == '\'') return true;
// FUTURE: maybe other chars might count as 'alphanumeric',
// for example "_" if it's part of any method names...
return false;
}
/**
* Convert a StringBuffer to lowercase, in place.
* Utility function used by a few JavaSearch classes.
*/
public static void downcase(StringBuffer buf) {
for (int i=0; i<buf.length(); i++) {
char c = buf.charAt(i);
if (c>='A' && c<='Z') {
c -= 'A'-'a';
buf.setCharAt(i,c);
}
}
}
/**
* Return a downcased version of the specified String.
* Based on downcase(StringBuffer buf).
* Utility function used by a few JavaSearch classes.
*/
public static String downcase(String s) {
StringBuffer sb = new StringBuffer().append(s);
downcase(sb);
return sb.toString();
}
// FUTURE stuff:
//
// Method to return the current position in the input file,
// or better, the starting position of the last word returned
// by getWord(). This will be needed if the indexer ever wants
// to keep positional info in the index.
//
}