home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
PC World Komputer 1995 November
/
PCWK1195.iso
/
inne
/
win95
/
sieciowe
/
hotja32.lzh
/
hotjava
/
classsrc
/
browser
/
tools
/
javasearch
/
doc.java
< prev
next >
Wrap
Text File
|
1995-08-11
|
13KB
|
419 lines
/*
* @(#)Doc.java 1.11 95/03/14 David A. Brown
*
* Copyright (c) 1994 Sun Microsystems, Inc. All Rights Reserved.
*
* Permission to use, copy, modify, and distribute this software
* and its documentation for NON-COMMERCIAL purposes and without
* fee is hereby granted provided that this copyright notice
* appears in all copies. Please refer to the file "copyright.html"
* for further important copyright and licensing information.
*
* SUN MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF
* THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
* TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
* PARTICULAR PURPOSE, OR NON-INFRINGEMENT. SUN SHALL NOT BE LIABLE FOR
* ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR
* DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
*/
package browser.tools.JavaSearch;
import java.io.*;
/** One JavaSearch Document */
public class Doc {
//
// Persistent info about this Document (this is all
// saved in the docs file)
//
/** Document ID: NOTE maximum of 65535 docs in one index!! */
public char docID;
/**
* "Filename" of this document. This is NOT a full
* pathname -- it should be combined with either our Database's
* docPathPrefix or docURLPrefix, in order to get a
* fully-qualified filename or URL respectively.
*/
public String filename;
/**
* Document's "headline" or "title". Should be human-readable
* and useful in the context of a list of search hits.
*/
public String headline;
// FUTURE stuff:
//int startChar, endChar; // Character range of this doc in 'filename'
//int numLines; // # lines in the doc
//int length; // # bytes in the doc
//String date; // Human-readable date string, based on last-mod-date
//
// Other (transient) info about this Document
//
/** "Document type": slightly change the behavior of
* getWord() for different input file types.
*/
public static final int INVALID_TYPE = 0;
public static final int TEXT_TYPE = 1;
public static final int HTML_TYPE = 2;
//public static final int NEWS_TYPE = 3;
//
public int type = INVALID_TYPE;
/** Counter used to generate consecutive doc IDs. Starts at 1
* since doc ID 0 is never used! (it means the end of an
* index entry).
* Be sure to call resetDocIDCounter() if a single Java program
* is ever used to create more than one JavaSearch index.
*/
private static int doc_ID_counter = 1;
/** Maximum length (in chars) of a Doc headline */
private static final int HEADLINE_MAX_LENGTH = 80;
/**
* Doc constructor to use when we already know all the Doc's info,
* eg. when we're reading it from the doc file.
* Used by the Doc.readFromStream() method.
*/
private Doc(char id, String fn, String hl) {
docID = id;
filename = fn;
headline = hl;
}
/**
* Generate a new Doc object based on a filename.
* Doc type is derived from the filename.
* Headline is computed based on the doc type, and may involve
* opening and reading the file.
*
* Automatically generates a new sequential docID.
*
* This Doc constructor is used when CREATING an index.
*/
public Doc(String aFilename) {
filename = aFilename;
type = docTypeForFilename(filename);
headline = generateHeadline(filename, type);
// Generate a unique doc ID.
docID = (char)(doc_ID_counter++);
if (docID > 65535 ) { // REMIND: Is there a more correct way
// to express the biggest number you can fit in a char?
throw new Exception("Doc ID overflow: too many documents for one index!");
}
}
/**
* Create a new Doc object by looking it up, using the
* specified Doc ID, in the specified .docs and .docindex files.
*
* The .docindex files gives us a pointer into the .docs
* file, from where we read the actual document info.
*
* This Doc constructor is used when SEARCHING.
*/
public Doc(char id, RandomAccessFile docsFile,
RandomAccessFile docindexFile) {
// Read an docs file position from the docindex file.
// docindex entries are ints, which are 4 bytes long.
docindexFile.seek(4*(int)id);
int docsPos = docindexFile.readInt();
// Seek the docsfile to the right place, and read the
// Doc info.
docsFile.seek(docsPos);
readFromFile(docsFile);
// Now sanity-check the Doc we read from docsFile:
// make sure its ID is what we started off looking for!
if (id != docID) {
System.out.println(" Doc constructor: tried for ID "+
id+", but readFromFile() found "+docID+"!!!");
throw new Exception("Doc ID mismatch in .docs/.docindex files!");
}
}
/** Reset the counter used to generate consecutive doc IDs.
* This must be called if a single Java program is ever
* used to create more than one JavaSearch index.
*/
public static void resetDocIDCounter() {
doc_ID_counter = 1;
}
/** Generate a simple printed representation of this Doc */
public String toString() {
return "[DocID " + docID + "]\t" + filename +
"\t'" + headline + "'";
}
/** Trim a prefix off this Doc's filename.
* Used by the indexer. If our filename begins with
* "prefix", then we replace our filename with everything
* that *follows* prefix.
*/
public void trimFilename(String prefix) {
//System.out.println("\ntrimFilename: prefix '"+prefix+
// "', filename '"+filename+"'.");
if (prefix == null) return;
if (filename.startsWith(prefix)) {
filename = filename.substring(prefix.length());
}
//System.out.println(" trimmed! filename is now '"+filename+"'.");
}
/** Write this Doc to the specified Output stream */
void writeToStream(DataOutputStream out) {
// Our doc ID: one char
// Note the ID is redundant when writing the docs file
// (since Docs are in sequential ID order!) but we still
// save it and do a sanity check when retrieving.
out.writeChar(docID);
// Filename and headline, terminated by '\n's:
out.writeBytes(filename);
out.writeByte('\n');
out.writeBytes(headline);
out.writeByte('\n');
}
/**
* Read a single Document entry from the specified stream.
* Returns a Doc object, or null if stream hits EOF.
* The stream must be pointing at the start of a
* valid Doc entry!!
*/
static Doc readFromStream(DataInputStream in) {
char id = in.readChar();
System.out.println(" Doc.readFromStream: got a char: "+id);
// REMIND: what does readChar() return if we're at EOF??
// Is it OK to just use the readLine() calls (below) to detect EOF?
String filename = in.readLine();
String headline = in.readLine();
if (filename==null || headline==null) {
return null;
}
Doc doc = new Doc(id,filename,headline);
return doc;
}
/**
* Read a single Document entry from the specified file. NOTE that
* indexFile must be pointing at the start of a valid Doc entry!!
*
* This duplicates the functionality of
* readFromStream(), but unfortunately
* RandomAccessFiles are not InputStreams...
*/
public void readFromFile(RandomAccessFile docsFile) {
docID = docsFile.readChar();
filename = docsFile.readLine();
headline = docsFile.readLine();
if (filename==null || headline==null) {
throw new Exception("Doc.readFromFile hit EOF: must have been a bad pointer into docsFile!");
}
//System.out.println(" Read Doc from docsFile: "+this);
}
/**
* Return a Document Type given the specified filename.
* Specifically, look for any special file extensions we recognize.
*/
private static int docTypeForFilename(String filename) {
// Currently the only special filename we recognize
// is "*.html":
// REMIND: the 'endsWith(".html")' part might be Unix-specific!
if (filename.endsWith(".html")) {
//System.out.println("Got an HTML file: "+filename);
return HTML_TYPE;
}
// REMIND: Here's where we would detect a NEWS type file.
// But there's no way to do that from a filename, unless
// we assume a "numeric" filename (like '469') is a news
// article!
// Do we have to also open the file here to figure out what
// it is? Maybe the functionality of docTypeForFilename()
// should be merged with generateHeadline(), which already
// has to read the file anyway?
else {
// default to TEXT...
return TEXT_TYPE;
}
}
/**
* Compute a headline for this Doc, given the specified "filename" and
* Doc type. This may involve opening and reading some of the file
* for certain doc types!
*/
private static String generateHeadline(String filename, int docType) {
String headline = null;
// Handle all known types
switch (docType) {
case TEXT_TYPE:
// Headline is simply the filename
headline = filename;
break;
case HTML_TYPE:
headline = getHTMLHeadline(filename);
break;
//case NEWS_TYPE:
// headline = getNewsHeadline(filename);
// break;
default:
throw new Exception("Doc.generateHeadline: invalid docType ("+
docType+")!");
}
// Some final processing:
if ((headline == null) || (headline.length() == 0)) {
headline = "[No Headline]";
}
// REMIND: We should strip out any newlines,
// and maybe any other nasty characters here
// Enforce HEADLINE_MAX_LENGTH:
if (headline.length() > HEADLINE_MAX_LENGTH) {
headline = headline.substring(0,HEADLINE_MAX_LENGTH);
}
//System.out.println("generateHeadline("+filename+"): returning '"+
// headline+"'.");
return headline;
}
/**
* Compute the headline of an HTML file, by reading the file
* and finding everything between the <title> and </title>.
* Returns a String, or null if no HTML title was found.
*/
private static String getHTMLHeadline(String filename) {
String line,tmpline;
String result= null;
int pos;
// Delimiters of an HTML title (lowercase versions)
String titleStart = "<title>";
String titleEnd = "</title>";
//System.out.println("getHTMLHeadline: opening '" + filename + "'...");
FileInputStream filein = new FileInputStream(filename);
DataInputStream in = new DataInputStream(filein);
while (true) {
line = in.readLine();
if (line == null) {
filein.close();
return null;
}
tmpline = IndexingInputStream.downcase(line);
if ((pos = tmpline.indexOf(titleStart)) != -1) {
// Got the title! Save the rest of this line
result = line.substring(pos+titleStart.length());
break;
}
}
// If we already have titleEnd, we're done.
tmpline = IndexingInputStream.downcase(result);
if ((pos = tmpline.indexOf(titleEnd)) != -1) {
filein.close();
return result.substring(0,pos);
}
// Ok, keep reading more lines, looking for titleEnd
while ((line = in.readLine()) != null) {
tmpline = IndexingInputStream.downcase(line);
if ((pos = tmpline.indexOf(titleEnd)) != -1) {
// This line contained titleEnd.
// Add a space to result unless we're at
// the very beginning or end of the title
if ((result.length() != 0) && (pos != 0)) {
result += " ";
}
// And append anything *before* titleEnd
result += line.substring(0,pos);
break;
}
else {
// Add a space to result unless we're at
// the very beginning of the title,
// or if this line is empty
if ((result.length() != 0) && (line.length() != 0)) {
result += " ";
}
// Append this whole line to result
result += line;
}
}
filein.close();
return result;
}
// /**
// * Compute the headline of a News article, by reading the file
// * and looking for the From: and Subject: lines.
// * Returns a String, or null if no title could be constructed.
// */
// private static String getNewsHeadline(String filename) {
// String line,tmpline;
// String result = null;;
// int pos;
//
// // Header fields we care about
// String fromHeader = "from:";
// String subjectHeader = "subject:";
//
// //System.out.println("getNewsHeadline: opening '" + filename + "'...");
// FileInputStream filein = new FileInputStream(filename);
// DataInputStream in = new DataInputStream(filein);
//
// // Read through the lines of the file
// while ((line = in.readLine()) != null) {
//
// // Search for fromHeader or subjectHeader here
//
// }
//
// // Construct a nice-looking headline based on the
// // From and Subject fields. Maybe something like
// // "Subject of this Article [fromaddress@machine.sun.com]"
//
// filein.close();
// return result;
// }
}