Java Programmer's Toolkit

home *** CD-ROM | disk | FTP | other *** search

/ Java Programmer's Toolkit / Java Programmer's Toolkit.iso / hotjava / classsrc / net / www / html / parser~2.jav < prev next >

Wrap

Text File | 1995-08-11 | 15.7 KB | 631 lines

/* * @(#)Parser.java 1.53 95/05/10 Jonathan Payne * * Copyright (c) 1994 Sun Microsystems, Inc. All Rights Reserved. * * Permission to use, copy, modify, and distribute this software * and its documentation for NON-COMMERCIAL purposes and without * fee is hereby granted provided that this copyright notice * appears in all copies. Please refer to the file "copyright.html" * for further important copyright and licensing information. * * SUN MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF * THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED * TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A * PARTICULAR PURPOSE, OR NON-INFRINGEMENT. SUN SHALL NOT BE LIABLE FOR * ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR * DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. */ package net.www.html; import java.util.*; import java.io.*; /** Net.www.html.Parser takes an input stream and parses it for html tags. It produces an instance of net.www.html.Document as a result. */ public class Parser { final boolean debug = false; static Hashtable ampChars = new Hashtable(); static { ampChars.put("lt", new Character('<')); ampChars.put("gt", new Character('>')); ampChars.put("amp", new Character('&')); ampChars.put("quot", new Character('"')); ampChars.put("nbsp", new Character(' ')); /* remind - incorrect */ ampChars.put("shy", new Character('-')); /* remind - incorrect */ /* NOTE: These are case SENSITIVE, e.g., AElig and aelig. */ ampChars.put("AElig", new Character(198)); ampChars.put("Aacute", new Character(193)); ampChars.put("Acirc", new Character(194)); ampChars.put("Agrave", new Character(192)); ampChars.put("Aring", new Character(197)); ampChars.put("Atilde", new Character(195)); ampChars.put("Auml", new Character(196)); ampChars.put("Ccedil", new Character(199)); ampChars.put("ETH", new Character(208)); ampChars.put("Eacute", new Character(201)); ampChars.put("Ecirc", new Character(202)); ampChars.put("Egrave", new Character(200)); ampChars.put("Euml", new Character(203)); ampChars.put("Iacute", new Character(205)); ampChars.put("Icirc", new Character(206)); ampChars.put("Igrave", new Character(204)); ampChars.put("Iuml", new Character(207)); ampChars.put("Ntilde", new Character(209)); ampChars.put("Oacute", new Character(211)); ampChars.put("Ocirc", new Character(212)); ampChars.put("Ograve", new Character(210)); ampChars.put("Oslash", new Character(216)); ampChars.put("Otilde", new Character(213)); ampChars.put("Ouml", new Character(214)); ampChars.put("THORN", new Character(222)); ampChars.put("Uacute", new Character(218)); ampChars.put("Ucirc", new Character(219)); ampChars.put("Ugrave", new Character(217)); ampChars.put("Uuml", new Character(220)); ampChars.put("Yacute", new Character(221)); ampChars.put("aacute", new Character(225)); ampChars.put("acirc", new Character(226)); ampChars.put("aelig", new Character(230)); ampChars.put("agrave", new Character(224)); ampChars.put("aring", new Character(229)); ampChars.put("atilde", new Character(227)); ampChars.put("auml", new Character(228)); ampChars.put("ccedil", new Character(231)); ampChars.put("eacute", new Character(233)); ampChars.put("ecirc", new Character(234)); ampChars.put("egrave", new Character(232)); ampChars.put("eth", new Character(240)); ampChars.put("euml", new Character(235)); ampChars.put("iacute", new Character(237)); ampChars.put("icirc", new Character(238)); ampChars.put("igrave", new Character(236)); ampChars.put("iuml", new Character(239)); ampChars.put("ntilde", new Character(241)); ampChars.put("oacute", new Character(243)); ampChars.put("ocirc", new Character(244)); ampChars.put("ograve", new Character(242)); ampChars.put("oslash", new Character(248)); ampChars.put("otilde", new Character(245)); ampChars.put("ouml", new Character(246)); ampChars.put("szlig", new Character(223)); ampChars.put("thorn", new Character(254)); ampChars.put("uacute", new Character(250)); ampChars.put("ucirc", new Character(251)); ampChars.put("ugrave", new Character(249)); ampChars.put("uuml", new Character(252)); ampChars.put("yacute", new Character(253)); ampChars.put("yuml", new Character(255)); ampChars.put("copy", new Character(169)); ampChars.put("reg", new Character(174)); } /** usually the same as input except for ISINDEX (see below) */ byte output[]; /** input string we're parsing */ byte input[]; /** input length */ int inputLength; /** current position in input */ int inputSeek; /** current line number */ int lineCount = 1; /** Document we're building. */ Document html; /** Read a stream of bytes into a String object as quickly as possible. */ private void readInput(InputStream in) { input = new byte[1024*8]; output = input; inputLength = 0; int n; while ((n = in.read(input, inputLength, input.length - inputLength)) >= 0) { inputLength += n; if (inputLength == input.length) { byte newinput[] = new byte[inputLength * 2]; System.arraycopy(input, 0, newinput, 0, inputLength); input = newinput; } } } final int nextChar() { if (inputSeek >= inputLength) { return -1; } int c = input[inputSeek++]; switch (c) { case '\r': if (inputSeek >= inputLength) { lineCount++; return '\n'; } c = input[inputSeek++]; if (c != '\n') { inputSeek--; c = '\n'; } if (c == '\n') { lineCount++; } break; case '\n': lineCount++; break; } return c; } final int peekChar() { return (inputSeek >= inputLength) ? -1 : input[inputSeek]; } final void pushBack() { if (input[--inputSeek] == '\n') { lineCount--; } } final void skipWhiteSpace() { int c; while ((c = nextChar()) == ' ' || c == '\n' || c == '\t') ; pushBack(); } final boolean isWhiteSpace(int c) { return (c == ' ' || c == '\t' || c == '\n'); } void skipUntil(int what) { int c; while ((c = nextChar()) != what) { if (c == -1) break; } pushBack(); } final boolean isLetter(int c) { return (c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z'); } final boolean isDigit(int c) { return (c >= '0' && c <= '9'); } final boolean isTagChar(int c) { return (isLetter(c) || isDigit(c) || c == '.' || c == '-'); } int parseCharacter() { int val = 0; int c; insistThat(nextChar() == '#'); while (isDigit(c = nextChar())) { val = val * 10 + c - '0'; } if (c != ';') { pushBack(); } return val; } int parseEntity() { Character ch; String name; int start = inputSeek; int c; if (!isLetter(peekChar())) { return '&'; } while ((c = nextChar()) != -1 && isLetter(c)) ; int lastc = c; name = new String(input, 0, start, inputSeek - start - 1); if ((ch = (Character) ampChars.get(name)) != null) { c = ch.charValue(); } else { name = name.toLowerCase(); if ((ch = (Character) ampChars.get(name)) != null) { c = ch.charValue(); } else { warning("Warning: failed to find: &" + name); c = -1; } } if (lastc != ';') { pushBack(); } return c; } String makeLowerCaseString(byte str[], int start, int len) { return new String(str, 0, start, len).toLowerCase(); } String parseTagName() { int c; int start = inputSeek; if ((c = nextChar()) == '!') { /* This is a comment, as far as mosaic is concerned, so we just eat up all the characters until the '>', and return 0 (which means ignore). */ skipUntil('>'); return "<comment>"; } else if (!isTagChar(c)) { pushBack(); /* c */ return null; } /* Read tag name until end. Don't complain about illegal tag names, because mosaic doesn't. */ while ((c = nextChar()) != -1 && isTagChar(c)) ; pushBack(); /* push back the delimitor */ if (inputSeek - start == 0) { return null; } return makeLowerCaseString(input, start, inputSeek - start); } final void warning(String msg) { if (debug) { System.out.print("Warning (line " + lineCount + "): "); System.out.println(msg); } } private String delim = "> ="; void parseAttributes(TagRef ref) { int start; int c; String name; String value; while (true) { skipWhiteSpace(); start = inputSeek; while ((c = nextChar()) != -1 && c != '>' && c != ' ' && c != '=' && c != '\n' && c != '\t') ; pushBack(); if (start == inputSeek) { name = null; } else { name = makeLowerCaseString(input, start, inputSeek - start); } skipWhiteSpace(); if (name == null) { return; } if (peekChar() == '=') { nextChar(); skipWhiteSpace(); c = nextChar(); int match, cnt; if (c != '\'' && c != '"') { pushBack(); start = inputSeek; cnt = 0; while ((c = nextChar()) != -1 && c != ' ' && c != '\t' && c != '\n' && c != '>') ; } else { match = c; start = inputSeek; cnt = 0; while ((c = nextChar()) != -1 && c != match && c != '>') ; } if (c == -1) { warning("unexpected EOF"); return; } value = new String(input, 0, start, inputSeek - start - 1); if (c == '>') { pushBack(); } } else { value = new String("true"); } // System.out.println(name + ":" + value + ", line = " + lineCount); if (ref != null) { ref.addAttribute(name, value); } } } void insistThat(boolean expr) { if (!expr) { throw new Exception("assertion failed: " + lineCount); } } Stack tagStack = new Stack(); boolean handleTag(Tag tag, boolean isEnd, int offset) { if (isEnd) { Tag tos; try { tos = (Tag) tagStack.peek(); if (tos != tag) { if (tagStack.search(tag) == -1) { warning("Ignoring tag: </" + tag.name + ">"); return false; /* ignore this tag completely */ } else { while (true) { Tag t = (Tag) tagStack.pop(); if (t != tag) { warning("Missing </" + t.name + "> just noticed by </" + tag.name + ">"); html.endTag(t, offset); } else { break; } } } } else { if (tag.id == Tag.PRE) { preFormatted = false; } tagStack.pop(); } } catch (EmptyStackException e) { warning("Ignoring tag: </" + tag.name + ">"); return false; } } else if (tag.hasEndTag) { if (tag.id == Tag.PRE) { preFormatted = true; } tagStack.push(tag); } return true; } boolean preFormatted = false; static private Tag FORMtag = Tag.lookup("form"); static private Tag INPUTtag = Tag.lookup("input"); static private Tag HRtag = Tag.lookup("hr"); void parse() { int textIndex = 0; int c; Tag lastTag = null; boolean wasWhite = false; boolean textSinceLastBreak = false; mainloop: while ((c = nextChar()) != -1) { /* In this switch statement, a break means we've read a character suitable for inserting into the document. Continue means we've read some sort of markup. */ switch (c) { case '<': { boolean isEnd; String tagName; TagRef ref; Tag newTag; if ((c = peekChar()) == '/') { nextChar(); /* read it */ isEnd = true; } else { isEnd = false; } tagName = parseTagName(); if (tagName == null) { /* wasn't a real tag */ c = '<'; break; } newTag = Tag.lookup(tagName); ref = null; if (handleTag(newTag, isEnd, textIndex)) { /* Html spec says newline right before and just after a tag is markup and should be deleted. */ if (isEnd && newTag.hasEndTag && lastTag == null /* so we only eat one newline */ /* && newTag.breaks */) { if ((textIndex > 0) && (input[textIndex-1] == '\n')) { textIndex--; } } if (!isEnd) { ref = html.startTag(newTag, textIndex); } else { ref = html.endTag(newTag, textIndex); } lastTag = newTag; if (newTag.breaks) { textSinceLastBreak = false; } } if (!isEnd) { parseAttributes(ref); } else { skipUntil('>'); } if (nextChar() != '>') { warning("Malformed tag: " + lastTag); } if ((ref != null) && (ref.tag.id == Tag.ISINDEX)) { String prompt = ref.getAttribute("prompt"); int oldTextIndex = textIndex; TagRef fref = html.startTag(FORMtag, textIndex); String action = ref.getAttribute("action"); if (action != null) { fref.addAttribute("action", action); } html.startTag(HRtag, textIndex); if (prompt == null) { // At this point since the prompt wasn't in // the input buffer we need to handle the case // where we don't have enough space in the // buffer so we need to make a larger buffer // copy the first part of the input buffer, // then the prompt and the rest of the input // buffer into it. prompt = "This is a searchable index. Enter search keywords: "; output = new byte[input.length + prompt.length() + 1]; inputLength += prompt.length() + 1; System.arraycopy(input, 0, output, 0, textIndex); for (int i = 0 ; i < prompt.length() ; i++) { output[textIndex++] = (byte)prompt.charAt(i); } System.arraycopy(input, oldTextIndex - 1, output, textIndex, input.length - oldTextIndex); inputSeek += prompt.length() + 1; input = output; } else { for (int i = 0 ; i < prompt.length() ; i++) { input[textIndex++] = (byte)prompt.charAt(i); } } fref = html.startTag(INPUTtag, textIndex); fref.addAttribute("name", "isindex"); html.endTag(HRtag, textIndex); html.endTag(FORMtag, textIndex); } if (debug) { if (ref != null) { System.out.println("Line " + lineCount + ": " + ref.toExternalForm()); } } /* Html spec says newline right before and just after a tag is markup and should be deleted. */ if (!isEnd && newTag.hasEndTag && newTag.breaks && peekChar() == '\n') { nextChar(); } if (lastTag != null && lastTag.id == Tag.PLAINTEXT) { textIndex = inputLength - inputSeek; System.arraycopy(input, inputSeek, input, 0, textIndex); break mainloop; } wasWhite = false; continue; } case '&': if (peekChar() == '#') { c = parseCharacter(); } else { if ((c = parseEntity()) == -1) { continue; } } wasWhite = false; break; case '\n': case '\t': if (!preFormatted) { c = ' '; } /* falls into ... */ case ' ': if (!preFormatted) { if (!textSinceLastBreak /* (lastTag != null && lastTag.breaks) */ || wasWhite) { continue; } } wasWhite = true; break; default: wasWhite = false; break; } lastTag = null; textSinceLastBreak = true; //html.addCharacter(c); input[textIndex++] = (byte)c; } if (tagStack.size() != 0) { String error = "Missing "; int i = tagStack.size(); Tag tag; while (--i > 1) { tag = (Tag) tagStack.pop(); html.endTag(tag, textIndex); error = error + "</" + tag.name + ">, "; } tag = (Tag) tagStack.pop(); html.endTag(tag, textIndex); error = error + "</" + tag.name + ">"; warning(error + " at end of document.\n"); } tagStack = null; if (input.length != textIndex) { byte newinput[] = new byte[textIndex]; System.arraycopy(input, 0, newinput, 0, textIndex); input = newinput; } html.setText(input); } public Parser(InputStream is, Document html) { readInput(is); this.html = html; try { parse(); } catch (Exception e) { warning("Caught exception while parsing\n"); e.printStackTrace(); } } static public void main(String args[]) { URL url = new URL(null, args[0]); Parser p = new Parser(url.openStream(), new Document()); } }