Chip 2002 March

home *** CD-ROM | disk | FTP | other *** search

/ Chip 2002 March / Chip_2002-03_cd1.bin / internet / kosek / xml / xt-czech / Encoding.java next >

Wrap

Text File | 2000-10-30 | 91KB | 2,989 lines

package com.jclark.xml.tok; /** * An <code>Encoding</code> object corresponds to a possible * encoding (a mapping from characters to sequences of bytes). * It provides operations on byte arrays * that represent all or part of a parsed XML entity in that encoding. * * The set of ASCII characters excluding <code>$@\^`{}~</code> * have a special status; these are called XML significant * characters. * * This class imposes certain restrictions on an encoding: * <ul> * <li>the encoding must be stateless; * <li>a single byte must not encode more than one character; * <li>all XML significant characters must be encoded by the same number * of bytes, and no character may be encoded by fewer bytes. * </ul> * * Several methods operate on byte subarrays. The subarray is specified * by a byte array <code>buf</code> and two integers, * <code>off</code> and <code>end</code>; <code>off</code> * gives the index in <code>buf</code> of the first byte of the subarray * and <code>end</code> gives the * index in <code>buf</code> of the byte immediately after the last byte. * * Use the <code>getInitialEncoding</code> method to get an * <code>Encoding</code> object to use to start parsing an entity. * * The main operations provided by <code>Encoding</code> are * <code>tokenizeProlog</code>, <code>tokenizeContent</code> and * <code>tokenizeCdataSection</code>; * these are used to divide up an XML entity into tokens. * <code>tokenizeProlog</code> is used for the prolog of an XML document * as well as for the external subset and parameter entities (except * when referenced in an <code>EntityValue</code>); * it can also be used for parsing the <code>Misc</code>* that follows * the document element. * <code>tokenizeContent</code> is used for the document element and for * parsed general entities that are referenced in <code>content</code> * except for CDATA sections. * <code>tokenizeCdataSection</code> is used for CDATA sections, following * the <code><![CDATA[</code> up to and including the <code>]]></code>. * * <code>tokenizeAttributeValue</code> and <code>tokenizeEntityValue</code> * are used to further divide up tokens returned by <code>tokenizeProlog</code> * and <code>tokenizeContent</code>; they are also used to divide up entities * referenced in attribute values or entity values. * @version $Revision: 1.1 $ $Date: 2000/01/18 13:06:33 $ */ public abstract class Encoding { /** * Represents one or more characters of data. */ public static final int TOK_DATA_CHARS = 0; /** * Represents a newline (CR, LF or CR followed by LF) in data. */ public static final int TOK_DATA_NEWLINE = TOK_DATA_CHARS + 1; /** * Represents a complete start-tag <code><name></code>, * that doesn't have any attribute specifications. */ public static final int TOK_START_TAG_NO_ATTS = TOK_DATA_NEWLINE + 1; /** * Represents a complete start-tag <code><name att="val"></code>, * that contains one or more attribute specifications. */ public static final int TOK_START_TAG_WITH_ATTS = TOK_START_TAG_NO_ATTS + 1; /** * Represents an empty element tag <code><name/></code>, * that doesn't have any attribute specifications. */ public static final int TOK_EMPTY_ELEMENT_NO_ATTS = TOK_START_TAG_WITH_ATTS + 1; /** * Represents an empty element tag <code><name att="val"/></code>, * that contains one or more attribute specifications. */ public static final int TOK_EMPTY_ELEMENT_WITH_ATTS = TOK_EMPTY_ELEMENT_NO_ATTS + 1; /** * Represents a complete end-tag <code></name></code>. */ public static final int TOK_END_TAG = TOK_EMPTY_ELEMENT_WITH_ATTS + 1; /** * Represents the start of a CDATA section <code><![CDATA[</code>. */ public static final int TOK_CDATA_SECT_OPEN = TOK_END_TAG + 1; /** * Represents the end of a CDATA section <code>]]></code>. */ public static final int TOK_CDATA_SECT_CLOSE = TOK_CDATA_SECT_OPEN + 1; /** * Represents a general entity reference. */ public static final int TOK_ENTITY_REF = TOK_CDATA_SECT_CLOSE + 1; /** * Represents a general entity reference to a one of the 5 predefined * entities <code>amp</code>, <code>lt</code>, <code>gt</code>, * <code>quot</code>, <code>apos</code>. */ public static final int TOK_MAGIC_ENTITY_REF = TOK_ENTITY_REF + 1; /** * Represents a numeric character reference (decimal or hexadecimal), * when the referenced character is less than or equal to 0xFFFF * and so is represented by a single char. */ public static final int TOK_CHAR_REF = TOK_MAGIC_ENTITY_REF + 1; /** * Represents a numeric character reference (decimal or hexadecimal), * when the referenced character is greater than 0xFFFF and so is * represented by a pair of chars. */ public static final int TOK_CHAR_PAIR_REF = TOK_CHAR_REF + 1; /** * Represents a processing instruction. */ public static final int TOK_PI = TOK_CHAR_PAIR_REF + 1; /** * Represents an XML declaration or text declaration (a processing * instruction whose target is <code>xml</code>). */ public static final int TOK_XML_DECL = TOK_PI + 1; /** * Represents a comment <code></code>. * This can occur both in the prolog and in content. */ public static final int TOK_COMMENT = TOK_XML_DECL + 1; /** * Represents a white space character in an attribute value, * excluding white space characters that are part of line boundaries. */ public static final int TOK_ATTRIBUTE_VALUE_S = TOK_COMMENT + 1; /** * Represents a parameter entity reference in the prolog. */ public static final int TOK_PARAM_ENTITY_REF = TOK_ATTRIBUTE_VALUE_S + 1; /** * Represents whitespace in the prolog. * The token contains one or more whitespace characters. */ public static final int TOK_PROLOG_S = TOK_PARAM_ENTITY_REF + 1; /** * Represents <code><!NAME</code> in the prolog. */ public static final int TOK_DECL_OPEN = TOK_PROLOG_S + 1; /** * Represents <code>></code> in the prolog. */ public static final int TOK_DECL_CLOSE = TOK_DECL_OPEN + 1; /** * Represents a name in the prolog. */ public static final int TOK_NAME = TOK_DECL_CLOSE + 1; /** * Represents a name token in the prolog that is not a name. */ public static final int TOK_NMTOKEN = TOK_NAME + 1; /** * Represents <code>#NAME</code> in the prolog. */ public static final int TOK_POUND_NAME = TOK_NMTOKEN + 1; /** * Represents <code>|</code> in the prolog. */ public static final int TOK_OR = TOK_POUND_NAME + 1; /** * Represents a <code>%</code> in the prolog that does not start * a parameter entity reference. * This can occur in an entity declaration. */ public static final int TOK_PERCENT = TOK_OR + 1; /** * Represents a <code>(</code> in the prolog. */ public static final int TOK_OPEN_PAREN = TOK_PERCENT + 1; /** * Represents a <code>)</code> in the prolog that is not * followed immediately by any of * <code>*</code>, <code>+</code> or <code>?</code>. */ public static final int TOK_CLOSE_PAREN = TOK_OPEN_PAREN + 1; /** * Represents <code>[</code> in the prolog. */ public static final int TOK_OPEN_BRACKET = TOK_CLOSE_PAREN + 1; /** * Represents <code>]</code> in the prolog. */ public static final int TOK_CLOSE_BRACKET = TOK_OPEN_BRACKET + 1; /** * Represents a literal (EntityValue, AttValue, SystemLiteral or * PubidLiteral). */ public static final int TOK_LITERAL = TOK_CLOSE_BRACKET + 1; /** * Represents a name followed immediately by <code>?</code>. */ public static final int TOK_NAME_QUESTION = TOK_LITERAL + 1; /** * Represents a name followed immediately by <code>*</code>. */ public static final int TOK_NAME_ASTERISK = TOK_NAME_QUESTION + 1; /** * Represents a name followed immediately by <code>+</code>. */ public static final int TOK_NAME_PLUS = TOK_NAME_ASTERISK + 1; /** * Represents <code><![</code> in the prolog. */ public static final int TOK_COND_SECT_OPEN = TOK_NAME_PLUS + 1; /** * Represents <code>]]></code> in the prolog. */ public static final int TOK_COND_SECT_CLOSE = TOK_COND_SECT_OPEN + 1; /** * Represents <code>)?</code> in the prolog. */ public static final int TOK_CLOSE_PAREN_QUESTION = TOK_COND_SECT_CLOSE + 1; /** * Represents <code>)*</code> in the prolog. */ public static final int TOK_CLOSE_PAREN_ASTERISK = TOK_CLOSE_PAREN_QUESTION + 1; /** * Represents <code>)+</code> in the prolog. */ public static final int TOK_CLOSE_PAREN_PLUS = TOK_CLOSE_PAREN_ASTERISK + 1; /** * Represents <code>,</code> in the prolog. */ public static final int TOK_COMMA = TOK_CLOSE_PAREN_PLUS + 1; /** * Convert bytes to characters. * The bytes on <code>sourceBuf</code> between <code>sourceStart</code> * and <code>sourceEnd</code> are converted to characters and stored * in <code>targetBuf</code> starting at <code>targetStart</code>. * <code>(targetBuf.length - targetStart) * getMinBytesPerChar()</code> * must be at greater than or equal to * <code>sourceEnd - sourceStart</code>. * If <code>getFixedBytesPerChar</code> returns a value greater than 0, * then the return value will be equal to * <code>(sourceEnd - sourceStart)/getFixedBytesPerChar()</code>. * @return the number of characters stored into <code>targetBuf</code> * @see #getFixedBytesPerChar */ public abstract int convert(byte[] sourceBuf, int sourceStart, int sourceEnd, char[] targetBuf, int targetStart); /** * Returns the number of bytes required to represent each <code>char</code>, * or zero if different <code>char</code>s are represented by different * numbers of bytes. The value returned will 0, 1, 2, or 4. */ public abstract int getFixedBytesPerChar(); private static Encoding utf8Encoding; private static Encoding utf16LittleEndianEncoding; private static Encoding utf16BigEndianEncoding; private static Encoding internalEncoding; private static Encoding iso8859_1Encoding; private static Encoding asciiEncoding; private static Encoding windows1250Encoding; private static Encoding iso8859_2Encoding; private static final byte UTF8_ENCODING = 0; private static final byte UTF16_LITTLE_ENDIAN_ENCODING = 1; private static final byte UTF16_BIG_ENDIAN_ENCODING = 2; private static final byte INTERNAL_ENCODING = 3; private static final byte ISO8859_1_ENCODING = 4; private static final byte ASCII_ENCODING = 5; private static final byte WINDOWS1250_ENCODING = 6; private static final byte ISO8859_2_ENCODING = 7; // Encoding vector for windows-1250 encoding // This encoding is used by Windows for Central European languages private static final String windows1250MapString = "\0\u0001\u0002\u0003\u0004\u0005\u0006\u0007\b\t\n\u000b\f\r\u000e\u000f" + "\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001a\u001b\u001c\u001d\u001e\u001f" + "\u0020\u0021\"\u0023\u0024\u0025\u0026\'\u0028\u0029\u002a\u002b\u002c\u002d\u002e\u002f" + "\u0030\u0031\u0032\u0033\u0034\u0035\u0036\u0037\u0038\u0039\u003a\u003b\u003c\u003d\u003e\u003f" + "\u0040\u0041\u0042\u0043\u0044\u0045\u0046\u0047\u0048\u0049\u004a\u004b\u004c\u004d\u004e\u004f" + "\u0050\u0051\u0052\u0053\u0054\u0055\u0056\u0057\u0058\u0059\u005a\u005b\\\u005d\u005e\u005f" + "\u0060\u0061\u0062\u0063\u0064\u0065\u0066\u0067\u0068\u0069\u006a\u006b\u006c\u006d\u006e\u006f" + "\u0070\u0071\u0072\u0073\u0074\u0075\u0076\u0077\u0078\u0079\u007a\u007b\u007c\u007d\u007e\u007f" + "\u20ac\ufffd\u201a\ufffd\u201e\u2026\u2020\u2021\ufffd\u2030\u0160\u2039\u015a\u0164\u017d\u0179" + "\ufffd\u2018\u2019\u201c\u201d\u2022\u2013\u2014\ufffd\u2122\u0161\u203a\u015b\u0165\u017e\u017a" + "\u00a0\u02c7\u02d8\u0141\u00a4\u0104\u00a6\u00a7\u00a8\u00a9\u015e\u00ab\ufffd\u00ad\u00ae\u017b" + "\u00b0\u00b1\u02db\u0142\u00b4\u00b5\u00b6\u00b7\u00b8\u0105\u015f\u00bb\u013d\u02dd\u013e\u017c" + "\u0154\u00c1\u00c2\u0102\u00c4\u0139\u0106\u00c7\u010c\u00c9\u0118\u00cb\u011a\u00cd\u00ce\u010e" + "\u0110\u0143\u0147\u00d3\u00d4\u0150\u00d6\u00d7\u0158\u016e\u00da\u0170\u00dc\u00dd\u0162\u00df" + "\u0155\u00e1\u00e2\u0103\u00e4\u013a\u0107\u00e7\u010d\u00e9\u0119\u00eb\u011b\u00ed\u00ee\u010f" + "\u0111\u0144\u0148\u00f3\u00f4\u0151\u00f6\u00f7\u0159\u016f\u00fa\u0171\u00fc\u00fd\u0163\u02d9"; // Encoding vector for ISO 8859-2 encoding // This encoding is ISO standard for Central European languages private static final String iso8859_2MapString = "\0\u0001\u0002\u0003\u0004\u0005\u0006\u0007\b\t\n\u000b\f\r\u000e\u000f" + "\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001a\u001b\u001c\u001d\u001e\u001f" + "\u0020\u0021\"\u0023\u0024\u0025\u0026\'\u0028\u0029\u002a\u002b\u002c\u002d\u002e\u002f" + "\u0030\u0031\u0032\u0033\u0034\u0035\u0036\u0037\u0038\u0039\u003a\u003b\u003c\u003d\u003e\u003f" + "\u0040\u0041\u0042\u0043\u0044\u0045\u0046\u0047\u0048\u0049\u004a\u004b\u004c\u004d\u004e\u004f" + "\u0050\u0051\u0052\u0053\u0054\u0055\u0056\u0057\u0058\u0059\u005a\u005b\\\u005d\u005e\u005f" + "\u0060\u0061\u0062\u0063\u0064\u0065\u0066\u0067\u0068\u0069\u006a\u006b\u006c\u006d\u006e\u006f" + "\u0070\u0071\u0072\u0073\u0074\u0075\u0076\u0077\u0078\u0079\u007a\u007b\u007c\u007d\u007e\u007f" + "\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008a\u008b\u008c\u008d\u008e\u008f" + "\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009a\u009b\u009c\u009d\u009e\u009f" + "\u00a0\u0104\u02d8\u0141\u00a4\u013d\u015a\u00a7\u00a8\u0160\u015e\u0164\u0179\u00ad\u017d\u017b" + "\u00b0\u0105\u02db\u0142\u00b4\u013e\u015b\u02c7\u00b8\u0161\u015f\u0165\u017a\u02dd\u017e\u017c" + "\u0154\u00c1\u00c2\u0102\u00c4\u0139\u0106\u00c7\u010c\u00c9\u0118\u00cb\u011a\u00cd\u00ce\u010e" + "\u0110\u0143\u0147\u00d3\u00d4\u0150\u00d6\u00d7\u0158\u016e\u00da\u0170\u00dc\u00dd\u0162\u00df" + "\u0155\u00e1\u00e2\u0103\u00e4\u013a\u0107\u00e7\u010d\u00e9\u0119\u00eb\u011b\u00ed\u00ee\u010f" + "\u0111\u0144\u0148\u00f3\u00f4\u0151\u00f6\u00f7\u0159\u016f\u00fa\u0171\u00fc\u00fd\u0163\u02d9"; private static synchronized Encoding getEncoding(byte enc) { switch (enc) { case UTF8_ENCODING: if (utf8Encoding == null) utf8Encoding = new UTF8Encoding(); return utf8Encoding; case UTF16_LITTLE_ENDIAN_ENCODING: if (utf16LittleEndianEncoding == null) utf16LittleEndianEncoding = new UTF16LittleEndianEncoding(); return utf16LittleEndianEncoding; case UTF16_BIG_ENDIAN_ENCODING: if (utf16BigEndianEncoding == null) utf16BigEndianEncoding = new UTF16BigEndianEncoding(); return utf16BigEndianEncoding; case INTERNAL_ENCODING: if (internalEncoding == null) internalEncoding = new InternalEncoding(); return internalEncoding; case ISO8859_1_ENCODING: if (iso8859_1Encoding == null) iso8859_1Encoding = new ISO8859_1Encoding(); return iso8859_1Encoding; case ASCII_ENCODING: if (asciiEncoding == null) asciiEncoding = new ASCIIEncoding(); return asciiEncoding; case WINDOWS1250_ENCODING: if (windows1250Encoding == null) windows1250Encoding = new SingleByteEncoding(windows1250MapString); return windows1250Encoding; case ISO8859_2_ENCODING: if (iso8859_2Encoding == null) iso8859_2Encoding = new SingleByteEncoding(iso8859_2MapString); return iso8859_2Encoding; } return null; } Encoding getUTF16Encoding() { return getEncoding(UTF16_BIG_ENDIAN_ENCODING); } // Bytes with type < 0 may not be data in content. // The negation of the lead byte type gives the total number of bytes. static final int BT_LEAD2 = -2; static final int BT_LEAD3 = BT_LEAD2 - 1; static final int BT_LEAD4 = BT_LEAD3 - 1; static final int BT_NONXML = BT_LEAD4 - 1; static final int BT_MALFORM = BT_NONXML - 1; static final int BT_LT = BT_MALFORM - 1; static final int BT_AMP = BT_LT - 1; static final int BT_RSQB = BT_AMP - 1; static final int BT_CR = BT_RSQB - 1; static final int BT_LF = BT_CR - 1; // Bytes with type >= 0 are treated as data in content. static final int BT_GT = 0; static final int BT_QUOT = BT_GT + 1; static final int BT_APOS = BT_QUOT + 1; static final int BT_EQUALS = BT_APOS + 1; static final int BT_QUEST = BT_EQUALS + 1; static final int BT_EXCL = BT_QUEST + 1; static final int BT_SOL = BT_EXCL + 1; static final int BT_SEMI = BT_SOL + 1; static final int BT_NUM = BT_SEMI + 1; static final int BT_LSQB = BT_NUM + 1; static final int BT_S = BT_LSQB + 1; static final int BT_NMSTRT = BT_S + 1; static final int BT_NAME = BT_NMSTRT + 1; static final int BT_MINUS = BT_NAME + 1; static final int BT_OTHER = BT_MINUS + 1; static final int BT_PERCNT = BT_OTHER + 1; static final int BT_LPAR = BT_PERCNT + 1; static final int BT_RPAR = BT_LPAR + 1; static final int BT_AST = BT_RPAR + 1; static final int BT_PLUS = BT_AST + 1; static final int BT_COMMA = BT_PLUS + 1; static final int BT_VERBAR = BT_COMMA + 1; final static byte[] asciiTypeTable = { /* 0x00 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML, /* 0x04 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML, /* 0x08 */ BT_NONXML, BT_S, BT_LF, BT_NONXML, /* 0x0C */ BT_NONXML, BT_CR, BT_NONXML, BT_NONXML, /* 0x10 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML, /* 0x14 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML, /* 0x18 */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML, /* 0x1C */ BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML, /* 0x20 */ BT_S, BT_EXCL, BT_QUOT, BT_NUM, /* 0x24 */ BT_OTHER, BT_PERCNT, BT_AMP, BT_APOS, /* 0x28 */ BT_LPAR, BT_RPAR, BT_AST, BT_PLUS, /* 0x2C */ BT_COMMA, BT_MINUS, BT_NAME, BT_SOL, /* 0x30 */ BT_NAME, BT_NAME, BT_NAME, BT_NAME, /* 0x34 */ BT_NAME, BT_NAME, BT_NAME, BT_NAME, /* 0x38 */ BT_NAME, BT_NAME, BT_NMSTRT, BT_SEMI, /* 0x3C */ BT_LT, BT_EQUALS, BT_GT, BT_QUEST, /* 0x40 */ BT_OTHER, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, /* 0x44 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, /* 0x48 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, /* 0x4C */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, /* 0x50 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, /* 0x54 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, /* 0x58 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_LSQB, /* 0x5C */ BT_OTHER, BT_RSQB, BT_OTHER, BT_NMSTRT, /* 0x60 */ BT_OTHER, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, /* 0x64 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, /* 0x68 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, /* 0x6C */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, /* 0x70 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, /* 0x74 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, /* 0x78 */ BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_OTHER, /* 0x7C */ BT_VERBAR, BT_OTHER, BT_OTHER, BT_OTHER }; // The minimum number of bytes per character. private /* final */ int minBPC; Encoding(int minBPC) { this.minBPC = minBPC; } // There are guaranteed to be minBPC available bytes starting at off. abstract int byteType(byte[] buf, int off); abstract int byteToAscii(byte[] buf, int off); // This must only be called when c is an (XML significant) ASCII character. abstract boolean charMatches(byte[] buf, int off, char c); // Called only when byteType(buf, off) == BT_LEAD2 int byteType2(byte[] buf, int off) { return BT_OTHER; } // Called only when byteType(buf, off) == BT_LEAD3 int byteType3(byte[] buf, int off) { return BT_OTHER; } // Called only when byteType(buf, off) == BT_LEAD4 int byteType4(byte[] buf, int off) { return BT_OTHER; } void check2(byte[] buf, int off) throws InvalidTokenException { } void check3(byte[] buf, int off) throws InvalidTokenException { } void check4(byte[] buf, int off) throws InvalidTokenException { } /** * Moves a position forward. * On entry, <code>pos</code> gives the position of the byte at index * <code>off</code> in <code>buf</code>. * On exit, it <code>pos</code> will give the position of the byte at index * <code>end</code>, which must be greater than or equal to <code>off</code>. * The bytes between <code>off</code> and <code>end</code> must encode * one or more complete characters. * A carriage return followed by a line feed will be treated as a single * line delimiter provided that they are given to <code>movePosition</code> * together. */ public abstract void movePosition(byte[] buf, int off, int end, Position pos); // end encoding specific part private final void checkCharMatches(byte[] buf, int off, char c) throws InvalidTokenException { if (!charMatches(buf, off, c)) throw new InvalidTokenException(off); } /* off points to character following "<!-" */ private final int scanComment(byte[] buf, int off, int end, Token token) throws InvalidTokenException, PartialTokenException { if (off != end) { checkCharMatches(buf, off, '-'); off += minBPC; while (off != end) { switch (byteType(buf, off)) { case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); check2(buf, off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); check3(buf, off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); check4(buf, off); off += 4; break; case BT_NONXML: case BT_MALFORM: throw new InvalidTokenException(off); case BT_MINUS: if ((off += minBPC) == end) throw new PartialTokenException(); if (charMatches(buf, off, '-')) { if ((off += minBPC) == end) throw new PartialTokenException(); checkCharMatches(buf, off, '>'); token.tokenEnd = off + minBPC; return TOK_COMMENT; } break; default: off += minBPC; break; } } } throw new PartialTokenException(); } /* off points to character following "<!" */ private final int scanDecl(byte[] buf, int off, int end, Token token) throws InvalidTokenException, PartialTokenException { if (off == end) throw new PartialTokenException(); switch (byteType(buf, off)) { case BT_MINUS: return scanComment(buf, off + minBPC, end, token); case BT_LSQB: token.tokenEnd = off + minBPC; return TOK_COND_SECT_OPEN; case BT_NMSTRT: off += minBPC; break; default: throw new InvalidTokenException(off); } while (off != end) { switch (byteType(buf, off)) { case BT_PERCNT: if (off + minBPC == end) throw new PartialTokenException(); /* don't allow <!ENTITY% foo "whatever"> */ switch (byteType(buf, off + minBPC)) { case BT_S: case BT_CR: case BT_LF: case BT_PERCNT: throw new InvalidTokenException(off); } /* fall through */ case BT_S: case BT_CR: case BT_LF: token.tokenEnd = off; return TOK_DECL_OPEN; case BT_NMSTRT: off += minBPC; break; default: throw new InvalidTokenException(off); } } throw new PartialTokenException(); } private final boolean targetIsXml(byte[] buf, int off, int end) throws InvalidTokenException { boolean upper = false; if (end - off != minBPC*3) return false; switch (byteToAscii(buf, off)) { case 'x': break; case 'X': upper = true; break; default: return false; } off += minBPC; switch (byteToAscii(buf, off)) { case 'm': break; case 'M': upper = true; break; default: return false; } off += minBPC; switch (byteToAscii(buf, off)) { case 'l': break; case 'L': upper = true; break; default: return false; } if (upper) throw new InvalidTokenException(off, InvalidTokenException.XML_TARGET); return true; } /* off points to character following "<?" */ private final int scanPi(byte[] buf, int off, int end, Token token) throws PartialTokenException, InvalidTokenException { int target = off; if (off == end) throw new PartialTokenException(); switch (byteType(buf, off)) { case BT_NMSTRT: off += minBPC; break; case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); if (byteType2(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); if (byteType3(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); if (byteType4(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 4; break; default: throw new InvalidTokenException(off); } while (off != end) { switch (byteType(buf, off)) { case BT_NMSTRT: case BT_NAME: case BT_MINUS: off += minBPC; break; case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); if (!isNameChar2(buf, off)) throw new InvalidTokenException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); if (!isNameChar3(buf, off)) throw new InvalidTokenException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); if (!isNameChar4(buf, off)) throw new InvalidTokenException(off); off += 4; break; case BT_S: case BT_CR: case BT_LF: boolean isXml = targetIsXml(buf, target, off); token.nameEnd = off; off += minBPC; while (off != end) { switch (byteType(buf, off)) { case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); check2(buf, off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); check3(buf, off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); check4(buf, off); off += 4; break; case BT_NONXML: case BT_MALFORM: throw new InvalidTokenException(off); case BT_QUEST: off += minBPC; if (off == end) throw new PartialTokenException(); if (charMatches(buf, off, '>')) { token.tokenEnd = off + minBPC; if (isXml) return TOK_XML_DECL; else return TOK_PI; } break; default: off += minBPC; break; } } throw new PartialTokenException(); case BT_QUEST: token.nameEnd = off; off += minBPC; if (off == end) throw new PartialTokenException(); checkCharMatches(buf, off, '>'); token.tokenEnd = off + minBPC; return (targetIsXml(buf, target, token.nameEnd) ? TOK_XML_DECL : TOK_PI); default: throw new InvalidTokenException(off); } } throw new PartialTokenException(); } /* off points to character following "<![" */ private static final String CDATA = "CDATA["; private final int scanCdataSection(byte[] buf, int off, int end, Token token) throws PartialTokenException, InvalidTokenException { /* "CDATA[".length() == 6 */ if (end - off < 6 * minBPC) throw new PartialTokenException(); for (int i = 0; i < CDATA.length(); i++, off += minBPC) checkCharMatches(buf, off, CDATA.charAt(i)); token.tokenEnd = off; return TOK_CDATA_SECT_OPEN; } /** * Scans the first token of a byte subarrary that starts with the * content of a CDATA section. * Returns one of the following integers according to the type of token * that the subarray starts with: * <ul> * <li><code>TOK_DATA_CHARS</code> * <li><code>TOK_DATA_NEWLINE</code> * <li><code>TOK_CDATA_SECT_CLOSE</code> * </ul> * * Information about the token is stored in <code>token</code>. * * After <code>TOK_CDATA_SECT_CLOSE</code> is returned, the application * should use <code>tokenizeContent</code>. * * @exception EmptyTokenException if the subarray is empty * @exception PartialTokenException if the subarray contains only part of * a legal token * @exception InvalidTokenException if the subarrary does not start * with a legal token or part of one * @exception ExtensibleTokenException if the subarray encodes just a carriage * return ('\r') * * @see #TOK_DATA_CHARS * @see #TOK_DATA_NEWLINE * @see #TOK_CDATA_SECT_CLOSE * @see Token * @see EmptyTokenException * @see PartialTokenException * @see InvalidTokenException * @see ExtensibleTokenException * @see #tokenizeContent */ public final int tokenizeCdataSection(byte[] buf, int off, int end, Token token) throws EmptyTokenException, PartialTokenException, InvalidTokenException, ExtensibleTokenException { if (minBPC > 1) end = adjustEnd(off, end); if (off == end) throw new EmptyTokenException(); switch (byteType(buf, off)) { case BT_RSQB: off += minBPC; if (off == end) throw new PartialTokenException(); if (!charMatches(buf, off, ']')) break; off += minBPC; if (off == end) throw new PartialTokenException(); if (!charMatches(buf, off, '>')) { off -= minBPC; break; } token.tokenEnd = off + minBPC; return TOK_CDATA_SECT_CLOSE; case BT_CR: off += minBPC; if (off == end) throw new ExtensibleTokenException(TOK_DATA_NEWLINE); if (byteType(buf, off) == BT_LF) off += minBPC; token.tokenEnd = off; return TOK_DATA_NEWLINE; case BT_LF: token.tokenEnd = off + minBPC; return TOK_DATA_NEWLINE; case BT_NONXML: case BT_MALFORM: throw new InvalidTokenException(off); case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); check2(buf, off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); check3(buf, off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); check4(buf, off); off += 4; break; default: off += minBPC; break; } token.tokenEnd = extendCdata(buf, off, end); return TOK_DATA_CHARS; } int extendCdata(final byte[] buf, int off, final int end) throws InvalidTokenException { while (off != end) { switch (byteType(buf, off)) { case BT_LEAD2: if (end - off < 2) return off; check2(buf, off); off += 2; break; case BT_LEAD3: if (end - off < 3) return off; check3(buf, off); off += 3; break; case BT_LEAD4: if (end - off < 4) return off; check4(buf, off); off += 4; break; case BT_RSQB: case BT_NONXML: case BT_MALFORM: case BT_CR: case BT_LF: return off; default: off += minBPC; break; } } return off; } /* off points to character following "</" */ private final int scanEndTag(byte[] buf, int off, int end, Token token) throws PartialTokenException, InvalidTokenException { if (off == end) throw new PartialTokenException(); switch (byteType(buf, off)) { case BT_NMSTRT: off += minBPC; break; case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); if (byteType2(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); if (byteType3(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); if (byteType4(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 4; break; default: throw new InvalidTokenException(off); } while (off != end) { switch (byteType(buf, off)) { case BT_NMSTRT: case BT_NAME: case BT_MINUS: off += minBPC; break; case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); if (!isNameChar2(buf, off)) throw new InvalidTokenException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); if (!isNameChar3(buf, off)) throw new InvalidTokenException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); if (!isNameChar4(buf, off)) throw new InvalidTokenException(off); off += 4; break; case BT_S: case BT_CR: case BT_LF: token.nameEnd = off; for (off += minBPC; off != end; off += minBPC) { switch (byteType(buf, off)) { case BT_S: case BT_CR: case BT_LF: break; case BT_GT: token.tokenEnd = off + minBPC; return TOK_END_TAG; default: throw new InvalidTokenException(off); } } throw new PartialTokenException(); case BT_GT: token.nameEnd = off; token.tokenEnd = off + minBPC; return TOK_END_TAG; default: throw new InvalidTokenException(off); } } throw new PartialTokenException(); } /* off points to character following "&#X" */ private final int scanHexCharRef(byte[] buf, int off, int end, Token token) throws PartialTokenException, InvalidTokenException { if (off != end) { int c = byteToAscii(buf, off); int num; switch (c) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': num = c - '0'; break; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': num = c - ('A' - 10); break; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': num = c - ('a' - 10); break; default: throw new InvalidTokenException(off); } for (off += minBPC; off != end; off += minBPC) { c = byteToAscii(buf, off); switch (c) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': num = (num << 4) + c - '0'; break; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': num = (num << 4) + c - ('A' - 10); break; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': num = (num << 4) + c - ('a' - 10); break; case ';': token.tokenEnd = off + minBPC; return setRefChar(num, token); default: throw new InvalidTokenException(off); } if (num >= 0x110000) throw new InvalidTokenException(off); } } throw new PartialTokenException(); } /* off points to character following "&#" */ private final int scanCharRef(byte[] buf, int off, int end, Token token) throws PartialTokenException, InvalidTokenException { if (off != end) { int c = byteToAscii(buf, off); switch (c) { case 'x': return scanHexCharRef(buf, off + minBPC, end, token); case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': break; default: throw new InvalidTokenException(off); } int num = c - '0'; for (off += minBPC; off != end; off += minBPC) { c = byteToAscii(buf, off); switch (c) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': num = num * 10 + (c - '0'); if (num < 0x110000) break; /* fall through */ default: throw new InvalidTokenException(off); case ';': token.tokenEnd = off + minBPC; return setRefChar(num, token); } } } throw new PartialTokenException(); } /* num is known to be < 0x110000; return the token code */ private final int setRefChar(int num, Token token) throws InvalidTokenException { if (num < 0x10000) { switch (charTypeTable[num >> 8][num & 0xFF]) { case BT_NONXML: case BT_LEAD4: case BT_MALFORM: throw new InvalidTokenException(token.tokenEnd - minBPC); } token.refChar1 = (char)num; return TOK_CHAR_REF; } else { num -= 0x10000; token.refChar1 = (char)((num >> 10) + 0xD800); token.refChar2 = (char)((num & ((1 << 10) - 1)) + 0xDC00); return TOK_CHAR_PAIR_REF; } } private final boolean isMagicEntityRef(byte[] buf, int off, int end, Token token) { switch (byteToAscii(buf, off)) { case 'a': if (end - off < minBPC*4) break; switch (byteToAscii(buf, off + minBPC)) { case 'm': if (charMatches(buf, off + minBPC*2, 'p') && charMatches(buf, off + minBPC*3, ';')) { token.tokenEnd = off + minBPC*4; token.refChar1 = '&'; return true; } break; case 'p': if (end - off >= minBPC*5 && charMatches(buf, off + minBPC*2, 'o') && charMatches(buf, off + minBPC*3, 's') && charMatches(buf, off + minBPC*4, ';')) { token.tokenEnd = off + minBPC*5; token.refChar1 = '\''; return true; } break; } break; case 'l': if (end - off >= minBPC*3 && charMatches(buf, off + minBPC, 't') && charMatches(buf, off + minBPC*2, ';')) { token.tokenEnd = off + minBPC*3; token.refChar1 = '<'; return true; } break; case 'g': if (end - off >= minBPC*3 && charMatches(buf, off + minBPC, 't') && charMatches(buf, off + minBPC*2, ';')) { token.tokenEnd = off + minBPC*3; token.refChar1 = '>'; return true; } break; case 'q': if (end - off >= minBPC*5 && charMatches(buf, off + minBPC, 'u') && charMatches(buf, off + minBPC*2, 'o') && charMatches(buf, off + minBPC*3, 't') && charMatches(buf, off + minBPC*4, ';')) { token.tokenEnd = off + minBPC*5; token.refChar1 = '"'; return true; } break; } return false; } /* off points to character following "&" */ private final int scanRef(byte[] buf, int off, int end, Token token) throws PartialTokenException, InvalidTokenException { if (off == end) throw new PartialTokenException(); if (isMagicEntityRef(buf, off, end, token)) return TOK_MAGIC_ENTITY_REF; switch (byteType(buf, off)) { case BT_NMSTRT: off += minBPC; break; case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); if (byteType2(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); if (byteType3(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); if (byteType4(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 4; break; case BT_NUM: return scanCharRef(buf, off + minBPC, end, token); default: throw new InvalidTokenException(off); } while (off != end) { switch (byteType(buf, off)) { case BT_NMSTRT: case BT_NAME: case BT_MINUS: off += minBPC; break; case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); if (!isNameChar2(buf, off)) throw new InvalidTokenException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); if (!isNameChar3(buf, off)) throw new InvalidTokenException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); if (!isNameChar4(buf, off)) throw new InvalidTokenException(off); off += 4; break; case BT_SEMI: token.nameEnd = off; token.tokenEnd = off + minBPC; return TOK_ENTITY_REF; default: throw new InvalidTokenException(off); } } throw new PartialTokenException(); } /* off points to character following first character of attribute name */ private final int scanAtts(int nameStart, byte[] buf, int off, int end, ContentToken token) throws PartialTokenException, InvalidTokenException { int nameEnd = -1; while (off != end) { switch (byteType(buf, off)) { case BT_NMSTRT: case BT_NAME: case BT_MINUS: off += minBPC; break; case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); if (!isNameChar2(buf, off)) throw new InvalidTokenException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); if (!isNameChar3(buf, off)) throw new InvalidTokenException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); if (!isNameChar4(buf, off)) throw new InvalidTokenException(off); off += 4; break; case BT_S: case BT_CR: case BT_LF: nameEnd = off; loop: for (;;) { off += minBPC; if (off == end) throw new PartialTokenException(); switch (byteType(buf, off)) { case BT_EQUALS: break loop; case BT_S: case BT_LF: case BT_CR: break; default: throw new InvalidTokenException(off); } } /* fall through */ case BT_EQUALS: { if (nameEnd < 0) nameEnd = off; int open; for (;;) { off += minBPC; if (off == end) throw new PartialTokenException(); open = byteType(buf, off); if (open == BT_QUOT || open == BT_APOS) break; switch (open) { case BT_S: case BT_LF: case BT_CR: break; default: throw new InvalidTokenException(off); } } off += minBPC; int valueStart = off; boolean normalized = true; /* in attribute value */ for (;;) { int t; if (off == end) throw new PartialTokenException(); t = byteType(buf, off); if (t == open) break; switch (t) { case BT_NONXML: case BT_MALFORM: throw new InvalidTokenException(off); case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); check2(buf, off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); check3(buf, off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); check4(buf, off); off += 4; break; case BT_AMP: { normalized = false; int saveNameEnd = token.nameEnd; scanRef(buf, off + minBPC, end, token); token.nameEnd = saveNameEnd; off = token.tokenEnd; break; } case BT_S: if (normalized && (off == valueStart || byteToAscii(buf, off) != ' ' || (off + minBPC != end && (byteToAscii(buf, off + minBPC) == ' ' || byteType(buf, off + minBPC) == open)))) normalized = false; off += minBPC; break; case BT_LT: throw new InvalidTokenException(off); case BT_LF: case BT_CR: normalized = false; /* fall through */ default: off += minBPC; break; } } token.appendAttribute(nameStart, nameEnd, valueStart, off, normalized); off += minBPC; if (off == end) throw new PartialTokenException(); int t = byteType(buf, off); switch (t) { case BT_S: case BT_CR: case BT_LF: off += minBPC; if (off == end) throw new PartialTokenException(); t = byteType(buf, off); break; case BT_GT: case BT_SOL: break; default: throw new InvalidTokenException(off); } /* off points to closing quote */ skipToName: for (;;) { switch (t) { case BT_NMSTRT: nameStart = off; off += minBPC; break skipToName; case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); if (byteType2(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); nameStart = off; off += 2; break skipToName; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); if (byteType3(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); nameStart = off; off += 3; break skipToName; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); if (byteType4(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); nameStart = off; off += 4; break skipToName; case BT_S: case BT_CR: case BT_LF: break; case BT_GT: token.checkAttributeUniqueness(buf); token.tokenEnd = off + minBPC; return TOK_START_TAG_WITH_ATTS; case BT_SOL: off += minBPC; if (off == end) throw new PartialTokenException(); checkCharMatches(buf, off, '>'); token.checkAttributeUniqueness(buf); token.tokenEnd = off + minBPC; return TOK_EMPTY_ELEMENT_WITH_ATTS; default: throw new InvalidTokenException(off); } off += minBPC; if (off == end) throw new PartialTokenException(); t = byteType(buf, off); } nameEnd = -1; break; } default: throw new InvalidTokenException(off); } } throw new PartialTokenException(); } /* off points to character following "<" */ private final int scanLt(byte[] buf, int off, int end, ContentToken token) throws PartialTokenException, InvalidTokenException { if (off == end) throw new PartialTokenException(); switch (byteType(buf, off)) { case BT_NMSTRT: off += minBPC; break; case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); if (byteType2(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); if (byteType3(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); if (byteType4(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 4; break; case BT_EXCL: if ((off += minBPC) == end) throw new PartialTokenException(); switch (byteType(buf, off)) { case BT_MINUS: return scanComment(buf, off + minBPC, end, token); case BT_LSQB: return scanCdataSection(buf, off + minBPC, end, token); } throw new InvalidTokenException(off); case BT_QUEST: return scanPi(buf, off + minBPC, end, token); case BT_SOL: return scanEndTag(buf, off + minBPC, end, token); default: throw new InvalidTokenException(off); } /* we have a start-tag */ token.nameEnd = -1; token.clearAttributes(); while (off != end) { switch (byteType(buf, off)) { case BT_NMSTRT: case BT_NAME: case BT_MINUS: off += minBPC; break; case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); if (!isNameChar2(buf, off)) throw new InvalidTokenException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); if (!isNameChar3(buf, off)) throw new InvalidTokenException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); if (!isNameChar4(buf, off)) throw new InvalidTokenException(off); off += 4; break; case BT_S: case BT_CR: case BT_LF: token.nameEnd = off; off += minBPC; loop: for (;;) { if (off == end) throw new PartialTokenException(); switch (byteType(buf, off)) { case BT_NMSTRT: return scanAtts(off, buf, off + minBPC, end, token); case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); if (byteType2(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); return scanAtts(off, buf, off + 2, end, token); case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); if (byteType3(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); return scanAtts(off, buf, off + 3, end, token); case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); if (byteType4(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); return scanAtts(off, buf, off + 4, end, token); case BT_GT: case BT_SOL: break loop; case BT_S: case BT_CR: case BT_LF: off += minBPC; break; default: throw new InvalidTokenException(off); } } break; case BT_GT: if (token.nameEnd < 0) token.nameEnd = off; token.tokenEnd = off + minBPC; return TOK_START_TAG_NO_ATTS; case BT_SOL: if (token.nameEnd < 0) token.nameEnd = off; off += minBPC; if (off == end) throw new PartialTokenException(); checkCharMatches(buf, off, '>'); token.tokenEnd = off + minBPC; return TOK_EMPTY_ELEMENT_NO_ATTS; default: throw new InvalidTokenException(off); } } throw new PartialTokenException(); } // Ensure that we always scan a multiple of minBPC bytes. private final int adjustEnd(int off, int end) throws PartialCharException { int n = end - off; if ((n & (minBPC - 1)) != 0) { n &= ~(minBPC - 1); if (n == 0) throw new PartialCharException(off); return off + n; } else return end; } /** * Scans the first token of a byte subarrary that contains content. * Returns one of the following integers according to the type of token * that the subarray starts with: * <ul> * <li><code>TOK_START_TAG_NO_ATTS</code> * <li><code>TOK_START_TAG_WITH_ATTS</code> * <li><code>TOK_EMPTY_ELEMENT_NO_ATTS</code> * <li><code>TOK_EMPTY_ELEMENT_WITH_ATTS</code> * <li><code>TOK_END_TAG</code> * <li><code>TOK_DATA_CHARS</code> * <li><code>TOK_DATA_NEWLINE</code> * <li><code>TOK_CDATA_SECT_OPEN</code> * <li><code>TOK_ENTITY_REF</code> * <li><code>TOK_MAGIC_ENTITY_REF</code> * <li><code>TOK_CHAR_REF</code> * <li><code>TOK_CHAR_PAIR_REF</code> * <li><code>TOK_PI</code> * <li><code>TOK_XML_DECL</code> * <li><code>TOK_COMMENT</code> * </ul> * * Information about the token is stored in <code>token</code>. * * When <code>TOK_CDATA_SECT_OPEN</code> is returned, * <code>tokenizeCdataSection</code> should be called until * it returns <code>TOK_CDATA_SECT</code>. * * @exception EmptyTokenException if the subarray is empty * @exception PartialTokenException if the subarray contains only part of * a legal token * @exception InvalidTokenException if the subarrary does not start * with a legal token or part of one * @exception ExtensibleTokenException if the subarray encodes just a carriage * return ('\r') * * @see #TOK_START_TAG_NO_ATTS * @see #TOK_START_TAG_WITH_ATTS * @see #TOK_EMPTY_ELEMENT_NO_ATTS * @see #TOK_EMPTY_ELEMENT_WITH_ATTS * @see #TOK_END_TAG * @see #TOK_DATA_CHARS * @see #TOK_DATA_NEWLINE * @see #TOK_CDATA_SECT_OPEN * @see #TOK_ENTITY_REF * @see #TOK_MAGIC_ENTITY_REF * @see #TOK_CHAR_REF * @see #TOK_CHAR_PAIR_REF * @see #TOK_PI * @see #TOK_XML_DECL * @see #TOK_COMMENT * @see ContentToken * @see EmptyTokenException * @see PartialTokenException * @see InvalidTokenException * @see ExtensibleTokenException * @see #tokenizeCdataSection */ public final int tokenizeContent(byte[] buf, int off, int end, ContentToken token) throws PartialTokenException, InvalidTokenException, EmptyTokenException, ExtensibleTokenException { if (minBPC > 1) end = adjustEnd(off, end); if (off == end) throw new EmptyTokenException(); switch (byteType(buf, off)) { case BT_LT: return scanLt(buf, off + minBPC, end, token); case BT_AMP: return scanRef(buf, off + minBPC, end, token); case BT_CR: off += minBPC; if (off == end) throw new ExtensibleTokenException(TOK_DATA_NEWLINE); if (byteType(buf, off) == BT_LF) off += minBPC; token.tokenEnd = off; return TOK_DATA_NEWLINE; case BT_LF: token.tokenEnd = off + minBPC; return TOK_DATA_NEWLINE; case BT_RSQB: off += minBPC; if (off == end) throw new ExtensibleTokenException(TOK_DATA_CHARS); if (!charMatches(buf, off, ']')) break; off += minBPC; if (off == end) throw new ExtensibleTokenException(TOK_DATA_CHARS); if (!charMatches(buf, off, '>')) { off -= minBPC; break; } throw new InvalidTokenException(off); case BT_NONXML: case BT_MALFORM: throw new InvalidTokenException(off); case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); check2(buf, off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); check3(buf, off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); check4(buf, off); off += 4; break; default: off += minBPC; break; } token.tokenEnd = extendData(buf, off, end); return TOK_DATA_CHARS; } int extendData(final byte[] buf, int off, final int end) throws InvalidTokenException { while (off != end) { switch (byteType(buf, off)) { case BT_LEAD2: if (end - off < 2) return off; check2(buf, off); off += 2; break; case BT_LEAD3: if (end - off < 3) return off; check3(buf, off); off += 3; break; case BT_LEAD4: if (end - off < 4) return off; check4(buf, off); off += 4; break; case BT_RSQB: case BT_AMP: case BT_LT: case BT_NONXML: case BT_MALFORM: case BT_CR: case BT_LF: return off; default: off += minBPC; break; } } return off; } /* off points to character following "%" */ private final int scanPercent(byte[] buf, int off, int end, Token token) throws PartialTokenException, InvalidTokenException { if (off == end) throw new PartialTokenException(); switch (byteType(buf, off)) { case BT_NMSTRT: off += minBPC; break; case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); if (byteType2(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); if (byteType3(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); if (byteType4(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 4; break; case BT_S: case BT_LF: case BT_CR: case BT_PERCNT: token.tokenEnd = off; return TOK_PERCENT; default: throw new InvalidTokenException(off); } while (off != end) { switch (byteType(buf, off)) { case BT_NMSTRT: case BT_NAME: case BT_MINUS: off += minBPC; break; case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); if (!isNameChar2(buf, off)) throw new InvalidTokenException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); if (!isNameChar3(buf, off)) throw new InvalidTokenException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); if (!isNameChar4(buf, off)) throw new InvalidTokenException(off); off += 4; break; case BT_SEMI: token.nameEnd = off; token.tokenEnd = off + minBPC; return TOK_PARAM_ENTITY_REF; default: throw new InvalidTokenException(off); } } throw new PartialTokenException(); } private final int scanPoundName(byte[] buf, int off, int end, Token token) throws PartialTokenException, InvalidTokenException, ExtensibleTokenException { if (off == end) throw new PartialTokenException(); switch (byteType(buf, off)) { case BT_NMSTRT: off += minBPC; break; case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); if (byteType2(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); if (byteType3(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); if (byteType4(buf, off) != BT_NMSTRT) throw new InvalidTokenException(off); off += 4; break; default: throw new InvalidTokenException(off); } while (off != end) { switch (byteType(buf, off)) { case BT_NMSTRT: case BT_NAME: case BT_MINUS: off += minBPC; break; case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); if (!isNameChar2(buf, off)) throw new InvalidTokenException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); if (!isNameChar3(buf, off)) throw new InvalidTokenException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); if (!isNameChar4(buf, off)) throw new InvalidTokenException(off); off += 4; break; case BT_CR: case BT_LF: case BT_S: case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR: token.tokenEnd = off; return TOK_POUND_NAME; default: throw new InvalidTokenException(off); } } throw new ExtensibleTokenException(TOK_POUND_NAME); } private final int scanLit(int open, byte[] buf, int off, int end, Token token) throws PartialTokenException, InvalidTokenException, ExtensibleTokenException { while (off != end) { int t = byteType(buf, off); switch (t) { case BT_LEAD2: if (end - off < 2) throw new PartialTokenException(); check2(buf, off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialTokenException(); check3(buf, off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialTokenException(); check4(buf, off); off += 4; break; case BT_NONXML: case BT_MALFORM: throw new InvalidTokenException(off); case BT_QUOT: case BT_APOS: off += minBPC; if (t != open) break; if (off == end) throw new ExtensibleTokenException(TOK_LITERAL); switch (byteType(buf, off)) { case BT_S: case BT_CR: case BT_LF: case BT_GT: case BT_PERCNT: case BT_LSQB: token.tokenEnd = off; return TOK_LITERAL; default: throw new InvalidTokenException(off); } default: off += minBPC; break; } } throw new PartialTokenException(); } /** * Returns an encoding object to be used to start parsing an external entity. * The encoding is chosen based on the initial 4 bytes of the entity. * * @param buf the byte array containing the initial bytes of the entity * @param off the index in <code>buf</code> of the first byte of the entity * @param end the index in <code>buf</code> following the last available * byte of the entity; <code>end - off</code> must be greater than or equal * to 4 unless the entity has fewer that 4 bytes, in which case it must * be equal to the length of the entity * @param token receives information about the presence of a byte order * mark; if the entity starts with a byte order mark * then <code>token.getTokenEnd()</code> * will return <code>off + 2</code>, otherwise it will return * <code>off</code> * * @see TextDecl * @see XmlDecl * @see #TOK_XML_DECL * @see #getEncoding * @see #getInternalEncoding */ public static final Encoding getInitialEncoding(byte[] buf, int off, int end, Token token) { token.tokenEnd = off; switch (end - off) { case 0: break; case 1: if (buf[off] < 0) return null; break; default: int b0 = buf[off] & 0xFF; int b1 = buf[off + 1] & 0xFF; switch ((b0 << 8) | b1) { case 0xFEFF: token.tokenEnd = off + 2; /* fall through */ case '<': /* not legal; but not a fatal error */ return getEncoding(UTF16_BIG_ENDIAN_ENCODING); case 0xFFFE: token.tokenEnd = off + 2; /* fall through */ case '<' << 8: /* not legal; but not a fatal error */ return getEncoding(UTF16_LITTLE_ENDIAN_ENCODING); } } return getEncoding(UTF8_ENCODING); } /** * Returns an <code>Encoding</code> corresponding to * the specified IANA character set name. * Returns this <code>Encoding</code> if the name is null. * Returns null if the specified encoding is not supported. * Note that there are two distinct <code>Encoding</code> objects * associated with the name <code>UTF-16</code>, one for * each possible byte order; if this <code>Encoding</code> * is UTF-16 with little-endian byte ordering, then * <code>getEncoding("UTF-16")</code> will return this, * otherwise it will return an <code>Encoding</code> for * UTF-16 with big-endian byte ordering. * @param name a string specifying the IANA name of the encoding; this is * case insensitive */ public final Encoding getEncoding(String name) { if (name == null) return this; if (name.equalsIgnoreCase("UTF-8")) return getEncoding(UTF8_ENCODING); if (name.equalsIgnoreCase("UTF-16")) return getUTF16Encoding(); if (name.equalsIgnoreCase("ISO-8859-1")) return getEncoding(ISO8859_1_ENCODING); if (name.equalsIgnoreCase("US-ASCII")) return getEncoding(ASCII_ENCODING); if (name.equalsIgnoreCase("windows-1250")) return getEncoding(WINDOWS1250_ENCODING); if (name.equalsIgnoreCase("ISO-8859-2")) return getEncoding(ISO8859_2_ENCODING); return null; } /** * Returns an <code>Encoding</code> for entities encoded with * a single-byte encoding (an encoding in which each byte represents * exactly one character). * @param map a string specifying the character represented by each byte; * the string must have a length of 256; <code>map.charAt(b)</code> * specifies the character encoded by byte <code>b</code>; bytes that do * not represent any character should be mapped to <code>\uFFFD</code> */ public final Encoding getSingleByteEncoding(String map) { return new SingleByteEncoding(map); } /** * Returns an <code>Encoding</code> object for use with internal entities. * This is a UTF-16 big endian encoding, except that newlines * are assumed to have been normalized into line feed, * so carriage return is treated like a space. */ public final static Encoding getInternalEncoding() { return getEncoding(INTERNAL_ENCODING); } /** * Scans the first token of a byte subarray that contains part of a * prolog. * Returns one of the following integers according to the type of token * that the subarray starts with: * <ul> * <li><code>TOK_PI</code> * <li><code>TOK_XML_DECL</code> * <li><code>TOK_COMMENT</code> * <li><code>TOK_PARAM_ENTITY_REF</code> * <li><code>TOK_PROLOG_S</code> * <li><code>TOK_DECL_OPEN</code> * <li><code>TOK_DECL_CLOSE</code> * <li><code>TOK_NAME</code> * <li><code>TOK_NMTOKEN</code> * <li><code>TOK_POUND_NAME</code> * <li><code>TOK_OR</code> * <li><code>TOK_PERCENT</code> * <li><code>TOK_OPEN_PAREN</code> * <li><code>TOK_CLOSE_PAREN</code> * <li><code>TOK_OPEN_BRACKET</code> * <li><code>TOK_CLOSE_BRACKET</code> * <li><code>TOK_LITERAL</code> * <li><code>TOK_NAME_QUESTION</code> * <li><code>TOK_NAME_ASTERISK</code> * <li><code>TOK_NAME_PLUS</code> * <li><code>TOK_COND_SECT_OPEN</code> * <li><code>TOK_COND_SECT_CLOSE</code> * <li><code>TOK_CLOSE_PAREN_QUESTION</code> * <li><code>TOK_CLOSE_PAREN_ASTERISK</code> * <li><code>TOK_CLOSE_PAREN_PLUS</code> * <li><code>TOK_COMMA</code> * </ul> * @exception EmptyTokenException if the subarray is empty * @exception PartialTokenException if the subarray contains only part of * a legal token * @exception InvalidTokenException if the subarrary does not start * with a legal token or part of one * @exception EndOfPrologException if the subarray starts with the document * element; <code>tokenizeContent</code> should be used on the remainder * of the entity * @exception ExtensibleTokenException if the subarray is a legal token * but subsequent bytes in the same entity could be part of the token * @see #TOK_PI * @see #TOK_XML_DECL * @see #TOK_COMMENT * @see #TOK_PARAM_ENTITY_REF * @see #TOK_PROLOG_S * @see #TOK_DECL_OPEN * @see #TOK_DECL_CLOSE * @see #TOK_NAME * @see #TOK_NMTOKEN * @see #TOK_POUND_NAME * @see #TOK_OR * @see #TOK_PERCENT * @see #TOK_OPEN_PAREN * @see #TOK_CLOSE_PAREN * @see #TOK_OPEN_BRACKET * @see #TOK_CLOSE_BRACKET * @see #TOK_LITERAL * @see #TOK_NAME_QUESTION * @see #TOK_NAME_ASTERISK * @see #TOK_NAME_PLUS * @see #TOK_COND_SECT_OPEN * @see #TOK_COND_SECT_CLOSE * @see #TOK_CLOSE_PAREN_QUESTION * @see #TOK_CLOSE_PAREN_ASTERISK * @see #TOK_CLOSE_PAREN_PLUS * @see #TOK_COMMA * @see ContentToken * @see EmptyTokenException * @see PartialTokenException * @see InvalidTokenException * @see ExtensibleTokenException * @see EndOfPrologException */ public final int tokenizeProlog(byte[] buf, int off, int end, Token token) throws PartialTokenException, InvalidTokenException, EmptyTokenException, ExtensibleTokenException, EndOfPrologException { int tok; if (minBPC > 1) end = adjustEnd(off, end); if (off == end) throw new EmptyTokenException(); switch (byteType(buf, off)) { case BT_QUOT: return scanLit(BT_QUOT, buf, off + minBPC, end, token); case BT_APOS: return scanLit(BT_APOS, buf, off + minBPC, end, token); case BT_LT: { off += minBPC; if (off == end) throw new PartialTokenException(); switch (byteType(buf, off)) { case BT_EXCL: return scanDecl(buf, off + minBPC, end, token); case BT_QUEST: return scanPi(buf, off + minBPC, end, token); case BT_NMSTRT: case BT_LEAD2: case BT_LEAD3: case BT_LEAD4: token.tokenEnd = off - minBPC; throw new EndOfPrologException(); } throw new InvalidTokenException(off); } case BT_CR: if (off + minBPC == end) throw new ExtensibleTokenException(TOK_PROLOG_S); /* fall through */ case BT_S: case BT_LF: for (;;) { off += minBPC; if (off == end) break; switch (byteType(buf, off)) { case BT_S: case BT_LF: break; case BT_CR: /* don't split CR/LF pair */ if (off + minBPC != end) break; /* fall through */ default: token.tokenEnd = off; return TOK_PROLOG_S; } } token.tokenEnd = off; return TOK_PROLOG_S; case BT_PERCNT: return scanPercent(buf, off + minBPC, end, token); case BT_COMMA: token.tokenEnd = off + minBPC; return TOK_COMMA; case BT_LSQB: token.tokenEnd = off + minBPC; return TOK_OPEN_BRACKET; case BT_RSQB: off += minBPC; if (off == end) throw new ExtensibleTokenException(TOK_CLOSE_BRACKET); if (charMatches(buf, off, ']')) { if (off + minBPC == end) throw new PartialTokenException(); if (charMatches(buf, off + minBPC, '>')) { token.tokenEnd = off + 2*minBPC; return TOK_COND_SECT_CLOSE; } } token.tokenEnd = off; return TOK_CLOSE_BRACKET; case BT_LPAR: token.tokenEnd = off + minBPC; return TOK_OPEN_PAREN; case BT_RPAR: off += minBPC; if (off == end) throw new ExtensibleTokenException(TOK_CLOSE_PAREN); switch (byteType(buf, off)) { case BT_AST: token.tokenEnd = off + minBPC; return TOK_CLOSE_PAREN_ASTERISK; case BT_QUEST: token.tokenEnd = off + minBPC; return TOK_CLOSE_PAREN_QUESTION; case BT_PLUS: token.tokenEnd = off + minBPC; return TOK_CLOSE_PAREN_PLUS; case BT_CR: case BT_LF: case BT_S: case BT_GT: case BT_COMMA: case BT_VERBAR: case BT_RPAR: token.tokenEnd = off; return TOK_CLOSE_PAREN; } throw new InvalidTokenException(off); case BT_VERBAR: token.tokenEnd = off + minBPC; return TOK_OR; case BT_GT: token.tokenEnd = off + minBPC; return TOK_DECL_CLOSE; case BT_NUM: return scanPoundName(buf, off + minBPC, end, token); case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); switch (byteType2(buf, off)) { case BT_NMSTRT: off += 2; tok = TOK_NAME; break; case BT_NAME: off += 2; tok = TOK_NMTOKEN; break; default: throw new InvalidTokenException(off); } break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); switch (byteType3(buf, off)) { case BT_NMSTRT: off += 3; tok = TOK_NAME; break; case BT_NAME: off += 3; tok = TOK_NMTOKEN; break; default: throw new InvalidTokenException(off); } break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); switch (byteType4(buf, off)) { case BT_NMSTRT: off += 4; tok = TOK_NAME; break; case BT_NAME: off += 4; tok = TOK_NMTOKEN; break; default: throw new InvalidTokenException(off); } break; case BT_NMSTRT: tok = TOK_NAME; off += minBPC; break; case BT_NAME: case BT_MINUS: tok = TOK_NMTOKEN; off += minBPC; break; default: throw new InvalidTokenException(off); } while (off != end) { switch (byteType(buf, off)) { case BT_NMSTRT: case BT_NAME: case BT_MINUS: off += minBPC; break; case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); if (!isNameChar2(buf, off)) throw new InvalidTokenException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); if (!isNameChar3(buf, off)) throw new InvalidTokenException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); if (!isNameChar4(buf, off)) throw new InvalidTokenException(off); off += 4; break; case BT_GT: case BT_RPAR: case BT_COMMA: case BT_VERBAR: case BT_LSQB: case BT_PERCNT: case BT_S: case BT_CR: case BT_LF: token.tokenEnd = off; return tok; case BT_PLUS: if (tok != TOK_NAME) throw new InvalidTokenException(off); token.tokenEnd = off + minBPC; return TOK_NAME_PLUS; case BT_AST: if (tok != TOK_NAME) throw new InvalidTokenException(off); token.tokenEnd = off + minBPC; return TOK_NAME_ASTERISK; case BT_QUEST: if (tok != TOK_NAME) throw new InvalidTokenException(off); token.tokenEnd = off + minBPC; return TOK_NAME_QUESTION; default: throw new InvalidTokenException(off); } } throw new ExtensibleTokenException(tok); } /** * Scans the first token of a byte subarrary that contains part of * literal attribute value. The opening and closing delimiters * are not included in the subarrary. * Returns one of the following integers according to the type of * token that the subarray starts with: * <ul> * <li><code>TOK_DATA_CHARS</code> * <li><code>TOK_DATA_NEWLINE</code> * <li><code>TOK_ATTRIBUTE_VALUE_S</code> * <li><code>TOK_MAGIC_ENTITY_REF</code> * <li><code>TOK_ENTITY_REF</code> * <li><code>TOK_CHAR_REF</code> * <li><code>TOK_CHAR_PAIR_REF</code> * </ul> * @exception EmptyTokenException if the subarray is empty * @exception PartialTokenException if the subarray contains only part of * a legal token * @exception InvalidTokenException if the subarrary does not start * with a legal token or part of one * @exception ExtensibleTokenException if the subarray encodes just a carriage * return ('\r') * @see #TOK_DATA_CHARS * @see #TOK_DATA_NEWLINE * @see #TOK_ATTRIBUTE_VALUE_S * @see #TOK_MAGIC_ENTITY_REF * @see #TOK_ENTITY_REF * @see #TOK_CHAR_REF * @see #TOK_CHAR_PAIR_REF * @see Token * @see EmptyTokenException * @see PartialTokenException * @see InvalidTokenException * @see ExtensibleTokenException */ public final int tokenizeAttributeValue(byte[] buf, int off, int end, Token token) throws PartialTokenException, InvalidTokenException, EmptyTokenException, ExtensibleTokenException { if (minBPC > 1) end = adjustEnd(off, end); if (off == end) throw new EmptyTokenException(); int start = off; while (off != end) { switch (byteType(buf, off)) { case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); off += 4; break; case BT_AMP: if (off == start) return scanRef(buf, off + minBPC, end, token); token.tokenEnd = off; return TOK_DATA_CHARS; case BT_LT: /* this is for inside entity references */ throw new InvalidTokenException(off); case BT_S: if (off == start) { token.tokenEnd = off + minBPC; return TOK_ATTRIBUTE_VALUE_S; } token.tokenEnd = off; return TOK_DATA_CHARS; case BT_LF: if (off == start) { token.tokenEnd = off + minBPC; return TOK_DATA_NEWLINE; } token.tokenEnd = off; return TOK_DATA_CHARS; case BT_CR: if (off == start) { off += minBPC; if (off == end) throw new ExtensibleTokenException(TOK_DATA_NEWLINE); if (byteType(buf, off) == BT_LF) off += minBPC; token.tokenEnd = off; return TOK_DATA_NEWLINE; } token.tokenEnd = off; return TOK_DATA_CHARS; default: off += minBPC; break; } } token.tokenEnd = off; return TOK_DATA_CHARS; } /** * Scans the first token of a byte subarrary that contains part of * literal entity value. The opening and closing delimiters * are not included in the subarrary. * Returns one of the following integers according to the type of * token that the subarray starts with: * <ul> * <li><code>TOK_DATA_CHARS</code> * <li><code>TOK_DATA_NEWLINE</code> * <li><code>TOK_PARAM_ENTITY_REF</code> * <li><code>TOK_MAGIC_ENTITY_REF</code> * <li><code>TOK_ENTITY_REF</code> * <li><code>TOK_CHAR_REF</code> * <li><code>TOK_CHAR_PAIR_REF</code> * </ul> * @exception EmptyTokenException if the subarray is empty * @exception PartialTokenException if the subarray contains only part of * a legal token * @exception InvalidTokenException if the subarrary does not start * with a legal token or part of one * @exception ExtensibleTokenException if the subarray encodes just a carriage * return ('\r') * @see #TOK_DATA_CHARS * @see #TOK_DATA_NEWLINE * @see #TOK_MAGIC_ENTITY_REF * @see #TOK_ENTITY_REF * @see #TOK_PARAM_ENTITY_REF * @see #TOK_CHAR_REF * @see #TOK_CHAR_PAIR_REF * @see Token * @see EmptyTokenException * @see PartialTokenException * @see InvalidTokenException * @see ExtensibleTokenException */ public final int tokenizeEntityValue(byte[] buf, int off, int end, Token token) throws PartialTokenException, InvalidTokenException, EmptyTokenException, ExtensibleTokenException { if (minBPC > 1) end = adjustEnd(off, end); if (off == end) throw new EmptyTokenException(); int start = off; while (off != end) { switch (byteType(buf, off)) { case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); off += 4; break; case BT_AMP: if (off == start) return scanRef(buf, off + minBPC, end, token); token.tokenEnd = off; return TOK_DATA_CHARS; case BT_PERCNT: if (off == start) return scanPercent(buf, off + minBPC, end, token); token.tokenEnd = off; return TOK_DATA_CHARS; case BT_LF: if (off == start) { token.tokenEnd = off + minBPC; return TOK_DATA_NEWLINE; } token.tokenEnd = off; return TOK_DATA_CHARS; case BT_CR: if (off == start) { off += minBPC; if (off == end) throw new ExtensibleTokenException(TOK_DATA_NEWLINE); if (byteType(buf, off) == BT_LF) off += minBPC; token.tokenEnd = off; return TOK_DATA_NEWLINE; } token.tokenEnd = off; return TOK_DATA_CHARS; default: off += minBPC; break; } } token.tokenEnd = off; return TOK_DATA_CHARS; } /** * Skips over an ignored conditional section. * The subarray starts following the <code><![ IGNORE [</code>. * * @return the index of the character following the closing * <code>]]></code> * * @exception PartialTokenException if the subarray does not contain the * complete ignored conditional section * @exception InvalidTokenException if the ignored conditional section * contains illegal characters */ public final int skipIgnoreSect(byte[] buf, int off, int end) throws PartialTokenException, InvalidTokenException { if (minBPC > 1) end = adjustEnd(off, end); int level = 0; loop: while (off != end) { switch (byteType(buf, off)) { case BT_LEAD2: if (end - off < 2) throw new PartialCharException(off); check2(buf, off); off += 2; break; case BT_LEAD3: if (end - off < 3) throw new PartialCharException(off); check3(buf, off); off += 3; break; case BT_LEAD4: if (end - off < 4) throw new PartialCharException(off); check4(buf, off); off += 4; break; case BT_NONXML: case BT_MALFORM: throw new InvalidTokenException(off); case BT_LT: off += minBPC; if (off == end) break loop; if (!charMatches(buf, off, '!')) break; off += minBPC; if (off == end) break loop; if (!charMatches(buf, off, '[')) break; level++; off += minBPC; break; case BT_RSQB: off += minBPC; if (off == end) break loop; if (!charMatches(buf, off, ']')) break; off += minBPC; if (off == end) break loop; if (charMatches(buf, off, '>')) { if (level == 0) return off + minBPC; level--; } else if (charMatches(buf, off, ']')) break; off += minBPC; break; default: off += minBPC; break; } } throw new PartialTokenException(); } /** * Checks that a literal contained in the specified byte subarray * is a legal public identifier and returns a string with * the normalized content of the public id. * The subarray includes the opening and closing quotes. * @exception InvalidTokenException if it is not a legal public identifier */ public final String getPublicId(byte[] buf, int off, int end) throws InvalidTokenException { StringBuffer sbuf = new StringBuffer(); off += minBPC; end -= minBPC; for (; off != end; off += minBPC) { char c = (char)byteToAscii(buf, off); switch (byteType(buf, off)) { case BT_MINUS: case BT_APOS: case BT_LPAR: case BT_RPAR: case BT_PLUS: case BT_COMMA: case BT_SOL: case BT_EQUALS: case BT_QUEST: case BT_SEMI: case BT_EXCL: case BT_AST: case BT_PERCNT: case BT_NUM: sbuf.append(c); break; case BT_S: if (charMatches(buf, off, '\t')) throw new InvalidTokenException(off); /* fall through */ case BT_CR: case BT_LF: if (sbuf.length() > 0 && sbuf.charAt(sbuf.length() - 1) != ' ') sbuf.append(' '); break; case BT_NAME: case BT_NMSTRT: if ((c & ~0x7f) == 0) { sbuf.append(c); break; } // fall through default: switch (c) { case '$': case '@': break; default: throw new InvalidTokenException(off); } break; } } if (sbuf.length() > 0 && sbuf.charAt(sbuf.length() - 1) == ' ') sbuf.setLength(sbuf.length() - 1); return sbuf.toString(); } /** * Returns true if the specified byte subarray is equal to the string. * The string must contain only XML significant characters. */ public final boolean matchesXMLString(byte[] buf, int off, int end, String str) { int len = str.length(); if (len*minBPC != end - off) return false; for (int i = 0; i < len; off += minBPC, i++) { if (!charMatches(buf, off, str.charAt(i))) return false; } return true; } /** * Skips over XML whitespace characters at the start of the specified * subarray. * * @return the index of the first non-whitespace character, * <code>end</code> if there is the subarray is all whitespace */ public final int skipS(byte[] buf, int off, int end) { loop: while (off < end) { switch (byteType(buf, off)) { case BT_S: case BT_CR: case BT_LF: off += minBPC; break; default: break loop; } } return off; } private final boolean isNameChar2(byte[] buf, int off) { int bt = byteType2(buf, off); return bt == BT_NAME || bt == BT_NMSTRT; } private final boolean isNameChar3(byte[] buf, int off) { int bt = byteType3(buf, off); return bt == BT_NAME || bt == BT_NMSTRT; } private final boolean isNameChar4(byte[] buf, int off) { int bt = byteType4(buf, off); return bt == BT_NAME || bt == BT_NMSTRT; } private static final String nameStartSingles = "\u003a\u005f\u0386\u038c\u03da\u03dc\u03de\u03e0\u0559\u06d5\u093d\u09b2" + "\u0a5e\u0a8d\u0abd\u0ae0\u0b3d\u0b9c\u0cde\u0e30\u0e84\u0e8a\u0e8d\u0ea5" + "\u0ea7\u0eb0\u0ebd\u1100\u1109\u113c\u113e\u1140\u114c\u114e\u1150\u1159" + "\u1163\u1165\u1167\u1169\u1175\u119e\u11a8\u11ab\u11ba\u11eb\u11f0\u11f9" + "\u1f59\u1f5b\u1f5d\u1fbe\u2126\u212e\u3007"; private static final String nameStartRanges = "\u0041\u005a\u0061\u007a\u00c0\u00d6\u00d8\u00f6\u00f8\u00ff\u0100\u0131" + "\u0134\u013e\u0141\u0148\u014a\u017e\u0180\u01c3\u01cd\u01f0\u01f4\u01f5" + "\u01fa\u0217\u0250\u02a8\u02bb\u02c1\u0388\u038a\u038e\u03a1\u03a3\u03ce" + "\u03d0\u03d6\u03e2\u03f3\u0401\u040c\u040e\u044f\u0451\u045c\u045e\u0481" + "\u0490\u04c4\u04c7\u04c8\u04cb\u04cc\u04d0\u04eb\u04ee\u04f5\u04f8\u04f9" + "\u0531\u0556\u0561\u0586\u05d0\u05ea\u05f0\u05f2\u0621\u063a\u0641\u064a" + "\u0671\u06b7\u06ba\u06be\u06c0\u06ce\u06d0\u06d3\u06e5\u06e6\u0905\u0939" + "\u0958\u0961\u0985\u098c\u098f\u0990\u0993\u09a8\u09aa\u09b0\u09b6\u09b9" + "\u09dc\u09dd\u09df\u09e1\u09f0\u09f1\u0a05\u0a0a\u0a0f\u0a10\u0a13\u0a28" + "\u0a2a\u0a30\u0a32\u0a33\u0a35\u0a36\u0a38\u0a39\u0a59\u0a5c\u0a72\u0a74" + "\u0a85\u0a8b\u0a8f\u0a91\u0a93\u0aa8\u0aaa\u0ab0\u0ab2\u0ab3\u0ab5\u0ab9" + "\u0b05\u0b0c\u0b0f\u0b10\u0b13\u0b28\u0b2a\u0b30\u0b32\u0b33\u0b36\u0b39" + "\u0b5c\u0b5d\u0b5f\u0b61\u0b85\u0b8a\u0b8e\u0b90\u0b92\u0b95\u0b99\u0b9a" + "\u0b9e\u0b9f\u0ba3\u0ba4\u0ba8\u0baa\u0bae\u0bb5\u0bb7\u0bb9\u0c05\u0c0c" + "\u0c0e\u0c10\u0c12\u0c28\u0c2a\u0c33\u0c35\u0c39\u0c60\u0c61\u0c85\u0c8c" + "\u0c8e\u0c90\u0c92\u0ca8\u0caa\u0cb3\u0cb5\u0cb9\u0ce0\u0ce1\u0d05\u0d0c" + "\u0d0e\u0d10\u0d12\u0d28\u0d2a\u0d39\u0d60\u0d61\u0e01\u0e2e\u0e32\u0e33" + "\u0e40\u0e45\u0e81\u0e82\u0e87\u0e88\u0e94\u0e97\u0e99\u0e9f\u0ea1\u0ea3" + "\u0eaa\u0eab\u0ead\u0eae\u0eb2\u0eb3\u0ec0\u0ec4\u0f40\u0f47\u0f49\u0f69" + "\u10a0\u10c5\u10d0\u10f6\u1102\u1103\u1105\u1107\u110b\u110c\u110e\u1112" + "\u1154\u1155\u115f\u1161\u116d\u116e\u1172\u1173\u11ae\u11af\u11b7\u11b8" + "\u11bc\u11c2\u1e00\u1e9b\u1ea0\u1ef9\u1f00\u1f15\u1f18\u1f1d\u1f20\u1f45" + "\u1f48\u1f4d\u1f50\u1f57\u1f5f\u1f7d\u1f80\u1fb4\u1fb6\u1fbc\u1fc2\u1fc4" + "\u1fc6\u1fcc\u1fd0\u1fd3\u1fd6\u1fdb\u1fe0\u1fec\u1ff2\u1ff4\u1ff6\u1ffc" + "\u212a\u212b\u2180\u2182\u3041\u3094\u30a1\u30fa\u3105\u312c\uac00\ud7a3" + "\u4e00\u9fa5\u3021\u3029"; private static final String nameSingles = "\u002d\u002e\u05bf\u05c4\u0670\u093c\u094d\u09bc\u09be\u09bf\u09d7\u0a02" + "\u0a3c\u0a3e\u0a3f\u0abc\u0b3c\u0bd7\u0d57\u0e31\u0eb1\u0f35\u0f37\u0f39" + "\u0f3e\u0f3f\u0f97\u0fb9\u20e1\u3099\u309a\u00b7\u02d0\u02d1\u0387\u0640" + "\u0e46\u0ec6\u3005"; private static final String nameRanges = "\u0300\u0345\u0360\u0361\u0483\u0486\u0591\u05a1\u05a3\u05b9\u05bb\u05bd" + "\u05c1\u05c2\u064b\u0652\u06d6\u06dc\u06dd\u06df\u06e0\u06e4\u06e7\u06e8" + "\u06ea\u06ed\u0901\u0903\u093e\u094c\u0951\u0954\u0962\u0963\u0981\u0983" + "\u09c0\u09c4\u09c7\u09c8\u09cb\u09cd\u09e2\u09e3\u0a40\u0a42\u0a47\u0a48" + "\u0a4b\u0a4d\u0a70\u0a71\u0a81\u0a83\u0abe\u0ac5\u0ac7\u0ac9\u0acb\u0acd" + "\u0b01\u0b03\u0b3e\u0b43\u0b47\u0b48\u0b4b\u0b4d\u0b56\u0b57\u0b82\u0b83" + "\u0bbe\u0bc2\u0bc6\u0bc8\u0bca\u0bcd\u0c01\u0c03\u0c3e\u0c44\u0c46\u0c48" + "\u0c4a\u0c4d\u0c55\u0c56\u0c82\u0c83\u0cbe\u0cc4\u0cc6\u0cc8\u0cca\u0ccd" + "\u0cd5\u0cd6\u0d02\u0d03\u0d3e\u0d43\u0d46\u0d48\u0d4a\u0d4d\u0e34\u0e3a" + "\u0e47\u0e4e\u0eb4\u0eb9\u0ebb\u0ebc\u0ec8\u0ecd\u0f18\u0f19\u0f71\u0f84" + "\u0f86\u0f8b\u0f90\u0f95\u0f99\u0fad\u0fb1\u0fb7\u20d0\u20dc\u302a\u302f" + "\u0030\u0039\u0660\u0669\u06f0\u06f9\u0966\u096f\u09e6\u09ef\u0a66\u0a6f" + "\u0ae6\u0aef\u0b66\u0b6f\u0be7\u0bef\u0c66\u0c6f\u0ce6\u0cef\u0d66\u0d6f" + "\u0e50\u0e59\u0ed0\u0ed9\u0f20\u0f29\u3031\u3035\u309d\u309e\u30fc\u30fe"; /* final */ static byte[][] charTypeTable; private static void setCharType(char c, int type) { if (c < 0x80) return; int hi = c >> 8; if (charTypeTable[hi] == null) { charTypeTable[hi] = new byte[256]; for (int i = 0; i < 256; i++) charTypeTable[hi][i] = BT_OTHER; } charTypeTable[hi][c & 0xFF] = (byte)type; } private static void setCharType(char min, char max, int type) { byte[] shared = null; do { if ((min & 0xFF) == 0) { for (; min + 0xFF <= max; min += 0x100) { if (shared == null) { shared = new byte[256]; for (int i = 0; i < 256; i++) shared[i] = (byte)type; } charTypeTable[min >> 8] = shared; if (min + 0xFF == max) return; } } setCharType(min, type); } while (min++ != max); } static { charTypeTable = new byte[256][]; for (int i = 0; i < nameSingles.length(); i++) setCharType(nameSingles.charAt(i), BT_NAME); for (int i = 0; i < nameRanges.length(); i += 2) setCharType(nameRanges.charAt(i), nameRanges.charAt(i + 1), BT_NAME); for (int i = 0; i < nameStartSingles.length(); i++) setCharType(nameStartSingles.charAt(i), BT_NMSTRT); for (int i = 0; i < nameStartRanges.length(); i += 2) setCharType(nameStartRanges.charAt(i), nameStartRanges.charAt(i + 1), BT_NMSTRT); setCharType('\uD800', '\uDBFF', BT_LEAD4); setCharType('\uDC00', '\uDFFF', BT_MALFORM); setCharType('\uFFFE', '\uFFFF', BT_NONXML); byte[] other = new byte[256]; for (int i = 0; i < 256; i++) other[i] = BT_OTHER; for (int i = 0; i < 256; i++) if (charTypeTable[i] == null) charTypeTable[i] = other; System.arraycopy(asciiTypeTable, 0, charTypeTable[0], 0, 128); } /** * Returns the minimum number of bytes required to represent a single * character in this encoding. The value will be 1, 2 or 4. */ public final int getMinBytesPerChar() { return minBPC; } }