266 lines
9.3 KiB
Java
266 lines
9.3 KiB
Java
/*
|
|
* Copyright (c) 2007 Henri Sivonen
|
|
* Copyright (c) 2007-2008 Mozilla Foundation
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in
|
|
* all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
* DEALINGS IN THE SOFTWARE.
|
|
*/
|
|
|
|
package nu.validator.htmlparser.gwt;
|
|
|
|
import java.util.LinkedList;
|
|
|
|
import nu.validator.htmlparser.common.XmlViolationPolicy;
|
|
import nu.validator.htmlparser.impl.ErrorReportingTokenizer;
|
|
import nu.validator.htmlparser.impl.Tokenizer;
|
|
import nu.validator.htmlparser.impl.UTF16Buffer;
|
|
|
|
import org.xml.sax.ErrorHandler;
|
|
import org.xml.sax.SAXException;
|
|
import org.xml.sax.SAXParseException;
|
|
|
|
import com.google.gwt.core.client.JavaScriptObject;
|
|
import com.google.gwt.user.client.Timer;
|
|
|
|
/**
|
|
* This class implements an HTML5 parser that exposes data through the DOM
|
|
* interface.
|
|
*
|
|
* <p>By default, when using the constructor without arguments, the
|
|
* this parser treats XML 1.0-incompatible infosets as fatal errors.
|
|
* This corresponds to
|
|
* <code>FATAL</code> as the general XML violation policy. To make the parser
|
|
* support non-conforming HTML fully per the HTML 5 spec while on the other
|
|
* hand potentially violating the DOM API contract, set the general XML
|
|
* violation policy to <code>ALLOW</code>. This does not work with a standard
|
|
* DOM implementation. Handling all input without fatal errors and without
|
|
* violating the DOM API contract is possible by setting
|
|
* the general XML violation policy to <code>ALTER_INFOSET</code>. <em>This
|
|
* makes the parser non-conforming</em> but is probably the most useful
|
|
* setting for most applications.
|
|
*
|
|
* <p>The doctype is not represented in the tree.
|
|
*
|
|
* <p>The document mode is represented as user data <code>DocumentMode</code>
|
|
* object with the key <code>nu.validator.document-mode</code> on the document
|
|
* node.
|
|
*
|
|
* <p>The form pointer is also stored as user data with the key
|
|
* <code>nu.validator.form-pointer</code>.
|
|
*
|
|
* @version $Id: HtmlDocumentBuilder.java 255 2008-05-29 08:57:38Z hsivonen $
|
|
* @author hsivonen
|
|
*/
|
|
public class HtmlParser {
|
|
|
|
private static final int CHUNK_SIZE = 512;
|
|
|
|
private final Tokenizer tokenizer;
|
|
|
|
private final BrowserTreeBuilder domTreeBuilder;
|
|
|
|
private final StringBuilder documentWriteBuffer = new StringBuilder();
|
|
|
|
private ErrorHandler errorHandler;
|
|
|
|
private UTF16Buffer stream;
|
|
|
|
private int streamLength;
|
|
|
|
private boolean lastWasCR;
|
|
|
|
private boolean ending;
|
|
|
|
private ParseEndListener parseEndListener;
|
|
|
|
private final LinkedList<UTF16Buffer> bufferStack = new LinkedList<UTF16Buffer>();
|
|
|
|
/**
|
|
* Instantiates the parser
|
|
*
|
|
* @param implementation
|
|
* the DOM implementation
|
|
* @param xmlPolicy the policy
|
|
*/
|
|
public HtmlParser(JavaScriptObject document) {
|
|
this.domTreeBuilder = new BrowserTreeBuilder(document);
|
|
this.tokenizer = new ErrorReportingTokenizer(domTreeBuilder);
|
|
this.domTreeBuilder.setNamePolicy(XmlViolationPolicy.ALTER_INFOSET);
|
|
this.tokenizer.setCommentPolicy(XmlViolationPolicy.ALTER_INFOSET);
|
|
this.tokenizer.setContentNonXmlCharPolicy(XmlViolationPolicy.ALTER_INFOSET);
|
|
this.tokenizer.setContentSpacePolicy(XmlViolationPolicy.ALTER_INFOSET);
|
|
this.tokenizer.setNamePolicy(XmlViolationPolicy.ALTER_INFOSET);
|
|
this.tokenizer.setXmlnsPolicy(XmlViolationPolicy.ALTER_INFOSET);
|
|
}
|
|
|
|
/**
|
|
* Parses a document from a SAX <code>InputSource</code>.
|
|
* @param is the source
|
|
* @return the doc
|
|
* @see javax.xml.parsers.DocumentBuilder#parse(org.xml.sax.InputSource)
|
|
*/
|
|
public void parse(String source, ParseEndListener callback) throws SAXException {
|
|
parseEndListener = callback;
|
|
domTreeBuilder.setFragmentContext(null);
|
|
tokenize(source, null);
|
|
}
|
|
|
|
/**
|
|
* @param is
|
|
* @throws SAXException
|
|
* @throws IOException
|
|
* @throws MalformedURLException
|
|
*/
|
|
private void tokenize(String source, String context) throws SAXException {
|
|
lastWasCR = false;
|
|
ending = false;
|
|
documentWriteBuffer.setLength(0);
|
|
streamLength = source.length();
|
|
stream = new UTF16Buffer(source.toCharArray(), 0,
|
|
(streamLength < CHUNK_SIZE ? streamLength : CHUNK_SIZE));
|
|
bufferStack.clear();
|
|
push(stream);
|
|
domTreeBuilder.setFragmentContext(context == null ? null : context.intern());
|
|
tokenizer.start();
|
|
pump();
|
|
}
|
|
|
|
private void pump() throws SAXException {
|
|
if (ending) {
|
|
tokenizer.end();
|
|
domTreeBuilder.getDocument(); // drops the internal reference
|
|
parseEndListener.parseComplete();
|
|
// Don't schedule timeout
|
|
return;
|
|
}
|
|
|
|
int docWriteLen = documentWriteBuffer.length();
|
|
if (docWriteLen > 0) {
|
|
char[] newBuf = new char[docWriteLen];
|
|
documentWriteBuffer.getChars(0, docWriteLen, newBuf, 0);
|
|
push(new UTF16Buffer(newBuf, 0, docWriteLen));
|
|
documentWriteBuffer.setLength(0);
|
|
}
|
|
|
|
for (;;) {
|
|
UTF16Buffer buffer = peek();
|
|
if (!buffer.hasMore()) {
|
|
if (buffer == stream) {
|
|
if (buffer.getEnd() == streamLength) {
|
|
// Stop parsing
|
|
tokenizer.eof();
|
|
ending = true;
|
|
break;
|
|
} else {
|
|
int newEnd = buffer.getStart() + CHUNK_SIZE;
|
|
buffer.setEnd(newEnd < streamLength ? newEnd
|
|
: streamLength);
|
|
continue;
|
|
}
|
|
} else {
|
|
pop();
|
|
continue;
|
|
}
|
|
}
|
|
// now we have a non-empty buffer
|
|
buffer.adjust(lastWasCR);
|
|
lastWasCR = false;
|
|
if (buffer.hasMore()) {
|
|
lastWasCR = tokenizer.tokenizeBuffer(buffer);
|
|
domTreeBuilder.maybeRunScript();
|
|
break;
|
|
} else {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// schedule
|
|
Timer timer = new Timer() {
|
|
|
|
@Override public void run() {
|
|
try {
|
|
pump();
|
|
} catch (SAXException e) {
|
|
ending = true;
|
|
if (errorHandler != null) {
|
|
try {
|
|
errorHandler.fatalError(new SAXParseException(
|
|
e.getMessage(), null, null, -1, -1, e));
|
|
} catch (SAXException e1) {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
};
|
|
timer.schedule(1);
|
|
}
|
|
|
|
private void push(UTF16Buffer buffer) {
|
|
bufferStack.addLast(buffer);
|
|
}
|
|
|
|
private UTF16Buffer peek() {
|
|
return bufferStack.getLast();
|
|
}
|
|
|
|
private void pop() {
|
|
bufferStack.removeLast();
|
|
}
|
|
|
|
public void documentWrite(String text) throws SAXException {
|
|
UTF16Buffer buffer = new UTF16Buffer(text.toCharArray(), 0, text.length());
|
|
while (buffer.hasMore()) {
|
|
buffer.adjust(lastWasCR);
|
|
lastWasCR = false;
|
|
if (buffer.hasMore()) {
|
|
lastWasCR = tokenizer.tokenizeBuffer(buffer);
|
|
domTreeBuilder.maybeRunScript();
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @see javax.xml.parsers.DocumentBuilder#setErrorHandler(org.xml.sax.ErrorHandler)
|
|
*/
|
|
public void setErrorHandler(ErrorHandler errorHandler) {
|
|
this.errorHandler = errorHandler;
|
|
domTreeBuilder.setErrorHandler(errorHandler);
|
|
tokenizer.setErrorHandler(errorHandler);
|
|
}
|
|
|
|
/**
|
|
* Sets whether comment nodes appear in the tree.
|
|
* @param ignoreComments <code>true</code> to ignore comments
|
|
* @see nu.validator.htmlparser.impl.TreeBuilder#setIgnoringComments(boolean)
|
|
*/
|
|
public void setIgnoringComments(boolean ignoreComments) {
|
|
domTreeBuilder.setIgnoringComments(ignoreComments);
|
|
}
|
|
|
|
/**
|
|
* Sets whether the parser considers scripting to be enabled for noscript treatment.
|
|
* @param scriptingEnabled <code>true</code> to enable
|
|
* @see nu.validator.htmlparser.impl.TreeBuilder#setScriptingEnabled(boolean)
|
|
*/
|
|
public void setScriptingEnabled(boolean scriptingEnabled) {
|
|
domTreeBuilder.setScriptingEnabled(scriptingEnabled);
|
|
}
|
|
|
|
}
|