/* * Copyright (c) 2007 Henri Sivonen * Copyright (c) 2007-2008 Mozilla Foundation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ package nu.validator.htmlparser.gwt; import java.util.LinkedList; import nu.validator.htmlparser.common.XmlViolationPolicy; import nu.validator.htmlparser.impl.ErrorReportingTokenizer; import nu.validator.htmlparser.impl.Tokenizer; import nu.validator.htmlparser.impl.UTF16Buffer; import org.xml.sax.ErrorHandler; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; import com.google.gwt.core.client.JavaScriptObject; import com.google.gwt.user.client.Timer; /** * This class implements an HTML5 parser that exposes data through the DOM * interface. * *
By default, when using the constructor without arguments, the
* this parser treats XML 1.0-incompatible infosets as fatal errors.
* This corresponds to
* FATAL
as the general XML violation policy. To make the parser
* support non-conforming HTML fully per the HTML 5 spec while on the other
* hand potentially violating the DOM API contract, set the general XML
* violation policy to ALLOW
. This does not work with a standard
* DOM implementation. Handling all input without fatal errors and without
* violating the DOM API contract is possible by setting
* the general XML violation policy to ALTER_INFOSET
. This
* makes the parser non-conforming but is probably the most useful
* setting for most applications.
*
*
The doctype is not represented in the tree. * *
The document mode is represented as user data DocumentMode
* object with the key nu.validator.document-mode
on the document
* node.
*
*
The form pointer is also stored as user data with the key
* nu.validator.form-pointer
.
*
* @version $Id: HtmlDocumentBuilder.java 255 2008-05-29 08:57:38Z hsivonen $
* @author hsivonen
*/
public class HtmlParser {
private static final int CHUNK_SIZE = 512;
private final Tokenizer tokenizer;
private final BrowserTreeBuilder domTreeBuilder;
private final StringBuilder documentWriteBuffer = new StringBuilder();
private ErrorHandler errorHandler;
private UTF16Buffer stream;
private int streamLength;
private boolean lastWasCR;
private boolean ending;
private ParseEndListener parseEndListener;
private final LinkedListInputSource
.
* @param is the source
* @return the doc
* @see javax.xml.parsers.DocumentBuilder#parse(org.xml.sax.InputSource)
*/
public void parse(String source, ParseEndListener callback) throws SAXException {
parseEndListener = callback;
domTreeBuilder.setFragmentContext(null);
tokenize(source, null);
}
/**
* @param is
* @throws SAXException
* @throws IOException
* @throws MalformedURLException
*/
private void tokenize(String source, String context) throws SAXException {
lastWasCR = false;
ending = false;
documentWriteBuffer.setLength(0);
streamLength = source.length();
stream = new UTF16Buffer(source.toCharArray(), 0,
(streamLength < CHUNK_SIZE ? streamLength : CHUNK_SIZE));
bufferStack.clear();
push(stream);
domTreeBuilder.setFragmentContext(context == null ? null : context.intern());
tokenizer.start();
pump();
}
private void pump() throws SAXException {
if (ending) {
tokenizer.end();
domTreeBuilder.getDocument(); // drops the internal reference
parseEndListener.parseComplete();
// Don't schedule timeout
return;
}
int docWriteLen = documentWriteBuffer.length();
if (docWriteLen > 0) {
char[] newBuf = new char[docWriteLen];
documentWriteBuffer.getChars(0, docWriteLen, newBuf, 0);
push(new UTF16Buffer(newBuf, 0, docWriteLen));
documentWriteBuffer.setLength(0);
}
for (;;) {
UTF16Buffer buffer = peek();
if (!buffer.hasMore()) {
if (buffer == stream) {
if (buffer.getEnd() == streamLength) {
// Stop parsing
tokenizer.eof();
ending = true;
break;
} else {
int newEnd = buffer.getStart() + CHUNK_SIZE;
buffer.setEnd(newEnd < streamLength ? newEnd
: streamLength);
continue;
}
} else {
pop();
continue;
}
}
// now we have a non-empty buffer
buffer.adjust(lastWasCR);
lastWasCR = false;
if (buffer.hasMore()) {
lastWasCR = tokenizer.tokenizeBuffer(buffer);
domTreeBuilder.maybeRunScript();
break;
} else {
continue;
}
}
// schedule
Timer timer = new Timer() {
@Override public void run() {
try {
pump();
} catch (SAXException e) {
ending = true;
if (errorHandler != null) {
try {
errorHandler.fatalError(new SAXParseException(
e.getMessage(), null, null, -1, -1, e));
} catch (SAXException e1) {
}
}
}
}
};
timer.schedule(1);
}
private void push(UTF16Buffer buffer) {
bufferStack.addLast(buffer);
}
private UTF16Buffer peek() {
return bufferStack.getLast();
}
private void pop() {
bufferStack.removeLast();
}
public void documentWrite(String text) throws SAXException {
UTF16Buffer buffer = new UTF16Buffer(text.toCharArray(), 0, text.length());
while (buffer.hasMore()) {
buffer.adjust(lastWasCR);
lastWasCR = false;
if (buffer.hasMore()) {
lastWasCR = tokenizer.tokenizeBuffer(buffer);
domTreeBuilder.maybeRunScript();
}
}
}
/**
* @see javax.xml.parsers.DocumentBuilder#setErrorHandler(org.xml.sax.ErrorHandler)
*/
public void setErrorHandler(ErrorHandler errorHandler) {
this.errorHandler = errorHandler;
domTreeBuilder.setErrorHandler(errorHandler);
tokenizer.setErrorHandler(errorHandler);
}
/**
* Sets whether comment nodes appear in the tree.
* @param ignoreComments true
to ignore comments
* @see nu.validator.htmlparser.impl.TreeBuilder#setIgnoringComments(boolean)
*/
public void setIgnoringComments(boolean ignoreComments) {
domTreeBuilder.setIgnoringComments(ignoreComments);
}
/**
* Sets whether the parser considers scripting to be enabled for noscript treatment.
* @param scriptingEnabled true
to enable
* @see nu.validator.htmlparser.impl.TreeBuilder#setScriptingEnabled(boolean)
*/
public void setScriptingEnabled(boolean scriptingEnabled) {
domTreeBuilder.setScriptingEnabled(scriptingEnabled);
}
}