summaryrefslogtreecommitdiff
path: root/render/libxml_binding.c
diff options
context:
space:
mode:
Diffstat (limited to 'render/libxml_binding.c')
-rw-r--r--render/libxml_binding.c308
1 files changed, 308 insertions, 0 deletions
diff --git a/render/libxml_binding.c b/render/libxml_binding.c
new file mode 100644
index 000000000..51cf0a6be
--- /dev/null
+++ b/render/libxml_binding.c
@@ -0,0 +1,308 @@
+/*
+ * Copyright 2007 James Bursa <bursa@users.sourceforge.net>
+ * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
+ *
+ * This file is part of NetSurf, http://www.netsurf-browser.org/
+ *
+ * NetSurf is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * NetSurf is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef WITH_HUBBUB
+
+#include <stdbool.h>
+#include <string.h>
+
+#include <libxml/HTMLparser.h>
+#include <libxml/HTMLtree.h>
+#include <libxml/parser.h>
+#include <libxml/parserInternals.h>
+
+#include "render/parser_binding.h"
+
+#include "utils/log.h"
+#include "utils/talloc.h"
+
+typedef struct libxml_ctx {
+ htmlParserCtxt *parser;
+
+ /** HTML parser encoding handler. */
+ xmlCharEncodingHandler *encoding_handler;
+
+ const char *encoding;
+ binding_encoding_source encoding_source;
+
+ bool getenc;
+} libxml_ctx;
+
+static bool set_parser_encoding(libxml_ctx *c, const char *encoding);
+static const char *detect_encoding(const char **data, size_t *size);
+
+void *binding_create_tree(void *arena, const char *charset)
+{
+ libxml_ctx *ctx;
+
+ ctx = malloc(sizeof(libxml_ctx));
+ if (ctx == NULL)
+ return NULL;
+
+ ctx->parser = NULL;
+ ctx->encoding_handler = NULL;
+ ctx->encoding = charset;
+ ctx->encoding_source = ENCODING_SOURCE_HEADER;
+ ctx->getenc = true;
+
+ ctx->parser = htmlCreatePushParserCtxt(0, 0, "", 0, 0,
+ XML_CHAR_ENCODING_NONE);
+ if (ctx->parser == NULL) {
+ free(ctx);
+ return NULL;
+ }
+
+ if (ctx->encoding != NULL && !set_parser_encoding(ctx, charset)) {
+ if (ctx->parser->myDoc != NULL)
+ xmlFreeDoc(ctx->parser->myDoc);
+ htmlFreeParserCtxt(ctx->parser);
+ free(ctx);
+ return NULL;
+ }
+
+ return (void *) ctx;
+}
+
+void binding_destroy_tree(void *ctx)
+{
+ libxml_ctx *c = (libxml_ctx *) ctx;
+
+ if (ctx == NULL)
+ return;
+
+ if (c->parser->myDoc != NULL)
+ xmlFreeDoc(c->parser->myDoc);
+
+ if (c->parser != NULL)
+ htmlFreeParserCtxt(c->parser);
+
+ c->parser = NULL;
+ c->encoding = NULL;
+
+ free(c);
+}
+
+binding_error binding_parse_chunk(void *ctx, const uint8_t *data, size_t len)
+{
+ libxml_ctx *c = (libxml_ctx *) ctx;
+
+ if (c->getenc) {
+ /* No encoding was specified in the Content-Type header.
+ * Attempt to detect if the encoding is not 8-bit. If the
+ * encoding is 8-bit, leave the parser unchanged, so that it
+ * searches for a <meta http-equiv="content-type"
+ * content="text/html; charset=...">. */
+ const char *encoding;
+ encoding = detect_encoding((const char **) (void *) &data,
+ &len);
+ if (encoding) {
+ if (!set_parser_encoding(c, encoding))
+ return BINDING_NOMEM;
+ c->encoding = encoding;
+ c->encoding_source = ENCODING_SOURCE_DETECTED;
+ }
+ c->getenc = false;
+
+ /* The data we received may have solely consisted of a BOM.
+ * If so, it will have been stripped by html_detect_encoding.
+ * Therefore, we'll have nothing to do in that case. */
+ if (len == 0)
+ return BINDING_OK;
+ }
+
+ htmlParseChunk(c->parser, (const char *) data, len, 0);
+ /** \todo error handling */
+
+ if (!c->encoding && c->parser->input->encoding) {
+ /* The encoding was not in headers or detected,
+ * and the parser found a <meta http-equiv="content-type"
+ * content="text/html; charset=...">. */
+
+ /* However, if that encoding is non-ASCII-compatible,
+ * ignore it, as it can't possibly be correct */
+ if (strncasecmp((const char *) c->parser->input->encoding,
+ "UTF-16", 6) == 0 || /* UTF-16(LE|BE)? */
+ strncasecmp((const char *) c->parser->input->encoding,
+ "UTF-32", 6) == 0) { /* UTF-32(LE|BE)? */
+ c->encoding = "ISO-8859-1";
+ c->encoding_source = ENCODING_SOURCE_DETECTED;
+ } else {
+ c->encoding = (const char *) c->parser->input->encoding;
+ c->encoding_source = ENCODING_SOURCE_META;
+ }
+
+ if (!c->encoding)
+ return BINDING_NOMEM;
+
+ /* have the encoding; don't attempt to detect it */
+ c->getenc = false;
+
+ return BINDING_ENCODINGCHANGE;
+ }
+
+ return BINDING_OK;
+}
+
+binding_error binding_parse_completed(void *ctx)
+{
+ libxml_ctx *c = (libxml_ctx *) ctx;
+
+ htmlParseChunk(c->parser, "", 0, 1);
+ /** \todo error handling */
+
+ return BINDING_OK;
+}
+
+const char *binding_get_encoding(void *ctx, binding_encoding_source *source)
+{
+ libxml_ctx *c = (libxml_ctx *) ctx;
+
+ *source = c->encoding_source;
+
+ return c->encoding;
+}
+
+xmlDocPtr binding_get_document(void *ctx)
+{
+ libxml_ctx *c = (libxml_ctx *) ctx;
+ xmlDocPtr doc = c->parser->myDoc;
+
+ c->parser->myDoc = NULL;
+
+ return doc;
+}
+
+/******************************************************************************/
+
+/**
+ * Set the HTML parser character encoding.
+ *
+ * \param c context
+ * \param encoding name of encoding
+ * \return true on success, false on error and error reported
+ */
+bool set_parser_encoding(libxml_ctx *c, const char *encoding)
+{
+ xmlError *error;
+
+ c->encoding_handler = xmlFindCharEncodingHandler(encoding);
+ if (!c->encoding_handler) {
+ /* either out of memory, or no handler available */
+ /* assume no handler available, which is not a fatal error */
+ LOG(("no encoding handler for \"%s\"", encoding));
+ /* \todo warn user and ask them to install iconv? */
+ return true;
+ }
+
+ xmlCtxtResetLastError(c->parser);
+ if (xmlSwitchToEncoding(c->parser, c->encoding_handler)) {
+ error = xmlCtxtGetLastError(c->parser);
+ LOG(("xmlSwitchToEncoding(): %s",
+ error ? error->message : "failed"));
+ return false;
+ }
+
+ /* Dirty hack to get around libxml oddness:
+ * 1) When creating a push parser context, the input flow's encoding
+ * string is not set (whether an encoding is specified or not)
+ * 2) When switching encoding (as above), the input flow's encoding
+ * string is never changed
+ * 3) When handling a meta charset, the input flow's encoding string
+ * is checked to determine if an encoding has already been set.
+ * If it has been set, then the meta charset is ignored.
+ *
+ * The upshot of this is that, if we don't explicitly set the input
+ * flow's encoding string here, any meta charset in the document
+ * will override our setting, which is incorrect behaviour.
+ *
+ * Ideally, this would be fixed in libxml, but that requires rather
+ * more knowledge than I currently have of what libxml is doing.
+ */
+ if (!c->parser->input->encoding)
+ c->parser->input->encoding =
+ xmlStrdup((const xmlChar *) encoding);
+
+ /* Ensure noone else attempts to reset the encoding */
+ c->getenc = false;
+
+ return true;
+}
+
+/**
+ * Attempt to detect the encoding of some HTML data.
+ *
+ * \param data Pointer to HTML source data
+ * \param size Pointer to length of data
+ * \return a constant string giving the encoding, or 0 if the encoding
+ * appears to be some 8-bit encoding
+ *
+ * If a BOM is encountered, *data and *size will be modified to skip over it
+ */
+
+const char *detect_encoding(const char **data, size_t *size)
+{
+ const unsigned char *d = (const unsigned char *) *data;
+
+ /* this detection assumes that the first two characters are <= 0xff */
+ if (*size < 4)
+ return 0;
+
+ if (d[0] == 0x00 && d[1] == 0x00 &&
+ d[2] == 0xfe && d[3] == 0xff) { /* BOM 00 00 fe ff */
+ *data += 4;
+ *size -= 4;
+ return "UTF-32BE";
+ } else if (d[0] == 0xff && d[1] == 0xfe &&
+ d[2] == 0x00 && d[3] == 0x00) { /* BOM ff fe 00 00 */
+ *data += 4;
+ *size -= 4;
+ return "UTF-32LE";
+ }
+ else if (d[0] == 0x00 && d[1] != 0x00 &&
+ d[2] == 0x00 && d[3] != 0x00) /* 00 xx 00 xx */
+ return "UTF-16BE";
+ else if (d[0] != 0x00 && d[1] == 0x00 &&
+ d[2] != 0x00 && d[3] == 0x00) /* xx 00 xx 00 */
+ return "UTF-16LE";
+ else if (d[0] == 0x00 && d[1] == 0x00 &&
+ d[2] == 0x00 && d[3] != 0x00) /* 00 00 00 xx */
+ return "ISO-10646-UCS-4";
+ else if (d[0] != 0x00 && d[1] == 0x00 &&
+ d[2] == 0x00 && d[3] == 0x00) /* xx 00 00 00 */
+ return "ISO-10646-UCS-4";
+ else if (d[0] == 0xfe && d[1] == 0xff) { /* BOM fe ff */
+ *data += 2;
+ *size -= 2;
+ return "UTF-16BE";
+ } else if (d[0] == 0xff && d[1] == 0xfe) { /* BOM ff fe */
+ *data += 2;
+ *size -= 2;
+ return "UTF-16LE";
+ } else if (d[0] == 0xef && d[1] == 0xbb &&
+ d[2] == 0xbf) { /* BOM ef bb bf */
+ *data += 3;
+ *size -= 3;
+ return "UTF-8";
+ }
+
+ return 0;
+}
+
+#endif
+