summaryrefslogtreecommitdiff
path: root/render/libxml_binding.c
diff options
context:
space:
mode:
Diffstat (limited to 'render/libxml_binding.c')
-rw-r--r--render/libxml_binding.c312
1 files changed, 0 insertions, 312 deletions
diff --git a/render/libxml_binding.c b/render/libxml_binding.c
index fdff19b70..e69de29bb 100644
--- a/render/libxml_binding.c
+++ b/render/libxml_binding.c
@@ -1,312 +0,0 @@
-/*
- * Copyright 2007 James Bursa <bursa@users.sourceforge.net>
- * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
- *
- * This file is part of NetSurf, http://www.netsurf-browser.org/
- *
- * NetSurf is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * NetSurf is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef WITH_HUBBUB
-
-#include <stdbool.h>
-#include <string.h>
-
-#include <libxml/HTMLparser.h>
-#include <libxml/HTMLtree.h>
-#include <libxml/parser.h>
-#include <libxml/parserInternals.h>
-
-#include "render/parser_binding.h"
-
-#include "utils/log.h"
-#include "utils/talloc.h"
-
-typedef struct libxml_ctx {
- htmlParserCtxt *parser;
-
- /** HTML parser encoding handler. */
- xmlCharEncodingHandler *encoding_handler;
-
- const char *encoding;
- binding_encoding_source encoding_source;
-
- bool getenc;
-} libxml_ctx;
-
-static bool set_parser_encoding(libxml_ctx *c, const char *encoding);
-static const char *detect_encoding(const char **data, size_t *size);
-
-binding_error binding_create_tree(void *arena, const char *charset, void **ctx)
-{
- libxml_ctx *c;
-
- c = malloc(sizeof(libxml_ctx));
- if (c == NULL)
- return BINDING_NOMEM;
-
- c->parser = NULL;
- c->encoding_handler = NULL;
- c->encoding = charset;
- c->encoding_source = ENCODING_SOURCE_HEADER;
- c->getenc = true;
-
- c->parser = htmlCreatePushParserCtxt(0, 0, "", 0, 0,
- XML_CHAR_ENCODING_NONE);
- if (c->parser == NULL) {
- free(c);
- return BINDING_NOMEM;
- }
-
- if (c->encoding != NULL && !set_parser_encoding(c, charset)) {
- if (c->parser->myDoc != NULL)
- xmlFreeDoc(c->parser->myDoc);
- htmlFreeParserCtxt(c->parser);
- free(c);
- return BINDING_BADENCODING;
- }
-
- *ctx = (void *) c;
-
- return BINDING_OK;
-}
-
-binding_error binding_destroy_tree(void *ctx)
-{
- libxml_ctx *c = (libxml_ctx *) ctx;
-
- if (ctx == NULL)
- return BINDING_OK;
-
- if (c->parser->myDoc != NULL)
- xmlFreeDoc(c->parser->myDoc);
-
- if (c->parser != NULL)
- htmlFreeParserCtxt(c->parser);
-
- c->parser = NULL;
- c->encoding = NULL;
-
- free(c);
-
- return BINDING_OK;
-}
-
-binding_error binding_parse_chunk(void *ctx, const uint8_t *data, size_t len)
-{
- libxml_ctx *c = (libxml_ctx *) ctx;
-
- if (c->getenc) {
- /* No encoding was specified in the Content-Type header.
- * Attempt to detect if the encoding is not 8-bit. If the
- * encoding is 8-bit, leave the parser unchanged, so that it
- * searches for a <meta http-equiv="content-type"
- * content="text/html; charset=...">. */
- const char *encoding;
- encoding = detect_encoding((const char **) (void *) &data,
- &len);
- if (encoding) {
- if (!set_parser_encoding(c, encoding))
- return BINDING_NOMEM;
- c->encoding = encoding;
- c->encoding_source = ENCODING_SOURCE_DETECTED;
- }
- c->getenc = false;
-
- /* The data we received may have solely consisted of a BOM.
- * If so, it will have been stripped by html_detect_encoding.
- * Therefore, we'll have nothing to do in that case. */
- if (len == 0)
- return BINDING_OK;
- }
-
- htmlParseChunk(c->parser, (const char *) data, len, 0);
- /** \todo error handling */
-
- if (!c->encoding && c->parser->input->encoding) {
- /* The encoding was not in headers or detected,
- * and the parser found a <meta http-equiv="content-type"
- * content="text/html; charset=...">. */
-
- /* However, if that encoding is non-ASCII-compatible,
- * ignore it, as it can't possibly be correct */
- if (strncasecmp((const char *) c->parser->input->encoding,
- "UTF-16", 6) == 0 || /* UTF-16(LE|BE)? */
- strncasecmp((const char *) c->parser->input->encoding,
- "UTF-32", 6) == 0) { /* UTF-32(LE|BE)? */
- c->encoding = "ISO-8859-1";
- c->encoding_source = ENCODING_SOURCE_DETECTED;
- } else {
- c->encoding = (const char *) c->parser->input->encoding;
- c->encoding_source = ENCODING_SOURCE_META;
- }
-
- if (!c->encoding)
- return BINDING_NOMEM;
-
- /* have the encoding; don't attempt to detect it */
- c->getenc = false;
-
- return BINDING_ENCODINGCHANGE;
- }
-
- return BINDING_OK;
-}
-
-binding_error binding_parse_completed(void *ctx)
-{
- libxml_ctx *c = (libxml_ctx *) ctx;
-
- htmlParseChunk(c->parser, "", 0, 1);
- /** \todo error handling */
-
- return BINDING_OK;
-}
-
-const char *binding_get_encoding(void *ctx, binding_encoding_source *source)
-{
- libxml_ctx *c = (libxml_ctx *) ctx;
-
- *source = c->encoding_source;
-
- return c->encoding;
-}
-
-xmlDocPtr binding_get_document(void *ctx)
-{
- libxml_ctx *c = (libxml_ctx *) ctx;
- xmlDocPtr doc = c->parser->myDoc;
-
- c->parser->myDoc = NULL;
-
- return doc;
-}
-
-/******************************************************************************/
-
-/**
- * Set the HTML parser character encoding.
- *
- * \param c context
- * \param encoding name of encoding
- * \return true on success, false on error and error reported
- */
-bool set_parser_encoding(libxml_ctx *c, const char *encoding)
-{
- xmlError *error;
-
- c->encoding_handler = xmlFindCharEncodingHandler(encoding);
- if (!c->encoding_handler) {
- /* either out of memory, or no handler available */
- /* assume no handler available, which is not a fatal error */
- LOG(("no encoding handler for \"%s\"", encoding));
- /* \todo warn user and ask them to install iconv? */
- return true;
- }
-
- xmlCtxtResetLastError(c->parser);
- if (xmlSwitchToEncoding(c->parser, c->encoding_handler)) {
- error = xmlCtxtGetLastError(c->parser);
- LOG(("xmlSwitchToEncoding(): %s",
- error ? error->message : "failed"));
- return false;
- }
-
- /* Dirty hack to get around libxml oddness:
- * 1) When creating a push parser context, the input flow's encoding
- * string is not set (whether an encoding is specified or not)
- * 2) When switching encoding (as above), the input flow's encoding
- * string is never changed
- * 3) When handling a meta charset, the input flow's encoding string
- * is checked to determine if an encoding has already been set.
- * If it has been set, then the meta charset is ignored.
- *
- * The upshot of this is that, if we don't explicitly set the input
- * flow's encoding string here, any meta charset in the document
- * will override our setting, which is incorrect behaviour.
- *
- * Ideally, this would be fixed in libxml, but that requires rather
- * more knowledge than I currently have of what libxml is doing.
- */
- if (!c->parser->input->encoding)
- c->parser->input->encoding =
- xmlStrdup((const xmlChar *) encoding);
-
- /* Ensure noone else attempts to reset the encoding */
- c->getenc = false;
-
- return true;
-}
-
-/**
- * Attempt to detect the encoding of some HTML data.
- *
- * \param data Pointer to HTML source data
- * \param size Pointer to length of data
- * \return a constant string giving the encoding, or 0 if the encoding
- * appears to be some 8-bit encoding
- *
- * If a BOM is encountered, *data and *size will be modified to skip over it
- */
-
-const char *detect_encoding(const char **data, size_t *size)
-{
- const unsigned char *d = (const unsigned char *) *data;
-
- /* this detection assumes that the first two characters are <= 0xff */
- if (*size < 4)
- return 0;
-
- if (d[0] == 0x00 && d[1] == 0x00 &&
- d[2] == 0xfe && d[3] == 0xff) { /* BOM 00 00 fe ff */
- *data += 4;
- *size -= 4;
- return "UTF-32BE";
- } else if (d[0] == 0xff && d[1] == 0xfe &&
- d[2] == 0x00 && d[3] == 0x00) { /* BOM ff fe 00 00 */
- *data += 4;
- *size -= 4;
- return "UTF-32LE";
- }
- else if (d[0] == 0x00 && d[1] != 0x00 &&
- d[2] == 0x00 && d[3] != 0x00) /* 00 xx 00 xx */
- return "UTF-16BE";
- else if (d[0] != 0x00 && d[1] == 0x00 &&
- d[2] != 0x00 && d[3] == 0x00) /* xx 00 xx 00 */
- return "UTF-16LE";
- else if (d[0] == 0x00 && d[1] == 0x00 &&
- d[2] == 0x00 && d[3] != 0x00) /* 00 00 00 xx */
- return "ISO-10646-UCS-4";
- else if (d[0] != 0x00 && d[1] == 0x00 &&
- d[2] == 0x00 && d[3] == 0x00) /* xx 00 00 00 */
- return "ISO-10646-UCS-4";
- else if (d[0] == 0xfe && d[1] == 0xff) { /* BOM fe ff */
- *data += 2;
- *size -= 2;
- return "UTF-16BE";
- } else if (d[0] == 0xff && d[1] == 0xfe) { /* BOM ff fe */
- *data += 2;
- *size -= 2;
- return "UTF-16LE";
- } else if (d[0] == 0xef && d[1] == 0xbb &&
- d[2] == 0xbf) { /* BOM ef bb bf */
- *data += 3;
- *size -= 3;
- return "UTF-8";
- }
-
- return 0;
-}
-
-#endif
-