diff options
Diffstat (limited to 'render/libxml_binding.c')
-rw-r--r-- | render/libxml_binding.c | 312 |
1 files changed, 0 insertions, 312 deletions
diff --git a/render/libxml_binding.c b/render/libxml_binding.c index fdff19b70..e69de29bb 100644 --- a/render/libxml_binding.c +++ b/render/libxml_binding.c @@ -1,312 +0,0 @@ -/* - * Copyright 2007 James Bursa <bursa@users.sourceforge.net> - * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org> - * - * This file is part of NetSurf, http://www.netsurf-browser.org/ - * - * NetSurf is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; version 2 of the License. - * - * NetSurf is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#ifndef WITH_HUBBUB - -#include <stdbool.h> -#include <string.h> - -#include <libxml/HTMLparser.h> -#include <libxml/HTMLtree.h> -#include <libxml/parser.h> -#include <libxml/parserInternals.h> - -#include "render/parser_binding.h" - -#include "utils/log.h" -#include "utils/talloc.h" - -typedef struct libxml_ctx { - htmlParserCtxt *parser; - - /** HTML parser encoding handler. */ - xmlCharEncodingHandler *encoding_handler; - - const char *encoding; - binding_encoding_source encoding_source; - - bool getenc; -} libxml_ctx; - -static bool set_parser_encoding(libxml_ctx *c, const char *encoding); -static const char *detect_encoding(const char **data, size_t *size); - -binding_error binding_create_tree(void *arena, const char *charset, void **ctx) -{ - libxml_ctx *c; - - c = malloc(sizeof(libxml_ctx)); - if (c == NULL) - return BINDING_NOMEM; - - c->parser = NULL; - c->encoding_handler = NULL; - c->encoding = charset; - c->encoding_source = ENCODING_SOURCE_HEADER; - c->getenc = true; - - c->parser = htmlCreatePushParserCtxt(0, 0, "", 0, 0, - XML_CHAR_ENCODING_NONE); - if (c->parser == NULL) { - free(c); - return BINDING_NOMEM; - } - - if (c->encoding != NULL && !set_parser_encoding(c, charset)) { - if (c->parser->myDoc != NULL) - xmlFreeDoc(c->parser->myDoc); - htmlFreeParserCtxt(c->parser); - free(c); - return BINDING_BADENCODING; - } - - *ctx = (void *) c; - - return BINDING_OK; -} - -binding_error binding_destroy_tree(void *ctx) -{ - libxml_ctx *c = (libxml_ctx *) ctx; - - if (ctx == NULL) - return BINDING_OK; - - if (c->parser->myDoc != NULL) - xmlFreeDoc(c->parser->myDoc); - - if (c->parser != NULL) - htmlFreeParserCtxt(c->parser); - - c->parser = NULL; - c->encoding = NULL; - - free(c); - - return BINDING_OK; -} - -binding_error binding_parse_chunk(void *ctx, const uint8_t *data, size_t len) -{ - libxml_ctx *c = (libxml_ctx *) ctx; - - if (c->getenc) { - /* No encoding was specified in the Content-Type header. - * Attempt to detect if the encoding is not 8-bit. If the - * encoding is 8-bit, leave the parser unchanged, so that it - * searches for a <meta http-equiv="content-type" - * content="text/html; charset=...">. */ - const char *encoding; - encoding = detect_encoding((const char **) (void *) &data, - &len); - if (encoding) { - if (!set_parser_encoding(c, encoding)) - return BINDING_NOMEM; - c->encoding = encoding; - c->encoding_source = ENCODING_SOURCE_DETECTED; - } - c->getenc = false; - - /* The data we received may have solely consisted of a BOM. - * If so, it will have been stripped by html_detect_encoding. - * Therefore, we'll have nothing to do in that case. */ - if (len == 0) - return BINDING_OK; - } - - htmlParseChunk(c->parser, (const char *) data, len, 0); - /** \todo error handling */ - - if (!c->encoding && c->parser->input->encoding) { - /* The encoding was not in headers or detected, - * and the parser found a <meta http-equiv="content-type" - * content="text/html; charset=...">. */ - - /* However, if that encoding is non-ASCII-compatible, - * ignore it, as it can't possibly be correct */ - if (strncasecmp((const char *) c->parser->input->encoding, - "UTF-16", 6) == 0 || /* UTF-16(LE|BE)? */ - strncasecmp((const char *) c->parser->input->encoding, - "UTF-32", 6) == 0) { /* UTF-32(LE|BE)? */ - c->encoding = "ISO-8859-1"; - c->encoding_source = ENCODING_SOURCE_DETECTED; - } else { - c->encoding = (const char *) c->parser->input->encoding; - c->encoding_source = ENCODING_SOURCE_META; - } - - if (!c->encoding) - return BINDING_NOMEM; - - /* have the encoding; don't attempt to detect it */ - c->getenc = false; - - return BINDING_ENCODINGCHANGE; - } - - return BINDING_OK; -} - -binding_error binding_parse_completed(void *ctx) -{ - libxml_ctx *c = (libxml_ctx *) ctx; - - htmlParseChunk(c->parser, "", 0, 1); - /** \todo error handling */ - - return BINDING_OK; -} - -const char *binding_get_encoding(void *ctx, binding_encoding_source *source) -{ - libxml_ctx *c = (libxml_ctx *) ctx; - - *source = c->encoding_source; - - return c->encoding; -} - -xmlDocPtr binding_get_document(void *ctx) -{ - libxml_ctx *c = (libxml_ctx *) ctx; - xmlDocPtr doc = c->parser->myDoc; - - c->parser->myDoc = NULL; - - return doc; -} - -/******************************************************************************/ - -/** - * Set the HTML parser character encoding. - * - * \param c context - * \param encoding name of encoding - * \return true on success, false on error and error reported - */ -bool set_parser_encoding(libxml_ctx *c, const char *encoding) -{ - xmlError *error; - - c->encoding_handler = xmlFindCharEncodingHandler(encoding); - if (!c->encoding_handler) { - /* either out of memory, or no handler available */ - /* assume no handler available, which is not a fatal error */ - LOG(("no encoding handler for \"%s\"", encoding)); - /* \todo warn user and ask them to install iconv? */ - return true; - } - - xmlCtxtResetLastError(c->parser); - if (xmlSwitchToEncoding(c->parser, c->encoding_handler)) { - error = xmlCtxtGetLastError(c->parser); - LOG(("xmlSwitchToEncoding(): %s", - error ? error->message : "failed")); - return false; - } - - /* Dirty hack to get around libxml oddness: - * 1) When creating a push parser context, the input flow's encoding - * string is not set (whether an encoding is specified or not) - * 2) When switching encoding (as above), the input flow's encoding - * string is never changed - * 3) When handling a meta charset, the input flow's encoding string - * is checked to determine if an encoding has already been set. - * If it has been set, then the meta charset is ignored. - * - * The upshot of this is that, if we don't explicitly set the input - * flow's encoding string here, any meta charset in the document - * will override our setting, which is incorrect behaviour. - * - * Ideally, this would be fixed in libxml, but that requires rather - * more knowledge than I currently have of what libxml is doing. - */ - if (!c->parser->input->encoding) - c->parser->input->encoding = - xmlStrdup((const xmlChar *) encoding); - - /* Ensure noone else attempts to reset the encoding */ - c->getenc = false; - - return true; -} - -/** - * Attempt to detect the encoding of some HTML data. - * - * \param data Pointer to HTML source data - * \param size Pointer to length of data - * \return a constant string giving the encoding, or 0 if the encoding - * appears to be some 8-bit encoding - * - * If a BOM is encountered, *data and *size will be modified to skip over it - */ - -const char *detect_encoding(const char **data, size_t *size) -{ - const unsigned char *d = (const unsigned char *) *data; - - /* this detection assumes that the first two characters are <= 0xff */ - if (*size < 4) - return 0; - - if (d[0] == 0x00 && d[1] == 0x00 && - d[2] == 0xfe && d[3] == 0xff) { /* BOM 00 00 fe ff */ - *data += 4; - *size -= 4; - return "UTF-32BE"; - } else if (d[0] == 0xff && d[1] == 0xfe && - d[2] == 0x00 && d[3] == 0x00) { /* BOM ff fe 00 00 */ - *data += 4; - *size -= 4; - return "UTF-32LE"; - } - else if (d[0] == 0x00 && d[1] != 0x00 && - d[2] == 0x00 && d[3] != 0x00) /* 00 xx 00 xx */ - return "UTF-16BE"; - else if (d[0] != 0x00 && d[1] == 0x00 && - d[2] != 0x00 && d[3] == 0x00) /* xx 00 xx 00 */ - return "UTF-16LE"; - else if (d[0] == 0x00 && d[1] == 0x00 && - d[2] == 0x00 && d[3] != 0x00) /* 00 00 00 xx */ - return "ISO-10646-UCS-4"; - else if (d[0] != 0x00 && d[1] == 0x00 && - d[2] == 0x00 && d[3] == 0x00) /* xx 00 00 00 */ - return "ISO-10646-UCS-4"; - else if (d[0] == 0xfe && d[1] == 0xff) { /* BOM fe ff */ - *data += 2; - *size -= 2; - return "UTF-16BE"; - } else if (d[0] == 0xff && d[1] == 0xfe) { /* BOM ff fe */ - *data += 2; - *size -= 2; - return "UTF-16LE"; - } else if (d[0] == 0xef && d[1] == 0xbb && - d[2] == 0xbf) { /* BOM ef bb bf */ - *data += 3; - *size -= 3; - return "UTF-8"; - } - - return 0; -} - -#endif - |