summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/hubbub/parser.h3
-rw-r--r--src/charset/detect.c1
-rw-r--r--src/charset/detect.h3
-rw-r--r--src/parser.c21
-rw-r--r--src/treebuilder/in_head.c4
-rw-r--r--test/parser.c2
-rw-r--r--test/tree-buf.c2
-rw-r--r--test/tree.c2
-rw-r--r--test/tree2.c2
9 files changed, 32 insertions, 8 deletions
diff --git a/include/hubbub/parser.h b/include/hubbub/parser.h
index 68a9d27..2e34b5e 100644
--- a/include/hubbub/parser.h
+++ b/include/hubbub/parser.h
@@ -8,6 +8,7 @@
#ifndef hubbub_parser_h_
#define hubbub_parser_h_
+#include <stdbool.h>
#include <inttypes.h>
#include <hubbub/errors.h>
@@ -55,7 +56,7 @@ typedef union hubbub_parser_optparams {
} hubbub_parser_optparams;
/* Create a hubbub parser */
-hubbub_parser *hubbub_parser_create(const char *enc,
+hubbub_parser *hubbub_parser_create(const char *enc, bool fix_enc,
hubbub_alloc alloc, void *pw);
/* Destroy a hubbub parser */
void hubbub_parser_destroy(hubbub_parser *parser);
diff --git a/src/charset/detect.c b/src/charset/detect.c
index 755d9fd..562c12d 100644
--- a/src/charset/detect.c
+++ b/src/charset/detect.c
@@ -25,7 +25,6 @@ static bool hubbub_charset_get_attribute(const uint8_t **data,
const uint8_t *end,
const uint8_t **name, uint32_t *namelen,
const uint8_t **value, uint32_t *valuelen);
-static void hubbub_charset_fix_charset(uint16_t *charset);
/**
* Extract a charset from a chunk of data
diff --git a/src/charset/detect.h b/src/charset/detect.h
index cb837d0..ec97267 100644
--- a/src/charset/detect.h
+++ b/src/charset/detect.h
@@ -20,5 +20,8 @@ parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len,
uint16_t hubbub_charset_parse_content(const uint8_t *value,
uint32_t valuelen);
+/* Fix up frequently misused character sets */
+void hubbub_charset_fix_charset(uint16_t *charset);
+
#endif
diff --git a/src/parser.c b/src/parser.c
index e43a309..075a0e2 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -5,6 +5,9 @@
* Copyright 2007-8 John-Mark Bell <jmb@netsurf-browser.org>
*/
+#include <string.h>
+
+#include <parserutils/charset/mibenum.h>
#include <parserutils/input/inputstream.h>
#include <hubbub/parser.h>
@@ -29,11 +32,12 @@ struct hubbub_parser {
* Create a hubbub parser
*
* \param enc Source document encoding, or NULL to autodetect
+ * `param fix_enc Permit fixing up of encoding if it's frequently misused
* \param alloc Memory (de)allocation function
* \param pw Pointer to client-specific private data (may be NULL)
* \return Pointer to parser instance, or NULL on error
*/
-hubbub_parser *hubbub_parser_create(const char *enc,
+hubbub_parser *hubbub_parser_create(const char *enc, bool fix_enc,
hubbub_alloc alloc, void *pw)
{
hubbub_parser *parser;
@@ -45,6 +49,19 @@ hubbub_parser *hubbub_parser_create(const char *enc,
if (parser == NULL)
return NULL;
+ /* If we have an encoding and we're permitted to fix up likely broken
+ * ones, then attempt to do so. */
+ if (enc != NULL && fix_enc == true) {
+ uint16_t mibenum = parserutils_charset_mibenum_from_name(enc,
+ strlen(enc));
+
+ if (mibenum != 0) {
+ hubbub_charset_fix_charset(&mibenum);
+
+ enc = parserutils_charset_mibenum_to_name(mibenum);
+ }
+ }
+
parser->stream = parserutils_inputstream_create(enc,
enc != NULL ? HUBBUB_CHARSET_CONFIDENT : HUBBUB_CHARSET_UNKNOWN,
hubbub_charset_extract, alloc, pw);
@@ -201,7 +218,7 @@ hubbub_error hubbub_parser_parse_chunk(hubbub_parser *parser,
* Pass a chunk of extraneous data to a hubbub parser for parsing
*
* \param parser Parser instance to use
- * \param data Data to parse (encoded in internal charset)
+ * \param data Data to parse (encoded in UTF-8)
* \param len Length, in byte, of data
* \return HUBBUB_OK on success, appropriate error otherwise
*/
diff --git a/src/treebuilder/in_head.c b/src/treebuilder/in_head.c
index e6cba81..ba4daf5 100644
--- a/src/treebuilder/in_head.c
+++ b/src/treebuilder/in_head.c
@@ -61,6 +61,8 @@ static hubbub_error process_meta_in_head(hubbub_treebuilder *treebuilder,
}
if (charset_enc != 0) {
+ hubbub_charset_fix_charset(&charset_enc);
+
if (treebuilder->tree_handler->encoding_change) {
const char *name = parserutils_charset_mibenum_to_name(
charset_enc);
@@ -73,6 +75,8 @@ static hubbub_error process_meta_in_head(hubbub_treebuilder *treebuilder,
}
}
} else if (content_type_enc != 0) {
+ hubbub_charset_fix_charset(&content_type_enc);
+
if (treebuilder->tree_handler->encoding_change) {
const char *name = parserutils_charset_mibenum_to_name(
content_type_enc);
diff --git a/test/parser.c b/test/parser.c
index 31cf541..6585bc7 100644
--- a/test/parser.c
+++ b/test/parser.c
@@ -32,7 +32,7 @@ static int run_test(int argc, char **argv, unsigned int CHUNK_SIZE)
/* Initialise library */
assert(hubbub_initialise(argv[1], myrealloc, NULL) == HUBBUB_OK);
- parser = hubbub_parser_create("UTF-8", myrealloc, NULL);
+ parser = hubbub_parser_create("UTF-8", false, myrealloc, NULL);
assert(parser != NULL);
params.token_handler.handler = token_handler;
diff --git a/test/tree-buf.c b/test/tree-buf.c
index cc7e345..362f80a 100644
--- a/test/tree-buf.c
+++ b/test/tree-buf.c
@@ -145,7 +145,7 @@ static hubbub_parser *setup_parser(void)
hubbub_parser *parser;
hubbub_parser_optparams params;
- parser = hubbub_parser_create("UTF-8", myrealloc, NULL);
+ parser = hubbub_parser_create("UTF-8", false, myrealloc, NULL);
assert(parser != NULL);
params.tree_handler = &tree_handler;
diff --git a/test/tree.c b/test/tree.c
index 96762c9..74da1b0 100644
--- a/test/tree.c
+++ b/test/tree.c
@@ -95,7 +95,7 @@ static int run_test(int argc, char **argv, unsigned int CHUNK_SIZE)
}
node_ref_alloc = NODE_REF_CHUNK;
- parser = hubbub_parser_create("UTF-8", myrealloc, NULL);
+ parser = hubbub_parser_create("UTF-8", false, myrealloc, NULL);
assert(parser != NULL);
params.tree_handler = &tree_handler;
diff --git a/test/tree2.c b/test/tree2.c
index 3e1e79e..3a9b5ad 100644
--- a/test/tree2.c
+++ b/test/tree2.c
@@ -145,7 +145,7 @@ static hubbub_parser *setup_parser(void)
hubbub_parser *parser;
hubbub_parser_optparams params;
- parser = hubbub_parser_create("UTF-8", myrealloc, NULL);
+ parser = hubbub_parser_create("UTF-8", false, myrealloc, NULL);
assert(parser != NULL);
params.tree_handler = &tree_handler;