From 6df8f99a707326655b4f285920f19fef6d9eb90a Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Tue, 14 Oct 2008 15:44:05 +0000 Subject: Fixup dubious charsets svn path=/trunk/hubbub/; revision=5575 --- include/hubbub/parser.h | 3 ++- src/charset/detect.c | 1 - src/charset/detect.h | 3 +++ src/parser.c | 21 +++++++++++++++++++-- src/treebuilder/in_head.c | 4 ++++ test/parser.c | 2 +- test/tree-buf.c | 2 +- test/tree.c | 2 +- test/tree2.c | 2 +- 9 files changed, 32 insertions(+), 8 deletions(-) diff --git a/include/hubbub/parser.h b/include/hubbub/parser.h index 68a9d27..2e34b5e 100644 --- a/include/hubbub/parser.h +++ b/include/hubbub/parser.h @@ -8,6 +8,7 @@ #ifndef hubbub_parser_h_ #define hubbub_parser_h_ +#include #include #include @@ -55,7 +56,7 @@ typedef union hubbub_parser_optparams { } hubbub_parser_optparams; /* Create a hubbub parser */ -hubbub_parser *hubbub_parser_create(const char *enc, +hubbub_parser *hubbub_parser_create(const char *enc, bool fix_enc, hubbub_alloc alloc, void *pw); /* Destroy a hubbub parser */ void hubbub_parser_destroy(hubbub_parser *parser); diff --git a/src/charset/detect.c b/src/charset/detect.c index 755d9fd..562c12d 100644 --- a/src/charset/detect.c +++ b/src/charset/detect.c @@ -25,7 +25,6 @@ static bool hubbub_charset_get_attribute(const uint8_t **data, const uint8_t *end, const uint8_t **name, uint32_t *namelen, const uint8_t **value, uint32_t *valuelen); -static void hubbub_charset_fix_charset(uint16_t *charset); /** * Extract a charset from a chunk of data diff --git a/src/charset/detect.h b/src/charset/detect.h index cb837d0..ec97267 100644 --- a/src/charset/detect.h +++ b/src/charset/detect.h @@ -20,5 +20,8 @@ parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len, uint16_t hubbub_charset_parse_content(const uint8_t *value, uint32_t valuelen); +/* Fix up frequently misused character sets */ +void hubbub_charset_fix_charset(uint16_t *charset); + #endif diff --git a/src/parser.c b/src/parser.c index e43a309..075a0e2 100644 --- a/src/parser.c +++ b/src/parser.c @@ -5,6 +5,9 @@ * Copyright 2007-8 John-Mark Bell */ +#include + +#include #include #include @@ -29,11 +32,12 @@ struct hubbub_parser { * Create a hubbub parser * * \param enc Source document encoding, or NULL to autodetect + * `param fix_enc Permit fixing up of encoding if it's frequently misused * \param alloc Memory (de)allocation function * \param pw Pointer to client-specific private data (may be NULL) * \return Pointer to parser instance, or NULL on error */ -hubbub_parser *hubbub_parser_create(const char *enc, +hubbub_parser *hubbub_parser_create(const char *enc, bool fix_enc, hubbub_alloc alloc, void *pw) { hubbub_parser *parser; @@ -45,6 +49,19 @@ hubbub_parser *hubbub_parser_create(const char *enc, if (parser == NULL) return NULL; + /* If we have an encoding and we're permitted to fix up likely broken + * ones, then attempt to do so. */ + if (enc != NULL && fix_enc == true) { + uint16_t mibenum = parserutils_charset_mibenum_from_name(enc, + strlen(enc)); + + if (mibenum != 0) { + hubbub_charset_fix_charset(&mibenum); + + enc = parserutils_charset_mibenum_to_name(mibenum); + } + } + parser->stream = parserutils_inputstream_create(enc, enc != NULL ? HUBBUB_CHARSET_CONFIDENT : HUBBUB_CHARSET_UNKNOWN, hubbub_charset_extract, alloc, pw); @@ -201,7 +218,7 @@ hubbub_error hubbub_parser_parse_chunk(hubbub_parser *parser, * Pass a chunk of extraneous data to a hubbub parser for parsing * * \param parser Parser instance to use - * \param data Data to parse (encoded in internal charset) + * \param data Data to parse (encoded in UTF-8) * \param len Length, in byte, of data * \return HUBBUB_OK on success, appropriate error otherwise */ diff --git a/src/treebuilder/in_head.c b/src/treebuilder/in_head.c index e6cba81..ba4daf5 100644 --- a/src/treebuilder/in_head.c +++ b/src/treebuilder/in_head.c @@ -61,6 +61,8 @@ static hubbub_error process_meta_in_head(hubbub_treebuilder *treebuilder, } if (charset_enc != 0) { + hubbub_charset_fix_charset(&charset_enc); + if (treebuilder->tree_handler->encoding_change) { const char *name = parserutils_charset_mibenum_to_name( charset_enc); @@ -73,6 +75,8 @@ static hubbub_error process_meta_in_head(hubbub_treebuilder *treebuilder, } } } else if (content_type_enc != 0) { + hubbub_charset_fix_charset(&content_type_enc); + if (treebuilder->tree_handler->encoding_change) { const char *name = parserutils_charset_mibenum_to_name( content_type_enc); diff --git a/test/parser.c b/test/parser.c index 31cf541..6585bc7 100644 --- a/test/parser.c +++ b/test/parser.c @@ -32,7 +32,7 @@ static int run_test(int argc, char **argv, unsigned int CHUNK_SIZE) /* Initialise library */ assert(hubbub_initialise(argv[1], myrealloc, NULL) == HUBBUB_OK); - parser = hubbub_parser_create("UTF-8", myrealloc, NULL); + parser = hubbub_parser_create("UTF-8", false, myrealloc, NULL); assert(parser != NULL); params.token_handler.handler = token_handler; diff --git a/test/tree-buf.c b/test/tree-buf.c index cc7e345..362f80a 100644 --- a/test/tree-buf.c +++ b/test/tree-buf.c @@ -145,7 +145,7 @@ static hubbub_parser *setup_parser(void) hubbub_parser *parser; hubbub_parser_optparams params; - parser = hubbub_parser_create("UTF-8", myrealloc, NULL); + parser = hubbub_parser_create("UTF-8", false, myrealloc, NULL); assert(parser != NULL); params.tree_handler = &tree_handler; diff --git a/test/tree.c b/test/tree.c index 96762c9..74da1b0 100644 --- a/test/tree.c +++ b/test/tree.c @@ -95,7 +95,7 @@ static int run_test(int argc, char **argv, unsigned int CHUNK_SIZE) } node_ref_alloc = NODE_REF_CHUNK; - parser = hubbub_parser_create("UTF-8", myrealloc, NULL); + parser = hubbub_parser_create("UTF-8", false, myrealloc, NULL); assert(parser != NULL); params.tree_handler = &tree_handler; diff --git a/test/tree2.c b/test/tree2.c index 3e1e79e..3a9b5ad 100644 --- a/test/tree2.c +++ b/test/tree2.c @@ -145,7 +145,7 @@ static hubbub_parser *setup_parser(void) hubbub_parser *parser; hubbub_parser_optparams params; - parser = hubbub_parser_create("UTF-8", myrealloc, NULL); + parser = hubbub_parser_create("UTF-8", false, myrealloc, NULL); assert(parser != NULL); params.tree_handler = &tree_handler; -- cgit v1.2.3