diff options
author | John Mark Bell <jmb@netsurf-browser.org> | 2008-10-14 15:44:05 +0000 |
---|---|---|
committer | John Mark Bell <jmb@netsurf-browser.org> | 2008-10-14 15:44:05 +0000 |
commit | 6df8f99a707326655b4f285920f19fef6d9eb90a (patch) | |
tree | 893e8d1ff525027eb482957c259d8885c3436ae2 /src | |
parent | 58837fe7fb2196d39f09425329087b6b48aace46 (diff) | |
download | libhubbub-6df8f99a707326655b4f285920f19fef6d9eb90a.tar.gz libhubbub-6df8f99a707326655b4f285920f19fef6d9eb90a.tar.bz2 |
Fixup dubious charsets
svn path=/trunk/hubbub/; revision=5575
Diffstat (limited to 'src')
-rw-r--r-- | src/charset/detect.c | 1 | ||||
-rw-r--r-- | src/charset/detect.h | 3 | ||||
-rw-r--r-- | src/parser.c | 21 | ||||
-rw-r--r-- | src/treebuilder/in_head.c | 4 |
4 files changed, 26 insertions, 3 deletions
diff --git a/src/charset/detect.c b/src/charset/detect.c index 755d9fd..562c12d 100644 --- a/src/charset/detect.c +++ b/src/charset/detect.c @@ -25,7 +25,6 @@ static bool hubbub_charset_get_attribute(const uint8_t **data, const uint8_t *end, const uint8_t **name, uint32_t *namelen, const uint8_t **value, uint32_t *valuelen); -static void hubbub_charset_fix_charset(uint16_t *charset); /** * Extract a charset from a chunk of data diff --git a/src/charset/detect.h b/src/charset/detect.h index cb837d0..ec97267 100644 --- a/src/charset/detect.h +++ b/src/charset/detect.h @@ -20,5 +20,8 @@ parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len, uint16_t hubbub_charset_parse_content(const uint8_t *value, uint32_t valuelen); +/* Fix up frequently misused character sets */ +void hubbub_charset_fix_charset(uint16_t *charset); + #endif diff --git a/src/parser.c b/src/parser.c index e43a309..075a0e2 100644 --- a/src/parser.c +++ b/src/parser.c @@ -5,6 +5,9 @@ * Copyright 2007-8 John-Mark Bell <jmb@netsurf-browser.org> */ +#include <string.h> + +#include <parserutils/charset/mibenum.h> #include <parserutils/input/inputstream.h> #include <hubbub/parser.h> @@ -29,11 +32,12 @@ struct hubbub_parser { * Create a hubbub parser * * \param enc Source document encoding, or NULL to autodetect + * `param fix_enc Permit fixing up of encoding if it's frequently misused * \param alloc Memory (de)allocation function * \param pw Pointer to client-specific private data (may be NULL) * \return Pointer to parser instance, or NULL on error */ -hubbub_parser *hubbub_parser_create(const char *enc, +hubbub_parser *hubbub_parser_create(const char *enc, bool fix_enc, hubbub_alloc alloc, void *pw) { hubbub_parser *parser; @@ -45,6 +49,19 @@ hubbub_parser *hubbub_parser_create(const char *enc, if (parser == NULL) return NULL; + /* If we have an encoding and we're permitted to fix up likely broken + * ones, then attempt to do so. */ + if (enc != NULL && fix_enc == true) { + uint16_t mibenum = parserutils_charset_mibenum_from_name(enc, + strlen(enc)); + + if (mibenum != 0) { + hubbub_charset_fix_charset(&mibenum); + + enc = parserutils_charset_mibenum_to_name(mibenum); + } + } + parser->stream = parserutils_inputstream_create(enc, enc != NULL ? HUBBUB_CHARSET_CONFIDENT : HUBBUB_CHARSET_UNKNOWN, hubbub_charset_extract, alloc, pw); @@ -201,7 +218,7 @@ hubbub_error hubbub_parser_parse_chunk(hubbub_parser *parser, * Pass a chunk of extraneous data to a hubbub parser for parsing * * \param parser Parser instance to use - * \param data Data to parse (encoded in internal charset) + * \param data Data to parse (encoded in UTF-8) * \param len Length, in byte, of data * \return HUBBUB_OK on success, appropriate error otherwise */ diff --git a/src/treebuilder/in_head.c b/src/treebuilder/in_head.c index e6cba81..ba4daf5 100644 --- a/src/treebuilder/in_head.c +++ b/src/treebuilder/in_head.c @@ -61,6 +61,8 @@ static hubbub_error process_meta_in_head(hubbub_treebuilder *treebuilder, } if (charset_enc != 0) { + hubbub_charset_fix_charset(&charset_enc); + if (treebuilder->tree_handler->encoding_change) { const char *name = parserutils_charset_mibenum_to_name( charset_enc); @@ -73,6 +75,8 @@ static hubbub_error process_meta_in_head(hubbub_treebuilder *treebuilder, } } } else if (content_type_enc != 0) { + hubbub_charset_fix_charset(&content_type_enc); + if (treebuilder->tree_handler->encoding_change) { const char *name = parserutils_charset_mibenum_to_name( content_type_enc); |