From 472d9ed91f2e43642edd7ae92269b76a293e004e Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Fri, 23 Jan 2009 11:12:56 +0000 Subject: Convert UTF-16 meta charsets to UTF-8, given we'll have auto-detected them as UTF-8 in the chardet stage. (Credit: Andi Sidwell) svn path=/trunk/hubbub/; revision=6180 --- src/treebuilder/in_head.c | 57 ++++++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/src/treebuilder/in_head.c b/src/treebuilder/in_head.c index 765b7d0..b042e52 100644 --- a/src/treebuilder/in_head.c +++ b/src/treebuilder/in_head.c @@ -29,6 +29,7 @@ static hubbub_error process_meta_in_head(hubbub_treebuilder *treebuilder, const hubbub_token *token) { + static uint16_t utf16, utf16be, utf16le; uint16_t charset_enc = 0; uint16_t content_type_enc = 0; size_t i; @@ -37,10 +38,19 @@ static hubbub_error process_meta_in_head(hubbub_treebuilder *treebuilder, /** \todo ack sc flag */ -#if 0 - if (confidence == certain) + if (treebuilder->tree_handler->encoding_change == NULL) return HUBBUB_OK; -#endif + + /* Grab UTF-16 MIBenums */ + if (utf16 == 0) { + utf16 = parserutils_charset_mibenum_from_name( + "utf-16", SLEN("utf-16")); + utf16be = parserutils_charset_mibenum_from_name( + "utf-16be", SLEN("utf-16be")); + utf16le = parserutils_charset_mibenum_from_name( + "utf-16le", SLEN("utf-16le")); + assert(utf16 != 0 && utf16be != 0 && utf16le != 0); + } for (i = 0; i < token->data.tag.n_attributes; i++) { hubbub_attribute *attr = &token->data.tag.attributes[i]; @@ -61,33 +71,28 @@ static hubbub_error process_meta_in_head(hubbub_treebuilder *treebuilder, } } + /* Fall back, if necessary */ + if (charset_enc == 0 && content_type_enc != 0) + charset_enc = content_type_enc; + if (charset_enc != 0) { - hubbub_charset_fix_charset(&charset_enc); + const char *name; - if (treebuilder->tree_handler->encoding_change) { - const char *name = parserutils_charset_mibenum_to_name( - charset_enc); + hubbub_charset_fix_charset(&charset_enc); - /* 1 indicates the encoding should actually change */ - if (treebuilder->tree_handler->encoding_change( - treebuilder->tree_handler->ctx, - name) == 1) { - return HUBBUB_ENCODINGCHANGE; - } + /* Change UTF-16 to UTF-8 */ + if (charset_enc == utf16le || charset_enc == utf16be || + charset_enc == utf16) { + charset_enc = parserutils_charset_mibenum_from_name( + "UTF-8", SLEN("UTF-8")); } - } else if (content_type_enc != 0) { - hubbub_charset_fix_charset(&content_type_enc); - - if (treebuilder->tree_handler->encoding_change) { - const char *name = parserutils_charset_mibenum_to_name( - content_type_enc); - - /* 1 indicates the encoding should actually change */ - if (treebuilder->tree_handler->encoding_change( - treebuilder->tree_handler->ctx, - name) == 1) { - return HUBBUB_ENCODINGCHANGE; - } + + name = parserutils_charset_mibenum_to_name(charset_enc); + + /* 1 indicates the encoding should actually change */ + if (treebuilder->tree_handler->encoding_change( + treebuilder->tree_handler->ctx, name) == 1) { + return HUBBUB_ENCODINGCHANGE; } } -- cgit v1.2.1