summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Mark Bell <jmb@netsurf-browser.org>2007-05-29 18:03:07 +0000
committerJohn Mark Bell <jmb@netsurf-browser.org>2007-05-29 18:03:07 +0000
commit01316168fc813f22ea15eda250f4d925c1fa9061 (patch)
treee7edea85f9cf75a50c453691e46c3bc0d7d3b879
parent15be8d1ade8769e9670009701dd1ce5c8b36c54b (diff)
downloadnetsurf-01316168fc813f22ea15eda250f4d925c1fa9061.tar.gz
netsurf-01316168fc813f22ea15eda250f4d925c1fa9061.tar.bz2
Fix bugs in charset detection.
Strip BOM from parser input, as it confuses libxml. Ignore non-ASCII-compatible charsets declared in meta tag (the parser defaults to 8 bit, so if it's managed to extract a meta charset, then it must be ASCII-compatible, so a non-ASCII-compatible meta charset is lies). Fixes WightLink timetable and 1726341. svn path=/trunk/netsurf/; revision=3304
-rw-r--r--render/html.c86
1 files changed, 66 insertions, 20 deletions
diff --git a/render/html.c b/render/html.c
index 0cf080381..9fe76455a 100644
--- a/render/html.c
+++ b/render/html.c
@@ -38,7 +38,7 @@
static bool html_set_parser_encoding(struct content *c, const char *encoding);
-static const char *html_detect_encoding(const char *data, unsigned int size);
+static const char *html_detect_encoding(const char **data, unsigned int *size);
static void html_convert_css_callback(content_msg msg, struct content *css,
intptr_t p1, intptr_t p2, union content_msg_data data);
static bool html_meta_refresh(struct content *c, xmlNode *head);
@@ -157,7 +157,7 @@ bool html_process_data(struct content *c, char *data, unsigned int size)
* searches for a <meta http-equiv="content-type"
* content="text/html; charset=...">. */
const char *encoding;
- encoding = html_detect_encoding(data, size);
+ encoding = html_detect_encoding((const char **) &data, &size);
if (encoding) {
if (!html_set_parser_encoding(c, encoding))
return false;
@@ -168,6 +168,12 @@ bool html_process_data(struct content *c, char *data, unsigned int size)
ENCODING_SOURCE_DETECTED;
}
c->data.html.getenc = false;
+
+ /* The data we received may have solely consisted of a BOM.
+ * If so, it will have been stripped by html_detect_encoding.
+ * Therefore, we'll have nothing to do in that case. */
+ if (size == 0)
+ return true;
}
for (x = 0; x + CHUNK <= size; x += CHUNK) {
@@ -180,8 +186,22 @@ bool html_process_data(struct content *c, char *data, unsigned int size)
/* The encoding was not in headers or detected,
* and the parser found a <meta http-equiv="content-type"
* content="text/html; charset=...">. */
- c->data.html.encoding = talloc_strdup(c,
+
+ /* However, if that encoding is non-ASCII-compatible,
+ * ignore it, as it can't possibly be correct */
+ if (strncasecmp(c->data.html.parser->input->encoding,
+ "UTF-16", 6) == 0 || /* UTF-16(LE|BE)? */
+ strncasecmp(c->data.html.parser->input->encoding,
+ "UTF-32", 6) == 0) { /* UTF-32(LE|BE)? */
+ c->data.html.encoding = talloc_strdup(c, "ISO-8859-1");
+ c->data.html.encoding_source =
+ ENCODING_SOURCE_DETECTED;
+ } else {
+ c->data.html.encoding = talloc_strdup(c,
c->data.html.parser->input->encoding);
+ c->data.html.encoding_source = ENCODING_SOURCE_META;
+ }
+
if (!c->data.html.encoding) {
union content_msg_data msg_data;
@@ -189,7 +209,6 @@ bool html_process_data(struct content *c, char *data, unsigned int size)
content_broadcast(c, CONTENT_MSG_ERROR, msg_data);
return false;
}
- c->data.html.encoding_source = ENCODING_SOURCE_META;
/* have the encoding; don't attempt to detect it */
c->data.html.getenc = false;
@@ -293,33 +312,60 @@ bool html_set_parser_encoding(struct content *c, const char *encoding)
/**
* Attempt to detect the encoding of some HTML data.
*
- * \param data HTML source data
- * \param size length of data
+ * \param data Pointer to HTML source data
+ * \param size Pointer to length of data
* \return a constant string giving the encoding, or 0 if the encoding
* appears to be some 8-bit encoding
+ *
+ * If a BOM is encountered, *data and *size will be modified to skip over it
*/
-const char *html_detect_encoding(const char *data, unsigned int size)
+const char *html_detect_encoding(const char **data, unsigned int *size)
{
+ const unsigned char *d = (const unsigned char *) *data;
+
/* this detection assumes that the first two characters are <= 0xff */
- if (size < 4)
+ if (*size < 4)
return 0;
- if (data[0] == 0xfe && data[1] == 0xff) /* BOM fe ff */
+
+ if (d[0] == 0x00 && d[1] == 0x00 &&
+ d[2] == 0xfe && d[3] == 0xff) { /* BOM 00 00 fe ff */
+ *data += 4;
+ *size -= 4;
+ return "UTF-32BE";
+ } else if (d[0] == 0xff && d[1] == 0xfe &&
+ d[2] == 0x00 && d[3] == 0x00) { /* BOM ff fe 00 00 */
+ *data += 4;
+ *size -= 4;
+ return "UTF-32LE";
+ }
+ else if (d[0] == 0x00 && d[1] != 0x00 &&
+ d[2] == 0x00 && d[3] != 0x00) /* 00 xx 00 xx */
return "UTF-16BE";
- else if (data[0] == 0xfe && data[1] == 0xff) /* BOM ff fe */
+ else if (d[0] != 0x00 && d[1] == 0x00 &&
+ d[2] != 0x00 && d[3] == 0x00) /* xx 00 xx 00 */
return "UTF-16LE";
- else if (data[0] == 0x00 && data[1] != 0x00 &&
- data[2] == 0x00 && data[3] != 0x00) /* 00 xx 00 xx */
- return "UTF-16BE";
- else if (data[0] != 0x00 && data[1] == 0x00 &&
- data[2] != 0x00 && data[3] == 0x00) /* xx 00 xx 00 */
- return "UTF-16BE";
- else if (data[0] == 0x00 && data[1] == 0x00 &&
- data[2] == 0x00 && data[3] != 0x00) /* 00 00 00 xx */
+ else if (d[0] == 0x00 && d[1] == 0x00 &&
+ d[2] == 0x00 && d[3] != 0x00) /* 00 00 00 xx */
return "ISO-10646-UCS-4";
- else if (data[0] != 0x00 && data[1] == 0x00 &&
- data[2] == 0x00 && data[3] == 0x00) /* xx 00 00 00 */
+ else if (d[0] != 0x00 && d[1] == 0x00 &&
+ d[2] == 0x00 && d[3] == 0x00) /* xx 00 00 00 */
return "ISO-10646-UCS-4";
+ else if (d[0] == 0xfe && d[1] == 0xff) { /* BOM fe ff */
+ *data += 2;
+ *size -= 2;
+ return "UTF-16BE";
+ } else if (d[0] == 0xfe && d[1] == 0xff) { /* BOM ff fe */
+ *data += 2;
+ *size -= 2;
+ return "UTF-16LE";
+ } else if (d[0] == 0xef && d[1] == 0xbb &&
+ d[2] == 0xbf) { /* BOM ef bb bf */
+ *data += 3;
+ *size -= 3;
+ return "UTF-8";
+ }
+
return 0;
}