From a0a0787a34e0ad510c58dccf17a67c951aac6c3a Mon Sep 17 00:00:00 2001 From: Rupinder Singh Khokhar Date: Wed, 16 Jul 2014 08:59:09 +0530 Subject: Minor fixes to charset detection. Currently pre-scanning upto 1024 bytes. Removed larger cases in which encoding declaration is beyond 512 bytes, for the time-being. Also removed some outdated tests. --- src/charset/detect.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'src/charset') diff --git a/src/charset/detect.c b/src/charset/detect.c index fd3de13..ebd6b32 100644 --- a/src/charset/detect.c +++ b/src/charset/detect.c @@ -215,13 +215,13 @@ uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len) if (data == NULL) return 0; - end = pos + min(512, len); + end = pos + min(1024, len); /* 1. */ while (pos < end) { /* a */ if (PEEK(""); /* b */ } else if (PEEK(" 0 && strncasecmp((const char *) name, "content", - SLEN("content")) == 0) { + SLEN("content")) == 0 && + mibenum == 0) { mibenum = hubbub_charset_parse_content(value, valuelen); + need_pragma = true; + } else if (namelen == SLEN("http-equiv") && valuelen == + SLEN("content-type") && strncasecmp((const char *) + value, "content-type", + SLEN("content-type")) == 0 && + strncasecmp((const char *) name, "http-equiv", + SLEN("http-equiv")) == 0) { + got_pragma = true; } /* 6 */ @@ -349,12 +362,14 @@ uint16_t hubbub_charset_parse_attributes(const uint8_t **pos, } /* 7 */ - if (mibenum != 0) { + + } + if (mibenum != 0) { + if(got_pragma != false || need_pragma != true) { /* confidence = tentative; */ return mibenum; } } - return 0; } @@ -505,13 +520,6 @@ bool hubbub_charset_get_attribute(const uint8_t **data, const uint8_t *end, return false; } - /* 2. Invalid element open character */ - if (*pos == '<') { - pos--; - *data = pos; - return false; - } - /* 3. End of element */ if (*pos == '>') { *data = pos; @@ -537,7 +545,7 @@ bool hubbub_charset_get_attribute(const uint8_t **data, const uint8_t *end, } /* c */ - if (*pos == '/' || *pos == '<' || *pos == '>') { + if (*pos == '/' || *pos == '>') { *data = pos; return true; } @@ -631,7 +639,7 @@ bool hubbub_charset_get_attribute(const uint8_t **data, const uint8_t *end, while (pos < end) { /* 12. Extract unquoted value */ /* a */ - if (ISSPACE(*pos) || *pos == '<' || *pos == '>') { + if (ISSPACE(*pos) || *pos == '>') { *data = pos; return true; } -- cgit v1.2.3