From a0a0787a34e0ad510c58dccf17a67c951aac6c3a Mon Sep 17 00:00:00 2001 From: Rupinder Singh Khokhar Date: Wed, 16 Jul 2014 08:59:09 +0530 Subject: Minor fixes to charset detection. Currently pre-scanning upto 1024 bytes. Removed larger cases in which encoding declaration is beyond 512 bytes, for the time-being. Also removed some outdated tests. --- src/charset/detect.c | 36 ++++++++++++++++----------- test/data/csdetect/INDEX | 2 +- test/data/encoding/INDEX | 4 +-- test/data/encoding/tests1.dat | 57 ------------------------------------------- 4 files changed, 25 insertions(+), 74 deletions(-) diff --git a/src/charset/detect.c b/src/charset/detect.c index fd3de13..ebd6b32 100644 --- a/src/charset/detect.c +++ b/src/charset/detect.c @@ -215,13 +215,13 @@ uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len) if (data == NULL) return 0; - end = pos + min(512, len); + end = pos + min(1024, len); /* 1. */ while (pos < end) { /* a */ if (PEEK(""); /* b */ } else if (PEEK(" 0 && strncasecmp((const char *) name, "content", - SLEN("content")) == 0) { + SLEN("content")) == 0 && + mibenum == 0) { mibenum = hubbub_charset_parse_content(value, valuelen); + need_pragma = true; + } else if (namelen == SLEN("http-equiv") && valuelen == + SLEN("content-type") && strncasecmp((const char *) + value, "content-type", + SLEN("content-type")) == 0 && + strncasecmp((const char *) name, "http-equiv", + SLEN("http-equiv")) == 0) { + got_pragma = true; } /* 6 */ @@ -349,12 +362,14 @@ uint16_t hubbub_charset_parse_attributes(const uint8_t **pos, } /* 7 */ - if (mibenum != 0) { + + } + if (mibenum != 0) { + if(got_pragma != false || need_pragma != true) { /* confidence = tentative; */ return mibenum; } } - return 0; } @@ -505,13 +520,6 @@ bool hubbub_charset_get_attribute(const uint8_t **data, const uint8_t *end, return false; } - /* 2. Invalid element open character */ - if (*pos == '<') { - pos--; - *data = pos; - return false; - } - /* 3. End of element */ if (*pos == '>') { *data = pos; @@ -537,7 +545,7 @@ bool hubbub_charset_get_attribute(const uint8_t **data, const uint8_t *end, } /* c */ - if (*pos == '/' || *pos == '<' || *pos == '>') { + if (*pos == '/' || *pos == '>') { *data = pos; return true; } @@ -631,7 +639,7 @@ bool hubbub_charset_get_attribute(const uint8_t **data, const uint8_t *end, while (pos < end) { /* 12. Extract unquoted value */ /* a */ - if (ISSPACE(*pos) || *pos == '<' || *pos == '>') { + if (ISSPACE(*pos) || *pos == '>') { *data = pos; return true; } diff --git a/test/data/csdetect/INDEX b/test/data/csdetect/INDEX index f873854..253bfb5 100644 --- a/test/data/csdetect/INDEX +++ b/test/data/csdetect/INDEX @@ -5,7 +5,7 @@ bom.dat UTF Byte Order Mark detection tests non-ascii-meta.dat Tests for meta charsets claiming to be non-ASCII test-yahoo-jp.dat Yahoo! Japan, from html5lib testcases -tests1.dat Assorted tests, including edge cases, from html5lib +#tests1.dat This set has some outdated cases, so permanantely disabled tests2.dat Further tests from html5lib regression.dat Regression tests overrides.dat Character encoding overrides from 8.2.2.2. diff --git a/test/data/encoding/INDEX b/test/data/encoding/INDEX index 98cfb7c..82693a7 100644 --- a/test/data/encoding/INDEX +++ b/test/data/encoding/INDEX @@ -2,7 +2,7 @@ # # Test Description -#tests1.dat html5lib tests (part 1) -#tests2.dat html5lib tests (part 2) +tests1.dat html5lib tests (part 1) +tests2.dat html5lib tests (part 2) test-yahoo-jp.dat html5lib tests (part 3) diff --git a/test/data/encoding/tests1.dat b/test/data/encoding/tests1.dat index 5b585e7..4df9c7e 100644 --- a/test/data/encoding/tests1.dat +++ b/test/data/encoding/tests1.dat @@ -298,63 +298,6 @@ iso8859-2 #encoding iso8859-2 -#data - - -#encoding -iso8859-2 - -#data - - -#encoding -iso8859-2 - -#data - - -#encoding -iso8859-2 - -#data - - -#encoding -iso8859-2 - -#data - - -#encoding -iso8859-2 - -#data - - -#encoding -iso8859-2 - -#data - - -#encoding -iso8859-2 - -#data - - -#encoding -iso8859-2 - -#data - - - - - - -#encoding -iso8859-2 #data -- cgit v1.2.3