summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniel Silverstone <dsilvers@digital-scurf.org>2017-09-09 08:59:58 (GMT)
committer Daniel Silverstone <dsilvers@digital-scurf.org>2017-09-09 08:59:58 (GMT)
commit0eb6188c3a931063f78b017c621b79709746706e (patch)
tree9947fbb140fbc0e7013b2b8246dcc66a21f9202c
parent73071c0dea1e4bcfd094810d051aebc74e6c648c (diff)
downloadlibhubbub-0eb6188c3a931063f78b017c621b79709746706e.tar.gz
libhubbub-0eb6188c3a931063f78b017c621b79709746706e.tar.bz2
Support falling back to space separated charset
In some cases, for example, Apple Mail, programs generate HTML with apallingly bad meta tags such as: <meta content="text/html charset=utf-8"> This is bad because *a* no http-equiv="Content-Type" and *b* because the content type and charset do not have a separating semi-colon. Sadly, Chrome et-al support this, so we need to in Hubbub. This change adjusts the content="" parser to retry if it cannot find a semicolon, and work forwards to first whitespace instead. Fixes: #2549
-rw-r--r--src/charset/detect.c19
1 files changed, 17 insertions, 2 deletions
diff --git a/src/charset/detect.c b/src/charset/detect.c
index 93cbe63..d2d6816 100644
--- a/src/charset/detect.c
+++ b/src/charset/detect.c
@@ -369,6 +369,7 @@ uint16_t hubbub_charset_parse_attributes(const uint8_t **pos,
uint16_t hubbub_charset_parse_content(const uint8_t *value,
uint32_t valuelen)
{
+ const uint8_t *restart = value;
const uint8_t *end;
const uint8_t *tentative = NULL;
uint32_t tentative_len = 0;
@@ -388,8 +389,22 @@ uint16_t hubbub_charset_parse_content(const uint8_t *value,
value++;
}
- if (value >= end)
- return 0;
+ if (value >= end) {
+ /* Fallback, no semicolon, try for first whitespace */
+ value = restart;
+ while (value < end) {
+ /* This condition is odd, because ISSPACE() includes
+ * forward slash, which we need to skip so that content
+ * types work properly.
+ */
+ if (ISSPACE(*value) && (*value != '/')) {
+ value++;
+ break;
+ }
+
+ value++;
+ }
+ }
/* 2 */
while (value < end && ISSPACE(*value)) {