From 93e2b4855ba5b16a507e75dc533500efcc91e065 Mon Sep 17 00:00:00 2001 From: Vincent Sanders Date: Sun, 19 Mar 2017 15:00:21 +0000 Subject: make mimesniffing use core strings --- content/mimesniff.c | 342 +++++++++++++++++++--------------------------------- content/mimesniff.h | 14 +-- desktop/netsurf.c | 6 - 3 files changed, 134 insertions(+), 228 deletions(-) diff --git a/content/mimesniff.c b/content/mimesniff.c index b24448861..832320086 100644 --- a/content/mimesniff.c +++ b/content/mimesniff.c @@ -16,8 +16,9 @@ * along with this program. If not, see . */ -/** \file - * MIME type sniffer (implementation) +/** + * \file + * MIME type sniffer implementation * * Spec version: 2011-11-27 */ @@ -25,11 +26,13 @@ #include #include +#include "utils/http.h" +#include "utils/utils.h" +#include "utils/corestrings.h" + #include "content/content_factory.h" #include "content/llcache.h" #include "content/mimesniff.h" -#include "utils/http.h" -#include "utils/utils.h" struct map_s { const uint8_t *sig; @@ -38,103 +41,6 @@ struct map_s { lwc_string **type; }; -static lwc_string *unknown_unknown; -static lwc_string *application_unknown; -static lwc_string *any; -static lwc_string *text_xml; -static lwc_string *application_xml; -static lwc_string *text_html; -static lwc_string *text_plain; -static lwc_string *application_octet_stream; -static lwc_string *image_gif; -static lwc_string *image_png; -static lwc_string *image_jpeg; -static lwc_string *image_bmp; -static lwc_string *image_vnd_microsoft_icon; -static lwc_string *image_webp; -static lwc_string *application_rss_xml; -static lwc_string *application_atom_xml; -static lwc_string *audio_wave; -static lwc_string *application_ogg; -static lwc_string *video_webm; -static lwc_string *application_x_rar_compressed; -static lwc_string *application_zip; -static lwc_string *application_x_gzip; -static lwc_string *application_postscript; -static lwc_string *application_pdf; -static lwc_string *video_mp4; -static lwc_string *image_svg; - -nserror mimesniff_init(void) -{ - lwc_error lerror; - -#define SINIT(v, s) \ - lerror = lwc_intern_string(s, SLEN(s), &v); \ - if (lerror != lwc_error_ok) \ - return NSERROR_NOMEM - - SINIT(unknown_unknown, "unknown/unknown"); - SINIT(application_unknown, "application/unknown"); - SINIT(any, "*/*"); - SINIT(text_xml, "text/xml"); - SINIT(application_xml, "application/xml"); - SINIT(text_html, "text/html"); - SINIT(text_plain, "text/plain"); - SINIT(application_octet_stream, "application/octet-stream"); - SINIT(image_gif, "image/gif"); - SINIT(image_png, "image/png"); - SINIT(image_jpeg, "image/jpeg"); - SINIT(image_bmp, "image/bmp"); - SINIT(image_vnd_microsoft_icon, "image/vnd.microsoft.icon"); - SINIT(image_webp, "image/webp"); - SINIT(application_rss_xml, "application/rss+xml"); - SINIT(application_atom_xml, "application/atom+xml"); - SINIT(audio_wave, "audio/wave"); - SINIT(application_ogg, "application/ogg"); - SINIT(video_webm, "video/webm"); - SINIT(application_x_rar_compressed, "application/x-rar-compressed"); - SINIT(application_zip, "application/zip"); - SINIT(application_x_gzip, "application/x-gzip"); - SINIT(application_postscript, "application/postscript"); - SINIT(application_pdf, "application/pdf"); - SINIT(video_mp4, "video/mp4"); - SINIT(image_svg, "image/svg+xml"); -#undef SINIT - - return NSERROR_OK; -} - -void mimesniff_fini(void) -{ - lwc_string_unref(image_svg); - lwc_string_unref(video_mp4); - lwc_string_unref(application_pdf); - lwc_string_unref(application_postscript); - lwc_string_unref(application_x_gzip); - lwc_string_unref(application_zip); - lwc_string_unref(application_x_rar_compressed); - lwc_string_unref(video_webm); - lwc_string_unref(application_ogg); - lwc_string_unref(audio_wave); - lwc_string_unref(application_atom_xml); - lwc_string_unref(application_rss_xml); - lwc_string_unref(image_webp); - lwc_string_unref(image_vnd_microsoft_icon); - lwc_string_unref(image_bmp); - lwc_string_unref(image_jpeg); - lwc_string_unref(image_png); - lwc_string_unref(image_gif); - lwc_string_unref(application_octet_stream); - lwc_string_unref(text_plain); - lwc_string_unref(text_html); - lwc_string_unref(application_xml); - lwc_string_unref(text_xml); - lwc_string_unref(any); - lwc_string_unref(application_unknown); - lwc_string_unref(unknown_unknown); -} - static bool mimesniff__has_binary_octets(const uint8_t *data, size_t len) { const uint8_t *end = data + len; @@ -143,7 +49,7 @@ static bool mimesniff__has_binary_octets(const uint8_t *data, size_t len) const uint8_t c = *data; /* Binary iff in C0 and not ESC, CR, FF, LF, HT */ - if (c <= 0x1f && c != 0x1b && c != '\r' && c != '\f' && + if (c <= 0x1f && c != 0x1b && c != '\r' && c != '\f' && c != '\n' && c != '\t') break; @@ -168,19 +74,19 @@ static nserror mimesniff__match_mp4(const uint8_t *data, size_t len, * uint32_t compatible_brands[]; * } * - * Note 1: A size of 0 implies that the length of the box is designated - * by the remaining input data (and thus may only occur in the last - * box in the input). We'll reject this below, as it's pointless + * Note 1: A size of 0 implies that the length of the box is designated + * by the remaining input data (and thus may only occur in the last + * box in the input). We'll reject this below, as it's pointless * sniffing input that contains no boxes other than 'ftyp'. * - * Note 2: A size of 1 implies an additional uint64_t field after - * the type which contains the extended box size. We'll reject this, - * too, as it implies a minimum of (2^32 - 24) / 4 compatible brands, + * Note 2: A size of 1 implies an additional uint64_t field after + * the type which contains the extended box size. We'll reject this, + * too, as it implies a minimum of (2^32 - 24) / 4 compatible brands, * which is decidely unlikely. */ - /* 12 reflects the minimum number of octets needed to sniff useful - * information out of an 'ftyp' box (i.e. the size, type, + /* 12 reflects the minimum number of octets needed to sniff useful + * information out of an 'ftyp' box (i.e. the size, type, * and major_brand words). */ if (len < 12) return NSERROR_NOT_FOUND; @@ -193,22 +99,22 @@ static nserror mimesniff__match_mp4(const uint8_t *data, size_t len, return NSERROR_NOT_FOUND; /* Ensure this is an 'ftyp' box */ - if (data[4] != 'f' || data[5] != 't' || + if (data[4] != 'f' || data[5] != 't' || data[6] != 'y' || data[7] != 'p') return NSERROR_NOT_FOUND; /* Check if major brand begins with 'mp4' */ if (data[8] == 'm' && data[9] == 'p' && data[10] == '4') { - *effective_type = lwc_string_ref(video_mp4); + *effective_type = lwc_string_ref(corestring_lwc_video_mp4); return NSERROR_OK; } /* Search each compatible brand in the box for "mp4" */ for (i = 16; i <= box_size - 4; i += 4) { - if (data[i] == 'm' && - data[i+1] == 'p' && + if (data[i] == 'm' && + data[i+1] == 'p' && data[i+2] == '4') { - *effective_type = lwc_string_ref(video_mp4); + *effective_type = lwc_string_ref(corestring_lwc_video_mp4); return NSERROR_OK; } } @@ -221,28 +127,28 @@ static nserror mimesniff__match_unknown_ws(const uint8_t *data, size_t len, { #define SIG(t, s, x) { (const uint8_t *) s, SLEN(s), x, t } static const struct map_s ws_exact_match_types[] = { - SIG(&text_xml, "len + 1) continue; - if (strncasecmp((const char *) data, - (const char *) it->sig, it->len) == 0 && - (data[it->len] == ' ' || + if (strncasecmp((const char *) data, + (const char *) it->sig, it->len) == 0 && + (data[it->len] == ' ' || data[it->len] == '>')) { *effective_type = lwc_string_ref(*it->type); return NSERROR_OK; @@ -294,9 +200,9 @@ static nserror mimesniff__match_unknown_bom(const uint8_t *data, size_t len, { #define SIG(t, s, x) { (const uint8_t *) s, SLEN(s), x, t } static const struct map_s bom_match_types[] = { - SIG(&text_plain, "\xfe\xff", false), - SIG(&text_plain, "\xff\xfe", false), - SIG(&text_plain, "\xef\xbb\xbf", false), + SIG(&corestring_lwc_text_plain, "\xfe\xff", false), + SIG(&corestring_lwc_text_plain, "\xff\xfe", false), + SIG(&corestring_lwc_text_plain, "\xef\xbb\xbf", false), { NULL, 0, false, NULL } }; #undef SIG @@ -317,17 +223,17 @@ static nserror mimesniff__match_unknown_riff(const uint8_t *data, size_t len, { #define SIG(t, s, x) { (const uint8_t *) s, SLEN(s), x, t } static const struct map_s riff_match_types[] = { - SIG(&image_webp, "WEBPVP", true), - SIG(&audio_wave, "WAVE", true), + SIG(&corestring_lwc_image_webp, "WEBPVP", true), + SIG(&corestring_lwc_audio_wave, "WAVE", true), { NULL, 0, false, NULL } }; #undef SIG const struct map_s *it; for (it = riff_match_types; it->sig != NULL; it++) { - if (it->len + SLEN("RIFF????") <= len && + if (it->len + SLEN("RIFF????") <= len && memcmp(data, "RIFF", SLEN("RIFF")) == 0 && - memcmp(data + SLEN("RIFF????"), + memcmp(data + SLEN("RIFF????"), it->sig, it->len) == 0) { *effective_type = lwc_string_ref(*it->type); return NSERROR_OK; @@ -342,19 +248,19 @@ static nserror mimesniff__match_unknown_exact(const uint8_t *data, size_t len, { #define SIG(t, s, x) { (const uint8_t *) s, SLEN(s), x, t } static const struct map_s exact_match_types[] = { - SIG(&image_gif, "GIF87a", true), - SIG(&image_gif, "GIF89a", true), - SIG(&image_png, "\x89PNG\r\n\x1a\n", true), - SIG(&image_jpeg, "\xff\xd8\xff", true), - SIG(&image_bmp, "BM", true), - SIG(&image_vnd_microsoft_icon, "\x00\x00\x01\x00", true), - SIG(&application_ogg, "OggS\x00", true), - SIG(&video_webm, "\x1a\x45\xdf\xa3", true), - SIG(&application_x_rar_compressed, "Rar \x1a\x07\x00", true), - SIG(&application_zip, "PK\x03\x04", true), - SIG(&application_x_gzip, "\x1f\x8b\x08", true), - SIG(&application_postscript, "%!PS-Adobe-", true), - SIG(&application_pdf, "%PDF-", false), + SIG(&corestring_lwc_image_gif, "GIF87a", true), + SIG(&corestring_lwc_image_gif, "GIF89a", true), + SIG(&corestring_lwc_image_png, "\x89PNG\r\n\x1a\n", true), + SIG(&corestring_lwc_image_jpeg, "\xff\xd8\xff", true), + SIG(&corestring_lwc_image_bmp, "BM", true), + SIG(&corestring_lwc_image_vnd_microsoft_icon, "\x00\x00\x01\x00", true), + SIG(&corestring_lwc_application_ogg, "OggS\x00", true), + SIG(&corestring_lwc_video_webm, "\x1a\x45\xdf\xa3", true), + SIG(&corestring_lwc_application_x_rar_compressed, "Rar \x1a\x07\x00", true), + SIG(&corestring_lwc_application_zip, "PK\x03\x04", true), + SIG(&corestring_lwc_application_x_gzip, "\x1f\x8b\x08", true), + SIG(&corestring_lwc_application_postscript, "%!PS-Adobe-",true), + SIG(&corestring_lwc_application_pdf, "%PDF-", false), { NULL, 0, false, NULL } }; #undef SIG @@ -374,11 +280,11 @@ static nserror mimesniff__match_unknown_exact(const uint8_t *data, size_t len, static nserror mimesniff__match_unknown(const uint8_t *data, size_t len, bool allow_unsafe, lwc_string **effective_type) { - if (mimesniff__match_unknown_exact(data, len, allow_unsafe, + if (mimesniff__match_unknown_exact(data, len, allow_unsafe, effective_type) == NSERROR_OK) return NSERROR_OK; - if (mimesniff__match_unknown_riff(data, len, + if (mimesniff__match_unknown_riff(data, len, effective_type) == NSERROR_OK) return NSERROR_OK; @@ -407,49 +313,51 @@ static nserror mimesniff__compute_unknown(const uint8_t *data, size_t len, len = min(len, 512); - if (mimesniff__match_unknown(data, len, true, - effective_type) == NSERROR_OK) + if (mimesniff__match_unknown(data, len, true, + effective_type) == NSERROR_OK) { return NSERROR_OK; + } if (mimesniff__has_binary_octets(data, len) == false) { /* No binary octets => text/plain */ - *effective_type = lwc_string_ref(text_plain); + *effective_type = lwc_string_ref(corestring_lwc_text_plain); return NSERROR_OK; } - *effective_type = lwc_string_ref(application_octet_stream); + *effective_type = lwc_string_ref(corestring_lwc_application_octet_stream); return NSERROR_OK; } -static nserror mimesniff__compute_text_or_binary(const uint8_t *data, +static nserror mimesniff__compute_text_or_binary(const uint8_t *data, size_t len, lwc_string **effective_type) { - if (data == NULL) + if (data == NULL) { return NSERROR_NEED_DATA; + } len = min(len, 512); if (len >= 3 && ((data[0] == 0xfe && data[1] == 0xff) || (data[0] == 0xff && data[1] == 0xfe) || - (data[0] == 0xef && data[1] == 0xbb && + (data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf))) { /* Found a BOM => text/plain */ - *effective_type = lwc_string_ref(text_plain); + *effective_type = lwc_string_ref(corestring_lwc_text_plain); return NSERROR_OK; } if (mimesniff__has_binary_octets(data, len) == false) { /* No binary octets => text/plain */ - *effective_type = lwc_string_ref(text_plain); + *effective_type = lwc_string_ref(corestring_lwc_text_plain); return NSERROR_OK; } - if (mimesniff__match_unknown(data, len, false, + if (mimesniff__match_unknown(data, len, false, effective_type) == NSERROR_OK) return NSERROR_OK; - *effective_type = lwc_string_ref(application_octet_stream); + *effective_type = lwc_string_ref(corestring_lwc_application_octet_stream); return NSERROR_OK; } @@ -463,12 +371,12 @@ static nserror mimesniff__compute_image(lwc_string *official_type, size_t len; lwc_string **type; } image_types[] = { - SIG(&image_gif, "GIF87a"), - SIG(&image_gif, "GIF89a"), - SIG(&image_png, "\x89PNG\r\n\x1a\n"), - SIG(&image_jpeg, "\xff\xd8\xff"), - SIG(&image_bmp, "BM"), - SIG(&image_vnd_microsoft_icon, "\x00\x00\x01\x00"), + SIG(&corestring_lwc_image_gif, "GIF87a"), + SIG(&corestring_lwc_image_gif, "GIF89a"), + SIG(&corestring_lwc_image_png, "\x89PNG\r\n\x1a\n"), + SIG(&corestring_lwc_image_jpeg, "\xff\xd8\xff"), + SIG(&corestring_lwc_image_bmp, "BM"), + SIG(&corestring_lwc_image_vnd_microsoft_icon, "\x00\x00\x01\x00"), { NULL, 0, NULL } }; #undef SIG @@ -489,12 +397,12 @@ static nserror mimesniff__compute_image(lwc_string *official_type, } /* WebP has a signature that doesn't fit into the above table */ - if (SLEN("RIFF????WEBPVP") <= len && - memcmp(data, "RIFF", SLEN("RIFF")) == 0 && - memcmp(data + SLEN("RIFF????"), + if (SLEN("RIFF????WEBPVP") <= len && + memcmp(data, "RIFF", SLEN("RIFF")) == 0 && + memcmp(data + SLEN("RIFF????"), "WEBPVP", SLEN("WEBPVP")) == 0 ) { lwc_string_unref(official_type); - *effective_type = lwc_string_ref(image_webp); + *effective_type = lwc_string_ref(corestring_lwc_image_webp); return NSERROR_OK; } @@ -537,7 +445,7 @@ static nserror mimesniff__compute_feed_or_html(const uint8_t *data, switch (state) { case BEFORE_BOM: - if (3 <= end - data && c == 0xef && data[1] == 0xbb && + if (3 <= end - data && c == 0xef && data[1] == 0xbb && data[2] == 0xbf) { data += 3; } @@ -597,12 +505,12 @@ static nserror mimesniff__compute_feed_or_html(const uint8_t *data, break; case IN_TAG: if (MATCH("rss")) { - *effective_type = - lwc_string_ref(application_rss_xml); + *effective_type = + lwc_string_ref(corestring_lwc_application_rss_xml); return NSERROR_OK; } else if (MATCH("feed")) { - *effective_type = - lwc_string_ref(application_atom_xml); + *effective_type = + lwc_string_ref(corestring_lwc_application_atom_xml); return NSERROR_OK; } else if (MATCH("rdf:RDF")) { state = IN_RDF; @@ -621,8 +529,7 @@ static nserror mimesniff__compute_feed_or_html(const uint8_t *data, data++; if (rdf && rss) { - *effective_type = - lwc_string_ref(application_rss_xml); + *effective_type = lwc_string_ref(corestring_lwc_application_rss_xml); return NSERROR_OK; } @@ -631,7 +538,7 @@ static nserror mimesniff__compute_feed_or_html(const uint8_t *data, #undef MATCH } - *effective_type = lwc_string_ref(text_html); + *effective_type = lwc_string_ref(corestring_lwc_text_html); return NSERROR_OK; @@ -664,7 +571,7 @@ nserror mimesniff_compute_effective_type(llcache_handle *handle, bool match; nserror error; - content_type_header = + content_type_header = llcache_handle_get_header(handle, "Content-Type"); if (content_type_header == NULL) { if (sniff_allowed == false) @@ -692,9 +599,10 @@ nserror mimesniff_compute_effective_type(llcache_handle *handle, if (image_only) { lwc_string *official_type; - if (lwc_string_caseless_isequal(ct->media_type, image_svg, - &match) == lwc_error_ok && match) { - *effective_type = lwc_string_ref(image_svg); + if (lwc_string_caseless_isequal(ct->media_type, + corestring_lwc_image_svg, + &match) == lwc_error_ok && match) { + *effective_type = lwc_string_ref(corestring_lwc_image_svg); http_content_type_destroy(ct); return NSERROR_OK; } @@ -710,8 +618,9 @@ nserror mimesniff_compute_effective_type(llcache_handle *handle, /* Look for text types */ for (tt = text_types; tt->data != NULL; tt++) { if (tt->len == content_type_header_len && - memcmp(tt->data, content_type_header, - content_type_header_len) == 0) { + memcmp(tt->data, + content_type_header, + content_type_header_len) == 0) { http_content_type_destroy(ct); return mimesniff__compute_text_or_binary(data, len, effective_type); @@ -719,22 +628,24 @@ nserror mimesniff_compute_effective_type(llcache_handle *handle, } /* unknown/unknown, application/unknown, * / * */ - if ((lwc_string_caseless_isequal(ct->media_type, unknown_unknown, - &match) == lwc_error_ok && match) || - (lwc_string_caseless_isequal(ct->media_type, - application_unknown, &match) == lwc_error_ok && - match) || - (lwc_string_caseless_isequal(ct->media_type, any, - &match) == lwc_error_ok && match)) { + if ((lwc_string_caseless_isequal(ct->media_type, + corestring_lwc_unknown_unknown, + &match) == lwc_error_ok && match) || + (lwc_string_caseless_isequal(ct->media_type, + corestring_lwc_application_unknown, + &match) == lwc_error_ok && match) || + (lwc_string_caseless_isequal(ct->media_type, + corestring_lwc_any, + &match) == lwc_error_ok && match)) { http_content_type_destroy(ct); return mimesniff__compute_unknown(data, len, effective_type); } /* +xml */ if (lwc_string_length(ct->media_type) > SLEN("+xml") && - strncasecmp(lwc_string_data(ct->media_type) + - lwc_string_length(ct->media_type) - - SLEN("+xml"), + strncasecmp(lwc_string_data(ct->media_type) + + lwc_string_length(ct->media_type) - + SLEN("+xml"), "+xml", SLEN("+xml")) == 0) { /* Use official type */ *effective_type = lwc_string_ref(ct->media_type); @@ -743,19 +654,20 @@ nserror mimesniff_compute_effective_type(llcache_handle *handle, } /* text/xml, application/xml */ - if ((lwc_string_caseless_isequal(ct->media_type, text_xml, - &match) == lwc_error_ok && match) || - (lwc_string_caseless_isequal(ct->media_type, - application_xml, &match) == lwc_error_ok && - match)) { + if ((lwc_string_caseless_isequal(ct->media_type, + corestring_lwc_text_xml, + &match) == lwc_error_ok && match) || + (lwc_string_caseless_isequal(ct->media_type, + corestring_lwc_application_xml, + &match) == lwc_error_ok && match)) { /* Use official type */ *effective_type = lwc_string_ref(ct->media_type); http_content_type_destroy(ct); return NSERROR_OK; } - + /* Image types */ - if (content_factory_type_from_mime_type(ct->media_type) == + if (content_factory_type_from_mime_type(ct->media_type) == CONTENT_IMAGE) { lwc_string *official_type = lwc_string_ref(ct->media_type); http_content_type_destroy(ct); @@ -764,11 +676,12 @@ nserror mimesniff_compute_effective_type(llcache_handle *handle, } /* text/html */ - if ((lwc_string_caseless_isequal(ct->media_type, text_html, - &match) == lwc_error_ok && match)) { + if ((lwc_string_caseless_isequal(ct->media_type, + corestring_lwc_text_html, + &match) == lwc_error_ok && match)) { http_content_type_destroy(ct); return mimesniff__compute_feed_or_html(data, len, - effective_type); + effective_type); } /* Use official type */ @@ -778,4 +691,3 @@ nserror mimesniff_compute_effective_type(llcache_handle *handle, return NSERROR_OK; } - diff --git a/content/mimesniff.h b/content/mimesniff.h index bf3e493f3..474f7b87f 100644 --- a/content/mimesniff.h +++ b/content/mimesniff.h @@ -16,8 +16,9 @@ * along with this program. If not, see . */ -/** \file - * MIME type sniffer (interface) +/** + * \file + * MIME type sniffer interface */ #ifndef NETSURF_CONTENT_MIMESNIFF_H_ @@ -31,8 +32,10 @@ struct llcache_handle; /** - * Compute the effective MIME type for an object using the sniffing - * algorithm described in http://mimesniff.spec.whatwg.org/ + * Compute the effective MIME type for an object + * + * The implementation uses the sniffing algorithm described in + * http://mimesniff.spec.whatwg.org/ * * \param handle Source data handle to sniff * \param data First data chunk, or NULL @@ -49,7 +52,4 @@ nserror mimesniff_compute_effective_type(struct llcache_handle *handle, const uint8_t *data, size_t len, bool sniff_allowed, bool image_only, lwc_string **effective_type); -nserror mimesniff_init(void); -void mimesniff_fini(void); - #endif diff --git a/desktop/netsurf.c b/desktop/netsurf.c index 0f597aa9d..3baa936f3 100644 --- a/desktop/netsurf.c +++ b/desktop/netsurf.c @@ -212,10 +212,6 @@ nserror netsurf_init(const char *store_path) if (ret != NSERROR_OK) return ret; - ret = mimesniff_init(); - if (ret != NSERROR_OK) - return ret; - setlocale(LC_ALL, ""); /* initialise the fetchers */ @@ -262,8 +258,6 @@ void netsurf_exit(void) LOG("Closing fetches"); fetcher_quit(); - mimesniff_fini(); - /* dump any remaining cache entries */ image_cache_fini(); -- cgit v1.2.3