From babbb971172d6cbe71126ca1f0069236ea1bf144 Mon Sep 17 00:00:00 2001 From: Andrew Sidwell Date: Tue, 24 Jun 2008 00:00:27 +0000 Subject: Rough and unoptimised quirks-mode detector in the "initial" tree construction phase. svn path=/trunk/hubbub/; revision=4430 --- src/treebuilder/initial.c | 235 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 234 insertions(+), 1 deletion(-) (limited to 'src/treebuilder') diff --git a/src/treebuilder/initial.c b/src/treebuilder/initial.c index 30a380b..910cf33 100644 --- a/src/treebuilder/initial.c +++ b/src/treebuilder/initial.c @@ -3,6 +3,8 @@ * Licensed under the MIT License, * http://www.opensource.org/licenses/mit-license.php * Copyright 2008 John-Mark Bell + * + * Up-to-date with 19 June 2008 spec. */ #include @@ -14,6 +16,225 @@ #include "utils/utils.h" +#define S(s) { s, sizeof s } + +struct { + const char *name; + size_t len; +} public_doctypes[] = { + S("+//Silmaril//dtd html Pro v0r11 19970101//"), + S("-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//"), + S("-//AS//DTD HTML 3.0 asWedit + extensions//"), + S("-//IETF//DTD HTML 2.0 Level 1//"), + S("-//IETF//DTD HTML 2.0 Level 2//"), + S("-//IETF//DTD HTML 2.0 Strict Level 1//"), + S("-//IETF//DTD HTML 2.0 Strict Level 2//"), + S("-//IETF//DTD HTML 2.0 Strict//"), + S("-//IETF//DTD HTML 2.0//"), + S("-//IETF//DTD HTML 2.1E//"), + S("-//IETF//DTD HTML 3.0//"), + S("-//IETF//DTD HTML 3.2 Final//"), + S("-//IETF//DTD HTML 3.2//"), + S("-//IETF//DTD HTML 3//"), + S("-//IETF//DTD HTML Level 0//"), + S("-//IETF//DTD HTML Level 1//"), + S("-//IETF//DTD HTML Level 2//"), + S("-//IETF//DTD HTML Level 3//"), + S("-//IETF//DTD HTML Strict Level 0//"), + S("-//IETF//DTD HTML Strict Level 1//"), + S("-//IETF//DTD HTML Strict Level 2//"), + S("-//IETF//DTD HTML Strict Level 3//"), + S("-//IETF//DTD HTML Strict//"), + S("-//IETF//DTD HTML//"), + S("-//Metrius//DTD Metrius Presentational//"), + S("-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//"), + S("-//Microsoft//DTD Internet Explorer 2.0 HTML//"), + S("-//Microsoft//DTD Internet Explorer 2.0 Tables//"), + S("-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//"), + S("-//Microsoft//DTD Internet Explorer 3.0 HTML//"), + S("-//Microsoft//DTD Internet Explorer 3.0 Tables//"), + S("-//Netscape Comm. Corp.//DTD HTML//"), + S("-//Netscape Comm. Corp.//DTD Strict HTML//"), + S("-//O'Reilly and Associates//DTD HTML 2.0//"), + S("-//O'Reilly and Associates//DTD HTML Extended 1.0//"), + S("-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//"), + S("-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::extensions to HTML 4.0//"), + S("-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::extensions to HTML 4.0//"), + S("-//Spyglass//DTD HTML 2.0 Extended//"), + S("-//SQ//DTD HTML 2.0 HoTMetaL + extensions//"), + S("-//Sun Microsystems Corp.//DTD HotJava HTML//"), + S("-//Sun Microsystems Corp.//DTD HotJava Strict HTML//"), + S("-//W3C//DTD HTML 3 1995-03-24//"), + S("-//W3C//DTD HTML 3.2 Draft//"), + S("-//W3C//DTD HTML 3.2 Final//"), + S("-//W3C//DTD HTML 3.2//"), + S("-//W3C//DTD HTML 3.2S Draft//"), + S("-//W3C//DTD HTML 4.0 Frameset//"), + S("-//W3C//DTD HTML 4.0 Transitional//"), + S("-//W3C//DTD HTML Experimental 19960712//"), + S("-//W3C//DTD HTML Experimental 970421//"), + S("-//W3C//DTD W3 HTML//"), + S("-//W3O//DTD W3 HTML 3.0//"), +}; + +#undef S + + +/** + * Check if one string starts with another. + * + * \param a String to compare + * \param a_len Length of first string + * \param b String to compare + * \param b_len Length of second string + */ +static bool starts_with(const uint8_t *a, size_t a_len, const uint8_t *b, + size_t b_len) +{ + uint8_t z1, z2; + + if (a_len < b_len) + return false; + + for (const uint8_t *s1 = a, *s2 = b; b_len > 0; s1++, s2++, b_len--) + { + z1 = (*s1 & ~0x20); + z2 = (*s2 & ~0x20); + if (z1 != z2) return false; + if (!z1) return true; + } + + return true; +} + +/** + * Check that one string is exactly equal to another + * + * \param a String to compare + * \param a_len Length of first string + * \param b String to compare + * \param b_len Length of second string + */ +static bool match(const uint8_t *a, size_t a_len, const uint8_t *b, + size_t b_len) +{ + uint8_t z1, z2; + + if (a_len != b_len) + return false; + + for (const uint8_t *s1 = a, *s2 = b; b_len > 0; s1++, s2++, b_len--) + { + z1 = (*s1 & ~0x20); + z2 = (*s2 & ~0x20); + if (z1 != z2) return false; + } + + return true; +} + + +/** + * Determine whether this doctype triggers full quirks mode + * + * \param cdoc The doctype to examine + * \return True to trigger quirks, false otherwise + */ +static bool lookup_full_quirks(hubbub_treebuilder *treebuilder, + const hubbub_doctype *cdoc) +{ + size_t i; + + const uint8_t *name = treebuilder->input_buffer + cdoc->name.data.off; + size_t name_len = cdoc->name.len; + + const uint8_t *public_id = treebuilder->input_buffer + + cdoc->public_id.data.off; + size_t public_id_len = cdoc->public_id.len; + + const uint8_t *system_id = treebuilder->input_buffer + + cdoc->system_id.data.off; + size_t system_id_len = cdoc->system_id.len; + +#define S(s) (uint8_t *) s, sizeof s + + /* Check the name is "HTML" (case-insensitively) */ + if (!match(name, name_len, S("HTML"))) + return true; + + /* No public id means not-quirks */ + if (cdoc->public_missing) return false; + + for (i = 0; i < sizeof public_doctypes / sizeof public_doctypes[0]; i++) + { + if (starts_with(public_id, public_id_len, + (uint8_t *) public_doctypes[i].name, + public_doctypes[i].len)) { + return true; + } + } + + if (match(public_id, public_id_len, + S("-//W3O//DTD W3 HTML Strict 3.0//EN//")) || + match(public_id, public_id_len, + S("-/W3C/DTD HTML 4.0 Transitional/EN")) || + match(public_id, public_id_len, + S("HTML")) || + match(system_id, system_id_len, + S("http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"))) { + return true; + } + + if (cdoc->system_missing == true && + (starts_with(public_id, public_id_len, + S("-//W3C//DTD HTML 4.01 Frameset//")) || + starts_with(public_id, public_id_len, + S("-//W3C//DTD HTML 4.01 Transitional//")))) { + return true; + } + +#undef S + + return false; +} + + +/** + * Determine whether this doctype triggers limited quirks mode + * + * \param cdoc The doctype to examine + * \return True to trigger quirks, false otherwise + */ +static bool lookup_limited_quirks(hubbub_treebuilder *treebuilder, + const hubbub_doctype *cdoc) +{ + const uint8_t *public_id = treebuilder->input_buffer + + cdoc->public_id.data.off; + size_t public_id_len = cdoc->public_id.len; + +#define S(s) (uint8_t *) s, sizeof s + + if (starts_with(public_id, public_id_len, + S("-//W3C//DTD XHTML 1.0 Frameset//")) || + starts_with(public_id, public_id_len, + S("-//W3C//DTD XHTML 1.0 Transitional//"))) { + return true; + } + + if (cdoc->system_missing == false && + (starts_with(public_id, public_id_len, + S("-//W3C//DTD HTML 4.01 Frameset//")) || + starts_with(public_id, public_id_len, + S("-//W3C//DTD HTML 4.01 Transitional//")))) { + return true; + } + +#undef S + + return false; +} + + /** * Handle token in initial insertion mode * @@ -71,7 +292,19 @@ bool handle_initial(hubbub_treebuilder *treebuilder, const hubbub_token *token) doctype); } - /* \todo look up the doctype in a catalog */ + const hubbub_doctype *cdoc = &token->data.doctype; + + /* Work out whether we need quirks mode or not */ + if (cdoc->force_quirks == true || + lookup_full_quirks(treebuilder, cdoc)) { + treebuilder->tree_handler->set_quirks_mode( + treebuilder->tree_handler->ctx, + HUBBUB_QUIRKS_MODE_FULL); + } else if (lookup_limited_quirks(treebuilder, cdoc)) { + treebuilder->tree_handler->set_quirks_mode( + treebuilder->tree_handler->ctx, + HUBBUB_QUIRKS_MODE_LIMITED); + } treebuilder->tree_handler->unref_node( treebuilder->tree_handler->ctx, appended); -- cgit v1.2.3