From 427ce60a0cf055347b2fd7ac4a37bec59d65c3ac Mon Sep 17 00:00:00 2001
From: John Mark Bell
Date: Mon, 7 Apr 2008 02:04:05 +0000
Subject: Implement "in body" insertion mode. Modify treebuilder test driver to
bring it in line with API changes. A few minimal bits of testdata for various
bits of in body. Proper testing will come once we're actually building a
tree.
svn path=/trunk/hubbub/; revision=4076
---
include/hubbub/functypes.h | 35 +-
include/hubbub/tree.h | 6 +-
src/treebuilder/Makefile | 2 +-
src/treebuilder/in_body.c | 1898 +++++++++++++++++++++++++++++++++++++++++
src/treebuilder/in_body.h | 18 +
src/treebuilder/internal.h | 190 +++++
src/treebuilder/treebuilder.c | 801 ++++++++---------
test/data/html/INDEX | 2 +
test/data/html/isindex.html | 8 +
test/data/html/misnested.html | 11 +
test/tree.c | 84 +-
11 files changed, 2597 insertions(+), 458 deletions(-)
create mode 100644 src/treebuilder/in_body.c
create mode 100644 src/treebuilder/in_body.h
create mode 100644 src/treebuilder/internal.h
create mode 100644 test/data/html/isindex.html
create mode 100644 test/data/html/misnested.html
diff --git a/include/hubbub/functypes.h b/include/hubbub/functypes.h
index ddc307a..80c8388 100644
--- a/include/hubbub/functypes.h
+++ b/include/hubbub/functypes.h
@@ -9,6 +9,7 @@
#define hubbub_functypes_h_
#include
+#include
#include
#include
@@ -52,12 +53,6 @@ typedef int (*hubbub_tree_create_doctype)(void *ctx, const hubbub_string *qname,
typedef int (*hubbub_tree_create_element)(void *ctx, const hubbub_tag *tag,
void **result);
-/**
- * Type of tree element node creation function (verbatim name)
- */
-typedef int (*hubbub_tree_create_element_verbatim)(void *ctx,
- const uint8_t *name, size_t name_len, void **result);
-
/**
* Type of tree text node creation function
*/
@@ -98,6 +93,34 @@ typedef int (*hubbub_tree_remove_child)(void *ctx, void *parent, void *child,
typedef int (*hubbub_tree_clone_node)(void *ctx, void *node, bool deep,
void **result);
+/**
+ * Type of child reparenting function
+ */
+typedef int (*hubbub_tree_reparent_children)(void *ctx, void *node,
+ void *new_parent);
+
+/**
+ * Type of parent node acquisition function
+ */
+typedef int (*hubbub_tree_get_parent)(void *ctx, void *node, bool element_only,
+ void **result);
+
+/**
+ * Type of child presence query function
+ */
+typedef int (*hubbub_tree_has_children)(void *ctx, void *node, bool *result);
+
+/**
+ * Type of form association function
+ */
+typedef int (*hubbub_tree_form_associate)(void *ctx, void *form, void *node);
+
+/**
+ * Type of attribute addition function
+ */
+typedef int (*hubbub_tree_add_attributes)(void *ctx, void *node,
+ const hubbub_attribute *attributes, uint32_t n_attributes);
+
/**
* Type of tree quirks mode notification function
*/
diff --git a/include/hubbub/tree.h b/include/hubbub/tree.h
index cc66acf..7e2e11f 100644
--- a/include/hubbub/tree.h
+++ b/include/hubbub/tree.h
@@ -17,7 +17,6 @@ typedef struct hubbub_tree_handler {
hubbub_tree_create_comment create_comment;
hubbub_tree_create_doctype create_doctype;
hubbub_tree_create_element create_element;
- hubbub_tree_create_element_verbatim create_element_verbatim;
hubbub_tree_create_text create_text;
hubbub_tree_ref_node ref_node;
hubbub_tree_unref_node unref_node;
@@ -25,6 +24,11 @@ typedef struct hubbub_tree_handler {
hubbub_tree_insert_before insert_before;
hubbub_tree_remove_child remove_child;
hubbub_tree_clone_node clone_node;
+ hubbub_tree_reparent_children reparent_children;
+ hubbub_tree_get_parent get_parent;
+ hubbub_tree_has_children has_children;
+ hubbub_tree_form_associate form_associate;
+ hubbub_tree_add_attributes add_attributes;
hubbub_tree_set_quirks_mode set_quirks_mode;
void *ctx;
} hubbub_tree_handler;
diff --git a/src/treebuilder/Makefile b/src/treebuilder/Makefile
index d63a7a3..3353a26 100644
--- a/src/treebuilder/Makefile
+++ b/src/treebuilder/Makefile
@@ -22,7 +22,7 @@
CFLAGS += -I$(CURDIR)
# Objects
-OBJS = treebuilder
+OBJS = in_body treebuilder
.PHONY: clean debug distclean export release setup test
diff --git a/src/treebuilder/in_body.c b/src/treebuilder/in_body.c
new file mode 100644
index 0000000..7fefdfd
--- /dev/null
+++ b/src/treebuilder/in_body.c
@@ -0,0 +1,1898 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2008 John-Mark Bell
+ */
+
+#include
+#include
+
+#include "treebuilder/in_body.h"
+#include "utils/utils.h"
+
+#undef DEBUG_IN_BODY
+
+typedef struct bookmark {
+ formatting_list_entry *prev;
+ formatting_list_entry *next;
+} bookmark;
+
+static void process_character(hubbub_treebuilder *treebuilder,
+ const hubbub_token *token);
+static bool process_start_tag(hubbub_treebuilder *treebuilder,
+ const hubbub_token *token);
+static bool process_end_tag(hubbub_treebuilder *treebuilder,
+ const hubbub_token *token);
+
+static void process_html_in_body(hubbub_treebuilder *treebuilder,
+ const hubbub_token *token);
+static void process_body_in_body(hubbub_treebuilder *treebuilder,
+ const hubbub_token *token);
+static void process_container_in_body(hubbub_treebuilder *treebuilder,
+ const hubbub_token *token);
+static void process_form_in_body(hubbub_treebuilder *treebuilder,
+ const hubbub_token *token);
+static void process_dd_dt_li_in_body(hubbub_treebuilder *treebuilder,
+ const hubbub_token *token, element_type type);
+static void process_plaintext_in_body(hubbub_treebuilder *treebuilder,
+ const hubbub_token *token);
+static void process_a_in_body(hubbub_treebuilder *treebuilder,
+ const hubbub_token *token);
+static void process_presentational_in_body(hubbub_treebuilder *treebuilder,
+ const hubbub_token *token, element_type type);
+static void process_nobr_in_body(hubbub_treebuilder *treebuilder,
+ const hubbub_token *token);
+static void process_button_in_body(hubbub_treebuilder *treebuilder,
+ const hubbub_token *token);
+static void process_applet_marquee_object_in_body(
+ hubbub_treebuilder *treebuilder, const hubbub_token *token,
+ element_type type);
+static void process_hr_in_body(hubbub_treebuilder *treebuilder,
+ const hubbub_token *token);
+static void process_image_in_body(hubbub_treebuilder *treebuilder,
+ const hubbub_token *token);
+static void process_input_in_body(hubbub_treebuilder *treebuilder,
+ const hubbub_token *token);
+static void process_isindex_in_body(hubbub_treebuilder *treebuilder,
+ const hubbub_token *token);
+static void process_textarea_in_body(hubbub_treebuilder *treebuilder,
+ const hubbub_token *token);
+static void process_select_in_body(hubbub_treebuilder *treebuilder,
+ const hubbub_token *token);
+static void process_phrasing_in_body(hubbub_treebuilder *treebuilder,
+ const hubbub_token *token);
+
+static bool process_0body_in_body(hubbub_treebuilder *treebuilder);
+static void process_0container_in_body(hubbub_treebuilder *treebuilder,
+ element_type type);
+static void process_0p_in_body(hubbub_treebuilder *treebuilder);
+static void process_0dd_dt_li_in_body(hubbub_treebuilder *treebuilder,
+ element_type type);
+static void process_0h_in_body(hubbub_treebuilder *treebuilder,
+ element_type type);
+static void process_0presentational_in_body(hubbub_treebuilder *treebuilder,
+ element_type type);
+static void process_0applet_button_marquee_object_in_body(
+ hubbub_treebuilder *treebuilder, element_type type);
+static void process_0br_in_body(hubbub_treebuilder *treebuilder);
+static void process_0generic_in_body(hubbub_treebuilder *treebuilder,
+ element_type type);
+
+static bool aa_find_and_validate_formatting_element(
+ hubbub_treebuilder *treebuilder, element_type type,
+ formatting_list_entry **element);
+static formatting_list_entry *aa_find_formatting_element(
+ hubbub_treebuilder *treebuilder, element_type type);
+static bool aa_find_furthest_block(hubbub_treebuilder *treebuilder,
+ formatting_list_entry *formatting_element,
+ uint32_t *furthest_block);
+static void aa_remove_from_parent(hubbub_treebuilder *treebuilder, void *node);
+static void aa_reparent_node(hubbub_treebuilder *treebuilder, void *node,
+ void *new_parent);
+static void aa_find_bookmark_location_reparenting_misnested(
+ hubbub_treebuilder *treebuilder,
+ uint32_t formatting_element, uint32_t furthest_block,
+ bookmark *bookmark, uint32_t *last_node);
+static void aa_remove_element_stack_item(hubbub_treebuilder *treebuilder,
+ uint32_t index, uint32_t limit);
+static void aa_clone_and_replace_entries(hubbub_treebuilder *treebuilder,
+ formatting_list_entry *element);
+static void aa_insert_into_foster_parent(hubbub_treebuilder *treebuilder,
+ void *node);
+
+
+/**
+ * Handle tokens in "in body" insertion mode
+ *
+ * \param treebuilder The treebuilder instance
+ * \param token The token to process
+ * \return True to reprocess the token, false otherwise
+ */
+bool handle_in_body(hubbub_treebuilder *treebuilder,
+ const hubbub_token *token)
+{
+ bool reprocess = false;
+
+#if !defined(NDEBUG) && defined(DEBUG_IN_BODY)
+ fprintf(stdout, "Processing token %d\n", token->type);
+ element_stack_dump(treebuilder, stdout);
+ formatting_list_dump(treebuilder, stdout);
+#endif
+
+ if (treebuilder->context.strip_leading_lr &&
+ token->type != HUBBUB_TOKEN_CHARACTER) {
+ /* Reset the LR stripping flag */
+ treebuilder->context.strip_leading_lr = false;
+ }
+
+ switch (token->type) {
+ case HUBBUB_TOKEN_CHARACTER:
+ process_character(treebuilder, token);
+ break;
+ case HUBBUB_TOKEN_COMMENT:
+ process_comment_append(treebuilder, token,
+ treebuilder->context.element_stack[
+ treebuilder->context.current_node].node);
+ break;
+ case HUBBUB_TOKEN_DOCTYPE:
+ /** \todo parse error */
+ break;
+ case HUBBUB_TOKEN_START_TAG:
+ reprocess = process_start_tag(treebuilder, token);
+ break;
+ case HUBBUB_TOKEN_END_TAG:
+ reprocess = process_end_tag(treebuilder, token);
+ break;
+ case HUBBUB_TOKEN_EOF:
+ for (uint32_t i = treebuilder->context.current_node;
+ i > 0; i--) {
+ element_type type =
+ treebuilder->context.element_stack[i].type;
+
+ if (!(type == DD || type == DT || type == LI ||
+ type == P || type == TBODY ||
+ type == TD || type == TFOOT ||
+ type == TH || type == THEAD ||
+ type == TR || type == BODY)) {
+ /** \todo parse error */
+ break;
+ }
+ }
+ break;
+ }
+
+#if !defined(NDEBUG) && defined(DEBUG_IN_BODY)
+ fprintf(stdout, "Processed\n");
+ element_stack_dump(treebuilder, stdout);
+ formatting_list_dump(treebuilder, stdout);
+#endif
+
+ return reprocess;
+}
+
+/**
+ * Process a character token
+ *
+ * \param treebuilder The treebuilder instance
+ * \param token The token to process
+ */
+void process_character(hubbub_treebuilder *treebuilder,
+ const hubbub_token *token)
+{
+ hubbub_string dummy = token->data.character;
+
+ reconstruct_active_formatting_list(treebuilder);
+
+ if (treebuilder->context.strip_leading_lr) {
+ const uint8_t *str =
+ treebuilder->input_buffer + dummy.data.off;
+
+ /** \todo UTF-16 */
+ if (*str == '\n') {
+ dummy.data.off++;
+ dummy.len--;
+ }
+
+ treebuilder->context.strip_leading_lr = false;
+ }
+
+ append_text(treebuilder, &dummy);
+}
+
+/**
+ * Process a tag as if in "in body" mode
+ *
+ * \param treebuilder The treebuilder instance
+ * \param token The token to process
+ * \return True to reprocess the token
+ */
+bool process_tag_in_body(hubbub_treebuilder *treebuilder,
+ const hubbub_token *token)
+{
+ bool reprocess = false;
+
+ switch (token->type)
+ {
+ case HUBBUB_TOKEN_START_TAG:
+ reprocess = process_start_tag(treebuilder, token);
+ break;
+ case HUBBUB_TOKEN_END_TAG:
+ reprocess = process_end_tag(treebuilder, token);
+ break;
+ case HUBBUB_TOKEN_CHARACTER:
+ case HUBBUB_TOKEN_COMMENT:
+ case HUBBUB_TOKEN_DOCTYPE:
+ case HUBBUB_TOKEN_EOF:
+ assert(0);
+ break;
+ }
+
+ return reprocess;
+}
+
+/**
+ * Process a start tag
+ *
+ * \param treebuilder The treebuilder instance
+ * \param token The token to process
+ * \return True to reprocess the token
+ */
+bool process_start_tag(hubbub_treebuilder *treebuilder,
+ const hubbub_token *token)
+{
+ bool reprocess = false;
+ element_type type = element_type_from_name(treebuilder,
+ &token->data.tag.name);
+
+ if (type == HTML) {
+ process_html_in_body(treebuilder, token);
+ } else if (type == BASE || type == LINK || type == META) {
+ process_base_link_meta_in_head(treebuilder,
+ token, type);
+ } else if (type == SCRIPT) {
+ process_script_in_head(treebuilder, token);
+ } else if (type == STYLE) {
+ parse_generic_rcdata(treebuilder, token, false);
+ } else if (type == TITLE) {
+ parse_generic_rcdata(treebuilder, token, true);
+ } else if (type == BODY) {
+ process_body_in_body(treebuilder, token);
+ } else if (type == ADDRESS || type == BLOCKQUOTE ||
+ type == CENTER || type == DIR ||
+ type == DIV || type == DL ||
+ type == FIELDSET || type == H1 || type == H2 ||
+ type == H3 || type == H4 || type == H5 ||
+ type == H6 || type == MENU || type == OL ||
+ type == P || type == UL) {
+ process_container_in_body(treebuilder, token);
+ } else if (type == PRE || type == LISTING) {
+ process_container_in_body(treebuilder, token);
+
+ treebuilder->context.strip_leading_lr = true;
+ } else if (type == FORM) {
+ process_form_in_body(treebuilder, token);
+ } else if (type == DD || type == DT || type == LI) {
+ process_dd_dt_li_in_body(treebuilder, token, type);
+ } else if (type == PLAINTEXT) {
+ process_plaintext_in_body(treebuilder, token);
+ } else if (type == A) {
+ process_a_in_body(treebuilder, token);
+ } else if (type == B || type == BIG || type == EM ||
+ type == FONT || type == I || type == S ||
+ type == SMALL || type == STRIKE ||
+ type == STRONG || type == TT || type == U) {
+ process_presentational_in_body(treebuilder,
+ token, type);
+ } else if (type == NOBR) {
+ process_nobr_in_body(treebuilder, token);
+ } else if (type == BUTTON) {
+ process_button_in_body(treebuilder, token);
+ } else if (type == APPLET || type == MARQUEE ||
+ type == OBJECT) {
+ process_applet_marquee_object_in_body(treebuilder,
+ token, type);
+ } else if (type == XMP) {
+ reconstruct_active_formatting_list(treebuilder);
+ parse_generic_rcdata(treebuilder, token, false);
+ } else if (type == TABLE) {
+ process_container_in_body(treebuilder, token);
+
+ if (treebuilder->context.mode == IN_BODY) {
+ treebuilder->context.mode = IN_TABLE;
+ }
+ } else if (type == AREA || type == BASEFONT ||
+ type == BGSOUND || type == BR ||
+ type == EMBED || type == IMG || type == PARAM ||
+ type == SPACER || type == WBR) {
+ reconstruct_active_formatting_list(treebuilder);
+ insert_element_no_push(treebuilder, &token->data.tag);
+ } else if (type == HR) {
+ process_hr_in_body(treebuilder, token);
+ } else if (type == IMAGE) {
+ process_image_in_body(treebuilder, token);
+ } else if (type == INPUT) {
+ process_input_in_body(treebuilder, token);
+ } else if (type == ISINDEX) {
+ process_isindex_in_body(treebuilder, token);
+ } else if (type == TEXTAREA) {
+ process_textarea_in_body(treebuilder, token);
+ } else if (type == IFRAME || type == NOEMBED ||
+ type == NOFRAMES ||
+ (false /* scripting */ && type == NOSCRIPT)) {
+ parse_generic_rcdata(treebuilder, token, false);
+ } else if (type == SELECT) {
+ process_select_in_body(treebuilder, token);
+
+ if (treebuilder->context.mode == IN_BODY) {
+ treebuilder->context.mode = IN_SELECT;
+ } else if (treebuilder->context.mode == IN_TABLE ||
+ treebuilder->context.mode == IN_CAPTION ||
+ treebuilder->context.mode == IN_COLUMN_GROUP ||
+ treebuilder->context.mode == IN_TABLE_BODY ||
+ treebuilder->context.mode == IN_ROW ||
+ treebuilder->context.mode == IN_CELL) {
+ treebuilder->context.mode = IN_SELECT_IN_TABLE;
+ }
+ } else if (type == CAPTION || type == COL || type == COLGROUP ||
+ type == FRAME || type == FRAMESET ||
+ type == HEAD || type == OPTION ||
+ type == OPTGROUP || type == TBODY ||
+ type == TD || type == TFOOT || type == TH ||
+ type == THEAD || type == TR) {
+ /** \todo parse error */
+/* } else if (type == EVENT_SOURCE || type == SECTION ||
+ type == NAV || type == ARTICLE ||
+ type == ASIDE || type == HEADER ||
+ type == FOOTER || type == DATAGRID ||
+ type == COMMAND) {
+*/ } else {
+ process_phrasing_in_body(treebuilder, token);
+ }
+
+ return reprocess;
+}
+
+/**
+ * Process an end tag
+ *
+ * \param treebuilder The treebuilder instance
+ * \param token The token to process
+ * \return True to reprocess the token
+ */
+bool process_end_tag(hubbub_treebuilder *treebuilder,
+ const hubbub_token *token)
+{
+ bool reprocess = false;
+ element_type type = element_type_from_name(treebuilder,
+ &token->data.tag.name);
+
+ if (type == BODY) {
+ if (process_0body_in_body(treebuilder) &&
+ treebuilder->context.mode == IN_BODY) {
+ treebuilder->context.mode = AFTER_BODY;
+ }
+ } else if (type == HTML) {
+ /* Act as if
start tag as if in "in body"
+ *
+ * \param treebuilder The treebuilder instance
+ * \param token The token to process
+ */
+void process_body_in_body(hubbub_treebuilder *treebuilder,
+ const hubbub_token *token)
+{
+ /** \todo parse error */
+
+ if (treebuilder->context.current_node < 1 ||
+ treebuilder->context.element_stack[1].type != BODY)
+ return;
+
+ treebuilder->tree_handler->add_attributes(
+ treebuilder->tree_handler->ctx,
+ treebuilder->context.element_stack[1].node,
+ token->data.tag.attributes,
+ token->data.tag.n_attributes);
+}
+
+/**
+ * Process a generic container start tag as if in "in body"
+ *
+ * \param treebuilder The treebuilder instance
+ * \param token The token to process
+ */
+void process_container_in_body(hubbub_treebuilder *treebuilder,
+ const hubbub_token *token)
+{
+ if (element_in_scope(treebuilder, P, false)) {
+ process_0p_in_body(treebuilder);
+ }
+
+ insert_element(treebuilder, &token->data.tag);
+}
+
+/**
+ * Process a