From 2a8f109871049a5b2376c95e91eeaed4172f89c2 Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Thu, 5 Mar 2009 11:00:00 +0000 Subject: Make hubbub parser binding build trees correctly (credit: Bo Yang) svn path=/trunk/dom/; revision=6711 --- bindings/hubbub/parser.c | 723 ++++++++++++++++++++++++++++++++++++++++------- bindings/hubbub/parser.h | 33 ++- 2 files changed, 649 insertions(+), 107 deletions(-) (limited to 'bindings') diff --git a/bindings/hubbub/parser.c b/bindings/hubbub/parser.c index 7b5e6ab..0ad4176 100644 --- a/bindings/hubbub/parser.c +++ b/bindings/hubbub/parser.c @@ -3,6 +3,7 @@ * Licensed under the MIT License, * http://www.opensource.org/licenses/mit-license.php * Copyright 2007 John-Mark Bell + * Copyright 2009 Bo Yang */ #include @@ -16,13 +17,19 @@ #include "utils.h" /** - * libdom Hubbub parser object + * libdom Hubbub parser context */ struct dom_hubbub_parser { hubbub_parser *parser; /**< Hubbub parser instance */ + hubbub_tree_handler tree_handler; + /**< Hubbub parser tree handler */ struct dom_document *doc; /**< DOM Document we're building */ + dom_hubbub_encoding_source encoding_source; + /**< The document's encoding source */ + const char *encoding; /**< The document's encoding */ + bool complete; /**< Indicate stream completion */ struct dom_implementation *impl;/**< DOM implementation */ @@ -34,39 +41,83 @@ struct dom_hubbub_parser { void *mctx; /**< Pointer to client data */ }; -static hubbub_error __dom_hubbub_token_handler(const hubbub_token *token, - void *pw); -static bool __initialised; +/* The callbacks declarations */ +static int create_comment(void *parser, const hubbub_string *data, + void **result); +static int create_doctype(void *parser, const hubbub_doctype *doctype, + void **result); +static int create_element(void *parser, const hubbub_tag *tag, void **result); +static int create_text(void *parser, const hubbub_string *data, + void **result); +static int ref_node(void *parser, void *node); +static int unref_node(void *parser, void *node); +static int append_child(void *parser, void *parent, void *child, + void **result); +static int insert_before(void *parser, void *parent, void *child, + void *ref_child, void **result); +static int remove_child(void *parser, void *parent, void *child, + void **result); +static int clone_node(void *parser, void *node, bool deep, void **result); +static int reparent_children(void *parser, void *node, void *new_parent); +static int get_parent(void *parser, void *node, bool element_only, + void **result); +static int has_children(void *parser, void *node, bool *result); +static int form_associate(void *parser, void *form, void *node); +static int add_attributes(void *parser, void *node, + const hubbub_attribute *attributes, uint32_t n_attributes); +static int set_quirks_mode(void *parser, hubbub_quirks_mode mode); +static int change_encoding(void *parser, const char *charset); + +static hubbub_tree_handler tree_handler = { + create_comment, + create_doctype, + create_element, + create_text, + ref_node, + unref_node, + append_child, + insert_before, + remove_child, + clone_node, + reparent_children, + get_parent, + has_children, + form_associate, + add_attributes, + set_quirks_mode, + change_encoding, + NULL +}; + +static bool __initialised = false; /** * Create a Hubbub parser instance * * \param aliases Path to encoding alias mapping file * \param enc Source charset, or NULL - * \param int_enc Desired charset of document buffer (UTF-8 or UTF-16) + * \param fix_enc Whether fix the encoding * \param alloc Memory (de)allocation function * \param pw Pointer to client-specific private data * \param msg Informational message function * \param mctx Pointer to client-specific private data * \return Pointer to instance, or NULL on memory exhaustion */ -dom_hubbub_parser *dom_hubbub_parser_create(const char *aliases, - const char *enc, const char *int_enc, +dom_hubbub_parser *dom_hubbub_parser_create(const char *aliases, + const char *enc, bool fix_enc, dom_alloc alloc, void *pw, dom_msg msg, void *mctx) { dom_hubbub_parser *parser; hubbub_parser_optparams params; - struct dom_string *features; + hubbub_error error; dom_exception err; - hubbub_error e; - - UNUSED(int_enc); + struct dom_string *features; if (__initialised == false) { - e = hubbub_initialise(aliases, (hubbub_alloc) alloc, pw); - if (e != HUBBUB_OK) { - msg(DOM_MSG_ERROR, mctx, + error = hubbub_initialise(aliases, (hubbub_alloc) alloc, pw); + if (error != HUBBUB_OK) { + msg(DOM_MSG_ERROR, mctx, "Failed initialising hubbub"); return NULL; } @@ -74,36 +125,33 @@ dom_hubbub_parser *dom_hubbub_parser_create(const char *aliases, __initialised = true; } + parser = alloc(NULL, sizeof(dom_hubbub_parser), pw); if (parser == NULL) { - msg(DOM_MSG_CRITICAL, mctx, "No memory for parser"); + msg(DOM_MSG_CRITICAL, mctx, "No memory for parsing context"); return NULL; } - e = hubbub_parser_create(enc, true, (hubbub_alloc) alloc, pw, - &parser->parser); - if (e != HUBBUB_OK) { - alloc(parser, 0, pw); - msg(DOM_MSG_CRITICAL, mctx, "Failed to create hubbub parser"); - return NULL; - } + parser->parser = NULL; + parser->doc = NULL; + parser->encoding = enc; + parser->encoding_source = enc != NULL ? ENCODING_SOURCE_HEADER + : ENCODING_SOURCE_DETECTED; + parser->complete = false; + parser->impl = NULL; - params.token_handler.handler = __dom_hubbub_token_handler; - params.token_handler.pw = parser; - e = hubbub_parser_setopt(parser->parser, HUBBUB_PARSER_TOKEN_HANDLER, - ¶ms); - if (e != HUBBUB_OK) { - hubbub_parser_destroy(parser->parser); - alloc(parser, 0, pw); - msg(DOM_MSG_CRITICAL, mctx, - "Failed registering hubbub token handler"); + parser->alloc = alloc; + parser->pw = pw; + parser->msg = msg; + parser->mctx = mctx; + + error = hubbub_parser_create(enc, fix_enc, alloc, pw, &parser->parser); + if (error != HUBBUB_OK) { + parser->alloc(parser, 0, parser->pw); + msg(DOM_MSG_CRITICAL, mctx, "Can't create parser"); return NULL; } - parser->doc = NULL; - - parser->complete = false; - /* Get DOM implementation */ /* Create string representation of the features we want */ err = dom_string_create(alloc, pw, @@ -115,7 +163,7 @@ dom_hubbub_parser *dom_hubbub_parser_create(const char *aliases, return NULL; } - /* Now, try to get an appropriate implementation from the registry */ + /* Now, try to get an appropriate implementation from the registry */ err = dom_implregistry_get_dom_implementation(features, &parser->impl, alloc, pw); if (err != DOM_NO_ERR) { @@ -129,11 +177,26 @@ dom_hubbub_parser *dom_hubbub_parser_create(const char *aliases, /* no longer need the features string */ dom_string_unref(features); - parser->alloc = alloc; - parser->pw = pw; + err = dom_implementation_create_document(parser->impl, NULL, NULL, NULL, + &parser->doc, alloc, pw); + if (err != DOM_NO_ERR) { + hubbub_parser_destroy(parser->parser); + alloc(parser, 0, pw); + msg(DOM_MSG_ERROR, mctx, "Can't create DOM document"); + return NULL; + } - parser->msg = msg; - parser->mctx = mctx; + parser->tree_handler = tree_handler; + parser->tree_handler.ctx = (void *) parser; + + params.tree_handler = &parser->tree_handler; + hubbub_parser_setopt(parser->parser, HUBBUB_PARSER_TREE_HANDLER, + ¶ms); + + dom_node_ref((struct dom_node *) parser->doc); + params.document_node = parser->doc; + hubbub_parser_setopt(parser->parser, HUBBUB_PARSER_DOCUMENT_NODE, + ¶ms); return parser; } @@ -142,31 +205,29 @@ dom_hubbub_parser *dom_hubbub_parser_create(const char *aliases, void dom_hubbub_parser_destroy(dom_hubbub_parser *parser) { dom_implementation_unref(parser->impl); - hubbub_parser_destroy(parser->parser); + parser->parser = NULL; - /** \todo do we want to clean up the document here too? */ + if (parser->doc != NULL) { + dom_node_unref((struct dom_node *) parser->doc); + parser->doc = NULL; + } parser->alloc(parser, 0, parser->pw); } -/* Parse a chunk of data */ dom_hubbub_error dom_hubbub_parser_parse_chunk(dom_hubbub_parser *parser, uint8_t *data, size_t len) { hubbub_error err; err = hubbub_parser_parse_chunk(parser->parser, data, len); - if (err != HUBBUB_OK) { - parser->msg(DOM_MSG_ERROR, parser->mctx, - "hubbub_parser_parse_chunk failed: %d", err); + if (err != HUBBUB_OK) return DOM_HUBBUB_HUBBUB_ERR | err; - } return DOM_HUBBUB_OK; } -/* Notify parser that datastream is empty */ dom_hubbub_error dom_hubbub_parser_completed(dom_hubbub_parser *parser) { hubbub_error err; @@ -183,73 +244,523 @@ dom_hubbub_error dom_hubbub_parser_completed(dom_hubbub_parser *parser) return DOM_HUBBUB_OK; } -/* Retrieve the created DOM Document */ struct dom_document *dom_hubbub_parser_get_document(dom_hubbub_parser *parser) { - return (parser->complete ? parser->doc : NULL); + struct dom_document *doc = NULL; + + if (parser->complete) { + doc = parser->doc; + parser->doc = NULL; + } + + return doc; } -hubbub_error __dom_hubbub_token_handler(const hubbub_token *token, void *pw) +const char *dom_hubbub_parser_get_encoding(dom_hubbub_parser *parser, + dom_hubbub_encoding_source *source) { - dom_hubbub_parser *parser = (dom_hubbub_parser *) pw; - static const char *token_names[] = { - "DOCTYPE", "START TAG", "END TAG", - "COMMENT", "CHARACTERS", "EOF" - }; - size_t i; + *source = parser->encoding_source; + + return parser->encoding != NULL ? parser->encoding + : "Windows-1252"; +} + +/* The callbacks definitions */ +static int create_comment(void *parser, const hubbub_string *data, + void **result) +{ + dom_hubbub_parser *dom_parser = (dom_hubbub_parser *) parser; + dom_exception err; + struct dom_string *str; + struct dom_comment *comment; + + *result = NULL; + + err = dom_string_create(dom_parser->alloc, dom_parser->pw, data->ptr, + data->len, &str); + if (err != DOM_NO_ERR) { + dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, + "Can't create comment node text"); + return 1; + } + + err = dom_document_create_comment(dom_parser->doc, str, &comment); + if (err != DOM_NO_ERR) { + dom_string_unref(str); + dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, + "Can't create comment node with text '%.*s'", + data->len, data->ptr); + return 1; + } + + *result = comment; + + dom_string_unref(str); + + return 0; +} + +static int create_doctype(void *parser, const hubbub_doctype *doctype, + void **result) +{ + dom_hubbub_parser *dom_parser = (dom_hubbub_parser *) parser; + dom_exception err; + struct dom_string *qname, *public_id = NULL, *system_id = NULL; + struct dom_document_type *dtype; + + *result = NULL; + + err = dom_string_create(dom_parser->alloc, dom_parser->pw, + doctype->name.ptr, doctype->name.len, &qname); + if (err != DOM_NO_ERR) { + dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, + "Can't create doctype name"); + goto fail; + } + + if (doctype->public_missing == false) { + err = dom_string_create(dom_parser->alloc, dom_parser->pw, + doctype->public_id.ptr, + doctype->public_id.len, &public_id); + } else { + err = dom_string_create(dom_parser->alloc, dom_parser->pw, + NULL, 0, &public_id); + } + if (err != DOM_NO_ERR) { + dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, + "Can't create doctype public id"); + goto clean1; + } + + if (doctype->system_missing == false) { + err = dom_string_create(dom_parser->alloc, dom_parser->pw, + doctype->system_id.ptr, + doctype->system_id.len, &system_id); + } else { + err = dom_string_create(dom_parser->alloc, dom_parser->pw, + NULL, 0, &system_id); + } + if (err != DOM_NO_ERR) { + dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, + "Can't create doctype system id"); + goto clean2; + } + + err = dom_implementation_create_document_type(dom_parser->impl, qname, + public_id, system_id, &dtype, dom_parser->alloc, + dom_parser->pw); + if (err != DOM_NO_ERR) { + dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, + "Can't create the document type"); + goto clean3; + } + + *result = dtype; + +clean3: + dom_string_unref(system_id); + +clean2: + dom_string_unref(public_id); + +clean1: + dom_string_unref(qname); + +fail: + if (*result == NULL) + return 1; + else + return 0; +} + +static int create_element(void *parser, const hubbub_tag *tag, void **result) +{ + dom_hubbub_parser *dom_parser = (dom_hubbub_parser *) parser; + dom_exception err; + struct dom_string *name; + struct dom_element *element = NULL; + + *result = NULL; + + err = dom_string_create(dom_parser->alloc, dom_parser->pw, + tag->name.ptr, tag->name.len, &name); + if (err != DOM_NO_ERR) { + dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, + "Can't create element name"); + goto fail; + } + + if (tag->ns == HUBBUB_NS_NULL) { + err = dom_document_create_element(dom_parser->doc, name, + &element); + if (err != DOM_NO_ERR) { + dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, + "Can't create the DOM element"); + goto clean1; + } + } else { + err = dom_document_create_element_ns(dom_parser->doc, + dom_namespaces[tag->ns], name, &element); + if (err != DOM_NO_ERR) { + dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, + "Can't create the DOM element"); + goto clean1; + } + } + + *result = element; +clean1: + dom_string_unref(name); + +fail: + if (*result == NULL) + return 1; + else + return 0; +} + +static int create_text(void *parser, const hubbub_string *data, void **result) +{ + dom_hubbub_parser *dom_parser = (dom_hubbub_parser *) parser; + dom_exception err; + struct dom_string *str; + struct dom_text *text = NULL; + + *result = NULL; + + err = dom_string_create(dom_parser->alloc, dom_parser->pw, data->ptr, + data->len, &str); + if (err != DOM_NO_ERR) { + dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, + "Can't create text '%.*s'", data->len, + data->ptr); + goto fail; + } + + err = dom_document_create_text_node(dom_parser->doc, str, &text); + if (err != DOM_NO_ERR) { + dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, + "Can't create the DOM text node"); + goto clean1; + } + + *result = text; +clean1: + dom_string_unref(str); +fail: + if (*result == NULL) + return 1; + else + return 0; + +} + +static int ref_node(void *parser, void *node) +{ + struct dom_node *dnode = (struct dom_node *) node; + + UNUSED(parser); + + dom_node_ref(dnode); + + return 0; +} + +static int unref_node(void *parser, void *node) +{ + struct dom_node *dnode = (struct dom_node *) node; + + UNUSED(parser); + + dom_node_unref(dnode); + + return 0; +} + +static int append_child(void *parser, void *parent, void *child, void **result) +{ + dom_hubbub_parser *dom_parser = (dom_hubbub_parser *) parser; + dom_exception err; + + err = dom_node_append_child((struct dom_node *) parent, + (struct dom_node *) child, + (struct dom_node **) result); + if (err != DOM_NO_ERR) { + dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, + "Can't append child '%p' for parent '%p'", + child, parent); + return 1; + } + + return 0; +} + +static int insert_before(void *parser, void *parent, void *child, + void *ref_child, void **result) +{ + dom_hubbub_parser *dom_parser = (dom_hubbub_parser *) parser; + dom_exception err; + + err = dom_node_insert_before((struct dom_node *) parent, + (struct dom_node *) child, + (struct dom_node *) ref_child, + (struct dom_node **) result); + if (err != DOM_NO_ERR) { + dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, + "Can't insert node '%p' before node '%p'", + child, ref_child); + return 1; + } + + return 0; +} + +static int remove_child(void *parser, void *parent, void *child, void **result) +{ + dom_hubbub_parser *dom_parser = (dom_hubbub_parser *) parser; + dom_exception err; + + err = dom_node_remove_child((struct dom_node *) parent, + (struct dom_node *) child, + (struct dom_node **) result); + if (err != DOM_NO_ERR) { + dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, + "Can't remove child '%p'", child); + return 1; + } + + return 0; +} + +static int clone_node(void *parser, void *node, bool deep, void **result) +{ + dom_hubbub_parser *dom_parser = (dom_hubbub_parser *) parser; + dom_exception err; + + err = dom_node_clone_node((struct dom_node *) node, deep, + (struct dom_node **) result); + if (err != DOM_NO_ERR) { + dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, + "Can't clone node '%p'", node); + return 1; + } + + return 0; +} + +static int reparent_children(void *parser, void *node, void *new_parent) +{ + dom_hubbub_parser *dom_parser = (dom_hubbub_parser *) parser; + dom_exception err; + struct dom_node *child, *result; + + while(true) { + err = dom_node_get_first_child((struct dom_node *) node, + &child); + if (err != DOM_NO_ERR) { + dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, + "Error in dom_note_get_first_child"); + return 1; + } + if (child == NULL) + break; + + err = dom_node_remove_child(node, (struct dom_node *) child, + &result); + if (err != DOM_NO_ERR) { + dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, + "Error in dom_node_remove_child"); + goto fail; + } + dom_node_unref(result); + + err = dom_node_append_child((struct dom_node *) new_parent, + (struct dom_node *) child, &result); + if (err != DOM_NO_ERR) { + dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, + "Error in dom_node_append_child"); + goto fail; + } + dom_node_unref(result); + dom_node_unref(child); + } + return 0; + +fail: + dom_node_unref(child); + return 1; +} + +static int get_parent(void *parser, void *node, bool element_only, + void **result) +{ + dom_hubbub_parser *dom_parser = (dom_hubbub_parser *) parser; + dom_exception err; + struct dom_node *parent; + dom_node_type type = DOM_NODE_TYPE_COUNT; + + err = dom_node_get_parent_node((struct dom_node *) node, + &parent); + if (err != DOM_NO_ERR) { + dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, + "Error in dom_node_get_parent"); + return 1; + } + if (element_only == false) { + *result = parent; + return 0; + } + + err = dom_node_get_node_type(parent, &type); + if (err != DOM_NO_ERR) { + dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, + "Error in dom_node_get_type"); + goto fail; + } + if (type == DOM_ELEMENT_NODE) { + *result = parent; + return 0; + } else { + *result = NULL; + dom_node_unref(parent); + return 0; + } + +fail: + dom_node_unref(parent); + return 1; +} + +static int has_children(void *parser, void *node, bool *result) +{ + dom_hubbub_parser *dom_parser = (dom_hubbub_parser *) parser; + dom_exception err; + + UNUSED(parser); + + err = dom_node_has_child_nodes((struct dom_node *) node, result); + if (err != DOM_NO_ERR) { + dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, + "Error in dom_node_has_child_nodes"); + return 1; + } + return 0; +} + +static int form_associate(void *parser, void *form, void *node) +{ UNUSED(parser); + UNUSED(form); + UNUSED(node); - printf("%s: ", token_names[token->type]); - - switch (token->type) { - case HUBBUB_TOKEN_DOCTYPE: - printf("'%.*s' (%svalid)\n", - (int) token->data.doctype.name.len, - token->data.doctype.name.ptr, - token->data.doctype.force_quirks ? "in" : ""); - break; - case HUBBUB_TOKEN_START_TAG: - printf("'%.*s' %s\n", - (int) token->data.tag.name.len, - token->data.tag.name.ptr, - (token->data.tag.n_attributes > 0) ? - "attributes:" : ""); - for (i = 0; i < token->data.tag.n_attributes; i++) { - printf("\t'%.*s' = '%.*s'\n", - (int) token->data.tag.attributes[i].name.len, - token->data.tag.attributes[i].name.ptr, - (int) token->data.tag.attributes[i].value.len, - token->data.tag.attributes[i].value.ptr); + return 0; +} + +static int add_attributes(void *parser, void *node, + const hubbub_attribute *attributes, uint32_t n_attributes) +{ + dom_hubbub_parser *dom_parser = (dom_hubbub_parser *) parser; + dom_exception err; + uint32_t i; + + for (i = 0; i < n_attributes; i++) { + struct dom_string *name, *value; + err = dom_string_create(dom_parser->alloc, dom_parser->pw, + attributes[i].name.ptr, + attributes[i].name.len, &name); + if (err != DOM_NO_ERR) { + dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, + "Can't create attribute name"); + goto fail; } - break; - case HUBBUB_TOKEN_END_TAG: - printf("'%.*s' %s\n", - (int) token->data.tag.name.len, - token->data.tag.name.ptr, - (token->data.tag.n_attributes > 0) ? - "attributes:" : ""); - for (i = 0; i < token->data.tag.n_attributes; i++) { - printf("\t'%.*s' = '%.*s'\n", - (int) token->data.tag.attributes[i].name.len, - token->data.tag.attributes[i].name.ptr, - (int) token->data.tag.attributes[i].value.len, - token->data.tag.attributes[i].value.ptr); + + err = dom_string_create(dom_parser->alloc, dom_parser->pw, + attributes[i].value.ptr, + attributes[i].value.len, &value); + if (err != DOM_NO_ERR) { + dom_parser->msg(DOM_MSG_CRITICAL, dom_parser->mctx, + "Can't create attribute value"); + dom_string_unref(name); + goto fail; + } + + if (attributes[i].ns == HUBBUB_NS_NULL) { + err = dom_element_set_attribute( + (struct dom_element *) node, name, + value); + dom_string_unref(name); + dom_string_unref(value); + if (err != DOM_NO_ERR) { + dom_parser->msg(DOM_MSG_CRITICAL, + dom_parser->mctx, + "Can't add attribute"); + goto fail; + } + } else { + err = dom_element_set_attribute_ns( + (struct dom_element *) node, + dom_namespaces[attributes[i].ns], name, + value); + dom_string_unref(name); + dom_string_unref(value); + if (err != DOM_NO_ERR) { + dom_parser->msg(DOM_MSG_CRITICAL, + dom_parser->mctx, + "Can't add attribute ns"); + goto fail; + } } - break; - case HUBBUB_TOKEN_COMMENT: - printf("'%.*s'\n", (int) token->data.comment.len, - token->data.comment.ptr); - break; - case HUBBUB_TOKEN_CHARACTER: - printf("'%.*s'\n", (int) token->data.character.len, - token->data.character.ptr); - break; - case HUBBUB_TOKEN_EOF: - printf("\n"); - break; - } - - return HUBBUB_OK; + } + + return 0; + +fail: + return 1; +} + +static int set_quirks_mode(void *parser, hubbub_quirks_mode mode) +{ + UNUSED(parser); + UNUSED(mode); + + return 0; +} + +static int change_encoding(void *parser, const char *charset) +{ + dom_hubbub_parser *dom_parser = (dom_hubbub_parser *) parser; + uint32_t source; + const char *name; + + /* If we have an encoding here, it means we are *certain* */ + if (dom_parser->encoding != NULL) { + return 0; + } + + /* Find the confidence otherwise (can only be from a BOM) */ + name = hubbub_parser_read_charset(dom_parser->parser, &source); + + if (source == HUBBUB_CHARSET_CONFIDENT) { + dom_parser->encoding_source = ENCODING_SOURCE_DETECTED; + dom_parser->encoding = (char *) charset; + return 0; + } + + /* So here we have something of confidence tentative... */ + /* http://www.whatwg.org/specs/web-apps/current-work/#change */ + + /* 2. "If the new encoding is identical or equivalent to the encoding + * that is already being used to interpret the input stream, then set + * the confidence to confident and abort these steps." */ + + /* Whatever happens, the encoding should be set here; either for + * reprocessing with a different charset, or for confirming that the + * charset is in fact correct */ + dom_parser->encoding = charset; + dom_parser->encoding_source = ENCODING_SOURCE_META; + + /* Equal encodings will have the same string pointers */ + return (charset == name) ? 0 : 1; } diff --git a/bindings/hubbub/parser.h b/bindings/hubbub/parser.h index f4c2ac4..7e28916 100644 --- a/bindings/hubbub/parser.h +++ b/bindings/hubbub/parser.h @@ -19,9 +19,37 @@ struct dom_document; typedef struct dom_hubbub_parser dom_hubbub_parser; +/* The encoding source of the document */ +typedef enum dom_hubub_encoding_source { + ENCODING_SOURCE_HEADER, + ENCODING_SOURCE_DETECTED, + ENCODING_SOURCE_META +} dom_hubbub_encoding_source; + +/* The recommended way to use the parser is: + * + * dom_hubbub_parser_create(...); + * dom_hubbub_parser_parse_chunk(...); + * call _parse_chunk for all chunks of data + * + * After you have parsed the data, + * + * dom_hubbub_parser_completed(...); + * dom_bubbub_parser_get_document(...); + * dom_hubbub_parser_destroy(...); + * + * Clients must ensure that the last 3 function calls above are called in + * the order shown. dom_hubbub_parser_get_document() will pass the ownership + * of the document to the client. After that, the parser should be destroyed. + * The client must not call any method of this parser after destruction. + * + * The client must call dom_hubbub_parser_completed() before calling + * dom_hubbub_parser_get_document(). + */ + /* Create a Hubbub parser instance */ dom_hubbub_parser *dom_hubbub_parser_create(const char *aliases, - const char *enc, const char *int_enc, + const char *enc, bool fix_enc, dom_alloc alloc, void *pw, dom_msg msg, void *mctx); /* Destroy a Hubbub parser instance */ @@ -37,4 +65,7 @@ dom_hubbub_error dom_hubbub_parser_completed(dom_hubbub_parser *parser); /* Retrieve the created DOM Document */ struct dom_document *dom_hubbub_parser_get_document(dom_hubbub_parser *parser); +/* Retrieve the document's encoding */ +const char *dom_hubbub_parser_get_encoding(dom_hubbub_parser *parser, + dom_hubbub_encoding_source *source); #endif -- cgit v1.2.3