diff options
-rw-r--r-- | include/hubbub/types.h | 3 | ||||
-rw-r--r-- | src/tokeniser/tokeniser.c | 99 | ||||
-rw-r--r-- | src/treebuilder/in_body.c | 6 | ||||
-rw-r--r-- | src/treebuilder/in_foreign_content.c | 5 | ||||
-rw-r--r-- | src/treebuilder/in_head.c | 8 | ||||
-rw-r--r-- | src/treebuilder/internal.h | 2 | ||||
-rw-r--r-- | src/treebuilder/treebuilder.c | 18 | ||||
-rw-r--r-- | test/data/tree-construction/INDEX | 2 |
8 files changed, 89 insertions, 54 deletions
diff --git a/include/hubbub/types.h b/include/hubbub/types.h index 6e2b1a9..a812e9f 100644 --- a/include/hubbub/types.h +++ b/include/hubbub/types.h @@ -34,7 +34,8 @@ typedef enum hubbub_content_model { HUBBUB_CONTENT_MODEL_RCDATA, HUBBUB_CONTENT_MODEL_CDATA, HUBBUB_CONTENT_MODEL_PLAINTEXT, - HUBBUB_CONTENT_MODEL_RAWTEXT + HUBBUB_CONTENT_MODEL_RAWTEXT, + HUBBUB_CONTENT_MODEL_SCRIPTDATA } hubbub_content_model; /** diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c index 1d16ba4..500a88d 100644 --- a/src/tokeniser/tokeniser.c +++ b/src/tokeniser/tokeniser.c @@ -426,7 +426,17 @@ hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser, tokeniser->error_pw = params->error_handler.pw; break; case HUBBUB_TOKENISER_CONTENT_MODEL: - tokeniser->content_model = params->content_model.model; + if(params->content_model.model == HUBBUB_CONTENT_MODEL_SCRIPTDATA) { + /*There is no other way to achieve this switch, + since we have multiple states coming + into some handlers. The only way to + avoid this is to not club the states at all, + and have standalone handlers for each state in + script domain.*/ + tokeniser->state = STATE_SCRIPT_DATA; + } else { + tokeniser->content_model = params->content_model.model; + } break; case HUBBUB_TOKENISER_PROCESS_CDATA: tokeniser->process_cdata_section = params->process_cdata; @@ -774,6 +784,7 @@ hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser) /* Buffer '<' */ tokeniser->context.pending = len; + if(tokeniser->state == STATE_SCRIPT_DATA) { tokeniser->state = STATE_SCRIPT_DATA_LESS_THAN; } else if(tokeniser->state == STATE_SCRIPT_DATA_ESCAPED) { @@ -813,15 +824,15 @@ hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser) } /* Emit a null or a replacement character */ - if(tokeniser->content_model != HUBBUB_CONTENT_MODEL_PCDATA) { + if(tokeniser->content_model != HUBBUB_CONTENT_MODEL_PCDATA || + tokeniser->state == STATE_SCRIPT_DATA || + tokeniser->state == STATE_SCRIPT_DATA_ESCAPED || + tokeniser->state == STATE_SCRIPT_DATA_DOUBLE_ESCAPED) { emit_character_token(tokeniser, &u_fffd_str); } else { emit_character_token(tokeniser, &u_null_str); } - if(tokeniser->state == STATE_SCRIPT_DATA_DOUBLE_ESCAPED || - tokeniser->state == STATE_SCRIPT_DATA_ESCAPED) { - tokeniser->state = STATE_DATA; - } + /* Advance past NUL */ parserutils_inputstream_advance(tokeniser->input, 1); } else if (c == '\r' && tokeniser->content_model != HUBBUB_CONTENT_MODEL_PLAINTEXT) { @@ -855,14 +866,22 @@ hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser) tokeniser->state = STATE_SCRIPT_DATA_ESCAPED_DASH; } tokeniser->context.pending += len; + break; } else { /* Just collect into buffer */ tokeniser->context.pending += len; } } - if (tokeniser->state != STATE_TAG_OPEN && - (tokeniser->state != STATE_DATA || error == PARSERUTILS_EOF) && + if (tokeniser->state != STATE_SCRIPT_DATA_LESS_THAN && + tokeniser->state != STATE_SCRIPT_DATA_ESCAPED_LESS_THAN && + tokeniser->state != STATE_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN && + tokeniser->state != STATE_TAG_OPEN && + ((tokeniser->state != STATE_DATA && + tokeniser->state != STATE_SCRIPT_DATA && + tokeniser->state != STATE_SCRIPT_DATA_ESCAPED && + tokeniser->state != STATE_SCRIPT_DATA_DOUBLE_ESCAPED) || + error == PARSERUTILS_EOF) && tokeniser->context.pending > 0) { /* Emit any pending characters */ emit_current_chars(tokeniser); @@ -872,9 +891,7 @@ hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser) /* there is no need to switch from SCRIPT_DATA or the SCRIPT_DATA_ESCAPED states to the DATA state because all the neccessary emitting is being done here itself. - Still, this is being done here to mantain the conformance - to specs */ - tokeniser->state = STATE_DATA; + */ token.type = HUBBUB_TOKEN_EOF; hubbub_tokeniser_emit_token(tokeniser, &token); } @@ -952,7 +969,6 @@ hubbub_error hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser) const uint8_t *cptr; parserutils_error error; uint8_t c; - assert(tokeniser->context.pending == 1); /* assert(tokeniser->context.chars.ptr[0] == '<'); */ @@ -996,7 +1012,6 @@ hubbub_error hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser) if(tokeniser->state == STATE_SCRIPT_DATA_LESS_THAN) { tokeniser->state = STATE_SCRIPT_DATA_ESCAPE_START; tokeniser->context.pending += len; - return emit_current_chars(tokeniser); } else { parserutils_inputstream_advance(tokeniser->input, SLEN("<!")); @@ -1069,7 +1084,9 @@ hubbub_error hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser) if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA || tokeniser->content_model == HUBBUB_CONTENT_MODEL_RAWTEXT || tokeniser->content_model == - HUBBUB_CONTENT_MODEL_CDATA) { + HUBBUB_CONTENT_MODEL_CDATA || + tokeniser->state == STATE_SCRIPT_DATA_END_TAG_OPEN || + tokeniser->state == STATE_SCRIPT_DATA_ESCAPED_END_TAG_OPEN) { uint8_t *start_tag_name = tokeniser->context.last_start_tag_name; size_t start_tag_len = @@ -1233,21 +1250,20 @@ hubbub_error hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser) if (error != PARSERUTILS_OK) { if (error == PARSERUTILS_EOF) { - tokeniser->state = STATE_DATA; if(tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA || tokeniser->content_model == HUBBUB_CONTENT_MODEL_RAWTEXT) { - return emit_current_chars(tokeniser); + tokeniser->state = STATE_DATA; } else if(tokeniser->state == STATE_SCRIPT_DATA_END_TAG_NAME) { tokeniser->state = STATE_SCRIPT_DATA; - return emit_current_chars(tokeniser); } else if(tokeniser->state == STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME) { tokeniser->state = STATE_SCRIPT_DATA_ESCAPED; - return emit_current_chars(tokeniser); } else { parserutils_inputstream_advance(tokeniser->input, tokeniser->context.pending); + tokeniser->state = STATE_DATA; return HUBBUB_OK; } + return emit_current_chars(tokeniser); } else { return hubbub_error_from_parserutils_error(error); } @@ -1262,8 +1278,10 @@ hubbub_error hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser) COLLECT(ctag->name, &c, len); tokeniser->context.pending += len; } else if (tokeniser->context.close_tag_match.match == false && - tokeniser->content_model != - HUBBUB_CONTENT_MODEL_PCDATA) { + (tokeniser->content_model != + HUBBUB_CONTENT_MODEL_PCDATA || + tokeniser->state == STATE_SCRIPT_DATA_END_TAG_NAME || + tokeniser->state == STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME)) { /* We should emit "</" here, but instead we leave it in the * buffer so the data state emits it with any characters * following it */ @@ -1274,8 +1292,6 @@ hubbub_error hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser) } else { tokeniser->state = STATE_DATA; } - - return emit_current_chars(tokeniser); } else { if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { tokeniser->context.pending += len; @@ -1296,10 +1312,10 @@ hubbub_error hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser) return emit_current_chars(tokeniser); } else if(tokeniser->state == STATE_SCRIPT_DATA_END_TAG_NAME) { tokeniser->state = STATE_SCRIPT_DATA; - tokeniser->context.pending += len; + return emit_current_chars(tokeniser); } else if(tokeniser->state == STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME) { tokeniser->state = STATE_SCRIPT_DATA_ESCAPED; - tokeniser->context.pending += len; + return emit_current_chars(tokeniser); } else { COLLECT(ctag->name, cptr, len); tokeniser->context.pending += len; @@ -1322,7 +1338,7 @@ hubbub_error hubbub_tokeniser_handle_script_data_escaped_dash(hubbub_tokeniser * if (error != PARSERUTILS_OK) { if (error == PARSERUTILS_EOF) { tokeniser->state = STATE_DATA; - return emit_current_chars(tokeniser); + return HUBBUB_OK; } else { return hubbub_error_from_parserutils_error(error); } @@ -1339,8 +1355,10 @@ hubbub_error hubbub_tokeniser_handle_script_data_escaped_dash(hubbub_tokeniser * tokeniser->context.pending += len; } else if(c == '<') { - /*emit any pending characters*/ - emit_current_chars(tokeniser); + if(tokeniser->context.pending > 0) { + /*emit any pending characters*/ + emit_current_chars(tokeniser); + } if(tokeniser->state == STATE_SCRIPT_DATA_ESCAPED_DASH || tokeniser->state == STATE_SCRIPT_DATA_ESCAPED_DASH_DASH) { tokeniser->state = STATE_SCRIPT_DATA_ESCAPED_LESS_THAN; @@ -1362,8 +1380,6 @@ hubbub_error hubbub_tokeniser_handle_script_data_escaped_dash(hubbub_tokeniser * if(err != HUBBUB_OK) return err; - tokeniser->context.pending += len; - parserutils_inputstream_advance(tokeniser->input, len); if(tokeniser->state == STATE_SCRIPT_DATA_ESCAPED_DASH || @@ -1399,21 +1415,27 @@ hubbub_error hubbub_tokeniser_handle_script_data_escaped_less_than(hubbub_tokeni if(error != PARSERUTILS_OK) { if (error == PARSERUTILS_EOF) { if(tokeniser->state == STATE_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN) { - tokeniser->state = STATE_SCRIPT_DATA_ESCAPED; - } else { tokeniser->state = STATE_SCRIPT_DATA_DOUBLE_ESCAPED; + } else { + tokeniser->state = STATE_SCRIPT_DATA_ESCAPED; } - return emit_current_chars(tokeniser); + return HUBBUB_OK; } else { return hubbub_error_from_parserutils_error(error); } } c = *cptr; - if (c == '/') { tokeniser->context.pending += len; + tokeniser->context.close_tag_match.match = false; + tokeniser->context.close_tag_match.count = 0; + if(tokeniser->state == STATE_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN) { + START_BUF(ctag->name, "", 0); + ctag->n_attributes = 0; + tokeniser->context.current_tag_type = + HUBBUB_TOKEN_END_TAG; tokeniser->state = STATE_SCRIPT_DATA_DOUBLE_ESCAPE_END; } else { tokeniser->state = STATE_SCRIPT_DATA_ESCAPED_END_TAG_OPEN; @@ -1443,6 +1465,7 @@ hubbub_error hubbub_tokeniser_handle_script_data_escaped_less_than(hubbub_tokeni tokeniser->state = STATE_SCRIPT_DATA_DOUBLE_ESCAPE_START; } else { tokeniser->state = STATE_SCRIPT_DATA_ESCAPED; + return emit_current_chars(tokeniser); } return HUBBUB_OK; } @@ -1499,7 +1522,7 @@ hubbub_error hubbub_tokeniser_handle_script_data_double_escape_start(hubbub_toke } else { tokeniser->state = STATE_SCRIPT_DATA_ESCAPED; } - return emit_current_chars(tokeniser); + return HUBBUB_OK; } else { return hubbub_error_from_parserutils_error(error); } @@ -1509,7 +1532,7 @@ hubbub_error hubbub_tokeniser_handle_script_data_double_escape_start(hubbub_toke if (c == '\t' || c == '\f' || c == '\r' || c == ' ' || c == '/' || c == '>') { - if(hubbub_string_match_ci(ctag->name.ptr, + if(hubbub_string_match_ci(tokeniser->buffer->data, ctag->name.len, S("script"))) { if(end) { @@ -1526,12 +1549,13 @@ hubbub_error hubbub_tokeniser_handle_script_data_double_escape_start(hubbub_toke } tokeniser->context.pending += len; + return emit_current_chars(tokeniser); } else if ('A' <= c && c <= 'Z') { uint8_t lc = (c + 0x20); - COLLECT(ctag->name, &lc, len); + COLLECT_MS(ctag->name, &lc, len); tokeniser->context.pending += len; } else if('a' <=c && c <= 'z') { - COLLECT(ctag->name, &c, len); + COLLECT_MS(ctag->name, &c, len); tokeniser->context.pending += len; } else { if(end) { @@ -3246,6 +3270,7 @@ hubbub_error hubbub_tokeniser_handle_cdata_block(hubbub_tokeniser *tokeniser) } else if (c == '>' && tokeniser->context.match_cdata.end == 2) { /* Remove the previous two "]]" */ tokeniser->context.pending -= 2; + tokeniser->context.match_cdata.end = 0; /* Emit any pending characters */ emit_current_chars(tokeniser); diff --git a/src/treebuilder/in_body.c b/src/treebuilder/in_body.c index d684702..dcccdd0 100644 --- a/src/treebuilder/in_body.c +++ b/src/treebuilder/in_body.c @@ -325,7 +325,7 @@ hubbub_error process_start_tag(hubbub_treebuilder *treebuilder, treebuilder->context.frameset_ok = false; - err = parse_generic_rcdata(treebuilder, token, false); + err = parse_generic_rcdata(treebuilder, token, HUBBUB_CONTENT_MODEL_RAWTEXT); } else if (type == TABLE) { if(treebuilder->quirks_mode != HUBBUB_QUIRKS_MODE_FULL && element_in_scope(treebuilder, P, @@ -384,7 +384,7 @@ hubbub_error process_start_tag(hubbub_treebuilder *treebuilder, type == NOSCRIPT)) { if (type == IFRAME) treebuilder->context.frameset_ok = false; - err = parse_generic_rcdata(treebuilder, token, false); + err = parse_generic_rcdata(treebuilder, token, HUBBUB_CONTENT_MODEL_RAWTEXT); } else if (type == SELECT) { err = process_select_in_body(treebuilder, token); if (err != HUBBUB_OK) @@ -1403,7 +1403,7 @@ hubbub_error process_textarea_in_body(hubbub_treebuilder *treebuilder, { treebuilder->context.strip_leading_lr = true; treebuilder->context.frameset_ok = false; - return parse_generic_rcdata(treebuilder, token, true); + return parse_generic_rcdata(treebuilder, token, HUBBUB_CONTENT_MODEL_RCDATA); } /** diff --git a/src/treebuilder/in_foreign_content.c b/src/treebuilder/in_foreign_content.c index 73a061d..97fb1d0 100644 --- a/src/treebuilder/in_foreign_content.c +++ b/src/treebuilder/in_foreign_content.c @@ -551,12 +551,13 @@ hubbub_error handle_in_foreign_content(hubbub_treebuilder *treebuilder, hubbub_ns ns; element_type type; void *node_iterator; - while(node_iterator !=stack[node].node) { + void *vnode = stack[node].node; + do{ element_stack_pop(treebuilder, &ns, &type, &node_iterator); treebuilder->tree_handler->unref_node( treebuilder->tree_handler->ctx, node_iterator); - } + } while(node_iterator != vnode); return HUBBUB_OK; } if(stack[node].ns == HUBBUB_NS_HTML) { diff --git a/src/treebuilder/in_head.c b/src/treebuilder/in_head.c index 9568789..52a2e3f 100644 --- a/src/treebuilder/in_head.c +++ b/src/treebuilder/in_head.c @@ -142,13 +142,13 @@ hubbub_error handle_in_head(hubbub_treebuilder *treebuilder, } else if (type == META) { err = process_meta_in_head(treebuilder, token); } else if (type == TITLE) { - err = parse_generic_rcdata(treebuilder, token, true); + err = parse_generic_rcdata(treebuilder, token, HUBBUB_CONTENT_MODEL_RCDATA); } else if (type == NOFRAMES || type == STYLE) { - err = parse_generic_rcdata(treebuilder, token, false); + err = parse_generic_rcdata(treebuilder, token, HUBBUB_CONTENT_MODEL_RAWTEXT); } else if (type == NOSCRIPT) { if (treebuilder->context.enable_scripting) { err = parse_generic_rcdata(treebuilder, token, - false); + HUBBUB_CONTENT_MODEL_RAWTEXT); } else { err = insert_element(treebuilder, &token->data.tag, true); @@ -161,7 +161,7 @@ hubbub_error handle_in_head(hubbub_treebuilder *treebuilder, /** \todo need to ensure that the client callback * sets the parser-inserted/already-executed script * flags. */ - err = parse_generic_rcdata(treebuilder, token, false); + err = parse_generic_rcdata(treebuilder, token, HUBBUB_CONTENT_MODEL_SCRIPTDATA); } else if (type == HEAD) { /** \todo parse error */ } else { diff --git a/src/treebuilder/internal.h b/src/treebuilder/internal.h index a5f0b22..5d3c75f 100644 --- a/src/treebuilder/internal.h +++ b/src/treebuilder/internal.h @@ -146,7 +146,7 @@ hubbub_error process_in_table_text( hubbub_error process_comment_append(hubbub_treebuilder *treebuilder, const hubbub_token *token, void *parent); hubbub_error parse_generic_rcdata(hubbub_treebuilder *treebuilder, - const hubbub_token *token, bool rcdata); + const hubbub_token *token, hubbub_content_model content_model); uint32_t element_in_scope(hubbub_treebuilder *treebuilder, element_type type, element_scope scope); diff --git a/src/treebuilder/treebuilder.c b/src/treebuilder/treebuilder.c index 568a589..eb6bb51 100644 --- a/src/treebuilder/treebuilder.c +++ b/src/treebuilder/treebuilder.c @@ -379,6 +379,14 @@ hubbub_error hubbub_treebuilder_token_handler(const hubbub_token *token, } } + hubbub_tokeniser_optparams params; + hubbub_error e; + params.process_cdata = (treebuilder->context.element_stack[ + treebuilder->context.current_node].ns != HUBBUB_NS_HTML); + e = hubbub_tokeniser_setopt(treebuilder->tokeniser, HUBBUB_TOKENISER_PROCESS_CDATA, ¶ms); + UNUSED(e); + assert(e == HUBBUB_OK); + return err; } @@ -523,11 +531,11 @@ hubbub_error process_comment_append(hubbub_treebuilder *treebuilder, * * \param treebuilder The treebuilder instance * \param token The current token - * \param rcdata True for RCDATA, false for CDATA + * \param content_model The content-model in which the algorithm is being evoked * \return HUBBUB_OK on success, appropriate error otherwise */ hubbub_error parse_generic_rcdata(hubbub_treebuilder *treebuilder, - const hubbub_token *token, bool rcdata) + const hubbub_token *token, hubbub_content_model content_model) { hubbub_error error; element_type type; @@ -539,10 +547,10 @@ hubbub_error parse_generic_rcdata(hubbub_treebuilder *treebuilder, if (error != HUBBUB_OK) return error; - params.content_model.model = rcdata ? HUBBUB_CONTENT_MODEL_RCDATA - : HUBBUB_CONTENT_MODEL_RAWTEXT; + params.content_model.model = content_model; error = hubbub_tokeniser_setopt(treebuilder->tokeniser, - HUBBUB_TOKENISER_CONTENT_MODEL, ¶ms); + HUBBUB_TOKENISER_CONTENT_MODEL, ¶ms); + /* There is no way that setopt can fail. Ensure this. */ assert(error == HUBBUB_OK); diff --git a/test/data/tree-construction/INDEX b/test/data/tree-construction/INDEX index 45126b7..c249b98 100644 --- a/test/data/tree-construction/INDEX +++ b/test/data/tree-construction/INDEX @@ -21,7 +21,7 @@ pending-spec-changes.dat html5lib treebuilder tests #plain-text-unsafe.dat NA README.md html5lib treebuilder tests regression.dat html5lib treebuilder tests -#scriptdata01.dat NA +scriptdata01.dat NA #scripted/adoption01.dat NA #scripted/ark.dat NA #scripted/webkit01.dat NA |