summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRupinder Singh Khokhar <rsk1coder99@gmail.com>2014-07-13 03:16:10 (GMT)
committer Rupinder Singh Khokhar <rsk1coder99@gmail.com>2014-08-01 16:14:33 (GMT)
commit5d07e38342f6a65cec3e661447d539b71401b49b (patch)
treeabb4df0110dd8b013a41c479cc46dd4cff076952
parentcc0119cafe9c29bfb42573d65a2012dce8628c76 (diff)
downloadlibhubbub-5d07e38342f6a65cec3e661447d539b71401b49b.tar.gz
libhubbub-5d07e38342f6a65cec3e661447d539b71401b49b.tar.bz2
This is perhaps the best way to treat an incoming script content_model_flag. Black-boxing is mantained, & a switch is allowed only to a script data state. Script content model can't be incorporated in the style of rcdata & rawtext data, wherein it was easy to make a 1to1 matching between handlers and states. Also fixed the tokeniser to properly handle script tags. tokeniser was earlier modified in commit 7b6b8eb6fcbdd175540902ca699e7e704b90f9e0, has now been tested & bugs removed. Additionaly, in every loop of the dispatcher, it will be checked whether it is safe for tokeniser to process CDATA, and corresponding opts on the tokeniser will be set. this may slow the library down because of repeated checking in every loop. The tokeniser code has become unbearably messy due to the script tags, so a little tiding up & optimisation will be done later ;)
-rw-r--r--include/hubbub/types.h3
-rw-r--r--src/tokeniser/tokeniser.c99
-rw-r--r--src/treebuilder/in_body.c6
-rw-r--r--src/treebuilder/in_foreign_content.c5
-rw-r--r--src/treebuilder/in_head.c8
-rw-r--r--src/treebuilder/internal.h2
-rw-r--r--src/treebuilder/treebuilder.c18
-rw-r--r--test/data/tree-construction/INDEX2
8 files changed, 89 insertions, 54 deletions
diff --git a/include/hubbub/types.h b/include/hubbub/types.h
index 6e2b1a9..a812e9f 100644
--- a/include/hubbub/types.h
+++ b/include/hubbub/types.h
@@ -34,7 +34,8 @@ typedef enum hubbub_content_model {
HUBBUB_CONTENT_MODEL_RCDATA,
HUBBUB_CONTENT_MODEL_CDATA,
HUBBUB_CONTENT_MODEL_PLAINTEXT,
- HUBBUB_CONTENT_MODEL_RAWTEXT
+ HUBBUB_CONTENT_MODEL_RAWTEXT,
+ HUBBUB_CONTENT_MODEL_SCRIPTDATA
} hubbub_content_model;
/**
diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c
index 1d16ba4..500a88d 100644
--- a/src/tokeniser/tokeniser.c
+++ b/src/tokeniser/tokeniser.c
@@ -426,7 +426,17 @@ hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser,
tokeniser->error_pw = params->error_handler.pw;
break;
case HUBBUB_TOKENISER_CONTENT_MODEL:
- tokeniser->content_model = params->content_model.model;
+ if(params->content_model.model == HUBBUB_CONTENT_MODEL_SCRIPTDATA) {
+ /*There is no other way to achieve this switch,
+ since we have multiple states coming
+ into some handlers. The only way to
+ avoid this is to not club the states at all,
+ and have standalone handlers for each state in
+ script domain.*/
+ tokeniser->state = STATE_SCRIPT_DATA;
+ } else {
+ tokeniser->content_model = params->content_model.model;
+ }
break;
case HUBBUB_TOKENISER_PROCESS_CDATA:
tokeniser->process_cdata_section = params->process_cdata;
@@ -774,6 +784,7 @@ hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
/* Buffer '<' */
tokeniser->context.pending = len;
+
if(tokeniser->state == STATE_SCRIPT_DATA) {
tokeniser->state = STATE_SCRIPT_DATA_LESS_THAN;
} else if(tokeniser->state == STATE_SCRIPT_DATA_ESCAPED) {
@@ -813,15 +824,15 @@ hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
}
/* Emit a null or a replacement character */
- if(tokeniser->content_model != HUBBUB_CONTENT_MODEL_PCDATA) {
+ if(tokeniser->content_model != HUBBUB_CONTENT_MODEL_PCDATA ||
+ tokeniser->state == STATE_SCRIPT_DATA ||
+ tokeniser->state == STATE_SCRIPT_DATA_ESCAPED ||
+ tokeniser->state == STATE_SCRIPT_DATA_DOUBLE_ESCAPED) {
emit_character_token(tokeniser, &u_fffd_str);
} else {
emit_character_token(tokeniser, &u_null_str);
}
- if(tokeniser->state == STATE_SCRIPT_DATA_DOUBLE_ESCAPED ||
- tokeniser->state == STATE_SCRIPT_DATA_ESCAPED) {
- tokeniser->state = STATE_DATA;
- }
+
/* Advance past NUL */
parserutils_inputstream_advance(tokeniser->input, 1);
} else if (c == '\r' && tokeniser->content_model != HUBBUB_CONTENT_MODEL_PLAINTEXT) {
@@ -855,14 +866,22 @@ hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
tokeniser->state = STATE_SCRIPT_DATA_ESCAPED_DASH;
}
tokeniser->context.pending += len;
+ break;
} else {
/* Just collect into buffer */
tokeniser->context.pending += len;
}
}
- if (tokeniser->state != STATE_TAG_OPEN &&
- (tokeniser->state != STATE_DATA || error == PARSERUTILS_EOF) &&
+ if (tokeniser->state != STATE_SCRIPT_DATA_LESS_THAN &&
+ tokeniser->state != STATE_SCRIPT_DATA_ESCAPED_LESS_THAN &&
+ tokeniser->state != STATE_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN &&
+ tokeniser->state != STATE_TAG_OPEN &&
+ ((tokeniser->state != STATE_DATA &&
+ tokeniser->state != STATE_SCRIPT_DATA &&
+ tokeniser->state != STATE_SCRIPT_DATA_ESCAPED &&
+ tokeniser->state != STATE_SCRIPT_DATA_DOUBLE_ESCAPED) ||
+ error == PARSERUTILS_EOF) &&
tokeniser->context.pending > 0) {
/* Emit any pending characters */
emit_current_chars(tokeniser);
@@ -872,9 +891,7 @@ hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
/* there is no need to switch from SCRIPT_DATA or
the SCRIPT_DATA_ESCAPED states to the DATA state because
all the neccessary emitting is being done here itself.
- Still, this is being done here to mantain the conformance
- to specs */
- tokeniser->state = STATE_DATA;
+ */
token.type = HUBBUB_TOKEN_EOF;
hubbub_tokeniser_emit_token(tokeniser, &token);
}
@@ -952,7 +969,6 @@ hubbub_error hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
const uint8_t *cptr;
parserutils_error error;
uint8_t c;
-
assert(tokeniser->context.pending == 1);
/* assert(tokeniser->context.chars.ptr[0] == '<'); */
@@ -996,7 +1012,6 @@ hubbub_error hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
if(tokeniser->state == STATE_SCRIPT_DATA_LESS_THAN) {
tokeniser->state = STATE_SCRIPT_DATA_ESCAPE_START;
tokeniser->context.pending += len;
- return emit_current_chars(tokeniser);
} else {
parserutils_inputstream_advance(tokeniser->input,
SLEN("<!"));
@@ -1069,7 +1084,9 @@ hubbub_error hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
tokeniser->content_model == HUBBUB_CONTENT_MODEL_RAWTEXT ||
tokeniser->content_model ==
- HUBBUB_CONTENT_MODEL_CDATA) {
+ HUBBUB_CONTENT_MODEL_CDATA ||
+ tokeniser->state == STATE_SCRIPT_DATA_END_TAG_OPEN ||
+ tokeniser->state == STATE_SCRIPT_DATA_ESCAPED_END_TAG_OPEN) {
uint8_t *start_tag_name =
tokeniser->context.last_start_tag_name;
size_t start_tag_len =
@@ -1233,21 +1250,20 @@ hubbub_error hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser)
if (error != PARSERUTILS_OK) {
if (error == PARSERUTILS_EOF) {
- tokeniser->state = STATE_DATA;
if(tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
tokeniser->content_model == HUBBUB_CONTENT_MODEL_RAWTEXT) {
- return emit_current_chars(tokeniser);
+ tokeniser->state = STATE_DATA;
} else if(tokeniser->state == STATE_SCRIPT_DATA_END_TAG_NAME) {
tokeniser->state = STATE_SCRIPT_DATA;
- return emit_current_chars(tokeniser);
} else if(tokeniser->state == STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME) {
tokeniser->state = STATE_SCRIPT_DATA_ESCAPED;
- return emit_current_chars(tokeniser);
} else {
parserutils_inputstream_advance(tokeniser->input,
tokeniser->context.pending);
+ tokeniser->state = STATE_DATA;
return HUBBUB_OK;
}
+ return emit_current_chars(tokeniser);
} else {
return hubbub_error_from_parserutils_error(error);
}
@@ -1262,8 +1278,10 @@ hubbub_error hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser)
COLLECT(ctag->name, &c, len);
tokeniser->context.pending += len;
} else if (tokeniser->context.close_tag_match.match == false &&
- tokeniser->content_model !=
- HUBBUB_CONTENT_MODEL_PCDATA) {
+ (tokeniser->content_model !=
+ HUBBUB_CONTENT_MODEL_PCDATA ||
+ tokeniser->state == STATE_SCRIPT_DATA_END_TAG_NAME ||
+ tokeniser->state == STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME)) {
/* We should emit "</" here, but instead we leave it in the
* buffer so the data state emits it with any characters
* following it */
@@ -1274,8 +1292,6 @@ hubbub_error hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser)
} else {
tokeniser->state = STATE_DATA;
}
-
- return emit_current_chars(tokeniser);
} else {
if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
tokeniser->context.pending += len;
@@ -1296,10 +1312,10 @@ hubbub_error hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser)
return emit_current_chars(tokeniser);
} else if(tokeniser->state == STATE_SCRIPT_DATA_END_TAG_NAME) {
tokeniser->state = STATE_SCRIPT_DATA;
- tokeniser->context.pending += len;
+ return emit_current_chars(tokeniser);
} else if(tokeniser->state == STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME) {
tokeniser->state = STATE_SCRIPT_DATA_ESCAPED;
- tokeniser->context.pending += len;
+ return emit_current_chars(tokeniser);
} else {
COLLECT(ctag->name, cptr, len);
tokeniser->context.pending += len;
@@ -1322,7 +1338,7 @@ hubbub_error hubbub_tokeniser_handle_script_data_escaped_dash(hubbub_tokeniser *
if (error != PARSERUTILS_OK) {
if (error == PARSERUTILS_EOF) {
tokeniser->state = STATE_DATA;
- return emit_current_chars(tokeniser);
+ return HUBBUB_OK;
} else {
return hubbub_error_from_parserutils_error(error);
}
@@ -1339,8 +1355,10 @@ hubbub_error hubbub_tokeniser_handle_script_data_escaped_dash(hubbub_tokeniser *
tokeniser->context.pending += len;
} else if(c == '<') {
- /*emit any pending characters*/
- emit_current_chars(tokeniser);
+ if(tokeniser->context.pending > 0) {
+ /*emit any pending characters*/
+ emit_current_chars(tokeniser);
+ }
if(tokeniser->state == STATE_SCRIPT_DATA_ESCAPED_DASH ||
tokeniser->state == STATE_SCRIPT_DATA_ESCAPED_DASH_DASH) {
tokeniser->state = STATE_SCRIPT_DATA_ESCAPED_LESS_THAN;
@@ -1362,8 +1380,6 @@ hubbub_error hubbub_tokeniser_handle_script_data_escaped_dash(hubbub_tokeniser *
if(err != HUBBUB_OK)
return err;
- tokeniser->context.pending += len;
-
parserutils_inputstream_advance(tokeniser->input, len);
if(tokeniser->state == STATE_SCRIPT_DATA_ESCAPED_DASH ||
@@ -1399,21 +1415,27 @@ hubbub_error hubbub_tokeniser_handle_script_data_escaped_less_than(hubbub_tokeni
if(error != PARSERUTILS_OK) {
if (error == PARSERUTILS_EOF) {
if(tokeniser->state == STATE_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN) {
- tokeniser->state = STATE_SCRIPT_DATA_ESCAPED;
- } else {
tokeniser->state = STATE_SCRIPT_DATA_DOUBLE_ESCAPED;
+ } else {
+ tokeniser->state = STATE_SCRIPT_DATA_ESCAPED;
}
- return emit_current_chars(tokeniser);
+ return HUBBUB_OK;
} else {
return hubbub_error_from_parserutils_error(error);
}
}
c = *cptr;
-
if (c == '/') {
tokeniser->context.pending += len;
+ tokeniser->context.close_tag_match.match = false;
+ tokeniser->context.close_tag_match.count = 0;
+
if(tokeniser->state == STATE_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN) {
+ START_BUF(ctag->name, "", 0);
+ ctag->n_attributes = 0;
+ tokeniser->context.current_tag_type =
+ HUBBUB_TOKEN_END_TAG;
tokeniser->state = STATE_SCRIPT_DATA_DOUBLE_ESCAPE_END;
} else {
tokeniser->state = STATE_SCRIPT_DATA_ESCAPED_END_TAG_OPEN;
@@ -1443,6 +1465,7 @@ hubbub_error hubbub_tokeniser_handle_script_data_escaped_less_than(hubbub_tokeni
tokeniser->state = STATE_SCRIPT_DATA_DOUBLE_ESCAPE_START;
} else {
tokeniser->state = STATE_SCRIPT_DATA_ESCAPED;
+ return emit_current_chars(tokeniser);
}
return HUBBUB_OK;
}
@@ -1499,7 +1522,7 @@ hubbub_error hubbub_tokeniser_handle_script_data_double_escape_start(hubbub_toke
} else {
tokeniser->state = STATE_SCRIPT_DATA_ESCAPED;
}
- return emit_current_chars(tokeniser);
+ return HUBBUB_OK;
} else {
return hubbub_error_from_parserutils_error(error);
}
@@ -1509,7 +1532,7 @@ hubbub_error hubbub_tokeniser_handle_script_data_double_escape_start(hubbub_toke
if (c == '\t' || c == '\f' || c == '\r' || c == ' ' || c == '/' ||
c == '>') {
- if(hubbub_string_match_ci(ctag->name.ptr,
+ if(hubbub_string_match_ci(tokeniser->buffer->data,
ctag->name.len,
S("script"))) {
if(end) {
@@ -1526,12 +1549,13 @@ hubbub_error hubbub_tokeniser_handle_script_data_double_escape_start(hubbub_toke
}
tokeniser->context.pending += len;
+ return emit_current_chars(tokeniser);
} else if ('A' <= c && c <= 'Z') {
uint8_t lc = (c + 0x20);
- COLLECT(ctag->name, &lc, len);
+ COLLECT_MS(ctag->name, &lc, len);
tokeniser->context.pending += len;
} else if('a' <=c && c <= 'z') {
- COLLECT(ctag->name, &c, len);
+ COLLECT_MS(ctag->name, &c, len);
tokeniser->context.pending += len;
} else {
if(end) {
@@ -3246,6 +3270,7 @@ hubbub_error hubbub_tokeniser_handle_cdata_block(hubbub_tokeniser *tokeniser)
} else if (c == '>' && tokeniser->context.match_cdata.end == 2) {
/* Remove the previous two "]]" */
tokeniser->context.pending -= 2;
+ tokeniser->context.match_cdata.end = 0;
/* Emit any pending characters */
emit_current_chars(tokeniser);
diff --git a/src/treebuilder/in_body.c b/src/treebuilder/in_body.c
index d684702..dcccdd0 100644
--- a/src/treebuilder/in_body.c
+++ b/src/treebuilder/in_body.c
@@ -325,7 +325,7 @@ hubbub_error process_start_tag(hubbub_treebuilder *treebuilder,
treebuilder->context.frameset_ok = false;
- err = parse_generic_rcdata(treebuilder, token, false);
+ err = parse_generic_rcdata(treebuilder, token, HUBBUB_CONTENT_MODEL_RAWTEXT);
} else if (type == TABLE) {
if(treebuilder->quirks_mode != HUBBUB_QUIRKS_MODE_FULL &&
element_in_scope(treebuilder, P,
@@ -384,7 +384,7 @@ hubbub_error process_start_tag(hubbub_treebuilder *treebuilder,
type == NOSCRIPT)) {
if (type == IFRAME)
treebuilder->context.frameset_ok = false;
- err = parse_generic_rcdata(treebuilder, token, false);
+ err = parse_generic_rcdata(treebuilder, token, HUBBUB_CONTENT_MODEL_RAWTEXT);
} else if (type == SELECT) {
err = process_select_in_body(treebuilder, token);
if (err != HUBBUB_OK)
@@ -1403,7 +1403,7 @@ hubbub_error process_textarea_in_body(hubbub_treebuilder *treebuilder,
{
treebuilder->context.strip_leading_lr = true;
treebuilder->context.frameset_ok = false;
- return parse_generic_rcdata(treebuilder, token, true);
+ return parse_generic_rcdata(treebuilder, token, HUBBUB_CONTENT_MODEL_RCDATA);
}
/**
diff --git a/src/treebuilder/in_foreign_content.c b/src/treebuilder/in_foreign_content.c
index 73a061d..97fb1d0 100644
--- a/src/treebuilder/in_foreign_content.c
+++ b/src/treebuilder/in_foreign_content.c
@@ -551,12 +551,13 @@ hubbub_error handle_in_foreign_content(hubbub_treebuilder *treebuilder,
hubbub_ns ns;
element_type type;
void *node_iterator;
- while(node_iterator !=stack[node].node) {
+ void *vnode = stack[node].node;
+ do{
element_stack_pop(treebuilder, &ns, &type, &node_iterator);
treebuilder->tree_handler->unref_node(
treebuilder->tree_handler->ctx,
node_iterator);
- }
+ } while(node_iterator != vnode);
return HUBBUB_OK;
}
if(stack[node].ns == HUBBUB_NS_HTML) {
diff --git a/src/treebuilder/in_head.c b/src/treebuilder/in_head.c
index 9568789..52a2e3f 100644
--- a/src/treebuilder/in_head.c
+++ b/src/treebuilder/in_head.c
@@ -142,13 +142,13 @@ hubbub_error handle_in_head(hubbub_treebuilder *treebuilder,
} else if (type == META) {
err = process_meta_in_head(treebuilder, token);
} else if (type == TITLE) {
- err = parse_generic_rcdata(treebuilder, token, true);
+ err = parse_generic_rcdata(treebuilder, token, HUBBUB_CONTENT_MODEL_RCDATA);
} else if (type == NOFRAMES || type == STYLE) {
- err = parse_generic_rcdata(treebuilder, token, false);
+ err = parse_generic_rcdata(treebuilder, token, HUBBUB_CONTENT_MODEL_RAWTEXT);
} else if (type == NOSCRIPT) {
if (treebuilder->context.enable_scripting) {
err = parse_generic_rcdata(treebuilder, token,
- false);
+ HUBBUB_CONTENT_MODEL_RAWTEXT);
} else {
err = insert_element(treebuilder,
&token->data.tag, true);
@@ -161,7 +161,7 @@ hubbub_error handle_in_head(hubbub_treebuilder *treebuilder,
/** \todo need to ensure that the client callback
* sets the parser-inserted/already-executed script
* flags. */
- err = parse_generic_rcdata(treebuilder, token, false);
+ err = parse_generic_rcdata(treebuilder, token, HUBBUB_CONTENT_MODEL_SCRIPTDATA);
} else if (type == HEAD) {
/** \todo parse error */
} else {
diff --git a/src/treebuilder/internal.h b/src/treebuilder/internal.h
index a5f0b22..5d3c75f 100644
--- a/src/treebuilder/internal.h
+++ b/src/treebuilder/internal.h
@@ -146,7 +146,7 @@ hubbub_error process_in_table_text(
hubbub_error process_comment_append(hubbub_treebuilder *treebuilder,
const hubbub_token *token, void *parent);
hubbub_error parse_generic_rcdata(hubbub_treebuilder *treebuilder,
- const hubbub_token *token, bool rcdata);
+ const hubbub_token *token, hubbub_content_model content_model);
uint32_t element_in_scope(hubbub_treebuilder *treebuilder,
element_type type, element_scope scope);
diff --git a/src/treebuilder/treebuilder.c b/src/treebuilder/treebuilder.c
index 568a589..eb6bb51 100644
--- a/src/treebuilder/treebuilder.c
+++ b/src/treebuilder/treebuilder.c
@@ -379,6 +379,14 @@ hubbub_error hubbub_treebuilder_token_handler(const hubbub_token *token,
}
}
+ hubbub_tokeniser_optparams params;
+ hubbub_error e;
+ params.process_cdata = (treebuilder->context.element_stack[
+ treebuilder->context.current_node].ns != HUBBUB_NS_HTML);
+ e = hubbub_tokeniser_setopt(treebuilder->tokeniser, HUBBUB_TOKENISER_PROCESS_CDATA, &params);
+ UNUSED(e);
+ assert(e == HUBBUB_OK);
+
return err;
}
@@ -523,11 +531,11 @@ hubbub_error process_comment_append(hubbub_treebuilder *treebuilder,
*
* \param treebuilder The treebuilder instance
* \param token The current token
- * \param rcdata True for RCDATA, false for CDATA
+ * \param content_model The content-model in which the algorithm is being evoked
* \return HUBBUB_OK on success, appropriate error otherwise
*/
hubbub_error parse_generic_rcdata(hubbub_treebuilder *treebuilder,
- const hubbub_token *token, bool rcdata)
+ const hubbub_token *token, hubbub_content_model content_model)
{
hubbub_error error;
element_type type;
@@ -539,10 +547,10 @@ hubbub_error parse_generic_rcdata(hubbub_treebuilder *treebuilder,
if (error != HUBBUB_OK)
return error;
- params.content_model.model = rcdata ? HUBBUB_CONTENT_MODEL_RCDATA
- : HUBBUB_CONTENT_MODEL_RAWTEXT;
+ params.content_model.model = content_model;
error = hubbub_tokeniser_setopt(treebuilder->tokeniser,
- HUBBUB_TOKENISER_CONTENT_MODEL, &params);
+ HUBBUB_TOKENISER_CONTENT_MODEL, &params);
+
/* There is no way that setopt can fail. Ensure this. */
assert(error == HUBBUB_OK);
diff --git a/test/data/tree-construction/INDEX b/test/data/tree-construction/INDEX
index 45126b7..c249b98 100644
--- a/test/data/tree-construction/INDEX
+++ b/test/data/tree-construction/INDEX
@@ -21,7 +21,7 @@ pending-spec-changes.dat html5lib treebuilder tests
#plain-text-unsafe.dat NA
README.md html5lib treebuilder tests
regression.dat html5lib treebuilder tests
-#scriptdata01.dat NA
+scriptdata01.dat NA
#scripted/adoption01.dat NA
#scripted/ark.dat NA
#scripted/webkit01.dat NA