From 02f5504de388eb69ded0bc9e05361d8db82c2137 Mon Sep 17 00:00:00 2001 From: Rupinder Singh Khokhar Date: Fri, 13 Jun 2014 05:00:11 +0530 Subject: [Fix] tokeniser wrongly emitted a replacement character instead of utf8 NULL. Also, the tester used strlen to calculate string lengths--this seg faults if a null is passed-- this is also fixed. --- src/tokeniser/tokeniser.c | 10 ++++++++-- test/data/tokeniser2/INDEX | 2 +- test/testutils.h | 12 ++++++++++++ test/tokeniser2.c | 32 ++++++++++++++++---------------- test/tokeniser3.c | 32 ++++++++++++++++---------------- 5 files changed, 53 insertions(+), 35 deletions(-) diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c index 7355f80..d108490 100644 --- a/src/tokeniser/tokeniser.c +++ b/src/tokeniser/tokeniser.c @@ -44,6 +44,12 @@ static const hubbub_string u_fffd_str = { u_fffd, sizeof(u_fffd) }; static const uint8_t lf = '\n'; static const hubbub_string lf_str = { &lf, 1 }; +/** + * UTF-8 encoding of U+0000 NULL CHARACTER + */ +static const uint8_t u_null[1] = { '\x00'}; +static const hubbub_string u_null_str = { u_null, sizeof(u_null) }; + /** * Tokeniser states @@ -751,8 +757,8 @@ hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser) emit_current_chars(tokeniser); } - /* Emit a replacement character */ - emit_character_token(tokeniser, &u_fffd_str); + /* Emit a null character */ + emit_character_token(tokeniser, &u_null_str); /* Advance past NUL */ parserutils_inputstream_advance(tokeniser->input, 1); diff --git a/test/data/tokeniser2/INDEX b/test/data/tokeniser2/INDEX index 57c0cad..1f6ea93 100644 --- a/test/data/tokeniser2/INDEX +++ b/test/data/tokeniser2/INDEX @@ -3,7 +3,7 @@ # Test Description test1.test html5lib tests (part 1) -#test2.test html5lib tests (part 2) +test2.test html5lib tests (part 2) #test3.test html5lib tests (part 3) #test4.test html5lib tests (part 4) #contentModelFlags.test html5lib content model tests diff --git a/test/testutils.h b/test/testutils.h index 45870f9..7a8eda5 100644 --- a/test/testutils.h +++ b/test/testutils.h @@ -10,6 +10,18 @@ #define UNUSED(x) ((x) = (x)) #endif +#ifndef LEN +uint32_t string_length(const char *str); +uint32_t string_length(const char *str) +{ + if(str == NULL) + return 0; + return strlen(str); +} +#define LEN(x) string_length(x) +#endif + + /* Redefine assert, so we can simply use the standard assert mechanism * within testcases and exit with the right output for the testrunner * to do the right thing. */ diff --git a/test/tokeniser2.c b/test/tokeniser2.c index c8ab9c0..07f355a 100644 --- a/test/tokeniser2.c +++ b/test/tokeniser2.c @@ -134,7 +134,7 @@ void run_test(context *ctx) if (ctx->last_start_tag != NULL) { /* Fake up a start tag, in PCDATA state */ - size_t len = strlen(ctx->last_start_tag) + 3; + size_t len = LEN(ctx->last_start_tag) + 3; uint8_t *buf = malloc(len); snprintf((char *) buf, len, "<%s>", @@ -302,21 +302,21 @@ hubbub_error token_handler(const hubbub_token *token, void *pw) (int) token->data.doctype.system_id.len); } - assert(token->data.doctype.name.len == strlen(expname)); - assert(strncmp(gotname, expname, strlen(expname)) == 0); + assert(token->data.doctype.name.len == LEN(expname)); + assert(strncmp(gotname, expname, LEN(expname)) == 0); assert((exppub == NULL) == (token->data.doctype.public_missing == true)); if (exppub) { - assert(token->data.doctype.public_id.len == strlen(exppub)); - assert(strncmp(gotpub, exppub, strlen(exppub)) == 0); + assert(token->data.doctype.public_id.len == LEN(exppub)); + assert(strncmp(gotpub, exppub, LEN(exppub)) == 0); } assert((expsys == NULL) == (token->data.doctype.system_missing == true)); if (gotsys) { - assert(token->data.doctype.system_id.len == strlen(expsys)); - assert(strncmp(gotsys, expsys, strlen(expsys)) == 0); + assert(token->data.doctype.system_id.len == LEN(expsys)); + assert(strncmp(gotsys, expsys, LEN(expsys)) == 0); } assert(expquirks == token->data.doctype.force_quirks); @@ -348,8 +348,8 @@ hubbub_error token_handler(const hubbub_token *token, void *pw) printf("attributes:\n"); } - assert(token->data.tag.name.len == strlen(expname)); - assert(strncmp(tagname, expname, strlen(expname)) == 0); + assert(token->data.tag.name.len == LEN(expname)); + assert(strncmp(tagname, expname, LEN(expname)) == 0); assert((token->data.tag.n_attributes == 0) == (expattrs == NULL)); @@ -373,11 +373,11 @@ hubbub_error token_handler(const hubbub_token *token, void *pw) (int) namelen, gotname, (int) vallen, gotval); - assert(namelen == strlen(expname)); + assert(namelen == LEN(expname)); assert(strncmp(gotname, expname, strlen(expname)) == 0); - assert(vallen == strlen(expval)); - assert(strncmp(gotval, expval, strlen(expval)) == 0); + assert(vallen == LEN(expval)); + assert(strncmp(gotval, expval, LEN(expval)) == 0); expattrs = expattrs->next; } @@ -398,8 +398,8 @@ hubbub_error token_handler(const hubbub_token *token, void *pw) (token->data.tag.n_attributes > 0) ? "attributes:" : ""); - assert(token->data.tag.name.len == strlen(expname)); - assert(strncmp(tagname, expname, strlen(expname)) == 0); + assert(token->data.tag.name.len == LEN(expname)); + assert(strncmp(tagname, expname, LEN(expname)) == 0); } break; case HUBBUB_TOKEN_COMMENT: @@ -413,8 +413,8 @@ hubbub_error token_handler(const hubbub_token *token, void *pw) printf(" got: '%.*s'\n", (int) token->data.comment.len, gotstr); - assert(token->data.comment.len == strlen(expstr)); - assert(strncmp(gotstr, expstr, strlen(expstr)) == 0); + assert(token->data.comment.len == LEN(expstr)); + assert(strncmp(gotstr, expstr, LEN(expstr)) == 0); } break; case HUBBUB_TOKEN_CHARACTER: diff --git a/test/tokeniser3.c b/test/tokeniser3.c index 949ddd0..e68a230 100644 --- a/test/tokeniser3.c +++ b/test/tokeniser3.c @@ -132,7 +132,7 @@ void run_test(context *ctx) if (ctx->last_start_tag != NULL) { /* Fake up a start tag, in PCDATA state */ - size_t len = strlen(ctx->last_start_tag) + 3; + size_t len = LEN(ctx->last_start_tag) + 3; uint8_t *buf = malloc(len); snprintf((char *) buf, len, "<%s>", @@ -308,21 +308,21 @@ hubbub_error token_handler(const hubbub_token *token, void *pw) (int) token->data.doctype.system_id.len); } - assert(token->data.doctype.name.len == strlen(expname)); - assert(strncmp(gotname, expname, strlen(expname)) == 0); + assert(token->data.doctype.name.len == LEN(expname)); + assert(strncmp(gotname, expname, LEN(expname)) == 0); assert((exppub == NULL) == (token->data.doctype.public_missing == true)); if (exppub) { - assert(token->data.doctype.public_id.len == strlen(exppub)); - assert(strncmp(gotpub, exppub, strlen(exppub)) == 0); + assert(token->data.doctype.public_id.len == LEN(exppub)); + assert(strncmp(gotpub, exppub, LEN(exppub)) == 0); } assert((expsys == NULL) == (token->data.doctype.system_missing == true)); if (gotsys) { - assert(token->data.doctype.system_id.len == strlen(expsys)); - assert(strncmp(gotsys, expsys, strlen(expsys)) == 0); + assert(token->data.doctype.system_id.len == LEN(expsys)); + assert(strncmp(gotsys, expsys, LEN(expsys)) == 0); } assert(expquirks == token->data.doctype.force_quirks); @@ -354,8 +354,8 @@ hubbub_error token_handler(const hubbub_token *token, void *pw) printf("attributes:\n"); } - assert(token->data.tag.name.len == strlen(expname)); - assert(strncmp(tagname, expname, strlen(expname)) == 0); + assert(token->data.tag.name.len == LEN(expname)); + assert(strncmp(tagname, expname, LEN(expname)) == 0); assert((token->data.tag.n_attributes == 0) == (expattrs == NULL)); @@ -379,11 +379,11 @@ hubbub_error token_handler(const hubbub_token *token, void *pw) (int) namelen, gotname, (int) vallen, gotval); - assert(namelen == strlen(expname)); + assert(namelen == LEN(expname)); assert(strncmp(gotname, expname, strlen(expname)) == 0); - assert(vallen == strlen(expval)); - assert(strncmp(gotval, expval, strlen(expval)) == 0); + assert(vallen == LEN(expval)); + assert(strncmp(gotval, expval, LEN(expval)) == 0); expattrs = expattrs->next; } @@ -404,8 +404,8 @@ hubbub_error token_handler(const hubbub_token *token, void *pw) (token->data.tag.n_attributes > 0) ? "attributes:" : ""); - assert(token->data.tag.name.len == strlen(expname)); - assert(strncmp(tagname, expname, strlen(expname)) == 0); + assert(token->data.tag.name.len == LEN(expname)); + assert(strncmp(tagname, expname, LEN(expname)) == 0); } break; case HUBBUB_TOKEN_COMMENT: @@ -419,8 +419,8 @@ hubbub_error token_handler(const hubbub_token *token, void *pw) printf(" got: '%.*s'\n", (int) token->data.comment.len, gotstr); - assert(token->data.comment.len == strlen(expstr)); - assert(strncmp(gotstr, expstr, strlen(expstr)) == 0); + assert(token->data.comment.len == LEN(expstr)); + assert(strncmp(gotstr, expstr, LEN(expstr)) == 0); } break; case HUBBUB_TOKEN_CHARACTER: -- cgit v1.2.3