From 02f5504de388eb69ded0bc9e05361d8db82c2137 Mon Sep 17 00:00:00 2001 From: Rupinder Singh Khokhar Date: Fri, 13 Jun 2014 05:00:11 +0530 Subject: [Fix] tokeniser wrongly emitted a replacement character instead of utf8 NULL. Also, the tester used strlen to calculate string lengths--this seg faults if a null is passed-- this is also fixed. --- src/tokeniser/tokeniser.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'src/tokeniser') diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c index 7355f80..d108490 100644 --- a/src/tokeniser/tokeniser.c +++ b/src/tokeniser/tokeniser.c @@ -44,6 +44,12 @@ static const hubbub_string u_fffd_str = { u_fffd, sizeof(u_fffd) }; static const uint8_t lf = '\n'; static const hubbub_string lf_str = { &lf, 1 }; +/** + * UTF-8 encoding of U+0000 NULL CHARACTER + */ +static const uint8_t u_null[1] = { '\x00'}; +static const hubbub_string u_null_str = { u_null, sizeof(u_null) }; + /** * Tokeniser states @@ -751,8 +757,8 @@ hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser) emit_current_chars(tokeniser); } - /* Emit a replacement character */ - emit_character_token(tokeniser, &u_fffd_str); + /* Emit a null character */ + emit_character_token(tokeniser, &u_null_str); /* Advance past NUL */ parserutils_inputstream_advance(tokeniser->input, 1); -- cgit v1.2.3