From a501b83d9be45e80b59fc8eca8e1816f467b4662 Mon Sep 17 00:00:00 2001 From: Rupinder Singh Khokhar Date: Fri, 13 Jun 2014 00:51:59 +0530 Subject: Updating Named Entities API in tokeniser --- src/tokeniser/entities.c | 19 +++++---- src/tokeniser/entities.h | 2 +- src/tokeniser/tokeniser.c | 99 +++++++++++++++++++++++++++-------------------- 3 files changed, 70 insertions(+), 50 deletions(-) (limited to 'src') diff --git a/src/tokeniser/entities.c b/src/tokeniser/entities.c index ac47d80..298b2cf 100644 --- a/src/tokeniser/entities.c +++ b/src/tokeniser/entities.c @@ -7,15 +7,20 @@ #include "utils/utils.h" #include "tokeniser/entities.h" +/** + * UTF-8 encoding of U+FFFD REPLACEMENT CHARACTER + */ +static const uint8_t u_fffd[3] = { '\xEF', '\xBF', '\xBD' }; +static const hubbub_string u_fffd_str = { u_fffd, sizeof(u_fffd) }; /** Node in our entity tree */ typedef struct hubbub_entity_node { - /* Do not reorder this without fixing make-entities.pl */ + /* Do not reorder this without fixing make-entities.pl */ uint8_t split; /**< Data to split on */ int32_t lt; /**< Subtree for data less than split */ int32_t eq; /**< Subtree for data equal to split */ int32_t gt; /**< Subtree for data greater than split */ - uint32_t value; /**< Data for this node */ + hubbub_string value; /**< Data for this node */ } hubbub_entity_node; #include "entities.inc" @@ -38,7 +43,7 @@ typedef struct hubbub_entity_node { * is found. */ static hubbub_error hubbub_entity_tree_search_step(uint8_t c, - uint32_t *result, int32_t *context) + hubbub_string *result, int32_t *context) { bool match = false; int32_t p; @@ -63,7 +68,7 @@ static hubbub_error hubbub_entity_tree_search_step(uint8_t c, match = true; *result = dict[dict[p].eq].value; p = dict[p].eq; - } else if (dict[p].value != 0) { + } else if (dict[p].value.ptr != NULL) { match = true; *result = dict[p].value; p = dict[p].eq; @@ -100,13 +105,13 @@ static hubbub_error hubbub_entity_tree_search_step(uint8_t c, * The location pointed to by ::result will be set to U+FFFD unless a match * is found. */ -hubbub_error hubbub_entities_search_step(uint8_t c, uint32_t *result, +hubbub_error hubbub_entities_search_step(uint8_t c, hubbub_string *result, int32_t *context) { if (result == NULL) return HUBBUB_BADPARM; - *result = 0xFFFD; - + *result = u_fffd_str; + return hubbub_entity_tree_search_step(c, result, context); } diff --git a/src/tokeniser/entities.h b/src/tokeniser/entities.h index 0703b37..a8b9bbf 100644 --- a/src/tokeniser/entities.h +++ b/src/tokeniser/entities.h @@ -14,7 +14,7 @@ #include /* Step-wise search for an entity in the dictionary */ -hubbub_error hubbub_entities_search_step(uint8_t c, uint32_t *result, +hubbub_error hubbub_entities_search_step(uint8_t c, hubbub_string *result, int32_t *context); #endif diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c index a7e67a1..7355f80 100644 --- a/src/tokeniser/tokeniser.c +++ b/src/tokeniser/tokeniser.c @@ -4,6 +4,7 @@ * http://www.opensource.org/licenses/mit-license.php * Copyright 2007 John-Mark Bell * Copyright 2008 Andrew Sidwell + * Copyright 2014 Rupinder Singh Khokhar */ #include #include @@ -128,7 +129,7 @@ typedef struct hubbub_tokeniser_context { struct { size_t offset; /**< Offset in buffer */ uint32_t length; /**< Length of entity */ - uint32_t codepoint; /**< UCS4 codepoint */ + hubbub_string codepoint; /**< UTF-8 codepoint */ bool complete; /**< True if match complete */ uint32_t poss_length; /**< Optimistic length @@ -147,6 +148,12 @@ typedef struct hubbub_tokeniser_context { * numeric entity value */ hubbub_tokeniser_state return_state; /**< State we were * called from */ + union { + uint32_t ucs4; /**context.match_entity.codepoint) { - parserutils_charset_utf8_from_ucs4( - tokeniser->context.match_entity.codepoint, - &utf8ptr, &len); - - token.data.character.ptr = utf8; - token.data.character.len = sizeof(utf8) - len; + if (tokeniser->context.match_entity.codepoint.ptr != NULL) { + token.data.character.ptr = + tokeniser->context.match_entity.codepoint.ptr; + token.data.character.len = + tokeniser->context.match_entity.codepoint.len; hubbub_tokeniser_emit_token(tokeniser, &token); @@ -833,6 +833,7 @@ hubbub_error hubbub_tokeniser_handle_character_reference_data( } else { parserutils_error error; const uint8_t *cptr = NULL; + size_t len; error = parserutils_inputstream_peek( tokeniser->input, @@ -1607,23 +1608,17 @@ hubbub_error hubbub_tokeniser_handle_character_reference_in_attribute_value( hubbub_attribute *attr = &ctag->attributes[ ctag->n_attributes - 1]; - uint8_t utf8[6]; - uint8_t *utf8ptr = utf8; - size_t len = sizeof(utf8); - - if (tokeniser->context.match_entity.codepoint) { - parserutils_charset_utf8_from_ucs4( - tokeniser->context.match_entity.codepoint, - &utf8ptr, &len); - - COLLECT_MS(attr->value, utf8, sizeof(utf8) - len); + if (tokeniser->context.match_entity.codepoint.ptr != NULL) { + COLLECT_MS(attr->value, + tokeniser->context.match_entity.codepoint.ptr, + tokeniser->context.match_entity.codepoint.len); /* +1 for the ampersand */ tokeniser->context.pending += tokeniser->context.match_entity.length + 1; } else { - size_t len = 0; + size_t len; const uint8_t *cptr = NULL; parserutils_error error; @@ -2899,7 +2894,9 @@ hubbub_error hubbub_tokeniser_consume_character_reference( if (error != PARSERUTILS_OK) { if (error == PARSERUTILS_EOF) { tokeniser->context.match_entity.complete = true; - tokeniser->context.match_entity.codepoint = 0; + tokeniser->context.match_entity.codepoint.len = 0; + tokeniser->context.match_entity.codepoint.ptr = NULL; + tokeniser->context.match_entity.numeric_state.ucs4 = 0; return HUBBUB_OK; } else { return hubbub_error_from_parserutils_error(error); @@ -2913,13 +2910,15 @@ hubbub_error hubbub_tokeniser_consume_character_reference( tokeniser->context.match_entity.poss_length = 0; tokeniser->context.match_entity.length = 0; tokeniser->context.match_entity.base = 0; - tokeniser->context.match_entity.codepoint = 0; tokeniser->context.match_entity.had_data = false; tokeniser->context.match_entity.return_state = tokeniser->state; tokeniser->context.match_entity.complete = false; tokeniser->context.match_entity.overflow = false; tokeniser->context.match_entity.context = -1; tokeniser->context.match_entity.prev_len = len; + tokeniser->context.match_entity.numeric_state.ucs4 = 0; + tokeniser->context.match_entity.codepoint.ptr = NULL; + tokeniser->context.match_entity.codepoint.len = 0; /* Reset allowed character for future calls */ tokeniser->context.allowed_char = '\0'; @@ -2928,7 +2927,6 @@ hubbub_error hubbub_tokeniser_consume_character_reference( c == '<' || c == '&' || (allowed_char && c == allowed_char)) { tokeniser->context.match_entity.complete = true; - tokeniser->context.match_entity.codepoint = 0; } else if (c == '#') { tokeniser->context.match_entity.length += len; tokeniser->state = STATE_NUMBERED_ENTITY; @@ -2975,22 +2973,22 @@ hubbub_error hubbub_tokeniser_handle_numbered_entity( if (ctx->match_entity.base == 10 && ('0' <= c && c <= '9')) { ctx->match_entity.had_data = true; - ctx->match_entity.codepoint = - ctx->match_entity.codepoint * 10 + (c - '0'); - + ctx->match_entity.numeric_state.ucs4 = + ctx->match_entity.numeric_state.ucs4 * 10 + (c - '0'); ctx->match_entity.length += len; } else if (ctx->match_entity.base == 16 && (('0' <= c && c <= '9') || ('A' <= (c & ~0x20) && (c & ~0x20) <= 'F'))) { ctx->match_entity.had_data = true; - ctx->match_entity.codepoint *= 16; + ctx->match_entity.numeric_state.ucs4 *= 16; if ('0' <= c && c <= '9') { - ctx->match_entity.codepoint += (c - '0'); + ctx->match_entity.numeric_state.ucs4 += + (c-'0'); } else { - ctx->match_entity.codepoint += - ((c & ~0x20) - 'A' + 10); + ctx->match_entity.numeric_state.ucs4 += + ((c & ~0x20) - 'A' + 10); } ctx->match_entity.length += len; @@ -2998,7 +2996,7 @@ hubbub_error hubbub_tokeniser_handle_numbered_entity( break; } - if (ctx->match_entity.codepoint >= 0x10FFFF) { + if (ctx->match_entity.numeric_state.ucs4 >= 0x10FFFF) { ctx->match_entity.overflow = true; } } @@ -3014,7 +3012,8 @@ hubbub_error hubbub_tokeniser_handle_numbered_entity( /* Had data, so calculate final codepoint */ if (ctx->match_entity.had_data) { - uint32_t cp = ctx->match_entity.codepoint; + uint32_t cp = + ctx->match_entity.numeric_state.ucs4; if (0x80 <= cp && cp <= 0x9F) { cp = cp1252Table[cp - 0x80]; @@ -3031,8 +3030,22 @@ hubbub_error hubbub_tokeniser_handle_numbered_entity( * in the loop above to avoid overflow */ cp = 0xFFFD; } + ctx->match_entity.numeric_state.ucs4 = cp; + + /*Convert UCS-4 to UTF-8*/ + uint8_t *utf8_ptr= + (ctx->match_entity.numeric_state.numeric_buf); + size_t buf_len= + sizeof(ctx->match_entity.numeric_state.numeric_buf); + parserutils_charset_utf8_from_ucs4( + ctx->match_entity.numeric_state.ucs4, + &utf8_ptr, + &buf_len); + ctx->match_entity.codepoint.ptr= + (ctx->match_entity.numeric_state.numeric_buf); + ctx->match_entity.codepoint.len= + sizeof(ctx->match_entity.numeric_state.numeric_buf)-buf_len; - ctx->match_entity.codepoint = cp; } /* Flag completion */ @@ -3056,11 +3069,10 @@ hubbub_error hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser) ctx->match_entity.offset + ctx->match_entity.poss_length, &cptr, &len)) == PARSERUTILS_OK) { - uint32_t cp; + hubbub_string cp; uint8_t c = *cptr; hubbub_error error; - if (c > 0x7F) { /* Entity names are ASCII only */ break; @@ -3119,14 +3131,17 @@ hubbub_error hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser) error == PARSERUTILS_EOF); if (error == PARSERUTILS_EOF) { - ctx->match_entity.codepoint = 0; + ctx->match_entity.codepoint.len = 0; + ctx->match_entity.codepoint.ptr = NULL; } c = *cptr; if ((0x0030 <= c && c <= 0x0039) || (0x0041 <= c && c <= 0x005A) || - (0x0061 <= c && c <= 0x007A)) { - ctx->match_entity.codepoint = 0; + (0x0061 <= c && c <= 0x007A) || + c == '=') { + ctx->match_entity.codepoint.len = 0; + ctx->match_entity.codepoint.ptr = NULL; } } } -- cgit v1.2.3