summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorRupinder Singh Khokhar <rsk1coder99@gmail.com>2014-06-13 00:51:59 +0530
committerRupinder Singh Khokhar <rsk1coder99@gmail.com>2014-07-09 10:04:05 +0530
commita501b83d9be45e80b59fc8eca8e1816f467b4662 (patch)
tree949a6ffdc0df6bfe925cc622aeadcea56778273f /src
parent34f1fae56766782ad5142db64879b348c01060bb (diff)
downloadlibhubbub-a501b83d9be45e80b59fc8eca8e1816f467b4662.tar.gz
libhubbub-a501b83d9be45e80b59fc8eca8e1816f467b4662.tar.bz2
Updating Named Entities API in tokeniser
Diffstat (limited to 'src')
-rw-r--r--src/tokeniser/entities.c19
-rw-r--r--src/tokeniser/entities.h2
-rw-r--r--src/tokeniser/tokeniser.c99
3 files changed, 70 insertions, 50 deletions
diff --git a/src/tokeniser/entities.c b/src/tokeniser/entities.c
index ac47d80..298b2cf 100644
--- a/src/tokeniser/entities.c
+++ b/src/tokeniser/entities.c
@@ -7,15 +7,20 @@
#include "utils/utils.h"
#include "tokeniser/entities.h"
+/**
+ * UTF-8 encoding of U+FFFD REPLACEMENT CHARACTER
+ */
+static const uint8_t u_fffd[3] = { '\xEF', '\xBF', '\xBD' };
+static const hubbub_string u_fffd_str = { u_fffd, sizeof(u_fffd) };
/** Node in our entity tree */
typedef struct hubbub_entity_node {
- /* Do not reorder this without fixing make-entities.pl */
+ /* Do not reorder this without fixing make-entities.pl */
uint8_t split; /**< Data to split on */
int32_t lt; /**< Subtree for data less than split */
int32_t eq; /**< Subtree for data equal to split */
int32_t gt; /**< Subtree for data greater than split */
- uint32_t value; /**< Data for this node */
+ hubbub_string value; /**< Data for this node */
} hubbub_entity_node;
#include "entities.inc"
@@ -38,7 +43,7 @@ typedef struct hubbub_entity_node {
* is found.
*/
static hubbub_error hubbub_entity_tree_search_step(uint8_t c,
- uint32_t *result, int32_t *context)
+ hubbub_string *result, int32_t *context)
{
bool match = false;
int32_t p;
@@ -63,7 +68,7 @@ static hubbub_error hubbub_entity_tree_search_step(uint8_t c,
match = true;
*result = dict[dict[p].eq].value;
p = dict[p].eq;
- } else if (dict[p].value != 0) {
+ } else if (dict[p].value.ptr != NULL) {
match = true;
*result = dict[p].value;
p = dict[p].eq;
@@ -100,13 +105,13 @@ static hubbub_error hubbub_entity_tree_search_step(uint8_t c,
* The location pointed to by ::result will be set to U+FFFD unless a match
* is found.
*/
-hubbub_error hubbub_entities_search_step(uint8_t c, uint32_t *result,
+hubbub_error hubbub_entities_search_step(uint8_t c, hubbub_string *result,
int32_t *context)
{
if (result == NULL)
return HUBBUB_BADPARM;
- *result = 0xFFFD;
-
+ *result = u_fffd_str;
+
return hubbub_entity_tree_search_step(c, result, context);
}
diff --git a/src/tokeniser/entities.h b/src/tokeniser/entities.h
index 0703b37..a8b9bbf 100644
--- a/src/tokeniser/entities.h
+++ b/src/tokeniser/entities.h
@@ -14,7 +14,7 @@
#include <hubbub/functypes.h>
/* Step-wise search for an entity in the dictionary */
-hubbub_error hubbub_entities_search_step(uint8_t c, uint32_t *result,
+hubbub_error hubbub_entities_search_step(uint8_t c, hubbub_string *result,
int32_t *context);
#endif
diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c
index a7e67a1..7355f80 100644
--- a/src/tokeniser/tokeniser.c
+++ b/src/tokeniser/tokeniser.c
@@ -4,6 +4,7 @@
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
* Copyright 2008 Andrew Sidwell <takkaria@netsurf-browser.org>
+ * Copyright 2014 Rupinder Singh Khokhar <rsk1coder99@gmail.com>
*/
#include <assert.h>
#include <stdbool.h>
@@ -128,7 +129,7 @@ typedef struct hubbub_tokeniser_context {
struct {
size_t offset; /**< Offset in buffer */
uint32_t length; /**< Length of entity */
- uint32_t codepoint; /**< UCS4 codepoint */
+ hubbub_string codepoint; /**< UTF-8 codepoint */
bool complete; /**< True if match complete */
uint32_t poss_length; /**< Optimistic length
@@ -147,6 +148,12 @@ typedef struct hubbub_tokeniser_context {
* numeric entity value */
hubbub_tokeniser_state return_state; /**< State we were
* called from */
+ union {
+ uint32_t ucs4; /**<UCS-4 value for numeric
+ * entity*/
+ uint8_t numeric_buf[6]; /**<UTF-8 value for numeric
+ * entity */
+ } numeric_state;
} match_entity; /**< Entity matching state */
struct {
@@ -810,19 +817,12 @@ hubbub_error hubbub_tokeniser_handle_character_reference_data(
} else {
hubbub_token token;
- uint8_t utf8[6];
- uint8_t *utf8ptr = utf8;
- size_t len = sizeof(utf8);
-
token.type = HUBBUB_TOKEN_CHARACTER;
-
- if (tokeniser->context.match_entity.codepoint) {
- parserutils_charset_utf8_from_ucs4(
- tokeniser->context.match_entity.codepoint,
- &utf8ptr, &len);
-
- token.data.character.ptr = utf8;
- token.data.character.len = sizeof(utf8) - len;
+ if (tokeniser->context.match_entity.codepoint.ptr != NULL) {
+ token.data.character.ptr =
+ tokeniser->context.match_entity.codepoint.ptr;
+ token.data.character.len =
+ tokeniser->context.match_entity.codepoint.len;
hubbub_tokeniser_emit_token(tokeniser, &token);
@@ -833,6 +833,7 @@ hubbub_error hubbub_tokeniser_handle_character_reference_data(
} else {
parserutils_error error;
const uint8_t *cptr = NULL;
+ size_t len;
error = parserutils_inputstream_peek(
tokeniser->input,
@@ -1607,23 +1608,17 @@ hubbub_error hubbub_tokeniser_handle_character_reference_in_attribute_value(
hubbub_attribute *attr = &ctag->attributes[
ctag->n_attributes - 1];
- uint8_t utf8[6];
- uint8_t *utf8ptr = utf8;
- size_t len = sizeof(utf8);
-
- if (tokeniser->context.match_entity.codepoint) {
- parserutils_charset_utf8_from_ucs4(
- tokeniser->context.match_entity.codepoint,
- &utf8ptr, &len);
-
- COLLECT_MS(attr->value, utf8, sizeof(utf8) - len);
+ if (tokeniser->context.match_entity.codepoint.ptr != NULL) {
+ COLLECT_MS(attr->value,
+ tokeniser->context.match_entity.codepoint.ptr,
+ tokeniser->context.match_entity.codepoint.len);
/* +1 for the ampersand */
tokeniser->context.pending +=
tokeniser->context.match_entity.length
+ 1;
} else {
- size_t len = 0;
+ size_t len;
const uint8_t *cptr = NULL;
parserutils_error error;
@@ -2899,7 +2894,9 @@ hubbub_error hubbub_tokeniser_consume_character_reference(
if (error != PARSERUTILS_OK) {
if (error == PARSERUTILS_EOF) {
tokeniser->context.match_entity.complete = true;
- tokeniser->context.match_entity.codepoint = 0;
+ tokeniser->context.match_entity.codepoint.len = 0;
+ tokeniser->context.match_entity.codepoint.ptr = NULL;
+ tokeniser->context.match_entity.numeric_state.ucs4 = 0;
return HUBBUB_OK;
} else {
return hubbub_error_from_parserutils_error(error);
@@ -2913,13 +2910,15 @@ hubbub_error hubbub_tokeniser_consume_character_reference(
tokeniser->context.match_entity.poss_length = 0;
tokeniser->context.match_entity.length = 0;
tokeniser->context.match_entity.base = 0;
- tokeniser->context.match_entity.codepoint = 0;
tokeniser->context.match_entity.had_data = false;
tokeniser->context.match_entity.return_state = tokeniser->state;
tokeniser->context.match_entity.complete = false;
tokeniser->context.match_entity.overflow = false;
tokeniser->context.match_entity.context = -1;
tokeniser->context.match_entity.prev_len = len;
+ tokeniser->context.match_entity.numeric_state.ucs4 = 0;
+ tokeniser->context.match_entity.codepoint.ptr = NULL;
+ tokeniser->context.match_entity.codepoint.len = 0;
/* Reset allowed character for future calls */
tokeniser->context.allowed_char = '\0';
@@ -2928,7 +2927,6 @@ hubbub_error hubbub_tokeniser_consume_character_reference(
c == '<' || c == '&' ||
(allowed_char && c == allowed_char)) {
tokeniser->context.match_entity.complete = true;
- tokeniser->context.match_entity.codepoint = 0;
} else if (c == '#') {
tokeniser->context.match_entity.length += len;
tokeniser->state = STATE_NUMBERED_ENTITY;
@@ -2975,22 +2973,22 @@ hubbub_error hubbub_tokeniser_handle_numbered_entity(
if (ctx->match_entity.base == 10 &&
('0' <= c && c <= '9')) {
ctx->match_entity.had_data = true;
- ctx->match_entity.codepoint =
- ctx->match_entity.codepoint * 10 + (c - '0');
-
+ ctx->match_entity.numeric_state.ucs4 =
+ ctx->match_entity.numeric_state.ucs4 * 10 + (c - '0');
ctx->match_entity.length += len;
} else if (ctx->match_entity.base == 16 &&
(('0' <= c && c <= '9') ||
('A' <= (c & ~0x20) &&
(c & ~0x20) <= 'F'))) {
ctx->match_entity.had_data = true;
- ctx->match_entity.codepoint *= 16;
+ ctx->match_entity.numeric_state.ucs4 *= 16;
if ('0' <= c && c <= '9') {
- ctx->match_entity.codepoint += (c - '0');
+ ctx->match_entity.numeric_state.ucs4 +=
+ (c-'0');
} else {
- ctx->match_entity.codepoint +=
- ((c & ~0x20) - 'A' + 10);
+ ctx->match_entity.numeric_state.ucs4 +=
+ ((c & ~0x20) - 'A' + 10);
}
ctx->match_entity.length += len;
@@ -2998,7 +2996,7 @@ hubbub_error hubbub_tokeniser_handle_numbered_entity(
break;
}
- if (ctx->match_entity.codepoint >= 0x10FFFF) {
+ if (ctx->match_entity.numeric_state.ucs4 >= 0x10FFFF) {
ctx->match_entity.overflow = true;
}
}
@@ -3014,7 +3012,8 @@ hubbub_error hubbub_tokeniser_handle_numbered_entity(
/* Had data, so calculate final codepoint */
if (ctx->match_entity.had_data) {
- uint32_t cp = ctx->match_entity.codepoint;
+ uint32_t cp =
+ ctx->match_entity.numeric_state.ucs4;
if (0x80 <= cp && cp <= 0x9F) {
cp = cp1252Table[cp - 0x80];
@@ -3031,8 +3030,22 @@ hubbub_error hubbub_tokeniser_handle_numbered_entity(
* in the loop above to avoid overflow */
cp = 0xFFFD;
}
+ ctx->match_entity.numeric_state.ucs4 = cp;
+
+ /*Convert UCS-4 to UTF-8*/
+ uint8_t *utf8_ptr=
+ (ctx->match_entity.numeric_state.numeric_buf);
+ size_t buf_len=
+ sizeof(ctx->match_entity.numeric_state.numeric_buf);
+ parserutils_charset_utf8_from_ucs4(
+ ctx->match_entity.numeric_state.ucs4,
+ &utf8_ptr,
+ &buf_len);
+ ctx->match_entity.codepoint.ptr=
+ (ctx->match_entity.numeric_state.numeric_buf);
+ ctx->match_entity.codepoint.len=
+ sizeof(ctx->match_entity.numeric_state.numeric_buf)-buf_len;
- ctx->match_entity.codepoint = cp;
}
/* Flag completion */
@@ -3056,11 +3069,10 @@ hubbub_error hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser)
ctx->match_entity.offset +
ctx->match_entity.poss_length,
&cptr, &len)) == PARSERUTILS_OK) {
- uint32_t cp;
+ hubbub_string cp;
uint8_t c = *cptr;
hubbub_error error;
-
if (c > 0x7F) {
/* Entity names are ASCII only */
break;
@@ -3119,14 +3131,17 @@ hubbub_error hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser)
error == PARSERUTILS_EOF);
if (error == PARSERUTILS_EOF) {
- ctx->match_entity.codepoint = 0;
+ ctx->match_entity.codepoint.len = 0;
+ ctx->match_entity.codepoint.ptr = NULL;
}
c = *cptr;
if ((0x0030 <= c && c <= 0x0039) ||
(0x0041 <= c && c <= 0x005A) ||
- (0x0061 <= c && c <= 0x007A)) {
- ctx->match_entity.codepoint = 0;
+ (0x0061 <= c && c <= 0x007A) ||
+ c == '=') {
+ ctx->match_entity.codepoint.len = 0;
+ ctx->match_entity.codepoint.ptr = NULL;
}
}
}