summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Mark Bell <jmb@netsurf-browser.org>2008-06-18 10:59:30 (GMT)
committer John Mark Bell <jmb@netsurf-browser.org>2008-06-18 10:59:30 (GMT)
commitf5b08ded5898003ba8d81fc50f2231f4a49bffda (patch)
treea17402a183424a0cb596813661e8278d2fb428ff
parent7b529dbf9f5fbc39edc072bc790f33ebb39d1d0e (diff)
downloadjson-c-f5b08ded5898003ba8d81fc50f2231f4a49bffda.tar.gz
json-c-f5b08ded5898003ba8d81fc50f2231f4a49bffda.tar.bz2
Correctly decode UTF-16 surrogates
svn path=/trunk/json-c/; revision=4384
-rw-r--r--json-c/json_tokener.c78
-rw-r--r--json-c/json_tokener.h3
-rw-r--r--json-c/test1.c12
3 files changed, 79 insertions, 14 deletions
diff --git a/json-c/json_tokener.c b/json-c/json_tokener.c
index c904f48..d594569 100644
--- a/json-c/json_tokener.c
+++ b/json-c/json_tokener.c
@@ -124,6 +124,39 @@ char* strndup(const char* str, size_t n)
}
#endif
+static void json_tokener_output_ucs(struct printbuf *pb, unsigned int ucs)
+{
+ unsigned char utf_out[4];
+
+ /* Don't permit surrogates or undefined characters */
+ if ((0xd800 <= ucs && ucs <= 0xdfff) || (ucs & 0xfffe) == 0xfffe)
+ ucs = 0xfffd;
+
+ if (ucs < 0x80) {
+ utf_out[0] = ucs;
+ printbuf_memappend(pb, (char*)utf_out, 1);
+ } else if (ucs < 0x800) {
+ utf_out[0] = 0xc0 | (ucs >> 6);
+ utf_out[1] = 0x80 | (ucs & 0x3f);
+ printbuf_memappend(pb, (char*)utf_out, 2);
+ } else if (ucs < 0x10000) {
+ utf_out[0] = 0xe0 | (ucs >> 12);
+ utf_out[1] = 0x80 | ((ucs >> 6) & 0x3f);
+ utf_out[2] = 0x80 | (ucs & 0x3f);
+ printbuf_memappend(pb, (char*)utf_out, 3);
+ } else if (ucs < 0x110000) {
+ utf_out[0] = 0xf0 | (ucs >> 18);
+ utf_out[1] = 0x80 | ((ucs >> 12) & 0x3f);
+ utf_out[2] = 0x80 | ((ucs >> 6) & 0x3f);
+ utf_out[3] = 0x80 | (ucs & 0x3f);
+ printbuf_memappend(pb, (char*)utf_out, 4);
+ } else {
+ utf_out[0] = 0xef;
+ utf_out[1] = 0xbf;
+ utf_out[2] = 0xbd;
+ printbuf_memappend(pb, (char*)utf_out, 3);
+ }
+}
#define state tok->stack[tok->depth].state
#define saved_state tok->stack[tok->depth].saved_state
@@ -316,6 +349,7 @@ struct json_object* json_tokener_parse_ex(struct json_tokener *tok,
break;
case 'u':
tok->ucs_char = 0;
+ tok->ucs_surrogate = 0;
tok->st_pos = 0;
state = json_tokener_state_escape_unicode;
break;
@@ -329,21 +363,15 @@ struct json_object* json_tokener_parse_ex(struct json_tokener *tok,
if(strchr(json_hex_chars, c)) {
tok->ucs_char += ((unsigned int)hexdigit(c) << ((3-tok->st_pos++)*4));
if(tok->st_pos == 4) {
- unsigned char utf_out[3];
- if (tok->ucs_char < 0x80) {
- utf_out[0] = tok->ucs_char;
- printbuf_memappend(tok->pb, (char*)utf_out, 1);
- } else if (tok->ucs_char < 0x800) {
- utf_out[0] = 0xc0 | (tok->ucs_char >> 6);
- utf_out[1] = 0x80 | (tok->ucs_char & 0x3f);
- printbuf_memappend(tok->pb, (char*)utf_out, 2);
+ if (0xD800 <= tok->ucs_char && tok->ucs_char <= 0xDBFF) {
+ tok->ucs_surrogate = tok->ucs_char;
+ tok->ucs_char = 0;
+ tok->st_pos = 0;
+ state = json_tokener_state_escape_unicode_surrogate;
} else {
- utf_out[0] = 0xe0 | (tok->ucs_char >> 12);
- utf_out[1] = 0x80 | ((tok->ucs_char >> 6) & 0x3f);
- utf_out[2] = 0x80 | (tok->ucs_char & 0x3f);
- printbuf_memappend(tok->pb, (char*)utf_out, 3);
+ json_tokener_output_ucs(tok->pb, tok->ucs_char);
+ state = saved_state;
}
- state = saved_state;
}
} else {
tok->err = json_tokener_error_parse_string;
@@ -351,6 +379,30 @@ struct json_object* json_tokener_parse_ex(struct json_tokener *tok,
}
break;
+ case json_tokener_state_escape_unicode_surrogate:
+ if (tok->st_pos == 0 && c == '\\') {
+ tok->st_pos++;
+ } else if (tok->st_pos == 1 && c == 'u') {
+ tok->st_pos++;
+ } else if (tok->st_pos > 1 && strchr(json_hex_chars, c)) {
+ tok->ucs_char += ((unsigned int)hexdigit(c) << ((5-tok->st_pos++)*4));
+ if (tok->st_pos == 6) {
+ if (0xDC00 <= tok->ucs_char && tok->ucs_char <= 0xDFFF) {
+ tok->ucs_char = (tok->ucs_surrogate << 10) + tok->ucs_char +
+ (0x10000 - (0xd800 << 10) - 0xdc00);
+ json_tokener_output_ucs(tok->pb, tok->ucs_char);
+ state = saved_state;
+ } else {
+ tok->err = json_tokener_error_parse_string;
+ goto out;
+ }
+ }
+ } else {
+ tok->err = json_tokener_error_parse_string;
+ goto out;
+ }
+ break;
+
case json_tokener_state_boolean:
printbuf_memappend(tok->pb, &c, 1);
if(strncasecmp(json_true_str, tok->pb->buf,
diff --git a/json-c/json_tokener.h b/json-c/json_tokener.h
index 117d6ef..00f3349 100644
--- a/json-c/json_tokener.h
+++ b/json-c/json_tokener.h
@@ -44,6 +44,7 @@ enum json_tokener_state {
json_tokener_state_string,
json_tokener_state_string_escape,
json_tokener_state_escape_unicode,
+ json_tokener_state_escape_unicode_surrogate,
json_tokener_state_boolean,
json_tokener_state_number,
json_tokener_state_array,
@@ -73,7 +74,7 @@ struct json_tokener
struct printbuf *pb;
int depth, is_double, st_pos, char_offset;
ptrdiff_t err;
- unsigned int ucs_char;
+ unsigned int ucs_char, ucs_surrogate;
char quote_char;
struct json_tokener_srec stack[JSON_TOKENER_MAX_DEPTH];
};
diff --git a/json-c/test1.c b/json-c/test1.c
index a64a255..2f4e8dc 100644
--- a/json-c/test1.c
+++ b/json-c/test1.c
@@ -144,6 +144,18 @@ int main(int argc, char **argv)
new_obj = json_tokener_parse("{ \"foo");
if(is_error(new_obj)) printf("got error as expected\n");
+ new_obj = json_tokener_parse("\"\\ud800\\udc00\"");
+ printf("new_obj.to_string()=%s\n", json_object_to_json_string(new_obj));
+
+ new_obj = json_tokener_parse("\"\\udbff\\udfff\"");
+ printf("new_obj.to_string()=%s\n", json_object_to_json_string(new_obj));
+
+ new_obj = json_tokener_parse("\"\\ud800foo\"");
+ if(is_error(new_obj)) printf("got error as expected\n");
+
+ new_obj = json_tokener_parse("\"\\ud800\\ufffd\"");
+ if(is_error(new_obj)) printf("got error as expected\n");
+
/* test incremental parsing */
tok = json_tokener_new();
new_obj = json_tokener_parse_ex(tok, "{ \"foo", 6);