summaryrefslogtreecommitdiff
path: root/riscos/ucstables.c
diff options
context:
space:
mode:
authorJohn Mark Bell <jmb@netsurf-browser.org>2007-02-11 11:19:42 +0000
committerJohn Mark Bell <jmb@netsurf-browser.org>2007-02-11 11:19:42 +0000
commitb54332fd586d9ba9d3373e5fa37a80aa9ce265e3 (patch)
tree96e4b300f0f4f3d2b9dd2e0b59de95d04547c1bb /riscos/ucstables.c
parent3b40e0f5fc824bdcea46ee4fce609c2511c6b1fd (diff)
downloadnetsurf-b54332fd586d9ba9d3373e5fa37a80aa9ce265e3.tar.gz
netsurf-b54332fd586d9ba9d3373e5fa37a80aa9ce265e3.tar.bz2
Rewrite utf8_[to,from]_local_encoding to not have arbitrary limit in the
number of special characters permitted in the input. (fixes 1651343, 1649247, 1644745, 1607934) Should also be rather more efficient, too, as it now conducts only a single pass over the input data. svn path=/trunk/netsurf/; revision=3177
Diffstat (limited to 'riscos/ucstables.c')
-rw-r--r--riscos/ucstables.c207
1 files changed, 96 insertions, 111 deletions
diff --git a/riscos/ucstables.c b/riscos/ucstables.c
index 5a8eb8aec..6f8e44d11 100644
--- a/riscos/ucstables.c
+++ b/riscos/ucstables.c
@@ -441,13 +441,9 @@ utf8_convert_ret utf8_to_local_encoding(const char *string, size_t len,
char **result)
{
os_error *error;
- int alphabet, i, offset_count = 0;
- struct {
- const struct special *local; /* local character */
- size_t offset; /* byte offset into string */
- } offsets[CHAR_MAX];
- size_t off;
- char *temp;
+ int alphabet, i;
+ size_t off, prev_off;
+ char *temp, *cur_pos;
const char *enc;
utf8_convert_ret err;
@@ -475,8 +471,18 @@ utf8_convert_ret utf8_to_local_encoding(const char *string, size_t len,
localencodings[CONT_ENC_END + 1]
: localencodings[0]));
- /* populate offsets array with details of characters that
- * will be stripped by iconv */
+ /* create output buffer */
+ *(result) = malloc(len + 1);
+ if (!(*result))
+ return UTF8_CONVERT_NOMEM;
+ *(*result) = '\0';
+
+ prev_off = 0;
+ cur_pos = (*result);
+
+ /* Iterate over string, converting input between unconvertable
+ * characters and inserting appropriate output for characters
+ * that iconv can't handle. */
for (off = 0; off < len; off = utf8_next(string, len, off)) {
if (string[off] != 0xE2 &&
string[off] != 0xC5 && string[off] != 0xEF)
@@ -484,69 +490,45 @@ utf8_convert_ret utf8_to_local_encoding(const char *string, size_t len,
for (i = 0; i != NOF_ELEMENTS(special_chars); i++) {
if (strncmp(string + off, special_chars[i].utf,
- special_chars[i].len) == 0) {
- /* ensure we don't overflow our buffer */
- assert(offset_count < CHAR_MAX - 1);
- offsets[offset_count].local =
- &special_chars[i];
- offsets[offset_count].offset = off;
- offset_count++;
- break;
- }
- }
- }
+ special_chars[i].len) != 0)
+ continue;
- if (offset_count == 0) {
- /* No substitutions are required, so exit here */
- return utf8_to_enc(string, enc, len, result);
- }
+ /* 0 length has a special meaning to utf8_to_enc */
+ if (off - prev_off > 0) {
+ err = utf8_to_enc(string + prev_off, enc,
+ off - prev_off, &temp);
+ if (err != UTF8_CONVERT_OK) {
+ assert(err != UTF8_CONVERT_BADENC);
+ free(*result);
+ return UTF8_CONVERT_NOMEM;
+ }
- /* create output buffer */
- *(result) = malloc(len + 1);
- if (!(*result))
- return UTF8_CONVERT_NOMEM;
- *(*result) = '\0';
+ strcat(cur_pos, temp);
- /* convert the chunks between offsets, then copy stripped
- * character into output string */
- for (i = 0; i != offset_count; i++) {
- off = (i > 0 ? offsets[i-1].offset + offsets[i-1].local->len
- : 0);
-
- /* 0 length has a special meaning to utf8_to_enc */
- if (offsets[i].offset > off) {
- err = utf8_to_enc(string + off, enc,
- offsets[i].offset - off, &temp);
- if (err != UTF8_CONVERT_OK) {
- assert(err != UTF8_CONVERT_BADENC);
- free(*result);
- return UTF8_CONVERT_NOMEM;
- }
+ cur_pos += strlen(temp);
- strcat((*result), temp);
+ free(temp);
+ }
- free(temp);
+ *cur_pos = special_chars[i].local;
+ *(++cur_pos) = '\0';
+ prev_off = off + special_chars[i].len;
}
-
- off = strlen(*result);
- (*result)[off] = offsets[i].local->local;
- (*result)[off+1] = '\0';
}
/* handle last chunk
* NB. 0 length has a special meaning to utf8_to_enc */
- off = offsets[offset_count - 1].offset +
- offsets[offset_count - 1].local->len;
- if (off < len) {
- err = utf8_to_enc(string + off, enc, len - off, &temp);
+ if (prev_off < len) {
+ err = utf8_to_enc(string + prev_off, enc, len - prev_off,
+ &temp);
if (err != UTF8_CONVERT_OK) {
assert(err != UTF8_CONVERT_BADENC);
free(*result);
return UTF8_CONVERT_NOMEM;
}
- strcat((*result), temp);
+ strcat(cur_pos, temp);
free(temp);
}
@@ -566,12 +548,9 @@ utf8_convert_ret utf8_from_local_encoding(const char *string, size_t len,
char **result)
{
os_error *error;
- int alphabet, i, offset_count = 0;
- struct {
- const struct special *local; /* utf character */
- size_t offset; /* byte offset into string */
- } offsets[CHAR_MAX];
- size_t off;
+ int alphabet, i, num_specials = 0, result_alloc;
+#define SPECIAL_CHUNK_SIZE 255
+ size_t off, prev_off, cur_off;
char *temp;
const char *enc;
utf8_convert_ret err;
@@ -603,64 +582,74 @@ utf8_convert_ret utf8_from_local_encoding(const char *string, size_t len,
localencodings[CONT_ENC_END + 1]
: localencodings[0]));
- /* populate offsets array with details of characters that
- * will be stripped by iconv */
+ /* create output buffer (oversized) */
+ result_alloc = (len * 4) + (3 * SPECIAL_CHUNK_SIZE) + 1;
+
+ *(result) = malloc(result_alloc);
+ if (!(*result))
+ return UTF8_CONVERT_NOMEM;
+ *(*result) = '\0';
+
+ prev_off = 0;
+ cur_off = 0;
+
+ /* Iterate over string, converting input between unconvertable
+ * characters and inserting appropriate output for characters
+ * that iconv can't handle. */
for (off = 0; off < len; off++) {
if (string[off] < 0x80 || string[off] > 0x9f)
continue;
for (i = 0; i != NOF_ELEMENTS(special_chars); i++) {
- if (string[off] == special_chars[i].local) {
- /* ensure we don't overflow our buffer */
- assert(offset_count < CHAR_MAX - 1);
- offsets[offset_count].local =
- &special_chars[i];
- offsets[offset_count].offset = off;
- offset_count++;
- break;
- }
- }
- }
+ if (string[off] != special_chars[i].local)
+ continue;
- if (offset_count == 0) {
- /* No substitutions are required, so exit here */
- return utf8_from_enc(string, enc, len, result);
- }
+ /* 0 length has a special meaning to utf8_from_enc */
+ if (off - prev_off > 0) {
+ err = utf8_from_enc(string + prev_off, enc,
+ off - prev_off, &temp);
+ if (err != UTF8_CONVERT_OK) {
+ assert(err != UTF8_CONVERT_BADENC);
+ LOG(("utf8_from_enc failed"));
+ free(*result);
+ return UTF8_CONVERT_NOMEM;
+ }
- /* create output buffer (oversized) */
- *(result) = malloc((len * 4) + (3 * offset_count) + 1);
- if (!(*result))
- return UTF8_CONVERT_NOMEM;
- *(*result) = '\0';
+ strcat((*result) + cur_off, temp);
+
+ cur_off += strlen(temp);
- /* convert the chunks between offsets, then copy stripped
- * UTF-8 character into output string */
- for (i = 0; i != offset_count; i++) {
- off = (i > 0 ? offsets[i-1].offset + 1 : 0);
-
- /* 0 length has a special meaning to utf8_from_enc */
- if (offsets[i].offset > off) {
- err = utf8_from_enc(string + off, enc,
- offsets[i].offset - off, &temp);
- if (err != UTF8_CONVERT_OK) {
- assert(err != UTF8_CONVERT_BADENC);
- LOG(("utf8_from_enc failed"));
- free(*result);
- return UTF8_CONVERT_NOMEM;
+ free(temp);
}
- strcat((*result), temp);
- free(temp);
- }
+ strcat((*result) + cur_off, special_chars[i].utf);
+
+ cur_off += special_chars[i].len;
- strcat((*result), offsets[i].local->utf);
+ prev_off = off + 1;
+
+ num_specials++;
+ if (num_specials % SPECIAL_CHUNK_SIZE ==
+ SPECIAL_CHUNK_SIZE - 1) {
+ char *temp = realloc((*result),
+ result_alloc +
+ (3 * SPECIAL_CHUNK_SIZE));
+ if (!temp) {
+ free(*result);
+ return UTF8_CONVERT_NOMEM;
+ }
+
+ *result = temp;
+ result_alloc += (3 * SPECIAL_CHUNK_SIZE);
+ }
+ }
}
/* handle last chunk
* NB. 0 length has a special meaning to utf8_from_enc */
- off = offsets[offset_count - 1].offset + 1;
- if (off < len) {
- err = utf8_from_enc(string + off, enc, len - off, &temp);
+ if (prev_off < len) {
+ err = utf8_from_enc(string + prev_off, enc, len - prev_off,
+ &temp);
if (err != UTF8_CONVERT_OK) {
assert(err != UTF8_CONVERT_BADENC);
LOG(("utf8_from_enc failed"));
@@ -668,22 +657,18 @@ utf8_convert_ret utf8_from_local_encoding(const char *string, size_t len,
return UTF8_CONVERT_NOMEM;
}
- strcat((*result), temp);
+ strcat((*result) + cur_off, temp);
free(temp);
}
/* and copy into more reasonably-sized buffer */
- temp = malloc(strlen((*result)) + 1);
+ temp = realloc((*result), cur_off + 1);
if (!temp) {
- LOG(("malloc failed"));
+ LOG(("realloc failed"));
free(*result);
return UTF8_CONVERT_NOMEM;
}
- *temp = '\0';
-
- strcpy(temp, (*result));
- free(*result);
*result = temp;
return UTF8_CONVERT_OK;