diff options
Diffstat (limited to 'utils/utf8.c')
-rw-r--r-- | utils/utf8.c | 206 |
1 files changed, 105 insertions, 101 deletions
diff --git a/utils/utf8.c b/utils/utf8.c index f0ac0c9b2..3eedd0810 100644 --- a/utils/utf8.c +++ b/utils/utf8.c @@ -32,6 +32,7 @@ #include "utils/log.h" #include "utils/utf8.h" +#include "netsurf/inttypes.h" #include "netsurf/utf8.h" #include "desktop/gui_internal.h" @@ -44,7 +45,7 @@ uint32_t utf8_to_ucs4(const char *s_in, size_t l) parserutils_error perror; perror = parserutils_charset_utf8_to_ucs4((const uint8_t *) s_in, l, - &ucs4, &len); + &ucs4, &len); if (perror != PARSERUTILS_OK) ucs4 = 0xfffd; @@ -106,7 +107,7 @@ size_t utf8_char_byte_length(const char *s) parserutils_error perror; perror = parserutils_charset_utf8_char_byte_length((const uint8_t *) s, - &len); + &len); assert(perror == PARSERUTILS_OK); return len; @@ -131,7 +132,7 @@ size_t utf8_next(const char *s, size_t l, size_t o) parserutils_error perror; perror = parserutils_charset_utf8_next((const uint8_t *) s, l, o, - &next); + &next); assert(perror == PARSERUTILS_OK); return next; @@ -151,6 +152,47 @@ static inline void utf8_clear_cd_cache(void) last_cd.cd = 0; } +/** + * obtain a cached conversion descriptor + * + * either return the cached conversion descriptor or create one if required + */ +static nserror +get_cached_cd(const char *enc_from, const char *enc_to, iconv_t *cd_out) +{ + iconv_t cd; + /* we cache the last used conversion descriptor, + * so check if we're trying to use it here */ + if (strncasecmp(last_cd.from, enc_from, sizeof(last_cd.from)) == 0 && + strncasecmp(last_cd.to, enc_to, sizeof(last_cd.to)) == 0 && + last_cd.cd != 0) { + *cd_out = last_cd.cd; + return NSERROR_OK; + } + + /* no match, so create a new cd */ + cd = iconv_open(enc_to, enc_from); + if (cd == (iconv_t) -1) { + if (errno == EINVAL) { + return NSERROR_BAD_ENCODING; + } + /* default to no memory */ + return NSERROR_NOMEM; + } + + /* close the last cd - we don't care if this fails */ + if (last_cd.cd) { + iconv_close(last_cd.cd); + } + + /* and safely copy the to/from/cd data into last_cd */ + snprintf(last_cd.from, sizeof(last_cd.from), "%s", enc_from); + snprintf(last_cd.to, sizeof(last_cd.to), "%s", enc_to); + *cd_out = last_cd.cd = cd; + + return NSERROR_OK; +} + /* exported interface documented in utils/utf8.h */ nserror utf8_finalise(void) { @@ -168,95 +210,72 @@ nserror utf8_finalise(void) * Convert a string from one encoding to another * * \param string The NULL-terminated string to convert - * \param len Length of input string to consider (in bytes), or 0 + * \param slen Length of input string to consider (in bytes), or 0 * \param from The encoding name to convert from * \param to The encoding name to convert to - * \param result Pointer to location in which to store result. - * \param result_len Pointer to location in which to store result length. + * \param result_out Pointer to location in which to store result. + * \param result_len_out Pointer to location in which to store result length. * \return NSERROR_OK for no error, NSERROR_NOMEM on allocation error, * NSERROR_BAD_ENCODING for a bad character encoding */ static nserror utf8_convert(const char *string, - size_t len, + size_t slen, const char *from, const char *to, - char **result, - size_t *result_len) + char **result_out, + size_t *result_len_out) { iconv_t cd; - char *temp, *out, *in; - size_t slen, rlen; - - assert(string && from && to && result); - - if (string[0] == '\0') { - /* On AmigaOS, iconv() returns an error if we pass an - * empty string. This prevents iconv() being called as - * there is no conversion necessary anyway. */ - *result = strdup(""); - if (!(*result)) { - *result = NULL; - return NSERROR_NOMEM; - } + char *temp, *out, *in, *result; + size_t result_len; + nserror res; - return NSERROR_OK; + assert(string && from && to && result_out); + + /* calculate the source length if not given */ + if (slen==0) { + slen = strlen(string); } - if (strcasecmp(from, to) == 0) { - /* conversion from an encoding to itself == strdup */ - slen = len ? len : strlen(string); - *(result) = strndup(string, slen); - if (!(*result)) { - *(result) = NULL; + /* process the empty string separately avoiding any conversion + * check for the source and destination encoding being the same + * + * This optimisation is necessary on AmigaOS as iconv() + * returns an error if an empty string is passed. + */ + if ((slen == 0) || (strcasecmp(from, to) == 0)) { + *result_out = strndup(string, slen); + if (*result_out == NULL) { return NSERROR_NOMEM; } + if (result_len_out != NULL) { + *result_len_out = slen; + } return NSERROR_OK; } in = (char *)string; - /* we cache the last used conversion descriptor, - * so check if we're trying to use it here */ - if (strncasecmp(last_cd.from, from, sizeof(last_cd.from)) == 0 && - strncasecmp(last_cd.to, to, sizeof(last_cd.to)) == 0) { - cd = last_cd.cd; - } - else { - /* no match, so create a new cd */ - cd = iconv_open(to, from); - if (cd == (iconv_t)-1) { - if (errno == EINVAL) - return NSERROR_BAD_ENCODING; - /* default to no memory */ - return NSERROR_NOMEM; - } - - /* close the last cd - we don't care if this fails */ - if (last_cd.cd) - iconv_close(last_cd.cd); - - /* and copy the to/from/cd data into last_cd */ - snprintf(last_cd.from, sizeof(last_cd.from), "%s", from); - snprintf(last_cd.to, sizeof(last_cd.to), "%s", to); - last_cd.cd = cd; + res = get_cached_cd(from, to, &cd); + if (res != NSERROR_OK) { + return res; } - slen = len ? len : strlen(string); /* Worst case = ASCII -> UCS4, so allocate an output buffer * 4 times larger than the input buffer, and add 4 bytes at * the end for the NULL terminator */ - rlen = slen * 4 + 4; + result_len = slen * 4 + 4; - temp = out = malloc(rlen); + temp = out = malloc(result_len); if (!out) { return NSERROR_NOMEM; } /* perform conversion */ - if (iconv(cd, (void *) &in, &slen, &out, &rlen) == (size_t)-1) { + if (iconv(cd, (void *) &in, &slen, &out, &result_len) == (size_t)-1) { free(temp); /* clear the cached conversion descriptor as it's invalid */ if (last_cd.cd) @@ -270,19 +289,22 @@ utf8_convert(const char *string, return NSERROR_NOMEM; } - *(result) = realloc(temp, out - temp + 4); - if (!(*result)) { + result_len = out - temp; + + /* resize buffer allowing for null termination */ + result = realloc(temp, result_len + 4); + if (result == NULL) { free(temp); - *(result) = NULL; /* for sanity's sake */ return NSERROR_NOMEM; } /* NULL terminate - needs 4 characters as we may have * converted to UTF-32 */ - memset((*result) + (out - temp), 0, 4); + memset(result + result_len, 0, 4); - if (result_len != NULL) { - *result_len = (out - temp); + *result_out = result; + if (result_len_out != NULL) { + *result_len_out = result_len; } return NSERROR_OK; @@ -290,14 +312,14 @@ utf8_convert(const char *string, /* exported interface documented in utils/utf8.h */ nserror utf8_to_enc(const char *string, const char *encname, - size_t len, char **result) + size_t len, char **result) { return utf8_convert(string, len, "UTF-8", encname, result, NULL); } /* exported interface documented in utils/utf8.h */ nserror utf8_from_enc(const char *string, const char *encname, - size_t len, char **result, size_t *result_len) + size_t len, char **result, size_t *result_len) { return utf8_convert(string, len, encname, "UTF-8", result, result_len); } @@ -325,10 +347,10 @@ utf8_convert_html_chunk(iconv_t cd, return NSERROR_NOMEM; ucs4 = utf8_to_ucs4(chunk, inlen); - esclen = snprintf(escape, sizeof(escape), "&#x%06x;", ucs4); + esclen = snprintf(escape, sizeof(escape), "&#x%06"PRIx32";", ucs4); pescape = escape; ret = iconv(cd, (void *) &pescape, &esclen, - (void *) out, outlen); + (void *) out, outlen); if (ret == (size_t) -1) return NSERROR_NOMEM; @@ -340,45 +362,26 @@ utf8_convert_html_chunk(iconv_t cd, return NSERROR_OK; } + + /* exported interface documented in utils/utf8.h */ nserror -utf8_to_html(const char *string, const char *encname, size_t len, char **result) +utf8_to_html(const char *string, const char *encname, size_t len, char **result_out) { iconv_t cd; const char *in; - char *out, *origout; + char *out, *origout, *result; size_t off, prev_off, inlen, outlen, origoutlen, esclen; nserror ret; char *pescape, escape[11]; + nserror res; if (len == 0) len = strlen(string); - /* we cache the last used conversion descriptor, - * so check if we're trying to use it here */ - if (strncasecmp(last_cd.from, "UTF-8", sizeof(last_cd.from)) == 0 && - strncasecmp(last_cd.to, encname, - sizeof(last_cd.to)) == 0 && - last_cd.cd != 0) { - cd = last_cd.cd; - } else { - /* no match, so create a new cd */ - cd = iconv_open(encname, "UTF-8"); - if (cd == (iconv_t) -1) { - if (errno == EINVAL) - return NSERROR_BAD_ENCODING; - /* default to no memory */ - return NSERROR_NOMEM; - } - - /* close the last cd - we don't care if this fails */ - if (last_cd.cd) - iconv_close(last_cd.cd); - - /* and safely copy the to/from/cd data into last_cd */ - snprintf(last_cd.from, sizeof(last_cd.from), "UTF-8"); - snprintf(last_cd.to, sizeof(last_cd.to), "%s", encname); - last_cd.cd = cd; + res = get_cached_cd("UTF-8", encname, &cd); + if (res != NSERROR_OK) { + return res; } /* Worst case is ASCII -> UCS4, with all characters escaped: @@ -398,13 +401,13 @@ utf8_to_html(const char *string, const char *encname, size_t len, char **result) while (off < len) { /* Must escape '&', '<', and '>' */ if (string[off] == '&' || string[off] == '<' || - string[off] == '>') { + string[off] == '>') { if (off - prev_off > 0) { /* Emit chunk */ in = string + prev_off; inlen = off - prev_off; ret = utf8_convert_html_chunk(cd, in, inlen, - &out, &outlen); + &out, &outlen); if (ret != NSERROR_OK) { free(origout); iconv_close(cd); @@ -415,10 +418,10 @@ utf8_to_html(const char *string, const char *encname, size_t len, char **result) /* Emit mandatory escape */ esclen = snprintf(escape, sizeof(escape), - "&#x%06x;", string[off]); + "&#x%06x;", string[off]); pescape = escape; ret = utf8_convert_html_chunk(cd, pescape, esclen, - &out, &outlen); + &out, &outlen); if (ret != NSERROR_OK) { free(origout); iconv_close(cd); @@ -450,11 +453,12 @@ utf8_to_html(const char *string, const char *encname, size_t len, char **result) outlen -= 4; /* Shrink-wrap */ - *result = realloc(origout, origoutlen - outlen); - if (*result == NULL) { + result = realloc(origout, origoutlen - outlen); + if (result == NULL) { free(origout); return NSERROR_NOMEM; } + *result_out = result; return NSERROR_OK; } |