summaryrefslogtreecommitdiff
path: root/utils/utf8.c
diff options
context:
space:
mode:
Diffstat (limited to 'utils/utf8.c')
-rw-r--r--utils/utf8.c206
1 files changed, 105 insertions, 101 deletions
diff --git a/utils/utf8.c b/utils/utf8.c
index f0ac0c9b2..3eedd0810 100644
--- a/utils/utf8.c
+++ b/utils/utf8.c
@@ -32,6 +32,7 @@
#include "utils/log.h"
#include "utils/utf8.h"
+#include "netsurf/inttypes.h"
#include "netsurf/utf8.h"
#include "desktop/gui_internal.h"
@@ -44,7 +45,7 @@ uint32_t utf8_to_ucs4(const char *s_in, size_t l)
parserutils_error perror;
perror = parserutils_charset_utf8_to_ucs4((const uint8_t *) s_in, l,
- &ucs4, &len);
+ &ucs4, &len);
if (perror != PARSERUTILS_OK)
ucs4 = 0xfffd;
@@ -106,7 +107,7 @@ size_t utf8_char_byte_length(const char *s)
parserutils_error perror;
perror = parserutils_charset_utf8_char_byte_length((const uint8_t *) s,
- &len);
+ &len);
assert(perror == PARSERUTILS_OK);
return len;
@@ -131,7 +132,7 @@ size_t utf8_next(const char *s, size_t l, size_t o)
parserutils_error perror;
perror = parserutils_charset_utf8_next((const uint8_t *) s, l, o,
- &next);
+ &next);
assert(perror == PARSERUTILS_OK);
return next;
@@ -151,6 +152,47 @@ static inline void utf8_clear_cd_cache(void)
last_cd.cd = 0;
}
+/**
+ * obtain a cached conversion descriptor
+ *
+ * either return the cached conversion descriptor or create one if required
+ */
+static nserror
+get_cached_cd(const char *enc_from, const char *enc_to, iconv_t *cd_out)
+{
+ iconv_t cd;
+ /* we cache the last used conversion descriptor,
+ * so check if we're trying to use it here */
+ if (strncasecmp(last_cd.from, enc_from, sizeof(last_cd.from)) == 0 &&
+ strncasecmp(last_cd.to, enc_to, sizeof(last_cd.to)) == 0 &&
+ last_cd.cd != 0) {
+ *cd_out = last_cd.cd;
+ return NSERROR_OK;
+ }
+
+ /* no match, so create a new cd */
+ cd = iconv_open(enc_to, enc_from);
+ if (cd == (iconv_t) -1) {
+ if (errno == EINVAL) {
+ return NSERROR_BAD_ENCODING;
+ }
+ /* default to no memory */
+ return NSERROR_NOMEM;
+ }
+
+ /* close the last cd - we don't care if this fails */
+ if (last_cd.cd) {
+ iconv_close(last_cd.cd);
+ }
+
+ /* and safely copy the to/from/cd data into last_cd */
+ snprintf(last_cd.from, sizeof(last_cd.from), "%s", enc_from);
+ snprintf(last_cd.to, sizeof(last_cd.to), "%s", enc_to);
+ *cd_out = last_cd.cd = cd;
+
+ return NSERROR_OK;
+}
+
/* exported interface documented in utils/utf8.h */
nserror utf8_finalise(void)
{
@@ -168,95 +210,72 @@ nserror utf8_finalise(void)
* Convert a string from one encoding to another
*
* \param string The NULL-terminated string to convert
- * \param len Length of input string to consider (in bytes), or 0
+ * \param slen Length of input string to consider (in bytes), or 0
* \param from The encoding name to convert from
* \param to The encoding name to convert to
- * \param result Pointer to location in which to store result.
- * \param result_len Pointer to location in which to store result length.
+ * \param result_out Pointer to location in which to store result.
+ * \param result_len_out Pointer to location in which to store result length.
* \return NSERROR_OK for no error, NSERROR_NOMEM on allocation error,
* NSERROR_BAD_ENCODING for a bad character encoding
*/
static nserror
utf8_convert(const char *string,
- size_t len,
+ size_t slen,
const char *from,
const char *to,
- char **result,
- size_t *result_len)
+ char **result_out,
+ size_t *result_len_out)
{
iconv_t cd;
- char *temp, *out, *in;
- size_t slen, rlen;
-
- assert(string && from && to && result);
-
- if (string[0] == '\0') {
- /* On AmigaOS, iconv() returns an error if we pass an
- * empty string. This prevents iconv() being called as
- * there is no conversion necessary anyway. */
- *result = strdup("");
- if (!(*result)) {
- *result = NULL;
- return NSERROR_NOMEM;
- }
+ char *temp, *out, *in, *result;
+ size_t result_len;
+ nserror res;
- return NSERROR_OK;
+ assert(string && from && to && result_out);
+
+ /* calculate the source length if not given */
+ if (slen==0) {
+ slen = strlen(string);
}
- if (strcasecmp(from, to) == 0) {
- /* conversion from an encoding to itself == strdup */
- slen = len ? len : strlen(string);
- *(result) = strndup(string, slen);
- if (!(*result)) {
- *(result) = NULL;
+ /* process the empty string separately avoiding any conversion
+ * check for the source and destination encoding being the same
+ *
+ * This optimisation is necessary on AmigaOS as iconv()
+ * returns an error if an empty string is passed.
+ */
+ if ((slen == 0) || (strcasecmp(from, to) == 0)) {
+ *result_out = strndup(string, slen);
+ if (*result_out == NULL) {
return NSERROR_NOMEM;
}
+ if (result_len_out != NULL) {
+ *result_len_out = slen;
+ }
return NSERROR_OK;
}
in = (char *)string;
- /* we cache the last used conversion descriptor,
- * so check if we're trying to use it here */
- if (strncasecmp(last_cd.from, from, sizeof(last_cd.from)) == 0 &&
- strncasecmp(last_cd.to, to, sizeof(last_cd.to)) == 0) {
- cd = last_cd.cd;
- }
- else {
- /* no match, so create a new cd */
- cd = iconv_open(to, from);
- if (cd == (iconv_t)-1) {
- if (errno == EINVAL)
- return NSERROR_BAD_ENCODING;
- /* default to no memory */
- return NSERROR_NOMEM;
- }
-
- /* close the last cd - we don't care if this fails */
- if (last_cd.cd)
- iconv_close(last_cd.cd);
-
- /* and copy the to/from/cd data into last_cd */
- snprintf(last_cd.from, sizeof(last_cd.from), "%s", from);
- snprintf(last_cd.to, sizeof(last_cd.to), "%s", to);
- last_cd.cd = cd;
+ res = get_cached_cd(from, to, &cd);
+ if (res != NSERROR_OK) {
+ return res;
}
- slen = len ? len : strlen(string);
/* Worst case = ASCII -> UCS4, so allocate an output buffer
* 4 times larger than the input buffer, and add 4 bytes at
* the end for the NULL terminator
*/
- rlen = slen * 4 + 4;
+ result_len = slen * 4 + 4;
- temp = out = malloc(rlen);
+ temp = out = malloc(result_len);
if (!out) {
return NSERROR_NOMEM;
}
/* perform conversion */
- if (iconv(cd, (void *) &in, &slen, &out, &rlen) == (size_t)-1) {
+ if (iconv(cd, (void *) &in, &slen, &out, &result_len) == (size_t)-1) {
free(temp);
/* clear the cached conversion descriptor as it's invalid */
if (last_cd.cd)
@@ -270,19 +289,22 @@ utf8_convert(const char *string,
return NSERROR_NOMEM;
}
- *(result) = realloc(temp, out - temp + 4);
- if (!(*result)) {
+ result_len = out - temp;
+
+ /* resize buffer allowing for null termination */
+ result = realloc(temp, result_len + 4);
+ if (result == NULL) {
free(temp);
- *(result) = NULL; /* for sanity's sake */
return NSERROR_NOMEM;
}
/* NULL terminate - needs 4 characters as we may have
* converted to UTF-32 */
- memset((*result) + (out - temp), 0, 4);
+ memset(result + result_len, 0, 4);
- if (result_len != NULL) {
- *result_len = (out - temp);
+ *result_out = result;
+ if (result_len_out != NULL) {
+ *result_len_out = result_len;
}
return NSERROR_OK;
@@ -290,14 +312,14 @@ utf8_convert(const char *string,
/* exported interface documented in utils/utf8.h */
nserror utf8_to_enc(const char *string, const char *encname,
- size_t len, char **result)
+ size_t len, char **result)
{
return utf8_convert(string, len, "UTF-8", encname, result, NULL);
}
/* exported interface documented in utils/utf8.h */
nserror utf8_from_enc(const char *string, const char *encname,
- size_t len, char **result, size_t *result_len)
+ size_t len, char **result, size_t *result_len)
{
return utf8_convert(string, len, encname, "UTF-8", result, result_len);
}
@@ -325,10 +347,10 @@ utf8_convert_html_chunk(iconv_t cd,
return NSERROR_NOMEM;
ucs4 = utf8_to_ucs4(chunk, inlen);
- esclen = snprintf(escape, sizeof(escape), "&#x%06x;", ucs4);
+ esclen = snprintf(escape, sizeof(escape), "&#x%06"PRIx32";", ucs4);
pescape = escape;
ret = iconv(cd, (void *) &pescape, &esclen,
- (void *) out, outlen);
+ (void *) out, outlen);
if (ret == (size_t) -1)
return NSERROR_NOMEM;
@@ -340,45 +362,26 @@ utf8_convert_html_chunk(iconv_t cd,
return NSERROR_OK;
}
+
+
/* exported interface documented in utils/utf8.h */
nserror
-utf8_to_html(const char *string, const char *encname, size_t len, char **result)
+utf8_to_html(const char *string, const char *encname, size_t len, char **result_out)
{
iconv_t cd;
const char *in;
- char *out, *origout;
+ char *out, *origout, *result;
size_t off, prev_off, inlen, outlen, origoutlen, esclen;
nserror ret;
char *pescape, escape[11];
+ nserror res;
if (len == 0)
len = strlen(string);
- /* we cache the last used conversion descriptor,
- * so check if we're trying to use it here */
- if (strncasecmp(last_cd.from, "UTF-8", sizeof(last_cd.from)) == 0 &&
- strncasecmp(last_cd.to, encname,
- sizeof(last_cd.to)) == 0 &&
- last_cd.cd != 0) {
- cd = last_cd.cd;
- } else {
- /* no match, so create a new cd */
- cd = iconv_open(encname, "UTF-8");
- if (cd == (iconv_t) -1) {
- if (errno == EINVAL)
- return NSERROR_BAD_ENCODING;
- /* default to no memory */
- return NSERROR_NOMEM;
- }
-
- /* close the last cd - we don't care if this fails */
- if (last_cd.cd)
- iconv_close(last_cd.cd);
-
- /* and safely copy the to/from/cd data into last_cd */
- snprintf(last_cd.from, sizeof(last_cd.from), "UTF-8");
- snprintf(last_cd.to, sizeof(last_cd.to), "%s", encname);
- last_cd.cd = cd;
+ res = get_cached_cd("UTF-8", encname, &cd);
+ if (res != NSERROR_OK) {
+ return res;
}
/* Worst case is ASCII -> UCS4, with all characters escaped:
@@ -398,13 +401,13 @@ utf8_to_html(const char *string, const char *encname, size_t len, char **result)
while (off < len) {
/* Must escape '&', '<', and '>' */
if (string[off] == '&' || string[off] == '<' ||
- string[off] == '>') {
+ string[off] == '>') {
if (off - prev_off > 0) {
/* Emit chunk */
in = string + prev_off;
inlen = off - prev_off;
ret = utf8_convert_html_chunk(cd, in, inlen,
- &out, &outlen);
+ &out, &outlen);
if (ret != NSERROR_OK) {
free(origout);
iconv_close(cd);
@@ -415,10 +418,10 @@ utf8_to_html(const char *string, const char *encname, size_t len, char **result)
/* Emit mandatory escape */
esclen = snprintf(escape, sizeof(escape),
- "&#x%06x;", string[off]);
+ "&#x%06x;", string[off]);
pescape = escape;
ret = utf8_convert_html_chunk(cd, pescape, esclen,
- &out, &outlen);
+ &out, &outlen);
if (ret != NSERROR_OK) {
free(origout);
iconv_close(cd);
@@ -450,11 +453,12 @@ utf8_to_html(const char *string, const char *encname, size_t len, char **result)
outlen -= 4;
/* Shrink-wrap */
- *result = realloc(origout, origoutlen - outlen);
- if (*result == NULL) {
+ result = realloc(origout, origoutlen - outlen);
+ if (result == NULL) {
free(origout);
return NSERROR_NOMEM;
}
+ *result_out = result;
return NSERROR_OK;
}