From 0bcfdbeb50b2435b848ea1fd50ccc79ba64fd129 Mon Sep 17 00:00:00 2001
From: John Mark Bell <jmb@netsurf-browser.org>
Date: Tue, 7 Jun 2005 21:29:26 +0000
Subject: [project @ 2005-06-07 21:29:26 by jmb] Lose cnv_str_local_enc and
 friends. UTF-8 conversion functions now return an enumerated type allowing
 for fallbacks, if appropriate.

svn path=/import/netsurf/; revision=1744
---
 utils/utf8.c | 67 +++++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 42 insertions(+), 25 deletions(-)

(limited to 'utils/utf8.c')

diff --git a/utils/utf8.c b/utils/utf8.c
index b2a219ced..3763b3af0 100644
--- a/utils/utf8.c
+++ b/utils/utf8.c
@@ -10,23 +10,23 @@
  */
 
 #include <assert.h>
+#include <errno.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include <iconv.h>
 
+#include "netsurf/utils/log.h"
 #include "netsurf/utils/utf8.h"
 
-static char *utf8_convert(const char *string, size_t len, const char *from,
-		const char *to);
+static utf8_convert_ret utf8_convert(const char *string, size_t len,
+		const char *from, const char *to, char **result);
 
 /**
  * Convert a UTF-8 multibyte sequence into a single UCS4 character
  *
  * Encoding of UCS values outside the UTF-16 plane has been removed from
- * RFC3629. This function conforms to RFC2279, however, as it is possible
- * that the platform specific keyboard input handler will generate a UCS4
- * value outside the UTF-16 plane.
+ * RFC3629. This function conforms to RFC2279, however.
  *
  * \param s  The sequence to process
  * \param l  Length of sequence
@@ -72,9 +72,7 @@ size_t utf8_to_ucs4(const char *s, size_t l)
  * Convert a single UCS4 character into a UTF-8 multibyte sequence
  *
  * Encoding of UCS values outside the UTF-16 plane has been removed from
- * RFC3629. This function conforms to RFC2279, however, as it is possible
- * that the platform specific keyboard input handler will generate a UCS4
- * value outside the UTF-16 plane.
+ * RFC3629. This function conforms to RFC2279, however.
  *
  * \param c  The character to process (0 <= c <= 0x7FFFFFFF)
  * \param s  Pointer to 6 byte long output buffer
@@ -207,24 +205,28 @@ size_t utf8_next(const char *s, size_t l, size_t o)
  * \param string  The NULL-terminated string to convert
  * \param encname The encoding name (suitable for passing to iconv)
  * \param len     Length of input string to consider (in bytes), or 0
- * \return Pointer to converted string (on heap) or NULL on error
+ * \param result  Pointer to location to store result (allocated on heap)
+ * \return Appropriate utf8_convert_ret value
  */
-char *utf8_to_enc(const char *string, const char *encname, size_t len)
+utf8_convert_ret utf8_to_enc(const char *string, const char *encname,
+		size_t len, char **result)
 {
-	return utf8_convert(string, len, "UTF-8", encname);
+	return utf8_convert(string, len, "UTF-8", encname, result);
 }
 
 /**
- * Convert a UTF8 string into the named encoding
+ * Convert a string in the named encoding into a UTF-8 string
  *
  * \param string  The NULL-terminated string to convert
  * \param encname The encoding name (suitable for passing to iconv)
  * \param len     Length of input string to consider (in bytes), or 0
- * \return Pointer to converted string (on heap) or NULL on error
+ * \param result  Pointer to location to store result (allocated on heap)
+ * \return Appropriate utf8_convert_ret value
  */
-char *utf8_from_enc(const char *string, const char *encname, size_t len)
+utf8_convert_ret utf8_from_enc(const char *string, const char *encname,
+		size_t len, char **result)
 {
-	return utf8_convert(string, len, encname, "UTF-8");
+	return utf8_convert(string, len, encname, "UTF-8", result);
 }
 
 /**
@@ -234,23 +236,27 @@ char *utf8_from_enc(const char *string, const char *encname, size_t len)
  * \param len     Length of input string to consider (in bytes)
  * \param from    The encoding name to convert from
  * \param to      The encoding name to convert to
- * \return Pointer to converted string (on heap) or NULL on error
+ * \param result  Pointer to location in which to store result
+ * \return Appropriate utf8_convert_ret value
  */
-char *utf8_convert(const char *string, size_t len, const char *from,
-		const char *to)
+utf8_convert_ret utf8_convert(const char *string, size_t len,
+		const char *from, const char *to, char **result)
 {
 	iconv_t cd;
 	char *ret, *temp, *out, *in;
 	size_t slen, rlen;
 
-	if (!string || !from || !to)
-		return NULL;
+	assert(string && from && to && result);
 
 	in = (char *)string;
 
 	cd = iconv_open(to, from);
-	if (cd == (iconv_t)-1)
-		return NULL;
+	if (cd == (iconv_t)-1) {
+		if (errno == EINVAL)
+			return UTF8_CONVERT_BADENC;
+		/* default to no memory */
+		return UTF8_CONVERT_NOMEM;
+	}
 
 	slen = len ? len : strlen(string);
 	/* Worst case = ACSII -> UCS4, so allocate an output buffer
@@ -262,14 +268,19 @@ char *utf8_convert(const char *string, size_t len, const char *from,
 	temp = out = calloc(rlen, sizeof(char));
 	if (!out) {
 		iconv_close(cd);
-		return NULL;
+		return UTF8_CONVERT_NOMEM;
 	}
 
 	/* perform conversion */
 	if (iconv(cd, &in, &slen, &out, &rlen) == (size_t)-1) {
 		free(temp);
 		iconv_close(cd);
-		return NULL;
+		/** \todo handle the various cases properly
+		 * There are 3 possible error cases:
+		 * a) Insufficiently large output buffer
+		 * b) Invalid input byte sequence
+		 * c) Incomplete input sequence */
+		return UTF8_CONVERT_NOMEM;
 	}
 
 	iconv_close(cd);
@@ -277,12 +288,18 @@ char *utf8_convert(const char *string, size_t len, const char *from,
 	if (rlen > 64 /* allow 64bytes wasted space */) {
 		/* and allocate a more sensibly sized output buffer */
 		ret = calloc(out - temp + 4, sizeof(char));
+		if (!ret) {
+			free(temp);
+			return UTF8_CONVERT_NOMEM;
+		}
 		memcpy(ret, temp, out - temp);
 		free(temp);
 	}
 	else
 		ret = temp;
 
-	return ret;
+	*result = ret;
+
+	return UTF8_CONVERT_OK;
 }
 
-- 
cgit v1.2.3