Rather less hideous approach to error detection and input pointer maintenence.

We now simply decode one character at a time and check for error afterwards. This has the benefit of being less code, clearer, less likely to crash if encoding state changes involve memory (de)allocation, and removes the reliance on UnicodeLib internals. It's probably slower, however, but correctness is more important here. Fix ISO-2022-JP-2 test data to not include characters from the JIS X 0201-1976 Kana set -- this set is not used in ISO-2022-JP-2. Implement verbose flag in *Iconv. svn path=/trunk/iconv/; revision=5712
author: John Mark Bell <jmb@netsurf-browser.org> 2008-11-18 13:54:08 +0000
committer: John Mark Bell <jmb@netsurf-browser.org> 2008-11-18 13:54:08 +0000
commit: e205a6b1ba20fd24f9cb2b461095d45fc30e9009 (patch)
tree: 9ecb266c8289b9045f85daa854bfa7b2bf3eb710
parent: 159fa67170c0edd5178c8c615f1b239728f02056 (diff)
download: iconv-e205a6b1ba20fd24f9cb2b461095d45fc30e9009.tar.gz
iconv-e205a6b1ba20fd24f9cb2b461095d45fc30e9009.tar.bz2
5 files changed, 63 insertions, 159 deletions
diff --git a/module/module.c b/module/module.c
index 2ef2326..0631551 100644
--- a/module/module.c
+++ b/module/module.c
@@ -201,7 +201,7 @@ _kernel_oserror *do_iconv(int argc, const char *args)
 {
 	char from[64] = "", to[64] = "";
 	char *f, *t;
-	bool list = false;
+	bool list = false, verbose = false;
 	char out[4096] = "";
 	char *o;
 	const char *p = args;
@@ -273,9 +273,13 @@ _kernel_oserror *do_iconv(int argc, const char *args)
 				p++;
 			argc--;
 			break;
+		case 'v':
+			verbose = true;
+			p += 2;
+			argc--;
+			break;
 		case 'c':
 		case 's':
-		case 'v':
 		default:
 			snprintf(ErrorGeneric.errmess, 
 				sizeof(ErrorGeneric.errmess),
@@ -358,7 +362,11 @@ _kernel_oserror *do_iconv(int argc, const char *args)
 		fclose(inf);
 
 		/* Convert text */
-		iconv(cd, &in, &inlen, &out, &outlen);
+		size_t read = iconv(cd, &in, &inlen, &out, &outlen);
+		if (verbose && read == (size_t) -1) {
+			fprintf(stderr, "Conversion failed: %s\n",
+					strerror(errno));
+		}
 
 		fwrite(output, 1, input_length * 4 - outlen, ofp);
 
diff --git a/src/iconv.c b/src/iconv.c
index 6cdfbb8..817822c 100644
--- a/src/iconv.c
+++ b/src/iconv.c
@@ -10,10 +10,6 @@
 
 #include <unicode/charsets.h>
 #include <unicode/encoding.h>
-/* Hacktastic */
-#define DEBUG 0
-#include <unicode/encpriv.h>
-#undef DEBUG
 
 #include <iconv/iconv.h>
 
@@ -244,34 +240,6 @@ iconv_t iconv_open(const char *tocode, const char *fromcode)
 		return (iconv_t)(-1);
 	}
 
-	if (e->in) {
-		e->in_save = calloc(1, sizeof(EncodingPriv) + 
-				((EncodingPriv *) e->in)->ws_size);
-		if (!e->in_save) {
-			if (e->out)
-				encoding_delete(e->out);
-			encoding_delete(e->in);
-			iconv_eightbit_delete(e);
-			free(e);
-			errno = ENOMEM;
-			return (iconv_t)(-1);
-		}
-	}
-
-	if (e->out) {
-		e->out_save = calloc(1, sizeof(EncodingPriv) + 
-				((EncodingPriv *) e->out)->ws_size);
-		if (!e->out_save) {
-			encoding_delete(e->out);
-			if (e->in)
-				encoding_delete(e->in);
-			iconv_eightbit_delete(e);
-			free(e);
-			errno = ENOMEM;
-			return (iconv_t)(-1);
-		}
-	}
-
 	/* add to list */
 	e->prev = 0;
 	e->next = context_list;
@@ -286,10 +254,7 @@ size_t iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf,
 		size_t *outbytesleft)
 {
 	struct encoding_context *e;
-	unsigned int read, read2;
-	char *orig_outbuf;
-	size_t orig_outbytesleft;
-	int write_state;
+	unsigned int read;
 
 	/* search for cd in list */
 	for (e = context_list; e; e = e->next)
@@ -347,117 +312,59 @@ size_t iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf,
 		return (size_t)(-1);
 	}
 
-	/* This is plain ugly. To be able to detect when each type of 
-	 * conversion error has occurred and maintain the correct pointer
-	 * into the input on error, we have to attempt to perform the
-	 * conversion then try it again and play spot the difference in
-	 * return values. As some encodings are stateful, we also need to
-	 * be able to preserve the current state of encoding contexts. This
-	 * requires knowledge of UnicodeLib's internal data structures. To
-	 * save pain later, I'm assuming that UnicodeLib's encpriv.h is
-	 * available at compile time. The cleaner approach of adding API to 
-	 * UnicodeLib seems pointless, as I can envisage no other use case 
-	 * than API munging for wanting to save/restore the state of codec 
-	 * instances.
-	 */
-
-	orig_outbuf = *outbuf;
-	orig_outbytesleft = *outbytesleft;
-
 	e->outbuf = outbuf;
 	e->outbytesleft = outbytesleft;
 
-	/* Try to convert all the input */
-	e->req_chars = INT_MAX;
-	e->chars_processed = 0;
-	e->write_state = WRITE_SUCCESS;
-
-	/* Save codec states */
-	if (e->in) {
-		memcpy(e->in_save, e->in, sizeof(EncodingPriv) + 
-				((EncodingPriv *) e->in)->ws_size);
-	}
-	if (e->out) {
-		memcpy(e->out_save, e->out, sizeof(EncodingPriv) +
-				((EncodingPriv *) e->out)->ws_size);
-	}
-
 	LOG(("reading"));
 
-	if (e->in)
-		read = encoding_read(e->in, character_callback, *inbuf,
-				*inbytesleft, e);
-	else
-		read = iconv_eightbit_read(e, character_callback, *inbuf,
-				*inbytesleft, e);
-
-	/* Record write state of first attempt (determines most errors) */
-	write_state = e->write_state;
-
-	/* Reset the output buffer pointer/length */
-	*outbuf = orig_outbuf;
-	*outbytesleft = orig_outbytesleft;
-
-	/* Shortcut failure to process first character of input */
-	if (e->chars_processed == 0) {
-		errno = write_state == WRITE_SUCCESS 
-			? EINVAL 
-			: write_state == WRITE_FAILED ? EILSEQ : E2BIG;
-		return (size_t) -1;
-	}
+	/* Perform the conversion.
+	 *
+	 * To ensure that we detect the correct error conditions
+	 * and point to the _start_ of erroneous input on error, we
+	 * have to convert each character independently. Then we
+	 * inspect for errors and only continue if there were none.
+	 */
+	while (*inbytesleft > 0) {
+		/* Clear current write state */
+		e->write_state = WRITE_NONE;
 
-	/* Now require the number of chars processed */
-	e->req_chars = e->chars_processed;
-	e->chars_processed = 0;
-	e->write_state = WRITE_SUCCESS;
+		if (e->in)
+			read = encoding_read(e->in, character_callback, *inbuf,
+					*inbytesleft, e);
+		else
+			read = iconv_eightbit_read(e, character_callback, 
+					*inbuf, *inbytesleft, e);
+
+		/* Stop on error */
+		if (e->write_state != WRITE_SUCCESS)
+			break;
 
-	/* Restore codec states */
-	if (e->in) {
-		memcpy(e->in, e->in_save, sizeof(EncodingPriv) + 
-				((EncodingPriv *) e->in)->ws_size);
-	}
-	if (e->out) {
-		memcpy(e->out, e->out_save, sizeof(EncodingPriv) +
-				((EncodingPriv *) e->out)->ws_size);
+		/* Advance input */
+		*inbuf += read;
+		*inbytesleft -= read;
 	}
 
-	/* And try again */
-	if (e->in)
-		read2 = encoding_read(e->in, character_callback, *inbuf,
-				*inbytesleft, e);
-	else
-		read2 = iconv_eightbit_read(e, character_callback, *inbuf,
-				*inbytesleft, e);
-
 	LOG(("done"));
 
 	LOG(("read: %d, ibl: %zd, obl: %zd", 
-			read2, *inbytesleft, *outbytesleft));
-
-	/* 2 or 3 */
-	if (write_state == WRITE_SUCCESS) {
-		*inbuf += read2;
-		*inbytesleft -= read2;
-
-		if (*inbytesleft > 0) {
-			errno = EINVAL;
-		} else {
-			return 0;
-		}
-	}
-	/* 4 */
-	else if (write_state == WRITE_NOMEM) {
-		LOG(("e2big"));
-		*inbuf += read2;
-		*inbytesleft -= read2;
+			read, *inbytesleft, *outbytesleft));
+
+	/* Determine correct return value/error code */
+	switch (e->write_state) {
+	case WRITE_SUCCESS: /* 2 */
+		/** \todo We really should calculate the correct number of 
+		 * irreversible conversions that have been performed. For now, 
+		 * assume everything's reversible. */
+		return 0;
+	case WRITE_NONE:    /* 3 */
+		errno = EINVAL;
+		break;
+	case WRITE_NOMEM:   /* 4 */
 		errno = E2BIG;
-	}
-	/* 1 */
-	else if (write_state == WRITE_FAILED) {
-		*inbuf += read2;
-		*inbytesleft -= read2;
-		LOG(("eilseq"));
+		break;
+	case WRITE_FAILED:  /* 1 */
 		errno = EILSEQ;
+		break;
 	}
 
 	LOG(("errno: %d", errno));
@@ -478,14 +385,10 @@ int iconv_close(iconv_t cd)
 	if (!e)
 		return 0;
 
-	if (e->in) {
+	if (e->in)
 		encoding_delete(e->in);
-		free(e->in_save);
-	}
-	if (e->out) {
+	if (e->out)
 		encoding_delete(e->out);
-		free(e->out_save);
-	}
 	iconv_eightbit_delete(e);
 
 	/* remove from list */
@@ -581,27 +484,19 @@ int character_callback(void *handle, UCS4 c)
 					--*e->outbytesleft;
 
 					e->write_state = WRITE_SUCCESS;
-
-					ret = 1;
 				} else {
 					e->write_state = WRITE_NOMEM;
-					ret = 0;
 				}
 			} else {
 				e->write_state = WRITE_NOMEM;
-				ret = 0;
 			}
 		} else {
 			e->write_state = WRITE_FAILED;
-			ret = 0;
 		}
 	}
 
-	if (e->write_state == WRITE_SUCCESS &&
-			++e->chars_processed == e->req_chars)
-		ret = 0;
-
-	return (!ret);
+	/* Always stop after processing each character */
+	return 1;
 }
 
 void parse_parameters(struct encoding_context *e, const char *params,
diff --git a/src/internal.h b/src/internal.h
index ce415ca..9150efc 100644
--- a/src/internal.h
+++ b/src/internal.h
@@ -16,18 +16,19 @@
 
 struct encoding_context {
 	Encoding *in;
-	void *in_save;
 	unsigned int inflags;
 	Encoding *out;
-	void *out_save;
 	unsigned int outflags;
 	unsigned short *intab, *outtab;
 	char **outbuf;
 	size_t *outbytesleft;
 	char transliterate;
-	enum { WRITE_SUCCESS, WRITE_FAILED, WRITE_NOMEM } write_state;
-	int chars_processed;
-	int req_chars;
+	enum {
+		WRITE_SUCCESS, 
+		WRITE_FAILED, 
+		WRITE_NOMEM, 
+		WRITE_NONE
+	} write_state;
 	struct encoding_context *prev, *next;
 };
 
diff --git a/test/GNU/ISO-2022-JP-2-snippet b/test/GNU/ISO-2022-JP-2-snippet
index 3e297b8..40fae83 100644
--- a/test/GNU/ISO-2022-JP-2-snippet
+++ b/test/GNU/ISO-2022-JP-2-snippet
@@ -1,4 +1,4 @@
-Japanese ($BF|K\8l(B)		$B$3$s$K$A$O(B, (I:]FAJ(B
+Japanese ($BF|K\8l(B)		$B$3$s$K$A$O(B
 	JIS  -- $B855$(B  $B3+H/(B
 Just for a test of JISX0212: $BqV$(DiQ(B (the second character is of JISX0212)
 Chinese ($BCfJ8(B,$BIaDL$A;0(B,$A::So(B)	$(D0_$B9%(B
diff --git a/test/GNU/ISO-2022-JP-2-snippet.UTF-8 b/test/GNU/ISO-2022-JP-2-snippet.UTF-8
index 6c63925..99d453b 100644
--- a/test/GNU/ISO-2022-JP-2-snippet.UTF-8
+++ b/test/GNU/ISO-2022-JP-2-snippet.UTF-8
@@ -1,4 +1,4 @@
-Japanese (日本語)		こんにちは, ｺﾝﾆﾁﾊ
+Japanese (日本語)		こんにちは
 	JIS  -- 元気  開発
 Just for a test of JISX0212: 騏驎 (the second character is of JISX0212)
 Chinese (中文,普通话,汉语)	你好
author	John Mark Bell <jmb@netsurf-browser.org>	2008-11-18 13:54:08 +0000
committer	John Mark Bell <jmb@netsurf-browser.org>	2008-11-18 13:54:08 +0000
commit	e205a6b1ba20fd24f9cb2b461095d45fc30e9009 (patch)
tree	9ecb266c8289b9045f85daa854bfa7b2bf3eb710
parent	159fa67170c0edd5178c8c615f1b239728f02056 (diff)
download	iconv-e205a6b1ba20fd24f9cb2b461095d45fc30e9009.tar.gz iconv-e205a6b1ba20fd24f9cb2b461095d45fc30e9009.tar.bz2