summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/eightbit.c19
-rw-r--r--src/iconv.c195
-rw-r--r--src/internal.h5
3 files changed, 180 insertions, 39 deletions
diff --git a/src/eightbit.c b/src/eightbit.c
index c8ebcba..ae81735 100644
--- a/src/eightbit.c
+++ b/src/eightbit.c
@@ -133,23 +133,34 @@ unsigned iconv_eightbit_read(struct encoding_context *e,
if (c < 0x80) {
/* ASCII */
- if (callback(handle, c))
+ if (callback(handle, c)) {
+ /* Used character, so update pos */
+ pos++;
break;
+ }
}
else if (c < 0x100 && e->intab) {
LOG(("maps to: %x", e->intab[c - 0x80]));
/* Look up in mapping table */
if (e->intab[c - 0x80] != 0xffff) {
- if (callback(handle, e->intab[c - 0x80]))
+ if (callback(handle, e->intab[c - 0x80])) {
+ pos++;
break;
+ }
}
else {
/* character not defined in this encoding */
- return pos;
+ if (callback(handle, 0xfffd)) {
+ pos++;
+ break;
+ }
}
} else {
/* character not defined in this encoding */
- break;
+ if (callback(handle, 0xfffd)) {
+ pos++;
+ break;
+ }
}
}
diff --git a/src/iconv.c b/src/iconv.c
index 7ea9f4c..6cdfbb8 100644
--- a/src/iconv.c
+++ b/src/iconv.c
@@ -1,6 +1,7 @@
/* iconv implementation - see iconv.h for docs */
#include <errno.h>
+#include <limits.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
@@ -9,6 +10,10 @@
#include <unicode/charsets.h>
#include <unicode/encoding.h>
+/* Hacktastic */
+#define DEBUG 0
+#include <unicode/encpriv.h>
+#undef DEBUG
#include <iconv/iconv.h>
@@ -239,6 +244,34 @@ iconv_t iconv_open(const char *tocode, const char *fromcode)
return (iconv_t)(-1);
}
+ if (e->in) {
+ e->in_save = calloc(1, sizeof(EncodingPriv) +
+ ((EncodingPriv *) e->in)->ws_size);
+ if (!e->in_save) {
+ if (e->out)
+ encoding_delete(e->out);
+ encoding_delete(e->in);
+ iconv_eightbit_delete(e);
+ free(e);
+ errno = ENOMEM;
+ return (iconv_t)(-1);
+ }
+ }
+
+ if (e->out) {
+ e->out_save = calloc(1, sizeof(EncodingPriv) +
+ ((EncodingPriv *) e->out)->ws_size);
+ if (!e->out_save) {
+ encoding_delete(e->out);
+ if (e->in)
+ encoding_delete(e->in);
+ iconv_eightbit_delete(e);
+ free(e);
+ errno = ENOMEM;
+ return (iconv_t)(-1);
+ }
+ }
+
/* add to list */
e->prev = 0;
e->next = context_list;
@@ -253,7 +286,10 @@ size_t iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf,
size_t *outbytesleft)
{
struct encoding_context *e;
- unsigned read;
+ unsigned int read, read2;
+ char *orig_outbuf;
+ size_t orig_outbytesleft;
+ int write_state;
/* search for cd in list */
for (e = context_list; e; e = e->next)
@@ -311,9 +347,41 @@ size_t iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf,
return (size_t)(-1);
}
+ /* This is plain ugly. To be able to detect when each type of
+ * conversion error has occurred and maintain the correct pointer
+ * into the input on error, we have to attempt to perform the
+ * conversion then try it again and play spot the difference in
+ * return values. As some encodings are stateful, we also need to
+ * be able to preserve the current state of encoding contexts. This
+ * requires knowledge of UnicodeLib's internal data structures. To
+ * save pain later, I'm assuming that UnicodeLib's encpriv.h is
+ * available at compile time. The cleaner approach of adding API to
+ * UnicodeLib seems pointless, as I can envisage no other use case
+ * than API munging for wanting to save/restore the state of codec
+ * instances.
+ */
+
+ orig_outbuf = *outbuf;
+ orig_outbytesleft = *outbytesleft;
+
e->outbuf = outbuf;
e->outbytesleft = outbytesleft;
+ /* Try to convert all the input */
+ e->req_chars = INT_MAX;
+ e->chars_processed = 0;
+ e->write_state = WRITE_SUCCESS;
+
+ /* Save codec states */
+ if (e->in) {
+ memcpy(e->in_save, e->in, sizeof(EncodingPriv) +
+ ((EncodingPriv *) e->in)->ws_size);
+ }
+ if (e->out) {
+ memcpy(e->out_save, e->out, sizeof(EncodingPriv) +
+ ((EncodingPriv *) e->out)->ws_size);
+ }
+
LOG(("reading"));
if (e->in)
@@ -323,40 +391,74 @@ size_t iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf,
read = iconv_eightbit_read(e, character_callback, *inbuf,
*inbytesleft, e);
+ /* Record write state of first attempt (determines most errors) */
+ write_state = e->write_state;
+
+ /* Reset the output buffer pointer/length */
+ *outbuf = orig_outbuf;
+ *outbytesleft = orig_outbytesleft;
+
+ /* Shortcut failure to process first character of input */
+ if (e->chars_processed == 0) {
+ errno = write_state == WRITE_SUCCESS
+ ? EINVAL
+ : write_state == WRITE_FAILED ? EILSEQ : E2BIG;
+ return (size_t) -1;
+ }
+
+ /* Now require the number of chars processed */
+ e->req_chars = e->chars_processed;
+ e->chars_processed = 0;
+ e->write_state = WRITE_SUCCESS;
+
+ /* Restore codec states */
+ if (e->in) {
+ memcpy(e->in, e->in_save, sizeof(EncodingPriv) +
+ ((EncodingPriv *) e->in)->ws_size);
+ }
+ if (e->out) {
+ memcpy(e->out, e->out_save, sizeof(EncodingPriv) +
+ ((EncodingPriv *) e->out)->ws_size);
+ }
+
+ /* And try again */
+ if (e->in)
+ read2 = encoding_read(e->in, character_callback, *inbuf,
+ *inbytesleft, e);
+ else
+ read2 = iconv_eightbit_read(e, character_callback, *inbuf,
+ *inbytesleft, e);
+
LOG(("done"));
LOG(("read: %d, ibl: %zd, obl: %zd",
- read, *inbytesleft, *outbytesleft));
+ read2, *inbytesleft, *outbytesleft));
- /* 2 */
- if (read == *inbytesleft) {
- *inbuf += read;
- *inbytesleft = 0;
- return 0;
+ /* 2 or 3 */
+ if (write_state == WRITE_SUCCESS) {
+ *inbuf += read2;
+ *inbytesleft -= read2;
+
+ if (*inbytesleft > 0) {
+ errno = EINVAL;
+ } else {
+ return 0;
+ }
}
/* 4 */
- else if ((int)*outbytesleft < 0) {
+ else if (write_state == WRITE_NOMEM) {
LOG(("e2big"));
- *outbytesleft = 0;
- *inbuf += read - 1;
- *inbytesleft -= read - 1;
+ *inbuf += read2;
+ *inbytesleft -= read2;
errno = E2BIG;
}
- /** \todo find a mechanism for distinguishing between 1 & 3 */
/* 1 */
- else if (read != *inbytesleft) {
- *inbuf += read;
- *inbytesleft -= read;
+ else if (write_state == WRITE_FAILED) {
+ *inbuf += read2;
+ *inbytesleft -= read2;
LOG(("eilseq"));
errno = EILSEQ;
}
- /* 3 */
- else if ((int)*outbytesleft >= 0) {
- *inbuf += read;
- *inbytesleft -= read;
- LOG(("einval"));
- errno = EINVAL;
- }
LOG(("errno: %d", errno));
@@ -376,10 +478,14 @@ int iconv_close(iconv_t cd)
if (!e)
return 0;
- if (e->in)
+ if (e->in) {
encoding_delete(e->in);
- if (e->out)
+ free(e->in_save);
+ }
+ if (e->out) {
encoding_delete(e->out);
+ free(e->out_save);
+ }
iconv_eightbit_delete(e);
/* remove from list */
@@ -409,8 +515,10 @@ int character_callback(void *handle, UCS4 c)
/* Stop on invalid characters if we're not transliterating */
/** \todo is this sane? -- we can't distinguish between illegal input
* or valid input which just happens to correspond with U+fffd. */
- if (c == 0xFFFD && !e->transliterate)
+ if (c == 0xFFFD && !e->transliterate) {
+ e->write_state = WRITE_FAILED;
return 1;
+ }
LOG(("outbuf: %p, free: %zd", *e->outbuf, *e->outbytesleft));
LOG(("writing: %d", c));
@@ -440,6 +548,9 @@ int character_callback(void *handle, UCS4 c)
(int*)e->outbytesleft);
}
+ e->write_state = ret == -1 ? WRITE_FAILED
+ : ret == 0 ? WRITE_NOMEM : WRITE_SUCCESS;
+
if (ret == -1) {
/* Transliterate, if we've been asked to.
* Assumes that output is 8bit/8bit multibyte with ASCII G0.
@@ -448,9 +559,10 @@ int character_callback(void *handle, UCS4 c)
* Also, afaiaa, all supported multibyte encodings are ASCII
* compatible. */
/** \todo Actually perform some kind of transliteration */
- if (e->transliterate && (int)*e->outbytesleft > 0) {
- if (e->out) {
- /* Reset encoding write state */
+ if (e->transliterate) {
+ if ((int)*e->outbytesleft > 0) {
+ if (e->out) {
+ /* Flush through any pending shift sequences */
/** \todo this is a bit dodgy, as we only
* really need to ensure that the ASCII set
* is mapped into G0 in ISO2022 encodings.
@@ -459,23 +571,36 @@ int character_callback(void *handle, UCS4 c)
* perform some dirty hackery which relies
* upon knowledge of UnicodeLib's internals
*/
- encoding_write(e->out, NULL_UCS4, e->outbuf,
+ encoding_write(e->out, NULL_UCS4,
+ e->outbuf,
(int*)e->outbytesleft);
- }
+ }
- if ((int)*e->outbytesleft > 0) {
- *(*e->outbuf)++ = '?';
- --*e->outbytesleft;
+ if ((int)*e->outbytesleft > 0) {
+ *(*e->outbuf)++ = '?';
+ --*e->outbytesleft;
+
+ e->write_state = WRITE_SUCCESS;
- ret = 1;
+ ret = 1;
+ } else {
+ e->write_state = WRITE_NOMEM;
+ ret = 0;
+ }
} else {
+ e->write_state = WRITE_NOMEM;
ret = 0;
}
} else {
- ret = 1;
+ e->write_state = WRITE_FAILED;
+ ret = 0;
}
}
+ if (e->write_state == WRITE_SUCCESS &&
+ ++e->chars_processed == e->req_chars)
+ ret = 0;
+
return (!ret);
}
diff --git a/src/internal.h b/src/internal.h
index 929d906..ce415ca 100644
--- a/src/internal.h
+++ b/src/internal.h
@@ -16,13 +16,18 @@
struct encoding_context {
Encoding *in;
+ void *in_save;
unsigned int inflags;
Encoding *out;
+ void *out_save;
unsigned int outflags;
unsigned short *intab, *outtab;
char **outbuf;
size_t *outbytesleft;
char transliterate;
+ enum { WRITE_SUCCESS, WRITE_FAILED, WRITE_NOMEM } write_state;
+ int chars_processed;
+ int req_chars;
struct encoding_context *prev, *next;
};