summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn-Mark Bell <jmb@netsurf-browser.org>2013-01-13 02:05:46 (GMT)
committer John-Mark Bell <jmb@netsurf-browser.org>2013-01-13 02:05:46 (GMT)
commit9c8a4ff7e117ba052b2957c7e3f2e8751e8f8970 (patch)
treeab5e21285a92db1f77e57741c1be20b1acdbd958
parent23deb46db03c3e7a2884a49edcf882d933315e70 (diff)
downloadiconv-9c8a4ff7e117ba052b2957c7e3f2e8751e8f8970.tar.gz
iconv-9c8a4ff7e117ba052b2957c7e3f2e8751e8f8970.tar.bz2
Transliteration fixes:
* Clear any substitution if codec reset has been requested. * Don't report memory exhaustion when failing to allocate space for the test conversion in translit_try_sequence: there's nothing the caller can do, so treat it as if the substitution cannot be converted to the target character set. * Correctly report success if we run out of input immediately following a flush of a substitution. Additional tests for transliteration.
-rw-r--r--build/tools/gentranstab.pl16
-rw-r--r--src/iconv.c27
-rw-r--r--test/translit.c53
3 files changed, 87 insertions, 9 deletions
diff --git a/build/tools/gentranstab.pl b/build/tools/gentranstab.pl
index 0e9205a..1b1ccad 100644
--- a/build/tools/gentranstab.pl
+++ b/build/tools/gentranstab.pl
@@ -48,12 +48,18 @@ static int translit_try_sequence(struct encoding_context *e,
size_t orig_tmplen, tmplen, index;
int ret = 1;
- /* First, determine if sequence can be written to target encoding */
+ /* Determine if sequence can be written to target encoding */
/* Worst case: conversion to UTF-8 (needing 6 bytes per character) */
orig_tmplen = tmplen = (seqlen + 1) * 6;
ptmpbuf = tmpbuf = malloc(tmplen);
- if (tmpbuf == NULL)
- return 0;
+ if (tmpbuf == NULL) {
+ /* Consider lack of memory an inability to write the output.
+ * We cannot report memory exhaustion from here, as it will
+ * result in the caller thinking that the output buffer is
+ * too small, which isn't actually the case. As
+ * transliteration is best-effort anyway, this should be ok. */
+ return -1;
+ }
/* Reset the transout codec */
if (e->transout != NULL) {
@@ -102,6 +108,8 @@ int translit_flush_replacement(struct encoding_context *e)
size_t substlen = e->substlen;
int ret = 1;
+ LOG(("Flushing %zd characters", substlen));
+
while (substlen > 0) {
UCS4 c = substitution[0];
@@ -118,6 +126,8 @@ int translit_flush_replacement(struct encoding_context *e)
e->substitution = substitution;
e->substlen = substlen;
+ LOG(("%zd characters remaining", substlen));
+
return ret;
}
diff --git a/src/iconv.c b/src/iconv.c
index c81a0b2..21bf665 100644
--- a/src/iconv.c
+++ b/src/iconv.c
@@ -292,6 +292,10 @@ size_t iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf,
/* Clear skip */
e->skip = 0;
+ /* Reset transliteration state */
+ e->substitution = NULL;
+ e->substlen = 0;
+
/* Reset read codec */
if (e->in) {
encoding_reset(e->in);
@@ -342,13 +346,20 @@ size_t iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf,
e->outbytesleft = outbytesleft;
/* Flush through any remaining transliteration */
- ret = translit_flush_replacement(e);
- if (ret <= 0) {
- errno = E2BIG;
- return (size_t)-1;
- }
+ if (e->substlen > 0) {
+ ret = translit_flush_replacement(e);
+ if (ret <= 0) {
+ errno = E2BIG;
+ return (size_t)-1;
+ }
- LOG(("reading"));
+ /* Force write state to success, so if there's no more input
+ * (i.e. we were transliterating the last character of input)
+ * we'll report success, rather than whatever caused us to
+ * stop writing the transliterated sequence last time round.
+ */
+ e->write_state = WRITE_SUCCESS;
+ }
/* If, on the previous attempt to convert data, we reached the end
* of the input buffer mid-sequence, then we retain the number of
@@ -359,12 +370,16 @@ size_t iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf,
* start.
*/
if (e->skip != 0) {
+ LOG(("Skipping %d bytes of input", e->skip));
+
*inbuf += e->skip;
*inbytesleft -= e->skip;
e->skip = 0;
}
+ LOG(("Reading %zd bytes of input", *inbytesleft));
+
/* Perform the conversion.
*
* To ensure that we detect the correct error conditions
diff --git a/test/translit.c b/test/translit.c
index 8f17889..240f5e2 100644
--- a/test/translit.c
+++ b/test/translit.c
@@ -20,7 +20,15 @@ typedef struct translit_testcase {
} translit_testcase;
static const translit_testcase tests[] = {
+ /* Trivial */
{ "iso-8859-1//TRANSLIT", "\xe2\x80\x93", "-" },
+ /* Multi-character replacements */
+ { "iso-8859-2//TRANSLIT", "\xc2\xa9", "(c)" },
+ { "iso-8859-3//TRANSLIT", "\xc2\xab", "<<" },
+ /* Multiple choices */
+ { "iso-8859-4//TRANSLIT", "\xef\xac\x85", "st" },
+ /* Default fallback */
+ { "iso-8859-1//TRANSLIT", "\xef\xac\x87", "?" },
{ NULL, NULL, NULL }
};
@@ -53,6 +61,50 @@ static void run_tests(void)
}
}
+static void test_translit_buffer_boundary(void)
+{
+ iconv_t cd;
+ char out[128];
+ char *inp = (char *) "\xc2\xa9", *outp = out;
+ size_t inlen = strlen(inp), outlen;
+ size_t read;
+
+ cd = iconv_open("iso-8859-2//TRANSLIT", "utf-8");
+ assert(cd != (iconv_t) -1);
+
+ outlen = 1;
+ read = iconv(cd, &inp, &inlen, &outp, &outlen);
+ assert(read == (size_t) -1);
+ assert(errno == E2BIG);
+
+ /* Expect ( to appear in output */
+ assert(outlen == 0);
+ assert(out[0] == '(');
+
+ /* Try to write next output character */
+ outlen = 1;
+ read = iconv(cd, &inp, &inlen, &outp, &outlen);
+ assert(read == (size_t) -1);
+ assert(errno == E2BIG);
+
+ /* Expect "(c" in output */
+ assert(outlen == 0);
+ assert(out[0] == '(');
+ assert(out[1] == 'c');
+
+ /* Flush through last character */
+ outlen = 1;
+ read = iconv(cd, &inp, &inlen, &outp, &outlen);
+ assert(read == 0);
+
+ /* Expect "(c)" in output, and all input read */
+ assert(outlen == 0);
+ assert(memcmp(out, "(c)", 3) == 0);
+ assert(inlen == 0);
+
+ iconv_close(cd);
+}
+
int main(int argc, char **argv)
{
const char *ucpath;
@@ -84,6 +136,7 @@ int main(int argc, char **argv)
assert(iconv_initialise(aliases) == 1);
run_tests();
+ test_translit_buffer_boundary();
iconv_finalise();