/* iconv implementation - see iconv.h for docs */ #include #include #include #include #include #include #include #include #include #include #include "internal.h" static struct encoding_context *context_list; static int character_callback(void *handle, UCS4 c); static void parse_parameters(struct encoding_context *e, const char *params, bool destination); static void parse_parameter(struct encoding_context *e, const char *param, int length, bool destination); int iconv_initialise(const char *aliases_file) { if (aliases_file == NULL) return false; if (create_alias_data(aliases_file) == false) return false; encoding_initialise(); return true; } void iconv_finalise(void) { struct encoding_context *a, *b; /* clients may quit / die without cleaning up. */ for (a = context_list; a; a = b) { b = a->next; if (a->in) encoding_delete(a->in); if (a->out) encoding_delete(a->out); iconv_eightbit_delete(a); free(a); } free_alias_data(); /* finalise the unicode library */ encoding_tidyup(); } iconv_t iconv_open(const char *tocode, const char *fromcode) { int to = 0, from = 0; struct encoding_context *e; struct canon *c; bool to_force_le = false, from_force_le = false; bool to_no_bom = false, from_no_bom = false; char totemp[128], fromtemp[128]; const char *slash; unsigned int len; eightbit_ret error; /* can't do anything without these */ if (!tocode || !fromcode) { errno = EINVAL; return (iconv_t)(-1); } e = calloc(1, sizeof(*e)); if (!e) { LOG(("malloc failed")); errno = ENOMEM; return (iconv_t)(-1); } /* strip any parameters off the end of the tocode string */ slash = strchr(tocode, '/'); len = slash ? (unsigned) (slash - tocode) : strlen(tocode); snprintf(totemp, sizeof totemp, "%.*s", len, tocode); /* parse parameters */ if (slash && *(slash + 1) == '/' && *(slash + 2) != '\0') parse_parameters(e, slash + 2, true); /* strip any parameters off the end of the fromcode string */ slash = strchr(fromcode, '/'); len = slash ? (unsigned) (slash - fromcode) : strlen(fromcode); snprintf(fromtemp, sizeof fromtemp, "%.*s", len, fromcode); /* parse parameters */ if (slash && *(slash + 1) == '/' && *(slash + 2) != '\0') parse_parameters(e, slash + 2, false); /* try our own 8bit charset code first */ to = iconv_eightbit_number_from_name(totemp); from = iconv_eightbit_number_from_name(fromtemp); /* if that failed, try the UnicodeLib functionality */ if (!to) to = iconv_encoding_number_from_name(totemp); if (!from) from = iconv_encoding_number_from_name(fromtemp); /* if that failed, perhaps it was an endian-specific variant of * something UnicodeLib can handle? */ if (!to) { c = alias_canonicalise(totemp); if (c) { switch(c->mib_enum) { case 1013: /* UTF-16BE */ to = csUnicode11; to_no_bom = true; break; case 1014: /* UTF-16LE */ to = csUnicode11; to_force_le = true; to_no_bom = true; break; case 1018: /* UTF-32BE */ to = csUCS4; to_no_bom = true; break; case 1019: /* UTF-32LE */ to = csUCS4; to_force_le = true; to_no_bom = true; break; } } else if (strcasecmp(totemp, "UCS-4-INTERNAL") == 0) { uint32_t test = 0x55aa55aa; to = csUCS4; to_force_le = ((const uint8_t *) &test)[0] == 0xaa; to_no_bom = true; } } if (!from) { c = alias_canonicalise(fromtemp); if (c) { switch(c->mib_enum) { case 1013: /* UTF-16BE */ from = csUnicode11; from_no_bom = true; break; case 1014: /* UTF-16LE */ from = csUnicode11; from_force_le = true; from_no_bom = true; break; case 1018: /* UTF-32BE */ from = csUCS4; from_no_bom = true; break; case 1019: /* UTF-32LE */ from = csUCS4; from_force_le = true; from_no_bom = true; break; } } else if (strcasecmp(fromtemp, "UCS-4-INTERNAL") == 0) { uint32_t test = 0x55aa55aa; from = csUCS4; from_force_le = ((const uint8_t *) &test)[0] == 0xaa; from_no_bom = true; } } LOG(("to: %d(%s) from: %d(%s)", to, totemp, from, fromtemp)); /* ensure both encodings are recognised */ if (to == 0 || from == 0) { free(e); errno = EINVAL; return (iconv_t)(-1); } /* bit 30 set indicates that this is an 8bit encoding */ if (from & (1<<30)) { error = iconv_eightbit_new(from & ~(1<<30), &e->intab); if (error != EIGHTBIT_OK) { free(e); errno = (error == EIGHTBIT_NOMEM) ? ENOMEM : EINVAL; return (iconv_t)(-1); } } else { e->in = encoding_new(from, encoding_READ); if (e->in == NULL) { free(e); errno = ENOMEM; /* Assume memory exhaustion */ return (iconv_t)(-1); } /* Set encoding flags */ unsigned int flags = 0; if (from_force_le) flags |= encoding_FLAG_LITTLE_ENDIAN; if (from == csUnicode || from_no_bom) flags |= encoding_FLAG_NO_HEADER; encoding_set_flags(e->in, flags, flags); e->inflags = flags; } if (to & (1<<30)) { error = iconv_eightbit_new(to & ~(1<<30), &e->outtab); if (error != EIGHTBIT_OK) { if (e->in) encoding_delete(e->in); iconv_eightbit_delete(e); free(e); errno = (error == EIGHTBIT_NOMEM) ? ENOMEM : EINVAL; return (iconv_t)(-1); } } else { e->out = encoding_new(to, encoding_WRITE_STRICT); if (e->out == NULL) { if (e->in) encoding_delete(e->in); iconv_eightbit_delete(e); free(e); errno = ENOMEM; /* Assume memory exhaustion */ return (iconv_t)(-1); } e->transout = encoding_new(to, encoding_WRITE_STRICT); if (e->transout == NULL) { if (e->out) encoding_delete(e->out); if (e->in) encoding_delete(e->in); iconv_eightbit_delete(e); free(e); errno = ENOMEM; /* Assume memory exhaustion */ return (iconv_t)(-1); } /* Set encoding flags */ unsigned int flags = 0; if (to_force_le) flags |= encoding_FLAG_LITTLE_ENDIAN; if (to == csUnicode || to_no_bom) flags |= encoding_FLAG_NO_HEADER; encoding_set_flags(e->out, flags, flags); encoding_set_flags(e->transout, flags, flags); e->outflags = flags; } /* add to list */ e->prev = 0; e->next = context_list; if (context_list) context_list->prev = e; context_list = e; return (iconv_t)e; } size_t iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) { struct encoding_context *e; unsigned int read = 0; int ret; /* search for cd in list */ for (e = context_list; e; e = e->next) if (e == (struct encoding_context *)cd) break; /* not found => invalid */ if (!e) { errno = EINVAL; return (size_t)(-1); } if (inbuf == NULL || *inbuf == NULL) { /* Clear skip */ e->skip = 0; /* Reset transliteration state */ e->substitution = NULL; e->substlen = 0; /* Reset read codec */ if (e->in) { encoding_reset(e->in); encoding_set_flags(e->in, e->inflags, e->inflags); } /* Reset write codec, flushing shift sequences, if asked */ if (e->out) { if (outbuf != NULL) { char *prev_outbuf = *outbuf; size_t prev_outbytesleft = *outbytesleft; ret = encoding_write(e->out, NULL_UCS4, outbuf, (int*) outbytesleft); LOG(("ret: %d", ret)); /* Why the need for this nonsense? UnicodeLib * appears to decrease the count of free space * in the buffer even if it doesn't write into * it. This is a bug, as the documentation says * that the buffer pointer AND free space count * are left unmodified if nothing is written. * Therefore, we have this hack until * UnicodeLib gets fixed. */ if (ret == -1) { *outbytesleft = prev_outbytesleft - (*outbuf - prev_outbuf); errno = EINVAL; return (size_t)-1; } } encoding_reset(e->out); encoding_set_flags(e->out, e->outflags, e->outflags); } return 0; } /* Is there any point doing anything? */ if (!outbuf || !(*outbuf) || !outbytesleft) { errno = EINVAL; return (size_t)(-1); } e->outbuf = outbuf; e->outbytesleft = outbytesleft; /* Flush through any remaining transliteration */ if (e->substlen > 0) { ret = translit_flush_replacement(e); if (ret <= 0) { errno = E2BIG; return (size_t)-1; } /* Force write state to success, so if there's no more input * (i.e. we were transliterating the last character of input) * we'll report success, rather than whatever caused us to * stop writing the transliterated sequence last time round. */ e->write_state = WRITE_SUCCESS; } /* If, on the previous attempt to convert data, we reached the end * of the input buffer mid-sequence, then we retain the number of * bytes into the sequence we have read so far. We need to skip over * these bytes in the input now because UnicodeLib expects the next * byte to be the next in the sequence rather than the iconv() * semantics of replaying the entire incomplete sequence from the * start. */ if (e->skip != 0) { LOG(("Skipping %d bytes of input", e->skip)); *inbuf += e->skip; *inbytesleft -= e->skip; e->skip = 0; } LOG(("Reading %zd bytes of input", *inbytesleft)); /* Perform the conversion. * * To ensure that we detect the correct error conditions * and point to the _start_ of erroneous input on error, we * have to convert each character independently. Then we * inspect for errors and only continue if there were none. */ while (*inbytesleft > 0) { /* Clear current write state */ e->write_state = WRITE_NONE; if (e->in) { read = encoding_read(e->in, character_callback, *inbuf, *inbytesleft, e); /* A trailing complete nullable sequence should * result in success */ if (e->write_state == WRITE_NONE && !encoding_read_in_multibyte_sequence(e->in)) e->write_state = WRITE_SUCCESS; } else { read = iconv_eightbit_read(e, character_callback, *inbuf, *inbytesleft, e); } /* Stop on error */ if (e->write_state != WRITE_SUCCESS) break; /* Advance input */ *inbuf += read; *inbytesleft -= read; } LOG(("done")); LOG(("read: %d, ibl: %zd, obl: %zd", read, *inbytesleft, *outbytesleft)); /* Determine correct return value/error code */ switch (e->write_state) { case WRITE_SUCCESS: /* 2 */ /** \todo We really should calculate the correct number of * irreversible conversions that have been performed. For now, * assume everything's reversible. */ return 0; case WRITE_NONE: /* 3 */ /* Mark where we need to start in the next call */ e->skip = read; errno = EINVAL; break; case WRITE_NOMEM: /* 4 */ if (e->substlen > 0) { /* Buffer full while transliterating: skip input */ e->skip = read; } errno = E2BIG; break; case WRITE_FAILED: /* 1 */ errno = EILSEQ; break; } LOG(("errno: %d", errno)); return (size_t)(-1); } int iconv_close(iconv_t cd) { struct encoding_context *e; /* search for cd in list */ for (e = context_list; e; e = e->next) if (e == (struct encoding_context *)cd) break; /* not found => invalid */ if (!e) return 0; if (e->in) encoding_delete(e->in); if (e->out) encoding_delete(e->out); if (e->transout) encoding_delete(e->transout); iconv_eightbit_delete(e); /* remove from list */ if (e->next) e->next->prev = e->prev; if (e->prev) e->prev->next = e->next; else context_list = e->next; free(e); /* reduce our memory usage somewhat */ encoding_table_remove_unused(8 /* recommended value */); return 0; } /* this is called for each converted character */ int character_callback(void *handle, UCS4 c) { struct encoding_context *e; int ret; e = (struct encoding_context*)handle; if (c == 0xFFFE) c = 0xFFFD; /* Stop on invalid characters if we're not transliterating */ /** \todo is this sane? -- we can't distinguish between illegal input * or valid input which just happens to correspond with U+fffd. */ if ((c == 0xFFFD && !e->transliterate) || c == 0xFFFF) { e->write_state = WRITE_FAILED; return 1; } LOG(("outbuf: %p, free: %zd", *e->outbuf, *e->outbytesleft)); LOG(("writing: %d", c)); if (e->out) { char *prev_outbuf = *e->outbuf; size_t prev_outbytesleft = *e->outbytesleft; ret = encoding_write(e->out, c, e->outbuf, (int*)e->outbytesleft); LOG(("ret: %d", ret)); /* Why the need for this nonsense? UnicodeLib appears to * decrease the count of free space in the buffer even * if it doesn't write into it. This is a bug, as the * documentation says that the buffer pointer AND free * space count are left unmodified if nothing is written. * Therefore, we have this hack until UnicodeLib gets fixed. * * We need to restore the space remaining in the output buffer * on memory exhaustion, too. Otherwise, it will be negative, * which will break the client. */ if (ret <= 0) { *e->outbytesleft = prev_outbytesleft - (*e->outbuf - prev_outbuf); } } else { ret = iconv_eightbit_write(e, c, e->outbuf, (int*)e->outbytesleft); } if (ret == -1 && e->transliterate) { /* Transliterate, if we've been asked to. */ ret = translit_substitute(e, c); } if (ret == -1) { e->write_state = WRITE_FAILED; } else if (ret == 0) { e->write_state = WRITE_NOMEM; } else { e->write_state = WRITE_SUCCESS; } /* Always stop after processing each character */ return 1; } void parse_parameters(struct encoding_context *e, const char *params, bool destination) { char *slash = NULL, *prev = NULL; int len; len = strlen(params); while (slash - params < len && (slash = strchr(params, '/')) != NULL) { parse_parameter(e, prev == NULL ? params : prev, slash - (prev == NULL ? params : prev), destination); prev = slash + 2; slash += 2; } if (slash == NULL) parse_parameter(e, prev == NULL ? params : prev, (params + len) - (prev == NULL ? params : prev), destination); } void parse_parameter(struct encoding_context *e, const char *param, int length, bool destination) { if (length == 8 && strncasecmp(param, "TRANSLIT", 8) == 0) { if (destination) e->transliterate = 1; } }