From c17dc661eae89cea66959fad4fe48d77f1faf174 Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Sat, 16 Apr 2005 05:09:33 +0000 Subject: [project @ 2005-04-16 05:09:32 by jmb] Split out UTF-8 handling functions. Submit URL-encoded forms in sensible encoding: * First entry in accept-charset list, if present * Document encoding, otherwise We may want to explicitly look for UTF-8, to save converting. Convert cnv_str_local_enc/cnv_local_enc_str to use iconv (they're now veneers for utf8_[to/from]_enc). Provide mechanism for looking up local system charset (derived from system alphabet, under RISC OS) svn path=/import/netsurf/; revision=1647 --- utils/utils.c | 104 +++------------------------------------------------------- 1 file changed, 5 insertions(+), 99 deletions(-) (limited to 'utils/utils.c') diff --git a/utils/utils.c b/utils/utils.c index 35e3b1c7e..04f24efa5 100644 --- a/utils/utils.c +++ b/utils/utils.c @@ -22,6 +22,7 @@ #define NDEBUG #include "netsurf/utils/log.h" #include "netsurf/utils/messages.h" +#include "netsurf/utils/utf8.h" #include "netsurf/utils/utils.h" @@ -110,56 +111,10 @@ char *cnv_space2nbsp(const char *s) * \param s string in local machine encoding. NUL or length terminated (which comes first). * \param length maximum number of bytes to consider at s. * \return malloc()'ed NUL termined string in UTF-8 encoding. - * - * Based on RISCOS-LATIN1 code from libiconv. - * \todo: we should use libiconv to support more local encodings instead - * of only RISCOS-LATIN1. */ char *cnv_local_enc_str(const char *s, size_t length) { - size_t l_out, l_in; - const char *s_in; - char *d, *d_out; - static const unsigned int riscos1_2uni[32] = { - /* 0x80 */ - 0x221a, 0x0174, 0x0175, 0x0083, 0x2573, 0x0176, 0x0177, 0x0087, - 0x21e6, 0x21e8, 0x21e9, 0x21e7, 0x2026, 0x2122, 0x2030, 0x2022, - /* 0x90 */ - 0x2018, 0x2019, 0x2039, 0x203a, 0x201c, 0x201d, 0x201e, 0x2013, - 0x2014, 0x2212, 0x0152, 0x0153, 0x2020, 0x2021, 0xfb01, 0xfb02, - }; - - /* We're counting on the fact that all riscos1_2uni[] values are - * between 0x80 (incl) and 0x1000 (excl). - */ - for (s_in = s, l_in = length, l_out = 1; - *s_in != '\0' && l_in != 0; - ++s_in, --l_in) - l_out += (*s_in >= 0x80 && *s_in < 0xA0) ? ((riscos1_2uni[*s_in - 0x80] < 0x800) ? 2 : 3) : 1; - if ((d_out = (char *)malloc(l_out)) == NULL) - return NULL; - for (s_in = s, l_in = length, d = d_out; - *s_in != '\0' && l_in != 0; - ++s_in, --l_in) { - unsigned int uc = (*s_in >= 0x80 && *s_in < 0xA0) ? riscos1_2uni[*s_in - 0x80] : *s_in; - const int cnt = (uc < 0x80) ? 1 : (uc < 0x800) ? 2 : 3; - switch (cnt) { - case 3: - d[2] = 0x80 | (uc & 0x3F); - uc = (uc >> 6) | 0x800; - /* fall through */ - case 2: - d[1] = 0x80 | (uc & 0x3F); - uc = (uc >> 6) | 0xC0; - /* fall through */ - case 1: - d[0] = uc; - } - d += cnt; - } - *d = '\0'; - - return d_out; + return utf8_from_enc(s, local_encoding_name(), length); } @@ -169,7 +124,7 @@ char *cnv_local_enc_str(const char *s, size_t length) */ char *cnv_str_local_enc(const char *s) { -return cnv_strn_local_enc(s, strlen(s), NULL); + return cnv_strn_local_enc(s, 0); } @@ -177,61 +132,12 @@ return cnv_strn_local_enc(s, strlen(s), NULL); * Converts UTF-8 string of bytes to the machine local encoding. * Caller needs to free return value. * - * When back_map is non-NULL, a ptr to a ptrdiff_t array is filled in which - * needs to be free'd by the caller. The array contains per character - * in the return string, a ptrdiff in the UTF-8 encoded string. - * * \todo: we should use libiconv to support more local encodings instead * of only ISOLATIN1. */ -char *cnv_strn_local_enc(const char *s, int length, const ptrdiff_t **back_mapPP) +char *cnv_strn_local_enc(const char *s, int length) { - /* Buffer at d & back_mapP can be overdimentioned but is certainly - * big enough to carry the end result. - */ - char *d, *d0; - const char * const s0 = s; - ptrdiff_t *back_mapP = NULL; - - if (back_mapPP != NULL) { - back_mapP = calloc(length + 1, sizeof(ptrdiff_t)); - if (!back_mapP) - return NULL; - *back_mapPP = back_mapP; - } - - d = calloc(length + 1, sizeof(char)); - if (!d) - return NULL; - - d0 = d; - - while (length != 0) { - int u, chars; - - chars = length; - u = xmlGetUTF8Char(s, &chars); - if (chars <= 0) { - s += 1; - length -= 1; - continue; - } - if (back_mapP != NULL) - *back_mapP++ = s - s0; - s += chars; - length -= chars; - if (u == 0x09 || u == 0x0a || u == 0x0d || - (0x20 <= u && u <= 0x7f) || - (0xa0 <= u && u <= 0xff)) - *d++ = u; - else - *d++ = '?'; - } - if (back_mapP != NULL) - *back_mapP = s - s0; - *d = 0; - - return d0; + return utf8_to_enc(s, local_encoding_name(), length); } -- cgit v1.2.3