summaryrefslogtreecommitdiff
path: root/riscos/ucstables.c
diff options
context:
space:
mode:
authorJohn Mark Bell <jmb@netsurf-browser.org>2005-07-16 14:35:25 +0000
committerJohn Mark Bell <jmb@netsurf-browser.org>2005-07-16 14:35:25 +0000
commitf4ecaaed31db0aa5d71c05dd3f04dc2833ad29fe (patch)
tree362b09da27833d63b3ae7b8d9fd14a4b56e92243 /riscos/ucstables.c
parent81a39c30755e9bb61a10a0edb16fec8996100024 (diff)
downloadnetsurf-f4ecaaed31db0aa5d71c05dd3f04dc2833ad29fe.tar.gz
netsurf-f4ecaaed31db0aa5d71c05dd3f04dc2833ad29fe.tar.bz2
[project @ 2005-07-16 14:35:20 by jmb]
- Convert Messages files to UTF-8 encoding. - Replace local_encoding_name() with platform specific utf8_[to,from]_local_encoding() functions - this allows mapping of 8bit characters 0x80->0x9f (inclusive). - All text that is rendered by the RISC OS Wimp is now converted to the system local encoding prior to display. - Lose the horrendous hack that was messages_get_key() - Menu text is now translated to system local encoding on the fly (if necessary) rather than at menu creation time. This allows the system alphabet to change under us and our menus remain usable. - The Languages menu now lists all languages that are present in the LangNames file. In the case of selecting the UI language, those languages which are not available are shaded. svn path=/import/netsurf/; revision=1796
Diffstat (limited to 'riscos/ucstables.c')
-rw-r--r--riscos/ucstables.c281
1 files changed, 269 insertions, 12 deletions
diff --git a/riscos/ucstables.c b/riscos/ucstables.c
index b744e9c6a..ef103f367 100644
--- a/riscos/ucstables.c
+++ b/riscos/ucstables.c
@@ -6,12 +6,17 @@
*/
/** \file
- * UCS conversion tables
+ * UCS conversion tables and RISC OS-specific UTF-8 text handling
*/
+#include <assert.h>
+#include <limits.h>
+#include <string.h>
#include "oslib/osbyte.h"
#include "oslib/territory.h"
+
#include "netsurf/riscos/ucstables.h"
+#include "netsurf/utils/utf8.h"
#include "netsurf/utils/utils.h"
/* Common values (ASCII) */
@@ -334,15 +339,16 @@ int *ucstable_from_alphabet(int alphabet)
return ucstable;
}
+
static const char *localencodings[] = {
"ISO-8859-1", /* BFont - 100 - just use Latin1, instead */
- "ISO-8859-1", /* do we want to use Acorn Latin1, instead? */
+ "ISO-8859-1",
"ISO-8859-2",
"ISO-8859-3",
"ISO-8859-4",
"ISO-8859-5",
"ISO-8859-6",
- "ISO-8869-7",
+ "ISO-8859-7",
"ISO-8859-8",
"ISO-8859-9",
"ISO-IR-182",
@@ -354,21 +360,272 @@ static const char *localencodings[] = {
"CP866" /* Cyrillic2 - 120 */
};
+static const struct special {
+ char local; /**< Local 8bit representation */
+ char len; /**< Length (in bytes) of UTF-8 character */
+ const char *utf; /**< UTF-8 representation */
+} special_chars[] = {
+ { 0x80, 3, "\xE2\x82\xAC" }, /* EURO SIGN */
+ { 0x81, 2, "\xC5\xB4" }, /* LATIN CAPITAL LETTER W WITH CIRCUMFLEX */
+ { 0x82, 2, "\xC5\xB5" }, /* LATIN SMALL LETTER W WITH CIRCUMFLEX */
+ { 0x84, 3, "\xE2\x9C\x98" }, /* HEAVY BALLOT X */
+ { 0x85, 2, "\xC5\xB6" }, /* LATIN CAPITAL LETTER Y WITH CIRCUMFLEX */
+ { 0x86, 2, "\xC5\xB7" }, /* LATIN SMALL LETTER Y WITH CIRCUMFLEX */
+ { 0x88, 3, "\xE2\x87\x90" }, /* LEFTWARDS DOUBLE ARROW */
+ { 0x89, 3, "\xE2\x87\x92" }, /* RIGHTWARDS DOUBLE ARROW */
+ { 0x8a, 3, "\xE2\x87\x93" }, /* DOWNWARDS DOUBLE ARROW */
+ { 0x8b, 3, "\xE2\x87\x91" }, /* UPWARDS DOUBLE ARROW */
+ { 0x8c, 3, "\xE2\x80\xA6" }, /* HORIZONTAL ELLIPSIS */
+ { 0x8d, 3, "\xE2\x84\xA2" }, /* TRADE MARK SIGN */
+ { 0x8e, 3, "\xE2\x80\xB0" }, /* PER MILLE SIGN */
+ { 0x8f, 3, "\xE2\x80\xA2" }, /* BULLET */
+ { 0x90, 3, "\xE2\x80\x98" }, /* LEFT SINGLE QUOTATION MARK */
+ { 0x91, 3, "\xE2\x80\x99" }, /* RIGHT SINGLE QUOTATION MARK */
+ { 0x92, 3, "\xE2\x80\xB9" }, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
+ { 0x93, 3, "\xE2\x80\xBA" }, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
+ { 0x94, 3, "\xE2\x80\x9C" }, /* LEFT DOUBLE QUOTATION MARK */
+ { 0x95, 3, "\xE2\x80\x9D" }, /* RIGHT DOUBLE QUOTATION MARK */
+ { 0x96, 3, "\xE2\x80\x9E" }, /* DOUBLE LOW-9 QUOTATION MARK */
+ { 0x97, 3, "\xE2\x80\x93" }, /* EN DASH */
+ { 0x98, 3, "\xE2\x80\x94" }, /* EM DASH */
+ { 0x99, 3, "\xE2\x88\x92" }, /* MINUS SIGN */
+ { 0x9a, 2, "\xC5\x92" }, /* LATIN CAPITAL LIGATURE OE */
+ { 0x9b, 2, "\xC5\x93" }, /* LATIN SMALL LIGATURE OE */
+ { 0x9c, 3, "\xE2\x80\xA0" }, /* DAGGER */
+ { 0x9d, 3, "\xE2\x80\xA1" }, /* DOUBLE DAGGER */
+ { 0x9e, 3, "\xEF\xAC\x81" }, /* LATIN SMALL LIGATURE FI */
+ { 0x9f, 3, "\xEF\xAC\x82" } /* LATIN SMALL LIGATURE FL */
+};
+
+
/**
- * Retrieve local encoding name, suitable for passing to iconv
+ * Convert a UTF-8 encoded string into the system local encoding
+ *
+ * \param string The string to convert
+ * \param len The length (in bytes) of the string, or 0
+ * \param result Pointer to location in which to store result
+ * \return The appropriate utf8_convert_ret value
*/
-const char *local_encoding_name(void)
+utf8_convert_ret utf8_to_local_encoding(const char *string, size_t len,
+ char **result)
{
os_error *error;
- int alphabet;
+ int alphabet, i, offset_count = 0;
+ struct {
+ const struct special *local; /* local character */
+ size_t offset; /* byte offset into string */
+ } offsets[CHAR_MAX];
+ size_t off;
+ char *temp;
+ const char *enc;
+ utf8_convert_ret err;
+
+ assert(string && result);
+
+ /* get length, if necessary */
+ if (len == 0)
+ len = strlen(string);
+ /* read system alphabet */
error = xosbyte1(osbyte_ALPHABET_NUMBER, 127, 0, &alphabet);
- if (!error) {
- if (alphabet < 116)
- return localencodings[alphabet - 100];
- else if (alphabet == 120)
- return localencodings[16];
+ if (error)
+ alphabet = territory_ALPHABET_LATIN1;
+
+ /* UTF-8 -> simply copy string */
+ if (alphabet == 111 /* UTF-8 */) {
+ *result = strndup(string, len);
+ return UTF8_CONVERT_OK;
+ }
+
+ /* get encoding name */
+ enc = (alphabet < 116 ? localencodings[alphabet - 100]
+ : (alphabet == 120 ? localencodings[16]
+ : localencodings[0]));
+
+ /* populate offsets array with details of characters that
+ * will be stripped by iconv */
+ for (off = 0; off < len; off = utf8_next(string, len, off)) {
+ if (string[off] != 0xE2 &&
+ string[off] != 0xC5 && string[off] != 0xEF)
+ continue;
+
+ for (i = 0; i != NOF_ELEMENTS(special_chars); i++) {
+ if (strncmp(string + off, special_chars[i].utf,
+ special_chars[i].len) == 0) {
+ /* ensure we don't overflow our buffer */
+ assert(offset_count < CHAR_MAX - 1);
+ offsets[offset_count].local =
+ &special_chars[i];
+ offsets[offset_count].offset = off;
+ offset_count++;
+ break;
+ }
+ }
+ }
+
+ if (offset_count == 0) {
+ /* No substitutions are required, so exit here */
+ return utf8_to_enc(string, enc, len, result);
+ }
+
+ /* create output buffer */
+ *(result) = malloc(len + 1);
+ if (!(*result))
+ return UTF8_CONVERT_NOMEM;
+ *(*result) = '\0';
+
+ /* convert the chunks between offsets, then copy stripped
+ * character into output string */
+ for (i = 0; i != offset_count; i++) {
+ off = (i > 0 ? offsets[i-1].offset + offsets[i-1].local->len
+ : 0);
+
+ err = utf8_to_enc(string + off, enc,
+ offsets[i].offset - off, &temp);
+ if (err != UTF8_CONVERT_OK) {
+ assert(err != UTF8_CONVERT_BADENC);
+ free(*result);
+ return UTF8_CONVERT_NOMEM;
+ }
+
+ strcat((*result), temp);
+ off = strlen(*result);
+ (*result)[off] = offsets[i].local->local;
+ (*result)[off+1] = '\0';
+
+ free(temp);
+ }
+
+ /* handle last chunk */
+ if (offsets[offset_count - 1].offset < len) {
+ off = offsets[offset_count - 1].offset +
+ offsets[offset_count - 1].local->len;
+
+ err = utf8_to_enc(string + off, enc, len - off, &temp);
+ if (err != UTF8_CONVERT_OK) {
+ assert(err != UTF8_CONVERT_BADENC);
+ free(*result);
+ return UTF8_CONVERT_NOMEM;
+ }
+
+ strcat((*result), temp);
+
+ free(temp);
+ }
+
+ return UTF8_CONVERT_OK;
+}
+
+/**
+ * Convert a string encoded in the system local encoding to UTF-8
+ *
+ * \param string The string to convert
+ * \param len The length (in bytes) of the string, or 0
+ * \param result Pointer to location in which to store result
+ * \return The appropriate utf8_convert_ret value
+ */
+utf8_convert_ret utf8_from_local_encoding(const char *string, size_t len,
+ char **result)
+{
+ os_error *error;
+ int alphabet, i, offset_count = 0;
+ struct {
+ const struct special *local; /* utf character */
+ size_t offset; /* byte offset into string */
+ } offsets[CHAR_MAX];
+ size_t off;
+ char *temp;
+ const char *enc;
+ utf8_convert_ret err;
+
+ assert(string && result);
+
+ /* get length, if necessary */
+ if (len == 0)
+ len = strlen(string);
+
+ /* read system alphabet */
+ error = xosbyte1(osbyte_ALPHABET_NUMBER, 127, 0, &alphabet);
+ if (error)
+ alphabet = territory_ALPHABET_LATIN1;
+
+ /* UTF-8 -> simply copy string */
+ if (alphabet == 111 /* UTF-8 */) {
+ *result = strndup(string, len);
+ return UTF8_CONVERT_OK;
+ }
+
+ /* get encoding name */
+ enc = (alphabet < 116 ? localencodings[alphabet - 100]
+ : (alphabet == 120 ? localencodings[16]
+ : localencodings[0]));
+
+ /* populate offsets array with details of characters that
+ * will be stripped by iconv */
+ for (off = 0; off < len; off++) {
+ if (string[off] < 0x80 || string[off] > 0x9f)
+ continue;
+
+ for (i = 0; i != NOF_ELEMENTS(special_chars); i++) {
+ if (string[off] == special_chars[i].local) {
+ /* ensure we don't overflow our buffer */
+ assert(offset_count < CHAR_MAX - 1);
+ offsets[offset_count].local =
+ &special_chars[i];
+ offsets[offset_count].offset = off;
+ offset_count++;
+ break;
+ }
+ }
+ }
+
+ if (offset_count == 0) {
+ /* No substitutions are required, so exit here */
+ return utf8_from_enc(string, enc, len, result);
+ }
+
+ /* create output buffer (oversized, but not by much) */
+ *(result) = malloc(len + (3 * offset_count) + 1);
+ if (!(*result))
+ return UTF8_CONVERT_NOMEM;
+ *(*result) = '\0';
+
+ /* convert the chunks between offsets, then copy stripped
+ * UTF-8 character into output string */
+ for (i = 0; i != offset_count; i++) {
+ off = (i > 0 ? offsets[i-1].offset + offsets[i-1].local->len
+ : 0);
+
+ err = utf8_from_enc(string + off, enc,
+ offsets[i].offset - off, &temp);
+ if (err != UTF8_CONVERT_OK) {
+ assert(err != UTF8_CONVERT_BADENC);
+ free(*result);
+ return UTF8_CONVERT_NOMEM;
+ }
+
+ strcat((*result), temp);
+ strcat((*result), offsets[i].local->utf);
+
+ free(temp);
+ }
+
+ /* handle last chunk */
+ if (offsets[offset_count - 1].offset < len) {
+ off = offsets[offset_count - 1].offset +
+ offsets[offset_count - 1].local->len;
+
+ err = utf8_from_enc(string + off, enc, len - off, &temp);
+ if (err != UTF8_CONVERT_OK) {
+ assert(err != UTF8_CONVERT_BADENC);
+ free(*result);
+ return UTF8_CONVERT_NOMEM;
+ }
+
+ strcat((*result), temp);
+
+ free(temp);
}
- return localencodings[0];
+ return UTF8_CONVERT_OK;
}