summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn-Mark Bell <jmb@netsurf-browser.org>2013-01-11 03:33:47 (GMT)
committer John-Mark Bell <jmb@netsurf-browser.org>2013-01-11 11:20:19 (GMT)
commit23deb46db03c3e7a2884a49edcf882d933315e70 (patch)
treec8d1c3bb87a616b4ce33a5a66ce26e84816f55f3
parentefe52d57b2e0d1cb15ce3ccea5dd7d5e0359dae4 (diff)
downloadiconv-23deb46db03c3e7a2884a49edcf882d933315e70.tar.gz
iconv-23deb46db03c3e7a2884a49edcf882d933315e70.tar.bz2
Add proper transliteration support.
-rw-r--r--build/tools/gentranstab.pl344
-rw-r--r--doc/ChangeLog1
-rw-r--r--src/Makefile6
-rw-r--r--src/iconv.c77
-rw-r--r--src/internal.h7
-rw-r--r--src/transtab1689
-rw-r--r--test/INDEX1
-rw-r--r--test/Makefile2
-rw-r--r--test/translit.c94
9 files changed, 2179 insertions, 42 deletions
diff --git a/build/tools/gentranstab.pl b/build/tools/gentranstab.pl
new file mode 100644
index 0000000..0e9205a
--- a/dev/null
+++ b/build/tools/gentranstab.pl
@@ -0,0 +1,344 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+
+# Usage: gentranstab.pl <path to transtab>
+
+usage() if (@ARGV != 1);
+
+my $transtab = shift @ARGV;
+
+open TRANSTAB,"<$transtab" or die "Failed opening $transtab: $!\n";
+
+print <<EOF;
+/* This file is autogenerated. Manual changes will be lost */
+
+#include <assert.h>
+#include <inttypes.h>
+#include <stdbool.h>
+
+#include "internal.h"
+
+static int translit_write_character(struct encoding_context *e,
+ UCS4 c, char **buffer, size_t *buflen, bool use_transout)
+{
+ Encoding *out = use_transout ? e->transout : e->out;
+ int ret;
+
+ if (out != NULL) {
+ char *prev_buf = *buffer;
+ size_t prev_len = *buflen;
+
+ ret = encoding_write(out, c, buffer, (int *) buflen);
+
+ if (ret <= 0)
+ *buflen = prev_len - (*buffer - prev_buf);
+ } else {
+ ret = iconv_eightbit_write(e, c, buffer, (int *) buflen);
+ }
+
+ return ret;
+}
+
+static int translit_try_sequence(struct encoding_context *e,
+ const size_t seqlen, const UCS2 *replacement)
+{
+ char *tmpbuf, *ptmpbuf;
+ size_t orig_tmplen, tmplen, index;
+ int ret = 1;
+
+ /* First, determine if sequence can be written to target encoding */
+ /* Worst case: conversion to UTF-8 (needing 6 bytes per character) */
+ orig_tmplen = tmplen = (seqlen + 1) * 6;
+ ptmpbuf = tmpbuf = malloc(tmplen);
+ if (tmpbuf == NULL)
+ return 0;
+
+ /* Reset the transout codec */
+ if (e->transout != NULL) {
+ encoding_reset(e->transout);
+ encoding_set_flags(e->transout, e->outflags, e->outflags);
+ }
+
+ for (index = 0; index < seqlen; index++) {
+ UCS4 c = replacement[index];
+ do {
+ ret = translit_write_character(e, c, &ptmpbuf,
+ &tmplen, true);
+ if (ret == 0) {
+ char *tmp = realloc(tmpbuf, orig_tmplen * 2);
+ if (tmp == NULL)
+ break;
+
+ ptmpbuf = tmp + (ptmpbuf - tmpbuf);
+ tmpbuf = tmp;
+ tmplen += orig_tmplen;
+ orig_tmplen *= 2;
+ }
+ } while (ret == 0);
+
+ if (ret <= 0)
+ break;
+ }
+
+ free(tmpbuf);
+
+ if (ret <= 0) {
+ /* Consider lack of memory an inability to write the output */
+ return -1;
+ }
+
+ e->substitution = replacement;
+ e->substlen = seqlen;
+
+ /* Emit replacement for real */
+ return translit_flush_replacement(e);
+}
+
+int translit_flush_replacement(struct encoding_context *e)
+{
+ const UCS2 *substitution = e->substitution;
+ size_t substlen = e->substlen;
+ int ret = 1;
+
+ while (substlen > 0) {
+ UCS4 c = substitution[0];
+
+ ret = translit_write_character(e, c,
+ e->outbuf, e->outbytesleft, false);
+ assert(ret != -1);
+ if (ret <= 0)
+ break;
+
+ substitution++;
+ substlen--;
+ }
+
+ e->substitution = substitution;
+ e->substlen = substlen;
+
+ return ret;
+}
+
+EOF
+
+# Map from codepoint -> ttvals ref
+# ttvals is a list of chars ref
+my %transmap = ();
+# Length, in characters, of longest substitution string seen so far
+my $maxsubst = 0;
+# Total number of substitution strings encountered
+my $numsubsts = 0;
+# Map from substitution string -> start index in charbin
+my %substs = ();
+# Accumulated list of substitution character sequences
+my @charbin = ();
+
+# Read in transtab data
+while (my $line = <TRANSTAB>) {
+ # Skip comments and blank lines
+ next if ($line =~ /^%/);
+ next if ($line =~ /^\s*$/);
+
+ # Format: <codepoint> <data>
+ my ($codepoint, $data) = split(' ', $line);
+
+ # Strip '<U' from start, and '>' from end of input codepoint
+ $codepoint =~ s/^<U([^>]+)>/$1/;
+
+ # Data is a list of semi-colon-separated substitutions
+ my @substitutions = split(';', $data);
+
+ my @ttvals = ();
+
+ foreach my $sub (@substitutions) {
+ # Strip quotes around substitution sequence
+ $sub =~ s/"([^"]*)"/$1/;
+
+ $numsubsts++;
+
+ if ($sub eq "") {
+ # Special-case empty substitutions
+ my @empty = ();
+ push(@ttvals, \@empty);
+ next;
+ }
+
+ # Split characters in sequence
+ my @chars = split('<', $sub);
+ shift @chars;
+ my $num_chars = scalar(@chars);
+
+ # Strip leading 'U' and trailing '>'
+ map { $_ =~ s/U([^>]+)>/$1/; } @chars;
+
+ $maxsubst = $num_chars if ($num_chars > $maxsubst);
+
+ # Stringify chars to produce hash key
+ my $hkey = "@chars";
+
+ # Find/insert in bin, if new substitution
+ if (!defined($substs{$hkey})) {
+ my $pos = find_in_bin(\@chars, $num_chars);
+
+ $substs{$hkey} = $pos;
+ }
+
+ # Append to list of substitutions for codepoint
+ push(@ttvals, \@chars);
+ }
+
+ # Insert into transmap
+ $transmap{$codepoint} = \@ttvals;
+}
+
+close TRANSTAB;
+
+# Ensure transtab is representable
+die "Charbin length exceeds 2^13!" if $#charbin >= 2**13;
+die "Maxsubst exceeds 8!" if $maxsubst >= 2**3;
+
+print <<EOF;
+struct translit_entry {
+ uint32_t codepoint : 16,
+ offset : 13,
+ length : 3;
+};
+
+EOF
+
+# Emit substitution data
+my $cblen = @charbin;
+print "static const UCS2 substdata[$cblen] = {\n";
+foreach my $c (@charbin) {
+ print "\t0x$c,\n";
+}
+print "};\n\n";
+
+# Emit transliteration LUT
+my $ttlen = $numsubsts + 1; # + 1 for sentinel
+print "static const struct translit_entry transtab[$ttlen] = {\n";
+foreach my $codepoint (sort(keys %transmap)) {
+ my $ttvals = $transmap{$codepoint};
+
+ for my $subst (@$ttvals) {
+ my $hkey = "@$subst";
+
+ if ($hkey ne "") {
+ my $slen = @$subst;
+ print "\t{ 0x$codepoint, $substs{$hkey}, $slen },\n";
+ } else {
+ print "\t{ 0x$codepoint, 0, 0 },\n";
+ }
+ }
+}
+# Place sentinel at the end
+print "\t{ 0, 0, 0 }\n";
+print "};\n\n";
+
+print <<EOF;
+static int translit_tab_cmp(const void *a, const void *b)
+{
+ const struct translit_entry *aa = (const struct translit_entry *) a;
+ const struct translit_entry *bb = (const struct translit_entry *) b;
+
+ return (int) aa->codepoint - (int) bb->codepoint;
+}
+
+int translit_substitute(struct encoding_context *e, UCS4 c)
+{
+ static const UCS2 default_subst[1] = { '?' };
+ int ret = 1;
+
+ if (c <= 0xFFFF) {
+ struct translit_entry key = { c, 0, 0 };
+ const struct translit_entry *res;
+
+ res = bsearch(&key, transtab, $numsubsts,
+ sizeof(struct translit_entry),
+ translit_tab_cmp);
+ if (res != NULL) {
+ /* Reverse until we find the first entry for c */
+ while (res > transtab) {
+ if (res[-1].codepoint != c)
+ break;
+ res--;
+ }
+
+ /* Try substitutions in turn, until we run out */
+ while (res->codepoint == c) {
+ ret = translit_try_sequence(e, res->length,
+ substdata + res->offset);
+ if (ret >= 0)
+ return ret;
+
+ res++;
+ }
+ }
+ }
+
+ /* Last-ditch replacement: must succeed */
+ return translit_try_sequence(e, 1, default_subst);
+}
+EOF
+
+# Search bin for existing sequence, or append if not found.
+#
+# The intent here is to minimise duplication of substitution
+# sequences. This implementation is decidedly trivial, and
+# makes no attempt to discover the optimal insertion order.
+#
+# Inspection of the output indicates that we use approximately
+# 5.5 bytes of storage for each substitution sequence
+# encountered (4 of these are the translit_entry, so there
+# doesn't seem much point in trying to optimise the layout of
+# the charbin any further.)
+sub find_in_bin
+{
+ my $pchars = shift;
+ my $pcharslen = shift;
+ my $binlen = scalar(@charbin);
+ my $offset = 0;
+
+ # Search bin for pchars
+ while ($offset <= $binlen - $pcharslen) {
+ my @slice = @charbin[$offset .. $offset + $pcharslen - 1];
+
+ last if aeq(\@slice, $pchars);
+
+ $offset++;
+ }
+
+ if ($offset <= $binlen - $pcharslen) {
+ # Found in bin
+ return $offset;
+ } else {
+ # Not found, so append
+ push(@charbin, @$pchars);
+ return $binlen;
+ }
+}
+
+# Compare two arrays for equality
+sub aeq
+{
+ my ($aref, $bref) = @_;
+ return 0 unless @$aref == @$bref;
+
+ my $idx = 0;
+ for my $item (@$aref) {
+ return 0 unless $item eq $bref->[$idx++];
+ }
+
+ return 1;
+}
+
+sub usage
+{
+ print STDERR <<EOF;
+Usage: gentranstab.pl <path to transtab>
+EOF
+
+ exit 1;
+}
diff --git a/doc/ChangeLog b/doc/ChangeLog
index b055aa9..26d30a7 100644
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -110,3 +110,4 @@ Iconv Changelog
- Correct handling of trailing valid shift sequences. Previously would
erroneously report EINVAL, instead of silently accepting them.
+ - Add proper transliteration behaviour when requested using //TRANSLIT.
diff --git a/src/Makefile b/src/Makefile
index 2a7d350..e520c6f 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,4 +1,10 @@
# Sources
DIR_SOURCES := alias.c aliases.c eightbit.c iconv.c utils.c
+SOURCES := $(SOURCES) $(BUILDDIR)/src_translit.c
+
+$(BUILDDIR)/src_translit.c: src/transtab build/tools/gentranstab.pl
+ $(VQ)$(ECHO) "TRANSTAB: $<"
+ $(Q)$(PERL) build/tools/gentranstab.pl $< >$@
+
include $(NSBUILD)/Makefile.subdir
diff --git a/src/iconv.c b/src/iconv.c
index db47cbc..c81a0b2 100644
--- a/src/iconv.c
+++ b/src/iconv.c
@@ -234,6 +234,18 @@ iconv_t iconv_open(const char *tocode, const char *fromcode)
return (iconv_t)(-1);
}
+ e->transout = encoding_new(to, encoding_WRITE_STRICT);
+ if (e->transout == NULL) {
+ if (e->out)
+ encoding_delete(e->out);
+ if (e->in)
+ encoding_delete(e->in);
+ iconv_eightbit_delete(e);
+ free(e);
+ errno = ENOMEM; /* Assume memory exhaustion */
+ return (iconv_t)(-1);
+ }
+
/* Set encoding flags */
unsigned int flags = 0;
if (to_force_le)
@@ -243,6 +255,7 @@ iconv_t iconv_open(const char *tocode, const char *fromcode)
flags |= encoding_FLAG_NO_HEADER;
encoding_set_flags(e->out, flags, flags);
+ encoding_set_flags(e->transout, flags, flags);
e->outflags = flags;
}
@@ -262,6 +275,7 @@ size_t iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf,
{
struct encoding_context *e;
unsigned int read = 0;
+ int ret;
/* search for cd in list */
for (e = context_list; e; e = e->next)
@@ -289,7 +303,6 @@ size_t iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf,
if (outbuf != NULL) {
char *prev_outbuf = *outbuf;
size_t prev_outbytesleft = *outbytesleft;
- int ret;
ret = encoding_write(e->out, NULL_UCS4,
outbuf, (int*) outbytesleft);
@@ -328,6 +341,13 @@ size_t iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf,
e->outbuf = outbuf;
e->outbytesleft = outbytesleft;
+ /* Flush through any remaining transliteration */
+ ret = translit_flush_replacement(e);
+ if (ret <= 0) {
+ errno = E2BIG;
+ return (size_t)-1;
+ }
+
LOG(("reading"));
/* If, on the previous attempt to convert data, we reached the end
@@ -397,6 +417,10 @@ size_t iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf,
errno = EINVAL;
break;
case WRITE_NOMEM: /* 4 */
+ if (e->substlen > 0) {
+ /* Buffer full while transliterating: skip input */
+ e->skip = read;
+ }
errno = E2BIG;
break;
case WRITE_FAILED: /* 1 */
@@ -426,6 +450,8 @@ int iconv_close(iconv_t cd)
encoding_delete(e->in);
if (e->out)
encoding_delete(e->out);
+ if (e->transout)
+ encoding_delete(e->transout);
iconv_eightbit_delete(e);
/* remove from list */
@@ -495,48 +521,17 @@ int character_callback(void *handle, UCS4 c)
(int*)e->outbytesleft);
}
- e->write_state = ret == -1 ? WRITE_FAILED
- : ret == 0 ? WRITE_NOMEM : WRITE_SUCCESS;
+ if (ret == -1 && e->transliterate) {
+ /* Transliterate, if we've been asked to. */
+ ret = translit_substitute(e, c);
+ }
if (ret == -1) {
- /* Transliterate, if we've been asked to.
- * Assumes that output is 8bit/8bit multibyte with ASCII G0.
- * This should be fine as the only <>8bit encodings are
- * UCS{2,4}, UTF-{16,32}, neither of which return -1.
- * Also, afaiaa, all supported multibyte encodings are ASCII
- * compatible. */
- /** \todo Actually perform some kind of transliteration */
- if (e->transliterate) {
- if ((int)*e->outbytesleft > 0) {
- if (e->out) {
- /* Flush through any pending shift sequences */
- /** \todo this is a bit dodgy, as we only
- * really need to ensure that the ASCII set
- * is mapped into G0 in ISO2022 encodings.
- * This will reset G1->G3, too, which may
- * break things. If so, we may have to
- * perform some dirty hackery which relies
- * upon knowledge of UnicodeLib's internals
- */
- encoding_write(e->out, NULL_UCS4,
- e->outbuf,
- (int*)e->outbytesleft);
- }
-
- if ((int)*e->outbytesleft > 0) {
- *(*e->outbuf)++ = '?';
- --*e->outbytesleft;
-
- e->write_state = WRITE_SUCCESS;
- } else {
- e->write_state = WRITE_NOMEM;
- }
- } else {
- e->write_state = WRITE_NOMEM;
- }
- } else {
- e->write_state = WRITE_FAILED;
- }
+ e->write_state = WRITE_FAILED;
+ } else if (ret == 0) {
+ e->write_state = WRITE_NOMEM;
+ } else {
+ e->write_state = WRITE_SUCCESS;
}
/* Always stop after processing each character */
diff --git a/src/internal.h b/src/internal.h
index 42efefe..827dccb 100644
--- a/src/internal.h
+++ b/src/internal.h
@@ -18,11 +18,14 @@ struct encoding_context {
Encoding *in;
unsigned int inflags;
Encoding *out;
+ Encoding *transout;
unsigned int outflags;
unsigned short *intab, *outtab;
char **outbuf;
size_t *outbytesleft;
char transliterate;
+ const UCS2 *substitution;
+ size_t substlen;
enum {
WRITE_SUCCESS,
WRITE_FAILED,
@@ -67,6 +70,10 @@ struct canon *alias_canonicalise(const char *alias);
short mibenum_from_name(const char *alias);
const char *mibenum_to_name(short mibenum);
+/* in translit.c */
+int translit_flush_replacement(struct encoding_context *e);
+int translit_substitute(struct encoding_context *e, UCS4 c);
+
/* in utils.c */
int strcasecmp(const char *s1, const char *s2);
int strncasecmp(const char *s1, const char *s2, size_t len);
diff --git a/src/transtab b/src/transtab
new file mode 100644
index 0000000..e51465e
--- a/dev/null
+++ b/src/transtab
@@ -0,0 +1,1689 @@
+% Source: http://www.cl.cam.ac.uk/~mgk25/unicode.html#libs
+%
+% "This package contains a table for transliterating ISO 10646 texts into
+% best-effort representations using smaller coded character sets (ASCII,
+% ISO 8859, etc.). It is primarily intended for inclusion into the GNU C
+% library, but might be of use for other applications as well. The table
+% is freely available to anyone."
+
+
+% APOSTROPHE
+<U0027> <U2019>
+% GRAVE ACCENT
+<U0060> <U201B>;<U2018>
+% NO-BREAK SPACE
+<U00A0> <U0020>
+% INVERTED EXCLAMATION MARK
+<U00A1> <U0021>
+% CENT SIGN
+<U00A2> <U0063>
+% POUND SIGN
+<U00A3> "<U0047><U0042><U0050>"
+% YEN SIGN
+<U00A5> <U0059>
+% BROKEN BAR
+<U00A6> <U007C>
+% SECTION SIGN
+<U00A7> <U0053>
+% DIAERESIS
+<U00A8> <U0022>
+% COPYRIGHT SIGN
+<U00A9> "<U0028><U0063><U0029>";<U0063>
+% FEMININE ORDINAL INDICATOR
+<U00AA> <U0061>
+% LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
+<U00AB> "<U003C><U003C>"
+% NOT SIGN
+<U00AC> <U002D>
+% SOFT HYPHEN
+<U00AD> <U002D>
+% REGISTERED SIGN
+<U00AE> "<U0028><U0052><U0029>"
+% MACRON
+<U00AF> <U002D>
+% DEGREE SIGN
+<U00B0> <U0020>
+% PLUS-MINUS SIGN
+<U00B1> "<U002B><U002F><U002D>"
+% SUPERSCRIPT TWO
+<U00B2> "<U005E><U0032>";<U0032>
+% SUPERSCRIPT THREE
+<U00B3> "<U005E><U0033>";<U0033>
+% ACUTE ACCENT
+<U00B4> <U0027>
+% MICRO SIGN
+<U00B5> <U03BC>;<U0075>
+% PILCROW SIGN
+<U00B6> <U0050>
+% MIDDLE DOT
+<U00B7> <U002E>
+% CEDILLA
+<U00B8> <U002C>
+% SUPERSCRIPT ONE
+<U00B9> "<U005E><U0031>";<U0031>
+% MASCULINE ORDINAL INDICATOR
+<U00BA> <U006F>
+% RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
+<U00BB> "<U003E><U003E>"
+% VULGAR FRACTION ONE QUARTER
+<U00BC> "<U0020><U0031><U002F><U0034>"
+% VULGAR FRACTION ONE HALF
+<U00BD> "<U0020><U0031><U002F><U0032>"
+% VULGAR FRACTION THREE QUARTERS
+<U00BE> "<U0020><U0033><U002F><U0034>"
+% INVERTED QUESTION MARK
+<U00BF> <U003F>
+% LATIN CAPITAL LETTER A WITH GRAVE
+<U00C0> <U0041>
+% LATIN CAPITAL LETTER A WITH ACUTE
+<U00C1> <U0041>
+% LATIN CAPITAL LETTER A WITH CIRCUMFLEX
+<U00C2> <U0041>
+% LATIN CAPITAL LETTER A WITH TILDE
+<U00C3> <U0041>
+% LATIN CAPITAL LETTER A WITH DIAERESIS
+<U00C4> "<U0041><U0065>";<U0041>
+% LATIN CAPITAL LETTER A WITH RING ABOVE
+<U00C5> "<U0041><U0061>";<U0041>
+% LATIN CAPITAL LETTER AE
+<U00C6> "<U0041><U0045>";<U0041>
+% LATIN CAPITAL LETTER C WITH CEDILLA
+<U00C7> <U0043>
+% LATIN CAPITAL LETTER E WITH GRAVE
+<U00C8> <U0045>
+% LATIN CAPITAL LETTER E WITH ACUTE
+<U00C9> <U0045>
+% LATIN CAPITAL LETTER E WITH CIRCUMFLEX
+<U00CA> <U0045>
+% LATIN CAPITAL LETTER E WITH DIAERESIS
+<U00CB> <U0045>
+% LATIN CAPITAL LETTER I WITH GRAVE
+<U00CC> <U0049>
+% LATIN CAPITAL LETTER I WITH ACUTE
+<U00CD> <U0049>
+% LATIN CAPITAL LETTER I WITH CIRCUMFLEX
+<U00CE> <U0049>
+% LATIN CAPITAL LETTER I WITH DIAERESIS
+<U00CF> <U0049>
+% LATIN CAPITAL LETTER ETH
+<U00D0> <U0044>
+% LATIN CAPITAL LETTER N WITH TILDE
+<U00D1> <U004E>
+% LATIN CAPITAL LETTER O WITH GRAVE
+<U00D2> <U004F>
+% LATIN CAPITAL LETTER O WITH ACUTE
+<U00D3> <U004F>
+% LATIN CAPITAL LETTER O WITH CIRCUMFLEX
+<U00D4> <U004F>
+% LATIN CAPITAL LETTER O WITH TILDE
+<U00D5> <U004F>
+% LATIN CAPITAL LETTER O WITH DIAERESIS
+<U00D6> "<U004F><U0065>";<U004F>
+% MULTIPLICATION SIGN
+<U00D7> <U0078>
+% LATIN CAPITAL LETTER O WITH STROKE
+<U00D8> <U004F>
+% LATIN CAPITAL LETTER U WITH GRAVE
+<U00D9> <U0055>
+% LATIN CAPITAL LETTER U WITH ACUTE
+<U00DA> <U0055>
+% LATIN CAPITAL LETTER U WITH CIRCUMFLEX
+<U00DB> <U0055>
+% LATIN CAPITAL LETTER U WITH DIAERESIS
+<U00DC> "<U0055><U0065>";<U0055>
+% LATIN CAPITAL LETTER Y WITH ACUTE
+<U00DD> <U0059>
+% LATIN CAPITAL LETTER THORN
+<U00DE> "<U0054><U0068>"
+% LATIN SMALL LETTER SHARP S
+<U00DF> "<U0073><U0073>";<U03B2>
+% LATIN SMALL LETTER A WITH GRAVE
+<U00E0> <U0061>
+% LATIN SMALL LETTER A WITH ACUTE
+<U00E1> <U0061>
+% LATIN SMALL LETTER A WITH CIRCUMFLEX
+<U00E2> <U0061>
+% LATIN SMALL LETTER A WITH TILDE
+<U00E3> <U0061>
+% LATIN SMALL LETTER A WITH DIAERESIS
+<U00E4> "<U0061><U0065>";<U0061>
+% LATIN SMALL LETTER A WITH RING ABOVE
+<U00E5> "<U0061><U0061>";<U0061>
+% LATIN SMALL LETTER AE
+<U00E6> "<U0061><U0065>";<U0061>
+% LATIN SMALL LETTER C WITH CEDILLA
+<U00E7> <U0063>
+% LATIN SMALL LETTER E WITH GRAVE
+<U00E8> <U0065>
+% LATIN SMALL LETTER E WITH ACUTE
+<U00E9> <U0065>
+% LATIN SMALL LETTER E WITH CIRCUMFLEX
+<U00EA> <U0065>
+% LATIN SMALL LETTER E WITH DIAERESIS
+<U00EB> <U0065>
+% LATIN SMALL LETTER I WITH GRAVE
+<U00EC> <U0069>
+% LATIN SMALL LETTER I WITH ACUTE
+<U00ED> <U0069>
+% LATIN SMALL LETTER I WITH CIRCUMFLEX
+<U00EE> <U0069>
+% LATIN SMALL LETTER I WITH DIAERESIS
+<U00EF> <U0069>
+% LATIN SMALL LETTER ETH
+<U00F0> <U0064>
+% LATIN SMALL LETTER N WITH TILDE
+<U00F1> <U006E>
+% LATIN SMALL LETTER O WITH GRAVE
+<U00F2> <U006F>
+% LATIN SMALL LETTER O WITH ACUTE
+<U00F3> <U006F>
+% LATIN SMALL LETTER O WITH CIRCUMFLEX
+<U00F4> <U006F>
+% LATIN SMALL LETTER O WITH TILDE
+<U00F5> <U006F>
+% LATIN SMALL LETTER O WITH DIAERESIS
+<U00F6> "<U006F><U0065>";<U006F>
+% DIVISION SIGN
+<U00F7> <U003A>
+% LATIN SMALL LETTER O WITH STROKE
+<U00F8> <U006F>
+% LATIN SMALL LETTER U WITH GRAVE
+<U00F9> <U0075>
+% LATIN SMALL LETTER U WITH ACUTE
+<U00FA> <U0075>
+% LATIN SMALL LETTER U WITH CIRCUMFLEX
+<U00FB> <U0075>
+% LATIN SMALL LETTER U WITH DIAERESIS
+<U00FC> "<U0075><U0065>";<U0075>
+% LATIN SMALL LETTER Y WITH ACUTE
+<U00FD> <U0079>
+% LATIN SMALL LETTER THORN
+<U00FE> "<U0074><U0068>"
+% LATIN SMALL LETTER Y WITH DIAERESIS
+<U00FF> <U0079>
+% LATIN CAPITAL LETTER A WITH MACRON
+<U0100> <U0041>
+% LATIN SMALL LETTER A WITH MACRON
+<U0101> <U0061>
+% LATIN CAPITAL LETTER A WITH BREVE
+<U0102> <U0041>
+% LATIN SMALL LETTER A WITH BREVE
+<U0103> <U0061>
+% LATIN CAPITAL LETTER A WITH OGONEK
+<U0104> <U0041>
+% LATIN SMALL LETTER A WITH OGONEK
+<U0105> <U0061>
+% LATIN CAPITAL LETTER C WITH ACUTE
+<U0106> <U0043>
+% LATIN SMALL LETTER C WITH ACUTE
+<U0107> <U0063>
+% LATIN CAPITAL LETTER C WITH CIRCUMFLEX
+<U0108> "<U0043><U0068>";<U0043>
+% LATIN SMALL LETTER C WITH CIRCUMFLEX
+<U0109> "<U0063><U0068>";<U0063>
+% LATIN CAPITAL LETTER C WITH DOT ABOVE
+<U010A> <U0043>
+% LATIN SMALL LETTER C WITH DOT ABOVE
+<U010B> <U0063>
+% LATIN CAPITAL LETTER C WITH CARON
+<U010C> <U0043>
+% LATIN SMALL LETTER C WITH CARON
+<U010D> <U0063>
+% LATIN CAPITAL LETTER D WITH CARON
+<U010E> <U0044>
+% LATIN SMALL LETTER D WITH CARON
+<U010F> <U0064>
+% LATIN CAPITAL LETTER D WITH STROKE
+<U0110> <U0044>
+% LATIN SMALL LETTER D WITH STROKE
+<U0111> <U0064>
+% LATIN CAPITAL LETTER E WITH MACRON
+<U0112> <U0045>
+% LATIN SMALL LETTER E WITH MACRON
+<U0113> <U0065>
+% LATIN CAPITAL LETTER E WITH BREVE
+<U0114> <U0045>
+% LATIN SMALL LETTER E WITH BREVE
+<U0115> <U0065>
+% LATIN CAPITAL LETTER E WITH DOT ABOVE
+<U0116> <U0045>
+% LATIN SMALL LETTER E WITH DOT ABOVE
+<U0117> <U0065>
+% LATIN CAPITAL LETTER E WITH OGONEK
+<U0118> <U0045>
+% LATIN SMALL LETTER E WITH OGONEK
+<U0119> <U0065>
+% LATIN CAPITAL LETTER E WITH CARON
+<U011A> <U0045>
+% LATIN SMALL LETTER E WITH CARON
+<U011B> <U0065>
+% LATIN CAPITAL LETTER G WITH CIRCUMFLEX
+<U011C> "<U0047><U0068>";<U0047>
+% LATIN SMALL LETTER G WITH CIRCUMFLEX
+<U011D> "<U0067><U0068>";<U0067>
+% LATIN CAPITAL LETTER G WITH BREVE
+<U011E> <U0047>
+% LATIN SMALL LETTER G WITH BREVE
+<U011F> <U0067>
+% LATIN CAPITAL LETTER G WITH DOT ABOVE
+<U0120> <U0047>
+% LATIN SMALL LETTER G WITH DOT ABOVE
+<U0121> <U0067>
+% LATIN CAPITAL LETTER G WITH CEDILLA
+<U0122> <U0047>
+% LATIN SMALL LETTER G WITH CEDILLA
+<U0123> <U0067>
+% LATIN CAPITAL LETTER H WITH CIRCUMFLEX
+<U0124> "<U0048><U0068>";<U0048>
+% LATIN SMALL LETTER H WITH CIRCUMFLEX
+<U0125> "<U0068><U0068>";<U0068>
+% LATIN CAPITAL LETTER H WITH STROKE
+<U0126> <U0048>
+% LATIN SMALL LETTER H WITH STROKE
+<U0127> <U0068>
+% LATIN CAPITAL LETTER I WITH TILDE
+<U0128> <U0049>
+% LATIN SMALL LETTER I WITH TILDE
+<U0129> <U0069>
+% LATIN CAPITAL LETTER I WITH MACRON
+<U012A> <U0049>
+% LATIN SMALL LETTER I WITH MACRON
+<U012B> <U0069>
+% LATIN CAPITAL LETTER I WITH BREVE
+<U012C> <U0049>
+% LATIN SMALL LETTER I WITH BREVE
+<U012D> <U0069>
+% LATIN CAPITAL LETTER I WITH OGONEK
+<U012E> <U0049>
+% LATIN SMALL LETTER I WITH OGONEK
+<U012F> <U0069>
+% LATIN CAPITAL LETTER I WITH DOT ABOVE
+<U0130> <U0049>
+% LATIN SMALL LETTER DOTLESS I
+<U0131> <U0069>
+% LATIN CAPITAL LIGATURE IJ
+<U0132> "<U0049><U004A>"
+% LATIN SMALL LIGATURE IJ
+<U0133> "<U0069><U006A>"
+% LATIN CAPITAL LETTER J WITH CIRCUMFLEX
+<U0134> "<U004A><U0068>";<U004A>
+% LATIN SMALL LETTER J WITH CIRCUMFLEX
+<U0135> "<U006A><U0068>";<U006A>
+% LATIN CAPITAL LETTER K WITH CEDILLA
+<U0136> <U004B>
+% LATIN SMALL LETTER K WITH CEDILLA
+<U0137> <U006B>
+% LATIN SMALL LETTER KRA
+<U0138> <U006B>
+% LATIN CAPITAL LETTER L WITH ACUTE
+<U0139> <U004C>
+% LATIN SMALL LETTER L WITH ACUTE
+<U013A> <U006C>
+% LATIN CAPITAL LETTER L WITH CEDILLA
+<U013B> <U004C>
+% LATIN SMALL LETTER L WITH CEDILLA
+<U013C> <U006C>
+% LATIN CAPITAL LETTER L WITH CARON
+<U013D> <U004C>
+% LATIN SMALL LETTER L WITH CARON
+<U013E> <U006C>
+% LATIN CAPITAL LETTER L WITH MIDDLE DOT
+<U013F> "<U004C><U00B7>";"<U004C><U002E>";<U004C>
+% LATIN SMALL LETTER L WITH MIDDLE DOT
+<U0140> "<U006C><U00B7>";"<U006C><U002E>";<U006C>
+% LATIN CAPITAL LETTER L WITH STROKE
+<U0141> <U004C>
+% LATIN SMALL LETTER L WITH STROKE
+<U0142> <U006C>
+% LATIN CAPITAL LETTER N WITH ACUTE
+<U0143> <U004E>
+% LATIN SMALL LETTER N WITH ACUTE
+<U0144> <U006E>
+% LATIN CAPITAL LETTER N WITH CEDILLA
+<U0145> <U004E>
+% LATIN SMALL LETTER N WITH CEDILLA
+<U0146> <U006E>
+% LATIN CAPITAL LETTER N WITH CARON
+<U0147> <U004E>
+% LATIN SMALL LETTER N WITH CARON
+<U0148> <U006E>
+% LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
+<U0149> "<U0027><U006E>"
+% LATIN CAPITAL LETTER ENG
+<U014A> "<U004E><U0047>";<U004E>
+% LATIN SMALL LETTER ENG
+<U014B> "<U006E><U0067>";<U006E>
+% LATIN CAPITAL LETTER O WITH MACRON
+<U014C> <U004F>
+% LATIN SMALL LETTER O WITH MACRON
+<U014D> <U006F>
+% LATIN CAPITAL LETTER O WITH BREVE
+<U014E> <U004F>
+% LATIN SMALL LETTER O WITH BREVE
+<U014F> <U006F>
+% LATIN CAPITAL LETTER O WITH DOUBLE ACUTE
+<U0150> <U004F>
+% LATIN SMALL LETTER O WITH DOUBLE ACUTE
+<U0151> <U006F>
+% LATIN CAPITAL LIGATURE OE
+<U0152> "<U004F><U0045>"
+% LATIN SMALL LIGATURE OE
+<U0153> "<U006F><U0065>"
+% LATIN CAPITAL LETTER R WITH ACUTE
+<U0154> <U0052>
+% LATIN SMALL LETTER R WITH ACUTE
+<U0155> <U0072>
+% LATIN CAPITAL LETTER R WITH CEDILLA
+<U0156> <U0052>
+% LATIN SMALL LETTER R WITH CEDILLA
+<U0157> <U0072>
+% LATIN CAPITAL LETTER R WITH CARON
+<U0158> <U0052>
+% LATIN SMALL LETTER R WITH CARON
+<U0159> <U0072>
+% LATIN CAPITAL LETTER S WITH ACUTE
+<U015A> <U0053>
+% LATIN SMALL LETTER S WITH ACUTE
+<U015B> <U0073>
+% LATIN CAPITAL LETTER S WITH CIRCUMFLEX
+<U015C> "<U0053><U0068>";<U0053>
+% LATIN SMALL LETTER S WITH CIRCUMFLEX
+<U015D> "<U0073><U0068>";<U0073>
+% LATIN CAPITAL LETTER S WITH CEDILLA
+<U015E> <U0053>
+% LATIN SMALL LETTER S WITH CEDILLA
+<U015F> <U0073>
+% LATIN CAPITAL LETTER S WITH CARON
+<U0160> <U0053>
+% LATIN SMALL LETTER S WITH CARON
+<U0161> <U0073>
+% LATIN CAPITAL LETTER T WITH CEDILLA
+<U0162> <U0054>
+% LATIN SMALL LETTER T WITH CEDILLA
+<U0163> <U0074>
+% LATIN CAPITAL LETTER T WITH CARON
+<U0164> <U0054>
+% LATIN SMALL LETTER T WITH CARON
+<U0165> <U0074>
+% LATIN CAPITAL LETTER T WITH STROKE
+<U0166> <U0054>
+% LATIN SMALL LETTER T WITH STROKE
+<U0167> <U0074>
+% LATIN CAPITAL LETTER U WITH TILDE
+<U0168> <U0055>
+% LATIN SMALL LETTER U WITH TILDE
+<U0169> <U0075>
+% LATIN CAPITAL LETTER U WITH MACRON
+<U016A> <U0055>
+% LATIN SMALL LETTER U WITH MACRON
+<U016B> <U0075>
+% LATIN CAPITAL LETTER U WITH BREVE
+<U016C> <U0055>
+% LATIN SMALL LETTER U WITH BREVE
+<U016D> <U0075>
+% LATIN CAPITAL LETTER U WITH RING ABOVE
+<U016E> <U0055>
+% LATIN SMALL LETTER U WITH RING ABOVE
+<U016F> <U0075>
+% LATIN CAPITAL LETTER U WITH DOUBLE ACUTE
+<U0170> <U0055>
+% LATIN SMALL LETTER U WITH DOUBLE ACUTE
+<U0171> <U0075>
+% LATIN CAPITAL LETTER U WITH OGONEK
+<U0172> <U0055>
+% LATIN SMALL LETTER U WITH OGONEK
+<U0173> <U0075>
+% LATIN CAPITAL LETTER W WITH CIRCUMFLEX
+<U0174> <U0057>
+% LATIN SMALL LETTER W WITH CIRCUMFLEX
+<U0175> <U0077>
+% LATIN CAPITAL LETTER Y WITH CIRCUMFLEX
+<U0176> <U0059>
+% LATIN SMALL LETTER Y WITH CIRCUMFLEX
+<U0177> <U0079>
+% LATIN CAPITAL LETTER Y WITH DIAERESIS
+<U0178> <U0059>
+% LATIN CAPITAL LETTER Z WITH ACUTE
+<U0179> <U005A>
+% LATIN SMALL LETTER Z WITH ACUTE
+<U017A> <U007A>
+% LATIN CAPITAL LETTER Z WITH DOT ABOVE
+<U017B> <U005A>
+% LATIN SMALL LETTER Z WITH DOT ABOVE
+<U017C> <U007A>
+% LATIN CAPITAL LETTER Z WITH CARON
+<U017D> <U005A>
+% LATIN SMALL LETTER Z WITH CARON
+<U017E> <U007A>
+% LATIN SMALL LETTER LONG S
+<U017F> <U0073>
+% LATIN SMALL LETTER F WITH HOOK
+<U0192> <U0066>
+% LATIN CAPITAL LETTER S WITH COMMA BELOW
+<U0218> <U015E>;<U0053>
+% LATIN SMALL LETTER S WITH COMMA BELOW
+<U0219> <U015F>;<U0073>
+% LATIN CAPITAL LETTER T WITH COMMA BELOW
+<U021A> <U0162>;<U0054>
+% LATIN SMALL LETTER T WITH COMMA BELOW
+<U021B> <U0163>;<U0074>
+% MODIFIER LETTER PRIME
+<U02B9> <U2032>;<U0027>
+% MODIFIER LETTER TURNED COMMA
+<U02BB> <U2018>
+% MODIFIER LETTER APOSTROPHE
+<U02BC> <U2019>;<U0027>
+% MODIFIER LETTER REVERSED COMMA
+<U02BD> <U201B>
+% MODIFIER LETTER CIRCUMFLEX ACCENT
+<U02C6> <U005E>
+% MODIFIER LETTER VERTICAL LINE
+<U02C8> <U0027>
+% MODIFIER LETTER MACRON
+<U02C9> <U00AF>
+% MODIFIER LETTER LOW VERTICAL LINE
+<U02CC> <U002C>
+% MODIFIER LETTER TRIANGULAR COLON
+<U02D0> <U003A>
+% RING ABOVE
+<U02DA> <U00B0>
+% SMALL TILDE
+<U02DC> <U007E>
+% DOUBLE ACUTE ACCENT
+<U02DD> <U0022>
+% GREEK NUMERAL SIGN
+<U0374> <U0027>
+% GREEK LOWER NUMERAL SIGN
+<U0375> <U002C>
+% GREEK QUESTION MARK
+<U037E> <U003B>
+% LATIN CAPITAL LETTER B WITH DOT ABOVE
+<U1E02> <U0042>
+% LATIN SMALL LETTER B WITH DOT ABOVE
+<U1E03> <U0062>
+% LATIN CAPITAL LETTER D WITH DOT ABOVE
+<U1E0A> <U0044>
+% LATIN SMALL LETTER D WITH DOT ABOVE
+<U1E0B> <U0064>
+% LATIN CAPITAL LETTER F WITH DOT ABOVE
+<U1E1E> <U0046>
+% LATIN SMALL LETTER F WITH DOT ABOVE
+<U1E1F> <U0066>
+% LATIN CAPITAL LETTER M WITH DOT ABOVE
+<U1E40> <U004D>
+% LATIN SMALL LETTER M WITH DOT ABOVE
+<U1E41> <U006D>
+% LATIN CAPITAL LETTER P WITH DOT ABOVE
+<U1E56> <U0050>
+% LATIN SMALL LETTER P WITH DOT ABOVE
+<U1E57> <U0070>
+% LATIN CAPITAL LETTER S WITH DOT ABOVE
+<U1E60> <U0053>
+% LATIN SMALL LETTER S WITH DOT ABOVE
+<U1E61> <U0073>
+% LATIN CAPITAL LETTER T WITH DOT ABOVE
+<U1E6A> <U0054>
+% LATIN SMALL LETTER T WITH DOT ABOVE
+<U1E6B> <U0074>
+% LATIN CAPITAL LETTER W WITH GRAVE
+<U1E80> <U0057>
+% LATIN SMALL LETTER W WITH GRAVE
+<U1E81> <U0077>
+% LATIN CAPITAL LETTER W WITH ACUTE
+<U1E82> <U0057>
+% LATIN SMALL LETTER W WITH ACUTE
+<U1E83> <U0077>
+% LATIN CAPITAL LETTER W WITH DIAERESIS
+<U1E84> <U0057>
+% LATIN SMALL LETTER W WITH DIAERESIS
+<U1E85> <U0077>
+% LATIN CAPITAL LETTER Y WITH GRAVE
+<U1EF2> <U0059>
+% LATIN SMALL LETTER Y WITH GRAVE
+<U1EF3> <U0079>
+% EN QUAD
+<U2000> <U0020>
+% EM QUAD
+<U2001> "<U0020><U0020>"
+% EN SPACE
+<U2002> <U0020>
+% EM SPACE
+<U2003> "<U0020><U0020>"
+% THREE-PER-EM SPACE
+<U2004> <U0020>
+% FOUR-PER-EM SPACE
+<U2005> <U0020>
+% SIX-PER-EM SPACE
+<U2006> <U0020>
+% FIGURE SPACE
+<U2007> <U0020>
+% PUNCTUATION SPACE
+<U2008> <U0020>
+% THIN SPACE
+<U2009> <U0020>
+% HAIR SPACE
+<U200A> ""
+% ZERO WIDTH SPACE
+<U200B> ""
+% ZERO WIDTH NON-JOINER
+<U200C> ""
+% ZERO WIDTH JOINER
+<U200D> ""
+% LEFT-TO-RIGHT MARK
+<U200E> ""
+% RIGHT-TO-LEFT MARK
+<U200F> ""
+% HYPHEN
+<U2010> <U002D>
+% NON-BREAKING HYPHEN
+<U2011> <U002D>
+% FIGURE DASH
+<U2012> <U002D>
+% EN DASH
+<U2013> <U002D>
+% EM DASH
+<U2014> "<U002D><U002D>"
+% HORIZONTAL BAR
+<U2015> "<U002D><U002D>"
+% DOUBLE VERTICAL LINE
+<U2016> "<U007C><U007C>"
+% DOUBLE LOW LINE
+<U2017> <U005F>
+% LEFT SINGLE QUOTATION MARK
+<U2018> <U0027>
+% RIGHT SINGLE QUOTATION MARK
+<U2019> <U0027>
+% SINGLE LOW-9 QUOTATION MARK
+<U201A> <U0027>
+% SINGLE HIGH-REVERSED-9 QUOTATION MARK
+<U201B> <U0027>
+% LEFT DOUBLE QUOTATION MARK
+<U201C> <U0022>
+% RIGHT DOUBLE QUOTATION MARK
+<U201D> <U0022>
+% DOUBLE LOW-9 QUOTATION MARK
+<U201E> <U0022>
+% DOUBLE HIGH-REVERSED-9 QUOTATION MARK
+<U201F> <U0022>
+% DAGGER
+<U2020> <U002B>
+% DOUBLE DAGGER
+<U2021> "<U002B><U002B>"
+% BULLET
+<U2022> <U006F>
+% TRIANGULAR BULLET
+<U2023> <U003E>
+% ONE DOT LEADER
+<U2024> <U002E>
+% TWO DOT LEADER
+<U2025> "<U002E><U002E>"
+% HORIZONTAL ELLIPSIS
+<U2026> "<U002E><U002E><U002E>"
+% HYPHENATION POINT
+<U2027> <U002D>
+% LEFT-TO-RIGHT EMBEDDING
+<U202A> ""
+% RIGHT-TO-LEFT EMBEDDING
+<U202B> ""
+% POP DIRECTIONAL FORMATTING
+<U202C> ""
+% LEFT-TO-RIGHT OVERRIDE
+<U202D> ""
+% RIGHT-TO-LEFT OVERRIDE
+<U202E> ""
+% NARROW NO-BREAK SPACE
+<U202F> <U0020>
+% PER MILLE SIGN
+<U2030> "<U0020><U0030><U002F><U0030><U0030>"
+% PRIME
+<U2032> <U0027>
+% DOUBLE PRIME
+<U2033> <U0022>
+% TRIPLE PRIME
+<U2034> "<U0027><U0027><U0027>"
+% REVERSED PRIME
+<U2035> <U0060>
+% REVERSED DOUBLE PRIME
+<U2036> "<U0060><U0060>"
+% REVERSED TRIPLE PRIME
+<U2037> "<U0060><U0060><U0060>"
+% SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+<U2039> <U003C>
+% SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+<U203A> <U003E>
+% DOUBLE EXCLAMATION MARK
+<U203C> "<U0021><U0021>"
+% OVERLINE
+<U203E> <U002D>
+% HYPHEN BULLET
+<U2043> <U002D>
+% FRACTION SLASH
+<U2044> <U002F>
+% QUESTION EXCLAMATION MARK
+<U2048> "<U003F><U0021>"
+% EXCLAMATION QUESTION MARK
+<U2049> "<U0021><U003F>"
+% TIRONIAN SIGN ET
+<U204A> <U0037>
+% SUPERSCRIPT ZERO
+<U2070> "<U005E><U0030>";<U0030>
+% SUPERSCRIPT FOUR
+<U2074> "<U005E><U0034>";<U0034>
+% SUPERSCRIPT FIVE
+<U2075> "<U005E><U0035>";<U0035>
+% SUPERSCRIPT SIX
+<U2076> "<U005E><U0036>";<U0036>
+% SUPERSCRIPT SEVEN
+<U2077> "<U005E><U0037>";<U0037>
+% SUPERSCRIPT EIGHT
+<U2078> "<U005E><U0038>";<U0038>
+% SUPERSCRIPT NINE
+<U2079> "<U005E><U0039>";<U0039>
+% SUPERSCRIPT PLUS SIGN
+<U207A> "<U005E><U002B>";<U002B>
+% SUPERSCRIPT MINUS
+<U207B> "<U005E><U002D>";<U002D>
+% SUPERSCRIPT EQUALS SIGN
+<U207C> "<U005E><U003D>";<U003D>
+% SUPERSCRIPT LEFT PARENTHESIS
+<U207D> "<U005E><U0028>";<U0028>
+% SUPERSCRIPT RIGHT PARENTHESIS
+<U207E> "<U005E><U0029>";<U0029>
+% SUPERSCRIPT LATIN SMALL LETTER N
+<U207F> "<U005E><U006E>";<U006E>
+% SUBSCRIPT ZERO
+<U2080> "<U005F><U0030>";<U0030>
+% SUBSCRIPT ONE
+<U2081> "<U005F><U0031>";<U0031>
+% SUBSCRIPT TWO
+<U2082> "<U005F><U0032>";<U0032>
+% SUBSCRIPT THREE
+<U2083> "<U005F><U0033>";<U0033>
+% SUBSCRIPT FOUR
+<U2084> "<U005F><U0034>";<U0034>
+% SUBSCRIPT FIVE
+<U2085> "<U005F><U0035>";<U0035>
+% SUBSCRIPT SIX
+<U2086> "<U005F><U0036>";<U0036>
+% SUBSCRIPT SEVEN
+<U2087> "<U005F><U0037>";<U0037>
+% SUBSCRIPT EIGHT
+<U2088> "<U005F><U0038>";<U0038>
+% SUBSCRIPT NINE
+<U2089> "<U005F><U0039>";<U0039>
+% SUBSCRIPT PLUS SIGN
+<U208A> "<U005F><U002B>";<U002B>
+% SUBSCRIPT MINUS
+<U208B> "<U005F><U002D>";<U002D>
+% SUBSCRIPT EQUALS SIGN
+<U208C> "<U005F><U003D>";<U003D>
+% SUBSCRIPT LEFT PARENTHESIS
+<U208D> "<U005F><U0028>";<U0028>
+% SUBSCRIPT RIGHT PARENTHESIS
+<U208E> "<U005F><U0029>";<U0029>
+% EURO SIGN
+<U20AC> "<U0045><U0055><U0052>";<U0045>
+% ACCOUNT OF
+<U2100> "<U0061><U002F><U0063>"
+% ADDRESSED TO THE SUBJECT
+<U2101> "<U0061><U002F><U0073>"
+% DEGREE CELSIUS
+<U2103> "<U00B0><U0043>";<U0043>
+% CARE OF
+<U2105> "<U0063><U002F><U006F>"
+% CADA UNA
+<U2106> "<U0063><U002F><U0075>"
+% DEGREE FAHRENHEIT
+<U2109> "<U00B0><U0046>";<U0046>
+% SCRIPT SMALL L
+<U2113> <U006C>
+% NUMERO SIGN
+<U2116> "<U004E><U00BA>";"<U004E><U006F>"
+% SOUND RECORDING COPYRIGHT
+<U2117> "<U0028><U0050><U0029>"
+% SERVICE MARK
+<U2120> "<U005B><U0053><U004D><U005D>"
+% TELEPHONE SIGN
+<U2121> "<U0054><U0045><U004C>"
+% TRADE MARK SIGN
+<U2122> "<U005B><U0054><U004D><U005D>"
+% OHM SIGN
+<U2126> <U03A9>;"<U006F><U0068><U006D>";<U004F>
+% KELVIN SIGN
+<U212A> <U004B>
+% ANGSTROM SIGN
+<U212B> <U00C5>
+% ESTIMATED SYMBOL
+<U212E> <U0065>
+% VULGAR FRACTION ONE THIRD
+<U2153> "<U0020><U0031><U002F><U0033>"
+% VULGAR FRACTION TWO THIRDS
+<U2154> "<U0020><U0032><U002F><U0033>"
+% VULGAR FRACTION ONE FIFTH
+<U2155> "<U0020><U0031><U002F><U0035>"
+% VULGAR FRACTION TWO FIFTHS
+<U2156> "<U0020><U0032><U002F><U0035>"
+% VULGAR FRACTION THREE FIFTHS
+<U2157> "<U0020><U0033><U002F><U0035>"
+% VULGAR FRACTION FOUR FIFTHS
+<U2158> "<U0020><U0034><U002F><U0035>"
+% VULGAR FRACTION ONE SIXTH
+<U2159> "<U0020><U0031><U002F><U0036>"
+% VULGAR FRACTION FIVE SIXTHS
+<U215A> "<U0020><U0035><U002F><U0036>"
+% VULGAR FRACTION ONE EIGHTH
+<U215B> "<U0020><U0031><U002F><U0038>"
+% VULGAR FRACTION THREE EIGHTHS
+<U215C> "<U0020><U0033><U002F><U0038>"
+% VULGAR FRACTION FIVE EIGHTHS
+<U215D> "<U0020><U0035><U002F><U0038>"
+% VULGAR FRACTION SEVEN EIGHTHS
+<U215E> "<U0020><U0037><U002F><U0038>"
+% FRACTION NUMERATOR ONE
+<U215F> "<U0020><U0031><U002F>"
+% ROMAN NUMERAL ONE
+<U2160> <U0049>
+% ROMAN NUMERAL TWO
+<U2161> "<U0049><U0049>"
+% ROMAN NUMERAL THREE
+<U2162> "<U0049><U0049><U0049>"
+% ROMAN NUMERAL FOUR
+<U2163> "<U0049><U0056>"
+% ROMAN NUMERAL FIVE
+<U2164> <U0056>
+% ROMAN NUMERAL SIX
+<U2165> "<U0056><U0049>"
+% ROMAN NUMERAL SEVEN
+<U2166> "<U0056><U0049><U0049>"
+% ROMAN NUMERAL EIGHT
+<U2167> "<U0056><U0049><U0049><U0049>"
+% ROMAN NUMERAL NINE
+<U2168> "<U0049><U0058>"
+% ROMAN NUMERAL TEN
+<U2169> <U0058>
+% ROMAN NUMERAL ELEVEN
+<U216A> "<U0058><U0049>"
+% ROMAN NUMERAL TWELVE
+<U216B> "<U0058><U0049><U0049>"
+% ROMAN NUMERAL FIFTY
+<U216C> <U004C>
+% ROMAN NUMERAL ONE HUNDRED
+<U216D> <U0043>
+% ROMAN NUMERAL FIVE HUNDRED
+<U216E> <U0044>
+% ROMAN NUMERAL ONE THOUSAND
+<U216F> <U004D>
+% SMALL ROMAN NUMERAL ONE
+<U2170> <U0069>
+% SMALL ROMAN NUMERAL TWO
+<U2171> "<U0069><U0069>"
+% SMALL ROMAN NUMERAL THREE
+<U2172> "<U0069><U0069><U0069>"
+% SMALL ROMAN NUMERAL FOUR
+<U2173> "<U0069><U0076>"
+% SMALL ROMAN NUMERAL FIVE
+<U2174> <U0076>
+% SMALL ROMAN NUMERAL SIX
+<U2175> "<U0076><U0069>"
+% SMALL ROMAN NUMERAL SEVEN
+<U2176> "<U0076><U0069><U0069>"
+% SMALL ROMAN NUMERAL EIGHT
+<U2177> "<U0076><U0069><U0069><U0069>"
+% SMALL ROMAN NUMERAL NINE
+<U2178> "<U0069><U0078>"
+% SMALL ROMAN NUMERAL TEN
+<U2179> <U0078>
+% SMALL ROMAN NUMERAL ELEVEN
+<U217A> "<U0078><U0069>"
+% SMALL ROMAN NUMERAL TWELVE
+<U217B> "<U0078><U0069><U0069>"
+% SMALL ROMAN NUMERAL FIFTY
+<U217C> <U006C>
+% SMALL ROMAN NUMERAL ONE HUNDRED
+<U217D> <U0063>
+% SMALL ROMAN NUMERAL FIVE HUNDRED
+<U217E> <U0064>
+% SMALL ROMAN NUMERAL ONE THOUSAND
+<U217F> <U006D>
+% LEFTWARDS ARROW
+<U2190> "<U003C><U002D>"
+% UPWARDS ARROW
+<U2191> <U005E>
+% RIGHTWARDS ARROW
+<U2192> "<U002D><U003E>"
+% DOWNWARDS ARROW
+<U2193> <U0076>
+% LEFT RIGHT ARROW
+<U2194> "<U003C><U002D><U003E>"
+% LEFTWARDS DOUBLE ARROW
+<U21D0> "<U003C><U003D>"
+% RIGHTWARDS DOUBLE ARROW
+<U21D2> "<U003D><U003E>"
+% LEFT RIGHT DOUBLE ARROW
+<U21D4> "<U003C><U003D><U003E>"
+% MINUS SIGN
+<U2212> <U2013>;<U002D>
+% DIVISION SLASH
+<U2215> <U002F>
+% SET MINUS
+<U2216> <U005C>
+% ASTERISK OPERATOR
+<U2217> <U002A>
+% RING OPERATOR
+<U2218> <U006F>
+% BULLET OPERATOR
+<U2219> <U00B7>
+% INFINITY
+<U221E> "<U0069><U006E><U0066>"
+% DIVIDES
+<U2223> <U007C>
+% PARALLEL TO
+<U2225> "<U007C><U007C>"
+% RATIO
+<U2236> <U003A>
+% TILDE OPERATOR
+<U223C> <U007E>
+% NOT EQUAL TO
+<U2260> "<U002F><U003D>"
+% IDENTICAL TO
+<U2261> <U003D>
+% LESS-THAN OR EQUAL TO
+<U2264> "<U003C><U003D>"
+% GREATER-THAN OR EQUAL TO
+<U2265> "<U003E><U003D>"
+% MUCH LESS-THAN
+<U226A> "<U003C><U003C>"
+% MUCH GREATER-THAN
+<U226B> "<U003E><U003E>"
+% CIRCLED PLUS
+<U2295> "<U0028><U002B><U0029>"
+% CIRCLED MINUS
+<U2296> "<U0028><U002D><U0029>"
+% CIRCLED TIMES
+<U2297> "<U0028><U0078><U0029>"
+% CIRCLED DIVISION SLASH
+<U2298> "<U0028><U002F><U0029>"
+% RIGHT TACK
+<U22A2> "<U007C><U002D>"
+% LEFT TACK
+<U22A3> "<U002D><U007C>"
+% ASSERTION
+<U22A6> "<U007C><U002D>"
+% MODELS
+<U22A7> "<U007C><U003D>"
+% TRUE
+<U22A8> "<U007C><U003D>"
+% FORCES
+<U22A9> "<U007C><U007C><U002D>"
+% DOT OPERATOR
+<U22C5> <U00B7>
+% STAR OPERATOR
+<U22C6> <U002A>
+% EQUAL AND PARALLEL TO
+<U22D5> <U0023>
+% VERY MUCH LESS-THAN
+<U22D8> "<U003C><U003C><U003C>"
+% VERY MUCH GREATER-THAN
+<U22D9> "<U003E><U003E><U003E>"
+% MIDLINE HORIZONTAL ELLIPSIS
+<U22EF> "<U002E><U002E><U002E>"
+% LEFT-POINTING ANGLE BRACKET
+<U2329> <U003C>
+% RIGHT-POINTING ANGLE BRACKET
+<U232A> <U003E>
+% SYMBOL FOR NULL
+<U2400> "<U004E><U0055><U004C>"
+% SYMBOL FOR START OF HEADING
+<U2401> "<U0053><U004F><U0048>"
+% SYMBOL FOR START OF TEXT
+<U2402> "<U0053><U0054><U0058>"
+% SYMBOL FOR END OF TEXT
+<U2403> "<U0045><U0054><U0058>"
+% SYMBOL FOR END OF TRANSMISSION
+<U2404> "<U0045><U004F><U0054>"
+% SYMBOL FOR ENQUIRY
+<U2405> "<U0045><U004E><U0051>"
+% SYMBOL FOR ACKNOWLEDGE
+<U2406> "<U0041><U0043><U004B>"
+% SYMBOL FOR BELL
+<U2407> "<U0042><U0045><U004C>"
+% SYMBOL FOR BACKSPACE
+<U2408> "<U0042><U0053>"
+% SYMBOL FOR HORIZONTAL TABULATION
+<U2409> "<U0048><U0054>"
+% SYMBOL FOR LINE FEED
+<U240A> "<U004C><U0046>"
+% SYMBOL FOR VERTICAL TABULATION
+<U240B> "<U0056><U0054>"
+% SYMBOL FOR FORM FEED
+<U240C> "<U0046><U0046>"
+% SYMBOL FOR CARRIAGE RETURN
+<U240D> "<U0043><U0052>"
+% SYMBOL FOR SHIFT OUT
+<U240E> "<U0053><U004F>"
+% SYMBOL FOR SHIFT IN
+<U240F> "<U0053><U0049>"
+% SYMBOL FOR DATA LINK ESCAPE
+<U2410> "<U0044><U004C><U0045>"
+% SYMBOL FOR DEVICE CONTROL ONE
+<U2411> "<U0044><U0043><U0031>"
+% SYMBOL FOR DEVICE CONTROL TWO
+<U2412> "<U0044><U0043><U0032>"
+% SYMBOL FOR DEVICE CONTROL THREE
+<U2413> "<U0044><U0043><U0033>"
+% SYMBOL FOR DEVICE CONTROL FOUR
+<U2414> "<U0044><U0043><U0034>"
+% SYMBOL FOR NEGATIVE ACKNOWLEDGE
+<U2415> "<U004E><U0041><U004B>"
+% SYMBOL FOR SYNCHRONOUS IDLE
+<U2416> "<U0053><U0059><U004E>"
+% SYMBOL FOR END OF TRANSMISSION BLOCK
+<U2417> "<U0045><U0054><U0042>"
+% SYMBOL FOR CANCEL
+<U2418> "<U0043><U0041><U004E>"
+% SYMBOL FOR END OF MEDIUM
+<U2419> "<U0045><U004D>"
+% SYMBOL FOR SUBSTITUTE
+<U241A> "<U0053><U0055><U0042>"
+% SYMBOL FOR ESCAPE
+<U241B> "<U0045><U0053><U0043>"
+% SYMBOL FOR FILE SEPARATOR
+<U241C> "<U0046><U0053>"
+% SYMBOL FOR GROUP SEPARATOR
+<U241D> "<U0047><U0053>"
+% SYMBOL FOR RECORD SEPARATOR
+<U241E> "<U0052><U0053>"
+% SYMBOL FOR UNIT SEPARATOR
+<U241F> "<U0055><U0053>"
+% SYMBOL FOR SPACE
+<U2420> "<U0053><U0050>"
+% SYMBOL FOR DELETE
+<U2421> "<U0044><U0045><U004C>"
+% OPEN BOX
+<U2423> <U005F>
+% SYMBOL FOR NEWLINE
+<U2424> "<U004E><U004C>"
+% SYMBOL FOR DELETE FORM TWO
+<U2425> "<U002F><U002F><U002F>"
+% SYMBOL FOR SUBSTITUTE FORM TWO
+<U2426> <U003F>
+% CIRCLED DIGIT ONE
+<U2460> "<U0028><U0031><U0029>";<U0031>
+% CIRCLED DIGIT TWO
+<U2461> "<U0028><U0032><U0029>";<U0032>
+% CIRCLED DIGIT THREE
+<U2462> "<U0028><U0033><U0029>";<U0033>
+% CIRCLED DIGIT FOUR
+<U2463> "<U0028><U0034><U0029>";<U0034>
+% CIRCLED DIGIT FIVE
+<U2464> "<U0028><U0035><U0029>";<U0035>
+% CIRCLED DIGIT SIX
+<U2465> "<U0028><U0036><U0029>";<U0036>
+% CIRCLED DIGIT SEVEN
+<U2466> "<U0028><U0037><U0029>";<U0037>
+% CIRCLED DIGIT EIGHT
+<U2467> "<U0028><U0038><U0029>";<U0038>
+% CIRCLED DIGIT NINE
+<U2468> "<U0028><U0039><U0029>";<U0039>
+% CIRCLED NUMBER TEN
+<U2469> "<U0028><U0031><U0030><U0029>"
+% CIRCLED NUMBER ELEVEN
+<U246A> "<U0028><U0031><U0031><U0029>"
+% CIRCLED NUMBER TWELVE
+<U246B> "<U0028><U0031><U0032><U0029>"
+% CIRCLED NUMBER THIRTEEN
+<U246C> "<U0028><U0031><U0033><U0029>"
+% CIRCLED NUMBER FOURTEEN
+<U246D> "<U0028><U0031><U0034><U0029>"
+% CIRCLED NUMBER FIFTEEN
+<U246E> "<U0028><U0031><U0035><U0029>"
+% CIRCLED NUMBER SIXTEEN
+<U246F> "<U0028><U0031><U0036><U0029>"
+% CIRCLED NUMBER SEVENTEEN
+<U2470> "<U0028><U0031><U0037><U0029>"
+% CIRCLED NUMBER EIGHTEEN
+<U2471> "<U0028><U0031><U0038><U0029>"
+% CIRCLED NUMBER NINETEEN
+<U2472> "<U0028><U0031><U0039><U0029>"
+% CIRCLED NUMBER TWENTY
+<U2473> "<U0028><U0032><U0030><U0029>"
+% PARENTHESIZED DIGIT ONE
+<U2474> "<U0028><U0031><U0029>";<U0031>
+% PARENTHESIZED DIGIT TWO
+<U2475> "<U0028><U0032><U0029>";<U0032>
+% PARENTHESIZED DIGIT THREE
+<U2476> "<U0028><U0033><U0029>";<U0033>
+% PARENTHESIZED DIGIT FOUR
+<U2477> "<U0028><U0034><U0029>";<U0034>
+% PARENTHESIZED DIGIT FIVE
+<U2478> "<U0028><U0035><U0029>";<U0035>
+% PARENTHESIZED DIGIT SIX
+<U2479> "<U0028><U0036><U0029>";<U0036>
+% PARENTHESIZED DIGIT SEVEN
+<U247A> "<U0028><U0037><U0029>";<U0037>
+% PARENTHESIZED DIGIT EIGHT
+<U247B> "<U0028><U0038><U0029>";<U0038>
+% PARENTHESIZED DIGIT NINE
+<U247C> "<U0028><U0039><U0029>";<U0039>
+% PARENTHESIZED NUMBER TEN
+<U247D> "<U0028><U0031><U0030><U0029>"
+% PARENTHESIZED NUMBER ELEVEN
+<U247E> "<U0028><U0031><U0031><U0029>"
+% PARENTHESIZED NUMBER TWELVE
+<U247F> "<U0028><U0031><U0032><U0029>"
+% PARENTHESIZED NUMBER THIRTEEN
+<U2480> "<U0028><U0031><U0033><U0029>"
+% PARENTHESIZED NUMBER FOURTEEN
+<U2481> "<U0028><U0031><U0034><U0029>"
+% PARENTHESIZED NUMBER FIFTEEN
+<U2482> "<U0028><U0031><U0035><U0029>"
+% PARENTHESIZED NUMBER SIXTEEN
+<U2483> "<U0028><U0031><U0036><U0029>"
+% PARENTHESIZED NUMBER SEVENTEEN
+<U2484> "<U0028><U0031><U0037><U0029>"
+% PARENTHESIZED NUMBER EIGHTEEN
+<U2485> "<U0028><U0031><U0038><U0029>"
+% PARENTHESIZED NUMBER NINETEEN
+<U2486> "<U0028><U0031><U0039><U0029>"
+% PARENTHESIZED NUMBER TWENTY
+<U2487> "<U0028><U0032><U0030><U0029>"
+% DIGIT ONE FULL STOP
+<U2488> "<U0031><U002E>";<U0031>
+% DIGIT TWO FULL STOP
+<U2489> "<U0032><U002E>";<U0032>
+% DIGIT THREE FULL STOP
+<U248A> "<U0033><U002E>";<U0033>
+% DIGIT FOUR FULL STOP
+<U248B> "<U0034><U002E>";<U0034>
+% DIGIT FIVE FULL STOP
+<U248C> "<U0035><U002E>";<U0035>
+% DIGIT SIX FULL STOP
+<U248D> "<U0036><U002E>";<U0036>
+% DIGIT SEVEN FULL STOP
+<U248E> "<U0037><U002E>";<U0037>
+% DIGIT EIGHT FULL STOP
+<U248F> "<U0038><U002E>";<U0038>
+% DIGIT NINE FULL STOP
+<U2490> "<U0039><U002E>";<U0039>
+% NUMBER TEN FULL STOP
+<U2491> "<U0031><U0030><U002E>"
+% NUMBER ELEVEN FULL STOP
+<U2492> "<U0031><U0031><U002E>"
+% NUMBER TWELVE FULL STOP
+<U2493> "<U0031><U0032><U002E>"
+% NUMBER THIRTEEN FULL STOP
+<U2494> "<U0031><U0033><U002E>"
+% NUMBER FOURTEEN FULL STOP
+<U2495> "<U0031><U0034><U002E>"
+% NUMBER FIFTEEN FULL STOP
+<U2496> "<U0031><U0035><U002E>"
+% NUMBER SIXTEEN FULL STOP
+<U2497> "<U0031><U0036><U002E>"
+% NUMBER SEVENTEEN FULL STOP
+<U2498> "<U0031><U0037><U002E>"
+% NUMBER EIGHTEEN FULL STOP
+<U2499> "<U0031><U0038><U002E>"
+% NUMBER NINETEEN FULL STOP
+<U249A> "<U0031><U0039><U002E>"
+% NUMBER TWENTY FULL STOP
+<U249B> "<U0032><U0030><U002E>"
+% PARENTHESIZED LATIN SMALL LETTER A
+<U249C> "<U0028><U0061><U0029>";<U0061>
+% PARENTHESIZED LATIN SMALL LETTER B
+<U249D> "<U0028><U0062><U0029>";<U0062>
+% PARENTHESIZED LATIN SMALL LETTER C
+<U249E> "<U0028><U0063><U0029>";<U0063>
+% PARENTHESIZED LATIN SMALL LETTER D
+<U249F> "<U0028><U0064><U0029>";<U0064>
+% PARENTHESIZED LATIN SMALL LETTER E
+<U24A0> "<U0028><U0065><U0029>";<U0065>
+% PARENTHESIZED LATIN SMALL LETTER F
+<U24A1> "<U0028><U0066><U0029>";<U0066>
+% PARENTHESIZED LATIN SMALL LETTER G
+<U24A2> "<U0028><U0067><U0029>";<U0067>
+% PARENTHESIZED LATIN SMALL LETTER H
+<U24A3> "<U0028><U0068><U0029>";<U0068>
+% PARENTHESIZED LATIN SMALL LETTER I
+<U24A4> "<U0028><U0069><U0029>";<U0069>
+% PARENTHESIZED LATIN SMALL LETTER J
+<U24A5> "<U0028><U006A><U0029>";<U006A>
+% PARENTHESIZED LATIN SMALL LETTER K
+<U24A6> "<U0028><U006B><U0029>";<U006B>
+% PARENTHESIZED LATIN SMALL LETTER L
+<U24A7> "<U0028><U006C><U0029>";<U006C>
+% PARENTHESIZED LATIN SMALL LETTER M
+<U24A8> "<U0028><U006D><U0029>";<U006D>
+% PARENTHESIZED LATIN SMALL LETTER N
+<U24A9> "<U0028><U006E><U0029>";<U006E>
+% PARENTHESIZED LATIN SMALL LETTER O
+<U24AA> "<U0028><U006F><U0029>";<U006F>
+% PARENTHESIZED LATIN SMALL LETTER P
+<U24AB> "<U0028><U0070><U0029>";<U0070>
+% PARENTHESIZED LATIN SMALL LETTER Q
+<U24AC> "<U0028><U0071><U0029>";<U0071>
+% PARENTHESIZED LATIN SMALL LETTER R
+<U24AD> "<U0028><U0072><U0029>";<U0072>
+% PARENTHESIZED LATIN SMALL LETTER S
+<U24AE> "<U0028><U0073><U0029>";<U0073>
+% PARENTHESIZED LATIN SMALL LETTER T
+<U24AF> "<U0028><U0074><U0029>";<U0074>
+% PARENTHESIZED LATIN SMALL LETTER U
+<U24B0> "<U0028><U0075><U0029>";<U0075>
+% PARENTHESIZED LATIN SMALL LETTER V
+<U24B1> "<U0028><U0076><U0029>";<U0076>
+% PARENTHESIZED LATIN SMALL LETTER W
+<U24B2> "<U0028><U0077><U0029>";<U0077>
+% PARENTHESIZED LATIN SMALL LETTER X
+<U24B3> "<U0028><U0078><U0029>";<U0078>
+% PARENTHESIZED LATIN SMALL LETTER Y
+<U24B4> "<U0028><U0079><U0029>";<U0079>
+% PARENTHESIZED LATIN SMALL LETTER Z
+<U24B5> "<U0028><U007A><U0029>";<U007A>
+% CIRCLED LATIN CAPITAL LETTER A
+<U24B6> "<U0028><U0041><U0029>";<U0041>
+% CIRCLED LATIN CAPITAL LETTER B
+<U24B7> "<U0028><U0042><U0029>";<U0042>
+% CIRCLED LATIN CAPITAL LETTER C
+<U24B8> "<U0028><U0043><U0029>";<U0043>
+% CIRCLED LATIN CAPITAL LETTER D
+<U24B9> "<U0028><U0044><U0029>";<U0044>
+% CIRCLED LATIN CAPITAL LETTER E
+<U24BA> "<U0028><U0045><U0029>";<U0045>
+% CIRCLED LATIN CAPITAL LETTER F
+<U24BB> "<U0028><U0046><U0029>";<U0046>
+% CIRCLED LATIN CAPITAL LETTER G
+<U24BC> "<U0028><U0047><U0029>";<U0047>
+% CIRCLED LATIN CAPITAL LETTER H
+<U24BD> "<U0028><U0048><U0029>";<U0048>
+% CIRCLED LATIN CAPITAL LETTER I
+<U24BE> "<U0028><U0049><U0029>";<U0049>
+% CIRCLED LATIN CAPITAL LETTER J
+<U24BF> "<U0028><U004A><U0029>";<U004A>
+% CIRCLED LATIN CAPITAL LETTER K
+<U24C0> "<U0028><U004B><U0029>";<U004B>
+% CIRCLED LATIN CAPITAL LETTER L
+<U24C1> "<U0028><U004C><U0029>";<U004C>
+% CIRCLED LATIN CAPITAL LETTER M
+<U24C2> "<U0028><U004D><U0029>";<U004D>
+% CIRCLED LATIN CAPITAL LETTER N
+<U24C3> "<U0028><U004E><U0029>";<U004E>
+% CIRCLED LATIN CAPITAL LETTER O
+<U24C4> "<U0028><U004F><U0029>";<U004F>
+% CIRCLED LATIN CAPITAL LETTER P
+<U24C5> "<U0028><U0050><U0029>";<U0050>
+% CIRCLED LATIN CAPITAL LETTER Q
+<U24C6> "<U0028><U0051><U0029>";<U0051>
+% CIRCLED LATIN CAPITAL LETTER R
+<U24C7> "<U0028><U0052><U0029>";<U0052>
+% CIRCLED LATIN CAPITAL LETTER S
+<U24C8> "<U0028><U0053><U0029>";<U0053>
+% CIRCLED LATIN CAPITAL LETTER T
+<U24C9> "<U0028><U0054><U0029>";<U0054>
+% CIRCLED LATIN CAPITAL LETTER U
+<U24CA> "<U0028><U0055><U0029>";<U0055>
+% CIRCLED LATIN CAPITAL LETTER V
+<U24CB> "<U0028><U0056><U0029>";<U0056>
+% CIRCLED LATIN CAPITAL LETTER W
+<U24CC> "<U0028><U0057><U0029>";<U0057>
+% CIRCLED LATIN CAPITAL LETTER X
+<U24CD> "<U0028><U0058><U0029>";<U0058>
+% CIRCLED LATIN CAPITAL LETTER Y
+<U24CE> "<U0028><U0059><U0029>";<U0059>
+% CIRCLED LATIN CAPITAL LETTER Z
+<U24CF> "<U0028><U005A><U0029>";<U005A>
+% CIRCLED LATIN SMALL LETTER A
+<U24D0> "<U0028><U0061><U0029>";<U0061>
+% CIRCLED LATIN SMALL LETTER B
+<U24D1> "<U0028><U0062><U0029>";<U0062>
+% CIRCLED LATIN SMALL LETTER C
+<U24D2> "<U0028><U0063><U0029>";<U0063>
+% CIRCLED LATIN SMALL LETTER D
+<U24D3> "<U0028><U0064><U0029>";<U0064>
+% CIRCLED LATIN SMALL LETTER E
+<U24D4> "<U0028><U0065><U0029>";<U0065>
+% CIRCLED LATIN SMALL LETTER F
+<U24D5> "<U0028><U0066><U0029>";<U0066>
+% CIRCLED LATIN SMALL LETTER G
+<U24D6> "<U0028><U0067><U0029>";<U0067>
+% CIRCLED LATIN SMALL LETTER H
+<U24D7> "<U0028><U0068><U0029>";<U0068>
+% CIRCLED LATIN SMALL LETTER I
+<U24D8> "<U0028><U0069><U0029>";<U0069>
+% CIRCLED LATIN SMALL LETTER J
+<U24D9> "<U0028><U006A><U0029>";<U006A>
+% CIRCLED LATIN SMALL LETTER K
+<U24DA> "<U0028><U006B><U0029>";<U006B>
+% CIRCLED LATIN SMALL LETTER L
+<U24DB> "<U0028><U006C><U0029>";<U006C>
+% CIRCLED LATIN SMALL LETTER M
+<U24DC> "<U0028><U006D><U0029>";<U006D>
+% CIRCLED LATIN SMALL LETTER N
+<U24DD> "<U0028><U006E><U0029>";<U006E>
+% CIRCLED LATIN SMALL LETTER O
+<U24DE> "<U0028><U006F><U0029>";<U006F>
+% CIRCLED LATIN SMALL LETTER P
+<U24DF> "<U0028><U0070><U0029>";<U0070>
+% CIRCLED LATIN SMALL LETTER Q
+<U24E0> "<U0028><U0071><U0029>";<U0071>
+% CIRCLED LATIN SMALL LETTER R
+<U24E1> "<U0028><U0072><U0029>";<U0072>
+% CIRCLED LATIN SMALL LETTER S
+<U24E2> "<U0028><U0073><U0029>";<U0073>
+% CIRCLED LATIN SMALL LETTER T
+<U24E3> "<U0028><U0074><U0029>";<U0074>
+% CIRCLED LATIN SMALL LETTER U
+<U24E4> "<U0028><U0075><U0029>";<U0075>
+% CIRCLED LATIN SMALL LETTER V
+<U24E5> "<U0028><U0076><U0029>";<U0076>
+% CIRCLED LATIN SMALL LETTER W
+<U24E6> "<U0028><U0077><U0029>";<U0077>
+% CIRCLED LATIN SMALL LETTER X
+<U24E7> "<U0028><U0078><U0029>";<U0078>
+% CIRCLED LATIN SMALL LETTER Y
+<U24E8> "<U0028><U0079><U0029>";<U0079>
+% CIRCLED LATIN SMALL LETTER Z
+<U24E9> "<U0028><U007A><U0029>";<U007A>
+% CIRCLED DIGIT ZERO
+<U24EA> "<U0028><U0030><U0029>";<U0030>
+% BOX DRAWINGS LIGHT HORIZONTAL
+<U2500> <U002D>
+% BOX DRAWINGS HEAVY HORIZONTAL
+<U2501> <U003D>
+% BOX DRAWINGS LIGHT VERTICAL
+<U2502> <U007C>
+% BOX DRAWINGS HEAVY VERTICAL
+<U2503> <U007C>
+% BOX DRAWINGS LIGHT TRIPLE DASH HORIZONTAL
+<U2504> <U002D>
+% BOX DRAWINGS HEAVY TRIPLE DASH HORIZONTAL
+<U2505> <U003D>
+% BOX DRAWINGS LIGHT TRIPLE DASH VERTICAL
+<U2506> <U007C>
+% BOX DRAWINGS HEAVY TRIPLE DASH VERTICAL
+<U2507> <U007C>
+% BOX DRAWINGS LIGHT QUADRUPLE DASH HORIZONTAL
+<U2508> <U002D>
+% BOX DRAWINGS HEAVY QUADRUPLE DASH HORIZONTAL
+<U2509> <U003D>
+% BOX DRAWINGS LIGHT QUADRUPLE DASH VERTICAL
+<U250A> <U007C>
+% BOX DRAWINGS HEAVY QUADRUPLE DASH VERTICAL
+<U250B> <U007C>
+% BOX DRAWINGS LIGHT DOWN AND RIGHT
+<U250C> <U002B>
+% BOX DRAWINGS DOWN LIGHT AND RIGHT HEAVY
+<U250D> <U002B>
+% BOX DRAWINGS DOWN HEAVY AND RIGHT LIGHT
+<U250E> <U002B>
+% BOX DRAWINGS HEAVY DOWN AND RIGHT
+<U250F> <U002B>
+% BOX DRAWINGS LIGHT DOWN AND LEFT
+<U2510> <U002B>
+% BOX DRAWINGS DOWN LIGHT AND LEFT HEAVY
+<U2511> <U002B>
+% BOX DRAWINGS DOWN HEAVY AND LEFT LIGHT
+<U2512> <U002B>
+% BOX DRAWINGS HEAVY DOWN AND LEFT
+<U2513> <U002B>
+% BOX DRAWINGS LIGHT UP AND RIGHT
+<U2514> <U002B>
+% BOX DRAWINGS UP LIGHT AND RIGHT HEAVY
+<U2515> <U002B>
+% BOX DRAWINGS UP HEAVY AND RIGHT LIGHT
+<U2516> <U002B>
+% BOX DRAWINGS HEAVY UP AND RIGHT
+<U2517> <U002B>
+% BOX DRAWINGS LIGHT UP AND LEFT
+<U2518> <U002B>
+% BOX DRAWINGS UP LIGHT AND LEFT HEAVY
+<U2519> <U002B>
+% BOX DRAWINGS UP HEAVY AND LEFT LIGHT
+<U251A> <U002B>
+% BOX DRAWINGS HEAVY UP AND LEFT
+<U251B> <U002B>
+% BOX DRAWINGS LIGHT VERTICAL AND RIGHT
+<U251C> <U002B>
+% BOX DRAWINGS VERTICAL LIGHT AND RIGHT HEAVY
+<U251D> <U002B>
+% BOX DRAWINGS UP HEAVY AND RIGHT DOWN LIGHT
+<U251E> <U002B>
+% BOX DRAWINGS DOWN HEAVY AND RIGHT UP LIGHT
+<U251F> <U002B>
+% BOX DRAWINGS VERTICAL HEAVY AND RIGHT LIGHT
+<U2520> <U002B>
+% BOX DRAWINGS DOWN LIGHT AND RIGHT UP HEAVY
+<U2521> <U002B>
+% BOX DRAWINGS UP LIGHT AND RIGHT DOWN HEAVY
+<U2522> <U002B>
+% BOX DRAWINGS HEAVY VERTICAL AND RIGHT
+<U2523> <U002B>
+% BOX DRAWINGS LIGHT VERTICAL AND LEFT
+<U2524> <U002B>
+% BOX DRAWINGS VERTICAL LIGHT AND LEFT HEAVY
+<U2525> <U002B>
+% BOX DRAWINGS UP HEAVY AND LEFT DOWN LIGHT
+<U2526> <U002B>
+% BOX DRAWINGS DOWN HEAVY AND LEFT UP LIGHT
+<U2527> <U002B>
+% BOX DRAWINGS VERTICAL HEAVY AND LEFT LIGHT
+<U2528> <U002B>
+% BOX DRAWINGS DOWN LIGHT AND LEFT UP HEAVY
+<U2529> <U002B>
+% BOX DRAWINGS UP LIGHT AND LEFT DOWN HEAVY
+<U252A> <U002B>
+% BOX DRAWINGS HEAVY VERTICAL AND LEFT
+<U252B> <U002B>
+% BOX DRAWINGS LIGHT DOWN AND HORIZONTAL
+<U252C> <U002B>
+% BOX DRAWINGS LEFT HEAVY AND RIGHT DOWN LIGHT
+<U252D> <U002B>
+% BOX DRAWINGS RIGHT HEAVY AND LEFT DOWN LIGHT
+<U252E> <U002B>
+% BOX DRAWINGS DOWN LIGHT AND HORIZONTAL HEAVY
+<U252F> <U002B>
+% BOX DRAWINGS DOWN HEAVY AND HORIZONTAL LIGHT
+<U2530> <U002B>
+% BOX DRAWINGS RIGHT LIGHT AND LEFT DOWN HEAVY
+<U2531> <U002B>
+% BOX DRAWINGS LEFT LIGHT AND RIGHT DOWN HEAVY
+<U2532> <U002B>
+% BOX DRAWINGS HEAVY DOWN AND HORIZONTAL
+<U2533> <U002B>
+% BOX DRAWINGS LIGHT UP AND HORIZONTAL
+<U2534> <U002B>
+% BOX DRAWINGS LEFT HEAVY AND RIGHT UP LIGHT
+<U2535> <U002B>
+% BOX DRAWINGS RIGHT HEAVY AND LEFT UP LIGHT
+<U2536> <U002B>
+% BOX DRAWINGS UP LIGHT AND HORIZONTAL HEAVY
+<U2537> <U002B>
+% BOX DRAWINGS UP HEAVY AND HORIZONTAL LIGHT
+<U2538> <U002B>
+% BOX DRAWINGS RIGHT LIGHT AND LEFT UP HEAVY
+<U2539> <U002B>
+% BOX DRAWINGS LEFT LIGHT AND RIGHT UP HEAVY
+<U253A> <U002B>
+% BOX DRAWINGS HEAVY UP AND HORIZONTAL
+<U253B> <U002B>
+% BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL
+<U253C> <U002B>
+% BOX DRAWINGS LEFT HEAVY AND RIGHT VERTICAL LIGHT
+<U253D> <U002B>
+% BOX DRAWINGS RIGHT HEAVY AND LEFT VERTICAL LIGHT
+<U253E> <U002B>
+% BOX DRAWINGS VERTICAL LIGHT AND HORIZONTAL HEAVY
+<U253F> <U002B>
+% BOX DRAWINGS UP HEAVY AND DOWN HORIZONTAL LIGHT
+<U2540> <U002B>
+% BOX DRAWINGS DOWN HEAVY AND UP HORIZONTAL LIGHT
+<U2541> <U002B>
+% BOX DRAWINGS VERTICAL HEAVY AND HORIZONTAL LIGHT
+<U2542> <U002B>
+% BOX DRAWINGS LEFT UP HEAVY AND RIGHT DOWN LIGHT
+<U2543> <U002B>
+% BOX DRAWINGS RIGHT UP HEAVY AND LEFT DOWN LIGHT
+<U2544> <U002B>
+% BOX DRAWINGS LEFT DOWN HEAVY AND RIGHT UP LIGHT
+<U2545> <U002B>
+% BOX DRAWINGS RIGHT DOWN HEAVY AND LEFT UP LIGHT
+<U2546> <U002B>
+% BOX DRAWINGS DOWN LIGHT AND UP HORIZONTAL HEAVY
+<U2547> <U002B>
+% BOX DRAWINGS UP LIGHT AND DOWN HORIZONTAL HEAVY
+<U2548> <U002B>
+% BOX DRAWINGS RIGHT LIGHT AND LEFT VERTICAL HEAVY
+<U2549> <U002B>
+% BOX DRAWINGS LEFT LIGHT AND RIGHT VERTICAL HEAVY
+<U254A> <U002B>
+% BOX DRAWINGS HEAVY VERTICAL AND HORIZONTAL
+<U254B> <U002B>
+% BOX DRAWINGS LIGHT DOUBLE DASH HORIZONTAL
+<U254C> <U002D>
+% BOX DRAWINGS HEAVY DOUBLE DASH HORIZONTAL
+<U254D> <U003D>
+% BOX DRAWINGS LIGHT DOUBLE DASH VERTICAL
+<U254E> <U007C>
+% BOX DRAWINGS HEAVY DOUBLE DASH VERTICAL
+<U254F> <U007C>
+% BOX DRAWINGS DOUBLE HORIZONTAL
+<U2550> <U003D>
+% BOX DRAWINGS DOUBLE VERTICAL
+<U2551> <U007C>
+% BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE
+<U2552> <U002B>
+% BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE
+<U2553> <U002B>
+% BOX DRAWINGS DOUBLE DOWN AND RIGHT
+<U2554> <U002B>
+% BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE
+<U2555> <U002B>
+% BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE
+<U2556> <U002B>
+% BOX DRAWINGS DOUBLE DOWN AND LEFT
+<U2557> <U002B>
+% BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE
+<U2558> <U002B>
+% BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE
+<U2559> <U002B>
+% BOX DRAWINGS DOUBLE UP AND RIGHT
+<U255A> <U002B>
+% BOX DRAWINGS UP SINGLE AND LEFT DOUBLE
+<U255B> <U002B>
+% BOX DRAWINGS UP DOUBLE AND LEFT SINGLE
+<U255C> <U002B>
+% BOX DRAWINGS DOUBLE UP AND LEFT
+<U255D> <U002B>
+% BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE
+<U255E> <U002B>
+% BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE
+<U255F> <U002B>
+% BOX DRAWINGS DOUBLE VERTICAL AND RIGHT
+<U2560> <U002B>
+% BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE
+<U2561> <U002B>
+% BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE
+<U2562> <U002B>
+% BOX DRAWINGS DOUBLE VERTICAL AND LEFT
+<U2563> <U002B>
+% BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE
+<U2564> <U002B>
+% BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE
+<U2565> <U002B>
+% BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL
+<U2566> <U002B>
+% BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE
+<U2567> <U002B>
+% BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE
+<U2568> <U002B>
+% BOX DRAWINGS DOUBLE UP AND HORIZONTAL
+<U2569> <U002B>
+% BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE
+<U256A> <U002B>
+% BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE
+<U256B> <U002B>
+% BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL
+<U256C> <U002B>
+% BOX DRAWINGS LIGHT ARC DOWN AND RIGHT
+<U256D> <U002B>
+% BOX DRAWINGS LIGHT ARC DOWN AND LEFT
+<U256E> <U002B>
+% BOX DRAWINGS LIGHT ARC UP AND LEFT
+<U256F> <U002B>
+% BOX DRAWINGS LIGHT ARC UP AND RIGHT
+<U2570> <U002B>
+% BOX DRAWINGS LIGHT DIAGONAL UPPER RIGHT TO LOWER LEFT
+<U2571> <U002F>
+% BOX DRAWINGS LIGHT DIAGONAL UPPER LEFT TO LOWER RIGHT
+<U2572> <U005C>
+% BOX DRAWINGS LIGHT DIAGONAL CROSS
+<U2573> <U0058>
+% BOX DRAWINGS LIGHT LEFT AND HEAVY RIGHT
+<U257C> <U002D>
+% BOX DRAWINGS LIGHT UP AND HEAVY DOWN
+<U257D> <U007C>
+% BOX DRAWINGS HEAVY LEFT AND LIGHT RIGHT
+<U257E> <U002D>
+% BOX DRAWINGS HEAVY UP AND LIGHT DOWN
+<U257F> <U007C>
+% WHITE CIRCLE
+<U25CB> <U006F>
+% WHITE BULLET
+<U25E6> <U006F>
+% BLACK STAR
+<U2605> <U002A>
+% WHITE STAR
+<U2606> <U002A>
+% BALLOT BOX WITH X
+<U2612> <U0058>
+% SALTIRE
+<U2613> <U0058>
+% WHITE FROWNING FACE
+<U2639> "<U003A><U002D><U0028>"
+% WHITE SMILING FACE
+<U263A> "<U003A><U002D><U0029>"
+% BLACK SMILING FACE
+<U263B> "<U0028><U002D><U003A>"
+% MUSIC FLAT SIGN
+<U266D> <U0062>
+% MUSIC SHARP SIGN
+<U266F> <U0023>
+% UPPER BLADE SCISSORS
+<U2701> "<U0025><U003C>"
+% BLACK SCISSORS
+<U2702> "<U0025><U003C>"
+% LOWER BLADE SCISSORS
+<U2703> "<U0025><U003C>"
+% WHITE SCISSORS
+<U2704> "<U0025><U003C>"
+% VICTORY HAND
+<U270C> <U0056>
+% CHECK MARK
+<U2713> <U221A>
+% HEAVY CHECK MARK
+<U2714> <U221A>
+% MULTIPLICATION X
+<U2715> <U0078>
+% HEAVY MULTIPLICATION X
+<U2716> <U0078>
+% BALLOT X
+<U2717> <U0058>
+% HEAVY BALLOT X
+<U2718> <U0058>
+% OUTLINED GREEK CROSS
+<U2719> <U002B>
+% HEAVY GREEK CROSS
+<U271A> <U002B>
+% OPEN CENTRE CROSS
+<U271B> <U002B>
+% HEAVY OPEN CENTRE CROSS
+<U271C> <U002B>
+% LATIN CROSS
+<U271D> <U002B>
+% SHADOWED WHITE LATIN CROSS
+<U271E> <U002B>
+% OUTLINED LATIN CROSS
+<U271F> <U002B>
+% MALTESE CROSS
+<U2720> <U002B>
+% STAR OF DAVID
+<U2721> <U002A>
+% FOUR TEARDROP-SPOKED ASTERISK
+<U2722> <U002B>
+% FOUR BALLOON-SPOKED ASTERISK
+<U2723> <U002B>
+% HEAVY FOUR BALLOON-SPOKED ASTERISK
+<U2724> <U002B>
+% FOUR CLUB-SPOKED ASTERISK
+<U2725> <U002B>
+% BLACK FOUR POINTED STAR
+<U2726> <U002B>
+% WHITE FOUR POINTED STAR
+<U2727> <U002B>
+% STRESS OUTLINED WHITE STAR
+<U2729> <U002A>
+% CIRCLED WHITE STAR
+<U272A> <U002A>
+% OPEN CENTRE BLACK STAR
+<U272B> <U002A>
+% BLACK CENTRE WHITE STAR
+<U272C> <U002A>
+% OUTLINED BLACK STAR
+<U272D> <U002A>
+% HEAVY OUTLINED BLACK STAR
+<U272E> <U002A>
+% PINWHEEL STAR
+<U272F> <U002A>
+% SHADOWED WHITE STAR
+<U2730> <U002A>
+% HEAVY ASTERISK
+<U2731> <U002A>
+% OPEN CENTRE ASTERISK
+<U2732> <U002A>
+% EIGHT SPOKED ASTERISK
+<U2733> <U002A>
+% EIGHT POINTED BLACK STAR
+<U2734> <U002A>
+% EIGHT POINTED PINWHEEL STAR
+<U2735> <U002A>
+% SIX POINTED BLACK STAR
+<U2736> <U002A>
+% EIGHT POINTED RECTILINEAR BLACK STAR
+<U2737> <U002A>
+% HEAVY EIGHT POINTED RECTILINEAR BLACK STAR
+<U2738> <U002A>
+% TWELVE POINTED BLACK STAR
+<U2739> <U002A>
+% SIXTEEN POINTED ASTERISK
+<U273A> <U002A>
+% TEARDROP-SPOKED ASTERISK
+<U273B> <U002A>
+% OPEN CENTRE TEARDROP-SPOKED ASTERISK
+<U273C> <U002A>
+% HEAVY TEARDROP-SPOKED ASTERISK
+<U273D> <U002A>
+% SIX PETALLED BLACK AND WHITE FLORETTE
+<U273E> <U002A>
+% BLACK FLORETTE
+<U273F> <U002A>
+% WHITE FLORETTE
+<U2740> <U002A>
+% EIGHT PETALLED OUTLINED BLACK FLORETTE
+<U2741> <U002A>
+% CIRCLED OPEN CENTRE EIGHT POINTED STAR
+<U2742> <U002A>
+% HEAVY TEARDROP-SPOKED PINWHEEL ASTERISK
+<U2743> <U002A>
+% SNOWFLAKE
+<U2744> <U002A>
+% TIGHT TRIFOLIATE SNOWFLAKE
+<U2745> <U002A>
+% HEAVY CHEVRON SNOWFLAKE
+<U2746> <U002A>
+% SPARKLE
+<U2747> <U002A>
+% HEAVY SPARKLE
+<U2748> <U002A>
+% BALLOON-SPOKED ASTERISK
+<U2749> <U002A>
+% EIGHT TEARDROP-SPOKED PROPELLER ASTERISK
+<U274A> <U002A>
+% HEAVY EIGHT TEARDROP-SPOKED PROPELLER ASTERISK
+<U274B> <U002A>
+% LATIN SMALL LIGATURE FF
+<UFB00> "<U0066><U0066>"
+% LATIN SMALL LIGATURE FI
+<UFB01> "<U0066><U0069>"
+% LATIN SMALL LIGATURE FL
+<UFB02> "<U0066><U006C>"
+% LATIN SMALL LIGATURE FFI
+<UFB03> "<U0066><U0066><U0069>"
+% LATIN SMALL LIGATURE FFL
+<UFB04> "<U0066><U0066><U006C>"
+% LATIN SMALL LIGATURE LONG S T
+<UFB05> "<U017F><U0074>";"<U0073><U0074>"
+% LATIN SMALL LIGATURE ST
+<UFB06> "<U0073><U0074>"
+% ZERO WIDTH NO-BREAK SPACE
+<UFEFF> ""
+% REPLACEMENT CHARACTER
+<UFFFD> <U003F>
diff --git a/test/INDEX b/test/INDEX
index 1ab567a..9c3c013 100644
--- a/test/INDEX
+++ b/test/INDEX
@@ -3,5 +3,6 @@
# Test Description DataDir
iconv Iconv initialisation/finalisation
nullable Handling of nullable input sequences
+translit Handling of transliteration
# Regression tests
diff --git a/test/Makefile b/test/Makefile
index 89b8093..c5d298f 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -1,4 +1,4 @@
# Tests
-DIR_TEST_ITEMS := iconv:iconv.c nullable:nullable.c
+DIR_TEST_ITEMS := iconv:iconv.c nullable:nullable.c translit:translit.c
include $(NSBUILD)/Makefile.subdir
diff --git a/test/translit.c b/test/translit.c
new file mode 100644
index 0000000..8f17889
--- a/dev/null
+++ b/test/translit.c
@@ -0,0 +1,94 @@
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <iconv/iconv.h>
+#include <iconv-internal/iconv.h>
+
+#include "testutils.h"
+
+#ifdef __riscos__
+#define ALIASES_FILE "Files.Aliases"
+#else
+#define ALIASES_FILE "Files/Aliases"
+#endif
+
+typedef struct translit_testcase {
+ const char *to_charset;
+ const char *source;
+ const char *expected;
+} translit_testcase;
+
+static const translit_testcase tests[] = {
+ { "iso-8859-1//TRANSLIT", "\xe2\x80\x93", "-" },
+ { NULL, NULL, NULL }
+};
+
+static void run_test(const translit_testcase *test)
+{
+ iconv_t cd;
+ char out[128];
+ char *inp = (char *) test->source, *outp = out;
+ size_t inlen = strlen(inp), outlen = sizeof(out);
+ size_t read;
+
+ cd = iconv_open(test->to_charset, "utf-8");
+ assert(cd != (iconv_t) -1);
+
+ read = iconv(cd, &inp, &inlen, &outp, &outlen);
+ assert(read == 0);
+
+ assert(sizeof(out) - outlen == strlen(test->expected));
+ assert(memcmp(out, test->expected, sizeof(out) - outlen) == 0);
+
+ iconv_close(cd);
+}
+
+static void run_tests(void)
+{
+ int index;
+
+ for (index = 0; tests[index].to_charset != NULL; index++) {
+ run_test(&tests[index]);
+ }
+}
+
+int main(int argc, char **argv)
+{
+ const char *ucpath;
+ int alen;
+ char aliases[4096];
+
+ UNUSED(argc);
+ UNUSED(argv);
+
+#ifdef __riscos__
+ ucpath = "Unicode:";
+#else
+ ucpath = getenv("UNICODE_DIR");
+#endif
+
+ assert(ucpath != NULL);
+
+ strncpy(aliases, ucpath, sizeof(aliases));
+ alen = strlen(aliases);
+#ifndef __riscos__
+ if (aliases[alen - 1] != '/') {
+ strncat(aliases, "/", sizeof(aliases) - alen - 1);
+ alen += 1;
+ }
+#endif
+ strncat(aliases, ALIASES_FILE, sizeof(aliases) - alen - 1);
+ aliases[sizeof(aliases) - 1] = '\0';
+
+ assert(iconv_initialise(aliases) == 1);
+
+ run_tests();
+
+ iconv_finalise();
+
+ printf("PASS\n");
+
+ return 0;
+}
+