summaryrefslogtreecommitdiff
path: root/build/tools
diff options
context:
space:
mode:
Diffstat (limited to 'build/tools')
-rw-r--r--build/tools/gentranstab.pl344
1 files changed, 344 insertions, 0 deletions
diff --git a/build/tools/gentranstab.pl b/build/tools/gentranstab.pl
new file mode 100644
index 0000000..0e9205a
--- /dev/null
+++ b/build/tools/gentranstab.pl
@@ -0,0 +1,344 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+
+# Usage: gentranstab.pl <path to transtab>
+
+usage() if (@ARGV != 1);
+
+my $transtab = shift @ARGV;
+
+open TRANSTAB,"<$transtab" or die "Failed opening $transtab: $!\n";
+
+print <<EOF;
+/* This file is autogenerated. Manual changes will be lost */
+
+#include <assert.h>
+#include <inttypes.h>
+#include <stdbool.h>
+
+#include "internal.h"
+
+static int translit_write_character(struct encoding_context *e,
+ UCS4 c, char **buffer, size_t *buflen, bool use_transout)
+{
+ Encoding *out = use_transout ? e->transout : e->out;
+ int ret;
+
+ if (out != NULL) {
+ char *prev_buf = *buffer;
+ size_t prev_len = *buflen;
+
+ ret = encoding_write(out, c, buffer, (int *) buflen);
+
+ if (ret <= 0)
+ *buflen = prev_len - (*buffer - prev_buf);
+ } else {
+ ret = iconv_eightbit_write(e, c, buffer, (int *) buflen);
+ }
+
+ return ret;
+}
+
+static int translit_try_sequence(struct encoding_context *e,
+ const size_t seqlen, const UCS2 *replacement)
+{
+ char *tmpbuf, *ptmpbuf;
+ size_t orig_tmplen, tmplen, index;
+ int ret = 1;
+
+ /* First, determine if sequence can be written to target encoding */
+ /* Worst case: conversion to UTF-8 (needing 6 bytes per character) */
+ orig_tmplen = tmplen = (seqlen + 1) * 6;
+ ptmpbuf = tmpbuf = malloc(tmplen);
+ if (tmpbuf == NULL)
+ return 0;
+
+ /* Reset the transout codec */
+ if (e->transout != NULL) {
+ encoding_reset(e->transout);
+ encoding_set_flags(e->transout, e->outflags, e->outflags);
+ }
+
+ for (index = 0; index < seqlen; index++) {
+ UCS4 c = replacement[index];
+ do {
+ ret = translit_write_character(e, c, &ptmpbuf,
+ &tmplen, true);
+ if (ret == 0) {
+ char *tmp = realloc(tmpbuf, orig_tmplen * 2);
+ if (tmp == NULL)
+ break;
+
+ ptmpbuf = tmp + (ptmpbuf - tmpbuf);
+ tmpbuf = tmp;
+ tmplen += orig_tmplen;
+ orig_tmplen *= 2;
+ }
+ } while (ret == 0);
+
+ if (ret <= 0)
+ break;
+ }
+
+ free(tmpbuf);
+
+ if (ret <= 0) {
+ /* Consider lack of memory an inability to write the output */
+ return -1;
+ }
+
+ e->substitution = replacement;
+ e->substlen = seqlen;
+
+ /* Emit replacement for real */
+ return translit_flush_replacement(e);
+}
+
+int translit_flush_replacement(struct encoding_context *e)
+{
+ const UCS2 *substitution = e->substitution;
+ size_t substlen = e->substlen;
+ int ret = 1;
+
+ while (substlen > 0) {
+ UCS4 c = substitution[0];
+
+ ret = translit_write_character(e, c,
+ e->outbuf, e->outbytesleft, false);
+ assert(ret != -1);
+ if (ret <= 0)
+ break;
+
+ substitution++;
+ substlen--;
+ }
+
+ e->substitution = substitution;
+ e->substlen = substlen;
+
+ return ret;
+}
+
+EOF
+
+# Map from codepoint -> ttvals ref
+# ttvals is a list of chars ref
+my %transmap = ();
+# Length, in characters, of longest substitution string seen so far
+my $maxsubst = 0;
+# Total number of substitution strings encountered
+my $numsubsts = 0;
+# Map from substitution string -> start index in charbin
+my %substs = ();
+# Accumulated list of substitution character sequences
+my @charbin = ();
+
+# Read in transtab data
+while (my $line = <TRANSTAB>) {
+ # Skip comments and blank lines
+ next if ($line =~ /^%/);
+ next if ($line =~ /^\s*$/);
+
+ # Format: <codepoint> <data>
+ my ($codepoint, $data) = split(' ', $line);
+
+ # Strip '<U' from start, and '>' from end of input codepoint
+ $codepoint =~ s/^<U([^>]+)>/$1/;
+
+ # Data is a list of semi-colon-separated substitutions
+ my @substitutions = split(';', $data);
+
+ my @ttvals = ();
+
+ foreach my $sub (@substitutions) {
+ # Strip quotes around substitution sequence
+ $sub =~ s/"([^"]*)"/$1/;
+
+ $numsubsts++;
+
+ if ($sub eq "") {
+ # Special-case empty substitutions
+ my @empty = ();
+ push(@ttvals, \@empty);
+ next;
+ }
+
+ # Split characters in sequence
+ my @chars = split('<', $sub);
+ shift @chars;
+ my $num_chars = scalar(@chars);
+
+ # Strip leading 'U' and trailing '>'
+ map { $_ =~ s/U([^>]+)>/$1/; } @chars;
+
+ $maxsubst = $num_chars if ($num_chars > $maxsubst);
+
+ # Stringify chars to produce hash key
+ my $hkey = "@chars";
+
+ # Find/insert in bin, if new substitution
+ if (!defined($substs{$hkey})) {
+ my $pos = find_in_bin(\@chars, $num_chars);
+
+ $substs{$hkey} = $pos;
+ }
+
+ # Append to list of substitutions for codepoint
+ push(@ttvals, \@chars);
+ }
+
+ # Insert into transmap
+ $transmap{$codepoint} = \@ttvals;
+}
+
+close TRANSTAB;
+
+# Ensure transtab is representable
+die "Charbin length exceeds 2^13!" if $#charbin >= 2**13;
+die "Maxsubst exceeds 8!" if $maxsubst >= 2**3;
+
+print <<EOF;
+struct translit_entry {
+ uint32_t codepoint : 16,
+ offset : 13,
+ length : 3;
+};
+
+EOF
+
+# Emit substitution data
+my $cblen = @charbin;
+print "static const UCS2 substdata[$cblen] = {\n";
+foreach my $c (@charbin) {
+ print "\t0x$c,\n";
+}
+print "};\n\n";
+
+# Emit transliteration LUT
+my $ttlen = $numsubsts + 1; # + 1 for sentinel
+print "static const struct translit_entry transtab[$ttlen] = {\n";
+foreach my $codepoint (sort(keys %transmap)) {
+ my $ttvals = $transmap{$codepoint};
+
+ for my $subst (@$ttvals) {
+ my $hkey = "@$subst";
+
+ if ($hkey ne "") {
+ my $slen = @$subst;
+ print "\t{ 0x$codepoint, $substs{$hkey}, $slen },\n";
+ } else {
+ print "\t{ 0x$codepoint, 0, 0 },\n";
+ }
+ }
+}
+# Place sentinel at the end
+print "\t{ 0, 0, 0 }\n";
+print "};\n\n";
+
+print <<EOF;
+static int translit_tab_cmp(const void *a, const void *b)
+{
+ const struct translit_entry *aa = (const struct translit_entry *) a;
+ const struct translit_entry *bb = (const struct translit_entry *) b;
+
+ return (int) aa->codepoint - (int) bb->codepoint;
+}
+
+int translit_substitute(struct encoding_context *e, UCS4 c)
+{
+ static const UCS2 default_subst[1] = { '?' };
+ int ret = 1;
+
+ if (c <= 0xFFFF) {
+ struct translit_entry key = { c, 0, 0 };
+ const struct translit_entry *res;
+
+ res = bsearch(&key, transtab, $numsubsts,
+ sizeof(struct translit_entry),
+ translit_tab_cmp);
+ if (res != NULL) {
+ /* Reverse until we find the first entry for c */
+ while (res > transtab) {
+ if (res[-1].codepoint != c)
+ break;
+ res--;
+ }
+
+ /* Try substitutions in turn, until we run out */
+ while (res->codepoint == c) {
+ ret = translit_try_sequence(e, res->length,
+ substdata + res->offset);
+ if (ret >= 0)
+ return ret;
+
+ res++;
+ }
+ }
+ }
+
+ /* Last-ditch replacement: must succeed */
+ return translit_try_sequence(e, 1, default_subst);
+}
+EOF
+
+# Search bin for existing sequence, or append if not found.
+#
+# The intent here is to minimise duplication of substitution
+# sequences. This implementation is decidedly trivial, and
+# makes no attempt to discover the optimal insertion order.
+#
+# Inspection of the output indicates that we use approximately
+# 5.5 bytes of storage for each substitution sequence
+# encountered (4 of these are the translit_entry, so there
+# doesn't seem much point in trying to optimise the layout of
+# the charbin any further.)
+sub find_in_bin
+{
+ my $pchars = shift;
+ my $pcharslen = shift;
+ my $binlen = scalar(@charbin);
+ my $offset = 0;
+
+ # Search bin for pchars
+ while ($offset <= $binlen - $pcharslen) {
+ my @slice = @charbin[$offset .. $offset + $pcharslen - 1];
+
+ last if aeq(\@slice, $pchars);
+
+ $offset++;
+ }
+
+ if ($offset <= $binlen - $pcharslen) {
+ # Found in bin
+ return $offset;
+ } else {
+ # Not found, so append
+ push(@charbin, @$pchars);
+ return $binlen;
+ }
+}
+
+# Compare two arrays for equality
+sub aeq
+{
+ my ($aref, $bref) = @_;
+ return 0 unless @$aref == @$bref;
+
+ my $idx = 0;
+ for my $item (@$aref) {
+ return 0 unless $item eq $bref->[$idx++];
+ }
+
+ return 1;
+}
+
+sub usage
+{
+ print STDERR <<EOF;
+Usage: gentranstab.pl <path to transtab>
+EOF
+
+ exit 1;
+}