#!/usr/bin/perl use warnings; use strict; # Usage: gentranstab.pl usage() if (@ARGV != 1); my $transtab = shift @ARGV; open TRANSTAB,"<$transtab" or die "Failed opening $transtab: $!\n"; print < #include #include #include "internal.h" static int translit_write_character(struct encoding_context *e, UCS4 c, char **buffer, size_t *buflen, bool use_transout) { Encoding *out = use_transout ? e->transout : e->out; int ret; if (out != NULL) { char *prev_buf = *buffer; size_t prev_len = *buflen; ret = encoding_write(out, c, buffer, (int *) buflen); if (ret <= 0) *buflen = prev_len - (*buffer - prev_buf); } else { ret = iconv_eightbit_write(e, c, buffer, (int *) buflen); } return ret; } static int translit_try_sequence(struct encoding_context *e, const size_t seqlen, const UCS2 *replacement) { char *tmpbuf, *ptmpbuf; size_t orig_tmplen, tmplen, index; int ret = 1; /* First, determine if sequence can be written to target encoding */ /* Worst case: conversion to UTF-8 (needing 6 bytes per character) */ orig_tmplen = tmplen = (seqlen + 1) * 6; ptmpbuf = tmpbuf = malloc(tmplen); if (tmpbuf == NULL) return 0; /* Reset the transout codec */ if (e->transout != NULL) { encoding_reset(e->transout); encoding_set_flags(e->transout, e->outflags, e->outflags); } for (index = 0; index < seqlen; index++) { UCS4 c = replacement[index]; do { ret = translit_write_character(e, c, &ptmpbuf, &tmplen, true); if (ret == 0) { char *tmp = realloc(tmpbuf, orig_tmplen * 2); if (tmp == NULL) break; ptmpbuf = tmp + (ptmpbuf - tmpbuf); tmpbuf = tmp; tmplen += orig_tmplen; orig_tmplen *= 2; } } while (ret == 0); if (ret <= 0) break; } free(tmpbuf); if (ret <= 0) { /* Consider lack of memory an inability to write the output */ return -1; } e->substitution = replacement; e->substlen = seqlen; /* Emit replacement for real */ return translit_flush_replacement(e); } int translit_flush_replacement(struct encoding_context *e) { const UCS2 *substitution = e->substitution; size_t substlen = e->substlen; int ret = 1; while (substlen > 0) { UCS4 c = substitution[0]; ret = translit_write_character(e, c, e->outbuf, e->outbytesleft, false); assert(ret != -1); if (ret <= 0) break; substitution++; substlen--; } e->substitution = substitution; e->substlen = substlen; return ret; } EOF # Map from codepoint -> ttvals ref # ttvals is a list of chars ref my %transmap = (); # Length, in characters, of longest substitution string seen so far my $maxsubst = 0; # Total number of substitution strings encountered my $numsubsts = 0; # Map from substitution string -> start index in charbin my %substs = (); # Accumulated list of substitution character sequences my @charbin = (); # Read in transtab data while (my $line = ) { # Skip comments and blank lines next if ($line =~ /^%/); next if ($line =~ /^\s*$/); # Format: my ($codepoint, $data) = split(' ', $line); # Strip '' from end of input codepoint $codepoint =~ s/^]+)>/$1/; # Data is a list of semi-colon-separated substitutions my @substitutions = split(';', $data); my @ttvals = (); foreach my $sub (@substitutions) { # Strip quotes around substitution sequence $sub =~ s/"([^"]*)"/$1/; $numsubsts++; if ($sub eq "") { # Special-case empty substitutions my @empty = (); push(@ttvals, \@empty); next; } # Split characters in sequence my @chars = split('<', $sub); shift @chars; my $num_chars = scalar(@chars); # Strip leading 'U' and trailing '>' map { $_ =~ s/U([^>]+)>/$1/; } @chars; $maxsubst = $num_chars if ($num_chars > $maxsubst); # Stringify chars to produce hash key my $hkey = "@chars"; # Find/insert in bin, if new substitution if (!defined($substs{$hkey})) { my $pos = find_in_bin(\@chars, $num_chars); $substs{$hkey} = $pos; } # Append to list of substitutions for codepoint push(@ttvals, \@chars); } # Insert into transmap $transmap{$codepoint} = \@ttvals; } close TRANSTAB; # Ensure transtab is representable die "Charbin length exceeds 2^13!" if $#charbin >= 2**13; die "Maxsubst exceeds 8!" if $maxsubst >= 2**3; print <codepoint - (int) bb->codepoint; } int translit_substitute(struct encoding_context *e, UCS4 c) { static const UCS2 default_subst[1] = { '?' }; int ret = 1; if (c <= 0xFFFF) { struct translit_entry key = { c, 0, 0 }; const struct translit_entry *res; res = bsearch(&key, transtab, $numsubsts, sizeof(struct translit_entry), translit_tab_cmp); if (res != NULL) { /* Reverse until we find the first entry for c */ while (res > transtab) { if (res[-1].codepoint != c) break; res--; } /* Try substitutions in turn, until we run out */ while (res->codepoint == c) { ret = translit_try_sequence(e, res->length, substdata + res->offset); if (ret >= 0) return ret; res++; } } } /* Last-ditch replacement: must succeed */ return translit_try_sequence(e, 1, default_subst); } EOF # Search bin for existing sequence, or append if not found. # # The intent here is to minimise duplication of substitution # sequences. This implementation is decidedly trivial, and # makes no attempt to discover the optimal insertion order. # # Inspection of the output indicates that we use approximately # 5.5 bytes of storage for each substitution sequence # encountered (4 of these are the translit_entry, so there # doesn't seem much point in trying to optimise the layout of # the charbin any further.) sub find_in_bin { my $pchars = shift; my $pcharslen = shift; my $binlen = scalar(@charbin); my $offset = 0; # Search bin for pchars while ($offset <= $binlen - $pcharslen) { my @slice = @charbin[$offset .. $offset + $pcharslen - 1]; last if aeq(\@slice, $pchars); $offset++; } if ($offset <= $binlen - $pcharslen) { # Found in bin return $offset; } else { # Not found, so append push(@charbin, @$pchars); return $binlen; } } # Compare two arrays for equality sub aeq { my ($aref, $bref) = @_; return 0 unless @$aref == @$bref; my $idx = 0; for my $item (@$aref) { return 0 unless $item eq $bref->[$idx++]; } return 1; } sub usage { print STDERR < EOF exit 1; }