diff options
Diffstat (limited to 'aliases/makealiases.c')
-rw-r--r-- | aliases/makealiases.c | 243 |
1 files changed, 243 insertions, 0 deletions
diff --git a/aliases/makealiases.c b/aliases/makealiases.c new file mode 100644 index 0000000..f24370b --- /dev/null +++ b/aliases/makealiases.c @@ -0,0 +1,243 @@ +/** + * IANA charset data to Iconv Aliases file convertor + * + * Version history: + * + * 0.01 - Initial version + * 0.02 - Added "utf8" alias seen in the wild + */ + +#include <ctype.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +struct extra { + const char *canon; + const char *aliases; +} extras[] = { + { "ISO-8859-1", "8859_1 ISO8859-1" }, + { "ISO-8859-2", "8859_2 ISO8859-2" }, + { "ISO-8859-3", "8859_3 ISO8859-3" }, + { "ISO-8859-4", "8859_4 ISO8859-4" }, + { "ISO-8859-5", "8859_5 ISO8859-5" }, + { "ISO-8859-7", "8859_7 ISO8859-7" }, + { "ISO-8859-8", "8859_8 ISO8859-8" }, + { "ISO-8859-9", "8859_9 ISO8859-9" }, + { "ISO-8859-10", "8859_10 ISO8859-10" }, + { "ISO-8859-13", "8859_13 ISO8859-13" }, + { "ISO-8859-14", "8859_14 ISO8859-14" }, + { "ISO-8859-15", "8859_15 ISO8859-15" }, + { "Shift_JIS", "X-SJIS Shift-JIS" }, + { "EUC-JP", "EUCJP" }, + { "EUC-KR", "EUCKR" }, + { "UTF-8", "UNICODE-1-1-UTF-8 UNICODE-2-0-UTF-8 utf8" }, + { "ISO-10646-UCS-4", "UCS-4 UCS4" }, + { "ISO-10646-UCS-2", "UCS-2 UCS2" }, + { "GB2312", "EUC-CN EUCCN CN-GB" }, + { "Big5", "BIG-FIVE BIG-5 CN-BIG5 BIG_FIVE" }, + { "macintosh", "MACROMAN MAC-ROMAN X-MAC-ROMAN" }, + { "windows-1250", "CP1250 MS-EE" }, + { "windows-1251", "CP1251 MS-CYRL" }, + { "windows-1252", "CP1252 MS-ANSI" }, + { "windows-1253", "CP1253 MS-GREEK" }, + { "windows-1254", "CP1254 MS-TURK" }, + { "windows-1256", "CP1256 MS-ARAB" }, + { "windows-1257", "CP1257 WINBALTRIM" }, +}; +#define EXTRAS_SIZE (sizeof(extras) / sizeof(extras[0])) + +/* + * Make aliases file from IANA charset data. + * The canonical name of an encoding is that which follows the "Name:" tag + * in the input file. There is an exception, however, for those encodings + * which have an alias which is denoted as the "preferred MIME name". For + * these encodings, the preferred MIME name is taken as the canonical form. + */ + +#define TOP argv[1] +#define SETS argv[2] +#define BOTTOM argv[3] +#define ALIASES argv[4] + +int main(int argc, const char **argv) +{ + FILE *in, *out; + char buf[200], name[64]; + short mibenum; + char *s, *n, *aliases, *temp; + int i; + int namelen; + + in = fopen(TOP, "r"); + if (!in) + return 1; + + out = fopen(ALIASES, "w"); + if (!out) + return 1; + + while (fgets(buf, sizeof buf, in)) { + fputs(buf, out); + } + + fclose(in); + + in = fopen(SETS, "r"); + if (!in) { + fclose(out); + return 1; + } + + fgets(buf, sizeof buf, in); + + while (1) { + /* find start of record */ + if (strncmp(buf, "Name:", 5) != 0) { + while(fgets(buf, sizeof buf, in)) { + if (strncmp(buf, "Name:", 5) == 0) + break; + } + } + if(strncmp(buf, "Name:", 5) != 0) + break; + + buf[strlen(buf) - 1] = '\0'; + + s = buf+5; + /* skip whitespace */ + while (isspace(*s)) + s++; + /* copy name to buffer */ + n = name; + while (*s) { + if (isspace(*s)) + break; + *n++ = *s++; + } + *n = '\0'; + + /* get mibenum */ + while(fgets(buf, sizeof buf, in)) { + if (strncmp(buf, "Name:", 5) == 0) + break; + if (strncmp(buf, "MIBenum:", 8) == 0) + break; + } + if (strncmp(buf, "MIBenum:", 8) != 0) + continue; + + buf[strlen(buf) - 1] = '\0'; + + s = buf+8; + while (isspace(*s)) + s++; + mibenum = atoi(s); + + aliases = malloc(1); + if (!aliases) + break; + *aliases = '\0'; + + /* parse aliases */ + while(fgets(buf, sizeof buf, in)) { + if (strncmp(buf, "Name:", 5) == 0) + break; + if (strncmp(buf, "Alias:", 6) != 0) + continue; + + buf[strlen(buf) - 1] = '\0'; + + s = buf + 6; + while (isspace(*s)) + s++; + + if (strncmp(s, "None", 4) == 0) + /* ignore this */ + continue; + + if (strstr(s, "preferred MIME name") != 0) { + temp = realloc(aliases, + strlen(aliases) + 1 + + strlen(name) + 1); + if (!temp) + goto end; + aliases = temp; + sprintf(aliases, "%s%s%s", aliases, + aliases[0] == '\0' ? "" : " ", name); + n = name; + while (*s) { + if (isspace(*s)) + break; + *n++ = *s++; + } + *n = '\0'; + } + else { + n = s; + while (*n) { + if (isspace(*n)) + break; + n++; + } + temp = realloc(aliases, + strlen(aliases) + 1 + (n - s) + 1); + if (!temp) + goto end; + aliases = temp; + n = aliases + strlen(aliases); + if (aliases[0] != '\0') + *n++ = ' '; + while (*s) { + if (isspace(*s)) + break; + *n++ = *s++; + } + *n = '\0'; + } + } + + fprintf(out, "%s\t", name); + + /* Rounded up to tab stop */ + namelen = (strlen(name) + 8) & ~(8 - 1); + while (namelen < 3 * 8) { + fputc('\t', out); + namelen += 8; + } + + fprintf(out, "%d", mibenum); + + if (aliases[0] != '\0') + fprintf(out, "\t\t%s", aliases); + for (i = 0; i != EXTRAS_SIZE; i++) { + if (strcmp(name, extras[i].canon) == 0) { + fprintf(out, "%s%s", + aliases[0] == '\0' ? "\t\t" : " ", + extras[i].aliases); + break; + } + } + fprintf(out, "\n"); + + free(aliases); + } + +end: + fclose(in); + + in = fopen(BOTTOM, "r"); + if (!in) { + fclose(out); + return 1; + } + + while (fgets(buf, sizeof buf, in)) + fputs(buf, out); + + fclose(in); + fclose(out); + + return 0; +} + |