From 90bab1fefd7d631f63ea0e60b5ad496ff7c100d0 Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Thu, 11 Jun 2009 22:51:12 +0000 Subject: Windows sucks. svn path=/trunk/iconv/; revision=7770 --- doc/Uni-iconv | 204 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 204 insertions(+) create mode 100644 doc/Uni-iconv (limited to 'doc/Uni-iconv') diff --git a/doc/Uni-iconv b/doc/Uni-iconv new file mode 100644 index 0000000..caea2d0 --- /dev/null +++ b/doc/Uni-iconv @@ -0,0 +1,204 @@ +Introduction: +============= + +This file documents an approximate correlation between the data files +provided in the !Unicode distribution and the encoding headers in GNU +libiconv 1.9.1. + +Those with '?' in the iconv column either are not represented in iconv +or I've missed the relevant header file ;) + +A number of encodings are present in the iconv distribution but not +in !Unicode. These are documented at the end of this file. + +Changelog: +========== + +v 0.01 (09-Sep-2004) +~~~~~~~~~~~~~~~~~~~~ +Initial Incarnation + +v 0.02 (11-Sep-2004) +~~~~~~~~~~~~~~~~~~~~ +Documented additional encodings supported by the Iconv module. +Corrected list of !Unicode deficiencies. + + +!Unicode->iconv: +================ + +Unicode: iconv: notes: + +Acorn.Latin1 riscos1.h + +Apple.CentEuro mac_centraleurope.h +Apple.Cyrillic mac_cyrillic.h +Apple.Roman mac_roman.h +Apple.Ukrainian mac_ukraine.h + +BigFive big5.h + +ISO2022.C0.40[ISO646] ? + +ISO2022.C1.43[IS6429] ? + +ISO2022.G94.40[646old] iso646_cn.h +ISO2022.G94.41[646-GB] ? +ISO2022.G94.42[646IRV] ? +ISO2022.G94.43[FinSwe] ? +ISO2022.G94.47[646-SE] ? +ISO2022.G94.48[646-SE] ? +ISO2022.G94.49[JS201K] jisx0201.h top of JIS range +ISO2022.G94.4A[JS201R] jisx0201.h iso646_jp.h bottom of JIS range +ISO2022.G94.4B[646-DE] ? +ISO2022.G94.4C[646-PT] ? +ISO2022.G94.54[GB1988] ? +ISO2022.G94.56[Teltxt] ? +ISO2022.G94.59[646-IT] ? +ISO2022.G94.5A[646-ES] ? +ISO2022.G94.60[646-NO] ? +ISO2022.G94.66[646-FR] ? +ISO2022.G94.69[646-HU] ? +ISO2022.G94.6B[Arabic] ? +ISO2022.G94.6C[IS6397] ? +ISO2022.G94.7A[SerbCr] ? + +ISO2022.G94x94.40[JS6226] ? +ISO2022.G94x94.41[GB2312] gb2312.h +ISO2022.G94x94.42[JIS208] jis0x208.h +ISO2022.G94x94.43[KS1001] ksc5601.h +ISO2022.G94x94.44[JIS212] jis0x212.h +ISO2022.G94x94.47[CNS1] cns11643_1.h the tables differ +ISO2022.G94x94.48[CNS2] cns11643_2.h +ISO2022.G94x94.49[CNS3] cns11643_3.h +ISO2022.G94x94.4A[CNS4] cns11643_4.h +ISO2022.G94x94.4B[CNS5] cns11643_5.h +ISO2022.G94x94.4C[CNS6] cns11643_6.h +ISO2022.G94x94.4D[CNS7] cns11643_7.h + +ISO2022.G96.41[Lat1] iso8859_1.h +ISO2022.G96.42[Lat2] iso8859_2.h +ISO2022.G96.43[Lat3] iso8859_3.h +ISO2022.G96.44[Lat4] iso8859_4.h +ISO2022.G96.46[Greek] ? +ISO2022.G96.47[Arabic] iso8859_6.h ISO-8859-6 ignored +ISO2022.G96.48[Hebrew] ? +ISO2022.G96.4C[Cyrill] ? +ISO2022.G96.4D[Lat5] iso8859_5.h +ISO2022.G96.50[LatSup] ? +ISO2022.G96.52[IS6397] ? +ISO2022.G96.54[Thai] tis620.h +ISO2022.G96.56[Lat6] iso8859_6.h +ISO2022.G96.58[L6Sami] ? +ISO2022.G96.59[Lat7] iso8859_7.h +ISO2022.G96.5C[Welsh] ? +ISO2022.G96.5D[Sami] ? +ISO2022.G96.5E[Hebrew] ? +ISO2022.G96.5F[Lat8] iso8859_8.h +ISO2022.G96.62[Lat9] iso8859_9.h + +KOI8-R koi8_r.h + +Microsoft.CP1250 cp1250.h +Microsoft.CP1251 cp1251.h +Microsoft.CP1252 cp1252.h +Microsoft.CP1254 cp1254.h +Microsoft.CP866 cp866.h +Microsoft.CP932 cp932.h cp932ext.h + +iconv->!Unicode: +================ + +Iconv has the following encodings, which are not present in !Unicode. +Providing a suitable data file for !Unicode is trivial. Whether UnicodeLib +will then act upon the addition of these is unknown. +This list is ordered as per libiconv's NOTES file. + +European & Semitic languages: + + ISO-8859-16 (iso8859_16.h) + KOI8-{U,RU,T} (koi8_xx.h) + CP125{3,5,6,7} (cp125n.h) + CP850 (cp850.h) + CP862 (cp862.h) + Mac{Croatian,Romania,Greek,Turkish,Hebrew,Arabic} (mac_foo.h) + +Japanese: + + None afaikt. + +Simplified Chinese: + + GB18030 (gb18030.h, gb18030ext.h) + HZ-GB-2312 (hz.h) + +Traditional Chinese: + + CP950 (cp950.h) + BIG5-HKSCS (big5hkscs.h) + +Korean: + + CP949 (cp949.h) + +Armenian: + + ARMSCII-8 (armscii_8.h) + +Georgian: + + Georgian-Academy, Georgian-PS (georgian_academy.h, georgian_ps.h) + +Thai: + + CP874 (cp874.h) + MacThai (mac_thai.h) + +Laotian: + + MuleLao-1, CP1133 (mulelao.h, cp1133.h) + +Vietnamese: + + VISCII, TCVN (viscii.h, tcvn.h) + CP1258 (cp1258.h) + +Unicode: + + BE/LE variants of normal encodings. I assume UnicodeLib handles + these, but can't be sure. + C99 / JAVA - well, yes. + + +Iconv Module: +============= + +The iconv module is effectively a thin veneer around UnicodeLib. However, +8bit encodings are implemented within the module rather than using the +support in UnicodeLib. The rationale for this is simply that, although +UnicodeLib will understand (and act upon - reportedly...) additions to +the ISO2022 Unicode resource, other encodings are ignored. As the vast +majority of outstanding encodings fall into this category, and the code +is fairly simple, it made sense to implement it within the module. + +With use of the iconv module, the list of outstanding encodings is +reduced to: + + CP1255 (requires state-based transcoding) + + GB18030 (not 8bit - reportedly a requirement of PRC) + HZ-GB-2312 (not 8bit - supported by IE4) + + CP950 (not 8bit - a (MS) variant of Big5) + BIG5-HKSCS (not 8bit - again, a Big5 variant) + + CP949 (not 8bit) + + ARMSCII-8 (easily implemented, if required) + + VISCII (easily implemented, if required) + CP1258, TCVN (requires state-based transcoding) + +Additionally, the rest of the CodePage encodings implemented in iconv +but not listed above (due to omissions from the iconv documentation) +are implemented by the iconv module. -- cgit v1.2.3