From c34dc685f70193728ead525f59d56cdbf116f574 Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Sun, 4 Nov 2007 03:40:09 +0000 Subject: Make TestObject support both HTML and XML documents and auto-detect which parser to use. Make binding testcase attempt to load an HTML document. Hubbub parser binding constructor takes Aliases file path as a parameter. Hubbub parser binding's token handler now spews debug at stdout. svn path=/trunk/dom/; revision=3648 --- bindings/hubbub/parser.c | 78 +++++++++-- bindings/hubbub/parser.h | 6 +- test/Makefile | 12 +- test/binding.c | 4 + test/data/binding/Aliases | 302 ++++++++++++++++++++++++++++++++++++++++++ test/data/binding/sample.html | 11 ++ test/lib/testobject.c | 145 +++++++++++++++++--- 7 files changed, 520 insertions(+), 38 deletions(-) create mode 100644 test/data/binding/Aliases create mode 100644 test/data/binding/sample.html diff --git a/bindings/hubbub/parser.c b/bindings/hubbub/parser.c index 8931fdf..9473438 100644 --- a/bindings/hubbub/parser.c +++ b/bindings/hubbub/parser.c @@ -5,6 +5,8 @@ * Copyright 2007 John-Mark Bell */ +#include + #include #include @@ -18,6 +20,7 @@ */ struct dom_hubbub_parser { hubbub_parser *parser; /**< Hubbub parser instance */ + const uint8_t *buffer; /**< Parser buffer pointer */ struct dom_document *doc; /**< DOM Document we're building */ @@ -41,6 +44,7 @@ static bool __initialised; /** * Create a Hubbub parser instance * + * \param aliases Path to encoding alias mapping file * \param enc Source charset, or NULL * \param int_enc Desired charset of document buffer (UTF-8 or UTF-16) * \param alloc Memory (de)allocation function @@ -49,9 +53,9 @@ static bool __initialised; * \param mctx Pointer to client-specific private data * \return Pointer to instance, or NULL on memory exhaustion */ -dom_hubbub_parser *dom_hubbub_parser_create(const char *enc, - const char *int_enc, dom_alloc alloc, void *pw, - dom_msg msg, void *mctx) +dom_hubbub_parser *dom_hubbub_parser_create(const char *aliases, + const char *enc, const char *int_enc, + dom_alloc alloc, void *pw, dom_msg msg, void *mctx) { dom_hubbub_parser *parser; hubbub_parser_optparams params; @@ -60,8 +64,7 @@ dom_hubbub_parser *dom_hubbub_parser_create(const char *enc, hubbub_error e; if (__initialised == false) { - /** \todo Need path of encoding aliases file */ - e = hubbub_initialise("", (hubbub_alloc) alloc, pw); + e = hubbub_initialise(aliases, (hubbub_alloc) alloc, pw); if (e != HUBBUB_OK) { msg(DOM_MSG_ERROR, mctx, "Failed initialising hubbub"); @@ -202,14 +205,71 @@ struct dom_document *dom_hubbub_parser_get_document(dom_hubbub_parser *parser) void __dom_hubbub_buffer_handler(const uint8_t *buffer, size_t len, void *pw) { - UNUSED(buffer); + dom_hubbub_parser *parser = (dom_hubbub_parser *) pw; + UNUSED(len); - UNUSED(pw); + + parser->buffer = buffer; } void __dom_hubbub_token_handler(const hubbub_token *token, void *pw) { - UNUSED(token); - UNUSED(pw); + dom_hubbub_parser *parser = (dom_hubbub_parser *) pw; + static const char *token_names[] = { + "DOCTYPE", "START TAG", "END TAG", + "COMMENT", "CHARACTERS", "EOF" + }; + size_t i; + + printf("%s: ", token_names[token->type]); + + switch (token->type) { + case HUBBUB_TOKEN_DOCTYPE: + printf("'%.*s' (%svalid)\n", + (int) token->data.doctype.name.len, + parser->buffer + + token->data.doctype.name.data_off, + token->data.doctype.correct ? "" : "in"); + break; + case HUBBUB_TOKEN_START_TAG: + printf("'%.*s' %s\n", + (int) token->data.tag.name.len, + parser->buffer + token->data.tag.name.data_off, + (token->data.tag.n_attributes > 0) ? + "attributes:" : ""); + for (i = 0; i < token->data.tag.n_attributes; i++) { + printf("\t'%.*s' = '%.*s'\n", + (int) token->data.tag.attributes[i].name.len, + parser->buffer + token->data.tag.attributes[i].name.data_off, + (int) token->data.tag.attributes[i].value.len, + parser->buffer + token->data.tag.attributes[i].value.data_off); + } + break; + case HUBBUB_TOKEN_END_TAG: + printf("'%.*s' %s\n", + (int) token->data.tag.name.len, + parser->buffer + token->data.tag.name.data_off, + (token->data.tag.n_attributes > 0) ? + "attributes:" : ""); + for (i = 0; i < token->data.tag.n_attributes; i++) { + printf("\t'%.*s' = '%.*s'\n", + (int) token->data.tag.attributes[i].name.len, + parser->buffer + token->data.tag.attributes[i].name.data_off, + (int) token->data.tag.attributes[i].value.len, + parser->buffer + token->data.tag.attributes[i].value.data_off); + } + break; + case HUBBUB_TOKEN_COMMENT: + printf("'%.*s'\n", (int) token->data.comment.len, + parser->buffer + token->data.comment.data_off); + break; + case HUBBUB_TOKEN_CHARACTER: + printf("'%.*s'\n", (int) token->data.character.len, + parser->buffer + token->data.character.data_off); + break; + case HUBBUB_TOKEN_EOF: + printf("\n"); + break; + } } diff --git a/bindings/hubbub/parser.h b/bindings/hubbub/parser.h index 65da1ea..f4c2ac4 100644 --- a/bindings/hubbub/parser.h +++ b/bindings/hubbub/parser.h @@ -20,9 +20,9 @@ struct dom_document; typedef struct dom_hubbub_parser dom_hubbub_parser; /* Create a Hubbub parser instance */ -dom_hubbub_parser *dom_hubbub_parser_create(const char *enc, - const char *int_enc, dom_alloc alloc, void *pw, - dom_msg msg, void *mctx); +dom_hubbub_parser *dom_hubbub_parser_create(const char *aliases, + const char *enc, const char *int_enc, + dom_alloc alloc, void *pw, dom_msg msg, void *mctx); /* Destroy a Hubbub parser instance */ void dom_hubbub_parser_destroy(dom_hubbub_parser *parser); diff --git a/test/Makefile b/test/Makefile index 2a0c63a..f5f7233 100644 --- a/test/Makefile +++ b/test/Makefile @@ -19,11 +19,15 @@ # test Execute any test cases # Extend toolchain settings -CFLAGS += -I${TOP}/src/ -I${TOP}/bindings/xml/ -I$(CURDIR) -LDFLAGS += `${PKGCONFIG} ${PKGCONFIGFLAGS} --libs libxml-2.0` +CFLAGS += -I${TOP}/src/ -I${TOP}/bindings/xml/ -I${TOP}/bindings/hubbub/ \ + -I$(CURDIR) +# TODO: fix hubbub library usage -- needs hubbub to use pkgconfig +LDFLAGS += `${PKGCONFIG} ${PKGCONFIGFLAGS} --libs libxml-2.0` \ + -L${TOP}/../hubbub/ -lhubbub # Libraries we link against -LIBS = -L./lib -ldomtest-debug -ldom-libxml-debug -ldom-debug +LIBS = -L./lib -ldomtest-debug -ldom-libxml-debug \ + -ldom-libhubbub-debug -ldom-debug # Release output RELEASE = @@ -89,6 +93,6 @@ xml/c/%.c: xml/tests/%.xml %: %.c @${ECHO} ${ECHOFLAGS} "==> $<" @${CC} -c -g ${CFLAGS} -o $@.o $< - @${LD} -g -o $@ $@.o ${LDFLAGS} $(LIBS) + @${LD} -g -o $@ $@.o $(LIBS) ${LDFLAGS} @${RM} ${RMFLAGS} $@.o diff --git a/test/binding.c b/test/binding.c index ac76733..e7d824f 100644 --- a/test/binding.c +++ b/test/binding.c @@ -10,10 +10,14 @@ int main(int argc, char **argv) struct dom_string *elementName; dom_exception err; TestObject *staff; + TestObject *html; staff = test_object_create(argc, argv, "staff.xml", false); assert(staff != NULL); + html = test_object_create(argc, argv, "sample.html", false); + assert(html != NULL); + doc = test_object_get_doc(staff); assert(doc != NULL); diff --git a/test/data/binding/Aliases b/test/data/binding/Aliases new file mode 100644 index 0000000..db61ff1 --- /dev/null +++ b/test/data/binding/Aliases @@ -0,0 +1,302 @@ +# > Unicode:Files.Aliases +# Mapping of character set encoding names to their canonical form +# +# Lines starting with a '#' are comments, blank lines are ignored. +# +# Based on http://www.iana.org/assignments/character-sets and +# http://www.iana.org/assignments/ianacharset-mib +# +# Canonical Form MIBenum Aliases... +# +US-ASCII 3 iso-ir-6 ANSI_X3.4-1986 ISO_646.irv:1991 ASCII ISO646-US ANSI_X3.4-1968 us IBM367 cp367 csASCII +ISO-10646-UTF-1 27 csISO10646UTF1 +ISO_646.basic:1983 28 ref csISO646basic1983 +INVARIANT 29 csINVARIANT +ISO_646.irv:1983 30 iso-ir-2 irv csISO2IntlRefVersion +BS_4730 20 iso-ir-4 ISO646-GB gb uk csISO4UnitedKingdom +NATS-SEFI 31 iso-ir-8-1 csNATSSEFI +NATS-SEFI-ADD 32 iso-ir-8-2 csNATSSEFIADD +NATS-DANO 33 iso-ir-9-1 csNATSDANO +NATS-DANO-ADD 34 iso-ir-9-2 csNATSDANOADD +SEN_850200_B 35 iso-ir-10 FI ISO646-FI ISO646-SE se csISO10Swedish +SEN_850200_C 21 iso-ir-11 ISO646-SE2 se2 csISO11SwedishForNames +KS_C_5601-1987 36 iso-ir-149 KS_C_5601-1989 KSC_5601 korean csKSC56011987 +ISO-2022-KR 37 csISO2022KR +EUC-KR 38 csEUCKR EUCKR +ISO-2022-JP 39 csISO2022JP +ISO-2022-JP-2 40 csISO2022JP2 +ISO-2022-CN 104 +ISO-2022-CN-EXT 105 +JIS_C6220-1969-jp 41 JIS_C6220-1969 iso-ir-13 katakana x0201-7 csISO13JISC6220jp +JIS_C6220-1969-ro 42 iso-ir-14 jp ISO646-JP csISO14JISC6220ro +IT 22 iso-ir-15 ISO646-IT csISO15Italian +PT 43 iso-ir-16 ISO646-PT csISO16Portuguese +ES 23 iso-ir-17 ISO646-ES csISO17Spanish +greek7-old 44 iso-ir-18 csISO18Greek7Old +latin-greek 45 iso-ir-19 csISO19LatinGreek +DIN_66003 24 iso-ir-21 de ISO646-DE csISO21German +NF_Z_62-010_(1973) 46 iso-ir-25 ISO646-FR1 csISO25French +Latin-greek-1 47 iso-ir-27 csISO27LatinGreek1 +ISO_5427 48 iso-ir-37 csISO5427Cyrillic +JIS_C6226-1978 49 iso-ir-42 csISO42JISC62261978 +BS_viewdata 50 iso-ir-47 csISO47BSViewdata +INIS 51 iso-ir-49 csISO49INIS +INIS-8 52 iso-ir-50 csISO50INIS8 +INIS-cyrillic 53 iso-ir-51 csISO51INISCyrillic +ISO_5427:1981 54 iso-ir-54 ISO5427Cyrillic1981 +ISO_5428:1980 55 iso-ir-55 csISO5428Greek +GB_1988-80 56 iso-ir-57 cn ISO646-CN csISO57GB1988 +GB_2312-80 57 iso-ir-58 chinese csISO58GB231280 +NS_4551-1 25 iso-ir-60 ISO646-NO no csISO60DanishNorwegian csISO60Norwegian1 +NS_4551-2 58 ISO646-NO2 iso-ir-61 no2 csISO61Norwegian2 +NF_Z_62-010 26 iso-ir-69 ISO646-FR fr csISO69French +videotex-suppl 59 iso-ir-70 csISO70VideotexSupp1 +PT2 60 iso-ir-84 ISO646-PT2 csISO84Portuguese2 +ES2 61 iso-ir-85 ISO646-ES2 csISO85Spanish2 +MSZ_7795.3 62 iso-ir-86 ISO646-HU hu csISO86Hungarian +JIS_C6226-1983 63 iso-ir-87 x0208 JIS_X0208-1983 csISO87JISX0208 +greek7 64 iso-ir-88 csISO88Greek7 +ASMO_449 65 ISO_9036 arabic7 iso-ir-89 csISO89ASMO449 +iso-ir-90 66 csISO90 +JIS_C6229-1984-a 67 iso-ir-91 jp-ocr-a csISO91JISC62291984a +JIS_C6229-1984-b 68 iso-ir-92 ISO646-JP-OCR-B jp-ocr-b csISO92JISC62991984b +JIS_C6229-1984-b-add 69 iso-ir-93 jp-ocr-b-add csISO93JIS62291984badd +JIS_C6229-1984-hand 70 iso-ir-94 jp-ocr-hand csISO94JIS62291984hand +JIS_C6229-1984-hand-add 71 iso-ir-95 jp-ocr-hand-add csISO95JIS62291984handadd +JIS_C6229-1984-kana 72 iso-ir-96 csISO96JISC62291984kana +ISO_2033-1983 73 iso-ir-98 e13b csISO2033 +ANSI_X3.110-1983 74 iso-ir-99 CSA_T500-1983 NAPLPS csISO99NAPLPS +ISO-8859-1 4 iso-ir-100 ISO_8859-1 ISO_8859-1:1987 latin1 l1 IBM819 CP819 csISOLatin1 8859_1 ISO8859-1 +ISO-8859-2 5 iso-ir-101 ISO_8859-2 ISO_8859-2:1987 latin2 l2 csISOLatin2 8859_2 ISO8859-2 +T.61-7bit 75 iso-ir-102 csISO102T617bit +T.61-8bit 76 T.61 iso-ir-103 csISO103T618bit +ISO-8859-3 6 iso-ir-109 ISO_8859-3 ISO_8859-3:1988 latin3 l3 csISOLatin3 8859_3 ISO8859-3 +ISO-8859-4 7 iso-ir-110 ISO_8859-4 ISO_8859-4:1988 latin4 l4 csISOLatin4 8859_4 ISO8859-4 +ECMA-cyrillic 77 iso-ir-111 KOI8-E csISO111ECMACyrillic +CSA_Z243.4-1985-1 78 iso-ir-121 ISO646-CA csa7-1 ca csISO121Canadian1 +CSA_Z243.4-1985-2 79 iso-ir-122 ISO646-CA2 csa7-2 csISO122Canadian2 +CSA_Z243.4-1985-gr 80 iso-ir-123 csISO123CSAZ24341985gr +ISO-8859-6 9 iso-ir-127 ISO_8859-6 ISO_8859-6:1987 ECMA-114 ASMO-708 arabic csISOLatinArabic +ISO-8859-6-E 81 csISO88596E ISO_8859-6-E +ISO-8859-6-I 82 csISO88596I ISO_8859-6-I +ISO-8859-7 10 iso-ir-126 ISO_8859-7 ISO_8859-7:1987 ELOT_928 ECMA-118 greek greek8 csISOLatinGreek 8859_7 ISO8859-7 +T.101-G2 83 iso-ir-128 csISO128T101G2 +ISO-8859-8 11 iso-ir-138 ISO_8859-8 ISO_8859-8:1988 hebrew csISOLatinHebrew 8859_8 ISO8859-8 +ISO-8859-8-E 84 csISO88598E ISO_8859-8-E +ISO-8859-8-I 85 csISO88598I ISO_8859-8-I +CSN_369103 86 iso-ir-139 csISO139CSN369103 +JUS_I.B1.002 87 iso-ir-141 ISO646-YU js yu csISO141JUSIB1002 +ISO_6937-2-add 14 iso-ir-142 csISOTextComm +IEC_P27-1 88 iso-ir-143 csISO143IECP271 +ISO-8859-5 8 iso-ir-144 ISO_8859-5 ISO_8859-5:1988 cyrillic csISOLatinCyrillic 8859_5 ISO8859-5 +JUS_I.B1.003-serb 89 iso-ir-146 serbian csISO146Serbian +JUS_I.B1.003-mac 90 macedonian iso-ir-147 csISO147Macedonian +ISO-8859-9 12 iso-ir-148 ISO_8859-9 ISO_8859-9:1989 latin5 l5 csISOLatin5 8859_9 ISO8859-9 +greek-ccitt 91 iso-ir-150 csISO150 csISO150GreekCCITT +NC_NC00-10:81 92 cuba iso-ir-151 ISO646-CU csISO151Cuba +ISO_6937-2-25 93 iso-ir-152 csISO6937Add +GOST_19768-74 94 ST_SEV_358-88 iso-ir-153 csISO153GOST1976874 +ISO_8859-supp 95 iso-ir-154 latin1-2-5 csISO8859Supp +ISO_10367-box 96 iso-ir-155 csISO10367Box +ISO-8859-10 13 iso-ir-157 l6 ISO_8859-10:1992 csISOLatin6 latin6 8859_10 ISO8859-10 +latin-lap 97 lap iso-ir-158 csISO158Lap +JIS_X0212-1990 98 x0212 iso-ir-159 csISO159JISX02121990 +DS_2089 99 DS2089 ISO646-DK dk csISO646Danish +us-dk 100 csUSDK +dk-us 101 csDKUS +JIS_X0201 15 X0201 csHalfWidthKatakana +KSC5636 102 ISO646-KR csKSC5636 +ISO-10646-UCS-2 1000 csUnicode UCS-2 UCS2 +ISO-10646-UCS-4 1001 csUCS4 UCS-4 UCS4 +DEC-MCS 2008 dec csDECMCS +hp-roman8 2004 roman8 r8 csHPRoman8 +macintosh 2027 mac csMacintosh MACROMAN MAC-ROMAN X-MAC-ROMAN +IBM037 2028 cp037 ebcdic-cp-us ebcdic-cp-ca ebcdic-cp-wt ebcdic-cp-nl csIBM037 +IBM038 2029 EBCDIC-INT cp038 csIBM038 +IBM273 2030 CP273 csIBM273 +IBM274 2031 EBCDIC-BE CP274 csIBM274 +IBM275 2032 EBCDIC-BR cp275 csIBM275 +IBM277 2033 EBCDIC-CP-DK EBCDIC-CP-NO csIBM277 +IBM278 2034 CP278 ebcdic-cp-fi ebcdic-cp-se csIBM278 +IBM280 2035 CP280 ebcdic-cp-it csIBM280 +IBM281 2036 EBCDIC-JP-E cp281 csIBM281 +IBM284 2037 CP284 ebcdic-cp-es csIBM284 +IBM285 2038 CP285 ebcdic-cp-gb csIBM285 +IBM290 2039 cp290 EBCDIC-JP-kana csIBM290 +IBM297 2040 cp297 ebcdic-cp-fr csIBM297 +IBM420 2041 cp420 ebcdic-cp-ar1 csIBM420 +IBM423 2042 cp423 ebcdic-cp-gr csIBM423 +IBM424 2043 cp424 ebcdic-cp-he csIBM424 +IBM437 2011 cp437 437 csPC8CodePage437 +IBM500 2044 CP500 ebcdic-cp-be ebcdic-cp-ch csIBM500 +IBM775 2087 cp775 csPC775Baltic +IBM850 2009 cp850 850 csPC850Multilingual +IBM851 2045 cp851 851 csIBM851 +IBM852 2010 cp852 852 csPCp852 +IBM855 2046 cp855 855 csIBM855 +IBM857 2047 cp857 857 csIBM857 +IBM860 2048 cp860 860 csIBM860 +IBM861 2049 cp861 861 cp-is csIBM861 +IBM862 2013 cp862 862 csPC862LatinHebrew +IBM863 2050 cp863 863 csIBM863 +IBM864 2051 cp864 csIBM864 +IBM865 2052 cp865 865 csIBM865 +IBM866 2086 cp866 866 csIBM866 +IBM868 2053 CP868 cp-ar csIBM868 +IBM869 2054 cp869 869 cp-gr csIBM869 +IBM870 2055 CP870 ebcdic-cp-roece ebcdic-cp-yu csIBM870 +IBM871 2056 CP871 ebcdic-cp-is csIBM871 +IBM880 2057 cp880 EBCDIC-Cyrillic csIBM880 +IBM891 2058 cp891 csIBM891 +IBM903 2059 cp903 csIBM903 +IBM904 2060 cp904 904 csIBBM904 +IBM905 2061 CP905 ebcdic-cp-tr csIBM905 +IBM918 2062 CP918 ebcdic-cp-ar2 csIBM918 +IBM1026 2063 CP1026 csIBM1026 +EBCDIC-AT-DE 2064 csIBMEBCDICATDE +EBCDIC-AT-DE-A 2065 csEBCDICATDEA +EBCDIC-CA-FR 2066 csEBCDICCAFR +EBCDIC-DK-NO 2067 csEBCDICDKNO +EBCDIC-DK-NO-A 2068 csEBCDICDKNOA +EBCDIC-FI-SE 2069 csEBCDICFISE +EBCDIC-FI-SE-A 2070 csEBCDICFISEA +EBCDIC-FR 2071 csEBCDICFR +EBCDIC-IT 2072 csEBCDICIT +EBCDIC-PT 2073 csEBCDICPT +EBCDIC-ES 2074 csEBCDICES +EBCDIC-ES-A 2075 csEBCDICESA +EBCDIC-ES-S 2076 csEBCDICESS +EBCDIC-UK 2077 csEBCDICUK +EBCDIC-US 2078 csEBCDICUS +UNKNOWN-8BIT 2079 csUnknown8BiT +MNEMONIC 2080 csMnemonic +MNEM 2081 csMnem +VISCII 2082 csVISCII +VIQR 2083 csVIQR +KOI8-R 2084 csKOI8R +KOI8-U 2088 +IBM00858 2089 CCSID00858 CP00858 PC-Multilingual-850+euro +IBM00924 2090 CCSID00924 CP00924 ebcdic-Latin9--euro +IBM01140 2091 CCSID01140 CP01140 ebcdic-us-37+euro +IBM01141 2092 CCSID01141 CP01141 ebcdic-de-273+euro +IBM01142 2093 CCSID01142 CP01142 ebcdic-dk-277+euro ebcdic-no-277+euro +IBM01143 2094 CCSID01143 CP01143 ebcdic-fi-278+euro ebcdic-se-278+euro +IBM01144 2095 CCSID01144 CP01144 ebcdic-it-280+euro +IBM01145 2096 CCSID01145 CP01145 ebcdic-es-284+euro +IBM01146 2097 CCSID01146 CP01146 ebcdic-gb-285+euro +IBM01147 2098 CCSID01147 CP01147 ebcdic-fr-297+euro +IBM01148 2099 CCSID01148 CP01148 ebcdic-international-500+euro +IBM01149 2100 CCSID01149 CP01149 ebcdic-is-871+euro +Big5-HKSCS 2101 +IBM1047 2102 IBM-1047 +PTCP154 2103 csPTCP154 PT154 CP154 Cyrillic-Asian +Amiga-1251 2104 Ami1251 Amiga1251 Ami-1251 +KOI7-switched 2105 +UNICODE-1-1 1010 csUnicode11 +SCSU 1011 +UTF-7 1012 +UTF-16BE 1013 +UTF-16LE 1014 +UTF-16 1015 +CESU-8 1016 csCESU-8 +UTF-32 1017 +UTF-32BE 1018 +UTF-32LE 1019 +BOCU-1 1020 csBOCU-1 +UNICODE-1-1-UTF-7 103 csUnicode11UTF7 +UTF-8 106 UNICODE-1-1-UTF-8 UNICODE-2-0-UTF-8 utf8 +ISO-8859-13 109 8859_13 ISO8859-13 +ISO-8859-14 110 iso-ir-199 ISO_8859-14:1998 ISO_8859-14 latin8 iso-celtic l8 8859_14 ISO8859-14 +ISO-8859-15 111 ISO_8859-15 Latin-9 8859_15 ISO8859-15 +ISO-8859-16 112 iso-ir-226 ISO_8859-16:2001 ISO_8859-16 latin10 l10 +GBK 113 CP936 MS936 windows-936 +GB18030 114 +OSD_EBCDIC_DF04_15 115 +OSD_EBCDIC_DF03_IRV 116 +OSD_EBCDIC_DF04_1 117 +JIS_Encoding 16 csJISEncoding +Shift_JIS 17 MS_Kanji csShiftJIS X-SJIS Shift-JIS +EUC-JP 18 csEUCPkdFmtJapanese Extended_UNIX_Code_Packed_Format_for_Japanese EUCJP +Extended_UNIX_Code_Fixed_Width_for_Japanese 19 csEUCFixWidJapanese +ISO-10646-UCS-Basic 1002 csUnicodeASCII +ISO-10646-Unicode-Latin1 1003 csUnicodeLatin1 ISO-10646 +ISO-Unicode-IBM-1261 1005 csUnicodeIBM1261 +ISO-Unicode-IBM-1268 1006 csUnicodeIBM1268 +ISO-Unicode-IBM-1276 1007 csUnicodeIBM1276 +ISO-Unicode-IBM-1264 1008 csUnicodeIBM1264 +ISO-Unicode-IBM-1265 1009 csUnicodeIBM1265 +ISO-8859-1-Windows-3.0-Latin-1 2000 csWindows30Latin1 +ISO-8859-1-Windows-3.1-Latin-1 2001 csWindows31Latin1 +ISO-8859-2-Windows-Latin-2 2002 csWindows31Latin2 +ISO-8859-9-Windows-Latin-5 2003 csWindows31Latin5 +Adobe-Standard-Encoding 2005 csAdobeStandardEncoding +Ventura-US 2006 csVenturaUS +Ventura-International 2007 csVenturaInternational +PC8-Danish-Norwegian 2012 csPC8DanishNorwegian +PC8-Turkish 2014 csPC8Turkish +IBM-Symbols 2015 csIBMSymbols +IBM-Thai 2016 csIBMThai +HP-Legal 2017 csHPLegal +HP-Pi-font 2018 csHPPiFont +HP-Math8 2019 csHPMath8 +Adobe-Symbol-Encoding 2020 csHPPSMath +HP-DeskTop 2021 csHPDesktop +Ventura-Math 2022 csVenturaMath +Microsoft-Publishing 2023 csMicrosoftPublishing +Windows-31J 2024 csWindows31J +GB2312 2025 csGB2312 EUC-CN EUCCN CN-GB +Big5 2026 csBig5 BIG-FIVE BIG-5 CN-BIG5 BIG_FIVE +windows-1250 2250 CP1250 MS-EE +windows-1251 2251 CP1251 MS-CYRL +windows-1252 2252 CP1252 MS-ANSI +windows-1253 2253 CP1253 MS-GREEK +windows-1254 2254 CP1254 MS-TURK +windows-1255 2255 +windows-1256 2256 CP1256 MS-ARAB +windows-1257 2257 CP1257 WINBALTRIM +windows-1258 2258 +TIS-620 2259 +HZ-GB-2312 2085 + +# Additional encodings not defined by IANA + +# Arbitrary allocations +#CP737 3001 +#CP853 3002 +#CP856 3003 +CP874 3004 WINDOWS-874 +#CP922 3005 +#CP1046 3006 +#CP1124 3007 +#CP1125 3008 WINDOWS-1125 +#CP1129 3009 +#CP1133 3010 IBM-CP1133 +#CP1161 3011 IBM-1161 IBM1161 CSIBM1161 +#CP1162 3012 IBM-1162 IBM1162 CSIBM1162 +#CP1163 3013 IBM-1163 IBM1163 CSIBM1163 +#GEORGIAN-ACADEMY 3014 +#GEORGIAN-PS 3015 +#KOI8-RU 3016 +#KOI8-T 3017 +#MACARABIC 3018 X-MAC-ARABIC MAC-ARABIC +#MACCROATIAN 3019 X-MAC-CROATIAN MAC-CROATIAN +#MACGREEK 3020 X-MAC-GREEK MAC-GREEK +#MACHEBREW 3021 X-MAC-HEBREW MAC-HEBREW +#MACICELAND 3022 X-MAC-ICELAND MAC-ICELAND +#MACROMANIA 3023 X-MAC-ROMANIA MAC-ROMANIA +#MACTHAI 3024 X-MAC-THAI MAC-THAI +#MACTURKISH 3025 X-MAC-TURKISH MAC-TURKISH +#MULELAO-1 3026 + +# From Unicode Lib +ISO-IR-182 4000 +ISO-IR-197 4002 +ISO-2022-JP-1 4008 +MACCYRILLIC 4009 X-MAC-CYRILLIC MAC-CYRILLIC +MACUKRAINE 4010 X-MAC-UKRAINIAN MAC-UKRAINIAN +MACCENTRALEUROPE 4011 X-MAC-CENTRALEURROMAN MAC-CENTRALEURROMAN +JOHAB 4012 +ISO-8859-11 4014 iso-ir-166 ISO_8859-11 ISO8859-11 8859_11 +X-CURRENT 4999 X-SYSTEM +X-ACORN-LATIN1 5001 +X-ACORN-FUZZY 5002 diff --git a/test/data/binding/sample.html b/test/data/binding/sample.html new file mode 100644 index 0000000..a090978 --- /dev/null +++ b/test/data/binding/sample.html @@ -0,0 +1,11 @@ + + + + +This is a title + + +

Here is some text with mismatched bold and italic tags. Here's some more text.

+ + + diff --git a/test/lib/testobject.c b/test/lib/testobject.c index 5a20770..a0ab835 100644 --- a/test/lib/testobject.c +++ b/test/lib/testobject.c @@ -5,11 +5,15 @@ * Copyright 2007 John-Mark Bell */ +#include #include #include +#include #include +#include "bindings/hubbub/parser.h" + #include "bindings/xml/xmlbinding.h" #include "bindings/xml/xmlparser.h" @@ -17,10 +21,14 @@ #include "testobject.h" #include "utils.h" -static bool xml_parser_initialised; +static bool parser_initialised; struct TestObject { - dom_xml_parser *parser; + enum { OBJECT_XML, OBJECT_HTML } type; + union { + dom_xml_parser *xml; + dom_hubbub_parser *html; + } parser; struct dom_document *doc; }; @@ -33,6 +41,7 @@ TestObject *test_object_create(int argc, char **argv, #define CHUNK_SIZE 4096 uint8_t buf[CHUNK_SIZE]; FILE *fp; + char *dot; size_t len; TestObject *ret; @@ -43,15 +52,18 @@ TestObject *test_object_create(int argc, char **argv, exit(EXIT_FAILURE); } - if (xml_parser_initialised == false) { + if (parser_initialised == false) { assert(dom_initialise(myrealloc, NULL) == DOM_NO_ERR); assert(dom_xml_binding_initialise(myrealloc, NULL) == DOM_XML_OK); +// assert(dom_hubbub_binding_initialise(myrealloc, NULL) == +// DOM_HUBBUB_OK); + atexit(test_object_cleanup); - xml_parser_initialised = true; + parser_initialised = true; } snprintf(fnbuf, sizeof fnbuf, "%s/%s", argv[1], uri); @@ -60,16 +72,67 @@ TestObject *test_object_create(int argc, char **argv, if (ret == NULL) return NULL; - ret->parser = dom_xml_parser_create(NULL, "UTF-8", myrealloc, NULL, - mymsg, NULL); - if (ret->parser == NULL) { - free(ret); - return NULL; + /* Detect the parser type (this is mildly hacky) */ + dot = strrchr(uri, '.'); + len = strlen(uri); + + if (dot == NULL) { + printf("No file extension, assuming XML\n"); + + ret->type = OBJECT_XML; + } else if (len - ((dot + 1) - uri) == 3) { + if (tolower(dot[1]) == 'x' && tolower(dot[2]) == 'm' + && tolower(dot[3]) == 'l') { + ret->type = OBJECT_XML; + } else if (tolower(dot[1]) == 'h' && tolower(dot[2]) == 't' && + tolower(dot[3]) == 'm') { + ret->type = OBJECT_HTML; + } + } else if (len - ((dot + 1) - uri) == 4) { + if (tolower(dot[1]) == 'h' && tolower(dot[2]) == 't' && + tolower(dot[3]) == 'm' && + tolower(dot[4]) == 'l') { + ret->type = OBJECT_HTML; + } + } else { + /* Assume XML */ + ret->type = OBJECT_XML; + } + + switch (ret->type) { + case OBJECT_XML: + ret->parser.xml = dom_xml_parser_create(NULL, "UTF-8", + myrealloc, NULL, mymsg, NULL); + if (ret->parser.xml == NULL) { + free(ret); + return NULL; + } + break; + case OBJECT_HTML: + { + char abuf[1024]; + snprintf(abuf, sizeof abuf, "%s/Aliases", argv[1]); + + ret->parser.html = dom_hubbub_parser_create(abuf, + NULL, "UTF-8", myrealloc, NULL, mymsg, NULL); + if (ret->parser.html == NULL) { + free(ret); + return NULL; + } + break; + } } fp = fopen(fnbuf, "r"); if (fp == NULL) { - dom_xml_parser_destroy(ret->parser); + switch (ret->type) { + case OBJECT_XML: + dom_xml_parser_destroy(ret->parser.xml); + break; + case OBJECT_HTML: + dom_hubbub_parser_destroy(ret->parser.html); + break; + } free(ret); return NULL; } @@ -81,8 +144,16 @@ TestObject *test_object_create(int argc, char **argv, while (len > CHUNK_SIZE) { fread(buf, 1, CHUNK_SIZE, fp); - assert(dom_xml_parser_parse_chunk(ret->parser, buf, - CHUNK_SIZE) == DOM_XML_OK); + switch (ret->type) { + case OBJECT_XML: + assert(dom_xml_parser_parse_chunk(ret->parser.xml, + buf, CHUNK_SIZE) == DOM_XML_OK); + break; + case OBJECT_HTML: + assert(dom_hubbub_parser_parse_chunk(ret->parser.html, + buf, CHUNK_SIZE) == DOM_HUBBUB_OK); + break; + } len -= CHUNK_SIZE; } @@ -90,20 +161,51 @@ TestObject *test_object_create(int argc, char **argv, if (len > 0) { fread(buf, 1, len, fp); - assert(dom_xml_parser_parse_chunk(ret->parser, buf, - len) == DOM_XML_OK); + switch (ret->type) { + case OBJECT_XML: + assert(dom_xml_parser_parse_chunk(ret->parser.xml, + buf, len) == DOM_XML_OK); + break; + case OBJECT_HTML: + assert(dom_hubbub_parser_parse_chunk(ret->parser.html, + buf, len) == DOM_HUBBUB_OK); + break; + } len = 0; } - assert(dom_xml_parser_completed(ret->parser) == DOM_XML_OK); + switch (ret->type) { + case OBJECT_XML: + assert(dom_xml_parser_completed(ret->parser.xml) == DOM_XML_OK); + break; + case OBJECT_HTML: + assert(dom_hubbub_parser_completed(ret->parser.html) == + DOM_HUBBUB_OK); + break; + } fclose(fp); - ret->doc = dom_xml_parser_get_document(ret->parser); + switch (ret->type) { + case OBJECT_XML: + ret->doc = dom_xml_parser_get_document(ret->parser.xml); + break; + case OBJECT_HTML: + ret->doc = dom_hubbub_parser_get_document(ret->parser.html); + break; + } - dom_xml_parser_destroy(ret->parser); - ret->parser = NULL; + switch (ret->type) { + case OBJECT_XML: + dom_xml_parser_destroy(ret->parser.xml); + ret->parser.xml = NULL; + break; + case OBJECT_HTML: + dom_hubbub_parser_destroy(ret->parser.html); + ret->parser.html = NULL; + break; + } return ret; @@ -117,14 +219,13 @@ struct dom_document *test_object_get_doc(TestObject *obj) const char *test_object_get_mimetype(TestObject *obj) { - UNUSED(obj); - - return "text/xml"; + return (obj->type == OBJECT_XML ? "text/xml" : "text/html"); } void test_object_cleanup(void) { - if (xml_parser_initialised) { + if (parser_initialised) { +// dom_hubbub_binding_finalise(); dom_xml_binding_finalise(); dom_finalise(); } -- cgit v1.2.3