From 7b30a5520cfb56e651f0eb4da85a3e07747da7dc Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Sat, 23 Jun 2007 22:40:25 +0000 Subject: Import hubbub -- an HTML parsing library. Plenty of work still to do (like tree generation ;) svn path=/trunk/hubbub/; revision=3359 --- src/Makefile | 79 ++ src/charset/Makefile | 53 ++ src/charset/aliases.c | 361 +++++++ src/charset/aliases.h | 42 + src/charset/codec.c | 186 ++++ src/charset/codec.h | 153 +++ src/charset/codec_iconv.c | 837 +++++++++++++++++ src/charset/codec_impl.h | 51 + src/charset/codec_utf8.c | 620 ++++++++++++ src/charset/detect.c | 673 +++++++++++++ src/charset/detect.h | 22 + src/hubbub.c | 63 ++ src/input/Makefile | 53 ++ src/input/filter.c | 380 ++++++++ src/input/filter.h | 57 ++ src/input/inputstream.c | 479 ++++++++++ src/input/inputstream.h | 98 ++ src/input/streamimpl.h | 77 ++ src/input/utf8_stream.c | 567 +++++++++++ src/parser.c | 237 +++++ src/tokeniser/Makefile | 53 ++ src/tokeniser/entities.c | 363 +++++++ src/tokeniser/entities.h | 25 + src/tokeniser/tokeniser.c | 2282 +++++++++++++++++++++++++++++++++++++++++++++ src/tokeniser/tokeniser.h | 71 ++ src/utils/Makefile | 53 ++ src/utils/dict.c | 219 +++++ src/utils/dict.h | 31 + src/utils/errors.c | 70 ++ src/utils/utf8.c | 368 ++++++++ src/utils/utf8.h | 38 + src/utils/utils.h | 28 + 32 files changed, 8689 insertions(+) create mode 100644 src/Makefile create mode 100644 src/charset/Makefile create mode 100644 src/charset/aliases.c create mode 100644 src/charset/aliases.h create mode 100644 src/charset/codec.c create mode 100644 src/charset/codec.h create mode 100644 src/charset/codec_iconv.c create mode 100644 src/charset/codec_impl.h create mode 100644 src/charset/codec_utf8.c create mode 100644 src/charset/detect.c create mode 100644 src/charset/detect.h create mode 100644 src/hubbub.c create mode 100644 src/input/Makefile create mode 100644 src/input/filter.c create mode 100644 src/input/filter.h create mode 100644 src/input/inputstream.c create mode 100644 src/input/inputstream.h create mode 100644 src/input/streamimpl.h create mode 100644 src/input/utf8_stream.c create mode 100644 src/parser.c create mode 100644 src/tokeniser/Makefile create mode 100644 src/tokeniser/entities.c create mode 100644 src/tokeniser/entities.h create mode 100644 src/tokeniser/tokeniser.c create mode 100644 src/tokeniser/tokeniser.h create mode 100644 src/utils/Makefile create mode 100644 src/utils/dict.c create mode 100644 src/utils/dict.h create mode 100644 src/utils/errors.c create mode 100644 src/utils/utf8.c create mode 100644 src/utils/utf8.h create mode 100644 src/utils/utils.h (limited to 'src') diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000..b72a9e0 --- /dev/null +++ b/src/Makefile @@ -0,0 +1,79 @@ +# Makefile for libhubbub +# +# Toolchain is exported by top-level makefile +# +# Top-level makefile also exports the following variables: +# +# COMPONENT Name of component +# EXPORT Absolute path of export directory +# TOP Absolute path of source tree root +# +# The top-level makefile requires the following targets to exist: +# +# clean Clean source tree +# debug Create a debug binary +# distclean Fully clean source tree, back to pristine condition +# export Export distributable components to ${EXPORT} +# release Create a release binary +# setup Perform any setup required prior to compilation +# test Execute any test cases + +# Manipulate include paths +CFLAGS += -I$(CURDIR) + +# Release output +RELEASE = ${TOP}/${COMPONENT}.a + +# Debug output +DEBUG = ${TOP}/${COMPONENT}-debug.a + +# Objects +OBJS = hubbub parser + +.PHONY: clean debug distclean export release setup test + +# Targets +release: $(addprefix Release/, $(addsuffix .o, $(OBJS))) + @${MAKE} -C charset release + @${MAKE} -C input release + @${MAKE} -C tokeniser release + @${MAKE} -C utils release + @${AR} ${ARFLAGS} $(RELEASE) Release/* + +debug: $(addprefix Debug/, $(addsuffix .o, $(OBJS))) + @${MAKE} -C charset debug + @${MAKE} -C input debug + @${MAKE} -C tokeniser debug + @${MAKE} -C utils debug + @${AR} ${ARFLAGS} $(DEBUG) Debug/* + +clean: + @${MAKE} -C charset clean + @${MAKE} -C input clean + @${MAKE} -C tokeniser clean + @${MAKE} -C utils clean + -@${RM} ${RMFLAGS} $(addprefix Release/, $(addsuffix .o, ${OBJS})) + -@${RM} ${RMFLAGS} $(addprefix Debug/, $(addsuffix .o, ${OBJS})) + -@${RM} ${RMFLAGS} $(RELEASE) $(DEBUG) + +distclean: + -@${RM} ${RMFLAGS} -r Release + -@${RM} ${RMFLAGS} -r Debug + +setup: + @${MKDIR} ${MKDIRFLAGS} Release + @${MKDIR} ${MKDIRFLAGS} Debug + +export: + @${CP} ${CPFLAGS} $(RELEASE) ${EXPORT}/lib/ + +test: + +# Pattern rules +Release/%.o: %.c + @${ECHO} ${ECHOFLAGS} "==> $<" + @${CC} -c ${CFLAGS} -DNDEBUG -o $@ $< + +Debug/%.o: %.c + @${ECHO} ${ECHOFLAGS} "==> $<" + @${CC} -c -g ${CFLAGS} -o $@ $< diff --git a/src/charset/Makefile b/src/charset/Makefile new file mode 100644 index 0000000..62817b3 --- /dev/null +++ b/src/charset/Makefile @@ -0,0 +1,53 @@ +# Makefile for libhubbub +# +# Toolchain is exported by top-level makefile +# +# Top-level makefile also exports the following variables: +# +# COMPONENT Name of component +# EXPORT Absolute path of export directory +# TOP Absolute path of source tree root +# +# The top-level makefile requires the following targets to exist: +# +# clean Clean source tree +# debug Create a debug binary +# distclean Fully clean source tree, back to pristine condition +# export Export distributable components to ${EXPORT} +# release Create a release binary +# setup Perform any setup required prior to compilation +# test Execute any test cases + +# Manipulate include paths +CFLAGS += -I$(CURDIR) + +# Objects +OBJS = aliases codec codec_iconv codec_utf8 detect + +.PHONY: clean debug distclean export release setup test + +# Targets +release: $(addprefix ../Release/, $(addsuffix .o, $(OBJS))) + +debug: $(addprefix ../Debug/, $(addsuffix .o, $(OBJS))) + +clean: + -@${RM} ${RMFLAGS} $(addprefix ../Release/, $(addsuffix .o, ${OBJS})) + -@${RM} ${RMFLAGS} $(addprefix ../Debug/, $(addsuffix .o, ${OBJS})) + +distclean: + +setup: + +export: + +test: + +# Pattern rules +../Release/%.o: %.c + @${ECHO} ${ECHOFLAGS} "==> $<" + @${CC} -c ${CFLAGS} -DNDEBUG -o $@ $< + +../Debug/%.o: %.c + @${ECHO} ${ECHOFLAGS} "==> $<" + @${CC} -c -g ${CFLAGS} -o $@ $< diff --git a/src/charset/aliases.c b/src/charset/aliases.c new file mode 100644 index 0000000..dcf6de2 --- /dev/null +++ b/src/charset/aliases.c @@ -0,0 +1,361 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include +#include +#include +#include +#include +#include + +#include "charset/aliases.h" + +struct alias { + struct alias *next; + hubbub_aliases_canon *canon; + uint16_t name_len; + char name[1]; +}; + +#define HASH_SIZE (43) +static hubbub_aliases_canon *canon_tab[HASH_SIZE]; +static struct alias *alias_tab[HASH_SIZE]; + +static hubbub_error hubbub_create_alias(const char *alias, + hubbub_aliases_canon *c, hubbub_alloc alloc, void *pw); +static hubbub_aliases_canon *hubbub_create_canon(const char *canon, + uint16_t mibenum, hubbub_alloc alloc, void *pw); +static uint32_t hubbub_hash_val(const char *alias, size_t len); + +/** + * Create alias data from Aliases file + * + * \param filename The path to the Aliases file + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return HUBBUB_OK on success, appropriate error otherwise. + */ +hubbub_error hubbub_aliases_create(const char *filename, + hubbub_alloc alloc, void *pw) +{ + char buf[300]; + FILE *fp; + + if (filename == NULL || alloc == NULL) + return HUBBUB_BADPARM; + + fp = fopen(filename, "r"); + if (fp == NULL) + return HUBBUB_FILENOTFOUND; + + while (fgets(buf, sizeof buf, fp)) { + char *p, *aliases = 0, *mib, *end; + hubbub_aliases_canon *cf; + + if (buf[0] == 0 || buf[0] == '#') + /* skip blank lines or comments */ + continue; + + buf[strlen(buf) - 1] = 0; /* lose terminating newline */ + end = buf + strlen(buf); + + /* find end of canonical form */ + for (p = buf; *p && !isspace(*p) && !iscntrl(*p); p++) + ; /* do nothing */ + if (p >= end) + continue; + *p++ = '\0'; /* terminate canonical form */ + + /* skip whitespace */ + for (; *p && isspace(*p); p++) + ; /* do nothing */ + if (p >= end) + continue; + mib = p; + + /* find end of mibenum */ + for (; *p && !isspace(*p) && !iscntrl(*p); p++) + ; /* do nothing */ + if (p < end) + *p++ = '\0'; /* terminate mibenum */ + + cf = hubbub_create_canon(buf, atoi(mib), alloc, pw); + if (cf == NULL) + continue; + + /* skip whitespace */ + for (; p < end && *p && isspace(*p); p++) + ; /* do nothing */ + if (p >= end) + continue; + aliases = p; + + while (p < end) { + /* find end of alias */ + for (; *p && !isspace(*p) && !iscntrl(*p); p++) + ; /* do nothing */ + if (p > end) + /* stop if we've gone past the end */ + break; + /* terminate current alias */ + *p++ = '\0'; + + if (hubbub_create_alias(aliases, cf, + alloc, pw) != HUBBUB_OK) + break; + + /* in terminating, we may have advanced + * past the end - check this here */ + if (p >= end) + break; + + /* skip whitespace */ + for (; *p && isspace(*p); p++) + ; /* do nothing */ + + if (p >= end) + /* gone past end => stop */ + break; + + /* update pointer to current alias */ + aliases = p; + } + } + + fclose(fp); + + return HUBBUB_OK; +} + +/** + * Free all alias data + * + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data + */ +void hubbub_aliases_destroy(hubbub_alloc alloc, void *pw) +{ + hubbub_aliases_canon *c, *d; + struct alias *a, *b; + int i; + + for (i = 0; i != HASH_SIZE; i++) { + for (c = canon_tab[i]; c; c = d) { + d = c->next; + alloc(c, 0, pw); + } + canon_tab[i] = NULL; + + for (a = alias_tab[i]; a; a = b) { + b = a->next; + alloc(a, 0, pw); + } + alias_tab[i] = NULL; + } +} + +/** + * Retrieve the MIB enum value assigned to an encoding name + * + * \param alias The alias to lookup + * \param len The length of the alias string + * \return The MIB enum value, or 0 if not found + */ +uint16_t hubbub_mibenum_from_name(const char *alias, size_t len) +{ + hubbub_aliases_canon *c; + + if (alias == NULL) + return 0; + + c = hubbub_alias_canonicalise(alias, len); + if (c == NULL) + return 0; + + return c->mib_enum; +} + +/** + * Retrieve the canonical name of an encoding from the MIB enum + * + * \param mibenum The MIB enum value + * \return Pointer to canonical name, or NULL if not found + */ +const char *hubbub_mibenum_to_name(uint16_t mibenum) +{ + int i; + hubbub_aliases_canon *c; + + for (i = 0; i != HASH_SIZE; i++) + for (c = canon_tab[i]; c; c = c->next) + if (c->mib_enum == mibenum) + return c->name; + + return NULL; +} + + +/** + * Retrieve the canonical form of an alias name + * + * \param alias The alias name + * \param len The length of the alias name + * \return Pointer to canonical form or NULL if not found + */ +hubbub_aliases_canon *hubbub_alias_canonicalise(const char *alias, + size_t len) +{ + uint32_t hash; + hubbub_aliases_canon *c; + struct alias *a; + + if (alias == NULL) + return NULL; + + hash = hubbub_hash_val(alias, len); + + for (c = canon_tab[hash]; c; c = c->next) + if (c->name_len == len && + strncasecmp(c->name, alias, len) == 0) + break; + if (c) + return c; + + for (a = alias_tab[hash]; a; a = a->next) + if (a->name_len == len && + strncasecmp(a->name, alias, len) == 0) + break; + if (a) + return a->canon; + + return NULL; +} + + +/** + * Create an alias + * + * \param alias The alias name + * \param c The canonical form + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_create_alias(const char *alias, hubbub_aliases_canon *c, + hubbub_alloc alloc, void *pw) +{ + struct alias *a; + uint32_t hash; + + if (alias == NULL || c == NULL || alloc == NULL) + return HUBBUB_BADPARM; + + a = alloc(NULL, sizeof(struct alias) + strlen(alias) + 1, pw); + if (a == NULL) + return HUBBUB_NOMEM; + + a->canon = c; + a->name_len = strlen(alias); + strcpy(a->name, alias); + a->name[a->name_len] = '\0'; + + hash = hubbub_hash_val(alias, a->name_len); + + a->next = alias_tab[hash]; + alias_tab[hash] = a; + + return HUBBUB_OK; +} + +/** + * Create a canonical form + * + * \param canon The canonical name + * \param mibenum The MIB enum value + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to canonical form or NULL on error + */ +hubbub_aliases_canon *hubbub_create_canon(const char *canon, + uint16_t mibenum, hubbub_alloc alloc, void *pw) +{ + hubbub_aliases_canon *c; + uint32_t hash, len; + + if (canon == NULL || alloc == NULL) + return NULL; + + len = strlen(canon); + + c = alloc(NULL, sizeof(hubbub_aliases_canon) + len + 1, pw); + if (c == NULL) + return NULL; + + c->mib_enum = mibenum; + c->name_len = len; + strcpy(c->name, canon); + c->name[len] = '\0'; + + hash = hubbub_hash_val(canon, len); + + c->next = canon_tab[hash]; + canon_tab[hash] = c; + + return c; +} + +/** + * Hash function + * + * \param alias String to hash + * \return The hashed value + */ +uint32_t hubbub_hash_val(const char *alias, size_t len) +{ + const char *s = alias; + uint32_t h = 5381; + + if (alias == NULL) + return 0; + + while (len--) + h = (h * 33) ^ (*s++ & ~0x20); /* case insensitive */ + + return h % HASH_SIZE; +} + + +#ifndef NDEBUG +/** + * Dump all alias data to stdout + */ +void hubbub_aliases_dump(void) +{ + hubbub_aliases_canon *c; + struct alias *a; + int i; + size_t size = 0; + + for (i = 0; i != HASH_SIZE; i++) { + for (c = canon_tab[i]; c; c = c->next) { + printf("%d %s\n", i, c->name); + size += offsetof(hubbub_aliases_canon, name) + + c->name_len; + } + + for (a = alias_tab[i]; a; a = a->next) { + printf("%d %s\n", i, a->name); + size += offsetof(struct alias, name) + a->name_len; + } + } + + size += (sizeof(canon_tab) / sizeof(canon_tab[0])); + size += (sizeof(alias_tab) / sizeof(alias_tab[0])); + + printf("%u\n", (unsigned int) size); +} +#endif diff --git a/src/charset/aliases.h b/src/charset/aliases.h new file mode 100644 index 0000000..e0505d0 --- /dev/null +++ b/src/charset/aliases.h @@ -0,0 +1,42 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef hubbub_charset_aliases_h_ +#define hubbub_charset_aliases_h_ + +#include + +#include +#include + +typedef struct hubbub_aliases_canon { + struct hubbub_aliases_canon *next; + uint16_t mib_enum; + uint16_t name_len; + char name[1]; +} hubbub_aliases_canon; + +/* Load encoding aliases from file */ +hubbub_error hubbub_aliases_create(const char *filename, + hubbub_alloc alloc, void *pw); +/* Destroy encoding aliases */ +void hubbub_aliases_destroy(hubbub_alloc alloc, void *pw); + +/* Convert an encoding alias to a MIB enum value */ +uint16_t hubbub_mibenum_from_name(const char *alias, size_t len); +/* Convert a MIB enum value into an encoding alias */ +const char *hubbub_mibenum_to_name(uint16_t mibenum); + +/* Canonicalise an alias name */ +hubbub_aliases_canon *hubbub_alias_canonicalise(const char *alias, + size_t len); + +#ifndef NDEBUG +void hubbub_aliases_dump(void); +#endif + +#endif diff --git a/src/charset/codec.c b/src/charset/codec.c new file mode 100644 index 0000000..12a1bdc --- /dev/null +++ b/src/charset/codec.c @@ -0,0 +1,186 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include + +#include "charset/aliases.h" + +#include "codec_impl.h" + +extern hubbub_charsethandler hubbub_iconv_codec_handler; +extern hubbub_charsethandler hubbub_utf8_codec_handler; + +static hubbub_charsethandler *handler_table[] = { + &hubbub_utf8_codec_handler, + &hubbub_iconv_codec_handler, + NULL, +}; + +/** + * Create a charset codec + * + * \param charset Target charset + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to codec instance, or NULL on failure + */ +hubbub_charsetcodec *hubbub_charsetcodec_create(const char *charset, + hubbub_alloc alloc, void *pw) +{ + hubbub_charsetcodec *codec; + hubbub_charsethandler **handler; + const hubbub_aliases_canon * canon; + + if (charset == NULL || alloc == NULL) + return NULL; + + /* Canonicalise charset name. */ + canon = hubbub_alias_canonicalise(charset, strlen(charset)); + if (canon == NULL) + return NULL; + + /* Search for handler class */ + for (handler = handler_table; *handler != NULL; handler++) { + if ((*handler)->handles_charset(canon->name)) + break; + } + + /* None found */ + if ((*handler) == NULL) + return NULL; + + /* Instantiate class */ + codec = (*handler)->create(canon->name, alloc, pw); + if (codec == NULL) + return NULL; + + /* and initialise it */ + codec->mibenum = canon->mib_enum; + + codec->filter = NULL; + codec->filter_pw = NULL; + + codec->errormode = HUBBUB_CHARSETCODEC_ERROR_LOOSE; + + codec->alloc = alloc; + codec->alloc_pw = pw; + + return codec; +} + +/** + * Destroy a charset codec + * + * \param codec The codec to destroy + */ +void hubbub_charsetcodec_destroy(hubbub_charsetcodec *codec) +{ + if (codec == NULL) + return; + + codec->handler.destroy(codec); + + codec->alloc(codec, 0, codec->alloc_pw); +} + +/** + * Configure a charset codec + * + * \param codec The codec to configure + * \parem type The codec option type to configure + * \param params Option-specific parameters + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_charsetcodec_setopt(hubbub_charsetcodec *codec, + hubbub_charsetcodec_opttype type, + hubbub_charsetcodec_optparams *params) +{ + if (codec == NULL || params == NULL) + return HUBBUB_BADPARM; + + switch (type) { + case HUBBUB_CHARSETCODEC_FILTER_FUNC: + codec->filter = params->filter_func.filter; + codec->filter_pw = params->filter_func.pw; + break; + + case HUBBUB_CHARSETCODEC_ERROR_MODE: + codec->errormode = params->error_mode.mode; + break; + } + + return HUBBUB_OK; +} + +/** + * Encode a chunk of UCS4 data into a codec's charset + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return HUBBUB_OK on success, appropriate error otherwise. + * + * source, sourcelen, dest and destlen will be updated appropriately on exit + */ +hubbub_error hubbub_charsetcodec_encode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + if (codec == NULL || source == NULL || *source == NULL || + sourcelen == NULL || dest == NULL || *dest == NULL || + destlen == NULL) + return HUBBUB_BADPARM; + + return codec->handler.encode(codec, source, sourcelen, dest, destlen); +} + +/** + * Decode a chunk of data in a codec's charset into UCS4 + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return HUBBUB_OK on success, appropriate error otherwise. + * + * source, sourcelen, dest and destlen will be updated appropriately on exit + * + * Call this with a source length of 0 to flush any buffers. + */ +hubbub_error hubbub_charsetcodec_decode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + if (codec == NULL || source == NULL || *source == NULL || + sourcelen == NULL || dest == NULL || *dest == NULL || + destlen == NULL) + return HUBBUB_BADPARM; + + return codec->handler.decode(codec, source, sourcelen, dest, destlen); +} + +/** + * Clear a charset codec's encoding state + * + * \param codec The codec to reset + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_charsetcodec_reset(hubbub_charsetcodec *codec) +{ + if (codec == NULL) + return HUBBUB_BADPARM; + + /* Reset filter */ + if (codec->filter) + codec->filter(HUBBUB_CHARSETCODEC_NULL, NULL, NULL, NULL); + + return codec->handler.reset(codec); +} + diff --git a/src/charset/codec.h b/src/charset/codec.h new file mode 100644 index 0000000..4cd94d8 --- /dev/null +++ b/src/charset/codec.h @@ -0,0 +1,153 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef hubbub_charset_codec_h_ +#define hubbub_charset_codec_h_ + +#include + +#include +#include + +typedef struct hubbub_charsetcodec hubbub_charsetcodec; + +#define HUBBUB_CHARSETCODEC_NULL (0xffffffffU) + +/** + * Type of charset codec filter function + * + * \param c UCS4 character (in host byte order) or + * HUBBUB_CHARSETCODEC_NULL to reset + * \param output Pointer to location to store output buffer location + * \param outputlen Pointer to location to store output buffer length + * \param pw Pointer to client-specific private data + * \return HUBBUB_OK on success, or appropriate error otherwise. + * + * The output buffer is owned by the filter code and will not be freed by + * any charset codec. It should contain the replacement UCS4 character(s) + * for the input. The replacement characters should be in host byte order. + * The contents of *output and *outputlen on entry are ignored and these + * will be filled in by the filter code. + * + * Filters may elect to replace the input character with no output. In this + * case, *output should be set to NULL and *outputlen should be set to 0 and + * HUBBUB_OK should be returned. + * + * The output length is in terms of the number of UCS4 characters in the + * output buffer. i.e.: + * + * for (size_t i = 0; i < outputlen; i++) { + * dest[curchar++] = output[i]; + * } + * + * would copy the contents of the filter output buffer to the codec's output + * buffer. + */ +typedef hubbub_error (*hubbub_charsetcodec_filter)(uint32_t c, + uint32_t **output, size_t *outputlen, void *pw); + +/** + * Charset codec error mode + * + * A codec's error mode determines its behaviour in the face of: + * + * + characters which are unrepresentable in the destination charset (if + * encoding data) or which cannot be converted to UCS4 (if decoding data). + * + invalid byte sequences (both encoding and decoding) + * + * The options provide a choice between the following approaches: + * + * + draconian, "stop processing" ("strict") + * + "replace the unrepresentable character with something else" ("loose") + * + "attempt to transliterate, or replace if unable" ("translit") + * + * The default error mode is "loose". + * + * + * In the "loose" case, the replacement character will depend upon: + * + * + Whether the operation was encoding or decoding + * + If encoding, what the destination charset is. + * + * If decoding, the replacement character will be: + * + * U+FFFD (REPLACEMENT CHARACTER) + * + * If encoding, the replacement character will be: + * + * U+003F (QUESTION MARK) if the destination charset is not UTF-(8|16|32) + * U+FFFD (REPLACEMENT CHARACTER) otherwise. + * + * + * In the "translit" case, the codec will attempt to transliterate into + * the destination charset, if encoding. If decoding, or if transliteration + * fails, this option is identical to "loose". + */ +typedef enum hubbub_charsetcodec_errormode { + /** Abort processing if unrepresentable character encountered */ + HUBBUB_CHARSETCODEC_ERROR_STRICT = 0, + /** Replace unrepresentable characters with single alternate */ + HUBBUB_CHARSETCODEC_ERROR_LOOSE = 1, + /** Transliterate unrepresentable characters, if possible */ + HUBBUB_CHARSETCODEC_ERROR_TRANSLIT = 2, +} hubbub_charsetcodec_errormode; + +/** + * Charset codec option types + */ +typedef enum hubbub_charsetcodec_opttype { + /** Register codec filter function */ + HUBBUB_CHARSETCODEC_FILTER_FUNC = 0, + /** Set codec error mode */ + HUBBUB_CHARSETCODEC_ERROR_MODE = 1, +} hubbub_charsetcodec_opttype; + +/** + * Charset codec option parameters + */ +typedef union hubbub_charsetcodec_optparams { + /** Parameters for filter function setting */ + struct { + /** Filter function */ + hubbub_charsetcodec_filter filter; + /** Client-specific private data */ + void *pw; + } filter_func; + + /** Parameters for error mode setting */ + struct { + /** The desired error handling mode */ + hubbub_charsetcodec_errormode mode; + } error_mode; +} hubbub_charsetcodec_optparams; + + +/* Create a charset codec */ +hubbub_charsetcodec *hubbub_charsetcodec_create(const char *charset, + hubbub_alloc alloc, void *pw); +/* Destroy a charset codec */ +void hubbub_charsetcodec_destroy(hubbub_charsetcodec *codec); + +/* Configure a charset codec */ +hubbub_error hubbub_charsetcodec_setopt(hubbub_charsetcodec *codec, + hubbub_charsetcodec_opttype type, + hubbub_charsetcodec_optparams *params); + +/* Encode a chunk of UCS4 data into a codec's charset */ +hubbub_error hubbub_charsetcodec_encode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); + +/* Decode a chunk of data in a codec's charset into UCS4 */ +hubbub_error hubbub_charsetcodec_decode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); + +/* Reset a charset codec */ +hubbub_error hubbub_charsetcodec_reset(hubbub_charsetcodec *codec); + +#endif diff --git a/src/charset/codec_iconv.c b/src/charset/codec_iconv.c new file mode 100644 index 0000000..097e82a --- /dev/null +++ b/src/charset/codec_iconv.c @@ -0,0 +1,837 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +/* This codec is hideously slow. Only use it as a last resort */ + +#include +#include +#include + +#include + +/* These two are for htonl / ntohl */ +#include +#include + +#include "charset/aliases.h" +#include "utils/utils.h" + +#include "codec_impl.h" + +/** + * A note on endianness: + * + * UCS4 is big-endian by default. Therefore, this codec reads and writes + * big-endian values. This is fine, and causes no problems. However, to + * make life easier for client-supplied filter code, character values passed + * to a filter and those read back from a filter are in host-endian. + * Therefore, we need to convert from big-endian to host-endian when passing + * characters to a filter and perform the reverse translation when reading + * characters back. + */ + +/** + * Iconv-based charset codec + */ +typedef struct hubbub_iconv_codec { + hubbub_charsetcodec base; /**< Base class */ + + iconv_t read_cd; /**< Iconv handle for reading */ +#define INVAL_BUFSIZE (32) + uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up + * incomplete input + * sequences */ + size_t inval_len; /**< Number of bytes in inval_buf */ + +#define READ_BUFSIZE (8) + uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial + * output sequences (decode) + */ + size_t read_len; /**< Number of characters in + * read_buf */ + + iconv_t write_cd; /**< Iconv handle for writing */ +#define WRITE_BUFSIZE (8) + uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial + * output sequences (encode) + */ + size_t write_len; /**< Number of characters in + * write_buf */ +} hubbub_iconv_codec; + + +static bool hubbub_iconv_codec_handles_charset(const char *charset); +static hubbub_charsetcodec *hubbub_iconv_codec_create(const char *charset, + hubbub_alloc alloc, void *pw); +static void hubbub_iconv_codec_destroy (hubbub_charsetcodec *codec); +static hubbub_error hubbub_iconv_codec_encode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static hubbub_error hubbub_iconv_codec_decode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static hubbub_error hubbub_iconv_codec_reset(hubbub_charsetcodec *codec); +static hubbub_error hubbub_iconv_codec_filter_decoded_char( + hubbub_iconv_codec *c, uint32_t ucs4, uint8_t **dest, + size_t *destlen); +static bool hubbub_iconv_codec_is_unicode(hubbub_iconv_codec *c); +static hubbub_error hubbub_iconv_codec_read_char(hubbub_iconv_codec *c, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static hubbub_error hubbub_iconv_codec_write_char(hubbub_iconv_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen); + +/** + * Determine whether this codec handles a specific charset + * + * \param charset Charset to test + * \return true if handleable, false otherwise + */ +bool hubbub_iconv_codec_handles_charset(const char *charset) +{ + iconv_t cd; + bool ret; + + cd = iconv_open("UCS-4", charset); + + ret = (cd != (iconv_t) -1); + + if (ret) + iconv_close(cd); + + return ret; +} + +/** + * Create an iconv-based codec + * + * \param charset The charset to read from / write to + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to codec, or NULL on failure + */ +hubbub_charsetcodec *hubbub_iconv_codec_create(const char *charset, + hubbub_alloc alloc, void *pw) +{ + hubbub_iconv_codec *codec; + + codec = alloc(NULL, sizeof(hubbub_iconv_codec), pw); + if (codec == NULL) + return NULL; + + codec->read_cd = iconv_open("UCS-4", charset); + if (codec->read_cd == (iconv_t) -1) { + alloc(codec, 0, pw); + return NULL; + } + + codec->write_cd = iconv_open(charset, "UCS-4"); + if (codec->write_cd == (iconv_t) -1) { + iconv_close(codec->read_cd); + alloc(codec, 0, pw); + return NULL; + } + + codec->inval_buf[0] = '\0'; + codec->inval_len = 0; + + codec->read_buf[0] = 0; + codec->read_len = 0; + + codec->write_buf[0] = 0; + codec->write_len = 0; + + /* Finally, populate vtable */ + codec->base.handler.destroy = hubbub_iconv_codec_destroy; + codec->base.handler.encode = hubbub_iconv_codec_encode; + codec->base.handler.decode = hubbub_iconv_codec_decode; + codec->base.handler.reset = hubbub_iconv_codec_reset; + + return (hubbub_charsetcodec *) codec; +} + +/** + * Destroy an iconv-based codec + * + * \param codec The codec to destroy + */ +void hubbub_iconv_codec_destroy (hubbub_charsetcodec *codec) +{ + hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec; + + iconv_close(c->read_cd); + iconv_close(c->write_cd); + + return; +} + +/** + * Encode a chunk of UCS4 data into an iconv-based codec's charset + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * HUBBUB_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * as a result of the failure of the + * client-provided filter function. + * + * On exit, ::source will point immediately _after_ the last input character + * read. Any remaining output for the character will be buffered by the + * codec for writing on the next call. This buffered data is post-filtering, + * so will not be refiltered on the next call. + * + * In the case of the filter function failing, ::source will point _at_ the + * last input character read; nothing will be written or buffered for the + * failed character. It is up to the client to fix the cause of the failure + * and retry the encoding process. + * + * Note that, if failure occurs whilst attempting to write any output + * buffered by the last call, then ::source and ::sourcelen will remain + * unchanged (as nothing more has been read). + * + * There is no way to determine the output character which caused a + * failure (as it may be one in a filter-injected replacement sequence). + * It is, however, possible to determine which source character caused it + * (this being the character immediately before the location pointed to by + * ::source on exit). + * + * [I.e. the process of filtering results in a potential one-to-many mapping + * between source characters and output characters, and identification of + * individual output characters is impossible.] + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + */ +hubbub_error hubbub_iconv_codec_encode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec; + uint32_t ucs4; + const uint32_t *towrite; + size_t towritelen; + hubbub_error error; + + /* Process any outstanding characters from the previous call */ + if (c->write_len > 0) { + uint32_t *pwrite = c->write_buf; + + while (c->write_len > 0) { + error = hubbub_iconv_codec_write_char(c, pwrite[0], + dest, destlen); + if (error != HUBBUB_OK) { + /* Copy outstanding chars down, skipping + * invalid one, if present, so as to avoid + * reprocessing the invalid character */ + if (error == HUBBUB_INVALID) { + for (ucs4 = 1; ucs4 < c->write_len; + ucs4++) { + c->write_buf[ucs4] = + pwrite[ucs4]; + } + } + + return error; + } + + pwrite++; + c->write_len--; + } + } + + /* Now process the characters for this call */ + while (*sourcelen > 0) { + towrite = (const uint32_t *) (const void *) *source; + towritelen = 1; + ucs4 = *towrite; + + /* Run character we're about to output through the + * registered filter, so it can replace it, if it sees + * fit to do so */ + if (c->base.filter != NULL) { + uint32_t *replacement; + + error = c->base.filter(ntohl(ucs4), + &replacement, &towritelen, + c->base.filter_pw); + if (error != HUBBUB_OK) { + /* Don't eat character -- filter failed, + * so nothing gets written or buffered. + * It's up to the client to ensure that + * the filter works in the case where it + * reprocesses this character after the + * fault is fixed up. */ + + return error; + } + + /* Convert filter output to big endian UCS4 */ + for (ucs4 = 0; ucs4 < towritelen; ucs4++) { + replacement[ucs4] = htonl(replacement[ucs4]); + } + + towrite = (const uint32_t *) replacement; + } + + /* Output current character(s) */ + while (towritelen > 0) { + error = hubbub_iconv_codec_write_char(c, towrite[0], + dest, destlen); + + if (error != HUBBUB_OK) { + ucs4 = (error == HUBBUB_INVALID) ? 1 : 0; + + if (towritelen - ucs4 >= WRITE_BUFSIZE) + abort(); + + c->write_len = towritelen - ucs4; + + /* Copy pending chars to save area, for + * processing next call; skipping invalid + * character, if present, so it's not + * reprocessed. */ + for (; ucs4 < towritelen; ucs4++) { + c->write_buf[ucs4] = towrite[ucs4]; + } + + /* Claim character we've just buffered, + * so it's not repreocessed */ + *source += 4; + *sourcelen -= 4; + + return error; + } + + towrite++; + towritelen--; + } + + *source += 4; + *sourcelen -= 4; + } + + return HUBBUB_OK; +} + +/** + * Decode a chunk of data in an iconv-based codec's charset into UCS4 + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * HUBBUB_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * as a result of the failure of the + * client-provided filter function. + * + * On exit, ::source will point immediately _after_ the last input character + * read, if the result is _OK or _NOMEM. Any remaining output for the + * character will be buffered by the codec for writing on the next call. + * This buffered data is post-filtering, so will not be refiltered on the + * next call. + * + * In the case of the result being _INVALID or the filter function failing, + * ::source will point _at_ the last input character read; nothing will be + * written or buffered for the failed character. It is up to the client to + * fix the cause of the failure and retry the decoding process. + * + * Note that, if failure occurs whilst attempting to write any output + * buffered by the last call, then ::source and ::sourcelen will remain + * unchanged (as nothing more has been read). + * + * There is no way to determine the output character which caused a + * failure (as it may be one in a filter-injected replacement sequence). + * It is, however, possible to determine which source character caused it + * (this being the character immediately at or before the location pointed + * to by ::source on exit). + * + * [I.e. the process of filtering results in a potential one-to-many mapping + * between source characters and output characters, and identification of + * individual output characters is impossible.] + * + * If STRICT error handling is configured and an illegal sequence is split + * over two calls, then _INVALID will be returned from the second call, + * but ::source will point mid-way through the invalid sequence (i.e. it + * will be unmodified over the second call). In addition, the internal + * incomplete-sequence buffer will be emptied, such that subsequent calls + * will progress, rather than re-evaluating the same invalid sequence. + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + * + * Call this with a source length of 0 to flush the output buffer. + */ +hubbub_error hubbub_iconv_codec_decode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec; + hubbub_error error; + + if (c->read_len > 0) { + /* Output left over from last decode + * Attempt to finish this here */ + uint32_t *pread = c->read_buf; + + while (c->read_len > 0 && *destlen >= c->read_len * 4) { + *((uint32_t *) (void *) *dest) = pread[0]; + + *dest += 4; + *destlen -= 4; + + pread++; + c->read_len--; + } + + if (*destlen < c->read_len * 4) { + /* Run out of output buffer */ + size_t i; + + /* Shuffle remaining output down */ + for (i = 0; i < c->read_len; i++) { + c->read_buf[i] = pread[i]; + } + + return HUBBUB_NOMEM; + } + } + + if (c->inval_len > 0) { + /* The last decode ended in an incomplete sequence. + * Fill up inval_buf with data from the start of the + * new chunk and process it. */ + uint8_t *in = c->inval_buf; + size_t ol = c->inval_len; + size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen); + size_t orig_l = l; + + memcpy(c->inval_buf + ol, *source, l); + + l += c->inval_len; + + error = hubbub_iconv_codec_read_char(c, + (const uint8_t **) &in, &l, dest, destlen); + if (error != HUBBUB_OK && error != HUBBUB_NOMEM) { + return error; + } + + + /* And now, fix everything up so the normal processing + * does the right thing. */ + *source += max((signed) (orig_l - l), 0); + *sourcelen -= max((signed) (orig_l - l), 0); + + /* Failed to resolve an incomplete character and + * ran out of buffer space. No recovery strategy + * possible, so explode everywhere. */ + if ((orig_l + ol) - l == 0) + abort(); + + /* Handle memry exhaustion case from above */ + if (error != HUBBUB_OK) + return error; + } + + while (*sourcelen > 0) { + error = hubbub_iconv_codec_read_char(c, + source, sourcelen, dest, destlen); + if (error != HUBBUB_OK) { + return error; + } + } + + return HUBBUB_OK; +} + +/** + * Clear an iconv-based codec's encoding state + * + * \param codec The codec to reset + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_iconv_codec_reset(hubbub_charsetcodec *codec) +{ + hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec; + + iconv(c->read_cd, NULL, NULL, NULL, NULL); + iconv(c->write_cd, NULL, NULL, NULL, NULL); + + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + c->read_buf[0] = 0; + c->read_len = 0; + + c->write_buf[0] = 0; + c->write_len = 0; + + return HUBBUB_OK; +} + +/** + * Feed a UCS4 character through the registered filter and output the result + * + * \param c Codec to use + * \param ucs4 UCS4 character (big endian) + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to output buffer length + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * as a result of the failure of the + * client-provided filter function. + */ +hubbub_error hubbub_iconv_codec_filter_decoded_char(hubbub_iconv_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen) +{ + if (c->base.filter != NULL) { + uint32_t *rep; + size_t replen; + hubbub_error error; + + error = c->base.filter(ntohl(ucs4), &rep, &replen, + c->base.filter_pw); + if (error != HUBBUB_OK) { + return error; + } + + while (replen > 0 && *destlen >= replen * 4) { + *((uint32_t *) (void *) *dest) = htonl(*rep); + + *dest += 4; + *destlen -= 4; + + rep++; + replen--; + } + + if (*destlen < replen * 4) { + /* Run out of output buffer */ + size_t i; + + /* Buffer remaining output */ + c->read_len = replen; + + for (i = 0; i < replen; i++) { + c->read_buf[i] = htonl(rep[i]); + } + + return HUBBUB_NOMEM; + } + + } else { + if (*destlen < 4) { + /* Run out of output buffer */ + + c->read_len = 1; + c->read_buf[0] = ucs4; + + return HUBBUB_NOMEM; + } + + *((uint32_t *) (void *) *dest) = ucs4; + *dest += 4; + *destlen -= 4; + } + + return HUBBUB_OK; +} + +/** + * Detect if a codec's charset is Unicode capable + * + * \param c Codec to consider + * \return true if a Unicode variant, false otherwise + */ +bool hubbub_iconv_codec_is_unicode(hubbub_iconv_codec *c) +{ + static uint16_t ucs4; + static uint16_t ucs2; + static uint16_t utf8; + static uint16_t utf16; + static uint16_t utf16be; + static uint16_t utf16le; + static uint16_t utf32; + static uint16_t utf32be; + static uint16_t utf32le; + + if (ucs4 == 0) { + ucs4 = hubbub_mibenum_from_name("UCS-4", SLEN("UCS-4")); + ucs2 = hubbub_mibenum_from_name("UCS-2", SLEN("UCS-2")); + utf8 = hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8")); + utf16 = hubbub_mibenum_from_name("UTF-16", SLEN("UTF-16")); + utf16be = hubbub_mibenum_from_name("UTF-16BE", + SLEN("UTF-16BE")); + utf16le = hubbub_mibenum_from_name("UTF-16LE", + SLEN("UTF-16LE")); + utf32 = hubbub_mibenum_from_name("UTF-32", SLEN("UTF-32")); + utf32be = hubbub_mibenum_from_name("UTF-32BE", + SLEN("UTF-32BE")); + utf32le = hubbub_mibenum_from_name("UTF-32LE", + SLEN("UTF-32LE")); + } + + return (c->base.mibenum == ucs4 || + c->base.mibenum == ucs2 || + c->base.mibenum == utf8 || + c->base.mibenum == utf16 || + c->base.mibenum == utf16be || + c->base.mibenum == utf16le || + c->base.mibenum == utf32 || + c->base.mibenum == utf32be || + c->base.mibenum == utf32le); +} + +/** + * Read a character from the codec's native charset to UCS4 (big endian) + * + * \param c The codec + * \param source Pointer to pointer to source buffer (updated on exit) + * \param sourcelen Pointer to length of source buffer (updated on exit) + * \param dest Pointer to pointer to output buffer (updated on exit) + * \param destlen Pointer to length of output buffer (updated on exit) + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * HUBBUB_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * as a result of the failure of the + * client-provided filter function. + * + * On exit, ::source will point immediately _after_ the last input character + * read, if the result is _OK or _NOMEM. Any remaining output for the + * character will be buffered by the codec for writing on the next call. + * This buffered data is post-filtering, so will not be refiltered on the + * next call. + * + * In the case of the result being _INVALID or the filter function failing, + * ::source will point _at_ the last input character read; nothing will be + * written or buffered for the failed character. It is up to the client to + * fix the cause of the failure and retry the decoding process. + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + */ +hubbub_error hubbub_iconv_codec_read_char(hubbub_iconv_codec *c, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + size_t iconv_ret; + const uint8_t *origsrc = *source; + size_t origsrclen = *sourcelen; + uint32_t ucs4; + uint8_t *pucs4 = (uint8_t *) &ucs4; + size_t sucs4 = 4; + hubbub_error error; + + /* Use iconv to convert a single character + * Side effect: Updates *source to point at next input + * character and *sourcelen to reflect reduced input length + */ + iconv_ret = iconv(c->read_cd, (char **) source, sourcelen, + (char **) (void *) &pucs4, &sucs4); + + if (iconv_ret != (size_t) -1 || + (*source != origsrc && sucs4 == 0)) { + /* Read a character */ + error = hubbub_iconv_codec_filter_decoded_char(c, + ucs4, dest, destlen); + if (error != HUBBUB_OK && error != HUBBUB_NOMEM) { + /* filter function failed; restore source pointers */ + *source = origsrc; + *sourcelen = origsrclen; + } + + /* Clear inval buffer */ + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + return error; + } else if (errno == E2BIG) { + /* Should never happen */ + abort(); + } else if (errno == EINVAL) { + /* Incomplete input sequence */ + if (*sourcelen > INVAL_BUFSIZE) + abort(); + + memmove(c->inval_buf, (const char *) *source, *sourcelen); + c->inval_buf[*sourcelen] = '\0'; + c->inval_len = *sourcelen; + + *source += *sourcelen; + *sourcelen = 0; + + return HUBBUB_OK; + } else if (errno == EILSEQ) { + /* Illegal input sequence */ + bool found = false; + const uint8_t *oldsrc; + size_t oldsrclen; + + /* Clear inval buffer */ + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + /* Strict errormode; simply flag invalid character */ + if (c->base.errormode == HUBBUB_CHARSETCODEC_ERROR_STRICT) { + /* restore source pointers */ + *source = origsrc; + *sourcelen = origsrclen; + + return HUBBUB_INVALID; + } + + /* Ok, this becomes problematic. The iconv API here + * is particularly unhelpful; *source will point at + * the _start_ of the illegal sequence. This means + * that we must find the end of the sequence */ + + /* Search for the start of the next valid input + * sequence (or the end of the input stream) */ + while (*sourcelen > 1) { + pucs4 = (uint8_t *) &ucs4; + sucs4 = 4; + + (*source)++; + (*sourcelen)--; + + oldsrc = *source; + oldsrclen = *sourcelen; + + iconv_ret = iconv(c->read_cd, + (char **) source, sourcelen, + (char **) (void *) &pucs4, &sucs4); + if (iconv_ret != (size_t) -1 || errno != EILSEQ) { + found = true; + break; + } + } + + if (found) { + /* Found start of next valid sequence */ + *source = oldsrc; + *sourcelen = oldsrclen; + } else { + /* Not found - skip last byte in buffer */ + (*source)++; + (*sourcelen)--; + + if (*sourcelen != 0) + abort(); + } + + /* output U+FFFD and continue processing. */ + error = hubbub_iconv_codec_filter_decoded_char(c, + htonl(0xFFFD), dest, destlen); + if (error != HUBBUB_OK && error != HUBBUB_NOMEM) { + /* filter function failed; restore source pointers */ + *source = origsrc; + *sourcelen = origsrclen; + } + + return error; + } + + return HUBBUB_OK; +} + +/** + * Write a UCS4 character in a codec's native charset + * + * \param c The codec + * \param ucs4 The UCS4 character to write (big endian) + * \param dest Pointer to pointer to output buffer (updated on exit) + * \param destlen Pointer to length of output buffer (updated on exit) + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * HUBBUB_INVALID if character cannot be represented and the + * codec's error handling mode is set to STRICT. + */ +hubbub_error hubbub_iconv_codec_write_char(hubbub_iconv_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen) +{ + size_t iconv_ret; + uint8_t *pucs4 = (uint8_t *) &ucs4; + size_t sucs4 = 4; + uint8_t *origdest = *dest; + + iconv_ret = iconv(c->write_cd, (char **) (void *) &pucs4, + &sucs4, (char **) dest, destlen); + + if (iconv_ret == (size_t) -1 && errno == E2BIG) { + /* Output buffer is too small */ + return HUBBUB_NOMEM; + } else if (iconv_ret == (size_t) -1 && errno == EILSEQ) { + /* Illegal multibyte sequence */ + /* This should never happen */ + abort(); + } else if (iconv_ret == (size_t) -1 && errno == EINVAL) { + /* Incomplete input character */ + /* This should never happen */ + abort(); + } else if (*dest == origdest) { + /* Nothing was output */ + switch (c->base.errormode) { + case HUBBUB_CHARSETCODEC_ERROR_STRICT: + return HUBBUB_INVALID; + + case HUBBUB_CHARSETCODEC_ERROR_TRANSLIT: + /** \todo transliteration */ + case HUBBUB_CHARSETCODEC_ERROR_LOOSE: + { + pucs4 = (uint8_t *) &ucs4; + sucs4 = 4; + + ucs4 = hubbub_iconv_codec_is_unicode(c) + ? htonl(0xFFFD) : htonl(0x3F); + + iconv_ret = iconv(c->write_cd, + (char **) (void *) &pucs4, &sucs4, + (char **) dest, destlen); + + if (iconv_ret == (size_t) -1 && errno == E2BIG) { + return HUBBUB_NOMEM; + } else if (iconv_ret == (size_t) -1 && + errno == EILSEQ) { + /* Illegal multibyte sequence */ + /* This should never happen */ + abort(); + } else if (iconv_ret == (size_t) -1 && + errno == EINVAL) { + /* Incomplete input character */ + /* This should never happen */ + abort(); + } + } + break; + } + } + + return HUBBUB_OK; +} + +const hubbub_charsethandler hubbub_iconv_codec_handler = { + hubbub_iconv_codec_handles_charset, + hubbub_iconv_codec_create +}; diff --git a/src/charset/codec_impl.h b/src/charset/codec_impl.h new file mode 100644 index 0000000..eb5116b --- /dev/null +++ b/src/charset/codec_impl.h @@ -0,0 +1,51 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef hubbub_charset_codecimpl_h_ +#define hubbub_charset_codecimpl_h_ + +#include +#include + +#include "codec.h" + +/** + * Core charset codec definition; implementations extend this + */ +struct hubbub_charsetcodec { + uint16_t mibenum; /**< MIB enum for charset */ + + hubbub_charsetcodec_filter filter; /**< filter function */ + void *filter_pw; /**< filter private word */ + + hubbub_charsetcodec_errormode errormode; /**< error mode */ + + hubbub_alloc alloc; /**< allocation function */ + void *alloc_pw; /**< private word */ + + struct { + void (*destroy)(hubbub_charsetcodec *codec); + hubbub_error (*encode)(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); + hubbub_error (*decode)(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); + hubbub_error (*reset)(hubbub_charsetcodec *codec); + } handler; /**< Vtable for handler code */ +}; + +/** + * Codec factory component definition + */ +typedef struct hubbub_charsethandler { + bool (*handles_charset)(const char *charset); + hubbub_charsetcodec *(*create)(const char *charset, + hubbub_alloc alloc, void *pw); +} hubbub_charsethandler; + +#endif diff --git a/src/charset/codec_utf8.c b/src/charset/codec_utf8.c new file mode 100644 index 0000000..86d667f --- /dev/null +++ b/src/charset/codec_utf8.c @@ -0,0 +1,620 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include +#include + +/* These two are for htonl / ntohl */ +#include +#include + +#include "charset/aliases.h" +#include "utils/utf8.h" +#include "utils/utils.h" + +#include "codec_impl.h" + +/** + * UTF-8 charset codec + */ +typedef struct hubbub_utf8_codec { + hubbub_charsetcodec base; /**< Base class */ + +#define INVAL_BUFSIZE (32) + uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up + * incomplete input + * sequences */ + size_t inval_len; /*< Byte length of inval_buf **/ + +#define READ_BUFSIZE (8) + uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial + * output sequences (decode) + * (host-endian) */ + size_t read_len; /**< Character length of read_buf */ + +#define WRITE_BUFSIZE (8) + uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial + * output sequences (encode) + * (host-endian) */ + size_t write_len; /**< Character length of write_buf */ + +} hubbub_utf8_codec; + +static bool hubbub_utf8_codec_handles_charset(const char *charset); +static hubbub_charsetcodec *hubbub_utf8_codec_create(const char *charset, + hubbub_alloc alloc, void *pw); +static void hubbub_utf8_codec_destroy (hubbub_charsetcodec *codec); +static hubbub_error hubbub_utf8_codec_encode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static hubbub_error hubbub_utf8_codec_decode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static hubbub_error hubbub_utf8_codec_reset(hubbub_charsetcodec *codec); +static hubbub_error hubbub_utf8_codec_read_char(hubbub_utf8_codec *c, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static hubbub_error hubbub_utf8_codec_filter_decoded_char( + hubbub_utf8_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen); + +/** + * Determine whether this codec handles a specific charset + * + * \param charset Charset to test + * \return true if handleable, false otherwise + */ +bool hubbub_utf8_codec_handles_charset(const char *charset) +{ + return hubbub_mibenum_from_name(charset, strlen(charset)) == + hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8")); +} + +/** + * Create a utf8 codec + * + * \param charset The charset to read from / write to + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to codec, or NULL on failure + */ +hubbub_charsetcodec *hubbub_utf8_codec_create(const char *charset, + hubbub_alloc alloc, void *pw) +{ + hubbub_utf8_codec *codec; + + UNUSED(charset); + + codec = alloc(NULL, sizeof(hubbub_utf8_codec), pw); + if (codec == NULL) + return NULL; + + codec->inval_buf[0] = '\0'; + codec->inval_len = 0; + + codec->read_buf[0] = 0; + codec->read_len = 0; + + codec->write_buf[0] = 0; + codec->write_len = 0; + + /* Finally, populate vtable */ + codec->base.handler.destroy = hubbub_utf8_codec_destroy; + codec->base.handler.encode = hubbub_utf8_codec_encode; + codec->base.handler.decode = hubbub_utf8_codec_decode; + codec->base.handler.reset = hubbub_utf8_codec_reset; + + return (hubbub_charsetcodec *) codec; +} + +/** + * Destroy a utf8 codec + * + * \param codec The codec to destroy + */ +void hubbub_utf8_codec_destroy (hubbub_charsetcodec *codec) +{ + UNUSED(codec); +} + +/** + * Encode a chunk of UCS4 data into utf8 + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * HUBBUB_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * as a result of the failure of the + * client-provided filter function. + * + * On exit, ::source will point immediately _after_ the last input character + * read. Any remaining output for the character will be buffered by the + * codec for writing on the next call. This buffered data is post-filtering, + * so will not be refiltered on the next call. + * + * In the case of the filter function failing, ::source will point _at_ the + * last input character read; nothing will be written or buffered for the + * failed character. It is up to the client to fix the cause of the failure + * and retry the encoding process. + * + * Note that, if failure occurs whilst attempting to write any output + * buffered by the last call, then ::source and ::sourcelen will remain + * unchanged (as nothing more has been read). + * + * There is no way to determine the output character which caused a + * failure (as it may be one in a filter-injected replacement sequence). + * It is, however, possible to determine which source character caused it + * (this being the character immediately before the location pointed to by + * ::source on exit). + * + * [I.e. the process of filtering results in a potential one-to-many mapping + * between source characters and output characters, and identification of + * individual output characters is impossible.] + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + */ +hubbub_error hubbub_utf8_codec_encode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + hubbub_utf8_codec *c = (hubbub_utf8_codec *) codec; + uint32_t ucs4; + uint32_t *towrite; + size_t towritelen; + hubbub_error error; + + /* Process any outstanding characters from the previous call */ + if (c->write_len > 0) { + uint32_t *pwrite = c->write_buf; + uint8_t buf[6]; + size_t len; + + while (c->write_len > 0) { + error = hubbub_utf8_from_ucs4(pwrite[0], buf, &len); + if (error != HUBBUB_OK) + abort(); + + if (*destlen < len) { + /* Insufficient output buffer space */ + for (len = 0; len < c->write_len; len++) + c->write_buf[len] = pwrite[len]; + + return HUBBUB_NOMEM; + } + + memcpy(*dest, buf, len); + + *dest += len; + *destlen -= len; + + pwrite++; + c->write_len--; + } + } + + /* Now process the characters for this call */ + while (*sourcelen > 0) { + ucs4 = ntohl(*((uint32_t *) (void *) *source)); + towrite = &ucs4; + towritelen = 1; + + /* Run character we're about to output through the + * registered filter, so it can replace it. */ + if (c->base.filter != NULL) { + error = c->base.filter(ucs4, + &towrite, &towritelen, + c->base.filter_pw); + if (error != HUBBUB_OK) + return error; + } + + /* Output current characters */ + while (towritelen > 0) { + uint8_t buf[6]; + size_t len; + + error = hubbub_utf8_from_ucs4(towrite[0], buf, &len); + if (error != HUBBUB_OK) + abort(); + + if (*destlen < len) { + /* Insufficient output space */ + if (towritelen >= WRITE_BUFSIZE) + abort(); + + c->write_len = towritelen; + + /* Copy pending chars to save area, for + * processing next call. */ + for (len = 0; len < towritelen; len++) + c->write_buf[len] = towrite[len]; + + /* Claim character we've just buffered, + * so it's not reprocessed */ + *source += 4; + *sourcelen -= 4; + + return HUBBUB_NOMEM; + } + + memcpy(*dest, buf, len); + + *dest += len; + *destlen -= len; + + towrite++; + towritelen--; + } + + *source += 4; + *sourcelen -= 4; + } + + return HUBBUB_OK; +} + +/** + * Decode a chunk of utf8 data into UCS4 + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * HUBBUB_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * as a result of the failure of the + * client-provided filter function. + * + * On exit, ::source will point immediately _after_ the last input character + * read, if the result is _OK or _NOMEM. Any remaining output for the + * character will be buffered by the codec for writing on the next call. + * This buffered data is post-filtering, so will not be refiltered on the + * next call. + * + * In the case of the result being _INVALID or the filter function failing, + * ::source will point _at_ the last input character read; nothing will be + * written or buffered for the failed character. It is up to the client to + * fix the cause of the failure and retry the decoding process. + * + * Note that, if failure occurs whilst attempting to write any output + * buffered by the last call, then ::source and ::sourcelen will remain + * unchanged (as nothing more has been read). + * + * There is no way to determine the output character which caused a + * failure (as it may be one in a filter-injected replacement sequence). + * It is, however, possible to determine which source character caused it + * (this being the character immediately at or before the location pointed + * to by ::source on exit). + * + * [I.e. the process of filtering results in a potential one-to-many mapping + * between source characters and output characters, and identification of + * individual output characters is impossible.] + * + * If STRICT error handling is configured and an illegal sequence is split + * over two calls, then _INVALID will be returned from the second call, + * but ::source will point mid-way through the invalid sequence (i.e. it + * will be unmodified over the second call). In addition, the internal + * incomplete-sequence buffer will be emptied, such that subsequent calls + * will progress, rather than re-evaluating the same invalid sequence. + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + * + * Call this with a source length of 0 to flush the output buffer. + */ +hubbub_error hubbub_utf8_codec_decode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + hubbub_utf8_codec *c = (hubbub_utf8_codec *) codec; + hubbub_error error; + + if (c->read_len > 0) { + /* Output left over from last decode */ + uint32_t *pread = c->read_buf; + + while (c->read_len > 0 && *destlen >= c->read_len * 4) { + *((uint32_t *) (void *) *dest) = htonl(pread[0]); + + *dest += 4; + *destlen -= 4; + + pread++; + c->read_len--; + } + + if (*destlen < c->read_len * 4) { + /* Ran out of output buffer */ + size_t i; + + /* Shuffle remaining output down */ + for (i = 0; i < c->read_len; i++) + c->read_buf[i] = pread[i]; + + return HUBBUB_NOMEM; + } + } + + if (c->inval_len > 0) { + /* The last decode ended in an incomplete sequence. + * Fill up inval_buf with data from the start of the + * new chunk and process it. */ + uint8_t *in = c->inval_buf; + size_t ol = c->inval_len; + size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen); + size_t orig_l = l; + + memcpy(c->inval_buf + ol, *source, l); + + l += c->inval_len; + + error = hubbub_utf8_codec_read_char(c, + (const uint8_t **) &in, &l, dest, destlen); + if (error != HUBBUB_OK && error != HUBBUB_NOMEM) { + return error; + } + + /* And now, fix up source pointers */ + *source += max((signed) (orig_l - l), 0); + *sourcelen -= max((signed) (orig_l - l), 0); + + /* Failed to resolve an incomplete character and + * ran out of buffer space. No recovery strategy + * possible, so explode everywhere. */ + if ((orig_l + ol) - l == 0) + abort(); + + /* Report memory exhaustion case from above */ + if (error != HUBBUB_OK) + return error; + } + + /* Finally, the "normal" case; process all outstanding characters */ + while (*sourcelen > 0) { + error = hubbub_utf8_codec_read_char(c, + source, sourcelen, dest, destlen); + if (error != HUBBUB_OK) { + return error; + } + } + + return HUBBUB_OK; +} + +/** + * Clear a utf8 codec's encoding state + * + * \param codec The codec to reset + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_utf8_codec_reset(hubbub_charsetcodec *codec) +{ + hubbub_utf8_codec *c = (hubbub_utf8_codec *) codec; + + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + c->read_buf[0] = 0; + c->read_len = 0; + + c->write_buf[0] = 0; + c->write_len = 0; + + return HUBBUB_OK; +} + + +/** + * Read a character from the UTF-8 to UCS4 (big endian) + * + * \param c The codec + * \param source Pointer to pointer to source buffer (updated on exit) + * \param sourcelen Pointer to length of source buffer (updated on exit) + * \param dest Pointer to pointer to output buffer (updated on exit) + * \param destlen Pointer to length of output buffer (updated on exit) + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * HUBBUB_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * as a result of the failure of the + * client-provided filter function. + * + * On exit, ::source will point immediately _after_ the last input character + * read, if the result is _OK or _NOMEM. Any remaining output for the + * character will be buffered by the codec for writing on the next call. + * This buffered data is post-filtering, so will not be refiltered on the + * next call. + * + * In the case of the result being _INVALID or the filter function failing, + * ::source will point _at_ the last input character read; nothing will be + * written or buffered for the failed character. It is up to the client to + * fix the cause of the failure and retry the decoding process. + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + */ +hubbub_error hubbub_utf8_codec_read_char(hubbub_utf8_codec *c, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + uint32_t ucs4; + size_t sucs4; + hubbub_error error; + + /* Convert a single character */ + error = hubbub_utf8_to_ucs4(*source, *sourcelen, &ucs4, &sucs4); + if (error == HUBBUB_OK) { + /* Read a character */ + error = hubbub_utf8_codec_filter_decoded_char(c, + ucs4, dest, destlen); + if (error == HUBBUB_OK || error == HUBBUB_NOMEM) { + /* filter function succeeded; update source pointers */ + *source += sucs4; + *sourcelen -= sucs4; + } + + /* Clear inval buffer */ + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + return error; + } else if (error == HUBBUB_NEEDDATA) { + /* Incomplete input sequence */ + if (*sourcelen > INVAL_BUFSIZE) + abort(); + + memmove(c->inval_buf, (char *) *source, *sourcelen); + c->inval_buf[*sourcelen] = '\0'; + c->inval_len = *sourcelen; + + *source += *sourcelen; + *sourcelen = 0; + + return HUBBUB_OK; + } else if (error == HUBBUB_INVALID) { + /* Illegal input sequence */ + uint32_t nextchar; + + /* Clear inval buffer */ + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + /* Strict errormode; simply flag invalid character */ + if (c->base.errormode == HUBBUB_CHARSETCODEC_ERROR_STRICT) { + return HUBBUB_INVALID; + } + + /* Find next valid UTF-8 sequence. + * We're processing client-provided data, so let's + * be paranoid about its validity. */ + error = hubbub_utf8_next_paranoid(*source, *sourcelen, + 0, &nextchar); + if (error != HUBBUB_OK) { + if (error == HUBBUB_NEEDDATA) { + /* Need more data to be sure */ + if (*sourcelen > INVAL_BUFSIZE) + abort(); + + memmove(c->inval_buf, (char *) *source, + *sourcelen); + c->inval_buf[*sourcelen] = '\0'; + c->inval_len = *sourcelen; + + *source += *sourcelen; + *sourcelen = 0; + + nextchar = 0; + } else { + return error; + } + } + + /* output U+FFFD and continue processing. */ + error = hubbub_utf8_codec_filter_decoded_char(c, + 0xFFFD, dest, destlen); + if (error == HUBBUB_OK || error == HUBBUB_NOMEM) { + /* filter function succeeded; update source pointers */ + *source += nextchar; + *sourcelen -= nextchar; + } + + return error; + } + + return HUBBUB_OK; +} + +/** + * Feed a UCS4 character through the registered filter and output the result + * + * \param c Codec to use + * \param ucs4 UCS4 character (host endian) + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to output buffer length + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * as a result of the failure of the + * client-provided filter function. + */ +hubbub_error hubbub_utf8_codec_filter_decoded_char(hubbub_utf8_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen) +{ + if (c->base.filter != NULL) { + uint32_t *rep; + size_t replen; + hubbub_error error; + + error = c->base.filter(ucs4, &rep, &replen, + c->base.filter_pw); + if (error != HUBBUB_OK) { + return error; + } + + while (replen > 0 && *destlen >= replen * 4) { + *((uint32_t *) (void *) *dest) = htonl(*rep); + + *dest += 4; + *destlen -= 4; + + rep++; + replen--; + } + + if (*destlen < replen * 4) { + /* Run out of output buffer */ + size_t i; + + /* Buffer remaining output */ + c->read_len = replen; + + for (i = 0; i < replen; i++) { + c->read_buf[i] = rep[i]; + } + + return HUBBUB_NOMEM; + } + + } else { + if (*destlen < 4) { + /* Run out of output buffer */ + c->read_len = 1; + c->read_buf[0] = ucs4; + + return HUBBUB_NOMEM; + } + + *((uint32_t *) (void *) *dest) = htonl(ucs4); + *dest += 4; + *destlen -= 4; + } + + return HUBBUB_OK; +} + + +const hubbub_charsethandler hubbub_utf8_codec_handler = { + hubbub_utf8_codec_handles_charset, + hubbub_utf8_codec_create +}; diff --git a/src/charset/detect.c b/src/charset/detect.c new file mode 100644 index 0000000..8ff3b87 --- /dev/null +++ b/src/charset/detect.c @@ -0,0 +1,673 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include +#include + +#include "charset/aliases.h" +#include "utils/utils.h" + +#include "detect.h" + +static uint16_t hubbub_charset_read_bom(const uint8_t **data, size_t *len); +static uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len); +static uint16_t hubbub_charset_parse_attributes(const uint8_t **pos, + const uint8_t *end); +static uint16_t hubbub_charset_parse_content(const uint8_t *value, + uint32_t valuelen); +static bool hubbub_charset_get_attribute(const uint8_t **data, + const uint8_t *end, + const uint8_t **name, uint32_t *namelen, + const uint8_t **value, uint32_t *valuelen); + +/** + * Extract a charset from a chunk of data + * + * \param data Pointer to pointer to buffer containing data + * \param len Pointer to buffer length + * \param mibenum Pointer to location to store MIB enum representing charset + * \param source Pointer to location to receive charset source + * \return HUBBUB_OK on success, appropriate error otherwise + * + * The data pointer and length will be modified by this function if + * a byte order mark is encountered at the start of the buffer. The updated + * data pointer will point to the first byte in the buffer after the BOM. + * The length will be modified appropriately. + * + * The larger a chunk of data fed to this routine, the better, as it allows + * charset autodetection access to a larger dataset for analysis. + */ +hubbub_error hubbub_charset_extract(const uint8_t **data, size_t *len, + uint16_t *mibenum, hubbub_charset_source *source) +{ + uint16_t charset = 0; + + if (data == NULL || *data == NULL || len == NULL || + mibenum == NULL || source == NULL) + return HUBBUB_BADPARM; + + /* We need at least 4 bytes of data */ + if (*len < 4) + goto default_encoding; + + /* First, look for a BOM */ + charset = hubbub_charset_read_bom(data, len); + if (charset != 0) { + *mibenum = charset; + *source = HUBBUB_CHARSET_DOCUMENT; + + return HUBBUB_OK; + } + + /* No BOM was found, so we must look for a meta charset within + * the document itself. */ + charset = hubbub_charset_scan_meta(*data, *len); + if (charset != 0) { + /* ISO-8859-1 becomes Windows-1252 */ + if (charset == hubbub_mibenum_from_name("ISO-8859-1", + SLEN("ISO-8859-1"))) { + charset = hubbub_mibenum_from_name("Windows-1252", + SLEN("Windows-1252")); + /* Fallback to 8859-1 if that failed */ + if (charset == 0) + charset = hubbub_mibenum_from_name( + "ISO-8859-1", SLEN("ISO-8859-1")); + } + + /* If we've encountered a meta charset for a non-ASCII- + * compatible encoding, don't trust it. + * + * Firstly, it should have been sent with a BOM (and thus + * detected above). + * + * Secondly, we've just used an ASCII-only parser to + * extract the encoding from the document. Therefore, + * the document plainly isn't what the meta charset + * claims it is. + * + * What we do in this case is to ignore the meta charset's + * claims and leave the charset determination to the + * autodetection routines (or the fallback case if they + * fail). + */ + if (charset != hubbub_mibenum_from_name("UTF-16", + SLEN("UTF-16")) && + charset != hubbub_mibenum_from_name("UTF-16LE", + SLEN("UTF-16LE")) && + charset != hubbub_mibenum_from_name("UTF-16BE", + SLEN("UTF-16BE")) && + charset != hubbub_mibenum_from_name("UTF-32", + SLEN("UTF-32")) && + charset != hubbub_mibenum_from_name("UTF-32LE", + SLEN("UTF-32LE")) && + charset != hubbub_mibenum_from_name("UTF-32BE", + SLEN("UTF-32BE"))) { + + *mibenum = charset; + *source = HUBBUB_CHARSET_DOCUMENT; + + return HUBBUB_OK; + } + } + + /* No charset was specified within the document, attempt to + * autodetect the encoding from the data that we have available. */ + + /** \todo Charset autodetection */ + + /* We failed to autodetect a charset, so use the default fallback */ +default_encoding: + + charset = hubbub_mibenum_from_name("Windows-1252", + SLEN("Windows-1252")); + if (charset == 0) + charset = hubbub_mibenum_from_name("ISO-8859-1", + SLEN("ISO-8859-1")); + + *mibenum = charset; + *source = HUBBUB_CHARSET_DEFAULT; + + return HUBBUB_OK; +} + + +/** + * Inspect the beginning of a buffer of data for the presence of a + * UTF Byte Order Mark. + * + * \param data Pointer to pointer to buffer containing data + * \param len Pointer to buffer length + * \return MIB enum representing encoding described by BOM, or 0 if not found + * + * If a BOM is found, the data pointer will be modified to point to the first + * byte in the buffer after the BOM. The length will also be modified + * appropriately. + */ +uint16_t hubbub_charset_read_bom(const uint8_t **data, size_t *len) +{ + if (data == NULL || *data == NULL || len == NULL) + return 0; + + /* We require at least 4 bytes of data */ + if (*len < 4) + return 0; + +#define UTF32BOM_LEN (4) +#define UTF16BOM_LEN (2) +#define UTF8BOM_LEN (3) + + if ((*data)[0] == 0x00 && (*data)[1] == 0x00 && + (*data)[2] == 0xFE && (*data)[3] == 0xFF) { + *data += UTF32BOM_LEN; + *len -= UTF32BOM_LEN; + + return hubbub_mibenum_from_name("UTF-32BE", + SLEN("UTF-32BE")); + } else if ((*data)[0] == 0xFF && (*data)[1] == 0xFE && + (*data)[2] == 0x00 && (*data)[3] == 0x00) { + *data += UTF32BOM_LEN; + *len -= UTF32BOM_LEN; + + return hubbub_mibenum_from_name("UTF-32LE", + SLEN("UTF-32LE")); + } else if ((*data)[0] == 0xFE && (*data)[1] == 0xFF) { + *data += UTF16BOM_LEN; + *len -= UTF16BOM_LEN; + + return hubbub_mibenum_from_name("UTF-16BE", + SLEN("UTF-16BE")); + } else if ((*data)[0] == 0xFF && (*data)[1] == 0xFE) { + *data += UTF16BOM_LEN; + *len -= UTF16BOM_LEN; + + return hubbub_mibenum_from_name("UTF-16LE", + SLEN("UTF-16LE")); + } else if ((*data)[0] == 0xEF && (*data)[1] == 0xBB && + (*data)[2] == 0xBF) { + *data += UTF8BOM_LEN; + *len -= UTF8BOM_LEN; + + return hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8")); + } + +#undef UTF32BOM_LEN +#undef UTF16BOM_LEN +#undef UTF8BOM_LEN + + return 0; +} + +#define PEEK(a) \ + (pos < end - SLEN(a) && \ + strncasecmp((const char *) pos, a, SLEN(a)) == 0) + +#define ADVANCE(a) \ + while (pos < end - SLEN(a)) { \ + if (PEEK(a)) \ + break; \ + pos++; \ + } \ + \ + if (pos == end - SLEN(a)) \ + return 0; + +#define ISSPACE(a) \ + (a == 0x09 || a == 0x0a || a == 0x0b || \ + a == 0x0c || a == 0x0d || a == 0x20) + +/** + * Search for a meta charset within a buffer of data + * + * \param data Pointer to buffer containing data + * \param len Length of buffer in data + * \return MIB enum representing encoding, or 0 if none found + */ +uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len) +{ + const uint8_t *pos = data; + const uint8_t *end; + uint16_t mibenum; + + if (data == NULL) + return 0; + + end = pos + min(512, len); + + /* 1. */ + while (pos < end) { + /* a */ + if (PEEK(""); + /* b */ + } else if (PEEK("= end - 1) + return 0; + + if (ISSPACE(*(pos + SLEN("= end) + return 0; + } + /* c */ + } else if ((PEEK("' || *pos == '<') + break; + pos++; + } + + if (pos >= end) + return 0; + + /* 3 */ + if (*pos != '<') { + const uint8_t *n; + const uint8_t *v; + uint32_t nl, vl; + + while (hubbub_charset_get_attribute(&pos, end, + &n, &nl, &v, &vl)) + ; /* do nothing */ + /* 2 */ + } else + continue; + /* d */ + } else if (PEEK(""); + } + + /* e - do nothing */ + + /* 2 */ + pos++; + } + + return 0; +} + +/** + * Parse attributes on a meta tag + * + * \param pos Pointer to pointer to current location (updated on exit) + * \param end Pointer to end of data stream + * \return MIB enum of detected encoding, or 0 if none found + */ +uint16_t hubbub_charset_parse_attributes(const uint8_t **pos, + const uint8_t *end) +{ + const uint8_t *name; + const uint8_t *value; + uint32_t namelen, valuelen; + uint16_t mibenum; + + if (pos == NULL || *pos == NULL || end == NULL) + return 0; + + /* 2 */ + while (hubbub_charset_get_attribute(pos, end, + &name, &namelen, &value, &valuelen)) { + /* 3 */ + /* a */ + if (namelen == SLEN("charset") && valuelen > 0 && + strncasecmp((const char *) name, "charset", + SLEN("charset")) == 0) { + /* strip value */ + while (ISSPACE(*value)) { + value++; + valuelen--; + } + + while (valuelen > 0 && ISSPACE(value[valuelen - 1])) + valuelen--; + + mibenum = hubbub_mibenum_from_name( + (const char *) value, valuelen); + if (mibenum != 0) + return mibenum; + /* b */ + } else if (namelen == SLEN("content") && valuelen > 0 && + strncasecmp((const char *) name, "content", + SLEN("content")) == 0) { + mibenum = hubbub_charset_parse_content(value, + valuelen); + if (mibenum != 0) + return mibenum; + } + + /* c - do nothing */ + + /* 1 */ + while (*pos < end) { + if (ISSPACE(**pos)) + break; + (*pos)++; + } + + if (*pos >= end) { + return 0; + } + } + + return 0; +} + +/** + * Parse a content= attribute's value + * + * \param value Attribute's value + * \param valuelen Length of value + * \return MIB enum of detected encoding, or 0 if none found + */ +uint16_t hubbub_charset_parse_content(const uint8_t *value, + uint32_t valuelen) +{ + const uint8_t *end; + const uint8_t *tentative = NULL; + uint32_t tentative_len = 0; + + if (value == NULL) + return 0; + + end = value + valuelen; + + /* 1 */ + while (value < end) { + if (*value == ';') { + value++; + break; + } + + value++; + } + + if (value >= end) + return 0; + + /* 2 */ + while (value < end && ISSPACE(*value)) { + value++; + } + + if (value >= end) + return 0; + + /* 3 */ + if (value < end - SLEN("charset") && + strncasecmp((const char *) value, + "charset", SLEN("charset")) != 0) + return 0; + + value += SLEN("charset"); + + /* 4 */ + while (value < end && ISSPACE(*value)) { + value++; + } + + if (value >= end) + return 0; + + /* 5 */ + if (*value != '=') + return 0; + /* skip '=' */ + value++; + + /* 6 */ + while (value < end && ISSPACE(*value)) { + value++; + } + + if (value >= end) + return 0; + + /* 7 */ + tentative = value; + + /* a */ + if (*value == '"') { + while (++value < end && *value != '"') { + tentative_len++; + } + + if (value < end) + tentative++; + else + tentative = NULL; + /* b */ + } else if (*value == '\'') { + while (++value < end && *value != '\'') { + tentative_len++; + } + + if (value < end) + tentative++; + else + tentative = NULL; + /* c */ + } else { + while (value < end && !ISSPACE(*value)) { + value++; + tentative_len++; + } + } + + /* 8 */ + if (tentative != NULL) { + return hubbub_mibenum_from_name((const char *) tentative, + tentative_len); + } + + /* 9 */ + return 0; +} + +/** + * Extract an attribute from the data stream + * + * \param data Pointer to pointer to current location (updated on exit) + * \param end Pointer to end of data stream + * \param name Pointer to location to receive attribute name + * \param namelen Pointer to location to receive attribute name length + * \param value Pointer to location to receive attribute value + * \param valuelen Pointer to location to receive attribute value langth + * \return true if attribute extracted, false otherwise. + * + * Note: The caller should heed the returned lengths; these are the only + * indicator that useful content resides in name or value. + */ +bool hubbub_charset_get_attribute(const uint8_t **data, const uint8_t *end, + const uint8_t **name, uint32_t *namelen, + const uint8_t **value, uint32_t *valuelen) +{ + const uint8_t *pos; + + if (data == NULL || *data == NULL || end == NULL || name == NULL || + namelen == NULL || value == NULL || valuelen == NULL) + return false; + + pos = *data; + + /* 1. Skip leading spaces or '/' characters */ + while (pos < end && (ISSPACE(*pos) || *pos == '/')) { + pos++; + } + + if (pos >= end) { + *data = pos; + return false; + } + + /* 2. Invalid element open character */ + if (*pos == '<') { + pos--; + *data = pos; + return false; + } + + /* 3. End of element */ + if (*pos == '>') { + *data = pos; + return false; + } + + /* 4. Initialise name & value to empty string */ + *name = pos; + *namelen = 0; + *value = (const uint8_t *) ""; + *valuelen = 0; + + /* 5. Extract name */ + while (pos < end) { + /* a */ + if (*pos == '=') { + break; + } + + /* b */ + if (ISSPACE(*pos)) { + break; + } + + /* c */ + if (*pos == '/' || *pos == '<' || *pos == '>') { + return true; + } + + /* d is handled by strncasecmp in _parse_attributes */ + + /* e */ + (*namelen)++; + + /* 6 */ + pos++; + } + + if (pos >= end) { + *data = pos; + return false; + } + + if (ISSPACE(*pos)) { + /* 7. Skip trailing spaces */ + while (pos < end && ISSPACE(*pos)) { + pos++; + } + + if (pos >= end) { + *data = pos; + return false; + } + + /* 8. Must be '=' */ + if (*pos != '=') { + pos--; + *data = pos; + return true; + } + } + + /* 9. Skip '=' */ + pos++; + + /* 10. Skip any spaces after '=' */ + while (pos < end && ISSPACE(*pos)) { + pos++; + } + + if (pos >= end) { + *data = pos; + return false; + } + + /* 11. Extract value, if quoted */ + /* a */ + if (*pos == '\'' || *pos == '"') { + /* 1 */ + const uint8_t *quote = pos; + + /* 2 */ + while (++pos < end) { + /* 3 */ + if (*pos == *quote) { + *value = (quote + 1); + *data = ++pos; + return true; + } + + /* 4 is handled by strncasecmp */ + + /* 5 */ + (*valuelen)++; + + /* 6 */ + } + + if (pos >= end) { + *data = pos; + return false; + } + } + + /* b */ + if (*pos == '<' || *pos == '>') { + *data = pos; + return true; + } + + /* c is handled by strncasecmp */ + + /* d */ + *value = pos; + + while (pos < end) { + /* 12. Extract unquoted value */ + /* a */ + if (ISSPACE(*pos) || *pos == '<' || *pos == '>') { + *data = pos; + return true; + } + + /* b is handled by strncasecmp */ + + /* c */ + (*valuelen)++; + + /* 13. Advance */ + pos++; + } + + if (pos >= end) { + *data = pos; + return false; + } + + /* should never be reached */ + abort(); + + return false; +} diff --git a/src/charset/detect.h b/src/charset/detect.h new file mode 100644 index 0000000..854a8d6 --- /dev/null +++ b/src/charset/detect.h @@ -0,0 +1,22 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef hubbub_charset_detect_h_ +#define hubbub_charset_detect_h_ + +#include + +#include +#include +#include + +/* Extract a charset from a chunk of data */ +hubbub_error hubbub_charset_extract(const uint8_t **data, size_t *len, + uint16_t *mibenum, hubbub_charset_source *source); + +#endif + diff --git a/src/hubbub.c b/src/hubbub.c new file mode 100644 index 0000000..32e0a1f --- /dev/null +++ b/src/hubbub.c @@ -0,0 +1,63 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include + +#include "charset/aliases.h" +#include "tokeniser/entities.h" + +/** + * Initialise the Hubbub library for use. + * + * This _must_ be called before using any hubbub functions + * + * \param aliases_file Pointer to name of file containing encoding alias data + * \param alloc Pointer to (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return HUBBUB_OK on success, applicable error otherwise. + */ +hubbub_error hubbub_initialise(const char *aliases_file, + hubbub_alloc alloc, void *pw) +{ + hubbub_error error; + + if (aliases_file == NULL || alloc == NULL) + return HUBBUB_BADPARM; + + error = hubbub_aliases_create(aliases_file, alloc, pw); + if (error != HUBBUB_OK) + return error; + + error = hubbub_entities_create(alloc, pw); + if (error != HUBBUB_OK) { + hubbub_aliases_destroy(alloc, pw); + return error; + } + + return HUBBUB_OK; +} + +/** + * Clean up after Hubbub + * + * \param alloc Pointer to (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return HUBBUB_OK on success, applicable error otherwise. + */ +hubbub_error hubbub_finalise(hubbub_alloc alloc, void *pw) +{ + if (alloc == NULL) + return HUBBUB_BADPARM; + + hubbub_entities_destroy(alloc, pw); + + hubbub_aliases_destroy(alloc, pw); + + return HUBBUB_OK; +} + + diff --git a/src/input/Makefile b/src/input/Makefile new file mode 100644 index 0000000..8b06c63 --- /dev/null +++ b/src/input/Makefile @@ -0,0 +1,53 @@ +# Makefile for libhubbub +# +# Toolchain is exported by top-level makefile +# +# Top-level makefile also exports the following variables: +# +# COMPONENT Name of component +# EXPORT Absolute path of export directory +# TOP Absolute path of source tree root +# +# The top-level makefile requires the following targets to exist: +# +# clean Clean source tree +# debug Create a debug binary +# distclean Fully clean source tree, back to pristine condition +# export Export distributable components to ${EXPORT} +# release Create a release binary +# setup Perform any setup required prior to compilation +# test Execute any test cases + +# Manipulate include paths +CFLAGS += -I$(CURDIR) + +# Objects +OBJS = filter inputstream utf8_stream + +.PHONY: clean debug distclean export release setup test + +# Targets +release: $(addprefix ../Release/, $(addsuffix .o, $(OBJS))) + +debug: $(addprefix ../Debug/, $(addsuffix .o, $(OBJS))) + +clean: + -@${RM} ${RMFLAGS} $(addprefix ../Release/, $(addsuffix .o, ${OBJS})) + -@${RM} ${RMFLAGS} $(addprefix ../Debug/, $(addsuffix .o, ${OBJS})) + +distclean: + +setup: + +export: + +test: + +# Pattern rules +../Release/%.o: %.c + @${ECHO} ${ECHOFLAGS} "==> $<" + @${CC} -c ${CFLAGS} -DNDEBUG -o $@ $< + +../Debug/%.o: %.c + @${ECHO} ${ECHOFLAGS} "==> $<" + @${CC} -c -g ${CFLAGS} -o $@ $< diff --git a/src/input/filter.c b/src/input/filter.c new file mode 100644 index 0000000..5ac5391 --- /dev/null +++ b/src/input/filter.c @@ -0,0 +1,380 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include +#include +#include +#include + +#include "charset/aliases.h" +#include "charset/codec.h" +#include "utils/utils.h" + +#include "input/filter.h" + + +/** Input filter */ +struct hubbub_filter { + hubbub_charsetcodec *read_codec; /**< Read codec */ + hubbub_charsetcodec *write_codec; /**< Write codec */ + + uint32_t filter_output[2]; /**< Filter output buffer */ + uint32_t last_filter_char; /**< Last filtered character */ + + uint32_t pivot_buf[64]; /**< Conversion pivot buffer */ + + bool leftover; /**< Data remains from last call */ + uint8_t *pivot_left; /**< Remaining pivot to write */ + size_t pivot_len; /**< Length of pivot remaining */ + + struct { + uint16_t encoding; /**< Input encoding */ + } settings; /**< Filter settings */ + + hubbub_alloc alloc; /**< Memory (de)allocation function */ + void *pw; /**< Client private data */ +}; + +static hubbub_error hubbub_filter_set_defaults(hubbub_filter *input); +static hubbub_error hubbub_filter_set_encoding(hubbub_filter *input, + const char *enc); +static hubbub_error read_character_filter(uint32_t c, + uint32_t **output, size_t *outputlen, void *pw); + +/** + * Create an input filter + * + * \param int_enc Desired encoding of document + * \param alloc Function used to (de)allocate data + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to filter instance, or NULL on failure + */ +hubbub_filter *hubbub_filter_create(const char *int_enc, + hubbub_alloc alloc, void *pw) +{ + hubbub_filter *filter; + + if (alloc == NULL) + return NULL; + + filter = alloc(NULL, sizeof(*filter), pw); + if (!filter) + return NULL; + + filter->last_filter_char = 0; + + filter->leftover = false; + filter->pivot_left = NULL; + filter->pivot_len = 0; + + filter->alloc = alloc; + filter->pw = pw; + + if (hubbub_filter_set_defaults(filter) != HUBBUB_OK) { + filter->alloc(filter, 0, pw); + return NULL; + } + + filter->write_codec = hubbub_charsetcodec_create(int_enc, alloc, pw); + if (filter->write_codec == NULL) { + if (filter->read_codec != NULL) + hubbub_charsetcodec_destroy(filter->read_codec); + filter->alloc(filter, 0, pw); + return NULL; + } + + return filter; +} + +/** + * Destroy an input filter + * + * \param input Pointer to filter instance + */ +void hubbub_filter_destroy(hubbub_filter *input) +{ + if (input == NULL) + return; + + if (input->read_codec != NULL) + hubbub_charsetcodec_destroy(input->read_codec); + + if (input->write_codec != NULL) + hubbub_charsetcodec_destroy(input->write_codec); + + input->alloc(input, 0, input->pw); + + return; +} + +/** + * Configure an input filter + * + * \param input Pointer to filter instance + * \param type Input option type to configure + * \param params Option-specific parameters + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_filter_setopt(hubbub_filter *input, + hubbub_filter_opttype type, + hubbub_filter_optparams *params) +{ + hubbub_error error = HUBBUB_OK; + + if (input == NULL || params == NULL) + return HUBBUB_BADPARM; + + switch (type) { + case HUBBUB_FILTER_SET_ENCODING: + error = hubbub_filter_set_encoding(input, + params->encoding.name); + break; + } + + return error; +} + +/** + * Process a chunk of data + * + * \param input Pointer to filter instance + * \param data Pointer to pointer to input buffer + * \param len Pointer to length of input buffer + * \param output Pointer to pointer to output buffer + * \param outlen Pointer to length of output buffer + * \return HUBBUB_OK on success, appropriate error otherwise + * + * Call this with an input buffer length of 0 to flush any buffers. + */ +hubbub_error hubbub_filter_process_chunk(hubbub_filter *input, + const uint8_t **data, size_t *len, + uint8_t **output, size_t *outlen) +{ + hubbub_error read_error, write_error; + + if (input == NULL || data == NULL || *data == NULL || len == NULL || + output == NULL || *output == NULL || outlen == NULL) + return HUBBUB_BADPARM; + + if (input->leftover) { + /* Some data left to be written from last call */ + + /* Attempt to flush the remaining data. */ + write_error = hubbub_charsetcodec_encode(input->write_codec, + (const uint8_t **) &input->pivot_left, + &input->pivot_len, + output, outlen); + + if (write_error != HUBBUB_OK) { + return write_error; + } + + /* And clear leftover */ + input->pivot_left = NULL; + input->pivot_len = 0; + input->leftover = false; + } + + while (*len > 0) { + size_t pivot_len = sizeof(input->pivot_buf); + uint8_t *pivot = (uint8_t *) input->pivot_buf; + + read_error = hubbub_charsetcodec_decode(input->read_codec, + data, len, + (uint8_t **) &pivot, &pivot_len); + + pivot = (uint8_t *) input->pivot_buf; + pivot_len = sizeof(input->pivot_buf) - pivot_len; + + if (pivot_len > 0) { + write_error = hubbub_charsetcodec_encode( + input->write_codec, + (const uint8_t **) &pivot, + &pivot_len, + output, outlen); + + if (write_error != HUBBUB_OK) { + input->leftover = true; + input->pivot_left = pivot; + input->pivot_len = pivot_len; + + return write_error; + } + } + + if (read_error != HUBBUB_OK && read_error != HUBBUB_NOMEM) + return read_error; + } + + return HUBBUB_OK; +} + +/** + * Reset an input filter's state + * + * \param input The input filter to reset + * \param HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_filter_reset(hubbub_filter *input) +{ + hubbub_error error; + + if (input == NULL) + return HUBBUB_BADPARM; + + /* Clear pivot buffer leftovers */ + input->pivot_left = NULL; + input->pivot_len = 0; + input->leftover = false; + + /* Reset read codec */ + error = hubbub_charsetcodec_reset(input->read_codec); + if (error != HUBBUB_OK) + return error; + + /* Reset write codec */ + error = hubbub_charsetcodec_reset(input->write_codec); + if (error != HUBBUB_OK) + return error; + + return HUBBUB_OK; +} + +/** + * Set an input filter's default settings + * + * \param input Input filter to configure + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_filter_set_defaults(hubbub_filter *input) +{ + hubbub_error error; + + if (input == NULL) + return HUBBUB_BADPARM; + + input->read_codec = NULL; + input->write_codec = NULL; + input->settings.encoding = 0; + error = hubbub_filter_set_encoding(input, "ISO-8859-1"); + if (error != HUBBUB_OK) + return error; + + return HUBBUB_OK; +} + +/** + * Set an input filter's encoding + * + * \param input Input filter to configure + * \param enc Encoding name + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_filter_set_encoding(hubbub_filter *input, + const char *enc) +{ + const char *old_enc; + uint16_t mibenum; + hubbub_error error; + hubbub_charsetcodec_optparams params; + + if (input == NULL || enc == NULL) + return HUBBUB_BADPARM; + + mibenum = hubbub_mibenum_from_name(enc, strlen(enc)); + if (mibenum == 0) + return HUBBUB_INVALID; + + /* Exit early if we're already using this encoding */ + if (input->settings.encoding == mibenum) + return HUBBUB_OK; + + old_enc = hubbub_mibenum_to_name(input->settings.encoding); + if (old_enc == NULL) + old_enc = "ISO-8859-1"; + + if (input->read_codec != NULL) + hubbub_charsetcodec_destroy(input->read_codec); + + input->read_codec = hubbub_charsetcodec_create(enc, input->alloc, + input->pw); + if (input->read_codec == NULL) + return HUBBUB_NOMEM; + + /* Register filter function */ + params.filter_func.filter = read_character_filter; + params.filter_func.pw = (void *) input; + error = hubbub_charsetcodec_setopt(input->read_codec, + HUBBUB_CHARSETCODEC_FILTER_FUNC, + (hubbub_charsetcodec_optparams *) ¶ms); + if (error != HUBBUB_OK) + return error; + + input->settings.encoding = mibenum; + + return HUBBUB_OK; +} + +/** + * Character filter function for read characters + * + * \param c The read character (UCS4 - host byte order) + * \param output Pointer to pointer to output buffer (filled on exit) + * \param outputlen Pointer to output buffer length (filled on exit) + * \param pw Pointer to client-specific private data. + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error read_character_filter(uint32_t c, uint32_t **output, + size_t *outputlen, void *pw) +{ + hubbub_filter *input = (hubbub_filter *) pw; + size_t len; + + if (output == NULL || outputlen == NULL || pw == NULL) + return HUBBUB_BADPARM; + + /* Line ending normalisation: + * CRLF -> LF (trap CR and let LF through unmodified) + * CR -> LF (trap CR and convert to LF if not CRLF) + * LF -> LF (leave LF alone) + */ + +#define NUL (0x00000000) +#define CR (0x0000000D) +#define LF (0x0000000A) +#define REP (0x0000FFFD) + + if (c == NUL) { + /* Replace NUL (U+0000) characters in input with U+FFFD */ + input->filter_output[0] = REP; + len = 1; + } else if (c == CR) { + /* Trap CR characters */ + len = 0; + } else if (input->last_filter_char == CR && c != LF) { + /* Last char was CR and this isn't LF => CR -> LF */ + input->filter_output[0] = LF; + input->filter_output[1] = c; + len = 2; + } else { + /* Let character through unchanged */ + input->filter_output[0] = c; + len = 1; + } + +#undef NUL +#undef CR +#undef LF +#undef REP + + input->last_filter_char = c; + + *output = input->filter_output; + *outputlen = len; + + return HUBBUB_OK; +} diff --git a/src/input/filter.h b/src/input/filter.h new file mode 100644 index 0000000..6650e09 --- /dev/null +++ b/src/input/filter.h @@ -0,0 +1,57 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef hubbub_input_filter_h_ +#define hubbub_input_filter_h_ + +#include + +#include +#include + +typedef struct hubbub_filter hubbub_filter; + +/** + * Input filter option types + */ +typedef enum hubbub_filter_opttype { + HUBBUB_FILTER_SET_ENCODING = 0, +} hubbub_filter_opttype; + +/** + * Input filter option parameters + */ +typedef union hubbub_filter_optparams { + /** Parameters for encoding setting */ + struct { + /** Encoding name */ + const char *name; + } encoding; +} hubbub_filter_optparams; + + +/* Create an input filter */ +hubbub_filter *hubbub_filter_create(const char *int_enc, + hubbub_alloc alloc, void *pw); +/* Destroy an input filter */ +void hubbub_filter_destroy(hubbub_filter *input); + +/* Configure an input filter */ +hubbub_error hubbub_filter_setopt(hubbub_filter *input, + hubbub_filter_opttype type, + hubbub_filter_optparams *params); + +/* Process a chunk of data */ +hubbub_error hubbub_filter_process_chunk(hubbub_filter *input, + const uint8_t **data, size_t *len, + uint8_t **output, size_t *outlen); + +/* Reset an input filter's state */ +hubbub_error hubbub_filter_reset(hubbub_filter *input); + +#endif + diff --git a/src/input/inputstream.c b/src/input/inputstream.c new file mode 100644 index 0000000..f82d279 --- /dev/null +++ b/src/input/inputstream.c @@ -0,0 +1,479 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include + +#include "charset/aliases.h" +#include "input/streamimpl.h" + +/** + * Buffer moving claimant context + */ +struct hubbub_inputstream_bm_handler { + hubbub_inputstream_buffermoved handler; /**< Handler function */ + void *pw; /**< Client private data */ + + struct hubbub_inputstream_bm_handler *next; + struct hubbub_inputstream_bm_handler *prev; +}; + +extern hubbub_streamhandler utf8stream; + +static hubbub_streamhandler *handler_table[] = { + &utf8stream, + NULL +}; + +/** + * Create an input stream + * + * \param enc Document charset, or NULL to autodetect + * \param int_enc Desired encoding of document + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to stream instance, or NULL on failure + */ +hubbub_inputstream *hubbub_inputstream_create(const char *enc, + const char *int_enc, hubbub_alloc alloc, void *pw) +{ + hubbub_inputstream *stream; + hubbub_streamhandler **handler; + + if (int_enc == NULL || alloc == NULL) + return NULL; + + /* Search for handler class */ + for (handler = handler_table; *handler != NULL; handler++) { + if ((*handler)->uses_encoding(int_enc)) + break; + } + + /* None found */ + if ((*handler) == NULL) + return NULL; + + stream = (*handler)->create(enc, int_enc, alloc, pw); + if (stream == NULL) + return NULL; + + stream->handlers = NULL; + + stream->alloc = alloc; + stream->pw = pw; + + return stream; +} + +/** + * Destroy an input stream + * + * \param stream Input stream to destroy + */ +void hubbub_inputstream_destroy(hubbub_inputstream *stream) +{ + hubbub_inputstream_bm_handler *h, *i; + + if (stream == NULL) + return; + + for (h = stream->handlers; h; h = i) { + i = h->next; + + stream->alloc(h, 0, stream->pw); + } + + stream->destroy(stream); +} + +/** + * Append data to an input stream + * + * \param stream Input stream to append data to + * \param data Data to append (in document charset), or NULL to flag EOF + * \param len Length, in bytes, of data + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_inputstream_append(hubbub_inputstream *stream, + const uint8_t *data, size_t len) +{ + if (stream == NULL) + return HUBBUB_BADPARM; + + /* Calling this if we've disowned the buffer is foolish */ + if (stream->buffer == NULL) + return HUBBUB_INVALID; + + return stream->append(stream, data, len); +} + +/** + * Insert data into stream at current location + * + * \param stream Input stream to insert into + * \param data Data to insert (UTF-8 encoded) + * \param len Length, in bytes, of data + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_inputstream_insert(hubbub_inputstream *stream, + const uint8_t *data, size_t len) +{ + if (stream == NULL || data == NULL) + return HUBBUB_BADPARM; + + /* Calling this if we've disowned the buffer is foolish */ + if (stream->buffer == NULL) + return HUBBUB_INVALID; + + return stream->insert(stream, data, len); +} + +/** + * Look at the next character in the stream + * + * \param stream Stream to look in + * \return UCS4 (host-endian) character code, or EOF or OOD. + */ +uint32_t hubbub_inputstream_peek(hubbub_inputstream *stream) +{ + /* It is illegal to call this after the buffer has been disowned */ + if (stream == NULL || stream->buffer == NULL) + return HUBBUB_INPUTSTREAM_OOD; + + return stream->peek(stream);; +} + +/** + * Retrieve the byte index and length of the current character in the stream + * + * \param stream Stream to look in + * \param len Pointer to location to receive byte length of character + * \return Byte index of current character from start of stream, + * or (uint32_t) -1 on error + */ +uint32_t hubbub_inputstream_cur_pos(hubbub_inputstream *stream, + size_t *len) +{ + /* It is illegal to call this after the buffer has been disowned */ + if (stream == NULL || len == NULL || stream->buffer == NULL) + return (uint32_t) -1; + + return stream->cur_pos(stream, len); +} + +/** + * Convert the current character to lower case + * + * \param stream Stream to look in + */ +void hubbub_inputstream_lowercase(hubbub_inputstream *stream) +{ + if (stream == NULL || stream->buffer == NULL) + return; + + stream->lowercase(stream); +} + +/** + * Convert the current character to upper case + * + * \param stream Stream to look in + */ +void hubbub_inputstream_uppercase(hubbub_inputstream *stream) +{ + if (stream == NULL || stream->buffer == NULL) + return; + + stream->uppercase(stream); +} + +/** + * Advance the stream's current position + * + * \param stream The stream whose position to advance + */ +void hubbub_inputstream_advance(hubbub_inputstream *stream) +{ + /* It is illegal to call this after the buffer has been disowned */ + if (stream == NULL || stream->buffer == NULL) + return; + + if (stream->cursor == stream->buffer_len) + return; + + stream->advance(stream); +} + +/** + * Push a character back onto the stream + * + * \param stream Stream to push back to + * \param character UCS4 (host-endian) codepoint to push back + * \return HUBBUB_OK on success, appropriate error otherwise + * + * Note that this doesn't actually modify the data in the stream. + * It works by ensuring that the character located just before the + * current stream location is the same as ::character. If it is, + * then the stream pointer is moved back. If it is not, then an + * error is returned and the stream pointer remains unmodified. + */ +hubbub_error hubbub_inputstream_push_back(hubbub_inputstream *stream, + uint32_t character) +{ + /* It is illegal to call this after the buffer has been disowned */ + if (stream == NULL || stream->buffer == NULL) + return HUBBUB_BADPARM; + + if (stream->cursor == 0) + return HUBBUB_INVALID; + + return stream->push_back(stream, character); +} + +/** + * Rewind the input stream by a number of bytes + * + * \param stream Stream to rewind + * \param n Number of bytes to go back + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_inputstream_rewind(hubbub_inputstream *stream, size_t n) +{ + if (stream == NULL || stream->buffer == NULL) + return HUBBUB_BADPARM; + + if (stream->cursor < n) + return HUBBUB_INVALID; + + stream->cursor -= n; + + return HUBBUB_OK; +} + +/** + * Claim ownership of an input stream's buffer + * + * \param stream Input stream whose buffer to claim + * \param buffer Pointer to location to receive buffer pointer + * \param len Pointer to location to receive byte length of buffer + * \return HUBBUB_OK on success, appropriate error otherwise. + * + * Once the buffer has been claimed by a client, the input stream disclaims + * all ownership rights (and invalidates any internal references it may have + * to the buffer). Therefore, the only input stream call which may be made + * after calling this function is to destroy the input stream. Therefore, + * unless the stream pointer is located at EOF, this call will return an + * error. + */ +hubbub_error hubbub_inputstream_claim_buffer(hubbub_inputstream *stream, + uint8_t **buffer, size_t *len) +{ + if (stream == NULL || buffer == NULL || len == NULL) + return HUBBUB_BADPARM; + + if (stream->had_eof == false || + stream->cursor != stream->buffer_len) + return HUBBUB_INVALID; + + *buffer = stream->buffer; + *len = stream->buffer_len; + + stream->buffer = NULL; + + return HUBBUB_OK; +} + +/** + * Register interest in buffer moved events + * + * \param stream Input stream to register interest with + * \param handler Pointer to handler function + * \param pw Pointer to client-specific private data (may be NULL) + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_inputstream_register_movehandler( + hubbub_inputstream *stream, + hubbub_inputstream_buffermoved handler, void *pw) +{ + hubbub_inputstream_bm_handler *h; + + if (stream == NULL || handler == NULL) + return HUBBUB_BADPARM; + + h = stream->alloc(NULL, sizeof(hubbub_inputstream_bm_handler), + stream->pw); + if (h == NULL) + return HUBBUB_NOMEM; + + h->handler = handler; + h->pw = pw; + + h->prev = NULL; + h->next = stream->handlers; + + if (stream->handlers) + stream->handlers->prev = h; + stream->handlers = h; + + /* And notify claimant of current buffer location */ + handler(stream->buffer, stream->buffer_len, pw); + + return HUBBUB_OK; +} + +/** + * Deregister interest in buffer moved events + * + * \param stream Input stream to deregister from + * \param handler Pointer to handler function + * \param pw Pointer to client-specific private data (may be NULL) + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_inputstream_deregister_movehandler( + hubbub_inputstream *stream, + hubbub_inputstream_buffermoved handler, void *pw) +{ + hubbub_inputstream_bm_handler *h; + + if (stream == NULL || handler == NULL) + return HUBBUB_BADPARM; + + for (h = stream->handlers; h; h = h->next) { + if (h->handler == handler && h->pw == pw) + break; + } + + if (h == NULL) + return HUBBUB_INVALID; + + if (h->next) + h->next->prev = h->prev; + if (h->prev) + h->prev->next = h->next; + else + stream->handlers = h->next; + + stream->alloc(h, 0, stream->pw); + + return HUBBUB_OK; +} + +/** + * Case insensitively compare a pair of ranges in the input stream + * + * \param stream Input stream to look in + * \param r1 Offset of start of first range + * \param r2 Offset of start of second range + * \param len Byte length of ranges + * \return 0 if ranges match, non-zero otherwise + */ +int hubbub_inputstream_compare_range_ci(hubbub_inputstream *stream, + uint32_t r1, uint32_t r2, size_t len) +{ + if (stream == NULL || stream->buffer == NULL) + return 1; /* arbitrary */ + + return stream->cmp_range_ci(stream, r1, r2, len); +} + +/** + * Case sensitively compare a pair of ranges in the input stream + * + * \param stream Input stream to look in + * \param r1 Offset of start of first range + * \param r2 Offset of start of second range + * \param len Byte length of ranges + * \return 0 if ranges match, non-zero otherwise + */ +int hubbub_inputstream_compare_range_cs(hubbub_inputstream *stream, + uint32_t r1, uint32_t r2, size_t len) +{ + if (stream == NULL || stream->buffer == NULL) + return 1; /* arbitrary */ + + return stream->cmp_range_cs(stream, r1, r2, len); +} + +/** + * Case sensitively compare a range of input stream against an ASCII string + * + * \param stream Input stream to look in + * \param off Offset of range start + * \param len Byte length of range + * \param data Comparison string + * \param dlen Byte length of comparison string + * \return 0 if match, non-zero otherwise + */ +int hubbub_inputstream_compare_range_ascii(hubbub_inputstream *stream, + uint32_t off, size_t len, const char *data, size_t dlen) +{ + if (stream == NULL || stream->buffer == NULL) + return 1; /* arbitrary */ + + return stream->cmp_range_ascii(stream, off, len, data, dlen); +} + +/** + * Replace a range of bytes in the input stream with a single character + * + * \param stream Input stream containing data + * \param start Offset of start of range to replace + * \param len Length (in bytes) of range to replace + * \param ucs4 UCS4 (host endian) encoded replacement character + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_inputstream_replace_range(hubbub_inputstream *stream, + uint32_t start, size_t len, uint32_t ucs4) +{ + if (stream == NULL || stream->buffer == NULL) + return HUBBUB_BADPARM; + + if (start >= stream->buffer_len) + return HUBBUB_INVALID; + + if (start < stream->cursor) + return HUBBUB_INVALID; + + return stream->replace_range(stream, start, len, ucs4); +} + +/** + * Read the document charset + * + * \param stream Input stream to query + * \param source Pointer to location to receive charset source + * \return Pointer to charset name (constant; do not free), or NULL if unknown + */ +const char *hubbub_inputstream_read_charset(hubbub_inputstream *stream, + hubbub_charset_source *source) +{ + if (stream == NULL || source == NULL) + return NULL; + + *source = stream->encsrc; + + if (stream->encsrc == HUBBUB_CHARSET_UNKNOWN) + return NULL; + + return hubbub_mibenum_to_name(stream->mibenum); +} + +/** + * Inform interested parties that the buffer has moved + * + * \param stream Input stream + */ +void hubbub_inputstream_buffer_moved(hubbub_inputstream *stream) +{ + hubbub_inputstream_bm_handler *h; + + if (stream == NULL) + return; + + for (h = stream->handlers; h; h = h->next) + h->handler(stream->buffer, stream->buffer_len, h->pw); +} + diff --git a/src/input/inputstream.h b/src/input/inputstream.h new file mode 100644 index 0000000..5325d14 --- /dev/null +++ b/src/input/inputstream.h @@ -0,0 +1,98 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef hubbub_input_inputstream_h_ +#define hubbub_input_inputstream_h_ + +#include + +#include +#include +#include + +typedef struct hubbub_inputstream hubbub_inputstream; + +/* EOF pseudo-character */ +#define HUBBUB_INPUTSTREAM_EOF (0xFFFFFFFFU) +/* Out-of-data indicator */ +#define HUBBUB_INPUTSTREAM_OOD (0xFFFFFFFEU) + +/* Type of input stream buffer moved handler function */ +typedef void (*hubbub_inputstream_buffermoved)(const uint8_t *buffer, + size_t len, void *pw); + +/* Create an input stream */ +hubbub_inputstream *hubbub_inputstream_create(const char *enc, + const char *int_enc, hubbub_alloc alloc, void *pw); +/* Destroy an input stream */ +void hubbub_inputstream_destroy(hubbub_inputstream *stream); + +/* Append data to an input stream */ +hubbub_error hubbub_inputstream_append(hubbub_inputstream *stream, + const uint8_t *data, size_t len); +/* Insert data into stream at current location */ +hubbub_error hubbub_inputstream_insert(hubbub_inputstream *stream, + const uint8_t *data, size_t len); + +/* Look at the next character in the stream */ +uint32_t hubbub_inputstream_peek(hubbub_inputstream *stream); + +/* Retrieve the byte index and length of the current character in the stream */ +uint32_t hubbub_inputstream_cur_pos(hubbub_inputstream *stream, size_t *len); + +/* Convert the current character to lowercase */ +void hubbub_inputstream_lowercase(hubbub_inputstream *stream); + +/* Convert the current character to uppercase */ +void hubbub_inputstream_uppercase(hubbub_inputstream *stream); + +/* Advance the stream's current position */ +void hubbub_inputstream_advance(hubbub_inputstream *stream); + +/* Push a character back onto the stream */ +hubbub_error hubbub_inputstream_push_back(hubbub_inputstream *stream, + uint32_t character); + +/* Rewind the input stream by a number of bytes */ +hubbub_error hubbub_inputstream_rewind(hubbub_inputstream *stream, size_t n); + +/* Claim ownership of an input stream's buffer */ +hubbub_error hubbub_inputstream_claim_buffer(hubbub_inputstream *stream, + uint8_t **buffer, size_t *len); + +/* Register interest in buffer moved events */ +hubbub_error hubbub_inputstream_register_movehandler( + hubbub_inputstream *stream, + hubbub_inputstream_buffermoved handler, void *pw); + +/* Deregister interest in buffer moved events */ +hubbub_error hubbub_inputstream_deregister_movehandler( + hubbub_inputstream *stream, + hubbub_inputstream_buffermoved handler, void *pw); + +/* Case insensitively compare a pair of ranges in the input stream */ +int hubbub_inputstream_compare_range_ci(hubbub_inputstream *stream, + uint32_t r1, uint32_t r2, size_t len); + +/* Case sensitively compare a pair of ranges in the input stream */ +int hubbub_inputstream_compare_range_cs(hubbub_inputstream *stream, + uint32_t r1, uint32_t r2, size_t len); + +/* Case sensitively compare a range of input stream against an ASCII string */ +int hubbub_inputstream_compare_range_ascii(hubbub_inputstream *stream, + uint32_t off, size_t len, const char *data, size_t dlen); + +/* Replace a range of bytes in the input stream with a single character */ +hubbub_error hubbub_inputstream_replace_range(hubbub_inputstream *stream, + uint32_t start, size_t len, uint32_t ucs4); + +/* Read the document charset */ +const char *hubbub_inputstream_read_charset(hubbub_inputstream *stream, + hubbub_charset_source *source); + +#endif + diff --git a/src/input/streamimpl.h b/src/input/streamimpl.h new file mode 100644 index 0000000..f44f6da --- /dev/null +++ b/src/input/streamimpl.h @@ -0,0 +1,77 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef hubbub_input_streamimpl_h_ +#define hubbub_input_streamimpl_h_ + +#include + +#include + +#include "input/filter.h" +#include "input/inputstream.h" + +typedef struct hubbub_inputstream_bm_handler hubbub_inputstream_bm_handler; + +/** + * Input stream definition: implementations extend this + */ +struct hubbub_inputstream { + uint8_t *buffer; /**< Document buffer */ + size_t buffer_len; /**< Amount of data in buffer */ + size_t buffer_alloc; /**< Allocated size of buffer */ + + uint32_t cursor; /**< Byte offset of current position */ + + bool had_eof; /**< Whether EOF has been reached */ + + uint16_t mibenum; /**< MIB enum for charset, or 0 */ + hubbub_charset_source encsrc; /**< Charset source */ + + hubbub_filter *input; /**< Charset conversion filter */ + + hubbub_inputstream_bm_handler *handlers; /**< List of buffer + * moved handlers */ + hubbub_alloc alloc; /**< Memory (de)allocation function */ + void *pw; /**< Client private data */ + + void (*destroy)(hubbub_inputstream *stream); + hubbub_error (*append)(hubbub_inputstream *stream, + const uint8_t *data, size_t len); + hubbub_error (*insert)(hubbub_inputstream *stream, + const uint8_t *data, size_t len); + uint32_t (*peek)(hubbub_inputstream *stream); + uint32_t (*cur_pos)(hubbub_inputstream *stream, size_t *len); + void (*lowercase)(hubbub_inputstream *stream); + void (*uppercase)(hubbub_inputstream *stream); + void (*advance)(hubbub_inputstream *stream); + hubbub_error (*push_back)(hubbub_inputstream *stream, + uint32_t character); + int (*cmp_range_ci)(hubbub_inputstream *stream, uint32_t r1, + uint32_t r2, size_t len); + int (*cmp_range_cs)(hubbub_inputstream *stream, uint32_t r1, + uint32_t r2, size_t len); + int (*cmp_range_ascii)(hubbub_inputstream *stream, + uint32_t off, size_t len, + const char *data, size_t dlen); + hubbub_error (*replace_range)(hubbub_inputstream *stream, + uint32_t start, size_t len, uint32_t ucs4); +}; + +/** + * Input stream factory component definition + */ +typedef struct hubbub_streamhandler { + bool (*uses_encoding)(const char *int_enc); + hubbub_inputstream *(*create)(const char *enc, const char *int_enc, + hubbub_alloc alloc, void *pw); +} hubbub_streamhandler; + +/* Notification of stream buffer moving */ +void hubbub_inputstream_buffer_moved(hubbub_inputstream *stream); + +#endif diff --git a/src/input/utf8_stream.c b/src/input/utf8_stream.c new file mode 100644 index 0000000..5d08993 --- /dev/null +++ b/src/input/utf8_stream.c @@ -0,0 +1,567 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include +#include + +#include "charset/aliases.h" +#include "charset/detect.h" +#include "input/streamimpl.h" +#include "utils/utf8.h" +#include "utils/utils.h" + +#define BUFFER_CHUNK (4096) + +static bool hubbub_utf8stream_uses_encoding(const char *int_enc); +static hubbub_inputstream *hubbub_utf8stream_create(const char *enc, + const char *int_enc, hubbub_alloc alloc, void *pw); +static void hubbub_utf8stream_destroy(hubbub_inputstream *stream); +static hubbub_error hubbub_utf8stream_append(hubbub_inputstream *stream, + const uint8_t *data, size_t len); +static hubbub_error hubbub_utf8stream_insert(hubbub_inputstream *stream, + const uint8_t *data, size_t len); +static uint32_t hubbub_utf8stream_peek(hubbub_inputstream *stream); +static uint32_t hubbub_utf8stream_cur_pos(hubbub_inputstream *stream, + size_t *len); +static void hubbub_utf8stream_lowercase(hubbub_inputstream *stream); +static void hubbub_utf8stream_uppercase(hubbub_inputstream *stream); +static void hubbub_utf8stream_advance(hubbub_inputstream *stream); +static hubbub_error hubbub_utf8stream_push_back(hubbub_inputstream *stream, + uint32_t character); +static int hubbub_utf8stream_compare_range_ci(hubbub_inputstream *stream, + uint32_t r1, uint32_t r2, size_t len); +static int hubbub_utf8stream_compare_range_cs(hubbub_inputstream *stream, + uint32_t r1, uint32_t r2, size_t len); +static int hubbub_utf8stream_compare_range_ascii(hubbub_inputstream *stream, + uint32_t off, size_t len, const char *data, size_t dlen); +static hubbub_error hubbub_utf8stream_replace_range( + hubbub_inputstream *stream, + uint32_t start, size_t len, uint32_t ucs4); + +/** + * Determine whether a stream implementation uses an internal encoding + * + * \param int_enc The desired encoding + * \return true if handled, false otherwise + */ +bool hubbub_utf8stream_uses_encoding(const char *int_enc) +{ + return (hubbub_mibenum_from_name(int_enc, strlen(int_enc)) == + hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8"))); +} + +/** + * Create an input stream + * + * \param enc Document charset, or NULL if unknown + * \param int_enc Desired encoding of document + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to stream instance, or NULL on failure + */ +hubbub_inputstream *hubbub_utf8stream_create(const char *enc, + const char *int_enc, hubbub_alloc alloc, void *pw) +{ + hubbub_inputstream *stream; + + if (hubbub_mibenum_from_name(int_enc, strlen(int_enc)) != + hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8"))) + return NULL; + + stream = alloc(NULL, sizeof(hubbub_inputstream), pw); + if (stream == NULL) + return NULL; + + stream->buffer = alloc(NULL, BUFFER_CHUNK, pw); + if (stream->buffer == NULL) { + alloc(stream, 0, pw); + return NULL; + } + + stream->buffer_len = 0; + stream->buffer_alloc = BUFFER_CHUNK; + + stream->cursor = 0; + + stream->had_eof = false; + + stream->input = hubbub_filter_create(int_enc, alloc, pw); + if (stream->input == NULL) { + alloc(stream->buffer, 0, pw); + alloc(stream, 0, pw); + return NULL; + } + + if (enc != NULL) { + hubbub_error error; + hubbub_filter_optparams params; + + stream->mibenum = hubbub_mibenum_from_name(enc, strlen(enc)); + + if (stream->mibenum != 0) { + params.encoding.name = enc; + + error = hubbub_filter_setopt(stream->input, + HUBBUB_FILTER_SET_ENCODING, ¶ms); + if (error != HUBBUB_OK && error != HUBBUB_INVALID) { + hubbub_filter_destroy(stream->input); + alloc(stream->buffer, 0, pw); + alloc(stream, 0, pw); + return NULL; + } + + stream->encsrc = HUBBUB_CHARSET_DICTATED; + } + } else { + stream->mibenum = 0; + stream->encsrc = HUBBUB_CHARSET_UNKNOWN; + } + + stream->destroy = hubbub_utf8stream_destroy; + stream->append = hubbub_utf8stream_append; + stream->insert = hubbub_utf8stream_insert; + stream->peek = hubbub_utf8stream_peek; + stream->cur_pos = hubbub_utf8stream_cur_pos; + stream->lowercase = hubbub_utf8stream_lowercase; + stream->uppercase = hubbub_utf8stream_uppercase; + stream->advance = hubbub_utf8stream_advance; + stream->push_back = hubbub_utf8stream_push_back; + stream->cmp_range_ci = hubbub_utf8stream_compare_range_ci; + stream->cmp_range_cs = hubbub_utf8stream_compare_range_cs; + stream->cmp_range_ascii = hubbub_utf8stream_compare_range_ascii; + stream->replace_range = hubbub_utf8stream_replace_range; + + return stream; +} + +/** + * Destroy an input stream + * + * \param stream Input stream to destroy + */ +void hubbub_utf8stream_destroy(hubbub_inputstream *stream) +{ + if (stream->input != NULL) { + hubbub_filter_destroy(stream->input); + } + + if (stream->buffer != NULL) { + stream->alloc(stream->buffer, 0, stream->pw); + } + + stream->alloc(stream, 0, stream->pw); +} + +/** + * Append data to an input stream + * + * \param stream Input stream to append data to + * \param data Data to append (in document charset), or NULL to flag EOF + * \param len Length, in bytes, of data + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_utf8stream_append(hubbub_inputstream *stream, + const uint8_t *data, size_t len) +{ + hubbub_error error; + uint8_t *base; + size_t space; + + if (data == NULL) { + /* EOF indicated */ + size_t dummy_len = 0; + uint8_t *dummy_data = (uint8_t *) &dummy_len; + + base = stream->buffer + stream->buffer_len; + space = stream->buffer_alloc - stream->buffer_len; + + /* Forcibly flush through any remaining buffered data */ + while ((error = hubbub_filter_process_chunk(stream->input, + (const uint8_t **) &dummy_data, &dummy_len, + &base, &space)) == HUBBUB_NOMEM) { + bool moved = false; + uint8_t *temp = stream->alloc(stream->buffer, + stream->buffer_alloc + BUFFER_CHUNK, + stream->pw); + + if (temp == NULL) { + return HUBBUB_NOMEM; + } + + moved = (temp != stream->buffer); + + stream->buffer = temp; + stream->buffer_len += stream->buffer_alloc - + stream->buffer_len - space; + stream->buffer_alloc += BUFFER_CHUNK; + + base = stream->buffer + stream->buffer_len; + space = stream->buffer_alloc - stream->buffer_len; + + if (moved) + hubbub_inputstream_buffer_moved(stream); + } + + /* And fix up buffer length */ + stream->buffer_len += stream->buffer_alloc - + stream->buffer_len - space; + + stream->had_eof = true; + } else { + /* Normal data chunk */ + + if (stream->mibenum == 0) { + /* Haven't found charset yet; detect it */ + error = hubbub_charset_extract(&data, &len, + &stream->mibenum, &stream->encsrc); + if (error) { + return error; + } + + /* We should always have a charset by now */ + if (stream->mibenum == 0) + abort(); + } + + base = stream->buffer + stream->buffer_len; + space = stream->buffer_alloc - stream->buffer_len; + + /* Convert chunk to UTF-8 */ + while ((error = hubbub_filter_process_chunk(stream->input, + &data, &len, + &base, &space)) == HUBBUB_NOMEM) { + bool moved = false; + uint8_t *temp = stream->alloc(stream->buffer, + stream->buffer_alloc + BUFFER_CHUNK, + stream->pw); + + if (temp == NULL) { + return HUBBUB_NOMEM; + } + + moved = (temp != stream->buffer); + + stream->buffer = temp; + stream->buffer_len += stream->buffer_alloc - + stream->buffer_len - space; + stream->buffer_alloc += BUFFER_CHUNK; + + base = stream->buffer + stream->buffer_len; + space = stream->buffer_alloc - stream->buffer_len - + space; + + if (moved) + hubbub_inputstream_buffer_moved(stream); + } + + /* And fix up buffer length */ + stream->buffer_len += stream->buffer_alloc - + stream->buffer_len - space; + } + + return HUBBUB_OK; +} + +/** + * Insert data into stream at current location + * + * \param stream Input stream to insert into + * \param data Data to insert (UTF-8 encoded) + * \param len Length, in bytes, of data + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_utf8stream_insert(hubbub_inputstream *stream, + const uint8_t *data, size_t len) +{ + size_t space; + uint8_t *curpos; + + space = stream->buffer_alloc - stream->buffer_len; + + /* Need to grow buffer, if there's insufficient space */ + if (space <= len) { + bool moved = false; + uint8_t *temp = stream->alloc(stream->buffer, + stream->buffer_alloc + + ((len + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK) + + BUFFER_CHUNK, + stream->pw); + + if (temp == NULL) + return HUBBUB_NOMEM; + + moved = (temp != stream->buffer); + + stream->buffer = temp; + stream->buffer_alloc += + ((len + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK); + + if (moved) + hubbub_inputstream_buffer_moved(stream); + } + + /* Find the insertion point + * (just before the next character to be read) */ + curpos = stream->buffer + stream->cursor; + + /* Move data above this point up */ + memmove(curpos + len, curpos, stream->buffer_len - stream->cursor); + + /* Copy new data into gap created by memmove */ + memcpy(curpos, data, len); + + /* Fix up buffer length */ + stream->buffer_len += len; + + return HUBBUB_OK; +} + +/** + * Look at the next character in the stream + * + * \param stream Stream to look in + * \return UCS4 (host-endian) character code, or EOF or OOD. + */ +uint32_t hubbub_utf8stream_peek(hubbub_inputstream *stream) +{ + hubbub_error error; + size_t len; + uint32_t ret; + + if (stream->cursor == stream->buffer_len) { + return stream->had_eof ? HUBBUB_INPUTSTREAM_EOF + : HUBBUB_INPUTSTREAM_OOD; + } + + error = hubbub_utf8_to_ucs4(stream->buffer + stream->cursor, + stream->buffer_len - stream->cursor, + &ret, &len); + if (error != HUBBUB_OK && error != HUBBUB_NEEDDATA) + return HUBBUB_INPUTSTREAM_OOD; + + if (error == HUBBUB_NEEDDATA) { + if (stream->had_eof) + return HUBBUB_INPUTSTREAM_EOF; + else + return HUBBUB_INPUTSTREAM_OOD; + } + + return ret; +} + +/** + * Retrieve the byte index and length of the current character in the stream + * + * \param stream Stream to look in + * \param len Pointer to location to receive byte length of character + * \return Byte index of current character from start of stream, + * or (uint32_t) -1 on error + */ +uint32_t hubbub_utf8stream_cur_pos(hubbub_inputstream *stream, + size_t *len) +{ + hubbub_utf8_char_byte_length(stream->buffer + stream->cursor, len); + + return stream->cursor; +} + +/** + * Convert the current character to lower case + * + * \param stream Stream to look in + */ +void hubbub_utf8stream_lowercase(hubbub_inputstream *stream) +{ + if ('A' <= stream->buffer[stream->cursor] && + stream->buffer[stream->cursor] <= 'Z') + stream->buffer[stream->cursor] += 0x0020; +} + +/** + * Convert the current character to upper case + * + * \param stream Stream to look in + */ +void hubbub_utf8stream_uppercase(hubbub_inputstream *stream) +{ + if ('a' <= stream->buffer[stream->cursor] && + stream->buffer[stream->cursor] <= 'z') + stream->buffer[stream->cursor] -= 0x0020; +} + +/** + * Advance the stream's current position + * + * \param stream The stream whose position to advance + */ +void hubbub_utf8stream_advance(hubbub_inputstream *stream) +{ + hubbub_error error; + uint32_t next; + + error = hubbub_utf8_next(stream->buffer, stream->buffer_len, + stream->cursor, &next); + + if (error == HUBBUB_OK) + stream->cursor = next; +} + +/** + * Push a character back onto the stream + * + * \param stream Stream to push back to + * \param character UCS4 (host-endian) codepoint to push back + * \return HUBBUB_OK on success, appropriate error otherwise + * + * Note that this doesn't actually modify the data in the stream. + * It works by ensuring that the character located just before the + * current stream location is the same as ::character. If it is, + * then the stream pointer is moved back. If it is not, then an + * error is returned and the stream pointer remains unmodified. + */ +hubbub_error hubbub_utf8stream_push_back(hubbub_inputstream *stream, + uint32_t character) +{ + hubbub_error error; + uint32_t prev; + uint8_t buf[6]; + size_t len; + + error = hubbub_utf8_prev(stream->buffer, stream->cursor, &prev); + if (error != HUBBUB_OK) + return error; + + error = hubbub_utf8_from_ucs4(character, buf, &len); + if (error != HUBBUB_OK) + return error; + + if ((stream->cursor - prev) != len || + memcmp(stream->buffer + prev, buf, len) != 0) + return HUBBUB_INVALID; + + stream->cursor = prev; + + return HUBBUB_OK; +} + +/** + * Case insensitively compare a pair of ranges in the input stream + * + * \param stream Input stream to look in + * \param r1 Offset of start of first range + * \param r2 Offset of start of second range + * \param len Byte length of ranges + * \return 0 if ranges match, non-zero otherwise + */ +int hubbub_utf8stream_compare_range_ci(hubbub_inputstream *stream, + uint32_t r1, uint32_t r2, size_t len) +{ + return strncasecmp((const char *) (stream->buffer + r1), + (const char *) (stream->buffer + r2), len); +} + +/** + * Case sensitively compare a pair of ranges in the input stream + * + * \param stream Input stream to look in + * \param r1 Offset of start of first range + * \param r2 Offset of start of second range + * \param len Byte length of ranges + * \return 0 if ranges match, non-zero otherwise + */ +int hubbub_utf8stream_compare_range_cs(hubbub_inputstream *stream, + uint32_t r1, uint32_t r2, size_t len) +{ + return strncmp((const char *) (stream->buffer + r1), + (const char *) (stream->buffer + r2), len); +} + +/** + * Case sensitively compare a range of input stream against an ASCII string + * + * \param stream Input stream to look in + * \param off Offset of range start + * \param len Byte length of range + * \param data Comparison string + * \param dlen Byte length of comparison string + * \return 0 if match, non-zero otherwise + */ +int hubbub_utf8stream_compare_range_ascii(hubbub_inputstream *stream, + uint32_t off, size_t len, const char *data, size_t dlen) +{ + /* Lengths don't match, so strings don't */ + if (len != dlen) + return 1; /* arbitrary */ + + return strncmp((const char *) (stream->buffer + off), + data, len); +} + +/** + * Replace a range of bytes in the input stream with a single character + * + * \param stream Input stream containing data + * \param start Offset of start of range to replace + * \param len Length (in bytes) of range to replace + * \param ucs4 UCS4 (host endian) encoded replacement character + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_utf8stream_replace_range(hubbub_inputstream *stream, + uint32_t start, size_t len, uint32_t ucs4) +{ + uint8_t buf[6]; + size_t replen; + int32_t diff; + hubbub_error error; + + /* Get UTF8 version of replacement character */ + error = hubbub_utf8_from_ucs4(ucs4, buf, &replen); + if (error) + return error; + + diff = replen - len; + + if (stream->buffer_len + diff >= stream->buffer_alloc) { + /* Need more buffer space */ + bool moved = false; + uint8_t *temp = stream->alloc(stream->buffer, + stream->buffer_alloc + + ((diff + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK) + + BUFFER_CHUNK, + stream->pw); + + if (temp == NULL) + return HUBBUB_NOMEM; + + moved = (temp != stream->buffer); + + stream->buffer = temp; + stream->buffer_alloc += + ((diff + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK); + + if (moved) + hubbub_inputstream_buffer_moved(stream); + } + + /* Move subsequent input to correct location */ + memmove(stream->buffer + start + len + diff, + stream->buffer + start + len, + stream->buffer_len - (start + len)); + + /* And fill the gap with the replacement character */ + memcpy(stream->buffer + start, buf, replen); + + /* Finally, update length */ + stream->buffer_len += diff; + + return HUBBUB_OK; +} + +hubbub_streamhandler utf8stream = { + hubbub_utf8stream_uses_encoding, + hubbub_utf8stream_create +}; diff --git a/src/parser.c b/src/parser.c new file mode 100644 index 0000000..e7a4fe8 --- /dev/null +++ b/src/parser.c @@ -0,0 +1,237 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include + +#include "input/inputstream.h" +#include "tokeniser/tokeniser.h" + +/** + * Hubbub parser object + */ +struct hubbub_parser { + hubbub_inputstream *stream; /**< Input stream instance */ + hubbub_tokeniser *tok; /**< Tokeniser instance */ + + hubbub_alloc alloc; /**< Memory (de)allocation function */ + void *pw; /**< Client data */ +}; + +/** + * Create a hubbub parser + * + * \param enc Source document encoding, or NULL to autodetect + * \param int_enc Desired encoding of document + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to parser instance, or NULL on error + */ +hubbub_parser *hubbub_parser_create(const char *enc, const char *int_enc, + hubbub_alloc alloc, void *pw) +{ + hubbub_parser *parser; + + if (alloc == NULL) + return NULL; + + parser = alloc(NULL, sizeof(hubbub_parser), pw); + if (parser == NULL) + return NULL; + + parser->stream = hubbub_inputstream_create(enc, int_enc, alloc, pw); + if (parser->stream == NULL) { + alloc(parser, 0, pw); + return NULL; + } + + parser->tok = hubbub_tokeniser_create(parser->stream, alloc, pw); + if (parser->tok == NULL) { + hubbub_inputstream_destroy(parser->stream); + alloc(parser, 0, pw); + return NULL; + } + + parser->alloc = alloc; + parser->pw = pw; + + return parser; +} + +/** + * Destroy a hubbub parser + * + * \param parser Parser instance to destroy + */ +void hubbub_parser_destroy(hubbub_parser *parser) +{ + if (parser == NULL) + return; + + hubbub_tokeniser_destroy(parser->tok); + + hubbub_inputstream_destroy(parser->stream); + + parser->alloc(parser, 0, parser->pw); +} + +/** + * Configure a hubbub parser + * + * \param parser Parser instance to configure + * \param type Option to set + * \param params Option-specific parameters + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_parser_setopt(hubbub_parser *parser, + hubbub_parser_opttype type, + hubbub_parser_optparams *params) +{ + hubbub_tokeniser_opttype toktype; + + if (parser == NULL || params == NULL) + return HUBBUB_BADPARM; + + switch (type) { + case HUBBUB_PARSER_TOKEN_HANDLER: + toktype = HUBBUB_TOKENISER_TOKEN_HANDLER; + break; + case HUBBUB_PARSER_BUFFER_HANDLER: + toktype = HUBBUB_TOKENISER_BUFFER_HANDLER; + break; + case HUBBUB_PARSER_ERROR_HANDLER: + toktype = HUBBUB_TOKENISER_BUFFER_HANDLER; + break; + case HUBBUB_PARSER_CONTENT_MODEL: + toktype = HUBBUB_TOKENISER_CONTENT_MODEL; + break; + } + + return hubbub_tokeniser_setopt(parser->tok, toktype, + (hubbub_tokeniser_optparams *) params); +} + +/** + * Pass a chunk of data to a hubbub parser for parsing + * + * \param parser Parser instance to use + * \param data Data to parse (encoded in the input charset) + * \param len Length, in bytes, of data + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_parser_parse_chunk(hubbub_parser *parser, + uint8_t *data, size_t len) +{ + hubbub_error error; + + if (parser == NULL || data == NULL) + return HUBBUB_BADPARM; + + error = hubbub_inputstream_append(parser->stream, data, len); + if (error != HUBBUB_OK) + return error; + + error = hubbub_tokeniser_run(parser->tok); + if (error != HUBBUB_OK) + return error; + + return HUBBUB_OK; +} + +/** + * Pass a chunk of extraneous data to a hubbub parser for parsing + * + * \param parser Parser instance to use + * \param data Data to parse (encoded in internal charset) + * \param len Length, in byte, of data + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_parser_parse_extraneous_chunk(hubbub_parser *parser, + uint8_t *data, size_t len) +{ + hubbub_error error; + + /** \todo In some cases, we don't actually want script-inserted + * data to be parsed until later. We'll need some way of flagging + * this through the public API, and the inputstream API will need + * some way of marking the insertion point so that, when the + * tokeniser is run, only the inserted chunk is parsed. */ + + if (parser == NULL || data == NULL) + return HUBBUB_BADPARM; + + error = hubbub_inputstream_insert(parser->stream, data, len); + if (error != HUBBUB_OK) + return error; + + error = hubbub_tokeniser_run(parser->tok); + if (error != HUBBUB_OK) + return error; + + return HUBBUB_OK; +} + +/** + * Inform the parser that the last chunk of data has been parsed + * + * \param parser Parser to inform + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_parser_completed(hubbub_parser *parser) +{ + hubbub_error error; + + if (parser == NULL) + return HUBBUB_BADPARM; + + error = hubbub_inputstream_append(parser->stream, NULL, 0); + if (error != HUBBUB_OK) + return error; + + error = hubbub_tokeniser_run(parser->tok); + if (error != HUBBUB_OK) + return error; + + return HUBBUB_OK; +} + +/** + * Read the document charset + * + * \param parser Parser instance to query + * \param source Pointer to location to receive charset source + * \return Pointer to charset name (constant; do not free), or NULL if unknown + */ +const char *hubbub_parser_read_charset(hubbub_parser *parser, + hubbub_charset_source *source) +{ + if (parser == NULL || source == NULL) + return NULL; + + return hubbub_inputstream_read_charset(parser->stream, source); +} + +/** + * Claim ownership of the document buffer + * + * \param parser Parser whose buffer to claim + * \param buffer Pointer to location to receive buffer pointer + * \param len Pointer to location to receive byte length of buffer + * \return HUBBUB_OK on success, appropriate error otherwise. + * + * Once the buffer has been claimed by a client, the parser disclaims + * all ownership rights (and invalidates any internal references it may have + * to the buffer). Therefore, the only parser call which may be made + * after calling this function is to destroy the parser. + */ +hubbub_error hubbub_parser_claim_buffer(hubbub_parser *parser, + uint8_t **buffer, size_t *len) +{ + if (parser == NULL || buffer == NULL || len == NULL) + return HUBBUB_BADPARM; + + return hubbub_inputstream_claim_buffer(parser->stream, buffer, len); +} diff --git a/src/tokeniser/Makefile b/src/tokeniser/Makefile new file mode 100644 index 0000000..539625f --- /dev/null +++ b/src/tokeniser/Makefile @@ -0,0 +1,53 @@ +# Makefile for libhubbub +# +# Toolchain is exported by top-level makefile +# +# Top-level makefile also exports the following variables: +# +# COMPONENT Name of component +# EXPORT Absolute path of export directory +# TOP Absolute path of source tree root +# +# The top-level makefile requires the following targets to exist: +# +# clean Clean source tree +# debug Create a debug binary +# distclean Fully clean source tree, back to pristine condition +# export Export distributable components to ${EXPORT} +# release Create a release binary +# setup Perform any setup required prior to compilation +# test Execute any test cases + +# Manipulate include paths +CFLAGS += -I$(CURDIR) + +# Objects +OBJS = entities tokeniser + +.PHONY: clean debug distclean export release setup test + +# Targets +release: $(addprefix ../Release/, $(addsuffix .o, $(OBJS))) + +debug: $(addprefix ../Debug/, $(addsuffix .o, $(OBJS))) + +clean: + -@${RM} ${RMFLAGS} $(addprefix ../Release/, $(addsuffix .o, ${OBJS})) + -@${RM} ${RMFLAGS} $(addprefix ../Debug/, $(addsuffix .o, ${OBJS})) + +distclean: + +setup: + +export: + +test: + +# Pattern rules +../Release/%.o: %.c + @${ECHO} ${ECHOFLAGS} "==> $<" + @${CC} -c ${CFLAGS} -DNDEBUG -o $@ $< + +../Debug/%.o: %.c + @${ECHO} ${ECHOFLAGS} "==> $<" + @${CC} -c -g ${CFLAGS} -o $@ $< diff --git a/src/tokeniser/entities.c b/src/tokeniser/entities.c new file mode 100644 index 0000000..8a9acf5 --- /dev/null +++ b/src/tokeniser/entities.c @@ -0,0 +1,363 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include "utils/dict.h" +#include "utils/utils.h" +#include "tokeniser/entities.h" + +typedef struct hubbub_entity hubbub_entity; + +static const struct hubbub_entity { + const char *name; + uint32_t ucs4; +} entities[] = { + { "AElig", 0x00C6 }, + { "Aacute", 0x00C1 }, + { "Acirc", 0x00C2 }, + { "Agrave", 0x00C0 }, + { "Alpha", 0x0391 }, + { "Aring", 0x00C5 }, + { "Atilde", 0x00C3 }, + { "Auml", 0x00C4 }, + { "Beta", 0x0392 }, + { "Ccedil", 0x00C7 }, + { "Chi", 0x03A7 }, + { "Dagger", 0x2021 }, + { "Delta", 0x0394 }, + { "ETH", 0x00D0 }, + { "Eacute", 0x00C9 }, + { "Ecirc", 0x00CA }, + { "Egrave", 0x00C8 }, + { "Epsilon", 0x0395 }, + { "Eta", 0x0397 }, + { "Euml", 0x00CB }, + { "Gamma", 0x0393 }, + { "Iacute", 0x00CD }, + { "Icirc", 0x00CE }, + { "Igrave", 0x00CC }, + { "Iota", 0x0399 }, + { "Iuml", 0x00CF }, + { "Kappa", 0x039A }, + { "Lambda", 0x039B }, + { "Mu", 0x039C }, + { "Ntilde", 0x00D1 }, + { "Nu", 0x039D }, + { "OElig", 0x0152 }, + { "Oacute", 0x00D3 }, + { "Ocirc", 0x00D4 }, + { "Ograve", 0x00D2 }, + { "Omega", 0x03A9 }, + { "Omicron", 0x039F }, + { "Oslash", 0x00D8 }, + { "Otilde", 0x00D5 }, + { "Ouml", 0x00D6 }, + { "Phi", 0x03A6 }, + { "Pi", 0x03A0 }, + { "Prime", 0x2033 }, + { "Psi", 0x03A8 }, + { "Rho", 0x03A1 }, + { "Scaron", 0x0160 }, + { "Sigma", 0x03A3 }, + { "THORN", 0x00DE }, + { "Tau", 0x03A4 }, + { "Theta", 0x0398 }, + { "Uacute", 0x00DA }, + { "Ucirc", 0x00DB }, + { "Ugrave", 0x00D9 }, + { "Upsilon", 0x03A5 }, + { "Uuml", 0x00DC }, + { "Xi", 0x039E }, + { "Yacute", 0x00DD }, + { "Yuml", 0x0178 }, + { "Zeta", 0x0396 }, + { "aacute", 0x00E1 }, + { "acirc", 0x00E2 }, + { "acute", 0x00B4 }, + { "aelig", 0x00E6 }, + { "agrave", 0x00E0 }, + { "alefsym", 0x2135 }, + { "alpha", 0x03B1 }, + { "amp", 0x0026 }, + { "AMP", 0x0026 }, + { "and", 0x2227 }, + { "ang", 0x2220 }, + { "apos", 0x0027 }, + { "aring", 0x00E5 }, + { "asymp", 0x2248 }, + { "atilde", 0x00E3 }, + { "auml", 0x00E4 }, + { "bdquo", 0x201E }, + { "beta", 0x03B2 }, + { "brvbar", 0x00A6 }, + { "bull", 0x2022 }, + { "cap", 0x2229 }, + { "ccedil", 0x00E7 }, + { "cedil", 0x00B8 }, + { "cent", 0x00A2 }, + { "chi", 0x03C7 }, + { "circ", 0x02C6 }, + { "clubs", 0x2663 }, + { "cong", 0x2245 }, + { "copy", 0x00A9 }, + { "COPY", 0x00A9 }, + { "crarr", 0x21B5 }, + { "cup", 0x222A }, + { "curren", 0x00A4 }, + { "dArr", 0x21D3 }, + { "dagger", 0x2020 }, + { "darr", 0x2193 }, + { "deg", 0x00B0 }, + { "delta", 0x03B4 }, + { "diams", 0x2666 }, + { "divide", 0x00F7 }, + { "eacute", 0x00E9 }, + { "ecirc", 0x00EA }, + { "egrave", 0x00E8 }, + { "empty", 0x2205 }, + { "emsp", 0x2003 }, + { "ensp", 0x2002 }, + { "epsilon", 0x03B5 }, + { "equiv", 0x2261 }, + { "eta", 0x03B7 }, + { "eth", 0x00F0 }, + { "euml", 0x00EB }, + { "euro", 0x20AC }, + { "exist", 0x2203 }, + { "fnof", 0x0192 }, + { "forall", 0x2200 }, + { "frac12", 0x00BD }, + { "frac14", 0x00BC }, + { "frac34", 0x00BE }, + { "frasl", 0x2044 }, + { "gamma", 0x03B3 }, + { "ge", 0x2265 }, + { "gt", 0x003E }, + { "GT", 0x003E }, + { "hArr", 0x21D4 }, + { "harr", 0x2194 }, + { "hearts", 0x2665 }, + { "hellip", 0x2026 }, + { "iacute", 0x00ED }, + { "icirc", 0x00EE }, + { "iexcl", 0x00A1 }, + { "igrave", 0x00EC }, + { "image", 0x2111 }, + { "infin", 0x221E }, + { "int", 0x222B }, + { "iota", 0x03B9 }, + { "iquest", 0x00BF }, + { "isin", 0x2208 }, + { "iuml", 0x00EF }, + { "kappa", 0x03BA }, + { "lArr", 0x21D0 }, + { "lambda", 0x03BB }, + { "lang", 0x2329 }, + { "laquo", 0x00AB }, + { "larr", 0x2190 }, + { "lceil", 0x2308 }, + { "ldquo", 0x201C }, + { "le", 0x2264 }, + { "lfloor", 0x230A }, + { "lowast", 0x2217 }, + { "loz", 0x25CA }, + { "lrm", 0x200E }, + { "lsaquo", 0x2039 }, + { "lsquo", 0x2018 }, + { "lt", 0x003C }, + { "LT", 0x003C }, + { "macr", 0x00AF }, + { "mdash", 0x2014 }, + { "micro", 0x00B5 }, + { "middot", 0x00B7 }, + { "minus", 0x2212 }, + { "mu", 0x03BC }, + { "nabla", 0x2207 }, + { "nbsp", 0x00A0 }, + { "ndash", 0x2013 }, + { "ne", 0x2260 }, + { "ni", 0x220B }, + { "not", 0x00AC }, + { "notin", 0x2209 }, + { "nsub", 0x2284 }, + { "ntilde", 0x00F1 }, + { "nu", 0x03BD }, + { "oacute", 0x00F3 }, + { "ocirc", 0x00F4 }, + { "oelig", 0x0153 }, + { "ograve", 0x00F2 }, + { "oline", 0x203E }, + { "omega", 0x03C9 }, + { "omicron", 0x03BF }, + { "oplus", 0x2295 }, + { "or", 0x2228 }, + { "ordf", 0x00AA }, + { "ordm", 0x00BA }, + { "oslash", 0x00F8 }, + { "otilde", 0x00F5 }, + { "otimes", 0x2297 }, + { "ouml", 0x00F6 }, + { "para", 0x00B6 }, + { "part", 0x2202 }, + { "permil", 0x2030 }, + { "perp", 0x22A5 }, + { "phi", 0x03C6 }, + { "pi", 0x03C0 }, + { "piv", 0x03D6 }, + { "plusmn", 0x00B1 }, + { "pound", 0x00A3 }, + { "prime", 0x2032 }, + { "prod", 0x220F }, + { "prop", 0x221D }, + { "psi", 0x03C8 }, + { "quot", 0x0022 }, + { "QUOT", 0x0022 }, + { "rArr", 0x21D2 }, + { "radic", 0x221A }, + { "rang", 0x232A }, + { "raquo", 0x00BB }, + { "rarr", 0x2192 }, + { "rceil", 0x2309 }, + { "rdquo", 0x201D }, + { "real", 0x211C }, + { "reg", 0x00AE }, + { "REG", 0x00AE }, + { "rfloor", 0x230B }, + { "rho", 0x03C1 }, + { "rlm", 0x200F }, + { "rsaquo", 0x203A }, + { "rsquo", 0x2019 }, + { "sbquo", 0x201A }, + { "scaron", 0x0161 }, + { "sdot", 0x22C5 }, + { "sect", 0x00A7 }, + { "shy", 0x00AD }, + { "sigma", 0x03C3 }, + { "sigmaf", 0x03C2 }, + { "sim", 0x223C }, + { "spades", 0x2660 }, + { "sub", 0x2282 }, + { "sube", 0x2286 }, + { "sum", 0x2211 }, + { "sup", 0x2283 }, + { "sup1", 0x00B9 }, + { "sup2", 0x00B2 }, + { "sup3", 0x00B3 }, + { "supe", 0x2287 }, + { "szlig", 0x00DF }, + { "tau", 0x03C4 }, + { "there4", 0x2234 }, + { "theta", 0x03B8 }, + { "thetasym", 0x03D1 }, + { "thinsp", 0x2009 }, + { "thorn", 0x00FE }, + { "tilde", 0x02DC }, + { "times", 0x00D7 }, + { "trade", 0x2122 }, + { "uArr", 0x21D1 }, + { "uacute", 0x00FA }, + { "uarr", 0x2191 }, + { "ucirc", 0x00FB }, + { "ugrave", 0x00F9 }, + { "uml", 0x00A8 }, + { "upsih", 0x03D2 }, + { "upsilon", 0x03C5 }, + { "uuml", 0x00FC }, + { "weierp", 0x2118 }, + { "xi", 0x03BE }, + { "yacute", 0x00FD }, + { "yen", 0x00A5 }, + { "yuml", 0x00FF }, + { "zeta", 0x03B6 }, + { "zwj", 0x200D }, + { "zwnj", 0x200C }, +}; + +static hubbub_dict *dict; + +/** + * Create the entities dictionary + * + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_entities_create(hubbub_alloc alloc, void *pw) +{ + hubbub_error error; + size_t i; + + if (alloc == NULL) + return HUBBUB_BADPARM; + + dict = hubbub_dict_create(alloc, pw); + if (dict == NULL) + return HUBBUB_NOMEM; + + for (i = 0; i < sizeof(entities) / sizeof(entities[0]); i++) { + error = hubbub_dict_insert(dict, entities[i].name, + &entities[i]); + if (error != HUBBUB_OK) { + hubbub_dict_destroy(dict); + return error; + } + } + + return HUBBUB_OK; +} + +/** + * Destroy the entities dictionary + * + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + */ +void hubbub_entities_destroy(hubbub_alloc alloc, void *pw) +{ + UNUSED(alloc); + UNUSED(pw); + + hubbub_dict_destroy(dict); +} + +/** + * Step-wise search for an entity in the dictionary + * + * \param c Character to look for + * \param result Pointer to location for result + * \param context Pointer to location for search context + * \return HUBBUB_OK if key found, + * HUBBUB_NEEDDATA if more steps are required + * HUBBUB_INVALID if nothing matches + * + * The value pointed to by ::context should be NULL for the first call. + * Thereafter, pass in the same value as returned by the previous call. + * The context is opaque to the caller and should not be inspected. + * + * The location pointed to by ::result will be set to U+FFFD unless a match + * is found. + */ +hubbub_error hubbub_entities_search_step(uint8_t c, uint32_t *result, + void **context) +{ + const hubbub_entity *e; + hubbub_error error; + + if (result == NULL || context == NULL) + return HUBBUB_BADPARM; + + error = hubbub_dict_search_step(dict, c, + (const void **) (const void *) &e, + context); + if (error != HUBBUB_OK) { + *result = 0xFFFD; + return error; + } + + *result = e->ucs4; + + return HUBBUB_OK; +} diff --git a/src/tokeniser/entities.h b/src/tokeniser/entities.h new file mode 100644 index 0000000..efd1987 --- /dev/null +++ b/src/tokeniser/entities.h @@ -0,0 +1,25 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef hubbub_tokeniser_entities_h_ +#define hubbub_tokeniser_entities_h_ + +#include + +#include +#include + +/* Create the entities dictionary */ +hubbub_error hubbub_entities_create(hubbub_alloc alloc, void *pw); +/* Destroy the entities dictionary */ +void hubbub_entities_destroy(hubbub_alloc alloc, void *pw); + +/* Step-wise search for an entity in the dictionary */ +hubbub_error hubbub_entities_search_step(uint8_t c, uint32_t *result, + void **context); + +#endif diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c new file mode 100644 index 0000000..f8b6bb3 --- /dev/null +++ b/src/tokeniser/tokeniser.c @@ -0,0 +1,2282 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include +#include + +#include "utils/utils.h" + +#include "tokeniser/entities.h" +#include "tokeniser/tokeniser.h" + +/** + * Table of mappings between Windows-1252 codepoints 128-159 and UCS4 + */ +static const uint32_t cp1252Table[32] = { + 0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, + 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD, + 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, + 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178 +}; + +/** + * Tokeniser states + */ +typedef enum hubbub_tokeniser_state { + HUBBUB_TOKENISER_STATE_DATA, + HUBBUB_TOKENISER_STATE_ENTITY_DATA, + HUBBUB_TOKENISER_STATE_TAG_OPEN, + HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN, + HUBBUB_TOKENISER_STATE_CLOSE_TAG_MATCH, + HUBBUB_TOKENISER_STATE_TAG_NAME, + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME, + HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME, + HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_NAME, + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE, + HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_DQ, + HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_SQ, + HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ, + HUBBUB_TOKENISER_STATE_ENTITY_IN_ATTRIBUTE_VALUE, + HUBBUB_TOKENISER_STATE_BOGUS_COMMENT, + HUBBUB_TOKENISER_STATE_MARKUP_DECLARATION_OPEN, + HUBBUB_TOKENISER_STATE_COMMENT_START, + HUBBUB_TOKENISER_STATE_COMMENT, + HUBBUB_TOKENISER_STATE_COMMENT_DASH, + HUBBUB_TOKENISER_STATE_COMMENT_END, + HUBBUB_TOKENISER_STATE_MATCH_DOCTYPE, + HUBBUB_TOKENISER_STATE_DOCTYPE, + HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_NAME, + HUBBUB_TOKENISER_STATE_DOCTYPE_NAME, + HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_NAME, + HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE, + HUBBUB_TOKENISER_STATE_NUMBERED_ENTITY, + HUBBUB_TOKENISER_STATE_NAMED_ENTITY +} hubbub_tokeniser_state; + +/** + * Context for tokeniser + */ +typedef struct hubbub_tokeniser_context { + hubbub_token_type current_tag_type; /**< Type of current_tag */ + hubbub_tag current_tag; /**< Current tag */ + + hubbub_string current_comment; /**< Current comment */ + + hubbub_doctype current_doctype; /**< Current doctype */ + + hubbub_string current_chars; /**< Pending characters */ + + hubbub_tokeniser_state prev_state; /**< Previous state */ + + struct { + hubbub_string tag; /**< Pending close tag */ + } close_tag_match; + + struct { + uint32_t count; /**< Index into "DOCTYPE" */ + } match_doctype; + + struct { + hubbub_string str; /**< Pending string */ + uint8_t base; /**< Base for numeric + * entities */ + uint32_t codepoint; /**< UCS4 codepoint */ + bool had_data; /**< Whether we read + * anything after &#(x)? */ + hubbub_tokeniser_state return_state; /**< State we were + * called from */ + bool complete; /**< Flag that entity + * matching completed */ + bool done_setup; /**< Flag that match setup + * has completed */ + void *context; /**< Context for named + * entity search */ + size_t prev_len; /**< Previous byte length + * of str */ + } match_entity; + + struct { + uint32_t line; /**< Current line of input */ + uint32_t col; /**< Current character in + * line */ + } position; +} hubbub_tokeniser_context; + +/** + * Tokeniser data structure + */ +struct hubbub_tokeniser { + hubbub_tokeniser_state state; /**< Current tokeniser state */ + hubbub_content_model content_model; /**< Current content + * model flag */ + + hubbub_inputstream *input; /**< Input stream */ + + const uint8_t *input_buffer; /**< Start of input stream's buffer */ + size_t input_buffer_len; /**< Length of input buffer */ + + hubbub_tokeniser_context context; /**< Tokeniser context */ + + hubbub_token_handler token_handler; + void *token_pw; + + hubbub_buffer_handler buffer_handler; + void *buffer_pw; + + hubbub_error_handler error_handler; + void *error_pw; + + hubbub_alloc alloc; /**< Memory (de)allocation function */ + void *alloc_pw; /**< Client private data */ +}; + +static bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_entity_data(hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_close_tag_open( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_close_tag_match( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_before_attribute_name( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_attribute_name( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_after_attribute_name( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_before_attribute_value( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_attribute_value_dq( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_attribute_value_sq( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_attribute_value_uq( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_entity_in_attribute_value( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_bogus_comment( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_markup_declaration_open( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_comment_start( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_comment_dash( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_comment_end(hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_match_doctype( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_doctype(hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_before_doctype_name( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_doctype_name( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_after_doctype_name( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_bogus_doctype( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_consume_entity(hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_numbered_entity( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_named_entity( + hubbub_tokeniser *tokeniser); +static void hubbub_tokeniser_buffer_moved_handler(const uint8_t *buffer, + size_t len, void *pw); +static void hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser, + hubbub_token *token); + +/** + * Create a hubbub tokeniser + * + * \param input Input stream instance + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to tokeniser instance, or NULL on failure + */ +hubbub_tokeniser *hubbub_tokeniser_create(hubbub_inputstream *input, + hubbub_alloc alloc, void *pw) +{ + hubbub_tokeniser *tok; + + if (input == NULL || alloc == NULL) + return NULL; + + tok = alloc(NULL, sizeof(hubbub_tokeniser), pw); + if (tok == NULL) + return NULL; + + tok->state = HUBBUB_TOKENISER_STATE_DATA; + tok->content_model = HUBBUB_CONTENT_MODEL_PCDATA; + + tok->input = input; + tok->input_buffer = NULL; + tok->input_buffer_len = 0; + + tok->token_handler = NULL; + tok->token_pw = NULL; + + tok->buffer_handler = NULL; + tok->buffer_pw = NULL; + + tok->error_handler = NULL; + tok->error_pw = NULL; + + tok->alloc = alloc; + tok->alloc_pw = pw; + + if (hubbub_inputstream_register_movehandler(input, + hubbub_tokeniser_buffer_moved_handler, tok) != + HUBBUB_OK) { + alloc(tok, 0, pw); + return NULL; + } + + memset(&tok->context, 0, sizeof(hubbub_tokeniser_context)); + + return tok; +} + +/** + * Destroy a hubbub tokeniser + * + * \param tokeniser The tokeniser instance to destroy + */ +void hubbub_tokeniser_destroy(hubbub_tokeniser *tokeniser) +{ + if (tokeniser == NULL) + return; + + hubbub_inputstream_deregister_movehandler(tokeniser->input, + hubbub_tokeniser_buffer_moved_handler, tokeniser); + + if (tokeniser->context.current_tag.attributes != NULL) { + tokeniser->alloc(tokeniser->context.current_tag.attributes, + 0, tokeniser->alloc_pw); + } + + tokeniser->alloc(tokeniser, 0, tokeniser->alloc_pw); +} + +/** + * Configure a hubbub tokeniser + * + * \param tokeniser The tokeniser instance to configure + * \param type The option type to set + * \param params Option-specific parameters + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser, + hubbub_tokeniser_opttype type, + hubbub_tokeniser_optparams *params) +{ + if (tokeniser == NULL || params == NULL) + return HUBBUB_BADPARM; + + switch (type) { + case HUBBUB_TOKENISER_TOKEN_HANDLER: + tokeniser->token_handler = params->token_handler.handler; + tokeniser->token_pw = params->token_handler.pw; + break; + case HUBBUB_TOKENISER_BUFFER_HANDLER: + tokeniser->buffer_handler = params->buffer_handler.handler; + tokeniser->buffer_pw = params->buffer_handler.pw; + tokeniser->buffer_handler(tokeniser->input_buffer, + tokeniser->input_buffer_len, + tokeniser->buffer_pw); + break; + case HUBBUB_TOKENISER_ERROR_HANDLER: + tokeniser->error_handler = params->error_handler.handler; + tokeniser->error_pw = params->error_handler.pw; + break; + case HUBBUB_TOKENISER_CONTENT_MODEL: + tokeniser->content_model = params->content_model.model; + break; + } + + return HUBBUB_OK; +} + +/** + * Process remaining data in the input stream + * + * \param tokeniser The tokeniser instance to invoke + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser) +{ + bool cont = true; + + if (tokeniser == NULL) + return HUBBUB_BADPARM; + + while (cont) { + switch (tokeniser->state) { + case HUBBUB_TOKENISER_STATE_DATA: + cont = hubbub_tokeniser_handle_data(tokeniser); + break; + case HUBBUB_TOKENISER_STATE_ENTITY_DATA: + cont = hubbub_tokeniser_handle_entity_data( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_TAG_OPEN: + cont = hubbub_tokeniser_handle_tag_open(tokeniser); + break; + case HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN: + cont = hubbub_tokeniser_handle_close_tag_open( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_CLOSE_TAG_MATCH: + cont = hubbub_tokeniser_handle_close_tag_match( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_TAG_NAME: + cont = hubbub_tokeniser_handle_tag_name(tokeniser); + break; + case HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME: + cont = hubbub_tokeniser_handle_before_attribute_name( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME: + cont = hubbub_tokeniser_handle_attribute_name( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_NAME: + cont = hubbub_tokeniser_handle_after_attribute_name( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE: + cont = hubbub_tokeniser_handle_before_attribute_value( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_DQ: + cont = hubbub_tokeniser_handle_attribute_value_dq( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_SQ: + cont = hubbub_tokeniser_handle_attribute_value_sq( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ: + cont = hubbub_tokeniser_handle_attribute_value_uq( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_ENTITY_IN_ATTRIBUTE_VALUE: + cont = hubbub_tokeniser_handle_entity_in_attribute_value( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_BOGUS_COMMENT: + cont = hubbub_tokeniser_handle_bogus_comment( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_MARKUP_DECLARATION_OPEN: + cont = hubbub_tokeniser_handle_markup_declaration_open( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_COMMENT_START: + cont = hubbub_tokeniser_handle_comment_start( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_COMMENT: + cont = hubbub_tokeniser_handle_comment(tokeniser); + break; + case HUBBUB_TOKENISER_STATE_COMMENT_DASH: + cont = hubbub_tokeniser_handle_comment_dash( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_COMMENT_END: + cont = hubbub_tokeniser_handle_comment_end( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_MATCH_DOCTYPE: + cont = hubbub_tokeniser_handle_match_doctype( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_DOCTYPE: + cont = hubbub_tokeniser_handle_doctype(tokeniser); + break; + case HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_NAME: + cont = hubbub_tokeniser_handle_before_doctype_name( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_DOCTYPE_NAME: + cont = hubbub_tokeniser_handle_doctype_name( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_NAME: + cont = hubbub_tokeniser_handle_after_doctype_name( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE: + cont = hubbub_tokeniser_handle_bogus_doctype( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_NUMBERED_ENTITY: + cont = hubbub_tokeniser_handle_numbered_entity( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_NAMED_ENTITY: + cont = hubbub_tokeniser_handle_named_entity( + tokeniser); + break; + } + } + + return HUBBUB_OK; +} + +bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser) +{ + hubbub_token token; + uint32_t c; + + /* Clear current characters */ + tokeniser->context.current_chars.data_off = 0; + tokeniser->context.current_chars.len = 0; + + while ((c = hubbub_inputstream_peek(tokeniser->input)) != + HUBBUB_INPUTSTREAM_EOF && + c != HUBBUB_INPUTSTREAM_OOD) { + if (c == '&' && (tokeniser->content_model == + HUBBUB_CONTENT_MODEL_PCDATA || + tokeniser->content_model == + HUBBUB_CONTENT_MODEL_RCDATA)) { + tokeniser->state = + HUBBUB_TOKENISER_STATE_ENTITY_DATA; + /* Don't eat the '&'; it'll be handled by + * entity consumption */ + break; + } else if (c == '<' && tokeniser->content_model != + HUBBUB_CONTENT_MODEL_PLAINTEXT) { + if (tokeniser->context.current_chars.len > 0) { + /* Emit any pending characters */ + token.type = HUBBUB_TOKEN_CHARACTER; + token.data.character = + tokeniser->context.current_chars; + + hubbub_tokeniser_emit_token(tokeniser, + &token); + } + + /* Buffer '<' */ + tokeniser->context.current_chars.data_off = + hubbub_inputstream_cur_pos(tokeniser->input, + &tokeniser->context.current_chars.len); + + tokeniser->state = HUBBUB_TOKENISER_STATE_TAG_OPEN; + hubbub_inputstream_advance(tokeniser->input); + break; + } else { + uint32_t pos; + size_t len; + + /* Accumulate characters into buffer */ + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + + if (tokeniser->context.current_chars.len == 0) { + tokeniser->context.current_chars.data_off = + pos; + } + tokeniser->context.current_chars.len++; + + hubbub_inputstream_advance(tokeniser->input); + } + } + + if (tokeniser->state != HUBBUB_TOKENISER_STATE_TAG_OPEN && + tokeniser->context.current_chars.len > 0) { + /* Emit any pending characters */ + token.type = HUBBUB_TOKEN_CHARACTER; + token.data.character = tokeniser->context.current_chars; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->context.current_chars.data_off = 0; + tokeniser->context.current_chars.len = 0; + } + + if (c == HUBBUB_INPUTSTREAM_EOF) { + token.type = HUBBUB_TOKEN_EOF; + + hubbub_tokeniser_emit_token(tokeniser, &token); + } + + return (c != HUBBUB_INPUTSTREAM_EOF && c != HUBBUB_INPUTSTREAM_OOD); +} + +bool hubbub_tokeniser_handle_entity_data(hubbub_tokeniser *tokeniser) +{ + if (tokeniser->context.match_entity.complete == false) { + return hubbub_tokeniser_consume_entity(tokeniser); + } else { + hubbub_token token; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD || + c == HUBBUB_INPUTSTREAM_EOF) { + /* Should never happen */ + abort(); + } + + /* Emit character */ + token.type = HUBBUB_TOKEN_CHARACTER; + token.data.character.data_off = + hubbub_inputstream_cur_pos(tokeniser->input, + &token.data.character.len); + + hubbub_tokeniser_emit_token(tokeniser, &token); + + /* Reset for next time */ + tokeniser->context.match_entity.complete = false; + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser) +{ + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t pos; + size_t len; + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA || + tokeniser->content_model == + HUBBUB_CONTENT_MODEL_CDATA) { + if (c == '/') { + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + tokeniser->context.current_chars.len += len; + + tokeniser->state = + HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN; + + hubbub_inputstream_advance(tokeniser->input); + } else { + hubbub_token token; + + /* Emit '<' */ + token.type = HUBBUB_TOKEN_CHARACTER; + token.data.character = + tokeniser->context.current_chars; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = + HUBBUB_TOKENISER_STATE_DATA; + } + } else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA) { + if (c == '!') { + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + + tokeniser->context.current_chars.len += len; + + tokeniser->state = + HUBBUB_TOKENISER_STATE_MARKUP_DECLARATION_OPEN; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '/') { + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + + tokeniser->context.current_chars.len += len; + + tokeniser->state = + HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN; + hubbub_inputstream_advance(tokeniser->input); + } else if ('A' <= c && c <= 'Z') { + hubbub_inputstream_lowercase(tokeniser->input); + + tokeniser->context.current_tag_type = + HUBBUB_TOKEN_START_TAG; + + ctag->name.data_off = + hubbub_inputstream_cur_pos(tokeniser->input, + &ctag->name.len); + ctag->n_attributes = 0; + + tokeniser->state = + HUBBUB_TOKENISER_STATE_TAG_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if ('a' <= c && c <= 'z') { + tokeniser->context.current_tag_type = + HUBBUB_TOKEN_START_TAG; + + ctag->name.data_off = + hubbub_inputstream_cur_pos(tokeniser->input, + &ctag->name.len); + ctag->n_attributes = 0; + + tokeniser->state = + HUBBUB_TOKENISER_STATE_TAG_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '>') { + hubbub_token token; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + tokeniser->context.current_chars.len += len; + + /* Emit "<>" */ + token.type = HUBBUB_TOKEN_CHARACTER; + token.data.character = + tokeniser->context.current_chars; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = + HUBBUB_TOKENISER_STATE_DATA; + + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '?') { + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + tokeniser->context.current_chars.len += len; + + tokeniser->context.current_comment.data_off = pos; + tokeniser->context.current_comment.len = len; + tokeniser->state = + HUBBUB_TOKENISER_STATE_BOGUS_COMMENT; + hubbub_inputstream_advance(tokeniser->input); + } else { + hubbub_token token; + + /* Emit '<' */ + token.type = HUBBUB_TOKEN_CHARACTER; + token.data.character = + tokeniser->context.current_chars; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = + HUBBUB_TOKENISER_STATE_DATA; + } + } + + return true; +} + +bool hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser) +{ + if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA || + tokeniser->content_model == + HUBBUB_CONTENT_MODEL_CDATA) { + tokeniser->context.close_tag_match.tag.len = 0; + tokeniser->state = HUBBUB_TOKENISER_STATE_CLOSE_TAG_MATCH; + } else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA) { + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + uint32_t pos; + size_t len; + + if ('A' <= c && c <= 'Z') { + hubbub_inputstream_lowercase(tokeniser->input); + + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + + tokeniser->context.current_tag_type = + HUBBUB_TOKEN_END_TAG; + ctag->name.data_off = pos; + ctag->name.len = len; + ctag->n_attributes = 0; + + tokeniser->state = HUBBUB_TOKENISER_STATE_TAG_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if ('a' <= c && c <= 'z') { + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + + tokeniser->context.current_tag_type = + HUBBUB_TOKEN_END_TAG; + ctag->name.data_off = pos; + ctag->name.len = len; + ctag->n_attributes = 0; + + tokeniser->state = HUBBUB_TOKENISER_STATE_TAG_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '>') { + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit "context.current_chars; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else if (c != HUBBUB_INPUTSTREAM_OOD) { + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + + tokeniser->context.current_comment.data_off = pos; + tokeniser->context.current_comment.len = len; + + tokeniser->state = + HUBBUB_TOKENISER_STATE_BOGUS_COMMENT; + hubbub_inputstream_advance(tokeniser->input); + } else { + /* Out of data */ + return false; + } + } + + return true; +} + +bool hubbub_tokeniser_handle_close_tag_match(hubbub_tokeniser *tokeniser) +{ + hubbub_tokeniser_context *ctx = &tokeniser->context; + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t c = 0; + + while (ctx->close_tag_match.tag.len < ctag->name.len && + (c = hubbub_inputstream_peek(tokeniser->input)) != + HUBBUB_INPUTSTREAM_EOF && + c != HUBBUB_INPUTSTREAM_OOD) { + /* Match last open tag */ + uint32_t off; + size_t len; + + off = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + if (ctx->close_tag_match.tag.len == 0) { + ctx->close_tag_match.tag.data_off = off; + ctx->close_tag_match.tag.len = len; + } else { + ctx->close_tag_match.tag.len += len; + } + + hubbub_inputstream_advance(tokeniser->input); + + if (ctx->close_tag_match.tag.len > ctag->name.len || + (ctx->close_tag_match.tag.len == ctag->name.len && + hubbub_inputstream_compare_range_ci( + tokeniser->input, + ctag->name.data_off, + ctx->close_tag_match.tag.data_off, + ctag->name.len) != 0)) { + hubbub_token token; + + /* Rewind input stream to start of tag name */ + if (hubbub_inputstream_rewind(tokeniser->input, + ctx->close_tag_match.tag.len) != + HUBBUB_OK) + abort(); + + /* Emit "context.current_chars; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + + return true; + } else if (ctx->close_tag_match.tag.len == ctag->name.len && + hubbub_inputstream_compare_range_ci( + tokeniser->input, + ctag->name.data_off, + ctx->close_tag_match.tag.data_off, + ctag->name.len) == 0) { + /* Matched => stop searching */ + break; + } + } + + if (c == HUBBUB_INPUTSTREAM_OOD) { + /* Need more data */ + return false; + } + + if (c == HUBBUB_INPUTSTREAM_EOF) { + /* Ran out of data - parse error */ + hubbub_token token; + + /* Rewind input stream to start of tag name */ + if (hubbub_inputstream_rewind(tokeniser->input, + ctx->close_tag_match.tag.len) != HUBBUB_OK) + abort(); + + /* Emit "context.current_chars; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + + return true; + } + + /* Match following char */ + c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) { + /* Need more data */ + return false; + } + + /* Rewind input stream to start of tag name */ + if (hubbub_inputstream_rewind(tokeniser->input, + ctx->close_tag_match.tag.len) != HUBBUB_OK) + abort(); + + /* Check that following char was valid */ + if (c != '\t' && c != '\n' && c != '\v' && c != '\f' && + c != ' ' && c != '>' && c != '/' && c != '<' && + c != HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit "context.current_chars; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + + return true; + } + + /* Switch the content model back to PCDATA */ + tokeniser->content_model = HUBBUB_CONTENT_MODEL_PCDATA; + + /* Finally, transition back to close tag open state */ + tokeniser->state = HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN; + + return true; +} + +bool hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser) +{ + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') { + tokeniser->state = + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '>') { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if ('A' <= c && c <= 'Z') { + uint32_t pos; + size_t len; + + hubbub_inputstream_lowercase(tokeniser->input); + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + ctag->name.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else if (c == '/') { + /** \todo permitted slash */ + tokeniser->state = + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + ctag->name.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_before_attribute_name( + hubbub_tokeniser *tokeniser) +{ + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') { + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '>') { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if ('A' <= c && c <= 'Z') { + uint32_t pos; + size_t len; + hubbub_attribute *attr; + + hubbub_inputstream_lowercase(tokeniser->input); + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + attr = tokeniser->alloc(ctag->attributes, + (ctag->n_attributes + 1) * + sizeof(hubbub_attribute), + tokeniser->alloc_pw); + if (attr == NULL) { + /** \todo handle memory exhaustion */ + } + + ctag->attributes = attr; + + attr[ctag->n_attributes].name.data_off = pos; + attr[ctag->n_attributes].name.len = len; + attr[ctag->n_attributes].value.data_off = 0; + attr[ctag->n_attributes].value.len = 0; + + ctag->n_attributes++; + + tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME; + + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '/') { + /** \todo permitted slash */ + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + hubbub_attribute *attr; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + attr = tokeniser->alloc(ctag->attributes, + (ctag->n_attributes + 1) * + sizeof(hubbub_attribute), + tokeniser->alloc_pw); + if (attr == NULL) { + /** \todo handle memory exhaustion */ + } + + ctag->attributes = attr; + + attr[ctag->n_attributes].name.data_off = pos; + attr[ctag->n_attributes].name.len = len; + attr[ctag->n_attributes].value.data_off = 0; + attr[ctag->n_attributes].value.len = 0; + + ctag->n_attributes++; + + tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_attribute_name(hubbub_tokeniser *tokeniser) +{ + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') { + tokeniser->state = + HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '=') { + tokeniser->state = + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '>') { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if ('A' <= c && c <= 'Z') { + uint32_t pos; + size_t len; + + hubbub_inputstream_lowercase(tokeniser->input); + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + ctag->attributes[ctag->n_attributes - 1].name.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '/') { + /** \todo permitted slash */ + tokeniser->state = + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + ctag->attributes[ctag->n_attributes - 1].name.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_after_attribute_name( + hubbub_tokeniser *tokeniser) +{ + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') { + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '=') { + tokeniser->state = + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '>') { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if ('A' <= c && c <= 'Z') { + uint32_t pos; + size_t len; + hubbub_attribute *attr; + + hubbub_inputstream_lowercase(tokeniser->input); + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + attr = tokeniser->alloc(ctag->attributes, + (ctag->n_attributes + 1) * + sizeof(hubbub_attribute), + tokeniser->alloc_pw); + if (attr == NULL) { + /** \todo handle memory exhaustion */ + } + + ctag->attributes = attr; + + attr[ctag->n_attributes].name.data_off = pos; + attr[ctag->n_attributes].name.len = len; + attr[ctag->n_attributes].value.data_off = 0; + attr[ctag->n_attributes].value.len = 0; + + ctag->n_attributes++; + + tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME; + + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '/') { + /** \todo permitted slash */ + tokeniser->state = + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + hubbub_attribute *attr; + + hubbub_inputstream_lowercase(tokeniser->input); + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + attr = tokeniser->alloc(ctag->attributes, + (ctag->n_attributes + 1) * + sizeof(hubbub_attribute), + tokeniser->alloc_pw); + if (attr == NULL) { + /** \todo handle memory exhaustion */ + } + + ctag->attributes = attr; + + attr[ctag->n_attributes].name.data_off = pos; + attr[ctag->n_attributes].name.len = len; + attr[ctag->n_attributes].value.data_off = 0; + attr[ctag->n_attributes].value.len = 0; + + ctag->n_attributes++; + + tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_before_attribute_value( + hubbub_tokeniser *tokeniser) +{ + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') { + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '"') { + tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_DQ; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '&') { + tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ; + } else if (c == '\'') { + tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_SQ; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '>') { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + ctag->attributes[ctag->n_attributes - 1].value.data_off = pos; + ctag->attributes[ctag->n_attributes - 1].value.len = len; + + tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_attribute_value_dq(hubbub_tokeniser *tokeniser) +{ + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '"') { + tokeniser->state = + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '&') { + tokeniser->context.prev_state = tokeniser->state; + tokeniser->state = + HUBBUB_TOKENISER_STATE_ENTITY_IN_ATTRIBUTE_VALUE; + /* Don't eat the '&'; entity consumption handles this */ + } else if (c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) { + ctag->attributes[ctag->n_attributes - 1].value.data_off = + pos; + } + + ctag->attributes[ctag->n_attributes - 1].value.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_attribute_value_sq(hubbub_tokeniser *tokeniser) +{ + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\'') { + tokeniser->state = + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '&') { + tokeniser->context.prev_state = tokeniser->state; + tokeniser->state = + HUBBUB_TOKENISER_STATE_ENTITY_IN_ATTRIBUTE_VALUE; + /* Don't eat the '&'; entity consumption handles this */ + } else if (c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) { + ctag->attributes[ctag->n_attributes - 1].value.data_off = + pos; + } + + ctag->attributes[ctag->n_attributes - 1].value.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_attribute_value_uq(hubbub_tokeniser *tokeniser) +{ + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') { + tokeniser->state = + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '&') { + tokeniser->context.prev_state = tokeniser->state; + tokeniser->state = + HUBBUB_TOKENISER_STATE_ENTITY_IN_ATTRIBUTE_VALUE; + /* Don't eat the '&'; entity consumption handles this */ + } else if (c == '>') { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) { + ctag->attributes[ctag->n_attributes - 1].value.data_off = + pos; + } + + ctag->attributes[ctag->n_attributes - 1].value.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_entity_in_attribute_value( + hubbub_tokeniser *tokeniser) +{ + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t pos; + size_t len; + + if (tokeniser->context.match_entity.complete == false) { + return hubbub_tokeniser_consume_entity(tokeniser); + } else { + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD || + c == HUBBUB_INPUTSTREAM_EOF) { + /* Should never happen */ + abort(); + } + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) { + ctag->attributes[ctag->n_attributes - 1].value.data_off = + pos; + } + + ctag->attributes[ctag->n_attributes - 1].value.len += len; + + /* Reset for next time */ + tokeniser->context.match_entity.complete = false; + + /* And back to the previous state */ + tokeniser->state = tokeniser->context.prev_state; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_bogus_comment(hubbub_tokeniser *tokeniser) +{ + hubbub_token token; + uint32_t c; + + while ((c = hubbub_inputstream_peek(tokeniser->input)) != + HUBBUB_INPUTSTREAM_EOF && + c != HUBBUB_INPUTSTREAM_OOD) { + uint32_t pos; + size_t len; + + if (c == '>') { + hubbub_inputstream_advance(tokeniser->input); + break; + } + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + if (tokeniser->context.current_comment.len == 0) + tokeniser->context.current_comment.data_off = pos; + tokeniser->context.current_comment.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + /* Emit comment */ + token.type = HUBBUB_TOKEN_COMMENT; + token.data.comment = tokeniser->context.current_comment; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + + return true; +} + +bool hubbub_tokeniser_handle_markup_declaration_open( + hubbub_tokeniser *tokeniser) +{ + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '-') { + tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_START; + hubbub_inputstream_advance(tokeniser->input); + } else if ((c & ~0x20) == 'D') { + hubbub_inputstream_uppercase(tokeniser->input); + tokeniser->context.match_doctype.count = 1; + tokeniser->state = HUBBUB_TOKENISER_STATE_MATCH_DOCTYPE; + hubbub_inputstream_advance(tokeniser->input); + } else { + tokeniser->context.current_comment.data_off = 0; + tokeniser->context.current_comment.len = 0; + + tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT; + } + + return true; +} + +bool hubbub_tokeniser_handle_comment_start(hubbub_tokeniser *tokeniser) +{ + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + tokeniser->context.current_comment.data_off = 0; + tokeniser->context.current_comment.len = 0; + + + if (c == '-') { + tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT; + hubbub_inputstream_advance(tokeniser->input); + } else { + hubbub_inputstream_push_back(tokeniser->input, '-'); + tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT; + } + + return true; +} + +bool hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser) +{ + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '-') { + tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_DASH; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit comment */ + token.type = HUBBUB_TOKEN_COMMENT; + token.data.comment = tokeniser->context.current_comment; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + if (tokeniser->context.current_comment.len == 0) + tokeniser->context.current_comment.data_off = pos; + tokeniser->context.current_comment.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_comment_dash(hubbub_tokeniser *tokeniser) +{ + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '-') { + tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_END; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit comment */ + token.type = HUBBUB_TOKEN_COMMENT; + token.data.comment = tokeniser->context.current_comment; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + if (tokeniser->context.current_comment.len == 0) { + tokeniser->context.current_comment.data_off = pos; + } else { + /* Need to do this to get length of '-' */ + len += pos - + tokeniser->context.current_comment.data_off; + } + + tokeniser->context.current_comment.len = len; + + tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_comment_end(hubbub_tokeniser *tokeniser) +{ + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '>') { + hubbub_token token; + + /* Emit comment */ + token.type = HUBBUB_TOKEN_COMMENT; + token.data.comment = tokeniser->context.current_comment; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '-') { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + if (tokeniser->context.current_comment.len == 0) { + tokeniser->context.current_comment.data_off = pos; + tokeniser->context.current_comment.len = len; + } else { + /* Need to do this to get length of '-' */ + len = pos - + tokeniser->context.current_comment.data_off; + } + + tokeniser->context.current_comment.len = len; + + tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_END; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit comment */ + token.type = HUBBUB_TOKEN_COMMENT; + token.data.comment = tokeniser->context.current_comment; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + if (tokeniser->context.current_comment.len == 0) { + tokeniser->context.current_comment.data_off = pos; + } else { + /* Need to do this to get length of '--' */ + len += pos - + tokeniser->context.current_comment.data_off; + } + + tokeniser->context.current_comment.len = len; + + tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_match_doctype(hubbub_tokeniser *tokeniser) +{ + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (tokeniser->context.match_doctype.count == 1 && + (c & ~0x20) == 'O') { + hubbub_inputstream_uppercase(tokeniser->input); + tokeniser->context.match_doctype.count++; + hubbub_inputstream_advance(tokeniser->input); + } else if (tokeniser->context.match_doctype.count == 2 && + (c & ~0x20) == 'C') { + hubbub_inputstream_uppercase(tokeniser->input); + tokeniser->context.match_doctype.count++; + hubbub_inputstream_advance(tokeniser->input); + } else if (tokeniser->context.match_doctype.count == 3 && + (c & ~0x20) == 'T') { + hubbub_inputstream_uppercase(tokeniser->input); + tokeniser->context.match_doctype.count++; + hubbub_inputstream_advance(tokeniser->input); + } else if (tokeniser->context.match_doctype.count == 4 && + (c & ~0x20) == 'Y') { + hubbub_inputstream_uppercase(tokeniser->input); + tokeniser->context.match_doctype.count++; + hubbub_inputstream_advance(tokeniser->input); + } else if (tokeniser->context.match_doctype.count == 5 && + (c & ~0x20) == 'P') { + hubbub_inputstream_uppercase(tokeniser->input); + tokeniser->context.match_doctype.count++; + hubbub_inputstream_advance(tokeniser->input); + } else if (tokeniser->context.match_doctype.count == 6 && + (c & ~0x20) == 'E') { + hubbub_inputstream_uppercase(tokeniser->input); + tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE; + hubbub_inputstream_advance(tokeniser->input); + } else { + switch (tokeniser->context.match_doctype.count) { + case 6: hubbub_inputstream_push_back(tokeniser->input, 'P'); + case 5: hubbub_inputstream_push_back(tokeniser->input, 'Y'); + case 4: hubbub_inputstream_push_back(tokeniser->input, 'T'); + case 3: hubbub_inputstream_push_back(tokeniser->input, 'C'); + case 2: hubbub_inputstream_push_back(tokeniser->input, 'O'); + case 1: hubbub_inputstream_push_back(tokeniser->input, 'D'); + } + + tokeniser->context.current_comment.data_off = 0; + tokeniser->context.current_comment.len = 0; + + tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT; + } + + return true; +} + +bool hubbub_tokeniser_handle_doctype(hubbub_tokeniser *tokeniser) +{ + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') { + hubbub_inputstream_advance(tokeniser->input); + } + + tokeniser->state = HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_NAME; + + return true; +} + +bool hubbub_tokeniser_handle_before_doctype_name( + hubbub_tokeniser *tokeniser) +{ + hubbub_doctype *cdoc = &tokeniser->context.current_doctype; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') { + hubbub_inputstream_advance(tokeniser->input); + } else if ('a' <= c && c <= 'z') { + uint32_t pos; + size_t len; + + hubbub_inputstream_uppercase(tokeniser->input); + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + cdoc->name.data_off = pos; + cdoc->name.len = len; + cdoc->correct = false; + + tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE_NAME; + + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '>') { + hubbub_token token; + + /* Emit doctype */ + token.type = HUBBUB_TOKEN_DOCTYPE; + token.data.doctype = tokeniser->context.current_doctype; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit doctype */ + token.type = HUBBUB_TOKEN_DOCTYPE; + token.data.doctype = tokeniser->context.current_doctype; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + cdoc->name.data_off = pos; + cdoc->name.len = len; + cdoc->correct = false; + + tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE_NAME; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_doctype_name(hubbub_tokeniser *tokeniser) +{ + hubbub_doctype *cdoc = &tokeniser->context.current_doctype; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') { + tokeniser->state = HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '>') { + hubbub_token token; + + /* Emit doctype */ + token.type = HUBBUB_TOKEN_DOCTYPE; + token.data.doctype = tokeniser->context.current_doctype; + token.data.doctype.correct = + (hubbub_inputstream_compare_range_ascii( + tokeniser->input, + token.data.doctype.name.data_off, + token.data.doctype.name.len, + "HTML", SLEN("HTML")) == 0); + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if ('a' <= c && c <= 'z') { + uint32_t pos; + size_t len; + + hubbub_inputstream_uppercase(tokeniser->input); + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + cdoc->name.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } else if (c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit doctype */ + token.type = HUBBUB_TOKEN_DOCTYPE; + token.data.doctype = tokeniser->context.current_doctype; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + cdoc->name.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_after_doctype_name(hubbub_tokeniser *tokeniser) +{ + hubbub_doctype *cdoc = &tokeniser->context.current_doctype; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') { + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '>') { + hubbub_token token; + + /* Emit doctype */ + token.type = HUBBUB_TOKEN_DOCTYPE; + token.data.doctype = tokeniser->context.current_doctype; + token.data.doctype.correct = + (hubbub_inputstream_compare_range_ascii( + tokeniser->input, + token.data.doctype.name.data_off, + token.data.doctype.name.len, + "HTML", SLEN("HTML")) == 0); + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit doctype */ + token.type = HUBBUB_TOKEN_DOCTYPE; + token.data.doctype = tokeniser->context.current_doctype; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + cdoc->correct = false; + + tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_bogus_doctype(hubbub_tokeniser *tokeniser) +{ + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '>') { + hubbub_token token; + + /* Emit doctype */ + token.type = HUBBUB_TOKEN_DOCTYPE; + token.data.doctype = tokeniser->context.current_doctype; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit doctype */ + token.type = HUBBUB_TOKEN_DOCTYPE; + token.data.doctype = tokeniser->context.current_doctype; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_consume_entity(hubbub_tokeniser *tokeniser) +{ + uint32_t c; + uint32_t pos; + size_t len; + + if (tokeniser->context.match_entity.done_setup == false) { + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + tokeniser->context.match_entity.str.data_off = pos; + tokeniser->context.match_entity.str.len = len; + tokeniser->context.match_entity.base = 0; + tokeniser->context.match_entity.codepoint = 0; + tokeniser->context.match_entity.had_data = false; + tokeniser->context.match_entity.return_state = + tokeniser->state; + tokeniser->context.match_entity.complete = false; + tokeniser->context.match_entity.done_setup = true; + tokeniser->context.match_entity.context = NULL; + tokeniser->context.match_entity.prev_len = len; + + hubbub_inputstream_advance(tokeniser->input); + } + + c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '#') { + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + tokeniser->context.match_entity.str.len += len; + + tokeniser->state = HUBBUB_TOKENISER_STATE_NUMBERED_ENTITY; + hubbub_inputstream_advance(tokeniser->input); + } else { + tokeniser->state = HUBBUB_TOKENISER_STATE_NAMED_ENTITY; + } + + return true; +} + +bool hubbub_tokeniser_handle_numbered_entity(hubbub_tokeniser *tokeniser) +{ + hubbub_tokeniser_context *ctx = &tokeniser->context; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + uint32_t pos; + size_t len; + hubbub_error error; + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (ctx->match_entity.base == 0) { + if ((c & ~0x20) == 'X') { + ctx->match_entity.base = 16; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + ctx->match_entity.str.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } else { + ctx->match_entity.base = 10; + } + } + + while ((c = hubbub_inputstream_peek(tokeniser->input)) != + HUBBUB_INPUTSTREAM_EOF && + c != HUBBUB_INPUTSTREAM_OOD) { + if (ctx->match_entity.base == 10 && + ('0' <= c && c <= '9')) { + ctx->match_entity.had_data = true; + + ctx->match_entity.codepoint = + ctx->match_entity.codepoint * 10 + (c - '0'); + + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + ctx->match_entity.str.len += len; + } else if (ctx->match_entity.base == 16 && + (('0' <= c && c <= '9') || + ('A' <= (c & ~0x20) && + (c & ~0x20) <= 'F'))) { + ctx->match_entity.had_data = true; + + ctx->match_entity.codepoint *= 16; + + if ('0' <= c && c <= '9') { + ctx->match_entity.codepoint += (c - '0'); + } else { + ctx->match_entity.codepoint += + ((c & ~0x20) - 'A' + 10); + } + + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + ctx->match_entity.str.len += len; + } else { + break; + } + + hubbub_inputstream_advance(tokeniser->input); + } + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + /* Eat trailing semicolon, if any */ + if (c == ';') { + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + ctx->match_entity.str.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } + + /* Rewind the inputstream to start of matched sequence */ + hubbub_inputstream_rewind(tokeniser->input, + ctx->match_entity.str.len); + + if (ctx->match_entity.had_data) { + /* Had data, so calculate final codepoint */ + if (0x80 <= ctx->match_entity.codepoint && + ctx->match_entity.codepoint <= 0x9F) { + ctx->match_entity.codepoint = + cp1252Table[ctx->match_entity.codepoint - + 0x80]; + } else if (ctx->match_entity.codepoint == 0 || + ctx->match_entity.codepoint > 0x10FFFF) { + ctx->match_entity.codepoint = 0xFFFD; + } + + /* And replace the matched range with it */ + error = hubbub_inputstream_replace_range(tokeniser->input, + ctx->match_entity.str.data_off, + ctx->match_entity.str.len, + ctx->match_entity.codepoint); + if (error != HUBBUB_OK) { + /** \todo handle memory exhaustion */ + } + } + + /* Reset for next time */ + ctx->match_entity.done_setup = false; + + /* Flag completion */ + ctx->match_entity.complete = true; + + /* And back to the state we were entered in */ + tokeniser->state = ctx->match_entity.return_state; + + return true; +} + +bool hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser) +{ + hubbub_tokeniser_context *ctx = &tokeniser->context; + uint32_t c; + uint32_t pos; + size_t len; + hubbub_error error; + + while ((c = hubbub_inputstream_peek(tokeniser->input)) != + HUBBUB_INPUTSTREAM_EOF && + c != HUBBUB_INPUTSTREAM_OOD) { + uint32_t cp; + + if (c > 0x7F) { + /* Entity names are ASCII only */ + break; + } + + error = hubbub_entities_search_step((uint8_t) c, + &cp, + &ctx->match_entity.context); + if (error == HUBBUB_OK) { + /* Had a match - store it for later */ + ctx->match_entity.codepoint = cp; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + ctx->match_entity.str.len += len; + + /* And cache length, for replacement */ + ctx->match_entity.prev_len = + ctx->match_entity.str.len; + } else if (error == HUBBUB_INVALID) { + /* No further matches - use last found */ + break; + } else { + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + ctx->match_entity.str.len += len; + } + + hubbub_inputstream_advance(tokeniser->input); + } + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + /* Eat trailing semicolon, if any */ + if (ctx->match_entity.codepoint != 0 && c == ';' && + ctx->match_entity.prev_len == + ctx->match_entity.str.len) { + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + ctx->match_entity.prev_len += len; + } + + /* Rewind the inputstream to start of processed sequence */ + hubbub_inputstream_rewind(tokeniser->input, + ctx->match_entity.str.len); + + /* Now, replace range, if we found a named entity */ + if (ctx->match_entity.codepoint != 0) { + error = hubbub_inputstream_replace_range(tokeniser->input, + ctx->match_entity.str.data_off, + ctx->match_entity.prev_len, + ctx->match_entity.codepoint); + if (error != HUBBUB_OK) { + /** \todo handle memory exhaustion */ + } + } + + /* Reset for next time */ + ctx->match_entity.done_setup = false; + + /* Flag completion */ + ctx->match_entity.complete = true; + + /* And back to the state from whence we came */ + tokeniser->state = ctx->match_entity.return_state; + + return true; +} + +/** + * Handle input stream buffer moving + * + * \param buffer Pointer to buffer + * \param len Length of data in buffer (bytes) + * \param pw Pointer to our context + */ +void hubbub_tokeniser_buffer_moved_handler(const uint8_t *buffer, + size_t len, void *pw) +{ + hubbub_tokeniser *tok = (hubbub_tokeniser *) pw; + + tok->input_buffer = buffer; + tok->input_buffer_len = len; + + if (tok->buffer_handler != NULL) + tok->buffer_handler(buffer, len, tok->buffer_pw); +} + +/** + * Emit a token, performing sanity checks if necessary + * + * \param tokeniser Tokeniser instance + * \param token Token to emit + */ +void hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser, + hubbub_token *token) +{ + if (tokeniser == NULL || token == NULL) + return; + + /* Nothing to do if there's no registered handler */ + if (tokeniser->token_handler == NULL) + return; + + if (token->type == HUBBUB_TOKEN_START_TAG || + token->type == HUBBUB_TOKEN_END_TAG) { + uint32_t i, j; + uint32_t n_attributes = token->data.tag.n_attributes; + hubbub_attribute *attrs = + token->data.tag.attributes; + + /* Discard duplicate attributes */ + for (i = 0; i < n_attributes; i++) { + for (j = 0; j < n_attributes; j++) { + uint32_t move; + + if (j == i || + attrs[i].name.len != + attrs[j].name.len || + hubbub_inputstream_compare_range_cs( + tokeniser->input, + attrs[i].name.data_off, + attrs[j].name.data_off, + attrs[i].name.len) != 0) { + /* Attributes don't match */ + continue; + } + + /* Calculate amount to move */ + move = (n_attributes - 1 - + ((i < j) ? j : i)) * + sizeof(hubbub_attribute); + + if (move > 0) { + memmove((i < j) ? &attrs[j] + : &attrs[i], + (i < j) ? &attrs[j+1] + : &attrs[i+1], + move); + } + + /* And reduce the number of attributes */ + n_attributes--; + } + } + + token->data.tag.n_attributes = n_attributes; + } + + /* Finally, emit token */ + tokeniser->token_handler(token, tokeniser->token_pw); +} diff --git a/src/tokeniser/tokeniser.h b/src/tokeniser/tokeniser.h new file mode 100644 index 0000000..20bbe20 --- /dev/null +++ b/src/tokeniser/tokeniser.h @@ -0,0 +1,71 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef hubbub_tokeniser_tokeniser_h_ +#define hubbub_tokeniser_tokeniser_h_ + +#include +#include + +#include +#include +#include + +#include "input/inputstream.h" + +typedef struct hubbub_tokeniser hubbub_tokeniser; + +/** + * Hubbub tokeniser option types + */ +typedef enum hubbub_tokeniser_opttype { + HUBBUB_TOKENISER_TOKEN_HANDLER, + HUBBUB_TOKENISER_BUFFER_HANDLER, + HUBBUB_TOKENISER_ERROR_HANDLER, + HUBBUB_TOKENISER_CONTENT_MODEL, +} hubbub_tokeniser_opttype; + +/** + * Hubbub tokeniser option parameters + */ +typedef union hubbub_tokeniser_optparams { + struct { + hubbub_token_handler handler; + void *pw; + } token_handler; + + struct { + hubbub_buffer_handler handler; + void *pw; + } buffer_handler; + + struct { + hubbub_error_handler handler; + void *pw; + } error_handler; + + struct { + hubbub_content_model model; + } content_model; +} hubbub_tokeniser_optparams; + +/* Create a hubbub tokeniser */ +hubbub_tokeniser *hubbub_tokeniser_create(hubbub_inputstream *input, + hubbub_alloc alloc, void *pw); +/* Destroy a hubbub tokeniser */ +void hubbub_tokeniser_destroy(hubbub_tokeniser *tokeniser); + +/* Configure a hubbub tokeniser */ +hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser, + hubbub_tokeniser_opttype type, + hubbub_tokeniser_optparams *params); + +/* Process remaining data in the input stream */ +hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser); + +#endif + diff --git a/src/utils/Makefile b/src/utils/Makefile new file mode 100644 index 0000000..59b5512 --- /dev/null +++ b/src/utils/Makefile @@ -0,0 +1,53 @@ +# Makefile for libhubbub +# +# Toolchain is exported by top-level makefile +# +# Top-level makefile also exports the following variables: +# +# COMPONENT Name of component +# EXPORT Absolute path of export directory +# TOP Absolute path of source tree root +# +# The top-level makefile requires the following targets to exist: +# +# clean Clean source tree +# debug Create a debug binary +# distclean Fully clean source tree, back to pristine condition +# export Export distributable components to ${EXPORT} +# release Create a release binary +# setup Perform any setup required prior to compilation +# test Execute any test cases + +# Manipulate include paths +CFLAGS += -I$(CURDIR) + +# Objects +OBJS = dict errors utf8 + +.PHONY: clean debug distclean export release setup test + +# Targets +release: $(addprefix ../Release/, $(addsuffix .o, $(OBJS))) + +debug: $(addprefix ../Debug/, $(addsuffix .o, $(OBJS))) + +clean: + -@${RM} ${RMFLAGS} $(addprefix ../Release/, $(addsuffix .o, ${OBJS})) + -@${RM} ${RMFLAGS} $(addprefix ../Debug/, $(addsuffix .o, ${OBJS})) + +distclean: + +setup: + +export: + +test: + +# Pattern rules +../Release/%.o: %.c + @${ECHO} ${ECHOFLAGS} "==> $<" + @${CC} -c ${CFLAGS} -DNDEBUG -o $@ $< + +../Debug/%.o: %.c + @${ECHO} ${ECHOFLAGS} "==> $<" + @${CC} -c -g ${CFLAGS} -o $@ $< diff --git a/src/utils/dict.c b/src/utils/dict.c new file mode 100644 index 0000000..f50ffab --- /dev/null +++ b/src/utils/dict.c @@ -0,0 +1,219 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include + +#include "utils/dict.h" + +/** Node in a dictionary tree */ +typedef struct hubbub_dict_node { + uint8_t split; /**< Data to split on */ + struct hubbub_dict_node *lt; /**< Subtree for data less than + * split */ + struct hubbub_dict_node *eq; /**< Subtree for data equal to split + * If split == '\0', this stores the + * pointer to the actual data, not a + * subtree */ + struct hubbub_dict_node *gt; /**< Subtree for data greater than + * split */ +} hubbub_dict_node; + +/** Dictionary object */ +struct hubbub_dict { + hubbub_dict_node *dict; /**< Root of tree */ + + hubbub_alloc alloc; /**< Memory (de)allocation function */ + void *pw; /**< Pointer to client data */ +}; + +static void hubbub_dict_destroy_internal(hubbub_dict *dict, + hubbub_dict_node *root); +static hubbub_dict_node *hubbub_dict_insert_internal(hubbub_dict *dict, + hubbub_dict_node *parent, const char *key, + const void *value); + + +/** + * Create a dictionary + * + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to dictionary instance, or NULL on error + */ +hubbub_dict *hubbub_dict_create(hubbub_alloc alloc, void *pw) +{ + hubbub_dict *dict; + + if (alloc == NULL) + return NULL; + + dict = alloc(NULL, sizeof(hubbub_dict), pw); + if (dict == NULL) + return NULL; + + dict->dict = NULL; + + dict->alloc = alloc; + dict->pw = pw; + + return dict; +} + +/** + * Destroy a dictionary + * + * \param dict Dictionary to destroy + */ +void hubbub_dict_destroy(hubbub_dict *dict) +{ + if (dict == NULL) + return; + + hubbub_dict_destroy_internal(dict, dict->dict); + + dict->alloc(dict, 0, dict->pw); +} + +/** + * Helper routine for dictionary destruction + * + * \param dict Dictionary being destroyed + * \param root Root node of dictionary (sub)tree to destroy + */ +void hubbub_dict_destroy_internal(hubbub_dict *dict, hubbub_dict_node *root) +{ + if (root == NULL) + return; + + hubbub_dict_destroy_internal(dict, root->lt); + if (root->split != '\0') + hubbub_dict_destroy_internal(dict, root->eq); + hubbub_dict_destroy_internal(dict, root->gt); + + dict->alloc(root, 0, dict->pw); +} + +/** + * Insert a key-value pair into a dictionary + * + * \param dict Dictionary to insert into + * \param key Key string + * \param value Value to associate with key (may be NULL) + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_dict_insert(hubbub_dict *dict, const char *key, + const void *value) +{ + if (dict == NULL || key == NULL) + return HUBBUB_BADPARM; + + dict->dict = hubbub_dict_insert_internal(dict, dict->dict, + key, value); + + return HUBBUB_OK; +} + +/** + * Helper routine for insertion into dictionary + * + * \param dict Dictionary being inserted into + * \param parent Parent node of subtree to insert into + * \param key Key string + * \param value Value to associate with key + * \return Pointer to root of tree created + */ +hubbub_dict_node *hubbub_dict_insert_internal(hubbub_dict *dict, + hubbub_dict_node *parent, const char *key, const void *value) +{ + if (parent == NULL) { + parent = dict->alloc(NULL, + sizeof(hubbub_dict_node), dict->pw); + if (parent == NULL) + return NULL; + parent->split = (uint8_t) key[0]; + parent->lt = parent->eq = parent->gt = NULL; + } + + if ((uint8_t) key[0] < parent->split) { + parent->lt = hubbub_dict_insert_internal(dict, + parent->lt, key, value); + } else if ((uint8_t) key[0] == parent->split) { + if (key[0] == '\0') { + parent->eq = (hubbub_dict_node *) value; + } else { + parent->eq = hubbub_dict_insert_internal(dict, + parent->eq, ++key, value); + } + } else { + parent->gt = hubbub_dict_insert_internal(dict, + parent->gt, key, value); + } + + return parent; +} + +/** + * Step-wise search for a key in a dictionary + * + * \param dict Dictionary to search + * \param c Character to look for + * \param result Pointer to location for result + * \param context Pointer to location for search context + * \return HUBBUB_OK if key found, + * HUBBUB_NEEDDATA if more steps are required + * HUBBUB_INVALID if nothing matches + * + * The value pointed to by ::context must be NULL for the first call. + * Thereafter, pass in the same value as returned by the previous call. + * The context is opaque to the caller and should not be inspected. + * + * The location pointed to by ::result will be set to NULL unless a match + * is found. + */ +hubbub_error hubbub_dict_search_step(hubbub_dict *dict, uint8_t c, + const void **result, void **context) +{ + bool match = false; + hubbub_dict_node *p; + + if (dict == NULL || result == NULL || context == NULL) + return HUBBUB_BADPARM; + + *result = NULL; + + if (*context == NULL) { + p = dict->dict; + } else { + p = (hubbub_dict_node *) *context; + } + + while (p != NULL) { + if (c < p->split) { + p = p->lt; + } else if (c == p->split) { + if (p->split == '\0') { + match = true; + p = NULL; + } else if (p->eq != NULL && p->eq->split == '\0') { + match = true; + *result = (const void *) p->eq->eq; + p = p->eq; + } else { + p = p->eq; + } + + break; + } else { + p = p->gt; + } + } + + *context = (void *) p; + + return (match) ? HUBBUB_OK : + (p == NULL) ? HUBBUB_INVALID : HUBBUB_NEEDDATA; +} diff --git a/src/utils/dict.h b/src/utils/dict.h new file mode 100644 index 0000000..2cde01d --- /dev/null +++ b/src/utils/dict.h @@ -0,0 +1,31 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef hubbub_utils_dict_h_ +#define hubbub_utils_dict_h_ + +#include + +#include +#include + +typedef struct hubbub_dict hubbub_dict; + +/* Create a dictionary */ +hubbub_dict *hubbub_dict_create(hubbub_alloc alloc, void *pw); +/* Destroy a dictionary */ +void hubbub_dict_destroy(hubbub_dict *dict); + +/* Insert a key-value pair into a dictionary */ +hubbub_error hubbub_dict_insert(hubbub_dict *dict, const char *key, + const void *value); + +/* Step-wise search for a key in a dictionary */ +hubbub_error hubbub_dict_search_step(hubbub_dict *dict, uint8_t c, + const void **result, void **context); + +#endif diff --git a/src/utils/errors.c b/src/utils/errors.c new file mode 100644 index 0000000..e57ba6a --- /dev/null +++ b/src/utils/errors.c @@ -0,0 +1,70 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include + +#include + +/** + * Convert a hubbub error code to a string + * + * \param error The error code to convert + * \return Pointer to string representation of error, or NULL if unknown. + */ +const char *hubbub_error_to_string(hubbub_error error) +{ + const char *result = NULL; + + switch (error) { + case HUBBUB_OK: + result = "No error"; + break; + case HUBBUB_NOMEM: + result = "Insufficient memory"; + break; + case HUBBUB_BADPARM: + result = "Bad parameter"; + break; + case HUBBUB_INVALID: + result = "Invalid input"; + break; + case HUBBUB_FILENOTFOUND: + result = "File not found"; + break; + case HUBBUB_NEEDDATA: + result = "Insufficient data"; + break; + } + + return result; +} + +/** + * Convert a string representation of an error name to a hubbub error code + * + * \param str String containing error name + * \param len Length of string (bytes) + * \return Hubbub error code, or HUBBUB_OK if unknown + */ +hubbub_error hubbub_error_from_string(const char *str, size_t len) +{ + if (strncmp(str, "HUBBUB_OK", len) == 0) { + return HUBBUB_OK; + } else if (strncmp(str, "HUBBUB_NOMEM", len) == 0) { + return HUBBUB_NOMEM; + } else if (strncmp(str, "HUBBUB_BADPARM", len) == 0) { + return HUBBUB_BADPARM; + } else if (strncmp(str, "HUBBUB_INVALID", len) == 0) { + return HUBBUB_INVALID; + } else if (strncmp(str, "HUBBUB_FILENOTFOUND", len) == 0) { + return HUBBUB_FILENOTFOUND; + } else if (strncmp(str, "HUBBUB_NEEDDATA", len) == 0) { + return HUBBUB_NEEDDATA; + } + + return HUBBUB_OK; +} diff --git a/src/utils/utf8.c b/src/utils/utf8.c new file mode 100644 index 0000000..062d629 --- /dev/null +++ b/src/utils/utf8.c @@ -0,0 +1,368 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +/** \file + * UTF-8 manipulation functions (implementation). + */ + +#include +#include +#include + +#include "utils/utf8.h" + +/** Number of continuation bytes for a given start byte */ +static const uint8_t numContinuations[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, +}; + +/** + * Convert a UTF-8 multibyte sequence into a single UCS4 character + * + * Encoding of UCS values outside the UTF-16 plane has been removed from + * RFC3629. This function conforms to RFC2279, however. + * + * \param s The sequence to process + * \param len Length of sequence + * \param ucs4 Pointer to location to receive UCS4 character (host endian) + * \param clen Pointer to location to receive byte length of UTF-8 sequence + * \return HUBBUB_OK on success, appropriate error otherwise + */ +inline hubbub_error hubbub_utf8_to_ucs4(const uint8_t *s, size_t len, + uint32_t *ucs4, size_t *clen) +{ + if (s == NULL || ucs4 == NULL || clen == NULL) + return HUBBUB_BADPARM; + + if (len == 0) + return HUBBUB_NEEDDATA; + + if (*s < 0x80) { + *ucs4 = *s; + *clen = 1; + } else if ((*s & 0xE0) == 0xC0) { + if (len < 2) + return HUBBUB_NEEDDATA; + else if ((*(s+1) & 0xC0) != 0x80) + return HUBBUB_INVALID; + else { + *ucs4 = ((*s & 0x1F) << 6) | (*(s+1) & 0x3F); + *clen = 2; + } + } else if ((*s & 0xF0) == 0xE0) { + if (len < 3) + return HUBBUB_NEEDDATA; + else if ((*(s+1) & 0xC0) != 0x80 || + (*(s+2) & 0xC0) != 0x80) + return HUBBUB_INVALID; + else { + *ucs4 = ((*s & 0x0F) << 12) | + ((*(s+1) & 0x3F) << 6) | + (*(s+2) & 0x3F); + *clen = 3; + } + } else if ((*s & 0xF8) == 0xF0) { + if (len < 4) + return HUBBUB_NEEDDATA; + else if ((*(s+1) & 0xC0) != 0x80 || + (*(s+2) & 0xC0) != 0x80 || + (*(s+3) & 0xC0) != 0x80) + return HUBBUB_INVALID; + else { + *ucs4 = ((*s & 0x0F) << 18) | + ((*(s+1) & 0x3F) << 12) | + ((*(s+2) & 0x3F) << 6) | + (*(s+3) & 0x3F); + *clen = 4; + } + } else if ((*s & 0xFC) == 0xF8) { + if (len < 5) + return HUBBUB_NEEDDATA; + else if ((*(s+1) & 0xC0) != 0x80 || + (*(s+2) & 0xC0) != 0x80 || + (*(s+3) & 0xC0) != 0x80 || + (*(s+4) & 0xC0) != 0x80) + return HUBBUB_INVALID; + else { + *ucs4 = ((*s & 0x0F) << 24) | + ((*(s+1) & 0x3F) << 18) | + ((*(s+2) & 0x3F) << 12) | + ((*(s+3) & 0x3F) << 6) | + (*(s+4) & 0x3F); + *clen = 5; + } + } else if ((*s & 0xFE) == 0xFC) { + if (len < 6) + return HUBBUB_NEEDDATA; + else if ((*(s+1) & 0xC0) != 0x80 || + (*(s+2) & 0xC0) != 0x80 || + (*(s+3) & 0xC0) != 0x80 || + (*(s+4) & 0xC0) != 0x80 || + (*(s+5) & 0xC0) != 0x80) + return HUBBUB_INVALID; + else { + *ucs4 = ((*s & 0x0F) << 28) | + ((*(s+1) & 0x3F) << 24) | + ((*(s+2) & 0x3F) << 18) | + ((*(s+3) & 0x3F) << 12) | + ((*(s+4) & 0x3F) << 6) | + (*(s+5) & 0x3F); + *clen = 6; + } + } else { + return HUBBUB_INVALID; + } + + return HUBBUB_OK; +} + +/** + * Convert a single UCS4 character into a UTF-8 multibyte sequence + * + * Encoding of UCS values outside the UTF-16 plane has been removed from + * RFC3629. This function conforms to RFC2279, however. + * + * \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian) + * \param s Pointer to 6 byte long output buffer + * \param len Pointer to location to receive length of multibyte sequence + * \return HUBBUB_OK on success, appropriate error otherwise + */ +inline hubbub_error hubbub_utf8_from_ucs4(uint32_t ucs4, uint8_t *s, + size_t *len) +{ + uint32_t l = 0; + + if (s == NULL || len == NULL) + return HUBBUB_BADPARM; + else if (ucs4 < 0x80) { + *s = (uint8_t) ucs4; + l = 1; + } else if (ucs4 < 0x800) { + *s = 0xC0 | ((ucs4 >> 6) & 0x1F); + *(s+1) = 0x80 | (ucs4 & 0x3F); + l = 2; + } else if (ucs4 < 0x10000) { + *s = 0xE0 | ((ucs4 >> 12) & 0xF); + *(s+1) = 0x80 | ((ucs4 >> 6) & 0x3F); + *(s+2) = 0x80 | (ucs4 & 0x3F); + l = 3; + } else if (ucs4 < 0x200000) { + *s = 0xF0 | ((ucs4 >> 18) & 0x7); + *(s+1) = 0x80 | ((ucs4 >> 12) & 0x3F); + *(s+2) = 0x80 | ((ucs4 >> 6) & 0x3F); + *(s+3) = 0x80 | (ucs4 & 0x3F); + l = 4; + } else if (ucs4 < 0x4000000) { + *s = 0xF8 | ((ucs4 >> 24) & 0x3); + *(s+1) = 0x80 | ((ucs4 >> 18) & 0x3F); + *(s+2) = 0x80 | ((ucs4 >> 12) & 0x3F); + *(s+3) = 0x80 | ((ucs4 >> 6) & 0x3F); + *(s+4) = 0x80 | (ucs4 & 0x3F); + l = 5; + } else if (ucs4 <= 0x7FFFFFFF) { + *s = 0xFC | ((ucs4 >> 30) & 0x1); + *(s+1) = 0x80 | ((ucs4 >> 24) & 0x3F); + *(s+2) = 0x80 | ((ucs4 >> 18) & 0x3F); + *(s+3) = 0x80 | ((ucs4 >> 12) & 0x3F); + *(s+4) = 0x80 | ((ucs4 >> 6) & 0x3F); + *(s+5) = 0x80 | (ucs4 & 0x3F); + l = 6; + } else { + return HUBBUB_INVALID; + } + + *len = l; + + return HUBBUB_OK; +} + +/** + * Calculate the length (in characters) of a bounded UTF-8 string + * + * \param s The string + * \param max Maximum length + * \param len Pointer to location to receive length of string + * \return HUBBUB_OK on success, appropriate error otherwise + */ +inline hubbub_error hubbub_utf8_length(const uint8_t *s, size_t max, + size_t *len) +{ + const uint8_t *end = s + max; + int l = 0; + + if (s == NULL || len == NULL) + return HUBBUB_BADPARM; + + while (s < end) { + if ((*s & 0x80) == 0x00) + s += 1; + else if ((*s & 0xE0) == 0xC0) + s += 2; + else if ((*s & 0xF0) == 0xE0) + s += 3; + else if ((*s & 0xF8) == 0xF0) + s += 4; + else if ((*s & 0xFC) == 0xF8) + s += 5; + else if ((*s & 0xFE) == 0xFC) + s += 6; + else + return HUBBUB_INVALID; + l++; + } + + *len = l; + + return HUBBUB_OK; +} + +/** + * Calculate the length (in bytes) of a UTF-8 character + * + * \param s Pointer to start of character + * \param len Pointer to location to receive length + * \return HUBBUB_OK on success, appropriate error otherwise + */ +inline hubbub_error hubbub_utf8_char_byte_length(const uint8_t *s, + size_t *len) +{ + if (s == NULL || len == NULL) + return HUBBUB_BADPARM; + + *len = numContinuations[s[0]] + 1 /* Start byte */; + + return HUBBUB_OK; +} + +/** + * Find previous legal UTF-8 char in string + * + * \param s The string + * \param off Offset in the string to start at + * \param prevoff Pointer to location to receive offset of first byte of + * previous legal character + * \return HUBBUB_OK on success, appropriate error otherwise + */ +inline hubbub_error hubbub_utf8_prev(const uint8_t *s, uint32_t off, + uint32_t *prevoff) +{ + if (s == NULL || prevoff == NULL) + return HUBBUB_BADPARM; + + while (off != 0 && (s[--off] & 0xC0) == 0x80) + /* do nothing */; + + *prevoff = off; + + return HUBBUB_OK; +} + +/** + * Find next legal UTF-8 char in string + * + * \param s The string (assumed valid) + * \param len Maximum offset in string + * \param off Offset in the string to start at + * \param nextoff Pointer to location to receive offset of first byte of + * next legal character + * \return HUBBUB_OK on success, appropriate error otherwise + */ +inline hubbub_error hubbub_utf8_next(const uint8_t *s, uint32_t len, + uint32_t off, uint32_t *nextoff) +{ + if (s == NULL || off >= len || nextoff == NULL) + return HUBBUB_BADPARM; + + /* Skip current start byte (if present - may be mid-sequence) */ + if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0) + off++; + + while (off < len && (s[off] & 0xC0) == 0x80) + off++; + + *nextoff = off; + + return HUBBUB_OK; +} + +/** + * Find next legal UTF-8 char in string + * + * \param s The string (assumed to be of dubious validity) + * \param len Maximum offset in string + * \param off Offset in the string to start at + * \param nextoff Pointer to location to receive offset of first byte of + * next legal character + * \return HUBBUB_OK on success, appropriate error otherwise + */ +inline hubbub_error hubbub_utf8_next_paranoid(const uint8_t *s, uint32_t len, + uint32_t off, uint32_t *nextoff) +{ + bool valid; + + if (s == NULL || off >= len || nextoff == NULL) + return HUBBUB_BADPARM; + + /* Skip current start byte (if present - may be mid-sequence) */ + if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0) + off++; + + while (1) { + /* Find next possible start byte */ + while (off < len && (s[off] & 0xC0) == 0x80) + off++; + + /* Ran off end of data */ + if (off == len || off + numContinuations[s[off]] >= len) + return HUBBUB_NEEDDATA; + + /* Found if start byte is ascii, + * or next n bytes are valid continuations */ + valid = true; + + switch (numContinuations[s[off]]) { + case 5: + valid &= ((s[off + 5] & 0xC0) == 0x80); + case 4: + valid &= ((s[off + 4] & 0xC0) == 0x80); + case 3: + valid &= ((s[off + 3] & 0xC0) == 0x80); + case 2: + valid &= ((s[off + 2] & 0xC0) == 0x80); + case 1: + valid &= ((s[off + 1] & 0xC0) == 0x80); + case 0: + valid &= (s[off + 0] < 0x80); + } + + if (valid) + break; + + /* Otherwise, skip this (invalid) start byte and try again */ + off++; + } + + *nextoff = off; + + return HUBBUB_OK; +} + diff --git a/src/utils/utf8.h b/src/utils/utf8.h new file mode 100644 index 0000000..8836338 --- /dev/null +++ b/src/utils/utf8.h @@ -0,0 +1,38 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +/** \file + * UTF-8 manipulation functions (interface). + */ + +#ifndef hubbub_utils_utf8_h_ +#define hubbub_utils_utf8_h + +#include + +#include + +inline hubbub_error hubbub_utf8_to_ucs4(const uint8_t *s, size_t len, + uint32_t *ucs4, size_t *clen); +inline hubbub_error hubbub_utf8_from_ucs4(uint32_t ucs4, uint8_t *s, + size_t *len); + +inline hubbub_error hubbub_utf8_length(const uint8_t *s, size_t max, + size_t *len); +inline hubbub_error hubbub_utf8_char_byte_length(const uint8_t *s, + size_t *len); + +inline hubbub_error hubbub_utf8_prev(const uint8_t *s, uint32_t off, + uint32_t *prevoff); +inline hubbub_error hubbub_utf8_next(const uint8_t *s, uint32_t len, + uint32_t off, uint32_t *nextoff); + +inline hubbub_error hubbub_utf8_next_paranoid(const uint8_t *s, uint32_t len, + uint32_t off, uint32_t *nextoff); + +#endif + diff --git a/src/utils/utils.h b/src/utils/utils.h new file mode 100644 index 0000000..a1e0230 --- /dev/null +++ b/src/utils/utils.h @@ -0,0 +1,28 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef hubbub_utils_h_ +#define hubbub_utils_h_ + +#ifndef max +#define max(a,b) ((a)>(b)?(a):(b)) +#endif + +#ifndef min +#define min(a,b) ((a)<(b)?(a):(b)) +#endif + +#ifndef SLEN +/* Calculate length of a string constant */ +#define SLEN(s) (sizeof((s)) - 1) /* -1 for '\0' */ +#endif + +#ifndef UNUSED +#define UNUSED(x) ((x)=(x)) +#endif + +#endif -- cgit v1.2.3