From 7b30a5520cfb56e651f0eb4da85a3e07747da7dc Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Sat, 23 Jun 2007 22:40:25 +0000 Subject: Import hubbub -- an HTML parsing library. Plenty of work still to do (like tree generation ;) svn path=/trunk/hubbub/; revision=3359 --- src/charset/Makefile | 53 +++ src/charset/aliases.c | 361 ++++++++++++++++++++ src/charset/aliases.h | 42 +++ src/charset/codec.c | 186 +++++++++++ src/charset/codec.h | 153 +++++++++ src/charset/codec_iconv.c | 837 ++++++++++++++++++++++++++++++++++++++++++++++ src/charset/codec_impl.h | 51 +++ src/charset/codec_utf8.c | 620 ++++++++++++++++++++++++++++++++++ src/charset/detect.c | 673 +++++++++++++++++++++++++++++++++++++ src/charset/detect.h | 22 ++ 10 files changed, 2998 insertions(+) create mode 100644 src/charset/Makefile create mode 100644 src/charset/aliases.c create mode 100644 src/charset/aliases.h create mode 100644 src/charset/codec.c create mode 100644 src/charset/codec.h create mode 100644 src/charset/codec_iconv.c create mode 100644 src/charset/codec_impl.h create mode 100644 src/charset/codec_utf8.c create mode 100644 src/charset/detect.c create mode 100644 src/charset/detect.h (limited to 'src/charset') diff --git a/src/charset/Makefile b/src/charset/Makefile new file mode 100644 index 0000000..62817b3 --- /dev/null +++ b/src/charset/Makefile @@ -0,0 +1,53 @@ +# Makefile for libhubbub +# +# Toolchain is exported by top-level makefile +# +# Top-level makefile also exports the following variables: +# +# COMPONENT Name of component +# EXPORT Absolute path of export directory +# TOP Absolute path of source tree root +# +# The top-level makefile requires the following targets to exist: +# +# clean Clean source tree +# debug Create a debug binary +# distclean Fully clean source tree, back to pristine condition +# export Export distributable components to ${EXPORT} +# release Create a release binary +# setup Perform any setup required prior to compilation +# test Execute any test cases + +# Manipulate include paths +CFLAGS += -I$(CURDIR) + +# Objects +OBJS = aliases codec codec_iconv codec_utf8 detect + +.PHONY: clean debug distclean export release setup test + +# Targets +release: $(addprefix ../Release/, $(addsuffix .o, $(OBJS))) + +debug: $(addprefix ../Debug/, $(addsuffix .o, $(OBJS))) + +clean: + -@${RM} ${RMFLAGS} $(addprefix ../Release/, $(addsuffix .o, ${OBJS})) + -@${RM} ${RMFLAGS} $(addprefix ../Debug/, $(addsuffix .o, ${OBJS})) + +distclean: + +setup: + +export: + +test: + +# Pattern rules +../Release/%.o: %.c + @${ECHO} ${ECHOFLAGS} "==> $<" + @${CC} -c ${CFLAGS} -DNDEBUG -o $@ $< + +../Debug/%.o: %.c + @${ECHO} ${ECHOFLAGS} "==> $<" + @${CC} -c -g ${CFLAGS} -o $@ $< diff --git a/src/charset/aliases.c b/src/charset/aliases.c new file mode 100644 index 0000000..dcf6de2 --- /dev/null +++ b/src/charset/aliases.c @@ -0,0 +1,361 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include +#include +#include +#include +#include +#include + +#include "charset/aliases.h" + +struct alias { + struct alias *next; + hubbub_aliases_canon *canon; + uint16_t name_len; + char name[1]; +}; + +#define HASH_SIZE (43) +static hubbub_aliases_canon *canon_tab[HASH_SIZE]; +static struct alias *alias_tab[HASH_SIZE]; + +static hubbub_error hubbub_create_alias(const char *alias, + hubbub_aliases_canon *c, hubbub_alloc alloc, void *pw); +static hubbub_aliases_canon *hubbub_create_canon(const char *canon, + uint16_t mibenum, hubbub_alloc alloc, void *pw); +static uint32_t hubbub_hash_val(const char *alias, size_t len); + +/** + * Create alias data from Aliases file + * + * \param filename The path to the Aliases file + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return HUBBUB_OK on success, appropriate error otherwise. + */ +hubbub_error hubbub_aliases_create(const char *filename, + hubbub_alloc alloc, void *pw) +{ + char buf[300]; + FILE *fp; + + if (filename == NULL || alloc == NULL) + return HUBBUB_BADPARM; + + fp = fopen(filename, "r"); + if (fp == NULL) + return HUBBUB_FILENOTFOUND; + + while (fgets(buf, sizeof buf, fp)) { + char *p, *aliases = 0, *mib, *end; + hubbub_aliases_canon *cf; + + if (buf[0] == 0 || buf[0] == '#') + /* skip blank lines or comments */ + continue; + + buf[strlen(buf) - 1] = 0; /* lose terminating newline */ + end = buf + strlen(buf); + + /* find end of canonical form */ + for (p = buf; *p && !isspace(*p) && !iscntrl(*p); p++) + ; /* do nothing */ + if (p >= end) + continue; + *p++ = '\0'; /* terminate canonical form */ + + /* skip whitespace */ + for (; *p && isspace(*p); p++) + ; /* do nothing */ + if (p >= end) + continue; + mib = p; + + /* find end of mibenum */ + for (; *p && !isspace(*p) && !iscntrl(*p); p++) + ; /* do nothing */ + if (p < end) + *p++ = '\0'; /* terminate mibenum */ + + cf = hubbub_create_canon(buf, atoi(mib), alloc, pw); + if (cf == NULL) + continue; + + /* skip whitespace */ + for (; p < end && *p && isspace(*p); p++) + ; /* do nothing */ + if (p >= end) + continue; + aliases = p; + + while (p < end) { + /* find end of alias */ + for (; *p && !isspace(*p) && !iscntrl(*p); p++) + ; /* do nothing */ + if (p > end) + /* stop if we've gone past the end */ + break; + /* terminate current alias */ + *p++ = '\0'; + + if (hubbub_create_alias(aliases, cf, + alloc, pw) != HUBBUB_OK) + break; + + /* in terminating, we may have advanced + * past the end - check this here */ + if (p >= end) + break; + + /* skip whitespace */ + for (; *p && isspace(*p); p++) + ; /* do nothing */ + + if (p >= end) + /* gone past end => stop */ + break; + + /* update pointer to current alias */ + aliases = p; + } + } + + fclose(fp); + + return HUBBUB_OK; +} + +/** + * Free all alias data + * + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data + */ +void hubbub_aliases_destroy(hubbub_alloc alloc, void *pw) +{ + hubbub_aliases_canon *c, *d; + struct alias *a, *b; + int i; + + for (i = 0; i != HASH_SIZE; i++) { + for (c = canon_tab[i]; c; c = d) { + d = c->next; + alloc(c, 0, pw); + } + canon_tab[i] = NULL; + + for (a = alias_tab[i]; a; a = b) { + b = a->next; + alloc(a, 0, pw); + } + alias_tab[i] = NULL; + } +} + +/** + * Retrieve the MIB enum value assigned to an encoding name + * + * \param alias The alias to lookup + * \param len The length of the alias string + * \return The MIB enum value, or 0 if not found + */ +uint16_t hubbub_mibenum_from_name(const char *alias, size_t len) +{ + hubbub_aliases_canon *c; + + if (alias == NULL) + return 0; + + c = hubbub_alias_canonicalise(alias, len); + if (c == NULL) + return 0; + + return c->mib_enum; +} + +/** + * Retrieve the canonical name of an encoding from the MIB enum + * + * \param mibenum The MIB enum value + * \return Pointer to canonical name, or NULL if not found + */ +const char *hubbub_mibenum_to_name(uint16_t mibenum) +{ + int i; + hubbub_aliases_canon *c; + + for (i = 0; i != HASH_SIZE; i++) + for (c = canon_tab[i]; c; c = c->next) + if (c->mib_enum == mibenum) + return c->name; + + return NULL; +} + + +/** + * Retrieve the canonical form of an alias name + * + * \param alias The alias name + * \param len The length of the alias name + * \return Pointer to canonical form or NULL if not found + */ +hubbub_aliases_canon *hubbub_alias_canonicalise(const char *alias, + size_t len) +{ + uint32_t hash; + hubbub_aliases_canon *c; + struct alias *a; + + if (alias == NULL) + return NULL; + + hash = hubbub_hash_val(alias, len); + + for (c = canon_tab[hash]; c; c = c->next) + if (c->name_len == len && + strncasecmp(c->name, alias, len) == 0) + break; + if (c) + return c; + + for (a = alias_tab[hash]; a; a = a->next) + if (a->name_len == len && + strncasecmp(a->name, alias, len) == 0) + break; + if (a) + return a->canon; + + return NULL; +} + + +/** + * Create an alias + * + * \param alias The alias name + * \param c The canonical form + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_create_alias(const char *alias, hubbub_aliases_canon *c, + hubbub_alloc alloc, void *pw) +{ + struct alias *a; + uint32_t hash; + + if (alias == NULL || c == NULL || alloc == NULL) + return HUBBUB_BADPARM; + + a = alloc(NULL, sizeof(struct alias) + strlen(alias) + 1, pw); + if (a == NULL) + return HUBBUB_NOMEM; + + a->canon = c; + a->name_len = strlen(alias); + strcpy(a->name, alias); + a->name[a->name_len] = '\0'; + + hash = hubbub_hash_val(alias, a->name_len); + + a->next = alias_tab[hash]; + alias_tab[hash] = a; + + return HUBBUB_OK; +} + +/** + * Create a canonical form + * + * \param canon The canonical name + * \param mibenum The MIB enum value + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to canonical form or NULL on error + */ +hubbub_aliases_canon *hubbub_create_canon(const char *canon, + uint16_t mibenum, hubbub_alloc alloc, void *pw) +{ + hubbub_aliases_canon *c; + uint32_t hash, len; + + if (canon == NULL || alloc == NULL) + return NULL; + + len = strlen(canon); + + c = alloc(NULL, sizeof(hubbub_aliases_canon) + len + 1, pw); + if (c == NULL) + return NULL; + + c->mib_enum = mibenum; + c->name_len = len; + strcpy(c->name, canon); + c->name[len] = '\0'; + + hash = hubbub_hash_val(canon, len); + + c->next = canon_tab[hash]; + canon_tab[hash] = c; + + return c; +} + +/** + * Hash function + * + * \param alias String to hash + * \return The hashed value + */ +uint32_t hubbub_hash_val(const char *alias, size_t len) +{ + const char *s = alias; + uint32_t h = 5381; + + if (alias == NULL) + return 0; + + while (len--) + h = (h * 33) ^ (*s++ & ~0x20); /* case insensitive */ + + return h % HASH_SIZE; +} + + +#ifndef NDEBUG +/** + * Dump all alias data to stdout + */ +void hubbub_aliases_dump(void) +{ + hubbub_aliases_canon *c; + struct alias *a; + int i; + size_t size = 0; + + for (i = 0; i != HASH_SIZE; i++) { + for (c = canon_tab[i]; c; c = c->next) { + printf("%d %s\n", i, c->name); + size += offsetof(hubbub_aliases_canon, name) + + c->name_len; + } + + for (a = alias_tab[i]; a; a = a->next) { + printf("%d %s\n", i, a->name); + size += offsetof(struct alias, name) + a->name_len; + } + } + + size += (sizeof(canon_tab) / sizeof(canon_tab[0])); + size += (sizeof(alias_tab) / sizeof(alias_tab[0])); + + printf("%u\n", (unsigned int) size); +} +#endif diff --git a/src/charset/aliases.h b/src/charset/aliases.h new file mode 100644 index 0000000..e0505d0 --- /dev/null +++ b/src/charset/aliases.h @@ -0,0 +1,42 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef hubbub_charset_aliases_h_ +#define hubbub_charset_aliases_h_ + +#include + +#include +#include + +typedef struct hubbub_aliases_canon { + struct hubbub_aliases_canon *next; + uint16_t mib_enum; + uint16_t name_len; + char name[1]; +} hubbub_aliases_canon; + +/* Load encoding aliases from file */ +hubbub_error hubbub_aliases_create(const char *filename, + hubbub_alloc alloc, void *pw); +/* Destroy encoding aliases */ +void hubbub_aliases_destroy(hubbub_alloc alloc, void *pw); + +/* Convert an encoding alias to a MIB enum value */ +uint16_t hubbub_mibenum_from_name(const char *alias, size_t len); +/* Convert a MIB enum value into an encoding alias */ +const char *hubbub_mibenum_to_name(uint16_t mibenum); + +/* Canonicalise an alias name */ +hubbub_aliases_canon *hubbub_alias_canonicalise(const char *alias, + size_t len); + +#ifndef NDEBUG +void hubbub_aliases_dump(void); +#endif + +#endif diff --git a/src/charset/codec.c b/src/charset/codec.c new file mode 100644 index 0000000..12a1bdc --- /dev/null +++ b/src/charset/codec.c @@ -0,0 +1,186 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include + +#include "charset/aliases.h" + +#include "codec_impl.h" + +extern hubbub_charsethandler hubbub_iconv_codec_handler; +extern hubbub_charsethandler hubbub_utf8_codec_handler; + +static hubbub_charsethandler *handler_table[] = { + &hubbub_utf8_codec_handler, + &hubbub_iconv_codec_handler, + NULL, +}; + +/** + * Create a charset codec + * + * \param charset Target charset + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to codec instance, or NULL on failure + */ +hubbub_charsetcodec *hubbub_charsetcodec_create(const char *charset, + hubbub_alloc alloc, void *pw) +{ + hubbub_charsetcodec *codec; + hubbub_charsethandler **handler; + const hubbub_aliases_canon * canon; + + if (charset == NULL || alloc == NULL) + return NULL; + + /* Canonicalise charset name. */ + canon = hubbub_alias_canonicalise(charset, strlen(charset)); + if (canon == NULL) + return NULL; + + /* Search for handler class */ + for (handler = handler_table; *handler != NULL; handler++) { + if ((*handler)->handles_charset(canon->name)) + break; + } + + /* None found */ + if ((*handler) == NULL) + return NULL; + + /* Instantiate class */ + codec = (*handler)->create(canon->name, alloc, pw); + if (codec == NULL) + return NULL; + + /* and initialise it */ + codec->mibenum = canon->mib_enum; + + codec->filter = NULL; + codec->filter_pw = NULL; + + codec->errormode = HUBBUB_CHARSETCODEC_ERROR_LOOSE; + + codec->alloc = alloc; + codec->alloc_pw = pw; + + return codec; +} + +/** + * Destroy a charset codec + * + * \param codec The codec to destroy + */ +void hubbub_charsetcodec_destroy(hubbub_charsetcodec *codec) +{ + if (codec == NULL) + return; + + codec->handler.destroy(codec); + + codec->alloc(codec, 0, codec->alloc_pw); +} + +/** + * Configure a charset codec + * + * \param codec The codec to configure + * \parem type The codec option type to configure + * \param params Option-specific parameters + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_charsetcodec_setopt(hubbub_charsetcodec *codec, + hubbub_charsetcodec_opttype type, + hubbub_charsetcodec_optparams *params) +{ + if (codec == NULL || params == NULL) + return HUBBUB_BADPARM; + + switch (type) { + case HUBBUB_CHARSETCODEC_FILTER_FUNC: + codec->filter = params->filter_func.filter; + codec->filter_pw = params->filter_func.pw; + break; + + case HUBBUB_CHARSETCODEC_ERROR_MODE: + codec->errormode = params->error_mode.mode; + break; + } + + return HUBBUB_OK; +} + +/** + * Encode a chunk of UCS4 data into a codec's charset + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return HUBBUB_OK on success, appropriate error otherwise. + * + * source, sourcelen, dest and destlen will be updated appropriately on exit + */ +hubbub_error hubbub_charsetcodec_encode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + if (codec == NULL || source == NULL || *source == NULL || + sourcelen == NULL || dest == NULL || *dest == NULL || + destlen == NULL) + return HUBBUB_BADPARM; + + return codec->handler.encode(codec, source, sourcelen, dest, destlen); +} + +/** + * Decode a chunk of data in a codec's charset into UCS4 + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return HUBBUB_OK on success, appropriate error otherwise. + * + * source, sourcelen, dest and destlen will be updated appropriately on exit + * + * Call this with a source length of 0 to flush any buffers. + */ +hubbub_error hubbub_charsetcodec_decode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + if (codec == NULL || source == NULL || *source == NULL || + sourcelen == NULL || dest == NULL || *dest == NULL || + destlen == NULL) + return HUBBUB_BADPARM; + + return codec->handler.decode(codec, source, sourcelen, dest, destlen); +} + +/** + * Clear a charset codec's encoding state + * + * \param codec The codec to reset + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_charsetcodec_reset(hubbub_charsetcodec *codec) +{ + if (codec == NULL) + return HUBBUB_BADPARM; + + /* Reset filter */ + if (codec->filter) + codec->filter(HUBBUB_CHARSETCODEC_NULL, NULL, NULL, NULL); + + return codec->handler.reset(codec); +} + diff --git a/src/charset/codec.h b/src/charset/codec.h new file mode 100644 index 0000000..4cd94d8 --- /dev/null +++ b/src/charset/codec.h @@ -0,0 +1,153 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef hubbub_charset_codec_h_ +#define hubbub_charset_codec_h_ + +#include + +#include +#include + +typedef struct hubbub_charsetcodec hubbub_charsetcodec; + +#define HUBBUB_CHARSETCODEC_NULL (0xffffffffU) + +/** + * Type of charset codec filter function + * + * \param c UCS4 character (in host byte order) or + * HUBBUB_CHARSETCODEC_NULL to reset + * \param output Pointer to location to store output buffer location + * \param outputlen Pointer to location to store output buffer length + * \param pw Pointer to client-specific private data + * \return HUBBUB_OK on success, or appropriate error otherwise. + * + * The output buffer is owned by the filter code and will not be freed by + * any charset codec. It should contain the replacement UCS4 character(s) + * for the input. The replacement characters should be in host byte order. + * The contents of *output and *outputlen on entry are ignored and these + * will be filled in by the filter code. + * + * Filters may elect to replace the input character with no output. In this + * case, *output should be set to NULL and *outputlen should be set to 0 and + * HUBBUB_OK should be returned. + * + * The output length is in terms of the number of UCS4 characters in the + * output buffer. i.e.: + * + * for (size_t i = 0; i < outputlen; i++) { + * dest[curchar++] = output[i]; + * } + * + * would copy the contents of the filter output buffer to the codec's output + * buffer. + */ +typedef hubbub_error (*hubbub_charsetcodec_filter)(uint32_t c, + uint32_t **output, size_t *outputlen, void *pw); + +/** + * Charset codec error mode + * + * A codec's error mode determines its behaviour in the face of: + * + * + characters which are unrepresentable in the destination charset (if + * encoding data) or which cannot be converted to UCS4 (if decoding data). + * + invalid byte sequences (both encoding and decoding) + * + * The options provide a choice between the following approaches: + * + * + draconian, "stop processing" ("strict") + * + "replace the unrepresentable character with something else" ("loose") + * + "attempt to transliterate, or replace if unable" ("translit") + * + * The default error mode is "loose". + * + * + * In the "loose" case, the replacement character will depend upon: + * + * + Whether the operation was encoding or decoding + * + If encoding, what the destination charset is. + * + * If decoding, the replacement character will be: + * + * U+FFFD (REPLACEMENT CHARACTER) + * + * If encoding, the replacement character will be: + * + * U+003F (QUESTION MARK) if the destination charset is not UTF-(8|16|32) + * U+FFFD (REPLACEMENT CHARACTER) otherwise. + * + * + * In the "translit" case, the codec will attempt to transliterate into + * the destination charset, if encoding. If decoding, or if transliteration + * fails, this option is identical to "loose". + */ +typedef enum hubbub_charsetcodec_errormode { + /** Abort processing if unrepresentable character encountered */ + HUBBUB_CHARSETCODEC_ERROR_STRICT = 0, + /** Replace unrepresentable characters with single alternate */ + HUBBUB_CHARSETCODEC_ERROR_LOOSE = 1, + /** Transliterate unrepresentable characters, if possible */ + HUBBUB_CHARSETCODEC_ERROR_TRANSLIT = 2, +} hubbub_charsetcodec_errormode; + +/** + * Charset codec option types + */ +typedef enum hubbub_charsetcodec_opttype { + /** Register codec filter function */ + HUBBUB_CHARSETCODEC_FILTER_FUNC = 0, + /** Set codec error mode */ + HUBBUB_CHARSETCODEC_ERROR_MODE = 1, +} hubbub_charsetcodec_opttype; + +/** + * Charset codec option parameters + */ +typedef union hubbub_charsetcodec_optparams { + /** Parameters for filter function setting */ + struct { + /** Filter function */ + hubbub_charsetcodec_filter filter; + /** Client-specific private data */ + void *pw; + } filter_func; + + /** Parameters for error mode setting */ + struct { + /** The desired error handling mode */ + hubbub_charsetcodec_errormode mode; + } error_mode; +} hubbub_charsetcodec_optparams; + + +/* Create a charset codec */ +hubbub_charsetcodec *hubbub_charsetcodec_create(const char *charset, + hubbub_alloc alloc, void *pw); +/* Destroy a charset codec */ +void hubbub_charsetcodec_destroy(hubbub_charsetcodec *codec); + +/* Configure a charset codec */ +hubbub_error hubbub_charsetcodec_setopt(hubbub_charsetcodec *codec, + hubbub_charsetcodec_opttype type, + hubbub_charsetcodec_optparams *params); + +/* Encode a chunk of UCS4 data into a codec's charset */ +hubbub_error hubbub_charsetcodec_encode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); + +/* Decode a chunk of data in a codec's charset into UCS4 */ +hubbub_error hubbub_charsetcodec_decode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); + +/* Reset a charset codec */ +hubbub_error hubbub_charsetcodec_reset(hubbub_charsetcodec *codec); + +#endif diff --git a/src/charset/codec_iconv.c b/src/charset/codec_iconv.c new file mode 100644 index 0000000..097e82a --- /dev/null +++ b/src/charset/codec_iconv.c @@ -0,0 +1,837 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +/* This codec is hideously slow. Only use it as a last resort */ + +#include +#include +#include + +#include + +/* These two are for htonl / ntohl */ +#include +#include + +#include "charset/aliases.h" +#include "utils/utils.h" + +#include "codec_impl.h" + +/** + * A note on endianness: + * + * UCS4 is big-endian by default. Therefore, this codec reads and writes + * big-endian values. This is fine, and causes no problems. However, to + * make life easier for client-supplied filter code, character values passed + * to a filter and those read back from a filter are in host-endian. + * Therefore, we need to convert from big-endian to host-endian when passing + * characters to a filter and perform the reverse translation when reading + * characters back. + */ + +/** + * Iconv-based charset codec + */ +typedef struct hubbub_iconv_codec { + hubbub_charsetcodec base; /**< Base class */ + + iconv_t read_cd; /**< Iconv handle for reading */ +#define INVAL_BUFSIZE (32) + uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up + * incomplete input + * sequences */ + size_t inval_len; /**< Number of bytes in inval_buf */ + +#define READ_BUFSIZE (8) + uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial + * output sequences (decode) + */ + size_t read_len; /**< Number of characters in + * read_buf */ + + iconv_t write_cd; /**< Iconv handle for writing */ +#define WRITE_BUFSIZE (8) + uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial + * output sequences (encode) + */ + size_t write_len; /**< Number of characters in + * write_buf */ +} hubbub_iconv_codec; + + +static bool hubbub_iconv_codec_handles_charset(const char *charset); +static hubbub_charsetcodec *hubbub_iconv_codec_create(const char *charset, + hubbub_alloc alloc, void *pw); +static void hubbub_iconv_codec_destroy (hubbub_charsetcodec *codec); +static hubbub_error hubbub_iconv_codec_encode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static hubbub_error hubbub_iconv_codec_decode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static hubbub_error hubbub_iconv_codec_reset(hubbub_charsetcodec *codec); +static hubbub_error hubbub_iconv_codec_filter_decoded_char( + hubbub_iconv_codec *c, uint32_t ucs4, uint8_t **dest, + size_t *destlen); +static bool hubbub_iconv_codec_is_unicode(hubbub_iconv_codec *c); +static hubbub_error hubbub_iconv_codec_read_char(hubbub_iconv_codec *c, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static hubbub_error hubbub_iconv_codec_write_char(hubbub_iconv_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen); + +/** + * Determine whether this codec handles a specific charset + * + * \param charset Charset to test + * \return true if handleable, false otherwise + */ +bool hubbub_iconv_codec_handles_charset(const char *charset) +{ + iconv_t cd; + bool ret; + + cd = iconv_open("UCS-4", charset); + + ret = (cd != (iconv_t) -1); + + if (ret) + iconv_close(cd); + + return ret; +} + +/** + * Create an iconv-based codec + * + * \param charset The charset to read from / write to + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to codec, or NULL on failure + */ +hubbub_charsetcodec *hubbub_iconv_codec_create(const char *charset, + hubbub_alloc alloc, void *pw) +{ + hubbub_iconv_codec *codec; + + codec = alloc(NULL, sizeof(hubbub_iconv_codec), pw); + if (codec == NULL) + return NULL; + + codec->read_cd = iconv_open("UCS-4", charset); + if (codec->read_cd == (iconv_t) -1) { + alloc(codec, 0, pw); + return NULL; + } + + codec->write_cd = iconv_open(charset, "UCS-4"); + if (codec->write_cd == (iconv_t) -1) { + iconv_close(codec->read_cd); + alloc(codec, 0, pw); + return NULL; + } + + codec->inval_buf[0] = '\0'; + codec->inval_len = 0; + + codec->read_buf[0] = 0; + codec->read_len = 0; + + codec->write_buf[0] = 0; + codec->write_len = 0; + + /* Finally, populate vtable */ + codec->base.handler.destroy = hubbub_iconv_codec_destroy; + codec->base.handler.encode = hubbub_iconv_codec_encode; + codec->base.handler.decode = hubbub_iconv_codec_decode; + codec->base.handler.reset = hubbub_iconv_codec_reset; + + return (hubbub_charsetcodec *) codec; +} + +/** + * Destroy an iconv-based codec + * + * \param codec The codec to destroy + */ +void hubbub_iconv_codec_destroy (hubbub_charsetcodec *codec) +{ + hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec; + + iconv_close(c->read_cd); + iconv_close(c->write_cd); + + return; +} + +/** + * Encode a chunk of UCS4 data into an iconv-based codec's charset + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * HUBBUB_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * as a result of the failure of the + * client-provided filter function. + * + * On exit, ::source will point immediately _after_ the last input character + * read. Any remaining output for the character will be buffered by the + * codec for writing on the next call. This buffered data is post-filtering, + * so will not be refiltered on the next call. + * + * In the case of the filter function failing, ::source will point _at_ the + * last input character read; nothing will be written or buffered for the + * failed character. It is up to the client to fix the cause of the failure + * and retry the encoding process. + * + * Note that, if failure occurs whilst attempting to write any output + * buffered by the last call, then ::source and ::sourcelen will remain + * unchanged (as nothing more has been read). + * + * There is no way to determine the output character which caused a + * failure (as it may be one in a filter-injected replacement sequence). + * It is, however, possible to determine which source character caused it + * (this being the character immediately before the location pointed to by + * ::source on exit). + * + * [I.e. the process of filtering results in a potential one-to-many mapping + * between source characters and output characters, and identification of + * individual output characters is impossible.] + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + */ +hubbub_error hubbub_iconv_codec_encode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec; + uint32_t ucs4; + const uint32_t *towrite; + size_t towritelen; + hubbub_error error; + + /* Process any outstanding characters from the previous call */ + if (c->write_len > 0) { + uint32_t *pwrite = c->write_buf; + + while (c->write_len > 0) { + error = hubbub_iconv_codec_write_char(c, pwrite[0], + dest, destlen); + if (error != HUBBUB_OK) { + /* Copy outstanding chars down, skipping + * invalid one, if present, so as to avoid + * reprocessing the invalid character */ + if (error == HUBBUB_INVALID) { + for (ucs4 = 1; ucs4 < c->write_len; + ucs4++) { + c->write_buf[ucs4] = + pwrite[ucs4]; + } + } + + return error; + } + + pwrite++; + c->write_len--; + } + } + + /* Now process the characters for this call */ + while (*sourcelen > 0) { + towrite = (const uint32_t *) (const void *) *source; + towritelen = 1; + ucs4 = *towrite; + + /* Run character we're about to output through the + * registered filter, so it can replace it, if it sees + * fit to do so */ + if (c->base.filter != NULL) { + uint32_t *replacement; + + error = c->base.filter(ntohl(ucs4), + &replacement, &towritelen, + c->base.filter_pw); + if (error != HUBBUB_OK) { + /* Don't eat character -- filter failed, + * so nothing gets written or buffered. + * It's up to the client to ensure that + * the filter works in the case where it + * reprocesses this character after the + * fault is fixed up. */ + + return error; + } + + /* Convert filter output to big endian UCS4 */ + for (ucs4 = 0; ucs4 < towritelen; ucs4++) { + replacement[ucs4] = htonl(replacement[ucs4]); + } + + towrite = (const uint32_t *) replacement; + } + + /* Output current character(s) */ + while (towritelen > 0) { + error = hubbub_iconv_codec_write_char(c, towrite[0], + dest, destlen); + + if (error != HUBBUB_OK) { + ucs4 = (error == HUBBUB_INVALID) ? 1 : 0; + + if (towritelen - ucs4 >= WRITE_BUFSIZE) + abort(); + + c->write_len = towritelen - ucs4; + + /* Copy pending chars to save area, for + * processing next call; skipping invalid + * character, if present, so it's not + * reprocessed. */ + for (; ucs4 < towritelen; ucs4++) { + c->write_buf[ucs4] = towrite[ucs4]; + } + + /* Claim character we've just buffered, + * so it's not repreocessed */ + *source += 4; + *sourcelen -= 4; + + return error; + } + + towrite++; + towritelen--; + } + + *source += 4; + *sourcelen -= 4; + } + + return HUBBUB_OK; +} + +/** + * Decode a chunk of data in an iconv-based codec's charset into UCS4 + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * HUBBUB_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * as a result of the failure of the + * client-provided filter function. + * + * On exit, ::source will point immediately _after_ the last input character + * read, if the result is _OK or _NOMEM. Any remaining output for the + * character will be buffered by the codec for writing on the next call. + * This buffered data is post-filtering, so will not be refiltered on the + * next call. + * + * In the case of the result being _INVALID or the filter function failing, + * ::source will point _at_ the last input character read; nothing will be + * written or buffered for the failed character. It is up to the client to + * fix the cause of the failure and retry the decoding process. + * + * Note that, if failure occurs whilst attempting to write any output + * buffered by the last call, then ::source and ::sourcelen will remain + * unchanged (as nothing more has been read). + * + * There is no way to determine the output character which caused a + * failure (as it may be one in a filter-injected replacement sequence). + * It is, however, possible to determine which source character caused it + * (this being the character immediately at or before the location pointed + * to by ::source on exit). + * + * [I.e. the process of filtering results in a potential one-to-many mapping + * between source characters and output characters, and identification of + * individual output characters is impossible.] + * + * If STRICT error handling is configured and an illegal sequence is split + * over two calls, then _INVALID will be returned from the second call, + * but ::source will point mid-way through the invalid sequence (i.e. it + * will be unmodified over the second call). In addition, the internal + * incomplete-sequence buffer will be emptied, such that subsequent calls + * will progress, rather than re-evaluating the same invalid sequence. + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + * + * Call this with a source length of 0 to flush the output buffer. + */ +hubbub_error hubbub_iconv_codec_decode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec; + hubbub_error error; + + if (c->read_len > 0) { + /* Output left over from last decode + * Attempt to finish this here */ + uint32_t *pread = c->read_buf; + + while (c->read_len > 0 && *destlen >= c->read_len * 4) { + *((uint32_t *) (void *) *dest) = pread[0]; + + *dest += 4; + *destlen -= 4; + + pread++; + c->read_len--; + } + + if (*destlen < c->read_len * 4) { + /* Run out of output buffer */ + size_t i; + + /* Shuffle remaining output down */ + for (i = 0; i < c->read_len; i++) { + c->read_buf[i] = pread[i]; + } + + return HUBBUB_NOMEM; + } + } + + if (c->inval_len > 0) { + /* The last decode ended in an incomplete sequence. + * Fill up inval_buf with data from the start of the + * new chunk and process it. */ + uint8_t *in = c->inval_buf; + size_t ol = c->inval_len; + size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen); + size_t orig_l = l; + + memcpy(c->inval_buf + ol, *source, l); + + l += c->inval_len; + + error = hubbub_iconv_codec_read_char(c, + (const uint8_t **) &in, &l, dest, destlen); + if (error != HUBBUB_OK && error != HUBBUB_NOMEM) { + return error; + } + + + /* And now, fix everything up so the normal processing + * does the right thing. */ + *source += max((signed) (orig_l - l), 0); + *sourcelen -= max((signed) (orig_l - l), 0); + + /* Failed to resolve an incomplete character and + * ran out of buffer space. No recovery strategy + * possible, so explode everywhere. */ + if ((orig_l + ol) - l == 0) + abort(); + + /* Handle memry exhaustion case from above */ + if (error != HUBBUB_OK) + return error; + } + + while (*sourcelen > 0) { + error = hubbub_iconv_codec_read_char(c, + source, sourcelen, dest, destlen); + if (error != HUBBUB_OK) { + return error; + } + } + + return HUBBUB_OK; +} + +/** + * Clear an iconv-based codec's encoding state + * + * \param codec The codec to reset + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_iconv_codec_reset(hubbub_charsetcodec *codec) +{ + hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec; + + iconv(c->read_cd, NULL, NULL, NULL, NULL); + iconv(c->write_cd, NULL, NULL, NULL, NULL); + + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + c->read_buf[0] = 0; + c->read_len = 0; + + c->write_buf[0] = 0; + c->write_len = 0; + + return HUBBUB_OK; +} + +/** + * Feed a UCS4 character through the registered filter and output the result + * + * \param c Codec to use + * \param ucs4 UCS4 character (big endian) + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to output buffer length + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * as a result of the failure of the + * client-provided filter function. + */ +hubbub_error hubbub_iconv_codec_filter_decoded_char(hubbub_iconv_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen) +{ + if (c->base.filter != NULL) { + uint32_t *rep; + size_t replen; + hubbub_error error; + + error = c->base.filter(ntohl(ucs4), &rep, &replen, + c->base.filter_pw); + if (error != HUBBUB_OK) { + return error; + } + + while (replen > 0 && *destlen >= replen * 4) { + *((uint32_t *) (void *) *dest) = htonl(*rep); + + *dest += 4; + *destlen -= 4; + + rep++; + replen--; + } + + if (*destlen < replen * 4) { + /* Run out of output buffer */ + size_t i; + + /* Buffer remaining output */ + c->read_len = replen; + + for (i = 0; i < replen; i++) { + c->read_buf[i] = htonl(rep[i]); + } + + return HUBBUB_NOMEM; + } + + } else { + if (*destlen < 4) { + /* Run out of output buffer */ + + c->read_len = 1; + c->read_buf[0] = ucs4; + + return HUBBUB_NOMEM; + } + + *((uint32_t *) (void *) *dest) = ucs4; + *dest += 4; + *destlen -= 4; + } + + return HUBBUB_OK; +} + +/** + * Detect if a codec's charset is Unicode capable + * + * \param c Codec to consider + * \return true if a Unicode variant, false otherwise + */ +bool hubbub_iconv_codec_is_unicode(hubbub_iconv_codec *c) +{ + static uint16_t ucs4; + static uint16_t ucs2; + static uint16_t utf8; + static uint16_t utf16; + static uint16_t utf16be; + static uint16_t utf16le; + static uint16_t utf32; + static uint16_t utf32be; + static uint16_t utf32le; + + if (ucs4 == 0) { + ucs4 = hubbub_mibenum_from_name("UCS-4", SLEN("UCS-4")); + ucs2 = hubbub_mibenum_from_name("UCS-2", SLEN("UCS-2")); + utf8 = hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8")); + utf16 = hubbub_mibenum_from_name("UTF-16", SLEN("UTF-16")); + utf16be = hubbub_mibenum_from_name("UTF-16BE", + SLEN("UTF-16BE")); + utf16le = hubbub_mibenum_from_name("UTF-16LE", + SLEN("UTF-16LE")); + utf32 = hubbub_mibenum_from_name("UTF-32", SLEN("UTF-32")); + utf32be = hubbub_mibenum_from_name("UTF-32BE", + SLEN("UTF-32BE")); + utf32le = hubbub_mibenum_from_name("UTF-32LE", + SLEN("UTF-32LE")); + } + + return (c->base.mibenum == ucs4 || + c->base.mibenum == ucs2 || + c->base.mibenum == utf8 || + c->base.mibenum == utf16 || + c->base.mibenum == utf16be || + c->base.mibenum == utf16le || + c->base.mibenum == utf32 || + c->base.mibenum == utf32be || + c->base.mibenum == utf32le); +} + +/** + * Read a character from the codec's native charset to UCS4 (big endian) + * + * \param c The codec + * \param source Pointer to pointer to source buffer (updated on exit) + * \param sourcelen Pointer to length of source buffer (updated on exit) + * \param dest Pointer to pointer to output buffer (updated on exit) + * \param destlen Pointer to length of output buffer (updated on exit) + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * HUBBUB_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * as a result of the failure of the + * client-provided filter function. + * + * On exit, ::source will point immediately _after_ the last input character + * read, if the result is _OK or _NOMEM. Any remaining output for the + * character will be buffered by the codec for writing on the next call. + * This buffered data is post-filtering, so will not be refiltered on the + * next call. + * + * In the case of the result being _INVALID or the filter function failing, + * ::source will point _at_ the last input character read; nothing will be + * written or buffered for the failed character. It is up to the client to + * fix the cause of the failure and retry the decoding process. + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + */ +hubbub_error hubbub_iconv_codec_read_char(hubbub_iconv_codec *c, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + size_t iconv_ret; + const uint8_t *origsrc = *source; + size_t origsrclen = *sourcelen; + uint32_t ucs4; + uint8_t *pucs4 = (uint8_t *) &ucs4; + size_t sucs4 = 4; + hubbub_error error; + + /* Use iconv to convert a single character + * Side effect: Updates *source to point at next input + * character and *sourcelen to reflect reduced input length + */ + iconv_ret = iconv(c->read_cd, (char **) source, sourcelen, + (char **) (void *) &pucs4, &sucs4); + + if (iconv_ret != (size_t) -1 || + (*source != origsrc && sucs4 == 0)) { + /* Read a character */ + error = hubbub_iconv_codec_filter_decoded_char(c, + ucs4, dest, destlen); + if (error != HUBBUB_OK && error != HUBBUB_NOMEM) { + /* filter function failed; restore source pointers */ + *source = origsrc; + *sourcelen = origsrclen; + } + + /* Clear inval buffer */ + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + return error; + } else if (errno == E2BIG) { + /* Should never happen */ + abort(); + } else if (errno == EINVAL) { + /* Incomplete input sequence */ + if (*sourcelen > INVAL_BUFSIZE) + abort(); + + memmove(c->inval_buf, (const char *) *source, *sourcelen); + c->inval_buf[*sourcelen] = '\0'; + c->inval_len = *sourcelen; + + *source += *sourcelen; + *sourcelen = 0; + + return HUBBUB_OK; + } else if (errno == EILSEQ) { + /* Illegal input sequence */ + bool found = false; + const uint8_t *oldsrc; + size_t oldsrclen; + + /* Clear inval buffer */ + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + /* Strict errormode; simply flag invalid character */ + if (c->base.errormode == HUBBUB_CHARSETCODEC_ERROR_STRICT) { + /* restore source pointers */ + *source = origsrc; + *sourcelen = origsrclen; + + return HUBBUB_INVALID; + } + + /* Ok, this becomes problematic. The iconv API here + * is particularly unhelpful; *source will point at + * the _start_ of the illegal sequence. This means + * that we must find the end of the sequence */ + + /* Search for the start of the next valid input + * sequence (or the end of the input stream) */ + while (*sourcelen > 1) { + pucs4 = (uint8_t *) &ucs4; + sucs4 = 4; + + (*source)++; + (*sourcelen)--; + + oldsrc = *source; + oldsrclen = *sourcelen; + + iconv_ret = iconv(c->read_cd, + (char **) source, sourcelen, + (char **) (void *) &pucs4, &sucs4); + if (iconv_ret != (size_t) -1 || errno != EILSEQ) { + found = true; + break; + } + } + + if (found) { + /* Found start of next valid sequence */ + *source = oldsrc; + *sourcelen = oldsrclen; + } else { + /* Not found - skip last byte in buffer */ + (*source)++; + (*sourcelen)--; + + if (*sourcelen != 0) + abort(); + } + + /* output U+FFFD and continue processing. */ + error = hubbub_iconv_codec_filter_decoded_char(c, + htonl(0xFFFD), dest, destlen); + if (error != HUBBUB_OK && error != HUBBUB_NOMEM) { + /* filter function failed; restore source pointers */ + *source = origsrc; + *sourcelen = origsrclen; + } + + return error; + } + + return HUBBUB_OK; +} + +/** + * Write a UCS4 character in a codec's native charset + * + * \param c The codec + * \param ucs4 The UCS4 character to write (big endian) + * \param dest Pointer to pointer to output buffer (updated on exit) + * \param destlen Pointer to length of output buffer (updated on exit) + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * HUBBUB_INVALID if character cannot be represented and the + * codec's error handling mode is set to STRICT. + */ +hubbub_error hubbub_iconv_codec_write_char(hubbub_iconv_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen) +{ + size_t iconv_ret; + uint8_t *pucs4 = (uint8_t *) &ucs4; + size_t sucs4 = 4; + uint8_t *origdest = *dest; + + iconv_ret = iconv(c->write_cd, (char **) (void *) &pucs4, + &sucs4, (char **) dest, destlen); + + if (iconv_ret == (size_t) -1 && errno == E2BIG) { + /* Output buffer is too small */ + return HUBBUB_NOMEM; + } else if (iconv_ret == (size_t) -1 && errno == EILSEQ) { + /* Illegal multibyte sequence */ + /* This should never happen */ + abort(); + } else if (iconv_ret == (size_t) -1 && errno == EINVAL) { + /* Incomplete input character */ + /* This should never happen */ + abort(); + } else if (*dest == origdest) { + /* Nothing was output */ + switch (c->base.errormode) { + case HUBBUB_CHARSETCODEC_ERROR_STRICT: + return HUBBUB_INVALID; + + case HUBBUB_CHARSETCODEC_ERROR_TRANSLIT: + /** \todo transliteration */ + case HUBBUB_CHARSETCODEC_ERROR_LOOSE: + { + pucs4 = (uint8_t *) &ucs4; + sucs4 = 4; + + ucs4 = hubbub_iconv_codec_is_unicode(c) + ? htonl(0xFFFD) : htonl(0x3F); + + iconv_ret = iconv(c->write_cd, + (char **) (void *) &pucs4, &sucs4, + (char **) dest, destlen); + + if (iconv_ret == (size_t) -1 && errno == E2BIG) { + return HUBBUB_NOMEM; + } else if (iconv_ret == (size_t) -1 && + errno == EILSEQ) { + /* Illegal multibyte sequence */ + /* This should never happen */ + abort(); + } else if (iconv_ret == (size_t) -1 && + errno == EINVAL) { + /* Incomplete input character */ + /* This should never happen */ + abort(); + } + } + break; + } + } + + return HUBBUB_OK; +} + +const hubbub_charsethandler hubbub_iconv_codec_handler = { + hubbub_iconv_codec_handles_charset, + hubbub_iconv_codec_create +}; diff --git a/src/charset/codec_impl.h b/src/charset/codec_impl.h new file mode 100644 index 0000000..eb5116b --- /dev/null +++ b/src/charset/codec_impl.h @@ -0,0 +1,51 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef hubbub_charset_codecimpl_h_ +#define hubbub_charset_codecimpl_h_ + +#include +#include + +#include "codec.h" + +/** + * Core charset codec definition; implementations extend this + */ +struct hubbub_charsetcodec { + uint16_t mibenum; /**< MIB enum for charset */ + + hubbub_charsetcodec_filter filter; /**< filter function */ + void *filter_pw; /**< filter private word */ + + hubbub_charsetcodec_errormode errormode; /**< error mode */ + + hubbub_alloc alloc; /**< allocation function */ + void *alloc_pw; /**< private word */ + + struct { + void (*destroy)(hubbub_charsetcodec *codec); + hubbub_error (*encode)(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); + hubbub_error (*decode)(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); + hubbub_error (*reset)(hubbub_charsetcodec *codec); + } handler; /**< Vtable for handler code */ +}; + +/** + * Codec factory component definition + */ +typedef struct hubbub_charsethandler { + bool (*handles_charset)(const char *charset); + hubbub_charsetcodec *(*create)(const char *charset, + hubbub_alloc alloc, void *pw); +} hubbub_charsethandler; + +#endif diff --git a/src/charset/codec_utf8.c b/src/charset/codec_utf8.c new file mode 100644 index 0000000..86d667f --- /dev/null +++ b/src/charset/codec_utf8.c @@ -0,0 +1,620 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include +#include + +/* These two are for htonl / ntohl */ +#include +#include + +#include "charset/aliases.h" +#include "utils/utf8.h" +#include "utils/utils.h" + +#include "codec_impl.h" + +/** + * UTF-8 charset codec + */ +typedef struct hubbub_utf8_codec { + hubbub_charsetcodec base; /**< Base class */ + +#define INVAL_BUFSIZE (32) + uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up + * incomplete input + * sequences */ + size_t inval_len; /*< Byte length of inval_buf **/ + +#define READ_BUFSIZE (8) + uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial + * output sequences (decode) + * (host-endian) */ + size_t read_len; /**< Character length of read_buf */ + +#define WRITE_BUFSIZE (8) + uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial + * output sequences (encode) + * (host-endian) */ + size_t write_len; /**< Character length of write_buf */ + +} hubbub_utf8_codec; + +static bool hubbub_utf8_codec_handles_charset(const char *charset); +static hubbub_charsetcodec *hubbub_utf8_codec_create(const char *charset, + hubbub_alloc alloc, void *pw); +static void hubbub_utf8_codec_destroy (hubbub_charsetcodec *codec); +static hubbub_error hubbub_utf8_codec_encode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static hubbub_error hubbub_utf8_codec_decode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static hubbub_error hubbub_utf8_codec_reset(hubbub_charsetcodec *codec); +static hubbub_error hubbub_utf8_codec_read_char(hubbub_utf8_codec *c, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static hubbub_error hubbub_utf8_codec_filter_decoded_char( + hubbub_utf8_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen); + +/** + * Determine whether this codec handles a specific charset + * + * \param charset Charset to test + * \return true if handleable, false otherwise + */ +bool hubbub_utf8_codec_handles_charset(const char *charset) +{ + return hubbub_mibenum_from_name(charset, strlen(charset)) == + hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8")); +} + +/** + * Create a utf8 codec + * + * \param charset The charset to read from / write to + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to codec, or NULL on failure + */ +hubbub_charsetcodec *hubbub_utf8_codec_create(const char *charset, + hubbub_alloc alloc, void *pw) +{ + hubbub_utf8_codec *codec; + + UNUSED(charset); + + codec = alloc(NULL, sizeof(hubbub_utf8_codec), pw); + if (codec == NULL) + return NULL; + + codec->inval_buf[0] = '\0'; + codec->inval_len = 0; + + codec->read_buf[0] = 0; + codec->read_len = 0; + + codec->write_buf[0] = 0; + codec->write_len = 0; + + /* Finally, populate vtable */ + codec->base.handler.destroy = hubbub_utf8_codec_destroy; + codec->base.handler.encode = hubbub_utf8_codec_encode; + codec->base.handler.decode = hubbub_utf8_codec_decode; + codec->base.handler.reset = hubbub_utf8_codec_reset; + + return (hubbub_charsetcodec *) codec; +} + +/** + * Destroy a utf8 codec + * + * \param codec The codec to destroy + */ +void hubbub_utf8_codec_destroy (hubbub_charsetcodec *codec) +{ + UNUSED(codec); +} + +/** + * Encode a chunk of UCS4 data into utf8 + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * HUBBUB_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * as a result of the failure of the + * client-provided filter function. + * + * On exit, ::source will point immediately _after_ the last input character + * read. Any remaining output for the character will be buffered by the + * codec for writing on the next call. This buffered data is post-filtering, + * so will not be refiltered on the next call. + * + * In the case of the filter function failing, ::source will point _at_ the + * last input character read; nothing will be written or buffered for the + * failed character. It is up to the client to fix the cause of the failure + * and retry the encoding process. + * + * Note that, if failure occurs whilst attempting to write any output + * buffered by the last call, then ::source and ::sourcelen will remain + * unchanged (as nothing more has been read). + * + * There is no way to determine the output character which caused a + * failure (as it may be one in a filter-injected replacement sequence). + * It is, however, possible to determine which source character caused it + * (this being the character immediately before the location pointed to by + * ::source on exit). + * + * [I.e. the process of filtering results in a potential one-to-many mapping + * between source characters and output characters, and identification of + * individual output characters is impossible.] + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + */ +hubbub_error hubbub_utf8_codec_encode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + hubbub_utf8_codec *c = (hubbub_utf8_codec *) codec; + uint32_t ucs4; + uint32_t *towrite; + size_t towritelen; + hubbub_error error; + + /* Process any outstanding characters from the previous call */ + if (c->write_len > 0) { + uint32_t *pwrite = c->write_buf; + uint8_t buf[6]; + size_t len; + + while (c->write_len > 0) { + error = hubbub_utf8_from_ucs4(pwrite[0], buf, &len); + if (error != HUBBUB_OK) + abort(); + + if (*destlen < len) { + /* Insufficient output buffer space */ + for (len = 0; len < c->write_len; len++) + c->write_buf[len] = pwrite[len]; + + return HUBBUB_NOMEM; + } + + memcpy(*dest, buf, len); + + *dest += len; + *destlen -= len; + + pwrite++; + c->write_len--; + } + } + + /* Now process the characters for this call */ + while (*sourcelen > 0) { + ucs4 = ntohl(*((uint32_t *) (void *) *source)); + towrite = &ucs4; + towritelen = 1; + + /* Run character we're about to output through the + * registered filter, so it can replace it. */ + if (c->base.filter != NULL) { + error = c->base.filter(ucs4, + &towrite, &towritelen, + c->base.filter_pw); + if (error != HUBBUB_OK) + return error; + } + + /* Output current characters */ + while (towritelen > 0) { + uint8_t buf[6]; + size_t len; + + error = hubbub_utf8_from_ucs4(towrite[0], buf, &len); + if (error != HUBBUB_OK) + abort(); + + if (*destlen < len) { + /* Insufficient output space */ + if (towritelen >= WRITE_BUFSIZE) + abort(); + + c->write_len = towritelen; + + /* Copy pending chars to save area, for + * processing next call. */ + for (len = 0; len < towritelen; len++) + c->write_buf[len] = towrite[len]; + + /* Claim character we've just buffered, + * so it's not reprocessed */ + *source += 4; + *sourcelen -= 4; + + return HUBBUB_NOMEM; + } + + memcpy(*dest, buf, len); + + *dest += len; + *destlen -= len; + + towrite++; + towritelen--; + } + + *source += 4; + *sourcelen -= 4; + } + + return HUBBUB_OK; +} + +/** + * Decode a chunk of utf8 data into UCS4 + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * HUBBUB_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * as a result of the failure of the + * client-provided filter function. + * + * On exit, ::source will point immediately _after_ the last input character + * read, if the result is _OK or _NOMEM. Any remaining output for the + * character will be buffered by the codec for writing on the next call. + * This buffered data is post-filtering, so will not be refiltered on the + * next call. + * + * In the case of the result being _INVALID or the filter function failing, + * ::source will point _at_ the last input character read; nothing will be + * written or buffered for the failed character. It is up to the client to + * fix the cause of the failure and retry the decoding process. + * + * Note that, if failure occurs whilst attempting to write any output + * buffered by the last call, then ::source and ::sourcelen will remain + * unchanged (as nothing more has been read). + * + * There is no way to determine the output character which caused a + * failure (as it may be one in a filter-injected replacement sequence). + * It is, however, possible to determine which source character caused it + * (this being the character immediately at or before the location pointed + * to by ::source on exit). + * + * [I.e. the process of filtering results in a potential one-to-many mapping + * between source characters and output characters, and identification of + * individual output characters is impossible.] + * + * If STRICT error handling is configured and an illegal sequence is split + * over two calls, then _INVALID will be returned from the second call, + * but ::source will point mid-way through the invalid sequence (i.e. it + * will be unmodified over the second call). In addition, the internal + * incomplete-sequence buffer will be emptied, such that subsequent calls + * will progress, rather than re-evaluating the same invalid sequence. + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + * + * Call this with a source length of 0 to flush the output buffer. + */ +hubbub_error hubbub_utf8_codec_decode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + hubbub_utf8_codec *c = (hubbub_utf8_codec *) codec; + hubbub_error error; + + if (c->read_len > 0) { + /* Output left over from last decode */ + uint32_t *pread = c->read_buf; + + while (c->read_len > 0 && *destlen >= c->read_len * 4) { + *((uint32_t *) (void *) *dest) = htonl(pread[0]); + + *dest += 4; + *destlen -= 4; + + pread++; + c->read_len--; + } + + if (*destlen < c->read_len * 4) { + /* Ran out of output buffer */ + size_t i; + + /* Shuffle remaining output down */ + for (i = 0; i < c->read_len; i++) + c->read_buf[i] = pread[i]; + + return HUBBUB_NOMEM; + } + } + + if (c->inval_len > 0) { + /* The last decode ended in an incomplete sequence. + * Fill up inval_buf with data from the start of the + * new chunk and process it. */ + uint8_t *in = c->inval_buf; + size_t ol = c->inval_len; + size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen); + size_t orig_l = l; + + memcpy(c->inval_buf + ol, *source, l); + + l += c->inval_len; + + error = hubbub_utf8_codec_read_char(c, + (const uint8_t **) &in, &l, dest, destlen); + if (error != HUBBUB_OK && error != HUBBUB_NOMEM) { + return error; + } + + /* And now, fix up source pointers */ + *source += max((signed) (orig_l - l), 0); + *sourcelen -= max((signed) (orig_l - l), 0); + + /* Failed to resolve an incomplete character and + * ran out of buffer space. No recovery strategy + * possible, so explode everywhere. */ + if ((orig_l + ol) - l == 0) + abort(); + + /* Report memory exhaustion case from above */ + if (error != HUBBUB_OK) + return error; + } + + /* Finally, the "normal" case; process all outstanding characters */ + while (*sourcelen > 0) { + error = hubbub_utf8_codec_read_char(c, + source, sourcelen, dest, destlen); + if (error != HUBBUB_OK) { + return error; + } + } + + return HUBBUB_OK; +} + +/** + * Clear a utf8 codec's encoding state + * + * \param codec The codec to reset + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_utf8_codec_reset(hubbub_charsetcodec *codec) +{ + hubbub_utf8_codec *c = (hubbub_utf8_codec *) codec; + + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + c->read_buf[0] = 0; + c->read_len = 0; + + c->write_buf[0] = 0; + c->write_len = 0; + + return HUBBUB_OK; +} + + +/** + * Read a character from the UTF-8 to UCS4 (big endian) + * + * \param c The codec + * \param source Pointer to pointer to source buffer (updated on exit) + * \param sourcelen Pointer to length of source buffer (updated on exit) + * \param dest Pointer to pointer to output buffer (updated on exit) + * \param destlen Pointer to length of output buffer (updated on exit) + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * HUBBUB_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * as a result of the failure of the + * client-provided filter function. + * + * On exit, ::source will point immediately _after_ the last input character + * read, if the result is _OK or _NOMEM. Any remaining output for the + * character will be buffered by the codec for writing on the next call. + * This buffered data is post-filtering, so will not be refiltered on the + * next call. + * + * In the case of the result being _INVALID or the filter function failing, + * ::source will point _at_ the last input character read; nothing will be + * written or buffered for the failed character. It is up to the client to + * fix the cause of the failure and retry the decoding process. + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + */ +hubbub_error hubbub_utf8_codec_read_char(hubbub_utf8_codec *c, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + uint32_t ucs4; + size_t sucs4; + hubbub_error error; + + /* Convert a single character */ + error = hubbub_utf8_to_ucs4(*source, *sourcelen, &ucs4, &sucs4); + if (error == HUBBUB_OK) { + /* Read a character */ + error = hubbub_utf8_codec_filter_decoded_char(c, + ucs4, dest, destlen); + if (error == HUBBUB_OK || error == HUBBUB_NOMEM) { + /* filter function succeeded; update source pointers */ + *source += sucs4; + *sourcelen -= sucs4; + } + + /* Clear inval buffer */ + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + return error; + } else if (error == HUBBUB_NEEDDATA) { + /* Incomplete input sequence */ + if (*sourcelen > INVAL_BUFSIZE) + abort(); + + memmove(c->inval_buf, (char *) *source, *sourcelen); + c->inval_buf[*sourcelen] = '\0'; + c->inval_len = *sourcelen; + + *source += *sourcelen; + *sourcelen = 0; + + return HUBBUB_OK; + } else if (error == HUBBUB_INVALID) { + /* Illegal input sequence */ + uint32_t nextchar; + + /* Clear inval buffer */ + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + /* Strict errormode; simply flag invalid character */ + if (c->base.errormode == HUBBUB_CHARSETCODEC_ERROR_STRICT) { + return HUBBUB_INVALID; + } + + /* Find next valid UTF-8 sequence. + * We're processing client-provided data, so let's + * be paranoid about its validity. */ + error = hubbub_utf8_next_paranoid(*source, *sourcelen, + 0, &nextchar); + if (error != HUBBUB_OK) { + if (error == HUBBUB_NEEDDATA) { + /* Need more data to be sure */ + if (*sourcelen > INVAL_BUFSIZE) + abort(); + + memmove(c->inval_buf, (char *) *source, + *sourcelen); + c->inval_buf[*sourcelen] = '\0'; + c->inval_len = *sourcelen; + + *source += *sourcelen; + *sourcelen = 0; + + nextchar = 0; + } else { + return error; + } + } + + /* output U+FFFD and continue processing. */ + error = hubbub_utf8_codec_filter_decoded_char(c, + 0xFFFD, dest, destlen); + if (error == HUBBUB_OK || error == HUBBUB_NOMEM) { + /* filter function succeeded; update source pointers */ + *source += nextchar; + *sourcelen -= nextchar; + } + + return error; + } + + return HUBBUB_OK; +} + +/** + * Feed a UCS4 character through the registered filter and output the result + * + * \param c Codec to use + * \param ucs4 UCS4 character (host endian) + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to output buffer length + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * as a result of the failure of the + * client-provided filter function. + */ +hubbub_error hubbub_utf8_codec_filter_decoded_char(hubbub_utf8_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen) +{ + if (c->base.filter != NULL) { + uint32_t *rep; + size_t replen; + hubbub_error error; + + error = c->base.filter(ucs4, &rep, &replen, + c->base.filter_pw); + if (error != HUBBUB_OK) { + return error; + } + + while (replen > 0 && *destlen >= replen * 4) { + *((uint32_t *) (void *) *dest) = htonl(*rep); + + *dest += 4; + *destlen -= 4; + + rep++; + replen--; + } + + if (*destlen < replen * 4) { + /* Run out of output buffer */ + size_t i; + + /* Buffer remaining output */ + c->read_len = replen; + + for (i = 0; i < replen; i++) { + c->read_buf[i] = rep[i]; + } + + return HUBBUB_NOMEM; + } + + } else { + if (*destlen < 4) { + /* Run out of output buffer */ + c->read_len = 1; + c->read_buf[0] = ucs4; + + return HUBBUB_NOMEM; + } + + *((uint32_t *) (void *) *dest) = htonl(ucs4); + *dest += 4; + *destlen -= 4; + } + + return HUBBUB_OK; +} + + +const hubbub_charsethandler hubbub_utf8_codec_handler = { + hubbub_utf8_codec_handles_charset, + hubbub_utf8_codec_create +}; diff --git a/src/charset/detect.c b/src/charset/detect.c new file mode 100644 index 0000000..8ff3b87 --- /dev/null +++ b/src/charset/detect.c @@ -0,0 +1,673 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include +#include + +#include "charset/aliases.h" +#include "utils/utils.h" + +#include "detect.h" + +static uint16_t hubbub_charset_read_bom(const uint8_t **data, size_t *len); +static uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len); +static uint16_t hubbub_charset_parse_attributes(const uint8_t **pos, + const uint8_t *end); +static uint16_t hubbub_charset_parse_content(const uint8_t *value, + uint32_t valuelen); +static bool hubbub_charset_get_attribute(const uint8_t **data, + const uint8_t *end, + const uint8_t **name, uint32_t *namelen, + const uint8_t **value, uint32_t *valuelen); + +/** + * Extract a charset from a chunk of data + * + * \param data Pointer to pointer to buffer containing data + * \param len Pointer to buffer length + * \param mibenum Pointer to location to store MIB enum representing charset + * \param source Pointer to location to receive charset source + * \return HUBBUB_OK on success, appropriate error otherwise + * + * The data pointer and length will be modified by this function if + * a byte order mark is encountered at the start of the buffer. The updated + * data pointer will point to the first byte in the buffer after the BOM. + * The length will be modified appropriately. + * + * The larger a chunk of data fed to this routine, the better, as it allows + * charset autodetection access to a larger dataset for analysis. + */ +hubbub_error hubbub_charset_extract(const uint8_t **data, size_t *len, + uint16_t *mibenum, hubbub_charset_source *source) +{ + uint16_t charset = 0; + + if (data == NULL || *data == NULL || len == NULL || + mibenum == NULL || source == NULL) + return HUBBUB_BADPARM; + + /* We need at least 4 bytes of data */ + if (*len < 4) + goto default_encoding; + + /* First, look for a BOM */ + charset = hubbub_charset_read_bom(data, len); + if (charset != 0) { + *mibenum = charset; + *source = HUBBUB_CHARSET_DOCUMENT; + + return HUBBUB_OK; + } + + /* No BOM was found, so we must look for a meta charset within + * the document itself. */ + charset = hubbub_charset_scan_meta(*data, *len); + if (charset != 0) { + /* ISO-8859-1 becomes Windows-1252 */ + if (charset == hubbub_mibenum_from_name("ISO-8859-1", + SLEN("ISO-8859-1"))) { + charset = hubbub_mibenum_from_name("Windows-1252", + SLEN("Windows-1252")); + /* Fallback to 8859-1 if that failed */ + if (charset == 0) + charset = hubbub_mibenum_from_name( + "ISO-8859-1", SLEN("ISO-8859-1")); + } + + /* If we've encountered a meta charset for a non-ASCII- + * compatible encoding, don't trust it. + * + * Firstly, it should have been sent with a BOM (and thus + * detected above). + * + * Secondly, we've just used an ASCII-only parser to + * extract the encoding from the document. Therefore, + * the document plainly isn't what the meta charset + * claims it is. + * + * What we do in this case is to ignore the meta charset's + * claims and leave the charset determination to the + * autodetection routines (or the fallback case if they + * fail). + */ + if (charset != hubbub_mibenum_from_name("UTF-16", + SLEN("UTF-16")) && + charset != hubbub_mibenum_from_name("UTF-16LE", + SLEN("UTF-16LE")) && + charset != hubbub_mibenum_from_name("UTF-16BE", + SLEN("UTF-16BE")) && + charset != hubbub_mibenum_from_name("UTF-32", + SLEN("UTF-32")) && + charset != hubbub_mibenum_from_name("UTF-32LE", + SLEN("UTF-32LE")) && + charset != hubbub_mibenum_from_name("UTF-32BE", + SLEN("UTF-32BE"))) { + + *mibenum = charset; + *source = HUBBUB_CHARSET_DOCUMENT; + + return HUBBUB_OK; + } + } + + /* No charset was specified within the document, attempt to + * autodetect the encoding from the data that we have available. */ + + /** \todo Charset autodetection */ + + /* We failed to autodetect a charset, so use the default fallback */ +default_encoding: + + charset = hubbub_mibenum_from_name("Windows-1252", + SLEN("Windows-1252")); + if (charset == 0) + charset = hubbub_mibenum_from_name("ISO-8859-1", + SLEN("ISO-8859-1")); + + *mibenum = charset; + *source = HUBBUB_CHARSET_DEFAULT; + + return HUBBUB_OK; +} + + +/** + * Inspect the beginning of a buffer of data for the presence of a + * UTF Byte Order Mark. + * + * \param data Pointer to pointer to buffer containing data + * \param len Pointer to buffer length + * \return MIB enum representing encoding described by BOM, or 0 if not found + * + * If a BOM is found, the data pointer will be modified to point to the first + * byte in the buffer after the BOM. The length will also be modified + * appropriately. + */ +uint16_t hubbub_charset_read_bom(const uint8_t **data, size_t *len) +{ + if (data == NULL || *data == NULL || len == NULL) + return 0; + + /* We require at least 4 bytes of data */ + if (*len < 4) + return 0; + +#define UTF32BOM_LEN (4) +#define UTF16BOM_LEN (2) +#define UTF8BOM_LEN (3) + + if ((*data)[0] == 0x00 && (*data)[1] == 0x00 && + (*data)[2] == 0xFE && (*data)[3] == 0xFF) { + *data += UTF32BOM_LEN; + *len -= UTF32BOM_LEN; + + return hubbub_mibenum_from_name("UTF-32BE", + SLEN("UTF-32BE")); + } else if ((*data)[0] == 0xFF && (*data)[1] == 0xFE && + (*data)[2] == 0x00 && (*data)[3] == 0x00) { + *data += UTF32BOM_LEN; + *len -= UTF32BOM_LEN; + + return hubbub_mibenum_from_name("UTF-32LE", + SLEN("UTF-32LE")); + } else if ((*data)[0] == 0xFE && (*data)[1] == 0xFF) { + *data += UTF16BOM_LEN; + *len -= UTF16BOM_LEN; + + return hubbub_mibenum_from_name("UTF-16BE", + SLEN("UTF-16BE")); + } else if ((*data)[0] == 0xFF && (*data)[1] == 0xFE) { + *data += UTF16BOM_LEN; + *len -= UTF16BOM_LEN; + + return hubbub_mibenum_from_name("UTF-16LE", + SLEN("UTF-16LE")); + } else if ((*data)[0] == 0xEF && (*data)[1] == 0xBB && + (*data)[2] == 0xBF) { + *data += UTF8BOM_LEN; + *len -= UTF8BOM_LEN; + + return hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8")); + } + +#undef UTF32BOM_LEN +#undef UTF16BOM_LEN +#undef UTF8BOM_LEN + + return 0; +} + +#define PEEK(a) \ + (pos < end - SLEN(a) && \ + strncasecmp((const char *) pos, a, SLEN(a)) == 0) + +#define ADVANCE(a) \ + while (pos < end - SLEN(a)) { \ + if (PEEK(a)) \ + break; \ + pos++; \ + } \ + \ + if (pos == end - SLEN(a)) \ + return 0; + +#define ISSPACE(a) \ + (a == 0x09 || a == 0x0a || a == 0x0b || \ + a == 0x0c || a == 0x0d || a == 0x20) + +/** + * Search for a meta charset within a buffer of data + * + * \param data Pointer to buffer containing data + * \param len Length of buffer in data + * \return MIB enum representing encoding, or 0 if none found + */ +uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len) +{ + const uint8_t *pos = data; + const uint8_t *end; + uint16_t mibenum; + + if (data == NULL) + return 0; + + end = pos + min(512, len); + + /* 1. */ + while (pos < end) { + /* a */ + if (PEEK(""); + /* b */ + } else if (PEEK("= end - 1) + return 0; + + if (ISSPACE(*(pos + SLEN("= end) + return 0; + } + /* c */ + } else if ((PEEK("' || *pos == '<') + break; + pos++; + } + + if (pos >= end) + return 0; + + /* 3 */ + if (*pos != '<') { + const uint8_t *n; + const uint8_t *v; + uint32_t nl, vl; + + while (hubbub_charset_get_attribute(&pos, end, + &n, &nl, &v, &vl)) + ; /* do nothing */ + /* 2 */ + } else + continue; + /* d */ + } else if (PEEK(""); + } + + /* e - do nothing */ + + /* 2 */ + pos++; + } + + return 0; +} + +/** + * Parse attributes on a meta tag + * + * \param pos Pointer to pointer to current location (updated on exit) + * \param end Pointer to end of data stream + * \return MIB enum of detected encoding, or 0 if none found + */ +uint16_t hubbub_charset_parse_attributes(const uint8_t **pos, + const uint8_t *end) +{ + const uint8_t *name; + const uint8_t *value; + uint32_t namelen, valuelen; + uint16_t mibenum; + + if (pos == NULL || *pos == NULL || end == NULL) + return 0; + + /* 2 */ + while (hubbub_charset_get_attribute(pos, end, + &name, &namelen, &value, &valuelen)) { + /* 3 */ + /* a */ + if (namelen == SLEN("charset") && valuelen > 0 && + strncasecmp((const char *) name, "charset", + SLEN("charset")) == 0) { + /* strip value */ + while (ISSPACE(*value)) { + value++; + valuelen--; + } + + while (valuelen > 0 && ISSPACE(value[valuelen - 1])) + valuelen--; + + mibenum = hubbub_mibenum_from_name( + (const char *) value, valuelen); + if (mibenum != 0) + return mibenum; + /* b */ + } else if (namelen == SLEN("content") && valuelen > 0 && + strncasecmp((const char *) name, "content", + SLEN("content")) == 0) { + mibenum = hubbub_charset_parse_content(value, + valuelen); + if (mibenum != 0) + return mibenum; + } + + /* c - do nothing */ + + /* 1 */ + while (*pos < end) { + if (ISSPACE(**pos)) + break; + (*pos)++; + } + + if (*pos >= end) { + return 0; + } + } + + return 0; +} + +/** + * Parse a content= attribute's value + * + * \param value Attribute's value + * \param valuelen Length of value + * \return MIB enum of detected encoding, or 0 if none found + */ +uint16_t hubbub_charset_parse_content(const uint8_t *value, + uint32_t valuelen) +{ + const uint8_t *end; + const uint8_t *tentative = NULL; + uint32_t tentative_len = 0; + + if (value == NULL) + return 0; + + end = value + valuelen; + + /* 1 */ + while (value < end) { + if (*value == ';') { + value++; + break; + } + + value++; + } + + if (value >= end) + return 0; + + /* 2 */ + while (value < end && ISSPACE(*value)) { + value++; + } + + if (value >= end) + return 0; + + /* 3 */ + if (value < end - SLEN("charset") && + strncasecmp((const char *) value, + "charset", SLEN("charset")) != 0) + return 0; + + value += SLEN("charset"); + + /* 4 */ + while (value < end && ISSPACE(*value)) { + value++; + } + + if (value >= end) + return 0; + + /* 5 */ + if (*value != '=') + return 0; + /* skip '=' */ + value++; + + /* 6 */ + while (value < end && ISSPACE(*value)) { + value++; + } + + if (value >= end) + return 0; + + /* 7 */ + tentative = value; + + /* a */ + if (*value == '"') { + while (++value < end && *value != '"') { + tentative_len++; + } + + if (value < end) + tentative++; + else + tentative = NULL; + /* b */ + } else if (*value == '\'') { + while (++value < end && *value != '\'') { + tentative_len++; + } + + if (value < end) + tentative++; + else + tentative = NULL; + /* c */ + } else { + while (value < end && !ISSPACE(*value)) { + value++; + tentative_len++; + } + } + + /* 8 */ + if (tentative != NULL) { + return hubbub_mibenum_from_name((const char *) tentative, + tentative_len); + } + + /* 9 */ + return 0; +} + +/** + * Extract an attribute from the data stream + * + * \param data Pointer to pointer to current location (updated on exit) + * \param end Pointer to end of data stream + * \param name Pointer to location to receive attribute name + * \param namelen Pointer to location to receive attribute name length + * \param value Pointer to location to receive attribute value + * \param valuelen Pointer to location to receive attribute value langth + * \return true if attribute extracted, false otherwise. + * + * Note: The caller should heed the returned lengths; these are the only + * indicator that useful content resides in name or value. + */ +bool hubbub_charset_get_attribute(const uint8_t **data, const uint8_t *end, + const uint8_t **name, uint32_t *namelen, + const uint8_t **value, uint32_t *valuelen) +{ + const uint8_t *pos; + + if (data == NULL || *data == NULL || end == NULL || name == NULL || + namelen == NULL || value == NULL || valuelen == NULL) + return false; + + pos = *data; + + /* 1. Skip leading spaces or '/' characters */ + while (pos < end && (ISSPACE(*pos) || *pos == '/')) { + pos++; + } + + if (pos >= end) { + *data = pos; + return false; + } + + /* 2. Invalid element open character */ + if (*pos == '<') { + pos--; + *data = pos; + return false; + } + + /* 3. End of element */ + if (*pos == '>') { + *data = pos; + return false; + } + + /* 4. Initialise name & value to empty string */ + *name = pos; + *namelen = 0; + *value = (const uint8_t *) ""; + *valuelen = 0; + + /* 5. Extract name */ + while (pos < end) { + /* a */ + if (*pos == '=') { + break; + } + + /* b */ + if (ISSPACE(*pos)) { + break; + } + + /* c */ + if (*pos == '/' || *pos == '<' || *pos == '>') { + return true; + } + + /* d is handled by strncasecmp in _parse_attributes */ + + /* e */ + (*namelen)++; + + /* 6 */ + pos++; + } + + if (pos >= end) { + *data = pos; + return false; + } + + if (ISSPACE(*pos)) { + /* 7. Skip trailing spaces */ + while (pos < end && ISSPACE(*pos)) { + pos++; + } + + if (pos >= end) { + *data = pos; + return false; + } + + /* 8. Must be '=' */ + if (*pos != '=') { + pos--; + *data = pos; + return true; + } + } + + /* 9. Skip '=' */ + pos++; + + /* 10. Skip any spaces after '=' */ + while (pos < end && ISSPACE(*pos)) { + pos++; + } + + if (pos >= end) { + *data = pos; + return false; + } + + /* 11. Extract value, if quoted */ + /* a */ + if (*pos == '\'' || *pos == '"') { + /* 1 */ + const uint8_t *quote = pos; + + /* 2 */ + while (++pos < end) { + /* 3 */ + if (*pos == *quote) { + *value = (quote + 1); + *data = ++pos; + return true; + } + + /* 4 is handled by strncasecmp */ + + /* 5 */ + (*valuelen)++; + + /* 6 */ + } + + if (pos >= end) { + *data = pos; + return false; + } + } + + /* b */ + if (*pos == '<' || *pos == '>') { + *data = pos; + return true; + } + + /* c is handled by strncasecmp */ + + /* d */ + *value = pos; + + while (pos < end) { + /* 12. Extract unquoted value */ + /* a */ + if (ISSPACE(*pos) || *pos == '<' || *pos == '>') { + *data = pos; + return true; + } + + /* b is handled by strncasecmp */ + + /* c */ + (*valuelen)++; + + /* 13. Advance */ + pos++; + } + + if (pos >= end) { + *data = pos; + return false; + } + + /* should never be reached */ + abort(); + + return false; +} diff --git a/src/charset/detect.h b/src/charset/detect.h new file mode 100644 index 0000000..854a8d6 --- /dev/null +++ b/src/charset/detect.h @@ -0,0 +1,22 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef hubbub_charset_detect_h_ +#define hubbub_charset_detect_h_ + +#include + +#include +#include +#include + +/* Extract a charset from a chunk of data */ +hubbub_error hubbub_charset_extract(const uint8_t **data, size_t *len, + uint16_t *mibenum, hubbub_charset_source *source); + +#endif + -- cgit v1.2.3