From 2777a04ed2ba4fd36138b991d66a32a283361f7e Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Thu, 1 May 2008 16:34:46 +0000 Subject: Import parser construction utility library svn path=/trunk/libparserutils/; revision=4111 --- COPYING | 19 + Makefile | 43 ++ Makefile-riscos | 46 +++ README | 44 ++ build/Makefile.common | 129 ++++++ build/Makefile.config | 8 + include/parserutils/charset/codec.h | 114 ++++++ include/parserutils/charset/mibenum.h | 24 ++ include/parserutils/charset/utf16.h | 38 ++ include/parserutils/charset/utf8.h | 38 ++ include/parserutils/errors.h | 29 ++ include/parserutils/functypes.h | 21 + include/parserutils/input/inputstream.h | 143 +++++++ include/parserutils/parserutils.h | 23 ++ include/parserutils/types.h | 15 + include/parserutils/utils/buffer.h | 39 ++ libparserutils.pc.in | 10 + src/Makefile | 49 +++ src/charset/Makefile | 49 +++ src/charset/aliases.c | 410 +++++++++++++++++++ src/charset/aliases.h | 36 ++ src/charset/charset.c | 54 +++ src/charset/charset.h | 24 ++ src/charset/codec.c | 185 +++++++++ src/charset/codecs/Makefile | 46 +++ src/charset/codecs/codec_iconv.c | 683 ++++++++++++++++++++++++++++++++ src/charset/codecs/codec_impl.h | 48 +++ src/charset/codecs/codec_utf16.c | 544 +++++++++++++++++++++++++ src/charset/codecs/codec_utf8.c | 546 +++++++++++++++++++++++++ src/charset/encodings/Makefile | 46 +++ src/charset/encodings/utf16.c | 239 +++++++++++ src/charset/encodings/utf8.c | 175 ++++++++ src/charset/encodings/utf8impl.h | 339 ++++++++++++++++ src/input/Makefile | 46 +++ src/input/filter.c | 384 ++++++++++++++++++ src/input/filter.h | 57 +++ src/input/inputstream.c | 477 ++++++++++++++++++++++ src/parserutils.c | 54 +++ src/utils/Makefile | 49 +++ src/utils/buffer.c | 156 ++++++++ src/utils/errors.c | 70 ++++ src/utils/utils.h | 28 ++ test/INDEX | 15 + test/Makefile | 80 ++++ test/README | 84 ++++ test/aliases.c | 62 +++ test/charset.c | 31 ++ test/cscodec.c | 232 +++++++++++ test/data/Aliases | 302 ++++++++++++++ test/data/cscodec/INDEX | 6 + test/data/cscodec/UTF-8-test.txt | Bin 0 -> 41013 bytes test/data/cscodec/simple.dat | Bin 0 -> 1109 bytes test/data/input/INDEX | 5 + test/data/input/UTF-8-test.txt | Bin 0 -> 20334 bytes test/filter.c | 357 +++++++++++++++++ test/inputstream.c | 97 +++++ test/parserutils.c | 30 ++ test/regression/cscodec-segv.c | 38 ++ test/regression/filter-segv.c | 39 ++ test/regression/stream-nomem.c | 94 +++++ test/testrunner.pl | 167 ++++++++ test/testutils.h | 123 ++++++ 62 files changed, 7339 insertions(+) create mode 100644 COPYING create mode 100644 Makefile create mode 100644 Makefile-riscos create mode 100644 README create mode 100644 build/Makefile.common create mode 100644 build/Makefile.config create mode 100644 include/parserutils/charset/codec.h create mode 100644 include/parserutils/charset/mibenum.h create mode 100644 include/parserutils/charset/utf16.h create mode 100644 include/parserutils/charset/utf8.h create mode 100644 include/parserutils/errors.h create mode 100644 include/parserutils/functypes.h create mode 100644 include/parserutils/input/inputstream.h create mode 100644 include/parserutils/parserutils.h create mode 100644 include/parserutils/types.h create mode 100644 include/parserutils/utils/buffer.h create mode 100644 libparserutils.pc.in create mode 100644 src/Makefile create mode 100644 src/charset/Makefile create mode 100644 src/charset/aliases.c create mode 100644 src/charset/aliases.h create mode 100644 src/charset/charset.c create mode 100644 src/charset/charset.h create mode 100644 src/charset/codec.c create mode 100644 src/charset/codecs/Makefile create mode 100644 src/charset/codecs/codec_iconv.c create mode 100644 src/charset/codecs/codec_impl.h create mode 100644 src/charset/codecs/codec_utf16.c create mode 100644 src/charset/codecs/codec_utf8.c create mode 100644 src/charset/encodings/Makefile create mode 100644 src/charset/encodings/utf16.c create mode 100644 src/charset/encodings/utf8.c create mode 100644 src/charset/encodings/utf8impl.h create mode 100644 src/input/Makefile create mode 100644 src/input/filter.c create mode 100644 src/input/filter.h create mode 100644 src/input/inputstream.c create mode 100644 src/parserutils.c create mode 100644 src/utils/Makefile create mode 100644 src/utils/buffer.c create mode 100644 src/utils/errors.c create mode 100644 src/utils/utils.h create mode 100644 test/INDEX create mode 100644 test/Makefile create mode 100644 test/README create mode 100644 test/aliases.c create mode 100644 test/charset.c create mode 100644 test/cscodec.c create mode 100644 test/data/Aliases create mode 100644 test/data/cscodec/INDEX create mode 100644 test/data/cscodec/UTF-8-test.txt create mode 100644 test/data/cscodec/simple.dat create mode 100644 test/data/input/INDEX create mode 100644 test/data/input/UTF-8-test.txt create mode 100644 test/filter.c create mode 100644 test/inputstream.c create mode 100644 test/parserutils.c create mode 100644 test/regression/cscodec-segv.c create mode 100644 test/regression/filter-segv.c create mode 100644 test/regression/stream-nomem.c create mode 100644 test/testrunner.pl create mode 100644 test/testutils.h diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..0f8d92b --- /dev/null +++ b/COPYING @@ -0,0 +1,19 @@ +Copyright (C) 2007-8 J-M Bell + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + + * The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..e4de9b9 --- /dev/null +++ b/Makefile @@ -0,0 +1,43 @@ +# Toolchain definitions for building on the destination platform +CC := gcc +AR := ar +LD := gcc + +CP := cp +RM := rm +MKDIR := mkdir +MV := mv +ECHO := echo +MAKE := make +PERL := perl +PKGCONFIG := pkg-config +INSTALL := install +SED := sed +LCOV := lcov +GENHTML := genhtml + +# Toolchain flags +WARNFLAGS := -Wall -Wextra -Wundef -Wpointer-arith -Wcast-align \ + -Wwrite-strings -Wstrict-prototypes -Wmissing-prototypes \ + -Wmissing-declarations -Wnested-externs -Werror -pedantic +override CFLAGS += -std=c99 -D_BSD_SOURCE -I$(TOP)/include/ $(WARNFLAGS) +RELEASECFLAGS = $(CFLAGS) -DNDEBUG -O2 +DEBUGCFLAGS = $(CFLAGS) -O0 -g +ARFLAGS := -cru +override LDFLAGS += -L$(TOP)/ + +CPFLAGS := +RMFLAGS := -f +MKDIRFLAGS := -p +MVFLAGS := +ECHOFLAGS := +MAKEFLAGS := +PKGCONFIGFLAGS := + +EXEEXT := + +# Default installation prefix +PREFIX ?= /usr/local + + +include build/Makefile.common diff --git a/Makefile-riscos b/Makefile-riscos new file mode 100644 index 0000000..c9fef3c --- /dev/null +++ b/Makefile-riscos @@ -0,0 +1,46 @@ +# Toolchain definitions for building for RISC OS using the GCCSDK cross-compiler +GCCSDK_INSTALL_CROSSBIN ?= /home/riscos/cross/bin +GCCSDK_INSTALL_ENV ?= /home/riscos/env + +CC := $(GCCSDK_INSTALL_CROSSBIN)/gcc +AR := $(GCCSDK_INSTALL_CROSSBIN)/ar +LD := $(GCCSDK_INSTALL_CROSSBIN)/gcc + +CP := cp +RM := rm +MKDIR := mkdir +MV := mv +ECHO := echo +MAKE := make +PERL := perl +PKGCONFIG := pkg-config +INSTALL := install +SED := sed +LCOV := echo +GENHTML := echo + +# Toolchain flags +WARNFLAGS := -Wall -Wextra -Wundef -Wpointer-arith -Wcast-align \ + -Wwrite-strings -Wstrict-prototypes -Wmissing-prototypes \ + -Wmissing-declarations -Wnested-externs -Werror -pedantic +CFLAGS += -std=c99 -D_BSD_SOURCE -I$(TOP)/include/ $(WARNFLAGS) \ + -mpoke-function-name +RELEASECFLAGS = $(CFLAGS) -DNDEBUG -O2 +DEBUGCFLAGS = $(CFLAGS) -O0 -g +ARFLAGS := -cru +LDFLAGS = -L$(TOP)/ + +CPFLAGS := +RMFLAGS := -f +MKDIRFLAGS := -p +MVFLAGS := +ECHOFLAGS := +MAKEFLAGS := +PKGCONFIGFLAGS := + +EXEEXT := ,ff8 + +# Default installation prefix +PREFIX ?= $(GCCSDK_INSTALL_ENV) + +include build/Makefile.common diff --git a/README b/README new file mode 100644 index 0000000..72041c0 --- /dev/null +++ b/README @@ -0,0 +1,44 @@ +LibParserUtils -- a utility library for parser building +======================================================= + +Overview +-------- + + LibParserUtils provides various pieces of functionality that are useful + when writing parsers. + +Requirements +------------ + + LibParserUtils requires the following tools: + + + A C99 capable C compiler + + GNU make or compatible + + Perl (for the testcases) + + Pkg-config (for the testcases) + + For enhanced charset support, LibParserUtils may also be configured to use + an iconv() implementation. + +Compilation +----------- + + If necessary, modify the toolchain settings in the Makefile. + Invoke make: + $ make + +Verification +------------ + + To verify that the parser is working, it is necessary to specify a + different makefile target than that used for normal compilation, thus: + + $ make test + +API documentation +----------------- + + Currently, there is none. However, the code is well commented and the + public API may be found in the "include" directory. The testcase sources + may also be of use in working out how to use it. + diff --git a/build/Makefile.common b/build/Makefile.common new file mode 100644 index 0000000..418a5a8 --- /dev/null +++ b/build/Makefile.common @@ -0,0 +1,129 @@ +# Top-level Makefile fragment + +# Default target +all: release + +# Name of component +COMPONENT := libparserutils + +# Environment +EXPORT := $(CURDIR)/dist +TOP := $(CURDIR) +RELEASEDIR := build/Release +DEBUGDIR := build/Debug +COVERAGEDIR := build/coverage + +# List of items to delete on clean +ITEMS_CLEAN := +# List of items to delete on distclean +ITEMS_DISTCLEAN := + +# List of targets to run for testing +TARGET_TESTS := + +# Source files +SOURCES := + +# Include configuration Makefile fragment +include build/Makefile.config + +# Include Makefile fragments in subdirectories + +define do_include +DIR := $$(dir $(1)) +include $(1) + +endef + +MAKE_INCLUDES := $(wildcard */Makefile) +$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC)))) + +# Calculate objects to build +OBJECTS := $(subst /,_,$(subst .c,.o,$(SOURCES))) + +.PHONY: release debug test coverage profile \ + clean distclean setup export install uninstall + +# Rules +release: setup $(addprefix $(RELEASEDIR)/,$(OBJECTS)) + @$(AR) $(ARFLAGS) $(COMPONENT).a $(RELEASEDIR)/* + +debug: setup $(addprefix $(DEBUGDIR)/,$(OBJECTS)) + @$(AR) $(ARFLAGS) $(COMPONENT)-debug.a $(DEBUGDIR)/* + +test: debug $(TARGET_TESTS) + +coverage: clean + @$(LCOV) --directory . --zerocounters + @$(MAKE) test CFLAGS="$(CFLAGS) -fprofile-arcs -ftest-coverage" \ + LDFLAGS="$(LDFLAGS) -lgcov" + @$(LCOV) --directory $(DEBUGDIR) --base-directory $(TOP) \ + --capture --output-file $(COVERAGEDIR)/$(COMPONENT)_tmp.info + @$(LCOV) --extract $(COVERAGEDIR)/$(COMPONENT)_tmp.info "$(TOP)/src*" \ + -o $(COVERAGEDIR)/$(COMPONENT).info + @$(RM) $(RMFLAGS) $(COVERAGEDIR)/$(COMPONENT)_tmp.info + @$(GENHTML) -o $(COVERAGEDIR) --num-spaces 2 \ + $(COVERAGEDIR)/$(COMPONENT).info + +profile: clean + @$(MAKE) test CFLAGS="$(CFLAGS) -pg" LDFLAGS="-pg $(LDFLAGS)" + +clean: + -@$(RM) $(RMFLAGS) $(ITEMS_CLEAN) + -@$(RM) $(RMFLAGS) gmon.out + -@$(RM) $(RMFLAGS) -r $(COVERAGEDIR) + -@$(RM) $(RMFLAGS) -r $(RELEASEDIR) + -@$(RM) $(RMFLAGS) -r $(DEBUGDIR) + -@$(RM) $(RMFLAGS) $(COMPONENT).a + -@$(RM) $(RMFLAGS) $(COMPONENT)-debug.a + -@$(RM) $(RMFLAGS) $(COMPONENT).pc + +distclean: clean + -@$(RM) $(RMFLAGS) $(ITEMS_DISTCLEAN) + -@$(RM) $(RMFLAGS) -r $(TOP)/dist + +setup: + @$(MKDIR) $(MKDIRFLAGS) $(RELEASEDIR) + @$(MKDIR) $(MKDIRFLAGS) $(DEBUGDIR) + @$(MKDIR) $(MKDIRFLAGS) $(COVERAGEDIR) + +export: release + @$(MKDIR) $(MKDIRFLAGS) $(TOP)/dist/lib + @$(CP) $(CPFLAGS) -r include $(EXPORT)/ + @${CP} ${CPFLAGS} $(COMPONENT).a ${EXPORT}/lib/ + +install: release + @$(MKDIR) $(MKDIRFLAGS) -p $(PREFIX)/lib/pkgconfig + @$(MKDIR) $(MKDIRFLAGS) -p $(PREFIX)/include/parserutils + @$(MKDIR) $(MKDIRFLAGS) -p $(PREFIX)/include/parserutils/charset + @$(MKDIR) $(MKDIRFLAGS) -p $(PREFIX)/include/parserutils/input + @$(MKDIR) $(MKDIRFLAGS) -p $(PREFIX)/include/parserutils/utils + @$(SED) -e 's#PREFIX#$(PREFIX)#' $(COMPONENT).pc.in >$(COMPONENT).pc + @$(INSTALL) --mode=644 -t $(PREFIX)/lib $(COMPONENT).a + @$(INSTALL) --mode=644 -t $(PREFIX)/lib/pkgconfig $(COMPONENT).pc + @$(INSTALL) --mode=644 -t $(PREFIX)/include/parserutils $(filter %.h, $(wildcard include/parserutils/*)) + @$(INSTALL) --mode=644 -t $(PREFIX)/include/parserutils/charset $(filter %.h, $(wildcard include/parserutils/charset/*)) + @$(INSTALL) --mode=644 -t $(PREFIX)/include/parserutils/input $(filter %.h, $(wildcard include/parserutils/input/*)) + @$(INSTALL) --mode=644 -t $(PREFIX)/include/parserutils/utils $(filter %.h, $(wildcard include/parserutils/utils/*)) + + +uninstall: + @$(RM) $(RMFLAGS) $(PREFIX)/lib/$(COMPONENT).a + @$(RM) $(RMFLAGS) $(PREFIX)/lib/pkgconfig/$(COMPONENT).pc + @$(RM) $(RMFLAGS) -r $(PREFIX)/include/parserutils + +# Finally, build rules for compilation +define do_compile +$$(RELEASEDIR)/$(2): $(1) + @$$(ECHO) $$(ECHOFLAGS) "==> $(1)" + @$$(CC) -c $$(RELEASECFLAGS) -o $$@ $(1) + +$$(DEBUGDIR)/$(2): $(1) + @$$(ECHO) $$(ECHOFLAGS) "==> $(1)" + @$$(CC) -c $$(DEBUGCFLAGS) -o $$@ $(1) + +endef + +$(eval $(foreach SOURCE,$(filter %.c,$(SOURCES)), \ + $(call do_compile,$(SOURCE),$(subst /,_,$(SOURCE:.c=.o))))) + diff --git a/build/Makefile.config b/build/Makefile.config new file mode 100644 index 0000000..b6560c1 --- /dev/null +++ b/build/Makefile.config @@ -0,0 +1,8 @@ +# Configuration Makefile fragment + +# Build the iconv codec +# override CFLAGS += -DWITH_ICONV_CODEC + +# Use iconv directly in the input filter +# override CFLAGS += -DWITH_ICONV_FILTER + diff --git a/include/parserutils/charset/codec.h b/include/parserutils/charset/codec.h new file mode 100644 index 0000000..ca98db5 --- /dev/null +++ b/include/parserutils/charset/codec.h @@ -0,0 +1,114 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef parserutils_charset_codec_h_ +#define parserutils_charset_codec_h_ + +#include + +#include +#include + +typedef struct parserutils_charset_codec parserutils_charset_codec; + +#define PARSERUTILS_CHARSET_CODEC_NULL (0xffffffffU) + +/** + * Charset codec error mode + * + * A codec's error mode determines its behaviour in the face of: + * + * + characters which are unrepresentable in the destination charset (if + * encoding data) or which cannot be converted to UCS4 (if decoding data). + * + invalid byte sequences (both encoding and decoding) + * + * The options provide a choice between the following approaches: + * + * + draconian, "stop processing" ("strict") + * + "replace the unrepresentable character with something else" ("loose") + * + "attempt to transliterate, or replace if unable" ("translit") + * + * The default error mode is "loose". + * + * + * In the "loose" case, the replacement character will depend upon: + * + * + Whether the operation was encoding or decoding + * + If encoding, what the destination charset is. + * + * If decoding, the replacement character will be: + * + * U+FFFD (REPLACEMENT CHARACTER) + * + * If encoding, the replacement character will be: + * + * U+003F (QUESTION MARK) if the destination charset is not UTF-(8|16|32) + * U+FFFD (REPLACEMENT CHARACTER) otherwise. + * + * + * In the "translit" case, the codec will attempt to transliterate into + * the destination charset, if encoding. If decoding, or if transliteration + * fails, this option is identical to "loose". + */ +typedef enum parserutils_charset_codec_errormode { + /** Abort processing if unrepresentable character encountered */ + PARSERUTILS_CHARSET_CODEC_ERROR_STRICT = 0, + /** Replace unrepresentable characters with single alternate */ + PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE = 1, + /** Transliterate unrepresentable characters, if possible */ + PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT = 2, +} parserutils_charset_codec_errormode; + +/** + * Charset codec option types + */ +typedef enum parserutils_charset_codec_opttype { + /** Set codec error mode */ + PARSERUTILS_CHARSET_CODEC_ERROR_MODE = 1, +} parserutils_charset_codec_opttype; + +/** + * Charset codec option parameters + */ +typedef union parserutils_charset_codec_optparams { + /** Parameters for error mode setting */ + struct { + /** The desired error handling mode */ + parserutils_charset_codec_errormode mode; + } error_mode; +} parserutils_charset_codec_optparams; + + +/* Create a charset codec */ +parserutils_charset_codec *parserutils_charset_codec_create(const char *charset, + parserutils_alloc alloc, void *pw); +/* Destroy a charset codec */ +void parserutils_charset_codec_destroy(parserutils_charset_codec *codec); + +/* Configure a charset codec */ +parserutils_error parserutils_charset_codec_setopt( + parserutils_charset_codec *codec, + parserutils_charset_codec_opttype type, + parserutils_charset_codec_optparams *params); + +/* Encode a chunk of UCS4 data into a codec's charset */ +parserutils_error parserutils_charset_codec_encode( + parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); + +/* Decode a chunk of data in a codec's charset into UCS4 */ +parserutils_error parserutils_charset_codec_decode( + parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); + +/* Reset a charset codec */ +parserutils_error parserutils_charset_codec_reset( + parserutils_charset_codec *codec); + +#endif diff --git a/include/parserutils/charset/mibenum.h b/include/parserutils/charset/mibenum.h new file mode 100644 index 0000000..8b3ac9d --- /dev/null +++ b/include/parserutils/charset/mibenum.h @@ -0,0 +1,24 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef parserutils_charset_mibenum_h_ +#define parserutils_charset_mibenum_h_ + +#include +#include + +#include +#include + +/* Convert an encoding alias to a MIB enum value */ +uint16_t parserutils_charset_mibenum_from_name(const char *alias, size_t len); +/* Convert a MIB enum value into an encoding alias */ +const char *parserutils_charset_mibenum_to_name(uint16_t mibenum); +/* Determine if a MIB enum value represents a Unicode variant */ +bool parserutils_charset_mibenum_is_unicode(uint16_t mibenum); + +#endif diff --git a/include/parserutils/charset/utf16.h b/include/parserutils/charset/utf16.h new file mode 100644 index 0000000..6569d6e --- /dev/null +++ b/include/parserutils/charset/utf16.h @@ -0,0 +1,38 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +/** \file + * UTF-16 manipulation functions (interface). + */ + +#ifndef parserutils_charset_utf16_h_ +#define parserutils_charset_utf16_h_ + +#include + +#include + +parserutils_error parserutils_charset_utf16_to_ucs4(const uint8_t *s, + size_t len, uint32_t *ucs4, size_t *clen); +parserutils_error parserutils_charset_utf16_from_ucs4(uint32_t ucs4, + uint8_t *s, size_t *len); + +parserutils_error parserutils_charset_utf16_length(const uint8_t *s, + size_t max, size_t *len); +parserutils_error parserutils_charset_utf16_char_byte_length(const uint8_t *s, + size_t *len); + +parserutils_error parserutils_charset_utf16_prev(const uint8_t *s, + uint32_t off, uint32_t *prevoff); +parserutils_error parserutils_charset_utf16_next(const uint8_t *s, + uint32_t len, uint32_t off, uint32_t *nextoff); + +parserutils_error parserutils_charset_utf16_next_paranoid(const uint8_t *s, + uint32_t len, uint32_t off, uint32_t *nextoff); + +#endif + diff --git a/include/parserutils/charset/utf8.h b/include/parserutils/charset/utf8.h new file mode 100644 index 0000000..16e012e --- /dev/null +++ b/include/parserutils/charset/utf8.h @@ -0,0 +1,38 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +/** \file + * UTF-8 manipulation functions (interface). + */ + +#ifndef parserutils_charset_utf8_h_ +#define parserutils_charset_utf8_h_ + +#include + +#include + +parserutils_error parserutils_charset_utf8_to_ucs4(const uint8_t *s, size_t len, + uint32_t *ucs4, size_t *clen); +parserutils_error parserutils_charset_utf8_from_ucs4(uint32_t ucs4, uint8_t **s, + size_t *len); + +parserutils_error parserutils_charset_utf8_length(const uint8_t *s, size_t max, + size_t *len); +parserutils_error parserutils_charset_utf8_char_byte_length(const uint8_t *s, + size_t *len); + +parserutils_error parserutils_charset_utf8_prev(const uint8_t *s, uint32_t off, + uint32_t *prevoff); +parserutils_error parserutils_charset_utf8_next(const uint8_t *s, uint32_t len, + uint32_t off, uint32_t *nextoff); + +parserutils_error parserutils_charset_utf8_next_paranoid(const uint8_t *s, + uint32_t len, uint32_t off, uint32_t *nextoff); + +#endif + diff --git a/include/parserutils/errors.h b/include/parserutils/errors.h new file mode 100644 index 0000000..09c715c --- /dev/null +++ b/include/parserutils/errors.h @@ -0,0 +1,29 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef parserutils_errors_h_ +#define parserutils_errors_h_ + +#include + +typedef enum parserutils_error { + PARSERUTILS_OK = 0, + + PARSERUTILS_NOMEM = 1, + PARSERUTILS_BADPARM = 2, + PARSERUTILS_INVALID = 3, + PARSERUTILS_FILENOTFOUND = 4, + PARSERUTILS_NEEDDATA = 5, +} parserutils_error; + +/* Convert a parserutils error value to a string */ +const char *parserutils_error_to_string(parserutils_error error); +/* Convert a string to a parserutils error value */ +parserutils_error parserutils_error_from_string(const char *str, size_t len); + +#endif + diff --git a/include/parserutils/functypes.h b/include/parserutils/functypes.h new file mode 100644 index 0000000..703a329 --- /dev/null +++ b/include/parserutils/functypes.h @@ -0,0 +1,21 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007-8 John-Mark Bell + */ + +#ifndef parserutils_functypes_h_ +#define parserutils_functypes_h_ + +#include +#include +#include + +#include + +/* Type of allocation function for parserutils */ +typedef void *(*parserutils_alloc)(void *ptr, size_t size, void *pw); + +#endif + diff --git a/include/parserutils/input/inputstream.h b/include/parserutils/input/inputstream.h new file mode 100644 index 0000000..2b0c407 --- /dev/null +++ b/include/parserutils/input/inputstream.h @@ -0,0 +1,143 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef parserutils_input_inputstream_h_ +#define parserutils_input_inputstream_h_ + +#include +#include +#include + +#include +#include +#include +#include +#include + +/** + * Type of charset detection function + */ +typedef parserutils_error (*parserutils_charset_detect_func)( + const uint8_t *data, size_t len, + uint16_t *mibenum, uint32_t *source); + +/** + * Input stream object + */ +typedef struct parserutils_inputstream +{ + parserutils_buffer *utf8; /**< Buffer containing utf8 data */ + + uint32_t cursor; /**< Byte offset of current position */ + + bool had_eof; /**< Whether EOF has been reached */ +} parserutils_inputstream; + +/* EOF pseudo-character */ +#define PARSERUTILS_INPUTSTREAM_EOF (0xFFFFFFFFU) +/* Out-of-data indicator */ +#define PARSERUTILS_INPUTSTREAM_OOD (0xFFFFFFFEU) + +/* Create an input stream */ +parserutils_inputstream *parserutils_inputstream_create(const char *enc, + uint32_t encsrc, parserutils_charset_detect_func csdetect, + parserutils_alloc alloc, void *pw); +/* Destroy an input stream */ +void parserutils_inputstream_destroy(parserutils_inputstream *stream); + +/* Append data to an input stream */ +parserutils_error parserutils_inputstream_append( + parserutils_inputstream *stream, + const uint8_t *data, size_t len); +/* Insert data into stream at current location */ +parserutils_error parserutils_inputstream_insert( + parserutils_inputstream *stream, + const uint8_t *data, size_t len); + +/* Slow form of css_inputstream_peek. */ +uintptr_t parserutils_inputstream_peek_slow(parserutils_inputstream *stream, + size_t offset, size_t *length); + +/* Look at the character in the stream that starts at + * offset bytes from the cursor + * + * \param stream Stream to look in + * \param offset Byte offset of start of character + * \param length Pointer to location to receive character length (in bytes) + * \return Pointer to character data, or EOF or OOD. + * + * Once the character pointed to by the result of this call has been advanced + * past (i.e. parserutils_inputstream_advance has caused the stream cursor to + * pass over the character), then no guarantee is made as to the validity of + * the data pointed to. Thus, any attempt to dereference the pointer after + * advancing past the data it points to is a bug. + */ +static inline uintptr_t parserutils_inputstream_peek( + parserutils_inputstream *stream, size_t offset, size_t *length) +{ + parserutils_error error = PARSERUTILS_OK; + size_t len; + + if (stream == NULL) + return PARSERUTILS_INPUTSTREAM_OOD; + +#define IS_ASCII(x) (((x) & 0x80) == 0) + + if (stream->cursor + offset < stream->utf8->length) { + if (IS_ASCII(stream->utf8->data[stream->cursor + offset])) { + len = 1; + } else { + error = parserutils_charset_utf8_char_byte_length( + stream->utf8->data + stream->cursor + offset, + &len); + + if (error != PARSERUTILS_OK && + error != PARSERUTILS_NEEDDATA) + return PARSERUTILS_INPUTSTREAM_OOD; + } + } + +#undef IS_ASCII + + if (stream->cursor + offset == stream->utf8->length || + error == PARSERUTILS_NEEDDATA) { + return parserutils_inputstream_peek_slow(stream, + offset, length); + } + + *length = len; + + return (uintptr_t) (stream->utf8->data + stream->cursor + offset); +} + +/** + * Advance the stream's current position + * + * \param stream The stream whose position to advance + * \param bytes The number of bytes to advance + */ +static inline void parserutils_inputstream_advance( + parserutils_inputstream *stream, size_t bytes) +{ + if (stream == NULL) + return; + + if (bytes > stream->utf8->length - stream->cursor) + abort(); + + if (stream->cursor == stream->utf8->length) + return; + + stream->cursor += bytes; +} + +/* Read the document charset */ +const char *parserutils_inputstream_read_charset( + parserutils_inputstream *stream, uint32_t *source); + +#endif + diff --git a/include/parserutils/parserutils.h b/include/parserutils/parserutils.h new file mode 100644 index 0000000..460e80c --- /dev/null +++ b/include/parserutils/parserutils.h @@ -0,0 +1,23 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef parserutils_parserutils_h_ +#define parserutils_parserutils_h_ + +#include +#include +#include + +/* Initialise the ParserUtils library for use */ +parserutils_error parserutils_initialise(const char *aliases_file, + parserutils_alloc alloc, void *pw); + +/* Clean up after ParserUtils */ +parserutils_error parserutils_finalise(parserutils_alloc alloc, void *pw); + +#endif + diff --git a/include/parserutils/types.h b/include/parserutils/types.h new file mode 100644 index 0000000..b36e4aa --- /dev/null +++ b/include/parserutils/types.h @@ -0,0 +1,15 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef parserutils_types_h_ +#define parserutils_types_h_ + +#include +#include + +#endif + diff --git a/include/parserutils/utils/buffer.h b/include/parserutils/utils/buffer.h new file mode 100644 index 0000000..f3a1883 --- /dev/null +++ b/include/parserutils/utils/buffer.h @@ -0,0 +1,39 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2008 John-Mark Bell + */ + +#ifndef parserutils_utils_buffer_h_ +#define parserutils_utils_buffer_h_ + +#include +#include + +struct parserutils_buffer +{ + uint8_t *data; + size_t length; + size_t allocated; + + parserutils_alloc alloc; + void *pw; +}; +typedef struct parserutils_buffer parserutils_buffer; + +parserutils_buffer *parserutils_buffer_create(parserutils_alloc alloc, + void *pw); +void parserutils_buffer_destroy(parserutils_buffer *buffer); + +parserutils_error parserutils_buffer_append(parserutils_buffer *buffer, + const uint8_t *data, size_t len); +parserutils_error parserutils_buffer_insert(parserutils_buffer *buffer, + size_t offset, const uint8_t *data, size_t len); +parserutils_error parserutils_buffer_discard(parserutils_buffer *buffer, + size_t offset, size_t len); + +parserutils_error parserutils_buffer_grow(parserutils_buffer *buffer); + +#endif + diff --git a/libparserutils.pc.in b/libparserutils.pc.in new file mode 100644 index 0000000..400ce78 --- /dev/null +++ b/libparserutils.pc.in @@ -0,0 +1,10 @@ +prefix=PREFIX +exec_prefix=${prefix} +libdir=${exec_prefix}/lib +includedir=${prefix}/include + +Name: libparserutils +Description: Utility library for facilitating parser development +Version: 0.0.1 +Libs: -L${libdir} -lparserutils +Cflags: -I${includedir} diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000..bb6c585 --- /dev/null +++ b/src/Makefile @@ -0,0 +1,49 @@ +# Child makefile fragment +# +# Toolchain is provided by top-level makefile +# +# Variables provided by top-level makefile +# +# COMPONENT The name of the component +# EXPORT The location of the export directory +# TOP The location of the source tree root +# RELEASEDIR The place to put release objects +# DEBUGDIR The place to put debug objects +# +# do_include Canned command sequence to include a child makefile +# +# Variables provided by parent makefile: +# +# DIR The name of the directory we're in, relative to $(TOP) +# +# Variables we can manipulate: +# +# ITEMS_CLEAN The list of items to remove for "make clean" +# ITEMS_DISTCLEAN The list of items to remove for "make distclean" +# TARGET_TESTS The list of target names to run for "make test" +# +# SOURCES The list of sources to build for $(COMPONENT) +# +# Plus anything from the toolchain + +# Push parent directory onto the directory stack +sp := $(sp).x +dirstack_$(sp) := $(d) +d := $(DIR) + +# Manipulate include paths +override CFLAGS := $(CFLAGS) -I$(d) + +# Sources +SRCS_$(d) := parserutils.c + +# Append to sources for component +SOURCES += $(addprefix $(d), $(SRCS_$(d))) + +# Now include any children we may have +MAKE_INCLUDES := $(wildcard $(d)*/Makefile) +$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC)))) + +# Finally, pop off the directory stack +d := $(dirstack_$(sp)) +sp := $(basename $(sp)) diff --git a/src/charset/Makefile b/src/charset/Makefile new file mode 100644 index 0000000..fc34d7c --- /dev/null +++ b/src/charset/Makefile @@ -0,0 +1,49 @@ +# Child makefile fragment +# +# Toolchain is provided by top-level makefile +# +# Variables provided by top-level makefile +# +# COMPONENT The name of the component +# EXPORT The location of the export directory +# TOP The location of the source tree root +# RELEASEDIR The place to put release objects +# DEBUGDIR The place to put debug objects +# +# do_include Canned command sequence to include a child makefile +# +# Variables provided by parent makefile: +# +# DIR The name of the directory we're in, relative to $(TOP) +# +# Variables we can manipulate: +# +# ITEMS_CLEAN The list of items to remove for "make clean" +# ITEMS_DISTCLEAN The list of items to remove for "make distclean" +# TARGET_TESTS The list of target names to run for "make test" +# +# SOURCES The list of sources to build for $(COMPONENT) +# +# Plus anything from the toolchain + +# Push parent directory onto the directory stack +sp := $(sp).x +dirstack_$(sp) := $(d) +d := $(DIR) + +# Manipulate include paths +override CFLAGS := $(CFLAGS) -I$(d) + +# Sources +SRCS_$(d) := aliases.c charset.c codec.c + +# Append to sources for component +SOURCES += $(addprefix $(d), $(SRCS_$(d))) + +# Now include any children we may have +MAKE_INCLUDES := $(wildcard $(d)*/Makefile) +$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC)))) + +# Finally, pop off the directory stack +d := $(dirstack_$(sp)) +sp := $(basename $(sp)) diff --git a/src/charset/aliases.c b/src/charset/aliases.c new file mode 100644 index 0000000..1e7e6ea --- /dev/null +++ b/src/charset/aliases.c @@ -0,0 +1,410 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include +#include +#include +#include +#include +#include + +#include "charset/aliases.h" +#include "utils/utils.h" + +struct alias { + struct alias *next; + parserutils_charset_aliases_canon *canon; + uint16_t name_len; + char name[1]; +}; + +#define HASH_SIZE (43) +static parserutils_charset_aliases_canon *canon_tab[HASH_SIZE]; +static struct alias *alias_tab[HASH_SIZE]; + +static parserutils_error parserutils_charset_create_alias(const char *alias, + parserutils_charset_aliases_canon *c, + parserutils_alloc alloc, void *pw); +static parserutils_charset_aliases_canon *parserutils_charset_create_canon( + const char *canon, uint16_t mibenum, + parserutils_alloc alloc, void *pw); +static uint32_t parserutils_charset_hash_val(const char *alias, size_t len); + +/** + * Create alias data from Aliases file + * + * \param filename The path to the Aliases file + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return PARSERUTILS_OK on success, appropriate error otherwise. + */ +parserutils_error parserutils_charset_aliases_create(const char *filename, + parserutils_alloc alloc, void *pw) +{ + char buf[300]; + FILE *fp; + + if (filename == NULL || alloc == NULL) + return PARSERUTILS_BADPARM; + + fp = fopen(filename, "r"); + if (fp == NULL) + return PARSERUTILS_FILENOTFOUND; + + while (fgets(buf, sizeof buf, fp)) { + char *p, *aliases = 0, *mib, *end; + parserutils_charset_aliases_canon *cf; + + if (buf[0] == 0 || buf[0] == '#') + /* skip blank lines or comments */ + continue; + + buf[strlen(buf) - 1] = 0; /* lose terminating newline */ + end = buf + strlen(buf); + + /* find end of canonical form */ + for (p = buf; *p && !isspace(*p) && !iscntrl(*p); p++) + ; /* do nothing */ + if (p >= end) + continue; + *p++ = '\0'; /* terminate canonical form */ + + /* skip whitespace */ + for (; *p && isspace(*p); p++) + ; /* do nothing */ + if (p >= end) + continue; + mib = p; + + /* find end of mibenum */ + for (; *p && !isspace(*p) && !iscntrl(*p); p++) + ; /* do nothing */ + if (p < end) + *p++ = '\0'; /* terminate mibenum */ + + cf = parserutils_charset_create_canon(buf, atoi(mib), alloc, pw); + if (cf == NULL) + continue; + + /* skip whitespace */ + for (; p < end && *p && isspace(*p); p++) + ; /* do nothing */ + if (p >= end) + continue; + aliases = p; + + while (p < end) { + /* find end of alias */ + for (; *p && !isspace(*p) && !iscntrl(*p); p++) + ; /* do nothing */ + if (p > end) + /* stop if we've gone past the end */ + break; + /* terminate current alias */ + *p++ = '\0'; + + if (parserutils_charset_create_alias(aliases, cf, + alloc, pw) != PARSERUTILS_OK) + break; + + /* in terminating, we may have advanced + * past the end - check this here */ + if (p >= end) + break; + + /* skip whitespace */ + for (; *p && isspace(*p); p++) + ; /* do nothing */ + + if (p >= end) + /* gone past end => stop */ + break; + + /* update pointer to current alias */ + aliases = p; + } + } + + fclose(fp); + + return PARSERUTILS_OK; +} + +/** + * Free all alias data + * + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data + */ +void parserutils_charset_aliases_destroy(parserutils_alloc alloc, void *pw) +{ + parserutils_charset_aliases_canon *c, *d; + struct alias *a, *b; + int i; + + for (i = 0; i != HASH_SIZE; i++) { + for (c = canon_tab[i]; c; c = d) { + d = c->next; + alloc(c, 0, pw); + } + canon_tab[i] = NULL; + + for (a = alias_tab[i]; a; a = b) { + b = a->next; + alloc(a, 0, pw); + } + alias_tab[i] = NULL; + } +} + +/** + * Retrieve the MIB enum value assigned to an encoding name + * + * \param alias The alias to lookup + * \param len The length of the alias string + * \return The MIB enum value, or 0 if not found + */ +uint16_t parserutils_charset_mibenum_from_name(const char *alias, size_t len) +{ + parserutils_charset_aliases_canon *c; + + if (alias == NULL) + return 0; + + c = parserutils_charset_alias_canonicalise(alias, len); + if (c == NULL) + return 0; + + return c->mib_enum; +} + +/** + * Retrieve the canonical name of an encoding from the MIB enum + * + * \param mibenum The MIB enum value + * \return Pointer to canonical name, or NULL if not found + */ +const char *parserutils_charset_mibenum_to_name(uint16_t mibenum) +{ + int i; + parserutils_charset_aliases_canon *c; + + for (i = 0; i != HASH_SIZE; i++) + for (c = canon_tab[i]; c; c = c->next) + if (c->mib_enum == mibenum) + return c->name; + + return NULL; +} + +/** + * Detect if a parserutils_charset is Unicode + * + * \param mibenum The MIB enum to consider + * \return true if a Unicode variant, false otherwise + */ +bool parserutils_charset_mibenum_is_unicode(uint16_t mibenum) +{ + static uint16_t ucs4; + static uint16_t ucs2; + static uint16_t utf8; + static uint16_t utf16; + static uint16_t utf16be; + static uint16_t utf16le; + static uint16_t utf32; + static uint16_t utf32be; + static uint16_t utf32le; + + if (ucs4 == 0) { + ucs4 = parserutils_charset_mibenum_from_name("UCS-4", + SLEN("UCS-4")); + ucs2 = parserutils_charset_mibenum_from_name("UCS-2", + SLEN("UCS-2")); + utf8 = parserutils_charset_mibenum_from_name("UTF-8", + SLEN("UTF-8")); + utf16 = parserutils_charset_mibenum_from_name("UTF-16", + SLEN("UTF-16")); + utf16be = parserutils_charset_mibenum_from_name("UTF-16BE", + SLEN("UTF-16BE")); + utf16le = parserutils_charset_mibenum_from_name("UTF-16LE", + SLEN("UTF-16LE")); + utf32 = parserutils_charset_mibenum_from_name("UTF-32", + SLEN("UTF-32")); + utf32be = parserutils_charset_mibenum_from_name("UTF-32BE", + SLEN("UTF-32BE")); + utf32le = parserutils_charset_mibenum_from_name("UTF-32LE", + SLEN("UTF-32LE")); + } + + return (mibenum == ucs4 || mibenum == ucs2 || mibenum == utf8 || + mibenum == utf16 || mibenum == utf16be || + mibenum == utf16le || mibenum == utf32 || + mibenum == utf32be || mibenum == utf32le); +} + +/** + * Retrieve the canonical form of an alias name + * + * \param alias The alias name + * \param len The length of the alias name + * \return Pointer to canonical form or NULL if not found + */ +parserutils_charset_aliases_canon *parserutils_charset_alias_canonicalise( + const char *alias, size_t len) +{ + uint32_t hash; + parserutils_charset_aliases_canon *c; + struct alias *a; + + if (alias == NULL) + return NULL; + + hash = parserutils_charset_hash_val(alias, len); + + for (c = canon_tab[hash]; c; c = c->next) + if (c->name_len == len && + strncasecmp(c->name, alias, len) == 0) + break; + if (c) + return c; + + for (a = alias_tab[hash]; a; a = a->next) + if (a->name_len == len && + strncasecmp(a->name, alias, len) == 0) + break; + if (a) + return a->canon; + + return NULL; +} + + +/** + * Create an alias + * + * \param alias The alias name + * \param c The canonical form + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_create_alias(const char *alias, + parserutils_charset_aliases_canon *c, + parserutils_alloc alloc, void *pw) +{ + struct alias *a; + uint32_t hash; + + if (alias == NULL || c == NULL || alloc == NULL) + return PARSERUTILS_BADPARM; + + a = alloc(NULL, sizeof(struct alias) + strlen(alias) + 1, pw); + if (a == NULL) + return PARSERUTILS_NOMEM; + + a->canon = c; + a->name_len = strlen(alias); + strcpy(a->name, alias); + a->name[a->name_len] = '\0'; + + hash = parserutils_charset_hash_val(alias, a->name_len); + + a->next = alias_tab[hash]; + alias_tab[hash] = a; + + return PARSERUTILS_OK; +} + +/** + * Create a canonical form + * + * \param canon The canonical name + * \param mibenum The MIB enum value + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to canonical form or NULL on error + */ +parserutils_charset_aliases_canon *parserutils_charset_create_canon( + const char *canon, uint16_t mibenum, + parserutils_alloc alloc, void *pw) +{ + parserutils_charset_aliases_canon *c; + uint32_t hash, len; + + if (canon == NULL || alloc == NULL) + return NULL; + + len = strlen(canon); + + c = alloc(NULL, sizeof(parserutils_charset_aliases_canon) + len + 1, pw); + if (c == NULL) + return NULL; + + c->mib_enum = mibenum; + c->name_len = len; + strcpy(c->name, canon); + c->name[len] = '\0'; + + hash = parserutils_charset_hash_val(canon, len); + + c->next = canon_tab[hash]; + canon_tab[hash] = c; + + return c; +} + +/** + * Hash function + * + * \param alias String to hash + * \return The hashed value + */ +uint32_t parserutils_charset_hash_val(const char *alias, size_t len) +{ + const char *s = alias; + uint32_t h = 5381; + + if (alias == NULL) + return 0; + + while (len--) + h = (h * 33) ^ (*s++ & ~0x20); /* case insensitive */ + + return h % HASH_SIZE; +} + + +#ifndef NDEBUG +/** + * Dump all alias data to stdout + */ +void parserutils_charset_aliases_dump(void) +{ + parserutils_charset_aliases_canon *c; + struct alias *a; + int i; + size_t size = 0; + + for (i = 0; i != HASH_SIZE; i++) { + for (c = canon_tab[i]; c; c = c->next) { + printf("%d %s\n", i, c->name); + size += offsetof(parserutils_charset_aliases_canon, + name) + c->name_len; + } + + for (a = alias_tab[i]; a; a = a->next) { + printf("%d %s\n", i, a->name); + size += offsetof(struct alias, name) + a->name_len; + } + } + + size += (sizeof(canon_tab) / sizeof(canon_tab[0])); + size += (sizeof(alias_tab) / sizeof(alias_tab[0])); + + printf("%u\n", (unsigned int) size); +} +#endif diff --git a/src/charset/aliases.h b/src/charset/aliases.h new file mode 100644 index 0000000..9abd2c8 --- /dev/null +++ b/src/charset/aliases.h @@ -0,0 +1,36 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef parserutils_charset_aliases_h_ +#define parserutils_charset_aliases_h_ + +#include + +#include + +typedef struct parserutils_charset_aliases_canon { + struct parserutils_charset_aliases_canon *next; + uint16_t mib_enum; + uint16_t name_len; + char name[1]; +} parserutils_charset_aliases_canon; + +/* Load encoding aliases from file */ +parserutils_error parserutils_charset_aliases_create(const char *filename, + parserutils_alloc alloc, void *pw); +/* Destroy encoding aliases */ +void parserutils_charset_aliases_destroy(parserutils_alloc alloc, void *pw); + +/* Canonicalise an alias name */ +parserutils_charset_aliases_canon *parserutils_charset_alias_canonicalise( + const char *alias, size_t len); + +#ifndef NDEBUG +void parserutils_charset_aliases_dump(void); +#endif + +#endif diff --git a/src/charset/charset.c b/src/charset/charset.c new file mode 100644 index 0000000..3ef1a71 --- /dev/null +++ b/src/charset/charset.c @@ -0,0 +1,54 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include "charset/aliases.h" +#include "charset/charset.h" + +/** + * Initialise the Charset library for use. + * + * This _must_ be called before using any libparserutils charset functions + * + * \param aliases_file Pointer to name of file containing encoding alias data + * \param alloc Pointer to (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return PARSERUTILS_OK on success, applicable error otherwise. + */ +parserutils_error parserutils_charset_initialise(const char *aliases_file, + parserutils_alloc alloc, void *pw) +{ + parserutils_error error; + + if (aliases_file == NULL || alloc == NULL) + return PARSERUTILS_BADPARM; + + error = parserutils_charset_aliases_create(aliases_file, alloc, pw); + if (error != PARSERUTILS_OK) + return error; + + return PARSERUTILS_OK; +} + +/** + * Clean up after Libparserutils + * + * \param alloc Pointer to (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return PARSERUTILS_OK on success, applicable error otherwise. + */ +parserutils_error parserutils_charset_finalise(parserutils_alloc alloc, + void *pw) +{ + if (alloc == NULL) + return PARSERUTILS_BADPARM; + + parserutils_charset_aliases_destroy(alloc, pw); + + return PARSERUTILS_OK; +} + + diff --git a/src/charset/charset.h b/src/charset/charset.h new file mode 100644 index 0000000..4b07577 --- /dev/null +++ b/src/charset/charset.h @@ -0,0 +1,24 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef parserutils_charset_charset_h_ +#define parserutils_charset_charset_h_ + +#include +#include +#include + +/* Initialise the Charset library for use */ +parserutils_error parserutils_charset_initialise(const char *aliases_file, + parserutils_alloc alloc, void *pw); + +/* Clean up after Charset */ +parserutils_error parserutils_charset_finalise(parserutils_alloc alloc, + void *pw); + +#endif + diff --git a/src/charset/codec.c b/src/charset/codec.c new file mode 100644 index 0000000..5c3fb3a --- /dev/null +++ b/src/charset/codec.c @@ -0,0 +1,185 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include + +#include "charset/aliases.h" +#include "charset/codecs/codec_impl.h" + +#ifdef WITH_ICONV_CODEC +extern parserutils_charset_handler iconv_codec_handler; +#endif + +extern parserutils_charset_handler charset_utf8_codec_handler; +extern parserutils_charset_handler charset_utf16_codec_handler; + +static parserutils_charset_handler *handler_table[] = { + &charset_utf8_codec_handler, + &charset_utf16_codec_handler, +#ifdef WITH_ICONV_CODEC + &iconv_codec_handler, +#endif + NULL, +}; + +/** + * Create a charset codec + * + * \param charset Target charset + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to codec instance, or NULL on failure + */ +parserutils_charset_codec *parserutils_charset_codec_create(const char *charset, + parserutils_alloc alloc, void *pw) +{ + parserutils_charset_codec *codec; + parserutils_charset_handler **handler; + const parserutils_charset_aliases_canon * canon; + + if (charset == NULL || alloc == NULL) + return NULL; + + /* Canonicalise parserutils_charset name. */ + canon = parserutils_charset_alias_canonicalise(charset, + strlen(charset)); + if (canon == NULL) + return NULL; + + /* Search for handler class */ + for (handler = handler_table; *handler != NULL; handler++) { + if ((*handler)->handles_charset(canon->name)) + break; + } + + /* None found */ + if ((*handler) == NULL) + return NULL; + + /* Instantiate class */ + codec = (*handler)->create(canon->name, alloc, pw); + if (codec == NULL) + return NULL; + + /* and initialise it */ + codec->mibenum = canon->mib_enum; + + codec->errormode = PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE; + + codec->alloc = alloc; + codec->alloc_pw = pw; + + return codec; +} + +/** + * Destroy a charset codec + * + * \param codec The codec to destroy + */ +void parserutils_charset_codec_destroy(parserutils_charset_codec *codec) +{ + if (codec == NULL) + return; + + codec->handler.destroy(codec); + + codec->alloc(codec, 0, codec->alloc_pw); +} + +/** + * Configure a charset codec + * + * \param codec The codec to configure + * \parem type The codec option type to configure + * \param params Option-specific parameters + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_codec_setopt( + parserutils_charset_codec *codec, + parserutils_charset_codec_opttype type, + parserutils_charset_codec_optparams *params) +{ + if (codec == NULL || params == NULL) + return PARSERUTILS_BADPARM; + + switch (type) { + case PARSERUTILS_CHARSET_CODEC_ERROR_MODE: + codec->errormode = params->error_mode.mode; + break; + } + + return PARSERUTILS_OK; +} + +/** + * Encode a chunk of UCS4 data into a codec's charset + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return PARSERUTILS_OK on success, appropriate error otherwise. + * + * source, sourcelen, dest and destlen will be updated appropriately on exit + */ +parserutils_error parserutils_charset_codec_encode( + parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + if (codec == NULL || source == NULL || *source == NULL || + sourcelen == NULL || dest == NULL || *dest == NULL || + destlen == NULL) + return PARSERUTILS_BADPARM; + + return codec->handler.encode(codec, source, sourcelen, dest, destlen); +} + +/** + * Decode a chunk of data in a codec's charset into UCS4 + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return PARSERUTILS_OK on success, appropriate error otherwise. + * + * source, sourcelen, dest and destlen will be updated appropriately on exit + * + * Call this with a source length of 0 to flush any buffers. + */ +parserutils_error parserutils_charset_codec_decode( + parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + if (codec == NULL || source == NULL || *source == NULL || + sourcelen == NULL || dest == NULL || *dest == NULL || + destlen == NULL) + return PARSERUTILS_BADPARM; + + return codec->handler.decode(codec, source, sourcelen, dest, destlen); +} + +/** + * Clear a charset codec's encoding state + * + * \param codec The codec to reset + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_codec_reset( + parserutils_charset_codec *codec) +{ + if (codec == NULL) + return PARSERUTILS_BADPARM; + + return codec->handler.reset(codec); +} + diff --git a/src/charset/codecs/Makefile b/src/charset/codecs/Makefile new file mode 100644 index 0000000..6d3b78e --- /dev/null +++ b/src/charset/codecs/Makefile @@ -0,0 +1,46 @@ +# Child makefile fragment +# +# Toolchain is provided by top-level makefile +# +# Variables provided by top-level makefile +# +# COMPONENT The name of the component +# EXPORT The location of the export directory +# TOP The location of the source tree root +# RELEASEDIR The place to put release objects +# DEBUGDIR The place to put debug objects +# +# do_include Canned command sequence to include a child makefile +# +# Variables provided by parent makefile: +# +# DIR The name of the directory we're in, relative to $(TOP) +# +# Variables we can manipulate: +# +# ITEMS_CLEAN The list of items to remove for "make clean" +# ITEMS_DISTCLEAN The list of items to remove for "make distclean" +# TARGET_TESTS The list of target names to run for "make test" +# +# SOURCES The list of sources to build for $(COMPONENT) +# +# Plus anything from the toolchain + +# Push parent directory onto the directory stack +sp := $(sp).x +dirstack_$(sp) := $(d) +d := $(DIR) + +# Sources +SRCS_$(d) := codec_iconv.c codec_utf8.c codec_utf16.c + +# Append to sources for component +SOURCES += $(addprefix $(d), $(SRCS_$(d))) + +# Now include any children we may have +MAKE_INCLUDES := $(wildcard $(d)*/Makefile) +$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC)))) + +# Finally, pop off the directory stack +d := $(dirstack_$(sp)) +sp := $(basename $(sp)) diff --git a/src/charset/codecs/codec_iconv.c b/src/charset/codecs/codec_iconv.c new file mode 100644 index 0000000..bbe8bc4 --- /dev/null +++ b/src/charset/codecs/codec_iconv.c @@ -0,0 +1,683 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +/* This codec is hideously slow. Only use it as a last resort */ + +#include +#include +#include + +/* We put this here rather than at the top as GCC complains + * about the source file being empty otherwise. */ +#ifdef WITH_ICONV_CODEC + +#include + +/* These two are for htonl / ntohl */ +#include +#include + +#include + +#include "charset/codecs/codec_impl.h" +#include "utils/utils.h" + +/** + * Iconv-based charset codec + */ +typedef struct iconv_codec { + parserutils_charset_codec base; /**< Base class */ + + iconv_t read_cd; /**< Iconv handle for reading */ +#define INVAL_BUFSIZE (32) + uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up + * incomplete input + * sequences */ + size_t inval_len; /**< Number of bytes in inval_buf */ + +#define READ_BUFSIZE (8) + uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial + * output sequences (decode) + */ + size_t read_len; /**< Number of characters in + * read_buf */ + + iconv_t write_cd; /**< Iconv handle for writing */ +#define WRITE_BUFSIZE (8) + uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial + * output sequences (encode) + */ + size_t write_len; /**< Number of characters in + * write_buf */ +} iconv_codec; + + +static bool iconv_codec_handles_charset(const char *charset); +static parserutils_charset_codec *iconv_codec_create(const char *charset, + parserutils_alloc alloc, void *pw); +static void iconv_codec_destroy (parserutils_charset_codec *codec); +static parserutils_error iconv_codec_encode(parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static parserutils_error iconv_codec_decode(parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static parserutils_error iconv_codec_reset(parserutils_charset_codec *codec); +static parserutils_error iconv_codec_output_decoded_char( + iconv_codec *c, uint32_t ucs4, uint8_t **dest, + size_t *destlen); +static parserutils_error iconv_codec_read_char(iconv_codec *c, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static parserutils_error iconv_codec_write_char(iconv_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen); + +/** + * Determine whether this codec handles a specific charset + * + * \param charset Charset to test + * \return true if handleable, false otherwise + */ +bool iconv_codec_handles_charset(const char *charset) +{ + iconv_t cd; + bool ret; + + cd = iconv_open("UCS-4", charset); + + ret = (cd != (iconv_t) -1); + + if (ret) + iconv_close(cd); + + return ret; +} + +/** + * Create an iconv-based codec + * + * \param charset The charset to read from / write to + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to codec, or NULL on failure + */ +parserutils_charset_codec *iconv_codec_create(const char *charset, + parserutils_alloc alloc, void *pw) +{ + iconv_codec *codec; + + codec = alloc(NULL, sizeof(iconv_codec), pw); + if (codec == NULL) + return NULL; + + codec->read_cd = iconv_open("UCS-4", charset); + if (codec->read_cd == (iconv_t) -1) { + alloc(codec, 0, pw); + return NULL; + } + + codec->write_cd = iconv_open(charset, "UCS-4"); + if (codec->write_cd == (iconv_t) -1) { + iconv_close(codec->read_cd); + alloc(codec, 0, pw); + return NULL; + } + + codec->inval_buf[0] = '\0'; + codec->inval_len = 0; + + codec->read_buf[0] = 0; + codec->read_len = 0; + + codec->write_buf[0] = 0; + codec->write_len = 0; + + /* Finally, populate vtable */ + codec->base.handler.destroy = iconv_codec_destroy; + codec->base.handler.encode = iconv_codec_encode; + codec->base.handler.decode = iconv_codec_decode; + codec->base.handler.reset = iconv_codec_reset; + + return (parserutils_charset_codec *) codec; +} + +/** + * Destroy an iconv-based codec + * + * \param codec The codec to destroy + */ +void iconv_codec_destroy (parserutils_charset_codec *codec) +{ + iconv_codec *c = (iconv_codec *) codec; + + iconv_close(c->read_cd); + iconv_close(c->write_cd); + + return; +} + +/** + * Encode a chunk of UCS4 data into an iconv-based codec's charset + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if output buffer is too small, + * PARSERUTILS_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * + * On exit, ::source will point immediately _after_ the last input character + * read. Any remaining output for the character will be buffered by the + * codec for writing on the next call. + * + * Note that, if failure occurs whilst attempting to write any output + * buffered by the last call, then ::source and ::sourcelen will remain + * unchanged (as nothing more has been read). + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + */ +parserutils_error iconv_codec_encode(parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + iconv_codec *c = (iconv_codec *) codec; + uint32_t ucs4; + const uint32_t *towrite; + size_t towritelen; + parserutils_error error; + + /* Process any outstanding characters from the previous call */ + if (c->write_len > 0) { + uint32_t *pwrite = c->write_buf; + + while (c->write_len > 0) { + error = iconv_codec_write_char(c, pwrite[0], + dest, destlen); + if (error != PARSERUTILS_OK) { + /* Copy outstanding chars down, skipping + * invalid one, if present, so as to avoid + * reprocessing the invalid character */ + if (error == PARSERUTILS_INVALID) { + for (ucs4 = 1; ucs4 < c->write_len; + ucs4++) { + c->write_buf[ucs4] = + pwrite[ucs4]; + } + } + + return error; + } + + pwrite++; + c->write_len--; + } + } + + /* Now process the characters for this call */ + while (*sourcelen > 0) { + towrite = (const uint32_t *) (const void *) *source; + towritelen = 1; + ucs4 = *towrite; + + /* Output current character(s) */ + while (towritelen > 0) { + error = iconv_codec_write_char(c, towrite[0], + dest, destlen); + + if (error != PARSERUTILS_OK) { + ucs4 = (error == PARSERUTILS_INVALID) ? 1 : 0; + + if (towritelen - ucs4 >= WRITE_BUFSIZE) + abort(); + + c->write_len = towritelen - ucs4; + + /* Copy pending chars to save area, for + * processing next call; skipping invalid + * character, if present, so it's not + * reprocessed. */ + for (; ucs4 < towritelen; ucs4++) { + c->write_buf[ucs4] = towrite[ucs4]; + } + + /* Claim character we've just buffered, + * so it's not repreocessed */ + *source += 4; + *sourcelen -= 4; + + return error; + } + + towrite++; + towritelen--; + } + + *source += 4; + *sourcelen -= 4; + } + + return PARSERUTILS_OK; +} + +/** + * Decode a chunk of data in an iconv-based codec's charset into UCS4 + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if output buffer is too small, + * PARSERUTILS_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * + * On exit, ::source will point immediately _after_ the last input character + * read, if the result is _OK or _NOMEM. Any remaining output for the + * character will be buffered by the codec for writing on the next call. + * + * In the case of the result being _INVALID, ::source will point _at_ the + * last input character read; nothing will be written or buffered for the + * failed character. It is up to the client to fix the cause of the failure + * and retry the decoding process. + * + * Note that, if failure occurs whilst attempting to write any output + * buffered by the last call, then ::source and ::sourcelen will remain + * unchanged (as nothing more has been read). + * + * If STRICT error handling is configured and an illegal sequence is split + * over two calls, then _INVALID will be returned from the second call, + * but ::source will point mid-way through the invalid sequence (i.e. it + * will be unmodified over the second call). In addition, the internal + * incomplete-sequence buffer will be emptied, such that subsequent calls + * will progress, rather than re-evaluating the same invalid sequence. + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + * + * Call this with a source length of 0 to flush the output buffer. + */ +parserutils_error iconv_codec_decode(parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + iconv_codec *c = (iconv_codec *) codec; + parserutils_error error; + + if (c->read_len > 0) { + /* Output left over from last decode + * Attempt to finish this here */ + uint32_t *pread = c->read_buf; + + while (c->read_len > 0 && *destlen >= c->read_len * 4) { + *((uint32_t *) (void *) *dest) = pread[0]; + + *dest += 4; + *destlen -= 4; + + pread++; + c->read_len--; + } + + if (*destlen < c->read_len * 4) { + /* Run out of output buffer */ + size_t i; + + /* Shuffle remaining output down */ + for (i = 0; i < c->read_len; i++) { + c->read_buf[i] = pread[i]; + } + + return PARSERUTILS_NOMEM; + } + } + + if (c->inval_len > 0) { + /* The last decode ended in an incomplete sequence. + * Fill up inval_buf with data from the start of the + * new chunk and process it. */ + uint8_t *in = c->inval_buf; + size_t ol = c->inval_len; + size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen); + size_t orig_l = l; + + memcpy(c->inval_buf + ol, *source, l); + + l += c->inval_len; + + error = iconv_codec_read_char(c, + (const uint8_t **) &in, &l, dest, destlen); + if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) { + return error; + } + + + /* And now, fix everything up so the normal processing + * does the right thing. */ + *source += max((signed) (orig_l - l), 0); + *sourcelen -= max((signed) (orig_l - l), 0); + + /* Failed to resolve an incomplete character and + * ran out of buffer space. No recovery strategy + * possible, so explode everywhere. */ + if ((orig_l + ol) - l == 0) + abort(); + + /* Handle memry exhaustion case from above */ + if (error != PARSERUTILS_OK) + return error; + } + + while (*sourcelen > 0) { + error = iconv_codec_read_char(c, + source, sourcelen, dest, destlen); + if (error != PARSERUTILS_OK) { + return error; + } + } + + return PARSERUTILS_OK; +} + +/** + * Clear an iconv-based codec's encoding state + * + * \param codec The codec to reset + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error iconv_codec_reset(parserutils_charset_codec *codec) +{ + iconv_codec *c = (iconv_codec *) codec; + + iconv(c->read_cd, NULL, NULL, NULL, NULL); + iconv(c->write_cd, NULL, NULL, NULL, NULL); + + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + c->read_buf[0] = 0; + c->read_len = 0; + + c->write_buf[0] = 0; + c->write_len = 0; + + return PARSERUTILS_OK; +} + +/** + * Output a UCS4 character + * + * \param c Codec to use + * \param ucs4 UCS4 character (big endian) + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to output buffer length + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if output buffer is too small, + */ +parserutils_error iconv_codec_output_decoded_char(iconv_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen) +{ + if (*destlen < 4) { + /* Run out of output buffer */ + + c->read_len = 1; + c->read_buf[0] = ucs4; + + return PARSERUTILS_NOMEM; + } + + *((uint32_t *) (void *) *dest) = ucs4; + *dest += 4; + *destlen -= 4; + + return PARSERUTILS_OK; +} + +/** + * Read a character from the codec's native charset to UCS4 (big endian) + * + * \param c The codec + * \param source Pointer to pointer to source buffer (updated on exit) + * \param sourcelen Pointer to length of source buffer (updated on exit) + * \param dest Pointer to pointer to output buffer (updated on exit) + * \param destlen Pointer to length of output buffer (updated on exit) + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if output buffer is too small, + * PARSERUTILS_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * + * On exit, ::source will point immediately _after_ the last input character + * read, if the result is _OK or _NOMEM. Any remaining output for the + * character will be buffered by the codec for writing on the next call. + * + * In the case of the result being _INVALID, ::source will point _at_ the + * last input character read; nothing will be written or buffered for the + * failed character. It is up to the client to fix the cause of the failure + * and retry the decoding process. + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + */ +parserutils_error iconv_codec_read_char(iconv_codec *c, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + size_t iconv_ret; + const uint8_t *origsrc = *source; + size_t origsrclen = *sourcelen; + uint32_t ucs4; + uint8_t *pucs4 = (uint8_t *) &ucs4; + size_t sucs4 = 4; + parserutils_error error; + + /* Use iconv to convert a single character + * Side effect: Updates *source to point at next input + * character and *sourcelen to reflect reduced input length + */ + iconv_ret = iconv(c->read_cd, (char **) source, sourcelen, + (char **) (void *) &pucs4, &sucs4); + + if (iconv_ret != (size_t) -1 || + (*source != origsrc && sucs4 == 0)) { + /* Read a character */ + error = iconv_codec_output_decoded_char(c, ucs4, dest, destlen); + if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) { + /* output failed; restore source pointers */ + *source = origsrc; + *sourcelen = origsrclen; + } + + /* Clear inval buffer */ + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + return error; + } else if (errno == E2BIG) { + /* Should never happen */ + abort(); + } else if (errno == EINVAL) { + /* Incomplete input sequence */ + if (*sourcelen > INVAL_BUFSIZE) + abort(); + + memmove(c->inval_buf, (const char *) *source, *sourcelen); + c->inval_buf[*sourcelen] = '\0'; + c->inval_len = *sourcelen; + + *source += *sourcelen; + *sourcelen = 0; + + return PARSERUTILS_OK; + } else if (errno == EILSEQ) { + /* Illegal input sequence */ + bool found = false; + const uint8_t *oldsrc; + size_t oldsrclen; + + /* Clear inval buffer */ + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + /* Strict errormode; simply flag invalid character */ + if (c->base.errormode == + PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) { + /* restore source pointers */ + *source = origsrc; + *sourcelen = origsrclen; + + return PARSERUTILS_INVALID; + } + + /* Ok, this becomes problematic. The iconv API here + * is particularly unhelpful; *source will point at + * the _start_ of the illegal sequence. This means + * that we must find the end of the sequence */ + + /* Search for the start of the next valid input + * sequence (or the end of the input stream) */ + while (*sourcelen > 1) { + pucs4 = (uint8_t *) &ucs4; + sucs4 = 4; + + (*source)++; + (*sourcelen)--; + + oldsrc = *source; + oldsrclen = *sourcelen; + + iconv_ret = iconv(c->read_cd, + (char **) source, sourcelen, + (char **) (void *) &pucs4, &sucs4); + if (iconv_ret != (size_t) -1 || errno != EILSEQ) { + found = true; + break; + } + } + + if (found) { + /* Found start of next valid sequence */ + *source = oldsrc; + *sourcelen = oldsrclen; + } else { + /* Not found - skip last byte in buffer */ + (*source)++; + (*sourcelen)--; + + if (*sourcelen != 0) + abort(); + } + + /* output U+FFFD and continue processing. */ + error = iconv_codec_output_decoded_char(c, + htonl(0xFFFD), dest, destlen); + if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) { + /* output failed; restore source pointers */ + *source = origsrc; + *sourcelen = origsrclen; + } + + return error; + } + + return PARSERUTILS_OK; +} + +/** + * Write a UCS4 character in a codec's native charset + * + * \param c The codec + * \param ucs4 The UCS4 character to write (big endian) + * \param dest Pointer to pointer to output buffer (updated on exit) + * \param destlen Pointer to length of output buffer (updated on exit) + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if output buffer is too small, + * PARSERUTILS_INVALID if character cannot be represented and the + * codec's error handling mode is set to STRICT. + */ +parserutils_error iconv_codec_write_char(iconv_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen) +{ + size_t iconv_ret; + uint8_t *pucs4 = (uint8_t *) &ucs4; + size_t sucs4 = 4; + uint8_t *origdest = *dest; + + iconv_ret = iconv(c->write_cd, (char **) (void *) &pucs4, + &sucs4, (char **) dest, destlen); + + if (iconv_ret == (size_t) -1 && errno == E2BIG) { + /* Output buffer is too small */ + return PARSERUTILS_NOMEM; + } else if (iconv_ret == (size_t) -1 && errno == EILSEQ) { + /* Illegal multibyte sequence */ + /* This should never happen */ + abort(); + } else if (iconv_ret == (size_t) -1 && errno == EINVAL) { + /* Incomplete input character */ + /* This should never happen */ + abort(); + } else if (*dest == origdest) { + /* Nothing was output */ + switch (c->base.errormode) { + case PARSERUTILS_CHARSET_CODEC_ERROR_STRICT: + return PARSERUTILS_INVALID; + + case PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT: + /** \todo transliteration */ + case PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE: + { + pucs4 = (uint8_t *) &ucs4; + sucs4 = 4; + + ucs4 = parserutils_charset_mibenum_is_unicode( + c->base.mibenum) + ? htonl(0xFFFD) : htonl(0x3F); + + iconv_ret = iconv(c->write_cd, + (char **) (void *) &pucs4, &sucs4, + (char **) dest, destlen); + + if (iconv_ret == (size_t) -1 && errno == E2BIG) { + return PARSERUTILS_NOMEM; + } else if (iconv_ret == (size_t) -1 && + errno == EILSEQ) { + /* Illegal multibyte sequence */ + /* This should never happen */ + abort(); + } else if (iconv_ret == (size_t) -1 && + errno == EINVAL) { + /* Incomplete input character */ + /* This should never happen */ + abort(); + } + } + break; + } + } + + return PARSERUTILS_OK; +} + +const parserutils_charset_handler iconv_codec_handler = { + iconv_codec_handles_charset, + iconv_codec_create +}; + +#endif diff --git a/src/charset/codecs/codec_impl.h b/src/charset/codecs/codec_impl.h new file mode 100644 index 0000000..9183594 --- /dev/null +++ b/src/charset/codecs/codec_impl.h @@ -0,0 +1,48 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef parserutils_charset_codecs_codecimpl_h_ +#define parserutils_charset_codecs_codecimpl_h_ + +#include +#include + +#include + +/** + * Core charset codec definition; implementations extend this + */ +struct parserutils_charset_codec { + uint16_t mibenum; /**< MIB enum for charset */ + + parserutils_charset_codec_errormode errormode; /**< error mode */ + + parserutils_alloc alloc; /**< allocation function */ + void *alloc_pw; /**< private word */ + + struct { + void (*destroy)(parserutils_charset_codec *codec); + parserutils_error (*encode)(parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); + parserutils_error (*decode)(parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); + parserutils_error (*reset)(parserutils_charset_codec *codec); + } handler; /**< Vtable for handler code */ +}; + +/** + * Codec factory component definition + */ +typedef struct parserutils_charset_handler { + bool (*handles_charset)(const char *charset); + parserutils_charset_codec *(*create)(const char *charset, + parserutils_alloc alloc, void *pw); +} parserutils_charset_handler; + +#endif diff --git a/src/charset/codecs/codec_utf16.c b/src/charset/codecs/codec_utf16.c new file mode 100644 index 0000000..0dd7a07 --- /dev/null +++ b/src/charset/codecs/codec_utf16.c @@ -0,0 +1,544 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include +#include + +/* These two are for htonl / ntohl */ +#include +#include + +#include +#include + +#include "charset/codecs/codec_impl.h" +#include "utils/utils.h" + +/** + * UTF-16 charset codec + */ +typedef struct charset_utf16_codec { + parserutils_charset_codec base; /**< Base class */ + +#define INVAL_BUFSIZE (32) + uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up + * incomplete input + * sequences */ + size_t inval_len; /*< Byte length of inval_buf **/ + +#define READ_BUFSIZE (8) + uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial + * output sequences (decode) + * (host-endian) */ + size_t read_len; /**< Character length of read_buf */ + +#define WRITE_BUFSIZE (8) + uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial + * output sequences (encode) + * (host-endian) */ + size_t write_len; /**< Character length of write_buf */ + +} charset_utf16_codec; + +static bool charset_utf16_codec_handles_charset(const char *charset); +static parserutils_charset_codec *charset_utf16_codec_create( + const char *charset, parserutils_alloc alloc, void *pw); +static void charset_utf16_codec_destroy (parserutils_charset_codec *codec); +static parserutils_error charset_utf16_codec_encode( + parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static parserutils_error charset_utf16_codec_decode( + parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static parserutils_error charset_utf16_codec_reset( + parserutils_charset_codec *codec); +static inline parserutils_error charset_utf16_codec_read_char( + charset_utf16_codec *c, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static inline parserutils_error charset_utf16_codec_output_decoded_char( + charset_utf16_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen); + +/** + * Determine whether this codec handles a specific charset + * + * \param charset Charset to test + * \return true if handleable, false otherwise + */ +bool charset_utf16_codec_handles_charset(const char *charset) +{ + return parserutils_charset_mibenum_from_name(charset, strlen(charset)) + == + parserutils_charset_mibenum_from_name("UTF-16", SLEN("UTF-16")); +} + +/** + * Create a utf16 codec + * + * \param charset The charset to read from / write to + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to codec, or NULL on failure + */ +parserutils_charset_codec *charset_utf16_codec_create(const char *charset, + parserutils_alloc alloc, void *pw) +{ + charset_utf16_codec *codec; + + UNUSED(charset); + + codec = alloc(NULL, sizeof(charset_utf16_codec), pw); + if (codec == NULL) + return NULL; + + codec->inval_buf[0] = '\0'; + codec->inval_len = 0; + + codec->read_buf[0] = 0; + codec->read_len = 0; + + codec->write_buf[0] = 0; + codec->write_len = 0; + + /* Finally, populate vtable */ + codec->base.handler.destroy = charset_utf16_codec_destroy; + codec->base.handler.encode = charset_utf16_codec_encode; + codec->base.handler.decode = charset_utf16_codec_decode; + codec->base.handler.reset = charset_utf16_codec_reset; + + return (parserutils_charset_codec *) codec; +} + +/** + * Destroy a utf16 codec + * + * \param codec The codec to destroy + */ +void charset_utf16_codec_destroy (parserutils_charset_codec *codec) +{ + UNUSED(codec); +} + +/** + * Encode a chunk of UCS4 data into utf16 + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if output buffer is too small, + * PARSERUTILS_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * + * On exit, ::source will point immediately _after_ the last input character + * read. Any remaining output for the character will be buffered by the + * codec for writing on the next call. + * + * Note that, if failure occurs whilst attempting to write any output + * buffered by the last call, then ::source and ::sourcelen will remain + * unchanged (as nothing more has been read). + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + */ +parserutils_error charset_utf16_codec_encode(parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + charset_utf16_codec *c = (charset_utf16_codec *) codec; + uint32_t ucs4; + uint32_t *towrite; + size_t towritelen; + parserutils_error error; + + /* Process any outstanding characters from the previous call */ + if (c->write_len > 0) { + uint32_t *pwrite = c->write_buf; + uint8_t buf[4]; + size_t len; + + while (c->write_len > 0) { + error = parserutils_charset_utf16_from_ucs4( + pwrite[0], buf, &len); + if (error != PARSERUTILS_OK) + abort(); + + if (*destlen < len) { + /* Insufficient output buffer space */ + for (len = 0; len < c->write_len; len++) + c->write_buf[len] = pwrite[len]; + + return PARSERUTILS_NOMEM; + } + + memcpy(*dest, buf, len); + + *dest += len; + *destlen -= len; + + pwrite++; + c->write_len--; + } + } + + /* Now process the characters for this call */ + while (*sourcelen > 0) { + ucs4 = ntohl(*((uint32_t *) (void *) *source)); + towrite = &ucs4; + towritelen = 1; + + /* Output current characters */ + while (towritelen > 0) { + uint8_t buf[4]; + size_t len; + + error = parserutils_charset_utf16_from_ucs4( + towrite[0], buf, &len); + if (error != PARSERUTILS_OK) + abort(); + + if (*destlen < len) { + /* Insufficient output space */ + if (towritelen >= WRITE_BUFSIZE) + abort(); + + c->write_len = towritelen; + + /* Copy pending chars to save area, for + * processing next call. */ + for (len = 0; len < towritelen; len++) + c->write_buf[len] = towrite[len]; + + /* Claim character we've just buffered, + * so it's not reprocessed */ + *source += 4; + *sourcelen -= 4; + + return PARSERUTILS_NOMEM; + } + + memcpy(*dest, buf, len); + + *dest += len; + *destlen -= len; + + towrite++; + towritelen--; + } + + *source += 4; + *sourcelen -= 4; + } + + return PARSERUTILS_OK; +} + +/** + * Decode a chunk of utf16 data into UCS4 + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if output buffer is too small, + * PARSERUTILS_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * + * On exit, ::source will point immediately _after_ the last input character + * read, if the result is _OK or _NOMEM. Any remaining output for the + * character will be buffered by the codec for writing on the next call. + * + * In the case of the result being _INVALID, ::source will point _at_ the + * last input character read; nothing will be written or buffered for the + * failed character. It is up to the client to fix the cause of the failure + * and retry the decoding process. + * + * Note that, if failure occurs whilst attempting to write any output + * buffered by the last call, then ::source and ::sourcelen will remain + * unchanged (as nothing more has been read). + * + * If STRICT error handling is configured and an illegal sequence is split + * over two calls, then _INVALID will be returned from the second call, + * but ::source will point mid-way through the invalid sequence (i.e. it + * will be unmodified over the second call). In addition, the internal + * incomplete-sequence buffer will be emptied, such that subsequent calls + * will progress, rather than re-evaluating the same invalid sequence. + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + * + * Call this with a source length of 0 to flush the output buffer. + */ +parserutils_error charset_utf16_codec_decode(parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + charset_utf16_codec *c = (charset_utf16_codec *) codec; + parserutils_error error; + + if (c->read_len > 0) { + /* Output left over from last decode */ + uint32_t *pread = c->read_buf; + + while (c->read_len > 0 && *destlen >= c->read_len * 4) { + *((uint32_t *) (void *) *dest) = htonl(pread[0]); + + *dest += 4; + *destlen -= 4; + + pread++; + c->read_len--; + } + + if (*destlen < c->read_len * 4) { + /* Ran out of output buffer */ + size_t i; + + /* Shuffle remaining output down */ + for (i = 0; i < c->read_len; i++) + c->read_buf[i] = pread[i]; + + return PARSERUTILS_NOMEM; + } + } + + if (c->inval_len > 0) { + /* The last decode ended in an incomplete sequence. + * Fill up inval_buf with data from the start of the + * new chunk and process it. */ + uint8_t *in = c->inval_buf; + size_t ol = c->inval_len; + size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen); + size_t orig_l = l; + + memcpy(c->inval_buf + ol, *source, l); + + l += c->inval_len; + + error = charset_utf16_codec_read_char(c, + (const uint8_t **) &in, &l, dest, destlen); + if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) { + return error; + } + + /* And now, fix up source pointers */ + *source += max((signed) (orig_l - l), 0); + *sourcelen -= max((signed) (orig_l - l), 0); + + /* Failed to resolve an incomplete character and + * ran out of buffer space. No recovery strategy + * possible, so explode everywhere. */ + if ((orig_l + ol) - l == 0) + abort(); + + /* Report memory exhaustion case from above */ + if (error != PARSERUTILS_OK) + return error; + } + + /* Finally, the "normal" case; process all outstanding characters */ + while (*sourcelen > 0) { + error = charset_utf16_codec_read_char(c, + source, sourcelen, dest, destlen); + if (error != PARSERUTILS_OK) { + return error; + } + } + + return PARSERUTILS_OK; +} + +/** + * Clear a utf16 codec's encoding state + * + * \param codec The codec to reset + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error charset_utf16_codec_reset(parserutils_charset_codec *codec) +{ + charset_utf16_codec *c = (charset_utf16_codec *) codec; + + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + c->read_buf[0] = 0; + c->read_len = 0; + + c->write_buf[0] = 0; + c->write_len = 0; + + return PARSERUTILS_OK; +} + + +/** + * Read a character from the UTF-16 to UCS4 (big endian) + * + * \param c The codec + * \param source Pointer to pointer to source buffer (updated on exit) + * \param sourcelen Pointer to length of source buffer (updated on exit) + * \param dest Pointer to pointer to output buffer (updated on exit) + * \param destlen Pointer to length of output buffer (updated on exit) + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if output buffer is too small, + * PARSERUTILS_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * + * On exit, ::source will point immediately _after_ the last input character + * read, if the result is _OK or _NOMEM. Any remaining output for the + * character will be buffered by the codec for writing on the next call. + * + * In the case of the result being _INVALID, ::source will point _at_ the + * last input character read; nothing will be written or buffered for the + * failed character. It is up to the client to fix the cause of the failure + * and retry the decoding process. + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + */ +parserutils_error charset_utf16_codec_read_char(charset_utf16_codec *c, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + uint32_t ucs4; + size_t sucs4; + parserutils_error error; + + /* Convert a single character */ + error = parserutils_charset_utf16_to_ucs4(*source, *sourcelen, + &ucs4, &sucs4); + if (error == PARSERUTILS_OK) { + /* Read a character */ + error = charset_utf16_codec_output_decoded_char(c, + ucs4, dest, destlen); + if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) { + /* output succeeded; update source pointers */ + *source += sucs4; + *sourcelen -= sucs4; + } + + /* Clear inval buffer */ + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + return error; + } else if (error == PARSERUTILS_NEEDDATA) { + /* Incomplete input sequence */ + if (*sourcelen > INVAL_BUFSIZE) + abort(); + + memmove(c->inval_buf, (char *) *source, *sourcelen); + c->inval_buf[*sourcelen] = '\0'; + c->inval_len = *sourcelen; + + *source += *sourcelen; + *sourcelen = 0; + + return PARSERUTILS_OK; + } else if (error == PARSERUTILS_INVALID) { + /* Illegal input sequence */ + uint32_t nextchar; + + /* Clear inval buffer */ + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + /* Strict errormode; simply flag invalid character */ + if (c->base.errormode == + PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) { + return PARSERUTILS_INVALID; + } + + /* Find next valid UTF-16 sequence. + * We're processing client-provided data, so let's + * be paranoid about its validity. */ + error = parserutils_charset_utf16_next_paranoid( + *source, *sourcelen, 0, &nextchar); + if (error != PARSERUTILS_OK) { + if (error == PARSERUTILS_NEEDDATA) { + /* Need more data to be sure */ + if (*sourcelen > INVAL_BUFSIZE) + abort(); + + memmove(c->inval_buf, (char *) *source, + *sourcelen); + c->inval_buf[*sourcelen] = '\0'; + c->inval_len = *sourcelen; + + *source += *sourcelen; + *sourcelen = 0; + + nextchar = 0; + } else { + return error; + } + } + + /* output U+FFFD and continue processing. */ + error = charset_utf16_codec_output_decoded_char(c, + 0xFFFD, dest, destlen); + if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) { + /* output succeeded; update source pointers */ + *source += nextchar; + *sourcelen -= nextchar; + } + + return error; + } + + return PARSERUTILS_OK; +} + +/** + * Output a UCS4 character + * + * \param c Codec to use + * \param ucs4 UCS4 character (host endian) + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to output buffer length + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if output buffer is too small, + */ +parserutils_error charset_utf16_codec_output_decoded_char(charset_utf16_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen) +{ + if (*destlen < 4) { + /* Run out of output buffer */ + c->read_len = 1; + c->read_buf[0] = ucs4; + + return PARSERUTILS_NOMEM; + } + + *((uint32_t *) (void *) *dest) = htonl(ucs4); + *dest += 4; + *destlen -= 4; + + return PARSERUTILS_OK; +} + + +const parserutils_charset_handler charset_utf16_codec_handler = { + charset_utf16_codec_handles_charset, + charset_utf16_codec_create +}; diff --git a/src/charset/codecs/codec_utf8.c b/src/charset/codecs/codec_utf8.c new file mode 100644 index 0000000..838d051 --- /dev/null +++ b/src/charset/codecs/codec_utf8.c @@ -0,0 +1,546 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include +#include + +/* These two are for htonl / ntohl */ +#include +#include + +#include + +#include "charset/codecs/codec_impl.h" +#include "charset/encodings/utf8impl.h" +#include "utils/utils.h" + +/** + * UTF-8 charset codec + */ +typedef struct charset_utf8_codec { + parserutils_charset_codec base; /**< Base class */ + +#define INVAL_BUFSIZE (32) + uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up + * incomplete input + * sequences */ + size_t inval_len; /*< Byte length of inval_buf **/ + +#define READ_BUFSIZE (8) + uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial + * output sequences (decode) + * (host-endian) */ + size_t read_len; /**< Character length of read_buf */ + +#define WRITE_BUFSIZE (8) + uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial + * output sequences (encode) + * (host-endian) */ + size_t write_len; /**< Character length of write_buf */ + +} charset_utf8_codec; + +static bool charset_utf8_codec_handles_charset(const char *charset); +static parserutils_charset_codec *charset_utf8_codec_create(const char *charset, + parserutils_alloc alloc, void *pw); +static void charset_utf8_codec_destroy (parserutils_charset_codec *codec); +static parserutils_error charset_utf8_codec_encode( + parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static parserutils_error charset_utf8_codec_decode( + parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static parserutils_error charset_utf8_codec_reset( + parserutils_charset_codec *codec); +static inline parserutils_error charset_utf8_codec_read_char( + charset_utf8_codec *c, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static inline parserutils_error charset_utf8_codec_output_decoded_char( + charset_utf8_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen); + +/** + * Determine whether this codec handles a specific charset + * + * \param charset Charset to test + * \return true if handleable, false otherwise + */ +bool charset_utf8_codec_handles_charset(const char *charset) +{ + return parserutils_charset_mibenum_from_name(charset, + strlen(charset)) == + parserutils_charset_mibenum_from_name("UTF-8", + SLEN("UTF-8")); +} + +/** + * Create a utf8 codec + * + * \param charset The charset to read from / write to + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to codec, or NULL on failure + */ +parserutils_charset_codec *charset_utf8_codec_create(const char *charset, + parserutils_alloc alloc, void *pw) +{ + charset_utf8_codec *codec; + + UNUSED(charset); + + codec = alloc(NULL, sizeof(charset_utf8_codec), pw); + if (codec == NULL) + return NULL; + + codec->inval_buf[0] = '\0'; + codec->inval_len = 0; + + codec->read_buf[0] = 0; + codec->read_len = 0; + + codec->write_buf[0] = 0; + codec->write_len = 0; + + /* Finally, populate vtable */ + codec->base.handler.destroy = charset_utf8_codec_destroy; + codec->base.handler.encode = charset_utf8_codec_encode; + codec->base.handler.decode = charset_utf8_codec_decode; + codec->base.handler.reset = charset_utf8_codec_reset; + + return (parserutils_charset_codec *) codec; +} + +/** + * Destroy a utf8 codec + * + * \param codec The codec to destroy + */ +void charset_utf8_codec_destroy (parserutils_charset_codec *codec) +{ + UNUSED(codec); +} + +/** + * Encode a chunk of UCS4 data into utf8 + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if output buffer is too small, + * PARSERUTILS_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * + * On exit, ::source will point immediately _after_ the last input character + * read. Any remaining output for the character will be buffered by the + * codec for writing on the next call. + * + * Note that, if failure occurs whilst attempting to write any output + * buffered by the last call, then ::source and ::sourcelen will remain + * unchanged (as nothing more has been read). + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + */ +parserutils_error charset_utf8_codec_encode(parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + charset_utf8_codec *c = (charset_utf8_codec *) codec; + uint32_t ucs4; + uint32_t *towrite; + size_t towritelen; + parserutils_error error; + + /* Process any outstanding characters from the previous call */ + if (c->write_len > 0) { + uint32_t *pwrite = c->write_buf; + + while (c->write_len > 0) { + UTF8_FROM_UCS4(pwrite[0], dest, destlen, error); + if (error != PARSERUTILS_OK) { + if (error != PARSERUTILS_NOMEM) + abort(); + + /* Insufficient output buffer space */ + for (uint32_t len = 0; + len < c->write_len; len++) { + c->write_buf[len] = pwrite[len]; + } + + return PARSERUTILS_NOMEM; + } + + pwrite++; + c->write_len--; + } + } + + /* Now process the characters for this call */ + while (*sourcelen > 0) { + ucs4 = ntohl(*((uint32_t *) (void *) *source)); + towrite = &ucs4; + towritelen = 1; + + /* Output current characters */ + while (towritelen > 0) { + UTF8_FROM_UCS4(towrite[0], dest, destlen, error); + if (error != PARSERUTILS_OK) { + if (error != PARSERUTILS_NOMEM) + abort(); + + /* Insufficient output space */ + if (towritelen >= WRITE_BUFSIZE) + abort(); + + c->write_len = towritelen; + + /* Copy pending chars to save area, for + * processing next call. */ + for (uint32_t len = 0; len < towritelen; len++) + c->write_buf[len] = towrite[len]; + + /* Claim character we've just buffered, + * so it's not reprocessed */ + *source += 4; + *sourcelen -= 4; + + return PARSERUTILS_NOMEM; + } + + towrite++; + towritelen--; + } + + *source += 4; + *sourcelen -= 4; + } + + return PARSERUTILS_OK; +} + +/** + * Decode a chunk of utf8 data into UCS4 + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if output buffer is too small, + * PARSERUTILS_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * + * On exit, ::source will point immediately _after_ the last input character + * read, if the result is _OK or _NOMEM. Any remaining output for the + * character will be buffered by the codec for writing on the next call. + * + * In the case of the result being _INVALID, ::source will point _at_ the + * last input character read; nothing will be written or buffered for the + * failed character. It is up to the client to fix the cause of the failure + * and retry the decoding process. + * + * Note that, if failure occurs whilst attempting to write any output + * buffered by the last call, then ::source and ::sourcelen will remain + * unchanged (as nothing more has been read). + * + * If STRICT error handling is configured and an illegal sequence is split + * over two calls, then _INVALID will be returned from the second call, + * but ::source will point mid-way through the invalid sequence (i.e. it + * will be unmodified over the second call). In addition, the internal + * incomplete-sequence buffer will be emptied, such that subsequent calls + * will progress, rather than re-evaluating the same invalid sequence. + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + * + * Call this with a source length of 0 to flush the output buffer. + */ +parserutils_error charset_utf8_codec_decode(parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + charset_utf8_codec *c = (charset_utf8_codec *) codec; + parserutils_error error; + + if (c->read_len > 0) { + /* Output left over from last decode */ + uint32_t *pread = c->read_buf; + + while (c->read_len > 0 && *destlen >= c->read_len * 4) { + *((uint32_t *) (void *) *dest) = htonl(pread[0]); + + *dest += 4; + *destlen -= 4; + + pread++; + c->read_len--; + } + + if (*destlen < c->read_len * 4) { + /* Ran out of output buffer */ + size_t i; + + /* Shuffle remaining output down */ + for (i = 0; i < c->read_len; i++) + c->read_buf[i] = pread[i]; + + return PARSERUTILS_NOMEM; + } + } + + if (c->inval_len > 0) { + /* The last decode ended in an incomplete sequence. + * Fill up inval_buf with data from the start of the + * new chunk and process it. */ + uint8_t *in = c->inval_buf; + size_t ol = c->inval_len; + size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen); + size_t orig_l = l; + + memcpy(c->inval_buf + ol, *source, l); + + l += c->inval_len; + + error = charset_utf8_codec_read_char(c, + (const uint8_t **) &in, &l, dest, destlen); + if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) { + return error; + } + + /* And now, fix up source pointers */ + *source += max((signed) (orig_l - l), 0); + *sourcelen -= max((signed) (orig_l - l), 0); + + /* Failed to resolve an incomplete character and + * ran out of buffer space. No recovery strategy + * possible, so explode everywhere. */ + if ((orig_l + ol) - l == 0) + abort(); + + /* Report memory exhaustion case from above */ + if (error != PARSERUTILS_OK) + return error; + } + + /* Finally, the "normal" case; process all outstanding characters */ + while (*sourcelen > 0) { + error = charset_utf8_codec_read_char(c, + source, sourcelen, dest, destlen); + if (error != PARSERUTILS_OK) { + return error; + } + } + + return PARSERUTILS_OK; +} + +/** + * Clear a utf8 codec's encoding state + * + * \param codec The codec to reset + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error charset_utf8_codec_reset(parserutils_charset_codec *codec) +{ + charset_utf8_codec *c = (charset_utf8_codec *) codec; + + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + c->read_buf[0] = 0; + c->read_len = 0; + + c->write_buf[0] = 0; + c->write_len = 0; + + return PARSERUTILS_OK; +} + + +/** + * Read a character from the UTF-8 to UCS4 (big endian) + * + * \param c The codec + * \param source Pointer to pointer to source buffer (updated on exit) + * \param sourcelen Pointer to length of source buffer (updated on exit) + * \param dest Pointer to pointer to output buffer (updated on exit) + * \param destlen Pointer to length of output buffer (updated on exit) + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if output buffer is too small, + * PARSERUTILS_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * + * On exit, ::source will point immediately _after_ the last input character + * read, if the result is _OK or _NOMEM. Any remaining output for the + * character will be buffered by the codec for writing on the next call. + * + * In the case of the result being _INVALID, ::source will point _at_ the + * last input character read; nothing will be written or buffered for the + * failed character. It is up to the client to fix the cause of the failure + * and retry the decoding process. + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + */ +parserutils_error charset_utf8_codec_read_char(charset_utf8_codec *c, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + uint32_t ucs4; + size_t sucs4; + parserutils_error error; + + /* Convert a single character */ + { + const uint8_t *src = *source; + size_t srclen = *sourcelen; + uint32_t *uptr = &ucs4; + size_t *usptr = &sucs4; + UTF8_TO_UCS4(src, srclen, uptr, usptr, error); + } + if (error == PARSERUTILS_OK) { + /* Read a character */ + error = charset_utf8_codec_output_decoded_char(c, + ucs4, dest, destlen); + if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) { + /* output succeeded; update source pointers */ + *source += sucs4; + *sourcelen -= sucs4; + } + + /* Clear inval buffer */ + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + return error; + } else if (error == PARSERUTILS_NEEDDATA) { + /* Incomplete input sequence */ + if (*sourcelen > INVAL_BUFSIZE) + abort(); + + memmove(c->inval_buf, (char *) *source, *sourcelen); + c->inval_buf[*sourcelen] = '\0'; + c->inval_len = *sourcelen; + + *source += *sourcelen; + *sourcelen = 0; + + return PARSERUTILS_OK; + } else if (error == PARSERUTILS_INVALID) { + /* Illegal input sequence */ + uint32_t nextchar; + + /* Strict errormode; simply flag invalid character */ + if (c->base.errormode == + PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) { + /* Clear inval buffer */ + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + return PARSERUTILS_INVALID; + } + + /* Find next valid UTF-8 sequence. + * We're processing client-provided data, so let's + * be paranoid about its validity. */ + { + const uint8_t *src = *source; + size_t srclen = *sourcelen; + uint32_t off = 0; + uint32_t *ncptr = &nextchar; + + UTF8_NEXT_PARANOID(src, srclen, off, ncptr, error); + } + if (error != PARSERUTILS_OK) { + if (error == PARSERUTILS_NEEDDATA) { + /* Need more data to be sure */ + if (*sourcelen > INVAL_BUFSIZE) + abort(); + + memmove(c->inval_buf, (char *) *source, + *sourcelen); + c->inval_buf[*sourcelen] = '\0'; + c->inval_len = *sourcelen; + + *source += *sourcelen; + *sourcelen = 0; + + nextchar = 0; + } else { + return error; + } + } + + /* Clear inval buffer */ + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + /* output U+FFFD and continue processing. */ + error = charset_utf8_codec_output_decoded_char(c, + 0xFFFD, dest, destlen); + if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) { + /* output succeeded; update source pointers */ + *source += nextchar; + *sourcelen -= nextchar; + } + + return error; + } + + return PARSERUTILS_OK; +} + +/** + * Output a UCS4 character + * + * \param c Codec to use + * \param ucs4 UCS4 character (host endian) + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to output buffer length + * \return PARSERUTILS_OK on success, + * PARSERUTILS_NOMEM if output buffer is too small, + */ +parserutils_error charset_utf8_codec_output_decoded_char(charset_utf8_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen) +{ + if (*destlen < 4) { + /* Run out of output buffer */ + c->read_len = 1; + c->read_buf[0] = ucs4; + + return PARSERUTILS_NOMEM; + } + + *((uint32_t *) (void *) *dest) = htonl(ucs4); + *dest += 4; + *destlen -= 4; + + return PARSERUTILS_OK; +} + + +const parserutils_charset_handler charset_utf8_codec_handler = { + charset_utf8_codec_handles_charset, + charset_utf8_codec_create +}; + diff --git a/src/charset/encodings/Makefile b/src/charset/encodings/Makefile new file mode 100644 index 0000000..47d9210 --- /dev/null +++ b/src/charset/encodings/Makefile @@ -0,0 +1,46 @@ +# Child makefile fragment +# +# Toolchain is provided by top-level makefile +# +# Variables provided by top-level makefile +# +# COMPONENT The name of the component +# EXPORT The location of the export directory +# TOP The location of the source tree root +# RELEASEDIR The place to put release objects +# DEBUGDIR The place to put debug objects +# +# do_include Canned command sequence to include a child makefile +# +# Variables provided by parent makefile: +# +# DIR The name of the directory we're in, relative to $(TOP) +# +# Variables we can manipulate: +# +# ITEMS_CLEAN The list of items to remove for "make clean" +# ITEMS_DISTCLEAN The list of items to remove for "make distclean" +# TARGET_TESTS The list of target names to run for "make test" +# +# SOURCES The list of sources to build for $(COMPONENT) +# +# Plus anything from the toolchain + +# Push parent directory onto the directory stack +sp := $(sp).x +dirstack_$(sp) := $(d) +d := $(DIR) + +# Sources +SRCS_$(d) := utf8.c utf16.c + +# Append to sources for component +SOURCES += $(addprefix $(d), $(SRCS_$(d))) + +# Now include any children we may have +MAKE_INCLUDES := $(wildcard $(d)*/Makefile) +$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC)))) + +# Finally, pop off the directory stack +d := $(dirstack_$(sp)) +sp := $(basename $(sp)) diff --git a/src/charset/encodings/utf16.c b/src/charset/encodings/utf16.c new file mode 100644 index 0000000..95dc64f --- /dev/null +++ b/src/charset/encodings/utf16.c @@ -0,0 +1,239 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +/** \file + * UTF-16 manipulation functions (implementation). + */ + +#include +#include +#include + +#include + +/** + * Convert a UTF-16 sequence into a single UCS4 character + * + * \param s The sequence to process + * \param len Length of sequence + * \param ucs4 Pointer to location to receive UCS4 character (host endian) + * \param clen Pointer to location to receive byte length of UTF-16 sequence + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf16_to_ucs4(const uint8_t *s, + size_t len, uint32_t *ucs4, size_t *clen) +{ + const uint16_t *ss = (const uint16_t *) (const void *) s; + + if (s == NULL || ucs4 == NULL || clen == NULL) + return PARSERUTILS_BADPARM; + + if (len < 2) + return PARSERUTILS_NEEDDATA; + + if (*ss < 0xD800 || *ss > 0xDFFF) { + *ucs4 = *ss; + *clen = 2; + } else if (0xD800 <= *ss && *ss <= 0xBFFF) { + if (len < 4) + return PARSERUTILS_NEEDDATA; + + if (0xDC00 <= ss[1] && ss[1] <= 0xE000) { + *ucs4 = (((s[0] >> 6) & 0x1f) + 1) | + ((s[0] & 0x3f) | (s[1] & 0x3ff)); + *clen = 4; + } else { + return PARSERUTILS_INVALID; + } + } + + return PARSERUTILS_OK; +} + +/** + * Convert a single UCS4 character into a UTF-16 sequence + * + * \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian) + * \param s Pointer to 4 byte long output buffer + * \param len Pointer to location to receive length of multibyte sequence + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf16_from_ucs4(uint32_t ucs4, uint8_t *s, + size_t *len) +{ + uint16_t *ss = (uint16_t *) (void *) s; + uint32_t l = 0; + + if (s == NULL || len == NULL) + return PARSERUTILS_BADPARM; + else if (ucs4 < 0x10000) { + *ss = (uint16_t) ucs4; + l = 2; + } else if (ucs4 < 0x110000) { + ss[0] = 0xD800 | (((ucs4 >> 16) & 0x1f) - 1) | (ucs4 >> 10); + ss[1] = 0xDC00 | (ucs4 & 0x3ff); + l = 4; + } else { + return PARSERUTILS_INVALID; + } + + *len = l; + + return PARSERUTILS_OK; +} + +/** + * Calculate the length (in characters) of a bounded UTF-16 string + * + * \param s The string + * \param max Maximum length + * \param len Pointer to location to receive length of string + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf16_length(const uint8_t *s, size_t max, + size_t *len) +{ + const uint16_t *ss = (const uint16_t *) (const void *) s; + const uint16_t *end = (const uint16_t *) (const void *) (s + max); + int l = 0; + + if (s == NULL || len == NULL) + return PARSERUTILS_BADPARM; + + while (ss < end) { + if (*ss < 0xD800 || 0xDFFF < *ss) + ss++; + else + ss += 2; + + l++; + } + + *len = l; + + return PARSERUTILS_OK; +} + +/** + * Calculate the length (in bytes) of a UTF-16 character + * + * \param s Pointer to start of character + * \param len Pointer to location to receive length + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf16_char_byte_length(const uint8_t *s, + size_t *len) +{ + const uint16_t *ss = (const uint16_t *) (const void *) s; + + if (s == NULL || len == NULL) + return PARSERUTILS_BADPARM; + + if (*ss < 0xD800 || 0xDFFF < *ss) + *len = 2; + else + *len = 4; + + return PARSERUTILS_OK; +} + +/** + * Find previous legal UTF-16 char in string + * + * \param s The string + * \param off Offset in the string to start at + * \param prevoff Pointer to location to receive offset of first byte of + * previous legal character + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf16_prev(const uint8_t *s, uint32_t off, + uint32_t *prevoff) +{ + const uint16_t *ss = (const uint16_t *) (const void *) s; + + if (s == NULL || prevoff == NULL) + return PARSERUTILS_BADPARM; + + if (off < 2) + *prevoff = 0; + else if (ss[-1] < 0xDC00 || ss[-1] > 0xDFFF) + *prevoff = off - 2; + else + *prevoff = (off < 4) ? 0 : off - 4; + + return PARSERUTILS_OK; +} + +/** + * Find next legal UTF-16 char in string + * + * \param s The string (assumed valid) + * \param len Maximum offset in string + * \param off Offset in the string to start at + * \param nextoff Pointer to location to receive offset of first byte of + * next legal character + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf16_next(const uint8_t *s, uint32_t len, + uint32_t off, uint32_t *nextoff) +{ + const uint16_t *ss = (const uint16_t *) (const void *) s; + + if (s == NULL || off >= len || nextoff == NULL) + return PARSERUTILS_BADPARM; + + if (len - off < 4) + *nextoff = len; + else if (ss[1] < 0xD800 || ss[1] > 0xDBFF) + *nextoff = off + 2; + else + *nextoff = (len - off < 6) ? len : off + 4; + + return PARSERUTILS_OK; +} + +/** + * Find next legal UTF-16 char in string + * + * \param s The string (assumed to be of dubious validity) + * \param len Maximum offset in string + * \param off Offset in the string to start at + * \param nextoff Pointer to location to receive offset of first byte of + * next legal character + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf16_next_paranoid(const uint8_t *s, + uint32_t len, uint32_t off, uint32_t *nextoff) +{ + const uint16_t *ss = (const uint16_t *) (const void *) s; + + if (s == NULL || off >= len || nextoff == NULL) + return PARSERUTILS_BADPARM; + + while (1) { + if (len - off < 4) { + return PARSERUTILS_NEEDDATA; + } else if (ss[1] < 0xD800 || ss[1] > 0xDFFF) { + *nextoff = off + 2; + break; + } else if (ss[1] >= 0xD800 && ss[1] <= 0xDBFF) { + if (len - off < 6) + return PARSERUTILS_NEEDDATA; + + if (ss[2] >= 0xDC00 && ss[2] <= 0xDFFF) { + *nextoff = off + 4; + break; + } else { + ss++; + off += 2; + } + } + } + + return PARSERUTILS_OK; +} + diff --git a/src/charset/encodings/utf8.c b/src/charset/encodings/utf8.c new file mode 100644 index 0000000..5b4ba95 --- /dev/null +++ b/src/charset/encodings/utf8.c @@ -0,0 +1,175 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +/** \file + * UTF-8 manipulation functions (implementation). + */ + +#include +#include +#include + +#include +#include "charset/encodings/utf8impl.h" + +/** Number of continuation bytes for a given start byte */ +const uint8_t numContinuations[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, +}; + +/** + * Convert a UTF-8 multibyte sequence into a single UCS4 character + * + * Encoding of UCS values outside the UTF-16 plane has been removed from + * RFC3629. This function conforms to RFC2279, however. + * + * \param s The sequence to process + * \param len Length of sequence + * \param ucs4 Pointer to location to receive UCS4 character (host endian) + * \param clen Pointer to location to receive byte length of UTF-8 sequence + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf8_to_ucs4(const uint8_t *s, size_t len, + uint32_t *ucs4, size_t *clen) +{ + parserutils_error error; + + UTF8_TO_UCS4(s, len, ucs4, clen, error); + + return error; +} + +/** + * Convert a single UCS4 character into a UTF-8 multibyte sequence + * + * Encoding of UCS values outside the UTF-16 plane has been removed from + * RFC3629. This function conforms to RFC2279, however. + * + * \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian) + * \param s Pointer to pointer to output buffer, updated on exit + * \param len Pointer to length, in bytes, of output buffer, updated on exit + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf8_from_ucs4(uint32_t ucs4, + uint8_t **s, size_t *len) +{ + parserutils_error error; + + UTF8_FROM_UCS4(ucs4, s, len, error); + + return error; +} + +/** + * Calculate the length (in characters) of a bounded UTF-8 string + * + * \param s The string + * \param max Maximum length + * \param len Pointer to location to receive length of string + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf8_length(const uint8_t *s, size_t max, + size_t *len) +{ + parserutils_error error; + + UTF8_LENGTH(s, max, len, error); + + return error; +} + +/** + * Calculate the length (in bytes) of a UTF-8 character + * + * \param s Pointer to start of character + * \param len Pointer to location to receive length + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf8_char_byte_length(const uint8_t *s, + size_t *len) +{ + parserutils_error error; + + UTF8_CHAR_BYTE_LENGTH(s, len, error); + + return error; +} + +/** + * Find previous legal UTF-8 char in string + * + * \param s The string + * \param off Offset in the string to start at + * \param prevoff Pointer to location to receive offset of first byte of + * previous legal character + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf8_prev(const uint8_t *s, uint32_t off, + uint32_t *prevoff) +{ + parserutils_error error; + + UTF8_PREV(s, off, prevoff, error); + + return error; +} + +/** + * Find next legal UTF-8 char in string + * + * \param s The string (assumed valid) + * \param len Maximum offset in string + * \param off Offset in the string to start at + * \param nextoff Pointer to location to receive offset of first byte of + * next legal character + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf8_next(const uint8_t *s, uint32_t len, + uint32_t off, uint32_t *nextoff) +{ + parserutils_error error; + + UTF8_NEXT(s, len, off, nextoff, error); + + return error; +} + +/** + * Find next legal UTF-8 char in string + * + * \param s The string (assumed to be of dubious validity) + * \param len Maximum offset in string + * \param off Offset in the string to start at + * \param nextoff Pointer to location to receive offset of first byte of + * next legal character + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf8_next_paranoid(const uint8_t *s, + uint32_t len, uint32_t off, uint32_t *nextoff) +{ + parserutils_error error; + + UTF8_NEXT_PARANOID(s, len, off, nextoff, error); + + return error; +} + diff --git a/src/charset/encodings/utf8impl.h b/src/charset/encodings/utf8impl.h new file mode 100644 index 0000000..1ca9de7 --- /dev/null +++ b/src/charset/encodings/utf8impl.h @@ -0,0 +1,339 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef parserutils_charset_encodings_utf8impl_h_ +#define parserutils_charset_encodings_utf8impl_h_ + +/** \file + * UTF-8 manipulation macros (implementation). + */ + +#include +#include +#include + +/** Number of continuation bytes for a given start byte */ +extern const uint8_t numContinuations[256]; + +/** + * Convert a UTF-8 multibyte sequence into a single UCS4 character + * + * Encoding of UCS values outside the UTF-16 plane has been removed from + * RFC3629. This macro conforms to RFC2279, however. + * + * \param s The sequence to process + * \param len Length of sequence + * \param ucs4 Pointer to location to receive UCS4 character (host endian) + * \param clen Pointer to location to receive byte length of UTF-8 sequence + * \param error Location to receive error code + */ +#define UTF8_TO_UCS4(s, len, ucs4, clen, error) \ +do { \ + uint32_t c, min; \ + uint8_t n; \ + \ + error = PARSERUTILS_OK; \ + \ + if (s == NULL || ucs4 == NULL || clen == NULL) { \ + error = PARSERUTILS_BADPARM; \ + break; \ + } \ + \ + if (len == 0) { \ + error = PARSERUTILS_NEEDDATA; \ + break; \ + } \ + \ + c = s[0]; \ + \ + if (c < 0x80) { \ + n = 1; \ + min = 0; \ + } else if ((c & 0xE0) == 0xC0) { \ + c &= 0x1F; \ + n = 2; \ + min = 0x80; \ + } else if ((c & 0xF0) == 0xE0) { \ + c &= 0x0F; \ + n = 3; \ + min = 0x800; \ + } else if ((c & 0xF8) == 0xF0) { \ + c &= 0x07; \ + n = 4; \ + min = 0x10000; \ + } else if ((c & 0xFC) == 0xF8) { \ + c &= 0x03; \ + n = 5; \ + min = 0x200000; \ + } else if ((c & 0xFE) == 0xFC) { \ + c &= 0x01; \ + n = 6; \ + min = 0x4000000; \ + } else { \ + error = PARSERUTILS_INVALID; \ + break; \ + } \ + \ + if (len < n) { \ + error = PARSERUTILS_NEEDDATA; \ + break; \ + } \ + \ + for (uint8_t i = 1; i < n; i++) { \ + uint32_t t = s[i]; \ + \ + if ((t & 0xC0) != 0x80) { \ + error = PARSERUTILS_INVALID; \ + break; \ + } \ + \ + c <<= 6; \ + c |= t & 0x3F; \ + } \ + \ + if (error == PARSERUTILS_OK) { \ + /* Detect overlong sequences, surrogates and fffe/ffff */ \ + if (c < min || (c >= 0xD800 && c <= 0xDFFF) || \ + c == 0xFFFE || c == 0xFFFF) { \ + error = PARSERUTILS_INVALID; \ + break; \ + } \ + \ + *ucs4 = c; \ + *clen = n; \ + } \ +} while(0) + +/** + * Convert a single UCS4 character into a UTF-8 multibyte sequence + * + * Encoding of UCS values outside the UTF-16 plane has been removed from + * RFC3629. This macro conforms to RFC2279, however. + * + * \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian) + * \param s Pointer to pointer to output buffer, updated on exit + * \param len Pointer to length, in bytes, of output buffer, updated on exit + * \param error Location to receive error code + */ +#define UTF8_FROM_UCS4(ucs4, s, len, error) \ +do { \ + uint8_t *buf; \ + uint8_t l = 0; \ + \ + error = PARSERUTILS_OK; \ + \ + if (s == NULL || *s == NULL || len == NULL) { \ + error = PARSERUTILS_BADPARM; \ + break; \ + } \ + \ + if (ucs4 < 0x80) { \ + l = 1; \ + } else if (ucs4 < 0x800) { \ + l = 2; \ + } else if (ucs4 < 0x10000) { \ + l = 3; \ + } else if (ucs4 < 0x200000) { \ + l = 4; \ + } else if (ucs4 < 0x4000000) { \ + l = 5; \ + } else if (ucs4 <= 0x7FFFFFFF) { \ + l = 6; \ + } else { \ + error = PARSERUTILS_INVALID; \ + break; \ + } \ + \ + if (l > *len) { \ + error = PARSERUTILS_NOMEM; \ + break; \ + } \ + \ + buf = *s; \ + \ + if (l == 1) { \ + buf[0] = (uint8_t) ucs4; \ + } else { \ + for (uint8_t i = l; i > 1; i--) { \ + buf[i - 1] = 0x80 | (ucs4 & 0x3F); \ + ucs4 >>= 6; \ + } \ + buf[0] = ~((1 << (8 - l)) - 1) | ucs4; \ + } \ + \ + *s += l; \ + *len -= l; \ +} while(0) + +/** + * Calculate the length (in characters) of a bounded UTF-8 string + * + * \param s The string + * \param max Maximum length + * \param len Pointer to location to receive length of string + * \param error Location to receive error code + */ +#define UTF8_LENGTH(s, max, len, error) \ +do { \ + const uint8_t *end = s + max; \ + int l = 0; \ + \ + error = PARSERUTILS_OK; \ + \ + if (s == NULL || len == NULL) { \ + error = PARSERUTILS_BADPARM; \ + break; \ + } \ + \ + while (s < end) { \ + uint32_t c = s[0]; \ + \ + if ((c & 0x80) == 0x00) \ + s += 1; \ + else if ((c & 0xE0) == 0xC0) \ + s += 2; \ + else if ((c & 0xF0) == 0xE0) \ + s += 3; \ + else if ((c & 0xF8) == 0xF0) \ + s += 4; \ + else if ((c & 0xFC) == 0xF8) \ + s += 5; \ + else if ((c & 0xFE) == 0xFC) \ + s += 6; \ + else { \ + error = PARSERUTILS_INVALID; \ + break; \ + } \ + \ + l++; \ + } \ + \ + if (error == PARSERUTILS_OK) \ + *len = l; \ +} while(0) + +/** + * Calculate the length (in bytes) of a UTF-8 character + * + * \param s Pointer to start of character + * \param len Pointer to location to receive length + * \param error Location to receive error code + */ +#define UTF8_CHAR_BYTE_LENGTH(s, len, error) \ +do { \ + if (s == NULL || len == NULL) { \ + error = PARSERUTILS_BADPARM; \ + break; \ + } \ + \ + *len = numContinuations[s[0]] + 1 /* Start byte */; \ + \ + error = PARSERUTILS_OK; \ +} while(0) + +/** + * Find previous legal UTF-8 char in string + * + * \param s The string + * \param off Offset in the string to start at + * \param prevoff Pointer to location to receive offset of first byte of + * previous legal character + * \param error Location to receive error code + */ +#define UTF8_PREV(s, off, prevoff, error) \ +do { \ + if (s == NULL || prevoff == NULL) { \ + error = PARSERUTILS_BADPARM; \ + break; \ + } \ + \ + while (off != 0 && (s[--off] & 0xC0) == 0x80) \ + /* do nothing */; \ + \ + *prevoff = off; \ + \ + error = PARSERUTILS_OK; \ +} while(0) + +/** + * Find next legal UTF-8 char in string + * + * \param s The string (assumed valid) + * \param len Maximum offset in string + * \param off Offset in the string to start at + * \param nextoff Pointer to location to receive offset of first byte of + * next legal character + * \param error Location to receive error code + */ +#define UTF8_NEXT(s, len, off, nextoff, error) \ +do { \ + if (s == NULL || off >= len || nextoff == NULL) { \ + error = PARSERUTILS_BADPARM; \ + break; \ + } \ + \ + /* Skip current start byte (if present - may be mid-sequence) */\ + if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0) \ + off++; \ + \ + while (off < len && (s[off] & 0xC0) == 0x80) \ + off++; \ + \ + *nextoff = off; \ + \ + error = PARSERUTILS_OK; \ +} while(0) + +/** + * Skip to start of next sequence in UTF-8 input + * + * \param s The string (assumed to be of dubious validity) + * \param len Maximum offset in string + * \param off Offset in the string to start at + * \param nextoff Pointer to location to receive offset of first byte of + * next legal character + * \param error Location to receive error code + */ +#define UTF8_NEXT_PARANOID(s, len, off, nextoff, error) \ +do { \ + uint8_t c; \ + \ + error = PARSERUTILS_OK; \ + \ + if (s == NULL || off >= len || nextoff == NULL) { \ + error = PARSERUTILS_BADPARM; \ + break; \ + } \ + \ + c = s[off]; \ + \ + /* If we're mid-sequence, simply advance to next byte */ \ + if (!(c < 0x80 || (c & 0xC0) == 0xC0)) { \ + off++; \ + } else { \ + uint32_t nCont = numContinuations[c]; \ + uint32_t nToSkip; \ + \ + if (off + nCont + 1 >= len) { \ + error = PARSERUTILS_NEEDDATA; \ + break; \ + } \ + \ + /* Verify continuation bytes */ \ + for (nToSkip = 1; nToSkip <= nCont; nToSkip++) { \ + if ((s[off + nToSkip] & 0xC0) != 0x80) \ + break; \ + } \ + \ + /* Skip over the valid bytes */ \ + off += nToSkip; \ + } \ + \ + *nextoff = off; \ +} while(0) + +#endif diff --git a/src/input/Makefile b/src/input/Makefile new file mode 100644 index 0000000..d62740e --- /dev/null +++ b/src/input/Makefile @@ -0,0 +1,46 @@ +# Child makefile fragment +# +# Toolchain is provided by top-level makefile +# +# Variables provided by top-level makefile +# +# COMPONENT The name of the component +# EXPORT The location of the export directory +# TOP The location of the source tree root +# RELEASEDIR The place to put release objects +# DEBUGDIR The place to put debug objects +# +# do_include Canned command sequence to include a child makefile +# +# Variables provided by parent makefile: +# +# DIR The name of the directory we're in, relative to $(TOP) +# +# Variables we can manipulate: +# +# ITEMS_CLEAN The list of items to remove for "make clean" +# ITEMS_DISTCLEAN The list of items to remove for "make distclean" +# TARGET_TESTS The list of target names to run for "make test" +# +# SOURCES The list of sources to build for $(COMPONENT) +# +# Plus anything from the toolchain + +# Push parent directory onto the directory stack +sp := $(sp).x +dirstack_$(sp) := $(d) +d := $(DIR) + +# Sources +SRCS_$(d) := filter.c inputstream.c + +# Append to sources for component +SOURCES += $(addprefix $(d), $(SRCS_$(d))) + +# Now include any children we may have +MAKE_INCLUDES := $(wildcard $(d)*/Makefile) +$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC)))) + +# Finally, pop off the directory stack +d := $(dirstack_$(sp)) +sp := $(basename $(sp)) diff --git a/src/input/filter.c b/src/input/filter.c new file mode 100644 index 0000000..f40c98f --- /dev/null +++ b/src/input/filter.c @@ -0,0 +1,384 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include +#include +#include +#include + +#ifdef WITH_ICONV_FILTER +#include +#endif + +#include +#include + +#include "input/filter.h" +#include "utils/utils.h" + +/** Input filter */ +struct parserutils_filter { +#ifdef WITH_ICONV_FILTER + iconv_t cd; /**< Iconv conversion descriptor */ + uint16_t int_enc; /**< The internal encoding */ +#else + parserutils_charset_codec *read_codec; /**< Read codec */ + parserutils_charset_codec *write_codec; /**< Write codec */ + + uint32_t pivot_buf[64]; /**< Conversion pivot buffer */ + + bool leftover; /**< Data remains from last call */ + uint8_t *pivot_left; /**< Remaining pivot to write */ + size_t pivot_len; /**< Length of pivot remaining */ +#endif + + struct { + uint16_t encoding; /**< Input encoding */ + } settings; /**< Filter settings */ + + parserutils_alloc alloc; /**< Memory (de)allocation function */ + void *pw; /**< Client private data */ +}; + +static parserutils_error filter_set_defaults(parserutils_filter *input); +static parserutils_error filter_set_encoding(parserutils_filter *input, + const char *enc); + +/** + * Create an input filter + * + * \param int_enc Desired encoding of document + * \param alloc Function used to (de)allocate data + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to filter instance, or NULL on failure + */ +parserutils_filter *parserutils_filter_create(const char *int_enc, + parserutils_alloc alloc, void *pw) +{ + parserutils_filter *filter; + + if (int_enc == NULL || alloc == NULL) + return NULL; + + filter = alloc(NULL, sizeof(*filter), pw); + if (!filter) + return NULL; + +#ifdef WITH_ICONV_FILTER + filter->cd = (iconv_t) -1; + filter->int_enc = parserutils_charset_mibenum_from_name( + int_enc, strlen(int_enc)); + if (filter->int_enc == 0) { + alloc(filter, 0, pw); + return NULL; + } +#else + filter->leftover = false; + filter->pivot_left = NULL; + filter->pivot_len = 0; +#endif + + filter->alloc = alloc; + filter->pw = pw; + + if (filter_set_defaults(filter) != PARSERUTILS_OK) { + filter->alloc(filter, 0, pw); + return NULL; + } + +#ifndef WITH_ICONV_FILTER + filter->write_codec = + parserutils_charset_codec_create(int_enc, alloc, pw); + if (filter->write_codec == NULL) { + if (filter->read_codec != NULL) + parserutils_charset_codec_destroy(filter->read_codec); + filter->alloc(filter, 0, pw); + return NULL; + } +#endif + + return filter; +} + +/** + * Destroy an input filter + * + * \param input Pointer to filter instance + */ +void parserutils_filter_destroy(parserutils_filter *input) +{ + if (input == NULL) + return; + +#ifdef WITH_ICONV_FILTER + if (input->cd != (iconv_t) -1) + iconv_close(input->cd); +#else + if (input->read_codec != NULL) + parserutils_charset_codec_destroy(input->read_codec); + + if (input->write_codec != NULL) + parserutils_charset_codec_destroy(input->write_codec); +#endif + + input->alloc(input, 0, input->pw); + + return; +} + +/** + * Configure an input filter + * + * \param input Pointer to filter instance + * \param type Input option type to configure + * \param params Option-specific parameters + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_filter_setopt(parserutils_filter *input, + parserutils_filter_opttype type, + parserutils_filter_optparams *params) +{ + parserutils_error error = PARSERUTILS_OK; + + if (input == NULL || params == NULL) + return PARSERUTILS_BADPARM; + + switch (type) { + case PARSERUTILS_FILTER_SET_ENCODING: + error = filter_set_encoding(input, params->encoding.name); + break; + } + + return error; +} + +/** + * Process a chunk of data + * + * \param input Pointer to filter instance + * \param data Pointer to pointer to input buffer + * \param len Pointer to length of input buffer + * \param output Pointer to pointer to output buffer + * \param outlen Pointer to length of output buffer + * \return PARSERUTILS_OK on success, appropriate error otherwise + * + * Call this with an input buffer length of 0 to flush any buffers. + */ +parserutils_error parserutils_filter_process_chunk(parserutils_filter *input, + const uint8_t **data, size_t *len, + uint8_t **output, size_t *outlen) +{ + if (input == NULL || data == NULL || *data == NULL || len == NULL || + output == NULL || *output == NULL || outlen == NULL) + return PARSERUTILS_BADPARM; + +#ifdef WITH_ICONV_FILTER + if (iconv(input->cd, (char **) data, len, + (char **) output, outlen) == (size_t) -1) { + switch (errno) { + case E2BIG: + return PARSERUTILS_NOMEM; + case EILSEQ: + if (*outlen < 3) + return PARSERUTILS_NOMEM; + + (*output)[0] = 0xef; + (*output)[1] = 0xbf; + (*output)[2] = 0xbd; + + *output += 3; + *outlen -= 3; + + (*data)++; + (*len)--; + + while (*len > 0) { + size_t ret; + + ret = iconv(input->cd, (char **) data, len, + (char **) output, outlen); + if (ret != (size_t) -1 || errno != EILSEQ) + break; + + (*data)++; + (*len)--; + } + + return errno == E2BIG ? PARSERUTILS_NOMEM + : PARSERUTILS_OK; + } + } + + return PARSERUTILS_OK; +#else + parserutils_error read_error, write_error; + + if (input->leftover) { + /* Some data left to be written from last call */ + + /* Attempt to flush the remaining data. */ + write_error = parserutils_charset_codec_encode( + input->write_codec, + (const uint8_t **) &input->pivot_left, + &input->pivot_len, + output, outlen); + + if (write_error != PARSERUTILS_OK) + return write_error; + + + /* And clear leftover */ + input->pivot_left = NULL; + input->pivot_len = 0; + input->leftover = false; + } + + while (*len > 0) { + size_t pivot_len = sizeof(input->pivot_buf); + uint8_t *pivot = (uint8_t *) input->pivot_buf; + + read_error = parserutils_charset_codec_decode(input->read_codec, + data, len, + (uint8_t **) &pivot, &pivot_len); + + pivot = (uint8_t *) input->pivot_buf; + pivot_len = sizeof(input->pivot_buf) - pivot_len; + + if (pivot_len > 0) { + write_error = parserutils_charset_codec_encode( + input->write_codec, + (const uint8_t **) &pivot, + &pivot_len, + output, outlen); + + if (write_error != PARSERUTILS_OK) { + input->leftover = true; + input->pivot_left = pivot; + input->pivot_len = pivot_len; + + return write_error; + } + } + + if (read_error != PARSERUTILS_OK && + read_error != PARSERUTILS_NOMEM) + return read_error; + } + + return PARSERUTILS_OK; +#endif +} + +/** + * Reset an input filter's state + * + * \param input The input filter to reset + * \param PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_filter_reset(parserutils_filter *input) +{ + if (input == NULL) + return PARSERUTILS_BADPARM; + +#ifdef WITH_ICONV_FILTER + iconv(input->cd, NULL, 0, NULL, 0); +#else + parserutils_error error; + + /* Clear pivot buffer leftovers */ + input->pivot_left = NULL; + input->pivot_len = 0; + input->leftover = false; + + /* Reset read codec */ + error = parserutils_charset_codec_reset(input->read_codec); + if (error != PARSERUTILS_OK) + return error; + + /* Reset write codec */ + error = parserutils_charset_codec_reset(input->write_codec); + if (error != PARSERUTILS_OK) + return error; +#endif + + return PARSERUTILS_OK; +} + +/** + * Set an input filter's default settings + * + * \param input Input filter to configure + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error filter_set_defaults(parserutils_filter *input) +{ + parserutils_error error; + + if (input == NULL) + return PARSERUTILS_BADPARM; + +#ifndef WITH_ICONV_FILTER + input->read_codec = NULL; + input->write_codec = NULL; +#endif + + input->settings.encoding = 0; + error = filter_set_encoding(input, "UTF-8"); + if (error != PARSERUTILS_OK) + return error; + + return PARSERUTILS_OK; +} + +/** + * Set an input filter's encoding + * + * \param input Input filter to configure + * \param enc Encoding name + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error filter_set_encoding(parserutils_filter *input, + const char *enc) +{ + const char *old_enc; + uint16_t mibenum; + + if (input == NULL || enc == NULL) + return PARSERUTILS_BADPARM; + + mibenum = parserutils_charset_mibenum_from_name(enc, strlen(enc)); + if (mibenum == 0) + return PARSERUTILS_INVALID; + + /* Exit early if we're already using this encoding */ + if (input->settings.encoding == mibenum) + return PARSERUTILS_OK; + + old_enc = parserutils_charset_mibenum_to_name(input->settings.encoding); + if (old_enc == NULL) + old_enc = "UTF-8"; + +#ifdef WITH_ICONV_FILTER + if (input->cd != (iconv_t) -1) + iconv_close(input->cd); + + input->cd = iconv_open( + parserutils_charset_mibenum_to_name(input->int_enc), enc); +#else + if (input->read_codec != NULL) + parserutils_charset_codec_destroy(input->read_codec); + + input->read_codec = parserutils_charset_codec_create(enc, input->alloc, + input->pw); + if (input->read_codec == NULL) + return PARSERUTILS_NOMEM; +#endif + + input->settings.encoding = mibenum; + + return PARSERUTILS_OK; +} diff --git a/src/input/filter.h b/src/input/filter.h new file mode 100644 index 0000000..96941a6 --- /dev/null +++ b/src/input/filter.h @@ -0,0 +1,57 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef parserutils_input_filter_h_ +#define parserutils_input_filter_h_ + +#include + +#include +#include + +typedef struct parserutils_filter parserutils_filter; + +/** + * Input filter option types + */ +typedef enum parserutils_filter_opttype { + PARSERUTILS_FILTER_SET_ENCODING = 0, +} parserutils_filter_opttype; + +/** + * Input filter option parameters + */ +typedef union parserutils_filter_optparams { + /** Parameters for encoding setting */ + struct { + /** Encoding name */ + const char *name; + } encoding; +} parserutils_filter_optparams; + + +/* Create an input filter */ +parserutils_filter *parserutils_filter_create(const char *int_enc, + parserutils_alloc alloc, void *pw); +/* Destroy an input filter */ +void parserutils_filter_destroy(parserutils_filter *input); + +/* Configure an input filter */ +parserutils_error parserutils_filter_setopt(parserutils_filter *input, + parserutils_filter_opttype type, + parserutils_filter_optparams *params); + +/* Process a chunk of data */ +parserutils_error parserutils_filter_process_chunk(parserutils_filter *input, + const uint8_t **data, size_t *len, + uint8_t **output, size_t *outlen); + +/* Reset an input filter's state */ +parserutils_error parserutils_filter_reset(parserutils_filter *input); + +#endif + diff --git a/src/input/inputstream.c b/src/input/inputstream.c new file mode 100644 index 0000000..fd44995 --- /dev/null +++ b/src/input/inputstream.c @@ -0,0 +1,477 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include +#include +#include + +#include +#include +#include + +#include "input/filter.h" +#include "utils/utils.h" + +/** + * Private input stream definition + */ +typedef struct parserutils_inputstream_private { + parserutils_inputstream public; /**< Public part. Must be first */ + + parserutils_buffer *raw; /**< Buffer containing raw data */ + + bool done_first_chunk; /**< Whether the first chunk has + * been processed */ + + uint16_t mibenum; /**< MIB enum for charset, or 0 */ + uint32_t encsrc; /**< Charset source */ + + parserutils_filter *input; /**< Charset conversion filter */ + + parserutils_charset_detect_func csdetect; /**< Charset detection func.*/ + + parserutils_alloc alloc; /**< Memory (de)allocation function */ + void *pw; /**< Client private data */ +} parserutils_inputstream_private; + +static inline parserutils_error parserutils_inputstream_refill_buffer( + parserutils_inputstream_private *stream); +static inline parserutils_error parserutils_inputstream_strip_bom( + uint16_t mibenum, parserutils_buffer *buffer); + +/** + * Create an input stream + * + * \param enc Document charset, or NULL to autodetect + * \param encsrc Value for encoding source, if specified, or 0 + * \param csdetect Charset detection function, or NULL + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to stream instance, or NULL on failure + * + * The value 0 is defined as being the lowest priority encoding source + * (i.e. the default fallback encoding). Beyond this, no further + * interpretation is made upon the encoding source. + */ +parserutils_inputstream *parserutils_inputstream_create(const char *enc, + uint32_t encsrc, parserutils_charset_detect_func csdetect, + parserutils_alloc alloc, void *pw) +{ + parserutils_inputstream_private *stream; + + if (alloc == NULL) + return NULL; + + stream = alloc(NULL, sizeof(parserutils_inputstream_private), pw); + if (stream == NULL) + return NULL; + + stream->raw = parserutils_buffer_create(alloc, pw); + if (stream->raw == NULL) { + alloc(stream, 0, pw); + return NULL; + } + + stream->public.utf8 = parserutils_buffer_create(alloc, pw); + if (stream->public.utf8 == NULL) { + parserutils_buffer_destroy(stream->raw); + alloc(stream, 0, pw); + return NULL; + } + + stream->public.cursor = 0; + stream->public.had_eof = false; + stream->done_first_chunk = false; + + stream->input = parserutils_filter_create("UTF-8", alloc, pw); + if (stream->input == NULL) { + parserutils_buffer_destroy(stream->public.utf8); + parserutils_buffer_destroy(stream->raw); + alloc(stream, 0, pw); + return NULL; + } + + if (enc != NULL) { + parserutils_error error; + parserutils_filter_optparams params; + + stream->mibenum = + parserutils_charset_mibenum_from_name(enc, strlen(enc)); + + if (stream->mibenum != 0) { + params.encoding.name = enc; + + error = parserutils_filter_setopt(stream->input, + PARSERUTILS_FILTER_SET_ENCODING, + ¶ms); + if (error != PARSERUTILS_OK && + error != PARSERUTILS_INVALID) { + parserutils_filter_destroy(stream->input); + parserutils_buffer_destroy(stream->public.utf8); + parserutils_buffer_destroy(stream->raw); + alloc(stream, 0, pw); + return NULL; + } + + stream->encsrc = encsrc; + } + } else { + stream->mibenum = 0; + stream->encsrc = 0; + } + + stream->csdetect = csdetect; + + stream->alloc = alloc; + stream->pw = pw; + + return (parserutils_inputstream *) stream; +} + +/** + * Destroy an input stream + * + * \param stream Input stream to destroy + */ +void parserutils_inputstream_destroy(parserutils_inputstream *stream) +{ + parserutils_inputstream_private *s = + (parserutils_inputstream_private *) stream; + + if (stream == NULL) + return; + + parserutils_filter_destroy(s->input); + parserutils_buffer_destroy(s->public.utf8); + parserutils_buffer_destroy(s->raw); + s->alloc(s, 0, s->pw); +} + +/** + * Append data to an input stream + * + * \param stream Input stream to append data to + * \param data Data to append (in document charset), or NULL to flag EOF + * \param len Length, in bytes, of data + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_inputstream_append( + parserutils_inputstream *stream, + const uint8_t *data, size_t len) +{ + parserutils_inputstream_private *s = + (parserutils_inputstream_private *) stream; + + if (stream == NULL) + return PARSERUTILS_BADPARM; + + if (data == NULL) { + s->public.had_eof = true; + return PARSERUTILS_OK; + } + + return parserutils_buffer_append(s->raw, data, len); +} + +/** + * Insert data into stream at current location + * + * \param stream Input stream to insert into + * \param data Data to insert (UTF-8 encoded) + * \param len Length, in bytes, of data + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_inputstream_insert( + parserutils_inputstream *stream, + const uint8_t *data, size_t len) +{ + parserutils_inputstream_private *s = + (parserutils_inputstream_private *) stream; + + if (stream == NULL || data == NULL) + return PARSERUTILS_BADPARM; + + return parserutils_buffer_insert(s->public.utf8, s->public.cursor, + data, len); +} + +#define IS_ASCII(x) (((x) & 0x80) == 0) + +/* Look at the character in the stream that starts at + * offset bytes from the cursor (slow version) + * + * \param stream Stream to look in + * \param offset Byte offset of start of character + * \param length Pointer to location to receive character length (in bytes) + * \return Pointer to character data, or EOF or OOD. + * + * Once the character pointed to by the result of this call has been advanced + * past (i.e. parserutils_inputstream_advance has caused the stream cursor to + * pass over the character), then no guarantee is made as to the validity of + * the data pointed to. Thus, any attempt to dereference the pointer after + * advancing past the data it points to is a bug. + */ +uintptr_t parserutils_inputstream_peek_slow(parserutils_inputstream *stream, + size_t offset, size_t *length) +{ + parserutils_inputstream_private *s = + (parserutils_inputstream_private *) stream; + parserutils_error error = PARSERUTILS_OK; + size_t len; + + if (stream == NULL) + return PARSERUTILS_INPUTSTREAM_OOD; + + /* There's insufficient data in the buffer, so read some more */ + if (s->raw->length == 0) { + /* No more data to be had */ + return s->public.had_eof ? PARSERUTILS_INPUTSTREAM_EOF + : PARSERUTILS_INPUTSTREAM_OOD; + } + + /* Refill utf8 buffer from raw buffer */ + error = parserutils_inputstream_refill_buffer(s); + if (error != PARSERUTILS_OK) + return PARSERUTILS_INPUTSTREAM_OOD; + + /* Now try the read */ + if (IS_ASCII(s->public.utf8->data[s->public.cursor + offset])) { + len = 1; + } else { + error = parserutils_charset_utf8_char_byte_length( + s->public.utf8->data + s->public.cursor + offset, + &len); + + if (error != PARSERUTILS_OK && error != PARSERUTILS_NEEDDATA) + return PARSERUTILS_INPUTSTREAM_OOD; + + if (error == PARSERUTILS_NEEDDATA) { + return s->public.had_eof ? PARSERUTILS_INPUTSTREAM_EOF + : PARSERUTILS_INPUTSTREAM_OOD; + } + } + + *length = len; + + return (uintptr_t) (s->public.utf8->data + s->public.cursor + offset); +} + +#undef IS_ASCII + +/** + * Read the source charset of the input stream + * + * \param stream Input stream to query + * \param source Pointer to location to receive charset source identifier + * \return Pointer to charset name (constant; do not free) + */ +const char *parserutils_inputstream_read_charset( + parserutils_inputstream *stream, uint32_t *source) +{ + parserutils_inputstream_private *s = + (parserutils_inputstream_private *) stream; + + if (stream == NULL || source == NULL) + return NULL; + + *source = s->encsrc; + + if (s->encsrc == 0) + return "UTF-8"; + + return parserutils_charset_mibenum_to_name(s->mibenum); +} + +/****************************************************************************** + ******************************************************************************/ + +/** + * Refill the UTF-8 buffer from the raw buffer + * + * \param stream The inputstream to operate on + * \return PARSERUTILS_OK on success + */ +parserutils_error parserutils_inputstream_refill_buffer( + parserutils_inputstream_private *stream) +{ + const uint8_t *raw; + uint8_t *utf8; + size_t raw_length, utf8_space; + parserutils_error error; + + /* If this is the first chunk of data, we must detect the charset and + * strip the BOM, if one exists */ + if (!stream->done_first_chunk) { + if (stream->csdetect != NULL) { + error = stream->csdetect(stream->raw->data, + stream->raw->length, + &stream->mibenum, &stream->encsrc); + if (error != PARSERUTILS_OK) + return error; + } else { + /* Default to UTF-8 */ + stream->mibenum = + parserutils_charset_mibenum_from_name("UTF-8", + SLEN("UTF-8")); + stream->encsrc = 0; + } + + if (stream->mibenum == 0) + abort(); + + error = parserutils_inputstream_strip_bom(stream->mibenum, + stream->raw); + if (error != PARSERUTILS_OK) + return error; + + stream->done_first_chunk = true; + } + + /* Work out how to perform the buffer fill */ + if (stream->public.cursor == stream->public.utf8->length) { + /* Cursor's at the end, so simply reuse the entire buffer */ + utf8 = stream->public.utf8->data; + utf8_space = stream->public.utf8->allocated; + } else { + /* Cursor's not at the end, so shift data after cursor to the + * bottom of the buffer. If the buffer's still over half full, + * extend it. */ + memmove(stream->public.utf8->data, + stream->public.utf8->data + stream->public.cursor, + stream->public.utf8->length - stream->public.cursor); + + stream->public.utf8->length -= stream->public.cursor; + + if (stream->public.utf8->length > + stream->public.utf8->allocated / 2) { + error = parserutils_buffer_grow(stream->public.utf8); + if (error != PARSERUTILS_OK) + return error; + } + + utf8 = stream->public.utf8->data + stream->public.utf8->length; + utf8_space = stream->public.utf8->allocated - + stream->public.utf8->length; + } + + raw = stream->raw->data; + raw_length = stream->raw->length; + + /* Try to fill utf8 buffer from the raw data */ + error = parserutils_filter_process_chunk(stream->input, + &raw, &raw_length, &utf8, &utf8_space); + /* _NOMEM implies that there's more input to read than available space + * in the utf8 buffer. That's fine, so we'll ignore that error. */ + if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) + return error; + + /* Remove the raw data we've processed from the raw buffer */ + error = parserutils_buffer_discard(stream->raw, 0, + stream->raw->length - raw_length); + if (error != PARSERUTILS_OK) + return error; + + /* Fix up the utf8 buffer information */ + stream->public.utf8->length = + stream->public.utf8->allocated - utf8_space; + + /* Finally, fix up the cursor */ + stream->public.cursor = 0; + + return PARSERUTILS_OK; +} + +/** + * Strip a BOM from a buffer in the given encoding + * + * \param mibenum The character set of the buffer + * \param buffer The buffer to process + */ +parserutils_error parserutils_inputstream_strip_bom(uint16_t mibenum, + parserutils_buffer *buffer) +{ + static uint16_t utf8; + static uint16_t utf16; + static uint16_t utf16be; + static uint16_t utf16le; + static uint16_t utf32; + static uint16_t utf32be; + static uint16_t utf32le; + + if (utf8 == 0) { + utf8 = parserutils_charset_mibenum_from_name("UTF-8", + SLEN("UTF-8")); + utf16 = parserutils_charset_mibenum_from_name("UTF-16", + SLEN("UTF-16")); + utf16be = parserutils_charset_mibenum_from_name("UTF-16BE", + SLEN("UTF-16BE")); + utf16le = parserutils_charset_mibenum_from_name("UTF-16LE", + SLEN("UTF-16LE")); + utf32 = parserutils_charset_mibenum_from_name("UTF-32", + SLEN("UTF-32")); + utf32be = parserutils_charset_mibenum_from_name("UTF-32BE", + SLEN("UTF-32BE")); + utf32le = parserutils_charset_mibenum_from_name("UTF-32LE", + SLEN("UTF-32LE")); + } + + /** \todo Handle unmarked UTF-16 and UTF-32. Endianness is specified + * by the BOM, if present, or is assumed to be big endian. */ + +#define UTF32_BOM_LEN (4) +#define UTF16_BOM_LEN (2) +#define UTF8_BOM_LEN (3) + + if (mibenum == utf8) { + if (buffer->length >= UTF8_BOM_LEN && + buffer->data[0] == 0xEF && + buffer->data[1] == 0xBB && + buffer->data[2] == 0xBF) { + return parserutils_buffer_discard( + buffer, 0, UTF8_BOM_LEN); + } + } else if (mibenum == utf16be) { + if (buffer->length >= UTF16_BOM_LEN && + buffer->data[0] == 0xFE && + buffer->data[1] == 0xFF) { + return parserutils_buffer_discard( + buffer, 0, UTF16_BOM_LEN); + } + } else if (mibenum == utf16le) { + if (buffer->length >= UTF16_BOM_LEN && + buffer->data[0] == 0xFF && + buffer->data[1] == 0xFE) { + return parserutils_buffer_discard( + buffer, 0, UTF16_BOM_LEN); + } + } else if (mibenum == utf32be) { + if (buffer->length >= UTF32_BOM_LEN && + buffer->data[0] == 0x00 && + buffer->data[1] == 0x00 && + buffer->data[2] == 0xFE && + buffer->data[3] == 0xFF) { + return parserutils_buffer_discard( + buffer, 0, UTF32_BOM_LEN); + } + } else if (mibenum == utf32le) { + if (buffer->length >= UTF32_BOM_LEN && + buffer->data[0] == 0xFF && + buffer->data[1] == 0xFE && + buffer->data[2] == 0x00 && + buffer->data[3] == 0x00) { + return parserutils_buffer_discard( + buffer, 0, UTF32_BOM_LEN); + } + } + +#undef UTF8_BOM_LEN +#undef UTF16_BOM_LEN +#undef UTF32_BOM_LEN + + return PARSERUTILS_OK; +} + diff --git a/src/parserutils.c b/src/parserutils.c new file mode 100644 index 0000000..ed9b21f --- /dev/null +++ b/src/parserutils.c @@ -0,0 +1,54 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include + +#include "charset/charset.h" + +/** + * Initialise the ParserUtils library for use. + * + * This _must_ be called before using any libparserutils functions + * + * \param aliases_file Pointer to name of file containing encoding alias data + * \param alloc Pointer to (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return PARSERUTILS_OK on success, applicable error otherwise. + */ +parserutils_error parserutils_initialise(const char *aliases_file, + parserutils_alloc alloc, void *pw) +{ + parserutils_error error; + + if (aliases_file == NULL || alloc == NULL) + return PARSERUTILS_BADPARM; + + error = parserutils_charset_initialise(aliases_file, alloc, pw); + if (error != PARSERUTILS_OK) + return error; + + return PARSERUTILS_OK; +} + +/** + * Clean up after Libparserutils + * + * \param alloc Pointer to (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return PARSERUTILS_OK on success, applicable error otherwise. + */ +parserutils_error parserutils_finalise(parserutils_alloc alloc, void *pw) +{ + if (alloc == NULL) + return PARSERUTILS_BADPARM; + + parserutils_charset_finalise(alloc, pw); + + return PARSERUTILS_OK; +} + + diff --git a/src/utils/Makefile b/src/utils/Makefile new file mode 100644 index 0000000..e053673 --- /dev/null +++ b/src/utils/Makefile @@ -0,0 +1,49 @@ +# Child makefile fragment +# +# Toolchain is provided by top-level makefile +# +# Variables provided by top-level makefile +# +# COMPONENT The name of the component +# EXPORT The location of the export directory +# TOP The location of the source tree root +# RELEASEDIR The place to put release objects +# DEBUGDIR The place to put debug objects +# +# do_include Canned command sequence to include a child makefile +# +# Variables provided by parent makefile: +# +# DIR The name of the directory we're in, relative to $(TOP) +# +# Variables we can manipulate: +# +# ITEMS_CLEAN The list of items to remove for "make clean" +# ITEMS_DISTCLEAN The list of items to remove for "make distclean" +# TARGET_TESTS The list of target names to run for "make test" +# +# SOURCES The list of sources to build for $(COMPONENT) +# +# Plus anything from the toolchain + +# Push parent directory onto the directory stack +sp := $(sp).x +dirstack_$(sp) := $(d) +d := $(DIR) + +# Manipulate include paths +override CFLAGS := $(CFLAGS) -I$(d) + +# Sources +SRCS_$(d) := buffer.c errors.c + +# Append to sources for component +SOURCES += $(addprefix $(d), $(SRCS_$(d))) + +# Now include any children we may have +MAKE_INCLUDES := $(wildcard $(d)*/Makefile) +$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC)))) + +# Finally, pop off the directory stack +d := $(dirstack_$(sp)) +sp := $(basename $(sp)) diff --git a/src/utils/buffer.c b/src/utils/buffer.c new file mode 100644 index 0000000..21c47fc --- /dev/null +++ b/src/utils/buffer.c @@ -0,0 +1,156 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2008 John-Mark Bell + */ + +#include + +#include + +#define DEFAULT_SIZE (4096) + +/** + * Create a memory buffer + * + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data + * \return Pointer to memory buffer, or NULL on memory exhaustion + */ +parserutils_buffer *parserutils_buffer_create(parserutils_alloc alloc, void *pw) +{ + parserutils_buffer *buffer = + alloc(NULL, sizeof(parserutils_buffer), pw); + + if (buffer == NULL) + return NULL; + + buffer->data = alloc(NULL, DEFAULT_SIZE, pw); + if (buffer->data == NULL) { + alloc(buffer, 0, pw); + return NULL; + } + + buffer->length = 0; + buffer->allocated = DEFAULT_SIZE; + + buffer->alloc = alloc; + buffer->pw = pw; + + return buffer; +} + +/** + * Destroy a memory buffer + * + * \param buffer The buffer to destroy + */ +void parserutils_buffer_destroy(parserutils_buffer *buffer) +{ + if (buffer == NULL) + return; + + buffer->alloc(buffer->data, 0, buffer->pw); + buffer->alloc(buffer, 0, buffer->pw); +} + +/** + * Append data to a memory buffer + * + * \param buffer The buffer to append to + * \param data The data to append + * \param len The length, in bytes, of the data to append + * \return PARSERUTILS_OK on success, appropriate error otherwise. + */ +parserutils_error parserutils_buffer_append(parserutils_buffer *buffer, + const uint8_t *data, size_t len) +{ + while (len >= buffer->allocated - buffer->length) { + parserutils_error error = parserutils_buffer_grow(buffer); + if (error != PARSERUTILS_OK) + return error; + } + + memcpy(buffer->data + buffer->length, data, len); + + buffer->length += len; + + return PARSERUTILS_OK; +} + +/** + * Insert data into a memory buffer + * + * \param buffer The buffer to insert into + * \param offset The offset into the buffer to insert at + * \param data The data to insert + * \param len The length, in bytes, of the data to insert + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_buffer_insert(parserutils_buffer *buffer, + size_t offset, const uint8_t *data, size_t len) +{ + if (offset > buffer->length) + return PARSERUTILS_BADPARM; + + if (offset == buffer->length) + return parserutils_buffer_append(buffer, data, len); + + while (len >= buffer->allocated - buffer->length) { + parserutils_error error = parserutils_buffer_grow(buffer); + if (error != PARSERUTILS_OK) + return error; + } + + memmove(buffer->data + buffer->length + len, + buffer->data + offset, buffer->length - offset); + + memcpy(buffer->data + offset, data, len); + + buffer->length += len; + + return PARSERUTILS_OK; +} + +/** + * Discard a section of a memory buffer + * + * \param buffer The buffer to discard data from + * \param offset The offset into the buffer of the start of the section + * \param len The number of bytes to discard + * \return PARSERUTILS_OK on success, appropriate error otherwise. + */ +parserutils_error parserutils_buffer_discard(parserutils_buffer *buffer, + size_t offset, size_t len) +{ + if (offset >= buffer->length || offset + len > buffer->length) + return PARSERUTILS_BADPARM; + + memmove(buffer->data + offset, buffer->data + offset + len, + buffer->length - len); + + buffer->length -= len; + + return PARSERUTILS_OK; +} + +/** + * Extend the amount of space allocated for a memory buffer + * + * \param buffer The buffer to extend + * \return PARSERUTILS_OK on success, appropriate error otherwise. + */ +parserutils_error parserutils_buffer_grow(parserutils_buffer *buffer) +{ + uint8_t *temp = buffer->alloc(buffer->data, + buffer->allocated * 2, buffer->pw); + if (temp == NULL) + return PARSERUTILS_NOMEM; + + buffer->data = temp; + buffer->allocated *= 2; + + return PARSERUTILS_OK; +} + diff --git a/src/utils/errors.c b/src/utils/errors.c new file mode 100644 index 0000000..353cda1 --- /dev/null +++ b/src/utils/errors.c @@ -0,0 +1,70 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include + +#include + +/** + * Convert a parserutils error code to a string + * + * \param error The error code to convert + * \return Pointer to string representation of error, or NULL if unknown. + */ +const char *parserutils_error_to_string(parserutils_error error) +{ + const char *result = NULL; + + switch (error) { + case PARSERUTILS_OK: + result = "No error"; + break; + case PARSERUTILS_NOMEM: + result = "Insufficient memory"; + break; + case PARSERUTILS_BADPARM: + result = "Bad parameter"; + break; + case PARSERUTILS_INVALID: + result = "Invalid input"; + break; + case PARSERUTILS_FILENOTFOUND: + result = "File not found"; + break; + case PARSERUTILS_NEEDDATA: + result = "Insufficient data"; + break; + } + + return result; +} + +/** + * Convert a string representation of an error name to a parserutils error code + * + * \param str String containing error name + * \param len Length of string (bytes) + * \return Error code, or PARSERUTILS_OK if unknown + */ +parserutils_error parserutils_error_from_string(const char *str, size_t len) +{ + if (strncmp(str, "PARSERUTILS_OK", len) == 0) { + return PARSERUTILS_OK; + } else if (strncmp(str, "PARSERUTILS_NOMEM", len) == 0) { + return PARSERUTILS_NOMEM; + } else if (strncmp(str, "PARSERUTILS_BADPARM", len) == 0) { + return PARSERUTILS_BADPARM; + } else if (strncmp(str, "PARSERUTILS_INVALID", len) == 0) { + return PARSERUTILS_INVALID; + } else if (strncmp(str, "PARSERUTILS_FILENOTFOUND", len) == 0) { + return PARSERUTILS_FILENOTFOUND; + } else if (strncmp(str, "PARSERUTILS_NEEDDATA", len) == 0) { + return PARSERUTILS_NEEDDATA; + } + + return PARSERUTILS_OK; +} diff --git a/src/utils/utils.h b/src/utils/utils.h new file mode 100644 index 0000000..5162945 --- /dev/null +++ b/src/utils/utils.h @@ -0,0 +1,28 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef parserutils_utils_h_ +#define parserutils_utils_h_ + +#ifndef max +#define max(a,b) ((a)>(b)?(a):(b)) +#endif + +#ifndef min +#define min(a,b) ((a)<(b)?(a):(b)) +#endif + +#ifndef SLEN +/* Calculate length of a string constant */ +#define SLEN(s) (sizeof((s)) - 1) /* -1 for '\0' */ +#endif + +#ifndef UNUSED +#define UNUSED(x) ((x)=(x)) +#endif + +#endif diff --git a/test/INDEX b/test/INDEX new file mode 100644 index 0000000..772c82f --- /dev/null +++ b/test/INDEX @@ -0,0 +1,15 @@ +# Index for testcases +# +# Test Description DataDir + +charset Charset initialisation/finalisation +parserutils Library initialisation/finalisation +aliases Encoding alias handling +cscodec Charset codec implementation cscodec +filter Input stream filtering +inputstream Inputstream handling input + +# Regression tests +regression/cscodec-segv Segfault in charset codecs +regression/filter-segv Segfault in input filtering +regression/stream-nomem Inputstream buffer expansion diff --git a/test/Makefile b/test/Makefile new file mode 100644 index 0000000..2ed0b44 --- /dev/null +++ b/test/Makefile @@ -0,0 +1,80 @@ +# Child makefile fragment +# +# Toolchain is provided by top-level makefile +# +# Variables provided by top-level makefile +# +# COMPONENT The name of the component +# EXPORT The location of the export directory +# TOP The location of the source tree root +# RELEASEDIR The place to put release objects +# DEBUGDIR The place to put debug objects +# +# do_include Canned command sequence to include a child makefile +# +# Variables provided by parent makefile: +# +# DIR The name of the directory we're in, relative to $(TOP) +# +# Variables we can manipulate: +# +# ITEMS_CLEAN The list of items to remove for "make clean" +# ITEMS_DISTCLEAN The list of items to remove for "make distclean" +# TARGET_TESTS The list of target names to run for "make test" +# +# SOURCES The list of sources to build for $(COMPONENT) +# +# Plus anything from the toolchain + +# Push parent directory onto the directory stack +sp := $(sp).x +dirstack_$(sp) := $(d) +d := $(DIR) + +# Extend toolchain settings +override CFLAGS := $(CFLAGS) -I$(TOP)/src/ -I$(d) + +# Tests +TESTS_$(d) := aliases cscodec charset filter inputstream parserutils +TESTS_$(d) := $(TESTS_$(d)) regression/cscodec-segv regression/filter-segv \ + regression/stream-nomem + +# Items for top-level makefile to use +ITEMS_CLEAN := $(ITEMS_CLEAN) \ + $(addprefix $(d), $(addsuffix $(EXEEXT), $(TESTS_$(d)))) \ + $(addprefix $(d), $(addsuffix .gcda, $(TESTS_$(d)))) \ + $(addprefix $(d), $(addsuffix .gcno, $(TESTS_$(d)))) +ITEMS_DISTCLEAN := $(ITEMS_DISTCLEAN) $(d)log + +# Targets for top-level makefile to run +TARGET_TESTS := $(TARGET_TESTS) test_$(d) + +# Now we get to hack around so that we know what directory we're in. +# $(d) no longer exists when running the commands for a target, so we can't +# simply use it verbatim. Assigning to a variable doesn't really help, as +# there's no guarantee that someone else hasn't overridden that variable. +# So, what we do is make the target depend on $(d), then pick it out of the +# dependency list when running commands. This isn't pretty, but is effective. +test_$(d): $(d) $(addprefix $(d), $(TESTS_$(d))) + @$(PERL) $(TOP)/$ $(1)" + @$$(CC) -c -g $$(DEBUGCFLAGS) -o $$@.o $(1) + @$$(LD) -g -o $$@ $$@.o $$(LDFLAGS) -lparserutils-debug + @$$(RM) $$(RMFLAGS) $$@.o + +endef + +$(eval $(foreach TEST,$(addprefix $(d), $(TESTS_$(d))), \ + $(call compile_test,$(addsuffix .c, $(TEST)),$(TEST)))) + +# Now include any children we may have +MAKE_INCLUDES := $(wildcard $(d)*/Makefile) +$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC)))) + +# Finally, pop off the directory stack +d := $(dirstack_$(sp)) +sp := $(basename $(sp)) diff --git a/test/README b/test/README new file mode 100644 index 0000000..7e41abf --- /dev/null +++ b/test/README @@ -0,0 +1,84 @@ +Libcharset testcases +==================== + +Testcases for Libcharset are self-contained binaries which test various parts +of the charset library. These may make use of external data files to drive +the testing. + +Testcase command lines +---------------------- + +Testcase command lines are in a unified format, thus: + + [ ] + +The aliases file parameter will always be specified (as it is required for +the library to work at all). + +The data file parameter is optional and may be provided on a test-by-test +basis. + +Testcase output +--------------- + +Testcases may output anything at all to stdout. The final line of the +output must begin with either PASS or FAIL (case sensitive), indicating +the success status of the test. + +Test Index +---------- + +In the test sources directory, is a file, named INDEX, which provides an +index of all available test binaries. Any new test applications should be +added to this index as they are created. + +The test index file format is as follows: + + file = *line + + line = ( entry / comment / blank ) LF + + entry = testname 1*HTAB description [ 1*HTAB datadir ] + comment = "#" *non-newline + blank = 0 + + testname = 1*non-reserved + description = 1*non-reserved + datadir = 1*non-reserved + + non-newline = VCHAR / WSP + non-reserved = VCHAR / SP + +Each entry contains a mandatory binary name and description followed by +an optional data directory specifier. The data directory specifier is +used to state the name of the directory containing data files for the +test name. This directory will be searched for within the "data" +directory in the source tree. + +If a data directory is specified, the test binary will be invoked for +each data file listed within the data directory INDEX, passing the +filename as the second parameter (, above). + +Data Index +---------- + +Each test data directory contains a file, named INDEX, which provides an +index of all available test data files. + +The data index file format is as follows: + + file = *line + + line = ( entry / comment / blank ) LF + + entry = dataname 1*HTAB description + comment = "#" *non-newline + blank = 0 + + dataname = 1*non-reserved + description = 1*non-reserved + + non-newline = VCHAR / WSP + non-reserved = VCHAR / SP + +Each entry contains a mandatory data file name and description. diff --git a/test/aliases.c b/test/aliases.c new file mode 100644 index 0000000..dff31c6 --- /dev/null +++ b/test/aliases.c @@ -0,0 +1,62 @@ +#include +#include + +#include "charset/aliases.h" + +#include "testutils.h" + +extern void charset_aliases_dump(void); + +static void *myrealloc(void *ptr, size_t len, void *pw) +{ + UNUSED(pw); + + return realloc(ptr, len); +} + +int main (int argc, char **argv) +{ + parserutils_charset_aliases_canon *c; + + if (argc != 2) { + printf("Usage: %s \n", argv[0]); + return 1; + } + + parserutils_charset_aliases_create(argv[1], myrealloc, NULL); + + parserutils_charset_aliases_dump(); + + c = parserutils_charset_alias_canonicalise("moose", 5); + if (c) { + printf("FAIL - found invalid encoding 'moose'\n"); + return 1; + } + + c = parserutils_charset_alias_canonicalise("csinvariant", 11); + if (c) { + printf("%s %d\n", c->name, c->mib_enum); + } else { + printf("FAIL - failed finding encoding 'csinvariant'\n"); + return 1; + } + + c = parserutils_charset_alias_canonicalise("nats-sefi-add", 13); + if (c) { + printf("%s %d\n", c->name, c->mib_enum); + } else { + printf("FAIL - failed finding encoding 'nats-sefi-add'\n"); + return 1; + } + + printf("%d\n", parserutils_charset_mibenum_from_name(c->name, + strlen(c->name))); + + printf("%s\n", parserutils_charset_mibenum_to_name(c->mib_enum)); + + parserutils_charset_aliases_destroy(myrealloc, NULL); + + printf("PASS\n"); + + return 0; +} diff --git a/test/charset.c b/test/charset.c new file mode 100644 index 0000000..a793e7e --- /dev/null +++ b/test/charset.c @@ -0,0 +1,31 @@ +#include +#include + +#include "charset/charset.h" + +#include "testutils.h" + +static void *myrealloc(void *ptr, size_t len, void *pw) +{ + UNUSED(pw); + + return realloc(ptr, len); +} + +int main(int argc, char **argv) +{ + if (argc != 2) { + printf("Usage: %s \n", argv[0]); + return 1; + } + + assert(parserutils_charset_initialise(argv[1], myrealloc, NULL) == + PARSERUTILS_OK); + + assert (parserutils_charset_finalise(myrealloc, NULL) == + PARSERUTILS_OK); + + printf("PASS\n"); + + return 0; +} diff --git a/test/cscodec.c b/test/cscodec.c new file mode 100644 index 0000000..d3b1b76 --- /dev/null +++ b/test/cscodec.c @@ -0,0 +1,232 @@ +#include +#include + +#include "charset/charset.h" +#include + +#include "utils/utils.h" + +#include "testutils.h" + +typedef struct line_ctx { + parserutils_charset_codec *codec; + + size_t buflen; + size_t bufused; + uint8_t *buf; + size_t explen; + size_t expused; + uint8_t *exp; + + bool indata; + bool inexp; + + parserutils_error exp_ret; + + enum { ENCODE, DECODE, BOTH } dir; +} line_ctx; + +static bool handle_line(const char *data, size_t datalen, void *pw); +static void run_test(line_ctx *ctx); + +static void *myrealloc(void *ptr, size_t len, void *pw) +{ + UNUSED(pw); + + return realloc(ptr, len); +} + +int main(int argc, char **argv) +{ + line_ctx ctx; + + if (argc != 3) { + printf("Usage: %s \n", argv[0]); + return 1; + } + + assert(parserutils_charset_initialise(argv[1], myrealloc, NULL) == + PARSERUTILS_OK); + + assert(parserutils_charset_codec_create("NATS-SEFI-ADD", + myrealloc, NULL) == NULL); + + ctx.codec = parserutils_charset_codec_create("UTF-8", myrealloc, NULL); + assert(ctx.codec != NULL); + + ctx.buflen = parse_filesize(argv[2]); + if (ctx.buflen == 0) + return 1; + + ctx.buf = malloc(2 * ctx.buflen); + if (ctx.buf == NULL) { + printf("Failed allocating %u bytes\n", + (unsigned int) ctx.buflen); + return 1; + } + + ctx.exp = ctx.buf + ctx.buflen; + ctx.explen = ctx.buflen; + + ctx.buf[0] = '\0'; + ctx.exp[0] = '\0'; + ctx.bufused = 0; + ctx.expused = 0; + ctx.indata = false; + ctx.inexp = false; + ctx.exp_ret = PARSERUTILS_OK; + + assert(parse_testfile(argv[2], handle_line, &ctx) == true); + + /* and run final test */ + if (ctx.bufused > 0 && ctx.buf[ctx.bufused - 1] == '\n') + ctx.bufused -= 1; + + if (ctx.expused > 0 && ctx.exp[ctx.expused - 1] == '\n') + ctx.expused -= 1; + + run_test(&ctx); + + free(ctx.buf); + + parserutils_charset_codec_destroy(ctx.codec); + + assert(parserutils_charset_finalise(myrealloc, NULL) == + PARSERUTILS_OK); + + printf("PASS\n"); + + return 0; +} + +bool handle_line(const char *data, size_t datalen, void *pw) +{ + line_ctx *ctx = (line_ctx *) pw; + + if (data[0] == '#') { + if (ctx->inexp) { + /* This marks end of testcase, so run it */ + + if (ctx->buf[ctx->bufused - 1] == '\n') + ctx->bufused -= 1; + + if (ctx->exp[ctx->expused - 1] == '\n') + ctx->expused -= 1; + + run_test(ctx); + + ctx->buf[0] = '\0'; + ctx->exp[0] = '\0'; + ctx->bufused = 0; + ctx->expused = 0; + ctx->exp_ret = PARSERUTILS_OK; + } + + if (strncasecmp(data+1, "data", 4) == 0) { + parserutils_charset_codec_optparams params; + const char *ptr = data + 6; + + ctx->indata = true; + ctx->inexp = false; + + if (strncasecmp(ptr, "decode", 6) == 0) + ctx->dir = DECODE; + else if (strncasecmp(ptr, "encode", 6) == 0) + ctx->dir = ENCODE; + else + ctx->dir = BOTH; + + ptr += 7; + + if (strncasecmp(ptr, "LOOSE", 5) == 0) { + params.error_mode.mode = + PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE; + ptr += 6; + } else if (strncasecmp(ptr, "STRICT", 6) == 0) { + params.error_mode.mode = + PARSERUTILS_CHARSET_CODEC_ERROR_STRICT; + ptr += 7; + } else { + params.error_mode.mode = + PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT; + ptr += 9; + } + + assert(parserutils_charset_codec_setopt(ctx->codec, + PARSERUTILS_CHARSET_CODEC_ERROR_MODE, + (parserutils_charset_codec_optparams *) ¶ms) + == PARSERUTILS_OK); + } else if (strncasecmp(data+1, "expected", 8) == 0) { + ctx->indata = false; + ctx->inexp = true; + + ctx->exp_ret = parserutils_error_from_string(data + 10, + datalen - 10 - 1 /* \n */); + } else if (strncasecmp(data+1, "reset", 5) == 0) { + ctx->indata = false; + ctx->inexp = false; + + parserutils_charset_codec_reset(ctx->codec); + } + } else { + if (ctx->indata) { + memcpy(ctx->buf + ctx->bufused, data, datalen); + ctx->bufused += datalen; + } + if (ctx->inexp) { + memcpy(ctx->exp + ctx->expused, data, datalen); + ctx->expused += datalen; + } + } + + return true; +} + +void run_test(line_ctx *ctx) +{ + static int testnum; + size_t destlen = ctx->bufused * 4; + uint8_t dest[destlen]; + uint8_t *pdest = dest; + const uint8_t *psrc = ctx->buf; + size_t srclen = ctx->bufused; + size_t i; + + if (ctx->dir == DECODE) { + assert(parserutils_charset_codec_decode(ctx->codec, + &psrc, &srclen, + &pdest, &destlen) == ctx->exp_ret); + } else if (ctx->dir == ENCODE) { + assert(parserutils_charset_codec_encode(ctx->codec, + &psrc, &srclen, + &pdest, &destlen) == ctx->exp_ret); + } else { + size_t templen = ctx->bufused * 4; + uint8_t temp[templen]; + uint8_t *ptemp = temp; + + assert(parserutils_charset_codec_decode(ctx->codec, + &psrc, &srclen, + &ptemp, &templen) == ctx->exp_ret); + ptemp = temp; + templen = ctx->bufused * 4 - templen; + assert(parserutils_charset_codec_encode(ctx->codec, + (const uint8_t **) &ptemp, &templen, + &pdest, &destlen) == ctx->exp_ret); + } + + printf("%d: Read '", ++testnum); + for (i = 0; i < ctx->expused; i++) { + printf("%c%c ", "0123456789abcdef"[(dest[i] >> 4) & 0xf], + "0123456789abcdef"[dest[i] & 0xf]); + } + printf("' Expected '"); + for (i = 0; i < ctx->expused; i++) { + printf("%c%c ", "0123456789abcdef"[(ctx->exp[i] >> 4) & 0xf], + "0123456789abcdef"[ctx->exp[i] & 0xf]); + } + printf("'\n"); + + assert(memcmp(dest, ctx->exp, ctx->expused) == 0); +} + diff --git a/test/data/Aliases b/test/data/Aliases new file mode 100644 index 0000000..db61ff1 --- /dev/null +++ b/test/data/Aliases @@ -0,0 +1,302 @@ +# > Unicode:Files.Aliases +# Mapping of character set encoding names to their canonical form +# +# Lines starting with a '#' are comments, blank lines are ignored. +# +# Based on http://www.iana.org/assignments/character-sets and +# http://www.iana.org/assignments/ianacharset-mib +# +# Canonical Form MIBenum Aliases... +# +US-ASCII 3 iso-ir-6 ANSI_X3.4-1986 ISO_646.irv:1991 ASCII ISO646-US ANSI_X3.4-1968 us IBM367 cp367 csASCII +ISO-10646-UTF-1 27 csISO10646UTF1 +ISO_646.basic:1983 28 ref csISO646basic1983 +INVARIANT 29 csINVARIANT +ISO_646.irv:1983 30 iso-ir-2 irv csISO2IntlRefVersion +BS_4730 20 iso-ir-4 ISO646-GB gb uk csISO4UnitedKingdom +NATS-SEFI 31 iso-ir-8-1 csNATSSEFI +NATS-SEFI-ADD 32 iso-ir-8-2 csNATSSEFIADD +NATS-DANO 33 iso-ir-9-1 csNATSDANO +NATS-DANO-ADD 34 iso-ir-9-2 csNATSDANOADD +SEN_850200_B 35 iso-ir-10 FI ISO646-FI ISO646-SE se csISO10Swedish +SEN_850200_C 21 iso-ir-11 ISO646-SE2 se2 csISO11SwedishForNames +KS_C_5601-1987 36 iso-ir-149 KS_C_5601-1989 KSC_5601 korean csKSC56011987 +ISO-2022-KR 37 csISO2022KR +EUC-KR 38 csEUCKR EUCKR +ISO-2022-JP 39 csISO2022JP +ISO-2022-JP-2 40 csISO2022JP2 +ISO-2022-CN 104 +ISO-2022-CN-EXT 105 +JIS_C6220-1969-jp 41 JIS_C6220-1969 iso-ir-13 katakana x0201-7 csISO13JISC6220jp +JIS_C6220-1969-ro 42 iso-ir-14 jp ISO646-JP csISO14JISC6220ro +IT 22 iso-ir-15 ISO646-IT csISO15Italian +PT 43 iso-ir-16 ISO646-PT csISO16Portuguese +ES 23 iso-ir-17 ISO646-ES csISO17Spanish +greek7-old 44 iso-ir-18 csISO18Greek7Old +latin-greek 45 iso-ir-19 csISO19LatinGreek +DIN_66003 24 iso-ir-21 de ISO646-DE csISO21German +NF_Z_62-010_(1973) 46 iso-ir-25 ISO646-FR1 csISO25French +Latin-greek-1 47 iso-ir-27 csISO27LatinGreek1 +ISO_5427 48 iso-ir-37 csISO5427Cyrillic +JIS_C6226-1978 49 iso-ir-42 csISO42JISC62261978 +BS_viewdata 50 iso-ir-47 csISO47BSViewdata +INIS 51 iso-ir-49 csISO49INIS +INIS-8 52 iso-ir-50 csISO50INIS8 +INIS-cyrillic 53 iso-ir-51 csISO51INISCyrillic +ISO_5427:1981 54 iso-ir-54 ISO5427Cyrillic1981 +ISO_5428:1980 55 iso-ir-55 csISO5428Greek +GB_1988-80 56 iso-ir-57 cn ISO646-CN csISO57GB1988 +GB_2312-80 57 iso-ir-58 chinese csISO58GB231280 +NS_4551-1 25 iso-ir-60 ISO646-NO no csISO60DanishNorwegian csISO60Norwegian1 +NS_4551-2 58 ISO646-NO2 iso-ir-61 no2 csISO61Norwegian2 +NF_Z_62-010 26 iso-ir-69 ISO646-FR fr csISO69French +videotex-suppl 59 iso-ir-70 csISO70VideotexSupp1 +PT2 60 iso-ir-84 ISO646-PT2 csISO84Portuguese2 +ES2 61 iso-ir-85 ISO646-ES2 csISO85Spanish2 +MSZ_7795.3 62 iso-ir-86 ISO646-HU hu csISO86Hungarian +JIS_C6226-1983 63 iso-ir-87 x0208 JIS_X0208-1983 csISO87JISX0208 +greek7 64 iso-ir-88 csISO88Greek7 +ASMO_449 65 ISO_9036 arabic7 iso-ir-89 csISO89ASMO449 +iso-ir-90 66 csISO90 +JIS_C6229-1984-a 67 iso-ir-91 jp-ocr-a csISO91JISC62291984a +JIS_C6229-1984-b 68 iso-ir-92 ISO646-JP-OCR-B jp-ocr-b csISO92JISC62991984b +JIS_C6229-1984-b-add 69 iso-ir-93 jp-ocr-b-add csISO93JIS62291984badd +JIS_C6229-1984-hand 70 iso-ir-94 jp-ocr-hand csISO94JIS62291984hand +JIS_C6229-1984-hand-add 71 iso-ir-95 jp-ocr-hand-add csISO95JIS62291984handadd +JIS_C6229-1984-kana 72 iso-ir-96 csISO96JISC62291984kana +ISO_2033-1983 73 iso-ir-98 e13b csISO2033 +ANSI_X3.110-1983 74 iso-ir-99 CSA_T500-1983 NAPLPS csISO99NAPLPS +ISO-8859-1 4 iso-ir-100 ISO_8859-1 ISO_8859-1:1987 latin1 l1 IBM819 CP819 csISOLatin1 8859_1 ISO8859-1 +ISO-8859-2 5 iso-ir-101 ISO_8859-2 ISO_8859-2:1987 latin2 l2 csISOLatin2 8859_2 ISO8859-2 +T.61-7bit 75 iso-ir-102 csISO102T617bit +T.61-8bit 76 T.61 iso-ir-103 csISO103T618bit +ISO-8859-3 6 iso-ir-109 ISO_8859-3 ISO_8859-3:1988 latin3 l3 csISOLatin3 8859_3 ISO8859-3 +ISO-8859-4 7 iso-ir-110 ISO_8859-4 ISO_8859-4:1988 latin4 l4 csISOLatin4 8859_4 ISO8859-4 +ECMA-cyrillic 77 iso-ir-111 KOI8-E csISO111ECMACyrillic +CSA_Z243.4-1985-1 78 iso-ir-121 ISO646-CA csa7-1 ca csISO121Canadian1 +CSA_Z243.4-1985-2 79 iso-ir-122 ISO646-CA2 csa7-2 csISO122Canadian2 +CSA_Z243.4-1985-gr 80 iso-ir-123 csISO123CSAZ24341985gr +ISO-8859-6 9 iso-ir-127 ISO_8859-6 ISO_8859-6:1987 ECMA-114 ASMO-708 arabic csISOLatinArabic +ISO-8859-6-E 81 csISO88596E ISO_8859-6-E +ISO-8859-6-I 82 csISO88596I ISO_8859-6-I +ISO-8859-7 10 iso-ir-126 ISO_8859-7 ISO_8859-7:1987 ELOT_928 ECMA-118 greek greek8 csISOLatinGreek 8859_7 ISO8859-7 +T.101-G2 83 iso-ir-128 csISO128T101G2 +ISO-8859-8 11 iso-ir-138 ISO_8859-8 ISO_8859-8:1988 hebrew csISOLatinHebrew 8859_8 ISO8859-8 +ISO-8859-8-E 84 csISO88598E ISO_8859-8-E +ISO-8859-8-I 85 csISO88598I ISO_8859-8-I +CSN_369103 86 iso-ir-139 csISO139CSN369103 +JUS_I.B1.002 87 iso-ir-141 ISO646-YU js yu csISO141JUSIB1002 +ISO_6937-2-add 14 iso-ir-142 csISOTextComm +IEC_P27-1 88 iso-ir-143 csISO143IECP271 +ISO-8859-5 8 iso-ir-144 ISO_8859-5 ISO_8859-5:1988 cyrillic csISOLatinCyrillic 8859_5 ISO8859-5 +JUS_I.B1.003-serb 89 iso-ir-146 serbian csISO146Serbian +JUS_I.B1.003-mac 90 macedonian iso-ir-147 csISO147Macedonian +ISO-8859-9 12 iso-ir-148 ISO_8859-9 ISO_8859-9:1989 latin5 l5 csISOLatin5 8859_9 ISO8859-9 +greek-ccitt 91 iso-ir-150 csISO150 csISO150GreekCCITT +NC_NC00-10:81 92 cuba iso-ir-151 ISO646-CU csISO151Cuba +ISO_6937-2-25 93 iso-ir-152 csISO6937Add +GOST_19768-74 94 ST_SEV_358-88 iso-ir-153 csISO153GOST1976874 +ISO_8859-supp 95 iso-ir-154 latin1-2-5 csISO8859Supp +ISO_10367-box 96 iso-ir-155 csISO10367Box +ISO-8859-10 13 iso-ir-157 l6 ISO_8859-10:1992 csISOLatin6 latin6 8859_10 ISO8859-10 +latin-lap 97 lap iso-ir-158 csISO158Lap +JIS_X0212-1990 98 x0212 iso-ir-159 csISO159JISX02121990 +DS_2089 99 DS2089 ISO646-DK dk csISO646Danish +us-dk 100 csUSDK +dk-us 101 csDKUS +JIS_X0201 15 X0201 csHalfWidthKatakana +KSC5636 102 ISO646-KR csKSC5636 +ISO-10646-UCS-2 1000 csUnicode UCS-2 UCS2 +ISO-10646-UCS-4 1001 csUCS4 UCS-4 UCS4 +DEC-MCS 2008 dec csDECMCS +hp-roman8 2004 roman8 r8 csHPRoman8 +macintosh 2027 mac csMacintosh MACROMAN MAC-ROMAN X-MAC-ROMAN +IBM037 2028 cp037 ebcdic-cp-us ebcdic-cp-ca ebcdic-cp-wt ebcdic-cp-nl csIBM037 +IBM038 2029 EBCDIC-INT cp038 csIBM038 +IBM273 2030 CP273 csIBM273 +IBM274 2031 EBCDIC-BE CP274 csIBM274 +IBM275 2032 EBCDIC-BR cp275 csIBM275 +IBM277 2033 EBCDIC-CP-DK EBCDIC-CP-NO csIBM277 +IBM278 2034 CP278 ebcdic-cp-fi ebcdic-cp-se csIBM278 +IBM280 2035 CP280 ebcdic-cp-it csIBM280 +IBM281 2036 EBCDIC-JP-E cp281 csIBM281 +IBM284 2037 CP284 ebcdic-cp-es csIBM284 +IBM285 2038 CP285 ebcdic-cp-gb csIBM285 +IBM290 2039 cp290 EBCDIC-JP-kana csIBM290 +IBM297 2040 cp297 ebcdic-cp-fr csIBM297 +IBM420 2041 cp420 ebcdic-cp-ar1 csIBM420 +IBM423 2042 cp423 ebcdic-cp-gr csIBM423 +IBM424 2043 cp424 ebcdic-cp-he csIBM424 +IBM437 2011 cp437 437 csPC8CodePage437 +IBM500 2044 CP500 ebcdic-cp-be ebcdic-cp-ch csIBM500 +IBM775 2087 cp775 csPC775Baltic +IBM850 2009 cp850 850 csPC850Multilingual +IBM851 2045 cp851 851 csIBM851 +IBM852 2010 cp852 852 csPCp852 +IBM855 2046 cp855 855 csIBM855 +IBM857 2047 cp857 857 csIBM857 +IBM860 2048 cp860 860 csIBM860 +IBM861 2049 cp861 861 cp-is csIBM861 +IBM862 2013 cp862 862 csPC862LatinHebrew +IBM863 2050 cp863 863 csIBM863 +IBM864 2051 cp864 csIBM864 +IBM865 2052 cp865 865 csIBM865 +IBM866 2086 cp866 866 csIBM866 +IBM868 2053 CP868 cp-ar csIBM868 +IBM869 2054 cp869 869 cp-gr csIBM869 +IBM870 2055 CP870 ebcdic-cp-roece ebcdic-cp-yu csIBM870 +IBM871 2056 CP871 ebcdic-cp-is csIBM871 +IBM880 2057 cp880 EBCDIC-Cyrillic csIBM880 +IBM891 2058 cp891 csIBM891 +IBM903 2059 cp903 csIBM903 +IBM904 2060 cp904 904 csIBBM904 +IBM905 2061 CP905 ebcdic-cp-tr csIBM905 +IBM918 2062 CP918 ebcdic-cp-ar2 csIBM918 +IBM1026 2063 CP1026 csIBM1026 +EBCDIC-AT-DE 2064 csIBMEBCDICATDE +EBCDIC-AT-DE-A 2065 csEBCDICATDEA +EBCDIC-CA-FR 2066 csEBCDICCAFR +EBCDIC-DK-NO 2067 csEBCDICDKNO +EBCDIC-DK-NO-A 2068 csEBCDICDKNOA +EBCDIC-FI-SE 2069 csEBCDICFISE +EBCDIC-FI-SE-A 2070 csEBCDICFISEA +EBCDIC-FR 2071 csEBCDICFR +EBCDIC-IT 2072 csEBCDICIT +EBCDIC-PT 2073 csEBCDICPT +EBCDIC-ES 2074 csEBCDICES +EBCDIC-ES-A 2075 csEBCDICESA +EBCDIC-ES-S 2076 csEBCDICESS +EBCDIC-UK 2077 csEBCDICUK +EBCDIC-US 2078 csEBCDICUS +UNKNOWN-8BIT 2079 csUnknown8BiT +MNEMONIC 2080 csMnemonic +MNEM 2081 csMnem +VISCII 2082 csVISCII +VIQR 2083 csVIQR +KOI8-R 2084 csKOI8R +KOI8-U 2088 +IBM00858 2089 CCSID00858 CP00858 PC-Multilingual-850+euro +IBM00924 2090 CCSID00924 CP00924 ebcdic-Latin9--euro +IBM01140 2091 CCSID01140 CP01140 ebcdic-us-37+euro +IBM01141 2092 CCSID01141 CP01141 ebcdic-de-273+euro +IBM01142 2093 CCSID01142 CP01142 ebcdic-dk-277+euro ebcdic-no-277+euro +IBM01143 2094 CCSID01143 CP01143 ebcdic-fi-278+euro ebcdic-se-278+euro +IBM01144 2095 CCSID01144 CP01144 ebcdic-it-280+euro +IBM01145 2096 CCSID01145 CP01145 ebcdic-es-284+euro +IBM01146 2097 CCSID01146 CP01146 ebcdic-gb-285+euro +IBM01147 2098 CCSID01147 CP01147 ebcdic-fr-297+euro +IBM01148 2099 CCSID01148 CP01148 ebcdic-international-500+euro +IBM01149 2100 CCSID01149 CP01149 ebcdic-is-871+euro +Big5-HKSCS 2101 +IBM1047 2102 IBM-1047 +PTCP154 2103 csPTCP154 PT154 CP154 Cyrillic-Asian +Amiga-1251 2104 Ami1251 Amiga1251 Ami-1251 +KOI7-switched 2105 +UNICODE-1-1 1010 csUnicode11 +SCSU 1011 +UTF-7 1012 +UTF-16BE 1013 +UTF-16LE 1014 +UTF-16 1015 +CESU-8 1016 csCESU-8 +UTF-32 1017 +UTF-32BE 1018 +UTF-32LE 1019 +BOCU-1 1020 csBOCU-1 +UNICODE-1-1-UTF-7 103 csUnicode11UTF7 +UTF-8 106 UNICODE-1-1-UTF-8 UNICODE-2-0-UTF-8 utf8 +ISO-8859-13 109 8859_13 ISO8859-13 +ISO-8859-14 110 iso-ir-199 ISO_8859-14:1998 ISO_8859-14 latin8 iso-celtic l8 8859_14 ISO8859-14 +ISO-8859-15 111 ISO_8859-15 Latin-9 8859_15 ISO8859-15 +ISO-8859-16 112 iso-ir-226 ISO_8859-16:2001 ISO_8859-16 latin10 l10 +GBK 113 CP936 MS936 windows-936 +GB18030 114 +OSD_EBCDIC_DF04_15 115 +OSD_EBCDIC_DF03_IRV 116 +OSD_EBCDIC_DF04_1 117 +JIS_Encoding 16 csJISEncoding +Shift_JIS 17 MS_Kanji csShiftJIS X-SJIS Shift-JIS +EUC-JP 18 csEUCPkdFmtJapanese Extended_UNIX_Code_Packed_Format_for_Japanese EUCJP +Extended_UNIX_Code_Fixed_Width_for_Japanese 19 csEUCFixWidJapanese +ISO-10646-UCS-Basic 1002 csUnicodeASCII +ISO-10646-Unicode-Latin1 1003 csUnicodeLatin1 ISO-10646 +ISO-Unicode-IBM-1261 1005 csUnicodeIBM1261 +ISO-Unicode-IBM-1268 1006 csUnicodeIBM1268 +ISO-Unicode-IBM-1276 1007 csUnicodeIBM1276 +ISO-Unicode-IBM-1264 1008 csUnicodeIBM1264 +ISO-Unicode-IBM-1265 1009 csUnicodeIBM1265 +ISO-8859-1-Windows-3.0-Latin-1 2000 csWindows30Latin1 +ISO-8859-1-Windows-3.1-Latin-1 2001 csWindows31Latin1 +ISO-8859-2-Windows-Latin-2 2002 csWindows31Latin2 +ISO-8859-9-Windows-Latin-5 2003 csWindows31Latin5 +Adobe-Standard-Encoding 2005 csAdobeStandardEncoding +Ventura-US 2006 csVenturaUS +Ventura-International 2007 csVenturaInternational +PC8-Danish-Norwegian 2012 csPC8DanishNorwegian +PC8-Turkish 2014 csPC8Turkish +IBM-Symbols 2015 csIBMSymbols +IBM-Thai 2016 csIBMThai +HP-Legal 2017 csHPLegal +HP-Pi-font 2018 csHPPiFont +HP-Math8 2019 csHPMath8 +Adobe-Symbol-Encoding 2020 csHPPSMath +HP-DeskTop 2021 csHPDesktop +Ventura-Math 2022 csVenturaMath +Microsoft-Publishing 2023 csMicrosoftPublishing +Windows-31J 2024 csWindows31J +GB2312 2025 csGB2312 EUC-CN EUCCN CN-GB +Big5 2026 csBig5 BIG-FIVE BIG-5 CN-BIG5 BIG_FIVE +windows-1250 2250 CP1250 MS-EE +windows-1251 2251 CP1251 MS-CYRL +windows-1252 2252 CP1252 MS-ANSI +windows-1253 2253 CP1253 MS-GREEK +windows-1254 2254 CP1254 MS-TURK +windows-1255 2255 +windows-1256 2256 CP1256 MS-ARAB +windows-1257 2257 CP1257 WINBALTRIM +windows-1258 2258 +TIS-620 2259 +HZ-GB-2312 2085 + +# Additional encodings not defined by IANA + +# Arbitrary allocations +#CP737 3001 +#CP853 3002 +#CP856 3003 +CP874 3004 WINDOWS-874 +#CP922 3005 +#CP1046 3006 +#CP1124 3007 +#CP1125 3008 WINDOWS-1125 +#CP1129 3009 +#CP1133 3010 IBM-CP1133 +#CP1161 3011 IBM-1161 IBM1161 CSIBM1161 +#CP1162 3012 IBM-1162 IBM1162 CSIBM1162 +#CP1163 3013 IBM-1163 IBM1163 CSIBM1163 +#GEORGIAN-ACADEMY 3014 +#GEORGIAN-PS 3015 +#KOI8-RU 3016 +#KOI8-T 3017 +#MACARABIC 3018 X-MAC-ARABIC MAC-ARABIC +#MACCROATIAN 3019 X-MAC-CROATIAN MAC-CROATIAN +#MACGREEK 3020 X-MAC-GREEK MAC-GREEK +#MACHEBREW 3021 X-MAC-HEBREW MAC-HEBREW +#MACICELAND 3022 X-MAC-ICELAND MAC-ICELAND +#MACROMANIA 3023 X-MAC-ROMANIA MAC-ROMANIA +#MACTHAI 3024 X-MAC-THAI MAC-THAI +#MACTURKISH 3025 X-MAC-TURKISH MAC-TURKISH +#MULELAO-1 3026 + +# From Unicode Lib +ISO-IR-182 4000 +ISO-IR-197 4002 +ISO-2022-JP-1 4008 +MACCYRILLIC 4009 X-MAC-CYRILLIC MAC-CYRILLIC +MACUKRAINE 4010 X-MAC-UKRAINIAN MAC-UKRAINIAN +MACCENTRALEUROPE 4011 X-MAC-CENTRALEURROMAN MAC-CENTRALEURROMAN +JOHAB 4012 +ISO-8859-11 4014 iso-ir-166 ISO_8859-11 ISO8859-11 8859_11 +X-CURRENT 4999 X-SYSTEM +X-ACORN-LATIN1 5001 +X-ACORN-FUZZY 5002 diff --git a/test/data/cscodec/INDEX b/test/data/cscodec/INDEX new file mode 100644 index 0000000..d6d338a --- /dev/null +++ b/test/data/cscodec/INDEX @@ -0,0 +1,6 @@ +# Index file for charset codec tests +# +# Test Description + +simple.dat Simple tests, designed to validate testdriver +UTF-8-test.txt Markus Kuhn's UTF-8 decoding test file diff --git a/test/data/cscodec/UTF-8-test.txt b/test/data/cscodec/UTF-8-test.txt new file mode 100644 index 0000000..920e54e Binary files /dev/null and b/test/data/cscodec/UTF-8-test.txt differ diff --git a/test/data/cscodec/simple.dat b/test/data/cscodec/simple.dat new file mode 100644 index 0000000..3e2c7ae Binary files /dev/null and b/test/data/cscodec/simple.dat differ diff --git a/test/data/input/INDEX b/test/data/input/INDEX new file mode 100644 index 0000000..c2c97ea --- /dev/null +++ b/test/data/input/INDEX @@ -0,0 +1,5 @@ +# Index file for inputstream tests +# +# Test Description + +UTF-8-test.txt Markus Kuhn's UTF-8 decoding test file diff --git a/test/data/input/UTF-8-test.txt b/test/data/input/UTF-8-test.txt new file mode 100644 index 0000000..abd16f7 Binary files /dev/null and b/test/data/input/UTF-8-test.txt differ diff --git a/test/filter.c b/test/filter.c new file mode 100644 index 0000000..ff4d1e7 --- /dev/null +++ b/test/filter.c @@ -0,0 +1,357 @@ +#include +#include +#include +#include + +#include + +#include "utils/utils.h" + +#include "input/filter.h" + +#include "testutils.h" + +static void *myrealloc(void *ptr, size_t len, void *pw) +{ + UNUSED(pw); + + return realloc(ptr, len); +} + +int main(int argc, char **argv) +{ + parserutils_filter_optparams params; + parserutils_filter *input; + uint8_t inbuf[64], outbuf[64]; + size_t inlen, outlen; + const uint8_t *in = inbuf; + uint8_t *out = outbuf; + + if (argc != 2) { + printf("Usage: %s \n", argv[0]); + return 1; + } + + /* Initialise library */ + assert(parserutils_initialise(argv[1], myrealloc, NULL) == + PARSERUTILS_OK); + + /* Create input filter */ + input = parserutils_filter_create("UTF-8", myrealloc, NULL); + assert(input); + + /* Convert filter to UTF-8 encoding */ + params.encoding.name = "UTF-8"; + assert(parserutils_filter_setopt(input, PARSERUTILS_FILTER_SET_ENCODING, + (parserutils_filter_optparams *) ¶ms) == + PARSERUTILS_OK); + + + /* Simple case - valid input & output buffer large enough */ + in = inbuf; + out = outbuf; + strcpy((char *) inbuf, "hell\xc2\xa0o!"); + inlen = strlen((const char *) inbuf); + outbuf[0] = '\0'; + outlen = 64; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + assert(parserutils_filter_reset(input) == PARSERUTILS_OK); + + assert(memcmp(outbuf, "hell\xc2\xa0o!", + SLEN("hell\xc2\xa0o!")) == 0); + + + /* Too small an output buffer; no encoding edge cases */ + in = inbuf; + out = outbuf; + strcpy((char *) inbuf, "hello!"); + inlen = strlen((const char *) inbuf); + outbuf[0] = '\0'; + outlen = 5; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_NOMEM); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + outlen = 64 - 5 + outlen; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + assert(parserutils_filter_reset(input) == PARSERUTILS_OK); + + assert(memcmp(outbuf, "hello!", + SLEN("hello!")) == 0); + + + /* Illegal input sequence; output buffer large enough */ + in = inbuf; + out = outbuf; + strcpy((char *) inbuf, "hell\x96o!"); + inlen = strlen((const char *) inbuf); + outbuf[0] = '\0'; + outlen = 64; + + /* Input does loose decoding, converting to U+FFFD if illegal + * input is encountered */ + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + assert(parserutils_filter_reset(input) == PARSERUTILS_OK); + + assert(memcmp(outbuf, "hell\xef\xbf\xbdo!", + SLEN("hell\xef\xbf\xbdo!")) == 0); + + + /* Input ends mid-sequence */ + in = inbuf; + out = outbuf; + strcpy((char *) inbuf, "hell\xc2\xa0o!"); + inlen = strlen((const char *) inbuf) - 3; + outbuf[0] = '\0'; + outlen = 64; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + inlen += 3; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + assert(parserutils_filter_reset(input) == PARSERUTILS_OK); + + assert(memcmp(outbuf, "hell\xc2\xa0o!", + SLEN("hell\xc2\xa0o!")) == 0); + + + /* Input ends mid-sequence, but second attempt has too small a + * buffer, but large enough to write out the incomplete character. */ + in = inbuf; + out = outbuf; + strcpy((char *) inbuf, "hell\xc2\xa0o!"); + inlen = strlen((const char *) inbuf) - 3; + outbuf[0] = '\0'; + outlen = 64; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + inlen += 3; + outlen = 3; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_NOMEM); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + outlen = 64 - 7; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + assert(parserutils_filter_reset(input) == PARSERUTILS_OK); + + assert(memcmp(outbuf, "hell\xc2\xa0o!", + SLEN("hell\xc2\xa0o!")) == 0); + + + /* Input ends mid-sequence, but second attempt has too small a + * buffer, not large enough to write out the incomplete character. */ + in = inbuf; + out = outbuf; + strcpy((char *) inbuf, "hell\xc2\xa0o!"); + inlen = strlen((const char *) inbuf) - 3; + outbuf[0] = '\0'; + outlen = 64; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + inlen += 3; + outlen = 1; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_NOMEM); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + outlen = 60; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + assert(parserutils_filter_reset(input) == PARSERUTILS_OK); + + assert(memcmp(outbuf, "hell\xc2\xa0o!", + SLEN("hell\xc2\xa0o!")) == 0); + + + /* Input ends mid-sequence, but second attempt contains + * invalid character */ + in = inbuf; + out = outbuf; + strcpy((char *) inbuf, "hell\xc2\xc2o!"); + inlen = strlen((const char *) inbuf) - 3; + outbuf[0] = '\0'; + outlen = 64; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + inlen += 3; + + /* Input does loose decoding, converting to U+FFFD if illegal + * input is encountered */ + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + assert(parserutils_filter_reset(input) == PARSERUTILS_OK); + + assert(memcmp(outbuf, "hell\xef\xbf\xbd\xef\xbf\xbdo!", + SLEN("hell\xef\xbf\xbd\xef\xbf\xbdo!")) == 0); + + + /* Input ends mid-sequence, but second attempt contains another + * incomplete character */ + in = inbuf; + out = outbuf; + strcpy((char *) inbuf, "hell\xc2\xa0\xc2\xa1o!"); + inlen = strlen((const char *) inbuf) - 5; + outbuf[0] = '\0'; + outlen = 64; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + inlen += 2; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + inlen += 3; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + assert(parserutils_filter_reset(input) == PARSERUTILS_OK); + + assert(memcmp(outbuf, "hell\xc2\xa0\xc2\xa1o!", + SLEN("hell\xc2\xa0\xc2\xa1o!")) == 0); + + + /* Input ends mid-sequence, but second attempt contains insufficient + * data to complete the incomplete character */ + in = inbuf; + out = outbuf; + strcpy((char *) inbuf, "hell\xe2\x80\xa2o!"); + inlen = strlen((const char *) inbuf) - 4; + outbuf[0] = '\0'; + outlen = 64; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + inlen += 1; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + inlen += 3; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + assert(parserutils_filter_reset(input) == PARSERUTILS_OK); + + assert(memcmp(outbuf, "hell\xe2\x80\xa2o!", + SLEN("hell\xe2\x80\xa2o!")) == 0); + + + /* Clean up */ + parserutils_filter_destroy(input); + + assert(parserutils_finalise(myrealloc, NULL) == PARSERUTILS_OK); + + printf("PASS\n"); + + return 0; +} diff --git a/test/inputstream.c b/test/inputstream.c new file mode 100644 index 0000000..bad3127 --- /dev/null +++ b/test/inputstream.c @@ -0,0 +1,97 @@ +#include +#include + +#include +#include +#include + +#include "utils/utils.h" + +#include "testutils.h" + +static void *myrealloc(void *ptr, size_t len, void *pw) +{ + UNUSED(pw); + + return realloc(ptr, len); +} + +int main(int argc, char **argv) +{ + parserutils_inputstream *stream; + FILE *fp; + size_t len, origlen; +#define CHUNK_SIZE (4096) + uint8_t buf[CHUNK_SIZE]; + uintptr_t c; + size_t clen; + + if (argc != 3) { + printf("Usage: %s \n", argv[0]); + return 1; + } + + /* Initialise library */ + assert(parserutils_initialise(argv[1], myrealloc, NULL) == + PARSERUTILS_OK); + + stream = parserutils_inputstream_create("UTF-8", 1, NULL, + myrealloc, NULL); + assert(stream != NULL); + + fp = fopen(argv[2], "rb"); + if (fp == NULL) { + printf("Failed opening %s\n", argv[2]); + return 1; + } + + fseek(fp, 0, SEEK_END); + origlen = len = ftell(fp); + fseek(fp, 0, SEEK_SET); + + while (len >= CHUNK_SIZE) { + fread(buf, 1, CHUNK_SIZE, fp); + + assert(parserutils_inputstream_append(stream, + buf, CHUNK_SIZE) == PARSERUTILS_OK); + + len -= CHUNK_SIZE; + + while ((c = parserutils_inputstream_peek(stream, 0, &clen)) != + PARSERUTILS_INPUTSTREAM_OOD) { + parserutils_inputstream_advance(stream, clen); + } + } + + if (len > 0) { + fread(buf, 1, len, fp); + + assert(parserutils_inputstream_append(stream, + buf, len) == PARSERUTILS_OK); + + len = 0; + } + + fclose(fp); + + assert(parserutils_inputstream_insert(stream, + (const uint8_t *) "hello!!!", + SLEN("hello!!!")) == PARSERUTILS_OK); + + assert(parserutils_inputstream_append(stream, NULL, 0) == + PARSERUTILS_OK); + + while ((c = parserutils_inputstream_peek(stream, 0, &clen)) != + PARSERUTILS_INPUTSTREAM_EOF) { + parserutils_inputstream_advance(stream, clen); + } + + parserutils_inputstream_destroy(stream); + + assert(parserutils_finalise(myrealloc, NULL) == PARSERUTILS_OK); + + printf("PASS\n"); + + return 0; +} + diff --git a/test/parserutils.c b/test/parserutils.c new file mode 100644 index 0000000..c6d671a --- /dev/null +++ b/test/parserutils.c @@ -0,0 +1,30 @@ +#include +#include + +#include + +#include "testutils.h" + +static void *myrealloc(void *ptr, size_t len, void *pw) +{ + UNUSED(pw); + + return realloc(ptr, len); +} + +int main(int argc, char **argv) +{ + if (argc != 2) { + printf("Usage: %s \n", argv[0]); + return 1; + } + + assert(parserutils_initialise(argv[1], myrealloc, NULL) == + PARSERUTILS_OK); + + assert (parserutils_finalise(myrealloc, NULL) == PARSERUTILS_OK); + + printf("PASS\n"); + + return 0; +} diff --git a/test/regression/cscodec-segv.c b/test/regression/cscodec-segv.c new file mode 100644 index 0000000..5802fdf --- /dev/null +++ b/test/regression/cscodec-segv.c @@ -0,0 +1,38 @@ +#include + +#include "charset/charset.h" +#include + +#include "testutils.h" + +static void *myrealloc(void *ptr, size_t len, void *pw) +{ + UNUSED(pw); + + return realloc(ptr, len); +} + +int main(int argc, char **argv) +{ + parserutils_charset_codec *codec; + + if (argc != 2) { + printf("Usage: %s \n", argv[0]); + return 1; + } + + assert(parserutils_charset_initialise(argv[1], myrealloc, NULL) == + PARSERUTILS_OK); + + codec = parserutils_charset_codec_create("UTF-8", myrealloc, NULL); + assert(codec != NULL); + + parserutils_charset_codec_destroy(codec); + + assert(parserutils_charset_finalise(myrealloc, NULL) == + PARSERUTILS_OK); + + printf("PASS\n"); + + return 0; +} diff --git a/test/regression/filter-segv.c b/test/regression/filter-segv.c new file mode 100644 index 0000000..761caab --- /dev/null +++ b/test/regression/filter-segv.c @@ -0,0 +1,39 @@ +#include +#include + +#include + +#include "input/filter.h" + +#include "testutils.h" + +static void *myrealloc(void *ptr, size_t len, void *pw) +{ + UNUSED(pw); + + return realloc(ptr, len); +} + +int main(int argc, char **argv) +{ + parserutils_filter *input; + + if (argc != 2) { + printf("Usage: %s \n", argv[0]); + return 1; + } + + assert(parserutils_initialise(argv[1], myrealloc, NULL) == + PARSERUTILS_OK); + + input = parserutils_filter_create("UTF-8", myrealloc, NULL); + assert(input); + + parserutils_filter_destroy(input); + + assert(parserutils_finalise(myrealloc, NULL) == PARSERUTILS_OK); + + printf("PASS\n"); + + return 0; +} diff --git a/test/regression/stream-nomem.c b/test/regression/stream-nomem.c new file mode 100644 index 0000000..f62b392 --- /dev/null +++ b/test/regression/stream-nomem.c @@ -0,0 +1,94 @@ +#include +#include + +#include +#include + +#include "utils/utils.h" + +#include "testutils.h" + +static void *myrealloc(void *ptr, size_t len, void *pw) +{ + UNUSED(pw); + + return realloc(ptr, len); +} + +int main(int argc, char **argv) +{ + parserutils_inputstream *stream; + + /* This is specially calculated so that the inputstream is forced to + * reallocate (it assumes that the inputstream's buffer chunk size + * is 4k) */ +#define BUFFER_SIZE (4096 + 4) + uint8_t input_buffer[BUFFER_SIZE]; +// uint8_t *buffer; +// size_t buflen; + uintptr_t c; + size_t clen; + + if (argc != 2) { + printf("Usage: %s \n", argv[0]); + return 1; + } + + /* Populate the buffer with something sane */ + memset(input_buffer, 'a', BUFFER_SIZE); + /* Now, set up our test data */ + input_buffer[BUFFER_SIZE - 1] = '5'; + input_buffer[BUFFER_SIZE - 2] = '4'; + input_buffer[BUFFER_SIZE - 3] = '\xbd'; + input_buffer[BUFFER_SIZE - 4] = '\xbf'; + /* This byte will occupy the 4095th byte in the buffer and + * thus cause the entirety of U+FFFD to be buffered until after + * the buffer has been enlarged */ + input_buffer[BUFFER_SIZE - 5] = '\xef'; + input_buffer[BUFFER_SIZE - 6] = '3'; + input_buffer[BUFFER_SIZE - 7] = '2'; + input_buffer[BUFFER_SIZE - 8] = '1'; + + assert(parserutils_initialise(argv[1], myrealloc, NULL) == + PARSERUTILS_OK); + + stream = parserutils_inputstream_create("UTF-8", 0, + NULL, myrealloc, NULL); + assert(stream != NULL); + + assert(parserutils_inputstream_append(stream, + input_buffer, BUFFER_SIZE) == PARSERUTILS_OK); + + assert(parserutils_inputstream_append(stream, NULL, 0) == + PARSERUTILS_OK); + + while ((c = parserutils_inputstream_peek(stream, 0, &clen)) != + PARSERUTILS_INPUTSTREAM_EOF) + parserutils_inputstream_advance(stream, clen); + +/* + assert(css_inputstream_claim_buffer(stream, &buffer, &buflen) == + CSS_OK); + + assert(buflen == BUFFER_SIZE); + + printf("Buffer: '%.*s'\n", 8, buffer + (BUFFER_SIZE - 8)); + + assert( buffer[BUFFER_SIZE - 6] == '3' && + buffer[BUFFER_SIZE - 5] == (uint8_t) '\xef' && + buffer[BUFFER_SIZE - 4] == (uint8_t) '\xbf' && + buffer[BUFFER_SIZE - 3] == (uint8_t) '\xbd' && + buffer[BUFFER_SIZE - 2] == '4'); + + free(buffer); +*/ + + parserutils_inputstream_destroy(stream); + + assert(parserutils_finalise(myrealloc, NULL) == PARSERUTILS_OK); + + printf("PASS\n"); + + return 0; +} + diff --git a/test/testrunner.pl b/test/testrunner.pl new file mode 100644 index 0000000..1c6c66d --- /dev/null +++ b/test/testrunner.pl @@ -0,0 +1,167 @@ +#!/bin/perl +# +# Testcase runner +# +# Usage: testrunner [] +# +# Operates upon INDEX files described in the README. +# Locates and executes testcases, feeding data files to programs +# as appropriate. +# Logs testcase output to file. +# Aborts test sequence on detection of error. +# + +use warnings; +use strict; +use File::Spec; +use IPC::Open3; + +if (@ARGV < 1) { + print "Usage: testrunner.pl []\n"; + exit; +} + +# Get directory +my $directory = shift @ARGV; + +# Get EXE extension (if any) +my $exeext = ""; +$exeext = shift @ARGV if (@ARGV > 0); + +# Open log file and /dev/null +open(LOG, ">$directory/log") or die "Failed opening test log"; +open(NULL, "+<", File::Spec->devnull) or die "Failed opening /dev/null"; + +# Open testcase index +open(TINDEX, "<$directory/INDEX") or die "Failed opening test INDEX"; + +# Parse testcase index, looking for testcases +while (my $line = ) { + next if ($line =~ /^(#.*)?$/); + + # Found one; decompose + (my $test, my $desc, my $data) = split /\t+/, $line; + + # Strip whitespace + $test =~ s/^\s+|\s+$//g; + $desc =~ s/^\s+|\s+$//g; + $data =~ s/^\s+|\s+$//g if ($data); + + # Append EXE extension to binary name + $test = $test . $exeext; + + print "Test: $desc\n"; + + my $pid; + + if ($data) { + # Testcase has external data files + + # Open datafile index + open(DINDEX, "<$directory/data/$data/INDEX") or + die "Failed opening $directory/data/$data/INDEX"; + + # Parse datafile index, looking for datafiles + while (my $dentry = ) { + next if ($dentry =~ /^(#.*)?$/); + + # Found one; decompose + (my $dtest, my $ddesc) = split /\t+/, $dentry; + + # Strip whitespace + $dtest =~ s/^\s+|\s+$//g; + $ddesc =~ s/^\s+|\s+$//g; + + print LOG "Running $directory/$test " . + "$directory/data/Aliases " . + "$directory/data/$data/$dtest\n"; + + # Make message fit on an 80 column terminal + my $msg = " ==> $test [$data/$dtest]"; + $msg = $msg . "." x (80 - length($msg) - 8); + + print $msg; + + # Run testcase + $pid = open3("&) { + print LOG " $output"; + $last = $output; + } + + # Wait for child to finish + waitpid($pid, 0); + + print substr($last, 0, 4) . "\n"; + + # Bail, noisily, on failure + if (substr($last, 0, 4) eq "FAIL") { + # Write any stderr output to the log + while (my $errors = ) { + print LOG " $errors"; + } + + print "\n\nFailure detected: " . + "consult log file\n\n\n"; + + exit(1); + } + } + + close(DINDEX); + } else { + # Testcase has no external data files + print LOG "Running $directory/$test $directory/data/Aliases\n"; + + # Make message fit on an 80 column terminal + my $msg = " ==> $test"; + $msg = $msg . "." x (80 - length($msg) - 8); + + print $msg; + + # Run testcase + $pid = open3("&) { + print LOG " $output"; + $last = $output; + } + + # Wait for child to finish + waitpid($pid, 0); + + print substr($last, 0, 4) . "\n"; + + # Bail, noisily, on failure + if (substr($last, 0, 4) eq "FAIL") { + # Write any stderr output to the log + while (my $errors = ) { + print LOG " $errors"; + } + + print "\n\nFailure detected: " . + "consult log file\n\n\n"; + + exit(1); + } + } + + print "\n"; +} + +# Clean up +close(TINDEX); + +close(NULL); +close(LOG); diff --git a/test/testutils.h b/test/testutils.h new file mode 100644 index 0000000..c91c5b8 --- /dev/null +++ b/test/testutils.h @@ -0,0 +1,123 @@ +#ifndef test_testutils_h_ +#define test_testutils_h_ + +#include +#include +#include + +#ifndef UNUSED +#define UNUSED(x) ((x) = (x)) +#endif + +/* Redefine assert, so we can simply use the standard assert mechanism + * within testcases and exit with the right output for the testrunner + * to do the right thing. */ +void __assert2(const char *expr, const char *function, + const char *file, int line); + +void __assert2(const char *expr, const char *function, + const char *file, int line) +{ + UNUSED(function); + UNUSED(file); + + printf("FAIL - %s at line %d\n", expr, line); + + exit(EXIT_FAILURE); +} + +#define assert(expr) \ + ((void) ((expr) || (__assert2 (#expr, __func__, __FILE__, __LINE__), 0))) + + +typedef bool (*line_func)(const char *data, size_t datalen, void *pw); + +static size_t parse_strlen(const char *str, size_t limit); +bool parse_testfile(const char *filename, line_func callback, void *pw); +size_t parse_filesize(const char *filename); + +/** + * Testcase datafile parser driver + * + * \param filename Name of file to parse + * \param callback Pointer to function to handle each line of input data + * \param pw Pointer to client-specific private data + * \return true on success, false otherwise. + */ +bool parse_testfile(const char *filename, line_func callback, void *pw) +{ + FILE *fp; + char buf[300]; + + fp = fopen(filename, "rb"); + if (fp == NULL) { + printf("Failed opening %s\n", filename); + return false; + } + + while (fgets(buf, sizeof buf, fp)) { + if (buf[0] == '\n') + continue; + + if (!callback(buf, parse_strlen(buf, sizeof buf), pw)) { + fclose(fp); + return false; + } + } + + fclose(fp); + + return true; +} + +/** + * Utility string length measurer; assumes strings are '\n' terminated + * + * \param str String to measure length of + * \param limit Upper bound on string length + * \return String length + */ +size_t parse_strlen(const char *str, size_t limit) +{ + size_t len = 0; + + if (str == NULL) + return 0; + + while (len < limit - 1 && *str != '\n') { + len++; + str++; + } + + len++; + + return len; +} + +/** + * Read the size of a file + * + * \param filename Name of file to read size of + * \return File size (in bytes), or 0 on error + */ +size_t parse_filesize(const char *filename) +{ + FILE *fp; + size_t len = 0; + + fp = fopen(filename, "rb"); + if (fp == NULL) { + printf("Failed opening %s\n", filename); + return 0; + } + + fseek(fp, 0, SEEK_END); + len = ftell(fp); + + fclose(fp); + + return len; +} + + +#endif -- cgit v1.2.3