From 2777a04ed2ba4fd36138b991d66a32a283361f7e Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Thu, 1 May 2008 16:34:46 +0000 Subject: Import parser construction utility library svn path=/trunk/libparserutils/; revision=4111 --- include/parserutils/charset/codec.h | 114 +++++++++++++++++++++++++ include/parserutils/charset/mibenum.h | 24 ++++++ include/parserutils/charset/utf16.h | 38 +++++++++ include/parserutils/charset/utf8.h | 38 +++++++++ include/parserutils/errors.h | 29 +++++++ include/parserutils/functypes.h | 21 +++++ include/parserutils/input/inputstream.h | 143 ++++++++++++++++++++++++++++++++ include/parserutils/parserutils.h | 23 +++++ include/parserutils/types.h | 15 ++++ include/parserutils/utils/buffer.h | 39 +++++++++ 10 files changed, 484 insertions(+) create mode 100644 include/parserutils/charset/codec.h create mode 100644 include/parserutils/charset/mibenum.h create mode 100644 include/parserutils/charset/utf16.h create mode 100644 include/parserutils/charset/utf8.h create mode 100644 include/parserutils/errors.h create mode 100644 include/parserutils/functypes.h create mode 100644 include/parserutils/input/inputstream.h create mode 100644 include/parserutils/parserutils.h create mode 100644 include/parserutils/types.h create mode 100644 include/parserutils/utils/buffer.h (limited to 'include/parserutils') diff --git a/include/parserutils/charset/codec.h b/include/parserutils/charset/codec.h new file mode 100644 index 0000000..ca98db5 --- /dev/null +++ b/include/parserutils/charset/codec.h @@ -0,0 +1,114 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef parserutils_charset_codec_h_ +#define parserutils_charset_codec_h_ + +#include + +#include +#include + +typedef struct parserutils_charset_codec parserutils_charset_codec; + +#define PARSERUTILS_CHARSET_CODEC_NULL (0xffffffffU) + +/** + * Charset codec error mode + * + * A codec's error mode determines its behaviour in the face of: + * + * + characters which are unrepresentable in the destination charset (if + * encoding data) or which cannot be converted to UCS4 (if decoding data). + * + invalid byte sequences (both encoding and decoding) + * + * The options provide a choice between the following approaches: + * + * + draconian, "stop processing" ("strict") + * + "replace the unrepresentable character with something else" ("loose") + * + "attempt to transliterate, or replace if unable" ("translit") + * + * The default error mode is "loose". + * + * + * In the "loose" case, the replacement character will depend upon: + * + * + Whether the operation was encoding or decoding + * + If encoding, what the destination charset is. + * + * If decoding, the replacement character will be: + * + * U+FFFD (REPLACEMENT CHARACTER) + * + * If encoding, the replacement character will be: + * + * U+003F (QUESTION MARK) if the destination charset is not UTF-(8|16|32) + * U+FFFD (REPLACEMENT CHARACTER) otherwise. + * + * + * In the "translit" case, the codec will attempt to transliterate into + * the destination charset, if encoding. If decoding, or if transliteration + * fails, this option is identical to "loose". + */ +typedef enum parserutils_charset_codec_errormode { + /** Abort processing if unrepresentable character encountered */ + PARSERUTILS_CHARSET_CODEC_ERROR_STRICT = 0, + /** Replace unrepresentable characters with single alternate */ + PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE = 1, + /** Transliterate unrepresentable characters, if possible */ + PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT = 2, +} parserutils_charset_codec_errormode; + +/** + * Charset codec option types + */ +typedef enum parserutils_charset_codec_opttype { + /** Set codec error mode */ + PARSERUTILS_CHARSET_CODEC_ERROR_MODE = 1, +} parserutils_charset_codec_opttype; + +/** + * Charset codec option parameters + */ +typedef union parserutils_charset_codec_optparams { + /** Parameters for error mode setting */ + struct { + /** The desired error handling mode */ + parserutils_charset_codec_errormode mode; + } error_mode; +} parserutils_charset_codec_optparams; + + +/* Create a charset codec */ +parserutils_charset_codec *parserutils_charset_codec_create(const char *charset, + parserutils_alloc alloc, void *pw); +/* Destroy a charset codec */ +void parserutils_charset_codec_destroy(parserutils_charset_codec *codec); + +/* Configure a charset codec */ +parserutils_error parserutils_charset_codec_setopt( + parserutils_charset_codec *codec, + parserutils_charset_codec_opttype type, + parserutils_charset_codec_optparams *params); + +/* Encode a chunk of UCS4 data into a codec's charset */ +parserutils_error parserutils_charset_codec_encode( + parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); + +/* Decode a chunk of data in a codec's charset into UCS4 */ +parserutils_error parserutils_charset_codec_decode( + parserutils_charset_codec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); + +/* Reset a charset codec */ +parserutils_error parserutils_charset_codec_reset( + parserutils_charset_codec *codec); + +#endif diff --git a/include/parserutils/charset/mibenum.h b/include/parserutils/charset/mibenum.h new file mode 100644 index 0000000..8b3ac9d --- /dev/null +++ b/include/parserutils/charset/mibenum.h @@ -0,0 +1,24 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef parserutils_charset_mibenum_h_ +#define parserutils_charset_mibenum_h_ + +#include +#include + +#include +#include + +/* Convert an encoding alias to a MIB enum value */ +uint16_t parserutils_charset_mibenum_from_name(const char *alias, size_t len); +/* Convert a MIB enum value into an encoding alias */ +const char *parserutils_charset_mibenum_to_name(uint16_t mibenum); +/* Determine if a MIB enum value represents a Unicode variant */ +bool parserutils_charset_mibenum_is_unicode(uint16_t mibenum); + +#endif diff --git a/include/parserutils/charset/utf16.h b/include/parserutils/charset/utf16.h new file mode 100644 index 0000000..6569d6e --- /dev/null +++ b/include/parserutils/charset/utf16.h @@ -0,0 +1,38 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +/** \file + * UTF-16 manipulation functions (interface). + */ + +#ifndef parserutils_charset_utf16_h_ +#define parserutils_charset_utf16_h_ + +#include + +#include + +parserutils_error parserutils_charset_utf16_to_ucs4(const uint8_t *s, + size_t len, uint32_t *ucs4, size_t *clen); +parserutils_error parserutils_charset_utf16_from_ucs4(uint32_t ucs4, + uint8_t *s, size_t *len); + +parserutils_error parserutils_charset_utf16_length(const uint8_t *s, + size_t max, size_t *len); +parserutils_error parserutils_charset_utf16_char_byte_length(const uint8_t *s, + size_t *len); + +parserutils_error parserutils_charset_utf16_prev(const uint8_t *s, + uint32_t off, uint32_t *prevoff); +parserutils_error parserutils_charset_utf16_next(const uint8_t *s, + uint32_t len, uint32_t off, uint32_t *nextoff); + +parserutils_error parserutils_charset_utf16_next_paranoid(const uint8_t *s, + uint32_t len, uint32_t off, uint32_t *nextoff); + +#endif + diff --git a/include/parserutils/charset/utf8.h b/include/parserutils/charset/utf8.h new file mode 100644 index 0000000..16e012e --- /dev/null +++ b/include/parserutils/charset/utf8.h @@ -0,0 +1,38 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +/** \file + * UTF-8 manipulation functions (interface). + */ + +#ifndef parserutils_charset_utf8_h_ +#define parserutils_charset_utf8_h_ + +#include + +#include + +parserutils_error parserutils_charset_utf8_to_ucs4(const uint8_t *s, size_t len, + uint32_t *ucs4, size_t *clen); +parserutils_error parserutils_charset_utf8_from_ucs4(uint32_t ucs4, uint8_t **s, + size_t *len); + +parserutils_error parserutils_charset_utf8_length(const uint8_t *s, size_t max, + size_t *len); +parserutils_error parserutils_charset_utf8_char_byte_length(const uint8_t *s, + size_t *len); + +parserutils_error parserutils_charset_utf8_prev(const uint8_t *s, uint32_t off, + uint32_t *prevoff); +parserutils_error parserutils_charset_utf8_next(const uint8_t *s, uint32_t len, + uint32_t off, uint32_t *nextoff); + +parserutils_error parserutils_charset_utf8_next_paranoid(const uint8_t *s, + uint32_t len, uint32_t off, uint32_t *nextoff); + +#endif + diff --git a/include/parserutils/errors.h b/include/parserutils/errors.h new file mode 100644 index 0000000..09c715c --- /dev/null +++ b/include/parserutils/errors.h @@ -0,0 +1,29 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef parserutils_errors_h_ +#define parserutils_errors_h_ + +#include + +typedef enum parserutils_error { + PARSERUTILS_OK = 0, + + PARSERUTILS_NOMEM = 1, + PARSERUTILS_BADPARM = 2, + PARSERUTILS_INVALID = 3, + PARSERUTILS_FILENOTFOUND = 4, + PARSERUTILS_NEEDDATA = 5, +} parserutils_error; + +/* Convert a parserutils error value to a string */ +const char *parserutils_error_to_string(parserutils_error error); +/* Convert a string to a parserutils error value */ +parserutils_error parserutils_error_from_string(const char *str, size_t len); + +#endif + diff --git a/include/parserutils/functypes.h b/include/parserutils/functypes.h new file mode 100644 index 0000000..703a329 --- /dev/null +++ b/include/parserutils/functypes.h @@ -0,0 +1,21 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007-8 John-Mark Bell + */ + +#ifndef parserutils_functypes_h_ +#define parserutils_functypes_h_ + +#include +#include +#include + +#include + +/* Type of allocation function for parserutils */ +typedef void *(*parserutils_alloc)(void *ptr, size_t size, void *pw); + +#endif + diff --git a/include/parserutils/input/inputstream.h b/include/parserutils/input/inputstream.h new file mode 100644 index 0000000..2b0c407 --- /dev/null +++ b/include/parserutils/input/inputstream.h @@ -0,0 +1,143 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef parserutils_input_inputstream_h_ +#define parserutils_input_inputstream_h_ + +#include +#include +#include + +#include +#include +#include +#include +#include + +/** + * Type of charset detection function + */ +typedef parserutils_error (*parserutils_charset_detect_func)( + const uint8_t *data, size_t len, + uint16_t *mibenum, uint32_t *source); + +/** + * Input stream object + */ +typedef struct parserutils_inputstream +{ + parserutils_buffer *utf8; /**< Buffer containing utf8 data */ + + uint32_t cursor; /**< Byte offset of current position */ + + bool had_eof; /**< Whether EOF has been reached */ +} parserutils_inputstream; + +/* EOF pseudo-character */ +#define PARSERUTILS_INPUTSTREAM_EOF (0xFFFFFFFFU) +/* Out-of-data indicator */ +#define PARSERUTILS_INPUTSTREAM_OOD (0xFFFFFFFEU) + +/* Create an input stream */ +parserutils_inputstream *parserutils_inputstream_create(const char *enc, + uint32_t encsrc, parserutils_charset_detect_func csdetect, + parserutils_alloc alloc, void *pw); +/* Destroy an input stream */ +void parserutils_inputstream_destroy(parserutils_inputstream *stream); + +/* Append data to an input stream */ +parserutils_error parserutils_inputstream_append( + parserutils_inputstream *stream, + const uint8_t *data, size_t len); +/* Insert data into stream at current location */ +parserutils_error parserutils_inputstream_insert( + parserutils_inputstream *stream, + const uint8_t *data, size_t len); + +/* Slow form of css_inputstream_peek. */ +uintptr_t parserutils_inputstream_peek_slow(parserutils_inputstream *stream, + size_t offset, size_t *length); + +/* Look at the character in the stream that starts at + * offset bytes from the cursor + * + * \param stream Stream to look in + * \param offset Byte offset of start of character + * \param length Pointer to location to receive character length (in bytes) + * \return Pointer to character data, or EOF or OOD. + * + * Once the character pointed to by the result of this call has been advanced + * past (i.e. parserutils_inputstream_advance has caused the stream cursor to + * pass over the character), then no guarantee is made as to the validity of + * the data pointed to. Thus, any attempt to dereference the pointer after + * advancing past the data it points to is a bug. + */ +static inline uintptr_t parserutils_inputstream_peek( + parserutils_inputstream *stream, size_t offset, size_t *length) +{ + parserutils_error error = PARSERUTILS_OK; + size_t len; + + if (stream == NULL) + return PARSERUTILS_INPUTSTREAM_OOD; + +#define IS_ASCII(x) (((x) & 0x80) == 0) + + if (stream->cursor + offset < stream->utf8->length) { + if (IS_ASCII(stream->utf8->data[stream->cursor + offset])) { + len = 1; + } else { + error = parserutils_charset_utf8_char_byte_length( + stream->utf8->data + stream->cursor + offset, + &len); + + if (error != PARSERUTILS_OK && + error != PARSERUTILS_NEEDDATA) + return PARSERUTILS_INPUTSTREAM_OOD; + } + } + +#undef IS_ASCII + + if (stream->cursor + offset == stream->utf8->length || + error == PARSERUTILS_NEEDDATA) { + return parserutils_inputstream_peek_slow(stream, + offset, length); + } + + *length = len; + + return (uintptr_t) (stream->utf8->data + stream->cursor + offset); +} + +/** + * Advance the stream's current position + * + * \param stream The stream whose position to advance + * \param bytes The number of bytes to advance + */ +static inline void parserutils_inputstream_advance( + parserutils_inputstream *stream, size_t bytes) +{ + if (stream == NULL) + return; + + if (bytes > stream->utf8->length - stream->cursor) + abort(); + + if (stream->cursor == stream->utf8->length) + return; + + stream->cursor += bytes; +} + +/* Read the document charset */ +const char *parserutils_inputstream_read_charset( + parserutils_inputstream *stream, uint32_t *source); + +#endif + diff --git a/include/parserutils/parserutils.h b/include/parserutils/parserutils.h new file mode 100644 index 0000000..460e80c --- /dev/null +++ b/include/parserutils/parserutils.h @@ -0,0 +1,23 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef parserutils_parserutils_h_ +#define parserutils_parserutils_h_ + +#include +#include +#include + +/* Initialise the ParserUtils library for use */ +parserutils_error parserutils_initialise(const char *aliases_file, + parserutils_alloc alloc, void *pw); + +/* Clean up after ParserUtils */ +parserutils_error parserutils_finalise(parserutils_alloc alloc, void *pw); + +#endif + diff --git a/include/parserutils/types.h b/include/parserutils/types.h new file mode 100644 index 0000000..b36e4aa --- /dev/null +++ b/include/parserutils/types.h @@ -0,0 +1,15 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef parserutils_types_h_ +#define parserutils_types_h_ + +#include +#include + +#endif + diff --git a/include/parserutils/utils/buffer.h b/include/parserutils/utils/buffer.h new file mode 100644 index 0000000..f3a1883 --- /dev/null +++ b/include/parserutils/utils/buffer.h @@ -0,0 +1,39 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2008 John-Mark Bell + */ + +#ifndef parserutils_utils_buffer_h_ +#define parserutils_utils_buffer_h_ + +#include +#include + +struct parserutils_buffer +{ + uint8_t *data; + size_t length; + size_t allocated; + + parserutils_alloc alloc; + void *pw; +}; +typedef struct parserutils_buffer parserutils_buffer; + +parserutils_buffer *parserutils_buffer_create(parserutils_alloc alloc, + void *pw); +void parserutils_buffer_destroy(parserutils_buffer *buffer); + +parserutils_error parserutils_buffer_append(parserutils_buffer *buffer, + const uint8_t *data, size_t len); +parserutils_error parserutils_buffer_insert(parserutils_buffer *buffer, + size_t offset, const uint8_t *data, size_t len); +parserutils_error parserutils_buffer_discard(parserutils_buffer *buffer, + size_t offset, size_t len); + +parserutils_error parserutils_buffer_grow(parserutils_buffer *buffer); + +#endif + -- cgit v1.2.3