From 2777a04ed2ba4fd36138b991d66a32a283361f7e Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Thu, 1 May 2008 16:34:46 +0000 Subject: Import parser construction utility library svn path=/trunk/libparserutils/; revision=4111 --- src/input/Makefile | 46 +++++ src/input/filter.c | 384 ++++++++++++++++++++++++++++++++++++++ src/input/filter.h | 57 ++++++ src/input/inputstream.c | 477 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 964 insertions(+) create mode 100644 src/input/Makefile create mode 100644 src/input/filter.c create mode 100644 src/input/filter.h create mode 100644 src/input/inputstream.c (limited to 'src/input') diff --git a/src/input/Makefile b/src/input/Makefile new file mode 100644 index 0000000..d62740e --- /dev/null +++ b/src/input/Makefile @@ -0,0 +1,46 @@ +# Child makefile fragment +# +# Toolchain is provided by top-level makefile +# +# Variables provided by top-level makefile +# +# COMPONENT The name of the component +# EXPORT The location of the export directory +# TOP The location of the source tree root +# RELEASEDIR The place to put release objects +# DEBUGDIR The place to put debug objects +# +# do_include Canned command sequence to include a child makefile +# +# Variables provided by parent makefile: +# +# DIR The name of the directory we're in, relative to $(TOP) +# +# Variables we can manipulate: +# +# ITEMS_CLEAN The list of items to remove for "make clean" +# ITEMS_DISTCLEAN The list of items to remove for "make distclean" +# TARGET_TESTS The list of target names to run for "make test" +# +# SOURCES The list of sources to build for $(COMPONENT) +# +# Plus anything from the toolchain + +# Push parent directory onto the directory stack +sp := $(sp).x +dirstack_$(sp) := $(d) +d := $(DIR) + +# Sources +SRCS_$(d) := filter.c inputstream.c + +# Append to sources for component +SOURCES += $(addprefix $(d), $(SRCS_$(d))) + +# Now include any children we may have +MAKE_INCLUDES := $(wildcard $(d)*/Makefile) +$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC)))) + +# Finally, pop off the directory stack +d := $(dirstack_$(sp)) +sp := $(basename $(sp)) diff --git a/src/input/filter.c b/src/input/filter.c new file mode 100644 index 0000000..f40c98f --- /dev/null +++ b/src/input/filter.c @@ -0,0 +1,384 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include +#include +#include +#include + +#ifdef WITH_ICONV_FILTER +#include +#endif + +#include +#include + +#include "input/filter.h" +#include "utils/utils.h" + +/** Input filter */ +struct parserutils_filter { +#ifdef WITH_ICONV_FILTER + iconv_t cd; /**< Iconv conversion descriptor */ + uint16_t int_enc; /**< The internal encoding */ +#else + parserutils_charset_codec *read_codec; /**< Read codec */ + parserutils_charset_codec *write_codec; /**< Write codec */ + + uint32_t pivot_buf[64]; /**< Conversion pivot buffer */ + + bool leftover; /**< Data remains from last call */ + uint8_t *pivot_left; /**< Remaining pivot to write */ + size_t pivot_len; /**< Length of pivot remaining */ +#endif + + struct { + uint16_t encoding; /**< Input encoding */ + } settings; /**< Filter settings */ + + parserutils_alloc alloc; /**< Memory (de)allocation function */ + void *pw; /**< Client private data */ +}; + +static parserutils_error filter_set_defaults(parserutils_filter *input); +static parserutils_error filter_set_encoding(parserutils_filter *input, + const char *enc); + +/** + * Create an input filter + * + * \param int_enc Desired encoding of document + * \param alloc Function used to (de)allocate data + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to filter instance, or NULL on failure + */ +parserutils_filter *parserutils_filter_create(const char *int_enc, + parserutils_alloc alloc, void *pw) +{ + parserutils_filter *filter; + + if (int_enc == NULL || alloc == NULL) + return NULL; + + filter = alloc(NULL, sizeof(*filter), pw); + if (!filter) + return NULL; + +#ifdef WITH_ICONV_FILTER + filter->cd = (iconv_t) -1; + filter->int_enc = parserutils_charset_mibenum_from_name( + int_enc, strlen(int_enc)); + if (filter->int_enc == 0) { + alloc(filter, 0, pw); + return NULL; + } +#else + filter->leftover = false; + filter->pivot_left = NULL; + filter->pivot_len = 0; +#endif + + filter->alloc = alloc; + filter->pw = pw; + + if (filter_set_defaults(filter) != PARSERUTILS_OK) { + filter->alloc(filter, 0, pw); + return NULL; + } + +#ifndef WITH_ICONV_FILTER + filter->write_codec = + parserutils_charset_codec_create(int_enc, alloc, pw); + if (filter->write_codec == NULL) { + if (filter->read_codec != NULL) + parserutils_charset_codec_destroy(filter->read_codec); + filter->alloc(filter, 0, pw); + return NULL; + } +#endif + + return filter; +} + +/** + * Destroy an input filter + * + * \param input Pointer to filter instance + */ +void parserutils_filter_destroy(parserutils_filter *input) +{ + if (input == NULL) + return; + +#ifdef WITH_ICONV_FILTER + if (input->cd != (iconv_t) -1) + iconv_close(input->cd); +#else + if (input->read_codec != NULL) + parserutils_charset_codec_destroy(input->read_codec); + + if (input->write_codec != NULL) + parserutils_charset_codec_destroy(input->write_codec); +#endif + + input->alloc(input, 0, input->pw); + + return; +} + +/** + * Configure an input filter + * + * \param input Pointer to filter instance + * \param type Input option type to configure + * \param params Option-specific parameters + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_filter_setopt(parserutils_filter *input, + parserutils_filter_opttype type, + parserutils_filter_optparams *params) +{ + parserutils_error error = PARSERUTILS_OK; + + if (input == NULL || params == NULL) + return PARSERUTILS_BADPARM; + + switch (type) { + case PARSERUTILS_FILTER_SET_ENCODING: + error = filter_set_encoding(input, params->encoding.name); + break; + } + + return error; +} + +/** + * Process a chunk of data + * + * \param input Pointer to filter instance + * \param data Pointer to pointer to input buffer + * \param len Pointer to length of input buffer + * \param output Pointer to pointer to output buffer + * \param outlen Pointer to length of output buffer + * \return PARSERUTILS_OK on success, appropriate error otherwise + * + * Call this with an input buffer length of 0 to flush any buffers. + */ +parserutils_error parserutils_filter_process_chunk(parserutils_filter *input, + const uint8_t **data, size_t *len, + uint8_t **output, size_t *outlen) +{ + if (input == NULL || data == NULL || *data == NULL || len == NULL || + output == NULL || *output == NULL || outlen == NULL) + return PARSERUTILS_BADPARM; + +#ifdef WITH_ICONV_FILTER + if (iconv(input->cd, (char **) data, len, + (char **) output, outlen) == (size_t) -1) { + switch (errno) { + case E2BIG: + return PARSERUTILS_NOMEM; + case EILSEQ: + if (*outlen < 3) + return PARSERUTILS_NOMEM; + + (*output)[0] = 0xef; + (*output)[1] = 0xbf; + (*output)[2] = 0xbd; + + *output += 3; + *outlen -= 3; + + (*data)++; + (*len)--; + + while (*len > 0) { + size_t ret; + + ret = iconv(input->cd, (char **) data, len, + (char **) output, outlen); + if (ret != (size_t) -1 || errno != EILSEQ) + break; + + (*data)++; + (*len)--; + } + + return errno == E2BIG ? PARSERUTILS_NOMEM + : PARSERUTILS_OK; + } + } + + return PARSERUTILS_OK; +#else + parserutils_error read_error, write_error; + + if (input->leftover) { + /* Some data left to be written from last call */ + + /* Attempt to flush the remaining data. */ + write_error = parserutils_charset_codec_encode( + input->write_codec, + (const uint8_t **) &input->pivot_left, + &input->pivot_len, + output, outlen); + + if (write_error != PARSERUTILS_OK) + return write_error; + + + /* And clear leftover */ + input->pivot_left = NULL; + input->pivot_len = 0; + input->leftover = false; + } + + while (*len > 0) { + size_t pivot_len = sizeof(input->pivot_buf); + uint8_t *pivot = (uint8_t *) input->pivot_buf; + + read_error = parserutils_charset_codec_decode(input->read_codec, + data, len, + (uint8_t **) &pivot, &pivot_len); + + pivot = (uint8_t *) input->pivot_buf; + pivot_len = sizeof(input->pivot_buf) - pivot_len; + + if (pivot_len > 0) { + write_error = parserutils_charset_codec_encode( + input->write_codec, + (const uint8_t **) &pivot, + &pivot_len, + output, outlen); + + if (write_error != PARSERUTILS_OK) { + input->leftover = true; + input->pivot_left = pivot; + input->pivot_len = pivot_len; + + return write_error; + } + } + + if (read_error != PARSERUTILS_OK && + read_error != PARSERUTILS_NOMEM) + return read_error; + } + + return PARSERUTILS_OK; +#endif +} + +/** + * Reset an input filter's state + * + * \param input The input filter to reset + * \param PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_filter_reset(parserutils_filter *input) +{ + if (input == NULL) + return PARSERUTILS_BADPARM; + +#ifdef WITH_ICONV_FILTER + iconv(input->cd, NULL, 0, NULL, 0); +#else + parserutils_error error; + + /* Clear pivot buffer leftovers */ + input->pivot_left = NULL; + input->pivot_len = 0; + input->leftover = false; + + /* Reset read codec */ + error = parserutils_charset_codec_reset(input->read_codec); + if (error != PARSERUTILS_OK) + return error; + + /* Reset write codec */ + error = parserutils_charset_codec_reset(input->write_codec); + if (error != PARSERUTILS_OK) + return error; +#endif + + return PARSERUTILS_OK; +} + +/** + * Set an input filter's default settings + * + * \param input Input filter to configure + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error filter_set_defaults(parserutils_filter *input) +{ + parserutils_error error; + + if (input == NULL) + return PARSERUTILS_BADPARM; + +#ifndef WITH_ICONV_FILTER + input->read_codec = NULL; + input->write_codec = NULL; +#endif + + input->settings.encoding = 0; + error = filter_set_encoding(input, "UTF-8"); + if (error != PARSERUTILS_OK) + return error; + + return PARSERUTILS_OK; +} + +/** + * Set an input filter's encoding + * + * \param input Input filter to configure + * \param enc Encoding name + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error filter_set_encoding(parserutils_filter *input, + const char *enc) +{ + const char *old_enc; + uint16_t mibenum; + + if (input == NULL || enc == NULL) + return PARSERUTILS_BADPARM; + + mibenum = parserutils_charset_mibenum_from_name(enc, strlen(enc)); + if (mibenum == 0) + return PARSERUTILS_INVALID; + + /* Exit early if we're already using this encoding */ + if (input->settings.encoding == mibenum) + return PARSERUTILS_OK; + + old_enc = parserutils_charset_mibenum_to_name(input->settings.encoding); + if (old_enc == NULL) + old_enc = "UTF-8"; + +#ifdef WITH_ICONV_FILTER + if (input->cd != (iconv_t) -1) + iconv_close(input->cd); + + input->cd = iconv_open( + parserutils_charset_mibenum_to_name(input->int_enc), enc); +#else + if (input->read_codec != NULL) + parserutils_charset_codec_destroy(input->read_codec); + + input->read_codec = parserutils_charset_codec_create(enc, input->alloc, + input->pw); + if (input->read_codec == NULL) + return PARSERUTILS_NOMEM; +#endif + + input->settings.encoding = mibenum; + + return PARSERUTILS_OK; +} diff --git a/src/input/filter.h b/src/input/filter.h new file mode 100644 index 0000000..96941a6 --- /dev/null +++ b/src/input/filter.h @@ -0,0 +1,57 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef parserutils_input_filter_h_ +#define parserutils_input_filter_h_ + +#include + +#include +#include + +typedef struct parserutils_filter parserutils_filter; + +/** + * Input filter option types + */ +typedef enum parserutils_filter_opttype { + PARSERUTILS_FILTER_SET_ENCODING = 0, +} parserutils_filter_opttype; + +/** + * Input filter option parameters + */ +typedef union parserutils_filter_optparams { + /** Parameters for encoding setting */ + struct { + /** Encoding name */ + const char *name; + } encoding; +} parserutils_filter_optparams; + + +/* Create an input filter */ +parserutils_filter *parserutils_filter_create(const char *int_enc, + parserutils_alloc alloc, void *pw); +/* Destroy an input filter */ +void parserutils_filter_destroy(parserutils_filter *input); + +/* Configure an input filter */ +parserutils_error parserutils_filter_setopt(parserutils_filter *input, + parserutils_filter_opttype type, + parserutils_filter_optparams *params); + +/* Process a chunk of data */ +parserutils_error parserutils_filter_process_chunk(parserutils_filter *input, + const uint8_t **data, size_t *len, + uint8_t **output, size_t *outlen); + +/* Reset an input filter's state */ +parserutils_error parserutils_filter_reset(parserutils_filter *input); + +#endif + diff --git a/src/input/inputstream.c b/src/input/inputstream.c new file mode 100644 index 0000000..fd44995 --- /dev/null +++ b/src/input/inputstream.c @@ -0,0 +1,477 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include +#include +#include + +#include +#include +#include + +#include "input/filter.h" +#include "utils/utils.h" + +/** + * Private input stream definition + */ +typedef struct parserutils_inputstream_private { + parserutils_inputstream public; /**< Public part. Must be first */ + + parserutils_buffer *raw; /**< Buffer containing raw data */ + + bool done_first_chunk; /**< Whether the first chunk has + * been processed */ + + uint16_t mibenum; /**< MIB enum for charset, or 0 */ + uint32_t encsrc; /**< Charset source */ + + parserutils_filter *input; /**< Charset conversion filter */ + + parserutils_charset_detect_func csdetect; /**< Charset detection func.*/ + + parserutils_alloc alloc; /**< Memory (de)allocation function */ + void *pw; /**< Client private data */ +} parserutils_inputstream_private; + +static inline parserutils_error parserutils_inputstream_refill_buffer( + parserutils_inputstream_private *stream); +static inline parserutils_error parserutils_inputstream_strip_bom( + uint16_t mibenum, parserutils_buffer *buffer); + +/** + * Create an input stream + * + * \param enc Document charset, or NULL to autodetect + * \param encsrc Value for encoding source, if specified, or 0 + * \param csdetect Charset detection function, or NULL + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to stream instance, or NULL on failure + * + * The value 0 is defined as being the lowest priority encoding source + * (i.e. the default fallback encoding). Beyond this, no further + * interpretation is made upon the encoding source. + */ +parserutils_inputstream *parserutils_inputstream_create(const char *enc, + uint32_t encsrc, parserutils_charset_detect_func csdetect, + parserutils_alloc alloc, void *pw) +{ + parserutils_inputstream_private *stream; + + if (alloc == NULL) + return NULL; + + stream = alloc(NULL, sizeof(parserutils_inputstream_private), pw); + if (stream == NULL) + return NULL; + + stream->raw = parserutils_buffer_create(alloc, pw); + if (stream->raw == NULL) { + alloc(stream, 0, pw); + return NULL; + } + + stream->public.utf8 = parserutils_buffer_create(alloc, pw); + if (stream->public.utf8 == NULL) { + parserutils_buffer_destroy(stream->raw); + alloc(stream, 0, pw); + return NULL; + } + + stream->public.cursor = 0; + stream->public.had_eof = false; + stream->done_first_chunk = false; + + stream->input = parserutils_filter_create("UTF-8", alloc, pw); + if (stream->input == NULL) { + parserutils_buffer_destroy(stream->public.utf8); + parserutils_buffer_destroy(stream->raw); + alloc(stream, 0, pw); + return NULL; + } + + if (enc != NULL) { + parserutils_error error; + parserutils_filter_optparams params; + + stream->mibenum = + parserutils_charset_mibenum_from_name(enc, strlen(enc)); + + if (stream->mibenum != 0) { + params.encoding.name = enc; + + error = parserutils_filter_setopt(stream->input, + PARSERUTILS_FILTER_SET_ENCODING, + ¶ms); + if (error != PARSERUTILS_OK && + error != PARSERUTILS_INVALID) { + parserutils_filter_destroy(stream->input); + parserutils_buffer_destroy(stream->public.utf8); + parserutils_buffer_destroy(stream->raw); + alloc(stream, 0, pw); + return NULL; + } + + stream->encsrc = encsrc; + } + } else { + stream->mibenum = 0; + stream->encsrc = 0; + } + + stream->csdetect = csdetect; + + stream->alloc = alloc; + stream->pw = pw; + + return (parserutils_inputstream *) stream; +} + +/** + * Destroy an input stream + * + * \param stream Input stream to destroy + */ +void parserutils_inputstream_destroy(parserutils_inputstream *stream) +{ + parserutils_inputstream_private *s = + (parserutils_inputstream_private *) stream; + + if (stream == NULL) + return; + + parserutils_filter_destroy(s->input); + parserutils_buffer_destroy(s->public.utf8); + parserutils_buffer_destroy(s->raw); + s->alloc(s, 0, s->pw); +} + +/** + * Append data to an input stream + * + * \param stream Input stream to append data to + * \param data Data to append (in document charset), or NULL to flag EOF + * \param len Length, in bytes, of data + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_inputstream_append( + parserutils_inputstream *stream, + const uint8_t *data, size_t len) +{ + parserutils_inputstream_private *s = + (parserutils_inputstream_private *) stream; + + if (stream == NULL) + return PARSERUTILS_BADPARM; + + if (data == NULL) { + s->public.had_eof = true; + return PARSERUTILS_OK; + } + + return parserutils_buffer_append(s->raw, data, len); +} + +/** + * Insert data into stream at current location + * + * \param stream Input stream to insert into + * \param data Data to insert (UTF-8 encoded) + * \param len Length, in bytes, of data + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_inputstream_insert( + parserutils_inputstream *stream, + const uint8_t *data, size_t len) +{ + parserutils_inputstream_private *s = + (parserutils_inputstream_private *) stream; + + if (stream == NULL || data == NULL) + return PARSERUTILS_BADPARM; + + return parserutils_buffer_insert(s->public.utf8, s->public.cursor, + data, len); +} + +#define IS_ASCII(x) (((x) & 0x80) == 0) + +/* Look at the character in the stream that starts at + * offset bytes from the cursor (slow version) + * + * \param stream Stream to look in + * \param offset Byte offset of start of character + * \param length Pointer to location to receive character length (in bytes) + * \return Pointer to character data, or EOF or OOD. + * + * Once the character pointed to by the result of this call has been advanced + * past (i.e. parserutils_inputstream_advance has caused the stream cursor to + * pass over the character), then no guarantee is made as to the validity of + * the data pointed to. Thus, any attempt to dereference the pointer after + * advancing past the data it points to is a bug. + */ +uintptr_t parserutils_inputstream_peek_slow(parserutils_inputstream *stream, + size_t offset, size_t *length) +{ + parserutils_inputstream_private *s = + (parserutils_inputstream_private *) stream; + parserutils_error error = PARSERUTILS_OK; + size_t len; + + if (stream == NULL) + return PARSERUTILS_INPUTSTREAM_OOD; + + /* There's insufficient data in the buffer, so read some more */ + if (s->raw->length == 0) { + /* No more data to be had */ + return s->public.had_eof ? PARSERUTILS_INPUTSTREAM_EOF + : PARSERUTILS_INPUTSTREAM_OOD; + } + + /* Refill utf8 buffer from raw buffer */ + error = parserutils_inputstream_refill_buffer(s); + if (error != PARSERUTILS_OK) + return PARSERUTILS_INPUTSTREAM_OOD; + + /* Now try the read */ + if (IS_ASCII(s->public.utf8->data[s->public.cursor + offset])) { + len = 1; + } else { + error = parserutils_charset_utf8_char_byte_length( + s->public.utf8->data + s->public.cursor + offset, + &len); + + if (error != PARSERUTILS_OK && error != PARSERUTILS_NEEDDATA) + return PARSERUTILS_INPUTSTREAM_OOD; + + if (error == PARSERUTILS_NEEDDATA) { + return s->public.had_eof ? PARSERUTILS_INPUTSTREAM_EOF + : PARSERUTILS_INPUTSTREAM_OOD; + } + } + + *length = len; + + return (uintptr_t) (s->public.utf8->data + s->public.cursor + offset); +} + +#undef IS_ASCII + +/** + * Read the source charset of the input stream + * + * \param stream Input stream to query + * \param source Pointer to location to receive charset source identifier + * \return Pointer to charset name (constant; do not free) + */ +const char *parserutils_inputstream_read_charset( + parserutils_inputstream *stream, uint32_t *source) +{ + parserutils_inputstream_private *s = + (parserutils_inputstream_private *) stream; + + if (stream == NULL || source == NULL) + return NULL; + + *source = s->encsrc; + + if (s->encsrc == 0) + return "UTF-8"; + + return parserutils_charset_mibenum_to_name(s->mibenum); +} + +/****************************************************************************** + ******************************************************************************/ + +/** + * Refill the UTF-8 buffer from the raw buffer + * + * \param stream The inputstream to operate on + * \return PARSERUTILS_OK on success + */ +parserutils_error parserutils_inputstream_refill_buffer( + parserutils_inputstream_private *stream) +{ + const uint8_t *raw; + uint8_t *utf8; + size_t raw_length, utf8_space; + parserutils_error error; + + /* If this is the first chunk of data, we must detect the charset and + * strip the BOM, if one exists */ + if (!stream->done_first_chunk) { + if (stream->csdetect != NULL) { + error = stream->csdetect(stream->raw->data, + stream->raw->length, + &stream->mibenum, &stream->encsrc); + if (error != PARSERUTILS_OK) + return error; + } else { + /* Default to UTF-8 */ + stream->mibenum = + parserutils_charset_mibenum_from_name("UTF-8", + SLEN("UTF-8")); + stream->encsrc = 0; + } + + if (stream->mibenum == 0) + abort(); + + error = parserutils_inputstream_strip_bom(stream->mibenum, + stream->raw); + if (error != PARSERUTILS_OK) + return error; + + stream->done_first_chunk = true; + } + + /* Work out how to perform the buffer fill */ + if (stream->public.cursor == stream->public.utf8->length) { + /* Cursor's at the end, so simply reuse the entire buffer */ + utf8 = stream->public.utf8->data; + utf8_space = stream->public.utf8->allocated; + } else { + /* Cursor's not at the end, so shift data after cursor to the + * bottom of the buffer. If the buffer's still over half full, + * extend it. */ + memmove(stream->public.utf8->data, + stream->public.utf8->data + stream->public.cursor, + stream->public.utf8->length - stream->public.cursor); + + stream->public.utf8->length -= stream->public.cursor; + + if (stream->public.utf8->length > + stream->public.utf8->allocated / 2) { + error = parserutils_buffer_grow(stream->public.utf8); + if (error != PARSERUTILS_OK) + return error; + } + + utf8 = stream->public.utf8->data + stream->public.utf8->length; + utf8_space = stream->public.utf8->allocated - + stream->public.utf8->length; + } + + raw = stream->raw->data; + raw_length = stream->raw->length; + + /* Try to fill utf8 buffer from the raw data */ + error = parserutils_filter_process_chunk(stream->input, + &raw, &raw_length, &utf8, &utf8_space); + /* _NOMEM implies that there's more input to read than available space + * in the utf8 buffer. That's fine, so we'll ignore that error. */ + if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) + return error; + + /* Remove the raw data we've processed from the raw buffer */ + error = parserutils_buffer_discard(stream->raw, 0, + stream->raw->length - raw_length); + if (error != PARSERUTILS_OK) + return error; + + /* Fix up the utf8 buffer information */ + stream->public.utf8->length = + stream->public.utf8->allocated - utf8_space; + + /* Finally, fix up the cursor */ + stream->public.cursor = 0; + + return PARSERUTILS_OK; +} + +/** + * Strip a BOM from a buffer in the given encoding + * + * \param mibenum The character set of the buffer + * \param buffer The buffer to process + */ +parserutils_error parserutils_inputstream_strip_bom(uint16_t mibenum, + parserutils_buffer *buffer) +{ + static uint16_t utf8; + static uint16_t utf16; + static uint16_t utf16be; + static uint16_t utf16le; + static uint16_t utf32; + static uint16_t utf32be; + static uint16_t utf32le; + + if (utf8 == 0) { + utf8 = parserutils_charset_mibenum_from_name("UTF-8", + SLEN("UTF-8")); + utf16 = parserutils_charset_mibenum_from_name("UTF-16", + SLEN("UTF-16")); + utf16be = parserutils_charset_mibenum_from_name("UTF-16BE", + SLEN("UTF-16BE")); + utf16le = parserutils_charset_mibenum_from_name("UTF-16LE", + SLEN("UTF-16LE")); + utf32 = parserutils_charset_mibenum_from_name("UTF-32", + SLEN("UTF-32")); + utf32be = parserutils_charset_mibenum_from_name("UTF-32BE", + SLEN("UTF-32BE")); + utf32le = parserutils_charset_mibenum_from_name("UTF-32LE", + SLEN("UTF-32LE")); + } + + /** \todo Handle unmarked UTF-16 and UTF-32. Endianness is specified + * by the BOM, if present, or is assumed to be big endian. */ + +#define UTF32_BOM_LEN (4) +#define UTF16_BOM_LEN (2) +#define UTF8_BOM_LEN (3) + + if (mibenum == utf8) { + if (buffer->length >= UTF8_BOM_LEN && + buffer->data[0] == 0xEF && + buffer->data[1] == 0xBB && + buffer->data[2] == 0xBF) { + return parserutils_buffer_discard( + buffer, 0, UTF8_BOM_LEN); + } + } else if (mibenum == utf16be) { + if (buffer->length >= UTF16_BOM_LEN && + buffer->data[0] == 0xFE && + buffer->data[1] == 0xFF) { + return parserutils_buffer_discard( + buffer, 0, UTF16_BOM_LEN); + } + } else if (mibenum == utf16le) { + if (buffer->length >= UTF16_BOM_LEN && + buffer->data[0] == 0xFF && + buffer->data[1] == 0xFE) { + return parserutils_buffer_discard( + buffer, 0, UTF16_BOM_LEN); + } + } else if (mibenum == utf32be) { + if (buffer->length >= UTF32_BOM_LEN && + buffer->data[0] == 0x00 && + buffer->data[1] == 0x00 && + buffer->data[2] == 0xFE && + buffer->data[3] == 0xFF) { + return parserutils_buffer_discard( + buffer, 0, UTF32_BOM_LEN); + } + } else if (mibenum == utf32le) { + if (buffer->length >= UTF32_BOM_LEN && + buffer->data[0] == 0xFF && + buffer->data[1] == 0xFE && + buffer->data[2] == 0x00 && + buffer->data[3] == 0x00) { + return parserutils_buffer_discard( + buffer, 0, UTF32_BOM_LEN); + } + } + +#undef UTF8_BOM_LEN +#undef UTF16_BOM_LEN +#undef UTF32_BOM_LEN + + return PARSERUTILS_OK; +} + -- cgit v1.2.3