From 2777a04ed2ba4fd36138b991d66a32a283361f7e Mon Sep 17 00:00:00 2001
From: John Mark Bell <jmb@netsurf-browser.org>
Date: Thu, 1 May 2008 16:34:46 +0000
Subject: Import parser construction utility library

svn path=/trunk/libparserutils/; revision=4111
---
 src/input/Makefile      |  46 +++++
 src/input/filter.c      | 384 ++++++++++++++++++++++++++++++++++++++
 src/input/filter.h      |  57 ++++++
 src/input/inputstream.c | 477 ++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 964 insertions(+)
 create mode 100644 src/input/Makefile
 create mode 100644 src/input/filter.c
 create mode 100644 src/input/filter.h
 create mode 100644 src/input/inputstream.c

(limited to 'src/input')

diff --git a/src/input/Makefile b/src/input/Makefile
new file mode 100644
index 0000000..d62740e
--- /dev/null
+++ b/src/input/Makefile
@@ -0,0 +1,46 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT		The name of the component
+# EXPORT		The location of the export directory
+# TOP			The location of the source tree root
+# RELEASEDIR		The place to put release objects
+# DEBUGDIR		The place to put debug objects
+#
+# do_include		Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR			The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN		The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN	The list of items to remove for "make distclean"
+# TARGET_TESTS		The list of target names to run for "make test"
+#
+# SOURCES		The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp             := $(sp).x
+dirstack_$(sp) := $(d)
+d              := $(DIR)
+
+# Sources
+SRCS_$(d) := filter.c inputstream.c
+
+# Append to sources for component
+SOURCES += $(addprefix $(d), $(SRCS_$(d)))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d  := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/src/input/filter.c b/src/input/filter.c
new file mode 100644
index 0000000..f40c98f
--- /dev/null
+++ b/src/input/filter.c
@@ -0,0 +1,384 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef WITH_ICONV_FILTER
+#include <iconv.h>
+#endif
+
+#include <parserutils/charset/mibenum.h>
+#include <parserutils/charset/codec.h>
+
+#include "input/filter.h"
+#include "utils/utils.h"
+
+/** Input filter */
+struct parserutils_filter {
+#ifdef WITH_ICONV_FILTER
+	iconv_t cd;			/**< Iconv conversion descriptor */
+	uint16_t int_enc;		/**< The internal encoding */
+#else
+	parserutils_charset_codec *read_codec;	/**< Read codec */
+	parserutils_charset_codec *write_codec;	/**< Write codec */
+
+	uint32_t pivot_buf[64];		/**< Conversion pivot buffer */
+
+	bool leftover;			/**< Data remains from last call */
+	uint8_t *pivot_left;		/**< Remaining pivot to write */
+	size_t pivot_len;		/**< Length of pivot remaining */
+#endif
+
+	struct {
+		uint16_t encoding;	/**< Input encoding */
+	} settings;			/**< Filter settings */
+
+	parserutils_alloc alloc;	/**< Memory (de)allocation function */
+	void *pw;			/**< Client private data */
+};
+
+static parserutils_error filter_set_defaults(parserutils_filter *input);
+static parserutils_error filter_set_encoding(parserutils_filter *input,
+		const char *enc);
+
+/**
+ * Create an input filter
+ *
+ * \param int_enc  Desired encoding of document
+ * \param alloc    Function used to (de)allocate data
+ * \param pw       Pointer to client-specific private data (may be NULL)
+ * \return Pointer to filter instance, or NULL on failure
+ */
+parserutils_filter *parserutils_filter_create(const char *int_enc,
+		parserutils_alloc alloc, void *pw)
+{
+	parserutils_filter *filter;
+
+	if (int_enc == NULL || alloc == NULL)
+		return NULL;
+
+	filter = alloc(NULL, sizeof(*filter), pw);
+	if (!filter)
+		return NULL;
+
+#ifdef WITH_ICONV_FILTER
+	filter->cd = (iconv_t) -1;
+	filter->int_enc = parserutils_charset_mibenum_from_name(
+			int_enc, strlen(int_enc));
+	if (filter->int_enc == 0) {
+		alloc(filter, 0, pw);
+		return NULL;
+	}
+#else
+	filter->leftover = false;
+	filter->pivot_left = NULL;
+	filter->pivot_len = 0;
+#endif
+
+	filter->alloc = alloc;
+	filter->pw = pw;
+
+	if (filter_set_defaults(filter) != PARSERUTILS_OK) {
+		filter->alloc(filter, 0, pw);
+		return NULL;
+	}
+
+#ifndef WITH_ICONV_FILTER
+	filter->write_codec = 
+			parserutils_charset_codec_create(int_enc, alloc, pw);
+	if (filter->write_codec == NULL) {
+		if (filter->read_codec != NULL)
+			parserutils_charset_codec_destroy(filter->read_codec);
+		filter->alloc(filter, 0, pw);
+		return NULL;
+	}
+#endif
+
+	return filter;
+}
+
+/**
+ * Destroy an input filter
+ *
+ * \param input  Pointer to filter instance
+ */
+void parserutils_filter_destroy(parserutils_filter *input)
+{
+	if (input == NULL)
+		return;
+
+#ifdef WITH_ICONV_FILTER
+	if (input->cd != (iconv_t) -1)
+		iconv_close(input->cd);
+#else
+	if (input->read_codec != NULL)
+		parserutils_charset_codec_destroy(input->read_codec);
+
+	if (input->write_codec != NULL)
+		parserutils_charset_codec_destroy(input->write_codec);
+#endif
+
+	input->alloc(input, 0, input->pw);
+
+	return;
+}
+
+/**
+ * Configure an input filter
+ *
+ * \param input   Pointer to filter instance
+ * \param type    Input option type to configure
+ * \param params  Option-specific parameters
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_filter_setopt(parserutils_filter *input,
+		parserutils_filter_opttype type,
+		parserutils_filter_optparams *params)
+{
+	parserutils_error error = PARSERUTILS_OK;
+
+	if (input == NULL || params == NULL)
+		return PARSERUTILS_BADPARM;
+
+	switch (type) {
+	case PARSERUTILS_FILTER_SET_ENCODING:
+		error = filter_set_encoding(input, params->encoding.name);
+		break;
+	}
+
+	return error;
+}
+
+/**
+ * Process a chunk of data
+ *
+ * \param input   Pointer to filter instance
+ * \param data    Pointer to pointer to input buffer
+ * \param len     Pointer to length of input buffer
+ * \param output  Pointer to pointer to output buffer
+ * \param outlen  Pointer to length of output buffer
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ *
+ * Call this with an input buffer length of 0 to flush any buffers.
+ */
+parserutils_error parserutils_filter_process_chunk(parserutils_filter *input,
+		const uint8_t **data, size_t *len,
+		uint8_t **output, size_t *outlen)
+{
+	if (input == NULL || data == NULL || *data == NULL || len == NULL ||
+			output == NULL || *output == NULL || outlen == NULL)
+		return PARSERUTILS_BADPARM;
+
+#ifdef WITH_ICONV_FILTER
+	if (iconv(input->cd, (char **) data, len, 
+			(char **) output, outlen) == (size_t) -1) {
+		switch (errno) {
+		case E2BIG:
+			return PARSERUTILS_NOMEM;
+		case EILSEQ:
+			if (*outlen < 3)
+				return PARSERUTILS_NOMEM;
+
+			(*output)[0] = 0xef;
+			(*output)[1] = 0xbf;
+			(*output)[2] = 0xbd;
+
+			*output += 3;
+			*outlen -= 3;
+
+			(*data)++;
+			(*len)--;
+
+			while (*len > 0) {
+				size_t ret;
+				
+				ret = iconv(input->cd, (char **) data, len, 
+						(char **) output, outlen);
+				if (ret != (size_t) -1 || errno != EILSEQ)
+					break;
+
+				(*data)++;
+				(*len)--;
+			}
+
+			return errno == E2BIG ? PARSERUTILS_NOMEM 
+					      : PARSERUTILS_OK;
+		}
+	}
+
+	return PARSERUTILS_OK;
+#else
+	parserutils_error read_error, write_error;
+
+	if (input->leftover) {
+		/* Some data left to be written from last call */
+
+		/* Attempt to flush the remaining data. */
+		write_error = parserutils_charset_codec_encode(
+				input->write_codec,
+				(const uint8_t **) &input->pivot_left,
+				&input->pivot_len,
+				output, outlen);
+
+		if (write_error != PARSERUTILS_OK)
+			return write_error;
+
+
+		/* And clear leftover */
+		input->pivot_left = NULL;
+		input->pivot_len = 0;
+		input->leftover = false;
+	}
+
+	while (*len > 0) {
+		size_t pivot_len = sizeof(input->pivot_buf);
+		uint8_t *pivot = (uint8_t *) input->pivot_buf;
+
+		read_error = parserutils_charset_codec_decode(input->read_codec,
+				data, len,
+				(uint8_t **) &pivot, &pivot_len);
+
+		pivot = (uint8_t *) input->pivot_buf;
+		pivot_len = sizeof(input->pivot_buf) - pivot_len;
+
+		if (pivot_len > 0) {
+			write_error = parserutils_charset_codec_encode(
+					input->write_codec,
+					(const uint8_t **) &pivot,
+					&pivot_len,
+					output, outlen);
+
+			if (write_error != PARSERUTILS_OK) {
+				input->leftover = true;
+				input->pivot_left = pivot;
+				input->pivot_len = pivot_len;
+
+				return write_error;
+			}
+		}
+
+		if (read_error != PARSERUTILS_OK && 
+				read_error != PARSERUTILS_NOMEM)
+			return read_error;
+	}
+
+	return PARSERUTILS_OK;
+#endif
+}
+
+/**
+ * Reset an input filter's state
+ *
+ * \param input  The input filter to reset
+ * \param PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_filter_reset(parserutils_filter *input)
+{
+	if (input == NULL)
+		return PARSERUTILS_BADPARM;
+
+#ifdef WITH_ICONV_FILTER
+	iconv(input->cd, NULL, 0, NULL, 0);
+#else
+	parserutils_error error;
+
+	/* Clear pivot buffer leftovers */
+	input->pivot_left = NULL;
+	input->pivot_len = 0;
+	input->leftover = false;
+
+	/* Reset read codec */
+	error = parserutils_charset_codec_reset(input->read_codec);
+	if (error != PARSERUTILS_OK)
+		return error;
+
+	/* Reset write codec */
+	error = parserutils_charset_codec_reset(input->write_codec);
+	if (error != PARSERUTILS_OK)
+		return error;
+#endif
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Set an input filter's default settings
+ *
+ * \param input  Input filter to configure
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error filter_set_defaults(parserutils_filter *input)
+{
+	parserutils_error error;
+
+	if (input == NULL)
+		return PARSERUTILS_BADPARM;
+
+#ifndef WITH_ICONV_FILTER
+	input->read_codec = NULL;
+	input->write_codec = NULL;
+#endif
+
+	input->settings.encoding = 0;
+	error = filter_set_encoding(input, "UTF-8");
+	if (error != PARSERUTILS_OK)
+		return error;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Set an input filter's encoding
+ *
+ * \param input  Input filter to configure
+ * \param enc    Encoding name
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error filter_set_encoding(parserutils_filter *input,
+		const char *enc)
+{
+	const char *old_enc;
+	uint16_t mibenum;
+
+	if (input == NULL || enc == NULL)
+		return PARSERUTILS_BADPARM;
+
+	mibenum = parserutils_charset_mibenum_from_name(enc, strlen(enc));
+	if (mibenum == 0)
+		return PARSERUTILS_INVALID;
+
+	/* Exit early if we're already using this encoding */
+	if (input->settings.encoding == mibenum)
+		return PARSERUTILS_OK;
+
+	old_enc = parserutils_charset_mibenum_to_name(input->settings.encoding);
+	if (old_enc == NULL)
+		old_enc = "UTF-8";
+
+#ifdef WITH_ICONV_FILTER
+	if (input->cd != (iconv_t) -1)
+		iconv_close(input->cd);
+
+	input->cd = iconv_open(
+		parserutils_charset_mibenum_to_name(input->int_enc), enc);
+#else
+	if (input->read_codec != NULL)
+		parserutils_charset_codec_destroy(input->read_codec);
+
+	input->read_codec = parserutils_charset_codec_create(enc, input->alloc,
+			input->pw);
+	if (input->read_codec == NULL)
+		return PARSERUTILS_NOMEM;
+#endif
+
+	input->settings.encoding = mibenum;
+
+	return PARSERUTILS_OK;
+}
diff --git a/src/input/filter.h b/src/input/filter.h
new file mode 100644
index 0000000..96941a6
--- /dev/null
+++ b/src/input/filter.h
@@ -0,0 +1,57 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_input_filter_h_
+#define parserutils_input_filter_h_
+
+#include <inttypes.h>
+
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+
+typedef struct parserutils_filter parserutils_filter;
+
+/**
+ * Input filter option types
+ */
+typedef enum parserutils_filter_opttype {
+	PARSERUTILS_FILTER_SET_ENCODING       = 0,
+} parserutils_filter_opttype;
+
+/**
+ * Input filter option parameters
+ */
+typedef union parserutils_filter_optparams {
+	/** Parameters for encoding setting */
+	struct {
+		/** Encoding name */
+		const char *name;
+	} encoding;
+} parserutils_filter_optparams;
+
+
+/* Create an input filter */
+parserutils_filter *parserutils_filter_create(const char *int_enc,
+		parserutils_alloc alloc, void *pw);
+/* Destroy an input filter */
+void parserutils_filter_destroy(parserutils_filter *input);
+
+/* Configure an input filter */
+parserutils_error parserutils_filter_setopt(parserutils_filter *input,
+		parserutils_filter_opttype type,
+		parserutils_filter_optparams *params);
+
+/* Process a chunk of data */
+parserutils_error parserutils_filter_process_chunk(parserutils_filter *input,
+		const uint8_t **data, size_t *len,
+		uint8_t **output, size_t *outlen);
+
+/* Reset an input filter's state */
+parserutils_error parserutils_filter_reset(parserutils_filter *input);
+
+#endif
+
diff --git a/src/input/inputstream.c b/src/input/inputstream.c
new file mode 100644
index 0000000..fd44995
--- /dev/null
+++ b/src/input/inputstream.c
@@ -0,0 +1,477 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <parserutils/charset/mibenum.h>
+#include <parserutils/charset/utf8.h>
+#include <parserutils/input/inputstream.h>
+
+#include "input/filter.h"
+#include "utils/utils.h"
+
+/**
+ * Private input stream definition
+ */
+typedef struct parserutils_inputstream_private {
+	parserutils_inputstream public;	/**< Public part. Must be first */
+
+	parserutils_buffer *raw;	/**< Buffer containing raw data */
+
+	bool done_first_chunk;		/**< Whether the first chunk has 
+					 * been processed */
+
+	uint16_t mibenum;		/**< MIB enum for charset, or 0 */
+	uint32_t encsrc;		/**< Charset source */
+
+	parserutils_filter *input;	/**< Charset conversion filter */
+
+	parserutils_charset_detect_func csdetect; /**< Charset detection func.*/
+
+	parserutils_alloc alloc;	/**< Memory (de)allocation function */
+	void *pw;			/**< Client private data */
+} parserutils_inputstream_private;
+
+static inline parserutils_error parserutils_inputstream_refill_buffer(
+		parserutils_inputstream_private *stream);
+static inline parserutils_error parserutils_inputstream_strip_bom(
+		uint16_t mibenum, parserutils_buffer *buffer);
+
+/**
+ * Create an input stream
+ *
+ * \param enc       Document charset, or NULL to autodetect
+ * \param encsrc    Value for encoding source, if specified, or 0
+ * \param csdetect  Charset detection function, or NULL
+ * \param alloc     Memory (de)allocation function
+ * \param pw        Pointer to client-specific private data (may be NULL)
+ * \return Pointer to stream instance, or NULL on failure
+ *
+ * The value 0 is defined as being the lowest priority encoding source 
+ * (i.e. the default fallback encoding). Beyond this, no further 
+ * interpretation is made upon the encoding source.
+ */
+parserutils_inputstream *parserutils_inputstream_create(const char *enc,
+		uint32_t encsrc, parserutils_charset_detect_func csdetect,
+		parserutils_alloc alloc, void *pw)
+{
+	parserutils_inputstream_private *stream;
+
+	if (alloc == NULL)
+		return NULL;
+
+	stream = alloc(NULL, sizeof(parserutils_inputstream_private), pw);
+	if (stream == NULL)
+		return NULL;
+
+	stream->raw = parserutils_buffer_create(alloc, pw);
+	if (stream->raw == NULL) {
+		alloc(stream, 0, pw);
+		return NULL;
+	}
+
+	stream->public.utf8 = parserutils_buffer_create(alloc, pw);
+	if (stream->public.utf8 == NULL) {
+		parserutils_buffer_destroy(stream->raw);
+		alloc(stream, 0, pw);
+		return NULL;
+	}
+
+	stream->public.cursor = 0;
+	stream->public.had_eof = false;
+	stream->done_first_chunk = false;
+
+	stream->input = parserutils_filter_create("UTF-8", alloc, pw);
+	if (stream->input == NULL) {
+		parserutils_buffer_destroy(stream->public.utf8);
+		parserutils_buffer_destroy(stream->raw);
+		alloc(stream, 0, pw);
+		return NULL;
+	}
+
+	if (enc != NULL) {
+		parserutils_error error;
+		parserutils_filter_optparams params;
+
+		stream->mibenum = 
+			parserutils_charset_mibenum_from_name(enc, strlen(enc));
+
+		if (stream->mibenum != 0) {
+			params.encoding.name = enc;
+
+			error = parserutils_filter_setopt(stream->input,
+					PARSERUTILS_FILTER_SET_ENCODING, 
+					&params);
+			if (error != PARSERUTILS_OK && 
+					error != PARSERUTILS_INVALID) {
+				parserutils_filter_destroy(stream->input);
+				parserutils_buffer_destroy(stream->public.utf8);
+				parserutils_buffer_destroy(stream->raw);
+				alloc(stream, 0, pw);
+				return NULL;
+			}
+
+			stream->encsrc = encsrc;
+		}
+	} else {
+		stream->mibenum = 0;
+		stream->encsrc = 0;
+	}
+
+	stream->csdetect = csdetect;
+
+	stream->alloc = alloc;
+	stream->pw = pw;
+
+	return (parserutils_inputstream *) stream;
+}
+
+/**
+ * Destroy an input stream
+ *
+ * \param stream  Input stream to destroy
+ */
+void parserutils_inputstream_destroy(parserutils_inputstream *stream)
+{
+	parserutils_inputstream_private *s = 
+			(parserutils_inputstream_private *) stream;
+
+	if (stream == NULL)
+		return;
+
+	parserutils_filter_destroy(s->input);
+	parserutils_buffer_destroy(s->public.utf8);
+	parserutils_buffer_destroy(s->raw);
+	s->alloc(s, 0, s->pw);
+}
+
+/**
+ * Append data to an input stream
+ *
+ * \param stream  Input stream to append data to
+ * \param data    Data to append (in document charset), or NULL to flag EOF
+ * \param len     Length, in bytes, of data
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_inputstream_append(
+		parserutils_inputstream *stream, 
+		const uint8_t *data, size_t len)
+{
+	parserutils_inputstream_private *s = 
+			(parserutils_inputstream_private *) stream;
+
+	if (stream == NULL)
+		return PARSERUTILS_BADPARM;
+
+	if (data == NULL) {
+		s->public.had_eof = true;
+		return PARSERUTILS_OK;
+	}
+
+	return parserutils_buffer_append(s->raw, data, len);
+}
+
+/**
+ * Insert data into stream at current location
+ *
+ * \param stream  Input stream to insert into
+ * \param data    Data to insert (UTF-8 encoded)
+ * \param len     Length, in bytes, of data
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_inputstream_insert(
+		parserutils_inputstream *stream,
+		const uint8_t *data, size_t len)
+{
+	parserutils_inputstream_private *s = 
+			(parserutils_inputstream_private *) stream;
+
+	if (stream == NULL || data == NULL)
+		return PARSERUTILS_BADPARM;
+
+	return parserutils_buffer_insert(s->public.utf8, s->public.cursor, 
+			data, len);
+}
+
+#define IS_ASCII(x) (((x) & 0x80) == 0)
+
+/* Look at the character in the stream that starts at 
+ * offset bytes from the cursor (slow version)
+ *
+ * \param stream  Stream to look in
+ * \param offset  Byte offset of start of character
+ * \param length  Pointer to location to receive character length (in bytes)
+ * \return Pointer to character data, or EOF or OOD.
+ *
+ * Once the character pointed to by the result of this call has been advanced
+ * past (i.e. parserutils_inputstream_advance has caused the stream cursor to 
+ * pass over the character), then no guarantee is made as to the validity of 
+ * the data pointed to. Thus, any attempt to dereference the pointer after 
+ * advancing past the data it points to is a bug.
+ */
+uintptr_t parserutils_inputstream_peek_slow(parserutils_inputstream *stream, 
+		size_t offset, size_t *length)
+{
+	parserutils_inputstream_private *s = 
+			(parserutils_inputstream_private *) stream;
+	parserutils_error error = PARSERUTILS_OK;
+	size_t len;
+
+	if (stream == NULL)
+		return PARSERUTILS_INPUTSTREAM_OOD;
+
+	/* There's insufficient data in the buffer, so read some more */
+	if (s->raw->length == 0) {
+		/* No more data to be had */
+		return s->public.had_eof ? PARSERUTILS_INPUTSTREAM_EOF
+					 : PARSERUTILS_INPUTSTREAM_OOD;
+	}
+
+	/* Refill utf8 buffer from raw buffer */
+	error = parserutils_inputstream_refill_buffer(s);
+	if (error != PARSERUTILS_OK)
+		return PARSERUTILS_INPUTSTREAM_OOD;
+
+	/* Now try the read */
+	if (IS_ASCII(s->public.utf8->data[s->public.cursor + offset])) {
+		len = 1;
+	} else {
+		error = parserutils_charset_utf8_char_byte_length(
+			s->public.utf8->data + s->public.cursor + offset,
+			&len);
+
+		if (error != PARSERUTILS_OK && error != PARSERUTILS_NEEDDATA)
+			return PARSERUTILS_INPUTSTREAM_OOD;
+
+		if (error == PARSERUTILS_NEEDDATA) {
+			return s->public.had_eof ? PARSERUTILS_INPUTSTREAM_EOF
+						 : PARSERUTILS_INPUTSTREAM_OOD;
+		}
+	}
+
+	*length = len;
+
+	return (uintptr_t) (s->public.utf8->data + s->public.cursor + offset);
+}
+
+#undef IS_ASCII
+
+/**
+ * Read the source charset of the input stream
+ *
+ * \param stream  Input stream to query
+ * \param source  Pointer to location to receive charset source identifier
+ * \return Pointer to charset name (constant; do not free)
+ */
+const char *parserutils_inputstream_read_charset(
+		parserutils_inputstream *stream, uint32_t *source)
+{
+	parserutils_inputstream_private *s = 
+			(parserutils_inputstream_private *) stream;
+
+	if (stream == NULL || source == NULL)
+		return NULL;
+
+	*source = s->encsrc;
+
+	if (s->encsrc == 0)
+		return "UTF-8";
+
+	return parserutils_charset_mibenum_to_name(s->mibenum);
+}
+
+/******************************************************************************
+ ******************************************************************************/
+
+/**
+ * Refill the UTF-8 buffer from the raw buffer
+ *
+ * \param stream  The inputstream to operate on
+ * \return PARSERUTILS_OK on success
+ */
+parserutils_error parserutils_inputstream_refill_buffer(
+		parserutils_inputstream_private *stream)
+{
+	const uint8_t *raw;
+	uint8_t *utf8;
+	size_t raw_length, utf8_space;
+	parserutils_error error;
+
+	/* If this is the first chunk of data, we must detect the charset and
+	 * strip the BOM, if one exists */
+	if (!stream->done_first_chunk) {
+		if (stream->csdetect != NULL) {
+			error = stream->csdetect(stream->raw->data, 
+				stream->raw->length,
+				&stream->mibenum, &stream->encsrc);
+			if (error != PARSERUTILS_OK)
+				return error;
+		} else {
+			/* Default to UTF-8 */
+			stream->mibenum = 
+				parserutils_charset_mibenum_from_name("UTF-8", 
+					SLEN("UTF-8"));
+			stream->encsrc = 0;
+		}
+
+		if (stream->mibenum == 0)
+			abort();
+
+		error = parserutils_inputstream_strip_bom(stream->mibenum, 
+				stream->raw);
+		if (error != PARSERUTILS_OK)
+			return error;
+
+		stream->done_first_chunk = true;
+	}
+
+	/* Work out how to perform the buffer fill */
+	if (stream->public.cursor == stream->public.utf8->length) {
+		/* Cursor's at the end, so simply reuse the entire buffer */
+		utf8 = stream->public.utf8->data;
+		utf8_space = stream->public.utf8->allocated;
+	} else {
+		/* Cursor's not at the end, so shift data after cursor to the
+		 * bottom of the buffer. If the buffer's still over half full, 
+		 * extend it. */
+		memmove(stream->public.utf8->data,
+			stream->public.utf8->data + stream->public.cursor,
+			stream->public.utf8->length - stream->public.cursor);
+
+		stream->public.utf8->length -= stream->public.cursor;
+
+		if (stream->public.utf8->length > 
+				stream->public.utf8->allocated / 2) {
+			error = parserutils_buffer_grow(stream->public.utf8);
+			if (error != PARSERUTILS_OK)
+				return error;
+		}
+
+		utf8 = stream->public.utf8->data + stream->public.utf8->length;
+		utf8_space = stream->public.utf8->allocated - 
+				stream->public.utf8->length;
+	}
+
+	raw = stream->raw->data;
+	raw_length = stream->raw->length;
+
+	/* Try to fill utf8 buffer from the raw data */
+	error = parserutils_filter_process_chunk(stream->input, 
+			&raw, &raw_length, &utf8, &utf8_space);
+	/* _NOMEM implies that there's more input to read than available space
+	 * in the utf8 buffer. That's fine, so we'll ignore that error. */
+	if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM)
+		return error;
+
+	/* Remove the raw data we've processed from the raw buffer */
+	error = parserutils_buffer_discard(stream->raw, 0, 
+			stream->raw->length - raw_length);
+	if (error != PARSERUTILS_OK)
+		return error;
+
+	/* Fix up the utf8 buffer information */
+	stream->public.utf8->length = 
+			stream->public.utf8->allocated - utf8_space;
+
+	/* Finally, fix up the cursor */
+	stream->public.cursor = 0;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Strip a BOM from a buffer in the given encoding
+ *
+ * \param mibenum  The character set of the buffer
+ * \param buffer   The buffer to process
+ */
+parserutils_error parserutils_inputstream_strip_bom(uint16_t mibenum, 
+		parserutils_buffer *buffer)
+{
+	static uint16_t utf8;
+	static uint16_t utf16;
+	static uint16_t utf16be;
+	static uint16_t utf16le;
+	static uint16_t utf32;
+	static uint16_t utf32be;
+	static uint16_t utf32le;
+
+	if (utf8 == 0) {
+		utf8 = parserutils_charset_mibenum_from_name("UTF-8", 
+				SLEN("UTF-8"));
+		utf16 = parserutils_charset_mibenum_from_name("UTF-16", 
+				SLEN("UTF-16"));
+		utf16be = parserutils_charset_mibenum_from_name("UTF-16BE",
+				SLEN("UTF-16BE"));
+		utf16le = parserutils_charset_mibenum_from_name("UTF-16LE",
+				SLEN("UTF-16LE"));
+		utf32 = parserutils_charset_mibenum_from_name("UTF-32", 
+				SLEN("UTF-32"));
+		utf32be = parserutils_charset_mibenum_from_name("UTF-32BE",
+				SLEN("UTF-32BE"));
+		utf32le = parserutils_charset_mibenum_from_name("UTF-32LE",
+				SLEN("UTF-32LE"));
+	}
+
+	/** \todo Handle unmarked UTF-16 and UTF-32. Endianness is specified 
+	 * by the BOM, if present, or is assumed to be big endian. */
+
+#define UTF32_BOM_LEN (4)
+#define UTF16_BOM_LEN (2)
+#define UTF8_BOM_LEN  (3)
+
+	if (mibenum == utf8) {
+		if (buffer->length >= UTF8_BOM_LEN && 
+				buffer->data[0] == 0xEF &&
+				buffer->data[1] == 0xBB && 
+				buffer->data[2] == 0xBF) {
+			return parserutils_buffer_discard(
+					buffer, 0, UTF8_BOM_LEN);
+		}
+	} else if (mibenum == utf16be) {
+		if (buffer->length >= UTF16_BOM_LEN &&
+				buffer->data[0] == 0xFE &&
+				buffer->data[1] == 0xFF) {
+			return parserutils_buffer_discard(
+					buffer, 0, UTF16_BOM_LEN);
+		}
+	} else if (mibenum == utf16le) {
+		if (buffer->length >= UTF16_BOM_LEN &&
+				buffer->data[0] == 0xFF &&
+				buffer->data[1] == 0xFE) {
+			return parserutils_buffer_discard(
+					buffer, 0, UTF16_BOM_LEN);
+		}
+	} else if (mibenum == utf32be) {
+		if (buffer->length >= UTF32_BOM_LEN &&
+				buffer->data[0] == 0x00 &&
+				buffer->data[1] == 0x00 &&
+				buffer->data[2] == 0xFE &&
+				buffer->data[3] == 0xFF) {
+			return parserutils_buffer_discard(
+					buffer, 0, UTF32_BOM_LEN);
+		}
+	} else if (mibenum == utf32le) {
+		if (buffer->length >= UTF32_BOM_LEN &&
+				buffer->data[0] == 0xFF &&
+				buffer->data[1] == 0xFE &&
+				buffer->data[2] == 0x00 &&
+				buffer->data[3] == 0x00) {
+			return parserutils_buffer_discard(
+					buffer, 0, UTF32_BOM_LEN);
+		}
+	}
+
+#undef UTF8_BOM_LEN
+#undef UTF16_BOM_LEN
+#undef UTF32_BOM_LEN
+
+	return PARSERUTILS_OK;
+}
+
-- 
cgit v1.2.3