summaryrefslogtreecommitdiff
path: root/src/charset
diff options
context:
space:
mode:
Diffstat (limited to 'src/charset')
-rw-r--r--src/charset/Makefile46
-rw-r--r--src/charset/detect.c161
-rw-r--r--src/charset/detect.h24
3 files changed, 231 insertions, 0 deletions
diff --git a/src/charset/Makefile b/src/charset/Makefile
new file mode 100644
index 0000000..dda58c1
--- /dev/null
+++ b/src/charset/Makefile
@@ -0,0 +1,46 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT The name of the component
+# EXPORT The location of the export directory
+# TOP The location of the source tree root
+# RELEASEDIR The place to put release objects
+# DEBUGDIR The place to put debug objects
+#
+# do_include Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN The list of items to remove for "make distclean"
+# TARGET_TESTS The list of target names to run for "make test"
+#
+# SOURCES The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp := $(sp).x
+dirstack_$(sp) := $(d)
+d := $(DIR)
+
+# Sources
+SRCS_$(d) := detect.c
+
+# Append to sources for component
+SOURCES += $(addprefix $(d), $(SRCS_$(d)))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/src/charset/detect.c b/src/charset/detect.c
new file mode 100644
index 0000000..11ee699
--- /dev/null
+++ b/src/charset/detect.c
@@ -0,0 +1,161 @@
+/*
+ * This file is part of LibCSS.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <stdbool.h>
+#include <string.h>
+
+#include <parserutils/charset/mibenum.h>
+
+#include "charset/detect.h"
+#include "utils/utils.h"
+
+static parserutils_error css_charset_read_bom_or_charset(const uint8_t *data,
+ size_t len, uint16_t *mibenum);
+
+/**
+ * Extract a charset from a chunk of data
+ *
+ * \param data Pointer to buffer containing data
+ * \param len Buffer length
+ * \param mibenum Pointer to location containing current MIB enum
+ * \param source Pointer to location containing current charset source
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ *
+ * ::mibenum and ::source will be updated on exit
+ *
+ * CSS 2.1 $4.4
+ */
+parserutils_error css_charset_extract(const uint8_t *data, size_t len,
+ uint16_t *mibenum, uint32_t *source)
+{
+ css_error error;
+ uint16_t charset = 0;
+
+ if (data == NULL || mibenum == NULL || source == NULL)
+ return PARSERUTILS_BADPARM;
+
+ /* If the charset was dictated by the client, we've nothing to detect */
+ if (*source == CSS_CHARSET_DICTATED)
+ return PARSERUTILS_OK;
+
+ /* We need at least 4 bytes of data */
+ if (len < 4)
+ goto default_encoding;
+
+ /* Look for a BOM and/or @charset */
+ error = css_charset_read_bom_or_charset(data, len, &charset);
+ if (error != PARSERUTILS_OK)
+ return error;
+
+ if (charset != 0) {
+ *mibenum = charset;
+ *source = CSS_CHARSET_DOCUMENT;
+
+ return PARSERUTILS_OK;
+ }
+
+ /* If we've already got a charset from the linking mechanism or
+ * referring document, then we've nothing further to do */
+ if (*source != CSS_CHARSET_DEFAULT)
+ return PARSERUTILS_OK;
+
+ /* We've not yet found a charset, so use the default fallback */
+default_encoding:
+
+ charset = parserutils_charset_mibenum_from_name("UTF-8", SLEN("UTF-8"));
+
+ *mibenum = charset;
+ *source = CSS_CHARSET_DEFAULT;
+
+ return PARSERUTILS_OK;
+}
+
+
+/**
+ * Inspect the beginning of a buffer of data for the presence of a
+ * UTF Byte Order Mark and/or an @charset rule
+ *
+ * \param data Pointer to buffer containing data
+ * \param len Buffer length
+ * \param mibenum Pointer to location to receive MIB enum
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error css_charset_read_bom_or_charset(const uint8_t *data,
+ size_t len, uint16_t *mibenum)
+{
+ uint16_t charset = 0;
+
+ if (data == NULL)
+ return PARSERUTILS_BADPARM;
+
+ /* We require at least 4 bytes of data */
+ if (len < 4)
+ return PARSERUTILS_NEEDDATA;
+
+
+ /* Look for BOM */
+ if (data[0] == 0x00 && data[1] == 0x00 &&
+ data[2] == 0xFE && data[3] == 0xFF) {
+ charset = parserutils_charset_mibenum_from_name("UTF-32BE",
+ SLEN("UTF-32BE"));
+ } else if (data[0] == 0xFF && data[1] == 0xFE &&
+ data[2] == 0x00 && data[3] == 0x00) {
+ charset = parserutils_charset_mibenum_from_name("UTF-32LE",
+ SLEN("UTF-32LE"));
+ } else if (data[0] == 0xFE && data[1] == 0xFF) {
+ charset = parserutils_charset_mibenum_from_name("UTF-16BE",
+ SLEN("UTF-16BE"));
+ } else if (data[0] == 0xFF && data[1] == 0xFE) {
+ charset = parserutils_charset_mibenum_from_name("UTF-16LE",
+ SLEN("UTF-16LE"));
+ } else if (data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF) {
+ charset = parserutils_charset_mibenum_from_name("UTF-8",
+ SLEN("UTF-8"));
+ }
+
+ /* BOM beats @charset.
+ * UAs differ here, but none appear to match the spec.
+ * The spec indicates that any @charset present in conjunction with a
+ * BOM, should match the BOM. In reality, it appears UAs just take the
+ * BOM as gospel and ignore any @charset rule. The w3c CSS validator
+ * appears to do the same (at the least, it doesn't complain about a
+ * mismatch).
+ */
+ if (charset != 0) {
+ *mibenum = charset;
+ return PARSERUTILS_OK;
+ }
+
+ /** \todo UTF-32 and UTF-16 @charset support */
+
+ /* Look for @charset, assuming ASCII-compatible source data */
+ if (len > 10 && strncmp((const char *) data, "@charset \"",
+ SLEN("@charset \"")) == 0) {
+ const uint8_t *end;
+
+ /* Look for "; at end of charset declaration */
+ for (end = data + 10; end < data + len; end++) {
+ if (*end == '"' && end < data + len - 1 &&
+ *(end + 1) == ';')
+ break;
+ }
+
+ if (end == data + len) {
+ /* Ran out of input */
+ return PARSERUTILS_NEEDDATA;
+ }
+
+ /* Convert to MIB enum */
+ charset = parserutils_charset_mibenum_from_name(
+ (char *) data + 10, end - data - 10);
+ }
+
+ *mibenum = charset;
+
+ return PARSERUTILS_OK;
+}
+
diff --git a/src/charset/detect.h b/src/charset/detect.h
new file mode 100644
index 0000000..d907921
--- /dev/null
+++ b/src/charset/detect.h
@@ -0,0 +1,24 @@
+/*
+ * This file is part of LibCSS.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef css_charset_detect_h_
+#define css_charset_detect_h_
+
+#include <inttypes.h>
+
+#include <libcss/errors.h>
+#include <libcss/functypes.h>
+#include <libcss/types.h>
+
+#include <parserutils/errors.h>
+
+/* Extract a charset from a chunk of data */
+parserutils_error css_charset_extract(const uint8_t *data, size_t len,
+ uint16_t *mibenum, uint32_t *source);
+
+#endif
+