/*
 * This file is part of LibCSS.
 * Licensed under the MIT License,
 *                http://www.opensource.org/licenses/mit-license.php
 * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
 */

#include <stdbool.h>
#include <string.h>

#include <parserutils/charset/mibenum.h>

#include "charset/detect.h"
#include "utils/utils.h"

static parserutils_error css_charset_read_bom_or_charset(const uint8_t *data, 
		size_t len, uint16_t *mibenum);
static parserutils_error try_utf32_charset(const uint8_t *data, 
		size_t len, uint16_t *result);
static parserutils_error try_utf16_charset(const uint8_t *data, 
		size_t len, uint16_t *result);
static parserutils_error try_ascii_compatible_charset(const uint8_t *data, 
		size_t len, uint16_t *result);

/**
 * Extract a charset from a chunk of data
 *
 * \param data     Pointer to buffer containing data
 * \param len      Buffer length
 * \param mibenum  Pointer to location containing current MIB enum
 * \param source   Pointer to location containing current charset source
 * \return PARSERUTILS_OK on success, appropriate error otherwise
 *
 * ::mibenum and ::source will be updated on exit
 *
 * CSS 2.1 $4.4
 */
parserutils_error css_charset_extract(const uint8_t *data, size_t len,
		uint16_t *mibenum, uint32_t *source)
{
	parserutils_error error;
	uint16_t charset = 0;

	if (data == NULL || mibenum == NULL || source == NULL)
		return PARSERUTILS_BADPARM;

	/* If the charset was dictated by the client, we've nothing to detect */
	if (*source == CSS_CHARSET_DICTATED)
		return PARSERUTILS_OK;

	/* Look for a BOM and/or @charset */
	error = css_charset_read_bom_or_charset(data, len, &charset);
	if (error != PARSERUTILS_OK)
		return error;

	if (charset != 0) {
		*mibenum = charset;
		*source = CSS_CHARSET_DOCUMENT;

		return PARSERUTILS_OK;
	}

	/* If we've already got a charset from the linking mechanism or 
	 * referring document, then we've nothing further to do */
	if (*source != CSS_CHARSET_DEFAULT)
		return PARSERUTILS_OK;

	/* We've not yet found a charset, so use the default fallback */
	charset = parserutils_charset_mibenum_from_name("UTF-8", SLEN("UTF-8"));

	*mibenum = charset;
	*source = CSS_CHARSET_DEFAULT;

	return PARSERUTILS_OK;
}


/**
 * Inspect the beginning of a buffer of data for the presence of a
 * UTF Byte Order Mark and/or an @charset rule
 *
 * \param data     Pointer to buffer containing data
 * \param len      Buffer length
 * \param mibenum  Pointer to location to receive MIB enum
 * \return PARSERUTILS_OK on success, appropriate error otherwise
 */
parserutils_error css_charset_read_bom_or_charset(const uint8_t *data, 
		size_t len, uint16_t *mibenum)
{
	parserutils_error error;
	uint16_t charset = 0;

	if (data == NULL)
		return PARSERUTILS_BADPARM;

	/* We require at least 4 bytes of data */
	if (len < 4)
		return PARSERUTILS_NEEDDATA;


	/* Look for BOM */
	if (data[0] == 0x00 && data[1] == 0x00 && 
			data[2] == 0xFE && data[3] == 0xFF) {
		charset = parserutils_charset_mibenum_from_name("UTF-32BE", 
				SLEN("UTF-32BE"));
	} else if (data[0] == 0xFF && data[1] == 0xFE &&
			data[2] == 0x00 && data[3] == 0x00) {
		charset = parserutils_charset_mibenum_from_name("UTF-32LE", 
				SLEN("UTF-32LE"));
	} else if (data[0] == 0xFE && data[1] == 0xFF) {
		charset = parserutils_charset_mibenum_from_name("UTF-16BE", 
				SLEN("UTF-16BE"));
	} else if (data[0] == 0xFF && data[1] == 0xFE) {
		charset = parserutils_charset_mibenum_from_name("UTF-16LE", 
				SLEN("UTF-16LE"));
	} else if (data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF) {
		charset = parserutils_charset_mibenum_from_name("UTF-8", 
				SLEN("UTF-8"));
	}

	/* BOM beats @charset.
	 * UAs differ here, but none appear to match the spec. 
	 * The spec indicates that any @charset present in conjunction with a 
	 * BOM, should match the BOM. In reality, it appears UAs just take the 
	 * BOM as gospel and ignore any @charset rule. The w3c CSS validator 
	 * appears to do the same (at the least, it doesn't complain about a
	 * mismatch).
	 */
	if (charset != 0) {
		*mibenum = charset;
		return PARSERUTILS_OK;
	}

	error = try_utf32_charset(data, len, &charset);
	if (error == PARSERUTILS_OK && charset != 0) {
		*mibenum = charset;
		return PARSERUTILS_OK;
	}

	error = try_utf16_charset(data, len, &charset);
	if (error == PARSERUTILS_OK && charset != 0) {
		*mibenum = charset;
		return PARSERUTILS_OK;
	}

	error = try_ascii_compatible_charset(data, len, &charset);
	if (error == PARSERUTILS_OK)
		*mibenum = charset;

	return PARSERUTILS_OK;
}

static parserutils_error try_utf32_charset(const uint8_t *data, 
		size_t len, uint16_t *result)
{
	uint16_t charset = 0;

#define CHARSET_BE "\0\0\0@\0\0\0c\0\0\0h\0\0\0a\0\0\0r\0\0\0s\0\0\0e\0\0\0t\0\0\0 \0\0\0\""
#define CHARSET_LE "@\0\0\0c\0\0\0h\0\0\0a\0\0\0r\0\0\0s\0\0\0e\0\0\0t\0\0\0 \0\0\0\"\0\0\0"

	if (len <= SLEN(CHARSET_LE))
		return PARSERUTILS_NEEDDATA;

	/* Look for @charset, assuming UTF-32 source data */
	if (memcmp(data, CHARSET_LE, SLEN(CHARSET_LE)) == 0) {
		const uint8_t *start = data + SLEN(CHARSET_LE);
		const uint8_t *end;
		char buf[8];
		char *ptr = buf;

		/* Look for "; at end of charset declaration */
		for (end = start; end < data + len - 4; end += 4) {
			uint32_t c = end[0] | (end[1] << 8) | 
				     (end[2] << 16) | (end[3] << 24);

			/* Bail if non-ASCII */
			if (c > 0x007f)
				break;

			/* Reached the end? */
			if (c == '"' && end < data + len - 8) {
				uint32_t d = end[4] | (end[5] << 8) |
				    (end[6] << 16) | (end[7] << 24);

				if (d == ';')
					break;
			}

			/* Append to buf, if there's space */
			if ((size_t) (ptr - buf) < sizeof(buf)) {
				/* Uppercase */
				if ('a' <= c && c <= 'z')
					*ptr++ = c & ~0x20;
				else
					*ptr++ = c;
			}
		}

		if (end == data + len - 4) {
			/* Ran out of input */
			return PARSERUTILS_NEEDDATA;
		}

		/* Ensure we have something that looks like UTF-32(LE)? */
		if ((ptr - buf == SLEN("UTF-32LE") && 
				memcmp(buf, "UTF-32LE", ptr - buf) == 0) ||
				(ptr - buf == SLEN("UTF-32") &&
				memcmp(buf, "UTF-32", ptr - buf) == 0)) {
			/* Convert to MIB enum */
			charset = parserutils_charset_mibenum_from_name(
					"UTF-32LE", SLEN("UTF-32LE"));
		}
	} else if (memcmp(data, CHARSET_BE, SLEN(CHARSET_BE)) == 0) {
		const uint8_t *start = data + SLEN(CHARSET_BE);
		const uint8_t *end;
		char buf[8];
		char *ptr = buf;

		/* Look for "; at end of charset declaration */
		for (end = start; end < data + len - 4; end += 4) {
			uint32_t c = end[3] | (end[2] << 8) | 
				     (end[1] << 16) | (end[0] << 24);

			/* Bail if non-ASCII */
			if (c > 0x007f)
				break;

			/* Reached the end? */
			if (c == '"' && end < data + len - 8) {
				uint32_t d = end[7] | (end[6] << 8) |
				    (end[5] << 16) | (end[4] << 24);

				if (d == ';')
					break;
			}

			/* Append to buf, if there's space */
			if ((size_t) (ptr - buf) < sizeof(buf)) {
				/* Uppercase */
				if ('a' <= c && c <= 'z')
					*ptr++ = c & ~0x20;
				else
					*ptr++ = c;
			}
		}

		if (end == data + len - 4) {
			/* Ran out of input */
			return PARSERUTILS_NEEDDATA;
		}

		/* Ensure we have something that looks like UTF-32(BE)? */
		if ((ptr - buf == SLEN("UTF-32BE") && 
				memcmp(buf, "UTF-32BE", ptr - buf) == 0) ||
				(ptr - buf == SLEN("UTF-32") &&
				memcmp(buf, "UTF-32", ptr - buf) == 0)) {
			/* Convert to MIB enum */
			charset = parserutils_charset_mibenum_from_name(
					"UTF-32BE", SLEN("UTF-32BE"));
		}
	}

#undef CHARSET_LE
#undef CHARSET_BE

	*result = charset;

	return PARSERUTILS_OK;
}

static parserutils_error try_utf16_charset(const uint8_t *data, 
		size_t len, uint16_t *result)
{
	uint16_t charset = 0;

#define CHARSET_BE "\0@\0c\0h\0a\0r\0s\0e\0t\0 \0\""
#define CHARSET_LE "@\0c\0h\0a\0r\0s\0e\0t\0 \0\"\0"

	if (len <= SLEN(CHARSET_LE))
		return PARSERUTILS_NEEDDATA;

	/* Look for @charset, assuming UTF-16 source data */
	if (memcmp(data, CHARSET_LE, SLEN(CHARSET_LE)) == 0) {
		const uint8_t *start = data + SLEN(CHARSET_LE);
		const uint8_t *end;
		char buf[8];
		char *ptr = buf;

		/* Look for "; at end of charset declaration */
		for (end = start; end < data + len - 2; end += 2) {
			uint32_t c = end[0] | (end[1] << 8);

			/* Bail if non-ASCII */
			if (c > 0x007f)
				break;

			/* Reached the end? */
			if (c == '"' && end < data + len - 4) {
				uint32_t d = end[2] | (end[3] << 8);

				if (d == ';')
					break;
			}

			/* Append to buf, if there's space */
			if ((size_t) (ptr - buf) < sizeof(buf)) {
				/* Uppercase */
				if ('a' <= c && c <= 'z')
					*ptr++ = c & ~0x20;
				else
					*ptr++ = c;
			}
		}

		if (end == data + len - 2) {
			/* Ran out of input */
			return PARSERUTILS_NEEDDATA;
		}

		/* Ensure we have something that looks like UTF-16(LE)? */
		if ((ptr - buf == SLEN("UTF-16LE") && 
				memcmp(buf, "UTF-16LE", ptr - buf) == 0) ||
				(ptr - buf == SLEN("UTF-16") &&
				memcmp(buf, "UTF-16", ptr - buf) == 0)) {
			/* Convert to MIB enum */
			charset = parserutils_charset_mibenum_from_name(
					"UTF-16LE", SLEN("UTF-16LE"));
		}
	} else if (memcmp(data, CHARSET_BE, SLEN(CHARSET_BE)) == 0) {
		const uint8_t *start = data + SLEN(CHARSET_BE);
		const uint8_t *end;
		char buf[8];
		char *ptr = buf;

		/* Look for "; at end of charset declaration */
		for (end = start; end < data + len - 2; end += 2) {
			uint32_t c = end[1] | (end[0] << 8);

			/* Bail if non-ASCII */
			if (c > 0x007f)
				break;

			/* Reached the end? */
			if (c == '"' && end < data + len - 4) {
				uint32_t d = end[3] | (end[2] << 8);

				if (d == ';')
					break;
			}

			/* Append to buf, if there's space */
			if ((size_t) (ptr - buf) < sizeof(buf)) {
				/* Uppercase */
				if ('a' <= c && c <= 'z')
					*ptr++ = c & ~0x20;
				else
					*ptr++ = c;
			}
		}

		if (end == data + len - 2) {
			/* Ran out of input */
			return PARSERUTILS_NEEDDATA;
		}

		/* Ensure we have something that looks like UTF-16(BE)? */
		if ((ptr - buf == SLEN("UTF-16BE") && 
				memcmp(buf, "UTF-16BE", ptr - buf) == 0) ||
				(ptr - buf == SLEN("UTF-16") &&
				memcmp(buf, "UTF-16", ptr - buf) == 0)) {
			/* Convert to MIB enum */
			charset = parserutils_charset_mibenum_from_name(
					"UTF-16BE", SLEN("UTF-16BE"));
		}
	}

#undef CHARSET_LE
#undef CHARSET_BE

	*result = charset;

	return PARSERUTILS_OK;
}

parserutils_error try_ascii_compatible_charset(const uint8_t *data, size_t len,
		uint16_t *result)
{
	uint16_t charset = 0;

#define CHARSET "@charset \""

	if (len <= SLEN(CHARSET))
		return PARSERUTILS_NEEDDATA;

	/* Look for @charset, assuming ASCII-compatible source data */
	if (memcmp(data, CHARSET, SLEN(CHARSET)) == 0) {
		const uint8_t *start = data + SLEN(CHARSET);
		const uint8_t *end;

		/* Look for "; at end of charset declaration */
		for (end = start; end < data + len; end++) {
			if (*end == '"' && end < data + len - 1 && 
					*(end + 1) == ';')
				break;
		}

		if (end == data + len) {
			/* Ran out of input */
			return PARSERUTILS_NEEDDATA;
		}

		/* Convert to MIB enum */
		charset = parserutils_charset_mibenum_from_name(
				(const char *) start,  end - start);

		/* Any non-ASCII compatible charset must be ignored, as
		 * we've just used an ASCII parser to read it. */
		if (charset == parserutils_charset_mibenum_from_name(
					"UTF-32", SLEN("UTF-32")) ||
			charset == parserutils_charset_mibenum_from_name(
					"UTF-32LE", SLEN("UTF-32LE")) ||
			charset == parserutils_charset_mibenum_from_name(
					"UTF-32BE", SLEN("UTF-32BE")) ||
			charset == parserutils_charset_mibenum_from_name(
					"UTF-16", SLEN("UTF-16")) ||
			charset == parserutils_charset_mibenum_from_name(
					"UTF-16LE", SLEN("UTF-16LE")) ||
			charset == parserutils_charset_mibenum_from_name(
					"UTF-16BE", SLEN("UTF-16BE"))) {

			charset = 0;
		}
	}

#undef CHARSET

	*result = charset;

	return PARSERUTILS_OK;
}