summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--docs/Macros60
-rw-r--r--include/hubbub/functypes.h6
-rw-r--r--include/hubbub/parser.h6
-rw-r--r--include/hubbub/types.h11
-rw-r--r--src/Makefile3
-rw-r--r--src/charset/Makefile2
-rw-r--r--src/charset/aliases.c361
-rw-r--r--src/charset/aliases.h42
-rw-r--r--src/charset/codec.c188
-rw-r--r--src/charset/codec.h153
-rw-r--r--src/charset/codec_iconv.c837
-rw-r--r--src/charset/codec_impl.h51
-rw-r--r--src/charset/codec_utf16.c620
-rw-r--r--src/charset/codec_utf8.c620
-rw-r--r--src/charset/detect.c153
-rw-r--r--src/charset/detect.h8
-rw-r--r--src/hubbub.c12
-rw-r--r--src/input/Makefile46
-rw-r--r--src/input/filter.c380
-rw-r--r--src/input/filter.h57
-rw-r--r--src/input/inputstream.c481
-rw-r--r--src/input/inputstream.h98
-rw-r--r--src/input/streamimpl.h77
-rw-r--r--src/input/utf16_stream.c605
-rw-r--r--src/input/utf8_stream.c562
-rw-r--r--src/parser.c70
-rw-r--r--src/tokeniser/tokeniser.c3565
-rw-r--r--src/tokeniser/tokeniser.h10
-rw-r--r--src/treebuilder/after_body.c6
-rw-r--r--src/treebuilder/after_head.c3
-rw-r--r--src/treebuilder/before_head.c3
-rw-r--r--src/treebuilder/before_html.c3
-rw-r--r--src/treebuilder/generic_rcdata.c17
-rw-r--r--src/treebuilder/in_body.c76
-rw-r--r--src/treebuilder/in_foreign_content.c21
-rw-r--r--src/treebuilder/in_head.c2
-rw-r--r--src/treebuilder/in_row.c3
-rw-r--r--src/treebuilder/in_table.c16
-rw-r--r--src/treebuilder/in_table_body.c3
-rw-r--r--src/treebuilder/initial.c11
-rw-r--r--src/treebuilder/internal.h21
-rw-r--r--src/treebuilder/script_collect.c6
-rw-r--r--src/treebuilder/treebuilder.c115
-rw-r--r--src/treebuilder/treebuilder.h6
-rw-r--r--src/utils/Makefile2
-rw-r--r--src/utils/utf16.c239
-rw-r--r--src/utils/utf16.h38
-rw-r--r--src/utils/utf8.c368
-rw-r--r--src/utils/utf8.h38
-rw-r--r--test/INDEX12
-rw-r--r--test/Makefile9
-rw-r--r--test/aliases.c61
-rw-r--r--test/cscodec.c247
-rw-r--r--test/csdetect.c18
-rw-r--r--test/data/cscodec/INDEX5
-rw-r--r--test/data/cscodec/simple.datbin1193 -> 0 bytes
-rw-r--r--test/data/csdetect/INDEX1
-rw-r--r--test/data/csdetect/regression.dat5
-rw-r--r--test/data/tree-construction/INDEX1
-rw-r--r--test/data/tree-construction/regression.dat31
-rw-r--r--test/filter.c355
-rw-r--r--test/inputstream.c126
-rw-r--r--test/parser-utf16.c195
-rw-r--r--test/parser.c44
-rw-r--r--test/regression/cscodec-segv.c37
-rw-r--r--test/regression/filter-segv.c38
-rw-r--r--test/regression/stream-nomem.c88
-rw-r--r--test/tokeniser.c54
-rw-r--r--test/tokeniser2.c107
-rw-r--r--test/tokeniser3.c108
-rw-r--r--test/tree.c49
-rw-r--r--test/tree2.c77
72 files changed, 2140 insertions, 9609 deletions
diff --git a/docs/Macros b/docs/Macros
new file mode 100644
index 0000000..f301a98
--- /dev/null
+++ b/docs/Macros
@@ -0,0 +1,60 @@
+The data which Hubbub is fed (the input stream) gets buffered into a UTF-8
+buffer. This buffer only holds a subset of the input stream at any given time.
+To avoid unnecessary copying (which is both a speed and memory loss), Hubbub
+tries to make all emitted strings point into this buffer, which is then
+advanced after tokens have been emitted. This is not always possible, however,
+because HTML5 specifies behaviour which requires changing various characters to
+various other characters, and these sets of characters may not have the same
+length. These cases are:
+
+ - CR handling -- CRLFs and CRs are converted to LFs
+ - tag and attribute names are lowercased
+ - entities are allowed in attribute names
+ - NUL bytes must be turned into U+FFFD REPLACEMENT CHARACTER
+
+When collecting the strings it will emit, Hubbub starts by assuming that no
+transformations on the input stream will be required. However, if it hits one
+of the above cases, then it copies all of the collected characters into a buffer
+and switches to using that instead. This means that every time a character is
+collected and it is possible that that character could be collected into a
+buffer, the code must check if it should be collected into a buffer. To allow
+this check, and others, to happen when necessary and never otherwise, Hubbub
+uses a set of macros to collect characters, detailed below.
+
+Hubbub strings are (beginning,length) pairs. This means that once the
+beginning is set to a position in the input stream, the string can collect
+further character runs in the stream simply by adding to the length part. This
+makes extending strings very efficient.
+
+ | COLLECT(hubbub_string str, uintptr_t cptr, size_t length)
+
+ This collects the character pointed to "cptr" (of size "length") into "str",
+ whether str is a buffered or unbuffered string, but only if "str" already
+ points to collected characters.
+
+ | COLLECT_NOBUF(hubbub_string str, size_t length)
+
+ This collects "length" bytes into "str", but only if "str" already points to
+ collected characters. (There is no need to pass the character, since this
+ just increases the length of the string.)
+
+ | COLLECT_MS(hubbub_string str, uintptr_t cptr, size_t length)
+
+ If "str" is currently zero-length, this acts like START(str, cptr, length).
+ Otherwise, it just acts like COLLECT(str, cptr, length).
+
+ | START(hubbub_string str, uintptr_t cptr, size_t length)
+
+ This sets the string "str"'s start to "cptr" and its length to "length".
+
+ | START_BUF(hubbub_string str, uintptr_t cptr, size_t length)
+
+ This buffers the character of length "length" pointed to by "c" and then
+ sets "str" to point to it.
+
+ | SWITCH(hubbub_string str)
+
+ This switches the string "str" from unbuffered to buffered; it copies all
+ characters currently collected in "str" to the buffer and then updates it
+ to point there.
+
diff --git a/include/hubbub/functypes.h b/include/hubbub/functypes.h
index 0d45e6a..c6dbee2 100644
--- a/include/hubbub/functypes.h
+++ b/include/hubbub/functypes.h
@@ -23,12 +23,6 @@ typedef void *(*hubbub_alloc)(void *ptr, size_t size, void *pw);
typedef void (*hubbub_token_handler)(const hubbub_token *token, void *pw);
/**
- * Type of document buffer handling function
- */
-typedef void (*hubbub_buffer_handler)(const uint8_t *data,
- size_t len, void *pw);
-
-/**
* Type of parse error handling function
*/
typedef void (*hubbub_error_handler)(uint32_t line, uint32_t col,
diff --git a/include/hubbub/parser.h b/include/hubbub/parser.h
index f7d8e1e..4249426 100644
--- a/include/hubbub/parser.h
+++ b/include/hubbub/parser.h
@@ -22,7 +22,6 @@ typedef struct hubbub_parser hubbub_parser;
*/
typedef enum hubbub_parser_opttype {
HUBBUB_PARSER_TOKEN_HANDLER,
- HUBBUB_PARSER_BUFFER_HANDLER,
HUBBUB_PARSER_ERROR_HANDLER,
HUBBUB_PARSER_CONTENT_MODEL,
HUBBUB_PARSER_TREE_HANDLER,
@@ -39,11 +38,6 @@ typedef union hubbub_parser_optparams {
} token_handler;
struct {
- hubbub_buffer_handler handler;
- void *pw;
- } buffer_handler;
-
- struct {
hubbub_error_handler handler;
void *pw;
} error_handler;
diff --git a/include/hubbub/types.h b/include/hubbub/types.h
index 5b49786..42d1460 100644
--- a/include/hubbub/types.h
+++ b/include/hubbub/types.h
@@ -70,16 +70,7 @@ typedef enum hubbub_ns {
* Tokeniser string type
*/
typedef struct hubbub_string {
- enum {
- HUBBUB_STRING_OFF,
- HUBBUB_STRING_PTR
- } type;
-
- union {
- const uint8_t *ptr; /**< Pointer to data */
- uint32_t off; /**< Byte offset of string start */
- } data;
-
+ const uint8_t *ptr; /**< Pointer to data */
size_t len; /**< Byte length of string */
} hubbub_string;
diff --git a/src/Makefile b/src/Makefile
index 1c733f7..c4b53ff 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -32,7 +32,8 @@ dirstack_$(sp) := $(d)
d := $(DIR)
# Manipulate include paths
-CFLAGS := $(CFLAGS) -I$(d)
+CFLAGS := $(CFLAGS) -I$(d) `pkg-config libparserutils --cflags`
+LDFLAGS := $(LDFLAGS) `pkg-config libparserutils --libs`
# Sources
SRCS_$(d) := hubbub.c parser.c
diff --git a/src/charset/Makefile b/src/charset/Makefile
index 2e76730..3bdb77f 100644
--- a/src/charset/Makefile
+++ b/src/charset/Makefile
@@ -32,7 +32,7 @@ dirstack_$(sp) := $(d)
d := $(DIR)
# Sources
-SRCS_$(d) := aliases.c codec.c codec_iconv.c codec_utf8.c codec_utf16.c detect.c
+SRCS_$(d) := detect.c
# Append to sources for component
SOURCES += $(addprefix $(d), $(SRCS_$(d)))
diff --git a/src/charset/aliases.c b/src/charset/aliases.c
deleted file mode 100644
index dcf6de2..0000000
--- a/src/charset/aliases.c
+++ /dev/null
@@ -1,361 +0,0 @@
-/*
- * This file is part of Hubbub.
- * Licensed under the MIT License,
- * http://www.opensource.org/licenses/mit-license.php
- * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
- */
-
-#include <ctype.h>
-#include <stdbool.h>
-#include <stddef.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "charset/aliases.h"
-
-struct alias {
- struct alias *next;
- hubbub_aliases_canon *canon;
- uint16_t name_len;
- char name[1];
-};
-
-#define HASH_SIZE (43)
-static hubbub_aliases_canon *canon_tab[HASH_SIZE];
-static struct alias *alias_tab[HASH_SIZE];
-
-static hubbub_error hubbub_create_alias(const char *alias,
- hubbub_aliases_canon *c, hubbub_alloc alloc, void *pw);
-static hubbub_aliases_canon *hubbub_create_canon(const char *canon,
- uint16_t mibenum, hubbub_alloc alloc, void *pw);
-static uint32_t hubbub_hash_val(const char *alias, size_t len);
-
-/**
- * Create alias data from Aliases file
- *
- * \param filename The path to the Aliases file
- * \param alloc Memory (de)allocation function
- * \param pw Pointer to client-specific private data (may be NULL)
- * \return HUBBUB_OK on success, appropriate error otherwise.
- */
-hubbub_error hubbub_aliases_create(const char *filename,
- hubbub_alloc alloc, void *pw)
-{
- char buf[300];
- FILE *fp;
-
- if (filename == NULL || alloc == NULL)
- return HUBBUB_BADPARM;
-
- fp = fopen(filename, "r");
- if (fp == NULL)
- return HUBBUB_FILENOTFOUND;
-
- while (fgets(buf, sizeof buf, fp)) {
- char *p, *aliases = 0, *mib, *end;
- hubbub_aliases_canon *cf;
-
- if (buf[0] == 0 || buf[0] == '#')
- /* skip blank lines or comments */
- continue;
-
- buf[strlen(buf) - 1] = 0; /* lose terminating newline */
- end = buf + strlen(buf);
-
- /* find end of canonical form */
- for (p = buf; *p && !isspace(*p) && !iscntrl(*p); p++)
- ; /* do nothing */
- if (p >= end)
- continue;
- *p++ = '\0'; /* terminate canonical form */
-
- /* skip whitespace */
- for (; *p && isspace(*p); p++)
- ; /* do nothing */
- if (p >= end)
- continue;
- mib = p;
-
- /* find end of mibenum */
- for (; *p && !isspace(*p) && !iscntrl(*p); p++)
- ; /* do nothing */
- if (p < end)
- *p++ = '\0'; /* terminate mibenum */
-
- cf = hubbub_create_canon(buf, atoi(mib), alloc, pw);
- if (cf == NULL)
- continue;
-
- /* skip whitespace */
- for (; p < end && *p && isspace(*p); p++)
- ; /* do nothing */
- if (p >= end)
- continue;
- aliases = p;
-
- while (p < end) {
- /* find end of alias */
- for (; *p && !isspace(*p) && !iscntrl(*p); p++)
- ; /* do nothing */
- if (p > end)
- /* stop if we've gone past the end */
- break;
- /* terminate current alias */
- *p++ = '\0';
-
- if (hubbub_create_alias(aliases, cf,
- alloc, pw) != HUBBUB_OK)
- break;
-
- /* in terminating, we may have advanced
- * past the end - check this here */
- if (p >= end)
- break;
-
- /* skip whitespace */
- for (; *p && isspace(*p); p++)
- ; /* do nothing */
-
- if (p >= end)
- /* gone past end => stop */
- break;
-
- /* update pointer to current alias */
- aliases = p;
- }
- }
-
- fclose(fp);
-
- return HUBBUB_OK;
-}
-
-/**
- * Free all alias data
- *
- * \param alloc Memory (de)allocation function
- * \param pw Pointer to client-specific private data
- */
-void hubbub_aliases_destroy(hubbub_alloc alloc, void *pw)
-{
- hubbub_aliases_canon *c, *d;
- struct alias *a, *b;
- int i;
-
- for (i = 0; i != HASH_SIZE; i++) {
- for (c = canon_tab[i]; c; c = d) {
- d = c->next;
- alloc(c, 0, pw);
- }
- canon_tab[i] = NULL;
-
- for (a = alias_tab[i]; a; a = b) {
- b = a->next;
- alloc(a, 0, pw);
- }
- alias_tab[i] = NULL;
- }
-}
-
-/**
- * Retrieve the MIB enum value assigned to an encoding name
- *
- * \param alias The alias to lookup
- * \param len The length of the alias string
- * \return The MIB enum value, or 0 if not found
- */
-uint16_t hubbub_mibenum_from_name(const char *alias, size_t len)
-{
- hubbub_aliases_canon *c;
-
- if (alias == NULL)
- return 0;
-
- c = hubbub_alias_canonicalise(alias, len);
- if (c == NULL)
- return 0;
-
- return c->mib_enum;
-}
-
-/**
- * Retrieve the canonical name of an encoding from the MIB enum
- *
- * \param mibenum The MIB enum value
- * \return Pointer to canonical name, or NULL if not found
- */
-const char *hubbub_mibenum_to_name(uint16_t mibenum)
-{
- int i;
- hubbub_aliases_canon *c;
-
- for (i = 0; i != HASH_SIZE; i++)
- for (c = canon_tab[i]; c; c = c->next)
- if (c->mib_enum == mibenum)
- return c->name;
-
- return NULL;
-}
-
-
-/**
- * Retrieve the canonical form of an alias name
- *
- * \param alias The alias name
- * \param len The length of the alias name
- * \return Pointer to canonical form or NULL if not found
- */
-hubbub_aliases_canon *hubbub_alias_canonicalise(const char *alias,
- size_t len)
-{
- uint32_t hash;
- hubbub_aliases_canon *c;
- struct alias *a;
-
- if (alias == NULL)
- return NULL;
-
- hash = hubbub_hash_val(alias, len);
-
- for (c = canon_tab[hash]; c; c = c->next)
- if (c->name_len == len &&
- strncasecmp(c->name, alias, len) == 0)
- break;
- if (c)
- return c;
-
- for (a = alias_tab[hash]; a; a = a->next)
- if (a->name_len == len &&
- strncasecmp(a->name, alias, len) == 0)
- break;
- if (a)
- return a->canon;
-
- return NULL;
-}
-
-
-/**
- * Create an alias
- *
- * \param alias The alias name
- * \param c The canonical form
- * \param alloc Memory (de)allocation function
- * \param pw Pointer to client-specific private data (may be NULL)
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_create_alias(const char *alias, hubbub_aliases_canon *c,
- hubbub_alloc alloc, void *pw)
-{
- struct alias *a;
- uint32_t hash;
-
- if (alias == NULL || c == NULL || alloc == NULL)
- return HUBBUB_BADPARM;
-
- a = alloc(NULL, sizeof(struct alias) + strlen(alias) + 1, pw);
- if (a == NULL)
- return HUBBUB_NOMEM;
-
- a->canon = c;
- a->name_len = strlen(alias);
- strcpy(a->name, alias);
- a->name[a->name_len] = '\0';
-
- hash = hubbub_hash_val(alias, a->name_len);
-
- a->next = alias_tab[hash];
- alias_tab[hash] = a;
-
- return HUBBUB_OK;
-}
-
-/**
- * Create a canonical form
- *
- * \param canon The canonical name
- * \param mibenum The MIB enum value
- * \param alloc Memory (de)allocation function
- * \param pw Pointer to client-specific private data (may be NULL)
- * \return Pointer to canonical form or NULL on error
- */
-hubbub_aliases_canon *hubbub_create_canon(const char *canon,
- uint16_t mibenum, hubbub_alloc alloc, void *pw)
-{
- hubbub_aliases_canon *c;
- uint32_t hash, len;
-
- if (canon == NULL || alloc == NULL)
- return NULL;
-
- len = strlen(canon);
-
- c = alloc(NULL, sizeof(hubbub_aliases_canon) + len + 1, pw);
- if (c == NULL)
- return NULL;
-
- c->mib_enum = mibenum;
- c->name_len = len;
- strcpy(c->name, canon);
- c->name[len] = '\0';
-
- hash = hubbub_hash_val(canon, len);
-
- c->next = canon_tab[hash];
- canon_tab[hash] = c;
-
- return c;
-}
-
-/**
- * Hash function
- *
- * \param alias String to hash
- * \return The hashed value
- */
-uint32_t hubbub_hash_val(const char *alias, size_t len)
-{
- const char *s = alias;
- uint32_t h = 5381;
-
- if (alias == NULL)
- return 0;
-
- while (len--)
- h = (h * 33) ^ (*s++ & ~0x20); /* case insensitive */
-
- return h % HASH_SIZE;
-}
-
-
-#ifndef NDEBUG
-/**
- * Dump all alias data to stdout
- */
-void hubbub_aliases_dump(void)
-{
- hubbub_aliases_canon *c;
- struct alias *a;
- int i;
- size_t size = 0;
-
- for (i = 0; i != HASH_SIZE; i++) {
- for (c = canon_tab[i]; c; c = c->next) {
- printf("%d %s\n", i, c->name);
- size += offsetof(hubbub_aliases_canon, name) +
- c->name_len;
- }
-
- for (a = alias_tab[i]; a; a = a->next) {
- printf("%d %s\n", i, a->name);
- size += offsetof(struct alias, name) + a->name_len;
- }
- }
-
- size += (sizeof(canon_tab) / sizeof(canon_tab[0]));
- size += (sizeof(alias_tab) / sizeof(alias_tab[0]));
-
- printf("%u\n", (unsigned int) size);
-}
-#endif
diff --git a/src/charset/aliases.h b/src/charset/aliases.h
deleted file mode 100644
index e0505d0..0000000
--- a/src/charset/aliases.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * This file is part of Hubbub.
- * Licensed under the MIT License,
- * http://www.opensource.org/licenses/mit-license.php
- * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
- */
-
-#ifndef hubbub_charset_aliases_h_
-#define hubbub_charset_aliases_h_
-
-#include <inttypes.h>
-
-#include <hubbub/errors.h>
-#include <hubbub/functypes.h>
-
-typedef struct hubbub_aliases_canon {
- struct hubbub_aliases_canon *next;
- uint16_t mib_enum;
- uint16_t name_len;
- char name[1];
-} hubbub_aliases_canon;
-
-/* Load encoding aliases from file */
-hubbub_error hubbub_aliases_create(const char *filename,
- hubbub_alloc alloc, void *pw);
-/* Destroy encoding aliases */
-void hubbub_aliases_destroy(hubbub_alloc alloc, void *pw);
-
-/* Convert an encoding alias to a MIB enum value */
-uint16_t hubbub_mibenum_from_name(const char *alias, size_t len);
-/* Convert a MIB enum value into an encoding alias */
-const char *hubbub_mibenum_to_name(uint16_t mibenum);
-
-/* Canonicalise an alias name */
-hubbub_aliases_canon *hubbub_alias_canonicalise(const char *alias,
- size_t len);
-
-#ifndef NDEBUG
-void hubbub_aliases_dump(void);
-#endif
-
-#endif
diff --git a/src/charset/codec.c b/src/charset/codec.c
deleted file mode 100644
index 727d600..0000000
--- a/src/charset/codec.c
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * This file is part of Hubbub.
- * Licensed under the MIT License,
- * http://www.opensource.org/licenses/mit-license.php
- * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
- */
-
-#include <string.h>
-
-#include "charset/aliases.h"
-
-#include "codec_impl.h"
-
-extern hubbub_charsethandler hubbub_iconv_codec_handler;
-extern hubbub_charsethandler hubbub_utf8_codec_handler;
-extern hubbub_charsethandler hubbub_utf16_codec_handler;
-
-static hubbub_charsethandler *handler_table[] = {
- &hubbub_utf8_codec_handler,
- &hubbub_utf16_codec_handler,
- &hubbub_iconv_codec_handler,
- NULL,
-};
-
-/**
- * Create a charset codec
- *
- * \param charset Target charset
- * \param alloc Memory (de)allocation function
- * \param pw Pointer to client-specific private data (may be NULL)
- * \return Pointer to codec instance, or NULL on failure
- */
-hubbub_charsetcodec *hubbub_charsetcodec_create(const char *charset,
- hubbub_alloc alloc, void *pw)
-{
- hubbub_charsetcodec *codec;
- hubbub_charsethandler **handler;
- const hubbub_aliases_canon * canon;
-
- if (charset == NULL || alloc == NULL)
- return NULL;
-
- /* Canonicalise charset name. */
- canon = hubbub_alias_canonicalise(charset, strlen(charset));
- if (canon == NULL)
- return NULL;
-
- /* Search for handler class */
- for (handler = handler_table; *handler != NULL; handler++) {
- if ((*handler)->handles_charset(canon->name))
- break;
- }
-
- /* None found */
- if ((*handler) == NULL)
- return NULL;
-
- /* Instantiate class */
- codec = (*handler)->create(canon->name, alloc, pw);
- if (codec == NULL)
- return NULL;
-
- /* and initialise it */
- codec->mibenum = canon->mib_enum;
-
- codec->filter = NULL;
- codec->filter_pw = NULL;
-
- codec->errormode = HUBBUB_CHARSETCODEC_ERROR_LOOSE;
-
- codec->alloc = alloc;
- codec->alloc_pw = pw;
-
- return codec;
-}
-
-/**
- * Destroy a charset codec
- *
- * \param codec The codec to destroy
- */
-void hubbub_charsetcodec_destroy(hubbub_charsetcodec *codec)
-{
- if (codec == NULL)
- return;
-
- codec->handler.destroy(codec);
-
- codec->alloc(codec, 0, codec->alloc_pw);
-}
-
-/**
- * Configure a charset codec
- *
- * \param codec The codec to configure
- * \parem type The codec option type to configure
- * \param params Option-specific parameters
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_charsetcodec_setopt(hubbub_charsetcodec *codec,
- hubbub_charsetcodec_opttype type,
- hubbub_charsetcodec_optparams *params)
-{
- if (codec == NULL || params == NULL)
- return HUBBUB_BADPARM;
-
- switch (type) {
- case HUBBUB_CHARSETCODEC_FILTER_FUNC:
- codec->filter = params->filter_func.filter;
- codec->filter_pw = params->filter_func.pw;
- break;
-
- case HUBBUB_CHARSETCODEC_ERROR_MODE:
- codec->errormode = params->error_mode.mode;
- break;
- }
-
- return HUBBUB_OK;
-}
-
-/**
- * Encode a chunk of UCS4 data into a codec's charset
- *
- * \param codec The codec to use
- * \param source Pointer to pointer to source data
- * \param sourcelen Pointer to length (in bytes) of source data
- * \param dest Pointer to pointer to output buffer
- * \param destlen Pointer to length (in bytes) of output buffer
- * \return HUBBUB_OK on success, appropriate error otherwise.
- *
- * source, sourcelen, dest and destlen will be updated appropriately on exit
- */
-hubbub_error hubbub_charsetcodec_encode(hubbub_charsetcodec *codec,
- const uint8_t **source, size_t *sourcelen,
- uint8_t **dest, size_t *destlen)
-{
- if (codec == NULL || source == NULL || *source == NULL ||
- sourcelen == NULL || dest == NULL || *dest == NULL ||
- destlen == NULL)
- return HUBBUB_BADPARM;
-
- return codec->handler.encode(codec, source, sourcelen, dest, destlen);
-}
-
-/**
- * Decode a chunk of data in a codec's charset into UCS4
- *
- * \param codec The codec to use
- * \param source Pointer to pointer to source data
- * \param sourcelen Pointer to length (in bytes) of source data
- * \param dest Pointer to pointer to output buffer
- * \param destlen Pointer to length (in bytes) of output buffer
- * \return HUBBUB_OK on success, appropriate error otherwise.
- *
- * source, sourcelen, dest and destlen will be updated appropriately on exit
- *
- * Call this with a source length of 0 to flush any buffers.
- */
-hubbub_error hubbub_charsetcodec_decode(hubbub_charsetcodec *codec,
- const uint8_t **source, size_t *sourcelen,
- uint8_t **dest, size_t *destlen)
-{
- if (codec == NULL || source == NULL || *source == NULL ||
- sourcelen == NULL || dest == NULL || *dest == NULL ||
- destlen == NULL)
- return HUBBUB_BADPARM;
-
- return codec->handler.decode(codec, source, sourcelen, dest, destlen);
-}
-
-/**
- * Clear a charset codec's encoding state
- *
- * \param codec The codec to reset
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_charsetcodec_reset(hubbub_charsetcodec *codec)
-{
- if (codec == NULL)
- return HUBBUB_BADPARM;
-
- /* Reset filter */
- if (codec->filter)
- codec->filter(HUBBUB_CHARSETCODEC_NULL, NULL, NULL, NULL);
-
- return codec->handler.reset(codec);
-}
-
diff --git a/src/charset/codec.h b/src/charset/codec.h
deleted file mode 100644
index 4cd94d8..0000000
--- a/src/charset/codec.h
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * This file is part of Hubbub.
- * Licensed under the MIT License,
- * http://www.opensource.org/licenses/mit-license.php
- * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
- */
-
-#ifndef hubbub_charset_codec_h_
-#define hubbub_charset_codec_h_
-
-#include <inttypes.h>
-
-#include <hubbub/errors.h>
-#include <hubbub/functypes.h>
-
-typedef struct hubbub_charsetcodec hubbub_charsetcodec;
-
-#define HUBBUB_CHARSETCODEC_NULL (0xffffffffU)
-
-/**
- * Type of charset codec filter function
- *
- * \param c UCS4 character (in host byte order) or
- * HUBBUB_CHARSETCODEC_NULL to reset
- * \param output Pointer to location to store output buffer location
- * \param outputlen Pointer to location to store output buffer length
- * \param pw Pointer to client-specific private data
- * \return HUBBUB_OK on success, or appropriate error otherwise.
- *
- * The output buffer is owned by the filter code and will not be freed by
- * any charset codec. It should contain the replacement UCS4 character(s)
- * for the input. The replacement characters should be in host byte order.
- * The contents of *output and *outputlen on entry are ignored and these
- * will be filled in by the filter code.
- *
- * Filters may elect to replace the input character with no output. In this
- * case, *output should be set to NULL and *outputlen should be set to 0 and
- * HUBBUB_OK should be returned.
- *
- * The output length is in terms of the number of UCS4 characters in the
- * output buffer. i.e.:
- *
- * for (size_t i = 0; i < outputlen; i++) {
- * dest[curchar++] = output[i];
- * }
- *
- * would copy the contents of the filter output buffer to the codec's output
- * buffer.
- */
-typedef hubbub_error (*hubbub_charsetcodec_filter)(uint32_t c,
- uint32_t **output, size_t *outputlen, void *pw);
-
-/**
- * Charset codec error mode
- *
- * A codec's error mode determines its behaviour in the face of:
- *
- * + characters which are unrepresentable in the destination charset (if
- * encoding data) or which cannot be converted to UCS4 (if decoding data).
- * + invalid byte sequences (both encoding and decoding)
- *
- * The options provide a choice between the following approaches:
- *
- * + draconian, "stop processing" ("strict")
- * + "replace the unrepresentable character with something else" ("loose")
- * + "attempt to transliterate, or replace if unable" ("translit")
- *
- * The default error mode is "loose".
- *
- *
- * In the "loose" case, the replacement character will depend upon:
- *
- * + Whether the operation was encoding or decoding
- * + If encoding, what the destination charset is.
- *
- * If decoding, the replacement character will be:
- *
- * U+FFFD (REPLACEMENT CHARACTER)
- *
- * If encoding, the replacement character will be:
- *
- * U+003F (QUESTION MARK) if the destination charset is not UTF-(8|16|32)
- * U+FFFD (REPLACEMENT CHARACTER) otherwise.
- *
- *
- * In the "translit" case, the codec will attempt to transliterate into
- * the destination charset, if encoding. If decoding, or if transliteration
- * fails, this option is identical to "loose".
- */
-typedef enum hubbub_charsetcodec_errormode {
- /** Abort processing if unrepresentable character encountered */
- HUBBUB_CHARSETCODEC_ERROR_STRICT = 0,
- /** Replace unrepresentable characters with single alternate */
- HUBBUB_CHARSETCODEC_ERROR_LOOSE = 1,
- /** Transliterate unrepresentable characters, if possible */
- HUBBUB_CHARSETCODEC_ERROR_TRANSLIT = 2,
-} hubbub_charsetcodec_errormode;
-
-/**
- * Charset codec option types
- */
-typedef enum hubbub_charsetcodec_opttype {
- /** Register codec filter function */
- HUBBUB_CHARSETCODEC_FILTER_FUNC = 0,
- /** Set codec error mode */
- HUBBUB_CHARSETCODEC_ERROR_MODE = 1,
-} hubbub_charsetcodec_opttype;
-
-/**
- * Charset codec option parameters
- */
-typedef union hubbub_charsetcodec_optparams {
- /** Parameters for filter function setting */
- struct {
- /** Filter function */
- hubbub_charsetcodec_filter filter;
- /** Client-specific private data */
- void *pw;
- } filter_func;
-
- /** Parameters for error mode setting */
- struct {
- /** The desired error handling mode */
- hubbub_charsetcodec_errormode mode;
- } error_mode;
-} hubbub_charsetcodec_optparams;
-
-
-/* Create a charset codec */
-hubbub_charsetcodec *hubbub_charsetcodec_create(const char *charset,
- hubbub_alloc alloc, void *pw);
-/* Destroy a charset codec */
-void hubbub_charsetcodec_destroy(hubbub_charsetcodec *codec);
-
-/* Configure a charset codec */
-hubbub_error hubbub_charsetcodec_setopt(hubbub_charsetcodec *codec,
- hubbub_charsetcodec_opttype type,
- hubbub_charsetcodec_optparams *params);
-
-/* Encode a chunk of UCS4 data into a codec's charset */
-hubbub_error hubbub_charsetcodec_encode(hubbub_charsetcodec *codec,
- const uint8_t **source, size_t *sourcelen,
- uint8_t **dest, size_t *destlen);
-
-/* Decode a chunk of data in a codec's charset into UCS4 */
-hubbub_error hubbub_charsetcodec_decode(hubbub_charsetcodec *codec,
- const uint8_t **source, size_t *sourcelen,
- uint8_t **dest, size_t *destlen);
-
-/* Reset a charset codec */
-hubbub_error hubbub_charsetcodec_reset(hubbub_charsetcodec *codec);
-
-#endif
diff --git a/src/charset/codec_iconv.c b/src/charset/codec_iconv.c
deleted file mode 100644
index 097e82a..0000000
--- a/src/charset/codec_iconv.c
+++ /dev/null
@@ -1,837 +0,0 @@
-/*
- * This file is part of Hubbub.
- * Licensed under the MIT License,
- * http://www.opensource.org/licenses/mit-license.php
- * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
- */
-
-/* This codec is hideously slow. Only use it as a last resort */
-
-#include <errno.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <iconv.h>
-
-/* These two are for htonl / ntohl */
-#include <arpa/inet.h>
-#include <netinet/in.h>
-
-#include "charset/aliases.h"
-#include "utils/utils.h"
-
-#include "codec_impl.h"
-
-/**
- * A note on endianness:
- *
- * UCS4 is big-endian by default. Therefore, this codec reads and writes
- * big-endian values. This is fine, and causes no problems. However, to
- * make life easier for client-supplied filter code, character values passed
- * to a filter and those read back from a filter are in host-endian.
- * Therefore, we need to convert from big-endian to host-endian when passing
- * characters to a filter and perform the reverse translation when reading
- * characters back.
- */
-
-/**
- * Iconv-based charset codec
- */
-typedef struct hubbub_iconv_codec {
- hubbub_charsetcodec base; /**< Base class */
-
- iconv_t read_cd; /**< Iconv handle for reading */
-#define INVAL_BUFSIZE (32)
- uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up
- * incomplete input
- * sequences */
- size_t inval_len; /**< Number of bytes in inval_buf */
-
-#define READ_BUFSIZE (8)
- uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial
- * output sequences (decode)
- */
- size_t read_len; /**< Number of characters in
- * read_buf */
-
- iconv_t write_cd; /**< Iconv handle for writing */
-#define WRITE_BUFSIZE (8)
- uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial
- * output sequences (encode)
- */
- size_t write_len; /**< Number of characters in
- * write_buf */
-} hubbub_iconv_codec;
-
-
-static bool hubbub_iconv_codec_handles_charset(const char *charset);
-static hubbub_charsetcodec *hubbub_iconv_codec_create(const char *charset,
- hubbub_alloc alloc, void *pw);
-static void hubbub_iconv_codec_destroy (hubbub_charsetcodec *codec);
-static hubbub_error hubbub_iconv_codec_encode(hubbub_charsetcodec *codec,
- const uint8_t **source, size_t *sourcelen,
- uint8_t **dest, size_t *destlen);
-static hubbub_error hubbub_iconv_codec_decode(hubbub_charsetcodec *codec,
- const uint8_t **source, size_t *sourcelen,
- uint8_t **dest, size_t *destlen);
-static hubbub_error hubbub_iconv_codec_reset(hubbub_charsetcodec *codec);
-static hubbub_error hubbub_iconv_codec_filter_decoded_char(
- hubbub_iconv_codec *c, uint32_t ucs4, uint8_t **dest,
- size_t *destlen);
-static bool hubbub_iconv_codec_is_unicode(hubbub_iconv_codec *c);
-static hubbub_error hubbub_iconv_codec_read_char(hubbub_iconv_codec *c,
- const uint8_t **source, size_t *sourcelen,
- uint8_t **dest, size_t *destlen);
-static hubbub_error hubbub_iconv_codec_write_char(hubbub_iconv_codec *c,
- uint32_t ucs4, uint8_t **dest, size_t *destlen);
-
-/**
- * Determine whether this codec handles a specific charset
- *
- * \param charset Charset to test
- * \return true if handleable, false otherwise
- */
-bool hubbub_iconv_codec_handles_charset(const char *charset)
-{
- iconv_t cd;
- bool ret;
-
- cd = iconv_open("UCS-4", charset);
-
- ret = (cd != (iconv_t) -1);
-
- if (ret)
- iconv_close(cd);
-
- return ret;
-}
-
-/**
- * Create an iconv-based codec
- *
- * \param charset The charset to read from / write to
- * \param alloc Memory (de)allocation function
- * \param pw Pointer to client-specific private data (may be NULL)
- * \return Pointer to codec, or NULL on failure
- */
-hubbub_charsetcodec *hubbub_iconv_codec_create(const char *charset,
- hubbub_alloc alloc, void *pw)
-{
- hubbub_iconv_codec *codec;
-
- codec = alloc(NULL, sizeof(hubbub_iconv_codec), pw);
- if (codec == NULL)
- return NULL;
-
- codec->read_cd = iconv_open("UCS-4", charset);
- if (codec->read_cd == (iconv_t) -1) {
- alloc(codec, 0, pw);
- return NULL;
- }
-
- codec->write_cd = iconv_open(charset, "UCS-4");
- if (codec->write_cd == (iconv_t) -1) {
- iconv_close(codec->read_cd);
- alloc(codec, 0, pw);
- return NULL;
- }
-
- codec->inval_buf[0] = '\0';
- codec->inval_len = 0;
-
- codec->read_buf[0] = 0;
- codec->read_len = 0;
-
- codec->write_buf[0] = 0;
- codec->write_len = 0;
-
- /* Finally, populate vtable */
- codec->base.handler.destroy = hubbub_iconv_codec_destroy;
- codec->base.handler.encode = hubbub_iconv_codec_encode;
- codec->base.handler.decode = hubbub_iconv_codec_decode;
- codec->base.handler.reset = hubbub_iconv_codec_reset;
-
- return (hubbub_charsetcodec *) codec;
-}
-
-/**
- * Destroy an iconv-based codec
- *
- * \param codec The codec to destroy
- */
-void hubbub_iconv_codec_destroy (hubbub_charsetcodec *codec)
-{
- hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec;
-
- iconv_close(c->read_cd);
- iconv_close(c->write_cd);
-
- return;
-}
-
-/**
- * Encode a chunk of UCS4 data into an iconv-based codec's charset
- *
- * \param codec The codec to use
- * \param source Pointer to pointer to source data
- * \param sourcelen Pointer to length (in bytes) of source data
- * \param dest Pointer to pointer to output buffer
- * \param destlen Pointer to length (in bytes) of output buffer
- * \return HUBBUB_OK on success,
- * HUBBUB_NOMEM if output buffer is too small,
- * HUBBUB_INVALID if a character cannot be represented and the
- * codec's error handling mode is set to STRICT,
- * <any_other_error> as a result of the failure of the
- * client-provided filter function.
- *
- * On exit, ::source will point immediately _after_ the last input character
- * read. Any remaining output for the character will be buffered by the
- * codec for writing on the next call. This buffered data is post-filtering,
- * so will not be refiltered on the next call.
- *
- * In the case of the filter function failing, ::source will point _at_ the
- * last input character read; nothing will be written or buffered for the
- * failed character. It is up to the client to fix the cause of the failure
- * and retry the encoding process.
- *
- * Note that, if failure occurs whilst attempting to write any output
- * buffered by the last call, then ::source and ::sourcelen will remain
- * unchanged (as nothing more has been read).
- *
- * There is no way to determine the output character which caused a
- * failure (as it may be one in a filter-injected replacement sequence).
- * It is, however, possible to determine which source character caused it
- * (this being the character immediately before the location pointed to by
- * ::source on exit).
- *
- * [I.e. the process of filtering results in a potential one-to-many mapping
- * between source characters and output characters, and identification of
- * individual output characters is impossible.]
- *
- * ::sourcelen will be reduced appropriately on exit.
- *
- * ::dest will point immediately _after_ the last character written.
- *
- * ::destlen will be reduced appropriately on exit.
- */
-hubbub_error hubbub_iconv_codec_encode(hubbub_charsetcodec *codec,
- const uint8_t **source, size_t *sourcelen,
- uint8_t **dest, size_t *destlen)
-{
- hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec;
- uint32_t ucs4;
- const uint32_t *towrite;
- size_t towritelen;
- hubbub_error error;
-
- /* Process any outstanding characters from the previous call */
- if (c->write_len > 0) {
- uint32_t *pwrite = c->write_buf;
-
- while (c->write_len > 0) {
- error = hubbub_iconv_codec_write_char(c, pwrite[0],
- dest, destlen);
- if (error != HUBBUB_OK) {
- /* Copy outstanding chars down, skipping
- * invalid one, if present, so as to avoid
- * reprocessing the invalid character */
- if (error == HUBBUB_INVALID) {
- for (ucs4 = 1; ucs4 < c->write_len;
- ucs4++) {
- c->write_buf[ucs4] =
- pwrite[ucs4];
- }
- }
-
- return error;
- }
-
- pwrite++;
- c->write_len--;
- }
- }
-
- /* Now process the characters for this call */
- while (*sourcelen > 0) {
- towrite = (const uint32_t *) (const void *) *source;
- towritelen = 1;
- ucs4 = *towrite;
-
- /* Run character we're about to output through the
- * registered filter, so it can replace it, if it sees
- * fit to do so */
- if (c->base.filter != NULL) {
- uint32_t *replacement;
-
- error = c->base.filter(ntohl(ucs4),
- &replacement, &towritelen,
- c->base.filter_pw);
- if (error != HUBBUB_OK) {
- /* Don't eat character -- filter failed,
- * so nothing gets written or buffered.
- * It's up to the client to ensure that
- * the filter works in the case where it
- * reprocesses this character after the
- * fault is fixed up. */
-
- return error;
- }
-
- /* Convert filter output to big endian UCS4 */
- for (ucs4 = 0; ucs4 < towritelen; ucs4++) {
- replacement[ucs4] = htonl(replacement[ucs4]);
- }
-
- towrite = (const uint32_t *) replacement;
- }
-
- /* Output current character(s) */
- while (towritelen > 0) {
- error = hubbub_iconv_codec_write_char(c, towrite[0],
- dest, destlen);
-
- if (error != HUBBUB_OK) {
- ucs4 = (error == HUBBUB_INVALID) ? 1 : 0;
-
- if (towritelen - ucs4 >= WRITE_BUFSIZE)
- abort();
-
- c->write_len = towritelen - ucs4;
-
- /* Copy pending chars to save area, for
- * processing next call; skipping invalid
- * character, if present, so it's not
- * reprocessed. */
- for (; ucs4 < towritelen; ucs4++) {
- c->write_buf[ucs4] = towrite[ucs4];
- }
-
- /* Claim character we've just buffered,
- * so it's not repreocessed */
- *source += 4;
- *sourcelen -= 4;
-
- return error;
- }
-
- towrite++;
- towritelen--;
- }
-
- *source += 4;
- *sourcelen -= 4;
- }
-
- return HUBBUB_OK;
-}
-
-/**
- * Decode a chunk of data in an iconv-based codec's charset into UCS4
- *
- * \param codec The codec to use
- * \param source Pointer to pointer to source data
- * \param sourcelen Pointer to length (in bytes) of source data
- * \param dest Pointer to pointer to output buffer
- * \param destlen Pointer to length (in bytes) of output buffer
- * \return HUBBUB_OK on success,
- * HUBBUB_NOMEM if output buffer is too small,
- * HUBBUB_INVALID if a character cannot be represented and the
- * codec's error handling mode is set to STRICT,
- * <any_other_error> as a result of the failure of the
- * client-provided filter function.
- *
- * On exit, ::source will point immediately _after_ the last input character
- * read, if the result is _OK or _NOMEM. Any remaining output for the
- * character will be buffered by the codec for writing on the next call.
- * This buffered data is post-filtering, so will not be refiltered on the
- * next call.
- *
- * In the case of the result being _INVALID or the filter function failing,
- * ::source will point _at_ the last input character read; nothing will be
- * written or buffered for the failed character. It is up to the client to
- * fix the cause of the failure and retry the decoding process.
- *
- * Note that, if failure occurs whilst attempting to write any output
- * buffered by the last call, then ::source and ::sourcelen will remain
- * unchanged (as nothing more has been read).
- *
- * There is no way to determine the output character which caused a
- * failure (as it may be one in a filter-injected replacement sequence).
- * It is, however, possible to determine which source character caused it
- * (this being the character immediately at or before the location pointed
- * to by ::source on exit).
- *
- * [I.e. the process of filtering results in a potential one-to-many mapping
- * between source characters and output characters, and identification of
- * individual output characters is impossible.]
- *
- * If STRICT error handling is configured and an illegal sequence is split
- * over two calls, then _INVALID will be returned from the second call,
- * but ::source will point mid-way through the invalid sequence (i.e. it
- * will be unmodified over the second call). In addition, the internal
- * incomplete-sequence buffer will be emptied, such that subsequent calls
- * will progress, rather than re-evaluating the same invalid sequence.
- *
- * ::sourcelen will be reduced appropriately on exit.
- *
- * ::dest will point immediately _after_ the last character written.
- *
- * ::destlen will be reduced appropriately on exit.
- *
- * Call this with a source length of 0 to flush the output buffer.
- */
-hubbub_error hubbub_iconv_codec_decode(hubbub_charsetcodec *codec,
- const uint8_t **source, size_t *sourcelen,
- uint8_t **dest, size_t *destlen)
-{
- hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec;
- hubbub_error error;
-
- if (c->read_len > 0) {
- /* Output left over from last decode
- * Attempt to finish this here */
- uint32_t *pread = c->read_buf;
-
- while (c->read_len > 0 && *destlen >= c->read_len * 4) {
- *((uint32_t *) (void *) *dest) = pread[0];
-
- *dest += 4;
- *destlen -= 4;
-
- pread++;
- c->read_len--;
- }
-
- if (*destlen < c->read_len * 4) {
- /* Run out of output buffer */
- size_t i;
-
- /* Shuffle remaining output down */
- for (i = 0; i < c->read_len; i++) {
- c->read_buf[i] = pread[i];
- }
-
- return HUBBUB_NOMEM;
- }
- }
-
- if (c->inval_len > 0) {
- /* The last decode ended in an incomplete sequence.
- * Fill up inval_buf with data from the start of the
- * new chunk and process it. */
- uint8_t *in = c->inval_buf;
- size_t ol = c->inval_len;
- size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
- size_t orig_l = l;
-
- memcpy(c->inval_buf + ol, *source, l);
-
- l += c->inval_len;
-
- error = hubbub_iconv_codec_read_char(c,
- (const uint8_t **) &in, &l, dest, destlen);
- if (error != HUBBUB_OK && error != HUBBUB_NOMEM) {
- return error;
- }
-
-
- /* And now, fix everything up so the normal processing
- * does the right thing. */
- *source += max((signed) (orig_l - l), 0);
- *sourcelen -= max((signed) (orig_l - l), 0);
-
- /* Failed to resolve an incomplete character and
- * ran out of buffer space. No recovery strategy
- * possible, so explode everywhere. */
- if ((orig_l + ol) - l == 0)
- abort();
-
- /* Handle memry exhaustion case from above */
- if (error != HUBBUB_OK)
- return error;
- }
-
- while (*sourcelen > 0) {
- error = hubbub_iconv_codec_read_char(c,
- source, sourcelen, dest, destlen);
- if (error != HUBBUB_OK) {
- return error;
- }
- }
-
- return HUBBUB_OK;
-}
-
-/**
- * Clear an iconv-based codec's encoding state
- *
- * \param codec The codec to reset
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_iconv_codec_reset(hubbub_charsetcodec *codec)
-{
- hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec;
-
- iconv(c->read_cd, NULL, NULL, NULL, NULL);
- iconv(c->write_cd, NULL, NULL, NULL, NULL);
-
- c->inval_buf[0] = '\0';
- c->inval_len = 0;
-
- c->read_buf[0] = 0;
- c->read_len = 0;
-
- c->write_buf[0] = 0;
- c->write_len = 0;
-
- return HUBBUB_OK;
-}
-
-/**
- * Feed a UCS4 character through the registered filter and output the result
- *
- * \param c Codec to use
- * \param ucs4 UCS4 character (big endian)
- * \param dest Pointer to pointer to output buffer
- * \param destlen Pointer to output buffer length
- * \return HUBBUB_OK on success,
- * HUBBUB_NOMEM if output buffer is too small,
- * <any_other_error> as a result of the failure of the
- * client-provided filter function.
- */
-hubbub_error hubbub_iconv_codec_filter_decoded_char(hubbub_iconv_codec *c,
- uint32_t ucs4, uint8_t **dest, size_t *destlen)
-{
- if (c->base.filter != NULL) {
- uint32_t *rep;
- size_t replen;
- hubbub_error error;
-
- error = c->base.filter(ntohl(ucs4), &rep, &replen,
- c->base.filter_pw);
- if (error != HUBBUB_OK) {
- return error;
- }
-
- while (replen > 0 && *destlen >= replen * 4) {
- *((uint32_t *) (void *) *dest) = htonl(*rep);
-
- *dest += 4;
- *destlen -= 4;
-
- rep++;
- replen--;
- }
-
- if (*destlen < replen * 4) {
- /* Run out of output buffer */
- size_t i;
-
- /* Buffer remaining output */
- c->read_len = replen;
-
- for (i = 0; i < replen; i++) {
- c->read_buf[i] = htonl(rep[i]);
- }
-
- return HUBBUB_NOMEM;
- }
-
- } else {
- if (*destlen < 4) {
- /* Run out of output buffer */
-
- c->read_len = 1;
- c->read_buf[0] = ucs4;
-
- return HUBBUB_NOMEM;
- }
-
- *((uint32_t *) (void *) *dest) = ucs4;
- *dest += 4;
- *destlen -= 4;
- }
-
- return HUBBUB_OK;
-}
-
-/**
- * Detect if a codec's charset is Unicode capable
- *
- * \param c Codec to consider
- * \return true if a Unicode variant, false otherwise
- */
-bool hubbub_iconv_codec_is_unicode(hubbub_iconv_codec *c)
-{
- static uint16_t ucs4;
- static uint16_t ucs2;
- static uint16_t utf8;
- static uint16_t utf16;
- static uint16_t utf16be;
- static uint16_t utf16le;
- static uint16_t utf32;
- static uint16_t utf32be;
- static uint16_t utf32le;
-
- if (ucs4 == 0) {
- ucs4 = hubbub_mibenum_from_name("UCS-4", SLEN("UCS-4"));
- ucs2 = hubbub_mibenum_from_name("UCS-2", SLEN("UCS-2"));
- utf8 = hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8"));
- utf16 = hubbub_mibenum_from_name("UTF-16", SLEN("UTF-16"));
- utf16be = hubbub_mibenum_from_name("UTF-16BE",
- SLEN("UTF-16BE"));
- utf16le = hubbub_mibenum_from_name("UTF-16LE",
- SLEN("UTF-16LE"));
- utf32 = hubbub_mibenum_from_name("UTF-32", SLEN("UTF-32"));
- utf32be = hubbub_mibenum_from_name("UTF-32BE",
- SLEN("UTF-32BE"));
- utf32le = hubbub_mibenum_from_name("UTF-32LE",
- SLEN("UTF-32LE"));
- }
-
- return (c->base.mibenum == ucs4 ||
- c->base.mibenum == ucs2 ||
- c->base.mibenum == utf8 ||
- c->base.mibenum == utf16 ||
- c->base.mibenum == utf16be ||
- c->base.mibenum == utf16le ||
- c->base.mibenum == utf32 ||
- c->base.mibenum == utf32be ||
- c->base.mibenum == utf32le);
-}
-
-/**
- * Read a character from the codec's native charset to UCS4 (big endian)
- *
- * \param c The codec
- * \param source Pointer to pointer to source buffer (updated on exit)
- * \param sourcelen Pointer to length of source buffer (updated on exit)
- * \param dest Pointer to pointer to output buffer (updated on exit)
- * \param destlen Pointer to length of output buffer (updated on exit)
- * \return HUBBUB_OK on success,
- * HUBBUB_NOMEM if output buffer is too small,
- * HUBBUB_INVALID if a character cannot be represented and the
- * codec's error handling mode is set to STRICT,
- * <any_other_error> as a result of the failure of the
- * client-provided filter function.
- *
- * On exit, ::source will point immediately _after_ the last input character
- * read, if the result is _OK or _NOMEM. Any remaining output for the
- * character will be buffered by the codec for writing on the next call.
- * This buffered data is post-filtering, so will not be refiltered on the
- * next call.
- *
- * In the case of the result being _INVALID or the filter function failing,
- * ::source will point _at_ the last input character read; nothing will be
- * written or buffered for the failed character. It is up to the client to
- * fix the cause of the failure and retry the decoding process.
- *
- * ::sourcelen will be reduced appropriately on exit.
- *
- * ::dest will point immediately _after_ the last character written.
- *
- * ::destlen will be reduced appropriately on exit.
- */
-hubbub_error hubbub_iconv_codec_read_char(hubbub_iconv_codec *c,
- const uint8_t **source, size_t *sourcelen,
- uint8_t **dest, size_t *destlen)
-{
- size_t iconv_ret;
- const uint8_t *origsrc = *source;
- size_t origsrclen = *sourcelen;
- uint32_t ucs4;
- uint8_t *pucs4 = (uint8_t *) &ucs4;
- size_t sucs4 = 4;
- hubbub_error error;
-
- /* Use iconv to convert a single character
- * Side effect: Updates *source to point at next input
- * character and *sourcelen to reflect reduced input length
- */
- iconv_ret = iconv(c->read_cd, (char **) source, sourcelen,
- (char **) (void *) &pucs4, &sucs4);
-
- if (iconv_ret != (size_t) -1 ||
- (*source != origsrc && sucs4 == 0)) {
- /* Read a character */
- error = hubbub_iconv_codec_filter_decoded_char(c,
- ucs4, dest, destlen);
- if (error != HUBBUB_OK && error != HUBBUB_NOMEM) {
- /* filter function failed; restore source pointers */
- *source = origsrc;
- *sourcelen = origsrclen;
- }
-
- /* Clear inval buffer */
- c->inval_buf[0] = '\0';
- c->inval_len = 0;
-
- return error;
- } else if (errno == E2BIG) {
- /* Should never happen */
- abort();
- } else if (errno == EINVAL) {
- /* Incomplete input sequence */
- if (*sourcelen > INVAL_BUFSIZE)
- abort();
-
- memmove(c->inval_buf, (const char *) *source, *sourcelen);
- c->inval_buf[*sourcelen] = '\0';
- c->inval_len = *sourcelen;
-
- *source += *sourcelen;
- *sourcelen = 0;
-
- return HUBBUB_OK;
- } else if (errno == EILSEQ) {
- /* Illegal input sequence */
- bool found = false;
- const uint8_t *oldsrc;
- size_t oldsrclen;
-
- /* Clear inval buffer */
- c->inval_buf[0] = '\0';
- c->inval_len = 0;
-
- /* Strict errormode; simply flag invalid character */
- if (c->base.errormode == HUBBUB_CHARSETCODEC_ERROR_STRICT) {
- /* restore source pointers */
- *source = origsrc;
- *sourcelen = origsrclen;
-
- return HUBBUB_INVALID;
- }
-
- /* Ok, this becomes problematic. The iconv API here
- * is particularly unhelpful; *source will point at
- * the _start_ of the illegal sequence. This means
- * that we must find the end of the sequence */
-
- /* Search for the start of the next valid input
- * sequence (or the end of the input stream) */
- while (*sourcelen > 1) {
- pucs4 = (uint8_t *) &ucs4;
- sucs4 = 4;
-
- (*source)++;
- (*sourcelen)--;
-
- oldsrc = *source;
- oldsrclen = *sourcelen;
-
- iconv_ret = iconv(c->read_cd,
- (char **) source, sourcelen,
- (char **) (void *) &pucs4, &sucs4);
- if (iconv_ret != (size_t) -1 || errno != EILSEQ) {
- found = true;
- break;
- }
- }
-
- if (found) {
- /* Found start of next valid sequence */
- *source = oldsrc;
- *sourcelen = oldsrclen;
- } else {
- /* Not found - skip last byte in buffer */
- (*source)++;
- (*sourcelen)--;
-
- if (*sourcelen != 0)
- abort();
- }
-
- /* output U+FFFD and continue processing. */
- error = hubbub_iconv_codec_filter_decoded_char(c,
- htonl(0xFFFD), dest, destlen);
- if (error != HUBBUB_OK && error != HUBBUB_NOMEM) {
- /* filter function failed; restore source pointers */
- *source = origsrc;
- *sourcelen = origsrclen;
- }
-
- return error;
- }
-
- return HUBBUB_OK;
-}
-
-/**
- * Write a UCS4 character in a codec's native charset
- *
- * \param c The codec
- * \param ucs4 The UCS4 character to write (big endian)
- * \param dest Pointer to pointer to output buffer (updated on exit)
- * \param destlen Pointer to length of output buffer (updated on exit)
- * \return HUBBUB_OK on success,
- * HUBBUB_NOMEM if output buffer is too small,
- * HUBBUB_INVALID if character cannot be represented and the
- * codec's error handling mode is set to STRICT.
- */
-hubbub_error hubbub_iconv_codec_write_char(hubbub_iconv_codec *c,
- uint32_t ucs4, uint8_t **dest, size_t *destlen)
-{
- size_t iconv_ret;
- uint8_t *pucs4 = (uint8_t *) &ucs4;
- size_t sucs4 = 4;
- uint8_t *origdest = *dest;
-
- iconv_ret = iconv(c->write_cd, (char **) (void *) &pucs4,
- &sucs4, (char **) dest, destlen);
-
- if (iconv_ret == (size_t) -1 && errno == E2BIG) {
- /* Output buffer is too small */
- return HUBBUB_NOMEM;
- } else if (iconv_ret == (size_t) -1 && errno == EILSEQ) {
- /* Illegal multibyte sequence */
- /* This should never happen */
- abort();
- } else if (iconv_ret == (size_t) -1 && errno == EINVAL) {
- /* Incomplete input character */
- /* This should never happen */
- abort();
- } else if (*dest == origdest) {
- /* Nothing was output */
- switch (c->base.errormode) {
- case HUBBUB_CHARSETCODEC_ERROR_STRICT:
- return HUBBUB_INVALID;
-
- case HUBBUB_CHARSETCODEC_ERROR_TRANSLIT:
- /** \todo transliteration */
- case HUBBUB_CHARSETCODEC_ERROR_LOOSE:
- {
- pucs4 = (uint8_t *) &ucs4;
- sucs4 = 4;
-
- ucs4 = hubbub_iconv_codec_is_unicode(c)
- ? htonl(0xFFFD) : htonl(0x3F);
-
- iconv_ret = iconv(c->write_cd,
- (char **) (void *) &pucs4, &sucs4,
- (char **) dest, destlen);
-
- if (iconv_ret == (size_t) -1 && errno == E2BIG) {
- return HUBBUB_NOMEM;
- } else if (iconv_ret == (size_t) -1 &&
- errno == EILSEQ) {
- /* Illegal multibyte sequence */
- /* This should never happen */
- abort();
- } else if (iconv_ret == (size_t) -1 &&
- errno == EINVAL) {
- /* Incomplete input character */
- /* This should never happen */
- abort();
- }
- }
- break;
- }
- }
-
- return HUBBUB_OK;
-}
-
-const hubbub_charsethandler hubbub_iconv_codec_handler = {
- hubbub_iconv_codec_handles_charset,
- hubbub_iconv_codec_create
-};
diff --git a/src/charset/codec_impl.h b/src/charset/codec_impl.h
deleted file mode 100644
index eb5116b..0000000
--- a/src/charset/codec_impl.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * This file is part of Hubbub.
- * Licensed under the MIT License,
- * http://www.opensource.org/licenses/mit-license.php
- * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
- */
-
-#ifndef hubbub_charset_codecimpl_h_
-#define hubbub_charset_codecimpl_h_
-
-#include <stdbool.h>
-#include <inttypes.h>
-
-#include "codec.h"
-
-/**
- * Core charset codec definition; implementations extend this
- */
-struct hubbub_charsetcodec {
- uint16_t mibenum; /**< MIB enum for charset */
-
- hubbub_charsetcodec_filter filter; /**< filter function */
- void *filter_pw; /**< filter private word */
-
- hubbub_charsetcodec_errormode errormode; /**< error mode */
-
- hubbub_alloc alloc; /**< allocation function */
- void *alloc_pw; /**< private word */
-
- struct {
- void (*destroy)(hubbub_charsetcodec *codec);
- hubbub_error (*encode)(hubbub_charsetcodec *codec,
- const uint8_t **source, size_t *sourcelen,
- uint8_t **dest, size_t *destlen);
- hubbub_error (*decode)(hubbub_charsetcodec *codec,
- const uint8_t **source, size_t *sourcelen,
- uint8_t **dest, size_t *destlen);
- hubbub_error (*reset)(hubbub_charsetcodec *codec);
- } handler; /**< Vtable for handler code */
-};
-
-/**
- * Codec factory component definition
- */
-typedef struct hubbub_charsethandler {
- bool (*handles_charset)(const char *charset);
- hubbub_charsetcodec *(*create)(const char *charset,
- hubbub_alloc alloc, void *pw);
-} hubbub_charsethandler;
-
-#endif
diff --git a/src/charset/codec_utf16.c b/src/charset/codec_utf16.c
deleted file mode 100644
index 9a94d29..0000000
--- a/src/charset/codec_utf16.c
+++ /dev/null
@@ -1,620 +0,0 @@
-/*
- * This file is part of Hubbub.
- * Licensed under the MIT License,
- * http://www.opensource.org/licenses/mit-license.php
- * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-/* These two are for htonl / ntohl */
-#include <arpa/inet.h>
-#include <netinet/in.h>
-
-#include "charset/aliases.h"
-#include "utils/utf16.h"
-#include "utils/utils.h"
-
-#include "codec_impl.h"
-
-/**
- * UTF-16 charset codec
- */
-typedef struct hubbub_utf16_codec {
- hubbub_charsetcodec base; /**< Base class */
-
-#define INVAL_BUFSIZE (32)
- uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up
- * incomplete input
- * sequences */
- size_t inval_len; /*< Byte length of inval_buf **/
-
-#define READ_BUFSIZE (8)
- uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial
- * output sequences (decode)
- * (host-endian) */
- size_t read_len; /**< Character length of read_buf */
-
-#define WRITE_BUFSIZE (8)
- uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial
- * output sequences (encode)
- * (host-endian) */
- size_t write_len; /**< Character length of write_buf */
-
-} hubbub_utf16_codec;
-
-static bool hubbub_utf16_codec_handles_charset(const char *charset);
-static hubbub_charsetcodec *hubbub_utf16_codec_create(const char *charset,
- hubbub_alloc alloc, void *pw);
-static void hubbub_utf16_codec_destroy (hubbub_charsetcodec *codec);
-static hubbub_error hubbub_utf16_codec_encode(hubbub_charsetcodec *codec,
- const uint8_t **source, size_t *sourcelen,
- uint8_t **dest, size_t *destlen);
-static hubbub_error hubbub_utf16_codec_decode(hubbub_charsetcodec *codec,
- const uint8_t **source, size_t *sourcelen,
- uint8_t **dest, size_t *destlen);
-static hubbub_error hubbub_utf16_codec_reset(hubbub_charsetcodec *codec);
-static hubbub_error hubbub_utf16_codec_read_char(hubbub_utf16_codec *c,
- const uint8_t **source, size_t *sourcelen,
- uint8_t **dest, size_t *destlen);
-static hubbub_error hubbub_utf16_codec_filter_decoded_char(
- hubbub_utf16_codec *c,
- uint32_t ucs4, uint8_t **dest, size_t *destlen);
-
-/**
- * Determine whether this codec handles a specific charset
- *
- * \param charset Charset to test
- * \return true if handleable, false otherwise
- */
-bool hubbub_utf16_codec_handles_charset(const char *charset)
-{
- return hubbub_mibenum_from_name(charset, strlen(charset)) ==
- hubbub_mibenum_from_name("UTF-16", SLEN("UTF-16"));
-}
-
-/**
- * Create a utf16 codec
- *
- * \param charset The charset to read from / write to
- * \param alloc Memory (de)allocation function
- * \param pw Pointer to client-specific private data (may be NULL)
- * \return Pointer to codec, or NULL on failure
- */
-hubbub_charsetcodec *hubbub_utf16_codec_create(const char *charset,
- hubbub_alloc alloc, void *pw)
-{
- hubbub_utf16_codec *codec;
-
- UNUSED(charset);
-
- codec = alloc(NULL, sizeof(hubbub_utf16_codec), pw);
- if (codec == NULL)
- return NULL;
-
- codec->inval_buf[0] = '\0';
- codec->inval_len = 0;
-
- codec->read_buf[0] = 0;
- codec->read_len = 0;
-
- codec->write_buf[0] = 0;
- codec->write_len = 0;
-
- /* Finally, populate vtable */
- codec->base.handler.destroy = hubbub_utf16_codec_destroy;
- codec->base.handler.encode = hubbub_utf16_codec_encode;
- codec->base.handler.decode = hubbub_utf16_codec_decode;
- codec->base.handler.reset = hubbub_utf16_codec_reset;
-
- return (hubbub_charsetcodec *) codec;
-}
-
-/**
- * Destroy a utf16 codec
- *
- * \param codec The codec to destroy
- */
-void hubbub_utf16_codec_destroy (hubbub_charsetcodec *codec)
-{
- UNUSED(codec);
-}
-
-/**
- * Encode a chunk of UCS4 data into utf16
- *
- * \param codec The codec to use
- * \param source Pointer to pointer to source data
- * \param sourcelen Pointer to length (in bytes) of source data
- * \param dest Pointer to pointer to output buffer
- * \param destlen Pointer to length (in bytes) of output buffer
- * \return HUBBUB_OK on success,
- * HUBBUB_NOMEM if output buffer is too small,
- * HUBBUB_INVALID if a character cannot be represented and the
- * codec's error handling mode is set to STRICT,
- * <any_other_error> as a result of the failure of the
- * client-provided filter function.
- *
- * On exit, ::source will point immediately _after_ the last input character
- * read. Any remaining output for the character will be buffered by the
- * codec for writing on the next call. This buffered data is post-filtering,
- * so will not be refiltered on the next call.
- *
- * In the case of the filter function failing, ::source will point _at_ the
- * last input character read; nothing will be written or buffered for the
- * failed character. It is up to the client to fix the cause of the failure
- * and retry the encoding process.
- *
- * Note that, if failure occurs whilst attempting to write any output
- * buffered by the last call, then ::source and ::sourcelen will remain
- * unchanged (as nothing more has been read).
- *
- * There is no way to determine the output character which caused a
- * failure (as it may be one in a filter-injected replacement sequence).
- * It is, however, possible to determine which source character caused it
- * (this being the character immediately before the location pointed to by
- * ::source on exit).
- *
- * [I.e. the process of filtering results in a potential one-to-many mapping
- * between source characters and output characters, and identification of
- * individual output characters is impossible.]
- *
- * ::sourcelen will be reduced appropriately on exit.
- *
- * ::dest will point immediately _after_ the last character written.
- *
- * ::destlen will be reduced appropriately on exit.
- */
-hubbub_error hubbub_utf16_codec_encode(hubbub_charsetcodec *codec,
- const uint8_t **source, size_t *sourcelen,
- uint8_t **dest, size_t *destlen)
-{
- hubbub_utf16_codec *c = (hubbub_utf16_codec *) codec;
- uint32_t ucs4;
- uint32_t *towrite;
- size_t towritelen;
- hubbub_error error;
-
- /* Process any outstanding characters from the previous call */
- if (c->write_len > 0) {
- uint32_t *pwrite = c->write_buf;
- uint8_t buf[4];
- size_t len;
-
- while (c->write_len > 0) {
- error = hubbub_utf16_from_ucs4(pwrite[0], buf, &len);
- if (error != HUBBUB_OK)
- abort();
-
- if (*destlen < len) {
- /* Insufficient output buffer space */
- for (len = 0; len < c->write_len; len++)
- c->write_buf[len] = pwrite[len];
-
- return HUBBUB_NOMEM;
- }
-
- memcpy(*dest, buf, len);
-
- *dest += len;
- *destlen -= len;
-
- pwrite++;
- c->write_len--;
- }
- }
-
- /* Now process the characters for this call */
- while (*sourcelen > 0) {
- ucs4 = ntohl(*((uint32_t *) (void *) *source));
- towrite = &ucs4;
- towritelen = 1;
-
- /* Run character we're about to output through the
- * registered filter, so it can replace it. */
- if (c->base.filter != NULL) {
- error = c->base.filter(ucs4,
- &towrite, &towritelen,
- c->base.filter_pw);
- if (error != HUBBUB_OK)
- return error;
- }
-
- /* Output current characters */
- while (towritelen > 0) {
- uint8_t buf[4];
- size_t len;
-
- error = hubbub_utf16_from_ucs4(towrite[0], buf, &len);
- if (error != HUBBUB_OK)
- abort();
-
- if (*destlen < len) {
- /* Insufficient output space */
- if (towritelen >= WRITE_BUFSIZE)
- abort();
-
- c->write_len = towritelen;
-
- /* Copy pending chars to save area, for
- * processing next call. */
- for (len = 0; len < towritelen; len++)
- c->write_buf[len] = towrite[len];
-
- /* Claim character we've just buffered,
- * so it's not reprocessed */
- *source += 4;
- *sourcelen -= 4;
-
- return HUBBUB_NOMEM;
- }
-
- memcpy(*dest, buf, len);
-
- *dest += len;
- *destlen -= len;
-
- towrite++;
- towritelen--;
- }
-
- *source += 4;
- *sourcelen -= 4;
- }
-
- return HUBBUB_OK;
-}
-
-/**
- * Decode a chunk of utf16 data into UCS4
- *
- * \param codec The codec to use
- * \param source Pointer to pointer to source data
- * \param sourcelen Pointer to length (in bytes) of source data
- * \param dest Pointer to pointer to output buffer
- * \param destlen Pointer to length (in bytes) of output buffer
- * \return HUBBUB_OK on success,
- * HUBBUB_NOMEM if output buffer is too small,
- * HUBBUB_INVALID if a character cannot be represented and the
- * codec's error handling mode is set to STRICT,
- * <any_other_error> as a result of the failure of the
- * client-provided filter function.
- *
- * On exit, ::source will point immediately _after_ the last input character
- * read, if the result is _OK or _NOMEM. Any remaining output for the
- * character will be buffered by the codec for writing on the next call.
- * This buffered data is post-filtering, so will not be refiltered on the
- * next call.
- *
- * In the case of the result being _INVALID or the filter function failing,
- * ::source will point _at_ the last input character read; nothing will be
- * written or buffered for the failed character. It is up to the client to
- * fix the cause of the failure and retry the decoding process.
- *
- * Note that, if failure occurs whilst attempting to write any output
- * buffered by the last call, then ::source and ::sourcelen will remain
- * unchanged (as nothing more has been read).
- *
- * There is no way to determine the output character which caused a
- * failure (as it may be one in a filter-injected replacement sequence).
- * It is, however, possible to determine which source character caused it
- * (this being the character immediately at or before the location pointed
- * to by ::source on exit).
- *
- * [I.e. the process of filtering results in a potential one-to-many mapping
- * between source characters and output characters, and identification of
- * individual output characters is impossible.]
- *
- * If STRICT error handling is configured and an illegal sequence is split
- * over two calls, then _INVALID will be returned from the second call,
- * but ::source will point mid-way through the invalid sequence (i.e. it
- * will be unmodified over the second call). In addition, the internal
- * incomplete-sequence buffer will be emptied, such that subsequent calls
- * will progress, rather than re-evaluating the same invalid sequence.
- *
- * ::sourcelen will be reduced appropriately on exit.
- *
- * ::dest will point immediately _after_ the last character written.
- *
- * ::destlen will be reduced appropriately on exit.
- *
- * Call this with a source length of 0 to flush the output buffer.
- */
-hubbub_error hubbub_utf16_codec_decode(hubbub_charsetcodec *codec,
- const uint8_t **source, size_t *sourcelen,
- uint8_t **dest, size_t *destlen)
-{
- hubbub_utf16_codec *c = (hubbub_utf16_codec *) codec;
- hubbub_error error;
-
- if (c->read_len > 0) {
- /* Output left over from last decode */
- uint32_t *pread = c->read_buf;
-
- while (c->read_len > 0 && *destlen >= c->read_len * 4) {
- *((uint32_t *) (void *) *dest) = htonl(pread[0]);
-
- *dest += 4;
- *destlen -= 4;
-
- pread++;
- c->read_len--;
- }
-
- if (*destlen < c->read_len * 4) {
- /* Ran out of output buffer */
- size_t i;
-
- /* Shuffle remaining output down */
- for (i = 0; i < c->read_len; i++)
- c->read_buf[i] = pread[i];
-
- return HUBBUB_NOMEM;
- }
- }
-
- if (c->inval_len > 0) {
- /* The last decode ended in an incomplete sequence.
- * Fill up inval_buf with data from the start of the
- * new chunk and process it. */
- uint8_t *in = c->inval_buf;
- size_t ol = c->inval_len;
- size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
- size_t orig_l = l;
-
- memcpy(c->inval_buf + ol, *source, l);
-
- l += c->inval_len;
-
- error = hubbub_utf16_codec_read_char(c,
- (const uint8_t **) &in, &l, dest, destlen);
- if (error != HUBBUB_OK && error != HUBBUB_NOMEM) {
- return error;
- }
-
- /* And now, fix up source pointers */
- *source += max((signed) (orig_l - l), 0);
- *sourcelen -= max((signed) (orig_l - l), 0);
-
- /* Failed to resolve an incomplete character and
- * ran out of buffer space. No recovery strategy
- * possible, so explode everywhere. */
- if ((orig_l + ol) - l == 0)
- abort();
-
- /* Report memory exhaustion case from above */
- if (error != HUBBUB_OK)
- return error;
- }
-
- /* Finally, the "normal" case; process all outstanding characters */
- while (*sourcelen > 0) {
- error = hubbub_utf16_codec_read_char(c,
- source, sourcelen, dest, destlen);
- if (error != HUBBUB_OK) {
- return error;
- }
- }
-
- return HUBBUB_OK;
-}
-
-/**
- * Clear a utf16 codec's encoding state
- *
- * \param codec The codec to reset
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_utf16_codec_reset(hubbub_charsetcodec *codec)
-{
- hubbub_utf16_codec *c = (hubbub_utf16_codec *) codec;
-
- c->inval_buf[0] = '\0';
- c->inval_len = 0;
-
- c->read_buf[0] = 0;
- c->read_len = 0;
-
- c->write_buf[0] = 0;
- c->write_len = 0;
-
- return HUBBUB_OK;
-}
-
-
-/**
- * Read a character from the UTF-16 to UCS4 (big endian)
- *
- * \param c The codec
- * \param source Pointer to pointer to source buffer (updated on exit)
- * \param sourcelen Pointer to length of source buffer (updated on exit)
- * \param dest Pointer to pointer to output buffer (updated on exit)
- * \param destlen Pointer to length of output buffer (updated on exit)
- * \return HUBBUB_OK on success,
- * HUBBUB_NOMEM if output buffer is too small,
- * HUBBUB_INVALID if a character cannot be represented and the
- * codec's error handling mode is set to STRICT,
- * <any_other_error> as a result of the failure of the
- * client-provided filter function.
- *
- * On exit, ::source will point immediately _after_ the last input character
- * read, if the result is _OK or _NOMEM. Any remaining output for the
- * character will be buffered by the codec for writing on the next call.
- * This buffered data is post-filtering, so will not be refiltered on the
- * next call.
- *
- * In the case of the result being _INVALID or the filter function failing,
- * ::source will point _at_ the last input character read; nothing will be
- * written or buffered for the failed character. It is up to the client to
- * fix the cause of the failure and retry the decoding process.
- *
- * ::sourcelen will be reduced appropriately on exit.
- *
- * ::dest will point immediately _after_ the last character written.
- *
- * ::destlen will be reduced appropriately on exit.
- */
-hubbub_error hubbub_utf16_codec_read_char(hubbub_utf16_codec *c,
- const uint8_t **source, size_t *sourcelen,
- uint8_t **dest, size_t *destlen)
-{
- uint32_t ucs4;
- size_t sucs4;
- hubbub_error error;
-
- /* Convert a single character */
- error = hubbub_utf16_to_ucs4(*source, *sourcelen, &ucs4, &sucs4);
- if (error == HUBBUB_OK) {
- /* Read a character */
- error = hubbub_utf16_codec_filter_decoded_char(c,
- ucs4, dest, destlen);
- if (error == HUBBUB_OK || error == HUBBUB_NOMEM) {
- /* filter function succeeded; update source pointers */
- *source += sucs4;
- *sourcelen -= sucs4;
- }
-
- /* Clear inval buffer */
- c->inval_buf[0] = '\0';
- c->inval_len = 0;
-
- return error;
- } else if (error == HUBBUB_NEEDDATA) {
- /* Incomplete input sequence */
- if (*sourcelen > INVAL_BUFSIZE)
- abort();
-
- memmove(c->inval_buf, (char *) *source, *sourcelen);
- c->inval_buf[*sourcelen] = '\0';
- c->inval_len = *sourcelen;
-
- *source += *sourcelen;
- *sourcelen = 0;
-
- return HUBBUB_OK;
- } else if (error == HUBBUB_INVALID) {
- /* Illegal input sequence */
- uint32_t nextchar;
-
- /* Clear inval buffer */
- c->inval_buf[0] = '\0';
- c->inval_len = 0;
-
- /* Strict errormode; simply flag invalid character */
- if (c->base.errormode == HUBBUB_CHARSETCODEC_ERROR_STRICT) {
- return HUBBUB_INVALID;
- }
-
- /* Find next valid UTF-16 sequence.
- * We're processing client-provided data, so let's
- * be paranoid about its validity. */
- error = hubbub_utf16_next_paranoid(*source, *sourcelen,
- 0, &nextchar);
- if (error != HUBBUB_OK) {
- if (error == HUBBUB_NEEDDATA) {
- /* Need more data to be sure */
- if (*sourcelen > INVAL_BUFSIZE)
- abort();
-
- memmove(c->inval_buf, (char *) *source,
- *sourcelen);
- c->inval_buf[*sourcelen] = '\0';
- c->inval_len = *sourcelen;
-
- *source += *sourcelen;
- *sourcelen = 0;
-
- nextchar = 0;
- } else {
- return error;
- }
- }
-
- /* output U+FFFD and continue processing. */
- error = hubbub_utf16_codec_filter_decoded_char(c,
- 0xFFFD, dest, destlen);
- if (error == HUBBUB_OK || error == HUBBUB_NOMEM) {
- /* filter function succeeded; update source pointers */
- *source += nextchar;
- *sourcelen -= nextchar;
- }
-
- return error;
- }
-
- return HUBBUB_OK;
-}
-
-/**
- * Feed a UCS4 character through the registered filter and output the result
- *
- * \param c Codec to use
- * \param ucs4 UCS4 character (host endian)
- * \param dest Pointer to pointer to output buffer
- * \param destlen Pointer to output buffer length
- * \return HUBBUB_OK on success,
- * HUBBUB_NOMEM if output buffer is too small,
- * <any_other_error> as a result of the failure of the
- * client-provided filter function.
- */
-hubbub_error hubbub_utf16_codec_filter_decoded_char(hubbub_utf16_codec *c,
- uint32_t ucs4, uint8_t **dest, size_t *destlen)
-{
- if (c->base.filter != NULL) {
- uint32_t *rep;
- size_t replen;
- hubbub_error error;
-
- error = c->base.filter(ucs4, &rep, &replen,
- c->base.filter_pw);
- if (error != HUBBUB_OK) {
- return error;
- }
-
- while (replen > 0 && *destlen >= replen * 4) {
- *((uint32_t *) (void *) *dest) = htonl(*rep);
-
- *dest += 4;
- *destlen -= 4;
-
- rep++;
- replen--;
- }
-
- if (*destlen < replen * 4) {
- /* Run out of output buffer */
- size_t i;
-
- /* Buffer remaining output */
- c->read_len = replen;
-
- for (i = 0; i < replen; i++) {
- c->read_buf[i] = rep[i];
- }
-
- return HUBBUB_NOMEM;
- }
-
- } else {
- if (*destlen < 4) {
- /* Run out of output buffer */
- c->read_len = 1;
- c->read_buf[0] = ucs4;
-
- return HUBBUB_NOMEM;
- }
-
- *((uint32_t *) (void *) *dest) = htonl(ucs4);
- *dest += 4;
- *destlen -= 4;
- }
-
- return HUBBUB_OK;
-}
-
-
-const hubbub_charsethandler hubbub_utf16_codec_handler = {
- hubbub_utf16_codec_handles_charset,
- hubbub_utf16_codec_create
-};
diff --git a/src/charset/codec_utf8.c b/src/charset/codec_utf8.c
deleted file mode 100644
index 86d667f..0000000
--- a/src/charset/codec_utf8.c
+++ /dev/null
@@ -1,620 +0,0 @@
-/*
- * This file is part of Hubbub.
- * Licensed under the MIT License,
- * http://www.opensource.org/licenses/mit-license.php
- * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-/* These two are for htonl / ntohl */
-#include <arpa/inet.h>
-#include <netinet/in.h>
-
-#include "charset/aliases.h"
-#include "utils/utf8.h"
-#include "utils/utils.h"
-
-#include "codec_impl.h"
-
-/**
- * UTF-8 charset codec
- */
-typedef struct hubbub_utf8_codec {
- hubbub_charsetcodec base; /**< Base class */
-
-#define INVAL_BUFSIZE (32)
- uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up
- * incomplete input
- * sequences */
- size_t inval_len; /*< Byte length of inval_buf **/
-
-#define READ_BUFSIZE (8)
- uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial
- * output sequences (decode)
- * (host-endian) */
- size_t read_len; /**< Character length of read_buf */
-
-#define WRITE_BUFSIZE (8)
- uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial
- * output sequences (encode)
- * (host-endian) */
- size_t write_len; /**< Character length of write_buf */
-
-} hubbub_utf8_codec;
-
-static bool hubbub_utf8_codec_handles_charset(const char *charset);
-static hubbub_charsetcodec *hubbub_utf8_codec_create(const char *charset,
- hubbub_alloc alloc, void *pw);
-static void hubbub_utf8_codec_destroy (hubbub_charsetcodec *codec);
-static hubbub_error hubbub_utf8_codec_encode(hubbub_charsetcodec *codec,
- const uint8_t **source, size_t *sourcelen,
- uint8_t **dest, size_t *destlen);
-static hubbub_error hubbub_utf8_codec_decode(hubbub_charsetcodec *codec,
- const uint8_t **source, size_t *sourcelen,
- uint8_t **dest, size_t *destlen);
-static hubbub_error hubbub_utf8_codec_reset(hubbub_charsetcodec *codec);
-static hubbub_error hubbub_utf8_codec_read_char(hubbub_utf8_codec *c,
- const uint8_t **source, size_t *sourcelen,
- uint8_t **dest, size_t *destlen);
-static hubbub_error hubbub_utf8_codec_filter_decoded_char(
- hubbub_utf8_codec *c,
- uint32_t ucs4, uint8_t **dest, size_t *destlen);
-
-/**
- * Determine whether this codec handles a specific charset
- *
- * \param charset Charset to test
- * \return true if handleable, false otherwise
- */
-bool hubbub_utf8_codec_handles_charset(const char *charset)
-{
- return hubbub_mibenum_from_name(charset, strlen(charset)) ==
- hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8"));
-}
-
-/**
- * Create a utf8 codec
- *
- * \param charset The charset to read from / write to
- * \param alloc Memory (de)allocation function
- * \param pw Pointer to client-specific private data (may be NULL)
- * \return Pointer to codec, or NULL on failure
- */
-hubbub_charsetcodec *hubbub_utf8_codec_create(const char *charset,
- hubbub_alloc alloc, void *pw)
-{
- hubbub_utf8_codec *codec;
-
- UNUSED(charset);
-
- codec = alloc(NULL, sizeof(hubbub_utf8_codec), pw);
- if (codec == NULL)
- return NULL;
-
- codec->inval_buf[0] = '\0';
- codec->inval_len = 0;
-
- codec->read_buf[0] = 0;
- codec->read_len = 0;
-
- codec->write_buf[0] = 0;
- codec->write_len = 0;
-
- /* Finally, populate vtable */
- codec->base.handler.destroy = hubbub_utf8_codec_destroy;
- codec->base.handler.encode = hubbub_utf8_codec_encode;
- codec->base.handler.decode = hubbub_utf8_codec_decode;
- codec->base.handler.reset = hubbub_utf8_codec_reset;
-
- return (hubbub_charsetcodec *) codec;
-}
-
-/**
- * Destroy a utf8 codec
- *
- * \param codec The codec to destroy
- */
-void hubbub_utf8_codec_destroy (hubbub_charsetcodec *codec)
-{
- UNUSED(codec);
-}
-
-/**
- * Encode a chunk of UCS4 data into utf8
- *
- * \param codec The codec to use
- * \param source Pointer to pointer to source data
- * \param sourcelen Pointer to length (in bytes) of source data
- * \param dest Pointer to pointer to output buffer
- * \param destlen Pointer to length (in bytes) of output buffer
- * \return HUBBUB_OK on success,
- * HUBBUB_NOMEM if output buffer is too small,
- * HUBBUB_INVALID if a character cannot be represented and the
- * codec's error handling mode is set to STRICT,
- * <any_other_error> as a result of the failure of the
- * client-provided filter function.
- *
- * On exit, ::source will point immediately _after_ the last input character
- * read. Any remaining output for the character will be buffered by the
- * codec for writing on the next call. This buffered data is post-filtering,
- * so will not be refiltered on the next call.
- *
- * In the case of the filter function failing, ::source will point _at_ the
- * last input character read; nothing will be written or buffered for the
- * failed character. It is up to the client to fix the cause of the failure
- * and retry the encoding process.
- *
- * Note that, if failure occurs whilst attempting to write any output
- * buffered by the last call, then ::source and ::sourcelen will remain
- * unchanged (as nothing more has been read).
- *
- * There is no way to determine the output character which caused a
- * failure (as it may be one in a filter-injected replacement sequence).
- * It is, however, possible to determine which source character caused it
- * (this being the character immediately before the location pointed to by
- * ::source on exit).
- *
- * [I.e. the process of filtering results in a potential one-to-many mapping
- * between source characters and output characters, and identification of
- * individual output characters is impossible.]
- *
- * ::sourcelen will be reduced appropriately on exit.
- *
- * ::dest will point immediately _after_ the last character written.
- *
- * ::destlen will be reduced appropriately on exit.
- */
-hubbub_error hubbub_utf8_codec_encode(hubbub_charsetcodec *codec,
- const uint8_t **source, size_t *sourcelen,
- uint8_t **dest, size_t *destlen)
-{
- hubbub_utf8_codec *c = (hubbub_utf8_codec *) codec;
- uint32_t ucs4;
- uint32_t *towrite;
- size_t towritelen;
- hubbub_error error;
-
- /* Process any outstanding characters from the previous call */
- if (c->write_len > 0) {
- uint32_t *pwrite = c->write_buf;
- uint8_t buf[6];
- size_t len;
-
- while (c->write_len > 0) {
- error = hubbub_utf8_from_ucs4(pwrite[0], buf, &len);
- if (error != HUBBUB_OK)
- abort();
-
- if (*destlen < len) {
- /* Insufficient output buffer space */
- for (len = 0; len < c->write_len; len++)
- c->write_buf[len] = pwrite[len];
-
- return HUBBUB_NOMEM;
- }
-
- memcpy(*dest, buf, len);
-
- *dest += len;
- *destlen -= len;
-
- pwrite++;
- c->write_len--;
- }
- }
-
- /* Now process the characters for this call */
- while (*sourcelen > 0) {
- ucs4 = ntohl(*((uint32_t *) (void *) *source));
- towrite = &ucs4;
- towritelen = 1;
-
- /* Run character we're about to output through the
- * registered filter, so it can replace it. */
- if (c->base.filter != NULL) {
- error = c->base.filter(ucs4,
- &towrite, &towritelen,
- c->base.filter_pw);
- if (error != HUBBUB_OK)
- return error;
- }
-
- /* Output current characters */
- while (towritelen > 0) {
- uint8_t buf[6];
- size_t len;
-
- error = hubbub_utf8_from_ucs4(towrite[0], buf, &len);
- if (error != HUBBUB_OK)
- abort();
-
- if (*destlen < len) {
- /* Insufficient output space */
- if (towritelen >= WRITE_BUFSIZE)
- abort();
-
- c->write_len = towritelen;
-
- /* Copy pending chars to save area, for
- * processing next call. */
- for (len = 0; len < towritelen; len++)
- c->write_buf[len] = towrite[len];
-
- /* Claim character we've just buffered,
- * so it's not reprocessed */
- *source += 4;
- *sourcelen -= 4;
-
- return HUBBUB_NOMEM;
- }
-
- memcpy(*dest, buf, len);
-
- *dest += len;
- *destlen -= len;
-
- towrite++;
- towritelen--;
- }
-
- *source += 4;
- *sourcelen -= 4;
- }
-
- return HUBBUB_OK;
-}
-
-/**
- * Decode a chunk of utf8 data into UCS4
- *
- * \param codec The codec to use
- * \param source Pointer to pointer to source data
- * \param sourcelen Pointer to length (in bytes) of source data
- * \param dest Pointer to pointer to output buffer
- * \param destlen Pointer to length (in bytes) of output buffer
- * \return HUBBUB_OK on success,
- * HUBBUB_NOMEM if output buffer is too small,
- * HUBBUB_INVALID if a character cannot be represented and the
- * codec's error handling mode is set to STRICT,
- * <any_other_error> as a result of the failure of the
- * client-provided filter function.
- *
- * On exit, ::source will point immediately _after_ the last input character
- * read, if the result is _OK or _NOMEM. Any remaining output for the
- * character will be buffered by the codec for writing on the next call.
- * This buffered data is post-filtering, so will not be refiltered on the
- * next call.
- *
- * In the case of the result being _INVALID or the filter function failing,
- * ::source will point _at_ the last input character read; nothing will be
- * written or buffered for the failed character. It is up to the client to
- * fix the cause of the failure and retry the decoding process.
- *
- * Note that, if failure occurs whilst attempting to write any output
- * buffered by the last call, then ::source and ::sourcelen will remain
- * unchanged (as nothing more has been read).
- *
- * There is no way to determine the output character which caused a
- * failure (as it may be one in a filter-injected replacement sequence).
- * It is, however, possible to determine which source character caused it
- * (this being the character immediately at or before the location pointed
- * to by ::source on exit).
- *
- * [I.e. the process of filtering results in a potential one-to-many mapping
- * between source characters and output characters, and identification of
- * individual output characters is impossible.]
- *
- * If STRICT error handling is configured and an illegal sequence is split
- * over two calls, then _INVALID will be returned from the second call,
- * but ::source will point mid-way through the invalid sequence (i.e. it
- * will be unmodified over the second call). In addition, the internal
- * incomplete-sequence buffer will be emptied, such that subsequent calls
- * will progress, rather than re-evaluating the same invalid sequence.
- *
- * ::sourcelen will be reduced appropriately on exit.
- *
- * ::dest will point immediately _after_ the last character written.
- *
- * ::destlen will be reduced appropriately on exit.
- *
- * Call this with a source length of 0 to flush the output buffer.
- */
-hubbub_error hubbub_utf8_codec_decode(hubbub_charsetcodec *codec,
- const uint8_t **source, size_t *sourcelen,
- uint8_t **dest, size_t *destlen)
-{
- hubbub_utf8_codec *c = (hubbub_utf8_codec *) codec;
- hubbub_error error;
-
- if (c->read_len > 0) {
- /* Output left over from last decode */
- uint32_t *pread = c->read_buf;
-
- while (c->read_len > 0 && *destlen >= c->read_len * 4) {
- *((uint32_t *) (void *) *dest) = htonl(pread[0]);
-
- *dest += 4;
- *destlen -= 4;
-
- pread++;
- c->read_len--;
- }
-
- if (*destlen < c->read_len * 4) {
- /* Ran out of output buffer */
- size_t i;
-
- /* Shuffle remaining output down */
- for (i = 0; i < c->read_len; i++)
- c->read_buf[i] = pread[i];
-
- return HUBBUB_NOMEM;
- }
- }
-
- if (c->inval_len > 0) {
- /* The last decode ended in an incomplete sequence.
- * Fill up inval_buf with data from the start of the
- * new chunk and process it. */
- uint8_t *in = c->inval_buf;
- size_t ol = c->inval_len;
- size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
- size_t orig_l = l;
-
- memcpy(c->inval_buf + ol, *source, l);
-
- l += c->inval_len;
-
- error = hubbub_utf8_codec_read_char(c,
- (const uint8_t **) &in, &l, dest, destlen);
- if (error != HUBBUB_OK && error != HUBBUB_NOMEM) {
- return error;
- }
-
- /* And now, fix up source pointers */
- *source += max((signed) (orig_l - l), 0);
- *sourcelen -= max((signed) (orig_l - l), 0);
-
- /* Failed to resolve an incomplete character and
- * ran out of buffer space. No recovery strategy
- * possible, so explode everywhere. */
- if ((orig_l + ol) - l == 0)
- abort();
-
- /* Report memory exhaustion case from above */
- if (error != HUBBUB_OK)
- return error;
- }
-
- /* Finally, the "normal" case; process all outstanding characters */
- while (*sourcelen > 0) {
- error = hubbub_utf8_codec_read_char(c,
- source, sourcelen, dest, destlen);
- if (error != HUBBUB_OK) {
- return error;
- }
- }
-
- return HUBBUB_OK;
-}
-
-/**
- * Clear a utf8 codec's encoding state
- *
- * \param codec The codec to reset
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_utf8_codec_reset(hubbub_charsetcodec *codec)
-{
- hubbub_utf8_codec *c = (hubbub_utf8_codec *) codec;
-
- c->inval_buf[0] = '\0';
- c->inval_len = 0;
-
- c->read_buf[0] = 0;
- c->read_len = 0;
-
- c->write_buf[0] = 0;
- c->write_len = 0;
-
- return HUBBUB_OK;
-}
-
-
-/**
- * Read a character from the UTF-8 to UCS4 (big endian)
- *
- * \param c The codec
- * \param source Pointer to pointer to source buffer (updated on exit)
- * \param sourcelen Pointer to length of source buffer (updated on exit)
- * \param dest Pointer to pointer to output buffer (updated on exit)
- * \param destlen Pointer to length of output buffer (updated on exit)
- * \return HUBBUB_OK on success,
- * HUBBUB_NOMEM if output buffer is too small,
- * HUBBUB_INVALID if a character cannot be represented and the
- * codec's error handling mode is set to STRICT,
- * <any_other_error> as a result of the failure of the
- * client-provided filter function.
- *
- * On exit, ::source will point immediately _after_ the last input character
- * read, if the result is _OK or _NOMEM. Any remaining output for the
- * character will be buffered by the codec for writing on the next call.
- * This buffered data is post-filtering, so will not be refiltered on the
- * next call.
- *
- * In the case of the result being _INVALID or the filter function failing,
- * ::source will point _at_ the last input character read; nothing will be
- * written or buffered for the failed character. It is up to the client to
- * fix the cause of the failure and retry the decoding process.
- *
- * ::sourcelen will be reduced appropriately on exit.
- *
- * ::dest will point immediately _after_ the last character written.
- *
- * ::destlen will be reduced appropriately on exit.
- */
-hubbub_error hubbub_utf8_codec_read_char(hubbub_utf8_codec *c,
- const uint8_t **source, size_t *sourcelen,
- uint8_t **dest, size_t *destlen)
-{
- uint32_t ucs4;
- size_t sucs4;
- hubbub_error error;
-
- /* Convert a single character */
- error = hubbub_utf8_to_ucs4(*source, *sourcelen, &ucs4, &sucs4);
- if (error == HUBBUB_OK) {
- /* Read a character */
- error = hubbub_utf8_codec_filter_decoded_char(c,
- ucs4, dest, destlen);
- if (error == HUBBUB_OK || error == HUBBUB_NOMEM) {
- /* filter function succeeded; update source pointers */
- *source += sucs4;
- *sourcelen -= sucs4;
- }
-
- /* Clear inval buffer */
- c->inval_buf[0] = '\0';
- c->inval_len = 0;
-
- return error;
- } else if (error == HUBBUB_NEEDDATA) {
- /* Incomplete input sequence */
- if (*sourcelen > INVAL_BUFSIZE)
- abort();
-
- memmove(c->inval_buf, (char *) *source, *sourcelen);
- c->inval_buf[*sourcelen] = '\0';
- c->inval_len = *sourcelen;
-
- *source += *sourcelen;
- *sourcelen = 0;
-
- return HUBBUB_OK;
- } else if (error == HUBBUB_INVALID) {
- /* Illegal input sequence */
- uint32_t nextchar;
-
- /* Clear inval buffer */
- c->inval_buf[0] = '\0';
- c->inval_len = 0;
-
- /* Strict errormode; simply flag invalid character */
- if (c->base.errormode == HUBBUB_CHARSETCODEC_ERROR_STRICT) {
- return HUBBUB_INVALID;
- }
-
- /* Find next valid UTF-8 sequence.
- * We're processing client-provided data, so let's
- * be paranoid about its validity. */
- error = hubbub_utf8_next_paranoid(*source, *sourcelen,
- 0, &nextchar);
- if (error != HUBBUB_OK) {
- if (error == HUBBUB_NEEDDATA) {
- /* Need more data to be sure */
- if (*sourcelen > INVAL_BUFSIZE)
- abort();
-
- memmove(c->inval_buf, (char *) *source,
- *sourcelen);
- c->inval_buf[*sourcelen] = '\0';
- c->inval_len = *sourcelen;
-
- *source += *sourcelen;
- *sourcelen = 0;
-
- nextchar = 0;
- } else {
- return error;
- }
- }
-
- /* output U+FFFD and continue processing. */
- error = hubbub_utf8_codec_filter_decoded_char(c,
- 0xFFFD, dest, destlen);
- if (error == HUBBUB_OK || error == HUBBUB_NOMEM) {
- /* filter function succeeded; update source pointers */
- *source += nextchar;
- *sourcelen -= nextchar;
- }
-
- return error;
- }
-
- return HUBBUB_OK;
-}
-
-/**
- * Feed a UCS4 character through the registered filter and output the result
- *
- * \param c Codec to use
- * \param ucs4 UCS4 character (host endian)
- * \param dest Pointer to pointer to output buffer
- * \param destlen Pointer to output buffer length
- * \return HUBBUB_OK on success,
- * HUBBUB_NOMEM if output buffer is too small,
- * <any_other_error> as a result of the failure of the
- * client-provided filter function.
- */
-hubbub_error hubbub_utf8_codec_filter_decoded_char(hubbub_utf8_codec *c,
- uint32_t ucs4, uint8_t **dest, size_t *destlen)
-{
- if (c->base.filter != NULL) {
- uint32_t *rep;
- size_t replen;
- hubbub_error error;
-
- error = c->base.filter(ucs4, &rep, &replen,
- c->base.filter_pw);
- if (error != HUBBUB_OK) {
- return error;
- }
-
- while (replen > 0 && *destlen >= replen * 4) {
- *((uint32_t *) (void *) *dest) = htonl(*rep);
-
- *dest += 4;
- *destlen -= 4;
-
- rep++;
- replen--;
- }
-
- if (*destlen < replen * 4) {
- /* Run out of output buffer */
- size_t i;
-
- /* Buffer remaining output */
- c->read_len = replen;
-
- for (i = 0; i < replen; i++) {
- c->read_buf[i] = rep[i];
- }
-
- return HUBBUB_NOMEM;
- }
-
- } else {
- if (*destlen < 4) {
- /* Run out of output buffer */
- c->read_len = 1;
- c->read_buf[0] = ucs4;
-
- return HUBBUB_NOMEM;
- }
-
- *((uint32_t *) (void *) *dest) = htonl(ucs4);
- *dest += 4;
- *destlen -= 4;
- }
-
- return HUBBUB_OK;
-}
-
-
-const hubbub_charsethandler hubbub_utf8_codec_handler = {
- hubbub_utf8_codec_handles_charset,
- hubbub_utf8_codec_create
-};
diff --git a/src/charset/detect.c b/src/charset/detect.c
index 8ff3b87..768eb9a 100644
--- a/src/charset/detect.c
+++ b/src/charset/detect.c
@@ -8,12 +8,15 @@
#include <stdbool.h>
#include <string.h>
-#include "charset/aliases.h"
+#include <parserutils/charset/mibenum.h>
+
+#include <hubbub/types.h>
+
#include "utils/utils.h"
#include "detect.h"
-static uint16_t hubbub_charset_read_bom(const uint8_t **data, size_t *len);
+static uint16_t hubbub_charset_read_bom(const uint8_t *data, size_t len);
static uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len);
static uint16_t hubbub_charset_parse_attributes(const uint8_t **pos,
const uint8_t *end);
@@ -27,31 +30,31 @@ static bool hubbub_charset_get_attribute(const uint8_t **data,
/**
* Extract a charset from a chunk of data
*
- * \param data Pointer to pointer to buffer containing data
- * \param len Pointer to buffer length
- * \param mibenum Pointer to location to store MIB enum representing charset
- * \param source Pointer to location to receive charset source
- * \return HUBBUB_OK on success, appropriate error otherwise
+ * \param data Pointer to buffer containing data
+ * \param len Buffer length
+ * \param mibenum Pointer to location containing current MIB enum
+ * \param source Pointer to location containint current charset source
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
*
- * The data pointer and length will be modified by this function if
- * a byte order mark is encountered at the start of the buffer. The updated
- * data pointer will point to the first byte in the buffer after the BOM.
- * The length will be modified appropriately.
+ * ::mibenum and ::source will be updated on exit
*
* The larger a chunk of data fed to this routine, the better, as it allows
* charset autodetection access to a larger dataset for analysis.
*/
-hubbub_error hubbub_charset_extract(const uint8_t **data, size_t *len,
- uint16_t *mibenum, hubbub_charset_source *source)
+parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len,
+ uint16_t *mibenum, uint32_t *source)
{
uint16_t charset = 0;
- if (data == NULL || *data == NULL || len == NULL ||
- mibenum == NULL || source == NULL)
- return HUBBUB_BADPARM;
+ if (data == NULL || mibenum == NULL || source == NULL)
+ return PARSERUTILS_BADPARM;
+
+ /* If the source is dictated, there's nothing for us to do */
+ if (*source == HUBBUB_CHARSET_DICTATED)
+ return PARSERUTILS_OK;
/* We need at least 4 bytes of data */
- if (*len < 4)
+ if (len < 4)
goto default_encoding;
/* First, look for a BOM */
@@ -60,21 +63,21 @@ hubbub_error hubbub_charset_extract(const uint8_t **data, size_t *len,
*mibenum = charset;
*source = HUBBUB_CHARSET_DOCUMENT;
- return HUBBUB_OK;
+ return PARSERUTILS_OK;
}
/* No BOM was found, so we must look for a meta charset within
* the document itself. */
- charset = hubbub_charset_scan_meta(*data, *len);
+ charset = hubbub_charset_scan_meta(data, len);
if (charset != 0) {
/* ISO-8859-1 becomes Windows-1252 */
- if (charset == hubbub_mibenum_from_name("ISO-8859-1",
- SLEN("ISO-8859-1"))) {
- charset = hubbub_mibenum_from_name("Windows-1252",
- SLEN("Windows-1252"));
+ if (charset == parserutils_charset_mibenum_from_name(
+ "ISO-8859-1", SLEN("ISO-8859-1"))) {
+ charset = parserutils_charset_mibenum_from_name(
+ "Windows-1252", SLEN("Windows-1252"));
/* Fallback to 8859-1 if that failed */
if (charset == 0)
- charset = hubbub_mibenum_from_name(
+ charset = parserutils_charset_mibenum_from_name(
"ISO-8859-1", SLEN("ISO-8859-1"));
}
@@ -94,23 +97,23 @@ hubbub_error hubbub_charset_extract(const uint8_t **data, size_t *len,
* autodetection routines (or the fallback case if they
* fail).
*/
- if (charset != hubbub_mibenum_from_name("UTF-16",
+ if (charset != parserutils_charset_mibenum_from_name("UTF-16",
SLEN("UTF-16")) &&
- charset != hubbub_mibenum_from_name("UTF-16LE",
- SLEN("UTF-16LE")) &&
- charset != hubbub_mibenum_from_name("UTF-16BE",
- SLEN("UTF-16BE")) &&
- charset != hubbub_mibenum_from_name("UTF-32",
- SLEN("UTF-32")) &&
- charset != hubbub_mibenum_from_name("UTF-32LE",
- SLEN("UTF-32LE")) &&
- charset != hubbub_mibenum_from_name("UTF-32BE",
- SLEN("UTF-32BE"))) {
+ charset != parserutils_charset_mibenum_from_name(
+ "UTF-16LE", SLEN("UTF-16LE")) &&
+ charset != parserutils_charset_mibenum_from_name(
+ "UTF-16BE", SLEN("UTF-16BE")) &&
+ charset != parserutils_charset_mibenum_from_name(
+ "UTF-32", SLEN("UTF-32")) &&
+ charset != parserutils_charset_mibenum_from_name(
+ "UTF-32LE", SLEN("UTF-32LE")) &&
+ charset != parserutils_charset_mibenum_from_name(
+ "UTF-32BE", SLEN("UTF-32BE"))) {
*mibenum = charset;
*source = HUBBUB_CHARSET_DOCUMENT;
- return HUBBUB_OK;
+ return PARSERUTILS_OK;
}
}
@@ -122,16 +125,16 @@ hubbub_error hubbub_charset_extract(const uint8_t **data, size_t *len,
/* We failed to autodetect a charset, so use the default fallback */
default_encoding:
- charset = hubbub_mibenum_from_name("Windows-1252",
+ charset = parserutils_charset_mibenum_from_name("Windows-1252",
SLEN("Windows-1252"));
if (charset == 0)
- charset = hubbub_mibenum_from_name("ISO-8859-1",
+ charset = parserutils_charset_mibenum_from_name("ISO-8859-1",
SLEN("ISO-8859-1"));
*mibenum = charset;
*source = HUBBUB_CHARSET_DEFAULT;
- return HUBBUB_OK;
+ return PARSERUTILS_OK;
}
@@ -139,65 +142,38 @@ default_encoding:
* Inspect the beginning of a buffer of data for the presence of a
* UTF Byte Order Mark.
*
- * \param data Pointer to pointer to buffer containing data
- * \param len Pointer to buffer length
+ * \param data Pointer to buffer containing data
+ * \param len Buffer length
* \return MIB enum representing encoding described by BOM, or 0 if not found
- *
- * If a BOM is found, the data pointer will be modified to point to the first
- * byte in the buffer after the BOM. The length will also be modified
- * appropriately.
*/
-uint16_t hubbub_charset_read_bom(const uint8_t **data, size_t *len)
+uint16_t hubbub_charset_read_bom(const uint8_t *data, size_t len)
{
- if (data == NULL || *data == NULL || len == NULL)
+ if (data == NULL)
return 0;
/* We require at least 4 bytes of data */
- if (*len < 4)
+ if (len < 4)
return 0;
-#define UTF32BOM_LEN (4)
-#define UTF16BOM_LEN (2)
-#define UTF8BOM_LEN (3)
-
- if ((*data)[0] == 0x00 && (*data)[1] == 0x00 &&
- (*data)[2] == 0xFE && (*data)[3] == 0xFF) {
- *data += UTF32BOM_LEN;
- *len -= UTF32BOM_LEN;
-
- return hubbub_mibenum_from_name("UTF-32BE",
+ if (data[0] == 0x00 && data[1] == 0x00 &&
+ data[2] == 0xFE && data[3] == 0xFF) {
+ return parserutils_charset_mibenum_from_name("UTF-32BE",
SLEN("UTF-32BE"));
- } else if ((*data)[0] == 0xFF && (*data)[1] == 0xFE &&
- (*data)[2] == 0x00 && (*data)[3] == 0x00) {
- *data += UTF32BOM_LEN;
- *len -= UTF32BOM_LEN;
-
- return hubbub_mibenum_from_name("UTF-32LE",
+ } else if (data[0] == 0xFF && data[1] == 0xFE &&
+ data[2] == 0x00 && data[3] == 0x00) {
+ return parserutils_charset_mibenum_from_name("UTF-32LE",
SLEN("UTF-32LE"));
- } else if ((*data)[0] == 0xFE && (*data)[1] == 0xFF) {
- *data += UTF16BOM_LEN;
- *len -= UTF16BOM_LEN;
-
- return hubbub_mibenum_from_name("UTF-16BE",
+ } else if (data[0] == 0xFE && data[1] == 0xFF) {
+ return parserutils_charset_mibenum_from_name("UTF-16BE",
SLEN("UTF-16BE"));
- } else if ((*data)[0] == 0xFF && (*data)[1] == 0xFE) {
- *data += UTF16BOM_LEN;
- *len -= UTF16BOM_LEN;
-
- return hubbub_mibenum_from_name("UTF-16LE",
+ } else if (data[0] == 0xFF && data[1] == 0xFE) {
+ return parserutils_charset_mibenum_from_name("UTF-16LE",
SLEN("UTF-16LE"));
- } else if ((*data)[0] == 0xEF && (*data)[1] == 0xBB &&
- (*data)[2] == 0xBF) {
- *data += UTF8BOM_LEN;
- *len -= UTF8BOM_LEN;
-
- return hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8"));
+ } else if (data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF) {
+ return parserutils_charset_mibenum_from_name("UTF-8",
+ SLEN("UTF-8"));
}
-#undef UTF32BOM_LEN
-#undef UTF16BOM_LEN
-#undef UTF8BOM_LEN
-
return 0;
}
@@ -223,7 +199,7 @@ uint16_t hubbub_charset_read_bom(const uint8_t **data, size_t *len)
* Search for a meta charset within a buffer of data
*
* \param data Pointer to buffer containing data
- * \param len Length of buffer in data
+ * \param len Length of buffer
* \return MIB enum representing encoding, or 0 if none found
*/
uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len)
@@ -344,7 +320,7 @@ uint16_t hubbub_charset_parse_attributes(const uint8_t **pos,
while (valuelen > 0 && ISSPACE(value[valuelen - 1]))
valuelen--;
- mibenum = hubbub_mibenum_from_name(
+ mibenum = parserutils_charset_mibenum_from_name(
(const char *) value, valuelen);
if (mibenum != 0)
return mibenum;
@@ -478,8 +454,8 @@ uint16_t hubbub_charset_parse_content(const uint8_t *value,
/* 8 */
if (tentative != NULL) {
- return hubbub_mibenum_from_name((const char *) tentative,
- tentative_len);
+ return parserutils_charset_mibenum_from_name(
+ (const char *) tentative, tentative_len);
}
/* 9 */
@@ -555,6 +531,7 @@ bool hubbub_charset_get_attribute(const uint8_t **data, const uint8_t *end,
/* c */
if (*pos == '/' || *pos == '<' || *pos == '>') {
+ *data = pos;
return true;
}
diff --git a/src/charset/detect.h b/src/charset/detect.h
index 854a8d6..807f374 100644
--- a/src/charset/detect.h
+++ b/src/charset/detect.h
@@ -10,13 +10,11 @@
#include <inttypes.h>
-#include <hubbub/errors.h>
-#include <hubbub/functypes.h>
-#include <hubbub/types.h>
+#include <parserutils/errors.h>
/* Extract a charset from a chunk of data */
-hubbub_error hubbub_charset_extract(const uint8_t **data, size_t *len,
- uint16_t *mibenum, hubbub_charset_source *source);
+parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len,
+ uint16_t *mibenum, uint32_t *source);
#endif
diff --git a/src/hubbub.c b/src/hubbub.c
index 32e0a1f..a1bd783 100644
--- a/src/hubbub.c
+++ b/src/hubbub.c
@@ -5,9 +5,10 @@
* Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
*/
+#include <parserutils/parserutils.h>
+
#include <hubbub/hubbub.h>
-#include "charset/aliases.h"
#include "tokeniser/entities.h"
/**
@@ -28,13 +29,12 @@ hubbub_error hubbub_initialise(const char *aliases_file,
if (aliases_file == NULL || alloc == NULL)
return HUBBUB_BADPARM;
- error = hubbub_aliases_create(aliases_file, alloc, pw);
- if (error != HUBBUB_OK)
- return error;
+ if (parserutils_initialise(aliases_file, alloc, pw) != PARSERUTILS_OK)
+ return !HUBBUB_OK;
error = hubbub_entities_create(alloc, pw);
if (error != HUBBUB_OK) {
- hubbub_aliases_destroy(alloc, pw);
+ parserutils_finalise(alloc, pw);
return error;
}
@@ -55,7 +55,7 @@ hubbub_error hubbub_finalise(hubbub_alloc alloc, void *pw)
hubbub_entities_destroy(alloc, pw);
- hubbub_aliases_destroy(alloc, pw);
+ parserutils_finalise(alloc, pw);
return HUBBUB_OK;
}
diff --git a/src/input/Makefile b/src/input/Makefile
deleted file mode 100644
index 3b9206f..0000000
--- a/src/input/Makefile
+++ /dev/null
@@ -1,46 +0,0 @@
-# Child makefile fragment for libhubbub
-#
-# Toolchain is provided by top-level makefile
-#
-# Variables provided by top-level makefile
-#
-# COMPONENT The name of the component
-# EXPORT The location of the export directory
-# TOP The location of the source tree root
-# RELEASEDIR The place to put release objects
-# DEBUGDIR The place to put debug objects
-#
-# do_include Canned command sequence to include a child makefile
-#
-# Variables provided by parent makefile:
-#
-# DIR The name of the directory we're in, relative to $(TOP)
-#
-# Variables we can manipulate:
-#
-# ITEMS_CLEAN The list of items to remove for "make clean"
-# ITEMS_DISTCLEAN The list of items to remove for "make distclean"
-# TARGET_TESTS The list of target names to run for "make test"
-#
-# SOURCES The list of sources to build for $(COMPONENT)
-#
-# Plus anything from the toolchain
-
-# Push parent directory onto the directory stack
-sp := $(sp).x
-dirstack_$(sp) := $(d)
-d := $(DIR)
-
-# Sources
-SRCS_$(d) := filter.c inputstream.c utf8_stream.c utf16_stream.c
-
-# Append to sources for component
-SOURCES += $(addprefix $(d), $(SRCS_$(d)))
-
-# Now include any children we may have
-MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
-$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
-
-# Finally, pop off the directory stack
-d := $(dirstack_$(sp))
-sp := $(basename $(sp))
diff --git a/src/input/filter.c b/src/input/filter.c
deleted file mode 100644
index 7a97840..0000000
--- a/src/input/filter.c
+++ /dev/null
@@ -1,380 +0,0 @@
-/*
- * This file is part of Hubbub.
- * Licensed under the MIT License,
- * http://www.opensource.org/licenses/mit-license.php
- * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
- */
-
-#include <errno.h>
-#include <stdbool.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "charset/aliases.h"
-#include "charset/codec.h"
-#include "utils/utils.h"
-
-#include "input/filter.h"
-
-
-/** Input filter */
-struct hubbub_filter {
- hubbub_charsetcodec *read_codec; /**< Read codec */
- hubbub_charsetcodec *write_codec; /**< Write codec */
-
- uint32_t filter_output[2]; /**< Filter output buffer */
- uint32_t last_filter_char; /**< Last filtered character */
-
- uint32_t pivot_buf[64]; /**< Conversion pivot buffer */
-
- bool leftover; /**< Data remains from last call */
- uint8_t *pivot_left; /**< Remaining pivot to write */
- size_t pivot_len; /**< Length of pivot remaining */
-
- struct {
- uint16_t encoding; /**< Input encoding */
- } settings; /**< Filter settings */
-
- hubbub_alloc alloc; /**< Memory (de)allocation function */
- void *pw; /**< Client private data */
-};
-
-static hubbub_error hubbub_filter_set_defaults(hubbub_filter *input);
-static hubbub_error hubbub_filter_set_encoding(hubbub_filter *input,
- const char *enc);
-static hubbub_error read_character_filter(uint32_t c,
- uint32_t **output, size_t *outputlen, void *pw);
-
-/**
- * Create an input filter
- *
- * \param int_enc Desired encoding of document
- * \param alloc Function used to (de)allocate data
- * \param pw Pointer to client-specific private data (may be NULL)
- * \return Pointer to filter instance, or NULL on failure
- */
-hubbub_filter *hubbub_filter_create(const char *int_enc,
- hubbub_alloc alloc, void *pw)
-{
- hubbub_filter *filter;
-
- if (alloc == NULL)
- return NULL;
-
- filter = alloc(NULL, sizeof(*filter), pw);
- if (!filter)
- return NULL;
-
- filter->last_filter_char = 0;
-
- filter->leftover = false;
- filter->pivot_left = NULL;
- filter->pivot_len = 0;
-
- filter->alloc = alloc;
- filter->pw = pw;
-
- if (hubbub_filter_set_defaults(filter) != HUBBUB_OK) {
- filter->alloc(filter, 0, pw);
- return NULL;
- }
-
- filter->write_codec = hubbub_charsetcodec_create(int_enc, alloc, pw);
- if (filter->write_codec == NULL) {
- if (filter->read_codec != NULL)
- hubbub_charsetcodec_destroy(filter->read_codec);
- filter->alloc(filter, 0, pw);
- return NULL;
- }
-
- return filter;
-}
-
-/**
- * Destroy an input filter
- *
- * \param input Pointer to filter instance
- */
-void hubbub_filter_destroy(hubbub_filter *input)
-{
- if (input == NULL)
- return;
-
- if (input->read_codec != NULL)
- hubbub_charsetcodec_destroy(input->read_codec);
-
- if (input->write_codec != NULL)
- hubbub_charsetcodec_destroy(input->write_codec);
-
- input->alloc(input, 0, input->pw);
-
- return;
-}
-
-/**
- * Configure an input filter
- *
- * \param input Pointer to filter instance
- * \param type Input option type to configure
- * \param params Option-specific parameters
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_filter_setopt(hubbub_filter *input,
- hubbub_filter_opttype type,
- hubbub_filter_optparams *params)
-{
- hubbub_error error = HUBBUB_OK;
-
- if (input == NULL || params == NULL)
- return HUBBUB_BADPARM;
-
- switch (type) {
- case HUBBUB_FILTER_SET_ENCODING:
- error = hubbub_filter_set_encoding(input,
- params->encoding.name);
- break;
- }
-
- return error;
-}
-
-/**
- * Process a chunk of data
- *
- * \param input Pointer to filter instance
- * \param data Pointer to pointer to input buffer
- * \param len Pointer to length of input buffer
- * \param output Pointer to pointer to output buffer
- * \param outlen Pointer to length of output buffer
- * \return HUBBUB_OK on success, appropriate error otherwise
- *
- * Call this with an input buffer length of 0 to flush any buffers.
- */
-hubbub_error hubbub_filter_process_chunk(hubbub_filter *input,
- const uint8_t **data, size_t *len,
- uint8_t **output, size_t *outlen)
-{
- hubbub_error read_error, write_error;
-
- if (input == NULL || data == NULL || *data == NULL || len == NULL ||
- output == NULL || *output == NULL || outlen == NULL)
- return HUBBUB_BADPARM;
-
- if (input->leftover) {
- /* Some data left to be written from last call */
-
- /* Attempt to flush the remaining data. */
- write_error = hubbub_charsetcodec_encode(input->write_codec,
- (const uint8_t **) &input->pivot_left,
- &input->pivot_len,
- output, outlen);
-
- if (write_error != HUBBUB_OK) {
- return write_error;
- }
-
- /* And clear leftover */
- input->pivot_left = NULL;
- input->pivot_len = 0;
- input->leftover = false;
- }
-
- while (*len > 0) {
- size_t pivot_len = sizeof(input->pivot_buf);
- uint8_t *pivot = (uint8_t *) input->pivot_buf;
-
- read_error = hubbub_charsetcodec_decode(input->read_codec,
- data, len,
- (uint8_t **) &pivot, &pivot_len);
-
- pivot = (uint8_t *) input->pivot_buf;
- pivot_len = sizeof(input->pivot_buf) - pivot_len;
-
- if (pivot_len > 0) {
- write_error = hubbub_charsetcodec_encode(
- input->write_codec,
- (const uint8_t **) &pivot,
- &pivot_len,
- output, outlen);
-
- if (write_error != HUBBUB_OK) {
- input->leftover = true;
- input->pivot_left = pivot;
- input->pivot_len = pivot_len;
-
- return write_error;
- }
- }
-
- if (read_error != HUBBUB_OK && read_error != HUBBUB_NOMEM)
- return read_error;
- }
-
- return HUBBUB_OK;
-}
-
-/**
- * Reset an input filter's state
- *
- * \param input The input filter to reset
- * \param HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_filter_reset(hubbub_filter *input)
-{
- hubbub_error error;
-
- if (input == NULL)
- return HUBBUB_BADPARM;
-
- /* Clear pivot buffer leftovers */
- input->pivot_left = NULL;
- input->pivot_len = 0;
- input->leftover = false;
-
- /* Reset read codec */
- error = hubbub_charsetcodec_reset(input->read_codec);
- if (error != HUBBUB_OK)
- return error;
-
- /* Reset write codec */
- error = hubbub_charsetcodec_reset(input->write_codec);
- if (error != HUBBUB_OK)
- return error;
-
- return HUBBUB_OK;
-}
-
-/**
- * Set an input filter's default settings
- *
- * \param input Input filter to configure
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_filter_set_defaults(hubbub_filter *input)
-{
- hubbub_error error;
-
- if (input == NULL)
- return HUBBUB_BADPARM;
-
- input->read_codec = NULL;
- input->write_codec = NULL;
- input->settings.encoding = 0;
- error = hubbub_filter_set_encoding(input, "ISO-8859-1");
- if (error != HUBBUB_OK)
- return error;
-
- return HUBBUB_OK;
-}
-
-/**
- * Set an input filter's encoding
- *
- * \param input Input filter to configure
- * \param enc Encoding name
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_filter_set_encoding(hubbub_filter *input,
- const char *enc)
-{
- const char *old_enc;
- uint16_t mibenum;
- hubbub_error error;
- hubbub_charsetcodec_optparams params;
-
- if (input == NULL || enc == NULL)
- return HUBBUB_BADPARM;
-
- mibenum = hubbub_mibenum_from_name(enc, strlen(enc));
- if (mibenum == 0)
- return HUBBUB_INVALID;
-
- /* Exit early if we're already using this encoding */
- if (input->settings.encoding == mibenum)
- return HUBBUB_OK;
-
- old_enc = hubbub_mibenum_to_name(input->settings.encoding);
- if (old_enc == NULL)
- old_enc = "ISO-8859-1";
-
- if (input->read_codec != NULL)
- hubbub_charsetcodec_destroy(input->read_codec);
-
- input->read_codec = hubbub_charsetcodec_create(enc, input->alloc,
- input->pw);
- if (input->read_codec == NULL)
- return HUBBUB_NOMEM;
-
- /* Register filter function */
- params.filter_func.filter = read_character_filter;
- params.filter_func.pw = (void *) input;
- error = hubbub_charsetcodec_setopt(input->read_codec,
- HUBBUB_CHARSETCODEC_FILTER_FUNC,
- (hubbub_charsetcodec_optparams *) &params);
- if (error != HUBBUB_OK)
- return error;
-
- input->settings.encoding = mibenum;
-
- return HUBBUB_OK;
-}
-
-/**
- * Character filter function for read characters
- *
- * \param c The read character (UCS4 - host byte order)
- * \param output Pointer to pointer to output buffer (filled on exit)
- * \param outputlen Pointer to output buffer length (filled on exit)
- * \param pw Pointer to client-specific private data.
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error read_character_filter(uint32_t c, uint32_t **output,
- size_t *outputlen, void *pw)
-{
- hubbub_filter *input = (hubbub_filter *) pw;
- size_t len;
-
- if (output == NULL || outputlen == NULL || pw == NULL)
- return HUBBUB_BADPARM;
-
- /* Line ending normalisation:
- * CRLF -> LF (trap CR and let LF through unmodified)
- * CR -> LF (trap CR and convert to LF if not CRLF)
- * LF -> LF (leave LF alone)
- */
-
-#define NUL (0x00000000)
-#define CR (0x0000000D)
-#define LF (0x0000000A)
-#define REP (0x0000FFFD)
-
- /* Replace NUL (U+0000) characters in input with U+FFFD */
- if (c == NUL)
- c = REP;
-
- if (c == CR) {
- /* Convert CRs to LFs straight away */
- input->filter_output[0] = LF;
- len = 1;
- } else if (input->last_filter_char == CR && c == LF) {
- /* Trap this LF */
- len = 0;
- } else {
- /* Let character through unchanged */
- input->filter_output[0] = c;
- len = 1;
- }
-
-
-#undef NUL
-#undef CR
-#undef LF
-#undef REP
-
- input->last_filter_char = c;
-
- *output = input->filter_output;
- *outputlen = len;
-
- return HUBBUB_OK;
-}
diff --git a/src/input/filter.h b/src/input/filter.h
deleted file mode 100644
index 6650e09..0000000
--- a/src/input/filter.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * This file is part of Hubbub.
- * Licensed under the MIT License,
- * http://www.opensource.org/licenses/mit-license.php
- * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
- */
-
-#ifndef hubbub_input_filter_h_
-#define hubbub_input_filter_h_
-
-#include <inttypes.h>
-
-#include <hubbub/errors.h>
-#include <hubbub/functypes.h>
-
-typedef struct hubbub_filter hubbub_filter;
-
-/**
- * Input filter option types
- */
-typedef enum hubbub_filter_opttype {
- HUBBUB_FILTER_SET_ENCODING = 0,
-} hubbub_filter_opttype;
-
-/**
- * Input filter option parameters
- */
-typedef union hubbub_filter_optparams {
- /** Parameters for encoding setting */
- struct {
- /** Encoding name */
- const char *name;
- } encoding;
-} hubbub_filter_optparams;
-
-
-/* Create an input filter */
-hubbub_filter *hubbub_filter_create(const char *int_enc,
- hubbub_alloc alloc, void *pw);
-/* Destroy an input filter */
-void hubbub_filter_destroy(hubbub_filter *input);
-
-/* Configure an input filter */
-hubbub_error hubbub_filter_setopt(hubbub_filter *input,
- hubbub_filter_opttype type,
- hubbub_filter_optparams *params);
-
-/* Process a chunk of data */
-hubbub_error hubbub_filter_process_chunk(hubbub_filter *input,
- const uint8_t **data, size_t *len,
- uint8_t **output, size_t *outlen);
-
-/* Reset an input filter's state */
-hubbub_error hubbub_filter_reset(hubbub_filter *input);
-
-#endif
-
diff --git a/src/input/inputstream.c b/src/input/inputstream.c
deleted file mode 100644
index 744aa23..0000000
--- a/src/input/inputstream.c
+++ /dev/null
@@ -1,481 +0,0 @@
-/*
- * This file is part of Hubbub.
- * Licensed under the MIT License,
- * http://www.opensource.org/licenses/mit-license.php
- * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
- */
-
-#include <stdlib.h>
-
-#include "charset/aliases.h"
-#include "input/streamimpl.h"
-
-/**
- * Buffer moving claimant context
- */
-struct hubbub_inputstream_bm_handler {
- hubbub_inputstream_buffermoved handler; /**< Handler function */
- void *pw; /**< Client private data */
-
- struct hubbub_inputstream_bm_handler *next;
- struct hubbub_inputstream_bm_handler *prev;
-};
-
-extern hubbub_streamhandler utf8stream;
-extern hubbub_streamhandler utf16stream;
-
-static hubbub_streamhandler *handler_table[] = {
- &utf8stream,
- &utf16stream,
- NULL
-};
-
-/**
- * Create an input stream
- *
- * \param enc Document charset, or NULL to autodetect
- * \param int_enc Desired encoding of document
- * \param alloc Memory (de)allocation function
- * \param pw Pointer to client-specific private data (may be NULL)
- * \return Pointer to stream instance, or NULL on failure
- */
-hubbub_inputstream *hubbub_inputstream_create(const char *enc,
- const char *int_enc, hubbub_alloc alloc, void *pw)
-{
- hubbub_inputstream *stream;
- hubbub_streamhandler **handler;
-
- if (int_enc == NULL || alloc == NULL)
- return NULL;
-
- /* Search for handler class */
- for (handler = handler_table; *handler != NULL; handler++) {
- if ((*handler)->uses_encoding(int_enc))
- break;
- }
-
- /* None found */
- if ((*handler) == NULL)
- return NULL;
-
- stream = (*handler)->create(enc, int_enc, alloc, pw);
- if (stream == NULL)
- return NULL;
-
- stream->handlers = NULL;
-
- stream->alloc = alloc;
- stream->pw = pw;
-
- return stream;
-}
-
-/**
- * Destroy an input stream
- *
- * \param stream Input stream to destroy
- */
-void hubbub_inputstream_destroy(hubbub_inputstream *stream)
-{
- hubbub_inputstream_bm_handler *h, *i;
-
- if (stream == NULL)
- return;
-
- for (h = stream->handlers; h; h = i) {
- i = h->next;
-
- stream->alloc(h, 0, stream->pw);
- }
-
- stream->destroy(stream);
-}
-
-/**
- * Append data to an input stream
- *
- * \param stream Input stream to append data to
- * \param data Data to append (in document charset), or NULL to flag EOF
- * \param len Length, in bytes, of data
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_inputstream_append(hubbub_inputstream *stream,
- const uint8_t *data, size_t len)
-{
- if (stream == NULL)
- return HUBBUB_BADPARM;
-
- /* Calling this if we've disowned the buffer is foolish */
- if (stream->buffer == NULL)
- return HUBBUB_INVALID;
-
- return stream->append(stream, data, len);
-}
-
-/**
- * Insert data into stream at current location
- *
- * \param stream Input stream to insert into
- * \param data Data to insert (UTF-8 encoded)
- * \param len Length, in bytes, of data
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_inputstream_insert(hubbub_inputstream *stream,
- const uint8_t *data, size_t len)
-{
- if (stream == NULL || data == NULL)
- return HUBBUB_BADPARM;
-
- /* Calling this if we've disowned the buffer is foolish */
- if (stream->buffer == NULL)
- return HUBBUB_INVALID;
-
- return stream->insert(stream, data, len);
-}
-
-/**
- * Look at the next character in the stream
- *
- * \param stream Stream to look in
- * \return UCS4 (host-endian) character code, or EOF or OOD.
- */
-uint32_t hubbub_inputstream_peek(hubbub_inputstream *stream)
-{
- /* It is illegal to call this after the buffer has been disowned */
- if (stream == NULL || stream->buffer == NULL)
- return HUBBUB_INPUTSTREAM_OOD;
-
- return stream->peek(stream);;
-}
-
-/**
- * Retrieve the byte index and length of the current character in the stream
- *
- * \param stream Stream to look in
- * \param len Pointer to location to receive byte length of character
- * \return Byte index of current character from start of stream,
- * or (uint32_t) -1 on error
- */
-uint32_t hubbub_inputstream_cur_pos(hubbub_inputstream *stream,
- size_t *len)
-{
- /* It is illegal to call this after the buffer has been disowned */
- if (stream == NULL || len == NULL || stream->buffer == NULL)
- return (uint32_t) -1;
-
- return stream->cur_pos(stream, len);
-}
-
-/**
- * Convert the current character to lower case
- *
- * \param stream Stream to look in
- */
-void hubbub_inputstream_lowercase(hubbub_inputstream *stream)
-{
- if (stream == NULL || stream->buffer == NULL)
- return;
-
- stream->lowercase(stream);
-}
-
-/**
- * Convert the current character to upper case
- *
- * \param stream Stream to look in
- */
-void hubbub_inputstream_uppercase(hubbub_inputstream *stream)
-{
- if (stream == NULL || stream->buffer == NULL)
- return;
-
- stream->uppercase(stream);
-}
-
-/**
- * Advance the stream's current position
- *
- * \param stream The stream whose position to advance
- */
-void hubbub_inputstream_advance(hubbub_inputstream *stream)
-{
- /* It is illegal to call this after the buffer has been disowned */
- if (stream == NULL || stream->buffer == NULL)
- return;
-
- if (stream->cursor == stream->buffer_len)
- return;
-
- stream->advance(stream);
-}
-
-/**
- * Push a character back onto the stream
- *
- * \param stream Stream to push back to
- * \param character UCS4 (host-endian) codepoint to push back
- * \return HUBBUB_OK on success, appropriate error otherwise
- *
- * Note that this doesn't actually modify the data in the stream.
- * It works by ensuring that the character located just before the
- * current stream location is the same as ::character. If it is,
- * then the stream pointer is moved back. If it is not, then an
- * error is returned and the stream pointer remains unmodified.
- */
-hubbub_error hubbub_inputstream_push_back(hubbub_inputstream *stream,
- uint32_t character)
-{
- /* It is illegal to call this after the buffer has been disowned */
- if (stream == NULL || stream->buffer == NULL)
- return HUBBUB_BADPARM;
-
- if (stream->cursor == 0)
- return HUBBUB_INVALID;
-
- return stream->push_back(stream, character);
-}
-
-/**
- * Rewind the input stream by a number of bytes
- *
- * \param stream Stream to rewind
- * \param n Number of bytes to go back
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_inputstream_rewind(hubbub_inputstream *stream, size_t n)
-{
- if (stream == NULL || stream->buffer == NULL)
- return HUBBUB_BADPARM;
-
- if (stream->cursor < n)
- return HUBBUB_INVALID;
-
- stream->cursor -= n;
-
- return HUBBUB_OK;
-}
-
-/**
- * Claim ownership of an input stream's buffer
- *
- * \param stream Input stream whose buffer to claim
- * \param buffer Pointer to location to receive buffer pointer
- * \param len Pointer to location to receive byte length of buffer
- * \return HUBBUB_OK on success, appropriate error otherwise.
- *
- * Once the buffer has been claimed by a client, the input stream disclaims
- * all ownership rights (and invalidates any internal references it may have
- * to the buffer). Therefore, the only input stream call which may be made
- * after calling this function is to destroy the input stream. Therefore,
- * unless the stream pointer is located at EOF, this call will return an
- * error.
- */
-hubbub_error hubbub_inputstream_claim_buffer(hubbub_inputstream *stream,
- uint8_t **buffer, size_t *len)
-{
- if (stream == NULL || buffer == NULL || len == NULL)
- return HUBBUB_BADPARM;
-
- if (stream->had_eof == false ||
- stream->cursor != stream->buffer_len)
- return HUBBUB_INVALID;
-
- *buffer = stream->buffer;
- *len = stream->buffer_len;
-
- stream->buffer = NULL;
-
- return HUBBUB_OK;
-}
-
-/**
- * Register interest in buffer moved events
- *
- * \param stream Input stream to register interest with
- * \param handler Pointer to handler function
- * \param pw Pointer to client-specific private data (may be NULL)
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_inputstream_register_movehandler(
- hubbub_inputstream *stream,
- hubbub_inputstream_buffermoved handler, void *pw)
-{
- hubbub_inputstream_bm_handler *h;
-
- if (stream == NULL || handler == NULL)
- return HUBBUB_BADPARM;
-
- h = stream->alloc(NULL, sizeof(hubbub_inputstream_bm_handler),
- stream->pw);
- if (h == NULL)
- return HUBBUB_NOMEM;
-
- h->handler = handler;
- h->pw = pw;
-
- h->prev = NULL;
- h->next = stream->handlers;
-
- if (stream->handlers)
- stream->handlers->prev = h;
- stream->handlers = h;
-
- /* And notify claimant of current buffer location */
- handler(stream->buffer, stream->buffer_len, pw);
-
- return HUBBUB_OK;
-}
-
-/**
- * Deregister interest in buffer moved events
- *
- * \param stream Input stream to deregister from
- * \param handler Pointer to handler function
- * \param pw Pointer to client-specific private data (may be NULL)
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_inputstream_deregister_movehandler(
- hubbub_inputstream *stream,
- hubbub_inputstream_buffermoved handler, void *pw)
-{
- hubbub_inputstream_bm_handler *h;
-
- if (stream == NULL || handler == NULL)
- return HUBBUB_BADPARM;
-
- for (h = stream->handlers; h; h = h->next) {
- if (h->handler == handler && h->pw == pw)
- break;
- }
-
- if (h == NULL)
- return HUBBUB_INVALID;
-
- if (h->next)
- h->next->prev = h->prev;
- if (h->prev)
- h->prev->next = h->next;
- else
- stream->handlers = h->next;
-
- stream->alloc(h, 0, stream->pw);
-
- return HUBBUB_OK;
-}
-
-/**
- * Case insensitively compare a pair of ranges in the input stream
- *
- * \param stream Input stream to look in
- * \param r1 Offset of start of first range
- * \param r2 Offset of start of second range
- * \param len Byte length of ranges
- * \return 0 if ranges match, non-zero otherwise
- */
-int hubbub_inputstream_compare_range_ci(hubbub_inputstream *stream,
- uint32_t r1, uint32_t r2, size_t len)
-{
- if (stream == NULL || stream->buffer == NULL)
- return 1; /* arbitrary */
-
- return stream->cmp_range_ci(stream, r1, r2, len);
-}
-
-/**
- * Case sensitively compare a pair of ranges in the input stream
- *
- * \param stream Input stream to look in
- * \param r1 Offset of start of first range
- * \param r2 Offset of start of second range
- * \param len Byte length of ranges
- * \return 0 if ranges match, non-zero otherwise
- */
-int hubbub_inputstream_compare_range_cs(hubbub_inputstream *stream,
- uint32_t r1, uint32_t r2, size_t len)
-{
- if (stream == NULL || stream->buffer == NULL)
- return 1; /* arbitrary */
-
- return stream->cmp_range_cs(stream, r1, r2, len);
-}
-
-/**
- * Case sensitively compare a range of input stream against an ASCII string
- *
- * \param stream Input stream to look in
- * \param off Offset of range start
- * \param len Byte length of range
- * \param data Comparison string
- * \param dlen Byte length of comparison string
- * \return 0 if match, non-zero otherwise
- */
-int hubbub_inputstream_compare_range_ascii(hubbub_inputstream *stream,
- uint32_t off, size_t len, const char *data, size_t dlen)
-{
- if (stream == NULL || stream->buffer == NULL)
- return 1; /* arbitrary */
-
- return stream->cmp_range_ascii(stream, off, len, data, dlen);
-}
-
-/**
- * Replace a range of bytes in the input stream with a single character
- *
- * \param stream Input stream containing data
- * \param start Offset of start of range to replace
- * \param len Length (in bytes) of range to replace
- * \param ucs4 UCS4 (host endian) encoded replacement character
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_inputstream_replace_range(hubbub_inputstream *stream,
- uint32_t start, size_t len, uint32_t ucs4)
-{
- if (stream == NULL || stream->buffer == NULL)
- return HUBBUB_BADPARM;
-
- if (start >= stream->buffer_len)
- return HUBBUB_INVALID;
-
- if (start < stream->cursor)
- return HUBBUB_INVALID;
-
- return stream->replace_range(stream, start, len, ucs4);
-}
-
-/**
- * Read the document charset
- *
- * \param stream Input stream to query
- * \param source Pointer to location to receive charset source
- * \return Pointer to charset name (constant; do not free), or NULL if unknown
- */
-const char *hubbub_inputstream_read_charset(hubbub_inputstream *stream,
- hubbub_charset_source *source)
-{
- if (stream == NULL || source == NULL)
- return NULL;
-
- *source = stream->encsrc;
-
- if (stream->encsrc == HUBBUB_CHARSET_UNKNOWN)
- return NULL;
-
- return hubbub_mibenum_to_name(stream->mibenum);
-}
-
-/**
- * Inform interested parties that the buffer has moved
- *
- * \param stream Input stream
- */
-void hubbub_inputstream_buffer_moved(hubbub_inputstream *stream)
-{
- hubbub_inputstream_bm_handler *h;
-
- if (stream == NULL)
- return;
-
- for (h = stream->handlers; h; h = h->next)
- h->handler(stream->buffer, stream->buffer_len, h->pw);
-}
-
diff --git a/src/input/inputstream.h b/src/input/inputstream.h
deleted file mode 100644
index 5325d14..0000000
--- a/src/input/inputstream.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * This file is part of Hubbub.
- * Licensed under the MIT License,
- * http://www.opensource.org/licenses/mit-license.php
- * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
- */
-
-#ifndef hubbub_input_inputstream_h_
-#define hubbub_input_inputstream_h_
-
-#include <inttypes.h>
-
-#include <hubbub/errors.h>
-#include <hubbub/functypes.h>
-#include <hubbub/types.h>
-
-typedef struct hubbub_inputstream hubbub_inputstream;
-
-/* EOF pseudo-character */
-#define HUBBUB_INPUTSTREAM_EOF (0xFFFFFFFFU)
-/* Out-of-data indicator */
-#define HUBBUB_INPUTSTREAM_OOD (0xFFFFFFFEU)
-
-/* Type of input stream buffer moved handler function */
-typedef void (*hubbub_inputstream_buffermoved)(const uint8_t *buffer,
- size_t len, void *pw);
-
-/* Create an input stream */
-hubbub_inputstream *hubbub_inputstream_create(const char *enc,
- const char *int_enc, hubbub_alloc alloc, void *pw);
-/* Destroy an input stream */
-void hubbub_inputstream_destroy(hubbub_inputstream *stream);
-
-/* Append data to an input stream */
-hubbub_error hubbub_inputstream_append(hubbub_inputstream *stream,
- const uint8_t *data, size_t len);
-/* Insert data into stream at current location */
-hubbub_error hubbub_inputstream_insert(hubbub_inputstream *stream,
- const uint8_t *data, size_t len);
-
-/* Look at the next character in the stream */
-uint32_t hubbub_inputstream_peek(hubbub_inputstream *stream);
-
-/* Retrieve the byte index and length of the current character in the stream */
-uint32_t hubbub_inputstream_cur_pos(hubbub_inputstream *stream, size_t *len);
-
-/* Convert the current character to lowercase */
-void hubbub_inputstream_lowercase(hubbub_inputstream *stream);
-
-/* Convert the current character to uppercase */
-void hubbub_inputstream_uppercase(hubbub_inputstream *stream);
-
-/* Advance the stream's current position */
-void hubbub_inputstream_advance(hubbub_inputstream *stream);
-
-/* Push a character back onto the stream */
-hubbub_error hubbub_inputstream_push_back(hubbub_inputstream *stream,
- uint32_t character);
-
-/* Rewind the input stream by a number of bytes */
-hubbub_error hubbub_inputstream_rewind(hubbub_inputstream *stream, size_t n);
-
-/* Claim ownership of an input stream's buffer */
-hubbub_error hubbub_inputstream_claim_buffer(hubbub_inputstream *stream,
- uint8_t **buffer, size_t *len);
-
-/* Register interest in buffer moved events */
-hubbub_error hubbub_inputstream_register_movehandler(
- hubbub_inputstream *stream,
- hubbub_inputstream_buffermoved handler, void *pw);
-
-/* Deregister interest in buffer moved events */
-hubbub_error hubbub_inputstream_deregister_movehandler(
- hubbub_inputstream *stream,
- hubbub_inputstream_buffermoved handler, void *pw);
-
-/* Case insensitively compare a pair of ranges in the input stream */
-int hubbub_inputstream_compare_range_ci(hubbub_inputstream *stream,
- uint32_t r1, uint32_t r2, size_t len);
-
-/* Case sensitively compare a pair of ranges in the input stream */
-int hubbub_inputstream_compare_range_cs(hubbub_inputstream *stream,
- uint32_t r1, uint32_t r2, size_t len);
-
-/* Case sensitively compare a range of input stream against an ASCII string */
-int hubbub_inputstream_compare_range_ascii(hubbub_inputstream *stream,
- uint32_t off, size_t len, const char *data, size_t dlen);
-
-/* Replace a range of bytes in the input stream with a single character */
-hubbub_error hubbub_inputstream_replace_range(hubbub_inputstream *stream,
- uint32_t start, size_t len, uint32_t ucs4);
-
-/* Read the document charset */
-const char *hubbub_inputstream_read_charset(hubbub_inputstream *stream,
- hubbub_charset_source *source);
-
-#endif
-
diff --git a/src/input/streamimpl.h b/src/input/streamimpl.h
deleted file mode 100644
index f44f6da..0000000
--- a/src/input/streamimpl.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * This file is part of Hubbub.
- * Licensed under the MIT License,
- * http://www.opensource.org/licenses/mit-license.php
- * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
- */
-
-#ifndef hubbub_input_streamimpl_h_
-#define hubbub_input_streamimpl_h_
-
-#include <stdbool.h>
-
-#include <hubbub/types.h>
-
-#include "input/filter.h"
-#include "input/inputstream.h"
-
-typedef struct hubbub_inputstream_bm_handler hubbub_inputstream_bm_handler;
-
-/**
- * Input stream definition: implementations extend this
- */
-struct hubbub_inputstream {
- uint8_t *buffer; /**< Document buffer */
- size_t buffer_len; /**< Amount of data in buffer */
- size_t buffer_alloc; /**< Allocated size of buffer */
-
- uint32_t cursor; /**< Byte offset of current position */
-
- bool had_eof; /**< Whether EOF has been reached */
-
- uint16_t mibenum; /**< MIB enum for charset, or 0 */
- hubbub_charset_source encsrc; /**< Charset source */
-
- hubbub_filter *input; /**< Charset conversion filter */
-
- hubbub_inputstream_bm_handler *handlers; /**< List of buffer
- * moved handlers */
- hubbub_alloc alloc; /**< Memory (de)allocation function */
- void *pw; /**< Client private data */
-
- void (*destroy)(hubbub_inputstream *stream);
- hubbub_error (*append)(hubbub_inputstream *stream,
- const uint8_t *data, size_t len);
- hubbub_error (*insert)(hubbub_inputstream *stream,
- const uint8_t *data, size_t len);
- uint32_t (*peek)(hubbub_inputstream *stream);
- uint32_t (*cur_pos)(hubbub_inputstream *stream, size_t *len);
- void (*lowercase)(hubbub_inputstream *stream);
- void (*uppercase)(hubbub_inputstream *stream);
- void (*advance)(hubbub_inputstream *stream);
- hubbub_error (*push_back)(hubbub_inputstream *stream,
- uint32_t character);
- int (*cmp_range_ci)(hubbub_inputstream *stream, uint32_t r1,
- uint32_t r2, size_t len);
- int (*cmp_range_cs)(hubbub_inputstream *stream, uint32_t r1,
- uint32_t r2, size_t len);
- int (*cmp_range_ascii)(hubbub_inputstream *stream,
- uint32_t off, size_t len,
- const char *data, size_t dlen);
- hubbub_error (*replace_range)(hubbub_inputstream *stream,
- uint32_t start, size_t len, uint32_t ucs4);
-};
-
-/**
- * Input stream factory component definition
- */
-typedef struct hubbub_streamhandler {
- bool (*uses_encoding)(const char *int_enc);
- hubbub_inputstream *(*create)(const char *enc, const char *int_enc,
- hubbub_alloc alloc, void *pw);
-} hubbub_streamhandler;
-
-/* Notification of stream buffer moving */
-void hubbub_inputstream_buffer_moved(hubbub_inputstream *stream);
-
-#endif
diff --git a/src/input/utf16_stream.c b/src/input/utf16_stream.c
deleted file mode 100644
index e69f124..0000000
--- a/src/input/utf16_stream.c
+++ /dev/null
@@ -1,605 +0,0 @@
-/*
- * This file is part of Hubbub.
- * Licensed under the MIT License,
- * http://www.opensource.org/licenses/mit-license.php
- * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
- */
-
-#include <stdbool.h>
-#include <string.h>
-
-#include "charset/aliases.h"
-#include "charset/detect.h"
-#include "input/streamimpl.h"
-#include "utils/utf16.h"
-#include "utils/utils.h"
-
-#define BUFFER_CHUNK (4096)
-
-static bool hubbub_utf16stream_uses_encoding(const char *int_enc);
-static hubbub_inputstream *hubbub_utf16stream_create(const char *enc,
- const char *int_enc, hubbub_alloc alloc, void *pw);
-static void hubbub_utf16stream_destroy(hubbub_inputstream *stream);
-static hubbub_error hubbub_utf16stream_append(hubbub_inputstream *stream,
- const uint8_t *data, size_t len);
-static hubbub_error hubbub_utf16stream_insert(hubbub_inputstream *stream,
- const uint8_t *data, size_t len);
-static uint32_t hubbub_utf16stream_peek(hubbub_inputstream *stream);
-static uint32_t hubbub_utf16stream_cur_pos(hubbub_inputstream *stream,
- size_t *len);
-static void hubbub_utf16stream_lowercase(hubbub_inputstream *stream);
-static void hubbub_utf16stream_uppercase(hubbub_inputstream *stream);
-static void hubbub_utf16stream_advance(hubbub_inputstream *stream);
-static hubbub_error hubbub_utf16stream_push_back(hubbub_inputstream *stream,
- uint32_t character);
-static int hubbub_utf16stream_compare_range_ci(hubbub_inputstream *stream,
- uint32_t r1, uint32_t r2, size_t len);
-static int hubbub_utf16stream_compare_range_cs(hubbub_inputstream *stream,
- uint32_t r1, uint32_t r2, size_t len);
-static int hubbub_utf16stream_compare_range_ascii(hubbub_inputstream *stream,
- uint32_t off, size_t len, const char *data, size_t dlen);
-static hubbub_error hubbub_utf16stream_replace_range(
- hubbub_inputstream *stream,
- uint32_t start, size_t len, uint32_t ucs4);
-
-/**
- * Determine whether a stream implementation uses an internal encoding
- *
- * \param int_enc The desired encoding
- * \return true if handled, false otherwise
- */
-bool hubbub_utf16stream_uses_encoding(const char *int_enc)
-{
- return (hubbub_mibenum_from_name(int_enc, strlen(int_enc)) ==
- hubbub_mibenum_from_name("UTF-16", SLEN("UTF-16")));
-}
-
-/**
- * Create an input stream
- *
- * \param enc Document charset, or NULL if unknown
- * \param int_enc Desired encoding of document
- * \param alloc Memory (de)allocation function
- * \param pw Pointer to client-specific private data (may be NULL)
- * \return Pointer to stream instance, or NULL on failure
- */
-hubbub_inputstream *hubbub_utf16stream_create(const char *enc,
- const char *int_enc, hubbub_alloc alloc, void *pw)
-{
- hubbub_inputstream *stream;
-
- if (hubbub_mibenum_from_name(int_enc, strlen(int_enc)) !=
- hubbub_mibenum_from_name("UTF-16", SLEN("UTF-16")))
- return NULL;
-
- stream = alloc(NULL, sizeof(hubbub_inputstream), pw);
- if (stream == NULL)
- return NULL;
-
- stream->buffer = alloc(NULL, BUFFER_CHUNK, pw);
- if (stream->buffer == NULL) {
- alloc(stream, 0, pw);
- return NULL;
- }
-
- stream->buffer_len = 0;
- stream->buffer_alloc = BUFFER_CHUNK;
-
- stream->cursor = 0;
-
- stream->had_eof = false;
-
- stream->input = hubbub_filter_create(int_enc, alloc, pw);
- if (stream->input == NULL) {
- alloc(stream->buffer, 0, pw);
- alloc(stream, 0, pw);
- return NULL;
- }
-
- if (enc != NULL) {
- hubbub_error error;
- hubbub_filter_optparams params;
-
- stream->mibenum = hubbub_mibenum_from_name(enc, strlen(enc));
-
- if (stream->mibenum != 0) {
- params.encoding.name = enc;
-
- error = hubbub_filter_setopt(stream->input,
- HUBBUB_FILTER_SET_ENCODING, &params);
- if (error != HUBBUB_OK && error != HUBBUB_INVALID) {
- hubbub_filter_destroy(stream->input);
- alloc(stream->buffer, 0, pw);
- alloc(stream, 0, pw);
- return NULL;
- }
-
- stream->encsrc = HUBBUB_CHARSET_DICTATED;
- }
- } else {
- stream->mibenum = 0;
- stream->encsrc = HUBBUB_CHARSET_UNKNOWN;
- }
-
- stream->destroy = hubbub_utf16stream_destroy;
- stream->append = hubbub_utf16stream_append;
- stream->insert = hubbub_utf16stream_insert;
- stream->peek = hubbub_utf16stream_peek;
- stream->cur_pos = hubbub_utf16stream_cur_pos;
- stream->lowercase = hubbub_utf16stream_lowercase;
- stream->uppercase = hubbub_utf16stream_uppercase;
- stream->advance = hubbub_utf16stream_advance;
- stream->push_back = hubbub_utf16stream_push_back;
- stream->cmp_range_ci = hubbub_utf16stream_compare_range_ci;
- stream->cmp_range_cs = hubbub_utf16stream_compare_range_cs;
- stream->cmp_range_ascii = hubbub_utf16stream_compare_range_ascii;
- stream->replace_range = hubbub_utf16stream_replace_range;
-
- return stream;
-}
-
-/**
- * Destroy an input stream
- *
- * \param stream Input stream to destroy
- */
-void hubbub_utf16stream_destroy(hubbub_inputstream *stream)
-{
- if (stream->input != NULL) {
- hubbub_filter_destroy(stream->input);
- }
-
- if (stream->buffer != NULL) {
- stream->alloc(stream->buffer, 0, stream->pw);
- }
-
- stream->alloc(stream, 0, stream->pw);
-}
-
-/**
- * Append data to an input stream
- *
- * \param stream Input stream to append data to
- * \param data Data to append (in document charset), or NULL to flag EOF
- * \param len Length, in bytes, of data
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_utf16stream_append(hubbub_inputstream *stream,
- const uint8_t *data, size_t len)
-{
- hubbub_error error;
- uint8_t *base;
- size_t space;
-
- if (data == NULL) {
- /* EOF indicated */
- size_t dummy_len = 0;
- uint8_t *dummy_data = (uint8_t *) &dummy_len;
-
- base = stream->buffer + stream->buffer_len;
- space = stream->buffer_alloc - stream->buffer_len;
-
- /* Forcibly flush through any remaining buffered data */
- while ((error = hubbub_filter_process_chunk(stream->input,
- (const uint8_t **) &dummy_data, &dummy_len,
- &base, &space)) == HUBBUB_NOMEM) {
- bool moved = false;
- uint8_t *temp = stream->alloc(stream->buffer,
- stream->buffer_alloc + BUFFER_CHUNK,
- stream->pw);
-
- if (temp == NULL) {
- return HUBBUB_NOMEM;
- }
-
- moved = (temp != stream->buffer);
-
- stream->buffer = temp;
- stream->buffer_len = stream->buffer_alloc - space;
- stream->buffer_alloc += BUFFER_CHUNK;
-
- base = stream->buffer + stream->buffer_len;
- space += BUFFER_CHUNK;
-
- if (moved)
- hubbub_inputstream_buffer_moved(stream);
- }
-
- /* And fix up buffer length */
- stream->buffer_len = stream->buffer_alloc - space;
-
- stream->had_eof = true;
- } else {
- /* Normal data chunk */
-
- if (stream->mibenum == 0) {
- /* Haven't found charset yet; detect it */
- error = hubbub_charset_extract(&data, &len,
- &stream->mibenum, &stream->encsrc);
- if (error) {
- return error;
- }
-
- /* We should always have a charset by now */
- if (stream->mibenum == 0)
- abort();
- }
-
- base = stream->buffer + stream->buffer_len;
- space = stream->buffer_alloc - stream->buffer_len;
-
- /* Convert chunk to UTF-16 */
- while ((error = hubbub_filter_process_chunk(stream->input,
- &data, &len,
- &base, &space)) == HUBBUB_NOMEM) {
- bool moved = false;
- uint8_t *temp = stream->alloc(stream->buffer,
- stream->buffer_alloc + BUFFER_CHUNK,
- stream->pw);
-
- if (temp == NULL) {
- return HUBBUB_NOMEM;
- }
-
- moved = (temp != stream->buffer);
-
- stream->buffer = temp;
- stream->buffer_len = stream->buffer_alloc - space;
- stream->buffer_alloc += BUFFER_CHUNK;
-
- base = stream->buffer + stream->buffer_len;
- space += BUFFER_CHUNK;
-
- if (moved)
- hubbub_inputstream_buffer_moved(stream);
- }
-
- /* And fix up buffer length */
- stream->buffer_len = stream->buffer_alloc - space;
- }
-
- return HUBBUB_OK;
-}
-
-/**
- * Insert data into stream at current location
- *
- * \param stream Input stream to insert into
- * \param data Data to insert (UTF-16 encoded)
- * \param len Length, in bytes, of data
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_utf16stream_insert(hubbub_inputstream *stream,
- const uint8_t *data, size_t len)
-{
- size_t space;
- uint8_t *curpos;
-
- space = stream->buffer_alloc - stream->buffer_len;
-
- /* Need to grow buffer, if there's insufficient space */
- if (space <= len) {
- bool moved = false;
- uint8_t *temp = stream->alloc(stream->buffer,
- stream->buffer_alloc +
- ((len + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK) +
- BUFFER_CHUNK,
- stream->pw);
-
- if (temp == NULL)
- return HUBBUB_NOMEM;
-
- moved = (temp != stream->buffer);
-
- stream->buffer = temp;
- stream->buffer_alloc +=
- ((len + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK);
-
- if (moved)
- hubbub_inputstream_buffer_moved(stream);
- }
-
- /* Find the insertion point
- * (just before the next character to be read) */
- curpos = stream->buffer + stream->cursor;
-
- /* Move data above this point up */
- memmove(curpos + len, curpos, stream->buffer_len - stream->cursor);
-
- /* Copy new data into gap created by memmove */
- memcpy(curpos, data, len);
-
- /* Fix up buffer length */
- stream->buffer_len += len;
-
- return HUBBUB_OK;
-}
-
-/**
- * Look at the next character in the stream
- *
- * \param stream Stream to look in
- * \return UCS4 (host-endian) character code, or EOF or OOD.
- */
-uint32_t hubbub_utf16stream_peek(hubbub_inputstream *stream)
-{
- hubbub_error error;
- size_t len;
- uint32_t ret;
-
- if (stream->cursor == stream->buffer_len) {
- return stream->had_eof ? HUBBUB_INPUTSTREAM_EOF
- : HUBBUB_INPUTSTREAM_OOD;
- }
-
- error = hubbub_utf16_to_ucs4(stream->buffer + stream->cursor,
- stream->buffer_len - stream->cursor,
- &ret, &len);
- if (error != HUBBUB_OK && error != HUBBUB_NEEDDATA)
- return HUBBUB_INPUTSTREAM_OOD;
-
- if (error == HUBBUB_NEEDDATA) {
- if (stream->had_eof)
- return HUBBUB_INPUTSTREAM_EOF;
- else
- return HUBBUB_INPUTSTREAM_OOD;
- }
-
- return ret;
-}
-
-/**
- * Retrieve the byte index and length of the current character in the stream
- *
- * \param stream Stream to look in
- * \param len Pointer to location to receive byte length of character
- * \return Byte index of current character from start of stream,
- * or (uint32_t) -1 on error
- */
-uint32_t hubbub_utf16stream_cur_pos(hubbub_inputstream *stream,
- size_t *len)
-{
- hubbub_utf16_char_byte_length(stream->buffer + stream->cursor, len);
-
- return stream->cursor;
-}
-
-/**
- * Convert the current character to lower case
- *
- * \param stream Stream to look in
- */
-void hubbub_utf16stream_lowercase(hubbub_inputstream *stream)
-{
- uint16_t *buf = (uint16_t *)
- ((void *) (stream->buffer + stream->cursor));
-
- if (0x0041 <= buf[0] && buf[0] <= 0x005B)
- buf[0] += 0x0020;
-}
-
-/**
- * Convert the current character to upper case
- *
- * \param stream Stream to look in
- */
-void hubbub_utf16stream_uppercase(hubbub_inputstream *stream)
-{
- uint16_t *buf = (uint16_t *)
- ((void *) (stream->buffer + stream->cursor));
-
- if (0x0061 <= buf[0] && buf[0] <= 0x007B)
- buf[0] -= 0x0020;
-}
-
-/**
- * Advance the stream's current position
- *
- * \param stream The stream whose position to advance
- */
-void hubbub_utf16stream_advance(hubbub_inputstream *stream)
-{
- hubbub_error error;
- uint32_t next;
-
- error = hubbub_utf16_next(stream->buffer, stream->buffer_len,
- stream->cursor, &next);
-
- if (error == HUBBUB_OK)
- stream->cursor = next;
-}
-
-/**
- * Push a character back onto the stream
- *
- * \param stream Stream to push back to
- * \param character UCS4 (host-endian) codepoint to push back
- * \return HUBBUB_OK on success, appropriate error otherwise
- *
- * Note that this doesn't actually modify the data in the stream.
- * It works by ensuring that the character located just before the
- * current stream location is the same as ::character. If it is,
- * then the stream pointer is moved back. If it is not, then an
- * error is returned and the stream pointer remains unmodified.
- */
-hubbub_error hubbub_utf16stream_push_back(hubbub_inputstream *stream,
- uint32_t character)
-{
- hubbub_error error;
- uint32_t prev;
- uint8_t buf[4];
- size_t len;
-
- error = hubbub_utf16_prev(stream->buffer, stream->cursor, &prev);
- if (error != HUBBUB_OK)
- return error;
-
- error = hubbub_utf16_from_ucs4(character, buf, &len);
- if (error != HUBBUB_OK)
- return error;
-
- if ((stream->cursor - prev) != len ||
- memcmp(stream->buffer + prev, buf, len) != 0)
- return HUBBUB_INVALID;
-
- stream->cursor = prev;
-
- return HUBBUB_OK;
-}
-
-/**
- * Case insensitively compare a pair of ranges in the input stream
- *
- * \param stream Input stream to look in
- * \param r1 Offset of start of first range
- * \param r2 Offset of start of second range
- * \param len Byte length of ranges
- * \return 0 if ranges match, non-zero otherwise
- */
-int hubbub_utf16stream_compare_range_ci(hubbub_inputstream *stream,
- uint32_t r1, uint32_t r2, size_t len)
-{
- uint8_t *range1 = (stream->buffer + r1);
- uint8_t *range2 = (stream->buffer + r2);
- int c1, c2;
- uint32_t r1next, r2next;
- hubbub_error error;
-
- if (len == 0)
- return 0;
-
- do {
- c1 = *((uint16_t *) (void *) range1);
- c2 = *((uint16_t *) (void *) range2);
-
- if ((0x0041 <= c1 && c1 <= 0x005B))
- c1 |= 0x0020;
-
- if ((0x0041 <= c2 && c2 <= 0x005B))
- c2 |= 0x0020;
-
- error = hubbub_utf16_next(range1, len, 0, &r1next);
- error = hubbub_utf16_next(range2, len, 0, &r2next);
-
- range1 += r1next;
- range2 += r2next;
-
- len -= r1next;
- } while(c1 != 0 && (c1 == c2) && len > 0);
-
- return (c1 - c2);
-}
-
-/**
- * Case sensitively compare a pair of ranges in the input stream
- *
- * \param stream Input stream to look in
- * \param r1 Offset of start of first range
- * \param r2 Offset of start of second range
- * \param len Byte length of ranges
- * \return 0 if ranges match, non-zero otherwise
- */
-int hubbub_utf16stream_compare_range_cs(hubbub_inputstream *stream,
- uint32_t r1, uint32_t r2, size_t len)
-{
- return memcmp((const char *) (stream->buffer + r1),
- (const char *) (stream->buffer + r2), len);
-}
-
-/**
- * Case sensitively compare a range of input stream against an ASCII string
- *
- * \param stream Input stream to look in
- * \param off Offset of range start
- * \param len Byte length of range
- * \param data Comparison string
- * \param dlen Byte length of comparison string
- * \return 0 if match, non-zero otherwise
- */
-int hubbub_utf16stream_compare_range_ascii(hubbub_inputstream *stream,
- uint32_t off, size_t len, const char *data, size_t dlen)
-{
- uint8_t *range = (stream->buffer + off);
- int c1, c2;
-
- /* Lengths don't match, so strings don't */
- if (len != dlen * 2)
- return 1; /* arbitrary */
-
- do {
- c1 = *((uint16_t *) (void *) range);
- c2 = *data;
-
- range += 2;
- data++;
-
- len -= 2;
- } while (c1 != 0 && (c1 == c2) && len > 0);
-
- return (c1 - c2);
-}
-
-/**
- * Replace a range of bytes in the input stream with a single character
- *
- * \param stream Input stream containing data
- * \param start Offset of start of range to replace
- * \param len Length (in bytes) of range to replace
- * \param ucs4 UCS4 (host endian) encoded replacement character
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_utf16stream_replace_range(hubbub_inputstream *stream,
- uint32_t start, size_t len, uint32_t ucs4)
-{
- uint8_t buf[4];
- size_t replen;
- int32_t diff;
- hubbub_error error;
-
- /* Get utf16 version of replacement character */
- error = hubbub_utf16_from_ucs4(ucs4, buf, &replen);
- if (error)
- return error;
-
- diff = replen - len;
-
- if (stream->buffer_len + diff >= stream->buffer_alloc) {
- /* Need more buffer space */
- bool moved = false;
- uint8_t *temp = stream->alloc(stream->buffer,
- stream->buffer_alloc +
- ((diff + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK) +
- BUFFER_CHUNK,
- stream->pw);
-
- if (temp == NULL)
- return HUBBUB_NOMEM;
-
- moved = (temp != stream->buffer);
-
- stream->buffer = temp;
- stream->buffer_alloc +=
- ((diff + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK);
-
- if (moved)
- hubbub_inputstream_buffer_moved(stream);
- }
-
- /* Move subsequent input to correct location */
- memmove(stream->buffer + start + len + diff,
- stream->buffer + start + len,
- stream->buffer_len - (start + len));
-
- /* And fill the gap with the replacement character */
- memcpy(stream->buffer + start, buf, replen);
-
- /* Finally, update length */
- stream->buffer_len += diff;
-
- return HUBBUB_OK;
-}
-
-hubbub_streamhandler utf16stream = {
- hubbub_utf16stream_uses_encoding,
- hubbub_utf16stream_create
-};
diff --git a/src/input/utf8_stream.c b/src/input/utf8_stream.c
deleted file mode 100644
index 3de142b..0000000
--- a/src/input/utf8_stream.c
+++ /dev/null
@@ -1,562 +0,0 @@
-/*
- * This file is part of Hubbub.
- * Licensed under the MIT License,
- * http://www.opensource.org/licenses/mit-license.php
- * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
- */
-
-#include <stdbool.h>
-#include <string.h>
-
-#include "charset/aliases.h"
-#include "charset/detect.h"
-#include "input/streamimpl.h"
-#include "utils/utf8.h"
-#include "utils/utils.h"
-
-#define BUFFER_CHUNK (4096)
-
-static bool hubbub_utf8stream_uses_encoding(const char *int_enc);
-static hubbub_inputstream *hubbub_utf8stream_create(const char *enc,
- const char *int_enc, hubbub_alloc alloc, void *pw);
-static void hubbub_utf8stream_destroy(hubbub_inputstream *stream);
-static hubbub_error hubbub_utf8stream_append(hubbub_inputstream *stream,
- const uint8_t *data, size_t len);
-static hubbub_error hubbub_utf8stream_insert(hubbub_inputstream *stream,
- const uint8_t *data, size_t len);
-static uint32_t hubbub_utf8stream_peek(hubbub_inputstream *stream);
-static uint32_t hubbub_utf8stream_cur_pos(hubbub_inputstream *stream,
- size_t *len);
-static void hubbub_utf8stream_lowercase(hubbub_inputstream *stream);
-static void hubbub_utf8stream_uppercase(hubbub_inputstream *stream);
-static void hubbub_utf8stream_advance(hubbub_inputstream *stream);
-static hubbub_error hubbub_utf8stream_push_back(hubbub_inputstream *stream,
- uint32_t character);
-static int hubbub_utf8stream_compare_range_ci(hubbub_inputstream *stream,
- uint32_t r1, uint32_t r2, size_t len);
-static int hubbub_utf8stream_compare_range_cs(hubbub_inputstream *stream,
- uint32_t r1, uint32_t r2, size_t len);
-static int hubbub_utf8stream_compare_range_ascii(hubbub_inputstream *stream,
- uint32_t off, size_t len, const char *data, size_t dlen);
-static hubbub_error hubbub_utf8stream_replace_range(
- hubbub_inputstream *stream,
- uint32_t start, size_t len, uint32_t ucs4);
-
-/**
- * Determine whether a stream implementation uses an internal encoding
- *
- * \param int_enc The desired encoding
- * \return true if handled, false otherwise
- */
-bool hubbub_utf8stream_uses_encoding(const char *int_enc)
-{
- return (hubbub_mibenum_from_name(int_enc, strlen(int_enc)) ==
- hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8")));
-}
-
-/**
- * Create an input stream
- *
- * \param enc Document charset, or NULL if unknown
- * \param int_enc Desired encoding of document
- * \param alloc Memory (de)allocation function
- * \param pw Pointer to client-specific private data (may be NULL)
- * \return Pointer to stream instance, or NULL on failure
- */
-hubbub_inputstream *hubbub_utf8stream_create(const char *enc,
- const char *int_enc, hubbub_alloc alloc, void *pw)
-{
- hubbub_inputstream *stream;
-
- if (hubbub_mibenum_from_name(int_enc, strlen(int_enc)) !=
- hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8")))
- return NULL;
-
- stream = alloc(NULL, sizeof(hubbub_inputstream), pw);
- if (stream == NULL)
- return NULL;
-
- stream->buffer = alloc(NULL, BUFFER_CHUNK, pw);
- if (stream->buffer == NULL) {
- alloc(stream, 0, pw);
- return NULL;
- }
-
- stream->buffer_len = 0;
- stream->buffer_alloc = BUFFER_CHUNK;
-
- stream->cursor = 0;
-
- stream->had_eof = false;
-
- stream->input = hubbub_filter_create(int_enc, alloc, pw);
- if (stream->input == NULL) {
- alloc(stream->buffer, 0, pw);
- alloc(stream, 0, pw);
- return NULL;
- }
-
- if (enc != NULL) {
- hubbub_error error;
- hubbub_filter_optparams params;
-
- stream->mibenum = hubbub_mibenum_from_name(enc, strlen(enc));
-
- if (stream->mibenum != 0) {
- params.encoding.name = enc;
-
- error = hubbub_filter_setopt(stream->input,
- HUBBUB_FILTER_SET_ENCODING, &params);
- if (error != HUBBUB_OK && error != HUBBUB_INVALID) {
- hubbub_filter_destroy(stream->input);
- alloc(stream->buffer, 0, pw);
- alloc(stream, 0, pw);
- return NULL;
- }
-
- stream->encsrc = HUBBUB_CHARSET_DICTATED;
- }
- } else {
- stream->mibenum = 0;
- stream->encsrc = HUBBUB_CHARSET_UNKNOWN;
- }
-
- stream->destroy = hubbub_utf8stream_destroy;
- stream->append = hubbub_utf8stream_append;
- stream->insert = hubbub_utf8stream_insert;
- stream->peek = hubbub_utf8stream_peek;
- stream->cur_pos = hubbub_utf8stream_cur_pos;
- stream->lowercase = hubbub_utf8stream_lowercase;
- stream->uppercase = hubbub_utf8stream_uppercase;
- stream->advance = hubbub_utf8stream_advance;
- stream->push_back = hubbub_utf8stream_push_back;
- stream->cmp_range_ci = hubbub_utf8stream_compare_range_ci;
- stream->cmp_range_cs = hubbub_utf8stream_compare_range_cs;
- stream->cmp_range_ascii = hubbub_utf8stream_compare_range_ascii;
- stream->replace_range = hubbub_utf8stream_replace_range;
-
- return stream;
-}
-
-/**
- * Destroy an input stream
- *
- * \param stream Input stream to destroy
- */
-void hubbub_utf8stream_destroy(hubbub_inputstream *stream)
-{
- if (stream->input != NULL) {
- hubbub_filter_destroy(stream->input);
- }
-
- if (stream->buffer != NULL) {
- stream->alloc(stream->buffer, 0, stream->pw);
- }
-
- stream->alloc(stream, 0, stream->pw);
-}
-
-/**
- * Append data to an input stream
- *
- * \param stream Input stream to append data to
- * \param data Data to append (in document charset), or NULL to flag EOF
- * \param len Length, in bytes, of data
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_utf8stream_append(hubbub_inputstream *stream,
- const uint8_t *data, size_t len)
-{
- hubbub_error error;
- uint8_t *base;
- size_t space;
-
- if (data == NULL) {
- /* EOF indicated */
- size_t dummy_len = 0;
- uint8_t *dummy_data = (uint8_t *) &dummy_len;
-
- base = stream->buffer + stream->buffer_len;
- space = stream->buffer_alloc - stream->buffer_len;
-
- /* Forcibly flush through any remaining buffered data */
- while ((error = hubbub_filter_process_chunk(stream->input,
- (const uint8_t **) &dummy_data, &dummy_len,
- &base, &space)) == HUBBUB_NOMEM) {
- bool moved = false;
- uint8_t *temp = stream->alloc(stream->buffer,
- stream->buffer_alloc + BUFFER_CHUNK,
- stream->pw);
-
- if (temp == NULL) {
- return HUBBUB_NOMEM;
- }
-
- moved = (temp != stream->buffer);
-
- stream->buffer = temp;
- stream->buffer_len = stream->buffer_alloc - space;
- stream->buffer_alloc += BUFFER_CHUNK;
-
- base = stream->buffer + stream->buffer_len;
- space += BUFFER_CHUNK;
-
- if (moved)
- hubbub_inputstream_buffer_moved(stream);
- }
-
- /* And fix up buffer length */
- stream->buffer_len = stream->buffer_alloc - space;
-
- stream->had_eof = true;
- } else {
- /* Normal data chunk */
-
- if (stream->mibenum == 0) {
- /* Haven't found charset yet; detect it */
- error = hubbub_charset_extract(&data, &len,
- &stream->mibenum, &stream->encsrc);
- if (error) {
- return error;
- }
-
- /* We should always have a charset by now */
- if (stream->mibenum == 0)
- abort();
- }
-
- base = stream->buffer + stream->buffer_len;
- space = stream->buffer_alloc - stream->buffer_len;
-
- /* Convert chunk to UTF-8 */
- while ((error = hubbub_filter_process_chunk(stream->input,
- &data, &len,
- &base, &space)) == HUBBUB_NOMEM) {
- bool moved = false;
- uint8_t *temp = stream->alloc(stream->buffer,
- stream->buffer_alloc + BUFFER_CHUNK,
- stream->pw);
-
- if (temp == NULL) {
- return HUBBUB_NOMEM;
- }
-
- moved = (temp != stream->buffer);
-
- stream->buffer = temp;
- stream->buffer_len = stream->buffer_alloc - space;
- stream->buffer_alloc += BUFFER_CHUNK;
-
- base = stream->buffer + stream->buffer_len;
- space += BUFFER_CHUNK;
-
- if (moved)
- hubbub_inputstream_buffer_moved(stream);
- }
-
- /* And fix up buffer length */
- stream->buffer_len = stream->buffer_alloc - space;
- }
-
- return HUBBUB_OK;
-}
-
-/**
- * Insert data into stream at current location
- *
- * \param stream Input stream to insert into
- * \param data Data to insert (UTF-8 encoded)
- * \param len Length, in bytes, of data
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_utf8stream_insert(hubbub_inputstream *stream,
- const uint8_t *data, size_t len)
-{
- size_t space;
- uint8_t *curpos;
-
- space = stream->buffer_alloc - stream->buffer_len;
-
- /* Need to grow buffer, if there's insufficient space */
- if (space <= len) {
- bool moved = false;
- uint8_t *temp = stream->alloc(stream->buffer,
- stream->buffer_alloc +
- ((len + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK) +
- BUFFER_CHUNK,
- stream->pw);
-
- if (temp == NULL)
- return HUBBUB_NOMEM;
-
- moved = (temp != stream->buffer);
-
- stream->buffer = temp;
- stream->buffer_alloc +=
- ((len + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK);
-
- if (moved)
- hubbub_inputstream_buffer_moved(stream);
- }
-
- /* Find the insertion point
- * (just before the next character to be read) */
- curpos = stream->buffer + stream->cursor;
-
- /* Move data above this point up */
- memmove(curpos + len, curpos, stream->buffer_len - stream->cursor);
-
- /* Copy new data into gap created by memmove */
- memcpy(curpos, data, len);
-
- /* Fix up buffer length */
- stream->buffer_len += len;
-
- return HUBBUB_OK;
-}
-
-/**
- * Look at the next character in the stream
- *
- * \param stream Stream to look in
- * \return UCS4 (host-endian) character code, or EOF or OOD.
- */
-uint32_t hubbub_utf8stream_peek(hubbub_inputstream *stream)
-{
- hubbub_error error;
- size_t len;
- uint32_t ret;
-
- if (stream->cursor == stream->buffer_len) {
- return stream->had_eof ? HUBBUB_INPUTSTREAM_EOF
- : HUBBUB_INPUTSTREAM_OOD;
- }
-
- error = hubbub_utf8_to_ucs4(stream->buffer + stream->cursor,
- stream->buffer_len - stream->cursor,
- &ret, &len);
- if (error != HUBBUB_OK && error != HUBBUB_NEEDDATA)
- return HUBBUB_INPUTSTREAM_OOD;
-
- if (error == HUBBUB_NEEDDATA) {
- if (stream->had_eof)
- return HUBBUB_INPUTSTREAM_EOF;
- else
- return HUBBUB_INPUTSTREAM_OOD;
- }
-
- return ret;
-}
-
-/**
- * Retrieve the byte index and length of the current character in the stream
- *
- * \param stream Stream to look in
- * \param len Pointer to location to receive byte length of character
- * \return Byte index of current character from start of stream,
- * or (uint32_t) -1 on error
- */
-uint32_t hubbub_utf8stream_cur_pos(hubbub_inputstream *stream,
- size_t *len)
-{
- hubbub_utf8_char_byte_length(stream->buffer + stream->cursor, len);
-
- return stream->cursor;
-}
-
-/**
- * Convert the current character to lower case
- *
- * \param stream Stream to look in
- */
-void hubbub_utf8stream_lowercase(hubbub_inputstream *stream)
-{
- if ('A' <= stream->buffer[stream->cursor] &&
- stream->buffer[stream->cursor] <= 'Z')
- stream->buffer[stream->cursor] += 0x0020;
-}
-
-/**
- * Convert the current character to upper case
- *
- * \param stream Stream to look in
- */
-void hubbub_utf8stream_uppercase(hubbub_inputstream *stream)
-{
- if ('a' <= stream->buffer[stream->cursor] &&
- stream->buffer[stream->cursor] <= 'z')
- stream->buffer[stream->cursor] -= 0x0020;
-}
-
-/**
- * Advance the stream's current position
- *
- * \param stream The stream whose position to advance
- */
-void hubbub_utf8stream_advance(hubbub_inputstream *stream)
-{
- hubbub_error error;
- uint32_t next;
-
- error = hubbub_utf8_next(stream->buffer, stream->buffer_len,
- stream->cursor, &next);
-
- if (error == HUBBUB_OK)
- stream->cursor = next;
-}
-
-/**
- * Push a character back onto the stream
- *
- * \param stream Stream to push back to
- * \param character UCS4 (host-endian) codepoint to push back
- * \return HUBBUB_OK on success, appropriate error otherwise
- *
- * Note that this doesn't actually modify the data in the stream.
- * It works by ensuring that the character located just before the
- * current stream location is the same as ::character. If it is,
- * then the stream pointer is moved back. If it is not, then an
- * error is returned and the stream pointer remains unmodified.
- */
-hubbub_error hubbub_utf8stream_push_back(hubbub_inputstream *stream,
- uint32_t character)
-{
- hubbub_error error;
- uint32_t prev;
- uint8_t buf[6];
- size_t len;
-
- error = hubbub_utf8_prev(stream->buffer, stream->cursor, &prev);
- if (error != HUBBUB_OK)
- return error;
-
- error = hubbub_utf8_from_ucs4(character, buf, &len);
- if (error != HUBBUB_OK)
- return error;
-
- if ((stream->cursor - prev) != len ||
- memcmp(stream->buffer + prev, buf, len) != 0)
- return HUBBUB_INVALID;
-
- stream->cursor = prev;
-
- return HUBBUB_OK;
-}
-
-/**
- * Case insensitively compare a pair of ranges in the input stream
- *
- * \param stream Input stream to look in
- * \param r1 Offset of start of first range
- * \param r2 Offset of start of second range
- * \param len Byte length of ranges
- * \return 0 if ranges match, non-zero otherwise
- */
-int hubbub_utf8stream_compare_range_ci(hubbub_inputstream *stream,
- uint32_t r1, uint32_t r2, size_t len)
-{
- return strncasecmp((const char *) (stream->buffer + r1),
- (const char *) (stream->buffer + r2), len);
-}
-
-/**
- * Case sensitively compare a pair of ranges in the input stream
- *
- * \param stream Input stream to look in
- * \param r1 Offset of start of first range
- * \param r2 Offset of start of second range
- * \param len Byte length of ranges
- * \return 0 if ranges match, non-zero otherwise
- */
-int hubbub_utf8stream_compare_range_cs(hubbub_inputstream *stream,
- uint32_t r1, uint32_t r2, size_t len)
-{
- return strncmp((const char *) (stream->buffer + r1),
- (const char *) (stream->buffer + r2), len);
-}
-
-/**
- * Case sensitively compare a range of input stream against an ASCII string
- *
- * \param stream Input stream to look in
- * \param off Offset of range start
- * \param len Byte length of range
- * \param data Comparison string
- * \param dlen Byte length of comparison string
- * \return 0 if match, non-zero otherwise
- */
-int hubbub_utf8stream_compare_range_ascii(hubbub_inputstream *stream,
- uint32_t off, size_t len, const char *data, size_t dlen)
-{
- /* Lengths don't match, so strings don't */
- if (len != dlen)
- return 1; /* arbitrary */
-
- return strncmp((const char *) (stream->buffer + off),
- data, len);
-}
-
-/**
- * Replace a range of bytes in the input stream with a single character
- *
- * \param stream Input stream containing data
- * \param start Offset of start of range to replace
- * \param len Length (in bytes) of range to replace
- * \param ucs4 UCS4 (host endian) encoded replacement character
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_utf8stream_replace_range(hubbub_inputstream *stream,
- uint32_t start, size_t len, uint32_t ucs4)
-{
- uint8_t buf[6];
- size_t replen;
- int32_t diff;
- hubbub_error error;
-
- /* Get UTF8 version of replacement character */
- error = hubbub_utf8_from_ucs4(ucs4, buf, &replen);
- if (error)
- return error;
-
- diff = replen - len;
-
- if (stream->buffer_len + diff >= stream->buffer_alloc) {
- /* Need more buffer space */
- bool moved = false;
- uint8_t *temp = stream->alloc(stream->buffer,
- stream->buffer_alloc +
- ((diff + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK) +
- BUFFER_CHUNK,
- stream->pw);
-
- if (temp == NULL)
- return HUBBUB_NOMEM;
-
- moved = (temp != stream->buffer);
-
- stream->buffer = temp;
- stream->buffer_alloc +=
- ((diff + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK);
-
- if (moved)
- hubbub_inputstream_buffer_moved(stream);
- }
-
- /* Move subsequent input to correct location */
- memmove(stream->buffer + start + len + diff,
- stream->buffer + start + len,
- stream->buffer_len - (start + len));
-
- /* And fill the gap with the replacement character */
- memcpy(stream->buffer + start, buf, replen);
-
- /* Finally, update length */
- stream->buffer_len += diff;
-
- return HUBBUB_OK;
-}
-
-hubbub_streamhandler utf8stream = {
- hubbub_utf8stream_uses_encoding,
- hubbub_utf8stream_create
-};
diff --git a/src/parser.c b/src/parser.c
index 1ae498a..26b2b1f 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -5,9 +5,11 @@
* Copyright 2007-8 John-Mark Bell <jmb@netsurf-browser.org>
*/
+#include <parserutils/input/inputstream.h>
+
#include <hubbub/parser.h>
-#include "input/inputstream.h"
+#include "charset/detect.h"
#include "tokeniser/tokeniser.h"
#include "treebuilder/treebuilder.h"
@@ -15,7 +17,7 @@
* Hubbub parser object
*/
struct hubbub_parser {
- hubbub_inputstream *stream; /**< Input stream instance */
+ parserutils_inputstream *stream; /**< Input stream instance */
hubbub_tokeniser *tok; /**< Tokeniser instance */
hubbub_treebuilder *tb; /**< Treebuilder instance */
@@ -44,7 +46,9 @@ hubbub_parser *hubbub_parser_create(const char *enc, const char *int_enc,
if (parser == NULL)
return NULL;
- parser->stream = hubbub_inputstream_create(enc, int_enc, alloc, pw);
+ parser->stream = parserutils_inputstream_create(enc,
+ enc != NULL ? HUBBUB_CHARSET_DICTATED : HUBBUB_CHARSET_UNKNOWN,
+ hubbub_charset_extract, alloc, pw);
if (parser->stream == NULL) {
alloc(parser, 0, pw);
return NULL;
@@ -52,7 +56,7 @@ hubbub_parser *hubbub_parser_create(const char *enc, const char *int_enc,
parser->tok = hubbub_tokeniser_create(parser->stream, alloc, pw);
if (parser->tok == NULL) {
- hubbub_inputstream_destroy(parser->stream);
+ parserutils_inputstream_destroy(parser->stream);
alloc(parser, 0, pw);
return NULL;
}
@@ -60,7 +64,7 @@ hubbub_parser *hubbub_parser_create(const char *enc, const char *int_enc,
parser->tb = hubbub_treebuilder_create(parser->tok, alloc, pw);
if (parser->tb == NULL) {
hubbub_tokeniser_destroy(parser->tok);
- hubbub_inputstream_destroy(parser->stream);
+ parserutils_inputstream_destroy(parser->stream);
alloc(parser, 0, pw);
return NULL;
}
@@ -85,7 +89,7 @@ void hubbub_parser_destroy(hubbub_parser *parser)
hubbub_tokeniser_destroy(parser->tok);
- hubbub_inputstream_destroy(parser->stream);
+ parserutils_inputstream_destroy(parser->stream);
parser->alloc(parser, 0, parser->pw);
}
@@ -119,19 +123,6 @@ hubbub_error hubbub_parser_setopt(hubbub_parser *parser,
HUBBUB_TOKENISER_TOKEN_HANDLER,
(hubbub_tokeniser_optparams *) params);
break;
- case HUBBUB_PARSER_BUFFER_HANDLER:
- /* The buffer handler cascades, so if there's a treebuilder,
- * simply inform that. Otherwise, tell the tokeniser. */
- if (parser->tb != NULL) {
- result = hubbub_treebuilder_setopt(parser->tb,
- HUBBUB_TREEBUILDER_BUFFER_HANDLER,
- (hubbub_treebuilder_optparams *) params);
- } else {
- result = hubbub_tokeniser_setopt(parser->tok,
- HUBBUB_TOKENISER_BUFFER_HANDLER,
- (hubbub_tokeniser_optparams *) params);
- }
- break;
case HUBBUB_PARSER_ERROR_HANDLER:
/* The error handler does not cascade, so tell both the
* treebuilder (if extant) and the tokeniser. */
@@ -183,14 +174,15 @@ hubbub_error hubbub_parser_setopt(hubbub_parser *parser,
hubbub_error hubbub_parser_parse_chunk(hubbub_parser *parser,
uint8_t *data, size_t len)
{
+ parserutils_error perror;
hubbub_error error;
if (parser == NULL || data == NULL)
return HUBBUB_BADPARM;
- error = hubbub_inputstream_append(parser->stream, data, len);
- if (error != HUBBUB_OK)
- return error;
+ perror = parserutils_inputstream_append(parser->stream, data, len);
+ if (perror != PARSERUTILS_OK)
+ return !HUBBUB_OK;
error = hubbub_tokeniser_run(parser->tok);
if (error != HUBBUB_OK)
@@ -221,7 +213,7 @@ hubbub_error hubbub_parser_parse_extraneous_chunk(hubbub_parser *parser,
if (parser == NULL || data == NULL)
return HUBBUB_BADPARM;
- error = hubbub_inputstream_insert(parser->stream, data, len);
+ error = parserutils_inputstream_insert(parser->stream, data, len);
if (error != HUBBUB_OK)
return error;
@@ -240,14 +232,15 @@ hubbub_error hubbub_parser_parse_extraneous_chunk(hubbub_parser *parser,
*/
hubbub_error hubbub_parser_completed(hubbub_parser *parser)
{
+ parserutils_error perror;
hubbub_error error;
if (parser == NULL)
return HUBBUB_BADPARM;
- error = hubbub_inputstream_append(parser->stream, NULL, 0);
- if (error != HUBBUB_OK)
- return error;
+ perror = parserutils_inputstream_append(parser->stream, NULL, 0);
+ if (perror != HUBBUB_OK)
+ return !HUBBUB_OK;
error = hubbub_tokeniser_run(parser->tok);
if (error != HUBBUB_OK)
@@ -264,32 +257,11 @@ hubbub_error hubbub_parser_completed(hubbub_parser *parser)
* \return Pointer to charset name (constant; do not free), or NULL if unknown
*/
const char *hubbub_parser_read_charset(hubbub_parser *parser,
- hubbub_charset_source *source)
+ uint32_t *source)
{
if (parser == NULL || source == NULL)
return NULL;
- return hubbub_inputstream_read_charset(parser->stream, source);
+ return parserutils_inputstream_read_charset(parser->stream, source);
}
-/**
- * Claim ownership of the document buffer
- *
- * \param parser Parser whose buffer to claim
- * \param buffer Pointer to location to receive buffer pointer
- * \param len Pointer to location to receive byte length of buffer
- * \return HUBBUB_OK on success, appropriate error otherwise.
- *
- * Once the buffer has been claimed by a client, the parser disclaims
- * all ownership rights (and invalidates any internal references it may have
- * to the buffer). Therefore, the only parser call which may be made
- * after calling this function is to destroy the parser.
- */
-hubbub_error hubbub_parser_claim_buffer(hubbub_parser *parser,
- uint8_t **buffer, size_t *len)
-{
- if (parser == NULL || buffer == NULL || len == NULL)
- return HUBBUB_BADPARM;
-
- return hubbub_inputstream_claim_buffer(parser->stream, buffer, len);
-}
diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c
index df8946b..dee0a76 100644
--- a/src/tokeniser/tokeniser.c
+++ b/src/tokeniser/tokeniser.c
@@ -3,11 +3,16 @@
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ * Copyright 2008 Andrew Sidwell <takkaria@netsurf-browser.org>
*/
#include <assert.h>
#include <stdbool.h>
#include <string.h>
+#include <stdio.h>
+
+#include <parserutils/charset/utf8.h>
+
#include "utils/utils.h"
#include "tokeniser/entities.h"
@@ -24,72 +29,87 @@ static const uint32_t cp1252Table[32] = {
};
/**
+ * UTF-8 encoding of U+FFFD REPLACEMENT CHARACTER
+ */
+static const uint8_t u_fffd[3] = { '\xEF', '\xBF', '\xBD' };
+static const hubbub_string u_fffd_str = { u_fffd, sizeof(u_fffd) };
+
+
+/**
+ * String for when we want to emit newlines
+ */
+static const uint8_t lf = '\n';
+static const hubbub_string lf_str = { &lf, 1 };
+
+
+/**
* Tokeniser states
*/
typedef enum hubbub_tokeniser_state {
- HUBBUB_TOKENISER_STATE_DATA,
- HUBBUB_TOKENISER_STATE_CHARACTER_REFERENCE_DATA,
- HUBBUB_TOKENISER_STATE_TAG_OPEN,
- HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN,
- HUBBUB_TOKENISER_STATE_TAG_NAME,
- HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME,
- HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME,
- HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_NAME,
- HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE,
- HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_DQ,
- HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_SQ,
- HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ,
- HUBBUB_TOKENISER_STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE,
- HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_VALUE_Q,
- HUBBUB_TOKENISER_STATE_SELF_CLOSING_START_TAG,
- HUBBUB_TOKENISER_STATE_BOGUS_COMMENT,
- HUBBUB_TOKENISER_STATE_MARKUP_DECLARATION_OPEN,
- HUBBUB_TOKENISER_STATE_MATCH_COMMENT,
- HUBBUB_TOKENISER_STATE_COMMENT_START,
- HUBBUB_TOKENISER_STATE_COMMENT_START_DASH,
- HUBBUB_TOKENISER_STATE_COMMENT,
- HUBBUB_TOKENISER_STATE_COMMENT_END_DASH,
- HUBBUB_TOKENISER_STATE_COMMENT_END,
- HUBBUB_TOKENISER_STATE_MATCH_DOCTYPE,
- HUBBUB_TOKENISER_STATE_DOCTYPE,
- HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_NAME,
- HUBBUB_TOKENISER_STATE_DOCTYPE_NAME,
- HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_NAME,
- HUBBUB_TOKENISER_STATE_MATCH_PUBLIC,
- HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_PUBLIC,
- HUBBUB_TOKENISER_STATE_DOCTYPE_PUBLIC_DQ,
- HUBBUB_TOKENISER_STATE_DOCTYPE_PUBLIC_SQ,
- HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_PUBLIC,
- HUBBUB_TOKENISER_STATE_MATCH_SYSTEM,
- HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_SYSTEM,
- HUBBUB_TOKENISER_STATE_DOCTYPE_SYSTEM_DQ,
- HUBBUB_TOKENISER_STATE_DOCTYPE_SYSTEM_SQ,
- HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_SYSTEM,
- HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE,
- HUBBUB_TOKENISER_STATE_MATCH_CDATA,
- HUBBUB_TOKENISER_STATE_CDATA_BLOCK,
- HUBBUB_TOKENISER_STATE_NUMBERED_ENTITY,
- HUBBUB_TOKENISER_STATE_NAMED_ENTITY
+ STATE_DATA,
+ STATE_CHARACTER_REFERENCE_DATA,
+ STATE_TAG_OPEN,
+ STATE_CLOSE_TAG_OPEN,
+ STATE_TAG_NAME,
+ STATE_BEFORE_ATTRIBUTE_NAME,
+ STATE_ATTRIBUTE_NAME,
+ STATE_AFTER_ATTRIBUTE_NAME,
+ STATE_BEFORE_ATTRIBUTE_VALUE,
+ STATE_ATTRIBUTE_VALUE_DQ,
+ STATE_ATTRIBUTE_VALUE_SQ,
+ STATE_ATTRIBUTE_VALUE_UQ,
+ STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE,
+ STATE_AFTER_ATTRIBUTE_VALUE_Q,
+ STATE_SELF_CLOSING_START_TAG,
+ STATE_BOGUS_COMMENT,
+ STATE_MARKUP_DECLARATION_OPEN,
+ STATE_MATCH_COMMENT,
+ STATE_COMMENT_START,
+ STATE_COMMENT_START_DASH,
+ STATE_COMMENT,
+ STATE_COMMENT_END_DASH,
+ STATE_COMMENT_END,
+ STATE_MATCH_DOCTYPE,
+ STATE_DOCTYPE,
+ STATE_BEFORE_DOCTYPE_NAME,
+ STATE_DOCTYPE_NAME,
+ STATE_AFTER_DOCTYPE_NAME,
+ STATE_MATCH_PUBLIC,
+ STATE_BEFORE_DOCTYPE_PUBLIC,
+ STATE_DOCTYPE_PUBLIC_DQ,
+ STATE_DOCTYPE_PUBLIC_SQ,
+ STATE_AFTER_DOCTYPE_PUBLIC,
+ STATE_MATCH_SYSTEM,
+ STATE_BEFORE_DOCTYPE_SYSTEM,
+ STATE_DOCTYPE_SYSTEM_DQ,
+ STATE_DOCTYPE_SYSTEM_SQ,
+ STATE_AFTER_DOCTYPE_SYSTEM,
+ STATE_BOGUS_DOCTYPE,
+ STATE_MATCH_CDATA,
+ STATE_CDATA_BLOCK,
+ STATE_NUMBERED_ENTITY,
+ STATE_NAMED_ENTITY
} hubbub_tokeniser_state;
/**
* Context for tokeniser
*/
typedef struct hubbub_tokeniser_context {
- hubbub_token_type current_tag_type; /**< Type of current_tag */
- hubbub_tag current_tag; /**< Current tag */
+ hubbub_string chars; /**< Pending characters */
- hubbub_string current_comment; /**< Current comment */
+ hubbub_string current_comment; /**< Current comment text */
+ hubbub_token_type current_tag_type; /**< Type of current_tag */
+ hubbub_tag current_tag; /**< Current tag */
hubbub_doctype current_doctype; /**< Current doctype */
-
- hubbub_string current_chars; /**< Pending characters */
-
hubbub_tokeniser_state prev_state; /**< Previous state */
-
- hubbub_string last_start_tag_name; /**< Name of the last start tag
+ uint8_t last_start_tag_name[10]; /**< Name of the last start tag
* emitted */
+ size_t last_start_tag_len;
+
+ bool to_buf;
+
struct {
uint32_t count;
bool match;
@@ -105,26 +125,27 @@ typedef struct hubbub_tokeniser_context {
} match_cdata;
struct {
- hubbub_string str; /**< Pending string */
- uint32_t poss_len;
+ size_t offset; /**< Offset in buffer */
+ uint32_t length; /**< Length of entity */
+ uint32_t codepoint; /**< UCS4 codepoint */
+ bool complete; /**< True if match complete */
+
+ uint32_t poss_length; /**< Optimistic length
+ * when matching named
+ * character references */
uint8_t base; /**< Base for numeric
* entities */
- uint32_t codepoint; /**< UCS4 codepoint */
+ void *context; /**< Context for named
+ * entity search */
+ size_t prev_len; /**< Previous byte length
+ * of str */
bool had_data; /**< Whether we read
* anything after &#(x)? */
- hubbub_tokeniser_state return_state; /**< State we were
- * called from */
- bool complete; /**< Flag that entity
- * matching completed */
- bool done_setup; /**< Flag that match setup
- * has completed */
bool overflow; /**< Whether this entity has
* has overflowed the maximum
* numeric entity value */
- void *context; /**< Context for named
- * entity search */
- size_t prev_len; /**< Previous byte length
- * of str */
+ hubbub_tokeniser_state return_state; /**< State we were
+ * called from */
} match_entity;
struct {
@@ -146,19 +167,14 @@ struct hubbub_tokeniser {
bool escape_flag; /**< Escape flag **/
bool process_cdata_section;
- hubbub_inputstream *input; /**< Input stream */
-
- const uint8_t *input_buffer; /**< Start of input stream's buffer */
- size_t input_buffer_len; /**< Length of input buffer */
+ parserutils_inputstream *input; /**< Input stream */
+ parserutils_buffer *buffer; /**< Input buffer */
hubbub_tokeniser_context context; /**< Tokeniser context */
hubbub_token_handler token_handler;
void *token_pw;
- hubbub_buffer_handler buffer_handler;
- void *buffer_pw;
-
hubbub_error_handler error_handler;
void *error_pw;
@@ -198,14 +214,7 @@ static bool hubbub_tokeniser_handle_bogus_comment(
static bool hubbub_tokeniser_handle_markup_declaration_open(
hubbub_tokeniser *tokeniser);
static bool hubbub_tokeniser_handle_match_comment(hubbub_tokeniser *tokeniser);
-static bool hubbub_tokeniser_handle_comment_start(
- hubbub_tokeniser *tokeniser);
-static bool hubbub_tokeniser_handle_comment_start_dash(
- hubbub_tokeniser *tokeniser);
static bool hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser);
-static bool hubbub_tokeniser_handle_comment_end_dash(
- hubbub_tokeniser *tokeniser);
-static bool hubbub_tokeniser_handle_comment_end(hubbub_tokeniser *tokeniser);
static bool hubbub_tokeniser_handle_match_doctype(
hubbub_tokeniser *tokeniser);
static bool hubbub_tokeniser_handle_doctype(hubbub_tokeniser *tokeniser);
@@ -238,13 +247,12 @@ static bool hubbub_tokeniser_handle_bogus_doctype(
static bool hubbub_tokeniser_handle_match_cdata(hubbub_tokeniser *tokeniser);
static bool hubbub_tokeniser_handle_cdata_block(hubbub_tokeniser *tokeniser);
static bool hubbub_tokeniser_consume_character_reference(
- hubbub_tokeniser *tokeniser);
+ hubbub_tokeniser *tokeniser, size_t off);
static bool hubbub_tokeniser_handle_numbered_entity(
hubbub_tokeniser *tokeniser);
static bool hubbub_tokeniser_handle_named_entity(
hubbub_tokeniser *tokeniser);
-static void hubbub_tokeniser_buffer_moved_handler(const uint8_t *buffer,
- size_t len, void *pw);
+
static void hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser,
hubbub_token *token);
@@ -256,7 +264,7 @@ static void hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser,
* \param pw Pointer to client-specific private data (may be NULL)
* \return Pointer to tokeniser instance, or NULL on failure
*/
-hubbub_tokeniser *hubbub_tokeniser_create(hubbub_inputstream *input,
+hubbub_tokeniser *hubbub_tokeniser_create(parserutils_inputstream *input,
hubbub_alloc alloc, void *pw)
{
hubbub_tokeniser *tok;
@@ -268,40 +276,30 @@ hubbub_tokeniser *hubbub_tokeniser_create(hubbub_inputstream *input,
if (tok == NULL)
return NULL;
- tok->state = HUBBUB_TOKENISER_STATE_DATA;
+ tok->buffer = parserutils_buffer_create(alloc, pw);
+ if (tok->buffer == NULL) {
+ alloc(tok, 0, pw);
+ return NULL;
+ }
+
+ tok->state = STATE_DATA;
tok->content_model = HUBBUB_CONTENT_MODEL_PCDATA;
tok->escape_flag = false;
tok->process_cdata_section = false;
tok->input = input;
- tok->input_buffer = NULL;
- tok->input_buffer_len = 0;
tok->token_handler = NULL;
tok->token_pw = NULL;
- tok->buffer_handler = NULL;
- tok->buffer_pw = NULL;
-
tok->error_handler = NULL;
tok->error_pw = NULL;
tok->alloc = alloc;
tok->alloc_pw = pw;
- if (hubbub_inputstream_register_movehandler(input,
- hubbub_tokeniser_buffer_moved_handler, tok) !=
- HUBBUB_OK) {
- alloc(tok, 0, pw);
- return NULL;
- }
-
memset(&tok->context, 0, sizeof(hubbub_tokeniser_context));
- tok->context.current_tag.name.type = HUBBUB_STRING_OFF;
- tok->context.current_comment.type = HUBBUB_STRING_OFF;
- tok->context.current_chars.type = HUBBUB_STRING_OFF;
- tok->context.match_entity.str.type = HUBBUB_STRING_OFF;
return tok;
}
@@ -316,9 +314,6 @@ void hubbub_tokeniser_destroy(hubbub_tokeniser *tokeniser)
if (tokeniser == NULL)
return;
- hubbub_inputstream_deregister_movehandler(tokeniser->input,
- hubbub_tokeniser_buffer_moved_handler, tokeniser);
-
if (tokeniser->context.current_tag.attributes != NULL) {
tokeniser->alloc(tokeniser->context.current_tag.attributes,
0, tokeniser->alloc_pw);
@@ -347,13 +342,6 @@ hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser,
tokeniser->token_handler = params->token_handler.handler;
tokeniser->token_pw = params->token_handler.pw;
break;
- case HUBBUB_TOKENISER_BUFFER_HANDLER:
- tokeniser->buffer_handler = params->buffer_handler.handler;
- tokeniser->buffer_pw = params->buffer_handler.pw;
- tokeniser->buffer_handler(tokeniser->input_buffer,
- tokeniser->input_buffer_len,
- tokeniser->buffer_pw);
- break;
case HUBBUB_TOKENISER_ERROR_HANDLER:
tokeniser->error_handler = params->error_handler.handler;
tokeniser->error_pw = params->error_handler.pw;
@@ -382,183 +370,174 @@ hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser)
if (tokeniser == NULL)
return HUBBUB_BADPARM;
+#ifdef NDEBUG
+#define state(x) \
+ case x:
+#else
+#define state(x) \
+ case x: \
+ printf( #x "\n");
+#endif
+
while (cont) {
switch (tokeniser->state) {
- case HUBBUB_TOKENISER_STATE_DATA:
+ state(STATE_DATA)
cont = hubbub_tokeniser_handle_data(tokeniser);
break;
- case HUBBUB_TOKENISER_STATE_CHARACTER_REFERENCE_DATA:
+ state(STATE_CHARACTER_REFERENCE_DATA)
cont = hubbub_tokeniser_handle_character_reference_data(
tokeniser);
break;
- case HUBBUB_TOKENISER_STATE_TAG_OPEN:
+ state(STATE_TAG_OPEN)
cont = hubbub_tokeniser_handle_tag_open(tokeniser);
break;
- case HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN:
+ state(STATE_CLOSE_TAG_OPEN)
cont = hubbub_tokeniser_handle_close_tag_open(
tokeniser);
break;
- case HUBBUB_TOKENISER_STATE_TAG_NAME:
+ state(STATE_TAG_NAME)
cont = hubbub_tokeniser_handle_tag_name(tokeniser);
break;
- case HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME:
+ state(STATE_BEFORE_ATTRIBUTE_NAME)
cont = hubbub_tokeniser_handle_before_attribute_name(
tokeniser);
break;
- case HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME:
+ state(STATE_ATTRIBUTE_NAME)
cont = hubbub_tokeniser_handle_attribute_name(
tokeniser);
break;
- case HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_NAME:
+ state(STATE_AFTER_ATTRIBUTE_NAME)
cont = hubbub_tokeniser_handle_after_attribute_name(
tokeniser);
break;
- case HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE:
+ state(STATE_BEFORE_ATTRIBUTE_VALUE)
cont = hubbub_tokeniser_handle_before_attribute_value(
tokeniser);
break;
- case HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_DQ:
+ state(STATE_ATTRIBUTE_VALUE_DQ)
cont = hubbub_tokeniser_handle_attribute_value_dq(
tokeniser);
break;
- case HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_SQ:
+ state(STATE_ATTRIBUTE_VALUE_SQ)
cont = hubbub_tokeniser_handle_attribute_value_sq(
tokeniser);
break;
- case HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ:
+ state(STATE_ATTRIBUTE_VALUE_UQ)
cont = hubbub_tokeniser_handle_attribute_value_uq(
tokeniser);
break;
- case HUBBUB_TOKENISER_STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE:
+ state(STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE)
cont = hubbub_tokeniser_handle_character_reference_in_attribute_value(
tokeniser);
break;
- case HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_VALUE_Q:
+ state(STATE_AFTER_ATTRIBUTE_VALUE_Q)
cont = hubbub_tokeniser_handle_after_attribute_value_q(
tokeniser);
break;
- case HUBBUB_TOKENISER_STATE_SELF_CLOSING_START_TAG:
+ state(STATE_SELF_CLOSING_START_TAG)
cont = hubbub_tokeniser_handle_self_closing_start_tag(
tokeniser);
break;
- case HUBBUB_TOKENISER_STATE_BOGUS_COMMENT:
+ state(STATE_BOGUS_COMMENT)
cont = hubbub_tokeniser_handle_bogus_comment(
tokeniser);
break;
- case HUBBUB_TOKENISER_STATE_MARKUP_DECLARATION_OPEN:
+ state(STATE_MARKUP_DECLARATION_OPEN)
cont = hubbub_tokeniser_handle_markup_declaration_open(
tokeniser);
break;
- case HUBBUB_TOKENISER_STATE_MATCH_COMMENT:
+ state(STATE_MATCH_COMMENT)
cont = hubbub_tokeniser_handle_match_comment(
tokeniser);
break;
- case HUBBUB_TOKENISER_STATE_COMMENT_START:
- cont = hubbub_tokeniser_handle_comment_start(
- tokeniser);
- break;
- case HUBBUB_TOKENISER_STATE_COMMENT_START_DASH:
- cont = hubbub_tokeniser_handle_comment_start_dash(
- tokeniser);
- break;
- case HUBBUB_TOKENISER_STATE_COMMENT:
+ case STATE_COMMENT_START:
+ case STATE_COMMENT_START_DASH:
+ case STATE_COMMENT:
+ case STATE_COMMENT_END_DASH:
+ case STATE_COMMENT_END:
+#ifndef NDEBUG
+ printf("COMMENT %d\n",
+ tokeniser->state - STATE_COMMENT_START + 1);
+#endif
cont = hubbub_tokeniser_handle_comment(tokeniser);
break;
- case HUBBUB_TOKENISER_STATE_COMMENT_END_DASH:
- cont = hubbub_tokeniser_handle_comment_end_dash(
- tokeniser);
- break;
- case HUBBUB_TOKENISER_STATE_COMMENT_END:
- cont = hubbub_tokeniser_handle_comment_end(
- tokeniser);
- break;
- case HUBBUB_TOKENISER_STATE_MATCH_DOCTYPE:
+ state(STATE_MATCH_DOCTYPE)
cont = hubbub_tokeniser_handle_match_doctype(
tokeniser);
break;
- case HUBBUB_TOKENISER_STATE_DOCTYPE:
+ state(STATE_DOCTYPE)
cont = hubbub_tokeniser_handle_doctype(tokeniser);
break;
- case HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_NAME:
+ state(STATE_BEFORE_DOCTYPE_NAME)
cont = hubbub_tokeniser_handle_before_doctype_name(
tokeniser);
break;
- case HUBBUB_TOKENISER_STATE_DOCTYPE_NAME:
+ state(STATE_DOCTYPE_NAME)
cont = hubbub_tokeniser_handle_doctype_name(
tokeniser);
break;
- case HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_NAME:
+ state(STATE_AFTER_DOCTYPE_NAME)
cont = hubbub_tokeniser_handle_after_doctype_name(
tokeniser);
break;
- case HUBBUB_TOKENISER_STATE_MATCH_PUBLIC:
+ state(STATE_MATCH_PUBLIC)
cont = hubbub_tokeniser_handle_match_public(
tokeniser);
break;
-
- case HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_PUBLIC:
+ state(STATE_BEFORE_DOCTYPE_PUBLIC)
cont = hubbub_tokeniser_handle_before_doctype_public(
tokeniser);
break;
-
- case HUBBUB_TOKENISER_STATE_DOCTYPE_PUBLIC_DQ:
+ state(STATE_DOCTYPE_PUBLIC_DQ)
cont = hubbub_tokeniser_handle_doctype_public_dq(
tokeniser);
break;
-
- case HUBBUB_TOKENISER_STATE_DOCTYPE_PUBLIC_SQ:
+ state(STATE_DOCTYPE_PUBLIC_SQ)
cont = hubbub_tokeniser_handle_doctype_public_sq(
tokeniser);
break;
-
- case HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_PUBLIC:
+ state(STATE_AFTER_DOCTYPE_PUBLIC)
cont = hubbub_tokeniser_handle_after_doctype_public(
tokeniser);
break;
-
- case HUBBUB_TOKENISER_STATE_MATCH_SYSTEM:
+ state(STATE_MATCH_SYSTEM)
cont = hubbub_tokeniser_handle_match_system(
tokeniser);
break;
-
- case HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_SYSTEM:
+ state(STATE_BEFORE_DOCTYPE_SYSTEM)
cont = hubbub_tokeniser_handle_before_doctype_system(
tokeniser);
break;
-
- case HUBBUB_TOKENISER_STATE_DOCTYPE_SYSTEM_DQ:
+ state(STATE_DOCTYPE_SYSTEM_DQ)
cont = hubbub_tokeniser_handle_doctype_system_dq(
tokeniser);
break;
-
- case HUBBUB_TOKENISER_STATE_DOCTYPE_SYSTEM_SQ:
+ state(STATE_DOCTYPE_SYSTEM_SQ)
cont = hubbub_tokeniser_handle_doctype_system_sq(
tokeniser);
break;
-
- case HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_SYSTEM:
+ state(STATE_AFTER_DOCTYPE_SYSTEM)
cont = hubbub_tokeniser_handle_after_doctype_system(
tokeniser);
break;
-
- case HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE:
+ state(STATE_BOGUS_DOCTYPE)
cont = hubbub_tokeniser_handle_bogus_doctype(
tokeniser);
break;
- case HUBBUB_TOKENISER_STATE_MATCH_CDATA:
+ state(STATE_MATCH_CDATA)
cont = hubbub_tokeniser_handle_match_cdata(
tokeniser);
break;
- case HUBBUB_TOKENISER_STATE_CDATA_BLOCK:
+ state(STATE_CDATA_BLOCK)
cont = hubbub_tokeniser_handle_cdata_block(
tokeniser);
break;
- case HUBBUB_TOKENISER_STATE_NUMBERED_ENTITY:
+ state(STATE_NUMBERED_ENTITY)
cont = hubbub_tokeniser_handle_numbered_entity(
tokeniser);
break;
- case HUBBUB_TOKENISER_STATE_NAMED_ENTITY:
+ state(STATE_NAMED_ENTITY)
cont = hubbub_tokeniser_handle_named_entity(
tokeniser);
break;
@@ -568,514 +547,691 @@ hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser)
return HUBBUB_OK;
}
+
+/**
+ * Macro to obtain the current character from the pointer "cptr".
+ *
+ * To be eliminated as soon as checks for EOF always happen before we want
+ * the current character.
+ */
+#define CHAR(cptr) \
+ (((cptr) == PARSERUTILS_INPUTSTREAM_EOF) ? 0 : (*((uint8_t *) cptr)))
+
+
+/**
+ * Various macros for manipulating buffers.
+ *
+ * \todo make some of these inline functions (type-safety)
+ * \todo document them properly here
+ */
+
+#define START_BUF(str, cptr, lengt) \
+ do { \
+ uint8_t *data = tokeniser->buffer->data + \
+ tokeniser->buffer->length; \
+ parserutils_buffer_append( \
+ tokeniser->buffer, \
+ cptr, (lengt)); \
+ (str).ptr = data; \
+ (str).len = (lengt); \
+ tokeniser->context.to_buf = true; \
+ } while (0)
+
+#define START(str, cptr, length) \
+ do { \
+ if (tokeniser->context.to_buf) { \
+ START_BUF(str, (uint8_t *)(cptr), length); \
+ } else { \
+ (str).ptr = (uint8_t *)(cptr); \
+ (str).len = (length); \
+ } \
+ } while (0)
+
+#define COLLECT(str, cptr, length) \
+ do { \
+ assert(str.len != 0); \
+ if (tokeniser->context.to_buf == true) { \
+ parserutils_buffer_append(tokeniser->buffer, \
+ (uint8_t *) cptr, (length)); \
+ } \
+ (str).len += (length); \
+ } while (0)
+
+#define COLLECT_NOBUF(str, length) \
+ do { \
+ assert(str.len != 0); \
+ (str).len += (length); \
+ } while (0)
+
+#define COLLECT_MS(str, cptr, length) \
+ do { \
+ if ((str).len == 0) { \
+ START(str, cptr, length); \
+ } else { \
+ COLLECT(str, cptr, length); \
+ } \
+ } while (0)
+
+#define COLLECT_MS_NOBUF(str, cptr, length) \
+ do { \
+ if ((str).len == 0) { \
+ (str).ptr = (uint8_t *) cptr; \
+ } \
+ (str).len += (length); \
+ } while (0)
+
+#define FINISH(str) \
+ tokeniser->context.to_buf = false
+
+#define SWITCH(str) \
+ do { \
+ uint8_t *data = tokeniser->buffer->data + \
+ tokeniser->buffer->length; \
+ parserutils_buffer_append( \
+ tokeniser->buffer, \
+ (str).ptr, (str).len); \
+ (str).ptr = data; \
+ tokeniser->context.to_buf = true; \
+ } while (0)
+
+#define COLLECT_CHAR(str, cptr, length) \
+ do { \
+ assert(str.len != 0); \
+ if (tokeniser->context.to_buf == false) { \
+ SWITCH(str); \
+ } \
+ parserutils_buffer_append(tokeniser->buffer, cptr, (length)); \
+ str.len += (length); \
+ } while (0)
+
+
+
+/**
+ * Emit a character token.
+ *
+ * \param tokeniser Tokeniser instance
+ * \param chars Pointer to hubbub_string to emit
+ * \return true
+ */
+static inline bool emit_character_token(hubbub_tokeniser *tokeniser,
+ const hubbub_string *chars)
+{
+ hubbub_token token;
+
+ token.type = HUBBUB_TOKEN_CHARACTER;
+ token.data.character = *chars;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ return true;
+}
+
+/**
+ * Emit the current pending characters being stored in the tokeniser context.
+ *
+ * \param tokeniser Tokeniser instance
+ * \return true
+ */
+static inline bool emit_current_chars(hubbub_tokeniser *tokeniser)
+{
+ hubbub_token token;
+
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(
+ tokeniser->input, 0, &len);
+
+ token.type = HUBBUB_TOKEN_CHARACTER;
+ token.data.character.ptr = (uint8_t *) cptr;
+ token.data.character.len = tokeniser->context.chars.len;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ return true;
+}
+
+/**
+ * Emit the current tag token being stored in the tokeniser context.
+ *
+ * \param tokeniser Tokeniser instance
+ * \return true
+ */
+static inline bool emit_current_tag(hubbub_tokeniser *tokeniser)
+{
+ hubbub_token token;
+
+ /* Emit current tag */
+ token.type = tokeniser->context.current_tag_type;
+ token.data.tag = tokeniser->context.current_tag;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ return true;
+}
+
+/**
+ * Emit the current comment token being stored in the tokeniser context.
+ *
+ * \param tokeniser Tokeniser instance
+ * \return true
+ */
+static inline bool emit_current_comment(hubbub_tokeniser *tokeniser)
+{
+ hubbub_token token;
+
+ token.type = HUBBUB_TOKEN_COMMENT;
+ token.data.comment = tokeniser->context.current_comment;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ return true;
+}
+
+/**
+ * Emit the current doctype token being stored in the tokeniser context.
+ *
+ * \param tokeniser Tokeniser instance
+ * \param force_qurirks Force quirks mode on this document
+ * \return true
+ */
+static inline bool emit_current_doctype(hubbub_tokeniser *tokeniser,
+ bool force_quirks)
+{
+ hubbub_token token;
+
+ /* Emit doctype */
+ token.type = HUBBUB_TOKEN_DOCTYPE;
+ token.data.doctype = tokeniser->context.current_doctype;
+ if (force_quirks == true)
+ token.data.doctype.force_quirks = true;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ return true;
+}
+
+
+
+
+
+/* this should always be called with an empty "chars" buffer */
bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
{
hubbub_token token;
- uint32_t c;
+ uintptr_t cptr;
+ size_t len;
- /* Clear current characters */
- tokeniser->context.current_chars.data.off = 0;
- tokeniser->context.current_chars.len = 0;
+ while ((cptr = parserutils_inputstream_peek(tokeniser->input,
+ tokeniser->context.chars.len, &len)) !=
+ PARSERUTILS_INPUTSTREAM_EOF &&
+ cptr != PARSERUTILS_INPUTSTREAM_OOD) {
+ uint8_t c = CHAR(cptr);
- while ((c = hubbub_inputstream_peek(tokeniser->input)) !=
- HUBBUB_INPUTSTREAM_EOF &&
- c != HUBBUB_INPUTSTREAM_OOD) {
if (c == '&' &&
(tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA ||
tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA) &&
tokeniser->escape_flag == false) {
tokeniser->state =
- HUBBUB_TOKENISER_STATE_CHARACTER_REFERENCE_DATA;
+ STATE_CHARACTER_REFERENCE_DATA;
/* Don't eat the '&'; it'll be handled by entity
* consumption */
break;
- } else if (c == '-') {
- size_t len;
- uint32_t pos;
- pos = hubbub_inputstream_cur_pos(tokeniser->input,
- &len);
+ } else if (c == '-') {
+ hubbub_string *chars = &tokeniser->context.chars;
if (tokeniser->escape_flag == false &&
- (tokeniser->content_model ==
- HUBBUB_CONTENT_MODEL_RCDATA ||
- tokeniser->content_model ==
- HUBBUB_CONTENT_MODEL_CDATA) &&
- pos >= 3 &&
- hubbub_inputstream_compare_range_ascii(
- tokeniser->input, pos - 3, 4,
- "<!--", SLEN("<!--")) == 0)
- {
- tokeniser->escape_flag = true;
- }
+ (tokeniser->content_model ==
+ HUBBUB_CONTENT_MODEL_RCDATA ||
+ tokeniser->content_model ==
+ HUBBUB_CONTENT_MODEL_CDATA) &&
+ chars->len >= 3) {
+
+ cptr = parserutils_inputstream_peek(
+ tokeniser->input,
+ chars->len - 3, &len);
- if (tokeniser->context.current_chars.len == 0) {
- tokeniser->context.current_chars.data.off =
- pos;
+ if (strncmp((char *)cptr,
+ "<!--", SLEN("<!--")) == 0)
+ tokeniser->escape_flag = true;
}
- tokeniser->context.current_chars.len += len;
- hubbub_inputstream_advance(tokeniser->input);
+ COLLECT_MS(tokeniser->context.chars, cptr, len);
} else if (c == '<' && (tokeniser->content_model ==
HUBBUB_CONTENT_MODEL_PCDATA ||
- ((tokeniser->content_model ==
+ ((tokeniser->content_model ==
HUBBUB_CONTENT_MODEL_RCDATA ||
- tokeniser->content_model ==
+ tokeniser->content_model ==
HUBBUB_CONTENT_MODEL_CDATA) &&
tokeniser->escape_flag == false))) {
- if (tokeniser->context.current_chars.len > 0) {
+ if (tokeniser->context.chars.len > 0) {
/* Emit any pending characters */
- token.type = HUBBUB_TOKEN_CHARACTER;
- token.data.character =
- tokeniser->context.current_chars;
-
- hubbub_tokeniser_emit_token(tokeniser,
- &token);
+ emit_current_chars(tokeniser);
}
/* Buffer '<' */
- tokeniser->context.current_chars.data.off =
- hubbub_inputstream_cur_pos(tokeniser->input,
- &tokeniser->context.current_chars.len);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_TAG_OPEN;
- hubbub_inputstream_advance(tokeniser->input);
+ START(tokeniser->context.chars, cptr, len);
+ tokeniser->state = STATE_TAG_OPEN;
break;
} else if (c == '>') {
- size_t len;
- uint32_t pos;
-
- pos = hubbub_inputstream_cur_pos(tokeniser->input,
- &len);
+ hubbub_string *chars = &tokeniser->context.chars;
/* no need to check that there are enough characters,
* since you can only run into this if the flag is
* true in the first place, which requires four
* characters. */
if (tokeniser->escape_flag == true &&
- (tokeniser->content_model ==
- HUBBUB_CONTENT_MODEL_RCDATA ||
- tokeniser->content_model ==
- HUBBUB_CONTENT_MODEL_CDATA) &&
- hubbub_inputstream_compare_range_ascii(
- tokeniser->input, pos - 2, 3,
- "-->", SLEN("-->")) == 0)
- {
- tokeniser->escape_flag = false;
+ (tokeniser->content_model ==
+ HUBBUB_CONTENT_MODEL_RCDATA ||
+ tokeniser->content_model ==
+ HUBBUB_CONTENT_MODEL_CDATA)) {
+
+ cptr = parserutils_inputstream_peek(
+ tokeniser->input,
+ chars->len - 2, &len);
+
+ if (strncmp((char *)cptr,
+ "-->", SLEN("-->")) == 0) {
+ tokeniser->escape_flag = false;
+ }
}
- if (tokeniser->context.current_chars.len == 0) {
- tokeniser->context.current_chars.data.off =
- pos;
+ COLLECT_MS(tokeniser->context.chars, cptr, len);
+ } else if (c == '\0') {
+ if (tokeniser->context.chars.len > 0) {
+ /* Emit any pending characters */
+ emit_current_chars(tokeniser);
}
- tokeniser->context.current_chars.len += len;
- hubbub_inputstream_advance(tokeniser->input);
- } else {
- uint32_t pos;
- size_t len;
+ /* Emit a replacement character */
+ emit_character_token(tokeniser, &u_fffd_str);
- /* Accumulate characters into buffer */
- pos = hubbub_inputstream_cur_pos(tokeniser->input,
+ /* Advance past NUL */
+ parserutils_inputstream_advance(tokeniser->input, 1);
+ } else if (c == '\r') {
+ cptr = parserutils_inputstream_peek(
+ tokeniser->input,
+ tokeniser->context.chars.len + len,
&len);
- if (tokeniser->context.current_chars.len == 0) {
- tokeniser->context.current_chars.data.off =
- pos;
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
+ break;
+ }
+
+ if (tokeniser->context.chars.len > 0) {
+ /* Emit any pending characters */
+ emit_current_chars(tokeniser);
}
- tokeniser->context.current_chars.len += len;
- hubbub_inputstream_advance(tokeniser->input);
+ c = CHAR(cptr);
+ if (c != '\n') {
+ /* Emit newline */
+ emit_character_token(tokeniser, &lf_str);
+ }
+
+ /* Advance over */
+ parserutils_inputstream_advance(tokeniser->input, 1);
+ } else {
+ /* Just collect into buffer */
+ COLLECT_MS(tokeniser->context.chars, cptr, len);
}
}
- if (tokeniser->state != HUBBUB_TOKENISER_STATE_TAG_OPEN &&
- tokeniser->context.current_chars.len > 0) {
+ if (tokeniser->state != STATE_TAG_OPEN &&
+ (tokeniser->state != STATE_DATA ||
+ cptr == PARSERUTILS_INPUTSTREAM_EOF) &&
+ tokeniser->context.chars.len > 0) {
/* Emit any pending characters */
- token.type = HUBBUB_TOKEN_CHARACTER;
- token.data.character = tokeniser->context.current_chars;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->context.current_chars.data.off = 0;
- tokeniser->context.current_chars.len = 0;
+ emit_current_chars(tokeniser);
}
- if (c == HUBBUB_INPUTSTREAM_EOF) {
+ if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
token.type = HUBBUB_TOKEN_EOF;
-
hubbub_tokeniser_emit_token(tokeniser, &token);
}
- return (c != HUBBUB_INPUTSTREAM_EOF && c != HUBBUB_INPUTSTREAM_OOD);
+ return (cptr != PARSERUTILS_INPUTSTREAM_EOF && cptr != PARSERUTILS_INPUTSTREAM_OOD);
}
+/* emit any pending tokens before calling */
bool hubbub_tokeniser_handle_character_reference_data(hubbub_tokeniser *tokeniser)
{
+ assert(tokeniser->context.chars.len == 0);
+
if (tokeniser->context.match_entity.complete == false) {
- return hubbub_tokeniser_consume_character_reference(tokeniser);
+ return hubbub_tokeniser_consume_character_reference(tokeniser,
+ tokeniser->context.chars.len);
} else {
hubbub_token token;
-#ifndef NDEBUG
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
- assert(c != HUBBUB_INPUTSTREAM_OOD &&
- c != HUBBUB_INPUTSTREAM_EOF);
-#endif
+ uint8_t utf8[6];
+ uint8_t *utf8ptr = utf8;
+ size_t len = sizeof(utf8);
- /* Emit character */
token.type = HUBBUB_TOKEN_CHARACTER;
- token.data.character.type = HUBBUB_STRING_OFF;
- token.data.character.data.off =
- hubbub_inputstream_cur_pos(tokeniser->input,
- &token.data.character.len);
- hubbub_tokeniser_emit_token(tokeniser, &token);
+ if (tokeniser->context.match_entity.codepoint) {
+ parserutils_charset_utf8_from_ucs4(
+ tokeniser->context.match_entity.codepoint,
+ &utf8ptr, &len);
+
+ token.data.character.ptr = utf8;
+ token.data.character.len = sizeof(utf8) - len;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ /* +1 for ampersand */
+ parserutils_inputstream_advance(tokeniser->input,
+ tokeniser->context.match_entity.length
+ + 1);
+ } else {
+ uintptr_t cptr = parserutils_inputstream_peek(
+ tokeniser->input,
+ tokeniser->context.chars.len,
+ &len);
+
+ token.data.character.ptr = (uint8_t *)cptr;
+ token.data.character.len = len;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+ parserutils_inputstream_advance(tokeniser->input, len);
+ }
/* Reset for next time */
tokeniser->context.match_entity.complete = false;
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
-
- hubbub_inputstream_advance(tokeniser->input);
+ tokeniser->state = STATE_DATA;
}
return true;
}
+/* this state always switches to another state straight away */
+/* this state expects the current character to be '<' */
bool hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
{
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
hubbub_tag *ctag = &tokeniser->context.current_tag;
- uint32_t pos;
+
size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(
+ tokeniser->input, tokeniser->context.chars.len, &len);
- if (c == HUBBUB_INPUTSTREAM_OOD)
- return false;
+ assert(tokeniser->context.chars.len == 1);
+/* assert(tokeniser->context.chars.ptr[0] == '<'); */
- if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
- tokeniser->content_model ==
- HUBBUB_CONTENT_MODEL_CDATA) {
- if (c == '/') {
- pos = hubbub_inputstream_cur_pos(tokeniser->input,
- &len);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
+ return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ /* Emit '<' */
+ emit_character_token(tokeniser,
+ &tokeniser->context.chars);
- tokeniser->context.current_chars.len += len;
+ tokeniser->state = STATE_DATA;
+ return true;
+ }
- tokeniser->context.close_tag_match.match = false;
- tokeniser->context.close_tag_match.count = 0;
+ uint8_t c = CHAR(cptr);
- tokeniser->state =
- HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN;
- hubbub_inputstream_advance(tokeniser->input);
- } else {
- hubbub_token token;
+ if (c == '/') {
+ COLLECT_NOBUF(tokeniser->context.chars, len);
- /* Emit '<' */
- token.type = HUBBUB_TOKEN_CHARACTER;
- token.data.character =
- tokeniser->context.current_chars;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
+ tokeniser->context.close_tag_match.match = false;
+ tokeniser->context.close_tag_match.count = 0;
- tokeniser->state =
- HUBBUB_TOKENISER_STATE_DATA;
- }
+ tokeniser->state = STATE_CLOSE_TAG_OPEN;
+ } else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
+ tokeniser->content_model ==
+ HUBBUB_CONTENT_MODEL_CDATA) {
+ /* Return to data state with '<' still in "chars" */
+ tokeniser->state = STATE_DATA;
} else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA) {
if (c == '!') {
- pos = hubbub_inputstream_cur_pos(tokeniser->input,
- &len);
-
- tokeniser->context.current_chars.len += len;
-
- tokeniser->state =
- HUBBUB_TOKENISER_STATE_MARKUP_DECLARATION_OPEN;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == '/') {
- pos = hubbub_inputstream_cur_pos(tokeniser->input,
- &len);
-
- tokeniser->context.current_chars.len += len;
+ parserutils_inputstream_advance(tokeniser->input,
+ SLEN("<!"));
- tokeniser->context.close_tag_match.match = false;
- tokeniser->context.close_tag_match.count = 0;
-
- tokeniser->state =
- HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN;
- hubbub_inputstream_advance(tokeniser->input);
+ tokeniser->context.chars.len = 0;
+ tokeniser->state = STATE_MARKUP_DECLARATION_OPEN;
} else if ('A' <= c && c <= 'Z') {
- hubbub_inputstream_lowercase(tokeniser->input);
-
+ COLLECT_NOBUF(tokeniser->context.chars, len);
tokeniser->context.current_tag_type =
HUBBUB_TOKEN_START_TAG;
- ctag->name.data.off =
- hubbub_inputstream_cur_pos(tokeniser->input,
- &ctag->name.len);
+ uint8_t lc = (c + 0x20);
+ START_BUF(ctag->name, &lc, len);
ctag->n_attributes = 0;
- tokeniser->state =
- HUBBUB_TOKENISER_STATE_TAG_NAME;
- hubbub_inputstream_advance(tokeniser->input);
+ tokeniser->state = STATE_TAG_NAME;
} else if ('a' <= c && c <= 'z') {
+ COLLECT_NOBUF(tokeniser->context.chars, len);
tokeniser->context.current_tag_type =
HUBBUB_TOKEN_START_TAG;
- ctag->name.data.off =
- hubbub_inputstream_cur_pos(tokeniser->input,
- &ctag->name.len);
+ START_BUF(ctag->name, (uint8_t *)cptr, len);
ctag->n_attributes = 0;
- tokeniser->state =
- HUBBUB_TOKENISER_STATE_TAG_NAME;
- hubbub_inputstream_advance(tokeniser->input);
+ tokeniser->state = STATE_TAG_NAME;
} else if (c == '>') {
- hubbub_token token;
-
- pos = hubbub_inputstream_cur_pos(tokeniser->input,
- &len);
- tokeniser->context.current_chars.len += len;
+ /** \todo parse error */
- /* Emit "<>" */
- token.type = HUBBUB_TOKEN_CHARACTER;
- token.data.character =
- tokeniser->context.current_chars;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state =
- HUBBUB_TOKENISER_STATE_DATA;
-
- hubbub_inputstream_advance(tokeniser->input);
+ COLLECT_NOBUF(tokeniser->context.chars, len);
+ tokeniser->state = STATE_DATA;
} else if (c == '?') {
- pos = hubbub_inputstream_cur_pos(tokeniser->input,
- &len);
- tokeniser->context.current_chars.len += len;
+ /** \todo parse error */
- tokeniser->context.current_comment.data.off = pos;
- tokeniser->context.current_comment.len = len;
- tokeniser->state =
- HUBBUB_TOKENISER_STATE_BOGUS_COMMENT;
- hubbub_inputstream_advance(tokeniser->input);
- } else {
- hubbub_token token;
+ /* Cursor still at "<", need to advance past it */
+ parserutils_inputstream_advance(
+ tokeniser->input, SLEN("<"));
+ tokeniser->context.chars.len = 0;
+ tokeniser->state = STATE_BOGUS_COMMENT;
+ } else {
/* Emit '<' */
- token.type = HUBBUB_TOKEN_CHARACTER;
- token.data.character =
- tokeniser->context.current_chars;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
+ emit_character_token(tokeniser,
+ &tokeniser->context.chars);
- tokeniser->state =
- HUBBUB_TOKENISER_STATE_DATA;
+ tokeniser->state = STATE_DATA;
}
}
return true;
}
+/* this state expects tokeniser->context.chars to be "</" */
+/* this state never stays in this state for more than one character */
bool hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
{
hubbub_tokeniser_context *ctx = &tokeniser->context;
- /**\todo Handle the fragment case here */
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(
+ tokeniser->input, tokeniser->context.chars.len, &len);
+
+ assert(tokeniser->context.chars.len == 2);
+/* assert(tokeniser->context.chars.ptr[0] == '<'); */
+/* assert(tokeniser->context.chars.ptr[1] == '/'); */
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
+ return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ emit_current_chars(tokeniser);
+ tokeniser->state = STATE_DATA;
+ return true;
+ }
+
+ uint8_t c = CHAR(cptr);
+
+ /**\todo fragment case */
if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
tokeniser->content_model ==
HUBBUB_CONTENT_MODEL_CDATA) {
- uint32_t c;
- hubbub_string start_tag =
+ uint8_t *start_tag_name =
tokeniser->context.last_start_tag_name;
+ size_t start_tag_len =
+ tokeniser->context.last_start_tag_len;
- while ((c = hubbub_inputstream_peek(tokeniser->input)) !=
- HUBBUB_INPUTSTREAM_EOF &&
- c != HUBBUB_INPUTSTREAM_OOD &&
- ctx->close_tag_match.match != true) {
- uint32_t pos;
- size_t len;
-
- pos = hubbub_inputstream_cur_pos(tokeniser->input,
- &len);
-
- if (ctx->close_tag_match.count+1 ==
- start_tag.len) {
- ctx->close_tag_match.match = true;
- } else if (hubbub_inputstream_compare_range_ci(
- tokeniser->input, pos,
- start_tag.data.off +
+ while ((cptr = parserutils_inputstream_peek(tokeniser->input,
+ ctx->chars.len +
ctx->close_tag_match.count,
- len) != 0) {
+ &len)) !=
+ PARSERUTILS_INPUTSTREAM_EOF &&
+ cptr != PARSERUTILS_INPUTSTREAM_OOD) {
+ c = CHAR(cptr);
+
+ if ((start_tag_name[ctx->close_tag_match.count] & ~0x20)
+ != (c & ~0x20)) {
break;
}
- hubbub_inputstream_advance(tokeniser->input);
ctx->close_tag_match.count += len;
+
+ if (ctx->close_tag_match.count == start_tag_len) {
+ ctx->close_tag_match.match = true;
+ break;
+ }
}
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
return false;
+ }
- if (ctx->close_tag_match.match) {
- c = hubbub_inputstream_peek(tokeniser->input);
- if (c != '\t' && c != '\n' && c != '\f' &&
- c != ' ' && c != '>' && c != '/' &&
- c != HUBBUB_INPUTSTREAM_EOF) {
- ctx->close_tag_match.match = false;
+ if (ctx->close_tag_match.match == true) {
+ cptr = parserutils_inputstream_peek(
+ tokeniser->input,
+ ctx->chars.len +
+ ctx->close_tag_match.count,
+ &len);
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
+ return false;
+ } else if (cptr != PARSERUTILS_INPUTSTREAM_EOF) {
+ c = CHAR(cptr);
+
+ if (c != '\t' && c != '\n' && c != '\f' &&
+ c != ' ' && c != '>' &&
+ c != '/') {
+ ctx->close_tag_match.match = false;
+ }
}
}
-
- /* After a match (or not), rewind */
- hubbub_inputstream_rewind(tokeniser->input,
- tokeniser->context.close_tag_match.count);
- tokeniser->context.close_tag_match.count = 0;
}
- if (ctx->close_tag_match.match == false && tokeniser->content_model !=
- HUBBUB_CONTENT_MODEL_PCDATA) {
- hubbub_token token;
-
- uint32_t pos;
- size_t len;
-
- /* emit a '</' character token -- by rewinding */
- hubbub_inputstream_rewind(tokeniser->input, 2);
-
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+ if (ctx->close_tag_match.match == false &&
+ tokeniser->content_model !=
+ HUBBUB_CONTENT_MODEL_PCDATA) {
+ /* We should emit "</" here, but instead we leave it in the
+ * buffer so the data state emits it with any characters
+ * following it */
+ tokeniser->state = STATE_DATA;
+ } else {
+ cptr = parserutils_inputstream_peek(tokeniser->input,
+ tokeniser->context.chars.len, &len);
- token.type = HUBBUB_TOKEN_CHARACTER;
- token.data.character.type = HUBBUB_STRING_OFF;
- token.data.character.data.off = pos;
- token.data.character.len = 2;
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
+ return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ /** \todo parse error */
- hubbub_tokeniser_emit_token(tokeniser, &token);
+ /* Emit "</" */
+ emit_current_chars(tokeniser);
+ tokeniser->state = STATE_DATA;
+ return true;
+ }
- hubbub_inputstream_advance(tokeniser->input);
- hubbub_inputstream_advance(tokeniser->input);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- } else {
- hubbub_tag *ctag = &tokeniser->context.current_tag;
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
- uint32_t pos;
- size_t len;
+ c = CHAR(cptr);
if ('A' <= c && c <= 'Z') {
- hubbub_inputstream_lowercase(tokeniser->input);
-
- pos = hubbub_inputstream_cur_pos(tokeniser->input,
- &len);
+ COLLECT_NOBUF(tokeniser->context.chars, len);
tokeniser->context.current_tag_type =
HUBBUB_TOKEN_END_TAG;
- ctag->name.data.off = pos;
- ctag->name.len = len;
- ctag->n_attributes = 0;
- tokeniser->state = HUBBUB_TOKENISER_STATE_TAG_NAME;
- hubbub_inputstream_advance(tokeniser->input);
+ uint8_t lc = (c + 0x20);
+ START_BUF(tokeniser->context.current_tag.name,
+ &lc, len);
+ tokeniser->context.current_tag.n_attributes = 0;
+
+ tokeniser->state = STATE_TAG_NAME;
} else if ('a' <= c && c <= 'z') {
- pos = hubbub_inputstream_cur_pos(tokeniser->input,
- &len);
+ COLLECT_NOBUF(tokeniser->context.chars, len);
tokeniser->context.current_tag_type =
HUBBUB_TOKEN_END_TAG;
- ctag->name.data.off = pos;
- ctag->name.len = len;
- ctag->n_attributes = 0;
+ START_BUF(tokeniser->context.current_tag.name,
+ (uint8_t *) cptr, len);
+ tokeniser->context.current_tag.n_attributes = 0;
- tokeniser->state = HUBBUB_TOKENISER_STATE_TAG_NAME;
- hubbub_inputstream_advance(tokeniser->input);
+ tokeniser->state = STATE_TAG_NAME;
} else if (c == '>') {
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == HUBBUB_INPUTSTREAM_EOF) {
- hubbub_token token;
+ /* Cursor still at "</", need to collect ">" */
+ COLLECT_NOBUF(tokeniser->context.chars, len);
- /* Emit "</" */
- token.type = HUBBUB_TOKEN_CHARACTER;
- token.data.character =
- tokeniser->context.current_chars;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
+ /* Now need to advance past "</>" */
+ parserutils_inputstream_advance(tokeniser->input,
+ tokeniser->context.chars.len);
+ tokeniser->context.chars.len = 0;
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- } else if (c != HUBBUB_INPUTSTREAM_OOD) {
- pos = hubbub_inputstream_cur_pos(tokeniser->input,
- &len);
+ /** \todo parse error */
+ tokeniser->state = STATE_DATA;
+ } else {
+ /** \todo parse error */
- tokeniser->context.current_comment.data.off = pos;
- tokeniser->context.current_comment.len = len;
+ /* Cursor still at "</", need to advance past it */
+ parserutils_inputstream_advance(tokeniser->input,
+ tokeniser->context.chars.len);
+ tokeniser->context.chars.len = 0;
- tokeniser->state =
- HUBBUB_TOKENISER_STATE_BOGUS_COMMENT;
- hubbub_inputstream_advance(tokeniser->input);
- } else {
- /* Out of data */
- return false;
+ tokeniser->state = STATE_BOGUS_COMMENT;
}
}
return true;
}
+/* this state expects tokeniser->context.current_tag to already have its
+ first character set */
bool hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser)
{
hubbub_tag *ctag = &tokeniser->context.current_tag;
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
- if (c == HUBBUB_INPUTSTREAM_OOD)
- return false;
-
- if (c == '\t' || c == '\n' || c == '\f' || c == ' ') {
- tokeniser->state =
- HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == '>') {
- hubbub_token token;
-
- /* Emit current tag */
- token.type = tokeniser->context.current_tag_type;
- token.data.tag = tokeniser->context.current_tag;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- hubbub_inputstream_advance(tokeniser->input);
- } else if ('A' <= c && c <= 'Z') {
- uint32_t pos;
- size_t len;
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(
+ tokeniser->input, tokeniser->context.chars.len, &len);
- hubbub_inputstream_lowercase(tokeniser->input);
+ assert(tokeniser->context.chars.len > 0);
+/* assert(tokeniser->context.chars.ptr[0] == '<'); */
+ assert(ctag->name.len > 0);
+ assert(ctag->name.ptr);
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
+ return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ tokeniser->state = STATE_DATA;
+ return emit_current_tag(tokeniser);
+ }
- ctag->name.len += len;
+ uint8_t c = CHAR(cptr);
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == HUBBUB_INPUTSTREAM_EOF) {
- hubbub_token token;
+ COLLECT_NOBUF(tokeniser->context.chars, len);
- /* Emit current tag */
- token.type = tokeniser->context.current_tag_type;
- token.data.tag = tokeniser->context.current_tag;
+ if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
+ FINISH(ctag->name);
- hubbub_tokeniser_emit_token(tokeniser, &token);
+ tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
+ } else if (c == '>') {
+ FINISH(ctag->name);
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ emit_current_tag(tokeniser);
+ tokeniser->state = STATE_DATA;
+ } else if (c == '\0') {
+ COLLECT_CHAR(ctag->name, u_fffd, sizeof(u_fffd));
} else if (c == '/') {
- tokeniser->state =
- HUBBUB_TOKENISER_STATE_SELF_CLOSING_START_TAG;
- hubbub_inputstream_advance(tokeniser->input);
+ FINISH(ctag->name);
+ tokeniser->state = STATE_SELF_CLOSING_START_TAG;
+ } else if ('A' <= c && c <= 'Z') {
+ uint8_t lc = (c + 0x20);
+ COLLECT_CHAR(ctag->name, &lc, len);
} else {
- uint32_t pos;
- size_t len;
-
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
-
- ctag->name.len += len;
-
- hubbub_inputstream_advance(tokeniser->input);
+ COLLECT(ctag->name, cptr, len);
}
return true;
@@ -1085,76 +1241,35 @@ bool hubbub_tokeniser_handle_before_attribute_name(
hubbub_tokeniser *tokeniser)
{
hubbub_tag *ctag = &tokeniser->context.current_tag;
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
-
- if (c == HUBBUB_INPUTSTREAM_OOD)
- return false;
-
- if (c == '\t' || c == '\n' || c == '\f' || c == ' ') {
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == '>') {
- hubbub_token token;
-
- /* Emit current tag */
- token.type = tokeniser->context.current_tag_type;
- token.data.tag = tokeniser->context.current_tag;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- hubbub_inputstream_advance(tokeniser->input);
- } else if ('A' <= c && c <= 'Z') {
- uint32_t pos;
- size_t len;
- hubbub_attribute *attr;
-
- hubbub_inputstream_lowercase(tokeniser->input);
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
-
- attr = tokeniser->alloc(ctag->attributes,
- (ctag->n_attributes + 1) *
- sizeof(hubbub_attribute),
- tokeniser->alloc_pw);
- if (attr == NULL) {
- /** \todo handle memory exhaustion */
- }
-
- ctag->attributes = attr;
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(
+ tokeniser->input, tokeniser->context.chars.len, &len);
- attr[ctag->n_attributes].ns = HUBBUB_NS_NULL;
- attr[ctag->n_attributes].name.type = HUBBUB_STRING_OFF;
- attr[ctag->n_attributes].name.data.off = pos;
- attr[ctag->n_attributes].name.len = len;
- attr[ctag->n_attributes].value.type = HUBBUB_STRING_OFF;
- attr[ctag->n_attributes].value.data.off = 0;
- attr[ctag->n_attributes].value.len = 0;
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
+ return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ tokeniser->state = STATE_DATA;
+ return emit_current_tag(tokeniser);
+ }
- ctag->n_attributes++;
+ uint8_t c = CHAR(cptr);
- tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME;
+ COLLECT_NOBUF(tokeniser->context.chars, len);
- hubbub_inputstream_advance(tokeniser->input);
+ if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
+ /* pass over in silence */
+ } else if (c == '>') {
+ emit_current_tag(tokeniser);
+ tokeniser->state = STATE_DATA;
} else if (c == '/') {
- tokeniser->state =
- HUBBUB_TOKENISER_STATE_SELF_CLOSING_START_TAG;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == HUBBUB_INPUTSTREAM_EOF) {
- hubbub_token token;
-
- /* Emit current tag */
- token.type = tokeniser->context.current_tag_type;
- token.data.tag = tokeniser->context.current_tag;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ tokeniser->state = STATE_SELF_CLOSING_START_TAG;
} else {
- uint32_t pos;
- size_t len;
hubbub_attribute *attr;
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+ if (c == '"' || c == '\'' || c == '=') {
+ /** \todo parse error */
+ }
attr = tokeniser->alloc(ctag->attributes,
(ctag->n_attributes + 1) *
@@ -1166,19 +1281,24 @@ bool hubbub_tokeniser_handle_before_attribute_name(
ctag->attributes = attr;
+ if ('A' <= c && c <= 'Z') {
+ uint8_t lc = (c + 0x20);
+ START_BUF(attr[ctag->n_attributes].name, &lc, len);
+ } else if (c == '\0') {
+ START_BUF(attr[ctag->n_attributes].name,
+ u_fffd, sizeof(u_fffd));
+ } else {
+ START_BUF(attr[ctag->n_attributes].name,
+ (uint8_t *) cptr, len);
+ }
+
attr[ctag->n_attributes].ns = HUBBUB_NS_NULL;
- attr[ctag->n_attributes].name.type = HUBBUB_STRING_OFF;
- attr[ctag->n_attributes].name.data.off = pos;
- attr[ctag->n_attributes].name.len = len;
- attr[ctag->n_attributes].value.type = HUBBUB_STRING_OFF;
- attr[ctag->n_attributes].value.data.off = 0;
+ attr[ctag->n_attributes].value.ptr = NULL;
attr[ctag->n_attributes].value.len = 0;
ctag->n_attributes++;
- tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME;
-
- hubbub_inputstream_advance(tokeniser->input);
+ tokeniser->state = STATE_ATTRIBUTE_NAME;
}
return true;
@@ -1187,150 +1307,89 @@ bool hubbub_tokeniser_handle_before_attribute_name(
bool hubbub_tokeniser_handle_attribute_name(hubbub_tokeniser *tokeniser)
{
hubbub_tag *ctag = &tokeniser->context.current_tag;
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
-
- if (c == HUBBUB_INPUTSTREAM_OOD)
- return false;
-
- if (c == '\t' || c == '\n' || c == '\f' || c == ' ') {
- tokeniser->state =
- HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_NAME;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == '=') {
- tokeniser->state =
- HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == '>') {
- hubbub_token token;
- /* Emit current tag */
- token.type = tokeniser->context.current_tag_type;
- token.data.tag = tokeniser->context.current_tag;
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(
+ tokeniser->input, tokeniser->context.chars.len, &len);
- hubbub_tokeniser_emit_token(tokeniser, &token);
+ assert(ctag->attributes[ctag->n_attributes - 1].name.len > 0);
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- hubbub_inputstream_advance(tokeniser->input);
- } else if ('A' <= c && c <= 'Z') {
- uint32_t pos;
- size_t len;
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
+ return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ tokeniser->state = STATE_DATA;
+ return emit_current_tag(tokeniser);
+ }
- hubbub_inputstream_lowercase(tokeniser->input);
+ uint8_t c = CHAR(cptr);
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+ COLLECT_NOBUF(tokeniser->context.chars, len);
- ctag->attributes[ctag->n_attributes - 1].name.len += len;
+ if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
+ FINISH(ctag->attributes[ctag->n_attributes - 1].name);
+ tokeniser->state = STATE_AFTER_ATTRIBUTE_NAME;
+ } else if (c == '=') {
+ FINISH(ctag->attributes[ctag->n_attributes - 1].name);
+ tokeniser->state = STATE_BEFORE_ATTRIBUTE_VALUE;
+ } else if (c == '>') {
+ FINISH(ctag->attributes[ctag->n_attributes - 1].name);
- hubbub_inputstream_advance(tokeniser->input);
+ emit_current_tag(tokeniser);
+ tokeniser->state = STATE_DATA;
} else if (c == '/') {
- tokeniser->state =
- HUBBUB_TOKENISER_STATE_SELF_CLOSING_START_TAG;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == HUBBUB_INPUTSTREAM_EOF) {
- hubbub_token token;
-
- /* Emit current tag */
- token.type = tokeniser->context.current_tag_type;
- token.data.tag = tokeniser->context.current_tag;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ FINISH(ctag->attributes[ctag->n_attributes - 1].name);
+ tokeniser->state = STATE_SELF_CLOSING_START_TAG;
+ } else if (c == '\0') {
+ COLLECT_CHAR(ctag->attributes[ctag->n_attributes - 1].name,
+ u_fffd, sizeof(u_fffd));
+ } else if ('A' <= c && c <= 'Z') {
+ uint8_t lc = (c + 0x20);
+ COLLECT_CHAR(ctag->attributes[ctag->n_attributes - 1].name,
+ &lc, len);
} else {
- uint32_t pos;
- size_t len;
-
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
-
- ctag->attributes[ctag->n_attributes - 1].name.len += len;
-
- hubbub_inputstream_advance(tokeniser->input);
+ COLLECT(ctag->attributes[ctag->n_attributes - 1].name,
+ cptr, len);
}
return true;
}
-bool hubbub_tokeniser_handle_after_attribute_name(
- hubbub_tokeniser *tokeniser)
+bool hubbub_tokeniser_handle_after_attribute_name(hubbub_tokeniser *tokeniser)
{
hubbub_tag *ctag = &tokeniser->context.current_tag;
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(
+ tokeniser->input, tokeniser->context.chars.len, &len);
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ tokeniser->state = STATE_DATA;
+ return emit_current_tag(tokeniser);
+ }
- if (c == '\t' || c == '\n' || c == '\f' || c == ' ') {
- hubbub_inputstream_advance(tokeniser->input);
+ uint8_t c = CHAR(cptr);
+
+ if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
+ COLLECT_NOBUF(tokeniser->context.chars, len);
} else if (c == '=') {
- tokeniser->state =
- HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE;
- hubbub_inputstream_advance(tokeniser->input);
+ COLLECT_NOBUF(tokeniser->context.chars, len);
+ tokeniser->state = STATE_BEFORE_ATTRIBUTE_VALUE;
} else if (c == '>') {
- hubbub_token token;
-
- /* Emit current tag */
- token.type = tokeniser->context.current_tag_type;
- token.data.tag = tokeniser->context.current_tag;
+ COLLECT_NOBUF(tokeniser->context.chars, len);
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- hubbub_inputstream_advance(tokeniser->input);
- } else if ('A' <= c && c <= 'Z') {
- uint32_t pos;
- size_t len;
- hubbub_attribute *attr;
-
- hubbub_inputstream_lowercase(tokeniser->input);
-
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
-
- attr = tokeniser->alloc(ctag->attributes,
- (ctag->n_attributes + 1) *
- sizeof(hubbub_attribute),
- tokeniser->alloc_pw);
- if (attr == NULL) {
- /** \todo handle memory exhaustion */
- }
-
- ctag->attributes = attr;
-
- attr[ctag->n_attributes].ns = HUBBUB_NS_NULL;
- attr[ctag->n_attributes].name.type = HUBBUB_STRING_OFF;
- attr[ctag->n_attributes].name.data.off = pos;
- attr[ctag->n_attributes].name.len = len;
- attr[ctag->n_attributes].value.type = HUBBUB_STRING_OFF;
- attr[ctag->n_attributes].value.data.off = 0;
- attr[ctag->n_attributes].value.len = 0;
-
- ctag->n_attributes++;
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME;
-
- hubbub_inputstream_advance(tokeniser->input);
+ emit_current_tag(tokeniser);
+ tokeniser->state = STATE_DATA;
} else if (c == '/') {
- /** \todo permitted slash */
- tokeniser->state =
- HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == HUBBUB_INPUTSTREAM_EOF) {
- hubbub_token token;
-
- /* Emit current tag */
- token.type = tokeniser->context.current_tag_type;
- token.data.tag = tokeniser->context.current_tag;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ COLLECT_NOBUF(tokeniser->context.chars, len);
+ tokeniser->state = STATE_SELF_CLOSING_START_TAG;
} else {
- uint32_t pos;
- size_t len;
hubbub_attribute *attr;
- hubbub_inputstream_lowercase(tokeniser->input);
-
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+ if (c == '"' || c == '\'' || c == '=') {
+ /** \todo parse error */
+ }
attr = tokeniser->alloc(ctag->attributes,
(ctag->n_attributes + 1) *
@@ -1342,76 +1401,74 @@ bool hubbub_tokeniser_handle_after_attribute_name(
ctag->attributes = attr;
+ if ('A' <= c && c <= 'Z') {
+ uint8_t lc = (c + 0x20);
+ START_BUF(attr[ctag->n_attributes].name, &lc, len);
+ } else if (c == '\0') {
+ START_BUF(attr[ctag->n_attributes].name,
+ u_fffd, sizeof(u_fffd));
+ } else {
+ START_BUF(attr[ctag->n_attributes].name,
+ (uint8_t *)cptr, len);
+ }
+
attr[ctag->n_attributes].ns = HUBBUB_NS_NULL;
- attr[ctag->n_attributes].name.type = HUBBUB_STRING_OFF;
- attr[ctag->n_attributes].name.data.off = pos;
- attr[ctag->n_attributes].name.len = len;
- attr[ctag->n_attributes].value.type = HUBBUB_STRING_OFF;
- attr[ctag->n_attributes].value.data.off = 0;
+ attr[ctag->n_attributes].value.ptr = NULL;
attr[ctag->n_attributes].value.len = 0;
ctag->n_attributes++;
- tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME;
-
- hubbub_inputstream_advance(tokeniser->input);
+ COLLECT_NOBUF(tokeniser->context.chars, len);
+ tokeniser->state = STATE_ATTRIBUTE_NAME;
}
return true;
}
+/* this state is only ever triggered by an '=' */
bool hubbub_tokeniser_handle_before_attribute_value(
hubbub_tokeniser *tokeniser)
{
hubbub_tag *ctag = &tokeniser->context.current_tag;
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(
+ tokeniser->input, tokeniser->context.chars.len, &len);
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ tokeniser->state = STATE_DATA;
+ return emit_current_tag(tokeniser);
+ }
+
+ uint8_t c = CHAR(cptr);
- if (c == '\t' || c == '\n' || c == '\f' || c == ' ') {
- hubbub_inputstream_advance(tokeniser->input);
+ if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
+ COLLECT_NOBUF(tokeniser->context.chars, len);
} else if (c == '"') {
- tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_DQ;
- hubbub_inputstream_advance(tokeniser->input);
+ COLLECT_NOBUF(tokeniser->context.chars, len);
+ tokeniser->state = STATE_ATTRIBUTE_VALUE_DQ;
} else if (c == '&') {
- tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ;
+ tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ;
} else if (c == '\'') {
- tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_SQ;
- hubbub_inputstream_advance(tokeniser->input);
+ COLLECT_NOBUF(tokeniser->context.chars, len);
+ tokeniser->state = STATE_ATTRIBUTE_VALUE_SQ;
} else if (c == '>') {
- hubbub_token token;
-
- /* Emit current tag */
- token.type = tokeniser->context.current_tag_type;
- token.data.tag = tokeniser->context.current_tag;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == HUBBUB_INPUTSTREAM_EOF) {
- hubbub_token token;
-
- /* Emit current tag */
- token.type = tokeniser->context.current_tag_type;
- token.data.tag = tokeniser->context.current_tag;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ COLLECT_NOBUF(tokeniser->context.chars, len);
+
+ emit_current_tag(tokeniser);
+ tokeniser->state = STATE_DATA;
+ } else if (c == '\0') {
+ COLLECT_NOBUF(tokeniser->context.chars, len);
+ START_BUF(ctag->attributes[ctag->n_attributes - 1].value,
+ u_fffd, sizeof(u_fffd));
+ tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ;
} else {
- uint32_t pos;
- size_t len;
-
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
-
- ctag->attributes[ctag->n_attributes - 1].value.data.off = pos;
- ctag->attributes[ctag->n_attributes - 1].value.len = len;
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ;
-
- hubbub_inputstream_advance(tokeniser->input);
+ COLLECT_NOBUF(tokeniser->context.chars, len);
+ START_BUF(ctag->attributes[ctag->n_attributes - 1].value,
+ (uint8_t *)cptr, len);
+ tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ;
}
return true;
@@ -1420,45 +1477,39 @@ bool hubbub_tokeniser_handle_before_attribute_value(
bool hubbub_tokeniser_handle_attribute_value_dq(hubbub_tokeniser *tokeniser)
{
hubbub_tag *ctag = &tokeniser->context.current_tag;
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(
+ tokeniser->input, tokeniser->context.chars.len, &len);
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ FINISH(ctag->attributes[ctag->n_attributes - 1].value);
+
+ tokeniser->state = STATE_DATA;
+ return emit_current_tag(tokeniser);
+ }
+
+ uint8_t c = CHAR(cptr);
if (c == '"') {
- tokeniser->state =
- HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_VALUE_Q;
- hubbub_inputstream_advance(tokeniser->input);
+ COLLECT_NOBUF(tokeniser->context.chars, len);
+ FINISH(ctag->attributes[ctag->n_attributes - 1].value);
+ tokeniser->state = STATE_AFTER_ATTRIBUTE_VALUE_Q;
} else if (c == '&') {
tokeniser->context.prev_state = tokeniser->state;
- tokeniser->state =
- HUBBUB_TOKENISER_STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
+ tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
tokeniser->context.allowed_char = '"';
/* Don't eat the '&'; it'll be handled by entity consumption */
- } else if (c == HUBBUB_INPUTSTREAM_EOF) {
- hubbub_token token;
-
- /* Emit current tag */
- token.type = tokeniser->context.current_tag_type;
- token.data.tag = tokeniser->context.current_tag;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ } else if (c == '\0') {
+ COLLECT_NOBUF(tokeniser->context.chars, len);
+ COLLECT_CHAR(ctag->attributes[ctag->n_attributes - 1].value,
+ u_fffd, sizeof(u_fffd));
} else {
- uint32_t pos;
- size_t len;
-
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
-
- if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) {
- ctag->attributes[ctag->n_attributes - 1].value.data.off =
- pos;
- }
-
- ctag->attributes[ctag->n_attributes - 1].value.len += len;
-
- hubbub_inputstream_advance(tokeniser->input);
+ COLLECT_NOBUF(tokeniser->context.chars, len);
+ COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
+ cptr, len);
}
return true;
@@ -1467,45 +1518,40 @@ bool hubbub_tokeniser_handle_attribute_value_dq(hubbub_tokeniser *tokeniser)
bool hubbub_tokeniser_handle_attribute_value_sq(hubbub_tokeniser *tokeniser)
{
hubbub_tag *ctag = &tokeniser->context.current_tag;
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(
+ tokeniser->input, tokeniser->context.chars.len, &len);
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ FINISH(ctag->attributes[ctag->n_attributes - 1].value);
+
+ tokeniser->state = STATE_DATA;
+ return emit_current_tag(tokeniser);
+ }
+
+ uint8_t c = CHAR(cptr);
if (c == '\'') {
+ COLLECT_NOBUF(tokeniser->context.chars, len);
+ FINISH(ctag->attributes[ctag->n_attributes - 1].value);
tokeniser->state =
- HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_VALUE_Q;
- hubbub_inputstream_advance(tokeniser->input);
+ STATE_AFTER_ATTRIBUTE_VALUE_Q;
} else if (c == '&') {
tokeniser->context.prev_state = tokeniser->state;
- tokeniser->state =
- HUBBUB_TOKENISER_STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
+ tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
tokeniser->context.allowed_char = '\'';
/* Don't eat the '&'; it'll be handled by entity consumption */
- } else if (c == HUBBUB_INPUTSTREAM_EOF) {
- hubbub_token token;
-
- /* Emit current tag */
- token.type = tokeniser->context.current_tag_type;
- token.data.tag = tokeniser->context.current_tag;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ } else if (c == '\0') {
+ COLLECT_NOBUF(tokeniser->context.chars, len);
+ COLLECT_CHAR(ctag->attributes[ctag->n_attributes - 1].value,
+ u_fffd, sizeof(u_fffd));
} else {
- uint32_t pos;
- size_t len;
-
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
-
- if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) {
- ctag->attributes[ctag->n_attributes - 1].value.data.off =
- pos;
- }
-
- ctag->attributes[ctag->n_attributes - 1].value.len += len;
-
- hubbub_inputstream_advance(tokeniser->input);
+ COLLECT_NOBUF(tokeniser->context.chars, len);
+ COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
+ cptr, len);
}
return true;
@@ -1514,51 +1560,49 @@ bool hubbub_tokeniser_handle_attribute_value_sq(hubbub_tokeniser *tokeniser)
bool hubbub_tokeniser_handle_attribute_value_uq(hubbub_tokeniser *tokeniser)
{
hubbub_tag *ctag = &tokeniser->context.current_tag;
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+ uint8_t c;
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(
+ tokeniser->input, tokeniser->context.chars.len, &len);
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ FINISH(ctag->attributes[ctag->n_attributes - 1].value);
- if (c == '\t' || c == '\n' || c == '\f' || c == ' ') {
- tokeniser->state =
- HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME;
- hubbub_inputstream_advance(tokeniser->input);
+ tokeniser->state = STATE_DATA;
+ return emit_current_tag(tokeniser);
+ }
+
+ c = CHAR(cptr);
+
+ if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
+ COLLECT_NOBUF(tokeniser->context.chars, len);
+ FINISH(ctag->attributes[ctag->n_attributes - 1].value);
+ tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
} else if (c == '&') {
tokeniser->context.prev_state = tokeniser->state;
- tokeniser->state =
- HUBBUB_TOKENISER_STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
+ tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
/* Don't eat the '&'; it'll be handled by entity consumption */
} else if (c == '>') {
- hubbub_token token;
-
- /* Emit current tag */
- token.type = tokeniser->context.current_tag_type;
- token.data.tag = tokeniser->context.current_tag;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == HUBBUB_INPUTSTREAM_EOF) {
- hubbub_token token;
-
- /* Emit current tag */
- token.type = tokeniser->context.current_tag_type;
- token.data.tag = tokeniser->context.current_tag;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ COLLECT_NOBUF(tokeniser->context.chars, len);
+ FINISH(ctag->attributes[ctag->n_attributes - 1].value);
+
+ emit_current_tag(tokeniser);
+ tokeniser->state = STATE_DATA;
+ } else if (c == '\0') {
+ COLLECT_NOBUF(tokeniser->context.chars, len);
+ COLLECT_CHAR(ctag->attributes[ctag->n_attributes - 1].value,
+ u_fffd, sizeof(u_fffd));
} else {
- uint32_t pos;
- size_t len;
-
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
- /* don't worry about setting the offset -- this is
- * always done before this state is reached */
- ctag->attributes[ctag->n_attributes - 1].value.len += len;
+ if (c == '"' || c == '\'' || c == '=') {
+ /** \todo parse error */
+ }
- hubbub_inputstream_advance(tokeniser->input);
+ COLLECT_NOBUF(tokeniser->context.chars, len);
+ COLLECT(ctag->attributes[ctag->n_attributes - 1].value,
+ cptr, len);
}
return true;
@@ -1567,80 +1611,87 @@ bool hubbub_tokeniser_handle_attribute_value_uq(hubbub_tokeniser *tokeniser)
bool hubbub_tokeniser_handle_character_reference_in_attribute_value(
hubbub_tokeniser *tokeniser)
{
- hubbub_tag *ctag = &tokeniser->context.current_tag;
- uint32_t pos;
- size_t len;
-
if (tokeniser->context.match_entity.complete == false) {
- return hubbub_tokeniser_consume_character_reference(tokeniser);
+ return hubbub_tokeniser_consume_character_reference(tokeniser,
+ tokeniser->context.chars.len);
} else {
-#ifndef NDEBUG
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
-
- assert(c != HUBBUB_INPUTSTREAM_OOD &&
- c != HUBBUB_INPUTSTREAM_EOF);
-#endif
-
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+ hubbub_tag *ctag = &tokeniser->context.current_tag;
+ hubbub_attribute *attr = &ctag->attributes[
+ ctag->n_attributes - 1];
+
+ uint8_t utf8[6];
+ uint8_t *utf8ptr = utf8;
+ size_t len = sizeof(utf8);
+
+ if (tokeniser->context.match_entity.codepoint) {
+ parserutils_charset_utf8_from_ucs4(
+ tokeniser->context.match_entity.codepoint,
+ &utf8ptr, &len);
+
+ /* +1 for the ampersand */
+ COLLECT_NOBUF(tokeniser->context.chars,
+ tokeniser->context.match_entity.length
+ + 1);
+
+ if (attr->value.len == 0) {
+ START_BUF(attr->value,
+ utf8, sizeof(utf8) - len);
+ } else {
+ SWITCH(attr->value);
+ COLLECT(attr->value, utf8, sizeof(utf8) - len);
+ }
+ } else {
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(
+ tokeniser->input,
+ tokeniser->context.chars.len, &len);
- if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) {
- ctag->attributes[ctag->n_attributes - 1].value.data.off =
- pos;
+ /* Insert the ampersand */
+ COLLECT(tokeniser->context.chars, cptr, len);
+ COLLECT_MS(attr->value, cptr, len);
}
- ctag->attributes[ctag->n_attributes - 1].value.len += len;
-
/* Reset for next time */
tokeniser->context.match_entity.complete = false;
/* And back to the previous state */
tokeniser->state = tokeniser->context.prev_state;
-
- hubbub_inputstream_advance(tokeniser->input);
}
return true;
}
+/* always switches state */
bool hubbub_tokeniser_handle_after_attribute_value_q(
hubbub_tokeniser *tokeniser)
{
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(
+ tokeniser->input, tokeniser->context.chars.len, &len);
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ tokeniser->state = STATE_DATA;
+ return emit_current_tag(tokeniser);
+ }
- if (c == '\t' || c == '\n' || c == '\f' || c == ' ') {
- tokeniser->state =
- HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == '>') {
- hubbub_token token;
-
- /* Emit current tag */
- token.type = tokeniser->context.current_tag_type;
- token.data.tag = tokeniser->context.current_tag;
+ uint8_t c = CHAR(cptr);
- hubbub_tokeniser_emit_token(tokeniser, &token);
+ if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
+ COLLECT_NOBUF(tokeniser->context.chars, len);
+ tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
+ } else if (c == '>') {
+ COLLECT_NOBUF(tokeniser->context.chars, len);
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- hubbub_inputstream_advance(tokeniser->input);
+ emit_current_tag(tokeniser);
+ tokeniser->state = STATE_DATA;
} else if (c == '/') {
- tokeniser->state =
- HUBBUB_TOKENISER_STATE_SELF_CLOSING_START_TAG;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == HUBBUB_INPUTSTREAM_EOF) {
- hubbub_token token;
-
- /* Emit current tag */
- token.type = tokeniser->context.current_tag_type;
- token.data.tag = tokeniser->context.current_tag;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ COLLECT_NOBUF(tokeniser->context.chars, len);
+ tokeniser->state = STATE_SELF_CLOSING_START_TAG;
} else {
- tokeniser->state = HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME;
+ /** \todo parse error */
+ tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
}
return true;
@@ -1649,106 +1700,106 @@ bool hubbub_tokeniser_handle_after_attribute_value_q(
bool hubbub_tokeniser_handle_self_closing_start_tag(
hubbub_tokeniser *tokeniser)
{
- hubbub_tag *ctag = &tokeniser->context.current_tag;
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input,
+ tokeniser->context.chars.len, &len);
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ tokeniser->state = STATE_DATA;
+ return emit_current_tag(tokeniser);
+ }
- if (c == '>') {
- hubbub_token token;
-
- ctag->self_closing = true;
-
- /* Emit current tag */
- token.type = tokeniser->context.current_tag_type;
- token.data.tag = tokeniser->context.current_tag;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
+ uint8_t c = CHAR(cptr);
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == HUBBUB_INPUTSTREAM_EOF) {
- hubbub_token token;
-
- /* Emit current tag */
- token.type = tokeniser->context.current_tag_type;
- token.data.tag = tokeniser->context.current_tag;
+ if (c == '>') {
+ COLLECT_NOBUF(tokeniser->context.chars, len);
- hubbub_tokeniser_emit_token(tokeniser, &token);
+ tokeniser->context.current_tag.self_closing = true;
+ emit_current_tag(tokeniser);
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ tokeniser->state = STATE_DATA;
} else {
- tokeniser->state = HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME;
+ tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
}
return true;
}
-
+/* this state expects tokeniser->context.chars to be empty on first entry */
bool hubbub_tokeniser_handle_bogus_comment(hubbub_tokeniser *tokeniser)
{
- hubbub_token token;
- uint32_t c;
-
- while ((c = hubbub_inputstream_peek(tokeniser->input)) !=
- HUBBUB_INPUTSTREAM_EOF &&
- c != HUBBUB_INPUTSTREAM_OOD) {
- uint32_t pos;
- size_t len;
+ hubbub_string *comment = &tokeniser->context.current_comment;
- if (c == '>') {
- hubbub_inputstream_advance(tokeniser->input);
- break;
- }
-
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
-
- if (tokeniser->context.current_comment.len == 0)
- tokeniser->context.current_comment.data.off = pos;
- tokeniser->context.current_comment.len += len;
-
- hubbub_inputstream_advance(tokeniser->input);
- }
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input,
+ tokeniser->context.chars.len, &len);
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ tokeniser->state = STATE_DATA;
+ tokeniser->context.current_comment.ptr =
+ tokeniser->buffer->data;
+ return emit_current_comment(tokeniser);
+ }
- /* Emit comment */
- token.type = HUBBUB_TOKEN_COMMENT;
- token.data.comment = tokeniser->context.current_comment;
+ uint8_t c = CHAR(cptr);
- hubbub_tokeniser_emit_token(tokeniser, &token);
+ COLLECT_MS_NOBUF(tokeniser->context.chars, cptr, len);
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ if (c == '>') {
+ tokeniser->context.current_comment.ptr =
+ tokeniser->buffer->data;
+ emit_current_comment(tokeniser);
+ tokeniser->state = STATE_DATA;
+ } else if (c == '\0') {
+ parserutils_buffer_append(tokeniser->buffer,
+ u_fffd, sizeof(u_fffd));
+ comment->len += sizeof(u_fffd);
+ } else {
+ parserutils_buffer_append(tokeniser->buffer,
+ (uint8_t *)cptr, len);
+ comment->len += len;
+ }
return true;
}
+/* this state always switches to another state straight away */
bool hubbub_tokeniser_handle_markup_declaration_open(
hubbub_tokeniser *tokeniser)
{
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input,
+ 0, &len);
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ tokeniser->state = STATE_BOGUS_COMMENT;
+ return true;
+ }
+
+ uint8_t c = CHAR(cptr);
if (c == '-') {
- tokeniser->state = HUBBUB_TOKENISER_STATE_MATCH_COMMENT;
- hubbub_inputstream_advance(tokeniser->input);
+ tokeniser->context.chars.ptr = (uint8_t *) cptr;
+ tokeniser->context.chars.len = len;
+ tokeniser->state = STATE_MATCH_COMMENT;
} else if ((c & ~0x20) == 'D') {
- tokeniser->context.match_doctype.count = 1;
- tokeniser->state = HUBBUB_TOKENISER_STATE_MATCH_DOCTYPE;
- hubbub_inputstream_advance(tokeniser->input);
+ tokeniser->context.chars.ptr = (uint8_t *) cptr;
+ tokeniser->context.chars.len = len;
+ tokeniser->context.match_doctype.count = len;
+ tokeniser->state = STATE_MATCH_DOCTYPE;
} else if (tokeniser->process_cdata_section == true && c == '[') {
- tokeniser->context.match_cdata.count = 1;
- tokeniser->state = HUBBUB_TOKENISER_STATE_MATCH_CDATA;
- hubbub_inputstream_advance(tokeniser->input);
+ tokeniser->context.chars.ptr = (uint8_t *) cptr;
+ tokeniser->context.chars.len = len;
+ tokeniser->context.match_cdata.count = len;
+ tokeniser->state = STATE_MATCH_CDATA;
} else {
- tokeniser->context.current_comment.data.off = 0;
- tokeniser->context.current_comment.len = 0;
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT;
+ tokeniser->state = STATE_BOGUS_COMMENT;
}
return true;
@@ -1757,400 +1808,238 @@ bool hubbub_tokeniser_handle_markup_declaration_open(
bool hubbub_tokeniser_handle_match_comment(hubbub_tokeniser *tokeniser)
{
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(
+ tokeniser->input, tokeniser->context.chars.len, &len);
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ tokeniser->context.chars.len =
+ tokeniser->context.current_comment.len =
+ 0;
+ tokeniser->state = STATE_BOGUS_COMMENT;
+ return true;
+ }
- tokeniser->context.current_comment.data.off = 0;
- tokeniser->context.current_comment.len = 0;
+ tokeniser->context.chars.len =
+ tokeniser->context.current_comment.len =
+ 0;
- if (c == '-') {
- tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_START;
- hubbub_inputstream_advance(tokeniser->input);
+ if (CHAR(cptr) == '-') {
+ parserutils_inputstream_advance(tokeniser->input, SLEN("--"));
+ tokeniser->state = STATE_COMMENT_START;
} else {
- /* Rewind to the first '-' */
- hubbub_inputstream_rewind(tokeniser->input, 1);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT;
+ tokeniser->state = STATE_BOGUS_COMMENT;
}
return true;
}
-bool hubbub_tokeniser_handle_comment_start(hubbub_tokeniser *tokeniser)
+bool hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser)
{
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
-
- if (c == HUBBUB_INPUTSTREAM_OOD)
- return false;
-
- if (c == '-') {
- tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_START_DASH;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == '>') {
- hubbub_token token;
-
- /* Emit comment */
- token.type = HUBBUB_TOKEN_COMMENT;
- token.data.comment = tokeniser->context.current_comment;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
+ hubbub_string *comment = &tokeniser->context.current_comment;
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == HUBBUB_INPUTSTREAM_EOF) {
- hubbub_token token;
-
- /* Emit comment */
- token.type = HUBBUB_TOKEN_COMMENT;
- token.data.comment = tokeniser->context.current_comment;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- } else {
- uint32_t pos;
- size_t len;
-
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
-
- if (tokeniser->context.current_comment.len == 0)
- tokeniser->context.current_comment.data.off = pos;
- tokeniser->context.current_comment.len += len;
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT;
- hubbub_inputstream_advance(tokeniser->input);
- }
-
- return true;
-}
-
-bool hubbub_tokeniser_handle_comment_start_dash(hubbub_tokeniser *tokeniser)
-{
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(
+ tokeniser->input, tokeniser->context.chars.len, &len);
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
return false;
-
- if (c == '-') {
- tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_END;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == '>') {
- hubbub_token token;
-
- /* Emit comment */
- token.type = HUBBUB_TOKEN_COMMENT;
- token.data.comment = tokeniser->context.current_comment;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == HUBBUB_INPUTSTREAM_EOF) {
- hubbub_token token;
-
- /* Emit comment */
- token.type = HUBBUB_TOKEN_COMMENT;
- token.data.comment = tokeniser->context.current_comment;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- } else {
- uint32_t pos;
- size_t len;
-
- /* In order to get to this state, the previous character must
- * be '-'. This means we can safely rewind and add to the
- * comment buffer. */
-
- hubbub_inputstream_rewind(tokeniser->input, 1);
-
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
-
- if (tokeniser->context.current_comment.len == 0)
- tokeniser->context.current_comment.data.off = pos;
-
- tokeniser->context.current_comment.len += len;
- hubbub_inputstream_advance(tokeniser->input);
-
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
- tokeniser->context.current_comment.len += len;
- hubbub_inputstream_advance(tokeniser->input);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ tokeniser->context.current_comment.ptr =
+ tokeniser->buffer->data;
+ emit_current_comment(tokeniser);
+ tokeniser->state = STATE_DATA;
+ return true;
}
- return true;
-}
-
-bool hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser)
-{
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
-
- if (c == HUBBUB_INPUTSTREAM_OOD)
- return false;
+ uint8_t c = CHAR(cptr);
- if (c == '-') {
- tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_END_DASH;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == HUBBUB_INPUTSTREAM_EOF) {
- hubbub_token token;
+ if (c == '>' && (tokeniser->state == STATE_COMMENT_START_DASH ||
+ tokeniser->state == STATE_COMMENT_START ||
+ tokeniser->state == STATE_COMMENT_END)) {
+ COLLECT_MS_NOBUF(tokeniser->context.chars, cptr, len);
- /* Emit comment */
- token.type = HUBBUB_TOKEN_COMMENT;
- token.data.comment = tokeniser->context.current_comment;
+ /** \todo parse error if state != COMMENT_END */
+ tokeniser->context.current_comment.ptr =
+ tokeniser->buffer->data;
+ emit_current_comment(tokeniser);
- hubbub_tokeniser_emit_token(tokeniser, &token);
+ tokeniser->state = STATE_DATA;
+ } else if (c == '-') {
+ if (tokeniser->state == STATE_COMMENT_START) {
+ tokeniser->state = STATE_COMMENT_START_DASH;
+ } else if (tokeniser->state == STATE_COMMENT_START_DASH) {
+ tokeniser->state = STATE_COMMENT_END;
+ } else if (tokeniser->state == STATE_COMMENT) {
+ tokeniser->state = STATE_COMMENT_END_DASH;
+ } else if (tokeniser->state == STATE_COMMENT_END_DASH) {
+ tokeniser->state = STATE_COMMENT_END;
+ } else if (tokeniser->state == STATE_COMMENT_END) {
+ parserutils_buffer_append(tokeniser->buffer,
+ (uint8_t *) "-", SLEN("-"));
+ comment->len += SLEN("-");
+ }
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ COLLECT_MS_NOBUF(tokeniser->context.chars, cptr, len);
} else {
- uint32_t pos;
- size_t len;
+ if (tokeniser->state == STATE_COMMENT_START_DASH ||
+ tokeniser->state == STATE_COMMENT_END_DASH) {
+ parserutils_buffer_append(tokeniser->buffer,
+ (uint8_t *) "-", SLEN("-"));
+ comment->len += SLEN("-");
+ } else if (tokeniser->state == STATE_COMMENT_END) {
+ parserutils_buffer_append(tokeniser->buffer,
+ (uint8_t *) "--", SLEN("--"));
+ comment->len += SLEN("--");
+ }
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
- tokeniser->context.current_comment.len += len;
+ if (c == '\0') {
+ parserutils_buffer_append(tokeniser->buffer,
+ u_fffd, sizeof(u_fffd));
+ comment->len += sizeof(u_fffd);
+ } else if (c == '\r') {
+ cptr = parserutils_inputstream_peek(
+ tokeniser->input,
+ tokeniser->context.chars.len + 1,
+ &len);
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
+ return false;
+ } else if (cptr != PARSERUTILS_INPUTSTREAM_EOF &&
+ CHAR(cptr) != '\n') {
+ parserutils_buffer_append(tokeniser->buffer,
+ &lf, sizeof(lf));
+ comment->len += sizeof(lf);
+ }
+ } else {
+ parserutils_buffer_append(tokeniser->buffer,
+ (uint8_t *)cptr, len);
+ comment->len += len;
+ }
- hubbub_inputstream_advance(tokeniser->input);
+ COLLECT_MS_NOBUF(tokeniser->context.chars, cptr, len);
+ tokeniser->state = STATE_COMMENT;
}
return true;
}
-bool hubbub_tokeniser_handle_comment_end_dash(hubbub_tokeniser *tokeniser)
-{
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
-
- if (c == HUBBUB_INPUTSTREAM_OOD)
- return false;
-
- if (c == '-') {
- tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_END;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == HUBBUB_INPUTSTREAM_EOF) {
- hubbub_token token;
-
- /* Emit comment */
- token.type = HUBBUB_TOKEN_COMMENT;
- token.data.comment = tokeniser->context.current_comment;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- } else {
- uint32_t pos;
- size_t len;
- /* In order to get to this state, the previous character must
- * be '-'. This means we can safely rewind and add 1 to the
- * comment buffer. */
- hubbub_inputstream_rewind(tokeniser->input, 1);
- tokeniser->context.current_comment.len += 1;
- /* Now add the input char */
- hubbub_inputstream_advance(tokeniser->input);
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
- tokeniser->context.current_comment.len += len;
- hubbub_inputstream_advance(tokeniser->input);
+#define DOCTYPE "DOCTYPE"
+#define DOCTYPE_LEN (SLEN(DOCTYPE) - 1)
- tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT;
- }
-
- return true;
-}
-
-bool hubbub_tokeniser_handle_comment_end(hubbub_tokeniser *tokeniser)
+bool hubbub_tokeniser_handle_match_doctype(hubbub_tokeniser *tokeniser)
{
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input,
+ tokeniser->context.match_doctype.count, &len);
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ tokeniser->context.current_comment.len =
+ tokeniser->context.chars.len =
+ 0;
+ tokeniser->state = STATE_BOGUS_COMMENT;
+ return true;
+ }
- if (c == '>') {
- hubbub_token token;
-
- /* Emit comment */
- token.type = HUBBUB_TOKEN_COMMENT;
- token.data.comment = tokeniser->context.current_comment;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == '-') {
- uint32_t pos;
- size_t len;
-
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
-
- if (tokeniser->context.current_comment.len == 0) {
- tokeniser->context.current_comment.data.off = pos;
- }
-
- tokeniser->context.current_comment.len = len;
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_END;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == HUBBUB_INPUTSTREAM_EOF) {
- hubbub_token token;
-
- /* Emit comment */
- token.type = HUBBUB_TOKEN_COMMENT;
- token.data.comment = tokeniser->context.current_comment;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- } else {
- uint32_t pos;
- size_t len;
-
- /* In order to have got here, the previous two characters
- * must be '--', so rewind two characters */
- hubbub_inputstream_rewind(tokeniser->input, 2);
-
- /* Add first '-' */
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
-
- if (tokeniser->context.current_comment.len == 0)
- tokeniser->context.current_comment.data.off = pos;
- tokeniser->context.current_comment.len += len;
- hubbub_inputstream_advance(tokeniser->input);
-
- /* Add second '-' */
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
- tokeniser->context.current_comment.len += len;
- hubbub_inputstream_advance(tokeniser->input);
+ uint8_t c = CHAR(cptr);
- /* Add input character */
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
- tokeniser->context.current_comment.len += len;
- hubbub_inputstream_advance(tokeniser->input);
+ assert(tokeniser->context.match_doctype.count <= DOCTYPE_LEN);
- tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT;
+ if (DOCTYPE[tokeniser->context.match_doctype.count] != (c & ~0x20)) {
+ tokeniser->context.current_comment.len =
+ tokeniser->context.chars.len =
+ 0;
+ tokeniser->state = STATE_BOGUS_COMMENT;
+ return true;
}
- return true;
-}
-
-bool hubbub_tokeniser_handle_match_doctype(hubbub_tokeniser *tokeniser)
-{
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+ COLLECT_NOBUF(tokeniser->context.chars, len);
- if (c == HUBBUB_INPUTSTREAM_OOD)
- return false;
-
- if (tokeniser->context.match_doctype.count == 1 &&
- (c & ~0x20) == 'O') {
- tokeniser->context.match_doctype.count++;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (tokeniser->context.match_doctype.count == 2 &&
- (c & ~0x20) == 'C') {
- tokeniser->context.match_doctype.count++;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (tokeniser->context.match_doctype.count == 3 &&
- (c & ~0x20) == 'T') {
- tokeniser->context.match_doctype.count++;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (tokeniser->context.match_doctype.count == 4 &&
- (c & ~0x20) == 'Y') {
- tokeniser->context.match_doctype.count++;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (tokeniser->context.match_doctype.count == 5 &&
- (c & ~0x20) == 'P') {
- tokeniser->context.match_doctype.count++;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (tokeniser->context.match_doctype.count == 6 &&
- (c & ~0x20) == 'E') {
- tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE;
- hubbub_inputstream_advance(tokeniser->input);
- } else {
- /* Rewind as many characters as have been matched */
- hubbub_inputstream_rewind(tokeniser->input,
- tokeniser->context.match_doctype.count);
+ if (tokeniser->context.match_doctype.count == DOCTYPE_LEN) {
+ /* Skip over the DOCTYPE bit */
+ parserutils_inputstream_advance(tokeniser->input,
+ tokeniser->context.chars.len);
- tokeniser->context.current_comment.data.off = 0;
- tokeniser->context.current_comment.len = 0;
+ memset(&tokeniser->context.current_doctype, 0,
+ sizeof tokeniser->context.current_doctype);
+ tokeniser->context.current_doctype.public_missing = true;
+ tokeniser->context.current_doctype.system_missing = true;
+ tokeniser->context.chars.len = 0;
- tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT;
+ tokeniser->state = STATE_DOCTYPE;
}
+ tokeniser->context.match_doctype.count++;
+
return true;
}
+#undef DOCTYPE
+#undef DOCTYPE_LEN
+
bool hubbub_tokeniser_handle_doctype(hubbub_tokeniser *tokeniser)
{
- hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input,
+ tokeniser->context.chars.len, &len);
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ tokeniser->state = STATE_BEFORE_DOCTYPE_NAME;
+ return true;
+ }
- memset(cdoc, 0, sizeof *cdoc);
- cdoc->name.type = HUBBUB_STRING_OFF;
- cdoc->public_missing = true;
- cdoc->public_id.type = HUBBUB_STRING_OFF;
- cdoc->system_missing = true;
- cdoc->system_id.type = HUBBUB_STRING_OFF;
+ uint8_t c = CHAR(cptr);
- if (c == '\t' || c == '\n' || c == '\f' || c == ' ') {
- hubbub_inputstream_advance(tokeniser->input);
+ if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
+ COLLECT_MS_NOBUF(tokeniser->context.chars, cptr, len);
}
- tokeniser->state = HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_NAME;
+ tokeniser->state = STATE_BEFORE_DOCTYPE_NAME;
return true;
}
-bool hubbub_tokeniser_handle_before_doctype_name(
- hubbub_tokeniser *tokeniser)
+bool hubbub_tokeniser_handle_before_doctype_name(hubbub_tokeniser *tokeniser)
{
hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input,
+ tokeniser->context.chars.len, &len);
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ /* Emit current doctype, force-quirks on */
+ emit_current_doctype(tokeniser, true);
+ tokeniser->state = STATE_DATA;
+ return true;
+ }
- if (c == '\t' || c == '\n' || c == '\f' || c == ' ') {
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == '>') {
- hubbub_token token;
-
- /* Emit doctype */
- token.type = HUBBUB_TOKEN_DOCTYPE;
- token.data.doctype = tokeniser->context.current_doctype;
- token.data.doctype.force_quirks = true;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == HUBBUB_INPUTSTREAM_EOF) {
- hubbub_token token;
-
- /* Emit doctype */
- token.type = HUBBUB_TOKEN_DOCTYPE;
- token.data.doctype = tokeniser->context.current_doctype;
- token.data.doctype.force_quirks = true;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
+ uint8_t c = CHAR(cptr);
+ COLLECT_MS_NOBUF(tokeniser->context.chars, cptr, len);
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
+ /* pass over in silence */
+ } else if (c == '>') {
+ emit_current_doctype(tokeniser, true);
+ tokeniser->state = STATE_DATA;
} else {
- uint32_t pos;
- size_t len;
-
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
-
- cdoc->name.data.off = pos;
- cdoc->name.len = len;
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE_NAME;
+ if (c == '\0') {
+ START_BUF(cdoc->name, u_fffd, sizeof(u_fffd));
+ } else {
+ START_BUF(cdoc->name, (uint8_t *) cptr, len);
+ }
- hubbub_inputstream_advance(tokeniser->input);
+ tokeniser->state = STATE_DOCTYPE_NAME;
}
return true;
@@ -2159,45 +2048,36 @@ bool hubbub_tokeniser_handle_before_doctype_name(
bool hubbub_tokeniser_handle_doctype_name(hubbub_tokeniser *tokeniser)
{
hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input,
+ tokeniser->context.chars.len, &len);
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ FINISH(cdoc->name);
- if (c == '\t' || c == '\n' || c == '\f' || c == ' ') {
- tokeniser->state = HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_NAME;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == '>') {
- hubbub_token token;
-
- /* Emit doctype */
- token.type = HUBBUB_TOKEN_DOCTYPE;
- token.data.doctype = tokeniser->context.current_doctype;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == HUBBUB_INPUTSTREAM_EOF) {
- hubbub_token token;
-
- /* Emit doctype */
- token.type = HUBBUB_TOKEN_DOCTYPE;
- token.data.doctype = tokeniser->context.current_doctype;
- token.data.doctype.force_quirks = true;
+ emit_current_doctype(tokeniser, true);
+ tokeniser->state = STATE_DATA;
+ return true;
+ }
- hubbub_tokeniser_emit_token(tokeniser, &token);
+ uint8_t c = CHAR(cptr);
+ COLLECT_NOBUF(tokeniser->context.chars, len);
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
+ FINISH(cdoc->name);
+ tokeniser->state = STATE_AFTER_DOCTYPE_NAME;
+ } else if (c == '>') {
+ FINISH(cdoc->name);
+ emit_current_doctype(tokeniser, false);
+ tokeniser->state = STATE_DATA;
} else {
- uint32_t pos;
- size_t len;
-
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
-
- cdoc->name.len += len;
-
- hubbub_inputstream_advance(tokeniser->input);
+ if (c == '\0') {
+ COLLECT_CHAR(cdoc->name, u_fffd, sizeof(u_fffd));
+ } else {
+ COLLECT(cdoc->name, cptr, len);
+ }
}
return true;
@@ -2205,141 +2085,115 @@ bool hubbub_tokeniser_handle_doctype_name(hubbub_tokeniser *tokeniser)
bool hubbub_tokeniser_handle_after_doctype_name(hubbub_tokeniser *tokeniser)
{
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input,
+ tokeniser->context.chars.len, &len);
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ emit_current_doctype(tokeniser, true);
+ tokeniser->state = STATE_DATA;
+ return true;
+ }
- if (c == '\t' || c == '\n' || c == '\f' || c == ' ') {
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == '>') {
- hubbub_token token;
-
- /* Emit doctype */
- token.type = HUBBUB_TOKEN_DOCTYPE;
- token.data.doctype = tokeniser->context.current_doctype;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == HUBBUB_INPUTSTREAM_EOF) {
- hubbub_token token;
-
- /* Emit doctype */
- token.type = HUBBUB_TOKEN_DOCTYPE;
- token.data.doctype = tokeniser->context.current_doctype;
- token.data.doctype.force_quirks = true;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
+ uint8_t c = CHAR(cptr);
+ COLLECT_NOBUF(tokeniser->context.chars, len);
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
+ /* pass over in silence */
+ } else if (c == '>') {
+ emit_current_doctype(tokeniser, false);
+ tokeniser->state = STATE_DATA;
} else if ((c & ~0x20) == 'P') {
- tokeniser->state = HUBBUB_TOKENISER_STATE_MATCH_PUBLIC;
tokeniser->context.match_doctype.count = 1;
- hubbub_inputstream_advance(tokeniser->input);
+ tokeniser->state = STATE_MATCH_PUBLIC;
} else if ((c & ~0x20) == 'S') {
- tokeniser->state = HUBBUB_TOKENISER_STATE_MATCH_SYSTEM;
tokeniser->context.match_doctype.count = 1;
- hubbub_inputstream_advance(tokeniser->input);
+ tokeniser->state = STATE_MATCH_SYSTEM;
} else {
- tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE;
+ tokeniser->state = STATE_BOGUS_DOCTYPE;
tokeniser->context.current_doctype.force_quirks = true;
-
- hubbub_inputstream_advance(tokeniser->input);
}
return true;
}
+#define PUBLIC "PUBLIC"
+#define PUBLIC_LEN (SLEN(PUBLIC) - 1)
bool hubbub_tokeniser_handle_match_public(hubbub_tokeniser *tokeniser)
{
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input,
+ tokeniser->context.chars.len, &len);
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ tokeniser->context.current_doctype.force_quirks = true;
+ tokeniser->state = STATE_BOGUS_DOCTYPE;
+ return true;
+ }
- if (tokeniser->context.match_doctype.count == 1 &&
- (c & ~0x20) == 'U') {
- tokeniser->context.match_doctype.count++;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (tokeniser->context.match_doctype.count == 2 &&
- (c & ~0x20) == 'B') {
- tokeniser->context.match_doctype.count++;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (tokeniser->context.match_doctype.count == 3 &&
- (c & ~0x20) == 'L') {
- tokeniser->context.match_doctype.count++;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (tokeniser->context.match_doctype.count == 4 &&
- (c & ~0x20) == 'I') {
- tokeniser->context.match_doctype.count++;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (tokeniser->context.match_doctype.count == 5 &&
- (c & ~0x20) == 'C') {
- tokeniser->state = HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_PUBLIC;
- hubbub_inputstream_advance(tokeniser->input);
- } else {
- /* Rewind as many characters as have been matched */
- hubbub_inputstream_rewind(tokeniser->input,
- tokeniser->context.match_doctype.count);
+ uint8_t c = CHAR(cptr);
+
+ assert(tokeniser->context.match_doctype.count <= PUBLIC_LEN);
- tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE;
+ if (PUBLIC[tokeniser->context.match_doctype.count] != (c & ~0x20)) {
tokeniser->context.current_doctype.force_quirks = true;
+ tokeniser->state = STATE_BOGUS_DOCTYPE;
+ return true;
}
- return true;
-}
+ COLLECT_NOBUF(tokeniser->context.chars, len);
+
+ if (tokeniser->context.match_doctype.count == PUBLIC_LEN) {
+ tokeniser->state = STATE_BEFORE_DOCTYPE_PUBLIC;
+ }
+ tokeniser->context.match_doctype.count++;
+ return true;
+}
+#undef PUBLIC
+#undef PUBLIC_LEN
bool hubbub_tokeniser_handle_before_doctype_public(hubbub_tokeniser *tokeniser)
{
hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input,
+ tokeniser->context.chars.len, &len);
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ emit_current_doctype(tokeniser, true);
+ tokeniser->state = STATE_DATA;
+ return true;
+ }
+
+ uint8_t c = CHAR(cptr);
+ COLLECT_NOBUF(tokeniser->context.chars, len);
- if (c == '\t' || c == '\n' || c == '\f' || c == ' ') {
- hubbub_inputstream_advance(tokeniser->input);
+ if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
+ /* pass over in silence */
} else if (c == '"') {
cdoc->public_missing = false;
- tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE_PUBLIC_DQ;
- hubbub_inputstream_advance(tokeniser->input);
+ cdoc->public_id.len = 0;
+ tokeniser->state = STATE_DOCTYPE_PUBLIC_DQ;
} else if (c == '\'') {
cdoc->public_missing = false;
- tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE_PUBLIC_SQ;
- hubbub_inputstream_advance(tokeniser->input);
+ cdoc->public_id.len = 0;
+ tokeniser->state = STATE_DOCTYPE_PUBLIC_SQ;
} else if (c == '>') {
- hubbub_token token;
-
- /* Emit doctype */
- token.type = HUBBUB_TOKEN_DOCTYPE;
- token.data.doctype = tokeniser->context.current_doctype;
- token.data.doctype.force_quirks = true;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == HUBBUB_INPUTSTREAM_EOF) {
- hubbub_token token;
-
- /* Emit doctype */
- token.type = HUBBUB_TOKEN_DOCTYPE;
- token.data.doctype = tokeniser->context.current_doctype;
- token.data.doctype.force_quirks = true;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ emit_current_doctype(tokeniser, true);
+ tokeniser->state = STATE_DATA;
} else {
cdoc->force_quirks = true;
- tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE;
- hubbub_inputstream_advance(tokeniser->input);
+ tokeniser->state = STATE_BOGUS_DOCTYPE;
}
return true;
@@ -2348,49 +2202,37 @@ bool hubbub_tokeniser_handle_before_doctype_public(hubbub_tokeniser *tokeniser)
bool hubbub_tokeniser_handle_doctype_public_dq(hubbub_tokeniser *tokeniser)
{
hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input,
+ tokeniser->context.chars.len, &len);
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ FINISH(cdoc->public_id);
+ emit_current_doctype(tokeniser, true);
+ tokeniser->state = STATE_DATA;
+ return true;
+ }
+
+ uint8_t c = CHAR(cptr);
+ COLLECT_NOBUF(tokeniser->context.chars, len);
if (c == '"') {
- tokeniser->state = HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_PUBLIC;
- hubbub_inputstream_advance(tokeniser->input);
+ FINISH(cdoc->public_id);
+ tokeniser->state = STATE_AFTER_DOCTYPE_PUBLIC;
} else if (c == '>') {
- hubbub_token token;
-
- /* Emit doctype */
- token.type = HUBBUB_TOKEN_DOCTYPE;
- token.data.doctype = tokeniser->context.current_doctype;
- token.data.doctype.force_quirks = true;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == HUBBUB_INPUTSTREAM_EOF) {
- hubbub_token token;
-
- /* Emit doctype */
- token.type = HUBBUB_TOKEN_DOCTYPE;
- token.data.doctype = tokeniser->context.current_doctype;
- token.data.doctype.force_quirks = true;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ FINISH(cdoc->public_id);
+ emit_current_doctype(tokeniser, true);
+ tokeniser->state = STATE_DATA;
+ } else if (c == '\0') {
+ if (cdoc->public_id.len == 0) {
+ START_BUF(cdoc->name, u_fffd, sizeof(u_fffd));
+ } else {
+ COLLECT_CHAR(cdoc->name, u_fffd, sizeof(u_fffd));
+ }
} else {
- uint32_t pos;
- size_t len;
-
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
-
- if (cdoc->public_id.len == 0)
- cdoc->public_id.data.off = pos;
-
- cdoc->public_id.len += len;
-
- hubbub_inputstream_advance(tokeniser->input);
+ COLLECT_MS(cdoc->public_id, cptr, len);
}
return true;
@@ -2399,49 +2241,39 @@ bool hubbub_tokeniser_handle_doctype_public_dq(hubbub_tokeniser *tokeniser)
bool hubbub_tokeniser_handle_doctype_public_sq(hubbub_tokeniser *tokeniser)
{
hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input,
+ tokeniser->context.chars.len, &len);
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ FINISH(cdoc->public_id);
+ emit_current_doctype(tokeniser, true);
+ tokeniser->state = STATE_DATA;
+ return true;
+ }
+
+ uint8_t c = CHAR(cptr);
+ COLLECT_NOBUF(tokeniser->context.chars, len);
if (c == '\'') {
- tokeniser->state = HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_PUBLIC;
- hubbub_inputstream_advance(tokeniser->input);
+ FINISH(cdoc->public_id);
+ tokeniser->state = STATE_AFTER_DOCTYPE_PUBLIC;
} else if (c == '>') {
- hubbub_token token;
-
- /* Emit doctype */
- token.type = HUBBUB_TOKEN_DOCTYPE;
- token.data.doctype = tokeniser->context.current_doctype;
- token.data.doctype.force_quirks = true;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == HUBBUB_INPUTSTREAM_EOF) {
- hubbub_token token;
-
- /* Emit doctype */
- token.type = HUBBUB_TOKEN_DOCTYPE;
- token.data.doctype = tokeniser->context.current_doctype;
- token.data.doctype.force_quirks = true;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ FINISH(cdoc->public_id);
+ emit_current_doctype(tokeniser, true);
+ tokeniser->state = STATE_DATA;
+ } else if (c == '\0') {
+ if (cdoc->public_id.len == 0) {
+ START_BUF(cdoc->public_id,
+ u_fffd, sizeof(u_fffd));
+ } else {
+ COLLECT_CHAR(cdoc->public_id,
+ u_fffd, sizeof(u_fffd));
+ }
} else {
- uint32_t pos;
- size_t len;
-
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
-
- if (cdoc->public_id.len == 0)
- cdoc->public_id.data.off = pos;
-
- cdoc->public_id.len += len;
-
- hubbub_inputstream_advance(tokeniser->input);
+ COLLECT_MS(cdoc->public_id, cptr, len);
}
return true;
@@ -2451,191 +2283,162 @@ bool hubbub_tokeniser_handle_doctype_public_sq(hubbub_tokeniser *tokeniser)
bool hubbub_tokeniser_handle_after_doctype_public(hubbub_tokeniser *tokeniser)
{
hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input,
+ tokeniser->context.chars.len, &len);
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ emit_current_doctype(tokeniser, true);
+ tokeniser->state = STATE_DATA;
+ return true;
+ }
- if (c == '\t' || c == '\n' || c == '\f' || c == ' ') {
- hubbub_inputstream_advance(tokeniser->input);
+ uint8_t c = CHAR(cptr);
+ COLLECT_NOBUF(tokeniser->context.chars, len);
+
+ if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
+ /* pass over in silence */
} else if (c == '"') {
cdoc->system_missing = false;
- tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE_SYSTEM_DQ;
- hubbub_inputstream_advance(tokeniser->input);
+ cdoc->system_id.len = 0;
+
+ tokeniser->state = STATE_DOCTYPE_SYSTEM_DQ;
} else if (c == '\'') {
cdoc->system_missing = false;
- tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE_SYSTEM_SQ;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == '>') {
- hubbub_token token;
-
- /* Emit doctype */
- token.type = HUBBUB_TOKEN_DOCTYPE;
- token.data.doctype = tokeniser->context.current_doctype;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == HUBBUB_INPUTSTREAM_EOF) {
- hubbub_token token;
-
- /* Emit doctype */
- token.type = HUBBUB_TOKEN_DOCTYPE;
- token.data.doctype = tokeniser->context.current_doctype;
- token.data.doctype.force_quirks = true;
+ cdoc->system_id.len = 0;
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ tokeniser->state = STATE_DOCTYPE_SYSTEM_SQ;
+ } else if (c == '>') {
+ emit_current_doctype(tokeniser, false);
+ tokeniser->state = STATE_DATA;
} else {
cdoc->force_quirks = true;
- tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE;
- hubbub_inputstream_advance(tokeniser->input);
+ tokeniser->state = STATE_BOGUS_DOCTYPE;
}
return true;
}
+
+#define SYSTEM "SYSTEM"
+#define SYSTEM_LEN (SLEN(SYSTEM) - 1)
+
bool hubbub_tokeniser_handle_match_system(hubbub_tokeniser *tokeniser)
{
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input,
+ tokeniser->context.chars.len, &len);
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ tokeniser->context.current_doctype.force_quirks = true;
+ tokeniser->state = STATE_BOGUS_DOCTYPE;
+ return true;
+ }
- if (tokeniser->context.match_doctype.count == 1 &&
- (c & ~0x20) == 'Y') {
- tokeniser->context.match_doctype.count++;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (tokeniser->context.match_doctype.count == 2 &&
- (c & ~0x20) == 'S') {
- tokeniser->context.match_doctype.count++;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (tokeniser->context.match_doctype.count == 3 &&
- (c & ~0x20) == 'T') {
- tokeniser->context.match_doctype.count++;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (tokeniser->context.match_doctype.count == 4 &&
- (c & ~0x20) == 'E') {
- tokeniser->context.match_doctype.count++;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (tokeniser->context.match_doctype.count == 5 &&
- (c & ~0x20) == 'M') {
- tokeniser->state = HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_SYSTEM;
- hubbub_inputstream_advance(tokeniser->input);
- } else {
- /* Rewind as many characters as have been matched */
- hubbub_inputstream_rewind(tokeniser->input,
- tokeniser->context.match_doctype.count);
+ uint8_t c = CHAR(cptr);
- tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE;
+ assert(tokeniser->context.match_doctype.count <= SYSTEM_LEN);
+
+ if (SYSTEM[tokeniser->context.match_doctype.count] != (c & ~0x20)) {
tokeniser->context.current_doctype.force_quirks = true;
+ tokeniser->state = STATE_BOGUS_DOCTYPE;
+ return true;
}
+ COLLECT_NOBUF(tokeniser->context.chars, len);
+
+ if (tokeniser->context.match_doctype.count == SYSTEM_LEN) {
+ tokeniser->state = STATE_BEFORE_DOCTYPE_SYSTEM;
+ }
+
+ tokeniser->context.match_doctype.count++;
+
return true;
}
+#undef SYSTEM
+#undef SYSTEM_LEN
bool hubbub_tokeniser_handle_before_doctype_system(hubbub_tokeniser *tokeniser)
{
hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input,
+ tokeniser->context.chars.len, &len);
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ emit_current_doctype(tokeniser, true);
+ tokeniser->state = STATE_DATA;
+ return true;
+ }
+
+ uint8_t c = CHAR(cptr);
+ COLLECT_NOBUF(tokeniser->context.chars, len);
- if (c == '\t' || c == '\n' || c == '\f' || c == ' ') {
- hubbub_inputstream_advance(tokeniser->input);
+ if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
} else if (c == '"') {
cdoc->system_missing = false;
- tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE_SYSTEM_DQ;
- hubbub_inputstream_advance(tokeniser->input);
+ cdoc->system_id.len = 0;
+
+ tokeniser->state = STATE_DOCTYPE_SYSTEM_DQ;
} else if (c == '\'') {
cdoc->system_missing = false;
- tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE_SYSTEM_SQ;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == '>') {
- hubbub_token token;
-
- /* Emit doctype */
- token.type = HUBBUB_TOKEN_DOCTYPE;
- token.data.doctype = tokeniser->context.current_doctype;
- token.data.doctype.force_quirks = true;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == HUBBUB_INPUTSTREAM_EOF) {
- hubbub_token token;
-
- /* Emit doctype */
- token.type = HUBBUB_TOKEN_DOCTYPE;
- token.data.doctype = tokeniser->context.current_doctype;
- token.data.doctype.force_quirks = true;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
+ cdoc->system_id.len = 0;
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ tokeniser->state = STATE_DOCTYPE_SYSTEM_SQ;
+ } else if (c == '>') {
+ emit_current_doctype(tokeniser, true);
+ tokeniser->state = STATE_DATA;
} else {
cdoc->force_quirks = true;
- tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE;
- hubbub_inputstream_advance(tokeniser->input);
+ tokeniser->state = STATE_BOGUS_DOCTYPE;
}
return true;
}
-
-
bool hubbub_tokeniser_handle_doctype_system_dq(hubbub_tokeniser *tokeniser)
{
hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input,
+ tokeniser->context.chars.len, &len);
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ FINISH(cdoc->system_id);
+ emit_current_doctype(tokeniser, true);
+ tokeniser->state = STATE_DATA;
+ return true;
+ }
+
+ uint8_t c = CHAR(cptr);
+ COLLECT_NOBUF(tokeniser->context.chars, len);
if (c == '"') {
- tokeniser->state = HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_SYSTEM;
- hubbub_inputstream_advance(tokeniser->input);
+ FINISH(cdoc->system_id);
+ tokeniser->state = STATE_AFTER_DOCTYPE_SYSTEM;
} else if (c == '>') {
- hubbub_token token;
-
- /* Emit doctype */
- token.type = HUBBUB_TOKEN_DOCTYPE;
- token.data.doctype = tokeniser->context.current_doctype;
- token.data.doctype.force_quirks = true;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == HUBBUB_INPUTSTREAM_EOF) {
- hubbub_token token;
-
- /* Emit doctype */
- token.type = HUBBUB_TOKEN_DOCTYPE;
- token.data.doctype = tokeniser->context.current_doctype;
- token.data.doctype.force_quirks = true;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ FINISH(cdoc->system_id);
+ emit_current_doctype(tokeniser, true);
+ tokeniser->state = STATE_DATA;
+ } else if (c == '\0') {
+ if (cdoc->public_id.len == 0) {
+ START_BUF(cdoc->system_id, u_fffd, sizeof(u_fffd));
+ } else {
+ COLLECT_CHAR(cdoc->system_id,
+ u_fffd, sizeof(u_fffd));
+ }
} else {
- uint32_t pos;
- size_t len;
-
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
-
- if (cdoc->system_id.len == 0)
- cdoc->system_id.data.off = pos;
-
- cdoc->system_id.len += len;
-
- hubbub_inputstream_advance(tokeniser->input);
+ COLLECT_MS(cdoc->system_id, cptr, len);
}
return true;
@@ -2644,89 +2447,67 @@ bool hubbub_tokeniser_handle_doctype_system_dq(hubbub_tokeniser *tokeniser)
bool hubbub_tokeniser_handle_doctype_system_sq(hubbub_tokeniser *tokeniser)
{
hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input,
+ tokeniser->context.chars.len, &len);
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ FINISH(cdoc->system_id);
+ emit_current_doctype(tokeniser, true);
+ tokeniser->state = STATE_DATA;
+ return true;
+ }
+
+ uint8_t c = CHAR(cptr);
+ COLLECT_NOBUF(tokeniser->context.chars, len);
if (c == '\'') {
- tokeniser->state = HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_SYSTEM;
- hubbub_inputstream_advance(tokeniser->input);
+ FINISH(cdoc->system_id);
+ tokeniser->state = STATE_AFTER_DOCTYPE_SYSTEM;
} else if (c == '>') {
- hubbub_token token;
-
- /* Emit doctype */
- token.type = HUBBUB_TOKEN_DOCTYPE;
- token.data.doctype = tokeniser->context.current_doctype;
- token.data.doctype.force_quirks = true;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == HUBBUB_INPUTSTREAM_EOF) {
- hubbub_token token;
-
- /* Emit doctype */
- token.type = HUBBUB_TOKEN_DOCTYPE;
- token.data.doctype = tokeniser->context.current_doctype;
- token.data.doctype.force_quirks = true;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ FINISH(cdoc->system_id);
+ emit_current_doctype(tokeniser, true);
+ tokeniser->state = STATE_DATA;
+ } else if (c == '\0') {
+ if (cdoc->public_id.len == 0) {
+ START_BUF(cdoc->system_id, u_fffd, sizeof(u_fffd));
+ } else {
+ COLLECT_CHAR(cdoc->system_id,
+ u_fffd, sizeof(u_fffd));
+ }
} else {
- uint32_t pos;
- size_t len;
-
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
-
- if (cdoc->system_id.len == 0)
- cdoc->system_id.data.off = pos;
-
- cdoc->system_id.len += len;
-
- hubbub_inputstream_advance(tokeniser->input);
+ COLLECT_MS(cdoc->system_id, cptr, len);
}
return true;
}
-
bool hubbub_tokeniser_handle_after_doctype_system(hubbub_tokeniser *tokeniser)
{
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input,
+ tokeniser->context.chars.len, &len);
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ emit_current_doctype(tokeniser, true);
+ tokeniser->state = STATE_DATA;
+ return true;
+ }
- if (c == '\t' || c == '\n' || c == '\f' || c == ' ') {
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == '>') {
- hubbub_token token;
-
- /* Emit doctype */
- token.type = HUBBUB_TOKEN_DOCTYPE;
- token.data.doctype = tokeniser->context.current_doctype;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == HUBBUB_INPUTSTREAM_EOF) {
- hubbub_token token;
-
- /* Emit doctype */
- token.type = HUBBUB_TOKEN_DOCTYPE;
- token.data.doctype = tokeniser->context.current_doctype;
- token.data.doctype.force_quirks = true;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
+ uint8_t c = CHAR(cptr);
+ COLLECT_NOBUF(tokeniser->context.chars, len);
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
+ /* pass over in silence */
+ } else if (c == '>') {
+ emit_current_doctype(tokeniser, false);
+ tokeniser->state = STATE_DATA;
} else {
- tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE;
- hubbub_inputstream_advance(tokeniser->input);
+ tokeniser->state = STATE_BOGUS_DOCTYPE;
}
return true;
@@ -2735,192 +2516,181 @@ bool hubbub_tokeniser_handle_after_doctype_system(hubbub_tokeniser *tokeniser)
bool hubbub_tokeniser_handle_bogus_doctype(hubbub_tokeniser *tokeniser)
{
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input,
+ tokeniser->context.chars.len, &len);
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ emit_current_doctype(tokeniser, false);
+ tokeniser->state = STATE_DATA;
+ return true;
+ }
- if (c == '>') {
- hubbub_token token;
-
- /* Emit doctype */
- token.type = HUBBUB_TOKEN_DOCTYPE;
- token.data.doctype = tokeniser->context.current_doctype;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == HUBBUB_INPUTSTREAM_EOF) {
- hubbub_token token;
-
- /* Emit doctype */
- token.type = HUBBUB_TOKEN_DOCTYPE;
- token.data.doctype = tokeniser->context.current_doctype;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
+ uint8_t c = CHAR(cptr);
+ COLLECT_NOBUF(tokeniser->context.chars, len);
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- } else {
- hubbub_inputstream_advance(tokeniser->input);
+ if (c == '>') {
+ emit_current_doctype(tokeniser, false);
+ tokeniser->state = STATE_DATA;
}
return true;
}
+
+#define CDATA "[CDATA["
+#define CDATA_LEN (SLEN(CDATA) - 1)
+
bool hubbub_tokeniser_handle_match_cdata(hubbub_tokeniser *tokeniser)
{
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input,
+ tokeniser->context.chars.len, &len);
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ tokeniser->context.current_comment.len =
+ tokeniser->context.chars.len =
+ 0;
+ tokeniser->state = STATE_BOGUS_COMMENT;
+ return true;
+ }
- if (tokeniser->context.match_cdata.count == 1 && c == 'C') {
- tokeniser->context.match_cdata.count++;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (tokeniser->context.match_cdata.count == 2 && c == 'D') {
- tokeniser->context.match_cdata.count++;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (tokeniser->context.match_cdata.count == 3 && c == 'A') {
- tokeniser->context.match_cdata.count++;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (tokeniser->context.match_cdata.count == 4 && c == 'T') {
- tokeniser->context.match_cdata.count++;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (tokeniser->context.match_cdata.count == 5 && c == 'A') {
- tokeniser->context.match_cdata.count++;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (tokeniser->context.match_cdata.count == 6 && c == '[') {
- tokeniser->context.current_chars.data.off = 0;
- tokeniser->context.current_chars.len = 0;
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_CDATA_BLOCK;
- hubbub_inputstream_advance(tokeniser->input);
- } else {
- /* Rewind as many characters as we matched */
- hubbub_inputstream_rewind(tokeniser->input,
- tokeniser->context.match_cdata.count);
+ uint8_t c = CHAR(cptr);
+
+ assert(tokeniser->context.match_cdata.count <= CDATA_LEN);
+
+ if (CDATA[tokeniser->context.match_cdata.count] != (c & ~0x20)) {
+ tokeniser->context.current_comment.len =
+ tokeniser->context.chars.len =
+ 0;
+ tokeniser->state = STATE_BOGUS_COMMENT;
+ return true;
+ }
- tokeniser->context.current_comment.data.off = 0;
- tokeniser->context.current_comment.len = 0;
+ COLLECT_NOBUF(tokeniser->context.chars, len);
- tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT;
+ if (tokeniser->context.match_cdata.count == CDATA_LEN) {
+ parserutils_inputstream_advance(tokeniser->input,
+ tokeniser->context.match_cdata.count + len);
+ tokeniser->context.chars.len = 0;
+ tokeniser->context.match_cdata.end = 0;
+ tokeniser->state = STATE_CDATA_BLOCK;
}
+ tokeniser->context.match_cdata.count += len;
+
return true;
}
+#undef CDATA
+#undef CDATA_LEN
+
+
bool hubbub_tokeniser_handle_cdata_block(hubbub_tokeniser *tokeniser)
{
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+ size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input,
+ tokeniser->context.chars.len, &len);
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD) {
return false;
+ } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) {
+ emit_character_token(tokeniser, &tokeniser->context.chars);
+ tokeniser->state = STATE_DATA;
+ return true;
+ }
+
+ uint8_t c = CHAR(cptr);
if (c == ']' && (tokeniser->context.match_cdata.end == 0 ||
tokeniser->context.match_cdata.end == 1)) {
- tokeniser->context.match_cdata.end++;
- hubbub_inputstream_advance(tokeniser->input);
+ COLLECT(tokeniser->context.chars, cptr, len);
+ tokeniser->context.match_cdata.end += len;
} else if (c == '>' && tokeniser->context.match_cdata.end == 2) {
- hubbub_token token;
+ /* Remove the previous two "]]" */
+ tokeniser->context.chars.len -= 2;
/* Emit any pending characters */
- token.type = HUBBUB_TOKEN_CHARACTER;
- token.data.character = tokeniser->context.current_chars;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
+ emit_character_token(tokeniser, &tokeniser->context.chars);
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- hubbub_inputstream_advance(tokeniser->input);
- } else if (c == HUBBUB_INPUTSTREAM_EOF) {
- hubbub_token token;
-
- /* Emit any pending characters */
- token.type = HUBBUB_TOKEN_CHARACTER;
- token.data.character = tokeniser->context.current_chars;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- } else {
- uint32_t pos;
- size_t len;
+ /* Now move past the "]]>" bit */
+ parserutils_inputstream_advance(tokeniser->input, SLEN("]]>"));
- if (tokeniser->context.match_cdata.end) {
- hubbub_inputstream_rewind(tokeniser->input,
- tokeniser->context.match_cdata.end);
- tokeniser->context.match_cdata.end = 0;
+ tokeniser->state = STATE_DATA;
+ } else if (c == '\0') {
+ if (tokeniser->context.chars.len > 0) {
+ /* Emit any pending characters */
+ emit_character_token(tokeniser,
+ &tokeniser->context.chars);
}
- /* Accumulate characters into buffer */
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+ /* Perform NUL-byte replacement */
+ emit_character_token(tokeniser, &u_fffd_str);
- if (tokeniser->context.current_chars.len == 0)
- tokeniser->context.current_chars.data.off = pos;
-
- tokeniser->context.current_chars.len += len;
-
- hubbub_inputstream_advance(tokeniser->input);
+ parserutils_inputstream_advance(tokeniser->input, len);
+ tokeniser->context.match_cdata.end = 0;
+ } else {
+ COLLECT_MS_NOBUF(tokeniser->context.chars, cptr, len);
+ tokeniser->context.match_cdata.end = 0;
}
return true;
}
-bool hubbub_tokeniser_consume_character_reference(hubbub_tokeniser *tokeniser)
+bool hubbub_tokeniser_consume_character_reference(hubbub_tokeniser *tokeniser, size_t pos)
{
uint32_t allowed_char = tokeniser->context.allowed_char;
- uint32_t c;
- uint32_t pos;
size_t len;
+ uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input,
+ pos, &len);
+
+ /* We should always started on a non-OOD character */
+ assert(cptr != PARSERUTILS_INPUTSTREAM_OOD);
+
+ size_t off = pos + len;
+
+ /* Look at the character after the ampersand */
+ cptr = parserutils_inputstream_peek(tokeniser->input, off, &len);
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return false;
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+ uint8_t c = CHAR(cptr);
/* Set things up */
- tokeniser->context.match_entity.str.data.off = pos;
- tokeniser->context.match_entity.str.len = len;
- tokeniser->context.match_entity.poss_len = 0;
+ tokeniser->context.match_entity.offset = off;
+ tokeniser->context.match_entity.poss_length = 0;
+ tokeniser->context.match_entity.length = 0;
tokeniser->context.match_entity.base = 0;
tokeniser->context.match_entity.codepoint = 0;
tokeniser->context.match_entity.had_data = false;
tokeniser->context.match_entity.return_state = tokeniser->state;
tokeniser->context.match_entity.complete = false;
- tokeniser->context.match_entity.done_setup = true;
tokeniser->context.match_entity.overflow = false;
tokeniser->context.match_entity.context = NULL;
tokeniser->context.match_entity.prev_len = len;
- hubbub_inputstream_advance(tokeniser->input);
-
- c = hubbub_inputstream_peek(tokeniser->input);
-
- if (c == HUBBUB_INPUTSTREAM_OOD) {
- /* rewind because we need more data */
- hubbub_inputstream_rewind(tokeniser->input, 1);
- return false;
- }
-
/* Reset allowed character for future calls */
tokeniser->context.allowed_char = '\0';
if (c == '\t' || c == '\n' || c == '\f' || c == ' ' ||
- c == '<' || c == '&' || c == HUBBUB_INPUTSTREAM_EOF ||
- (allowed_char && c == allowed_char)) {
+ c == '<' || c == '&' ||
+ cptr == PARSERUTILS_INPUTSTREAM_EOF ||
+ (allowed_char && c == allowed_char)) {
tokeniser->context.match_entity.complete = true;
- /* rewind to the '&' (de-consume) */
- hubbub_inputstream_rewind(tokeniser->input, 1);
- return true;
+ tokeniser->context.match_entity.codepoint = 0;
} else if (c == '#') {
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
-
- tokeniser->context.match_entity.str.len += len;
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_NUMBERED_ENTITY;
- hubbub_inputstream_advance(tokeniser->input);
+ tokeniser->context.match_entity.length += len;
+ tokeniser->state = STATE_NUMBERED_ENTITY;
} else {
- tokeniser->state = HUBBUB_TOKENISER_STATE_NAMED_ENTITY;
+ tokeniser->state = STATE_NAMED_ENTITY;
}
return true;
@@ -2930,47 +2700,44 @@ bool hubbub_tokeniser_consume_character_reference(hubbub_tokeniser *tokeniser)
bool hubbub_tokeniser_handle_numbered_entity(hubbub_tokeniser *tokeniser)
{
hubbub_tokeniser_context *ctx = &tokeniser->context;
- uint32_t c = hubbub_inputstream_peek(tokeniser->input);
- uint32_t pos;
+
size_t len;
- hubbub_error error;
+ uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input,
+ ctx->match_entity.offset + ctx->match_entity.length,
+ &len);
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
return false;
+ uint8_t c = CHAR(cptr);
+
if (ctx->match_entity.base == 0) {
if ((c & ~0x20) == 'X') {
ctx->match_entity.base = 16;
-
- pos = hubbub_inputstream_cur_pos(tokeniser->input,
- &len);
- ctx->match_entity.str.len += len;
-
- hubbub_inputstream_advance(tokeniser->input);
+ ctx->match_entity.length += len;
} else {
ctx->match_entity.base = 10;
}
}
- while ((c = hubbub_inputstream_peek(tokeniser->input)) !=
- HUBBUB_INPUTSTREAM_EOF &&
- c != HUBBUB_INPUTSTREAM_OOD) {
+ while ((cptr = parserutils_inputstream_peek(tokeniser->input,
+ ctx->match_entity.offset + ctx->match_entity.length,
+ &len)) != PARSERUTILS_INPUTSTREAM_EOF &&
+ cptr != PARSERUTILS_INPUTSTREAM_OOD) {
+ c = CHAR(cptr);
+
if (ctx->match_entity.base == 10 &&
('0' <= c && c <= '9')) {
ctx->match_entity.had_data = true;
-
ctx->match_entity.codepoint =
ctx->match_entity.codepoint * 10 + (c - '0');
- pos = hubbub_inputstream_cur_pos(tokeniser->input,
- &len);
- ctx->match_entity.str.len += len;
+ ctx->match_entity.length += len;
} else if (ctx->match_entity.base == 16 &&
(('0' <= c && c <= '9') ||
('A' <= (c & ~0x20) &&
(c & ~0x20) <= 'F'))) {
ctx->match_entity.had_data = true;
-
ctx->match_entity.codepoint *= 16;
if ('0' <= c && c <= '9') {
@@ -2980,9 +2747,7 @@ bool hubbub_tokeniser_handle_numbered_entity(hubbub_tokeniser *tokeniser)
((c & ~0x20) - 'A' + 10);
}
- pos = hubbub_inputstream_cur_pos(tokeniser->input,
- &len);
- ctx->match_entity.str.len += len;
+ ctx->match_entity.length += len;
} else {
break;
}
@@ -2990,25 +2755,18 @@ bool hubbub_tokeniser_handle_numbered_entity(hubbub_tokeniser *tokeniser)
if (ctx->match_entity.codepoint >= 0x10FFFF) {
ctx->match_entity.overflow = true;
}
-
- hubbub_inputstream_advance(tokeniser->input);
}
- if (c == HUBBUB_INPUTSTREAM_OOD)
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
return false;
+ c = CHAR(cptr);
+
/* Eat trailing semicolon, if any */
if (c == ';') {
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
- ctx->match_entity.str.len += len;
-
- hubbub_inputstream_advance(tokeniser->input);
+ ctx->match_entity.length += len;
}
- /* Rewind the inputstream to start of matched sequence */
- hubbub_inputstream_rewind(tokeniser->input,
- ctx->match_entity.str.len);
-
/* Had data, so calculate final codepoint */
if (ctx->match_entity.had_data) {
uint32_t cp = ctx->match_entity.codepoint;
@@ -3028,19 +2786,9 @@ bool hubbub_tokeniser_handle_numbered_entity(hubbub_tokeniser *tokeniser)
cp = 0xFFFD;
}
- /* And replace the matched range with it */
- error = hubbub_inputstream_replace_range(tokeniser->input,
- ctx->match_entity.str.data.off,
- ctx->match_entity.str.len,
- cp);
- if (error != HUBBUB_OK) {
- /** \todo handle memory exhaustion */
- }
+ ctx->match_entity.codepoint = cp;
}
- /* Reset for next time */
- ctx->match_entity.done_setup = false;
-
/* Flag completion */
ctx->match_entity.complete = true;
@@ -3053,61 +2801,60 @@ bool hubbub_tokeniser_handle_numbered_entity(hubbub_tokeniser *tokeniser)
bool hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser)
{
hubbub_tokeniser_context *ctx = &tokeniser->context;
- uint32_t c;
- uint32_t pos;
+
size_t len;
- hubbub_error error;
+ uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input,
+ ctx->match_entity.offset, &len);
+
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
+ return false;
+
+ uint8_t c = CHAR(cptr);
- while ((c = hubbub_inputstream_peek(tokeniser->input)) !=
- HUBBUB_INPUTSTREAM_EOF &&
- c != HUBBUB_INPUTSTREAM_OOD) {
+ while ((cptr = parserutils_inputstream_peek(tokeniser->input,
+ ctx->match_entity.offset +
+ ctx->match_entity.poss_length,
+ &len)) !=
+ PARSERUTILS_INPUTSTREAM_EOF &&
+ cptr != PARSERUTILS_INPUTSTREAM_OOD) {
uint32_t cp;
+ c = CHAR(cptr);
+
if (c > 0x7F) {
/* Entity names are ASCII only */
break;
}
- error = hubbub_entities_search_step((uint8_t) c,
- &cp,
+ hubbub_error error = hubbub_entities_search_step(c, &cp,
&ctx->match_entity.context);
if (error == HUBBUB_OK) {
/* Had a match - store it for later */
ctx->match_entity.codepoint = cp;
- pos = hubbub_inputstream_cur_pos(tokeniser->input,
- &len);
- ctx->match_entity.str.len += len;
- ctx->match_entity.str.len += ctx->match_entity.poss_len;
- ctx->match_entity.poss_len = 0;
-
- /* And cache length, for replacement */
- ctx->match_entity.prev_len =
- ctx->match_entity.str.len;
+ ctx->match_entity.length =
+ ctx->match_entity.poss_length + len;
+ ctx->match_entity.poss_length =
+ ctx->match_entity.length;
} else if (error == HUBBUB_INVALID) {
/* No further matches - use last found */
break;
} else {
- pos = hubbub_inputstream_cur_pos(tokeniser->input,
- &len);
- ctx->match_entity.poss_len += len;
+ /* Need more data */
+ ctx->match_entity.poss_length += len;
}
-
- hubbub_inputstream_advance(tokeniser->input);
}
- if (c == HUBBUB_INPUTSTREAM_OOD) {
+ if (cptr == PARSERUTILS_INPUTSTREAM_OOD)
return false;
- }
- /* Rewind back possible matches, if any */
- hubbub_inputstream_rewind(tokeniser->input,
- ctx->match_entity.poss_len);
-
- c = hubbub_inputstream_peek(tokeniser->input);
+ cptr = parserutils_inputstream_peek(tokeniser->input,
+ ctx->match_entity.offset + ctx->match_entity.length,
+ &len);
+ c = CHAR(cptr);
if ((tokeniser->context.match_entity.return_state ==
- HUBBUB_TOKENISER_STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE) &&
+ STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE) &&
(c != ';') &&
((0x0030 <= c && c <= 0x0039) ||
(0x0041 <= c && c <= 0x005A) ||
@@ -3115,26 +2862,6 @@ bool hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser)
ctx->match_entity.codepoint = 0;
}
- /* Rewind the inputstream to start of processed sequence */
- hubbub_inputstream_rewind(tokeniser->input,
- ctx->match_entity.str.len);
-
- pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
-
- /* Now, replace range, if we found a named entity */
- if (ctx->match_entity.codepoint != 0) {
- error = hubbub_inputstream_replace_range(tokeniser->input,
- ctx->match_entity.str.data.off,
- ctx->match_entity.prev_len,
- ctx->match_entity.codepoint);
- if (error != HUBBUB_OK) {
- /** \todo handle memory exhaustion */
- }
- }
-
- /* Reset for next time */
- ctx->match_entity.done_setup = false;
-
/* Flag completion */
ctx->match_entity.complete = true;
@@ -3144,24 +2871,6 @@ bool hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser)
return true;
}
-/**
- * Handle input stream buffer moving
- *
- * \param buffer Pointer to buffer
- * \param len Length of data in buffer (bytes)
- * \param pw Pointer to our context
- */
-void hubbub_tokeniser_buffer_moved_handler(const uint8_t *buffer,
- size_t len, void *pw)
-{
- hubbub_tokeniser *tok = (hubbub_tokeniser *) pw;
-
- tok->input_buffer = buffer;
- tok->input_buffer_len = len;
-
- if (tok->buffer_handler != NULL)
- tok->buffer_handler(buffer, len, tok->buffer_pw);
-}
/**
* Emit a token, performing sanity checks if necessary
@@ -3175,18 +2884,6 @@ void hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser,
assert(tokeniser != NULL);
assert(token != NULL);
- if (token->type == HUBBUB_TOKEN_START_TAG) {
- tokeniser->context.last_start_tag_name = token->data.tag.name;
- token->data.tag.ns = HUBBUB_NS_HTML;
- } else if (token->type == HUBBUB_TOKEN_END_TAG) {
- tokeniser->content_model = HUBBUB_CONTENT_MODEL_PCDATA;
- }
-
-
- /* Nothing to do if there's no registered handler */
- if (tokeniser->token_handler == NULL)
- return;
-
if (token->type == HUBBUB_TOKEN_START_TAG ||
token->type == HUBBUB_TOKEN_END_TAG) {
uint32_t i, j;
@@ -3194,6 +2891,8 @@ void hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser,
hubbub_attribute *attrs =
token->data.tag.attributes;
+ token->data.tag.ns = HUBBUB_NS_HTML;
+
/* Discard duplicate attributes */
for (i = 0; i < n_attributes; i++) {
for (j = 0; j < n_attributes; j++) {
@@ -3202,10 +2901,8 @@ void hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser,
if (j == i ||
attrs[i].name.len !=
attrs[j].name.len ||
- hubbub_inputstream_compare_range_cs(
- tokeniser->input,
- attrs[i].name.data.off,
- attrs[j].name.data.off,
+ strncmp((char *)attrs[i].name.ptr,
+ (char *)attrs[j].name.ptr,
attrs[i].name.len) != 0) {
/* Attributes don't match */
continue;
@@ -3233,5 +2930,35 @@ void hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser,
}
/* Finally, emit token */
- tokeniser->token_handler(token, tokeniser->token_pw);
+ if (tokeniser->token_handler)
+ tokeniser->token_handler(token, tokeniser->token_pw);
+
+ if (token->type == HUBBUB_TOKEN_START_TAG) {
+ if (token->data.tag.name.len <
+ sizeof(tokeniser->context.last_start_tag_name)) {
+ strncpy((char *)tokeniser->context.last_start_tag_name,
+ (const char *)token->data.tag.name.ptr,
+ token->data.tag.name.len);
+ tokeniser->context.last_start_tag_len =
+ token->data.tag.name.len;
+ } else {
+ tokeniser->context.last_start_tag_name[0] = '\0';
+ tokeniser->context.last_start_tag_len = 0;
+ }
+ } else if (token->type == HUBBUB_TOKEN_END_TAG) {
+ tokeniser->content_model = HUBBUB_CONTENT_MODEL_PCDATA;
+ }
+
+ if (tokeniser->buffer->length) {
+ /* Discard current buffer */
+ parserutils_buffer_discard(tokeniser->buffer, 0,
+ tokeniser->buffer->length);
+ }
+
+ /* Advance the pointer */
+ if (tokeniser->context.chars.len) {
+ parserutils_inputstream_advance(tokeniser->input,
+ tokeniser->context.chars.len);
+ tokeniser->context.chars.len = 0;
+ }
}
diff --git a/src/tokeniser/tokeniser.h b/src/tokeniser/tokeniser.h
index 2292cd7..ffc10fe 100644
--- a/src/tokeniser/tokeniser.h
+++ b/src/tokeniser/tokeniser.h
@@ -15,7 +15,7 @@
#include <hubbub/functypes.h>
#include <hubbub/types.h>
-#include "input/inputstream.h"
+#include <parserutils/input/inputstream.h>
typedef struct hubbub_tokeniser hubbub_tokeniser;
@@ -24,7 +24,6 @@ typedef struct hubbub_tokeniser hubbub_tokeniser;
*/
typedef enum hubbub_tokeniser_opttype {
HUBBUB_TOKENISER_TOKEN_HANDLER,
- HUBBUB_TOKENISER_BUFFER_HANDLER,
HUBBUB_TOKENISER_ERROR_HANDLER,
HUBBUB_TOKENISER_CONTENT_MODEL,
HUBBUB_TOKENISER_PROCESS_CDATA
@@ -40,11 +39,6 @@ typedef union hubbub_tokeniser_optparams {
} token_handler;
struct {
- hubbub_buffer_handler handler;
- void *pw;
- } buffer_handler;
-
- struct {
hubbub_error_handler handler;
void *pw;
} error_handler;
@@ -57,7 +51,7 @@ typedef union hubbub_tokeniser_optparams {
} hubbub_tokeniser_optparams;
/* Create a hubbub tokeniser */
-hubbub_tokeniser *hubbub_tokeniser_create(hubbub_inputstream *input,
+hubbub_tokeniser *hubbub_tokeniser_create(parserutils_inputstream *input,
hubbub_alloc alloc, void *pw);
/* Destroy a hubbub tokeniser */
void hubbub_tokeniser_destroy(hubbub_tokeniser *tokeniser);
diff --git a/src/treebuilder/after_body.c b/src/treebuilder/after_body.c
index 65f4e53..98191c0 100644
--- a/src/treebuilder/after_body.c
+++ b/src/treebuilder/after_body.c
@@ -31,9 +31,7 @@ bool handle_after_body(hubbub_treebuilder *treebuilder,
{
/* mostly cribbed from process_characters_expect_whitespace */
- const uint8_t *data = treebuilder->input_buffer +
- token->data.character.data.off;
-
+ const uint8_t *data = token->data.character.ptr;
size_t len = token->data.character.len;
size_t c;
@@ -57,7 +55,7 @@ bool handle_after_body(hubbub_treebuilder *treebuilder,
/* Anything else, switch to in body */
if (c != len) {
/* Update token data to strip leading whitespace */
- ((hubbub_token *) token)->data.character.data.off += c;
+ ((hubbub_token *) token)->data.character.ptr += c;
((hubbub_token *) token)->data.character.len -= c;
treebuilder->context.mode = IN_BODY;
diff --git a/src/treebuilder/after_head.c b/src/treebuilder/after_head.c
index 0243727..5bd4b40 100644
--- a/src/treebuilder/after_head.c
+++ b/src/treebuilder/after_head.c
@@ -100,8 +100,7 @@ bool handle_after_head(hubbub_treebuilder *treebuilder,
if (reprocess) {
/* Manufacture body */
tag.ns = HUBBUB_NS_HTML;
- tag.name.type = HUBBUB_STRING_PTR;
- tag.name.data.ptr = (const uint8_t *) "body";
+ tag.name.ptr = (const uint8_t *) "body";
tag.name.len = SLEN("body");
tag.n_attributes = 0;
diff --git a/src/treebuilder/before_head.c b/src/treebuilder/before_head.c
index 459a411..eb9c917 100644
--- a/src/treebuilder/before_head.c
+++ b/src/treebuilder/before_head.c
@@ -78,8 +78,7 @@ bool handle_before_head(hubbub_treebuilder *treebuilder,
if (reprocess) {
/* Manufacture head tag */
tag.ns = HUBBUB_NS_HTML;
- tag.name.type = HUBBUB_STRING_PTR;
- tag.name.data.ptr = (const uint8_t *) "head";
+ tag.name.ptr = (const uint8_t *) "head";
tag.name.len = SLEN("head");
tag.n_attributes = 0;
diff --git a/src/treebuilder/before_html.c b/src/treebuilder/before_html.c
index 8401087..70eebde 100644
--- a/src/treebuilder/before_html.c
+++ b/src/treebuilder/before_html.c
@@ -73,8 +73,7 @@ bool handle_before_html(hubbub_treebuilder *treebuilder,
/** \todo UTF-16 */
tag.ns = HUBBUB_NS_HTML;
- tag.name.type = HUBBUB_STRING_PTR;
- tag.name.data.ptr = (const uint8_t *) "html";
+ tag.name.ptr = (const uint8_t *) "html";
tag.name.len = SLEN("html");
tag.n_attributes = 0;
diff --git a/src/treebuilder/generic_rcdata.c b/src/treebuilder/generic_rcdata.c
index 0ab3b8c..e15cf97 100644
--- a/src/treebuilder/generic_rcdata.c
+++ b/src/treebuilder/generic_rcdata.c
@@ -35,20 +35,16 @@ bool handle_generic_rcdata(hubbub_treebuilder *treebuilder,
switch (token->type) {
case HUBBUB_TOKEN_CHARACTER:
- if (treebuilder->context.collect.string.len == 0) {
- treebuilder->context.collect.string.data.off =
- token->data.character.data.off;
- }
- treebuilder->context.collect.string.len +=
- token->data.character.len;
+ treebuilder->context.collect.string =
+ token->data.character;
if (treebuilder->context.strip_leading_lr) {
- const uint8_t *str = treebuilder->input_buffer +
- treebuilder->context.collect.string.data.off;
+ const uint8_t *str =
+ treebuilder->context.collect.string.ptr;
/** \todo UTF-16 */
if (*str == '\n') {
- treebuilder->context.collect.string.data.off++;
+ treebuilder->context.collect.string.ptr++;
treebuilder->context.collect.string.len--;
}
@@ -79,7 +75,7 @@ bool handle_generic_rcdata(hubbub_treebuilder *treebuilder,
break;
}
- if (done && treebuilder->context.collect.string.len) {
+ if (treebuilder->context.collect.string.len) {
int success;
void *text, *appended;
@@ -107,6 +103,7 @@ bool handle_generic_rcdata(hubbub_treebuilder *treebuilder,
treebuilder->tree_handler->unref_node(
treebuilder->tree_handler->ctx, text);
+ treebuilder->context.collect.string.len = 0;
}
if (done) {
diff --git a/src/treebuilder/in_body.c b/src/treebuilder/in_body.c
index 57b346d..699152e 100644
--- a/src/treebuilder/in_body.c
+++ b/src/treebuilder/in_body.c
@@ -185,12 +185,11 @@ void process_character(hubbub_treebuilder *treebuilder,
reconstruct_active_formatting_list(treebuilder);
if (treebuilder->context.strip_leading_lr) {
- const uint8_t *str =
- treebuilder->input_buffer + dummy.data.off;
+ const uint8_t *str = dummy.ptr;
/** \todo UTF-16 */
if (*str == '\n') {
- dummy.data.off++;
+ dummy.ptr++;
dummy.len--;
}
@@ -627,12 +626,13 @@ void process_a_in_body(hubbub_treebuilder *treebuilder,
/* Remove from formatting list, if it's still there */
if (entry2 == entry && entry2->details.node == node) {
+ hubbub_ns ons;
element_type otype;
void *onode;
uint32_t oindex;
formatting_list_remove(treebuilder, entry,
- &otype, &onode, &oindex);
+ &ons, &otype, &onode, &oindex);
treebuilder->tree_handler->unref_node(
treebuilder->tree_handler->ctx, onode);
@@ -657,7 +657,7 @@ void process_a_in_body(hubbub_treebuilder *treebuilder,
treebuilder->context.element_stack[
treebuilder->context.current_node].node);
- formatting_list_append(treebuilder, A,
+ formatting_list_append(treebuilder, token->data.tag.ns, A,
treebuilder->context.element_stack[
treebuilder->context.current_node].node,
treebuilder->context.current_node);
@@ -682,7 +682,7 @@ void process_presentational_in_body(hubbub_treebuilder *treebuilder,
treebuilder->context.element_stack[
treebuilder->context.current_node].node);
- formatting_list_append(treebuilder, type,
+ formatting_list_append(treebuilder, token->data.tag.ns, type,
treebuilder->context.element_stack[
treebuilder->context.current_node].node,
treebuilder->context.current_node);
@@ -716,7 +716,7 @@ void process_nobr_in_body(hubbub_treebuilder *treebuilder,
treebuilder->context.element_stack[
treebuilder->context.current_node].node);
- formatting_list_append(treebuilder, NOBR,
+ formatting_list_append(treebuilder, token->data.tag.ns, NOBR,
treebuilder->context.element_stack[
treebuilder->context.current_node].node,
treebuilder->context.current_node);
@@ -756,7 +756,7 @@ void process_button_in_body(hubbub_treebuilder *treebuilder,
treebuilder->context.element_stack[
treebuilder->context.current_node].node);
- formatting_list_append(treebuilder, BUTTON,
+ formatting_list_append(treebuilder, token->data.tag.ns, BUTTON,
treebuilder->context.element_stack[
treebuilder->context.current_node].node,
treebuilder->context.current_node);
@@ -781,7 +781,7 @@ void process_applet_marquee_object_in_body(hubbub_treebuilder *treebuilder,
treebuilder->context.element_stack[
treebuilder->context.current_node].node);
- formatting_list_append(treebuilder, type,
+ formatting_list_append(treebuilder, token->data.tag.ns, type,
treebuilder->context.element_stack[
treebuilder->context.current_node].node,
treebuilder->context.current_node);
@@ -816,8 +816,7 @@ void process_image_in_body(hubbub_treebuilder *treebuilder,
/** \todo UTF-16 */
tag.ns = HUBBUB_NS_HTML;
- tag.name.type = HUBBUB_STRING_PTR;
- tag.name.data.ptr = (const uint8_t *) "img";
+ tag.name.ptr = (const uint8_t *) "img";
tag.name.len = SLEN("img");
tag.n_attributes = token->data.tag.n_attributes;
@@ -883,7 +882,7 @@ void process_isindex_in_body(hubbub_treebuilder *treebuilder,
/* First up, clone the token's attributes */
if (token->data.tag.n_attributes > 0) {
- attrs = treebuilder->alloc(NULL,
+ attrs = treebuilder->alloc(NULL,
(token->data.tag.n_attributes + 1) *
sizeof(hubbub_attribute),
treebuilder->alloc_pw);
@@ -894,12 +893,11 @@ void process_isindex_in_body(hubbub_treebuilder *treebuilder,
for (uint32_t i = 0; i < token->data.tag.n_attributes; i++) {
hubbub_attribute *attr = &token->data.tag.attributes[i];
- const uint8_t *name = treebuilder->input_buffer +
- attr->name.data.off;
+ const uint8_t *name = attr->name.ptr;
- if (strncmp((const char *) name, "action",
+ if (strncmp((const char *) name, "action",
attr->name.len) == 0) {
- action = attr;
+ action = attr;
} else if (strncmp((const char *) name, "prompt",
attr->name.len) == 0) {
prompt = attr;
@@ -911,11 +909,9 @@ void process_isindex_in_body(hubbub_treebuilder *treebuilder,
}
attrs[n_attrs].ns = HUBBUB_NS_HTML;
- attrs[n_attrs].name.type = HUBBUB_STRING_PTR;
- attrs[n_attrs].name.data.ptr = (const uint8_t *) "name";
+ attrs[n_attrs].name.ptr = (const uint8_t *) "name";
attrs[n_attrs].name.len = SLEN("name");
- attrs[n_attrs].value.type = HUBBUB_STRING_PTR;
- attrs[n_attrs].value.data.ptr = (const uint8_t *) "isindex";
+ attrs[n_attrs].value.ptr = (const uint8_t *) "isindex";
attrs[n_attrs].value.len = SLEN("isindex");
n_attrs++;
}
@@ -925,10 +921,9 @@ void process_isindex_in_body(hubbub_treebuilder *treebuilder,
/* Set up dummy as a start tag token */
dummy.type = HUBBUB_TOKEN_START_TAG;
dummy.data.tag.ns = HUBBUB_NS_HTML;
- dummy.data.tag.name.type = HUBBUB_STRING_PTR;
/* Act as if <form> were seen */
- dummy.data.tag.name.data.ptr = (const uint8_t *) "form";
+ dummy.data.tag.name.ptr = (const uint8_t *) "form";
dummy.data.tag.name.len = SLEN("form");
dummy.data.tag.n_attributes = action != NULL ? 1 : 0;
@@ -937,7 +932,7 @@ void process_isindex_in_body(hubbub_treebuilder *treebuilder,
process_form_in_body(treebuilder, &dummy);
/* Act as if <hr> were seen */
- dummy.data.tag.name.data.ptr = (const uint8_t *) "hr";
+ dummy.data.tag.name.ptr = (const uint8_t *) "hr";
dummy.data.tag.name.len = SLEN("hr");
dummy.data.tag.n_attributes = 0;
dummy.data.tag.attributes = NULL;
@@ -945,7 +940,7 @@ void process_isindex_in_body(hubbub_treebuilder *treebuilder,
process_hr_in_body(treebuilder, &dummy);
/* Act as if <p> were seen */
- dummy.data.tag.name.data.ptr = (const uint8_t *) "p";
+ dummy.data.tag.name.ptr = (const uint8_t *) "p";
dummy.data.tag.name.len = SLEN("p");
dummy.data.tag.n_attributes = 0;
dummy.data.tag.attributes = NULL;
@@ -953,7 +948,7 @@ void process_isindex_in_body(hubbub_treebuilder *treebuilder,
process_container_in_body(treebuilder, &dummy);
/* Act as if <label> were seen */
- dummy.data.tag.name.data.ptr = (const uint8_t *) "label";
+ dummy.data.tag.name.ptr = (const uint8_t *) "label";
dummy.data.tag.name.len = SLEN("label");
dummy.data.tag.n_attributes = 0;
dummy.data.tag.attributes = NULL;
@@ -967,8 +962,7 @@ void process_isindex_in_body(hubbub_treebuilder *treebuilder,
} else {
/** \todo Localisation */
#define PROMPT "This is a searchable index. Insert your search keywords here: "
- dummy.data.character.type = HUBBUB_STRING_PTR;
- dummy.data.character.data.ptr = (const uint8_t *) PROMPT;
+ dummy.data.character.ptr = (const uint8_t *) PROMPT;
dummy.data.character.len = SLEN(PROMPT);
#undef PROMPT
}
@@ -977,8 +971,8 @@ void process_isindex_in_body(hubbub_treebuilder *treebuilder,
/* Act as if <input> was seen */
dummy.type = HUBBUB_TOKEN_START_TAG;
- dummy.data.tag.name.type = HUBBUB_STRING_PTR;
- dummy.data.tag.name.data.ptr = (const uint8_t *) "input";
+ dummy.data.tag.ns = HUBBUB_NS_HTML;
+ dummy.data.tag.name.ptr = (const uint8_t *) "input";
dummy.data.tag.name.len = SLEN("input");
dummy.data.tag.n_attributes = n_attrs;
@@ -993,7 +987,7 @@ void process_isindex_in_body(hubbub_treebuilder *treebuilder,
process_0p_in_body(treebuilder);
/* Act as if <hr> was seen */
- dummy.data.tag.name.data.ptr = (const uint8_t *) "hr";
+ dummy.data.tag.name.ptr = (const uint8_t *) "hr";
dummy.data.tag.name.len = SLEN("hr");
dummy.data.tag.n_attributes = 0;
dummy.data.tag.attributes = NULL;
@@ -1171,9 +1165,8 @@ void process_0p_in_body(hubbub_treebuilder *treebuilder)
dummy.type = HUBBUB_TOKEN_START_TAG;
dummy.data.tag.ns = HUBBUB_NS_HTML;
- dummy.data.tag.name.type = HUBBUB_STRING_PTR;
/** \todo UTF-16 */
- dummy.data.tag.name.data.ptr = (const uint8_t *) "p";
+ dummy.data.tag.name.ptr = (const uint8_t *) "p";
dummy.data.tag.name.len = SLEN("p");
dummy.data.tag.n_attributes = 0;
dummy.data.tag.attributes = NULL;
@@ -1415,19 +1408,20 @@ void process_0presentational_in_body(hubbub_treebuilder *treebuilder,
stack[furthest_block + 1].node = clone_appended;
/* 12 */
+ hubbub_ns ons;
element_type otype;
void *onode;
uint32_t oindex;
formatting_list_remove(treebuilder, entry,
- &otype, &onode, &oindex);
+ &ons, &otype, &onode, &oindex);
treebuilder->tree_handler->unref_node(
treebuilder->tree_handler->ctx, onode);
formatting_list_insert(treebuilder,
bookmark.prev, bookmark.next,
- otype, clone_appended, furthest_block + 1);
+ ons, otype, clone_appended, furthest_block + 1);
/* 14 */
}
@@ -1457,6 +1451,7 @@ bool aa_find_and_validate_formatting_element(hubbub_treebuilder *treebuilder,
if (entry->stack_index == 0) {
/* Not in element stack => remove from formatting list */
+ hubbub_ns ns;
element_type type;
void *node;
uint32_t index;
@@ -1464,7 +1459,7 @@ bool aa_find_and_validate_formatting_element(hubbub_treebuilder *treebuilder,
/** \todo parse error */
if (!formatting_list_remove(treebuilder, entry,
- &type, &node, &index)) {
+ &ns, &type, &node, &index)) {
/** \todo errors */
}
@@ -1553,7 +1548,7 @@ bool aa_find_furthest_block(hubbub_treebuilder *treebuilder,
/* Remove the formatting element from the list */
if (!formatting_list_remove(treebuilder, formatting_element,
- &type, &node, &index)) {
+ &ns, &type, &node, &index)) {
/* \todo errors */
}
@@ -1786,6 +1781,7 @@ void aa_remove_element_stack_item(hubbub_treebuilder *treebuilder,
void aa_clone_and_replace_entries(hubbub_treebuilder *treebuilder,
formatting_list_entry *element)
{
+ hubbub_ns ons;
element_type otype;
uint32_t oindex;
void *clone, *onode;
@@ -1796,8 +1792,9 @@ void aa_clone_and_replace_entries(hubbub_treebuilder *treebuilder,
/* Replace formatting list entry for node with clone */
formatting_list_replace(treebuilder, element,
- element->details.type, clone, element->stack_index,
- &otype, &onode, &oindex);
+ element->details.ns, element->details.type,
+ clone, element->stack_index,
+ &ons, &otype, &onode, &oindex);
treebuilder->tree_handler->unref_node(treebuilder->tree_handler->ctx,
onode);
@@ -1935,8 +1932,7 @@ void process_0br_in_body(hubbub_treebuilder *treebuilder)
/** \todo UTF-16 */
tag.ns = HUBBUB_NS_HTML;
- tag.name.type = HUBBUB_STRING_PTR;
- tag.name.data.ptr = (const uint8_t *) "br";
+ tag.name.ptr = (const uint8_t *) "br";
tag.name.len = SLEN("br");
tag.n_attributes = 0;
diff --git a/src/treebuilder/in_foreign_content.c b/src/treebuilder/in_foreign_content.c
index 5b38839..a5dda6b 100644
--- a/src/treebuilder/in_foreign_content.c
+++ b/src/treebuilder/in_foreign_content.c
@@ -147,16 +147,14 @@ void adjust_svg_attributes(hubbub_treebuilder *treebuilder,
for (size_t i = 0; i < tag->n_attributes; i++) {
hubbub_attribute *attr = &tag->attributes[i];
- const uint8_t *name = treebuilder->input_buffer +
- attr->name.data.off;
+ const uint8_t *name = attr->name.ptr;
size_t len = attr->name.len;
for (size_t j = 0; j < N_ELEMENTS(svg_attributes); j++) {
if (hubbub_string_match(name, len,
(uint8_t *)svg_attributes[j].attr,
svg_attributes[j].len)) {
- attr->name.type = HUBBUB_STRING_PTR;
- attr->name.data.ptr =
+ attr->name.ptr =
(uint8_t *)svg_attributes[j].proper;
}
}
@@ -172,16 +170,14 @@ void adjust_svg_attributes(hubbub_treebuilder *treebuilder,
void adjust_svg_tagname(hubbub_treebuilder *treebuilder,
hubbub_tag *tag)
{
- uint8_t *name = (uint8_t *) treebuilder->input_buffer +
- tag->name.data.off;
+ const uint8_t *name = tag->name.ptr;
size_t len = tag->name.len;
for (size_t i = 0; i < N_ELEMENTS(svg_tagnames); i++) {
if (hubbub_string_match(name, len,
(uint8_t *)svg_tagnames[i].attr,
svg_tagnames[i].len)) {
- tag->name.type = HUBBUB_STRING_PTR;
- tag->name.data.ptr =
+ tag->name.ptr =
(uint8_t *)svg_tagnames[i].proper;
}
}
@@ -202,8 +198,7 @@ void adjust_foreign_attributes(hubbub_treebuilder *treebuilder,
{
for (size_t i = 0; i < tag->n_attributes; i++) {
hubbub_attribute *attr = &tag->attributes[i];
- const uint8_t *name = treebuilder->input_buffer +
- attr->name.data.off;
+ const uint8_t *name = attr->name.ptr;
/* 10 == strlen("xlink:href") */
if (attr->name.len >= 10 &&
@@ -226,7 +221,7 @@ void adjust_foreign_attributes(hubbub_treebuilder *treebuilder,
hubbub_string_match(name, len,
S("type"))) {
attr->ns = HUBBUB_NS_XLINK;
- attr->name.data.off += 6;
+ attr->name.ptr += 6;
attr->name.len -= 6;
}
/* 8 == strlen("xml:base") */
@@ -241,7 +236,7 @@ void adjust_foreign_attributes(hubbub_treebuilder *treebuilder,
hubbub_string_match(name, len,
S("space"))) {
attr->ns = HUBBUB_NS_XML;
- attr->name.data.off += 4;
+ attr->name.ptr += 4;
attr->name.len -= 4;
}
} else if (hubbub_string_match(name, attr->name.len,
@@ -249,7 +244,7 @@ void adjust_foreign_attributes(hubbub_treebuilder *treebuilder,
hubbub_string_match(name, attr->name.len,
S("xmlns:xlink"))) {
attr->ns = HUBBUB_NS_XMLNS;
- attr->name.data.off += 6;
+ attr->name.ptr += 6;
attr->name.len -= 6;
}
diff --git a/src/treebuilder/in_head.c b/src/treebuilder/in_head.c
index 7b78973..79cfbc8 100644
--- a/src/treebuilder/in_head.c
+++ b/src/treebuilder/in_head.c
@@ -68,7 +68,7 @@ static void process_script_in_head(hubbub_treebuilder *treebuilder,
treebuilder->context.collect.mode = treebuilder->context.mode;
treebuilder->context.collect.node = script;
treebuilder->context.collect.type = SCRIPT;
- treebuilder->context.collect.string.data.off = 0;
+ treebuilder->context.collect.string.ptr = NULL;
treebuilder->context.collect.string.len = 0;
treebuilder->context.mode = SCRIPT_COLLECT_CHARACTERS;
diff --git a/src/treebuilder/in_row.c b/src/treebuilder/in_row.c
index c7afb1c..7264f5c 100644
--- a/src/treebuilder/in_row.c
+++ b/src/treebuilder/in_row.c
@@ -103,7 +103,8 @@ bool handle_in_row(hubbub_treebuilder *treebuilder,
treebuilder->context.element_stack[
treebuilder->context.current_node].node);
- formatting_list_append(treebuilder, type,
+ formatting_list_append(treebuilder,
+ token->data.tag.ns, type,
treebuilder->context.element_stack[
treebuilder->context.current_node].node,
treebuilder->context.current_node);
diff --git a/src/treebuilder/in_table.c b/src/treebuilder/in_table.c
index ec5173e..70969dd 100644
--- a/src/treebuilder/in_table.c
+++ b/src/treebuilder/in_table.c
@@ -49,10 +49,8 @@ static inline bool process_input_in_table(hubbub_treebuilder *treebuilder,
for (size_t i = 0; i < token->data.tag.n_attributes; i++) {
hubbub_attribute *attr = &token->data.tag.attributes[i];
- if (!hubbub_string_match_ci(treebuilder->input_buffer +
- attr->value.data.off,
- attr->value.len, (uint8_t *) "hidden",
- SLEN("hidden"))) {
+ if (!hubbub_string_match_ci(attr->value.ptr, attr->value.len,
+ (uint8_t *) "hidden", SLEN("hidden"))) {
continue;
}
@@ -120,7 +118,8 @@ bool handle_in_table(hubbub_treebuilder *treebuilder,
treebuilder->tree_handler->ctx,
treebuilder->context.element_stack[
treebuilder->context.current_node].node);
- formatting_list_append(treebuilder, type,
+ formatting_list_append(treebuilder,
+ token->data.tag.ns, type,
treebuilder->context.element_stack[
treebuilder->context.current_node].node,
treebuilder->context.current_node);
@@ -132,9 +131,7 @@ bool handle_in_table(hubbub_treebuilder *treebuilder,
if (type == COL) {
/* Insert colgroup and reprocess */
- tag.name.type = HUBBUB_STRING_PTR;
- tag.name.data.ptr =
- (const uint8_t *) "colgroup";
+ tag.name.ptr = (const uint8_t *) "colgroup";
tag.name.len = SLEN("colgroup");
reprocess = true;
@@ -149,8 +146,7 @@ bool handle_in_table(hubbub_treebuilder *treebuilder,
if (type == TD || type == TH || type == TR) {
/* Insert tbody and reprocess */
- tag.name.type = HUBBUB_STRING_PTR;
- tag.name.data.ptr = (const uint8_t *) "tbody";
+ tag.name.ptr = (const uint8_t *) "tbody";
tag.name.len = SLEN("tbody");
reprocess = true;
diff --git a/src/treebuilder/in_table_body.c b/src/treebuilder/in_table_body.c
index 0bf8df7..6a6c82a 100644
--- a/src/treebuilder/in_table_body.c
+++ b/src/treebuilder/in_table_body.c
@@ -114,8 +114,7 @@ bool handle_in_table_body(hubbub_treebuilder *treebuilder,
/* Manufacture tr tag */
tag.ns = HUBBUB_NS_HTML;
- tag.name.type = HUBBUB_STRING_PTR;
- tag.name.data.ptr = (const uint8_t *) "tr";
+ tag.name.ptr = (const uint8_t *) "tr";
tag.name.len = SLEN("tr");
tag.n_attributes = 0;
diff --git a/src/treebuilder/initial.c b/src/treebuilder/initial.c
index 1bce044..a90af13 100644
--- a/src/treebuilder/initial.c
+++ b/src/treebuilder/initial.c
@@ -120,15 +120,13 @@ static bool lookup_full_quirks(hubbub_treebuilder *treebuilder,
{
size_t i;
- const uint8_t *name = treebuilder->input_buffer + cdoc->name.data.off;
+ const uint8_t *name = cdoc->name.ptr;
size_t name_len = cdoc->name.len;
- const uint8_t *public_id = treebuilder->input_buffer +
- cdoc->public_id.data.off;
+ const uint8_t *public_id = cdoc->public_id.ptr;
size_t public_id_len = cdoc->public_id.len;
- const uint8_t *system_id = treebuilder->input_buffer +
- cdoc->system_id.data.off;
+ const uint8_t *system_id = cdoc->system_id.ptr;
size_t system_id_len = cdoc->system_id.len;
#define S(s) (uint8_t *) s, sizeof s
@@ -183,8 +181,7 @@ static bool lookup_full_quirks(hubbub_treebuilder *treebuilder,
static bool lookup_limited_quirks(hubbub_treebuilder *treebuilder,
const hubbub_doctype *cdoc)
{
- const uint8_t *public_id = treebuilder->input_buffer +
- cdoc->public_id.data.off;
+ const uint8_t *public_id = cdoc->public_id.ptr;
size_t public_id_len = cdoc->public_id.len;
#define S(s) (uint8_t *) s, sizeof s
diff --git a/src/treebuilder/internal.h b/src/treebuilder/internal.h
index ae293a9..5c9eb49 100644
--- a/src/treebuilder/internal.h
+++ b/src/treebuilder/internal.h
@@ -102,16 +102,10 @@ struct hubbub_treebuilder
{
hubbub_tokeniser *tokeniser; /**< Underlying tokeniser */
- const uint8_t *input_buffer; /**< Start of tokeniser's buffer */
- size_t input_buffer_len; /**< Length of input buffer */
-
hubbub_treebuilder_context context;
hubbub_tree_handler *tree_handler;
- hubbub_buffer_handler buffer_handler;
- void *buffer_pw;
-
hubbub_error_handler error_handler;
void *error_pw;
@@ -163,17 +157,22 @@ element_type current_node(hubbub_treebuilder *treebuilder);
element_type prev_node(hubbub_treebuilder *treebuilder);
bool formatting_list_append(hubbub_treebuilder *treebuilder,
- element_type type, void *node, uint32_t stack_index);
+ hubbub_ns ns, element_type type, void *node,
+ uint32_t stack_index);
bool formatting_list_insert(hubbub_treebuilder *treebuilder,
formatting_list_entry *prev, formatting_list_entry *next,
- element_type type, void *node, uint32_t stack_index);
+ hubbub_ns ns, element_type type, void *node,
+ uint32_t stack_index);
bool formatting_list_remove(hubbub_treebuilder *treebuilder,
formatting_list_entry *entry,
- element_type *type, void **node, uint32_t *stack_index);
+ hubbub_ns *ns, element_type *type, void **node,
+ uint32_t *stack_index);
bool formatting_list_replace(hubbub_treebuilder *treebuilder,
formatting_list_entry *entry,
- element_type type, void *node, uint32_t stack_index,
- element_type *otype, void **onode, uint32_t *ostack_index);
+ hubbub_ns ns, element_type type, void *node,
+ uint32_t stack_index,
+ hubbub_ns *ons, element_type *otype, void **onode,
+ uint32_t *ostack_index);
/* in_foreign_content.c */
void adjust_svg_attributes(hubbub_treebuilder *treebuilder,
diff --git a/src/treebuilder/script_collect.c b/src/treebuilder/script_collect.c
index e55b17c..e016409 100644
--- a/src/treebuilder/script_collect.c
+++ b/src/treebuilder/script_collect.c
@@ -30,10 +30,10 @@ bool handle_script_collect_characters(hubbub_treebuilder *treebuilder,
switch (token->type) {
case HUBBUB_TOKEN_CHARACTER:
if (treebuilder->context.collect.string.len == 0) {
- treebuilder->context.collect.string.data.off =
- token->data.character.data.off;
+ treebuilder->context.collect.string.ptr =
+ token->data.character.ptr;
}
- treebuilder->context.collect.string.len +=
+ treebuilder->context.collect.string.len +=
token->data.character.len;
break;
case HUBBUB_TOKEN_END_TAG:
diff --git a/src/treebuilder/treebuilder.c b/src/treebuilder/treebuilder.c
index f739113..c6d8f69 100644
--- a/src/treebuilder/treebuilder.c
+++ b/src/treebuilder/treebuilder.c
@@ -76,9 +76,6 @@ static const struct {
};
-static void hubbub_treebuilder_buffer_handler(const uint8_t *data,
- size_t len, void *pw);
-
/**
* Create a hubbub treebuilder
@@ -103,9 +100,6 @@ hubbub_treebuilder *hubbub_treebuilder_create(hubbub_tokeniser *tokeniser,
tb->tokeniser = tokeniser;
- tb->input_buffer = NULL;
- tb->input_buffer_len = 0;
-
tb->tree_handler = NULL;
memset(&tb->context, 0, sizeof(hubbub_treebuilder_context));
@@ -124,13 +118,8 @@ hubbub_treebuilder *hubbub_treebuilder_create(hubbub_tokeniser *tokeniser,
assert(HTML != 0);
tb->context.element_stack[0].type = 0;
- tb->context.collect.string.type = HUBBUB_STRING_OFF;
-
tb->context.strip_leading_lr = false;
- tb->buffer_handler = NULL;
- tb->buffer_pw = NULL;
-
tb->error_handler = NULL;
tb->error_pw = NULL;
@@ -147,17 +136,7 @@ hubbub_treebuilder *hubbub_treebuilder_create(hubbub_tokeniser *tokeniser,
return NULL;
}
- tokparams.buffer_handler.handler = hubbub_treebuilder_buffer_handler;
- tokparams.buffer_handler.pw = tb;
-
- if (hubbub_tokeniser_setopt(tokeniser, HUBBUB_TOKENISER_BUFFER_HANDLER,
- &tokparams) != HUBBUB_OK) {
- alloc(tb->context.element_stack, 0, pw);
- alloc(tb, 0, pw);
- return NULL;
- }
-
- return tb;
+ return tb;
}
/**
@@ -173,12 +152,6 @@ void hubbub_treebuilder_destroy(hubbub_treebuilder *treebuilder)
if (treebuilder == NULL)
return;
- tokparams.buffer_handler.handler = treebuilder->buffer_handler;
- tokparams.buffer_handler.pw = treebuilder->buffer_pw;
-
- hubbub_tokeniser_setopt(treebuilder->tokeniser,
- HUBBUB_TOKENISER_BUFFER_HANDLER, &tokparams);
-
tokparams.token_handler.handler = NULL;
tokparams.token_handler.pw = NULL;
@@ -253,13 +226,6 @@ hubbub_error hubbub_treebuilder_setopt(hubbub_treebuilder *treebuilder,
return HUBBUB_BADPARM;
switch (type) {
- case HUBBUB_TREEBUILDER_BUFFER_HANDLER:
- treebuilder->buffer_handler = params->buffer_handler.handler;
- treebuilder->buffer_pw = params->buffer_handler.pw;
- treebuilder->buffer_handler(treebuilder->input_buffer,
- treebuilder->input_buffer_len,
- treebuilder->buffer_pw);
- break;
case HUBBUB_TREEBUILDER_ERROR_HANDLER:
treebuilder->error_handler = params->error_handler.handler;
treebuilder->error_pw = params->error_handler.pw;
@@ -276,29 +242,6 @@ hubbub_error hubbub_treebuilder_setopt(hubbub_treebuilder *treebuilder,
}
/**
- * Handle tokeniser buffer moving
- *
- * \param data New location of buffer
- * \param len Length of buffer in bytes
- * \param pw Pointer to treebuilder instance
- */
-void hubbub_treebuilder_buffer_handler(const uint8_t *data,
- size_t len, void *pw)
-{
- hubbub_treebuilder *treebuilder = (hubbub_treebuilder *) pw;
-
- treebuilder->input_buffer = data;
- treebuilder->input_buffer_len = len;
-
- /* Inform client buffer handler, too (if there is one) */
- if (treebuilder->buffer_handler != NULL) {
- treebuilder->buffer_handler(treebuilder->input_buffer,
- treebuilder->input_buffer_len,
- treebuilder->buffer_pw);
- }
-}
-
-/**
* Handle tokeniser emitting a token
*
* \param token The emitted token
@@ -418,8 +361,7 @@ void hubbub_treebuilder_token_handler(const hubbub_token *token,
bool process_characters_expect_whitespace(hubbub_treebuilder *treebuilder,
const hubbub_token *token, bool insert_into_current_node)
{
- const uint8_t *data = treebuilder->input_buffer +
- token->data.character.data.off;
+ const uint8_t *data = token->data.character.ptr;
size_t len = token->data.character.len;
size_t c;
@@ -434,8 +376,7 @@ bool process_characters_expect_whitespace(hubbub_treebuilder *treebuilder,
if (c > 0 && insert_into_current_node) {
hubbub_string temp;
- temp.type = HUBBUB_STRING_OFF;
- temp.data.off = token->data.character.data.off;
+ temp.ptr = data;
temp.len = c;
append_text(treebuilder, &temp);
@@ -444,7 +385,7 @@ bool process_characters_expect_whitespace(hubbub_treebuilder *treebuilder,
/* Non-whitespace characters in token, so reprocess */
if (c != len) {
/* Update token data to strip leading whitespace */
- ((hubbub_token *) token)->data.character.data.off += c;
+ ((hubbub_token *) token)->data.character.ptr += c;
((hubbub_token *) token)->data.character.len -= c;
return true;
@@ -566,7 +507,7 @@ void parse_generic_rcdata(hubbub_treebuilder *treebuilder,
treebuilder->context.collect.mode = treebuilder->context.mode;
treebuilder->context.collect.type = type;
treebuilder->context.collect.node = appended;
- treebuilder->context.collect.string.data.off = 0;
+ treebuilder->context.collect.string.ptr = NULL;
treebuilder->context.collect.string.len = 0;
treebuilder->tree_handler->unref_node(
@@ -651,6 +592,7 @@ void reconstruct_active_formatting_list(hubbub_treebuilder *treebuilder)
while (entry != NULL) {
int success;
void *clone, *appended;
+ hubbub_ns prev_ns;
element_type prev_type;
void *prev_node;
uint32_t prev_stack_index;
@@ -719,9 +661,9 @@ void reconstruct_active_formatting_list(hubbub_treebuilder *treebuilder)
}
if (!formatting_list_replace(treebuilder, entry,
- entry->details.type, appended,
- treebuilder->context.current_node,
- &prev_type, &prev_node,
+ entry->details.ns, entry->details.type,
+ appended, treebuilder->context.current_node,
+ &prev_ns, &prev_type, &prev_node,
&prev_stack_index)) {
/** \todo handle errors */
treebuilder->tree_handler->unref_node(
@@ -748,6 +690,7 @@ void clear_active_formatting_list_to_marker(hubbub_treebuilder *treebuilder)
bool done = false;
while ((entry = treebuilder->context.formatting_list_end) != NULL) {
+ hubbub_ns ns;
element_type type;
void *node;
uint32_t stack_index;
@@ -756,7 +699,7 @@ void clear_active_formatting_list_to_marker(hubbub_treebuilder *treebuilder)
done = true;
if (!formatting_list_remove(treebuilder, entry,
- &type, &node, &stack_index)) {
+ &ns, &type, &node, &stack_index)) {
/** \todo handle errors */
}
@@ -1009,18 +952,9 @@ void append_text(hubbub_treebuilder *treebuilder,
element_type element_type_from_name(hubbub_treebuilder *treebuilder,
const hubbub_string *tag_name)
{
- const uint8_t *name = NULL;
+ const uint8_t *name = tag_name->ptr;
size_t len = tag_name->len;
- switch (tag_name->type) {
- case HUBBUB_STRING_OFF:
- name = treebuilder->input_buffer + tag_name->data.off;
- break;
- case HUBBUB_STRING_PTR:
- name = tag_name->data.ptr;
- break;
- }
-
/** \todo UTF-16 support */
/** \todo optimise this */
@@ -1249,13 +1183,15 @@ element_type prev_node(hubbub_treebuilder *treebuilder)
* Append an element to the end of the list of active formatting elements
*
* \param treebuilder Treebuilder instance containing list
+ * \param ns Namespace of node being inserted
* \param type Type of node being inserted
* \param node Node being inserted
* \param stack_index Index into stack of open elements
* \return True on success, false on memory exhaustion
*/
bool formatting_list_append(hubbub_treebuilder *treebuilder,
- element_type type, void *node, uint32_t stack_index)
+ hubbub_ns ns, element_type type, void *node,
+ uint32_t stack_index)
{
formatting_list_entry *entry;
@@ -1264,6 +1200,7 @@ bool formatting_list_append(hubbub_treebuilder *treebuilder,
if (entry == NULL)
return false;
+ entry->details.ns = ns;
entry->details.type = type;
entry->details.node = node;
entry->stack_index = stack_index;
@@ -1287,6 +1224,7 @@ bool formatting_list_append(hubbub_treebuilder *treebuilder,
* \param treebuilder Treebuilder instance containing list
* \param prev Previous entry
* \param next Next entry
+ * \param ns Namespace of node being inserted
* \param type Type of node being inserted
* \param node Node being inserted
* \param stack_index Index into stack of open elements
@@ -1294,7 +1232,8 @@ bool formatting_list_append(hubbub_treebuilder *treebuilder,
*/
bool formatting_list_insert(hubbub_treebuilder *treebuilder,
formatting_list_entry *prev, formatting_list_entry *next,
- element_type type, void *node, uint32_t stack_index)
+ hubbub_ns ns, element_type type, void *node,
+ uint32_t stack_index)
{
formatting_list_entry *entry;
@@ -1311,6 +1250,7 @@ bool formatting_list_insert(hubbub_treebuilder *treebuilder,
if (entry == NULL)
return false;
+ entry->details.ns = ns;
entry->details.type = type;
entry->details.node = node;
entry->stack_index = stack_index;
@@ -1337,6 +1277,7 @@ bool formatting_list_insert(hubbub_treebuilder *treebuilder,
*
* \param treebuilder Treebuilder instance containing list
* \param entry The item to remove
+ * \param ns Pointer to location to receive namespace of node
* \param type Pointer to location to receive type of node
* \param node Pointer to location to receive node
* \param stack_index Pointer to location to receive stack index
@@ -1344,8 +1285,10 @@ bool formatting_list_insert(hubbub_treebuilder *treebuilder,
*/
bool formatting_list_remove(hubbub_treebuilder *treebuilder,
formatting_list_entry *entry,
- element_type *type, void **node, uint32_t *stack_index)
+ hubbub_ns *ns, element_type *type, void **node,
+ uint32_t *stack_index)
{
+ *ns = entry->details.ns;
*type = entry->details.type;
*node = entry->details.node;
*stack_index = entry->stack_index;
@@ -1370,9 +1313,11 @@ bool formatting_list_remove(hubbub_treebuilder *treebuilder,
*
* \param treebuilder Treebuilder instance containing list
* \param entry The item to replace
+ * \param ns Replacement node namespace
* \param type Replacement node type
* \param node Replacement node
* \param stack_index Replacement stack index
+ * \param ons Pointer to location to receive old namespace
* \param otype Pointer to location to receive old type
* \param onode Pointer to location to receive old node
* \param ostack_index Pointer to location to receive old stack index
@@ -1380,15 +1325,19 @@ bool formatting_list_remove(hubbub_treebuilder *treebuilder,
*/
bool formatting_list_replace(hubbub_treebuilder *treebuilder,
formatting_list_entry *entry,
- element_type type, void *node, uint32_t stack_index,
- element_type *otype, void **onode, uint32_t *ostack_index)
+ hubbub_ns ns, element_type type, void *node,
+ uint32_t stack_index,
+ hubbub_ns *ons, element_type *otype, void **onode,
+ uint32_t *ostack_index)
{
UNUSED(treebuilder);
+ *ons = entry->details.ns;
*otype = entry->details.type;
*onode = entry->details.node;
*ostack_index = entry->stack_index;
+ entry->details.ns = ns;
entry->details.type = type;
entry->details.node = node;
entry->stack_index = stack_index;
diff --git a/src/treebuilder/treebuilder.h b/src/treebuilder/treebuilder.h
index 9d690b1..67451b8 100644
--- a/src/treebuilder/treebuilder.h
+++ b/src/treebuilder/treebuilder.h
@@ -24,7 +24,6 @@ typedef struct hubbub_treebuilder hubbub_treebuilder;
* Hubbub treebuilder option types
*/
typedef enum hubbub_treebuilder_opttype {
- HUBBUB_TREEBUILDER_BUFFER_HANDLER,
HUBBUB_TREEBUILDER_ERROR_HANDLER,
HUBBUB_TREEBUILDER_TREE_HANDLER,
HUBBUB_TREEBUILDER_DOCUMENT_NODE,
@@ -35,11 +34,6 @@ typedef enum hubbub_treebuilder_opttype {
*/
typedef union hubbub_treebuilder_optparams {
struct {
- hubbub_buffer_handler handler;
- void *pw;
- } buffer_handler;
-
- struct {
hubbub_error_handler handler;
void *pw;
} error_handler;
diff --git a/src/utils/Makefile b/src/utils/Makefile
index 1910dc0..0678442 100644
--- a/src/utils/Makefile
+++ b/src/utils/Makefile
@@ -32,7 +32,7 @@ dirstack_$(sp) := $(d)
d := $(DIR)
# Sources
-SRCS_$(d) := dict.c errors.c utf8.c utf16.c string.c
+SRCS_$(d) := dict.c errors.c string.c
# Append to sources for component
SOURCES += $(addprefix $(d), $(SRCS_$(d)))
diff --git a/src/utils/utf16.c b/src/utils/utf16.c
deleted file mode 100644
index a295109..0000000
--- a/src/utils/utf16.c
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- * This file is part of Hubbub.
- * Licensed under the MIT License,
- * http://www.opensource.org/licenses/mit-license.php
- * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
- */
-
-/** \file
- * UTF-16 manipulation functions (implementation).
- */
-
-#include <stdbool.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "utils/utf16.h"
-
-/**
- * Convert a UTF-16 sequence into a single UCS4 character
- *
- * \param s The sequence to process
- * \param len Length of sequence
- * \param ucs4 Pointer to location to receive UCS4 character (host endian)
- * \param clen Pointer to location to receive byte length of UTF-16 sequence
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_utf16_to_ucs4(const uint8_t *s, size_t len,
- uint32_t *ucs4, size_t *clen)
-{
- const uint16_t *ss = (const uint16_t *) (const void *) s;
-
- if (s == NULL || ucs4 == NULL || clen == NULL)
- return HUBBUB_BADPARM;
-
- if (len < 2)
- return HUBBUB_NEEDDATA;
-
- if (*ss < 0xD800 || *ss > 0xDFFF) {
- *ucs4 = *ss;
- *clen = 2;
- } else if (0xD800 <= *ss && *ss <= 0xBFFF) {
- if (len < 4)
- return HUBBUB_NEEDDATA;
-
- if (0xDC00 <= ss[1] && ss[1] <= 0xE000) {
- *ucs4 = (((s[0] >> 6) & 0x1f) + 1) |
- ((s[0] & 0x3f) | (s[1] & 0x3ff));
- *clen = 4;
- } else {
- return HUBBUB_INVALID;
- }
- }
-
- return HUBBUB_OK;
-}
-
-/**
- * Convert a single UCS4 character into a UTF-16 sequence
- *
- * \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
- * \param s Pointer to 4 byte long output buffer
- * \param len Pointer to location to receive length of multibyte sequence
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_utf16_from_ucs4(uint32_t ucs4, uint8_t *s,
- size_t *len)
-{
- uint16_t *ss = (uint16_t *) (void *) s;
- uint32_t l = 0;
-
- if (s == NULL || len == NULL)
- return HUBBUB_BADPARM;
- else if (ucs4 < 0x10000) {
- *ss = (uint16_t) ucs4;
- l = 2;
- } else if (ucs4 < 0x110000) {
- ss[0] = 0xD800 | (((ucs4 >> 16) & 0x1f) - 1) | (ucs4 >> 10);
- ss[1] = 0xDC00 | (ucs4 & 0x3ff);
- l = 4;
- } else {
- return HUBBUB_INVALID;
- }
-
- *len = l;
-
- return HUBBUB_OK;
-}
-
-/**
- * Calculate the length (in characters) of a bounded UTF-16 string
- *
- * \param s The string
- * \param max Maximum length
- * \param len Pointer to location to receive length of string
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_utf16_length(const uint8_t *s, size_t max,
- size_t *len)
-{
- const uint16_t *ss = (const uint16_t *) (const void *) s;
- const uint16_t *end = (const uint16_t *) (const void *) (s + max);
- int l = 0;
-
- if (s == NULL || len == NULL)
- return HUBBUB_BADPARM;
-
- while (ss < end) {
- if (*ss < 0xD800 || 0xDFFF < *ss)
- ss++;
- else
- ss += 2;
-
- l++;
- }
-
- *len = l;
-
- return HUBBUB_OK;
-}
-
-/**
- * Calculate the length (in bytes) of a UTF-16 character
- *
- * \param s Pointer to start of character
- * \param len Pointer to location to receive length
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_utf16_char_byte_length(const uint8_t *s,
- size_t *len)
-{
- const uint16_t *ss = (const uint16_t *) (const void *) s;
-
- if (s == NULL || len == NULL)
- return HUBBUB_BADPARM;
-
- if (*ss < 0xD800 || 0xDFFF < *ss)
- *len = 2;
- else
- *len = 4;
-
- return HUBBUB_OK;
-}
-
-/**
- * Find previous legal UTF-16 char in string
- *
- * \param s The string
- * \param off Offset in the string to start at
- * \param prevoff Pointer to location to receive offset of first byte of
- * previous legal character
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_utf16_prev(const uint8_t *s, uint32_t off,
- uint32_t *prevoff)
-{
- const uint16_t *ss = (const uint16_t *) (const void *) s;
-
- if (s == NULL || prevoff == NULL)
- return HUBBUB_BADPARM;
-
- if (off < 2)
- *prevoff = 0;
- else if (ss[-1] < 0xDC00 || ss[-1] > 0xDFFF)
- *prevoff = off - 2;
- else
- *prevoff = (off < 4) ? 0 : off - 4;
-
- return HUBBUB_OK;
-}
-
-/**
- * Find next legal UTF-16 char in string
- *
- * \param s The string (assumed valid)
- * \param len Maximum offset in string
- * \param off Offset in the string to start at
- * \param nextoff Pointer to location to receive offset of first byte of
- * next legal character
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_utf16_next(const uint8_t *s, uint32_t len,
- uint32_t off, uint32_t *nextoff)
-{
- const uint16_t *ss = (const uint16_t *) (const void *) s;
-
- if (s == NULL || off >= len || nextoff == NULL)
- return HUBBUB_BADPARM;
-
- if (len - off < 4)
- *nextoff = len;
- else if (ss[1] < 0xD800 || ss[1] > 0xDBFF)
- *nextoff = off + 2;
- else
- *nextoff = (len - off < 6) ? len : off + 4;
-
- return HUBBUB_OK;
-}
-
-/**
- * Find next legal UTF-16 char in string
- *
- * \param s The string (assumed to be of dubious validity)
- * \param len Maximum offset in string
- * \param off Offset in the string to start at
- * \param nextoff Pointer to location to receive offset of first byte of
- * next legal character
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_utf16_next_paranoid(const uint8_t *s,
- uint32_t len, uint32_t off, uint32_t *nextoff)
-{
- const uint16_t *ss = (const uint16_t *) (const void *) s;
-
- if (s == NULL || off >= len || nextoff == NULL)
- return HUBBUB_BADPARM;
-
- while (1) {
- if (len - off < 4) {
- return HUBBUB_NEEDDATA;
- } else if (ss[1] < 0xD800 || ss[1] > 0xDFFF) {
- *nextoff = off + 2;
- break;
- } else if (ss[1] >= 0xD800 && ss[1] <= 0xDBFF) {
- if (len - off < 6)
- return HUBBUB_NEEDDATA;
-
- if (ss[2] >= 0xDC00 && ss[2] <= 0xDFFF) {
- *nextoff = off + 4;
- break;
- } else {
- ss++;
- off += 2;
- }
- }
- }
-
- return HUBBUB_OK;
-}
-
diff --git a/src/utils/utf16.h b/src/utils/utf16.h
deleted file mode 100644
index 2cea38d..0000000
--- a/src/utils/utf16.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * This file is part of Hubbub.
- * Licensed under the MIT License,
- * http://www.opensource.org/licenses/mit-license.php
- * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
- */
-
-/** \file
- * UTF-16 manipulation functions (interface).
- */
-
-#ifndef hubbub_utils_utf16_h_
-#define hubbub_utils_utf16_h_
-
-#include <inttypes.h>
-
-#include <hubbub/errors.h>
-
-hubbub_error hubbub_utf16_to_ucs4(const uint8_t *s, size_t len,
- uint32_t *ucs4, size_t *clen);
-hubbub_error hubbub_utf16_from_ucs4(uint32_t ucs4, uint8_t *s,
- size_t *len);
-
-hubbub_error hubbub_utf16_length(const uint8_t *s, size_t max,
- size_t *len);
-hubbub_error hubbub_utf16_char_byte_length(const uint8_t *s,
- size_t *len);
-
-hubbub_error hubbub_utf16_prev(const uint8_t *s, uint32_t off,
- uint32_t *prevoff);
-hubbub_error hubbub_utf16_next(const uint8_t *s, uint32_t len,
- uint32_t off, uint32_t *nextoff);
-
-hubbub_error hubbub_utf16_next_paranoid(const uint8_t *s,
- uint32_t len, uint32_t off, uint32_t *nextoff);
-
-#endif
-
diff --git a/src/utils/utf8.c b/src/utils/utf8.c
deleted file mode 100644
index 08a1853..0000000
--- a/src/utils/utf8.c
+++ /dev/null
@@ -1,368 +0,0 @@
-/*
- * This file is part of Hubbub.
- * Licensed under the MIT License,
- * http://www.opensource.org/licenses/mit-license.php
- * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
- */
-
-/** \file
- * UTF-8 manipulation functions (implementation).
- */
-
-#include <stdbool.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "utils/utf8.h"
-
-/** Number of continuation bytes for a given start byte */
-static const uint8_t numContinuations[256] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5,
-};
-
-/**
- * Convert a UTF-8 multibyte sequence into a single UCS4 character
- *
- * Encoding of UCS values outside the UTF-16 plane has been removed from
- * RFC3629. This function conforms to RFC2279, however.
- *
- * \param s The sequence to process
- * \param len Length of sequence
- * \param ucs4 Pointer to location to receive UCS4 character (host endian)
- * \param clen Pointer to location to receive byte length of UTF-8 sequence
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_utf8_to_ucs4(const uint8_t *s, size_t len,
- uint32_t *ucs4, size_t *clen)
-{
- if (s == NULL || ucs4 == NULL || clen == NULL)
- return HUBBUB_BADPARM;
-
- if (len == 0)
- return HUBBUB_NEEDDATA;
-
- if (*s < 0x80) {
- *ucs4 = *s;
- *clen = 1;
- } else if ((*s & 0xE0) == 0xC0) {
- if (len < 2)
- return HUBBUB_NEEDDATA;
- else if ((*(s+1) & 0xC0) != 0x80)
- return HUBBUB_INVALID;
- else {
- *ucs4 = ((*s & 0x1F) << 6) | (*(s+1) & 0x3F);
- *clen = 2;
- }
- } else if ((*s & 0xF0) == 0xE0) {
- if (len < 3)
- return HUBBUB_NEEDDATA;
- else if ((*(s+1) & 0xC0) != 0x80 ||
- (*(s+2) & 0xC0) != 0x80)
- return HUBBUB_INVALID;
- else {
- *ucs4 = ((*s & 0x0F) << 12) |
- ((*(s+1) & 0x3F) << 6) |
- (*(s+2) & 0x3F);
- *clen = 3;
- }
- } else if ((*s & 0xF8) == 0xF0) {
- if (len < 4)
- return HUBBUB_NEEDDATA;
- else if ((*(s+1) & 0xC0) != 0x80 ||
- (*(s+2) & 0xC0) != 0x80 ||
- (*(s+3) & 0xC0) != 0x80)
- return HUBBUB_INVALID;
- else {
- *ucs4 = ((*s & 0x0F) << 18) |
- ((*(s+1) & 0x3F) << 12) |
- ((*(s+2) & 0x3F) << 6) |
- (*(s+3) & 0x3F);
- *clen = 4;
- }
- } else if ((*s & 0xFC) == 0xF8) {
- if (len < 5)
- return HUBBUB_NEEDDATA;
- else if ((*(s+1) & 0xC0) != 0x80 ||
- (*(s+2) & 0xC0) != 0x80 ||
- (*(s+3) & 0xC0) != 0x80 ||
- (*(s+4) & 0xC0) != 0x80)
- return HUBBUB_INVALID;
- else {
- *ucs4 = ((*s & 0x0F) << 24) |
- ((*(s+1) & 0x3F) << 18) |
- ((*(s+2) & 0x3F) << 12) |
- ((*(s+3) & 0x3F) << 6) |
- (*(s+4) & 0x3F);
- *clen = 5;
- }
- } else if ((*s & 0xFE) == 0xFC) {
- if (len < 6)
- return HUBBUB_NEEDDATA;
- else if ((*(s+1) & 0xC0) != 0x80 ||
- (*(s+2) & 0xC0) != 0x80 ||
- (*(s+3) & 0xC0) != 0x80 ||
- (*(s+4) & 0xC0) != 0x80 ||
- (*(s+5) & 0xC0) != 0x80)
- return HUBBUB_INVALID;
- else {
- *ucs4 = ((*s & 0x0F) << 28) |
- ((*(s+1) & 0x3F) << 24) |
- ((*(s+2) & 0x3F) << 18) |
- ((*(s+3) & 0x3F) << 12) |
- ((*(s+4) & 0x3F) << 6) |
- (*(s+5) & 0x3F);
- *clen = 6;
- }
- } else {
- return HUBBUB_INVALID;
- }
-
- return HUBBUB_OK;
-}
-
-/**
- * Convert a single UCS4 character into a UTF-8 multibyte sequence
- *
- * Encoding of UCS values outside the UTF-16 plane has been removed from
- * RFC3629. This function conforms to RFC2279, however.
- *
- * \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
- * \param s Pointer to 6 byte long output buffer
- * \param len Pointer to location to receive length of multibyte sequence
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_utf8_from_ucs4(uint32_t ucs4, uint8_t *s,
- size_t *len)
-{
- uint32_t l = 0;
-
- if (s == NULL || len == NULL)
- return HUBBUB_BADPARM;
- else if (ucs4 < 0x80) {
- *s = (uint8_t) ucs4;
- l = 1;
- } else if (ucs4 < 0x800) {
- *s = 0xC0 | ((ucs4 >> 6) & 0x1F);
- *(s+1) = 0x80 | (ucs4 & 0x3F);
- l = 2;
- } else if (ucs4 < 0x10000) {
- *s = 0xE0 | ((ucs4 >> 12) & 0xF);
- *(s+1) = 0x80 | ((ucs4 >> 6) & 0x3F);
- *(s+2) = 0x80 | (ucs4 & 0x3F);
- l = 3;
- } else if (ucs4 < 0x200000) {
- *s = 0xF0 | ((ucs4 >> 18) & 0x7);
- *(s+1) = 0x80 | ((ucs4 >> 12) & 0x3F);
- *(s+2) = 0x80 | ((ucs4 >> 6) & 0x3F);
- *(s+3) = 0x80 | (ucs4 & 0x3F);
- l = 4;
- } else if (ucs4 < 0x4000000) {
- *s = 0xF8 | ((ucs4 >> 24) & 0x3);
- *(s+1) = 0x80 | ((ucs4 >> 18) & 0x3F);
- *(s+2) = 0x80 | ((ucs4 >> 12) & 0x3F);
- *(s+3) = 0x80 | ((ucs4 >> 6) & 0x3F);
- *(s+4) = 0x80 | (ucs4 & 0x3F);
- l = 5;
- } else if (ucs4 <= 0x7FFFFFFF) {
- *s = 0xFC | ((ucs4 >> 30) & 0x1);
- *(s+1) = 0x80 | ((ucs4 >> 24) & 0x3F);
- *(s+2) = 0x80 | ((ucs4 >> 18) & 0x3F);
- *(s+3) = 0x80 | ((ucs4 >> 12) & 0x3F);
- *(s+4) = 0x80 | ((ucs4 >> 6) & 0x3F);
- *(s+5) = 0x80 | (ucs4 & 0x3F);
- l = 6;
- } else {
- return HUBBUB_INVALID;
- }
-
- *len = l;
-
- return HUBBUB_OK;
-}
-
-/**
- * Calculate the length (in characters) of a bounded UTF-8 string
- *
- * \param s The string
- * \param max Maximum length
- * \param len Pointer to location to receive length of string
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_utf8_length(const uint8_t *s, size_t max,
- size_t *len)
-{
- const uint8_t *end = s + max;
- int l = 0;
-
- if (s == NULL || len == NULL)
- return HUBBUB_BADPARM;
-
- while (s < end) {
- if ((*s & 0x80) == 0x00)
- s += 1;
- else if ((*s & 0xE0) == 0xC0)
- s += 2;
- else if ((*s & 0xF0) == 0xE0)
- s += 3;
- else if ((*s & 0xF8) == 0xF0)
- s += 4;
- else if ((*s & 0xFC) == 0xF8)
- s += 5;
- else if ((*s & 0xFE) == 0xFC)
- s += 6;
- else
- return HUBBUB_INVALID;
- l++;
- }
-
- *len = l;
-
- return HUBBUB_OK;
-}
-
-/**
- * Calculate the length (in bytes) of a UTF-8 character
- *
- * \param s Pointer to start of character
- * \param len Pointer to location to receive length
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_utf8_char_byte_length(const uint8_t *s,
- size_t *len)
-{
- if (s == NULL || len == NULL)
- return HUBBUB_BADPARM;
-
- *len = numContinuations[s[0]] + 1 /* Start byte */;
-
- return HUBBUB_OK;
-}
-
-/**
- * Find previous legal UTF-8 char in string
- *
- * \param s The string
- * \param off Offset in the string to start at
- * \param prevoff Pointer to location to receive offset of first byte of
- * previous legal character
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_utf8_prev(const uint8_t *s, uint32_t off,
- uint32_t *prevoff)
-{
- if (s == NULL || prevoff == NULL)
- return HUBBUB_BADPARM;
-
- while (off != 0 && (s[--off] & 0xC0) == 0x80)
- /* do nothing */;
-
- *prevoff = off;
-
- return HUBBUB_OK;
-}
-
-/**
- * Find next legal UTF-8 char in string
- *
- * \param s The string (assumed valid)
- * \param len Maximum offset in string
- * \param off Offset in the string to start at
- * \param nextoff Pointer to location to receive offset of first byte of
- * next legal character
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_utf8_next(const uint8_t *s, uint32_t len,
- uint32_t off, uint32_t *nextoff)
-{
- if (s == NULL || off >= len || nextoff == NULL)
- return HUBBUB_BADPARM;
-
- /* Skip current start byte (if present - may be mid-sequence) */
- if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0)
- off++;
-
- while (off < len && (s[off] & 0xC0) == 0x80)
- off++;
-
- *nextoff = off;
-
- return HUBBUB_OK;
-}
-
-/**
- * Find next legal UTF-8 char in string
- *
- * \param s The string (assumed to be of dubious validity)
- * \param len Maximum offset in string
- * \param off Offset in the string to start at
- * \param nextoff Pointer to location to receive offset of first byte of
- * next legal character
- * \return HUBBUB_OK on success, appropriate error otherwise
- */
-hubbub_error hubbub_utf8_next_paranoid(const uint8_t *s, uint32_t len,
- uint32_t off, uint32_t *nextoff)
-{
- bool valid;
-
- if (s == NULL || off >= len || nextoff == NULL)
- return HUBBUB_BADPARM;
-
- /* Skip current start byte (if present - may be mid-sequence) */
- if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0)
- off++;
-
- while (1) {
- /* Find next possible start byte */
- while (off < len && (s[off] & 0xC0) == 0x80)
- off++;
-
- /* Ran off end of data */
- if (off == len || off + numContinuations[s[off]] >= len)
- return HUBBUB_NEEDDATA;
-
- /* Found if start byte is ascii,
- * or next n bytes are valid continuations */
- valid = true;
-
- switch (numContinuations[s[off]]) {
- case 5:
- valid &= ((s[off + 5] & 0xC0) == 0x80);
- case 4:
- valid &= ((s[off + 4] & 0xC0) == 0x80);
- case 3:
- valid &= ((s[off + 3] & 0xC0) == 0x80);
- case 2:
- valid &= ((s[off + 2] & 0xC0) == 0x80);
- case 1:
- valid &= ((s[off + 1] & 0xC0) == 0x80);
- case 0:
- valid &= (s[off + 0] < 0x80);
- }
-
- if (valid)
- break;
-
- /* Otherwise, skip this (invalid) start byte and try again */
- off++;
- }
-
- *nextoff = off;
-
- return HUBBUB_OK;
-}
-
diff --git a/src/utils/utf8.h b/src/utils/utf8.h
deleted file mode 100644
index f2eedcb..0000000
--- a/src/utils/utf8.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * This file is part of Hubbub.
- * Licensed under the MIT License,
- * http://www.opensource.org/licenses/mit-license.php
- * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
- */
-
-/** \file
- * UTF-8 manipulation functions (interface).
- */
-
-#ifndef hubbub_utils_utf8_h_
-#define hubbub_utils_utf8_h_
-
-#include <inttypes.h>
-
-#include <hubbub/errors.h>
-
-hubbub_error hubbub_utf8_to_ucs4(const uint8_t *s, size_t len,
- uint32_t *ucs4, size_t *clen);
-hubbub_error hubbub_utf8_from_ucs4(uint32_t ucs4, uint8_t *s,
- size_t *len);
-
-hubbub_error hubbub_utf8_length(const uint8_t *s, size_t max,
- size_t *len);
-hubbub_error hubbub_utf8_char_byte_length(const uint8_t *s,
- size_t *len);
-
-hubbub_error hubbub_utf8_prev(const uint8_t *s, uint32_t off,
- uint32_t *prevoff);
-hubbub_error hubbub_utf8_next(const uint8_t *s, uint32_t len,
- uint32_t off, uint32_t *nextoff);
-
-hubbub_error hubbub_utf8_next_paranoid(const uint8_t *s, uint32_t len,
- uint32_t off, uint32_t *nextoff);
-
-#endif
-
diff --git a/test/INDEX b/test/INDEX
index e3522bc..50a15de 100644
--- a/test/INDEX
+++ b/test/INDEX
@@ -2,23 +2,13 @@
#
# Test Description DataDir
-aliases Encoding alias handling
-cscodec Charset codec implementation cscodec
+tokeniser2 HTML tokeniser (again) tokeniser2
csdetect Charset detection csdetect
dict Generic string dictionary
entities Named entity dictionary
-filter Input stream filtering
hubbub Library initialisation/finalisation
-inputstream Buffered input stream html
parser Public parser API html
-parser-utf16 Public parser API (utf-16 internally) html
tokeniser HTML tokeniser html
-tokeniser2 HTML tokeniser (again) tokeniser2
tokeniser3 HTML tokeniser (byte-by-byte) tokeniser2
tree Treebuilding API html
tree2 Treebuilding API tree-construction
-
-# Regression tests
-regression/cscodec-segv Segfault in charset codecs
-regression/filter-segv Segfault in input filtering
-regression/stream-nomem Inputstream buffer expansion
diff --git a/test/Makefile b/test/Makefile
index 20aa6ce..74df014 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -39,11 +39,8 @@ CFLAGS := $(CFLAGS) -I$(TOP)/src/ -I$(d) \
LDFLAGS := $(LDFLAGS) `$(PKGCONFIG) $(PKGCONFIGFLAGS) --libs json` -liconv
# Tests
-TESTS_$(d) := aliases cscodec csdetect dict entities filter hubbub \
- inputstream parser parser-utf16 tokeniser tokeniser2 tokeniser3 \
- tree tree2
-TESTS_$(d) := $(TESTS_$(d)) regression/cscodec-segv regression/filter-segv \
- regression/stream-nomem
+TESTS_$(d) := csdetect dict entities hubbub parser \
+ tokeniser tokeniser2 tokeniser3 tree tree2
# Items for top-level makefile to use
ITEMS_CLEAN := $(ITEMS_CLEAN) \
@@ -80,7 +77,7 @@ define compile_test
$(2): $(3) $(TOP)/$(COMPONENT)-debug.a
@$$(ECHO) $$(ECHOFLAGS) "==> $(1)"
@$$(CC) -c -g $$(DEBUGCFLAGS) -o $$@.o $(1)
- @$$(LD) -g -o $$@ $$@.o $$(LDFLAGS) -lhubbub-debug -lgcov
+ @$$(LD) -g -o $$@ $$@.o -lhubbub-debug $$(LDFLAGS) -lgcov
@$$(RM) $$(RMFLAGS) $$@.o
endef
diff --git a/test/aliases.c b/test/aliases.c
deleted file mode 100644
index 1cbf2a4..0000000
--- a/test/aliases.c
+++ /dev/null
@@ -1,61 +0,0 @@
-#include <stdio.h>
-#include <string.h>
-
-#include "charset/aliases.h"
-
-#include "testutils.h"
-
-extern void hubbub_aliases_dump(void);
-
-static void *myrealloc(void *ptr, size_t len, void *pw)
-{
- UNUSED(pw);
-
- return realloc(ptr, len);
-}
-
-int main (int argc, char **argv)
-{
- hubbub_aliases_canon *c;
-
- if (argc != 2) {
- printf("Usage: %s <filename>\n", argv[0]);
- return 1;
- }
-
- hubbub_aliases_create(argv[1], myrealloc, NULL);
-
- hubbub_aliases_dump();
-
- c = hubbub_alias_canonicalise("moose", 5);
- if (c) {
- printf("FAIL - found invalid encoding 'moose'\n");
- return 1;
- }
-
- c = hubbub_alias_canonicalise("csinvariant", 11);
- if (c) {
- printf("%s %d\n", c->name, c->mib_enum);
- } else {
- printf("FAIL - failed finding encoding 'csinvariant'\n");
- return 1;
- }
-
- c = hubbub_alias_canonicalise("nats-sefi-add", 13);
- if (c) {
- printf("%s %d\n", c->name, c->mib_enum);
- } else {
- printf("FAIL - failed finding encoding 'nats-sefi-add'\n");
- return 1;
- }
-
- printf("%d\n", hubbub_mibenum_from_name(c->name, strlen(c->name)));
-
- printf("%s\n", hubbub_mibenum_to_name(c->mib_enum));
-
- hubbub_aliases_destroy(myrealloc, NULL);
-
- printf("PASS\n");
-
- return 0;
-}
diff --git a/test/cscodec.c b/test/cscodec.c
deleted file mode 100644
index 525b275..0000000
--- a/test/cscodec.c
+++ /dev/null
@@ -1,247 +0,0 @@
-#include <stdio.h>
-#include <string.h>
-
-#include <hubbub/hubbub.h>
-
-#include "charset/codec.h"
-#include "utils/utils.h"
-
-#include "testutils.h"
-
-typedef struct line_ctx {
- hubbub_charsetcodec *codec;
-
- size_t buflen;
- size_t bufused;
- uint8_t *buf;
- size_t explen;
- size_t expused;
- uint8_t *exp;
-
- bool indata;
- bool inexp;
-
- hubbub_error exp_ret;
-
- enum { ENCODE, DECODE } dir;
-} line_ctx;
-
-static bool handle_line(const char *data, size_t datalen, void *pw);
-static void run_test(line_ctx *ctx);
-static hubbub_error filter(uint32_t c, uint32_t **output,
- size_t *outputlen, void *pw);
-
-
-static void *myrealloc(void *ptr, size_t len, void *pw)
-{
- UNUSED(pw);
-
- return realloc(ptr, len);
-}
-
-int main(int argc, char **argv)
-{
- line_ctx ctx;
-
- if (argc != 3) {
- printf("Usage: %s <aliases_file> <filename>\n", argv[0]);
- return 1;
- }
-
- assert(hubbub_initialise(argv[1], myrealloc, NULL) == HUBBUB_OK);
-
- assert(hubbub_charsetcodec_create("NATS-SEFI-ADD",
- myrealloc, NULL) == NULL);
-
- ctx.codec = hubbub_charsetcodec_create("UTF-8", myrealloc, NULL);
- assert(ctx.codec != NULL);
-
- ctx.buflen = parse_filesize(argv[2]);
- if (ctx.buflen == 0)
- return 1;
-
- ctx.buf = malloc(2 * ctx.buflen);
- if (ctx.buf == NULL) {
- printf("Failed allocating %u bytes\n",
- (unsigned int) ctx.buflen);
- return 1;
- }
-
- ctx.exp = ctx.buf + ctx.buflen;
- ctx.explen = ctx.buflen;
-
- ctx.buf[0] = '\0';
- ctx.exp[0] = '\0';
- ctx.bufused = 0;
- ctx.expused = 0;
- ctx.indata = false;
- ctx.inexp = false;
- ctx.exp_ret = HUBBUB_OK;
-
- assert(parse_testfile(argv[2], handle_line, &ctx) == true);
-
- /* and run final test */
- if (ctx.bufused > 0 && ctx.buf[ctx.bufused - 1] == '\n')
- ctx.bufused -= 1;
-
- if (ctx.expused > 0 && ctx.exp[ctx.expused - 1] == '\n')
- ctx.expused -= 1;
-
- run_test(&ctx);
-
- free(ctx.buf);
-
- hubbub_charsetcodec_destroy(ctx.codec);
-
- assert(hubbub_finalise(myrealloc, NULL) == HUBBUB_OK);
-
- printf("PASS\n");
-
- return 0;
-}
-
-bool handle_line(const char *data, size_t datalen, void *pw)
-{
- line_ctx *ctx = (line_ctx *) pw;
-
- if (data[0] == '#') {
- if (ctx->inexp) {
- /* This marks end of testcase, so run it */
-
- if (ctx->buf[ctx->bufused - 1] == '\n')
- ctx->bufused -= 1;
-
- if (ctx->exp[ctx->expused - 1] == '\n')
- ctx->expused -= 1;
-
- run_test(ctx);
-
- ctx->buf[0] = '\0';
- ctx->exp[0] = '\0';
- ctx->bufused = 0;
- ctx->expused = 0;
- ctx->exp_ret = HUBBUB_OK;
- }
-
- if (strncasecmp(data+1, "data", 4) == 0) {
- hubbub_charsetcodec_optparams params;
- const char *ptr = data + 6;
-
- ctx->indata = true;
- ctx->inexp = false;
-
- if (strncasecmp(ptr, "decode", 6) == 0)
- ctx->dir = DECODE;
- else
- ctx->dir = ENCODE;
-
- ptr += 7;
-
- if (strncasecmp(ptr, "LOOSE", 5) == 0) {
- params.error_mode.mode =
- HUBBUB_CHARSETCODEC_ERROR_LOOSE;
- ptr += 6;
- } else if (strncasecmp(ptr, "STRICT", 6) == 0) {
- params.error_mode.mode =
- HUBBUB_CHARSETCODEC_ERROR_STRICT;
- ptr += 7;
- } else {
- params.error_mode.mode =
- HUBBUB_CHARSETCODEC_ERROR_TRANSLIT;
- ptr += 9;
- }
-
- assert(hubbub_charsetcodec_setopt(ctx->codec,
- HUBBUB_CHARSETCODEC_ERROR_MODE,
- (hubbub_charsetcodec_optparams *) &params)
- == HUBBUB_OK);
-
- if (strncasecmp(ptr, "filter", 6) == 0) {
- params.filter_func.filter = filter;
- params.filter_func.pw = ctx;
-
- assert(hubbub_charsetcodec_setopt(ctx->codec,
- HUBBUB_CHARSETCODEC_FILTER_FUNC,
- (hubbub_charsetcodec_optparams *)
- &params) == HUBBUB_OK);
- }
- } else if (strncasecmp(data+1, "expected", 8) == 0) {
- ctx->indata = false;
- ctx->inexp = true;
-
- ctx->exp_ret = hubbub_error_from_string(data + 10,
- datalen - 10 - 1 /* \n */);
- } else if (strncasecmp(data+1, "reset", 5) == 0) {
- ctx->indata = false;
- ctx->inexp = false;
-
- hubbub_charsetcodec_reset(ctx->codec);
- }
- } else {
- if (ctx->indata) {
- memcpy(ctx->buf + ctx->bufused, data, datalen);
- ctx->bufused += datalen;
- }
- if (ctx->inexp) {
- memcpy(ctx->exp + ctx->expused, data, datalen);
- ctx->expused += datalen;
- }
- }
-
- return true;
-}
-
-void run_test(line_ctx *ctx)
-{
- static int testnum;
- size_t destlen = ctx->bufused * 4;
- uint8_t dest[destlen];
- uint8_t *pdest = dest;
- const uint8_t *psrc = ctx->buf;
- size_t srclen = ctx->bufused;
- size_t i;
-
- if (ctx->dir == DECODE) {
- assert(hubbub_charsetcodec_decode(ctx->codec,
- &psrc, &srclen,
- &pdest, &destlen) == ctx->exp_ret);
- } else {
- assert(hubbub_charsetcodec_encode(ctx->codec,
- &psrc, &srclen,
- &pdest, &destlen) == ctx->exp_ret);
- }
-
- printf("%d: Read '", ++testnum);
- for (i = 0; i < ctx->expused; i++) {
- printf("%c%c ", "0123456789abcdef"[(dest[i] >> 4) & 0xf],
- "0123456789abcdef"[dest[i] & 0xf]);
- }
- printf("' Expected '");
- for (i = 0; i < ctx->expused; i++) {
- printf("%c%c ", "0123456789abcdef"[(ctx->exp[i] >> 4) & 0xf],
- "0123456789abcdef"[ctx->exp[i] & 0xf]);
- }
- printf("'\n");
-
- assert(memcmp(dest, ctx->exp, ctx->expused) == 0);
-}
-
-hubbub_error filter(uint32_t c, uint32_t **output,
- size_t *outputlen, void *pw)
-{
- static uint32_t outbuf;
-
- UNUSED(pw);
-
- if (c == HUBBUB_CHARSETCODEC_NULL) {
- outbuf = 0;
- return HUBBUB_OK;
- }
-
- outbuf = c;
-
- *output = &outbuf;
- *outputlen = 1;
-
- return HUBBUB_OK;
-}
diff --git a/test/csdetect.c b/test/csdetect.c
index 3b39972..d02efcb 100644
--- a/test/csdetect.c
+++ b/test/csdetect.c
@@ -4,9 +4,10 @@
#include <stdlib.h>
#include <string.h>
+#include <parserutils/charset/mibenum.h>
+
#include <hubbub/hubbub.h>
-#include "charset/aliases.h"
#include "charset/detect.h"
#include "utils/utils.h"
@@ -113,20 +114,21 @@ bool handle_line(const char *data, size_t datalen, void *pw)
void run_test(const uint8_t *data, size_t len, char *expected)
{
- uint16_t mibenum;
- hubbub_charset_source source;
+ uint16_t mibenum = 0;
+ hubbub_charset_source source = HUBBUB_CHARSET_UNKNOWN;
static int testnum;
- assert(hubbub_charset_extract(&data, &len,
+ assert(hubbub_charset_extract(data, len,
&mibenum, &source) == HUBBUB_OK);
assert(mibenum != 0);
printf("%d: Detected charset %s (%d) Source %d Expected %s (%d)\n",
- ++testnum, hubbub_mibenum_to_name(mibenum),
+ ++testnum, parserutils_charset_mibenum_to_name(mibenum),
mibenum, source, expected,
- hubbub_mibenum_from_name(expected, strlen(expected)));
+ parserutils_charset_mibenum_from_name(
+ expected, strlen(expected)));
- assert(mibenum ==
- hubbub_mibenum_from_name(expected, strlen(expected)));
+ assert(mibenum == parserutils_charset_mibenum_from_name(
+ expected, strlen(expected)));
}
diff --git a/test/data/cscodec/INDEX b/test/data/cscodec/INDEX
deleted file mode 100644
index 326cff5..0000000
--- a/test/data/cscodec/INDEX
+++ /dev/null
@@ -1,5 +0,0 @@
-# Index file for charset codec tests
-#
-# Test Description
-
-simple.dat Simple tests, designed to validate testdriver \ No newline at end of file
diff --git a/test/data/cscodec/simple.dat b/test/data/cscodec/simple.dat
deleted file mode 100644
index 6a3cad1..0000000
--- a/test/data/cscodec/simple.dat
+++ /dev/null
Binary files differ
diff --git a/test/data/csdetect/INDEX b/test/data/csdetect/INDEX
index e292063..315ce6a 100644
--- a/test/data/csdetect/INDEX
+++ b/test/data/csdetect/INDEX
@@ -7,3 +7,4 @@ non-ascii-meta.dat Tests for meta charsets claiming to be non-ASCII
test-yahoo-jp.dat Yahoo! Japan, from html5lib testcases
tests1.dat Assorted tests, including edge cases, from html5lib
tests2.dat Further tests from html5lib
+regression.dat Regression tests
diff --git a/test/data/csdetect/regression.dat b/test/data/csdetect/regression.dat
new file mode 100644
index 0000000..75e5f14
--- /dev/null
+++ b/test/data/csdetect/regression.dat
@@ -0,0 +1,5 @@
+#data
+<table nowrap>
+#encoding
+windows-1252
+
diff --git a/test/data/tree-construction/INDEX b/test/data/tree-construction/INDEX
index ea258b0..c994b5a 100644
--- a/test/data/tree-construction/INDEX
+++ b/test/data/tree-construction/INDEX
@@ -17,3 +17,4 @@ tests12.dat html5lib tests
after-after-body.dat Tests "after after body" mode
after-after-frameset.dat Tests "after after frameset" mode
after-body.dat Tests "after body" mode
+regression.dat Regression tests
diff --git a/test/data/tree-construction/regression.dat b/test/data/tree-construction/regression.dat
new file mode 100644
index 0000000..0d4d77a
--- /dev/null
+++ b/test/data/tree-construction/regression.dat
@@ -0,0 +1,31 @@
+#data
+<html>
+ <body>
+ <table>
+ <tr>
+ <td>
+ <div>
+ <b>
+ </div>
+ <table></table>
+ </td>
+ </tr>
+ </table>
+ <table></table>
+ <script type="text/javascript"></script>
+ </body>
+</html>
+#errors
+#document
+| <html>
+| <head>
+| <body>
+| <table>
+| <tr>
+| <td>
+| <div>
+| <b>
+| <table>
+| <table>
+| <script>
+
diff --git a/test/filter.c b/test/filter.c
deleted file mode 100644
index 83cce20..0000000
--- a/test/filter.c
+++ /dev/null
@@ -1,355 +0,0 @@
-#include <inttypes.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <hubbub/hubbub.h>
-
-#include "utils/utils.h"
-
-#include "input/filter.h"
-
-#include "testutils.h"
-
-static void *myrealloc(void *ptr, size_t len, void *pw)
-{
- UNUSED(pw);
-
- return realloc(ptr, len);
-}
-
-int main(int argc, char **argv)
-{
- hubbub_filter_optparams params;
- hubbub_filter *input;
- uint8_t inbuf[64], outbuf[64];
- size_t inlen, outlen;
- const uint8_t *in = inbuf;
- uint8_t *out = outbuf;
-
- if (argc != 2) {
- printf("Usage: %s <filename>\n", argv[0]);
- return 1;
- }
-
- /* Initialise library */
- assert(hubbub_initialise(argv[1], myrealloc, NULL) == HUBBUB_OK);
-
- /* Create input filter */
- input = hubbub_filter_create("UTF-8", myrealloc, NULL);
- assert(input);
-
- /* Convert filter to UTF-8 encoding */
- params.encoding.name = "UTF-8";
- assert(hubbub_filter_setopt(input, HUBBUB_FILTER_SET_ENCODING,
- (hubbub_filter_optparams *) &params) == HUBBUB_OK);
-
-
- /* Simple case - valid input & output buffer large enough */
- in = inbuf;
- out = outbuf;
- strcpy((char *) inbuf, "hell\xc2\xa0o!");
- inlen = strlen((const char *) inbuf);
- outbuf[0] = '\0';
- outlen = 64;
-
- assert(hubbub_filter_process_chunk(input, &in, &inlen,
- &out, &outlen) == HUBBUB_OK);
-
- printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
- (int) (out - ((uint8_t *) outbuf)),
- outbuf, (int) outlen);
-
- assert(hubbub_filter_reset(input) == HUBBUB_OK);
-
- assert(memcmp(outbuf, "hell\xc2\xa0o!",
- SLEN("hell\xc2\xa0o!")) == 0);
-
-
- /* Too small an output buffer; no encoding edge cases */
- in = inbuf;
- out = outbuf;
- strcpy((char *) inbuf, "hello!");
- inlen = strlen((const char *) inbuf);
- outbuf[0] = '\0';
- outlen = 5;
-
- assert(hubbub_filter_process_chunk(input, &in, &inlen,
- &out, &outlen) == HUBBUB_NOMEM);
-
- printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
- (int) (out - ((uint8_t *) outbuf)),
- outbuf, (int) outlen);
-
- outlen = 64 - 5 + outlen;
-
- assert(hubbub_filter_process_chunk(input, &in, &inlen,
- &out, &outlen) == HUBBUB_OK);
-
- printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
- (int) (out - ((uint8_t *) outbuf)),
- outbuf, (int) outlen);
-
- assert(hubbub_filter_reset(input) == HUBBUB_OK);
-
- assert(memcmp(outbuf, "hello!",
- SLEN("hello!")) == 0);
-
-
- /* Illegal input sequence; output buffer large enough */
- in = inbuf;
- out = outbuf;
- strcpy((char *) inbuf, "hell\x96o!");
- inlen = strlen((const char *) inbuf);
- outbuf[0] = '\0';
- outlen = 64;
-
- /* Input does loose decoding, converting to U+FFFD if illegal
- * input is encountered */
- assert(hubbub_filter_process_chunk(input, &in, &inlen,
- &out, &outlen) == HUBBUB_OK);
-
- printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
- (int) (out - ((uint8_t *) outbuf)),
- outbuf, (int) outlen);
-
- assert(hubbub_filter_reset(input) == HUBBUB_OK);
-
- assert(memcmp(outbuf, "hell\xef\xbf\xbdo!",
- SLEN("hell\xef\xbf\xbdo!")) == 0);
-
-
- /* Input ends mid-sequence */
- in = inbuf;
- out = outbuf;
- strcpy((char *) inbuf, "hell\xc2\xa0o!");
- inlen = strlen((const char *) inbuf) - 3;
- outbuf[0] = '\0';
- outlen = 64;
-
- assert(hubbub_filter_process_chunk(input, &in, &inlen,
- &out, &outlen) == HUBBUB_OK);
-
- printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
- (int) (out - ((uint8_t *) outbuf)),
- outbuf, (int) outlen);
-
- inlen = 3;
-
- assert(hubbub_filter_process_chunk(input, &in, &inlen,
- &out, &outlen) == HUBBUB_OK);
-
- printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
- (int) (out - ((uint8_t *) outbuf)),
- outbuf, (int) outlen);
-
- assert(hubbub_filter_reset(input) == HUBBUB_OK);
-
- assert(memcmp(outbuf, "hell\xc2\xa0o!",
- SLEN("hell\xc2\xa0o!")) == 0);
-
-
- /* Input ends mid-sequence, but second attempt has too small a
- * buffer, but large enough to write out the incomplete character. */
- in = inbuf;
- out = outbuf;
- strcpy((char *) inbuf, "hell\xc2\xa0o!");
- inlen = strlen((const char *) inbuf) - 3;
- outbuf[0] = '\0';
- outlen = 64;
-
- assert(hubbub_filter_process_chunk(input, &in, &inlen,
- &out, &outlen) == HUBBUB_OK);
-
- printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
- (int) (out - ((uint8_t *) outbuf)),
- outbuf, (int) outlen);
-
- inlen = 3;
- outlen = 3;
-
- assert(hubbub_filter_process_chunk(input, &in, &inlen,
- &out, &outlen) == HUBBUB_NOMEM);
-
- printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
- (int) (out - ((uint8_t *) outbuf)),
- outbuf, (int) outlen);
-
- outlen = 64 - 7;
-
- assert(hubbub_filter_process_chunk(input, &in, &inlen,
- &out, &outlen) == HUBBUB_OK);
-
- printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
- (int) (out - ((uint8_t *) outbuf)),
- outbuf, (int) outlen);
-
- assert(hubbub_filter_reset(input) == HUBBUB_OK);
-
- assert(memcmp(outbuf, "hell\xc2\xa0o!",
- SLEN("hell\xc2\xa0o!")) == 0);
-
-
- /* Input ends mid-sequence, but second attempt has too small a
- * buffer, not large enough to write out the incomplete character. */
- in = inbuf;
- out = outbuf;
- strcpy((char *) inbuf, "hell\xc2\xa0o!");
- inlen = strlen((const char *) inbuf) - 3;
- outbuf[0] = '\0';
- outlen = 64;
-
- assert(hubbub_filter_process_chunk(input, &in, &inlen,
- &out, &outlen) == HUBBUB_OK);
-
- printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
- (int) (out - ((uint8_t *) outbuf)),
- outbuf, (int) outlen);
-
- inlen = 3;
- outlen = 1;
-
- assert(hubbub_filter_process_chunk(input, &in, &inlen,
- &out, &outlen) == HUBBUB_NOMEM);
-
- printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
- (int) (out - ((uint8_t *) outbuf)),
- outbuf, (int) outlen);
-
- outlen = 60;
-
- assert(hubbub_filter_process_chunk(input, &in, &inlen,
- &out, &outlen) == HUBBUB_OK);
-
- printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
- (int) (out - ((uint8_t *) outbuf)),
- outbuf, (int) outlen);
-
- assert(hubbub_filter_reset(input) == HUBBUB_OK);
-
- assert(memcmp(outbuf, "hell\xc2\xa0o!",
- SLEN("hell\xc2\xa0o!")) == 0);
-
-
- /* Input ends mid-sequence, but second attempt contains
- * invalid character */
- in = inbuf;
- out = outbuf;
- strcpy((char *) inbuf, "hell\xc2\xc2o!");
- inlen = strlen((const char *) inbuf) - 3;
- outbuf[0] = '\0';
- outlen = 64;
-
- assert(hubbub_filter_process_chunk(input, &in, &inlen,
- &out, &outlen) == HUBBUB_OK);
-
- printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
- (int) (out - ((uint8_t *) outbuf)),
- outbuf, (int) outlen);
-
- inlen = 3;
-
- /* Input does loose decoding, converting to U+FFFD if illegal
- * input is encountered */
- assert(hubbub_filter_process_chunk(input, &in, &inlen,
- &out, &outlen) == HUBBUB_OK);
-
- printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
- (int) (out - ((uint8_t *) outbuf)),
- outbuf, (int) outlen);
-
- assert(hubbub_filter_reset(input) == HUBBUB_OK);
-
- assert(memcmp(outbuf, "hell\xef\xbf\xbdo!",
- SLEN("hell\xef\xbf\xbdo!")) == 0);
-
-
- /* Input ends mid-sequence, but second attempt contains another
- * incomplete character */
- in = inbuf;
- out = outbuf;
- strcpy((char *) inbuf, "hell\xc2\xa0\xc2\xa1o!");
- inlen = strlen((const char *) inbuf) - 5;
- outbuf[0] = '\0';
- outlen = 64;
-
- assert(hubbub_filter_process_chunk(input, &in, &inlen,
- &out, &outlen) == HUBBUB_OK);
-
- printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
- (int) (out - ((uint8_t *) outbuf)),
- outbuf, (int) outlen);
-
- inlen = 2;
-
- assert(hubbub_filter_process_chunk(input, &in, &inlen,
- &out, &outlen) == HUBBUB_OK);
-
- printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
- (int) (out - ((uint8_t *) outbuf)),
- outbuf, (int) outlen);
-
- inlen = 3;
-
- assert(hubbub_filter_process_chunk(input, &in, &inlen,
- &out, &outlen) == HUBBUB_OK);
-
- printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
- (int) (out - ((uint8_t *) outbuf)),
- outbuf, (int) outlen);
-
- assert(hubbub_filter_reset(input) == HUBBUB_OK);
-
- assert(memcmp(outbuf, "hell\xc2\xa0\xc2\xa1o!",
- SLEN("hell\xc2\xa0\xc2\xa1o!")) == 0);
-
-
- /* Input ends mid-sequence, but second attempt contains insufficient
- * data to complete the incomplete character */
- in = inbuf;
- out = outbuf;
- strcpy((char *) inbuf, "hell\xe2\x80\xa2o!");
- inlen = strlen((const char *) inbuf) - 4;
- outbuf[0] = '\0';
- outlen = 64;
-
- assert(hubbub_filter_process_chunk(input, &in, &inlen,
- &out, &outlen) == HUBBUB_OK);
-
- printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
- (int) (out - ((uint8_t *) outbuf)),
- outbuf, (int) outlen);
-
- inlen = 1;
-
- assert(hubbub_filter_process_chunk(input, &in, &inlen,
- &out, &outlen) == HUBBUB_OK);
-
- printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
- (int) (out - ((uint8_t *) outbuf)),
- outbuf, (int) outlen);
-
- inlen = 3;
-
- assert(hubbub_filter_process_chunk(input, &in, &inlen,
- &out, &outlen) == HUBBUB_OK);
-
- printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
- (int) (out - ((uint8_t *) outbuf)),
- outbuf, (int) outlen);
-
- assert(hubbub_filter_reset(input) == HUBBUB_OK);
-
- assert(memcmp(outbuf, "hell\xe2\x80\xa2o!",
- SLEN("hell\xe2\x80\xa2o!")) == 0);
-
-
- /* Clean up */
- hubbub_filter_destroy(input);
-
- assert(hubbub_finalise(myrealloc, NULL) == HUBBUB_OK);
-
- printf("PASS\n");
-
- return 0;
-}
diff --git a/test/inputstream.c b/test/inputstream.c
deleted file mode 100644
index 3a83419..0000000
--- a/test/inputstream.c
+++ /dev/null
@@ -1,126 +0,0 @@
-#include <inttypes.h>
-#include <stdio.h>
-
-#include <hubbub/hubbub.h>
-
-#include "utils/utils.h"
-
-#include "input/inputstream.h"
-
-#include "testutils.h"
-
-static void buffer_moved_handler(const uint8_t *buffer, size_t len,
- void *pw);
-
-static void *myrealloc(void *ptr, size_t len, void *pw)
-{
- UNUSED(pw);
-
- return realloc(ptr, len);
-}
-
-int main(int argc, char **argv)
-{
- hubbub_inputstream *stream;
- FILE *fp;
- size_t len, origlen;
-#define CHUNK_SIZE (4096)
- uint8_t buf[CHUNK_SIZE];
- uint8_t *isb;
- size_t isblen;
- uint32_t c;
-
- if (argc != 3) {
- printf("Usage: %s <aliases_file> <filename>\n", argv[0]);
- return 1;
- }
-
- /* Initialise library */
- assert(hubbub_initialise(argv[1], myrealloc, NULL) == HUBBUB_OK);
-
- stream = hubbub_inputstream_create("UTF-8", "UTF-8", myrealloc, NULL);
- assert(stream != NULL);
-
- assert(hubbub_inputstream_register_movehandler(stream,
- buffer_moved_handler, NULL) == HUBBUB_OK);
-
- fp = fopen(argv[2], "rb");
- if (fp == NULL) {
- printf("Failed opening %s\n", argv[2]);
- return 1;
- }
-
- fseek(fp, 0, SEEK_END);
- origlen = len = ftell(fp);
- fseek(fp, 0, SEEK_SET);
-
- while (len >= CHUNK_SIZE) {
- fread(buf, 1, CHUNK_SIZE, fp);
-
- assert(hubbub_inputstream_append(stream,
- buf, CHUNK_SIZE) == HUBBUB_OK);
-
- len -= CHUNK_SIZE;
-
- while ((c = hubbub_inputstream_peek(stream)) !=
- HUBBUB_INPUTSTREAM_OOD) {
- size_t len;
- hubbub_inputstream_cur_pos(stream, &len);
- hubbub_inputstream_advance(stream);
- assert(hubbub_inputstream_push_back(stream, c) ==
- HUBBUB_OK);
- hubbub_inputstream_advance(stream);
- }
- }
-
- if (len > 0) {
- fread(buf, 1, len, fp);
-
- assert(hubbub_inputstream_append(stream,
- buf, len) == HUBBUB_OK);
-
- len = 0;
- }
-
- fclose(fp);
-
- assert(hubbub_inputstream_insert(stream,
- (const uint8_t *) "hello!!!",
- SLEN("hello!!!")) == HUBBUB_OK);
-
- assert(hubbub_inputstream_append(stream, NULL, 0) == HUBBUB_OK);
-
- while (hubbub_inputstream_peek(stream) !=
- HUBBUB_INPUTSTREAM_EOF) {
- size_t len;
- hubbub_inputstream_cur_pos(stream, &len);
- hubbub_inputstream_advance(stream);
- }
-
- assert(hubbub_inputstream_claim_buffer(stream, &isb, &isblen) ==
- HUBBUB_OK);
-
- printf("Input size: %zu, Output size: %zu\n", origlen, isblen);
- printf("Buffer at %p\n", isb);
-
- free(isb);
-
- assert(hubbub_inputstream_deregister_movehandler(stream,
- buffer_moved_handler, NULL) == HUBBUB_OK);
-
- hubbub_inputstream_destroy(stream);
-
- assert(hubbub_finalise(myrealloc, NULL) == HUBBUB_OK);
-
- printf("PASS\n");
-
- return 0;
-}
-
-void buffer_moved_handler(const uint8_t *buffer, size_t len,
- void *pw)
-{
- UNUSED(pw);
-
- printf("Buffer moved to: %p (%zu)\n", buffer, len);
-}
diff --git a/test/parser-utf16.c b/test/parser-utf16.c
deleted file mode 100644
index 326de78..0000000
--- a/test/parser-utf16.c
+++ /dev/null
@@ -1,195 +0,0 @@
-#include <inttypes.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <hubbub/hubbub.h>
-
-#include <hubbub/parser.h>
-
-#include "utils/utils.h"
-
-#include "testutils.h"
-
-static const uint8_t *pbuffer;
-
-static void buffer_handler(const uint8_t *buffer, size_t len, void *pw);
-static void token_handler(const hubbub_token *token, void *pw);
-
-static void *myrealloc(void *ptr, size_t len, void *pw)
-{
- UNUSED(pw);
-
- return realloc(ptr, len);
-}
-
-int main(int argc, char **argv)
-{
- hubbub_parser *parser;
- hubbub_parser_optparams params;
- FILE *fp;
- size_t len, origlen;
-#define CHUNK_SIZE (4096)
- uint8_t buf[CHUNK_SIZE];
- const char *charset;
- hubbub_charset_source cssource;
- uint8_t *buffer;
-
- if (argc != 3) {
- printf("Usage: %s <aliases_file> <filename>\n", argv[0]);
- return 1;
- }
-
- /* Initialise library */
- assert(hubbub_initialise(argv[1], myrealloc, NULL) == HUBBUB_OK);
-
- parser = hubbub_parser_create("UTF-8", "UTF-16", myrealloc, NULL);
- assert(parser != NULL);
-
- params.buffer_handler.handler = buffer_handler;
- params.buffer_handler.pw = NULL;
- assert(hubbub_parser_setopt(parser, HUBBUB_PARSER_BUFFER_HANDLER,
- &params) == HUBBUB_OK);
-
- params.token_handler.handler = token_handler;
- params.token_handler.pw = NULL;
- assert(hubbub_parser_setopt(parser, HUBBUB_PARSER_TOKEN_HANDLER,
- &params) == HUBBUB_OK);
-
- fp = fopen(argv[2], "rb");
- if (fp == NULL) {
- printf("Failed opening %s\n", argv[2]);
- return 1;
- }
-
- fseek(fp, 0, SEEK_END);
- origlen = len = ftell(fp);
- fseek(fp, 0, SEEK_SET);
-
- while (len >= CHUNK_SIZE) {
- fread(buf, 1, CHUNK_SIZE, fp);
-
- assert(hubbub_parser_parse_chunk(parser,
- buf, CHUNK_SIZE) == HUBBUB_OK);
-
- len -= CHUNK_SIZE;
- }
-
- if (len > 0) {
- fread(buf, 1, len, fp);
-
- assert(hubbub_parser_parse_chunk(parser,
- buf, len) == HUBBUB_OK);
-
- len = 0;
-
- assert(hubbub_parser_completed(parser) == HUBBUB_OK);
- }
-
- fclose(fp);
-
- charset = hubbub_parser_read_charset(parser, &cssource);
-
- printf("Charset: %s (from %d)\n", charset, cssource);
-
- assert(hubbub_parser_claim_buffer(parser, &buffer, &len) ==
- HUBBUB_OK);
-
- free(buffer);
-
- hubbub_parser_destroy(parser);
-
- assert(hubbub_finalise(myrealloc, NULL) == HUBBUB_OK);
-
- printf("PASS\n");
-
- return 0;
-}
-
-void buffer_handler(const uint8_t *buffer, size_t len, void *pw)
-{
- UNUSED(len);
- UNUSED(pw);
-
- pbuffer = buffer;
-}
-
-void token_handler(const hubbub_token *token, void *pw)
-{
- static const char *token_names[] = {
- "DOCTYPE", "START TAG", "END TAG",
- "COMMENT", "CHARACTERS", "EOF"
- };
- size_t i;
-
- UNUSED(pw);
-
- printf("%s: ", token_names[token->type]);
-
- switch (token->type) {
- case HUBBUB_TOKEN_DOCTYPE:
- printf("'%.*s' %sids:\n",
- (int) token->data.doctype.name.len,
- pbuffer + token->data.doctype.name.data.off,
- token->data.doctype.force_quirks ?
- "(force-quirks) " : "");
-
- if (token->data.doctype.public_missing)
- printf("\tpublic: missing\n");
- else
- printf("\tpublic: '%.*s'\n",
- (int) token->data.doctype.public_id.len,
- pbuffer + token->data.doctype.public_id.data.off);
-
- if (token->data.doctype.system_missing)
- printf("\tsystem: missing\n");
- else
- printf("\tsystem: '%.*s'\n",
- (int) token->data.doctype.system_id.len,
- pbuffer + token->data.doctype.system_id.data.off);
-
- break;
- case HUBBUB_TOKEN_START_TAG:
- printf("'%.*s' %s%s\n",
- (int) token->data.tag.name.len,
- pbuffer + token->data.tag.name.data.off,
- (token->data.tag.self_closing) ?
- "(self-closing) " : "",
- (token->data.tag.n_attributes > 0) ?
- "attributes:" : "");
- for (i = 0; i < token->data.tag.n_attributes; i++) {
- printf("\t'%.*s' = '%.*s'\n",
- (int) token->data.tag.attributes[i].name.len,
- pbuffer + token->data.tag.attributes[i].name.data.off,
- (int) token->data.tag.attributes[i].value.len,
- pbuffer + token->data.tag.attributes[i].value.data.off);
- }
- break;
- case HUBBUB_TOKEN_END_TAG:
- printf("'%.*s' %s%s\n",
- (int) token->data.tag.name.len,
- pbuffer + token->data.tag.name.data.off,
- (token->data.tag.self_closing) ?
- "(self-closing) " : "",
- (token->data.tag.n_attributes > 0) ?
- "attributes:" : "");
- for (i = 0; i < token->data.tag.n_attributes; i++) {
- printf("\t'%.*s' = '%.*s'\n",
- (int) token->data.tag.attributes[i].name.len,
- pbuffer + token->data.tag.attributes[i].name.data.off,
- (int) token->data.tag.attributes[i].value.len,
- pbuffer + token->data.tag.attributes[i].value.data.off);
- }
- break;
- case HUBBUB_TOKEN_COMMENT:
- printf("'%.*s'\n", (int) token->data.comment.len,
- pbuffer + token->data.comment.data.off);
- break;
- case HUBBUB_TOKEN_CHARACTER:
- printf("'%.*s'\n", (int) token->data.character.len,
- pbuffer + token->data.character.data.off);
- break;
- case HUBBUB_TOKEN_EOF:
- printf("\n");
- break;
- }
-}
diff --git a/test/parser.c b/test/parser.c
index 0bc9246..ba8dd31 100644
--- a/test/parser.c
+++ b/test/parser.c
@@ -10,9 +10,6 @@
#include "testutils.h"
-static const uint8_t *pbuffer;
-
-static void buffer_handler(const uint8_t *buffer, size_t len, void *pw);
static void token_handler(const hubbub_token *token, void *pw);
static void *myrealloc(void *ptr, size_t len, void *pw)
@@ -32,7 +29,6 @@ int main(int argc, char **argv)
uint8_t buf[CHUNK_SIZE];
const char *charset;
hubbub_charset_source cssource;
- uint8_t *buffer;
if (argc != 3) {
printf("Usage: %s <aliases_file> <filename>\n", argv[0]);
@@ -45,11 +41,6 @@ int main(int argc, char **argv)
parser = hubbub_parser_create("UTF-8", "UTF-8", myrealloc, NULL);
assert(parser != NULL);
- params.buffer_handler.handler = buffer_handler;
- params.buffer_handler.pw = NULL;
- assert(hubbub_parser_setopt(parser, HUBBUB_PARSER_BUFFER_HANDLER,
- &params) == HUBBUB_OK);
-
params.token_handler.handler = token_handler;
params.token_handler.pw = NULL;
assert(hubbub_parser_setopt(parser, HUBBUB_PARSER_TOKEN_HANDLER,
@@ -91,11 +82,6 @@ int main(int argc, char **argv)
printf("Charset: %s (from %d)\n", charset, cssource);
- assert(hubbub_parser_claim_buffer(parser, &buffer, &len) ==
- HUBBUB_OK);
-
- free(buffer);
-
hubbub_parser_destroy(parser);
assert(hubbub_finalise(myrealloc, NULL) == HUBBUB_OK);
@@ -105,14 +91,6 @@ int main(int argc, char **argv)
return 0;
}
-void buffer_handler(const uint8_t *buffer, size_t len, void *pw)
-{
- UNUSED(len);
- UNUSED(pw);
-
- pbuffer = buffer;
-}
-
void token_handler(const hubbub_token *token, void *pw)
{
static const char *token_names[] = {
@@ -129,7 +107,7 @@ void token_handler(const hubbub_token *token, void *pw)
case HUBBUB_TOKEN_DOCTYPE:
printf("'%.*s' %sids:\n",
(int) token->data.doctype.name.len,
- pbuffer + token->data.doctype.name.data.off,
+ token->data.doctype.name.ptr,
token->data.doctype.force_quirks ?
"(force-quirks) " : "");
@@ -138,20 +116,20 @@ void token_handler(const hubbub_token *token, void *pw)
else
printf("\tpublic: '%.*s'\n",
(int) token->data.doctype.public_id.len,
- pbuffer + token->data.doctype.public_id.data.off);
+ token->data.doctype.public_id.ptr);
if (token->data.doctype.system_missing)
printf("\tsystem: missing\n");
else
printf("\tsystem: '%.*s'\n",
(int) token->data.doctype.system_id.len,
- pbuffer + token->data.doctype.system_id.data.off);
+ token->data.doctype.system_id.ptr);
break;
case HUBBUB_TOKEN_START_TAG:
printf("'%.*s' %s%s\n",
(int) token->data.tag.name.len,
- pbuffer + token->data.tag.name.data.off,
+ token->data.tag.name.ptr,
(token->data.tag.self_closing) ?
"(self-closing) " : "",
(token->data.tag.n_attributes > 0) ?
@@ -159,15 +137,15 @@ void token_handler(const hubbub_token *token, void *pw)
for (i = 0; i < token->data.tag.n_attributes; i++) {
printf("\t'%.*s' = '%.*s'\n",
(int) token->data.tag.attributes[i].name.len,
- pbuffer + token->data.tag.attributes[i].name.data.off,
+ token->data.tag.attributes[i].name.ptr,
(int) token->data.tag.attributes[i].value.len,
- pbuffer + token->data.tag.attributes[i].value.data.off);
+ token->data.tag.attributes[i].value.ptr);
}
break;
case HUBBUB_TOKEN_END_TAG:
printf("'%.*s' %s%s\n",
(int) token->data.tag.name.len,
- pbuffer + token->data.tag.name.data.off,
+ token->data.tag.name.ptr,
(token->data.tag.self_closing) ?
"(self-closing) " : "",
(token->data.tag.n_attributes > 0) ?
@@ -175,18 +153,18 @@ void token_handler(const hubbub_token *token, void *pw)
for (i = 0; i < token->data.tag.n_attributes; i++) {
printf("\t'%.*s' = '%.*s'\n",
(int) token->data.tag.attributes[i].name.len,
- pbuffer + token->data.tag.attributes[i].name.data.off,
+ token->data.tag.attributes[i].name.ptr,
(int) token->data.tag.attributes[i].value.len,
- pbuffer + token->data.tag.attributes[i].value.data.off);
+ token->data.tag.attributes[i].value.ptr);
}
break;
case HUBBUB_TOKEN_COMMENT:
printf("'%.*s'\n", (int) token->data.comment.len,
- pbuffer + token->data.comment.data.off);
+ token->data.comment.ptr);
break;
case HUBBUB_TOKEN_CHARACTER:
printf("'%.*s'\n", (int) token->data.character.len,
- pbuffer + token->data.character.data.off);
+ token->data.character.ptr);
break;
case HUBBUB_TOKEN_EOF:
printf("\n");
diff --git a/test/regression/cscodec-segv.c b/test/regression/cscodec-segv.c
deleted file mode 100644
index ad9894a..0000000
--- a/test/regression/cscodec-segv.c
+++ /dev/null
@@ -1,37 +0,0 @@
-#include <stdio.h>
-
-#include <hubbub/hubbub.h>
-
-#include "charset/codec.h"
-
-#include "testutils.h"
-
-static void *myrealloc(void *ptr, size_t len, void *pw)
-{
- UNUSED(pw);
-
- return realloc(ptr, len);
-}
-
-int main(int argc, char **argv)
-{
- hubbub_charsetcodec *codec;
-
- if (argc != 2) {
- printf("Usage: %s <aliases_file>\n", argv[0]);
- return 1;
- }
-
- assert(hubbub_initialise(argv[1], myrealloc, NULL) == HUBBUB_OK);
-
- codec = hubbub_charsetcodec_create("ISO-8859-1", myrealloc, NULL);
- assert(codec != NULL);
-
- hubbub_charsetcodec_destroy(codec);
-
- assert(hubbub_finalise(myrealloc, NULL) == HUBBUB_OK);
-
- printf("PASS\n");
-
- return 0;
-}
diff --git a/test/regression/filter-segv.c b/test/regression/filter-segv.c
deleted file mode 100644
index 950df61..0000000
--- a/test/regression/filter-segv.c
+++ /dev/null
@@ -1,38 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <hubbub/hubbub.h>
-
-#include "input/filter.h"
-
-#include "testutils.h"
-
-static void *myrealloc(void *ptr, size_t len, void *pw)
-{
- UNUSED(pw);
-
- return realloc(ptr, len);
-}
-
-int main(int argc, char **argv)
-{
- hubbub_filter *input;
-
- if (argc != 2) {
- printf("Usage: %s <filename>\n", argv[0]);
- return 1;
- }
-
- assert(hubbub_initialise(argv[1], myrealloc, NULL) == HUBBUB_OK);
-
- input = hubbub_filter_create("UTF-8", myrealloc, NULL);
- assert(input);
-
- hubbub_filter_destroy(input);
-
- assert(hubbub_finalise(myrealloc, NULL) == HUBBUB_OK);
-
- printf("PASS\n");
-
- return 0;
-}
diff --git a/test/regression/stream-nomem.c b/test/regression/stream-nomem.c
deleted file mode 100644
index 7233ac7..0000000
--- a/test/regression/stream-nomem.c
+++ /dev/null
@@ -1,88 +0,0 @@
-#include <stdio.h>
-#include <string.h>
-
-#include <hubbub/hubbub.h>
-
-#include "utils/utils.h"
-
-#include "input/inputstream.h"
-
-#include "testutils.h"
-
-static void *myrealloc(void *ptr, size_t len, void *pw)
-{
- UNUSED(pw);
-
- return realloc(ptr, len);
-}
-
-int main(int argc, char **argv)
-{
- hubbub_inputstream *stream;
-
- /* This is specially calculated so that the inputstream is forced to
- * reallocate (it assumes that the inputstream's buffer chunk size
- * is 4k) */
-#define BUFFER_SIZE (4096 + 4)
- uint8_t input_buffer[BUFFER_SIZE];
- uint8_t *buffer;
- size_t buflen;
- uint32_t c;
-
- if (argc != 2) {
- printf("Usage: %s <aliases_file>\n", argv[0]);
- return 1;
- }
-
- /* Populate the buffer with something sane */
- memset(input_buffer, 'a', BUFFER_SIZE);
- /* Now, set up our test data */
- input_buffer[BUFFER_SIZE - 1] = '5';
- input_buffer[BUFFER_SIZE - 2] = '4';
- input_buffer[BUFFER_SIZE - 3] = '\xbd';
- input_buffer[BUFFER_SIZE - 4] = '\xbf';
- /* This byte will occupy the 4095th byte in the buffer and
- * thus cause the entirety of U+FFFD to be buffered until after
- * the buffer has been enlarged */
- input_buffer[BUFFER_SIZE - 5] = '\xef';
- input_buffer[BUFFER_SIZE - 6] = '3';
- input_buffer[BUFFER_SIZE - 7] = '2';
- input_buffer[BUFFER_SIZE - 8] = '1';
-
- assert(hubbub_initialise(argv[1], myrealloc, NULL) == HUBBUB_OK);
-
- stream = hubbub_inputstream_create("UTF-8", "UTF-8", myrealloc, NULL);
- assert(stream != NULL);
-
- assert(hubbub_inputstream_append(stream, input_buffer, BUFFER_SIZE) ==
- HUBBUB_OK);
-
- assert(hubbub_inputstream_append(stream, NULL, 0) == HUBBUB_OK);
-
- while ((c = hubbub_inputstream_peek(stream)) != HUBBUB_INPUTSTREAM_EOF)
- hubbub_inputstream_advance(stream);
-
- assert(hubbub_inputstream_claim_buffer(stream, &buffer, &buflen) ==
- HUBBUB_OK);
-
- assert(buflen == BUFFER_SIZE);
-
- printf("Buffer: '%.*s'\n", 8, buffer + (BUFFER_SIZE - 8));
-
- assert( buffer[BUFFER_SIZE - 6] == '3' &&
- buffer[BUFFER_SIZE - 5] == (uint8_t) '\xef' &&
- buffer[BUFFER_SIZE - 4] == (uint8_t) '\xbf' &&
- buffer[BUFFER_SIZE - 3] == (uint8_t) '\xbd' &&
- buffer[BUFFER_SIZE - 2] == '4');
-
- free(buffer);
-
- hubbub_inputstream_destroy(stream);
-
- assert(hubbub_finalise(myrealloc, NULL) == HUBBUB_OK);
-
- printf("PASS\n");
-
- return 0;
-}
-
diff --git a/test/tokeniser.c b/test/tokeniser.c
index 2d9577b..0ccf264 100644
--- a/test/tokeniser.c
+++ b/test/tokeniser.c
@@ -1,18 +1,16 @@
#include <inttypes.h>
#include <stdio.h>
+#include <parserutils/input/inputstream.h>
+
#include <hubbub/hubbub.h>
#include "utils/utils.h"
-#include "input/inputstream.h"
#include "tokeniser/tokeniser.h"
#include "testutils.h"
-static const uint8_t *pbuffer;
-
-static void buffer_handler(const uint8_t *buffer, size_t len, void *pw);
static void token_handler(const hubbub_token *token, void *pw);
static void *myrealloc(void *ptr, size_t len, void *pw)
@@ -24,7 +22,7 @@ static void *myrealloc(void *ptr, size_t len, void *pw)
int main(int argc, char **argv)
{
- hubbub_inputstream *stream;
+ parserutils_inputstream *stream;
hubbub_tokeniser *tok;
hubbub_tokeniser_optparams params;
FILE *fp;
@@ -40,17 +38,13 @@ int main(int argc, char **argv)
/* Initialise library */
assert(hubbub_initialise(argv[1], myrealloc, NULL) == HUBBUB_OK);
- stream = hubbub_inputstream_create("UTF-8", "UTF-8", myrealloc, NULL);
+ stream = parserutils_inputstream_create("UTF-8", 0, NULL,
+ myrealloc, NULL);
assert(stream != NULL);
tok = hubbub_tokeniser_create(stream, myrealloc, NULL);
assert(tok != NULL);
- params.buffer_handler.handler = buffer_handler;
- params.buffer_handler.pw = NULL;
- assert(hubbub_tokeniser_setopt(tok, HUBBUB_TOKENISER_BUFFER_HANDLER,
- &params) == HUBBUB_OK);
-
params.token_handler.handler = token_handler;
params.token_handler.pw = NULL;
assert(hubbub_tokeniser_setopt(tok, HUBBUB_TOKENISER_TOKEN_HANDLER,
@@ -69,7 +63,7 @@ int main(int argc, char **argv)
while (len >= CHUNK_SIZE) {
fread(buf, 1, CHUNK_SIZE, fp);
- assert(hubbub_inputstream_append(stream,
+ assert(parserutils_inputstream_append(stream,
buf, CHUNK_SIZE) == HUBBUB_OK);
len -= CHUNK_SIZE;
@@ -80,12 +74,12 @@ int main(int argc, char **argv)
if (len > 0) {
fread(buf, 1, len, fp);
- assert(hubbub_inputstream_append(stream,
+ assert(parserutils_inputstream_append(stream,
buf, len) == HUBBUB_OK);
len = 0;
- assert(hubbub_inputstream_append(stream, NULL, 0) ==
+ assert(parserutils_inputstream_append(stream, NULL, 0) ==
HUBBUB_OK);
assert(hubbub_tokeniser_run(tok) == HUBBUB_OK);
@@ -95,7 +89,7 @@ int main(int argc, char **argv)
hubbub_tokeniser_destroy(tok);
- hubbub_inputstream_destroy(stream);
+ parserutils_inputstream_destroy(stream);
assert(hubbub_finalise(myrealloc, NULL) == HUBBUB_OK);
@@ -104,14 +98,6 @@ int main(int argc, char **argv)
return 0;
}
-void buffer_handler(const uint8_t *buffer, size_t len, void *pw)
-{
- UNUSED(len);
- UNUSED(pw);
-
- pbuffer = buffer;
-}
-
void token_handler(const hubbub_token *token, void *pw)
{
static const char *token_names[] = {
@@ -128,7 +114,7 @@ void token_handler(const hubbub_token *token, void *pw)
case HUBBUB_TOKEN_DOCTYPE:
printf("'%.*s' %sids:\n",
(int) token->data.doctype.name.len,
- pbuffer + token->data.doctype.name.data.off,
+ token->data.doctype.name.ptr,
token->data.doctype.force_quirks ?
"(force-quirks) " : "");
@@ -137,20 +123,20 @@ void token_handler(const hubbub_token *token, void *pw)
else
printf("\tpublic: '%.*s'\n",
(int) token->data.doctype.public_id.len,
- pbuffer + token->data.doctype.public_id.data.off);
+ token->data.doctype.public_id.ptr);
if (token->data.doctype.system_missing)
printf("\tsystem: missing\n");
else
printf("\tsystem: '%.*s'\n",
(int) token->data.doctype.system_id.len,
- pbuffer + token->data.doctype.system_id.data.off);
+ token->data.doctype.system_id.ptr);
break;
case HUBBUB_TOKEN_START_TAG:
printf("'%.*s' %s%s\n",
(int) token->data.tag.name.len,
- pbuffer + token->data.tag.name.data.off,
+ token->data.tag.name.ptr,
(token->data.tag.self_closing) ?
"(self-closing) " : "",
(token->data.tag.n_attributes > 0) ?
@@ -158,15 +144,15 @@ void token_handler(const hubbub_token *token, void *pw)
for (i = 0; i < token->data.tag.n_attributes; i++) {
printf("\t'%.*s' = '%.*s'\n",
(int) token->data.tag.attributes[i].name.len,
- pbuffer + token->data.tag.attributes[i].name.data.off,
+ token->data.tag.attributes[i].name.ptr,
(int) token->data.tag.attributes[i].value.len,
- pbuffer + token->data.tag.attributes[i].value.data.off);
+ token->data.tag.attributes[i].value.ptr);
}
break;
case HUBBUB_TOKEN_END_TAG:
printf("'%.*s' %s%s\n",
(int) token->data.tag.name.len,
- pbuffer + token->data.tag.name.data.off,
+ token->data.tag.name.ptr,
(token->data.tag.self_closing) ?
"(self-closing) " : "",
(token->data.tag.n_attributes > 0) ?
@@ -174,18 +160,18 @@ void token_handler(const hubbub_token *token, void *pw)
for (i = 0; i < token->data.tag.n_attributes; i++) {
printf("\t'%.*s' = '%.*s'\n",
(int) token->data.tag.attributes[i].name.len,
- pbuffer + token->data.tag.attributes[i].name.data.off,
+ token->data.tag.attributes[i].name.ptr,
(int) token->data.tag.attributes[i].value.len,
- pbuffer + token->data.tag.attributes[i].value.data.off);
+ token->data.tag.attributes[i].value.ptr);
}
break;
case HUBBUB_TOKEN_COMMENT:
printf("'%.*s'\n", (int) token->data.comment.len,
- pbuffer + token->data.comment.data.off);
+ token->data.comment.ptr);
break;
case HUBBUB_TOKEN_CHARACTER:
printf("'%.*s'\n", (int) token->data.character.len,
- pbuffer + token->data.character.data.off);
+ token->data.character.ptr);
break;
case HUBBUB_TOKEN_EOF:
printf("\n");
diff --git a/test/tokeniser2.c b/test/tokeniser2.c
index 14ac71d..eb369b4 100644
--- a/test/tokeniser2.c
+++ b/test/tokeniser2.c
@@ -4,11 +4,12 @@
#include <json.h>
+#include <parserutils/input/inputstream.h>
+
#include <hubbub/hubbub.h>
#include "utils/utils.h"
-#include "input/inputstream.h"
#include "tokeniser/tokeniser.h"
#include "testutils.h"
@@ -29,7 +30,6 @@ typedef struct context {
} context;
static void run_test(context *ctx);
-static void buffer_handler(const uint8_t *buffer, size_t len, void *pw);
static void token_handler(const hubbub_token *token, void *pw);
static void *myrealloc(void *ptr, size_t len, void *pw)
@@ -74,6 +74,7 @@ int main(int argc, char **argv)
ctx.last_start_tag = NULL;
ctx.content_model = NULL;
+ ctx.process_cdata = false;
/* Extract settings */
for (entry = json_object_get_object(test)->head; entry;
@@ -119,7 +120,7 @@ int main(int argc, char **argv)
void run_test(context *ctx)
{
- hubbub_inputstream *stream;
+ parserutils_inputstream *stream;
hubbub_tokeniser *tok;
hubbub_tokeniser_optparams params;
int i, max_i;
@@ -138,7 +139,7 @@ void run_test(context *ctx)
ctx->output_index = 0;
ctx->char_off = 0;
- stream = hubbub_inputstream_create("UTF-8", "UTF-8",
+ stream = parserutils_inputstream_create("UTF-8", 0, NULL,
myrealloc, NULL);
assert(stream != NULL);
@@ -152,7 +153,7 @@ void run_test(context *ctx)
snprintf((char *) buf, sizeof buf, "<%s>",
ctx->last_start_tag);
- assert(hubbub_inputstream_append(stream,
+ assert(parserutils_inputstream_append(stream,
buf, strlen(ctx->last_start_tag) + 2) ==
HUBBUB_OK);
@@ -166,12 +167,6 @@ void run_test(context *ctx)
&params) == HUBBUB_OK);
}
- params.buffer_handler.handler = buffer_handler;
- params.buffer_handler.pw = ctx;
- assert(hubbub_tokeniser_setopt(tok,
- HUBBUB_TOKENISER_BUFFER_HANDLER,
- &params) == HUBBUB_OK);
-
params.token_handler.handler = token_handler;
params.token_handler.pw = ctx;
assert(hubbub_tokeniser_setopt(tok,
@@ -204,10 +199,10 @@ void run_test(context *ctx)
HUBBUB_TOKENISER_CONTENT_MODEL,
&params) == HUBBUB_OK);
- assert(hubbub_inputstream_append(stream,
+ assert(parserutils_inputstream_append(stream,
ctx->input, ctx->input_len) == HUBBUB_OK);
- assert(hubbub_inputstream_append(stream, NULL, 0) ==
+ assert(parserutils_inputstream_append(stream, NULL, 0) ==
HUBBUB_OK);
printf("Input: '%.*s' (%d)\n", (int) ctx->input_len,
@@ -218,19 +213,10 @@ void run_test(context *ctx)
hubbub_tokeniser_destroy(tok);
- hubbub_inputstream_destroy(stream);
+ parserutils_inputstream_destroy(stream);
}
}
-void buffer_handler(const uint8_t *buffer, size_t len, void *pw)
-{
- context *ctx = (context *) pw;
-
- UNUSED(len);
-
- ctx->pbuffer = buffer;
-}
-
void token_handler(const hubbub_token *token, void *pw)
{
static const char *token_names[] = {
@@ -277,7 +263,7 @@ void token_handler(const hubbub_token *token, void *pw)
items = json_object_get_array(obj);
- printf("%s: %s\n", token_names[token->type],
+ printf("got %s: expected %s\n", token_names[token->type],
json_object_get_string((struct json_object *)
array_list_get_idx(items, 0)));
@@ -297,9 +283,8 @@ void token_handler(const hubbub_token *token, void *pw)
array_list_get_idx(items, 3));
bool expquirks = !json_object_get_boolean(
array_list_get_idx(items, 4));
- char *gotname = (char *) (ctx->pbuffer +
- token->data.doctype.name.data.off);
- char *gotpub, *gotsys;
+ const char *gotname = (const char *)token->data.doctype.name.ptr;
+ const char *gotpub, *gotsys;
printf("'%.*s' %sids:\n",
(int) token->data.doctype.name.len,
@@ -311,34 +296,36 @@ void token_handler(const hubbub_token *token, void *pw)
gotpub = NULL;
printf("\tpublic: missing\n");
} else {
- gotpub = (char *) (ctx->pbuffer +
- token->data.doctype.public_id.data.off);
- printf("\tpublic: '%.*s'\n",
+ gotpub = (const char *) token->data.doctype.public_id.ptr;
+ printf("\tpublic: '%.*s' (%d)\n",
(int) token->data.doctype.public_id.len,
- gotpub);
+ gotpub,
+ (int) token->data.doctype.public_id.len);
}
if (token->data.doctype.system_missing) {
gotsys = NULL;
printf("\tsystem: missing\n");
} else {
- gotsys = (char *) (ctx->pbuffer +
- token->data.doctype.system_id.data.off);
- printf("\tsystem: '%.*s'\n",
+ gotsys = (const char *) token->data.doctype.system_id.ptr;
+ printf("\tsystem: '%.*s' (%d)\n",
(int) token->data.doctype.system_id.len,
- gotsys);
+ gotsys,
+ token->data.doctype.system_id.len);
}
assert(token->data.doctype.name.len == strlen(expname));
assert(strncmp(gotname, expname, strlen(expname)) == 0);
- assert((exppub == NULL) == (gotpub == NULL));
+ assert((exppub == NULL) ==
+ (token->data.doctype.public_missing == true));
if (exppub) {
assert(token->data.doctype.public_id.len == strlen(exppub));
assert(strncmp(gotpub, exppub, strlen(exppub)) == 0);
}
- assert((expsys == NULL) == (gotsys == NULL));
+ assert((expsys == NULL) ==
+ (token->data.doctype.system_missing == true));
if (gotsys) {
assert(token->data.doctype.system_id.len == strlen(expsys));
assert(strncmp(gotsys, expsys, strlen(expsys)) == 0);
@@ -356,16 +343,22 @@ void token_handler(const hubbub_token *token, void *pw)
bool self_closing = json_object_get_boolean(
array_list_get_idx(items, 3));
- char *tagname = (char *) (ctx->pbuffer +
- token->data.tag.name.data.off);
+ const char *tagname = (const char *)
+ token->data.tag.name.ptr;
- printf("'%.*s' %s%s\n",
+ printf("expected: '%s' %s\n",
+ expname,
+ (self_closing) ? "(self-closing) " : "");
+
+ printf(" got: '%.*s' %s\n",
(int) token->data.tag.name.len,
tagname,
(token->data.tag.self_closing) ?
- "(self-closing) " : "",
- (token->data.tag.n_attributes > 0) ?
- "attributes:" : "");
+ "(self-closing) " : "");
+
+ if (token->data.tag.n_attributes > 0) {
+ printf("attributes:\n");
+ }
assert(token->data.tag.name.len == strlen(expname));
assert(strncmp(tagname, expname, strlen(expname)) == 0);
@@ -379,12 +372,12 @@ void token_handler(const hubbub_token *token, void *pw)
char *expname = (char *) expattrs->k;
char *expval = json_object_get_string(
(struct json_object *) expattrs->v);
- char *gotname = (char *) (ctx->pbuffer +
- token->data.tag.attributes[i].name.data.off);
+ const char *gotname = (const char *)
+ token->data.tag.attributes[i].name.ptr;
size_t namelen =
token->data.tag.attributes[i].name.len;
- char *gotval = (char *) (ctx->pbuffer +
- token->data.tag.attributes[i].value.data.off);
+ const char *gotval = (const char *)
+ token->data.tag.attributes[i].value.ptr;
size_t vallen =
token->data.tag.attributes[i].value.len;
@@ -408,8 +401,8 @@ void token_handler(const hubbub_token *token, void *pw)
{
char *expname = json_object_get_string(
array_list_get_idx(items, 1));
- char *tagname = (char *) (ctx->pbuffer +
- token->data.tag.name.data.off);
+ const char *tagname = (const char *)
+ token->data.tag.name.ptr;
printf("'%.*s' %s\n",
(int) token->data.tag.name.len,
@@ -425,10 +418,12 @@ void token_handler(const hubbub_token *token, void *pw)
{
char *expstr = json_object_get_string(
array_list_get_idx(items, 1));
- char *gotstr = (char *) (ctx->pbuffer +
- token->data.comment.data.off);
+ const char *gotstr = (const char *)
+ token->data.comment.ptr;
- printf("'%.*s'\n", (int) token->data.comment.len, gotstr);
+ printf("expected: '%s'\n", expstr);
+ printf(" got: '%.*s'\n",
+ (int) token->data.comment.len, gotstr);
assert(token->data.comment.len == strlen(expstr));
assert(strncmp(gotstr, expstr, strlen(expstr)) == 0);
@@ -439,9 +434,9 @@ void token_handler(const hubbub_token *token, void *pw)
int expstrlen;
char *expstr = json_object_get_string_len(
array_list_get_idx(items, 1), &expstrlen);
- char *gotstr = (char *) (ctx->pbuffer +
- token->data.character.data.off);
- size_t len = min(token->data.character.len,
+ const char *gotstr = (const char *)
+ token->data.character.ptr;
+ size_t len = min(token->data.character.len,
expstrlen - ctx->char_off);
printf("expected: '%.*s'\n", (int) len, expstr + ctx->char_off);
@@ -457,7 +452,7 @@ void token_handler(const hubbub_token *token, void *pw)
hubbub_token t;
t.type = HUBBUB_TOKEN_CHARACTER;
- t.data.character.data.off += len;
+ t.data.character.ptr += len;
t.data.character.len -= len;
ctx->char_off = 0;
diff --git a/test/tokeniser3.c b/test/tokeniser3.c
index 76b1d07..05f57b4 100644
--- a/test/tokeniser3.c
+++ b/test/tokeniser3.c
@@ -4,18 +4,17 @@
#include <json.h>
+#include <parserutils/input/inputstream.h>
+
#include <hubbub/hubbub.h>
#include "utils/utils.h"
-#include "input/inputstream.h"
#include "tokeniser/tokeniser.h"
#include "testutils.h"
typedef struct context {
- const uint8_t *pbuffer;
-
const uint8_t *input;
size_t input_len;
@@ -29,7 +28,6 @@ typedef struct context {
} context;
static void run_test(context *ctx);
-static void buffer_handler(const uint8_t *buffer, size_t len, void *pw);
static void token_handler(const hubbub_token *token, void *pw);
static void *myrealloc(void *ptr, size_t len, void *pw)
@@ -74,6 +72,7 @@ int main(int argc, char **argv)
ctx.last_start_tag = NULL;
ctx.content_model = NULL;
+ ctx.process_cdata = false;
/* Extract settings */
for (entry = json_object_get_object(test)->head; entry;
@@ -117,7 +116,7 @@ int main(int argc, char **argv)
void run_test(context *ctx)
{
- hubbub_inputstream *stream;
+ parserutils_inputstream *stream;
hubbub_tokeniser *tok;
hubbub_tokeniser_optparams params;
int i, max_i;
@@ -137,7 +136,7 @@ void run_test(context *ctx)
ctx->output_index = 0;
ctx->char_off = 0;
- stream = hubbub_inputstream_create("UTF-8", "UTF-8",
+ stream = parserutils_inputstream_create("UTF-8", 0, NULL,
myrealloc, NULL);
assert(stream != NULL);
@@ -151,7 +150,7 @@ void run_test(context *ctx)
snprintf((char *) buf, sizeof buf, "<%s>",
ctx->last_start_tag);
- assert(hubbub_inputstream_append(stream,
+ assert(parserutils_inputstream_append(stream,
buf, strlen(ctx->last_start_tag) + 2) ==
HUBBUB_OK);
@@ -165,12 +164,6 @@ void run_test(context *ctx)
&params) == HUBBUB_OK);
}
- params.buffer_handler.handler = buffer_handler;
- params.buffer_handler.pw = ctx;
- assert(hubbub_tokeniser_setopt(tok,
- HUBBUB_TOKENISER_BUFFER_HANDLER,
- &params) == HUBBUB_OK);
-
params.token_handler.handler = token_handler;
params.token_handler.pw = ctx;
assert(hubbub_tokeniser_setopt(tok,
@@ -208,33 +201,24 @@ void run_test(context *ctx)
(int) ctx->input_len);
for (j = 0; j < ctx->input_len; j++) {
- assert(hubbub_inputstream_append(stream,
+ assert(parserutils_inputstream_append(stream,
ctx->input + j, 1) ==
HUBBUB_OK);
assert(hubbub_tokeniser_run(tok) == HUBBUB_OK);
}
- assert(hubbub_inputstream_append(stream, NULL, 0) ==
+ assert(parserutils_inputstream_append(stream, NULL, 0) ==
HUBBUB_OK);
assert(hubbub_tokeniser_run(tok) == HUBBUB_OK);
hubbub_tokeniser_destroy(tok);
- hubbub_inputstream_destroy(stream);
+ parserutils_inputstream_destroy(stream);
}
}
-void buffer_handler(const uint8_t *buffer, size_t len, void *pw)
-{
- context *ctx = (context *) pw;
-
- UNUSED(len);
-
- ctx->pbuffer = buffer;
-}
-
void token_handler(const hubbub_token *token, void *pw)
{
static const char *token_names[] = {
@@ -301,9 +285,9 @@ void token_handler(const hubbub_token *token, void *pw)
array_list_get_idx(items, 3));
bool expquirks = !json_object_get_boolean(
array_list_get_idx(items, 4));
- char *gotname = (char *) (ctx->pbuffer +
- token->data.doctype.name.data.off);
- char *gotpub, *gotsys;
+ const char *gotname = (const char *)
+ token->data.doctype.name.ptr;
+ const char *gotpub, *gotsys;
printf("'%.*s' %sids:\n",
(int) token->data.doctype.name.len,
@@ -315,34 +299,38 @@ void token_handler(const hubbub_token *token, void *pw)
gotpub = NULL;
printf("\tpublic: missing\n");
} else {
- gotpub = (char *) (ctx->pbuffer +
- token->data.doctype.public_id.data.off);
- printf("\tpublic: '%.*s'\n",
+ gotpub = (const char *)
+ token->data.doctype.public_id.ptr;
+ printf("\tpublic: '%.*s' (%d)\n",
(int) token->data.doctype.public_id.len,
- gotpub);
+ gotpub,
+ (int) token->data.doctype.public_id.len);
}
if (token->data.doctype.system_missing) {
gotsys = NULL;
printf("\tsystem: missing\n");
} else {
- gotsys = (char *) (ctx->pbuffer +
- token->data.doctype.system_id.data.off);
- printf("\tsystem: '%.*s'\n",
+ gotsys = (const char *)
+ token->data.doctype.system_id.ptr;
+ printf("\tsystem: '%.*s' (%d)\n",
(int) token->data.doctype.system_id.len,
- gotsys);
+ gotsys,
+ (int) token->data.doctype.system_id.len);
}
assert(token->data.doctype.name.len == strlen(expname));
assert(strncmp(gotname, expname, strlen(expname)) == 0);
- assert((exppub == NULL) == (gotpub == NULL));
+ assert((exppub == NULL) ==
+ (token->data.doctype.public_missing == true));
if (exppub) {
assert(token->data.doctype.public_id.len == strlen(exppub));
assert(strncmp(gotpub, exppub, strlen(exppub)) == 0);
}
- assert((expsys == NULL) == (gotsys == NULL));
+ assert((expsys == NULL) ==
+ (token->data.doctype.system_missing == true));
if (gotsys) {
assert(token->data.doctype.system_id.len == strlen(expsys));
assert(strncmp(gotsys, expsys, strlen(expsys)) == 0);
@@ -360,16 +348,22 @@ void token_handler(const hubbub_token *token, void *pw)
bool self_closing = json_object_get_boolean(
array_list_get_idx(items, 3));
- char *tagname = (char *) (ctx->pbuffer +
- token->data.tag.name.data.off);
+ const char *tagname = (const char *)
+ token->data.tag.name.ptr;
- printf("'%.*s' %s%s\n",
+ printf("expected: '%s' %s\n",
+ expname,
+ (self_closing) ? "(self-closing) " : "");
+
+ printf(" got: '%.*s' %s\n",
(int) token->data.tag.name.len,
tagname,
(token->data.tag.self_closing) ?
- "(self-closing) " : "",
- (token->data.tag.n_attributes > 0) ?
- "attributes:" : "");
+ "(self-closing) " : "");
+
+ if (token->data.tag.n_attributes > 0) {
+ printf("attributes:\n");
+ }
assert(token->data.tag.name.len == strlen(expname));
assert(strncmp(tagname, expname, strlen(expname)) == 0);
@@ -383,12 +377,12 @@ void token_handler(const hubbub_token *token, void *pw)
char *expname = (char *) expattrs->k;
char *expval = json_object_get_string(
(struct json_object *) expattrs->v);
- char *gotname = (char *) (ctx->pbuffer +
- token->data.tag.attributes[i].name.data.off);
+ const char *gotname = (const char *)
+ token->data.tag.attributes[i].name.ptr;
size_t namelen =
token->data.tag.attributes[i].name.len;
- char *gotval = (char *) (ctx->pbuffer +
- token->data.tag.attributes[i].value.data.off);
+ const char *gotval = (const char *)
+ token->data.tag.attributes[i].value.ptr;
size_t vallen =
token->data.tag.attributes[i].value.len;
@@ -412,8 +406,8 @@ void token_handler(const hubbub_token *token, void *pw)
{
char *expname = json_object_get_string(
array_list_get_idx(items, 1));
- char *tagname = (char *) (ctx->pbuffer +
- token->data.tag.name.data.off);
+ const char *tagname = (const char *)
+ token->data.tag.name.ptr;
printf("'%.*s' %s\n",
(int) token->data.tag.name.len,
@@ -429,10 +423,12 @@ void token_handler(const hubbub_token *token, void *pw)
{
char *expstr = json_object_get_string(
array_list_get_idx(items, 1));
- char *gotstr = (char *) (ctx->pbuffer +
- token->data.comment.data.off);
+ const char *gotstr = (const char *)
+ token->data.comment.ptr;
- printf("'%.*s'\n", (int) token->data.comment.len, gotstr);
+ printf("expected: '%s'\n", expstr);
+ printf(" got: '%.*s'\n",
+ (int) token->data.comment.len, gotstr);
assert(token->data.comment.len == strlen(expstr));
assert(strncmp(gotstr, expstr, strlen(expstr)) == 0);
@@ -443,8 +439,8 @@ void token_handler(const hubbub_token *token, void *pw)
int expstrlen;
char *expstr = json_object_get_string_len(
array_list_get_idx(items, 1), &expstrlen);
- char *gotstr = (char *) (ctx->pbuffer +
- token->data.character.data.off);
+ const char *gotstr = (const char *)
+ token->data.character.ptr;
size_t len = min(token->data.character.len,
expstrlen - ctx->char_off);
@@ -462,7 +458,7 @@ void token_handler(const hubbub_token *token, void *pw)
hubbub_token t;
t.type = HUBBUB_TOKEN_CHARACTER;
- t.data.character.data.off += len;
+ t.data.character.ptr += len;
t.data.character.len -= len;
ctx->char_off = 0;
diff --git a/test/tree.c b/test/tree.c
index a854491..ae947a0 100644
--- a/test/tree.c
+++ b/test/tree.c
@@ -29,9 +29,6 @@ static uintptr_t node_counter;
node_ref_alloc += NODE_REF_CHUNK; \
}
-static const uint8_t *pbuffer;
-
-static void buffer_handler(const uint8_t *buffer, size_t len, void *pw);
static int create_comment(void *ctx, const hubbub_string *data, void **result);
static int create_doctype(void *ctx, const hubbub_doctype *doctype,
void **result);
@@ -79,22 +76,6 @@ static void *myrealloc(void *ptr, size_t len, void *pw)
return realloc(ptr, len);
}
-static const uint8_t *ptr_from_hubbub_string(const hubbub_string *string)
-{
- const uint8_t *data;
-
- switch (string->type) {
- case HUBBUB_STRING_OFF:
- data = pbuffer + string->data.off;
- break;
- case HUBBUB_STRING_PTR:
- data = string->data.ptr;
- break;
- }
-
- return data;
-}
-
int main(int argc, char **argv)
{
hubbub_parser *parser;
@@ -105,7 +86,6 @@ int main(int argc, char **argv)
uint8_t buf[CHUNK_SIZE];
const char *charset;
hubbub_charset_source cssource;
- uint8_t *buffer;
bool passed = true;
if (argc != 3) {
@@ -126,11 +106,6 @@ int main(int argc, char **argv)
parser = hubbub_parser_create("UTF-8", "UTF-8", myrealloc, NULL);
assert(parser != NULL);
- params.buffer_handler.handler = buffer_handler;
- params.buffer_handler.pw = NULL;
- assert(hubbub_parser_setopt(parser, HUBBUB_PARSER_BUFFER_HANDLER,
- &params) == HUBBUB_OK);
-
params.tree_handler = &tree_handler;
assert(hubbub_parser_setopt(parser, HUBBUB_PARSER_TREE_HANDLER,
&params) == HUBBUB_OK);
@@ -176,11 +151,6 @@ int main(int argc, char **argv)
printf("Charset: %s (from %d)\n", charset, cssource);
- assert(hubbub_parser_claim_buffer(parser, &buffer, &len) ==
- HUBBUB_OK);
-
- free(buffer);
-
hubbub_parser_destroy(parser);
assert(hubbub_finalise(myrealloc, NULL) == HUBBUB_OK);
@@ -200,18 +170,10 @@ int main(int argc, char **argv)
return 0;
}
-void buffer_handler(const uint8_t *buffer, size_t len, void *pw)
-{
- UNUSED(len);
- UNUSED(pw);
-
- pbuffer = buffer;
-}
-
int create_comment(void *ctx, const hubbub_string *data, void **result)
{
printf("Creating (%" PRIuPTR ") [comment '%.*s']\n", ++node_counter,
- (int) data->len, ptr_from_hubbub_string(data));
+ (int) data->len, data->ptr);
GROW_REF
node_ref[node_counter] = 0;
@@ -226,8 +188,7 @@ int create_comment(void *ctx, const hubbub_string *data, void **result)
int create_doctype(void *ctx, const hubbub_doctype *doctype, void **result)
{
printf("Creating (%" PRIuPTR ") [doctype '%.*s']\n", ++node_counter,
- (int) doctype->name.len,
- ptr_from_hubbub_string(&doctype->name));
+ (int) doctype->name.len, doctype->name.ptr);
GROW_REF
node_ref[node_counter] = 0;
@@ -242,7 +203,7 @@ int create_doctype(void *ctx, const hubbub_doctype *doctype, void **result)
int create_element(void *ctx, const hubbub_tag *tag, void **result)
{
printf("Creating (%" PRIuPTR ") [element '%.*s']\n", ++node_counter,
- (int) tag->name.len, ptr_from_hubbub_string(&tag->name));
+ (int) tag->name.len, tag->name.ptr);
GROW_REF
node_ref[node_counter] = 0;
@@ -257,11 +218,11 @@ int create_element(void *ctx, const hubbub_tag *tag, void **result)
int create_text(void *ctx, const hubbub_string *data, void **result)
{
printf("Creating (%" PRIuPTR ") [text '%.*s']\n", ++node_counter,
- (int) data->len, ptr_from_hubbub_string(data));
+ (int) data->len, data->ptr);
GROW_REF
node_ref[node_counter] = 0;
-
+
ref_node(ctx, (void *) node_counter);
*result = (void *) node_counter;
diff --git a/test/tree2.c b/test/tree2.c
index 53876fb..0f181ef 100644
--- a/test/tree2.c
+++ b/test/tree2.c
@@ -74,10 +74,6 @@ node_t *Document;
static void node_print(buf_t *buf, node_t *node, unsigned depth);
-
-static const uint8_t *pbuffer;
-
-static void buffer_handler(const uint8_t *buffer, size_t len, void *pw);
static int create_comment(void *ctx, const hubbub_string *data, void **result);
static int create_doctype(void *ctx, const hubbub_doctype *doctype,
void **result);
@@ -123,29 +119,23 @@ static hubbub_tree_handler tree_handler = {
static void *myrealloc(void *ptr, size_t len, void *pw)
{
- UNUSED(pw);
-
- return realloc(ptr, len);
-}
+ void *ret;
-static const uint8_t *ptr_from_hubbub_string(const hubbub_string *string)
-{
- const uint8_t *data;
+ UNUSED(pw);
- switch (string->type) {
- case HUBBUB_STRING_OFF:
- data = pbuffer + string->data.off;
- break;
- case HUBBUB_STRING_PTR:
- data = string->data.ptr;
- break;
+ /* A half-arsed attempt at filling freshly allocated space with junk. */
+ if (ptr == NULL) {
+ ret = malloc(len);
+ if (ret != NULL)
+ memset(ret, 0xdf, len);
+ } else {
+ ret = realloc(ptr, len);
}
- return data;
+ return ret;
}
-
/*
* Create, initialise, and return, a parser instance.
*/
@@ -157,11 +147,6 @@ static hubbub_parser *setup_parser(void)
parser = hubbub_parser_create("UTF-8", "UTF-8", myrealloc, NULL);
assert(parser != NULL);
- params.buffer_handler.handler = buffer_handler;
- params.buffer_handler.pw = NULL;
- assert(hubbub_parser_setopt(parser, HUBBUB_PARSER_BUFFER_HANDLER,
- &params) == HUBBUB_OK);
-
params.tree_handler = &tree_handler;
assert(hubbub_parser_setopt(parser, HUBBUB_PARSER_TREE_HANDLER,
&params) == HUBBUB_OK);
@@ -174,14 +159,6 @@ static hubbub_parser *setup_parser(void)
}
-void buffer_handler(const uint8_t *buffer, size_t len, void *pw)
-{
- UNUSED(len);
- UNUSED(pw);
-
- pbuffer = buffer;
-}
-
/*** Buffer handling bits ***/
static void buf_clear(buf_t *buf)
@@ -371,8 +348,8 @@ int create_comment(void *ctx, const hubbub_string *data, void **result)
node_t *node = calloc(1, sizeof *node);
node->type = COMMENT;
- node->data.content = strndup((char *)ptr_from_hubbub_string(data),
- data->len);
+ node->data.content = strndup((const char *) data->ptr, data->len);
+ node->refcnt = 1;
node->refcnt = 1;
*result = node;
@@ -386,20 +363,18 @@ int create_doctype(void *ctx, const hubbub_doctype *doctype, void **result)
node->type = DOCTYPE;
node->data.doctype.name = strndup(
- (char *)ptr_from_hubbub_string(&doctype->name),
+ (const char *) doctype->name.ptr,
doctype->name.len);
if (!doctype->public_missing) {
node->data.doctype.public_id = strndup(
- (char *)ptr_from_hubbub_string(
- &doctype->public_id),
+ (const char *) doctype->public_id.ptr,
doctype->public_id.len);
}
if (!doctype->system_missing) {
node->data.doctype.system_id = strndup(
- (char *)ptr_from_hubbub_string(
- &doctype->system_id),
+ (const char *) doctype->system_id.ptr,
doctype->system_id.len);
}
node->refcnt = 1;
@@ -418,7 +393,7 @@ int create_element(void *ctx, const hubbub_tag *tag, void **result)
node->type = ELEMENT;
node->data.element.ns = tag->ns;
node->data.element.name = strndup(
- (char *)ptr_from_hubbub_string(&tag->name),
+ (const char *) tag->name.ptr,
tag->name.len);
node->data.element.n_attrs = tag->n_attributes;
@@ -432,12 +407,12 @@ int create_element(void *ctx, const hubbub_tag *tag, void **result)
attr->ns = tag->attributes[i].ns;
- attr->name = strndup((char *)ptr_from_hubbub_string(
- &tag->attributes[i].name),
+ attr->name = strndup(
+ (const char *) tag->attributes[i].name.ptr,
tag->attributes[i].name.len);
- attr->value = strndup((char *)ptr_from_hubbub_string(
- &tag->attributes[i].value),
+ attr->value = strndup(
+ (const char *) tag->attributes[i].value.ptr,
tag->attributes[i].value.len);
}
node->refcnt = 1;
@@ -452,8 +427,8 @@ int create_text(void *ctx, const hubbub_string *data, void **result)
node_t *node = calloc(1, sizeof *node);
node->type = CHARACTER;
- node->data.content = strndup((char *)ptr_from_hubbub_string(data),
- data->len);
+ node->data.content = strndup((const char *) data->ptr, data->len);
+ node->refcnt = 1;
node->refcnt = 1;
*result = node;
@@ -778,12 +753,12 @@ int add_attributes(void *ctx, void *vnode,
attr->ns = attributes[i].ns;
- attr->name = strndup((char *)ptr_from_hubbub_string(
- &attributes[i].name),
+ attr->name = strndup(
+ (const char *) attributes[i].name.ptr,
attributes[i].name.len);
- attr->value = strndup((char *)ptr_from_hubbub_string(
- &attributes[i].value),
+ attr->value = strndup(
+ (const char *) attributes[i].value.ptr,
attributes[i].value.len);
}