From 7b30a5520cfb56e651f0eb4da85a3e07747da7dc Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Sat, 23 Jun 2007 22:40:25 +0000 Subject: Import hubbub -- an HTML parsing library. Plenty of work still to do (like tree generation ;) svn path=/trunk/hubbub/; revision=3359 --- include/hubbub/errors.h | 29 ++++++++++++++ include/hubbub/functypes.h | 37 ++++++++++++++++++ include/hubbub/hubbub.h | 23 +++++++++++ include/hubbub/parser.h | 84 +++++++++++++++++++++++++++++++++++++++ include/hubbub/types.h | 97 ++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 270 insertions(+) create mode 100644 include/hubbub/errors.h create mode 100644 include/hubbub/functypes.h create mode 100644 include/hubbub/hubbub.h create mode 100644 include/hubbub/parser.h create mode 100644 include/hubbub/types.h (limited to 'include') diff --git a/include/hubbub/errors.h b/include/hubbub/errors.h new file mode 100644 index 0000000..c3b1f5d --- /dev/null +++ b/include/hubbub/errors.h @@ -0,0 +1,29 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef hubbub_errors_h_ +#define hubbub_errors_h_ + +#include + +typedef enum hubbub_error { + HUBBUB_OK = 0, + + HUBBUB_NOMEM = 1, + HUBBUB_BADPARM = 2, + HUBBUB_INVALID = 3, + HUBBUB_FILENOTFOUND = 4, + HUBBUB_NEEDDATA = 5, +} hubbub_error; + +/* Convert a hubbub error value to a string */ +const char *hubbub_error_to_string(hubbub_error error); +/* Convert a string to a hubbub error value */ +hubbub_error hubbub_error_from_string(const char *str, size_t len); + +#endif + diff --git a/include/hubbub/functypes.h b/include/hubbub/functypes.h new file mode 100644 index 0000000..aa3e649 --- /dev/null +++ b/include/hubbub/functypes.h @@ -0,0 +1,37 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef hubbub_functypes_h_ +#define hubbub_functypes_h_ + +#include + +#include + +/* Type of allocation function for hubbub */ +typedef void *(*hubbub_alloc)(void *ptr, size_t size, void *pw); + +/** + * Type of token handling function + */ +typedef void (*hubbub_token_handler)(const hubbub_token *token, void *pw); + +/** + * Type of document buffer handling function + */ +typedef void (*hubbub_buffer_handler)(const uint8_t *data, + size_t len, void *pw); + +/** + * Type of parse error handling function + */ +typedef void (*hubbub_error_handler)(uint32_t line, uint32_t col, + const char *message, void *pw); + + +#endif + diff --git a/include/hubbub/hubbub.h b/include/hubbub/hubbub.h new file mode 100644 index 0000000..8a15eca --- /dev/null +++ b/include/hubbub/hubbub.h @@ -0,0 +1,23 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef hubbub_h_ +#define hubbub_h_ + +#include +#include +#include + +/* Initialise the Hubbub library for use */ +hubbub_error hubbub_initialise(const char *aliases_file, + hubbub_alloc alloc, void *pw); + +/* Clean up after Hubbub */ +hubbub_error hubbub_finalise(hubbub_alloc alloc, void *pw); + +#endif + diff --git a/include/hubbub/parser.h b/include/hubbub/parser.h new file mode 100644 index 0000000..cdf8664 --- /dev/null +++ b/include/hubbub/parser.h @@ -0,0 +1,84 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef hubbub_parser_h_ +#define hubbub_parser_h_ + +#include + +#include +#include +#include + +typedef struct hubbub_parser hubbub_parser; + +/** + * Hubbub parser option types + */ +typedef enum hubbub_parser_opttype { + HUBBUB_PARSER_TOKEN_HANDLER, + HUBBUB_PARSER_BUFFER_HANDLER, + HUBBUB_PARSER_ERROR_HANDLER, + HUBBUB_PARSER_CONTENT_MODEL, +} hubbub_parser_opttype; + +/** + * Hubbub parser option parameters + */ +typedef union hubbub_parser_optparams { + struct { + hubbub_token_handler handler; + void *pw; + } token_handler; + + struct { + hubbub_buffer_handler handler; + void *pw; + } buffer_handler; + + struct { + hubbub_error_handler handler; + void *pw; + } error_handler; + + struct { + hubbub_content_model model; + } content_model; +} hubbub_parser_optparams; + +/* Create a hubbub parser */ +hubbub_parser *hubbub_parser_create(const char *enc, const char *int_enc, + hubbub_alloc alloc, void *pw); +/* Destroy a hubbub parser */ +void hubbub_parser_destroy(hubbub_parser *parser); + +/* Configure a hubbub parser */ +hubbub_error hubbub_parser_setopt(hubbub_parser *parser, + hubbub_parser_opttype type, + hubbub_parser_optparams *params); + +/* Pass a chunk of data to a hubbub parser for parsing */ +/* This data is encoded in the input charset */ +hubbub_error hubbub_parser_parse_chunk(hubbub_parser *parser, + uint8_t *data, size_t len); +/* Pass a chunk of extraneous data to a hubbub parser for parsing */ +/* This data is UTF-8 encoded */ +hubbub_error hubbub_parser_parse_extraneous_chunk(hubbub_parser *parser, + uint8_t *data, size_t len); +/* Inform the parser that the last chunk of data has been parsed */ +hubbub_error hubbub_parser_completed(hubbub_parser *parser); + +/* Read the document charset */ +const char *hubbub_parser_read_charset(hubbub_parser *parser, + hubbub_charset_source *source); + +/* Claim ownership of the document buffer */ +hubbub_error hubbub_parser_claim_buffer(hubbub_parser *parser, + uint8_t **buffer, size_t *len); + +#endif + diff --git a/include/hubbub/types.h b/include/hubbub/types.h new file mode 100644 index 0000000..57518ae --- /dev/null +++ b/include/hubbub/types.h @@ -0,0 +1,97 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef hubbub_types_h_ +#define hubbub_types_h_ + +#include +#include + +/** Source of charset information, in order of importance + * A client-dictated charset will override all others. + * A document-specified charset will override autodetection or the default */ +typedef enum hubbub_charset_source { + HUBBUB_CHARSET_UNKNOWN = 0, /**< Unknown */ + HUBBUB_CHARSET_DEFAULT = 1, /**< Default setting */ + HUBBUB_CHARSET_DETECTED = 2, /**< Autodetected */ + HUBBUB_CHARSET_DOCUMENT = 3, /**< Defined in document */ + HUBBUB_CHARSET_DICTATED = 4, /**< Dictated by client */ +} hubbub_charset_source; + +/** + * Content model flag + */ +typedef enum hubbub_content_model { + HUBBUB_CONTENT_MODEL_PCDATA, + HUBBUB_CONTENT_MODEL_RCDATA, + HUBBUB_CONTENT_MODEL_CDATA, + HUBBUB_CONTENT_MODEL_PLAINTEXT +} hubbub_content_model; + +/** + * Type of an emitted token + */ +typedef enum hubbub_token_type { + HUBBUB_TOKEN_DOCTYPE, + HUBBUB_TOKEN_START_TAG, + HUBBUB_TOKEN_END_TAG, + HUBBUB_TOKEN_COMMENT, + HUBBUB_TOKEN_CHARACTER, + HUBBUB_TOKEN_EOF +} hubbub_token_type; + +/** + * Tokeniser string type + */ +typedef struct hubbub_string { + uint32_t data_off; /**< Byte offset of string start */ + size_t len; /**< Byte length of string */ +} hubbub_string; + +/** + * Tag attribute data + */ +typedef struct hubbub_attribute { + hubbub_string name; /**< Attribute name */ + hubbub_string value; /**< Attribute value */ +} hubbub_attribute; + +/** + * Data for doctype token + */ +typedef struct hubbub_doctype { + hubbub_string name; /**< Doctype name */ + bool correct; /**< Doctype validity flag */ +} hubbub_doctype; + +/** + * Data for a tag + */ +typedef struct hubbub_tag { + hubbub_string name; /**< Tag name */ + uint32_t n_attributes; /**< Count of attributes */ + hubbub_attribute *attributes; /**< Array of attribute data */ +} hubbub_tag; + +/** + * Token data + */ +typedef struct hubbub_token { + hubbub_token_type type; + + union { + hubbub_doctype doctype; + + hubbub_tag tag; + + hubbub_string comment; + + hubbub_string character; + } data; +} hubbub_token; + +#endif -- cgit v1.2.3