From e0e38d906c8974bb22a0368a9709af9590362927 Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Sun, 30 Sep 2007 21:10:50 +0000 Subject: DOM Strings are now capable of containing either UTF-8 or UTF-16 encoded data. The charset used for strings within a document is specified at document creation time. Whilst it is possible to mix charsets within a document, it's not recommended. Things that need fixing: + dom_string_get_data() doesn't return the charset. Better would be to permit the client to request a charset for the data to be returned in. + Interned node name strings will break if the document is UTF-16 (dom_document_create()). In fact, these could quite happily be globals, rather than allocating a set for each document. + Other usage of dom string constructors need checking for sanity + DOM Strings need to gain more utility APIs (such as getting the character length of a string, string concatenation etc). svn path=/trunk/dom/; revision=3614 --- src/utils/utf8.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 src/utils/utf8.h (limited to 'src/utils/utf8.h') diff --git a/src/utils/utf8.h b/src/utils/utf8.h new file mode 100644 index 0000000..154dbb8 --- /dev/null +++ b/src/utils/utf8.h @@ -0,0 +1,38 @@ +/* + * This file is part of libdom. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +/** \file + * UTF-8 manipulation functions (interface). + */ + +#ifndef dom_utils_utf8_h_ +#define dom_utils_utf8_h_ + +#include + +#include "utils/charset_errors.h" + +inline charset_error _dom_utf8_to_ucs4(const uint8_t *s, size_t len, + uint32_t *ucs4, size_t *clen); +inline charset_error _dom_utf8_from_ucs4(uint32_t ucs4, uint8_t *s, + size_t *len); + +inline charset_error _dom_utf8_length(const uint8_t *s, size_t max, + size_t *len); +inline charset_error _dom_utf8_char_byte_length(const uint8_t *s, + size_t *len); + +inline charset_error _dom_utf8_prev(const uint8_t *s, uint32_t off, + uint32_t *prevoff); +inline charset_error _dom_utf8_next(const uint8_t *s, uint32_t len, + uint32_t off, uint32_t *nextoff); + +inline charset_error _dom_utf8_next_paranoid(const uint8_t *s, uint32_t len, + uint32_t off, uint32_t *nextoff); + +#endif + -- cgit v1.2.3