path: root/src/utils/utf8.h
diff options
authorJohn Mark Bell <>2007-09-30 21:10:50 +0000
committerJohn Mark Bell <>2007-09-30 21:10:50 +0000
commite0e38d906c8974bb22a0368a9709af9590362927 (patch)
treef7f4c1acff769e9a7f6cd0f1c037ba2c28a66593 /src/utils/utf8.h
parent49e419c9b75cc149e7f4c898c31aed33f4b2c960 (diff)
DOM Strings are now capable of containing either UTF-8 or UTF-16 encoded data.
The charset used for strings within a document is specified at document creation time. Whilst it is possible to mix charsets within a document, it's not recommended. Things that need fixing: + dom_string_get_data() doesn't return the charset. Better would be to permit the client to request a charset for the data to be returned in. + Interned node name strings will break if the document is UTF-16 (dom_document_create()). In fact, these could quite happily be globals, rather than allocating a set for each document. + Other usage of dom string constructors need checking for sanity + DOM Strings need to gain more utility APIs (such as getting the character length of a string, string concatenation etc). svn path=/trunk/dom/; revision=3614
Diffstat (limited to 'src/utils/utf8.h')
1 files changed, 38 insertions, 0 deletions
diff --git a/src/utils/utf8.h b/src/utils/utf8.h
new file mode 100644
index 0000000..154dbb8
--- /dev/null
+++ b/src/utils/utf8.h
@@ -0,0 +1,38 @@
+ * This file is part of libdom.
+ * Licensed under the MIT License,
+ *
+ * Copyright 2007 John-Mark Bell <>
+ */
+/** \file
+ * UTF-8 manipulation functions (interface).
+ */
+#ifndef dom_utils_utf8_h_
+#define dom_utils_utf8_h_
+#include <inttypes.h>
+#include "utils/charset_errors.h"
+inline charset_error _dom_utf8_to_ucs4(const uint8_t *s, size_t len,
+ uint32_t *ucs4, size_t *clen);
+inline charset_error _dom_utf8_from_ucs4(uint32_t ucs4, uint8_t *s,
+ size_t *len);
+inline charset_error _dom_utf8_length(const uint8_t *s, size_t max,
+ size_t *len);
+inline charset_error _dom_utf8_char_byte_length(const uint8_t *s,
+ size_t *len);
+inline charset_error _dom_utf8_prev(const uint8_t *s, uint32_t off,
+ uint32_t *prevoff);
+inline charset_error _dom_utf8_next(const uint8_t *s, uint32_t len,
+ uint32_t off, uint32_t *nextoff);
+inline charset_error _dom_utf8_next_paranoid(const uint8_t *s, uint32_t len,
+ uint32_t off, uint32_t *nextoff);