summaryrefslogtreecommitdiff
path: root/utils/utf8.h
diff options
context:
space:
mode:
authorVincent Sanders <vince@netsurf-browser.org>2014-01-28 21:40:13 +0000
committerVincent Sanders <vince@netsurf-browser.org>2014-01-28 21:40:13 +0000
commit654da2ffb5abf2afe9532f1d0cb77ed88f8a97cc (patch)
treeefbc3d97a8650b682b7181e748698de9c555d80d /utils/utf8.h
parent4b760c7e499e2f5cb1d06242d5f186e14e94496a (diff)
downloadnetsurf-654da2ffb5abf2afe9532f1d0cb77ed88f8a97cc.tar.gz
netsurf-654da2ffb5abf2afe9532f1d0cb77ed88f8a97cc.tar.bz2
move utf8 conversion routines to use nserror instead of their own error enum
Diffstat (limited to 'utils/utf8.h')
-rw-r--r--utils/utf8.h130
1 files changed, 117 insertions, 13 deletions
diff --git a/utils/utf8.h b/utils/utf8.h
index 68d42d3ea..7c450b5c3 100644
--- a/utils/utf8.h
+++ b/utils/utf8.h
@@ -26,40 +26,144 @@
#include <stdbool.h>
#include <stdint.h>
-typedef enum {
- UTF8_CONVERT_OK,
- UTF8_CONVERT_NOMEM,
- UTF8_CONVERT_BADENC
-} utf8_convert_ret;
+#include "utils/errors.h"
+/**
+ * Convert a UTF-8 multibyte sequence into a single UCS4 character
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This function conforms to RFC2279, however.
+ *
+ * \param s_in The sequence to process
+ * \param l Length of sequence
+ * \return UCS4 character
+ */
uint32_t utf8_to_ucs4(const char *s, size_t l);
+
+/**
+ * Convert a single UCS4 character into a UTF-8 multibyte sequence
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This function conforms to RFC2279, however.
+ *
+ * \param c The character to process (0 <= c <= 0x7FFFFFFF)
+ * \param s Pointer to 6 byte long output buffer
+ * \return Length of multibyte sequence
+ */
size_t utf8_from_ucs4(uint32_t c, char *s);
+
+/**
+ * Calculate the length (in characters) of a NULL-terminated UTF-8 string
+ *
+ * \param s The string
+ * \return Length of string
+ */
size_t utf8_length(const char *s);
+
+/**
+ * Calculated the length (in characters) of a bounded UTF-8 string
+ *
+ * \param s The string
+ * \param l Maximum length of input (in bytes)
+ * \return Length of string, in characters
+ */
size_t utf8_bounded_length(const char *s, size_t l);
+
+/**
+ * Calculate the length (in bytes) of a bounded UTF-8 string
+ *
+ * \param s The string
+ * \param l Maximum length of input (in bytes)
+ * \param c Maximum number of characters to measure
+ * \return Length of string, in bytes
+ */
size_t utf8_bounded_byte_length(const char *s, size_t l, size_t c);
+/**
+ * Calculate the length (in bytes) of a UTF-8 character
+ *
+ * \param s Pointer to start of character
+ * \return Length of character, in bytes
+ */
size_t utf8_char_byte_length(const char *s);
+
+/**
+ * Find previous legal UTF-8 char in string
+ *
+ * \param s The string
+ * \param o Offset in the string to start at
+ * \return Offset of first byte of previous legal character
+ */
size_t utf8_prev(const char *s, size_t o);
+
+/**
+ * Find next legal UTF-8 char in string
+ *
+ * \param s The string
+ * \param l Maximum offset in string
+ * \param o Offset in the string to start at
+ * \return Offset of first byte of next legal character
+ */
size_t utf8_next(const char *s, size_t l, size_t o);
-utf8_convert_ret utf8_to_enc(const char *string, const char *encname,
+
+/**
+ * Convert a UTF8 string into the named encoding
+ *
+ * \param string The NULL-terminated string to convert
+ * \param encname The encoding name (suitable for passing to iconv)
+ * \param len Length of input string to consider (in bytes), or 0
+ * \param result Pointer to location to store result (allocated on heap)
+ * \return standard nserror value
+ */
+nserror utf8_to_enc(const char *string, const char *encname,
size_t len, char **result);
-utf8_convert_ret utf8_from_enc(const char *string, const char *encname,
+
+/**
+ * Convert a string in the named encoding into a UTF-8 string
+ *
+ * \param string The NULL-terminated string to convert
+ * \param encname The encoding name (suitable for passing to iconv)
+ * \param len Length of input string to consider (in bytes), or 0
+ * \param result Pointer to location to store result (allocated on heap)
+ * \return standard nserror value
+ */
+nserror utf8_from_enc(const char *string, const char *encname,
size_t len, char **result, size_t *result_len);
-utf8_convert_ret utf8_to_html(const char *string, const char *encname,
+/**
+ * Convert a UTF-8 encoded string into a string of the given encoding,
+ * applying HTML escape sequences where necessary.
+ *
+ * \param string String to convert (NUL-terminated)
+ * \param encname Name of encoding to convert to
+ * \param len Length, in bytes, of the input string, or 0
+ * \param result Pointer to location to receive result
+ * \return standard nserror code
+ */
+nserror utf8_to_html(const char *string, const char *encname,
size_t len, char **result);
+/**
+ * Save the given utf8 text to a file, converting to local encoding.
+ *
+ * \param utf8_text text to save to file
+ * \param path pathname to save to
+ * \return true iff the save succeeded
+ */
bool utf8_save_text(const char *utf8_text, const char *path);
+
+/**
+ * Finalise the UTF-8 library
+ */
+nserror utf8_finalise(void);
+
/* These two are platform specific */
-utf8_convert_ret utf8_to_local_encoding(const char *string, size_t len,
- char **result);
-utf8_convert_ret utf8_from_local_encoding(const char *string, size_t len,
- char **result);
+nserror utf8_to_local_encoding(const char *string, size_t len, char **result);
+nserror utf8_from_local_encoding(const char *string, size_t len, char **result);
-void utf8_finalise(void);
#endif