summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Drake <mike@smoothartist.com>2016-11-21 14:22:39 +0000
committerSteven G. Johnson <stevenj@mit.edu>2016-11-21 09:22:39 -0500
commit70bbed8626e902d8c1e2b8277b0c61efb8a460bb (patch)
treee8a3bbc9ba7034cd38be424009b4845688fb9919
parentcaef918abd0a9425b3942df3859c7bea7b8986e0 (diff)
downloadlibutf8proc-70bbed8626e902d8c1e2b8277b0c61efb8a460bb.tar.gz
libutf8proc-70bbed8626e902d8c1e2b8277b0c61efb8a460bb.tar.bz2
Tlsa/ucs4 normalize (#88)
* Split codepoint sequence normalisation out into separate function. This creates utf8proc_normalize_utf32() which takes and returns a UTF-32 string, applying the following options: - UTF8PROC_NLF2LS - UTF8PROC_NLF2PS - UTF8PROC_NLF2LF - UTF8PROC_STRIPCC - UTF8PROC_COMPOSE - UTF8PROC_STABLE The utf8proc_reencode() function has been updated to call the new utf8proc_normalize_utf32(). * Update code documentation: utf8proc_reencode handles UTF8PROC_CHARBOUND.
-rw-r--r--utf8proc.c13
-rw-r--r--utf8proc.h32
2 files changed, 40 insertions, 5 deletions
diff --git a/utf8proc.c b/utf8proc.c
index ba5143a..8dc583e 100644
--- a/utf8proc.c
+++ b/utf8proc.c
@@ -545,9 +545,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
return wpos;
}
-UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
- /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
- ASSERT: 'buffer' has one spare byte of free space at the end! */
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
+ /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored */
if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
utf8proc_ssize_t rpos;
utf8proc_ssize_t wpos = 0;
@@ -655,6 +654,14 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
}
length = wpos;
}
+ return length;
+}
+
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
+ /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
+ ASSERT: 'buffer' has one spare byte of free space at the end! */
+ length = utf8proc_normalize_utf32(buffer, length, options);
+ if (length < 0) return length;
{
utf8proc_ssize_t rpos, wpos = 0;
utf8proc_int32_t uc;
diff --git a/utf8proc.h b/utf8proc.h
index 186641d..a81ab75 100644
--- a/utf8proc.h
+++ b/utf8proc.h
@@ -491,8 +491,34 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
);
/**
+ * Normalizes the sequence of `length` codepoints pointed to by `buffer`
+ * in-place (i.e., the result is also stored in `buffer`).
+ *
+ * @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
+ * @param length the length (in codepoints) of the buffer.
+ * @param options a bitwise or (`|`) of one or more of the following flags:
+ * - @ref UTF8PROC_NLF2LS - convert LF, CRLF, CR and NEL into LS
+ * - @ref UTF8PROC_NLF2PS - convert LF, CRLF, CR and NEL into PS
+ * - @ref UTF8PROC_NLF2LF - convert LF, CRLF, CR and NEL into LF
+ * - @ref UTF8PROC_STRIPCC - strip or convert all non-affected control characters
+ * - @ref UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite
+ * codepoints
+ * - @ref UTF8PROC_STABLE - prohibit combining characters that would violate
+ * the unicode versioning stability
+ *
+ * @return
+ * In case of success, the length (in codepoints) of the normalized UTF-32 string is
+ * returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg).
+ *
+ * @warning The entries of the array pointed to by `str` have to be in the
+ * range `0x0000` to `0x10FFFF`. Otherwise, the program might crash!
+ */
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options);
+
+/**
* Reencodes the sequence of `length` codepoints pointed to by `buffer`
* UTF-8 data in-place (i.e., the result is also stored in `buffer`).
+ * Can optionally normalize the UTF-32 sequence prior to UTF-8 conversion.
*
* @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
* @param length the length (in codepoints) of the buffer.
@@ -505,10 +531,12 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
* codepoints
* - @ref UTF8PROC_STABLE - prohibit combining characters that would violate
* the unicode versioning stability
+ * - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster
*
* @return
- * In case of success, the length (in bytes) of the resulting UTF-8 string is
- * returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg).
+ * In case of success, the length (in bytes) of the resulting nul-terminated
+ * UTF-8 string is returned; otherwise, a negative error code is returned
+ * (@ref utf8proc_errmsg).
*
* @warning The amount of free space pointed to by `buffer` must
* exceed the amount of the input data by one byte, and the