/* * This file is part of libdom. * Licensed under the MIT License, * http://www.opensource.org/licenses/mit-license.php * Copyright 2007 John-Mark Bell */ #include #include #include #include #include "core/document.h" #include "utils/utils.h" #include "utils/utf8.h" #include "utils/utf16.h" /** * A DOM string * * DOM strings store either a pointer to allocated data, a pointer * to constant data or an offset into a document buffer. * * They are reference counted so freeing is performed correctly. */ struct dom_string { enum { DOM_STRING_PTR, DOM_STRING_CONST_PTR, DOM_STRING_OFFSET, DOM_STRING_PTR_NODOC } type; /**< String type */ dom_string_charset charset; /**< Charset of string */ union { uint8_t *ptr; const uint8_t *cptr; uint32_t offset; } data; /**< Type-specific data */ size_t len; /**< Byte length of string */ union { struct dom_document *doc; /**< Owning document */ struct { dom_alloc alloc; /**< Memory (de)allocation * function */ void *pw; /**< Client-specific data */ } nodoc; } ctx; /**< Allocation context */ uint32_t refcnt; /**< Reference count */ }; static struct dom_string empty_string = { .type = DOM_STRING_CONST_PTR, .charset = DOM_STRING_UTF8, .data.ptr = NULL, .len = 0, .ctx.doc = NULL, .refcnt = 1 }; static dom_exception __dom_string_get_data(struct dom_string *str, const uint8_t **data, size_t *len); /** * Claim a reference on a DOM string * * \param str The string to claim a reference on */ void dom_string_ref(struct dom_string *str) { str->refcnt++; } /** * Release a reference on a DOM string * * \param str The string to release the reference from * * If the reference count reaches zero, any memory claimed by the * string will be released */ void dom_string_unref(struct dom_string *str) { if (--str->refcnt == 0) { if (str->type == DOM_STRING_PTR_NODOC) { str->ctx.nodoc.alloc(str->data.ptr, 0, str->ctx.nodoc.pw); str->ctx.nodoc.alloc(str, 0, str->ctx.nodoc.pw); } else { if (str->type == DOM_STRING_PTR) { dom_document_alloc(str->ctx.doc, str->data.ptr, 0); } dom_document_alloc(str->ctx.doc, str, 0); } } } /** * Create a DOM string from an offset into the document buffer * * \param doc The document in which the string resides * \param off Offset from start of document buffer * \param len Length, in bytes, of string * \param str Pointer to location to receive pointer to new string * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion * * The returned string will already be referenced, so there is no need * to explicitly reference it. */ dom_exception dom_string_create_from_off(struct dom_document *doc, uint32_t off, size_t len, struct dom_string **str) { struct dom_string *ret; ret = dom_document_alloc(doc, NULL, sizeof(struct dom_string)); if (ret == NULL) return DOM_NO_MEM_ERR; ret->type = DOM_STRING_OFFSET; ret->charset = dom_document_get_charset(doc); ret->data.offset = off; ret->len = len; ret->ctx.doc = doc; ret->refcnt = 1; *str = ret; return DOM_NO_ERR; } /** * Create a DOM string from a string of characters * * \param doc The document in which the string resides * \param ptr Pointer to string of characters * \param len Length, in bytes, of string of characters * \param str Pointer to location to receive pointer to new string * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion * * The returned string will already be referenced, so there is no need * to explicitly reference it. * * The string of characters passed in will be copied for use by the * returned DOM string. */ dom_exception dom_string_create_from_ptr(struct dom_document *doc, const uint8_t *ptr, size_t len, struct dom_string **str) { struct dom_string *ret; ret = dom_document_alloc(doc, NULL, sizeof(struct dom_string)); if (ret == NULL) return DOM_NO_MEM_ERR; ret->data.ptr = dom_document_alloc(doc, NULL, len); if (ret->data.ptr == NULL) { dom_document_alloc(doc, ret, 0); return DOM_NO_MEM_ERR; } ret->type = DOM_STRING_PTR; ret->charset = dom_document_get_charset(doc); memcpy(ret->data.ptr, ptr, len); ret->len = len; ret->ctx.doc = doc; ret->refcnt = 1; *str = ret; return DOM_NO_ERR; } /** * Create a DOM string from a constant string of characters * * \param doc The document in which the string resides * \param ptr Pointer to string of characters * \param len Length, in bytes, of string of characters * \param str Pointer to location to receive pointer to new string * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion * * The returned string will already be referenced, so there is no need * to explicitly reference it. * * The string of characters passed in will _not_ be copied for use by the * returned DOM string. */ dom_exception dom_string_create_from_const_ptr(struct dom_document *doc, const uint8_t *ptr, size_t len, struct dom_string **str) { struct dom_string *ret; ret = dom_document_alloc(doc, NULL, sizeof(struct dom_string)); if (ret == NULL) return DOM_NO_MEM_ERR; ret->type = DOM_STRING_CONST_PTR; ret->charset = dom_document_get_charset(doc); ret->data.cptr = ptr; ret->len = len; ret->ctx.doc = doc; ret->refcnt = 1; *str = ret; return DOM_NO_ERR; } /** * Create a DOM string from a string of characters that does not belong * to a document * * \param alloc Memory (de)allocation function * \param pw Pointer to client-specific private data * \param charset The charset of the string * \param ptr Pointer to string of characters * \param len Length, in bytes, of string of characters * \param str Pointer to location to receive result * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion * * The returned string will already be referenced, so there is no need * to explicitly reference it. * * The string of characters passed in will be copied for use by the * returned DOM string. */ dom_exception dom_string_create_from_ptr_no_doc(dom_alloc alloc, void *pw, dom_string_charset charset, const uint8_t *ptr, size_t len, struct dom_string **str) { struct dom_string *ret; ret = alloc(NULL, sizeof(struct dom_string), pw); if (ret == NULL) return DOM_NO_MEM_ERR; ret->data.ptr = alloc(NULL, len, pw); if (ret->data.ptr == NULL) { alloc(ret, 0, pw); return DOM_NO_MEM_ERR; } ret->type = DOM_STRING_PTR_NODOC; ret->charset = charset; memcpy(ret->data.ptr, ptr, len); ret->len = len; ret->ctx.nodoc.alloc = alloc; ret->ctx.nodoc.pw = pw; ret->refcnt = 1; *str = ret; return DOM_NO_ERR; } /** * Case sensitively compare two DOM strings * * \param s1 The first string to compare * \param s2 The second string to compare * \return 0 if strings match, non-0 otherwise * * NULL and "" will match. */ int dom_string_cmp(struct dom_string *s1, struct dom_string *s2) { const uint8_t *d1 = NULL; const uint8_t *d2 = NULL; size_t l1, l2; dom_exception err; err = __dom_string_get_data(s1, &d1, &l1); if (err != DOM_NO_ERR) return 1; /* arbitrary */ err = __dom_string_get_data(s2, &d2, &l2); if (err != DOM_NO_ERR) return 1; /* arbitrary */ while (l1 > 0 && l2 > 0) { uint32_t c1, c2; size_t cl1, cl2; charset_error err; err = (s1->charset == DOM_STRING_UTF8) ? _dom_utf8_to_ucs4(d1, l1, &c1, &cl1) : _dom_utf16_to_ucs4(d1, l1, &c1, &cl1); if (err != CHARSET_OK) { } err = (s2->charset == DOM_STRING_UTF8) ? _dom_utf8_to_ucs4(d2, l2, &c2, &cl2) : _dom_utf16_to_ucs4(d2, l2, &c2, &cl2); if (err != CHARSET_OK) { } if (c1 != c2) { return (int)(c1 - c2); } d1 += cl1; d2 += cl2; l1 -= cl1; l2 -= cl2; } return (int)(l1 - l2); } /** * Case insensitively compare two DOM strings * * \param s1 The first string to compare * \param s2 The second string to compare * \return 0 if strings match, non-0 otherwise * * NULL and "" will match. */ int dom_string_icmp(struct dom_string *s1, struct dom_string *s2) { const uint8_t *d1 = NULL; const uint8_t *d2 = NULL; size_t l1, l2; dom_exception err; err = __dom_string_get_data(s1, &d1, &l1); if (err != DOM_NO_ERR) return 1; /* arbitrary */ err = __dom_string_get_data(s2, &d2, &l2); if (err != DOM_NO_ERR) return 1; /* arbitrary */ while (l1 > 0 && l2 > 0) { uint32_t c1, c2; size_t cl1, cl2; charset_error err; err = (s1->charset == DOM_STRING_UTF8) ? _dom_utf8_to_ucs4(d1, l1, &c1, &cl1) : _dom_utf16_to_ucs4(d1, l1, &c1, &cl1); if (err != CHARSET_OK) { } err = (s2->charset == DOM_STRING_UTF8) ? _dom_utf8_to_ucs4(d2, l2, &c2, &cl2) : _dom_utf16_to_ucs4(d2, l2, &c2, &cl2); if (err != CHARSET_OK) { } /** \todo improved lower-casing algorithm */ if (tolower(c1) != tolower(c2)) { return (int)(tolower(c1) - tolower(c2)); } d1 += cl1; d2 += cl2; l1 -= cl1; l2 -= cl2; } return (int)(l1 - l2); } /** * Get the index of the first occurrence of a character in a dom string * * \param str The string to search in * \param chr UCS4 value to look for * \return Character index of found character, or -1 if none found */ uint32_t dom_string_index(struct dom_string *str, uint32_t chr) { const uint8_t *s; size_t clen, slen; uint32_t c, index; charset_error err; __dom_string_get_data(str, &s, &slen); index = 0; while (slen > 0) { if (str->charset == DOM_STRING_UTF8) { err = _dom_utf8_to_ucs4(s, slen, &c, &clen); } else { err = _dom_utf16_to_ucs4(s, slen, &c, &clen); } if (err != CHARSET_OK) { return (uint32_t) -1; } if (c == chr) { return index; } s += clen; slen -= clen; index++; } return (uint32_t) -1; } /** * Get the index of the last occurrence of a character in a dom string * * \param str The string to search in * \param chr UCS4 value to look for * \return Character index of found character, or -1 if none found */ uint32_t dom_string_rindex(struct dom_string *str, uint32_t chr) { const uint8_t *s; size_t clen, slen; uint32_t c, index; charset_error err; __dom_string_get_data(str, &s, &slen); index = dom_string_length(str); while (slen > 0) { if (str->charset == DOM_STRING_UTF8) { err = _dom_utf8_prev(s, slen, &clen); if (err == CHARSET_OK) { err = _dom_utf8_to_ucs4(s + clen, slen - clen, &c, &clen); } } else { err = _dom_utf16_prev(s, slen, &clen); if (err == CHARSET_OK) { err = _dom_utf16_to_ucs4(s + clen, slen - clen, &c, &clen); } } if (err != CHARSET_OK) { return (uint32_t) -1; } if (c == chr) { return index; } slen -= clen; index--; } return (uint32_t) -1; } /** * Get the length, in characters, of a dom string * * \param str The string to measure the length of * \return The length of the string, in characters */ uint32_t dom_string_length(struct dom_string *str) { const uint8_t *s; size_t slen; uint32_t clen; charset_error err; __dom_string_get_data(str, &s, &slen); if (str->charset == DOM_STRING_UTF8) { err = _dom_utf8_length(s, slen, &clen); } else { err = _dom_utf16_length(s, slen, &clen); } if (err != CHARSET_OK) { return 0; } return clen; } /** * Concatenate two dom strings * * \param s1 The first string * \param s2 The second string * \param result Pointer to location to receive result * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion * * The returned string will be allocated using the allocation details * stored in ::s1. * * The returned string will have its reference count increased. The client * should dereference it once it has finished with it. */ dom_exception dom_string_concat(struct dom_string *s1, struct dom_string *s2, struct dom_string **result) { struct dom_string *concat; const uint8_t *s; size_t slen; if (s1->type == DOM_STRING_PTR_NODOC) { concat = s1->ctx.nodoc.alloc(NULL, sizeof(struct dom_string), s1->ctx.nodoc.pw); } else { concat = dom_document_alloc(s1->ctx.doc, NULL, sizeof(struct dom_string)); } if (concat == NULL) { return DOM_NO_MEM_ERR; } /** \todo support attempted concatenation of mismatched charsets */ if (s1->type == DOM_STRING_PTR_NODOC) { concat->data.ptr = s1->ctx.nodoc.alloc(NULL, s1->len + s2->len, s1->ctx.nodoc.pw); } else { concat->data.ptr = dom_document_alloc(s1->ctx.doc, NULL, s1->len + s2->len); } if (concat->data.ptr == NULL) { if (s1->type == DOM_STRING_PTR_NODOC) { s1->ctx.nodoc.alloc(concat, 0, s1->ctx.nodoc.pw); } else { dom_document_alloc(s1->ctx.doc, concat, 0); } return DOM_NO_MEM_ERR; } concat->type = (s1->type == DOM_STRING_PTR_NODOC) ? DOM_STRING_PTR_NODOC : DOM_STRING_PTR; concat->charset = s1->charset; __dom_string_get_data(s1, &s, &slen); memcpy(concat->data.ptr, s, slen); __dom_string_get_data(s2, &s, &slen); memcpy(concat->data.ptr + s1->len, s, slen); concat->len = s1->len + s2->len; if (concat->type == DOM_STRING_PTR_NODOC) { concat->ctx.nodoc.alloc = s1->ctx.nodoc.alloc; concat->ctx.nodoc.pw = s1->ctx.nodoc.pw; } else { concat->ctx.doc = s1->ctx.doc; } concat->refcnt = 1; *result = concat; return DOM_NO_ERR; } /** * Extract a substring from a dom string * * \param str The string to extract from * \param i1 The character index of the start of the substring * \param i2 The character index of the end of the substring * \param result Pointer to location to receive result * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion * * The returned string will be allocated using the allocation details * stored in ::str. * * The returned string will have its reference count increased. The client * should dereference it once it has finished with it. */ dom_exception dom_string_substr(struct dom_string *str, uint32_t i1, uint32_t i2, struct dom_string **result) { const uint8_t *s; size_t slen; size_t b1, b2; charset_error err; __dom_string_get_data(str, &s, &slen); /* Initialise the byte index of the start to 0 */ b1 = 0; /* Make the end a character offset from the start */ i2 -= i1; /* Calculate the byte index of the start */ while (i1 > 0) { if (str->charset == DOM_STRING_UTF8) { err = _dom_utf8_next(s, slen - b1, b1, &b1); } else { err = _dom_utf16_next(s, slen - b1, b1, &b1); } if (err != CHARSET_OK) { return DOM_NO_MEM_ERR; } i1--; } /* Initialise the byte index of the end to that of the start */ b2 = b1; /* Calculate the byte index of the end */ while (i2 > 0) { if (str->charset == DOM_STRING_UTF8) { err = _dom_utf8_next(s, slen - b2, b2, &b2); } else { err = _dom_utf16_next(s, slen - b2, b2, &b2); } if (err != CHARSET_OK) { return DOM_NO_MEM_ERR; } i2--; } /* Create a string from the specified byte range */ return (str->type == DOM_STRING_PTR_NODOC) ? dom_string_create_from_ptr_no_doc( str->ctx.nodoc.alloc, str->ctx.nodoc.pw, str->charset, s + b1, b2 - b1, result) : dom_string_create_from_ptr(str->ctx.doc, s + b1, b2 - b1, result); } /** * Insert data into a dom string at the given location * * \param target Pointer to string to insert into * \param source Pointer to string to insert * \param offset Character offset of location to insert at * \param result Pointer to location to receive result * \return DOM_NO_ERR on success, * DOM_NO_MEM_ERR on memory exhaustion, * DOM_INDEX_SIZE_ERR if ::offset > len(::target). * * The returned string will be allocated using the allocation details * stored in ::target. * * The returned string will have its reference count increased. The client * should dereference it once it has finished with it. */ dom_exception dom_string_insert(struct dom_string *target, struct dom_string *source, uint32_t offset, struct dom_string **result) { struct dom_string *res; const uint8_t *t, *s; uint32_t tlen, slen, clen; uint32_t ins = 0; charset_error err; __dom_string_get_data(target, &t, &tlen); __dom_string_get_data(source, &s, &slen); clen = dom_string_length(target); if (offset > clen) return DOM_INDEX_SIZE_ERR; /* Calculate the byte index of the insertion point */ if (offset == clen) { /* Optimisation for append */ offset = 0; ins = tlen; } else { while (offset > 0) { if (target->charset == DOM_STRING_UTF8) { err = _dom_utf8_next(t, tlen - ins, ins, &ins); } else { err = _dom_utf16_next(t, tlen - ins, ins, &ins); } if (err != CHARSET_OK) { return DOM_NO_MEM_ERR; } offset--; } } /* Allocate result string */ if (target->type == DOM_STRING_PTR_NODOC) { res = target->ctx.nodoc.alloc(NULL, sizeof(struct dom_string), target->ctx.nodoc.pw); } else { res = dom_document_alloc(target->ctx.doc, NULL, sizeof(struct dom_string)); } if (res == NULL) { return DOM_NO_MEM_ERR; } /** \todo support insertion of a string from a different charset */ /* Allocate data buffer for result contents */ if (target->type == DOM_STRING_PTR_NODOC) { res->data.ptr = target->ctx.nodoc.alloc(NULL, tlen + slen, target->ctx.nodoc.pw); } else { res->data.ptr = dom_document_alloc(target->ctx.doc, NULL, tlen + slen); } if (res->data.ptr == NULL) { if (target->type == DOM_STRING_PTR_NODOC) { target->ctx.nodoc.alloc(res, 0, target->ctx.nodoc.pw); } else { dom_document_alloc(target->ctx.doc, res, 0); } return DOM_NO_MEM_ERR; } /* Populate result members */ res->type = (target->type == DOM_STRING_PTR_NODOC) ? DOM_STRING_PTR_NODOC : DOM_STRING_PTR; res->charset = target->charset; /* Copy initial portion of target, if any, into result */ if (ins > 0) { memcpy(res->data.ptr, t, ins); } /* Copy inserted data into result */ memcpy(res->data.ptr + ins, s, slen); /* Copy remainder of target, if any, into result */ if (tlen - ins > 0) { memcpy(res->data.ptr + ins + slen, t + ins, tlen - ins); } res->len = tlen + slen; if (res->type == DOM_STRING_PTR_NODOC) { res->ctx.nodoc.alloc = target->ctx.nodoc.alloc; res->ctx.nodoc.pw = target->ctx.nodoc.pw; } else { res->ctx.doc = target->ctx.doc; } res->refcnt = 1; *result = res; return DOM_NO_ERR; } /** * Replace a section of a dom string * * \param target Pointer to string of which to replace a section * \param source Pointer to replacement string * \param i1 Character index of start of region to replace * \param i2 Character index of end of region to replace * \param result Pointer to location to receive result * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion. * * The returned string will be allocated using the allocation details * stored in ::target. * * The returned string will have its reference count increased. The client * should dereference it once it has finished with it. */ dom_exception dom_string_replace(struct dom_string *target, struct dom_string *source, uint32_t i1, uint32_t i2, struct dom_string **result) { struct dom_string *res; const uint8_t *t, *s; uint32_t tlen, slen; uint32_t b1, b2; charset_error err; __dom_string_get_data(target, &t, &tlen); __dom_string_get_data(source, &s, &slen); /* Initialise the byte index of the start to 0 */ b1 = 0; /* Make the end a character offset from the start */ i2 -= i1; /* Calculate the byte index of the start */ while (i1 > 0) { if (target->charset == DOM_STRING_UTF8) { err = _dom_utf8_next(s, slen - b1, b1, &b1); } else { err = _dom_utf16_next(s, slen - b1, b1, &b1); } if (err != CHARSET_OK) { return DOM_NO_MEM_ERR; } i1--; } /* Initialise the byte index of the end to that of the start */ b2 = b1; /* Calculate the byte index of the end */ while (i2 > 0) { if (target->charset == DOM_STRING_UTF8) { err = _dom_utf8_next(s, slen - b2, b2, &b2); } else { err = _dom_utf16_next(s, slen - b2, b2, &b2); } if (err != CHARSET_OK) { return DOM_NO_MEM_ERR; } i2--; } /* Allocate result string */ if (target->type == DOM_STRING_PTR_NODOC) { res = target->ctx.nodoc.alloc(NULL, sizeof(struct dom_string), target->ctx.nodoc.pw); } else { res = dom_document_alloc(target->ctx.doc, NULL, sizeof(struct dom_string)); } if (res == NULL) { return DOM_NO_MEM_ERR; } /** \todo support insertion of a string from a different charset */ /* Allocate data buffer for result contents */ if (target->type == DOM_STRING_PTR_NODOC) { res->data.ptr = target->ctx.nodoc.alloc(NULL, tlen + slen - (b2 - b1), target->ctx.nodoc.pw); } else { res->data.ptr = dom_document_alloc(target->ctx.doc, NULL, tlen + slen - (b2 - b1)); } if (res->data.ptr == NULL) { if (target->type == DOM_STRING_PTR_NODOC) { target->ctx.nodoc.alloc(res, 0, target->ctx.nodoc.pw); } else { dom_document_alloc(target->ctx.doc, res, 0); } return DOM_NO_MEM_ERR; } /* Populate result members */ res->type = (target->type == DOM_STRING_PTR_NODOC) ? DOM_STRING_PTR_NODOC : DOM_STRING_PTR; res->charset = target->charset; /* Copy initial portion of target, if any, into result */ if (b1 > 0) { memcpy(res->data.ptr, t, b1); } /* Copy replacement data into result */ if (slen > 0) { memcpy(res->data.ptr + b1, s, slen); } /* Copy remainder of target, if any, into result */ if (tlen - b2 > 0) { memcpy(res->data.ptr + b1 + slen, t + b2, tlen - b2); } res->len = tlen + slen - (b2 - b1); if (res->type == DOM_STRING_PTR_NODOC) { res->ctx.nodoc.alloc = target->ctx.nodoc.alloc; res->ctx.nodoc.pw = target->ctx.nodoc.pw; } else { res->ctx.doc = target->ctx.doc; } res->refcnt = 1; *result = res; return DOM_NO_ERR; } /** * Duplicate a dom string * * \param str The string to duplicate * \param result Pointer to location to receive result * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion * * The returned string will be allocated using the allocation details * stored in ::str. * * The returned string will have its reference count increased. The client * should dereference it once it has finished with it. */ dom_exception dom_string_dup(struct dom_string *str, struct dom_string **result) { const uint8_t *s; size_t slen; __dom_string_get_data(str, &s, &slen); return str->type == DOM_STRING_PTR_NODOC ? dom_string_create_from_ptr_no_doc( str->ctx.nodoc.alloc, str->ctx.nodoc.pw, str->charset, s, slen, result) : dom_string_create_from_ptr(str->ctx.doc, s, slen, result); } /** * Calculate a hash value from a dom string * * \param str The string to calculate a hash of * \return The hash value associated with the string */ uint32_t dom_string_hash(struct dom_string *str) { const uint8_t *s; size_t slen; uint32_t hash = 0x01000193; __dom_string_get_data(str, &s, &slen); while (slen > 0) { hash *= 0x01000193; hash ^= *s; s++; slen--; } return hash; } /* */ /*---------------------------------------------------------------------------*/ /* */ /** * Get a pointer to the string of characters within a DOM string * * \param str Pointer to DOM string to retrieve pointer from * \param data Pointer to location to receive data * \param len Pointer to location to receive byte length of data * \return DOM_NO_ERR on success * * The caller must have previously claimed a reference on the DOM string. * The returned pointer must not be freed. */ dom_exception __dom_string_get_data(struct dom_string *str, const uint8_t **data, size_t *len) { /* Assume that a NULL str pointer indicates the empty string */ if (str == NULL) str = &empty_string; switch (str->type) { case DOM_STRING_PTR: *data = str->data.ptr; break; case DOM_STRING_CONST_PTR: *data = str->data.cptr; break; case DOM_STRING_OFFSET: *data = dom_document_get_base(str->ctx.doc) + str->data.offset; break; case DOM_STRING_PTR_NODOC: *data = str->data.ptr; break; } *len = str->len; return DOM_NO_ERR; }