summaryrefslogtreecommitdiff
path: root/src/charset/aliases.c
diff options
context:
space:
mode:
authorAndrew Sidwell <andy@entai.co.uk>2008-08-07 18:27:36 +0000
committerAndrew Sidwell <andy@entai.co.uk>2008-08-07 18:27:36 +0000
commitea5862718eaa728765ee0fb96e8c38a9aa4a653c (patch)
tree0cb2ee12d7dc40e53b3aad907f4348eebf353930 /src/charset/aliases.c
parentf4e860d40933bcfac31f5da33ef44c5389691f68 (diff)
downloadlibparserutils-ea5862718eaa728765ee0fb96e8c38a9aa4a653c.tar.gz
libparserutils-ea5862718eaa728765ee0fb96e8c38a9aa4a653c.tar.bz2
Make the lpu alias comparion stuff ignore punctuation characters, as per HTML5
svn path=/trunk/libparserutils/; revision=4947
Diffstat (limited to 'src/charset/aliases.c')
-rw-r--r--src/charset/aliases.c66
1 files changed, 60 insertions, 6 deletions
diff --git a/src/charset/aliases.c b/src/charset/aliases.c
index 1e7e6ea..ce098fb 100644
--- a/src/charset/aliases.c
+++ b/src/charset/aliases.c
@@ -12,6 +12,8 @@
#include <stdlib.h>
#include <string.h>
+#include <assert.h>
+
#include "charset/aliases.h"
#include "utils/utils.h"
@@ -32,6 +34,7 @@ static parserutils_error parserutils_charset_create_alias(const char *alias,
static parserutils_charset_aliases_canon *parserutils_charset_create_canon(
const char *canon, uint16_t mibenum,
parserutils_alloc alloc, void *pw);
+static int aliascmp(const char *s1, const char *s2, size_t s2_len);
static uint32_t parserutils_charset_hash_val(const char *alias, size_t len);
/**
@@ -246,6 +249,54 @@ bool parserutils_charset_mibenum_is_unicode(uint16_t mibenum)
mibenum == utf32be || mibenum == utf32le);
}
+#define IS_PUNCT_OR_SPACE(x) \
+ ((0x09 <= x && x <= 0x0D) || \
+ (0x20 <= x && x <= 0x2F) || \
+ (0x3A <= x && x <= 0x40) || \
+ (0x5B <= x && x <= 0x60) || \
+ (0x7B <= x && x <= 0x7E))
+
+
+/**
+ * Compare name "s1" to name "s2" (of size s2_len) case-insensitively
+ * and ignoring ASCII punctuation characters.
+ *
+ * See http://www.whatwg.org/specs/web-apps/current-work/#character0
+ *
+ * \param s1 Alias to compare to
+ * \param s2 Alias to compare
+ * \param s2_len Length of "s2"
+ * \returns 0 if equal, 1 otherwise
+ */
+int aliascmp(const char *s1, const char *s2, size_t s2_len)
+{
+ assert(s2_len != 0);
+
+ size_t s2_pos = 0;
+
+ while (true) {
+ while (IS_PUNCT_OR_SPACE(*s1))
+ s1++;
+ while (IS_PUNCT_OR_SPACE(s2[s2_pos]) &&
+ s2_pos <= s2_len) {
+ s2_pos++;
+ }
+
+ if (s2_pos == s2_len && !*s1)
+ return 0;
+ else if (s2_pos == s2_len || !*s1)
+ break;
+
+ if (tolower(*s1) != tolower(s2[s2_pos]))
+ break;
+ s1++;
+ s2_pos++;
+ }
+
+ return 1;
+}
+
+
/**
* Retrieve the canonical form of an alias name
*
@@ -266,15 +317,13 @@ parserutils_charset_aliases_canon *parserutils_charset_alias_canonicalise(
hash = parserutils_charset_hash_val(alias, len);
for (c = canon_tab[hash]; c; c = c->next)
- if (c->name_len == len &&
- strncasecmp(c->name, alias, len) == 0)
+ if (aliascmp(c->name, alias, len) == 0)
break;
if (c)
return c;
for (a = alias_tab[hash]; a; a = a->next)
- if (a->name_len == len &&
- strncasecmp(a->name, alias, len) == 0)
+ if (aliascmp(a->name, alias, len) == 0)
break;
if (a)
return a->canon;
@@ -371,8 +420,13 @@ uint32_t parserutils_charset_hash_val(const char *alias, size_t len)
if (alias == NULL)
return 0;
- while (len--)
- h = (h * 33) ^ (*s++ & ~0x20); /* case insensitive */
+ while (len--) {
+ if (IS_PUNCT_OR_SPACE(*s)) {
+ s++;
+ } else {
+ h = (h * 33) ^ (*s++ & ~0x20); /* case insensitive */
+ }
+ }
return h % HASH_SIZE;
}