From db0856606a7f2da053a2a7ff2cee12b69d23cb52 Mon Sep 17 00:00:00 2001 From: Chris Young Date: Fri, 30 May 2014 20:03:04 +0100 Subject: IDNA2008 support. --- utils/Makefile | 3 +- utils/idna.c | 625 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ utils/idna.h | 54 +++++ utils/nsurl.c | 21 +- 4 files changed, 697 insertions(+), 6 deletions(-) create mode 100644 utils/idna.c create mode 100644 utils/idna.h diff --git a/utils/Makefile b/utils/Makefile index c808b91ba..b1a6df4fc 100644 --- a/utils/Makefile +++ b/utils/Makefile @@ -2,6 +2,7 @@ S_UTILS := base64.c corestrings.c filename.c filepath.c hashtable.c \ libdom.c locale.c log.c messages.c nsurl.c talloc.c url.c \ - utf8.c utils.c useragent.c bloom.c nsoption.c file.c + utf8.c utils.c useragent.c bloom.c nsoption.c file.c idna.c \ + punycode.c utf8proc.c S_UTILS := $(addprefix utils/,$(S_UTILS)) diff --git a/utils/idna.c b/utils/idna.c new file mode 100644 index 000000000..826d041fb --- /dev/null +++ b/utils/idna.c @@ -0,0 +1,625 @@ +/* + * Copyright 2014 Chris Young + * + * This file is part of NetSurf, http://www.netsurf-browser.org/ + * + * NetSurf is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * NetSurf is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +/** \file + * NetSurf international domain name handling (implementation). + */ + +#include +#include +#include +#include + +#include "utils/errors.h" +#include "utils/idna.h" +#include "utils/idna_props.h" +#include "utils/log.h" +#include "utils/punycode.h" +#include "utils/utf8.h" +#include "utils/utf8proc.h" +#include "utils/utils.h" + + +int32_t idna_contexto[] = { + /* CONTEXTO codepoints which have a rule defined */ + 0x00b7, 0x0375, 0x05f3, 0x05f4, 0x30fb, 0x0660, 0x0661, + 0x0662, 0x0663, 0x0664, 0x0665, 0x0666, 0x0667, 0x0668, + 0x0669, 0x06f0, 0x06f1, 0x06f2, 0x06f3, 0x06f4, 0x06f5, + 0x06f6, 0x06f7, 0x06f8, 0x06f9, 0 +}; + + +/** + * Find the IDNA property of a UCS-4 codepoint + * + * \param cp Unicode codepoint + * \return IDNA property + */ +static idna_property idna__cp_property(int32_t cp) +{ + const idna_table *t; + + t = idna_derived; + while (t->p.property) { + if ((cp >= t->start) && (cp <= t->end)) + return t->p.property; + t++; + }; + + return IDNA_P_DISALLOWED; +} + + +/** + * Find the Joining_Type property of a UCS-4 codepoint + * + * \param cp Unicode codepoint + * \return JT property + */ +static idna_unicode_jt idna__jt_property(int32_t cp) +{ + const idna_table *t; + + t = idna_joiningtype; + while (t->p.jt) { + if ((cp >= t->start) && (cp <= t->end)) + return t->p.jt; + t++; + }; + + return IDNA_UNICODE_JT_U; +} + + +/** + * Check if a CONTEXTO codepoint has a rule defined + * + * \param cp Unicode codepoint + * \return true if a rule is defined + */ +static bool idna__contexto_rule(int32_t cp) +{ + bool rule = false; + int32_t *t = idna_contexto; + while (*t) { + if (*t == cp) rule = true; + }; + + return rule; +} + + +/** + * Check if a CONTEXTJ codepoint has a rule defined, + * and conforms to that rule. + * + * \param string UCS-4 string + * \param index character in the string which is CONTEXTJ + * \return true if conforming + */ +static bool idna__contextj_rule(int32_t *label, int index, size_t len) +{ + const utf8proc_property_t *unicode_props; + idna_unicode_jt joining_type; + int i; + bool match; + + /* These CONTEXTJ rules are defined at + * http://www.iana.org/assignments/idna-tables-5.2.0/idna-tables-5.2.0.xml + */ + + if (label[index] == 0x200c) { + if (index == 0) return false; /* No previous character */ + unicode_props = utf8proc_get_property(label[index - 1]); + if (unicode_props->combining_class == UTF8PROC_CCC_VIRAMA) + return true; + + match = false; + for (i = 0; i < (index - 1); i++) { + joining_type = idna__jt_property(label[i]); + if (((joining_type == IDNA_UNICODE_JT_L) || + (joining_type == IDNA_UNICODE_JT_D)) && + (idna__jt_property(label[i+1]) == IDNA_UNICODE_JT_T)) { + match = true; + break; + } + } + + if (match == false) return false; + + if (idna__jt_property(label[index+1]) != IDNA_UNICODE_JT_T) + return false; + + for (i = (index + 1); i < (int)len; i++) { + joining_type = idna__jt_property(label[i]); + if ((joining_type == IDNA_UNICODE_JT_R) || + (joining_type == IDNA_UNICODE_JT_D)) { + return true; + break; + } + } + + return false; + + } else if (label[index] == 0x200d) { + if (index == 0) return false; /* No previous character */ + unicode_props = utf8proc_get_property(label[index - 1]); + if (unicode_props->combining_class == UTF8PROC_CCC_VIRAMA) + return true; + return false; + } + + /* No rule defined */ + return false; +} + + +/** + * Convert a UTF-8 string to UCS-4 + * + * \param utf8_label UTF-8 string containing host label + * \param len Length of host label (in bytes) + * \param ucs4_label Pointer to update with the output + * \param ucs4_len Pointer to update with the length + * \return NSERROR_OK on success, appropriate error otherwise + * + * If return value != NSERROR_OK, output will be left untouched. + */ +static nserror idna__utf8_to_ucs4(const char *utf8_label, size_t len, int32_t **ucs4_label, size_t *ucs4_len) +{ + int32_t *nfc_label; + ssize_t nfc_size; + + nfc_label = malloc(len * 4); + if (nfc_label == NULL) return NSERROR_NOMEM; + + nfc_size = utf8proc_decompose((const uint8_t *)utf8_label, len, + nfc_label, len * 4, UTF8PROC_STABLE | UTF8PROC_COMPOSE); + if(nfc_size < 0) return NSERROR_NOMEM; + + nfc_size = utf8proc_normalise(nfc_label, nfc_size, + UTF8PROC_STABLE | UTF8PROC_COMPOSE); + if(nfc_size < 0) return NSERROR_NOMEM; + + *ucs4_label = nfc_label; + *ucs4_len = nfc_size; + return NSERROR_OK; +} + + +/** + * Convert a UCS-4 string to UTF-8 + * + * \param ucs4_label UCS-4 string containing host label + * \param ucs4_len Length of host label (in bytes) + * \param utf8_label Pointer to update with the output + * \param utf8_len Pointer to update with the length + * \return NSERROR_OK on success, appropriate error otherwise + * + * If return value != NSERROR_OK, output will be left untouched. + */ +static nserror idna__ucs4_to_utf8(const int32_t *ucs4_label, size_t ucs4_len, char **utf8_label, size_t *utf8_len) +{ + int32_t *nfc_label; + ssize_t nfc_size = ucs4_len; + + nfc_label = malloc(1 + ucs4_len * 4); + if (nfc_label == NULL) return NSERROR_NOMEM; + memcpy(nfc_label, ucs4_label, ucs4_len * 4); + + nfc_size = utf8proc_reencode(nfc_label, ucs4_len, + UTF8PROC_STABLE | UTF8PROC_COMPOSE); + if(nfc_size < 0) return NSERROR_NOMEM; + + *utf8_label = (char *)nfc_label; + *utf8_len = nfc_size; + + return NSERROR_OK; +} + + +/** + * Convert a host label in UCS-4 to an ACE version + * + * \param ucs4_label UCS-4 NFC string containing host label + * \param len Length of host label (in characters/codepoints) + * \param ace_label ASCII-compatible encoded version + * \param out_len Length of ace_label + * \return NSERROR_OK on success, appropriate error otherwise + * + * If return value != NSERROR_OK, output will be left untouched. + */ +static nserror idna__ucs4_to_ace(int32_t *ucs4_label, size_t len, char **ace_label, size_t *out_len) +{ + char punycode[65]; /* max length of host label + NULL */ + enum punycode_status status; + size_t output_length = 60; /* punycode length - 4 - 1 */ + + punycode[0] = 'x'; + punycode[1] = 'n'; + punycode[2] = '-'; + punycode[3] = '-'; + + status = punycode_encode(len, (const punycode_uint *)ucs4_label, + NULL, &output_length, punycode + 4); + if (status == punycode_bad_input) { + LOG(("Bad input")); + return NSERROR_BAD_ENCODING; + } else if (status == punycode_big_output) { + LOG(("Output too big")); + return NSERROR_NOMEM; + } else if (status == punycode_overflow) { + LOG(("Overflow")); + return NSERROR_NOMEM; + } + + output_length += SLEN("xn--"); + punycode[output_length] = '\0'; + + *ace_label = strdup(punycode); + *out_len = output_length; + + return NSERROR_OK; +} + + +/** + * Convert a host label in ACE format to UCS-4 + * + * \param ace_label ASCII string containing host label + * \param ace_len Length of host label + * \param ucs4_label Pointer to hold UCS4 decoded version + * \param ucs4_len Pointer to hold length of ucs4_label + * \return NSERROR_OK on success, appropriate error otherwise + * + * If return value != NSERROR_OK, output will be left untouched. + */ +static nserror idna__ace_to_ucs4(const char *ace_label, size_t ace_len, int32_t **ucs4_label, size_t *ucs4_len) +{ + int32_t *ucs4; + enum punycode_status status; + size_t output_length = ace_len; /* never exceeds input length */ + + /* The header should always have been checked before calling */ + assert((ace_label[0] == 'x') && (ace_label[1] == 'n') && + (ace_label[2] == '-') && (ace_label[3] == '-')); + + ucs4 = malloc(output_length * 4); + if (ucs4 == NULL) return NSERROR_NOMEM; + + status = punycode_decode(ace_len - 4, ace_label + 4, + &output_length, (punycode_uint *)ucs4, NULL); + + if (status == punycode_bad_input) { + LOG(("Bad input")); + return NSERROR_BAD_ENCODING; + } else if (status == punycode_big_output) { + LOG(("Output too big")); + return NSERROR_NOMEM; + } else if (status == punycode_overflow) { + LOG(("Overflow")); + return NSERROR_NOMEM; + } + + ucs4[output_length] = '\0'; + + *ucs4_label = ucs4; + *ucs4_len = output_length; + + return NSERROR_OK; +} + + +/** + * Find the length of a host label + * + * \param host String containing a host or FQDN + * \param max_length Length of host string to search (in bytes) + * \return Distance to next separator character or end of string + */ +static size_t idna__host_label_length(const char *host, size_t max_length) +{ + const char *p = host; + size_t length = 0; + + while (length < max_length) { + if ((*p == '.') || (*p == ':') || (*p == '\0')) break; + length++; + p++; + } + + return length; +} + + +/** + * Check if a host label is valid for IDNA2008 + * + * \param label Host label to check (UCS-4) + * \param len Length of host label (in characters/codepoints) + * \return true if compliant, false otherwise + */ +static bool idna__is_valid(int32_t *label, size_t len) +{ + const utf8proc_property_t *unicode_props; + idna_property idna_prop; + size_t i = 0; + + /* 1. Check that the string is NFC. + * This check is skipped as the conversion to Unicode + * does normalisation as part of the conversion. + */ + + /* 2. Check characters 3 and 4 are not '--'. */ + if ((label[2] == 0x002d) && (label[3] == 0x002d)) { + LOG(("Check failed: characters 2 and 3 are '--'")); + return false; + } + + /* 3. Check the first character is not a combining mark */ + unicode_props = utf8proc_get_property(label[0]); + + if ((unicode_props->category == UTF8PROC_CATEGORY_MN) || + (unicode_props->category == UTF8PROC_CATEGORY_MC) || + (unicode_props->category == UTF8PROC_CATEGORY_ME)) { + LOG(("Check failed: character 0 is a combining mark")); + return false; + } + + for (i = 0; i < len; i++) { + idna_prop = idna__cp_property(label[i]); + + /* 4. Check characters not DISALLOWED by RFC5892 */ + if (idna_prop == IDNA_P_DISALLOWED) { + LOG(("Check failed: character %d (%x) is DISALLOWED", i, label[i])); + return false; + } + + /* 5. Check CONTEXTJ characters conform to defined rules */ + if (idna_prop == IDNA_P_CONTEXTJ) { + if (idna__contextj_rule(label, i, len) == false) { + LOG(("Check failed: character %d (%x) does not conform to CONTEXTJ rule", i, label[i])); + return false; + } + } + + /* 6. Check CONTEXTO characters have a rule defined */ + /*\todo optionally we can check conformance to this rule */ + if (idna_prop == IDNA_P_CONTEXTO) { + if (idna__contexto_rule(label[i]) == false) { + LOG(("Check failed: character %d (%x) has no CONTEXTO rule defined", i, label[i])); + return false; + } + } + + /* 7. Check characters are not UNASSIGNED */ + if (idna_prop == IDNA_P_UNASSIGNED) { + LOG(("Check failed: character %d (%x) is UNASSIGNED", i, label[i])); + return false; + } + + /*\todo 8. (optionally) check Bidi compliance */ + } + + return true; +} + + +/** + * Check if a host label is LDH + * + * \param label Host label to check + * \param len Length of host label + * \return true if LDH compliant, false otherwise + */ +static bool idna__is_ldh(const char *label, size_t len) +{ + const char *p = label; + size_t i = 0; + + /* Check for leading or trailing hyphens */ + if ((p[0] == '-') || (p[len - 1] == '-')) + return false; + + /* Check for non-alphanumeric, non-hyphen characters */ + for (i = 0; i < len; p++) { + i++; + if (*p == '-') continue; + if ((*p >= '0') && (*p <= '9')) continue; + if ((*p >= 'a') && (*p <= 'z')) continue; + if ((*p >= 'A') && (*p <= 'Z')) continue; + + return false; + } + + return true; +} + + +/** + * Check if a host label appears to be ACE + * + * \param label Host label to check + * \param len Length of host label + * \return true if ACE compliant, false otherwise + */ +static bool idna__is_ace(const char *label, size_t len) +{ + /* Check it is a valid DNS string */ + if (idna__is_ldh(label, len) == false) + return false; + + /* Check the ACE prefix is present */ + if ((label[0] == 'x') && (label[1] == 'n') && + (label[2] == '-') && (label[3] == '-')) + return true; + + return false; +} + + +/** + * Verify an ACE label is valid + * + * \param label Host label to check + * \param len Length of label + * \return true if valid, false otherwise + */ +static bool idna__verify(const char *label, size_t len) +{ + nserror error; + int32_t *ucs4; + char *ace; + size_t ucs4_len, ace_len; + + error = idna__ace_to_ucs4(label, len, + &ucs4, &ucs4_len); + if (error != NSERROR_OK) return false; + + error = idna__ucs4_to_ace(ucs4, ucs4_len, + &ace, &ace_len); + free(ucs4); + if (error != NSERROR_OK) return false; + + if ((len == ace_len) && (strncmp(label, ace, len) == 0)) { + free(ace); + return true; + } + + LOG(("Re-encoded ACE label %s does not match input", ace)); + free(ace); + + return false; +} + + +/* exported interface documented in idna.h */ +nserror idna_encode(const char *host, size_t len, char **ace_host, size_t *ace_len) +{ + nserror error; + int32_t *ucs4_host; + size_t label_len, output_len, ucs4_len, fqdn_len = 0; + char fqdn[256]; + char *output, *fqdn_p = fqdn; + + while ((label_len = idna__host_label_length(host, len)) != 0) { + if (idna__is_ldh(host, label_len) == false) { + /* This string is IDN or invalid */ + + /* Convert to Unicode */ + if ((error = idna__utf8_to_ucs4(host, label_len, + &ucs4_host, &ucs4_len)) != NSERROR_OK) + return error; + + /* Check this is valid for conversion */ + if (idna__is_valid(ucs4_host, ucs4_len) == false) + return NSERROR_BAD_URL; + + /* Convert to ACE */ + error = idna__ucs4_to_ace(ucs4_host, ucs4_len, + &output, &output_len); + free(ucs4_host); + if (error != NSERROR_OK) return error; + strncpy(fqdn_p, output, output_len); + free(output); + fqdn_p += output_len; + fqdn_len += output_len; + } else { + /* This is already a DNS-valid ASCII string */ + if ((idna__is_ace(host, label_len) == true) && + (idna__verify(host, label_len) == false)) { + LOG(("Cannot verify ACE label %s", host)); + return NSERROR_BAD_URL; + } + strncpy(fqdn_p, host, label_len); + fqdn_p += label_len; + fqdn_len += label_len; + } + + *fqdn_p = '.'; + fqdn_p++; + fqdn_len++; + + host += label_len; + if ((*host == '\0') || (*host == ':')) break; + host++; + len = len - label_len - 1; + } + + fqdn_p--; + *fqdn_p = '\0'; + *ace_host = strdup(fqdn); + *ace_len = fqdn_len - 1; /* last character is NULL */ + + return NSERROR_OK; +} + + +/* exported interface documented in idna.h */ +nserror idna_decode(const char *ace_host, size_t ace_len, char **host, size_t *host_len) +{ + nserror error; + int32_t *ucs4_host; + size_t label_len, output_len, ucs4_len, fqdn_len = 0; + char fqdn[256]; + char *output, *fqdn_p = fqdn; + + while ((label_len = idna__host_label_length(ace_host, ace_len)) != 0) { + if (idna__is_ace(ace_host, label_len) == true) { + /* This string is DNS-valid and (probably) encoded */ + + /* Decode to Unicode */ + error = idna__ace_to_ucs4(ace_host, label_len, + &ucs4_host, &ucs4_len); + if (error != NSERROR_OK) return error; + + /* Convert to UTF-8 */ + if ((error = idna__ucs4_to_utf8(ucs4_host, ucs4_len, + &output, &output_len)) != NSERROR_OK) + return error; + + free(ucs4_host); + memcpy(fqdn_p, output, output_len * 4); + free(output); + fqdn_p += output_len; + fqdn_len += output_len; + } else { + /* Not ACE */ + memcpy(fqdn_p, ace_host, label_len); + fqdn_p += label_len; + fqdn_len += label_len; + } + + *fqdn_p = '.'; + fqdn_p++; + fqdn_len++; + + ace_host += label_len; + if ((*ace_host == '\0') || (*ace_host == ':')) break; + ace_host++; + ace_len = ace_len - label_len - 1; + } + + fqdn_p--; + *fqdn_p = '\0'; + *host = strdup(fqdn); + *host_len = fqdn_len - 1; /* last character is NULL */ + + return NSERROR_OK; +} + diff --git a/utils/idna.h b/utils/idna.h new file mode 100644 index 000000000..c59a22763 --- /dev/null +++ b/utils/idna.h @@ -0,0 +1,54 @@ +/* + * Copyright 2014 Chris Young + * + * This file is part of NetSurf, http://www.netsurf-browser.org/ + * + * NetSurf is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * NetSurf is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +/** \file + * NetSurf international domain name handling (interface). + */ + +#ifndef _NETSURF_UTILS_IDNA_H_ +#define _NETSURF_UTILS_IDNA_H_ + +/** + * Convert a hostname to an ACE version suitable for DNS lookup + * + * \param host String containing host + * \param len Length of host string + * \param ace_host Pointer to update with the output + * \param ace_len Pointer to update with length of ace_host + * \return NSERROR_OK on success, appropriate error otherwise + * + * If return value != NSERROR_OK, output will be left untouched. + */ +nserror idna_encode(const char *host, size_t len, char **ace_host, size_t *ace_len); + + +/** + * Convert a hostname from ACE to UTF-8 suitable for display + * + * \param ace_host String containing host + * \param ace_len Length of host string + * \param host Pointer to update with the output + * \param host_len Pointer to update with length of host + * \return NSERROR_OK on success, appropriate error otherwise + * + * If return value != NSERROR_OK, output will be left untouched. + */ +nserror idna_decode(const char *ace_host, size_t ace_len, char **host, size_t *host_len); + +#endif + diff --git a/utils/nsurl.c b/utils/nsurl.c index 5c0a48511..7e8792cff 100644 --- a/utils/nsurl.c +++ b/utils/nsurl.c @@ -28,6 +28,7 @@ #include "utils/corestrings.h" #include "utils/errors.h" +#include "utils/idna.h" #include "utils/log.h" #include "utils/nsurl.h" #include "utils/utils.h" @@ -690,8 +691,10 @@ static nserror nsurl__create_from_section(const char * const url_s, const char *pos; const char *pos_url_s; char *norm_start = pos_norm; + char *host; size_t copy_len; size_t length; + size_t host_len; enum { NSURL_F_NO_PORT = (1 << 0) } flags = 0; @@ -756,7 +759,8 @@ static nserror nsurl__create_from_section(const char * const url_s, continue; } - if (nsurl__is_unreserved(ascii_offset) == false) { + if ((section != URL_SCHEME && section != URL_HOST) && + (nsurl__is_unreserved(ascii_offset) == false)) { /* This character should be escaped after all, * just let it get copied */ copy_len += 3; @@ -778,7 +782,8 @@ static nserror nsurl__create_from_section(const char * const url_s, length -= 2; - } else if (nsurl__is_no_escape(*pos) == false) { + } else if ((section != URL_SCHEME && section != URL_HOST) && + (nsurl__is_no_escape(*pos) == false)) { /* This needs to be escaped */ if (copy_len > 0) { @@ -955,9 +960,15 @@ static nserror nsurl__create_from_section(const char * const url_s, } /* host */ - if (lwc_intern_string(norm_start, length, - &url->host) != lwc_error_ok) { - return NSERROR_NOMEM; + /* Encode host according to IDNA2008 */ + if (idna_encode(norm_start, length, &host, &host_len) == NSERROR_OK) { + if (lwc_intern_string(host, host_len, + &url->host) != lwc_error_ok) { + return NSERROR_NOMEM; + } + free(host); + } else { + return NSERROR_BAD_URL; } } -- cgit v1.2.3