/* * This file is part of NetSurf, http://netsurf.sourceforge.net/ * Licensed under the GNU General Public License, * http://www.opensource.org/licenses/gpl-license * Copyright 2004 James Bursa */ /** \file * URL parsing and joining (implementation). */ #include #include #include #include #include #include #include #include "netsurf/utils/log.h" #include "netsurf/utils/url.h" #include "netsurf/utils/utils.h" regex_t url_re, url_up_re, url_nice_re; /** * Initialise URL routines. * * Compiles regular expressions required by the url_ functions. */ void url_init(void) { /* regex from RFC 2396 */ regcomp_wrapper(&url_re, "^[[:space:]]*(([a-zA-Z][-a-zA-Z0-9+.]*):)?" "(//([^/?#[:space:]]*))?([^?#[:space:]]*)" "(\\?([^#[:space:]]*))?(#([^[:space:]]*))?" "[[:space:]]*$", REG_EXTENDED); regcomp_wrapper(&url_up_re, "/([^/]|[.][^./]|[^./][.]|[^/][^/][^/]+)?/[.][.](/|$)", REG_EXTENDED); regcomp_wrapper(&url_nice_re, "^([^.]{0,4}[.])?([^.][^.][.])?([^/?&;.=]*)" "(=[^/?&;.]*)?[/?&;.]", REG_EXTENDED); } /** * Normalize a URL. * * \param url an absolute URL * \return cleaned up url, allocated on the heap, or 0 on failure * * If there is no scheme, http:// is added. The scheme and host are * lower-cased. Default ports are removed (http only). An empty path is * replaced with "/". Characters are unescaped if safe. */ url_func_result url_normalize(const char *url, char **result) { char c; int m; int i; size_t len; bool http = false; regmatch_t match[10]; *result = NULL; if ((m = regexec(&url_re, url, 10, match, 0))) { LOG(("url '%s' failed to match regex", url)); return URL_FUNC_FAILED; } len = strlen(url); if (match[1].rm_so == -1) { /* scheme missing: add http:// and reparse */ /* LOG(("scheme missing: using http"));*/ if ((*result = malloc(len + 13)) == NULL) { LOG(("malloc failed")); return URL_FUNC_NOMEM; } strcpy(*result, "http://"); strcpy(*result + sizeof("http://")-1, url); if ((m = regexec(&url_re, *result, 10, match, 0))) { LOG(("url '%s' failed to match regex", (*result))); free(*result); return URL_FUNC_FAILED; } len += sizeof("http://")-1; } else { if ((*result = malloc(len + 6)) == NULL) { LOG(("malloc failed")); return URL_FUNC_NOMEM; } strcpy(*result, url); } /*for (unsigned int i = 0; i != 10; i++) { if (match[i].rm_so == -1) continue; fprintf(stderr, "%i: '%.*s'\n", i, match[i].rm_eo - match[i].rm_so, res + match[i].rm_so); }*/ /* see RFC 2616 section 3.2.3 */ /* make scheme lower-case */ if (match[2].rm_so != -1) { for (i = match[2].rm_so; i != match[2].rm_eo; i++) (*result)[i] = tolower((*result)[i]); if (match[2].rm_eo == 4 && (*result)[0] == 'h' && (*result)[1] == 't' && (*result)[2] == 't' && (*result)[3] == 'p') http = true; } /* make empty path into "/" */ if (match[5].rm_so != -1 && match[5].rm_so == match[5].rm_eo) { memmove((*result) + match[5].rm_so + 1, (*result) + match[5].rm_so, len - match[5].rm_so + 1); (*result)[match[5].rm_so] = '/'; len++; } /* make host lower-case */ if (match[4].rm_so != -1) { for (i = match[4].rm_so; i != match[4].rm_eo; i++) { if ((*result)[i] == ':') { if (http && (*result)[i + 1] == '8' && (*result)[i + 2] == '0' && i + 3 == match[4].rm_eo) { memmove((*result) + i, (*result) + i + 3, len - match[4].rm_eo); len -= 3; (*result)[len] = '\0'; } else if (i + 1 == match[4].rm_eo) { memmove((*result) + i, (*result) + i + 1, len - match[4].rm_eo); len--; (*result)[len] = '\0'; } break; } (*result)[i] = tolower((*result)[i]); } } /* unescape non-"reserved" escaped characters */ for (i = 0; (unsigned)i != len; i++) { if ((*result)[i] != '%') continue; c = tolower((*result)[i + 1]); if ('0' <= c && c <= '9') m = 16 * (c - '0'); else if ('a' <= c && c <= 'f') m = 16 * (c - 'a' + 10); else continue; c = tolower((*result)[i + 2]); if ('0' <= c && c <= '9') m += c - '0'; else if ('a' <= c && c <= 'f') m += c - 'a' + 10; else continue; if (m <= 0x20 || strchr(";/?:@&=+$," "<>#%\"{}|\\^[]`", m)) { i += 2; continue; } (*result)[i] = m; memmove((*result) + i + 1, (*result) + i + 3, len - i - 2); len -= 2; } return URL_FUNC_OK; } /** * Resolve a relative URL to absolute form. * * \param rel relative URL * \param base base URL, must be absolute and cleaned as by url_normalize() * \return an absolute URL, allocated on the heap, or 0 on failure */ url_func_result url_join(const char *rel, const char *base, char **result) { int m; int i, j; char *buf = 0; const char *scheme = 0, *authority = 0, *path = 0, *query = 0, *fragment = 0; int scheme_len = 0, authority_len = 0, path_len = 0, query_len = 0, fragment_len = 0; regmatch_t base_match[10]; regmatch_t rel_match[10]; regmatch_t up_match[3]; (*result) = 0; /* see RFC 2396 section 5.2 */ m = regexec(&url_re, base, 10, base_match, 0); if (m) { LOG(("base url '%s' failed to match regex", base)); return URL_FUNC_FAILED; } /*for (unsigned int i = 0; i != 10; i++) { if (base_match[i].rm_so == -1) continue; fprintf(stderr, "%i: '%.*s'\n", i, base_match[i].rm_eo - base_match[i].rm_so, base + base_match[i].rm_so); }*/ if (base_match[2].rm_so == -1) { LOG(("base url '%s' is not absolute", base)); return URL_FUNC_FAILED; } scheme = base + base_match[2].rm_so; scheme_len = base_match[2].rm_eo - base_match[2].rm_so; if (base_match[4].rm_so != -1) { authority = base + base_match[4].rm_so; authority_len = base_match[4].rm_eo - base_match[4].rm_so; } path = base + base_match[5].rm_so; path_len = base_match[5].rm_eo - base_match[5].rm_so; /* 1) */ m = regexec(&url_re, rel, 10, rel_match, 0); if (m) { LOG(("relative url '%s' failed to match regex", rel)); return URL_FUNC_FAILED; } /* 2) */ /* base + "#s" = (current document)#s (see Appendix C.1) */ /** \todo does (current document) include the query? */ if (rel_match[9].rm_so != -1) { fragment = rel + rel_match[9].rm_so; fragment_len = rel_match[9].rm_eo - rel_match[9].rm_so; } if (rel_match[5].rm_so == rel_match[5].rm_eo && rel_match[2].rm_so == -1 && rel_match[4].rm_so == -1 && rel_match[6].rm_so == -1) { goto step7; } if (rel_match[7].rm_so != -1) { query = rel + rel_match[7].rm_so; query_len = rel_match[7].rm_eo - rel_match[7].rm_so; } /* 3) */ if (rel_match[2].rm_so != -1) { scheme = rel + rel_match[2].rm_so; scheme_len = rel_match[2].rm_eo - rel_match[2].rm_so; authority = 0; authority_len = 0; if (rel_match[4].rm_so != -1) { authority = rel + rel_match[4].rm_so; authority_len = rel_match[4].rm_eo - rel_match[4].rm_so; } path = rel + rel_match[5].rm_so; path_len = rel_match[5].rm_eo - rel_match[5].rm_so; goto step7; } /* 4) */ if (rel_match[4].rm_so != -1) { authority = rel + rel_match[4].rm_so; authority_len = rel_match[4].rm_eo - rel_match[4].rm_so; path = rel + rel_match[5].rm_so; path_len = rel_match[5].rm_eo - rel_match[5].rm_so; goto step7; } /* 5) */ if (rel[rel_match[5].rm_so] == '/') { path = rel + rel_match[5].rm_so; path_len = rel_match[5].rm_eo - rel_match[5].rm_so; goto step7; } /* 6) */ buf = malloc(path_len + rel_match[5].rm_eo + 10); if (!buf) { LOG(("malloc failed")); return URL_FUNC_NOMEM; } /* a) */ strncpy(buf, path, path_len); for (; path_len != 0 && buf[path_len - 1] != '/'; path_len--) ; /* b) */ strncpy(buf + path_len, rel + rel_match[5].rm_so, rel_match[5].rm_eo - rel_match[5].rm_so); path_len += rel_match[5].rm_eo - rel_match[5].rm_so; /* c) */ buf[path_len] = 0; for (i = j = 0; j != path_len; ) { if (j && buf[j - 1] == '/' && buf[j] == '.' && buf[j + 1] == '/') j += 2; else buf[i++] = buf[j++]; } path_len = i; /* d) */ if (2 <= path_len && buf[path_len - 2] == '/' && buf[path_len - 1] == '.') path_len--; /* e) and f) */ while (1) { buf[path_len] = 0; m = regexec(&url_up_re, buf, 3, up_match, 0); if (m) break; if (up_match[1].rm_eo + 4 <= path_len) { memmove(buf + up_match[1].rm_so, buf + up_match[1].rm_eo + 4, path_len - up_match[1].rm_eo - 4); path_len -= up_match[1].rm_eo - up_match[1].rm_so + 4; } else path_len -= up_match[1].rm_eo - up_match[1].rm_so + 3; } buf[path_len] = 0; path = buf; step7: /* 7) */ (*result) = malloc(scheme_len + 1 + 2 + authority_len + path_len + 1 + 1 + query_len + 1 + fragment_len + 1); if (!(*result)) { LOG(("malloc failed")); free(buf); return URL_FUNC_NOMEM; } strncpy((*result), scheme, scheme_len); (*result)[scheme_len] = ':'; i = scheme_len + 1; if (authority) { (*result)[i++] = '/'; (*result)[i++] = '/'; strncpy((*result) + i, authority, authority_len); i += authority_len; } if (path_len) { strncpy((*result) + i, path, path_len); i += path_len; } else { (*result)[i++] = '/'; } if (query) { (*result)[i++] = '?'; strncpy((*result) + i, query, query_len); i += query_len; } if (fragment) { (*result)[i++] = '#'; strncpy((*result) + i, fragment, fragment_len); i += fragment_len; } (*result)[i] = 0; free(buf); return URL_FUNC_OK; } /** * Return the host name from an URL. * * \param url an absolute URL * \returns host name allocated on heap, or 0 on failure */ url_func_result url_host(const char *url, char **result) { int m; regmatch_t match[10]; (*result) = 0; m = regexec(&url_re, url, 10, match, 0); if (m) { LOG(("url '%s' failed to match regex", url)); return URL_FUNC_FAILED; } if (match[4].rm_so == -1) return URL_FUNC_FAILED; (*result) = malloc(match[4].rm_eo - match[4].rm_so + 1); if (!(*result)) { LOG(("malloc failed")); return URL_FUNC_NOMEM; } strncpy((*result), url + match[4].rm_so, match[4].rm_eo - match[4].rm_so); (*result)[match[4].rm_eo - match[4].rm_so] = 0; return URL_FUNC_OK; } /** * Return the scheme name from an URL * * \param url an absolute URL * \param result pointer to pointer to buffer to hold scheme name * \return URL_FUNC_OK on success */ url_func_result url_scheme(const char *url, char **result) { int m; regmatch_t match[10]; (*result) = 0; m = regexec(&url_re, url, 10, match, 0); if (m) { LOG(("url '%s' failed to match regex", url)); return URL_FUNC_FAILED; } if (match[2].rm_so == -1) return URL_FUNC_FAILED; (*result) = malloc(match[2].rm_eo - match[2].rm_so + 1); if (!(*result)) { LOG(("malloc failed")); return URL_FUNC_NOMEM; } strncpy((*result), url + match[2].rm_so, match[2].rm_eo - match[2].rm_so); (*result)[match[2].rm_eo - match[2].rm_so] = 0; return URL_FUNC_OK; } /** * Attempt to find a nice filename for a URL. * * \param url an absolute URL * \returns filename allocated on heap, or 0 on memory exhaustion */ url_func_result url_nice(const char *url, char **result) { unsigned int i, j, k = 0, so; unsigned int len; const char *colon; char buf[40]; char *rurl; int m; regmatch_t match[10]; /* just in case */ (*result) = 0; (*result) = malloc(40); if (!(*result)) return URL_FUNC_NOMEM; len = strlen(url); assert(len != 0); rurl = malloc(len + 1); if (!rurl) { free((*result)); return URL_FUNC_NOMEM; } /* reverse url into rurl */ for (i = 0, j = len - 1; i != len; i++, j--) rurl[i] = url[j]; rurl[len] = 0; /* prepare a fallback: always succeeds */ colon = strchr(url, ':'); if (colon) url = colon + 1; strncpy((*result), url, 15); (*result)[15] = 0; for (i = 0; (*result)[i]; i++) if (!isalnum((*result)[i])) (*result)[i] = '_'; /* append nice pieces */ j = 0; do { m = regexec(&url_nice_re, rurl + j, 10, match, 0); if (m) break; if (match[3].rm_so != match[3].rm_eo) { so = match[3].rm_so; i = match[3].rm_eo - so; if (15 < i) { so = match[3].rm_eo - 15; i = 15; } if (15 < k + i) break; if (k) k++; strncpy(buf + k, rurl + j + so, i); k += i; buf[k] = 160; /* nbsp */ } j += match[0].rm_eo; } while (j != len); if (k == 0) { free(rurl); return URL_FUNC_OK; } /* reverse back */ for (i = 0, j = k - 1; i != k; i++, j--) (*result)[i] = buf[j]; (*result)[k] = 0; for (i = 0; i != k; i++) if ((*result)[i] != (char) 0xa0 && !isalnum((*result)[i])) (*result)[i] = '_'; free(rurl); return URL_FUNC_OK; } #ifdef TEST int main(int argc, char *argv[]) { int i; url_func_result res; char *s; url_init(); for (i = 1; i != argc; i++) { /* printf("==> '%s'\n", argv[i]); res = url_normalize(argv[i], &s); if (res == URL_FUNC_OK) { printf("<== '%s'\n", s); free(s); }*/ /* printf("==> '%s'\n", argv[i]); res = url_host(argv[i], &s); if (res == URL_FUNC_OK) { printf("<== '%s'\n", s); free(s); }*/ if (1 != i) { res = url_join(argv[i], argv[1], &s); if (res == URL_FUNC_OK) { printf("'%s' + '%s' \t= '%s'\n", argv[1], argv[i], s); free(s); } } /* res = url_nice(argv[i], &s); if (res == URL_FUNC_OK) { printf("'%s'\n", s); free(s); }*/ } return 0; } void regcomp_wrapper(regex_t *preg, const char *regex, int cflags) { char errbuf[200]; int r; r = regcomp(preg, regex, cflags); if (r) { regerror(r, preg, errbuf, sizeof errbuf); fprintf(stderr, "Failed to compile regexp '%s'\n", regex); fprintf(stderr, "error: %s\n", errbuf); exit(1); } } #endif