From a0414fd967cb0eb6d3257b380e4cc88a0d839854 Mon Sep 17 00:00:00 2001 From: Richard Wilson Date: Tue, 27 Jun 2006 17:59:32 +0000 Subject: Massively optimise common URL functions. This should (a) drastically reduce the start-up time for users with a large history/hotlist, and (b) decrease the loading time for pages with a sizable number of references. svn path=/trunk/netsurf/; revision=2656 --- utils/url.c | 559 ++++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 321 insertions(+), 238 deletions(-) diff --git a/utils/url.c b/utils/url.c index 0d1332bce..3feaf2e4b 100644 --- a/utils/url.c +++ b/utils/url.c @@ -2,6 +2,7 @@ * This file is part of NetSurf, http://netsurf.sourceforge.net/ * Licensed under the GNU General Public License, * http://www.opensource.org/licenses/gpl-license + * Copyright 2006 Richard Wilson * Copyright 2005 James Bursa * Copyright 2005 John M Bell */ @@ -22,6 +23,24 @@ #include "netsurf/utils/url.h" #include "netsurf/utils/utils.h" +struct url_components { + union { + char *storage; /* buffer used for all the following data */ + int *users; + } internal; + char *scheme; + char *authority; + char *path; + char *query; + char *fragment; +}; + +url_func_result url_get_components(const char *url, + struct url_components *result); +void url_destroy_components(struct url_components *result); + +char *cached_url = NULL; +struct url_components cached_components; regex_t url_re, url_up_re; @@ -225,42 +244,34 @@ url_func_result url_join(const char *rel, const char *base, char **result) regmatch_t rel_match[10]; regmatch_t up_match[3]; + url_func_result status; + struct url_components components; + (*result) = 0; - /* see RFC 2396 section 5.2 */ - m = regexec(&url_re, base, 10, base_match, 0); - if (m) { - LOG(("base url '%s' failed to match regex", base)); - return URL_FUNC_FAILED; - } - /*for (unsigned int i = 0; i != 10; i++) { - if (base_match[i].rm_so == -1) - continue; - fprintf(stderr, "%i: '%.*s'\n", i, - base_match[i].rm_eo - base_match[i].rm_so, - base + base_match[i].rm_so); - }*/ - if (base_match[URL_RE_SCHEME].rm_so == -1) { - LOG(("base url '%s' is not absolute", base)); - return URL_FUNC_FAILED; - } - scheme = base + base_match[URL_RE_SCHEME].rm_so; - scheme_len = base_match[URL_RE_SCHEME].rm_eo - - base_match[URL_RE_SCHEME].rm_so; - if (base_match[URL_RE_AUTHORITY].rm_so != -1) { - authority = base + base_match[URL_RE_AUTHORITY].rm_so; - authority_len = base_match[URL_RE_AUTHORITY].rm_eo - - base_match[URL_RE_AUTHORITY].rm_so; + assert(base); + + /* break down the base url */ + status = url_get_components(base, &components); + if (status != URL_FUNC_OK) { + LOG(("base url '%s' failed to get components", base)); + return URL_FUNC_FAILED; } - path = base + base_match[URL_RE_PATH].rm_so; - path_len = base_match[URL_RE_PATH].rm_eo - - base_match[URL_RE_PATH].rm_so; + + scheme = components.scheme; + scheme_len = strlen(scheme); + authority = components.authority; + if (authority) + authority_len = strlen(authority); + path = components.path; + path_len = strlen(path); /* 1) */ m = regexec(&url_re, rel, 10, rel_match, 0); if (m) { LOG(("relative url '%s' failed to match regex", rel)); + url_destroy_components(&components); return URL_FUNC_FAILED; } @@ -339,6 +350,7 @@ url_func_result url_join(const char *rel, const char *base, char **result) buf = malloc(path_len + rel_match[URL_RE_PATH].rm_eo + 10); if (!buf) { LOG(("malloc failed")); + url_destroy_components(&components); return URL_FUNC_NOMEM; } /* a) */ @@ -393,6 +405,7 @@ step7: /* 7) */ if (!(*result)) { LOG(("malloc failed")); free(buf); + url_destroy_components(&components); return URL_FUNC_NOMEM; } @@ -424,6 +437,7 @@ step7: /* 7) */ (*result)[i] = 0; free(buf); + url_destroy_components(&components); return URL_FUNC_OK; } @@ -439,38 +453,41 @@ step7: /* 7) */ url_func_result url_host(const char *url, char **result) { - int m; - regmatch_t match[10]; - - (*result) = 0; + url_func_result status; + struct url_components components; + char *host_start, *host_end; - m = regexec(&url_re, url, 10, match, 0); - if (m) { - LOG(("url '%s' failed to match regex", url)); - return URL_FUNC_FAILED; - } - if (match[URL_RE_AUTHORITY].rm_so == -1) - return URL_FUNC_FAILED; + assert(url); - (*result) = malloc(match[URL_RE_AUTHORITY].rm_eo - - match[URL_RE_AUTHORITY].rm_so + 1); - if (!(*result)) { - LOG(("malloc failed")); - return URL_FUNC_NOMEM; + status = url_get_components(url, &components); + if (status == URL_FUNC_OK) { + if (!components.authority) { + url_destroy_components(&components); + return URL_FUNC_FAILED; + } + host_start = strchr(components.authority, '@'); + host_start = host_start ? host_start + 1 : components.authority; + host_end = strchr(host_start, ':'); + if (!host_end) + host_end = components.authority + + strlen(components.authority); + + *result = malloc(host_end - host_start + 1); + if (!(*result)) { + url_destroy_components(&components); + return URL_FUNC_FAILED; + } + memcpy((*result), host_start, host_end - host_start); + (*result)[host_end - host_start] = '\0'; } - strncpy((*result), url + match[URL_RE_AUTHORITY].rm_so, - match[URL_RE_AUTHORITY].rm_eo - - match[URL_RE_AUTHORITY].rm_so); - (*result)[match[URL_RE_AUTHORITY].rm_eo - - match[URL_RE_AUTHORITY].rm_so] = 0; - - return URL_FUNC_OK; + url_destroy_components(&components); + return status; } /** * Return the scheme name from an URL. - * + * * See RFC 3986, 3.1 for reference. * * \param url an absolute URL @@ -480,37 +497,19 @@ url_func_result url_host(const char *url, char **result) url_func_result url_scheme(const char *url, char **result) { - const char *scheme_end; - + url_func_result status; + struct url_components components; + assert(url); - - /* ensure the first character is alpha */ - if (!isalpha(*url)) - return URL_FUNC_FAILED; - - /* continue checking until the end marker (':') of the scheme for - * the format ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) */ - for (scheme_end = url; - ((*scheme_end != '\0') && (*scheme_end != ':')); - scheme_end++) { - if (!isalnum(*scheme_end) && - (*scheme_end != '+') && - (*scheme_end != '-') && - (*scheme_end != '.')) - return URL_FUNC_FAILED; - } - if (*scheme_end == '\0') - return URL_FUNC_FAILED; - - /* make a copy of the result for the caller */ - (*result) = malloc(scheme_end - url + 1); - if (!(*result)) { - LOG(("malloc failed")); - return URL_FUNC_NOMEM; + + status = url_get_components(url, &components); + if (status == URL_FUNC_OK) { + *result = strdup(components.scheme); + if (!(*result)) + status = URL_FUNC_NOMEM; } - strncpy((*result), url, scheme_end - url); - (*result)[scheme_end - url] = '\0'; - return URL_FUNC_OK; + url_destroy_components(&components); + return status; } @@ -524,40 +523,27 @@ url_func_result url_scheme(const char *url, char **result) url_func_result url_canonical_root(const char *url, char **result) { - int m, scheme_len, authority_len; - regmatch_t match[10]; + url_func_result status; + struct url_components components; - (*result) = 0; - - m = regexec(&url_re, url, 10, match, 0); - if (m) { - LOG(("url '%s' failed to match regex", url)); - return URL_FUNC_FAILED; - } - if (match[URL_RE_SCHEME].rm_so == -1 || - match[URL_RE_AUTHORITY].rm_so == -1) - return URL_FUNC_FAILED; - - scheme_len = match[URL_RE_SCHEME].rm_eo - match[URL_RE_SCHEME].rm_so; - authority_len = match[URL_RE_AUTHORITY].rm_eo - - match[URL_RE_AUTHORITY].rm_so; + assert(url); - (*result) = malloc(scheme_len + 1 + 2 + authority_len + 1); - if (!(*result)) { - LOG(("malloc failed")); - return URL_FUNC_NOMEM; + status = url_get_components(url, &components); + if (status == URL_FUNC_OK) { + if ((!components.scheme) || (!components.authority)) { + status = URL_FUNC_FAILED; + } else { + *result = malloc(strlen(components.scheme) + + strlen(components.authority) + 4); + if (!(*result)) + status = URL_FUNC_NOMEM; + else + sprintf((*result), "%s://%s", components.scheme, + components.authority); + } } - - strncpy((*result), url + match[URL_RE_SCHEME].rm_so, scheme_len); - m = scheme_len; - (*result)[m++] = ':'; - (*result)[m++] = '/'; - (*result)[m++] = '/'; - strncpy((*result) + m, url + match[URL_RE_AUTHORITY].rm_so, - authority_len); - (*result)[m + authority_len] = '\0'; - - return URL_FUNC_OK; + url_destroy_components(&components); + return status; } @@ -571,57 +557,42 @@ url_func_result url_canonical_root(const char *url, char **result) url_func_result url_strip_lqf(const char *url, char **result) { - int m, scheme_len, authority_len, path_len = 0; - regmatch_t match[10]; - - (*result) = 0; - - m = regexec(&url_re, url, 10, match, 0); - if (m) { - LOG(("url '%s' failed to match regex", url)); - return URL_FUNC_FAILED; - } - if (match[URL_RE_SCHEME].rm_so == -1 || - match[URL_RE_AUTHORITY].rm_so == -1) - return URL_FUNC_FAILED; + url_func_result status; + struct url_components components; + int len, path_len; - scheme_len = match[URL_RE_SCHEME].rm_eo - match[URL_RE_SCHEME].rm_so; - authority_len = match[URL_RE_AUTHORITY].rm_eo - - match[URL_RE_AUTHORITY].rm_so; - if (match[URL_RE_PATH].rm_so != -1) - path_len = match[URL_RE_PATH].rm_eo - - match[URL_RE_PATH].rm_so; - - (*result) = malloc(scheme_len + 1 + 2 + authority_len + - (path_len ? path_len : 1) + 1); - if (!(*result)) { - LOG(("malloc failed")); - return URL_FUNC_NOMEM; - } - - strncpy((*result), url + match[URL_RE_SCHEME].rm_so, scheme_len); - m = scheme_len; - (*result)[m++] = ':'; - (*result)[m++] = '/'; - (*result)[m++] = '/'; - strncpy((*result) + m, url + match[URL_RE_AUTHORITY].rm_so, - authority_len); - m += authority_len; + assert(url); - if (path_len) { - strncpy((*result) + m, url + match[URL_RE_AUTHORITY].rm_so, - path_len); - for (; path_len != 0 && (*result)[m + path_len - 1] != '/'; - path_len--) - /* do nothing */; - m += path_len; + status = url_get_components(url, &components); + if (status == URL_FUNC_OK) { + if ((!components.scheme) || (!components.authority) || + (!components.path)) { + status = URL_FUNC_FAILED; + } else { + if (strcmp(components.path, "/")) { + path_len = strlen(components.path); + if (components.path[path_len - 1] == '/') + path_len--; + while (components.path[path_len - 1] != '/') + path_len--; + } else { + path_len = 1; + } + len = strlen(components.scheme) + + strlen(components.authority) + + path_len + 4; + *result = malloc(len); + if (!(*result)) + status = URL_FUNC_NOMEM; + else + snprintf((*result), len, "%s://%s%s", + components.scheme, + components.authority, + components.path); + } } - else - (*result)[m++] = '/'; - - (*result)[m] = '\0'; - - return URL_FUNC_OK; + url_destroy_components(&components); + return status; } @@ -635,52 +606,29 @@ url_func_result url_strip_lqf(const char *url, char **result) url_func_result url_plq(const char *url, char **result) { - int m, path_len = 0, query_len = 0; - regmatch_t match[10]; - - (*result) = 0; - - m = regexec(&url_re, url, 10, match, 0); - if (m) { - LOG(("url '%s' failed to match regex", url)); - return URL_FUNC_FAILED; - } - if (match[URL_RE_SCHEME].rm_so == -1 || - match[URL_RE_AUTHORITY].rm_so == -1) - return URL_FUNC_FAILED; - - if (match[URL_RE_PATH].rm_so != -1) - path_len = match[URL_RE_PATH].rm_eo - - match[URL_RE_PATH].rm_so; - if (match[URL_RE_QUERY].rm_so != -1) - query_len = match[URL_RE_QUERY].rm_eo - - match[URL_RE_QUERY].rm_so; - - (*result) = malloc((path_len ? path_len : 1) + query_len + 1 + 1); - if (!(*result)) { - LOG(("malloc failed")); - return URL_FUNC_NOMEM; - } + url_func_result status; + struct url_components components; - m = 0; - if (path_len) { - strncpy((*result), url + match[URL_RE_PATH].rm_so, - path_len); - m += path_len; - } - else - (*result)[m++] = '/'; + assert(url); - if (query_len) { - (*result)[m++] = '?'; - strncpy((*result) + m, url + match[URL_RE_QUERY].rm_so, - query_len); - m += query_len; + status = url_get_components(url, &components); + if (status == URL_FUNC_OK) { + if ((components.query) && (strlen(components.query) > 0)) { + *result = malloc(strlen(components.path) + + strlen(components.query) + 2); + if (!(*result)) + status = URL_FUNC_NOMEM; + else + sprintf((*result), "%s?%s", components.path, + components.query); + } else { + *result = strdup(components.path); + if (!(*result)) + status = URL_FUNC_NOMEM; + } } - - (*result)[m] = '\0'; - - return URL_FUNC_OK; + url_destroy_components(&components); + return status; } @@ -694,45 +642,30 @@ url_func_result url_plq(const char *url, char **result) url_func_result url_path(const char *url, char **result) { - int m, path_len = 0; - regmatch_t match[10]; - - (*result) = 0; - - m = regexec(&url_re, url, 10, match, 0); - if (m) { - LOG(("url '%s' failed to match regex", url)); - return URL_FUNC_FAILED; - } - if (match[URL_RE_SCHEME].rm_so == -1 || - match[URL_RE_AUTHORITY].rm_so == -1) - return URL_FUNC_FAILED; - - if (match[URL_RE_PATH].rm_so != -1) - path_len = match[URL_RE_PATH].rm_eo - - match[URL_RE_PATH].rm_so; + url_func_result status; + struct url_components components; + int len; - (*result) = malloc((path_len ? path_len : 1) + 1); - if (!(*result)) { - LOG(("malloc failed")); - return URL_FUNC_NOMEM; - } + assert(url); - m = 0; - if (path_len > 1) { - strncpy((*result), url + match[URL_RE_PATH].rm_so, - path_len); - for (; path_len != 0 && (*result)[m + path_len - 1] != '/'; - path_len--) - /* do nothing */; - m += path_len; + status = url_get_components(url, &components); + if (status == URL_FUNC_OK) { + if (!components.path) { + status = URL_FUNC_FAILED; + } else { + len = strlen(components.path); + while (components.path[len - 1] != '/') + len--; + *result = malloc(len + 2); + if (!(*result)) + status = URL_FUNC_NOMEM; + else + snprintf((*result), len + 1, "%s", + components.path); + } } - else - (*result)[m++] = '/'; - - (*result)[m] = '\0'; - - return URL_FUNC_OK; + url_destroy_components(&components); + return status; } @@ -902,6 +835,156 @@ url_func_result url_escape(const char *unescaped, char **result) } +/** + * Split a URL into separate components + * + * URLs passed to this function are assumed to be valid and no error checking + * or recovery is attempted. + * + * See RFC 3986 for reference. + * + * \param url an absolute URL + * \param result pointer to buffer to hold components + * \return URL_FUNC_OK on success + */ + +url_func_result url_get_components(const char *url, + struct url_components *result) +{ + char *storage_end; + const char *scheme; + const char *authority; + const char *path; + const char *query; + const char *fragment; + + assert(url); + + /* used cached components as a preference */ + if (cached_url && !strcmp(url, cached_url)) { + *result = cached_components; + result->internal.users[0]++; + return URL_FUNC_OK; + } + + /* clear the cache */ + free(cached_url); + cached_url = NULL; + url_destroy_components(&cached_components); + memset(result, 0x00, sizeof(struct url_components)); + + + /* get enough storage space for a URL with termination at each node */ + result->internal.storage = malloc(strlen(url) + sizeof(int *) + 8); + if (!result->internal.storage) + return URL_FUNC_NOMEM; + result->internal.users[0] = 1; + storage_end = (char *)(result->internal.users + 1); + + + /* extract the scheme */ + scheme = strchr(url, ':'); + if (!scheme) { + url_destroy_components(result); + return URL_FUNC_FAILED; + } + memcpy(storage_end, url, scheme - url); + storage_end[scheme - url] = '\0'; + result->scheme = storage_end; + storage_end += scheme - url + 1; + + + /* look for an authority */ + authority = ++scheme; + if ((authority[0] == '/') && (authority[1] == '/')) { + authority = strchr(scheme + 2, '/'); + if (!authority) { + url_destroy_components(result); + return URL_FUNC_FAILED; + } + memcpy(storage_end, scheme + 2, authority - scheme - 2); + storage_end[authority - scheme - 2] = '\0'; + result->authority = storage_end; + storage_end += authority - scheme - 1; + } + + + /* extract the path (can be empty) */ + path = authority; + if ((*path != '?') && (*path != '#') && (*path != '\0')) { + path = strpbrk(path, "?#"); + if (!path) + path = authority + strlen(authority); + } + + /* substitute an empty path for a '/' */ + if (path == authority) { + *storage_end++ = '/'; + *storage_end++ = '\0'; + } else { + memcpy(storage_end, authority, path - authority); + storage_end[path - authority] = '\0'; + result->path = storage_end; + storage_end += path - authority + 1; + } + + + /* look for a query */ + query = path; + if (*query == '?') { + query = strchr(query, '#'); + if (!query) + query = path + strlen(path); + memcpy(storage_end, path + 1, query - path - 1); + storage_end[query - path - 1] = '\0'; + result->query = storage_end; + storage_end += query - path; + } + + + /* look for a fragment */ + fragment = query; + if (*fragment == '#') { + fragment = query + strlen(query); + + /* make a copy of the result for the caller */ + memcpy(storage_end, query + 1, fragment - query - 1); + storage_end[fragment - query - 1] = '\0'; + result->fragment = storage_end; +// storage_end += fragment - query; + } + + + /* cache our values */ + cached_url = strdup(url); + if (cached_url) { + result->internal.users[0]++; + cached_components = *result; + } + +/* fprintf(stderr, "u:%s\ns:%s\na:%s\np:%s\nq:%s\nf:%s\n", + url, result->scheme, result->authority, + result->path, result->query, result->fragment); +*/ return URL_FUNC_OK; +} + + +/** + * Release some url components from memory + * + * \param result pointer to buffer containing components + */ +void url_destroy_components(struct url_components *result) +{ + assert(result); + + if (result->internal.users) { + result->internal.users[0]--; + if (result->internal.users[0] == 0) + free(result->internal.storage); + } +} + #ifdef TEST int main(int argc, char *argv[]) -- cgit v1.2.3