From c09eb457df1962f5b014214874b2beffd69141a4 Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Sun, 9 Apr 2006 23:21:13 +0000 Subject: Unify information databases svn path=/trunk/netsurf/; revision=2519 --- content/authdb.c | 366 --------- content/authdb.h | 18 - content/certdb.c | 154 ---- content/certdb.h | 18 - content/fetch.c | 15 +- content/fetchcache.c | 1 - content/url_store.c | 750 ----------------- content/url_store.h | 61 -- content/urldb.c | 2231 ++++++++++++++++++++++++++++++++++++++++++++++++++ content/urldb.h | 65 ++ 10 files changed, 2301 insertions(+), 1378 deletions(-) delete mode 100644 content/authdb.c delete mode 100644 content/authdb.h delete mode 100644 content/certdb.c delete mode 100644 content/certdb.h delete mode 100644 content/url_store.c delete mode 100644 content/url_store.h create mode 100644 content/urldb.c create mode 100644 content/urldb.h (limited to 'content') diff --git a/content/authdb.c b/content/authdb.c deleted file mode 100644 index f97adb1b0..000000000 --- a/content/authdb.c +++ /dev/null @@ -1,366 +0,0 @@ -/* - * This file is part of NetSurf, http://netsurf.sourceforge.net/ - * Licensed under the GNU General Public License, - * http://www.opensource.org/licenses/gpl-license - * Copyright 2006 John M Bell - */ - -/** \file - * HTTP authentication database (implementation) - * - * Authentication details are stored hashed by canonical root URI - * (absoluteURI with no abs_path part - see RFC 2617) for fast lookup. - * - * A protection space is specified by the root URI and a case sensitive - * realm match. User-agents may preemptively send authentication details - * for locations within a currently known protected space (i.e: - * Given a known realm URI of scheme://authority/path/to/realm/ - * the URI scheme://authority/path/to/realm/foo/ can be assumed to - * be within the protection space.) - * - * In order to deal with realms within realms, the realm details are stored - * such that the most specific URI comes first (where "most specific" is - * classed as the one with the longest abs_path segment). - * - * Realms spanning domains are stored multiple times (once per domain). - * - * Where a higher level resource is found to be within a known realm, the - * existing match is replaced with the new one (i.e: - * Given a known realm of scheme://authority/path/to/realm/ (uri1) - * and the newly-acquired knowledge that scheme://authority/path/to/ (uri2) - * lies within the same realm, the realm details for uri1 are replaced with - * those for uri2. - in most cases, this is likely to be a simple - * replacement of the realm URI) - * - * There is currently no mechanism for retaining authentication details over - * sessions. - */ -#include -#include -#include -#include -#include "netsurf/content/authdb.h" -#define NDEBUG -#include "netsurf/utils/log.h" -#include "netsurf/utils/url.h" - -#define HASH_SIZE 77 - -struct realm_details { - char *realm; /**< Realm identifier */ - char *url; /**< Base URL of realm */ - char *auth; /**< Authentication details */ - struct realm_details *next; - struct realm_details *prev; -}; - -struct auth_entry { - char *root_url; /**< Canonical root URL of realms */ - struct realm_details *realms; /**< List of realms on this host */ - struct auth_entry *next; -}; - -static struct auth_entry *auth_table[HASH_SIZE]; - -static unsigned int authdb_hash(const char *s); -static struct realm_details *authdb_get_rd(const char *canon, - const char *url, const char *realm); -static void authdb_dump(void); - -/** - * Insert an entry into the database, potentially replacing any - * existing entry. - * - * \param url Absolute URL to resource - * \param realm Authentication realm containing resource - * \param auth Authentication details in form "username:password" - * \return true on success, false on error. - */ -bool authdb_insert(const char *url, const char *realm, const char *auth) -{ - char *canon, *stripped; - unsigned int hash; - struct realm_details *rd; - struct auth_entry *entry; - url_func_result ret; - - assert(url && realm && auth); - - LOG(("Adding '%s' - '%s'", url, realm)); - - ret = url_canonical_root(url, &canon); - if (ret != URL_FUNC_OK) - return false; - - LOG(("'%s'", canon)); - - ret = url_strip_lqf(url, &stripped); - if (ret != URL_FUNC_OK) { - free(canon); - return false; - } - - hash = authdb_hash(canon); - - /* Look for existing entry */ - for (entry = auth_table[hash]; entry; entry = entry->next) - if (strcmp(entry->root_url, canon) == 0) - break; - - rd = authdb_get_rd(canon, stripped, realm); - if (rd) { - /* We have a match */ - if (strlen(stripped) < strlen(rd->url)) { - /* more generic, so update URL and move to - * appropriate location in list (s.t. the invariant - * that most specific URLs come first is maintained) - */ - struct realm_details *r, *s; - char *temp = strdup(auth); - - if (!temp) { - free(temp); - free(stripped); - free(canon); - return false; - } - - free(rd->url); - rd->url = stripped; - - free(rd->auth); - rd->auth = temp; - - for (r = rd->next; r; r = s) { - s = r->next; - if (strlen(r->url) > strlen(rd->url)) { - rd->next->prev = rd->prev; - if (rd->prev) - rd->prev->next = rd->next; - else - entry->realms = r; - - rd->prev = r; - rd->next = r->next; - if (r->next) - r->next->prev = rd; - r->next = rd; - } - } - } - else if (strlen(stripped) == strlen(rd->url)) { - /* exact match, so replace auth details */ - char *temp = strdup(auth); - if (!temp) { - free(stripped); - free(canon); - return false; - } - - free(rd->auth); - rd->auth = temp; - - free(stripped); - } - /* otherwise, nothing to do */ - - free(canon); - return true; - } - - /* no existing entry => create one */ - rd = malloc(sizeof(struct realm_details)); - if (!rd) { - free(stripped); - free(canon); - return false; - } - - rd->realm = strdup(realm); - rd->auth = strdup(auth); - rd->url = stripped; - rd->prev = 0; - - if (!rd->realm || !rd->auth || ret != URL_FUNC_OK) { - free(rd->url); - free(rd->auth); - free(rd->realm); - free(rd); - free(canon); - return false; - } - - if (entry) { - /* found => add to it */ - rd->next = entry->realms; - if (entry->realms) - entry->realms->prev = rd; - entry->realms = rd; - - free(canon); - return true; - } - - /* not found => create new */ - entry = malloc(sizeof(struct auth_entry)); - if (!entry) { - free(rd->url); - free(rd->auth); - free(rd->realm); - free(rd); - free(canon); - return false; - } - - rd->next = 0; - entry->root_url = canon; - entry->realms = rd; - entry->next = auth_table[hash]; - auth_table[hash] = entry; - - return true; -} - -/** - * Find realm details entry - * - * \param canon Canonical root URL - * \param url Stripped URL to resource - * \param realm Realm containing resource - * \return Realm details or NULL if not found - */ -struct realm_details *authdb_get_rd(const char *canon, const char *url, - const char *realm) -{ - struct auth_entry *entry; - struct realm_details *ret; - - assert(canon && url); - - for (entry = auth_table[authdb_hash(canon)]; entry; - entry = entry->next) - if (strcmp(entry->root_url, canon) == 0) - break; - - if (!entry) - return NULL; - - for (ret = entry->realms; ret; ret = ret->next) { - if (strcmp(ret->realm, realm)) - /* skip realms that don't match */ - continue; - if (strlen(url) >= strlen(ret->url) && - !strncmp(url, ret->url, strlen(ret->url))) - /* If the requested URL is of equal or greater - * specificity than the stored one, but is within - * the same realm, then use the more generic details - */ - return ret; - else if (strncmp(url, ret->url, strlen(url)) == 0) { - /* We have a more general URL in the same realm */ - return ret; - } - } - - return NULL; -} - -/** - * Retrieve authentication details for an URL from the database - * - * \param url Absolute URL to consider - * \return authentication details, or NULL if none found. - */ -const char *authdb_get(const char *url) -{ - char *canon, *stripped; - struct auth_entry *entry; - struct realm_details *rd; - url_func_result ret; - - assert(url); - - LOG(("Searching for '%s'", url)); - - authdb_dump(); - - ret = url_canonical_root(url, &canon); - if (ret != URL_FUNC_OK) - return NULL; - - ret = url_strip_lqf(url, &stripped); - if (ret != URL_FUNC_OK) { - free(canon); - return NULL; - } - - /* Find auth entry */ - for (entry = auth_table[authdb_hash(canon)]; entry; - entry = entry->next) - if (strcmp(entry->root_url, canon) == 0) - break; - - if (!entry) { - free(stripped); - free(canon); - return NULL; - } - - LOG(("Found entry")); - - /* Find realm details */ - for (rd = entry->realms; rd; rd = rd->next) - if (strlen(stripped) >= strlen(rd->url) && - !strncmp(stripped, rd->url, strlen(rd->url))) - break; - - if (!rd) { - free(stripped); - free(canon); - return NULL; - } - - LOG(("Found realm")); - - free(stripped); - free(canon); - return rd->auth; -} - -/** - * Hash function for keys. - */ -unsigned int authdb_hash(const char *s) -{ - unsigned int i, z = 0, m; - if (!s) - return 0; - - m = strlen(s); - - for (i = 0; i != m && s[i]; i++) - z += s[i] & 0x1f; /* lower 5 bits, case insensitive */ - return z % HASH_SIZE; -} - -/** - * Dump contents of auth db to stderr - */ -void authdb_dump(void) -{ -#ifndef NDEBUG - int i; - struct auth_entry *e; - struct realm_details *r; - - for (i = 0; i != HASH_SIZE; i++) { - LOG(("%d:", i)); - for (e = auth_table[i]; e; e = e->next) { - LOG(("\t%s", e->root_url)); - for (r = e->realms; r; r = r->next) { - LOG(("\t\t%s - %s", r->url, r->realm)); - } - } - } -#endif -} diff --git a/content/authdb.h b/content/authdb.h deleted file mode 100644 index ece7b763d..000000000 --- a/content/authdb.h +++ /dev/null @@ -1,18 +0,0 @@ -/* - * This file is part of NetSurf, http://netsurf.sourceforge.net/ - * Licensed under the GNU General Public License, - * http://www.opensource.org/licenses/gpl-license - * Copyright 2006 John M Bell - */ - -/** \file - * HTTP authentication database (interface) - */ - -#ifndef _NETSURF_CONTENT_AUTHDB_H_ -#define _NETSURF_CONTENT_AUTHDB_H_ - -bool authdb_insert(const char *url, const char *realm, const char *auth); -const char *authdb_get(const char *url); - -#endif diff --git a/content/certdb.c b/content/certdb.c deleted file mode 100644 index 78c6ec04f..000000000 --- a/content/certdb.c +++ /dev/null @@ -1,154 +0,0 @@ -/* - * This file is part of NetSurf, http://netsurf.sourceforge.net/ - * Licensed under the GNU General Public License, - * http://www.opensource.org/licenses/gpl-license - * Copyright 2006 John M Bell - */ - -/** \file - * HTTPS certificate verification database (implementation) - * - * URLs of servers with invalid SSL certificates are stored hashed by - * canonical root URI (absoluteURI with no abs_path part - see RFC 2617) - * for fast lookup. - */ -#include -#include -#include -#include -#include "netsurf/utils/config.h" -#include "netsurf/content/certdb.h" -#define NDEBUG -#include "netsurf/utils/log.h" -#include "netsurf/utils/url.h" - -#define HASH_SIZE 77 - -#ifdef WITH_SSL - -struct cert_entry { - char *root_url; /**< Canonical root URL */ - struct cert_entry *next; -}; - -static struct cert_entry *cert_table[HASH_SIZE]; - -static unsigned int certdb_hash(const char *s); -static void certdb_dump(void); - -/** - * Insert an entry into the database - * - * \param url Absolute URL to resource - * \return true on success, false on error. - */ -bool certdb_insert(const char *url) -{ - char *canon; - unsigned int hash; - struct cert_entry *entry; - url_func_result ret; - - assert(url); - - LOG(("Adding '%s'", url)); - - ret = url_canonical_root(url, &canon); - if (ret != URL_FUNC_OK) - return false; - - LOG(("'%s'", canon)); - - hash = certdb_hash(canon); - - /* Look for existing entry */ - for (entry = cert_table[hash]; entry; entry = entry->next) { - if (strcmp(entry->root_url, canon) == 0) { - free(canon); - return true; - } - } - - /* not found => create new */ - entry = malloc(sizeof(struct cert_entry)); - if (!entry) { - free(canon); - return false; - } - - entry->root_url = canon; - entry->next = cert_table[hash]; - cert_table[hash] = entry; - - return true; -} - -/** - * Retrieve certificate details for an URL from the database - * - * \param url Absolute URL to consider - * \return certificate details, or NULL if none found. - */ -const char *certdb_get(const char *url) -{ - char *canon; - struct cert_entry *entry; - url_func_result ret; - - assert(url); - - LOG(("Searching for '%s'", url)); - - certdb_dump(); - - ret = url_canonical_root(url, &canon); - if (ret != URL_FUNC_OK) - return NULL; - - /* Find cert entry */ - for (entry = cert_table[certdb_hash(canon)]; entry; - entry = entry->next) { - if (strcmp(entry->root_url, canon) == 0) { - free(canon); - return entry->root_url; - } - } - - return NULL; -} - -/** - * Hash function for keys. - */ -unsigned int certdb_hash(const char *s) -{ - unsigned int i, z = 0, m; - if (!s) - return 0; - - m = strlen(s); - - for (i = 0; i != m && s[i]; i++) - z += s[i] & 0x1f; /* lower 5 bits, case insensitive */ - return z % HASH_SIZE; -} - -/** - * Dump contents of auth db to stderr - */ -void certdb_dump(void) -{ -#ifndef NDEBUG - int i; - struct cert_entry *e; - - for (i = 0; i != HASH_SIZE; i++) { - LOG(("%d:", i)); - for (e = cert_table[i]; e; e = e->next) { - LOG(("\t%s", e->root_url)); - } - } -#endif -} - -#endif diff --git a/content/certdb.h b/content/certdb.h deleted file mode 100644 index 28aa88664..000000000 --- a/content/certdb.h +++ /dev/null @@ -1,18 +0,0 @@ -/* - * This file is part of NetSurf, http://netsurf.sourceforge.net/ - * Licensed under the GNU General Public License, - * http://www.opensource.org/licenses/gpl-license - * Copyright 2006 John M Bell - */ - -/** \file - * HTTPS certificate verification database (interface) - */ - -#ifndef _NETSURF_CONTENT_CERTDB_H_ -#define _NETSURF_CONTENT_CERTDB_H_ - -bool certdb_insert(const char *url); -const char *certdb_get(const char *url); - -#endif diff --git a/content/fetch.c b/content/fetch.c index 4ba322067..bfbf715a0 100644 --- a/content/fetch.c +++ b/content/fetch.c @@ -31,13 +31,8 @@ #ifdef WITH_SSL #include "openssl/ssl.h" #endif -#ifdef WITH_AUTH -#include "netsurf/content/authdb.h" -#endif -#ifdef WITH_SSL -#include "netsurf/content/certdb.h" -#endif #include "netsurf/content/fetch.h" +#include "netsurf/content/urldb.h" #include "netsurf/desktop/options.h" #include "netsurf/render/form.h" #define NDEBUG @@ -158,7 +153,7 @@ static int fetch_cert_verify_callback(X509_STORE_CTX *x509_ctx, void *parm); ring = 0; \ } \ element->r_next = element->r_prev = 0 - + /** Find the element (by hostname) in the given ring, leave it in the * provided element variable */ @@ -483,7 +478,7 @@ static bool ns_internal_initiate_fetch(struct fetch *fetch, CURL *handle) fetch->curl_handle = 0; return false; } - + /* add to the global curl multi handle */ codem = curl_multi_add_handle(fetch_curl_multi, fetch->curl_handle); assert(codem == CURLM_OK || codem == CURLM_CALL_MULTI_PERFORM); @@ -649,7 +644,7 @@ CURLcode fetch_set_options(struct fetch *f) SETOPT(CURLOPT_COOKIEJAR, 0); } #ifdef WITH_AUTH - if ((auth = authdb_get(f->url)) != NULL) { + if ((auth = urldb_get_auth_details(f->url)) != NULL) { SETOPT(CURLOPT_HTTPAUTH, CURLAUTH_ANY); SETOPT(CURLOPT_USERPWD, auth); } else { @@ -677,7 +672,7 @@ CURLcode fetch_set_options(struct fetch *f) } #ifdef WITH_SSL - if (certdb_get(f->url) != NULL) { + if (urldb_get_cert_permissions(f->url)) { /* Disable certificate verification */ SETOPT(CURLOPT_SSL_VERIFYPEER, 0L); SETOPT(CURLOPT_SSL_VERIFYHOST, 0L); diff --git a/content/fetchcache.c b/content/fetchcache.c index 47f24e89c..bc8907f14 100644 --- a/content/fetchcache.c +++ b/content/fetchcache.c @@ -23,7 +23,6 @@ #include "netsurf/content/content.h" #include "netsurf/content/fetchcache.h" #include "netsurf/content/fetch.h" -#include "netsurf/content/url_store.h" #include "netsurf/utils/log.h" #include "netsurf/utils/messages.h" #include "netsurf/utils/talloc.h" diff --git a/content/url_store.c b/content/url_store.c deleted file mode 100644 index fde956e46..000000000 --- a/content/url_store.c +++ /dev/null @@ -1,750 +0,0 @@ -/* - * This file is part of NetSurf, http://netsurf.sourceforge.net/ - * Licensed under the GNU General Public License, - * http://www.opensource.org/licenses/gpl-license - * Copyright 2005 Richard Wilson - */ - -/** \file - * Central repository for URL data (implementation). - */ - -#include -#include -#include -#include -#include -#include -#include "netsurf/content/url_store.h" -#include "netsurf/image/bitmap.h" -#include "netsurf/desktop/options.h" -#ifdef riscos -#include "netsurf/riscos/bitmap.h" -#endif -#include "netsurf/utils/log.h" -#include "netsurf/utils/url.h" -#include "netsurf/utils/utils.h" - - -#define ITERATIONS_BEFORE_TEST 32 -#define MAXIMUM_URL_LENGTH 1024 - -struct hostname_data *url_store_hostnames = NULL; - -static struct hostname_data *url_store_find_hostname(const char *url); -static struct hostname_data *url_store_match_hostname( - struct hostname_data *previous); - -/* used for faster matching */ -static size_t current_match_url_length; -static char *current_match_scheme; -static int current_match_scheme_length; -static char *current_match_hostname; -static int current_match_hostname_length; -static bool current_match_www_test; - -/* used for faster searching */ -static struct hostname_data *last_hostname_found = NULL; - -/** - * Returns the hostname data for the specified URL. If no hostname - * data is currently available then it is created. - * - * \param url the url to find hostname data for - * \return the current hostname data, or NULL if memory exhausted - */ -struct hostname_data *url_store_find_hostname(const char *url) -{ - struct hostname_data *first = url_store_hostnames; - struct hostname_data *search; - struct hostname_data *result; - url_func_result res; - char *hostname = NULL; - int hostname_length; - int compare; - int fast_exit_counter = ITERATIONS_BEFORE_TEST; - const char *host_test; - - assert(url); - - /* as the URL is normalised, we optimise the hostname finding for http:// */ - if (!strncmp("http://", url, 7)) { - /* check for duplicate hostname calls */ - if ((last_hostname_found) && - (!strncmp(last_hostname_found->hostname, url + 7, - last_hostname_found->hostname_length))) { - /* ensure it isn't comparing 'foo.com' to 'foo.com.au' etc */ - if (url[last_hostname_found->hostname_length + 7] != '.') - return last_hostname_found; - } - - /* check for a hostname match */ - for (host_test = url + 7; - ((*host_test > 32) && (*host_test != '/')); - *host_test++); - hostname_length = host_test - url - 7; - host_test = url + 7; - if ((last_hostname_found) && - (strncmp(host_test, - last_hostname_found->hostname, - hostname_length) > 0)) - first = last_hostname_found; - for (search = first; search; search = search->next) { - if (search->hostname_length == hostname_length) { - compare = strncmp(host_test, search->hostname, - hostname_length); - if (compare == 0) { - last_hostname_found = search; - return search; - } else if (compare < 0) - break; - } - } - - /* allocate a new hostname */ - hostname = malloc(hostname_length + 1); - if (!hostname) - return NULL; - memcpy(hostname, host_test, hostname_length); - hostname[hostname_length] = '\0'; - } else { - /* no quick match found, fallback */ - res = url_host(url, &hostname); - switch (res) { - case URL_FUNC_OK: - break; - case URL_FUNC_NOMEM: - return NULL; - case URL_FUNC_FAILED: - hostname = strdup("file:/"); /* for 'file:/' */ - if (!hostname) - return NULL; - break; - default: - assert(0); - } - hostname_length = strlen(hostname); - } - - /* try to find a matching hostname fairly quickly */ - if ((last_hostname_found) && - (strcmp(hostname, last_hostname_found->hostname) > 0)) - first = last_hostname_found; - for (search = first; search; search = search->next) { - if ((fast_exit_counter <= 0) || - (search->hostname_length == hostname_length)) { - compare = strcmp(hostname, search->hostname); - if (compare == 0) { - free(hostname); - last_hostname_found = search; - return search; - } else if (compare < 0) - break; - fast_exit_counter = ITERATIONS_BEFORE_TEST; - } else { - fast_exit_counter--; - } - } - - /* no hostname is available: create a new one */ - result = malloc(sizeof *result); - if (!result) { - free(hostname); - return NULL; - } - result->hostname = hostname; - result->hostname_length = hostname_length; - result->url = 0; - result->previous = 0; - result->next = 0; - last_hostname_found = result; - - /* simple case: no current hostnames */ - if (!url_store_hostnames) { - url_store_hostnames = result; - return result; - } - - /* worst case scenario: the place we need to link is within the last - * section of the hostname list so we have no reference to work back - * from. rather than slowing with the very common case of searching, - * we take a speed hit for this case and simply move to the very end - * of the hostname list ready to work backwards. */ - if (!search) - for (search = url_store_hostnames; search->next; - search = search->next) - ; - - /* we can now simply scan backwards as we know roughly where we need - * to link to (we either had an early exit from the searching so we - * know we're in the block following where we need to link, or we're - * at the very end of the list as we were in the last block.) */ - while ((search) && (strcmp(hostname, search->hostname) < 0)) - search = search->previous; - - /* simple case: our new hostname is the first in the list */ - if (!search) { - result->next = url_store_hostnames; - url_store_hostnames->previous = result; - url_store_hostnames = result; - return result; - } - - /* general case: link in after the found hostname */ - result->previous = search; - result->next = search->next; - if (search->next) - search->next->previous = result; - search->next = result; - return result; -} - - -/** - * Returns the url data for the specified URL. If no url - * data is currently available then it is created. - * - * \param url a normalized url to find hostname data for - * \return the current hostname data, or NULL if memory exhausted - */ -struct url_content *url_store_find(const char *url) { - struct hostname_data *hostname_data; - struct url_data *search; - struct url_data *result; - size_t url_length; - int compare; - int fast_exit_counter = ITERATIONS_BEFORE_TEST; - - assert(url); - - /* find the corresponding hostname data */ - hostname_data = url_store_find_hostname(url); - if (!hostname_data) - return NULL; - - /* move to the start of the leafname */ - url_length = strlen(url); - - /* try to find a matching url fairly quickly */ - for (search = hostname_data->url; search; search = search->next) { - if ((fast_exit_counter <= 0) || - (search->data.url_length == url_length)) { - compare = strcmp(url, search->data.url); - if (compare == 0) - return &search->data; - else if (compare < 0) - break; - fast_exit_counter = ITERATIONS_BEFORE_TEST; - } else { - fast_exit_counter--; - } - } - - /* no URL is available: create a new one */ - result = calloc(1, sizeof(struct url_data)); - if (!result) - return NULL; - result->data.url = malloc(url_length + 1); - if (!result->data.url) { - free(result); - return NULL; - } - memcpy(result->data.url, url, url_length + 1); - result->data.url_length = url_length; - result->parent = hostname_data; - - /* simple case: no current URLs */ - if (!hostname_data->url) { - hostname_data->url = result; - return &result->data; - } - - /* worst case scenario: the place we need to link is within the last - * section of the URL list so we have no reference to work back - * from. rather than slowing with the very common case of searching, - * we take a speed hit for this case and simply move to the very end - * of the URL list ready to work backwards. */ - if (!search) - for (search = hostname_data->url; search->next; - search = search->next) - ; - - /* we can now simply scan backwards as we know roughly where we need - * to link to (we either had an early exit from the searching so we - * know we're in the block following where we need to link, or we're - * at the very end of the list as we were in the last block.) */ - while ((search) && (strcmp(url, search->data.url) < 0)) - search = search->previous; - - /* simple case: our new hostname is the first in the list */ - if (!search) { - result->next = hostname_data->url; - hostname_data->url->previous = result; - hostname_data->url = result; - return &result->data; - } - - /* general case: link in after the found hostname */ - result->previous = search; - result->next = search->next; - if (search->next) - search->next->previous = result; - search->next = result; - return &result->data; -} - - -/** - * Returns the next hostname that matches a part of the specified URL. - * - * The following variables must be initialised prior to calling: - * - * - current_match_scheme - * - current_match_hostname - * - current_match_hostname_length; - * - * \param url a normalized url to find the next match for - * \param current the current hostname to search forward from, or NULL - * \return the next matching hostname, or NULL - */ -struct hostname_data *url_store_match_hostname( - struct hostname_data *current) { - int compare; - - assert(current_match_hostname); - - /* advance to the next hostname */ - if (!current) - current = url_store_hostnames; - else - current = current->next; - - /* skip past hostname data without URLs */ - for (; current && (!current->url); current = current->next); - - while (current) { - if (current->hostname_length >= current_match_hostname_length) { - compare = strncmp(current_match_hostname, current->hostname, - current_match_hostname_length); - if (compare == 0) - return current; - else if ((compare < 0) && !current_match_www_test) - break; - } - /* special case: if hostname is not www then try it */ - if (current_match_www_test && ((current->hostname_length - 4) >= - current_match_hostname_length) && - (!strncmp(current->hostname, "www.", 4)) && - (!strncmp(current_match_hostname, - current->hostname + 4, - current_match_hostname_length))) - return current; - - /* move to next hostname with URLs */ - current = current->next; - for (; current && (!current->url); current = current->next); - } - return NULL; -} - - - -/** - * Returns the complete URL for the next matched stored URL. - * - * \param url a normalized url to find the next match for - * \param reference internal reference (NULL for first call) - * \return the next URL that matches - */ -struct url_content *url_store_match(const char *url, struct url_data **reference) { - struct hostname_data *hostname; - struct url_data *search = NULL; - url_func_result res; - - assert(url); - - if (!url_store_hostnames) - return NULL; - - /* find the scheme and first URL, not necessarily matching */ - if (!*reference) { - /* the hostname match is constant throughout */ - if (current_match_hostname) - free(current_match_hostname); - current_match_hostname = NULL; - res = url_host(url, ¤t_match_hostname); - switch (res) { - case URL_FUNC_OK: - break; - case URL_FUNC_NOMEM: - return NULL; - case URL_FUNC_FAILED: - /* for 'file:/' */ - current_match_hostname = strdup("file:/"); - if (!current_match_hostname) - return NULL; - break; - default: - assert(0); - } - current_match_hostname_length = strlen(current_match_hostname); - /* the scheme is constant throughout */ - if (current_match_scheme) - free(current_match_scheme); - current_match_scheme = NULL; - res = url_scheme(url, ¤t_match_scheme); - if (res != URL_FUNC_OK) - return NULL; - current_match_scheme_length = strlen(current_match_scheme); - /* the url is constant throughout */ - current_match_url_length = strlen(url); - current_match_www_test = (!strcmp(current_match_scheme, "http") && - strncmp(url + 4 + 3, "www.", 4)); /* 'http' + '://' */ - /* get our initial reference */ - hostname = url_store_match_hostname(NULL); - if (!hostname) - return NULL; - } else { - search = *reference; - hostname = search->parent; - } - - /* work through all our strings, ignoring the scheme and 'www.' */ - while (hostname) { - - /* get the next URL to test */ - if (!search) - search = hostname->url; - else - search = search->next; - - /* loop past end of list, or search */ - if (!search) { - hostname = url_store_match_hostname(hostname); - if (!hostname) - return NULL; - } else if (search->data.visits > 0) { - /* straight match */ - if ((search->data.url_length >= current_match_url_length) && - (!strncmp(search->data.url, url, - current_match_url_length))) { - *reference = search; - return &search->data; - } - /* try with 'www.' inserted after the scheme */ - if (current_match_www_test && - ((search->data.url_length - 4) >= - current_match_url_length) && - (!strncmp(search->data.url, - current_match_scheme, - current_match_scheme_length)) && - (!strncmp(search->data.url + - current_match_scheme_length + 3, - "www.", 4)) && - (!strncmp(search->data.url + - current_match_scheme_length + 7, - url + - current_match_scheme_length + 3, - current_match_url_length - - current_match_scheme_length - 3))) { - *reference = search; - return &search->data; - } - } - } - return NULL; -} - - -/** - * Converts a text string into one suitable for URL matching. - * - * \param text the text to search with - * \return URL matching string allocated on heap, or NULL on error - */ -char *url_store_match_string(const char *text) { - url_func_result res; - char *url; - - assert(text); - - res = url_normalize(text, &url); - if (res != URL_FUNC_OK) - return NULL; - - /* drop the '/' from the end if it was added when normalizing */ - if ((url[strlen(url) - 1] == '/') && (text[strlen(text) - 1] != '/')) - url[strlen(url) - 1] = '\0'; - return url; -} - - -/** - * Loads the current contents of the URL store from disk - * - * \param file the file to load options from - */ -void url_store_load(const char *file) { - char s[MAXIMUM_URL_LENGTH]; - struct hostname_data *hostname; - struct url_data *result; - int urls; - int i; - int version; - int length; - FILE *fp; - - LOG(("Loading URL file")); - - fp = fopen(file, "r"); - if (!fp) { - LOG(("Failed to open file '%s' for reading", file)); - return; - } - - if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) - return; - version = atoi(s); - if (version < 102) { - LOG(("Unsupported URL file version.")); - return; - } - if (version > 105) { - LOG(("Unknown URL file version.")); - return; - } - - last_hostname_found = NULL; - while (fgets(s, MAXIMUM_URL_LENGTH, fp)) { - /* get the hostname */ - length = strlen(s) - 1; - s[length] = '\0'; - - /* skip data that has ended up with a host of '' */ - if (length == 0) { - if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) - break; - urls = atoi(s); - for (i = 0; i < (6 * urls); i++) - if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) - break; - continue; - } - - /* add the host at the tail */ - if (version == 105) { - hostname = malloc(sizeof *hostname); - if (!hostname) - die("Insufficient memory to create hostname"); - hostname->hostname = malloc(length + 1); - if (!hostname->hostname) - die("Insufficient memory to create hostname"); - memcpy(hostname->hostname, s, length + 1); - hostname->hostname_length = length; - hostname->url = 0; - hostname->previous = last_hostname_found; - if (!hostname->previous) - url_store_hostnames = hostname; - else - last_hostname_found->next = hostname; - hostname->next = 0; - last_hostname_found = hostname; - } else { - hostname = url_store_find_hostname(s); - if (!hostname) - break; - } - if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) - break; - urls = atoi(s); - - /* load the non-corrupt data */ - for (i = 0; i < urls; i++) { - if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) - break; - length = strlen(s) - 1; - s[length] = '\0'; - result = calloc(1, sizeof(struct url_data)); - if (!result) - die("Insufficient memory to create URL"); - result->data.url_length = length; - result->data.url = malloc(length + 1); - if (!result->data.url) - die("Insufficient memory to create URL"); - memcpy(result->data.url, s, length + 1); - result->parent = hostname; - result->next = hostname->url; - if (hostname->url) - hostname->url->previous = result; - hostname->url = result; - if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) - break; - result->data.visits = atoi(s); - if (version == 102) { - /* ignore requests */ - if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) - break; - /* ignore thumbnail size */ - if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) - break; - /* set last visit as today to retain */ - result->data.last_visit = time(NULL); - } else { - if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) - break; - result->data.last_visit = atoi(s); - if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) - break; - result->data.type = atoi(s); - } - if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) - break; -#ifdef riscos - if (strlen(s) == 12) { - /* ensure filename is 'XX.XX.XX.XX' */ - if ((s[2] == '.') && (s[5] == '.') && - (s[8] == '.')) { - s[11] = '\0'; - result->data.thumbnail = - bitmap_create_file(s); - } - } -#endif - if (version >= 104) { - if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) - break; - length = strlen(s) - 1; - if (length > 0) { - s[length] = '\0'; - result->data.title = malloc(length + 1); - if (result->data.title) - memcpy(result->data.title, s, - length + 1); - } - } - } - } - fclose(fp); - LOG(("Successfully loaded URL file")); -} - - -/** - * Saves the current contents of the URL store to disk - * - * \param file the file to load options from - */ -void url_store_save(const char *file) { - struct hostname_data *search; - struct url_data *url; - int url_count; - const char *thumb_file; - char *s; - int i; - FILE *fp; -#ifdef riscos - struct bitmap *bitmap; -#endif - time_t min_date; - char *title; - - fp = fopen(file, "w"); - if (!fp) { - LOG(("Failed to open file '%s' for writing", file)); - return; - } - - /* get the minimum date for expiry */ - min_date = time(NULL) - (60 * 60 * 24) * option_expire_url; - - /* file format version number */ - fprintf(fp, "105\n"); - for (search = url_store_hostnames; search; search = search->next) { - url_count = 0; - for (url = search->url; url; url = url->next) - if ((url->data.last_visit > min_date) && - (url->data.visits > 0) && - (url->data.url_length < - MAXIMUM_URL_LENGTH)) { - url_count++; - } - if (url_count > 0) { - fprintf(fp, "%s\n%i\n", search->hostname, url_count); - for (url = search->url; url && url->next; - url = url->next); - for (; url; url = url->previous) - if ((url->data.last_visit > min_date) && - (url->data.visits > 0) && - (url->data.url_length < - MAXIMUM_URL_LENGTH)) { - thumb_file = ""; -#ifdef riscos - bitmap = url->data.thumbnail; - if (bitmap) - thumb_file = bitmap->filename; -#endif - - if (url->data.title) { - s = url->data.title; - for (i = 0; s[i] != '\0'; - i++) - if (s[i] < 32) - s[i] = ' '; - for (--i; - ((i > 0) && - (s[i] == ' ')); - i--) - s[i] = '\0'; - - title = url->data.title; - } - else - title = ""; - fprintf(fp, "%s\n%i\n%i\n%i\n%s\n%s\n", - url->data.url, - url->data.visits, - (int) url->data. - last_visit, - url->data.type, - thumb_file, - title); - } - } - } - fclose(fp); -} - - -/** - * Associates a thumbnail with a specified URL. - */ -void url_store_add_thumbnail(const char *url, struct bitmap *bitmap) { - struct url_content *content; - - content = url_store_find(url); - if (content) { - if (content->thumbnail) - bitmap_destroy(content->thumbnail); - content->thumbnail = bitmap; - } -} - - -/** - * Gets the thumbnail associated with a given URL. - */ -struct bitmap *url_store_get_thumbnail(const char *url) { - struct url_content *content; - - content = url_store_find(url); - if (content) - return content->thumbnail; - return NULL; -} - - -int url_store_compare_last_visit(const void *a, const void *b) { - struct url_content * const *url_a = (struct url_content * const *)a; - struct url_content * const *url_b = (struct url_content * const *)b; - return ((*url_a)->last_visit - (*url_b)->last_visit); -} diff --git a/content/url_store.h b/content/url_store.h deleted file mode 100644 index c10bc90d0..000000000 --- a/content/url_store.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * This file is part of NetSurf, http://netsurf.sourceforge.net/ - * Licensed under the GNU General Public License, - * http://www.opensource.org/licenses/gpl-license - * Copyright 2005 Richard Wilson - */ - -/** \file - * Central repository for URL data (interface). - */ - -#ifndef _NETSURF_CONTENT_URLSTORE_H_ -#define _NETSURF_CONTENT_URLSTORE_H_ - -#include -#include "netsurf/content/content_type.h" - -struct bitmap; - - -struct hostname_data { - char *hostname; /**< Hostname (lowercase) */ - int hostname_length; /**< Length of hostname */ - struct url_data *url; /**< URLs for this host */ - struct hostname_data *previous; /**< Previous hostname */ - struct hostname_data *next; /**< Next hostname */ -}; - - -struct url_content { - struct bitmap *thumbnail; /**< Thumbnail, or NULL */ - char *url; /**< URL (including hostname) */ - char *title; /**< Page title */ - size_t url_length; /**< Length of URL (including hostname) */ - unsigned int visits; /**< Number of times visited */ - time_t last_visit; /**< The time() of the last visit */ - content_type type; /**< The content type */ -}; - -struct url_data { - struct url_content data; /**< Stored URL content data */ - struct url_data *previous; /**< Previous URL */ - struct url_data *next; /**< Next URL */ - struct hostname_data *parent; /**< Parent hostname data */ -}; - -extern struct hostname_data *url_store_hostnames; - -struct url_content *url_store_find(const char *url); -struct url_content *url_store_match(const char *url, struct url_data **reference); -char *url_store_match_string(const char *text); - -void url_store_add_thumbnail(const char *url, struct bitmap *bitmap); -struct bitmap *url_store_get_thumbnail(const char *url); - -void url_store_load(const char *file); -void url_store_save(const char *file); - -int url_store_compare_last_visit(const void *, const void *); - -#endif diff --git a/content/urldb.c b/content/urldb.c new file mode 100644 index 000000000..c7a798a92 --- /dev/null +++ b/content/urldb.c @@ -0,0 +1,2231 @@ +/* + * This file is part of NetSurf, http://netsurf.sourceforge.net/ + * Licensed under the GNU General Public License, + * http://www.opensource.org/licenses/gpl-license + * Copyright 2006 John M Bell + */ + +/** \file + * Unified URL information database (implementation) + * + * URLs are stored in a tree-based structure as follows: + * + * The host component is extracted from each URL and, if a FQDN, split on + * every '.'.The tree is constructed by inserting each FQDN segment in + * reverse order. Duplicate nodes are merged. + * + * If the host part of an URL is an IP address, then this is added to the + * tree verbatim (as if it were a TLD). + * + * This provides something looking like: + * + * root (a sentinel) + * | + * ------------------------------------------------- + * | | | | | | | + * com edu gov 127.0.0.1 net org uk TLDs + * | | | | | | + * google ... ... ... ... co 2LDs + * | | + * www bbc Hosts/Subdomains + * | + * www ... + * + * Each of the nodes in this tree is a struct host_part. This stores the + * FQDN segment (or IP address) with which the node is concerned. Each node + * may contain further information about paths on a host (struct path_data) + * or SSL certificate processing on a host-wide basis + * (host_part::permit_invalid_certs). + * + * Path data is concerned with storing various metadata about the path in + * question. This includes global history data, HTTP authentication details + * and any associated HTTP cookies. This is stored as a tree of path segments + * hanging off the relevant host_part node. + * + * Therefore, to find the last visited time of the URL + * http://www.example.com/path/to/resource.html, the FQDN tree would be + * traversed in the order root -> "com" -> "example" -> "www". The "www" + * node would have attached to it a tree of struct path_data: + * + * (sentinel) + * | + * path + * | + * to + * | + * resource.html + * + * This represents the absolute path "/path/to/resource.html". The leaf node + * "resource.html" contains the last visited time of the resource. + * + * The mechanism described above is, however, not particularly conducive to + * fast searching of the database for a given URL (or URLs beginning with a + * given prefix). Therefore, an anciliary data structure is used to enable + * fast searching. This structure simply reflects the contents of the + * database, with entries being added/removed at the same time as for the + * core database. In order to ensure that degenerate cases are kept to a + * minimum, we use an AAtree. This is an approximation of a Red-Black tree + * with similar performance characteristics, but with a significantly + * simpler implementation. Entries in this tree comprise pointers to the + * leaf nodes of the host tree described above. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "netsurf/image/bitmap.h" +#include "netsurf/content/urldb.h" +#include "netsurf/desktop/options.h" +#ifdef riscos +/** \todo lose this */ +#include "netsurf/riscos/bitmap.h" +#endif +#include "netsurf/utils/log.h" +#include "netsurf/utils/url.h" +#include "netsurf/utils/utils.h" + +struct cookie { + char *name; /**< Cookie name */ + char *value; /**< Cookie value */ + char *comment; /**< Cookie comment */ + time_t expires; /**< Expiry timestamp, or 0 for session */ + time_t last_used; /**< Last used time */ + bool secure; /**< Only send for HTTPS requests */ + enum { COOKIE_NETSCAPE = 0, + COOKIE_RFC2109 = 1, + COOKIE_RFC2965 = 2 + } version; /**< Specification compliance */ + bool no_destroy; /**< Never destroy this cookie, + * unless it's expired */ + + struct cookie *next; /**< Next in list */ +}; + +struct auth_data { + char *realm; /**< Protection realm */ + char *auth; /**< Authentication details in form + * username:password */ +}; + +struct url_internal_data { + char *title; /**< Resource title */ + unsigned int visits; /**< Visit count */ + time_t last_visit; /**< Last visit time */ + content_type type; /**< Type of resource */ +}; + +struct path_data { + char *scheme; /**< URL scheme for data */ + unsigned int port; /**< Port number for data */ + char *segment; /**< Path segment for this node */ + unsigned int frag_cnt; /**< Number of entries in ::fragment */ + char **fragment; /**< Array of fragments */ + + struct bitmap *thumb; /**< Thumbnail image of resource */ + struct url_internal_data url; /**< URL data for resource */ + struct auth_data auth; /**< Authentication data for resource */ + struct cookie *cookies; /**< Cookies associated with resource */ + + struct path_data *next; /**< Next sibling */ + struct path_data *prev; /**< Previous sibling */ + struct path_data *parent; /**< Parent path segment */ + struct path_data *children; /**< Child path segments */ + struct path_data *last; /**< Last child */ +}; + +struct host_part { + /**< Known paths on this host. This _must_ be first so that + * struct host_part *h = (struct host_part *)mypath; works */ + struct path_data paths; + bool permit_invalid_certs; /**< Allow access to SSL protected + * resources on this host without + * verifying certificate authenticity + */ + + char *part; /**< Part of host string */ + + struct host_part *next; /**< Next sibling */ + struct host_part *prev; /**< Previous sibling */ + struct host_part *parent; /**< Parent host part */ + struct host_part *children; /**< Child host parts */ +}; + +struct search_node { + const struct host_part *data; /**< Host tree entry */ + + unsigned int level; /**< Node level */ + + struct search_node *left; /**< Left subtree */ + struct search_node *right; /**< Right subtree */ +}; + +/* Saving */ +static void urldb_save_search_tree(struct search_node *root, FILE *fp); +static void urldb_count_urls(const struct path_data *root, time_t expiry, + unsigned int *count); +static void urldb_write_urls(const struct path_data *parent, + const char *host, FILE *fp, char **path, int *path_alloc, + int *path_used, time_t expiry); + +/* Iteration */ +static bool urldb_iterate_partial_host(struct search_node *root, + const char *prefix, bool (*callback)(const char *url)); +static bool urldb_iterate_partial_path(const struct path_data *parent, + const char *host, const char *prefix, + char **path, int *path_alloc, int *path_used, + bool (*callback)(const char *url)); +static bool urldb_iterate_entries_host(struct search_node *parent, + bool (*callback)(const char *url)); +static bool urldb_iterate_entries_path(const char *host, char **path, + int *path_alloc, int *path_used, + const struct path_data *parent, + bool (*callback)(const char *url)); + +/* Insertion */ +static struct host_part *urldb_add_host_node(const char *part, + struct host_part *parent); +static struct host_part *urldb_add_host(const char *host); +static struct path_data *urldb_add_path_node(const char *scheme, + unsigned int port, const char *segment, const char *fragment, + struct path_data *parent); +static struct path_data *urldb_add_path(const char *scheme, + unsigned int port, const struct host_part *host, + const char *path, const char *fragment); +static int urldb_add_path_fragment_cmp(const void *a, const void *b); +static struct path_data *urldb_add_path_fragment(struct path_data *segment, + const char *fragment); + +/* Lookup */ +static struct path_data *urldb_find_url(const char *url); +static struct path_data *urldb_match_path(const struct path_data *parent, + const char *path, const char *scheme, unsigned short port); + +/* Dump */ +static void urldb_dump_hosts(struct host_part *parent); +static void urldb_dump_paths(struct path_data *parent); +static void urldb_dump_search(struct search_node *parent, int depth); + +/* Search tree */ +static struct search_node *urldb_search_insert(struct search_node *root, + const struct host_part *data); +static struct search_node *urldb_search_insert_internal( + struct search_node *root, struct search_node *n); +static struct search_node *urldb_search_remove(struct search_node *root, + const struct host_part *data); +static const struct host_part *urldb_search_find(struct search_node *root, + const char *host); +static struct search_node *urldb_search_skew(struct search_node *root); +static struct search_node *urldb_search_split(struct search_node *root); +static int urldb_search_match_host(const struct host_part *a, + const struct host_part *b); +static int urldb_search_match_string(const struct host_part *a, + const char *b); +static int urldb_search_match_prefix(const struct host_part *a, + const char *b); + +/** Root database handle */ +static struct host_part db_root; + +/** Search trees - one per letter + 1 for IPs */ +#define NUM_SEARCH_TREES 27 +#define ST_IP 0 +#define ST_DN 1 +static struct search_node empty = { 0, 0, &empty, &empty }; +static struct search_node *search_trees[NUM_SEARCH_TREES] = { + &empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty, + &empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty, + &empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty, + &empty, &empty, &empty +}; + +/** + * Import an URL database from file, replacing any existing database + * + * \param filename Name of file containing data + */ +void urldb_load(const char *filename) +{ +#define MAXIMUM_URL_LENGTH 4096 + char s[MAXIMUM_URL_LENGTH]; + struct host_part *h; + int urls; + int i; + int version; + int length; + FILE *fp; + + /** \todo optimise */ + + assert(filename); + + LOG(("Loading URL file")); + + fp = fopen(filename, "r"); + if (!fp) { + LOG(("Failed to open file '%s' for reading", filename)); + return; + } + + if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) + return; + version = atoi(s); + if (version < 105) { + LOG(("Unsupported URL file version.")); + return; + } + if (version > 105) { + LOG(("Unknown URL file version.")); + return; + } + + while (fgets(s, MAXIMUM_URL_LENGTH, fp)) { + /* get the hostname */ + length = strlen(s) - 1; + s[length] = '\0'; + + /* skip data that has ended up with a host of '' */ + if (length == 0) { + if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) + break; + urls = atoi(s); + for (i = 0; i < (6 * urls); i++) + if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) + break; + continue; + } + + h = urldb_add_host(s); + if (!h) + die("Memory exhausted whilst loading URL file"); + + if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) + break; + urls = atoi(s); + + /* load the non-corrupt data */ + for (i = 0; i < urls; i++) { + struct path_data *p = NULL; + + if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) + break; + length = strlen(s) - 1; + s[length] = '\0'; + + urldb_add_url(s); + p = urldb_find_url(s); + + if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) + break; + if (p) + p->url.visits = (unsigned int)atoi(s); + + if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) + break; + if (p) + p->url.last_visit = (time_t)atoi(s); + + if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) + break; + if (p) + p->url.type = (content_type)atoi(s); + + if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) + break; +#ifdef riscos + if (p && strlen(s) == 12) { + /* ensure filename is 'XX.XX.XX.XX' */ + if ((s[2] == '.') && (s[5] == '.') && + (s[8] == '.')) { + s[11] = '\0'; + p->thumb = bitmap_create_file(s); + } + } +#endif + + if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) + break; + length = strlen(s) - 1; + if (p && length > 0) { + s[length] = '\0'; + p->url.title = malloc(length + 1); + if (p->url.title) + memcpy(p->url.title, s, length + 1); + } + } + } + + fclose(fp); + LOG(("Successfully loaded URL file")); +#undef MAXIMUM_URL_LENGTH +} + +/** + * Export the current database to file + * + * \param filename Name of file to export to + */ +void urldb_save(const char *filename) +{ + FILE *fp; + int i; + + assert(filename); + + fp = fopen(filename, "w"); + if (!fp) { + LOG(("Failed to open file '%s' for writing", filename)); + return; + } + + /* file format version number */ + fprintf(fp, "105\n"); + + for (i = 0; i != NUM_SEARCH_TREES; i++) { + urldb_save_search_tree(search_trees[i], fp); + } + + fclose(fp); +} + +/** + * Save a search (sub)tree + * + * \param root Root of (sub)tree to save + * \param fp File to write to + */ +void urldb_save_search_tree(struct search_node *parent, FILE *fp) +{ + char host[256]; + const struct host_part *h; + unsigned int path_count = 0; + char *path, *p, *end; + int path_alloc = 64, path_used = 2; + time_t expiry = time(NULL) - (60 * 60 * 24) * option_expire_url; + + if (parent == &empty) + return; + + urldb_save_search_tree(parent->left, fp); + + path = malloc(path_alloc); + if (!path) + return; + + path[0] = '/'; + path[1] = '\0'; + + for (h = parent->data, p = host, end = host + sizeof host; + h && h != &db_root && p < end; h = h->parent) { + int written = snprintf(p, end - p, "%s%s", h->part, + (h->parent && h->parent->parent) ? "." : ""); + if (written < 0) { + free(path); + return; + } + p += written; + } + + urldb_count_urls(&parent->data->paths, expiry, &path_count); + + if (path_count > 0) { + fprintf(fp, "%s\n%i\n", host, path_count); + + urldb_write_urls(&parent->data->paths, host, fp, + &path, &path_alloc, &path_used, expiry); + } + + free(path); + + urldb_save_search_tree(parent->right, fp); +} + +/** + * Count number of URLs associated with a host + * + * \param root Root of path data tree + * \param expiry Expiry time for URLs + * \param count Pointer to count + */ +void urldb_count_urls(const struct path_data *root, time_t expiry, + unsigned int *count) +{ + const struct path_data *p; + + if (!root->children) { + if ((root->url.last_visit > expiry) && + (root->url.visits > 0)) + (*count)++; + } + + for (p = root->children; p; p = p->next) + urldb_count_urls(p, expiry, count); +} + +/** + * Write URLs associated with a host + * + * \param parent Root of (sub)tree to write + * \param host Current host name + * \param fp File to write to + * \param path Current path string + * \param path_alloc Allocated size of path + * \param path_used Used size of path + * \param expiry Expiry time of URLs + */ +void urldb_write_urls(const struct path_data *parent, const char *host, + FILE *fp, char **path, int *path_alloc, int *path_used, + time_t expiry) +{ + const struct path_data *p; + int i; + int pused = *path_used; + + if (!parent->children) { + /* leaf node */ + if (!((parent->url.last_visit > expiry) && + (parent->url.visits > 0))) + /* expired */ + return; + + fprintf(fp, "%s://%s", parent->scheme, host); + + if (parent->port) + fprintf(fp,":%d", parent->port); + + fprintf(fp, "%s\n", *path); + + /** \todo handle fragments? */ + + fprintf(fp, "%i\n%i\n%i\n", parent->url.visits, + (int)parent->url.last_visit, + (int)parent->url.type); + +#ifdef riscos + if (parent->thumb) + fprintf(fp, "%s\n", parent->thumb->filename); +#else + fprintf(fp, "\n"); +#endif + + if (parent->url.title) { + char *s = parent->url.title; + for (i = 0; s[i] != '\0'; i++) + if (s[i] < 32) + s[i] = ' '; + for (--i; ((i > 0) && (s[i] == ' ')); i--) + s[i] = '\0'; + fprintf(fp, "%s\n", parent->url.title); + } else + fprintf(fp, "\n"); + } + + for (p = parent->children; p; p = p->next) { + int len = *path_used + strlen(p->segment) + 1; + if (*path_alloc < len) { + char *temp = realloc(*path, + (len > 64) ? len : *path_alloc + 64); + if (!temp) + return; + *path = temp; + *path_alloc = (len > 64) ? len : *path_alloc + 64; + } + + strcat(*path, p->segment); + if (p->children) { + strcat(*path, "/"); + } else { + len -= 1; + } + + *path_used = len; + + urldb_write_urls(p, host, fp, path, path_alloc, path_used, + expiry); + + /* restore path to its state on entry to this function */ + *path_used = pused; + (*path)[pused - 1] = '\0'; + } +} + +/** + * Insert an URL into the database + * + * \param url URL to insert + * \return true on success, false otherwise + */ +bool urldb_add_url(const char *url) +{ + struct host_part *h; + struct path_data *p; + char *fragment = NULL, *host, *plq, *scheme, *colon; + unsigned short port; + url_func_result ret; + assert(url); + + /** \todo consider file: URLs */ + + host = strchr(url, '#'); + if (host) { + fragment = strdup(host+1); + if (!fragment) + return false; + } + + /* extract host */ + ret = url_host(url, &host); + if (ret != URL_FUNC_OK) + return false; + + /* extract path, leafname, query */ + ret = url_plq(url, &plq); + if (ret != URL_FUNC_OK) { + free(host); + free(fragment); + return false; + } + + /* extract scheme */ + ret = url_scheme(url, &scheme); + if (ret != URL_FUNC_OK) { + free(plq); + free(host); + free(fragment); + return false; + } + + colon = strrchr(host, ':'); + if (!colon) { + port = 0; + } else { + *colon = '\0'; + port = atoi(colon + 1); + } + + /* Get host entry */ + h = urldb_add_host(host); + if (!h) { + free(scheme); + free(plq); + free(host); + free(fragment); + return false; + } + + /* Get path entry */ + p = urldb_add_path(scheme, port, h, plq, fragment); + if (!p) { + free(scheme); + free(plq); + free(host); + free(fragment); + return false; + } + + return true; +} + +/** + * Set an URL's title string, replacing any existing one + * + * \param url The URL to look for + * \param title The title string to use (copied) + */ +void urldb_set_url_title(const char *url, const char *title) +{ + struct path_data *p; + char *temp; + + assert(url && title); + + p = urldb_find_url(url); + if (!p) + return; + + temp = strdup(title); + if (!temp) + return; + + free(p->url.title); + p->url.title = temp; +} + +/** + * Set an URL's content type + * + * \param url The URL to look for + * \param type The type to set + */ +void urldb_set_url_content_type(const char *url, content_type type) +{ + struct path_data *p; + + assert(url); + + p = urldb_find_url(url); + if (!p) + return; + + p->url.type = type; +} + +/** + * Update an URL's visit data + * + * \param url The URL to update + */ +void urldb_update_url_visit_data(const char *url) +{ + struct path_data *p; + + assert(url); + + p = urldb_find_url(url); + if (!p) + return; + + p->url.last_visit = time(NULL); + p->url.visits++; +} + +/** + * Reset an URL's visit statistics + * + * \param url The URL to reset + */ +void urldb_reset_url_visit_data(const char *url) +{ + struct path_data *p; + + assert(url); + + p = urldb_find_url(url); + if (!p) + return; + + p->url.last_visit = (time_t)0; + p->url.visits = 0; +} + + +/** + * Find data for an URL. + * + * \param url Absolute URL to look for + * \return Pointer to result struct, or NULL + */ +const struct url_data *urldb_get_url_data(const char *url) +{ + struct path_data *p; + + assert(url); + + p = urldb_find_url(url); + if (!p) + return NULL; + + return (struct url_data *)&p->url; +} + +/** + * Look up authentication details in database + * + * \param url Absolute URL to search for + * \return Pointer to authentication details, or NULL if not found + */ +const char *urldb_get_auth_details(const char *url) +{ + struct path_data *p, *q; + + assert(url); + + /* add to the db, so our lookup will work */ + urldb_add_url(url); + + p = urldb_find_url(url); + if (!p) + return NULL; + + for (; p; p = p->parent) { + /* The parent path entry is stored hung off the + * parent entry with an empty (not NULL) segment string. + * We look for this here. + */ + for (q = p->children; q; q = q->next) { + if (strlen(q->segment) == 0) + break; + } + + if (q && q->auth.realm && q->auth.auth) + break; + } + + if (!q) + return NULL; + + return q->auth.auth; +} + +/** + * Retrieve certificate verification permissions from database + * + * \param url Absolute URL to search for + * \return true to permit connections to hosts with invalid certificates, + * false otherwise. + */ +bool urldb_get_cert_permissions(const char *url) +{ + struct path_data *p; + struct host_part *h; + + assert(url); + + p = urldb_find_url(url); + if (!p) + return false; + + for (; p && p->parent; p = p->parent) + /* do nothing */; + + h = (struct host_part *)p; + + return h->permit_invalid_certs; +} + +/** + * Set authentication data for an URL + * + * \param url The URL to consider + * \param realm The authentication realm + * \param auth The authentication details (in form username:password) + */ +void urldb_set_auth_details(const char *url, const char *realm, + const char *auth) +{ + struct path_data *p; + char *t1, *t2; + + assert(url && realm && auth); + + /* add url, in case it's missing */ + urldb_add_url(url); + + p = urldb_find_url(url); + if (!p) + return; + + /** \todo search subtree for same realm/auth details + * and remove them (as the lookup routine searches up the tree) */ + + t1 = strdup(realm); + t2 = strdup(auth); + + if (!t1 || !t2) { + free(t1); + free(t2); + return; + } + + free(p->auth.realm); + free(p->auth.auth); + + p->auth.realm = t1; + p->auth.auth = t2; +} + +/** + * Set certificate verification permissions + * + * \param url URL to consider + * \param permit Set to true to allow invalid certificates + */ +void urldb_set_cert_permissions(const char *url, bool permit) +{ + struct path_data *p; + struct host_part *h; + + assert(url); + + /* add url, in case it's missing */ + urldb_add_url(url); + + p = urldb_find_url(url); + if (!p) + return; + + for (; p && p->parent; p = p->parent) + /* do nothing */; + + h = (struct host_part *)p; + + h->permit_invalid_certs = permit; +} + +/** + * Set thumbnail for url, replacing any existing thumbnail + * + * \param url Absolute URL to consider + * \param bitmap Opaque pointer to thumbnail data + */ +void urldb_set_thumbnail(const char *url, struct bitmap *bitmap) +{ + struct path_data *p; + + assert(url && bitmap); + + p = urldb_find_url(url); + if (!p) + return; + + if (p->thumb) + bitmap_destroy(p->thumb); + + p->thumb = bitmap; +} + +/** + * Retrieve thumbnail data for given URL + * + * \param url Absolute URL to search for + * \return Pointer to thumbnail data, or NULL if not found. + */ +const struct bitmap *urldb_get_thumbnail(const char *url) +{ + struct path_data *p; + + assert(url); + + p = urldb_find_url(url); + if (!p) + return NULL; + + return p->thumb; +} + +/** + * Iterate over entries in the database which match the given prefix + * + * \param prefix Prefix to match + * \param callback Callback function + */ +void urldb_iterate_partial(const char *prefix, + bool (*callback)(const char *url)) +{ + char host[256]; + char buf[260]; /* max domain + "www." */ + const char *slash; + struct search_node *tree; + const struct host_part *h; + + assert(prefix && callback); + + slash = strchr(prefix, '/'); + + if (*prefix >= '0' && *prefix <= '9') + tree = search_trees[ST_IP]; + else if (isalpha(*prefix)) + tree = search_trees[ST_DN + tolower(*prefix) - 'a']; + else + return; + + if (slash) { + /* if there's a slash in the input, then we can + * assume that we're looking for a path */ + char *path, *domain = host; + int path_alloc = 64, path_used = 2; + + snprintf(host, sizeof host, "%.*s", slash - prefix, prefix); + + h = urldb_search_find(tree, host); + if (!h) { + int len = slash - prefix; + + if ((len == 1 && tolower(host[0]) != 'w') || + (len == 2 && (tolower(host[0]) != 'w' || + tolower(host[1]) != 'w')) || + (len >= 3 && + strncasecmp(host, "www", 3))) { + snprintf(buf, sizeof buf, "www.%s", host); + h = urldb_search_find( + search_trees[ST_DN + 'w' - 'a'], + buf); + if (!h) + return; + domain = buf; + } else + return; + } + + path = malloc(path_alloc); + if (!path) + return; + + path[0] = '/'; + path[1] = '\0'; + + urldb_iterate_partial_path(&h->paths, domain, slash + 1, + &path, &path_alloc, &path_used, callback); + + free(path); + } else { + int len = strlen(prefix); + + /* looking for hosts */ + if (!urldb_iterate_partial_host(tree, prefix, callback)) + return; + + if ((len == 1 && tolower(prefix[0]) != 'w') || + (len == 2 && (tolower(prefix[0]) != 'w' || + tolower(prefix[1]) != 'w')) || + (len >= 3 && + strncasecmp(prefix, "www", 3))) { + /* now look for www.prefix */ + snprintf(buf, sizeof buf, "www.%s", prefix); + if(!urldb_iterate_partial_host( + search_trees[ST_DN + 'w' - 'a'], + buf, callback)) + return; + } + } +} + +/** + * Partial host iterator (internal) + * + * \param root Root of (sub)tree to traverse + * \param prefix Prefix to match + * \param callback Callback function + * \return true to continue, false otherwise + */ +bool urldb_iterate_partial_host(struct search_node *root, const char *prefix, + bool (*callback)(const char *url)) +{ + int c; + const struct host_part *h; + char domain[256], *p, *end; + char *path; + int path_alloc = 64, path_used = 2; + + assert(root && prefix && callback); + + if (root == &empty) + return true; + + c = urldb_search_match_prefix(root->data, prefix); + + if (c > 0) + /* No match => look in left subtree */ + return urldb_iterate_partial_host(root->left, prefix, + callback); + else if (c < 0) + /* No match => look in right subtree */ + return urldb_iterate_partial_host(root->right, prefix, + callback); + else { + /* Match => iterate over l/r subtrees & process this node */ + if (!urldb_iterate_partial_host(root->left, prefix, + callback)) + return false; + + /* Generate host string */ + for (h = root->data, p = domain, + end = domain + sizeof domain; + h && h != &db_root && p < end; + h = h->parent) { + int written = snprintf(p, end - p, "%s%s", h->part, + (h->parent && h->parent->parent) ? "." : ""); + if (written < 0) + return false; + p += written; + } + + path = malloc(path_alloc); + if (!path) + return false; + + path[0] = '/'; + path[1] = '\0'; + + /* and extract all paths attached to this host */ + if (!urldb_iterate_entries_path(domain, &path, &path_alloc, + &path_used, &root->data->paths, callback)) { + free(path); + return false; + } + + free(path); + + if (!urldb_iterate_partial_host(root->right, prefix, + callback)) + return false; + } + + return true; +} + +/** + * Partial path iterator (internal) + * + * \param parent Root of (sub)tree to traverse + * \param host Host string + * \param prefix Prefix to match + * \param path The built path up to this point + * \param path_alloc Allocated size of path + * \param path_used Used size of path + * \param callback Callback function + * \return true to continue, false otherwise + */ +bool urldb_iterate_partial_path(const struct path_data *parent, + const char *host, const char *prefix, + char **path, int *path_alloc, int *path_used, + bool (*callback)(const char *url)) +{ + const struct path_data *p; + const char *slash, *end = prefix + strlen(prefix); + int pused = *path_used; + int c; + + slash = strchr(prefix, '/'); + if (!slash) + slash = end; + + if (slash == prefix && *prefix == '/') + /* Ignore "//" */ + return true; + + for (p = parent->children; p; p = p->next) { + if ((c = strncasecmp(p->segment, prefix, slash - prefix)) < 0) + /* didn't match, but may be more */ + continue; + else if (c > 0) + /* no more possible matches */ + break; + + /* prefix matches so far */ + int len = *path_used + strlen(p->segment) + 1; + if (*path_alloc < len) { + char *temp = realloc(*path, + (len > 64) ? len : *path_alloc + 64); + if (!temp) + return false; + *path = temp; + *path_alloc = (len > 64) ? len : *path_alloc + 64; + } + + strcat(*path, p->segment); + if (p->children) + strcat(*path, "/"); + else + len -= 1; + + *path_used = len; + + if (slash == end) { + /* we've run out of prefix, so all + * paths below this one match */ + if (!urldb_iterate_entries_path(host, path, + path_alloc, path_used, p, callback)) + return false; + } else { + /* more prefix to go => recurse */ + if (!urldb_iterate_partial_path(p, host, slash + 1, + path, path_alloc, path_used, + callback)) + return false; + } + + /* restore path to that from input for next child */ + *path_used = pused; + (*path)[pused - 1] = '\0'; + } + + return true; +} + +/** + * Iterate over all entries in database + * + * \param callback Function to callback for each entry + */ +void urldb_iterate_entries(bool (*callback)(const char *url)) +{ + int i; + + assert(callback); + + for (i = 0; i < NUM_SEARCH_TREES; i++) { + if (!urldb_iterate_entries_host(search_trees[i], + callback)) + break; + } +} + +/** + * Host data iterator (internal) + * + * \param parent Root of subtree to iterate over + * \param callback Callback function + * \return true to continue, false otherwise + */ +bool urldb_iterate_entries_host(struct search_node *parent, + bool (*callback)(const char *url)) +{ + char domain[256], *p, *end; + const struct host_part *h; + char *path; + int path_alloc = 64, path_used = 2; + + if (parent == &empty) + return true; + + if (!urldb_iterate_entries_host(parent->left, callback)) + return false; + + for (h = parent->data, p = domain, end = domain + sizeof domain; + h && h != &db_root && p < end; h = h->parent) { + int written = snprintf(p, end - p, "%s%s", h->part, + (h->parent && h->parent->parent) ? "." : ""); + if (written < 0) + return false; + p += written; + } + + path = malloc(path_alloc); + if (!path) + return false; + + path[0] = '/'; + path[1] = '\0'; + + if (!urldb_iterate_entries_path(domain, &path, &path_alloc, + &path_used, &parent->data->paths, callback)) { + free(path); + return false; + } + + free(path); + + if (!urldb_iterate_entries_host(parent->right, callback)) + return false; + + return true; +} + +/** + * Path data iterator (internal) + * + * \param host Host component of output URI + * \param path The built path up to this point + * \param path_alloc Allocated size of path + * \param path_used Used size of path + * \param parent Root of subtree to iterate over + * \param callback Callback function to call + * \return true to continue, false otherwise + */ +bool urldb_iterate_entries_path(const char *host, char **path, + int *path_alloc, int *path_used, + const struct path_data *parent, + bool (*callback)(const char *url)) +{ + const struct path_data *p; + int pused = *path_used; + + if (!parent->children) { + /* leaf node */ + int schemelen = strlen(parent->scheme); + int hostlen = strlen(host); + int prefixlen = schemelen + 3 /* :// */ + + hostlen + 6 /* :NNNNN */; + static char *url; + static int url_alloc; + int written; + + if (url_alloc < *path_used + prefixlen + 2) { + char *temp = realloc(url, *path_used + prefixlen + 2); + if (!temp) + return false; + url = temp; + url_alloc = *path_used + prefixlen + 2; + } + + written = sprintf(url, "%s://%s", parent->scheme, host); + if (written < 0) { + return false; + } + + if (parent->port) { + written = sprintf(url + schemelen + 3 + hostlen, + ":%d", parent->port); + if (written < 0) { + return false; + } + written += schemelen + 3 + hostlen; + } + + written = sprintf(url + written, "%s", *path); + if (written < 0) { + return false; + } + + /** \todo handle fragments? */ + + if (!callback(url)) + return false; + } + + for (p = parent->children; p; p = p->next) { + int len = *path_used + strlen(p->segment) + 1; + if (*path_alloc < len) { + char *temp = realloc(*path, + (len > 64) ? len : *path_alloc + 64); + if (!temp) + return false; + *path = temp; + *path_alloc = (len > 64) ? len : *path_alloc + 64; + } + + strcat(*path, p->segment); + if (p->children) { + strcat(*path, "/"); + } else { + len -= 1; + } + + *path_used = len; + + if (!urldb_iterate_entries_path(host, path, path_alloc, + path_used, p, callback)) + return false; + + /* restore path to its state on entry to this function */ + *path_used = pused; + (*path)[pused - 1] = '\0'; + } + + return true; +} + +/** + * Add a host node to the tree + * + * \param part Host segment to add (or whole IP address) (copied) + * \param parent Parent node to add to + * \return Pointer to added node, or NULL on memory exhaustion + */ +struct host_part *urldb_add_host_node(const char *part, + struct host_part *parent) +{ + struct host_part *d; + + assert(part && parent); + + d = calloc(1, sizeof(struct host_part)); + if (!d) + return NULL; + + d->part = strdup(part); + if (!d->part) { + free(d); + return NULL; + } + + d->next = parent->children; + if (parent->children) + parent->children->prev = d; + d->parent = parent; + parent->children = d; + + return d; +} + +/** + * Add a host to the database, creating any intermediate entries + * + * \param host Hostname to add + * \return Pointer to leaf node, or NULL on memory exhaustion + */ +struct host_part *urldb_add_host(const char *host) +{ + struct host_part *d = (struct host_part *) &db_root, *e; + struct search_node *s; + char buf[256]; /* 256 bytes is sufficient - domain names are + * limited to 255 chars. */ + char *part; + + assert(host); + + if (*(host) >= '0' && *(host) <= '9') { + /* Host is an IP, so simply add as TLD */ + + /* Check for existing entry */ + for (e = d->children; e; e = e->next) + if (strcasecmp(host, e->part) == 0) + /* found => return it */ + return e; + + d = urldb_add_host_node(host, d); + + s = urldb_search_insert(search_trees[ST_IP], d); + if (!s) { + /* failed */ + d = NULL; + } else { + search_trees[ST_IP] = s; + } + + return d; + } + + /* Copy host string, so we can corrupt it */ + strncpy(buf, host, sizeof buf); + buf[sizeof buf - 1] = '\0'; + + /* Process FQDN segments backwards */ + do { + part = strrchr(buf, '.'); + if (!part) { + /* last segment */ + /* Check for existing entry */ + for (e = d->children; e; e = e->next) + if (strcasecmp(buf, e->part) == 0) + break; + + if (e) { + d = e; + } else { + d = urldb_add_host_node(buf, d); + } + + /* And insert into search tree */ + if (d) { + if (isalpha(*buf)) { + struct search_node **r; + r = &search_trees[ + tolower(*buf) - 'a' + ST_DN]; + + s = urldb_search_insert(*r, d); + if (!s) { + /* failed */ + d = NULL; + } else { + *r = s; + } + } else { + d = NULL; + } + } + break; + } + + /* Check for existing entry */ + for (e = d->children; e; e = e->next) + if (strcasecmp(part + 1, e->part) == 0) + break; + + d = e ? e : urldb_add_host_node(part + 1, d); + if (!d) + break; + + *part = '\0'; + } while (1); + + return d; +} + +/** + * Add a path node to the tree + * + * \param scheme URL scheme associated with path (copied) + * \param port Port number on host associated with path + * \param segment Path segment to add (copied) + * \param fragment URL fragment (copied), or NULL + * \param parent Parent node to add to + * \return Pointer to added node, or NULL on memory exhaustion + */ +struct path_data *urldb_add_path_node(const char *scheme, unsigned int port, + const char *segment, const char *fragment, + struct path_data *parent) +{ + struct path_data *d, *e; + + assert(scheme && segment && parent); + + d = calloc(1, sizeof(struct path_data)); + if (!d) + return NULL; + + d->scheme = strdup(scheme); + if (!d->scheme) { + free(d); + return NULL; + } + + d->port = port; + + d->segment = strdup(segment); + if (!d->segment) { + free(d->scheme); + free(d); + return NULL; + } + + if (fragment) { + if (!urldb_add_path_fragment(d, fragment)) { + free(d->segment); + free(d->scheme); + free(d); + return NULL; + } + } + + for (e = parent->children; e; e = e->next) + if (strcmp(e->segment, d->segment) > 0) + break; + + if (e) { + d->prev = e->prev; + d->next = e; + if (e->prev) + e->prev->next = d; + else + parent->children = d; + e->prev = d; + } else if (!parent->children) { + d->prev = d->next = NULL; + parent->children = parent->last = d; + } else { + d->next = NULL; + d->prev = parent->last; + parent->last->next = d; + parent->last = d; + } + d->parent = parent; + + return d; +} + +/** + * Add a path to the database, creating any intermediate entries + * + * \param scheme URL scheme associated with path + * \param port Port number on host associated with path + * \param host Host tree node to attach to + * \param path Absolute path to add + * \param fragment URL fragment, or NULL + * \return Pointer to leaf node, or NULL on memory exhaustion + */ +struct path_data *urldb_add_path(const char *scheme, unsigned int port, + const struct host_part *host, const char *path, + const char *fragment) +{ + struct path_data *d, *e; + char *buf; + char *segment, *slash; + + assert(scheme && host && path); + + d = (struct path_data *) &host->paths; + + /* Copy path string, so we can corrupt it */ + buf = malloc(strlen(path) + 1); + if (!buf) + return NULL; + + /* + 1 to strip leading '/' */ + strcpy(buf, path + 1); + + segment = buf; + + /* Process path segments */ + do { + slash = strchr(segment, '/'); + if (!slash) { + /* last segment */ + /* look for existing entry */ + for (e = d->children; e; e = e->next) + if (strcmp(segment, e->segment) == 0 && + strcasecmp(scheme, + e->scheme) == 0 && + e->port == port) + break; + + d = e ? urldb_add_path_fragment(e, fragment) : + urldb_add_path_node(scheme, port, + segment, fragment, d); + break; + } + + *slash = '\0'; + + /* look for existing entry */ + for (e = d->children; e; e = e->next) + if (strcmp(segment, e->segment) == 0 && + strcasecmp(scheme, e->scheme) == 0 && + e->port == port) + break; + + d = e ? e : urldb_add_path_node(scheme, port, segment, + NULL, d); + if (!d) + break; + + segment = slash + 1; + } while (1); + + free(buf); + + return d; +} + +/** + * Fragment comparator callback for qsort + */ +int urldb_add_path_fragment_cmp(const void *a, const void *b) +{ + return strcasecmp(*((const char **) a), *((const char **) b)); +} + +/** + * Add a fragment to a path segment + * + * \param segment Path segment to add to + * \param fragment Fragment to add (copied), or NULL + * \return segment or NULL on memory exhaustion + */ +struct path_data *urldb_add_path_fragment(struct path_data *segment, + const char *fragment) +{ + char **temp; + + assert(segment); + + /* If no fragment, this function is a NOP + * This may seem strange, but it makes the rest + * of the code cleaner */ + if (!fragment) + return segment; + + temp = realloc(segment->fragment, + (segment->frag_cnt + 1) * sizeof(char *)); + if (!temp) + return NULL; + + segment->fragment = temp; + segment->fragment[segment->frag_cnt] = strdup(fragment); + if (!segment->fragment[segment->frag_cnt]) { + /* Don't free temp - it's now our buffer */ + return NULL; + } + + segment->frag_cnt++; + + /* We want fragments in alphabetical order, so sort them + * It may prove better to insert in alphabetical order instead */ + qsort(segment->fragment, segment->frag_cnt, sizeof (char *), + urldb_add_path_fragment_cmp); + + return segment; +} + +/** + * Find an URL in the database + * + * \param url The URL to find + * \return Pointer to path data, or NULL if not found + */ +struct path_data *urldb_find_url(const char *url) +{ + const struct host_part *h; + struct path_data *p; + struct search_node *tree; + char *host, *plq, *scheme, *colon; + unsigned short port; + url_func_result ret; + + assert(url); + + /** \todo consider file: URLs */ + + /* extract host */ + ret = url_host(url, &host); + if (ret != URL_FUNC_OK) + return NULL; + + /* extract path, leafname, query */ + ret = url_plq(url, &plq); + if (ret != URL_FUNC_OK) { + free(host); + return NULL; + } + + /* extract scheme */ + ret = url_scheme(url, &scheme); + if (ret != URL_FUNC_OK) { + free(plq); + free(host); + return NULL; + } + + colon = strrchr(host, ':'); + if (!colon) { + port = 0; + } else { + *colon = '\0'; + port = atoi(colon + 1); + } + + if (*host >= '0' && *host <= '9') + tree = search_trees[ST_IP]; + else if (isalpha(*host)) + tree = search_trees[ST_DN + tolower(*host) - 'a']; + else { + free(plq); + free(host); + free(scheme); + return NULL; + } + + h = urldb_search_find(tree, host); + if (!h) { + free(plq); + free(host); + free(scheme); + return NULL; + } + + p = urldb_match_path(&h->paths, plq, scheme, port); + + free(plq); + free(host); + free(scheme); + + return p; +} + +/** + * Match a path string + * + * \param parent Path (sub)tree to look in + * \param path The path to search for + * \param scheme The URL scheme associated with the path + * \param port The port associated with the path + * \return Pointer to path data or NULL if not found. + */ +struct path_data *urldb_match_path(const struct path_data *parent, + const char *path, const char *scheme, unsigned short port) +{ + struct path_data *p; + const char *slash; + + if (*path == '\0') + return (struct path_data *)parent; + + slash = strchr(path + 1, '/'); + if (!slash) + slash = path + strlen(path); + + for (p = parent->children; p; p = p->next) { + if (strncmp(p->segment, path + 1, slash - path - 1) == 0 && + strcmp(p->scheme, scheme) == 0 && + p->port == port) + break; + } + + if (p) { + return urldb_match_path(p, slash, scheme, port); + } + + return NULL; +} + +/** + * Dump URL database to stderr + */ +void urldb_dump(void) +{ + int i; + + urldb_dump_hosts(&db_root); + + for (i = 0; i != NUM_SEARCH_TREES; i++) + urldb_dump_search(search_trees[i], 0); +} + +/** + * Dump URL database hosts to stderr + * + * \param parent Parent node of tree to dump + */ +void urldb_dump_hosts(struct host_part *parent) +{ + struct host_part *h; + + if (parent->part) { + LOG(("%s", parent->part)); + + LOG(("\t%s invalid SSL certs", + parent->permit_invalid_certs ? "Permits" : "Denies")); + } + + /* Dump path data */ + urldb_dump_paths(&parent->paths); + + /* and recurse */ + for (h = parent->children; h; h = h->next) + urldb_dump_hosts(h); +} + +/** + * Dump URL database paths to stderr + * + * \param parent Parent node of tree to dump + */ +void urldb_dump_paths(struct path_data *parent) +{ + struct path_data *p; + unsigned int i; + + if (parent->segment) { + LOG(("\t%s : %u", parent->scheme, parent->port)); + + LOG(("\t\t'%s'", parent->segment)); + + for (i = 0; i != parent->frag_cnt; i++) + LOG(("\t\t\t#%s", parent->fragment[i])); + } + + /* and recurse */ + for (p = parent->children; p; p = p->next) + urldb_dump_paths(p); +} + +/** + * Dump search tree + * + * \param parent Parent node of tree to dump + * \param depth Tree depth + */ +void urldb_dump_search(struct search_node *parent, int depth) +{ + const struct host_part *h; + int i; + + if (parent == &empty) + return; + + urldb_dump_search(parent->left, depth + 1); + + for (i = 0; i != depth; i++) + fputc(' ', stderr); + + for (h = parent->data; h; h = h->parent) { + fprintf(stderr, "%s", h->part); + if (h->parent && h->parent->parent) + fputc('.', stderr); + } + + fputc('\n', stderr); + + urldb_dump_search(parent->right, depth + 1); +} + +/** + * Insert a node into the search tree + * + * \param root Root of tree to insert into + * \param data User data to insert + * \return Pointer to updated root, or NULL if failed + */ +struct search_node *urldb_search_insert(struct search_node *root, + const struct host_part *data) +{ + struct search_node *n; + + assert(root && data); + + n = malloc(sizeof(struct search_node)); + if (!n) + return NULL; + + n->level = 1; + n->data = data; + n->left = n->right = ∅ + + root = urldb_search_insert_internal(root, n); + + return root; +} + +/** + * Insert node into search tree + * + * \param root Root of (sub)tree to insert into + * \param n Node to insert + * \return Pointer to updated root + */ +struct search_node *urldb_search_insert_internal(struct search_node *root, + struct search_node *n) +{ + assert(root && n); + + if (root == &empty) { + root = n; + } else { + int c = urldb_search_match_host(root->data, n->data); + + if (c > 0) { + root->left = urldb_search_insert_internal( + root->left, n); + } else if (c < 0) { + root->right = urldb_search_insert_internal( + root->right, n); + } else { + /* exact match */ + free(n); + return root; + } + + root = urldb_search_skew(root); + root = urldb_search_split(root); + } + + return root; +} + +/** + * Delete a node from a search tree + * + * \param root Tree to remove from + * \param data Data to delete + * \return Updated root of tree + */ +struct search_node *urldb_search_remove(struct search_node *root, + const struct host_part *data) +{ + static struct search_node *last, *deleted; + + assert(root && data); + + if (root != &empty) { + int c = urldb_search_match_host(root->data, data); + + last = root; + if (c > 0) { + root->left = urldb_search_remove(root->left, data); + } else { + deleted = root; + root->right = urldb_search_remove(root->right, data); + } + } + + if (root == last) { + if (deleted != &empty && + urldb_search_match_host(deleted->data, + data) == 0) { + deleted->data = last->data; + deleted = ∅ + root = root->right; + } + } else { + if (root->left->level < root->level - 1 || + root->right->level < root->level - 1) { + if (root->right->level > --root->level) + root->right->level = root->level; + + root = urldb_search_skew(root); + root->right = urldb_search_skew(root->right); + root->right->right = + urldb_search_skew(root->right->right); + root = urldb_search_split(root); + root->right = urldb_search_split(root->right); + } + } + + return root; +} + +/** + * Find a node in a search tree + * + * \param root Tree to look in + * \param host Host to find + * \return Pointer to host tree node, or NULL if not found + */ +const struct host_part *urldb_search_find(struct search_node *root, + const char *host) +{ + int c; + + assert(root && host); + + if (root == &empty) { + return NULL; + } + + c = urldb_search_match_string(root->data, host); + + if (c > 0) + return urldb_search_find(root->left, host); + else if (c < 0) + return urldb_search_find(root->right, host); + else + return root->data; +} + +/** + * Compare a pair of host_parts + * + * \param a + * \param b + * \return 0 if match, non-zero, otherwise + */ +int urldb_search_match_host(const struct host_part *a, + const struct host_part *b) +{ + int ret; + + assert(a && b); + + /* traverse up tree to root, comparing parts as we go. */ + for (; a && b; a = a->parent, b = b->parent) + if ((ret = strcasecmp(a->part, b->part)) != 0) + /* They differ => return the difference here */ + return ret; + + /* If we get here then either: + * a) The path lengths differ + * or b) The hosts are identical + */ + if (a && !b) + /* len(a) > len(b) */ + return 1; + else if (!a && b) + /* len(a) < len(b) */ + return -1; + + /* identical */ + return 0; +} + +/** + * Compare host_part with a string + * + * \param a + * \param b + * \return 0 if match, non-zero, otherwise + */ +int urldb_search_match_string(const struct host_part *a, + const char *b) +{ + const char *end, *dot; + int plen, ret; + + assert(a && b); + + if (*b >= '0' && *b <= '9') { + /* IP address */ + return strcasecmp(a->part, b); + } + + end = b + strlen(b); + + while (b < end && a) { + dot = strchr(b, '.'); + if (!dot) { + /* last segment */ + dot = end; + } + + /* Compare strings (length limited) */ + if ((ret = strncasecmp(a->part, b, dot - b)) != 0) + /* didn't match => return difference */ + return ret; + + /* The strings matched, now check that the lengths do, too */ + plen = strlen(a->part); + + if (plen > dot - b) + /* len(a) > len(b) */ + return 1; + else if (plen < dot - b) + /* len(a) < len(b) */ + return -1; + + b = dot + 1; + a = a->parent; + } + + /* If we get here then either: + * a) The path lengths differ + * or b) The hosts are identical + */ + if (a && a != &db_root && b >= end) + /* len(a) > len(b) */ + return 1; + else if (!a && b < end) + /* len(a) < len(b) */ + return -1; + + /* Identical */ + return 0; +} + +/** + * Compare host_part with prefix + * + * \param a + * \param b + * \return 0 if match, non-zero, otherwise + */ +int urldb_search_match_prefix(const struct host_part *a, + const char *b) +{ + const char *end, *dot; + int plen, ret; + + assert(a && b); + + if (*b >= '0' && *b <= '9') { + /* IP address */ + return strncasecmp(a->part, b, strlen(b)); + } + + end = b + strlen(b); + + while (b < end && a) { + dot = strchr(b, '.'); + if (!dot) { + /* last segment */ + dot = end; + } + + /* Compare strings (length limited) */ + if ((ret = strncasecmp(a->part, b, dot - b)) != 0) + /* didn't match => return difference */ + return ret; + + /* The strings matched */ + if (dot < end) { + /* Consider segment lengths only in the case + * where the prefix contains segments */ + plen = strlen(a->part); + if (plen > dot - b) + /* len(a) > len(b) */ + return 1; + else if (plen < dot - b) + /* len(a) < len(b) */ + return -1; + } + + b = dot + 1; + a = a->parent; + } + + /* If we get here then either: + * a) The path lengths differ + * or b) The hosts are identical + */ + if (a && a != &db_root && b >= end) + /* len(a) > len(b) => prefix matches */ + return 0; + else if (!a && b < end) + /* len(a) < len(b) => prefix does not match */ + return -1; + + /* Identical */ + return 0; +} + +/** + * Rotate a subtree right + * + * \param root Root of subtree to rotate + * \return new root of subtree + */ +struct search_node *urldb_search_skew(struct search_node *root) +{ + struct search_node *temp; + + assert(root); + + if (root->left->level == root->level) { + temp = root->left; + root->left = temp->right; + temp->right = root; + root = temp; + } + + return root; +} + +/** + * Rotate a node left, increasing the parent's level + * + * \param root Root of subtree to rotate + * \return New root of subtree + */ +struct search_node *urldb_search_split(struct search_node *root) +{ + struct search_node *temp; + + assert(root); + + if (root->right->right->level == root->level) { + temp = root->right; + root->right = temp->left; + temp->left = root; + root = temp; + + root->level++; + } + + return root; +} + +#ifdef TEST +int main(void) +{ + struct host_part *h; + struct path_data *p; + + h = urldb_add_host("127.0.0.1"); + if (!h) { + LOG(("failed adding host")); + return 1; + } + + /* Get host entry */ + h = urldb_add_host("netsurf.strcprstskrzkrk.co.uk"); + if (!h) { + LOG(("failed adding host")); + return 1; + } + + /* Get path entry */ + p = urldb_add_path("http", 80, h, "/path/to/resource.htm?a=b", "zz"); + if (!p) { + LOG(("failed adding path")); + return 1; + } + + p = urldb_add_path("http", 80, h, "/path/to/resource.htm?a=b", "aa"); + if (!p) { + LOG(("failed adding path")); + return 1; + } + + p = urldb_add_path("http", 80, h, "/path/to/resource.htm?a=b", "yy"); + if (!p) { + LOG(("failed adding path")); + return 1; + } + + urldb_dump(); + + return 0; +} +#endif diff --git a/content/urldb.h b/content/urldb.h new file mode 100644 index 000000000..9d59271d2 --- /dev/null +++ b/content/urldb.h @@ -0,0 +1,65 @@ +/* + * This file is part of NetSurf, http://netsurf.sourceforge.net/ + * Licensed under the GNU General Public License, + * http://www.opensource.org/licenses/gpl-license + * Copyright 2006 John M Bell + */ + +/** \file + * Unified URL information database (interface) + */ + +#ifndef _NETSURF_CONTENT_URLDB_H_ +#define _NETSURF_CONTENT_URLDB_H_ + +#include +#include +#include "netsurf/content/content_type.h" + +struct url_data { + const char *title; /**< Resource title */ + unsigned int visits; /**< Visit count */ + time_t last_visit; /**< Last visit time */ + content_type type; /**< Type of resource */ +}; + +struct bitmap; + +/* Persistence support */ +void urldb_load(const char *filename); +void urldb_save(const char *filename); + +/* URL insertion */ +bool urldb_add_url(const char *url); + +/* URL data modification / lookup */ +void urldb_set_url_title(const char *url, const char *title); +void urldb_set_url_content_type(const char *url, content_type type); +void urldb_update_url_visit_data(const char *url); +void urldb_reset_url_visit_data(const char *url); +const struct url_data *urldb_get_url_data(const char *url); + +/* Authentication modification / lookup */ +void urldb_set_auth_details(const char *url, const char *realm, + const char *auth); +const char *urldb_get_auth_details(const char *url); + +/* SSL certificate permissions */ +void urldb_set_cert_permissions(const char *url, bool permit); +bool urldb_get_cert_permissions(const char *url); + +/* Thumbnail handling */ +void urldb_set_thumbnail(const char *url, struct bitmap *bitmap); +const struct bitmap *urldb_get_thumbnail(const char *url); + +/* URL completion */ +void urldb_iterate_partial(const char *prefix, + bool (*callback)(const char *url)); + +/* Iteration */ +void urldb_iterate_entries(bool (*callback)(const char *url)); + +/* Debug */ +void urldb_dump(void); + +#endif -- cgit v1.2.3