/* * This file is part of NetSurf, http://netsurf.sourceforge.net/ * Licensed under the GNU General Public License, * http://www.opensource.org/licenses/gpl-license * Copyright 2005 Richard Wilson */ /** \file * Central repository for URL data (implementation). */ #include #include #include #include #include #include #include "netsurf/content/url_store.h" #include "netsurf/image/bitmap.h" #include "netsurf/desktop/options.h" #ifdef riscos #include "netsurf/riscos/bitmap.h" #endif #include "netsurf/utils/log.h" #include "netsurf/utils/url.h" #include "netsurf/utils/utils.h" #define ITERATIONS_BEFORE_TEST 32 #define MAXIMUM_URL_LENGTH 1024 struct hostname_data *url_store_hostnames = NULL; static struct hostname_data *url_store_find_hostname(const char *url); static struct hostname_data *url_store_match_hostname( struct hostname_data *previous); /* used for faster matching */ static size_t current_match_url_length; static char *current_match_scheme; static int current_match_scheme_length; static char *current_match_hostname; static int current_match_hostname_length; static bool current_match_www_test; /* used for faster searching */ static struct hostname_data *last_hostname_found = NULL; /** * Returns the hostname data for the specified URL. If no hostname * data is currently available then it is created. * * \param url the url to find hostname data for * \return the current hostname data, or NULL if memory exhausted */ struct hostname_data *url_store_find_hostname(const char *url) { struct hostname_data *first = url_store_hostnames; struct hostname_data *search; struct hostname_data *result; url_func_result res; char *hostname = NULL; int hostname_length; int compare; int fast_exit_counter = ITERATIONS_BEFORE_TEST; const char *host_test; assert(url); /* as the URL is normalised, we optimise the hostname finding for http:// */ if (!strncmp("http://", url, 7)) { /* check for duplicate hostname calls */ if ((last_hostname_found) && (!strncmp(last_hostname_found->hostname, url + 7, last_hostname_found->hostname_length))) { /* ensure it isn't comparing 'foo.com' to 'foo.com.au' etc */ if (url[last_hostname_found->hostname_length + 7] != '.') return last_hostname_found; } /* check for a hostname match */ for (host_test = url + 7; ((*host_test > 32) && (*host_test != '/')); *host_test++); hostname_length = host_test - url - 7; host_test = url + 7; if ((last_hostname_found) && (strncmp(host_test, last_hostname_found->hostname, hostname_length) > 0)) first = last_hostname_found; for (search = first; search; search = search->next) { if (search->hostname_length == hostname_length) { compare = strncmp(host_test, search->hostname, hostname_length); if (compare == 0) { last_hostname_found = search; return search; } else if (compare < 0) break; } } /* allocate a new hostname */ hostname = malloc(hostname_length + 1); if (!hostname) return NULL; strncpy(hostname, host_test, hostname_length); hostname[hostname_length] = '\0'; } else { /* no quick match found, fallback */ res = url_host(url, &hostname); switch (res) { case URL_FUNC_OK: break; case URL_FUNC_NOMEM: return NULL; case URL_FUNC_FAILED: hostname = strdup("file:/"); /* for 'file:/' */ if (!hostname) return NULL; break; default: assert(0); } hostname_length = strlen(hostname); } /* try to find a matching hostname fairly quickly */ if ((last_hostname_found) && (strcmp(hostname, last_hostname_found->hostname) > 0)) first = last_hostname_found; for (search = first; search; search = search->next) { if ((fast_exit_counter <= 0) || (search->hostname_length == hostname_length)) { compare = strcmp(hostname, search->hostname); if (compare == 0) { free(hostname); last_hostname_found = search; return search; } else if (compare < 0) break; fast_exit_counter = ITERATIONS_BEFORE_TEST; } else { fast_exit_counter--; } } /* no hostname is available: create a new one */ result = malloc(sizeof *result); if (!result) { free(hostname); return NULL; } result->hostname = hostname; result->hostname_length = hostname_length; result->url = 0; result->previous = 0; result->next = 0; last_hostname_found = result; /* simple case: no current hostnames */ if (!url_store_hostnames) { url_store_hostnames = result; return result; } /* worst case scenario: the place we need to link is within the last * section of the hostname list so we have no reference to work back * from. rather than slowing with the very common case of searching, * we take a speed hit for this case and simply move to the very end * of the hostname list ready to work backwards. */ if (!search) for (search = url_store_hostnames; search->next; search = search->next) ; /* we can now simply scan backwards as we know roughly where we need * to link to (we either had an early exit from the searching so we * know we're in the block following where we need to link, or we're * at the very end of the list as we were in the last block.) */ while ((search) && (strcmp(hostname, search->hostname) < 0)) search = search->previous; /* simple case: our new hostname is the first in the list */ if (!search) { result->next = url_store_hostnames; url_store_hostnames->previous = result; url_store_hostnames = result; return result; } /* general case: link in after the found hostname */ result->previous = search; result->next = search->next; if (search->next) search->next->previous = result; search->next = result; return result; } /** * Returns the url data for the specified URL. If no url * data is currently available then it is created. * * \param url a normalized url to find hostname data for * \return the current hostname data, or NULL if memory exhausted */ struct url_content *url_store_find(const char *url) { struct hostname_data *hostname_data; struct url_data *search; struct url_data *result; size_t url_length; int compare; int fast_exit_counter = ITERATIONS_BEFORE_TEST; assert(url); /* find the corresponding hostname data */ hostname_data = url_store_find_hostname(url); if (!hostname_data) return NULL; /* move to the start of the leafname */ url_length = strlen(url); /* try to find a matching url fairly quickly */ for (search = hostname_data->url; search; search = search->next) { if ((fast_exit_counter <= 0) || (search->data.url_length == url_length)) { compare = strcmp(url, search->data.url); if (compare == 0) return &search->data; else if (compare < 0) break; fast_exit_counter = ITERATIONS_BEFORE_TEST; } else { fast_exit_counter--; } } /* no URL is available: create a new one */ result = calloc(1, sizeof(struct url_data)); if (!result) return NULL; result->data.url = malloc(url_length + 1); if (!result->data.url) { free(result); return NULL; } strcpy(result->data.url, url); result->data.url_length = url_length; result->parent = hostname_data; /* simple case: no current URLs */ if (!hostname_data->url) { hostname_data->url = result; return &result->data; } /* worst case scenario: the place we need to link is within the last * section of the URL list so we have no reference to work back * from. rather than slowing with the very common case of searching, * we take a speed hit for this case and simply move to the very end * of the URL list ready to work backwards. */ if (!search) for (search = hostname_data->url; search->next; search = search->next) ; /* we can now simply scan backwards as we know roughly where we need * to link to (we either had an early exit from the searching so we * know we're in the block following where we need to link, or we're * at the very end of the list as we were in the last block.) */ while ((search) && (strcmp(url, search->data.url) < 0)) search = search->previous; /* simple case: our new hostname is the first in the list */ if (!search) { result->next = hostname_data->url; hostname_data->url->previous = result; hostname_data->url = result; return &result->data; } /* general case: link in after the found hostname */ result->previous = search; result->next = search->next; if (search->next) search->next->previous = result; search->next = result; return &result->data; } /** * Returns the next hostname that matches a part of the specified URL. * * The following variables must be initialised prior to calling: * * - current_match_scheme * - current_match_hostname * - current_match_hostname_length; * * \param url a normalized url to find the next match for * \param current the current hostname to search forward from, or NULL * \return the next matching hostname, or NULL */ struct hostname_data *url_store_match_hostname( struct hostname_data *current) { int compare; assert(current_match_hostname); /* advance to the next hostname */ if (!current) current = url_store_hostnames; else current = current->next; /* skip past hostname data without URLs */ for (; current && (!current->url); current = current->next); while (current) { if (current->hostname_length >= current_match_hostname_length) { compare = strncmp(current_match_hostname, current->hostname, current_match_hostname_length); if (compare == 0) return current; else if ((compare < 0) && !current_match_www_test) break; } /* special case: if hostname is not www then try it */ if (current_match_www_test && ((current->hostname_length - 4) >= current_match_hostname_length) && (!strncmp(current->hostname, "www.", 4)) && (!strncmp(current_match_hostname, current->hostname + 4, current_match_hostname_length))) return current; /* move to next hostname with URLs */ current = current->next; for (; current && (!current->url); current = current->next); } return NULL; } /** * Returns the complete URL for the next matched stored URL. * * \param url a normalized url to find the next match for * \param reference internal reference (NULL for first call) * \return the next URL that matches */ char *url_store_match(const char *url, struct url_data **reference) { struct hostname_data *hostname; struct url_data *search = NULL; url_func_result res; assert(url); if (!url_store_hostnames) return NULL; /* find the scheme and first URL, not necessarily matching */ if (!*reference) { /* the hostname match is constant throughout */ if (current_match_hostname) free(current_match_hostname); current_match_hostname = NULL; res = url_host(url, ¤t_match_hostname); switch (res) { case URL_FUNC_OK: break; case URL_FUNC_NOMEM: return NULL; case URL_FUNC_FAILED: /* for 'file:/' */ current_match_hostname = strdup("file:/"); if (!current_match_hostname) return NULL; break; default: assert(0); } current_match_hostname_length = strlen(current_match_hostname); /* the scheme is constant throughout */ if (current_match_scheme) free(current_match_scheme); current_match_scheme = NULL; res = url_scheme(url, ¤t_match_scheme); if (res != URL_FUNC_OK) return NULL; current_match_scheme_length = strlen(current_match_scheme); /* the url is constant throughout */ current_match_url_length = strlen(url); current_match_www_test = (!strcmp(current_match_scheme, "http") && strncmp(url + 4 + 3, "www.", 4)); /* 'http' + '://' */ /* get our initial reference */ hostname = url_store_match_hostname(NULL); if (!hostname) return NULL; } else { search = *reference; hostname = search->parent; } /* work through all our strings, ignoring the scheme and 'www.' */ while (hostname) { /* get the next URL to test */ if (!search) search = hostname->url; else search = search->next; /* loop past end of list, or search */ if (!search) { hostname = url_store_match_hostname(hostname); if (!hostname) return NULL; } else if (search->data.visits > 0) { /* straight match */ if ((search->data.url_length >= current_match_url_length) && (!strncmp(search->data.url, url, current_match_url_length))) { *reference = search; return search->data.url; } /* try with 'www.' inserted after the scheme */ if (current_match_www_test && ((search->data.url_length - 4) >= current_match_url_length) && (!strncmp(search->data.url, current_match_scheme, current_match_scheme_length)) && (!strncmp(search->data.url + current_match_scheme_length + 3, "www.", 4)) && (!strncmp(search->data.url + current_match_scheme_length + 7, url + current_match_scheme_length + 3, current_match_url_length - current_match_scheme_length - 3))) { *reference = search; return search->data.url; } } } return NULL; } /** * Converts a text string into one suitable for URL matching. * * \param text the text to search with * \return URL matching string allocated on heap, or NULL on error */ char *url_store_match_string(const char *text) { url_func_result res; char *url; assert(text); res = url_normalize(text, &url); if (res != URL_FUNC_OK) return NULL; /* drop the '/' from the end if it was added when normalizing */ if ((url[strlen(url) - 1] == '/') && (text[strlen(text) - 1] != '/')) url[strlen(url) - 1] = '\0'; return url; } /** * Loads the current contents of the URL store from disk * * \param file the file to load options from */ void url_store_load(const char *file) { char s[MAXIMUM_URL_LENGTH]; struct hostname_data *hostname; struct url_data *result; int urls; int i; int version; int length; FILE *fp; LOG(("Loading URL file")); fp = fopen(file, "r"); if (!fp) { LOG(("Failed to open file '%s' for reading", file)); return; } if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) return; version = atoi(s); if (version < 102) { LOG(("Unsupported URL file version.")); return; } if (version > 105) { LOG(("Unknown URL file version.")); return; } last_hostname_found = NULL; while (fgets(s, MAXIMUM_URL_LENGTH, fp)) { /* get the hostname */ length = strlen(s) - 1; if (s[length] == '\n') s[length] = '\0'; /* skip data that has ended up with a host of '' */ if (length == 0) { if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) break; urls = atoi(s); for (i = 0; i < (6 * urls); i++) if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) break; continue; } /* add the host at the tail */ if (version == 105) { hostname = malloc(sizeof *hostname); if (!hostname) die("Insufficient memory to create hostname"); hostname->hostname = strdup(s); if (!hostname->hostname) die("Insufficient memory to create hostname"); hostname->hostname_length = length; hostname->url = 0; hostname->previous = last_hostname_found; if (!hostname->previous) url_store_hostnames = hostname; else last_hostname_found->next = hostname; hostname->next = 0; last_hostname_found = hostname; } else { hostname = url_store_find_hostname(s); if (!hostname) break; } if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) break; urls = atoi(s); /* load the non-corrupt data */ for (i = 0; i < urls; i++) { if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) break; for (length = 0; s[length] > 32; length++); s[length] = 0x00; result = calloc(1, sizeof(struct url_data)); if (!result) die("Insufficient memory to create URL"); result->data.url_length = length; result->data.url = strdup(s); if (!result->data.url) die("Insufficient memory to create URL"); result->parent = hostname; result->next = hostname->url; if (hostname->url) hostname->url->previous = result; hostname->url = result; if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) break; result->data.visits = atoi(s); if (version == 102) { /* ignore requests */ if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) break; /* ignore thumbnail size */ if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) break; /* set last visit as today to retain */ result->data.last_visit = time(NULL); } else { if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) break; result->data.last_visit = atoi(s); if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) break; result->data.type = atoi(s); } if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) break; #ifdef riscos for (length = 0; s[length] > 32; length++); s[length] = 0x00; if (length == 11) { /* ensure filename is 'XX.XX.XX.XX' */ if ((s[2] == '.') && (s[5] == '.') && (s[8] == '.')) result->data.thumbnail = bitmap_create_file(s); } #endif if (version >= 104) { if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) break; for (length = 0; s[length] >= 32; length++); s[length] = 0x00; if (length > 0) result->data.title = strdup(s); } } } fclose(fp); LOG(("Successfully loaded URL file")); } /** * Saves the current contents of the URL store to disk * * \param file the file to load options from */ void url_store_save(const char *file) { struct hostname_data *search; struct url_data *url; int url_count; const char *thumb_file; char *s; int i; FILE *fp; #ifdef riscos struct bitmap *bitmap; #endif time_t min_date; char *title; fp = fopen(file, "w"); if (!fp) { LOG(("Failed to open file '%s' for writing", file)); return; } /* get the minimum date for expiry */ min_date = time(NULL) - (60 * 60 * 24) * option_expire_url; /* file format version number */ fprintf(fp, "105\n"); for (search = url_store_hostnames; search; search = search->next) { url_count = 0; for (url = search->url; url; url = url->next) if ((url->data.last_visit > min_date) && (url->data.visits > 0) && (url->data.url_length < MAXIMUM_URL_LENGTH)) { url_count++; } if (url_count > 0) { fprintf(fp, "%s\n%i\n", search->hostname, url_count); for (url = search->url; url && url->next; url = url->next); for (; url; url = url->previous) if ((url->data.last_visit > min_date) && (url->data.visits > 0) && (url->data.url_length < MAXIMUM_URL_LENGTH)) { thumb_file = ""; #ifdef riscos bitmap = url->data.thumbnail; if (bitmap) thumb_file = bitmap->filename; #endif if (url->data.title) { s = url->data.title; for (i = 0; s[i] != '\0'; i++) if (s[i] < 32) s[i] = ' '; for (--i; ((i > 0) && (s[i] == ' ')); i--) s[i] = '\0'; title = url->data.title; } else title = ""; fprintf(fp, "%s\n%i\n%i\n%i\n%s\n%s\n", url->data.url, url->data.visits, (int) url->data. last_visit, url->data.type, thumb_file, title); } } } fclose(fp); } /** * Associates a thumbnail with a specified URL. */ void url_store_add_thumbnail(const char *url, struct bitmap *bitmap) { struct url_content *content; content = url_store_find(url); if (content) { if (content->thumbnail) bitmap_destroy(content->thumbnail); content->thumbnail = bitmap; } } /** * Gets the thumbnail associated with a given URL. */ struct bitmap *url_store_get_thumbnail(const char *url) { struct url_content *content; content = url_store_find(url); if (content) return content->thumbnail; return NULL; } int url_store_compare_last_visit(const void *a, const void *b) { struct url_content * const *url_a = (struct url_content * const *)a; struct url_content * const *url_b = (struct url_content * const *)b; return ((*url_a)->last_visit - (*url_b)->last_visit); }