diff options
author | Rob Kendrick (humdrum) <rob.kendrick@codethink.co.uk> | 2013-05-20 13:13:43 +0100 |
---|---|---|
committer | Rob Kendrick (humdrum) <rob.kendrick@codethink.co.uk> | 2013-05-20 13:13:43 +0100 |
commit | d78c9ccd1003f7ea933819c84d05ca0332f4afaf (patch) | |
tree | cf78d72518e5fe076bda864987353b791bbc168d | |
parent | 0088273dba9fb6cfde673f2a951320a5055e7075 (diff) | |
parent | f571aebbdbee36256e3c243e555887996e4b7dff (diff) | |
download | netsurf-d78c9ccd1003f7ea933819c84d05ca0332f4afaf.tar.gz netsurf-d78c9ccd1003f7ea933819c84d05ca0332f4afaf.tar.bz2 |
Merge branch 'rjek/bloom'
-rw-r--r-- | content/urldb.c | 41 | ||||
-rw-r--r-- | utils/Makefile | 4 | ||||
-rw-r--r-- | utils/bloom.c | 163 | ||||
-rw-r--r-- | utils/bloom.h | 99 |
4 files changed, 304 insertions, 3 deletions
diff --git a/content/urldb.c b/content/urldb.c index e3cc1d73d..059c3c687 100644 --- a/content/urldb.c +++ b/content/urldb.c @@ -107,6 +107,7 @@ #include "utils/filename.h" #include "utils/url.h" #include "utils/utils.h" +#include "utils/bloom.h" struct cookie_internal_data { char *name; /**< Cookie name */ @@ -327,6 +328,17 @@ static int loaded_cookie_file_version; #define MIN_URL_FILE_VERSION 106 #define URL_FILE_VERSION 106 +/* Bloom filter used for short-circuting the false case of "is this + * URL in the database?". BLOOM_SIZE controls how large the filter is + * in bytes. Primitive experimentation shows that for a filter of X + * bytes filled with X items, searching for X items not in the filter + * has a 5% false-positive rate. We set it to 32kB, which should be + * enough for all but the largest databases, while not being shockingly + * wasteful on memory. + */ +static struct bloom_filter *url_bloom; +#define BLOOM_SIZE (1024 * 32) + /** * Import an URL database from file, replacing any existing database * @@ -346,7 +358,10 @@ void urldb_load(const char *filename) assert(filename); - LOG(("Loading URL file")); + LOG(("Loading URL file %s", filename)); + + if (url_bloom == NULL) + url_bloom = bloom_create(BLOOM_SIZE); fp = fopen(filename, "r"); if (!fp) { @@ -456,6 +471,11 @@ void urldb_load(const char *filename) die("Memory exhausted whilst loading " "URL file"); } + + if (url_bloom != NULL) { + uint32_t hash = nsurl_hash(nsurl); + bloom_insert_hash(url_bloom, hash); + } /* Copy and merge path/query strings */ if (nsurl_get(nsurl, NSURL_PATH | NSURL_QUERY, @@ -782,6 +802,14 @@ bool urldb_add_url(nsurl *url) unsigned int port_int; assert(url); + + if (url_bloom == NULL) + url_bloom = bloom_create(BLOOM_SIZE); + + if (url_bloom != NULL) { + uint32_t hash = nsurl_hash(url); + bloom_insert_hash(url_bloom, hash); + } /* Copy and merge path/query strings */ if (nsurl_get(url, NSURL_PATH | NSURL_QUERY, &path_query, &len) != @@ -1857,6 +1885,13 @@ struct path_data *urldb_find_url(nsurl *url) bool match; assert(url); + + if (url_bloom != NULL) { + if (bloom_search_hash(url_bloom, + nsurl_hash(url)) == false) { + return NULL; + } + } scheme = nsurl_get_component(url, NSURL_SCHEME); if (scheme == NULL) @@ -3951,6 +3986,10 @@ void urldb_destroy(void) b = a->next; urldb_destroy_host_tree(a); } + + /* And the bloom filter */ + if (url_bloom != NULL) + bloom_destroy(url_bloom); } /** diff --git a/utils/Makefile b/utils/Makefile index ed34e9557..071e4fec1 100644 --- a/utils/Makefile +++ b/utils/Makefile @@ -2,6 +2,6 @@ S_UTILS := base64.c corestrings.c filename.c filepath.c hashtable.c \ libdom.c locale.c log.c messages.c nsurl.c talloc.c url.c \ - utf8.c utils.c useragent.c + utf8.c utils.c useragent.c bloom.c -S_UTILS := $(addprefix utils/,$(S_UTILS))
\ No newline at end of file +S_UTILS := $(addprefix utils/,$(S_UTILS)) diff --git a/utils/bloom.c b/utils/bloom.c new file mode 100644 index 000000000..1b07d6f1b --- /dev/null +++ b/utils/bloom.c @@ -0,0 +1,163 @@ +/* + * Copyright 2013 Rob Kendrick <rjek@netsurf-browser.org> + * + * This file is part of NetSurf, http://www.netsurf-browser.org/ + * + * NetSurf is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * NetSurf is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +/** \file + * Trivial bloom filter */ + +#include <stdlib.h> +#include "utils/bloom.h" + +/** + * Hash a string, returning a 32bit value. The hash algorithm used is + * Fowler Noll Vo - a very fast and simple hash, ideal for short strings. + * See http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash for more details. + * + * \param datum The string to hash. + * \param len size_t of data length. + * \return The calculated hash value for the datum. + */ + +static inline uint32_t fnv(const char *datum, size_t len) +{ + uint32_t z = 0x811c9dc5; + + if (datum == NULL) + return 0; + + while (len--) { + z *= 0x01000193; + z ^= *datum++; + } + + return z; +} + +struct bloom_filter { + size_t size; + uint32_t items; + uint8_t filter[]; +}; + +struct bloom_filter *bloom_create(size_t size) +{ + struct bloom_filter *r = calloc(sizeof(*r) + size, 1); + + if (r == NULL) + return NULL; + + r->size = size; + + return r; +} + +void bloom_destroy(struct bloom_filter *b) +{ + free(b); +} + +void bloom_insert_str(struct bloom_filter *b, const char *s, size_t z) +{ + uint32_t hash = fnv(s, z); + bloom_insert_hash(b, hash); +} + +void bloom_insert_hash(struct bloom_filter *b, uint32_t hash) +{ + unsigned int index = hash % (b->size << 3); + unsigned int byte_index = index >> 3; + unsigned int bit_index = index & 7; + + b->filter[byte_index] |= (1 << bit_index); + b->items++; +} + +bool bloom_search_str(struct bloom_filter *b, const char *s, size_t z) +{ + uint32_t hash = fnv(s, z); + return bloom_search_hash(b, hash); +} + +bool bloom_search_hash(struct bloom_filter *b, uint32_t hash) +{ + unsigned int index = hash % (b->size << 3); + unsigned int byte_index = index >> 3; + unsigned int bit_index = index & 7; + + return (b->filter[byte_index] & (1 << bit_index)) != 0; +} + +uint32_t bloom_items(struct bloom_filter *b) +{ + return b->items; +} + +#ifdef TEST_RIG + +#include <stdio.h> +#include <string.h> +#include <assert.h> + +int main(int argc, char *arg[]) +{ + struct bloom_filter *b = bloom_create(8192); + FILE *dict = fopen("/usr/share/dict/words", "r"); + char buf[BUFSIZ]; + int false_positives = 0, total = 0; + + for (int i = 0; i < 8192; i++) { + fscanf(dict, "%s", buf); + printf("adding %s\n", buf); + bloom_insert_str(b, buf, strlen(buf)); + } + + printf("adding NetSurf\n"); + + bloom_insert_str(b, "NetSurf", 7); + printf("checking NetSurf (should be true)\n"); + assert(bloom_search_str(b, "NetSurf", 7)); + + fseek(dict, 0, SEEK_SET); + + for (int i = 0; i < 8192; i++) { + fscanf(dict, "%s", buf); + printf("checking %s (should be true)\n", buf); + assert(bloom_search_str(b, buf, strlen(buf))); + + total++; + } + + for (int i = 0; i < 8192; i++) { + fscanf(dict, "%s", buf); + printf("checking %s (should be false)\n", buf); + if (bloom_search_str(b, buf, strlen(buf)) == true) + false_positives++; + total++; + } + + printf("false positives: %d of %d, %f%%\n", + false_positives, total, + ((float)false_positives / total) * 100); + + fclose(dict); + bloom_destroy(b); + + return 0; +} + +#endif /* TEST_RIG */ + diff --git a/utils/bloom.h b/utils/bloom.h new file mode 100644 index 000000000..4a7bd3800 --- /dev/null +++ b/utils/bloom.h @@ -0,0 +1,99 @@ +/* + * Copyright 2013 Rob Kendrick <rjek@netsurf-browser.org> + * + * This file is part of NetSurf, http://www.netsurf-browser.org/ + * + * NetSurf is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * NetSurf is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +/** \file + * Trivial bloom filter */ + +#ifndef _NETSURF_UTILS_BLOOM_H_ +#define _NETSURF_UTILS_BLOOM_H_ + +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> + +struct bloom_filter; + +/** + * Create a new bloom filter. + * + * \param size Size of bloom filter in bytes + * \return Handle for newly-created bloom filter, or NULL + */ +struct bloom_filter *bloom_create(size_t size); + +/** + * Destroy a previously-created bloom filter + * + * \param b Bloom filter to destroy + */ +void bloom_destroy(struct bloom_filter *b); + +/** + * Insert a string of given length (may include NULs) into the filter, + * using an internal hash function. + * + * \param b Bloom filter to add to + * \param s Pointer to data + * \param z Length of data + */ +void bloom_insert_str(struct bloom_filter *b, const char *s, size_t z); + +/** + * Insert a given hash value into the filter, should you already have + * one to hand. + * + * \param b Bloom filter to add to + * \param hash Value to add + */ +void bloom_insert_hash(struct bloom_filter *b, uint32_t hash); + +/** + * Search the filter for the given string, assuming it was added by + * bloom_insert_str(). May return false-positives. + * + * \param b Bloom filter to search + * \param s Pointer to data to search for + * \param z Length of data + * + * \return False if never added, True if it might have been. + */ +bool bloom_search_str(struct bloom_filter *b, const char *s, size_t z); + +/** + * Search the filter for the given hash value, assuming it was added by + * bloom_insert_hash(). May return false-positives. + * + * \param b Bloom filter to search + * \param hash Hash value to search for + * + * \return False if never added, True if it might have been. + */ +bool bloom_search_hash(struct bloom_filter *b, uint32_t hash); + +/** + * Find out how many items have been added to this bloom filter. This + * is useful for deciding the size of a new bloom filter should you + * need to rehash it. + * + * \param b Bloom filter to examine + * + * \return Number of items that have been added + */ +uint32_t bloom_items(struct bloom_filter *b); + +#endif |