summaryrefslogtreecommitdiff
path: root/content/url_store.c
blob: 9d22b24637a4f3ef96459a77270c0c0db25cdd7b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
/*
 * This file is part of NetSurf, http://netsurf.sourceforge.net/
 * Licensed under the GNU General Public License,
 *		  http://www.opensource.org/licenses/gpl-license
 * Copyright 2005 Richard Wilson <info@tinct.net>
 */

/** \file
 * Central repository for URL data (implementation).
 */

#include <assert.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include "netsurf/content/url_store.h"
#include "netsurf/image/bitmap.h"
#include "netsurf/desktop/options.h"
#ifdef riscos
#include "netsurf/riscos/bitmap.h"
#endif
#include "netsurf/utils/log.h"
#include "netsurf/utils/url.h"
#include "netsurf/utils/utils.h"


#define ITERATIONS_BEFORE_TEST 32
#define MAXIMUM_URL_LENGTH 1024

struct hostname_data *url_store_hostnames = NULL;

static struct hostname_data *url_store_find_hostname(const char *url);
static struct hostname_data *url_store_match_hostname(const char *url,
		struct hostname_data *previous);
static char *url_store_match_scheme = NULL;

static struct hostname_data *last_hostname_found = NULL;

/**
 * Returns the hostname data for the specified URL. If no hostname
 * data is currently available then it is created.
 *
 * \param  url  the url to find hostname data for
 * \return  the current hostname data, or NULL if memory exhausted
 */
struct hostname_data *url_store_find_hostname(const char *url)
{
	struct hostname_data *first = url_store_hostnames;
	struct hostname_data *search;
	struct hostname_data *result;
	url_func_result res;
	char *hostname = NULL;
	int hostname_length;
	int compare;
	int fast_exit_counter = ITERATIONS_BEFORE_TEST;
	char *host_test;

	assert(url);

	/* as the URL is normalised, we optimise the hostname finding for http:// */
	if (!strncmp("http://", url, 7)) {
	  	/* check for duplicate hostname calls */
		if ((last_hostname_found) &&
				(!strncmp(last_hostname_found->hostname, url + 7,
					last_hostname_found->hostname_length))) {
			/* ensure it isn't comparing 'foo.com' to 'foo.com.au' etc */
			if (url[last_hostname_found->hostname_length + 7] != '.')
				return last_hostname_found;
		}

		/* check for a hostname match */
	  	for (host_test = url + 7;
	  			((*host_test > 32) && (*host_test != '/'));
	  			*host_test++);
		hostname_length = host_test - url - 7;
		host_test = url + 7;
		if ((last_hostname_found) &&
				(strncmp(host_test,
					last_hostname_found->hostname,
					hostname_length) > 0))
			first = last_hostname_found;
		for (search = first; search; search = search->next) {
			if (search->hostname_length == hostname_length) {
				compare = strncmp(host_test, search->hostname,
						hostname_length);
				if (compare == 0) {
					last_hostname_found = search;
					return search;
				} else if (compare < 0)
					break;
			}
		}
	
	  	/* allocate a new hostname */
		hostname = malloc(hostname_length + 1);
		if (!hostname)
			return NULL;
		strncpy(hostname, host_test, hostname_length);
		hostname[hostname_length] = '\0';
	} else {
		/* no quick match found, fallback */
		res = url_host(url, &hostname);
		switch (res) {
			case URL_FUNC_OK:
				break;
			case URL_FUNC_NOMEM:
				return NULL;
			case URL_FUNC_FAILED:
				hostname = strdup("file:/");	/* for 'file:/' */
				if (!hostname)
					return NULL;
				break;
			default:
				assert(0);
		}
		hostname_length = strlen(hostname);
	}

	/* try to find a matching hostname fairly quickly */
	if ((last_hostname_found) &&
			(strcmp(hostname, last_hostname_found->hostname) > 0))
		first = last_hostname_found;
	for (search = first; search; search = search->next) {
		if ((fast_exit_counter <= 0) ||
				(search->hostname_length == hostname_length)) {
			compare = strcmp(hostname, search->hostname);
			if (compare == 0) {
				free(hostname);
				last_hostname_found = search;
				return search;
			} else if (compare < 0)
				break;
			fast_exit_counter = ITERATIONS_BEFORE_TEST;
		} else {
			fast_exit_counter--;
		}
	}

	/* no hostname is available: create a new one */
	result = malloc(sizeof *result);
	if (!result) {
		free(hostname);
		return NULL;
	}
	result->hostname = hostname;
	result->hostname_length = hostname_length;
	result->url = 0;
	result->previous = 0;
	result->next = 0;
	last_hostname_found = result;

	/* simple case: no current hostnames */
	if (!url_store_hostnames) {
		url_store_hostnames = result;
		return result;
	}

	/* worst case scenario: the place we need to link is within the last
	 * section of the hostname list so we have no reference to work back
	 * from. rather than slowing with the very common case of searching,
	 * we take a speed hit for this case and simply move to the very end
	 * of the hostname list ready to work backwards. */
	if (!search)
		for (search = url_store_hostnames; search->next;
				search = search->next)
			;

	/* we can now simply scan backwards as we know roughly where we need
	 * to link to (we either had an early exit from the searching so we
	 * know we're in the block following where we need to link, or we're
	 * at the very end of the list as we were in the last block.) */
	while ((search) && (strcmp(hostname, search->hostname) < 0))
		search = search->previous;

	/* simple case: our new hostname is the first in the list */
	if (!search) {
		result->next = url_store_hostnames;
		url_store_hostnames->previous = result;
		url_store_hostnames = result;
		return result;
	}

	/* general case: link in after the found hostname */
	result->previous = search;
	result->next = search->next;
	if (search->next)
		search->next->previous = result;
	search->next = result;
	return result;
}


/**
 * Returns the url data for the specified URL. If no url
 * data is currently available then it is created.
 *
 * \param  url  a normalized url to find hostname data for
 * \return  the current hostname data, or NULL if memory exhausted
 */
struct url_content *url_store_find(const char *url) {
	struct hostname_data *hostname_data;
	struct url_data *search;
	struct url_data *result;
	size_t url_length;
	int compare;
	int fast_exit_counter = ITERATIONS_BEFORE_TEST;

	assert(url);

	/* find the corresponding hostname data */
	hostname_data = url_store_find_hostname(url);
	if (!hostname_data)
		return NULL;

	/* move to the start of the leafname */
	url_length = strlen(url);

	/* try to find a matching url fairly quickly */
	for (search = hostname_data->url; search; search = search->next) {
		if ((fast_exit_counter <= 0) ||
				(search->data.url_length == url_length)) {
			compare = strcmp(url, search->data.url);
			if (compare == 0)
				return &search->data;
			else if (compare < 0)
				break;
			fast_exit_counter = ITERATIONS_BEFORE_TEST;
		} else {
			fast_exit_counter--;
		}
	}

	/* no URL is available: create a new one */
	result = calloc(1, sizeof(struct url_data));
	if (!result)
		return NULL;
	result->data.url = malloc(url_length + 1);
	if (!result->data.url) {
		free(result);
		return NULL;
	}
	strcpy(result->data.url, url);
	result->data.url_length = url_length;
	result->parent = hostname_data;

	/* simple case: no current URLs */
	if (!hostname_data->url) {
		hostname_data->url = result;
		return &result->data;
	}

	/* worst case scenario: the place we need to link is within the last
	 * section of the URL list so we have no reference to work back
	 * from. rather than slowing with the very common case of searching,
	 * we take a speed hit for this case and simply move to the very end
	 * of the URL list ready to work backwards. */
	if (!search)
		for (search = hostname_data->url; search->next;
				search = search->next)
			;

	/* we can now simply scan backwards as we know roughly where we need
	 * to link to (we either had an early exit from the searching so we
	 * know we're in the block following where we need to link, or we're
	 * at the very end of the list as we were in the last block.) */
	while ((search) && (strcmp(url, search->data.url) < 0))
		search = search->previous;

	/* simple case: our new hostname is the first in the list */
	if (!search) {
		result->next = hostname_data->url;
		hostname_data->url->previous = result;
		hostname_data->url = result;
		return &result->data;
	}

	/* general case: link in after the found hostname */
	result->previous = search;
	result->next = search->next;
	if (search->next)
		search->next->previous = result;
	search->next = result;
	return &result->data;
}


/**
 * Returns the next hostname that matches a part of the specified URL.
 *
 * \param url	   a normalized url to find the next match for
 * \param current  the current hostname to search forward from, or NULL
 * \return the next matching hostname, or NULL
 *
 * \todo  distinguish between out-of-memory and no more results in return
 */
struct hostname_data *url_store_match_hostname(const char *url,
		struct hostname_data *current) {
	url_func_result res;
	char *hostname;
	int hostname_length;
	int compare;
	bool www_test;

	assert(url);

	res = url_host(url, &hostname);
	switch (res) {
	case URL_FUNC_OK:
		break;
	case URL_FUNC_NOMEM:
		return NULL;
	case URL_FUNC_FAILED:
		hostname = strdup("file:/");		/* for 'file:/' */
		if (!hostname)
			return NULL;
		break;
	default:
		assert(0);
	}
	hostname_length = strlen(hostname);
	www_test = strncmp(hostname, "www.", 4);

	/* advance to the next hostname */
	if (!current)
		current = url_store_hostnames;
	else
		current = current->next;

	/* skip past hostname data without URLs */
	for (; current && (!current->url); current = current->next)
		;

	while (current) {
		if (current->hostname_length >= hostname_length) {
			compare = strncmp(hostname, current->hostname,
					hostname_length);
			if (compare == 0) {
				free(hostname);
				return current;
			} else if ((compare < 0) && !www_test)
				break;
		}
		/* special case: if hostname is not www then try it */
		if (www_test && ((current->hostname_length - 4) >=
				hostname_length) &&
				(!strncmp(current->hostname, "www.", 4)) &&
				(!strncmp(hostname, current->hostname + 4,
					hostname_length))) {
			free(hostname);
			return current;
		}

		/* move to next hostname with URLs */
		current = current->next;
		for (; current && (!current->url); current = current->next)
			;
	}

	free(hostname);
	return NULL;
}



/**
 * Returns the complete URL for the next matched stored URL.
 *
 * \param url	     a normalized url to find the next match for
 * \param reference  internal reference (NULL for first call)
 * \return the next URL that matches
 */
char *url_store_match(const char *url, struct url_data **reference) {
	struct hostname_data *hostname;
	struct url_data *search = NULL;
	int scheme_length;
	size_t url_length;
	url_func_result res;
	bool www_test;

	assert(url);

	if (!url_store_hostnames)
		return NULL;

	/* find the scheme and first URL, not necessarily matching */
	if (!*reference) {
		hostname = url_store_match_hostname(url, NULL);
		if (!hostname)
			return NULL;
		if (url_store_match_scheme) {
		  	free(url_store_match_scheme);
		  	url_store_match_scheme = NULL;
		}
		res = url_scheme(url, &url_store_match_scheme);
		if (res != URL_FUNC_OK)
			return NULL;
	} else {
		search = *reference;
		hostname = search->parent;
	}

	scheme_length = strlen(url_store_match_scheme);
	url_length = strlen(url);
	www_test = (!strcmp(url_store_match_scheme, "http") &&
			strncmp(url + 4 + 3, "www.", 4)); /* 'http' + '://' */

	/* work through all our strings, ignoring the scheme and 'www.' */
	while (hostname) {

		/* get the next URL to test */
		if (!search)
			search = hostname->url;
		else
			search = search->next;

		/* loop past end of list, or search */
		if (!search) {
			hostname = url_store_match_hostname(url, hostname);
			if (!hostname)
				return NULL;
		} else if (search->data.visits > 0) {
			/* straight match */
			if ((search->data.url_length >= url_length) &&
					(!strncmp(search->data.url, url,
							url_length))) {
				*reference = search;
				return search->data.url;
			}
			/* try with 'www.' inserted after the scheme */
			if (www_test && ((search->data.url_length - 4) >=
					url_length) &&
				(!strncmp(search->data.url,
						url_store_match_scheme,
						scheme_length)) &&
				(!strncmp(search->data.url + scheme_length + 3,
						"www.", 4)) &&
				(!strncmp(search->data.url + scheme_length + 7,
						url + scheme_length + 3,
						url_length - scheme_length - 3))) {
				*reference = search;
				return search->data.url;
			}
		}
	}
	return NULL;
}


/**
 * Converts a text string into one suitable for URL matching.
 *
 * \param text	     the text to search with
 * \return URL matching string allocated on heap, or NULL on error
 */
char *url_store_match_string(const char *text) {
	url_func_result res;
	char *url;

	assert(text);

	res = url_normalize(text, &url);
	if (res != URL_FUNC_OK)
		return NULL;

	/* drop the '/' from the end if it was added when normalizing */
	if ((url[strlen(url) - 1] == '/') && (text[strlen(text) - 1] != '/'))
		url[strlen(url) - 1] = '\0';
	return url;
}


/**
 * Loads the current contents of the URL store from disk
 *
 * \param file  the file to load options from
 */
void url_store_load(const char *file) {
	char s[MAXIMUM_URL_LENGTH];
	struct hostname_data *hostname;
	struct url_data *result;
	int urls;
	int i;
	int version;
	int length;
	FILE *fp;

	LOG(("Loading URL file"));

	fp = fopen(file, "r");
	if (!fp) {
		LOG(("Failed to open file '%s' for reading", file));
		return;
	}

	if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
		return;
	version = atoi(s);
	if (version < 102) {
	  	LOG(("Unsupported URL file version."));
	  	return;
	}
	if (version > 104) {
	 	LOG(("Unknown URL file version."));
		return;
	}

	while (fgets(s, MAXIMUM_URL_LENGTH, fp)) {
		if (s[strlen(s) - 1] == '\n')
			s[strlen(s) - 1] = '\0';
		hostname = url_store_find_hostname(s);
		if (!hostname)
			break;
		if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
			break;
		urls = atoi(s);
		for (i = 0; i < urls; i++) {
			if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
				break;
			for (length = 0; s[length] > 32; length++);
			s[length] = 0x00;
			result = calloc(1, sizeof(struct url_data));
			if (!result)
				break;
			result->data.url_length = length;
			result->data.url = strdup(s);
			if (!result->data.url)
				die("Insufficient memory");
			result->parent = hostname;
			result->next = hostname->url;
			if (hostname->url)
				hostname->url->previous = result;
			hostname->url = result;
			if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
				break;
			result->data.visits = atoi(s);
			if (version == 102) {
			  	/* ignore requests */
				if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
					break;
				/* ignore thumbnail size */
				if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
					break;
				/* set last visit as today to retain */
				result->data.last_visit = time(NULL);
			} else {
				if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
					break;
				result->data.last_visit = atoi(s);
				if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
					break;
				result->data.type = atoi(s);
			}
			if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
				break;
#ifdef riscos
			for (length = 0; s[length] > 32; length++);
			s[length] = 0x00;
			if (length == 11) {
				/* ensure filename is 'XX.XX.XX.XX' */
				if ((s[2] == '.') && (s[5] == '.') &&
						(s[8] == '.'))
					result->data.thumbnail =
							bitmap_create_file(s);
			}
#endif
			if (version == 104) {
				if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
					break;
				for (length = 0; s[length] >= 32; length++);
				s[length] = 0x00;
				if (length > 0)
					result->data.title = strdup(s);
			}
		}
	}
	fclose(fp);
	LOG(("Successfully loaded URL file"));
}


/**
 * Saves the current contents of the URL store to disk
 *
 * \param file  the file to load options from
 */
void url_store_save(const char *file) {
	struct hostname_data *search;
	struct url_data *url;
	int url_count;
	char *normal = NULL;
	const char *thumb_file;
	char *s;
	int i;
	FILE *fp;
#ifdef riscos
	struct bitmap *bitmap;
#endif
	time_t min_date;
	char *title;

	fp = fopen(file, "w");
	if (!fp) {
		LOG(("Failed to open file '%s' for writing", file));
		return;
	}

	/* get the minimum date for expiry */
	min_date = time(NULL) - (60 * 60 * 24) * option_expire_url;
	LOG(("%d", (int) min_date));

	/* file format version number */
	fprintf(fp, "104\n");
	for (search = url_store_hostnames; search; search = search->next) {
		url_count = 0;
		for (url = search->url; url; url = url->next)
			if ((url->data.last_visit > min_date) &&
					(url->data.visits > 0) &&
					(url->data.url_length <
						MAXIMUM_URL_LENGTH)) {
				url_count++;
			}
		free(normal);
		normal = url_store_match_string(search->hostname);
		if ((url_count > 0) && (normal)) {
			fprintf(fp, "%s\n%i\n", normal, url_count);
			for (url = search->url; url && url->next;
					url = url->next);
			for (; url; url = url->previous)
				if ((url->data.last_visit > min_date) &&
						(url->data.visits > 0) &&
						(url->data.url_length <
							MAXIMUM_URL_LENGTH)) {
					thumb_file = "";
#ifdef riscos
					bitmap = url->data.thumbnail;
					if (bitmap)
						thumb_file = bitmap->filename;
#endif

					if (url->data.title) {
						s = url->data.title;
						for (i = 0; s[i] != '\0';
								i++)
							if (s[i] < 32)
								s[i] = ' ';
						for (--i;
							((i > 0) &&
							(s[i] == ' '));
								i--)
							s[i] = '\0';

						title = url->data.title;
					}
					else
						title = "";
					fprintf(fp, "%s\n%i\n%i\n%i\n%s\n%s\n",
							url->data.url,
							url->data.visits,
							(int) url->data.
								last_visit,
							url->data.type,
							thumb_file,
							title);
				}
		}
	}
	fclose(fp);
}


/**
 * Associates a thumbnail with a specified URL.
 */
void url_store_add_thumbnail(const char *url, struct bitmap *bitmap) {
	struct url_content *content;

	content = url_store_find(url);
	if (content) {
	  	if (content->thumbnail)
	  		bitmap_destroy(content->thumbnail);
	  	content->thumbnail = bitmap;
	}
}


/**
 * Gets the thumbnail associated with a given URL.
 */
struct bitmap *url_store_get_thumbnail(const char *url) {
	struct url_content *content;

	content = url_store_find(url);
	if (content)
		return content->thumbnail;
	return NULL;
}


int url_store_compare_last_visit(const void *a, const void *b) {
  	struct url_content * const *url_a = (struct url_content * const *)a;
  	struct url_content * const *url_b = (struct url_content * const *)b;
	return ((*url_a)->last_visit - (*url_b)->last_visit);
}