Merge branches/MarkieB/gtkmain to trunk.

svn path=/trunk/netsurf/; revision=9729
author: John Mark Bell <jmb@netsurf-browser.org> 2009-12-17 23:55:02 +0000
committer: John Mark Bell <jmb@netsurf-browser.org> 2009-12-17 23:55:02 +0000
commit: 355799ce0bbb078237dfc1ae9874bbc5342acbc4 (patch)
tree: 7ca980c01c0d4d1d55a3b7b15418c95c5618afae /desktop/save_complete.c
parent: 4346b2b62b940182575e6612e46234355afa083c (diff)
download: netsurf-355799ce0bbb078237dfc1ae9874bbc5342acbc4.tar.gz
netsurf-355799ce0bbb078237dfc1ae9874bbc5342acbc4.tar.bz2
1 files changed, 755 insertions, 0 deletions
diff --git a/desktop/save_complete.c b/desktop/save_complete.c
new file mode 100644
index 000000000..48438908d
--- /dev/null
+++ b/desktop/save_complete.c
@@ -0,0 +1,755 @@
+/*
+ * Copyright 2004 John M Bell <jmb202@ecs.soton.ac.uk>
+ * Copyright 2004-2007 James Bursa <bursa@users.sourceforge.net>
+ *
+ * This file is part of NetSurf, http://www.netsurf-browser.org/
+ *
+ * NetSurf is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * NetSurf is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** \file
+ * Save HTML document with dependencies (implementation).
+ */
+
+#include "utils/config.h"
+
+#define _GNU_SOURCE /* for strndup */
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <regex.h>
+#include <libxml/HTMLtree.h>
+#include <libxml/parserInternals.h>
+#include "utils/config.h"
+#include "content/content.h"
+#include "css/css.h"
+#include "render/box.h"
+#include "desktop/save_complete.h"
+#include "utils/log.h"
+#include "utils/url.h"
+#include "utils/utils.h"
+
+regex_t save_complete_import_re;
+
+/** An entry in save_complete_list. */
+struct save_complete_entry {
+	struct content *content;
+	struct save_complete_entry *next; /**< Next entry in list */
+};
+
+static bool save_complete_html(struct content *c, const char *path,
+		bool index, struct save_complete_entry **list);
+static bool save_imported_sheets(struct content *c, const char *path,
+		struct save_complete_entry **list);
+static char * rewrite_stylesheet_urls(const char *source, unsigned int size,
+		int *osize, const char *base,
+		struct save_complete_entry *list);
+static bool rewrite_document_urls(xmlDoc *doc, const char *base,
+		struct save_complete_entry *list);
+static bool rewrite_urls(xmlNode *n, const char *base,
+		struct save_complete_entry *list);
+static bool rewrite_url(xmlNode *n, const char *attr, const char *base,
+		struct save_complete_entry *list);
+static bool save_complete_list_add(struct content *content,
+		struct save_complete_entry **list);
+static struct content * save_complete_list_find(const char *url,
+		struct save_complete_entry *list);
+static bool save_complete_list_check(struct content *content,
+		struct save_complete_entry *list);
+/* static void save_complete_list_dump(void); */
+static bool save_complete_inventory(const char *path,
+		struct save_complete_entry *list);
+
+/**
+ * Save an HTML page with all dependencies.
+ *
+ * \param  c     CONTENT_HTML to save
+ * \param  path  directory to save to (must exist)
+ * \return  true on success, false on error and error reported
+ */
+
+bool save_complete(struct content *c, const char *path)
+{
+	bool result;
+	struct save_complete_entry *list = NULL;
+	
+	result = save_complete_html(c, path, true, &list);
+
+	if (result)
+		result = save_complete_inventory(path, list);
+
+	/* free save_complete_list */
+	while (list) {
+		struct save_complete_entry *next = list->next;
+		free(list);
+		list = next;
+	}
+
+	return result;
+}
+
+
+/**
+ * Save an HTML page with all dependencies, recursing through imported pages.
+ *
+ * \param  c      CONTENT_HTML to save
+ * \param  path   directory to save to (must exist)
+ * \param  index  true to save as "index"
+ * \return  true on success, false on error and error reported
+ */
+
+bool save_complete_html(struct content *c, const char *path, bool index,
+		struct save_complete_entry **list)
+{
+	char filename[256];
+	unsigned int i;
+	xmlDocPtr doc;
+	bool res;
+
+	if (c->type != CONTENT_HTML)
+		return false;
+
+	if (save_complete_list_check(c, *list))
+		return true;
+		
+	/* save stylesheets, ignoring the base and adblocking sheets */
+	for (i = STYLESHEET_START; i != c->data.html.stylesheet_count; i++) {
+		struct content *css = c->data.html.stylesheets[i].c;
+		char *source;
+		int source_len;
+		bool is_style;
+
+		if (!css)
+			continue;
+		if (save_complete_list_check(css, *list))
+			continue;
+
+		is_style = (strcmp(css->url, c->data.html.base_url) == 0);
+
+		if (is_style == false) {
+			if (!save_complete_list_add(css, list)) {
+				warn_user("NoMemory", 0);
+				return false;
+			}
+		}
+
+		if (!save_imported_sheets(css, path, list))
+			return false;
+
+		if (is_style)
+			continue; /* don't save <style> elements */
+
+		snprintf(filename, sizeof filename, "%p", css);
+		source = rewrite_stylesheet_urls(css->source_data,
+				css->source_size, &source_len, css->url,
+				*list);
+		if (!source) {
+			warn_user("NoMemory", 0);
+			return false;
+		}
+		res = save_complete_gui_save(path, filename, source_len,
+				source, CONTENT_CSS);
+		free(source);
+		if (res == false)
+			return false;
+	}
+	
+	/* save objects */
+	for (i = 0; i != c->data.html.object_count; i++) {
+		struct content *obj = c->data.html.object[i].content;
+
+		/* skip difficult content types */
+		if (!obj || obj->type >= CONTENT_OTHER || !obj->source_data)
+			continue;
+		if (save_complete_list_check(obj, *list))
+			continue;
+
+		if (!save_complete_list_add(obj, list)) {
+			warn_user("NoMemory", 0);
+			return false;
+		}
+
+		if (obj->type == CONTENT_HTML) {
+			if (!save_complete_html(obj, path, false, list))
+				return false;
+			continue;
+		}
+
+		snprintf(filename, sizeof filename, "%p", obj);
+		res = save_complete_gui_save(path, filename, 
+				obj->source_size, obj->source_data, obj->type);
+		if(res == false)
+			return false;
+	}
+
+	/*save_complete_list_dump();*/
+
+	/* copy document */
+	doc = xmlCopyDoc(c->data.html.document, 1);
+	if (doc == NULL) {
+		warn_user("NoMemory", 0);
+		return false;
+	}
+
+	/* rewrite all urls we know about */
+	if (!rewrite_document_urls(doc, c->data.html.base_url, *list)) {
+		xmlFreeDoc(doc);
+		warn_user("NoMemory", 0);
+		return false;
+	}
+
+	/* save the html file out last of all */
+	if (index)
+		snprintf(filename, sizeof filename, "index");
+	else 
+		snprintf(filename, sizeof filename, "%p", c);
+
+	errno = 0;
+	if (save_complete_htmlSaveFileFormat(path, filename, doc, 0, 0) == -1) {
+		if (errno)
+			warn_user("SaveError", strerror(errno));
+		else
+			warn_user("SaveError", "htmlSaveFileFormat failed");
+
+		xmlFreeDoc(doc);
+		return false;
+	}	
+
+	xmlFreeDoc(doc);
+
+	return true;
+}
+
+
+/**
+ * Save stylesheets imported by a CONTENT_CSS.
+ *
+ * \param  c     a CONTENT_CSS
+ * \param  path  path to save to
+ * \return  true on success, false on error and error reported
+ */
+
+bool save_imported_sheets(struct content *c, const char *path,
+		struct save_complete_entry **list)
+{
+	char filename[256];
+	unsigned int j;
+	char *source;
+	int source_len;
+	bool res;
+
+	for (j = 0; j != c->data.css.import_count; j++) {
+		struct content *css = c->data.css.imports[j].c;
+
+		if (!css)
+			continue;
+		if (save_complete_list_check(css, *list))
+			continue;
+
+		if (!save_complete_list_add(css, list)) {
+			warn_user("NoMemory", 0);
+			return false;
+		}
+
+		if (!save_imported_sheets(css, path, list))
+			return false;
+
+		snprintf(filename, sizeof filename, "%p", css);
+		source = rewrite_stylesheet_urls(css->source_data,
+				css->source_size, &source_len, css->url, 
+				*list);
+		if (!source) {
+			warn_user("NoMemory", 0);
+			return false;
+		}
+
+		res = save_complete_gui_save(path, filename, source_len,
+				source, CONTENT_CSS);
+		free(source);
+		if (res == false)
+			return false;
+	}
+
+	return true;
+}
+
+
+/**
+ * Initialise the save_complete module.
+ */
+
+void save_complete_init(void)
+{
+	/* Match an @import rule - see CSS 2.1 G.1. */
+	regcomp_wrapper(&save_complete_import_re,
+			"@import"		/* IMPORT_SYM */
+			"[ \t\r\n\f]*"		/* S* */
+			/* 1 */
+			"("			/* [ */
+			/* 2 3 */
+			"\"(([^\"]|[\\]\")*)\""	/* STRING (approximated) */
+			"|"
+			/* 4 5 */
+			"'(([^']|[\\]')*)'"
+			"|"			/* | */
+			"url\\([ \t\r\n\f]*"	/* URI (approximated) */
+			     /* 6 7 */
+			     "\"(([^\"]|[\\]\")*)\""
+			     "[ \t\r\n\f]*\\)"
+			"|"
+			"url\\([ \t\r\n\f]*"
+			    /* 8 9 */
+			     "'(([^']|[\\]')*)'"
+			     "[ \t\r\n\f]*\\)"
+			"|"
+			"url\\([ \t\r\n\f]*"
+			   /* 10 */
+			     "([^) \t\r\n\f]*)"
+			     "[ \t\r\n\f]*\\)"
+			")",			/* ] */
+			REG_EXTENDED | REG_ICASE);
+}
+
+
+/**
+ * Rewrite stylesheet \@import rules for save complete.
+ *
+ * @param  source  stylesheet source
+ * @param  size    size of source
+ * @param  osize   updated with the size of the result
+ * @param  base    url of stylesheet
+ * @return  converted source, or 0 on out of memory
+ */
+
+char * rewrite_stylesheet_urls(const char *source, unsigned int size,
+		int *osize, const char *base,
+		struct save_complete_entry *list)
+{
+	char *res;
+	const char *url;
+	char *url2;
+	char buf[20];
+	unsigned int offset = 0;
+	int url_len = 0;
+	struct content *content;
+	int m;
+	unsigned int i;
+	unsigned int imports = 0;
+	regmatch_t match[11];
+	url_func_result result;
+
+	/* count number occurences of @import to (over)estimate result size */
+	/* can't use strstr because source is not 0-terminated string */
+	for (i = 0; 7 < size && i != size - 7; i++) {
+		if (source[i] == '@' &&
+				tolower(source[i + 1]) == 'i' &&
+				tolower(source[i + 2]) == 'm' &&
+				tolower(source[i + 3]) == 'p' &&
+				tolower(source[i + 4]) == 'o' &&
+				tolower(source[i + 5]) == 'r' &&
+				tolower(source[i + 6]) == 't')
+			imports++;
+	}
+
+	res = malloc(size + imports * 20);
+	if (!res)
+		return 0;
+	*osize = 0;
+
+	while (offset < size) {
+		m = regexec(&save_complete_import_re, source + offset,
+				11, match, 0);
+		if (m)
+			break;
+
+		/*for (unsigned int i = 0; i != 11; i++) {
+			if (match[i].rm_so == -1)
+				continue;
+			fprintf(stderr, "%i: '%.*s'\n", i,
+					match[i].rm_eo - match[i].rm_so,
+					source + offset + match[i].rm_so);
+		}*/
+
+		url = 0;
+		if (match[2].rm_so != -1) {
+			url = source + offset + match[2].rm_so;
+			url_len = match[2].rm_eo - match[2].rm_so;
+		} else if (match[4].rm_so != -1) {
+			url = source + offset + match[4].rm_so;
+			url_len = match[4].rm_eo - match[4].rm_so;
+		} else if (match[6].rm_so != -1) {
+			url = source + offset + match[6].rm_so;
+			url_len = match[6].rm_eo - match[6].rm_so;
+		} else if (match[8].rm_so != -1) {
+			url = source + offset + match[8].rm_so;
+			url_len = match[8].rm_eo - match[8].rm_so;
+		} else if (match[10].rm_so != -1) {
+			url = source + offset + match[10].rm_so;
+			url_len = match[10].rm_eo - match[10].rm_so;
+		}
+		assert(url);
+
+		url2 = strndup(url, url_len);
+		if (!url2) {
+			free(res);
+			return 0;
+		}
+		result = url_join(url2, base, (char**)&url);
+		free(url2);
+		if (result == URL_FUNC_NOMEM) {
+			free(res);
+			return 0;
+		}
+
+		/* copy data before match */
+		memcpy(res + *osize, source + offset, match[0].rm_so);
+		*osize += match[0].rm_so;
+
+		if (result == URL_FUNC_OK) {
+			content = save_complete_list_find(url, list);
+			if (content) {
+				/* replace import */
+				snprintf(buf, sizeof buf, "@import '%p'",
+						content);
+				memcpy(res + *osize, buf, strlen(buf));
+				*osize += strlen(buf);
+			} else {
+				/* copy import */
+				memcpy(res + *osize, source + offset + match[0].rm_so,
+					match[0].rm_eo - match[0].rm_so);
+				*osize += match[0].rm_eo - match[0].rm_so;
+			}
+		}
+		else {
+			/* copy import */
+			memcpy(res + *osize, source + offset + match[0].rm_so,
+				match[0].rm_eo - match[0].rm_so);
+			*osize += match[0].rm_eo - match[0].rm_so;
+		}
+
+		assert(0 < match[0].rm_eo);
+		offset += match[0].rm_eo;
+	}
+
+	/* copy rest of source */
+	if (offset < size) {
+		memcpy(res + *osize, source + offset, size - offset);
+		*osize += size - offset;
+	}
+
+	return res;
+}
+
+
+/**
+ * Rewrite URLs in a HTML document to be relative.
+ *
+ * \param  doc   root of the document tree
+ * \param  base  base url of document
+ * \return  true on success, false on out of memory
+ */
+
+bool rewrite_document_urls(xmlDoc *doc, const char *base,
+		struct save_complete_entry *list)
+{
+	xmlNode *node;
+
+	for (node = doc->children; node; node = node->next)
+		if (node->type == XML_ELEMENT_NODE)
+			if (!rewrite_urls(node, base, list))
+				return false;
+
+	return true;
+}
+
+
+/**
+ * Traverse tree, rewriting URLs as we go.
+ *
+ * \param  n     xmlNode of type XML_ELEMENT_NODE to rewrite
+ * \param  base  base url of document
+ * \return  true on success, false on out of memory
+ *
+ * URLs in the tree rooted at element n are rewritten.
+ */
+
+bool rewrite_urls(xmlNode *n, const char *base,
+		struct save_complete_entry *list)
+{
+	xmlNode *child;
+
+	assert(n->type == XML_ELEMENT_NODE);
+
+	/**
+	 * We only need to consider the following cases:
+	 *
+	 * Attribute:      Elements:
+	 *
+	 * 1)   data         <object>
+	 * 2)   href         <a> <area> <link>
+	 * 3)   src          <script> <input> <frame> <iframe> <img>
+	 * 4)   n/a          <style>
+	 * 5)   n/a          any <base> tag
+	 * 6)   background   any (except those above)
+	 */
+	if (!n->name) {
+		/* ignore */
+	}
+	/* 1 */
+	else if (strcmp((const char *) n->name, "object") == 0) {
+		if (!rewrite_url(n, "data", base, list))
+			return false;
+	}
+	/* 2 */
+	else if (strcmp((const char *) n->name, "a") == 0 ||
+			strcmp((const char *) n->name, "area") == 0 ||
+			strcmp((const char *) n->name, "link") == 0) {
+		if (!rewrite_url(n, "href", base, list))
+			return false;
+	}
+	/* 3 */
+	else if (strcmp((const char *) n->name, "frame") == 0 ||
+			strcmp((const char *) n->name, "iframe") == 0 ||
+			strcmp((const char *) n->name, "input") == 0 ||
+			strcmp((const char *) n->name, "img") == 0 ||
+			strcmp((const char *) n->name, "script") == 0) {
+		if (!rewrite_url(n, "src", base, list))
+			return false;
+	}
+	/* 4 */
+	else if (strcmp((const char *) n->name, "style") == 0) {
+		unsigned int len;
+		xmlChar *content;
+
+		for (child = n->children; child != 0; child = child->next) {
+			/* Get current content */
+			content = xmlNodeGetContent(child);
+			if (!content)
+				/* unfortunately we don't know if this is
+				 * due to memory exhaustion, or because
+				 * there is no content for this node */
+				continue;
+
+			/* Rewrite @import rules */
+			char *rewritten = rewrite_stylesheet_urls(
+					(const char *) content,
+					strlen((const char *) content),
+					(int *) &len, base, list);
+			xmlFree(content);
+			if (!rewritten)
+				return false;
+
+			/* set new content */
+			xmlNodeSetContentLen(child,
+					(const xmlChar*)rewritten,
+					len);
+		}
+
+		return true;
+	}
+	/* 5 */
+	else if (strcmp((const char *) n->name, "base") == 0) {
+		/* simply remove any <base> tags from the document */
+		xmlUnlinkNode(n);
+		xmlFreeNode(n);
+		/* base tags have no content, so there's no point recursing
+		 * additionally, we've just destroyed this node, so trying
+		 * to recurse would result in bad things happening */
+		return true;
+	}
+	/* 6 */
+	else {
+	        if (!rewrite_url(n, "background", base, list))
+	                return false;
+	}
+
+	/* now recurse */
+	for (child = n->children; child;) {
+		/* we must extract the next child now, as if the current
+		 * child is a <base> element, it will be removed from the
+		 * tree (see 5, above), thus preventing extraction of the
+		 * next child */
+		xmlNode *next = child->next;
+		if (child->type == XML_ELEMENT_NODE) {
+			if (!rewrite_urls(child, base, list))
+				return false;
+		}
+		child = next;
+	}
+
+	return true;
+}
+
+
+/**
+ * Rewrite an URL in a HTML document.
+ *
+ * \param  n     The node to modify
+ * \param  attr  The html attribute to modify
+ * \param  base  base url of document
+ * \return  true on success, false on out of memory
+ */
+
+bool rewrite_url(xmlNode *n, const char *attr, const char *base,
+		struct save_complete_entry *list)
+{
+	char *url, *data;
+	char rel[20];
+	struct content *content;
+	url_func_result res;
+
+	if (!xmlHasProp(n, (const xmlChar *) attr))
+		return true;
+
+	data = (char *) xmlGetProp(n, (const xmlChar *) attr);
+	if (!data)
+		return false;
+
+	res = url_join(data, base, &url);
+	xmlFree(data);
+	if (res == URL_FUNC_NOMEM)
+		return false;
+	else if (res == URL_FUNC_OK) {
+		content = save_complete_list_find(url, list);
+		if (content) {
+			/* found a match */
+			free(url);
+			snprintf(rel, sizeof rel, "%p", content);
+			if (!xmlSetProp(n, (const xmlChar *) attr,
+							(xmlChar *) rel))
+				return false;
+		} else {
+			/* no match found */
+			if (!xmlSetProp(n, (const xmlChar *) attr,
+							(xmlChar *) url)) {
+				free(url);
+				return false;
+			}
+			free(url);
+		}
+	}
+
+	return true;
+}
+
+
+/**
+ * Add a content to the save_complete_list.
+ *
+ * \param  content  content to add
+ * \return  true on success, false on out of memory
+ */
+
+bool save_complete_list_add(struct content *content,
+		struct save_complete_entry **list)
+{
+	struct save_complete_entry *entry;
+	entry = malloc(sizeof (*entry));
+	if (!entry)
+		return false;
+	entry->content = content;
+	entry->next = *list;
+	*list = entry;
+	return true;
+}
+
+
+/**
+ * Look up a url in the save_complete_list.
+ *
+ * \param  url  url to find
+ * \return  content if found, 0 otherwise
+ */
+
+struct content * save_complete_list_find(const char *url,
+		struct save_complete_entry *list)
+{
+	struct save_complete_entry *entry;
+	for (entry = list; entry; entry = entry->next)
+		if (strcmp(url, entry->content->url) == 0)
+			return entry->content;
+	return 0;
+}
+
+
+/**
+ * Look up a content in the save_complete_list.
+ *
+ * \param  content  pointer to content
+ * \return  true if the content is in the save_complete_list
+ */
+
+bool save_complete_list_check(struct content *content,
+		struct save_complete_entry *list)
+{
+	struct save_complete_entry *entry;
+	for (entry = list; entry; entry = entry->next)
+		if (entry->content == content)
+			return true;
+	return false;
+}
+
+
+#if 0
+/**
+ * Dump save complete list to stderr
+ */
+void save_complete_list_dump(void)
+{
+	struct save_complete_entry *entry;
+	for (entry = save_complete_list; entry; entry = entry->next)
+		fprintf(stderr, "%p : %s\n", entry->content,
+						entry->content->url);
+}
+#endif
+
+
+/**
+ * Create the inventory file listing original URLs.
+ */
+
+bool save_complete_inventory(const char *path,
+		struct save_complete_entry *list)
+{
+	char urlpath[256];
+	FILE *fp;
+	char *pathstring, *standardpath = (path[0] == '/') ?
+			(char *)(path + 1) : (char *)path;
+
+	snprintf(urlpath, sizeof urlpath, "file:///%s/Inventory", 
+			standardpath);
+	pathstring = url_to_path(urlpath);
+	if (pathstring == NULL) {
+		warn_user("NoMemory", 0);
+		return false;
+	}
+	fp = fopen(pathstring, "w");
+	free(pathstring);
+	if (!fp) {
+		LOG(("fopen(): errno = %i", errno));
+		warn_user("SaveError", strerror(errno));
+		return false;
+	}
+
+	struct save_complete_entry *entry;
+	for (entry = list; entry; entry = entry->next)
+		fprintf(fp, "%p %s\n", entry->content, entry->content->url);
+
+	fclose(fp);
+
+	return true;
+}
+
author	John Mark Bell <jmb@netsurf-browser.org>	2009-12-17 23:55:02 +0000
committer	John Mark Bell <jmb@netsurf-browser.org>	2009-12-17 23:55:02 +0000
commit	355799ce0bbb078237dfc1ae9874bbc5342acbc4 (patch)
tree	7ca980c01c0d4d1d55a3b7b15418c95c5618afae /desktop/save_complete.c
parent	4346b2b62b940182575e6612e46234355afa083c (diff)
download	netsurf-355799ce0bbb078237dfc1ae9874bbc5342acbc4.tar.gz netsurf-355799ce0bbb078237dfc1ae9874bbc5342acbc4.tar.bz2