/* * Copyright 2012 John-Mark Bell * Copyright 2004-2007 James Bursa * * This file is part of NetSurf, http://www.netsurf-browser.org/ * * NetSurf is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; version 2 of the License. * * NetSurf is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /** \file * Save HTML document with dependencies (implementation). */ #include #include #include #include #include #include #include #include #include #include "utils/config.h" #include "utils/corestrings.h" #include "utils/log.h" #include "utils/nsurl.h" #include "utils/utf8.h" #include "utils/utils.h" #include "utils/file.h" #include "utils/messages.h" #include "content/content.h" #include "content/hlcache.h" #include "css/css.h" #include "render/box.h" #include "render/html.h" #include "desktop/save_complete.h" regex_t save_complete_import_re; /** An entry in save_complete_list. */ typedef struct save_complete_entry { hlcache_handle *content; struct save_complete_entry *next; /**< Next entry in list */ } save_complete_entry; typedef struct save_complete_ctx { const char *path; save_complete_entry *list; save_complete_set_type_cb set_type; nsurl *base; FILE *fp; enum { STATE_NORMAL, STATE_IN_STYLE } iter_state; } save_complete_ctx; typedef enum { EVENT_ENTER, EVENT_LEAVE } save_complete_event_type; static bool save_complete_save_html(save_complete_ctx *ctx, hlcache_handle *c, bool index); static bool save_complete_save_imported_sheets(save_complete_ctx *ctx, struct nscss_import *imports, uint32_t import_count); static void save_complete_ctx_initialise(save_complete_ctx *ctx, const char *path, save_complete_set_type_cb set_type) { ctx->path = path; ctx->list = NULL; ctx->set_type = set_type; } static void save_complete_ctx_finalise(save_complete_ctx *ctx) { save_complete_entry *list = ctx->list; while (list != NULL) { save_complete_entry *next = list->next; free(list); list = next; } } static bool save_complete_ctx_add_content(save_complete_ctx *ctx, hlcache_handle *content) { save_complete_entry *entry; entry = malloc(sizeof (*entry)); if (entry == NULL) return false; entry->content = content; entry->next = ctx->list; ctx->list = entry; return true; } static hlcache_handle *save_complete_ctx_find_content(save_complete_ctx *ctx, const nsurl *url) { save_complete_entry *entry; for (entry = ctx->list; entry != NULL; entry = entry->next) if (nsurl_compare(url, hlcache_handle_get_url(entry->content), NSURL_COMPLETE)) return entry->content; return NULL; } static bool save_complete_ctx_has_content(save_complete_ctx *ctx, hlcache_handle *content) { save_complete_entry *entry; for (entry = ctx->list; entry != NULL; entry = entry->next) if (entry->content == content) return true; return false; } static bool save_complete_save_buffer(save_complete_ctx *ctx, const char *leafname, const char *data, size_t data_len, lwc_string *mime_type) { nserror ret; FILE *fp; char *fname = NULL; ret = netsurf_mkpath(&fname, NULL, 2, ctx->path, leafname); if (ret != NSERROR_OK) { warn_user(messages_get_errorcode(ret), 0); return false; } fp = fopen(fname, "wb"); if (fp == NULL) { free(fname); LOG(("fopen(): errno = %i", errno)); warn_user("SaveError", strerror(errno)); return false; } fwrite(data, sizeof(*data), data_len, fp); fclose(fp); if (ctx->set_type != NULL) { ctx->set_type(fname, mime_type); } free(fname); return true; } /** * Rewrite stylesheet \@import rules for save complete. * * \param ctx Save complete context. * \param source stylesheet source. * \param size size of source. * \param base url of stylesheet. * \param osize updated with the size of the result. * \return converted source, or NULL on out of memory. */ static char *save_complete_rewrite_stylesheet_urls(save_complete_ctx *ctx, const char *source, unsigned long size, const nsurl *base, unsigned long *osize) { char *rewritten; unsigned long offset = 0; unsigned int imports = 0; nserror error; /* count number occurrences of @import to (over)estimate result size */ /* can't use strstr because source is not 0-terminated string */ for (offset = 0; SLEN("@import") < size && offset <= size - SLEN("@import"); offset++) { if (source[offset] == '@' && tolower(source[offset + 1]) == 'i' && tolower(source[offset + 2]) == 'm' && tolower(source[offset + 3]) == 'p' && tolower(source[offset + 4]) == 'o' && tolower(source[offset + 5]) == 'r' && tolower(source[offset + 6]) == 't') imports++; } rewritten = malloc(size + imports * 20); if (rewritten == NULL) return NULL; *osize = 0; offset = 0; while (offset < size) { const char *import_url = NULL; char *import_url_copy; int import_url_len = 0; nsurl *url = NULL; regmatch_t match[11]; int m = regexec(&save_complete_import_re, source + offset, 11, match, 0); if (m) break; if (match[2].rm_so != -1) { import_url = source + offset + match[2].rm_so; import_url_len = match[2].rm_eo - match[2].rm_so; } else if (match[4].rm_so != -1) { import_url = source + offset + match[4].rm_so; import_url_len = match[4].rm_eo - match[4].rm_so; } else if (match[6].rm_so != -1) { import_url = source + offset + match[6].rm_so; import_url_len = match[6].rm_eo - match[6].rm_so; } else if (match[8].rm_so != -1) { import_url = source + offset + match[8].rm_so; import_url_len = match[8].rm_eo - match[8].rm_so; } else if (match[10].rm_so != -1) { import_url = source + offset + match[10].rm_so; import_url_len = match[10].rm_eo - match[10].rm_so; } assert(import_url != NULL); import_url_copy = strndup(import_url, import_url_len); if (import_url_copy == NULL) { free(rewritten); return NULL; } error = nsurl_join(base, import_url_copy, &url); free(import_url_copy); if (error == NSERROR_NOMEM) { free(rewritten); return NULL; } /* copy data before match */ memcpy(rewritten + *osize, source + offset, match[0].rm_so); *osize += match[0].rm_so; if (url != NULL) { hlcache_handle *content; content = save_complete_ctx_find_content(ctx, url); if (content != NULL) { /* replace import */ char buf[64]; snprintf(buf, sizeof buf, "@import '%p'", content); memcpy(rewritten + *osize, buf, strlen(buf)); *osize += strlen(buf); } else { /* copy import */ memcpy(rewritten + *osize, source + offset + match[0].rm_so, match[0].rm_eo - match[0].rm_so); *osize += match[0].rm_eo - match[0].rm_so; } nsurl_unref(url); } else { /* copy import */ memcpy(rewritten + *osize, source + offset + match[0].rm_so, match[0].rm_eo - match[0].rm_so); *osize += match[0].rm_eo - match[0].rm_so; } assert(0 < match[0].rm_eo); offset += match[0].rm_eo; } /* copy rest of source */ if (offset < size) { memcpy(rewritten + *osize, source + offset, size - offset); *osize += size - offset; } return rewritten; } static bool save_complete_save_stylesheet(save_complete_ctx *ctx, hlcache_handle *css) { const char *css_data; unsigned long css_size; char *source; unsigned long source_len; struct nscss_import *imports; uint32_t import_count; lwc_string *type; char filename[32]; bool result; if (save_complete_ctx_has_content(ctx, css)) return true; if (save_complete_ctx_add_content(ctx, css) == false) { warn_user("NoMemory", 0); return false; } imports = nscss_get_imports(css, &import_count); if (save_complete_save_imported_sheets(ctx, imports, import_count) == false) return false; css_data = content_get_source_data(css, &css_size); source = save_complete_rewrite_stylesheet_urls(ctx, css_data, css_size, hlcache_handle_get_url(css), &source_len); if (source == NULL) { warn_user("NoMemory", 0); return false; } type = content_get_mime_type(css); if (type == NULL) { free(source); return false; } snprintf(filename, sizeof filename, "%p", css); result = save_complete_save_buffer(ctx, filename, source, source_len, type); lwc_string_unref(type); free(source); return result; } static bool save_complete_save_imported_sheets(save_complete_ctx *ctx, struct nscss_import *imports, uint32_t import_count) { uint32_t i; for (i = 0; i < import_count; i++) { /* treat a valid content as a stylesheet to save */ if ((imports[i].c != NULL) && (save_complete_save_stylesheet(ctx, imports[i].c) == false)) { return false; } } return true; } static bool save_complete_save_html_stylesheet(save_complete_ctx *ctx, struct html_stylesheet *sheet) { if (sheet->sheet == NULL) return true; return save_complete_save_stylesheet(ctx, sheet->sheet); } static bool save_complete_save_html_stylesheets(save_complete_ctx *ctx, hlcache_handle *c) { struct html_stylesheet *sheets; unsigned int i, count; sheets = html_get_stylesheets(c, &count); for (i = STYLESHEET_START; i != count; i++) { if (save_complete_save_html_stylesheet(ctx, &sheets[i]) == false) return false; } return true; } static bool save_complete_save_html_object(save_complete_ctx *ctx, hlcache_handle *obj) { const char *obj_data; unsigned long obj_size; lwc_string *type; bool result; char filename[32]; if (content_get_type(obj) == CONTENT_NONE) return true; obj_data = content_get_source_data(obj, &obj_size); if (obj_data == NULL) return true; if (save_complete_ctx_has_content(ctx, obj)) return true; if (save_complete_ctx_add_content(ctx, obj) == false) { warn_user("NoMemory", 0); return false; } if (content_get_type(obj) == CONTENT_HTML) { return save_complete_save_html(ctx, obj, false); } snprintf(filename, sizeof filename, "%p", obj); type = content_get_mime_type(obj); if (type == NULL) return false; result = save_complete_save_buffer(ctx, filename, obj_data, obj_size, type); lwc_string_unref(type); return result; } static bool save_complete_save_html_objects(save_complete_ctx *ctx, hlcache_handle *c) { struct content_html_object *object; unsigned int count; object = html_get_objects(c, &count); for (; object != NULL; object = object->next) { if ((object->content != NULL) && (object->box != NULL)) { if (save_complete_save_html_object(ctx, object->content) == false) return false; } } return true; } static bool save_complete_libdom_treewalk(dom_node *root, bool (*callback)(dom_node *node, save_complete_event_type event_type, void *ctx), void *ctx) { dom_node *node; node = dom_node_ref(root); /* tree root */ while (node != NULL) { dom_node *next = NULL; dom_exception exc; exc = dom_node_get_first_child(node, &next); if (exc != DOM_NO_ERR) { dom_node_unref(node); break; } if (next != NULL) { /* 1. children */ dom_node_unref(node); node = next; } else { exc = dom_node_get_next_sibling(node, &next); if (exc != DOM_NO_ERR) { dom_node_unref(node); break; } if (next != NULL) { /* 2. siblings */ if (callback(node, EVENT_LEAVE, ctx) == false) { return false; } dom_node_unref(node); node = next; } else { /* 3. ancestor siblings */ while (node != NULL) { exc = dom_node_get_next_sibling(node, &next); if (exc != DOM_NO_ERR) { dom_node_unref(node); node = NULL; break; } if (next != NULL) { dom_node_unref(next); break; } exc = dom_node_get_parent_node(node, &next); if (exc != DOM_NO_ERR) { dom_node_unref(node); node = NULL; break; } if (callback(node, EVENT_LEAVE, ctx) == false) { return false; } dom_node_unref(node); node = next; } if (node == NULL) break; exc = dom_node_get_next_sibling(node, &next); if (exc != DOM_NO_ERR) { dom_node_unref(node); break; } if (callback(node, EVENT_LEAVE, ctx) == false) { return false; } dom_node_unref(node); node = next; } } assert(node != NULL); if (callback(node, EVENT_ENTER, ctx) == false) { return false; /* callback caused early termination */ } } return true; } static bool save_complete_rewrite_url_value(save_complete_ctx *ctx, const char *value, size_t value_len) { nsurl *url; hlcache_handle *content; char *escaped; nserror error; error = nsurl_join(ctx->base, value, &url); if (error == NSERROR_NOMEM) return false; if (url != NULL) { content = save_complete_ctx_find_content(ctx, url); if (content != NULL) { /* found a match */ nsurl_unref(url); fprintf(ctx->fp, "\"%p\"", content); } else { /* no match found */ error = utf8_to_html(nsurl_access(url), "UTF-8", nsurl_length(url), &escaped); nsurl_unref(url); if (error != NSERROR_OK) return false; fprintf(ctx->fp, "\"%s\"", escaped); free(escaped); } } else { error = utf8_to_html(value, "UTF-8", value_len, &escaped); if (error != NSERROR_OK) return false; fprintf(ctx->fp, "\"%s\"", escaped); free(escaped); } return true; } static bool save_complete_write_value(save_complete_ctx *ctx, const char *value, size_t value_len) { char *escaped; nserror ret; ret = utf8_to_html(value, "UTF-8", value_len, &escaped); if (ret != NSERROR_OK) return false; fprintf(ctx->fp, "\"%s\"", escaped); free(escaped); return true; } static bool save_complete_handle_attr_value(save_complete_ctx *ctx, dom_string *node_name, dom_string *attr_name, dom_string *attr_value) { const char *node_data = dom_string_data(node_name); size_t node_len = dom_string_byte_length(node_name); const char *name_data = dom_string_data(attr_name); size_t name_len = dom_string_byte_length(attr_name); const char *value_data = dom_string_data(attr_value); size_t value_len = dom_string_byte_length(attr_value); /** * We only need to consider the following cases: * * Attribute: Elements: * * 1) data * 2) href * 3) src