/* * Copyright 2007 James Bursa * * This file is part of NetSurf, http://www.netsurf-browser.org/ * * NetSurf is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; version 2 of the License. * * NetSurf is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /** \file * Content for text/html (implementation). */ #define _GNU_SOURCE /* for strndup() */ #include #include #include #include #include #include #include "utils/config.h" #include "content/content.h" #include "content/fetch.h" #include "content/fetchcache.h" #include "desktop/browser.h" #include "desktop/gui.h" #include "desktop/options.h" #include "image/bitmap.h" #include "render/box.h" #include "render/font.h" #include "render/form.h" #include "render/html.h" #include "render/imagemap.h" #include "render/layout.h" #include "utils/log.h" #include "utils/messages.h" #include "utils/talloc.h" #include "utils/url.h" #include "utils/utils.h" #define CHUNK 4096 /* Change these to 1 to cause a dump to stderr of the frameset or box * when the trees have been built. */ #define ALWAYS_DUMP_FRAMESET 0 #define ALWAYS_DUMP_BOX 0 static void html_convert_css_callback(content_msg msg, struct content *css, intptr_t p1, intptr_t p2, union content_msg_data data); static bool html_meta_refresh(struct content *c, xmlNode *head); static bool html_head(struct content *c, xmlNode *head); static bool html_find_stylesheets(struct content *c, xmlNode *html); static bool html_process_style_element(struct content *c, xmlNode *style); static void html_object_callback(content_msg msg, struct content *object, intptr_t p1, intptr_t p2, union content_msg_data data); static void html_object_done(struct box *box, struct content *object, bool background); static void html_object_failed(struct box *box, struct content *content, bool background); static bool html_object_type_permitted(const content_type type, const content_type *permitted_types); static void html_object_refresh(void *p); static void html_destroy_frameset(struct content_html_frames *frameset); static void html_destroy_iframe(struct content_html_iframe *iframe); static void html_set_status(struct content *c, const char *extra); #if ALWAYS_DUMP_FRAMESET static void html_dump_frameset(struct content_html_frames *frame, unsigned int depth); #endif static const char empty_document[] = "" "" "" "Empty document" "" "" "

Empty document

" "

The document sent by the server is empty.

" "" ""; /** * Create a CONTENT_HTML. * * The content_html_data structure is initialized and the HTML parser is * created. */ bool html_create(struct content *c, const char *params[]) { unsigned int i; struct content_html_data *html = &c->data.html; union content_msg_data msg_data; binding_error error; html->parser_binding = NULL; html->document = 0; html->encoding = 0; html->base_url = c->url; html->base_target = NULL; html->layout = 0; html->background_colour = TRANSPARENT; html->stylesheet_count = 0; html->stylesheet_content = 0; html->style = 0; html->working_stylesheet = 0; html->object_count = 0; html->object = 0; html->forms = 0; html->imagemaps = 0; html->bw = 0; html->frameset = 0; html->iframe = 0; html->page = 0; html->index = 0; html->box = 0; html->font_func = &nsfont; for (i = 0; params[i]; i += 2) { if (strcasecmp(params[i], "charset") == 0) { html->encoding = talloc_strdup(c, params[i + 1]); if (!html->encoding) { error = BINDING_NOMEM; goto error; } html->encoding_source = ENCODING_SOURCE_HEADER; break; } } /* Create the parser binding */ error = binding_create_tree(c, html->encoding, &html->parser_binding); if (error == BINDING_BADENCODING && html->encoding != NULL) { /* Ok, we don't support the declared encoding. Bailing out * isn't exactly user-friendly, so fall back to autodetect */ talloc_free(html->encoding); html->encoding = NULL; error = binding_create_tree(c, html->encoding, &html->parser_binding); } if (error != BINDING_OK) goto error; return true; error: if (error == BINDING_BADENCODING) { LOG(("Bad encoding: %s", html->encoding ? html->encoding : "")); msg_data.error = messages_get("ParsingFail"); } else msg_data.error = messages_get("NoMemory"); content_broadcast(c, CONTENT_MSG_ERROR, msg_data); return false; } /** * Process data for CONTENT_HTML. * * The data is parsed in chunks of size CHUNK, multitasking in between. */ bool html_process_data(struct content *c, char *data, unsigned int size) { unsigned long x; binding_error err; const char *encoding; for (x = 0; x + CHUNK <= size; x += CHUNK) { err = binding_parse_chunk(c->data.html.parser_binding, (uint8_t *) data + x, CHUNK); if (err == BINDING_ENCODINGCHANGE) { goto encoding_change; } gui_multitask(); } err = binding_parse_chunk(c->data.html.parser_binding, (uint8_t *) data + x, (size - x)); if (err == BINDING_ENCODINGCHANGE) { goto encoding_change; } return true; encoding_change: /* Retrieve new encoding */ encoding = binding_get_encoding( c->data.html.parser_binding, &c->data.html.encoding_source); if (c->data.html.encoding != NULL) talloc_free(c->data.html.encoding); c->data.html.encoding = talloc_strdup(c, encoding); if (c->data.html.encoding == NULL) { union content_msg_data msg_data; msg_data.error = messages_get("NoMemory"); content_broadcast(c, CONTENT_MSG_ERROR, msg_data); return false; } /* Destroy binding */ binding_destroy_tree(c->data.html.parser_binding); /* Create new binding, using the new encoding */ err = binding_create_tree(c, c->data.html.encoding, &c->data.html.parser_binding); if (err == BINDING_BADENCODING) { /* Ok, we don't support the declared encoding. Bailing out * isn't exactly user-friendly, so fall back to Windows-1252 */ talloc_free(c->data.html.encoding); c->data.html.encoding = talloc_strdup(c, "Windows-1252"); if (c->data.html.encoding == NULL) { union content_msg_data msg_data; msg_data.error = messages_get("NoMemory"); content_broadcast(c, CONTENT_MSG_ERROR, msg_data); return false; } err = binding_create_tree(c, c->data.html.encoding, &c->data.html.parser_binding); } if (err != BINDING_OK) { union content_msg_data msg_data; if (err == BINDING_BADENCODING) { LOG(("Bad encoding: %s", c->data.html.encoding ? c->data.html.encoding : "")); msg_data.error = messages_get("ParsingFail"); } else msg_data.error = messages_get("NoMemory"); content_broadcast(c, CONTENT_MSG_ERROR, msg_data); return false; } /* Recurse to reprocess all that data. This is safe because * the encoding is now specified at parser-start which means * it cannot be changed again. */ return html_process_data(c, c->source_data, c->source_size); } /** * Convert a CONTENT_HTML for display. * * The following steps are carried out in order: * * - parsing to an XML tree is completed * - stylesheets are fetched * - the XML tree is converted to a box tree and object fetches are started * - the box tree is laid out * * On exit, the content status will be either CONTENT_STATUS_DONE if the * document is completely loaded or CONTENT_STATUS_READY if objects are still * being fetched. */ bool html_convert(struct content *c, int width, int height) { xmlNode *html, *head; union content_msg_data msg_data; unsigned int time_before, time_taken; #ifdef WITH_HUBBUB struct form *f; #endif /* finish parsing */ if (c->source_size == 0) { binding_error err; /* Destroy current binding */ binding_destroy_tree(c->data.html.parser_binding); /* Also, any existing encoding information, * as it's not guaranteed to match the error page. */ talloc_free(c->data.html.encoding); c->data.html.encoding = NULL; /* Create new binding, using default charset */ err = binding_create_tree(c, NULL, &c->data.html.parser_binding); if (err != BINDING_OK) { union content_msg_data msg_data; if (err == BINDING_BADENCODING) { LOG(("Bad encoding: %s", c->data.html.encoding ? c->data.html.encoding : "")); msg_data.error = messages_get("ParsingFail"); } else msg_data.error = messages_get("NoMemory"); content_broadcast(c, CONTENT_MSG_ERROR, msg_data); return false; } /* Process the error page */ if (html_process_data(c, (char *) empty_document, SLEN(empty_document)) == false) return false; } binding_parse_completed(c->data.html.parser_binding); c->data.html.document = binding_get_document(c->data.html.parser_binding); /*xmlDebugDumpDocument(stderr, c->data.html.document);*/ if (!c->data.html.document) { LOG(("Parsing failed")); msg_data.error = messages_get("ParsingFail"); content_broadcast(c, CONTENT_MSG_ERROR, msg_data); return false; } if (c->data.html.encoding == NULL) { const char *encoding = binding_get_encoding( c->data.html.parser_binding, &c->data.html.encoding_source); c->data.html.encoding = talloc_strdup(c, encoding); if (c->data.html.encoding == NULL) { msg_data.error = messages_get("NoMemory"); content_broadcast(c, CONTENT_MSG_ERROR, msg_data); return false; } } /* locate html and head elements */ html = xmlDocGetRootElement(c->data.html.document); if (html == 0 || strcmp((const char *) html->name, "html") != 0) { LOG(("html element not found")); msg_data.error = messages_get("ParsingFail"); content_broadcast(c, CONTENT_MSG_ERROR, msg_data); return false; } for (head = html->children; head != 0 && head->type != XML_ELEMENT_NODE; head = head->next) ; if (head && strcmp((const char *) head->name, "head") != 0) { head = 0; LOG(("head element not found")); } if (head) { if (!html_head(c, head)) { msg_data.error = messages_get("NoMemory"); content_broadcast(c, CONTENT_MSG_ERROR, msg_data); return false; } /* handle meta refresh */ if (!html_meta_refresh(c, head)) return false; } /* get stylesheets */ if (!html_find_stylesheets(c, html)) return false; #ifdef WITH_HUBBUB /* Retrieve forms from parser */ c->data.html.forms = binding_get_forms(c->data.html.parser_binding); for (f = c->data.html.forms; f != NULL; f = f->prev) { char *action; url_func_result res; /* Make all actions absolute */ res = url_join(f->action, c->data.html.base_url, &action); if (res != URL_FUNC_OK) { msg_data.error = messages_get("NoMemory"); content_broadcast(c, CONTENT_MSG_ERROR, msg_data); return false; } free(f->action); f->action = action; /* Ensure each form has a document encoding */ if (f->document_charset == NULL) { f->document_charset = strdup(c->data.html.encoding); if (f->document_charset == NULL) { msg_data.error = messages_get("NoMemory"); content_broadcast(c, CONTENT_MSG_ERROR, msg_data); return false; } } } #endif /* convert xml tree to box tree */ LOG(("XML to box")); content_set_status(c, messages_get("Processing")); content_broadcast(c, CONTENT_MSG_STATUS, msg_data); if (!xml_to_box(html, c)) { msg_data.error = messages_get("NoMemory"); content_broadcast(c, CONTENT_MSG_ERROR, msg_data); return false; } #if ALWAYS_DUMP_BOX box_dump(c->data.html.layout->children, 0); #endif #if ALWAYS_DUMP_FRAMESET if (c->data.html.frameset) html_dump_frameset(c->data.html.frameset, 0); #endif /* extract image maps - can't do this sensibly in xml_to_box */ if (!imagemap_extract(html, c)) { LOG(("imagemap extraction failed")); msg_data.error = messages_get("NoMemory"); content_broadcast(c, CONTENT_MSG_ERROR, msg_data); return false; } /*imagemap_dump(c);*/ /* layout the box tree */ html_set_status(c, messages_get("Formatting")); content_broadcast(c, CONTENT_MSG_STATUS, msg_data); LOG(("Layout document")); time_before = wallclock(); html_reformat(c, width, height); time_taken = wallclock() - time_before; LOG(("Layout took %dcs", time_taken)); c->reformat_time = wallclock() + ((time_taken < option_min_reflow_period ? option_min_reflow_period : time_taken * 1.25)); LOG(("Scheduling relayout no sooner than %dcs", c->reformat_time - wallclock())); /*box_dump(c->data.html.layout->children, 0);*/ /* Destroy the parser binding */ binding_destroy_tree(c->data.html.parser_binding); c->data.html.parser_binding = NULL; if (c->active == 0) c->status = CONTENT_STATUS_DONE; else c->status = CONTENT_STATUS_READY; html_set_status(c, ""); return true; } /** * Process elements in . * * \param c content structure * \param head xml node of head element * \return true on success, false on memory exhaustion * * The title and base href are extracted if present. */ bool html_head(struct content *c, xmlNode *head) { xmlNode *node; xmlChar *s; c->title = 0; for (node = head->children; node != 0; node = node->next) { if (node->type != XML_ELEMENT_NODE) continue; LOG(("Node: %s", node->name)); if (!c->title && strcmp((const char *) node->name, "title") == 0) { xmlChar *title = xmlNodeGetContent(node); char *title2; if (!title) return false; title2 = squash_whitespace((const char *) title); xmlFree(title); if (!title2) return false; c->title = talloc_strdup(c, title2); free(title2); if (!c->title) return false; } else if (strcmp((const char *) node->name, "base") == 0) { char *href = (char *) xmlGetProp(node, (const xmlChar *) "href"); if (href) { char *url; url_func_result res; res = url_normalize(href, &url); if (res == URL_FUNC_OK) { c->data.html.base_url = talloc_strdup(c, url); free(url); } xmlFree(href); } /* don't use the central values to ease freeing later on */ if ((s = xmlGetProp(node, (const xmlChar *) "target"))) { if ((!strcasecmp((const char *) s, "_blank")) || (!strcasecmp((const char *) s, "_top")) || (!strcasecmp((const char *) s, "_parent")) || (!strcasecmp((const char *) s, "_self")) || ('a' <= s[0] && s[0] <= 'z') || ('A' <= s[0] && s[0] <= 'Z')) { /* [6.16] */ c->data.html.base_target = talloc_strdup(c, (const char *) s); if (!c->data.html.base_target) { xmlFree(s); return false; } } xmlFree(s); } } } return true; } /** * Search for meta refresh * * http://wp.netscape.com/assist/net_sites/pushpull.html * * \param c content structure * \param head xml node of head element * \return true on success, false otherwise (error reported) */ bool html_meta_refresh(struct content *c, xmlNode *head) { xmlNode *n; xmlChar *equiv, *content; union content_msg_data msg_data; char *url, *end, *refresh = NULL, quote = 0; url_func_result res; for (n = head == 0 ? 0 : head->children; n; n = n->next) { if (n->type != XML_ELEMENT_NODE) continue; /* Recurse into noscript elements */ if (strcmp((const char *) n->name, "noscript") == 0) { if (!html_meta_refresh(c, n)) { /* Some error occurred */ return false; } else if (c->refresh) { /* Meta refresh found - stop */ return true; } } if (strcmp((const char *) n->name, "meta")) { continue; } equiv = xmlGetProp(n, (const xmlChar *) "http-equiv"); if (!equiv) continue; if (strcasecmp((const char *) equiv, "refresh")) { xmlFree(equiv); continue; } xmlFree(equiv); content = xmlGetProp(n, (const xmlChar *) "content"); if (!content) continue; end = (char *) content + strlen((const char *) content); /* content := *LWS intpart fracpart? *LWS [';' *LWS *1url *LWS] * intpart := 1*DIGIT * fracpart := 1*('.' | DIGIT) * url := "url" *LWS '=' *LWS (url-nq | url-sq | url-dq) * url-nq := *urlchar * url-sq := "'" *(urlchar | '"') "'" * url-dq := '"' *(urlchar | "'") '"' * urlchar := [#x9#x21#x23-#x26#x28-#x7E] | nonascii * nonascii := [#x80-#xD7FF#xE000-#xFFFD#x10000-#x10FFFF] */ /* *LWS intpart */ msg_data.delay = (int)strtol((char *) content, &url, 10); /* a very small delay and self-referencing URL can cause a loop * that grinds machines to a halt. To prevent this we set a * minimum refresh delay of 1s. */ if (msg_data.delay < 1) msg_data.delay = 1; /* fracpart? (ignored, as delay is integer only) */ while (url < end && (('0' <= *url && *url <= '9') || *url == '.')) { url++; } /* *LWS */ while (url < end && isspace(*url)) { url++; } /* ';' */ if (url < end && *url == ';') url++; /* *LWS */ while (url < end && isspace(*url)) { url++; } if (url == end) { /* Just delay specified, so refresh current page */ xmlFree(content); c->refresh = talloc_strdup(c, c->url); if (!c->refresh) { msg_data.error = messages_get("NoMemory"); content_broadcast(c, CONTENT_MSG_ERROR, msg_data); return false; } content_broadcast(c, CONTENT_MSG_REFRESH, msg_data); break; } /* "url" */ if (url <= end - 3) { if (strncasecmp(url, "url", 3) == 0) { url += 3; } else { /* Unexpected input, ignore this header */ continue; } } else { /* Insufficient input, ignore this header */ continue; } /* *LWS */ while (url < end && isspace(*url)) { url++; } /* '=' */ if (url < end) { if (*url == '=') { url++; } else { /* Unexpected input, ignore this header */ continue; } } else { /* Insufficient input, ignore this header */ continue; } /* *LWS */ while (url < end && isspace(*url)) { url++; } /* '"' or "'" */ if (url < end && (*url == '"' || *url == '\'')) { quote = *url; url++; } /* Start of URL */ refresh = url; if (quote != 0) { /* url-sq | url-dq */ while (url < end && *url != quote) url++; } else { /* url-nq */ while (url < end && !isspace(*url)) url++; } /* '"' or "'" or *LWS (we don't care) */ if (url < end) *url = '\0'; res = url_join(refresh, c->data.html.base_url, &refresh); xmlFree(content); if (res == URL_FUNC_NOMEM) { msg_data.error = messages_get("NoMemory"); content_broadcast(c, CONTENT_MSG_ERROR, msg_data); return false; } else if (res == URL_FUNC_FAILED) { /* This isn't fatal so carry on looking */ continue; } c->refresh = talloc_strdup(c, refresh); free(refresh); if (!c->refresh) { msg_data.error = messages_get("NoMemory"); content_broadcast(c, CONTENT_MSG_ERROR, msg_data); return false; } content_broadcast(c, CONTENT_MSG_REFRESH, msg_data); } return true; } /** * Process inline stylesheets and fetch linked stylesheets. * * Uses STYLE and LINK elements inside and outside HEAD * * \param c content structure * \param head xml node of html element * \return true on success, false if an error occurred */ bool html_find_stylesheets(struct content *c, xmlNode *html) { xmlNode *node; char *rel, *type, *media, *href, *url, *url2; unsigned int i = STYLESHEET_START; unsigned int last_active = 0; union content_msg_data msg_data; url_func_result res; struct content **stylesheet_content; /* stylesheet 0 is the base style sheet, * stylesheet 1 is the adblocking stylesheet, * stylesheet 2 is any