/* * This file is part of NetSurf, http://netsurf.sourceforge.net/ * Licensed under the GNU General Public License, * http://www.opensource.org/licenses/gpl-license * Copyright 2005 James Bursa */ /** \file * Content for text/html (implementation). */ #include #include #include #include #include #include #include "libxml/parserInternals.h" #include "netsurf/utils/config.h" #include "netsurf/content/content.h" #include "netsurf/content/fetch.h" #include "netsurf/content/fetchcache.h" #include "netsurf/desktop/browser.h" #include "netsurf/desktop/gui.h" #include "netsurf/desktop/options.h" #include "netsurf/render/box.h" #include "netsurf/render/font.h" #include "netsurf/render/html.h" #include "netsurf/render/imagemap.h" #include "netsurf/render/layout.h" #include "netsurf/utils/log.h" #include "netsurf/utils/messages.h" #include "netsurf/utils/talloc.h" #include "netsurf/utils/url.h" #include "netsurf/utils/utils.h" #define CHUNK 4096 static bool html_set_parser_encoding(struct content *c, const char *encoding); static const char *html_detect_encoding(const char *data, unsigned int size); static void html_convert_css_callback(content_msg msg, struct content *css, intptr_t p1, intptr_t p2, union content_msg_data data); static bool html_meta_refresh(struct content *c, xmlNode *head); static bool html_head(struct content *c, xmlNode *head); static bool html_find_stylesheets(struct content *c, xmlNode *head); static void html_object_callback(content_msg msg, struct content *object, intptr_t p1, intptr_t p2, union content_msg_data data); static void html_object_done(struct box *box, struct content *object, bool background); static void html_object_failed(struct box *box, struct content *content, bool background); static bool html_object_type_permitted(const content_type type, const content_type *permitted_types); static void html_object_refresh(void *p); static bool html_find_frame(struct content *c, const char *frame, struct content **page, unsigned int *i); /** * Create a CONTENT_HTML. * * The content_html_data structure is initialized and the HTML parser is * created. */ bool html_create(struct content *c, const char *params[]) { unsigned int i; struct content_html_data *html = &c->data.html; union content_msg_data msg_data; html->parser = 0; html->encoding_handler = 0; html->encoding = 0; html->getenc = true; html->base_url = c->url; html->layout = 0; html->background_colour = TRANSPARENT; html->stylesheet_count = 0; html->stylesheet_content = 0; html->style = 0; html->working_stylesheet = 0; html->object_count = 0; html->object = 0; html->forms = 0; html->imagemaps = 0; html->bw = 0; html->page = 0; html->index = 0; html->box = 0; for (i = 0; params[i]; i += 2) { if (strcasecmp(params[i], "charset") == 0) { html->encoding = talloc_strdup(c, params[i + 1]); if (!html->encoding) goto no_memory; html->encoding_source = ENCODING_SOURCE_HEADER; html->getenc = false; break; } } html->parser = htmlCreatePushParserCtxt(0, 0, "", 0, 0, XML_CHAR_ENCODING_NONE); if (!html->parser) goto no_memory; if (html->encoding) { /* an encoding was specified in the Content-Type header */ if (!html_set_parser_encoding(c, html->encoding)) return false; } return true; no_memory: /* memory allocated will be freed in html_destroy() */ msg_data.error = messages_get("NoMemory"); content_broadcast(c, CONTENT_MSG_ERROR, msg_data); warn_user("NoMemory", 0); return false; } /** * Process data for CONTENT_HTML. * * The data is parsed in chunks of size CHUNK, multitasking in between. */ bool html_process_data(struct content *c, char *data, unsigned int size) { unsigned long x; if (c->data.html.getenc) { /* No encoding was specified in the Content-Type header. * Attempt to detect if the encoding is not 8-bit. If the * encoding is 8-bit, leave the parser unchanged, so that it * searches for a . */ const char *encoding; encoding = html_detect_encoding(data, size); if (encoding) { if (!html_set_parser_encoding(c, encoding)) return false; c->data.html.encoding = talloc_strdup(c, encoding); if (!c->data.html.encoding) return false; c->data.html.encoding_source = ENCODING_SOURCE_DETECTED; } c->data.html.getenc = false; } for (x = 0; x + CHUNK <= size; x += CHUNK) { htmlParseChunk(c->data.html.parser, data + x, CHUNK, 0); gui_multitask(); } htmlParseChunk(c->data.html.parser, data + x, (int) (size - x), 0); return true; } /** * Set the HTML parser character encoding. * * \param c content of type CONTENT_HTML * \param encoding name of encoding * \return true on success, false on error and error reported */ bool html_set_parser_encoding(struct content *c, const char *encoding) { struct content_html_data *html = &c->data.html; xmlError *error; char error_message[500]; union content_msg_data msg_data; html->encoding_handler = xmlFindCharEncodingHandler(encoding); if (!html->encoding_handler) { /* either out of memory, or no handler available */ /* assume no handler available, which is not a fatal error */ LOG(("no encoding handler for \"%s\"", encoding)); /* \todo warn user and ask them to install iconv? */ return true; } xmlCtxtResetLastError(html->parser); if (xmlSwitchToEncoding(html->parser, html->encoding_handler)) { error = xmlCtxtGetLastError(html->parser); snprintf(error_message, sizeof error_message, "%s xmlSwitchToEncoding(): %s", messages_get("MiscError"), error ? error->message : "failed"); msg_data.error = error_message; content_broadcast(c, CONTENT_MSG_ERROR, msg_data); return false; } return true; } /** * Attempt to detect the encoding of some HTML data. * * \param data HTML source data * \param size length of data * \return a constant string giving the encoding, or 0 if the encoding * appears to be some 8-bit encoding */ const char *html_detect_encoding(const char *data, unsigned int size) { /* this detection assumes that the first two characters are <= 0xff */ if (size < 4) return 0; if (data[0] == 0xfe && data[1] == 0xff) /* BOM fe ff */ return "UTF-16BE"; else if (data[0] == 0xfe && data[1] == 0xff) /* BOM ff fe */ return "UTF-16LE"; else if (data[0] == 0x00 && data[1] != 0x00 && data[2] == 0x00 && data[3] != 0x00) /* 00 xx 00 xx */ return "UTF-16BE"; else if (data[0] != 0x00 && data[1] == 0x00 && data[2] != 0x00 && data[3] == 0x00) /* xx 00 xx 00 */ return "UTF-16BE"; else if (data[0] == 0x00 && data[1] == 0x00 && data[2] == 0x00 && data[3] != 0x00) /* 00 00 00 xx */ return "ISO-10646-UCS-4"; else if (data[0] != 0x00 && data[1] == 0x00 && data[2] == 0x00 && data[3] == 0x00) /* xx 00 00 00 */ return "ISO-10646-UCS-4"; return 0; } /** * Convert a CONTENT_HTML for display. * * The following steps are carried out in order: * * - parsing to an XML tree is completed * - stylesheets are fetched * - the XML tree is converted to a box tree and object fetches are started * - the box tree is laid out * * On exit, the content status will be either CONTENT_STATUS_DONE if the * document is completely loaded or CONTENT_STATUS_READY if objects are still * being fetched. */ bool html_convert(struct content *c, int width, int height) { xmlDoc *document; xmlNode *html, *head; union content_msg_data msg_data; /* finish parsing */ htmlParseChunk(c->data.html.parser, "", 0, 1); document = c->data.html.parser->myDoc; /*xmlDebugDumpDocument(stderr, c->data.html.parser->myDoc);*/ htmlFreeParserCtxt(c->data.html.parser); c->data.html.parser = 0; if (!document) { LOG(("Parsing failed")); msg_data.error = messages_get("ParsingFail"); content_broadcast(c, CONTENT_MSG_ERROR, msg_data); return false; } if (!c->data.html.encoding && document->encoding) { /* The encoding was not in headers or detected, and the parser * found a . */ c->data.html.encoding = talloc_strdup(c, document->encoding); if (!c->data.html.encoding) { msg_data.error = messages_get("NoMemory"); content_broadcast(c, CONTENT_MSG_ERROR, msg_data); return false; } c->data.html.encoding_source = ENCODING_SOURCE_META; } /* locate html and head elements */ for (html = document->children; html != 0 && html->type != XML_ELEMENT_NODE; html = html->next) ; if (html == 0 || strcmp((const char *) html->name, "html") != 0) { LOG(("html element not found")); xmlFreeDoc(document); msg_data.error = messages_get("ParsingFail"); content_broadcast(c, CONTENT_MSG_ERROR, msg_data); return false; } for (head = html->children; head != 0 && head->type != XML_ELEMENT_NODE; head = head->next) ; if (head && strcmp((const char *) head->name, "head") != 0) { head = 0; LOG(("head element not found")); } if (head) { if (!html_head(c, head)) { msg_data.error = messages_get("NoMemory"); content_broadcast(c, CONTENT_MSG_ERROR, msg_data); return false; } /* handle meta refresh */ if (!html_meta_refresh(c, head)) return false; } /* get stylesheets */ if (!html_find_stylesheets(c, head)) { msg_data.error = messages_get("NoMemory"); content_broadcast(c, CONTENT_MSG_ERROR, msg_data); return false; } /* convert xml tree to box tree */ LOG(("XML to box")); content_set_status(c, messages_get("Processing")); content_broadcast(c, CONTENT_MSG_STATUS, msg_data); if (!xml_to_box(html, c)) { msg_data.error = messages_get("NoMemory"); content_broadcast(c, CONTENT_MSG_ERROR, msg_data); return false; } /*box_dump(c->data.html.layout->children, 0);*/ /* extract image maps - can't do this sensibly in xml_to_box */ if (!imagemap_extract(html, c)) { LOG(("imagemap extraction failed")); msg_data.error = messages_get("NoMemory"); content_broadcast(c, CONTENT_MSG_ERROR, msg_data); return false; } /*imagemap_dump(c);*/ /* XML tree not required past this point */ xmlFreeDoc(document); /* layout the box tree */ content_set_status(c, messages_get("Formatting")); content_broadcast(c, CONTENT_MSG_STATUS, msg_data); LOG(("Layout document")); layout_document(c, width, height); /*box_dump(c->data.html.layout->children, 0);*/ c->width = c->data.html.layout->descendant_x1; c->height = c->data.html.layout->descendant_y1; c->size = talloc_total_size(c); if (c->active == 0) { c->status = CONTENT_STATUS_DONE; content_set_status(c, messages_get("Done")); } else { c->status = CONTENT_STATUS_READY; content_set_status(c, messages_get("FetchObjs"), c->active); } return true; } /** * Search for meta refresh * * http://wp.netscape.com/assist/net_sites/pushpull.html * * \param c content structure * \param head xml node of head element * \return true on success, false otherwise (error reported) */ bool html_meta_refresh(struct content *c, xmlNode *head) { xmlNode *n; xmlChar *equiv, *content; union content_msg_data msg_data; char *url, *end, *refresh; url_func_result res; for (n = head == 0 ? 0 : head->children; n; n = n->next) { if (n->type != XML_ELEMENT_NODE) continue; if (strcmp((const char *)n->name, "meta")) continue; equiv = xmlGetProp(n, (const xmlChar *)"http-equiv"); if (!equiv) continue; if (strcasecmp((const char *)equiv, "refresh")) { xmlFree(equiv); continue; } xmlFree(equiv); content = xmlGetProp(n, (const xmlChar *)"content"); if (!content) continue; end = (char *)content + strlen(content); msg_data.delay = (int)strtol((char *) content, &url, 10); if (url == end) { /* Just delay specified, so refresh current page */ xmlFree(content); c->refresh = talloc_strdup(c, c->url); if (!c->refresh) { msg_data.error = messages_get("NoMemory"); content_broadcast(c, CONTENT_MSG_ERROR, msg_data); return false; } content_broadcast(c, CONTENT_MSG_REFRESH, msg_data); break; } for ( ; url <= end - 4; url++) { if (!strncasecmp(url, "url=", 4)) break; } if (url <= end - 4) { res = url_join(url + 4, c->data.html.base_url, &refresh); xmlFree(content); if (res == URL_FUNC_NOMEM) { msg_data.error = messages_get("NoMemory"); content_broadcast(c, CONTENT_MSG_ERROR, msg_data); return false; } else if (res == URL_FUNC_FAILED) { /* This isn't fatal so carry on looking */ continue; } c->refresh = talloc_strdup(c, refresh); free(refresh); if (!c->refresh) { msg_data.error = messages_get("NoMemory"); content_broadcast(c, CONTENT_MSG_ERROR, msg_data); return false; } content_broadcast(c, CONTENT_MSG_REFRESH, msg_data); break; } xmlFree(content); } return true; } /** * Process elements in . * * \param c content structure * \param head xml node of head element * \return true on success, false on memory exhaustion * * The title and base href are extracted if present. */ bool html_head(struct content *c, xmlNode *head) { xmlNode *node; c->title = 0; for (node = head->children; node != 0; node = node->next) { if (node->type != XML_ELEMENT_NODE) continue; if (!c->title && strcmp(node->name, "title") == 0) { xmlChar *title = xmlNodeGetContent(node); if (!title) return false; char *title2 = squash_whitespace(title); xmlFree(title); if (!title2) return false; c->title = talloc_strdup(c, title2); free(title2); if (!c->title) return false; } else if (strcmp(node->name, "base") == 0) { char *href = (char *) xmlGetProp(node, (const xmlChar *) "href"); if (href) { char *url; url_func_result res; res = url_normalize(href, &url); if (res == URL_FUNC_OK) { c->data.html.base_url = talloc_strdup(c, url); free(url); } xmlFree(href); } } } return true; } /** * Process inline stylesheets and fetch linked stylesheets. * * \param c content structure * \param head xml node of head element, or 0 if none * \return true on success, false on memory exhaustion */ bool html_find_stylesheets(struct content *c, xmlNode *head) { xmlNode *node, *node2; char *rel, *type, *media, *href, *data, *url; unsigned int i = STYLESHEET_START; unsigned int last_active = 0; union content_msg_data msg_data; url_func_result res; struct content **stylesheet_content; /* stylesheet 0 is the base style sheet, * stylesheet 1 is the adblocking stylesheet, * stylesheet 2 is any