From 7a71363d767f88f54e80e4bbe07eaef54e6fbbee Mon Sep 17 00:00:00 2001 From: James Bursa Date: Sat, 1 Jan 2005 22:05:21 +0000 Subject: [project @ 2005-01-01 22:05:20 by bursa] xcalloc/xrealloc/xstrdup-purge week, part 5. Improve and simplify encoding detection. svn path=/import/netsurf/; revision=1416 --- !NetSurf/Resources/de/Messages | 5 + !NetSurf/Resources/en/Messages | 5 + !NetSurf/Resources/fr/Messages | 5 + render/box.c | 24 +-- render/html.c | 353 +++++++++++++++++++++++++++-------------- render/html.h | 30 ++-- riscos/menus.c | 32 ++-- 7 files changed, 308 insertions(+), 146 deletions(-) diff --git a/!NetSurf/Resources/de/Messages b/!NetSurf/Resources/de/Messages index c26fd02f1..701b2cd82 100644 --- a/!NetSurf/Resources/de/Messages +++ b/!NetSurf/Resources/de/Messages @@ -259,6 +259,11 @@ ScrollH:Ziehen der Leiste scrollt horizontal ScrollPRight:Anklicken scrollt eine Seite nach rechts ScrollRight:Pfeil anklicken scrollt rechts +Encoding0:from HTTP headers +Encoding1:detected +Encoding2:from +EncodingUnk:Unknown + # Interactive help HelpToolbar0:Schaltet zurück auf die zuvor dargestellte Seite.|MDer Inhalt wird dabei nicht aktualisiert. HelpToolbar1:Schaltet vorwärts auf die nächste Seite.|MDer Inhalt wird dabei nicht aktualisiert. diff --git a/!NetSurf/Resources/en/Messages b/!NetSurf/Resources/en/Messages index 6ff639879..d375b633e 100644 --- a/!NetSurf/Resources/en/Messages +++ b/!NetSurf/Resources/en/Messages @@ -259,6 +259,11 @@ ScrollH:Drag the bar to scroll horizontally ScrollPRight:Click to scroll right one page ScrollRight:Click the arrow to scroll right +Encoding0:from HTTP headers +Encoding1:detected +Encoding2:from +EncodingUnk:Unknown + # Interactive help HelpToolbar0:\Tback button.|M\Straverse back one page in the history tree.|MDoes not resubmit form information. HelpToolbar1:\Tforward button.|M\Straverse forward one page in the history tree.|MDoes not resubmit form information. diff --git a/!NetSurf/Resources/fr/Messages b/!NetSurf/Resources/fr/Messages index e2a9af949..16d20343f 100644 --- a/!NetSurf/Resources/fr/Messages +++ b/!NetSurf/Resources/fr/Messages @@ -259,6 +259,11 @@ ScrollH:Drag the bar to scroll horizontally ScrollPRight:Click to scroll right one page ScrollRight:Click the arrow to scroll right +Encoding0:from HTTP headers +Encoding1:detected +Encoding2:from +EncodingUnk:Unknown + # Interactive help HelpToolbar0:\Tle bouton de retour.|M\Srevenir d'une page en arrière dans l'historique.|MNe renvoie pas l'information de formulaire. HelpToolbar1:\Tle bouton d'avance.|M\Savancer d'une page dans l'historique.|MNe renvoie pas l'information de formulaire. diff --git a/render/box.c b/render/box.c index 131060dde..bf055eac4 100644 --- a/render/box.c +++ b/render/box.c @@ -731,8 +731,9 @@ end: if (!url) return false; /* start fetch */ - html_fetch_object(content, url, box, image_types, - content->available_width, 1000, true); + if (!html_fetch_object(content, url, box, image_types, + content->available_width, 1000, true)) + return false; } return true; @@ -1120,8 +1121,9 @@ struct box_result box_image(xmlNode *n, struct box_status *status, return (struct box_result) {box, false, false}; /* start fetch */ - html_fetch_object(status->content, url, box, image_types, - status->content->available_width, 1000, false); + if (!html_fetch_object(status->content, url, box, image_types, + status->content->available_width, 1000, false)) + return (struct box_result) {0, false, true}; return (struct box_result) {box, false, false}; } @@ -1551,10 +1553,11 @@ struct box_result box_input(xmlNode *n, struct box_status *status, */ if (res == URL_FUNC_OK && strcasecmp(url, status->content->data.html.base_url) != 0) - html_fetch_object(status->content, url, box, + if (!html_fetch_object(status->content, url, box, image_types, status->content->available_width, - 1000, false); + 1000, false)) + goto no_memory; xmlFree(s); } @@ -2942,7 +2945,8 @@ bool plugin_decode(struct content* content, char* url, struct box* box, * handle when we fetch it (if the type was not specified or is * different to that given in the attributes). */ - html_fetch_object(content, url, box, 0, 1000, 1000, false); + if (!html_fetch_object(content, url, box, 0, 1000, 1000, false)) + return false; return true; } @@ -3152,8 +3156,10 @@ struct box_result box_frameset(xmlNode *n, struct box_status *status, LOG(("frame, url '%s'", url)); - html_fetch_object(status->content, url, object_box, 0, - object_width, object_height, false); + if (!html_fetch_object(status->content, url, + object_box, 0, + object_width, object_height, false)) + return (struct box_result) {0, false, true}; xmlFree(s); c = c->next; diff --git a/render/html.c b/render/html.c index 9b0bbf2ee..33b2a96c2 100644 --- a/render/html.c +++ b/render/html.c @@ -34,10 +34,12 @@ #define CHUNK 4096 +static bool html_set_parser_encoding(struct content *c, const char *encoding); +static const char *html_detect_encoding(const char *data, unsigned int size); static void html_convert_css_callback(content_msg msg, struct content *css, void *p1, void *p2, union content_msg_data data); -static void html_head(struct content *c, xmlNode *head); -static void html_find_stylesheets(struct content *c, xmlNode *head); +static bool html_head(struct content *c, xmlNode *head); +static bool html_find_stylesheets(struct content *c, xmlNode *head); static void html_object_callback(content_msg msg, struct content *object, void *p1, void *p2, union content_msg_data data); static void html_object_done(struct box *box, struct content *object, @@ -58,44 +60,11 @@ bool html_create(struct content *c, const char *params[]) unsigned int i; struct content_html_data *html = &c->data.html; union content_msg_data msg_data; - xmlCharEncoding encXML = XML_CHAR_ENCODING_NONE; - const char *encStr = NULL; - html->encoding = NULL; + html->parser = 0; + html->encoding_handler = 0; + html->encoding = 0; html->getenc = true; - - for (i = 0; params[i]; i += 2) { - if (strcasecmp(params[i], "charset") == 0) { - encXML = xmlParseCharEncoding(params[i + 1]); - if (encXML != XML_CHAR_ENCODING_ERROR - && encXML != XML_CHAR_ENCODING_NONE) { - /* encoding specified - trust the server... */ - html->encoding = xstrdup(xmlGetCharEncodingName(encXML)); - html->getenc = false; - } else { - encStr = xstrdup(params[i + 1]); - } - break; - } - } - - html->parser = htmlCreatePushParserCtxt(0, 0, "", 0, 0, encXML); - if (encStr != NULL) { - xmlCharEncodingHandlerPtr handler; - if ((handler = xmlFindCharEncodingHandler(encStr)) != NULL) { - if (xmlSwitchToEncoding(html->parser, handler) == 0) { - html->encoding = encStr; - html->getenc = false; - } else { - LOG(("xmlSwitchToEncoding failed for <%s>\n", encStr)); - free((void *)encStr); - } - } else { - LOG(("xmlFindCharEncodingHandler() failed for <%s>\n", encStr)); - free((void *)encStr); - } - } - html->base_url = xstrdup(c->url); html->base_url = strdup(c->url); html->layout = 0; html->background_colour = TRANSPARENT; @@ -106,26 +75,43 @@ bool html_create(struct content *c, const char *params[]) html->object_count = 0; html->object = 0; html->imagemaps = 0; - html->string_pool = pool_create(8000); html->box_pool = pool_create(sizeof (struct box) * 100); + html->string_pool = pool_create(8000); html->bw = 0; - if (!html->parser || !html->base_url || !html->string_pool || - !html->box_pool) { - htmlFreeParserCtxt(html->parser); - free(html->base_url); - if (html->string_pool) - pool_destroy(html->string_pool); - if (html->box_pool) - pool_destroy(html->box_pool); + if (!html->base_url || !html->string_pool || !html->box_pool) + goto no_memory; - msg_data.error = messages_get("NoMemory"); - content_broadcast(c, CONTENT_MSG_ERROR, msg_data); - warn_user("NoMemory", 0); - return false; + for (i = 0; params[i]; i += 2) { + if (strcasecmp(params[i], "charset") == 0) { + html->encoding = strdup(params[i + 1]); + if (!html->encoding) + goto no_memory; + html->encoding_source = ENCODING_SOURCE_HEADER; + html->getenc = false; + break; + } + } + + html->parser = htmlCreatePushParserCtxt(0, 0, "", 0, 0, + XML_CHAR_ENCODING_NONE); + if (!html->parser) + goto no_memory; + + if (html->encoding) { + /* an encoding was specified in the Content-Type header */ + if (!html_set_parser_encoding(c, html->encoding)) + return false; } return true; + +no_memory: + /* memory allocated will be freed in html_destroy() */ + msg_data.error = messages_get("NoMemory"); + content_broadcast(c, CONTENT_MSG_ERROR, msg_data); + warn_user("NoMemory", 0); + return false; } @@ -139,16 +125,22 @@ bool html_process_data(struct content *c, char *data, unsigned int size) { unsigned long x; - /* First time through, check if we need to detect the encoding - * if so, detect it and reset the parser instance with it. - * Do this detection only once. - */ if (c->data.html.getenc) { - xmlCharEncoding encoding = xmlDetectCharEncoding(data, size); - if (encoding != XML_CHAR_ENCODING_ERROR && - encoding != XML_CHAR_ENCODING_NONE) { - xmlSwitchEncoding(c->data.html.parser, encoding); - c->data.html.encoding = xstrdup(xmlGetCharEncodingName(encoding)); + /* No encoding was specified in the Content-Type header. + * Attempt to detect if the encoding is not 8-bit. If the + * encoding is 8-bit, leave the parser unchanged, so that it + * searches for a . */ + const char *encoding; + encoding = html_detect_encoding(data, size); + if (encoding) { + if (!html_set_parser_encoding(c, encoding)) + return false; + c->data.html.encoding = strdup(encoding); + if (!c->data.html.encoding) + return false; + c->data.html.encoding_source = + ENCODING_SOURCE_DETECTED; } c->data.html.getenc = false; } @@ -163,6 +155,80 @@ bool html_process_data(struct content *c, char *data, unsigned int size) } +/** + * Set the HTML parser character encoding. + * + * \param c content of type CONTENT_HTML + * \param encoding name of encoding + * \return true on success, false on error and error reported + */ + +bool html_set_parser_encoding(struct content *c, const char *encoding) +{ + struct content_html_data *html = &c->data.html; + xmlError *error; + char error_message[500]; + union content_msg_data msg_data; + + html->encoding_handler = xmlFindCharEncodingHandler(encoding); + if (!html->encoding_handler) { + /* either out of memory, or no handler available */ + /* assume no handler available, which is not a fatal error */ + LOG(("no encoding handler for \"%s\"", encoding)); + /* \todo warn user and ask them to install iconv? */ + return true; + } + + xmlCtxtResetLastError(html->parser); + if (xmlSwitchToEncoding(html->parser, html->encoding_handler)) { + error = xmlCtxtGetLastError(html->parser); + snprintf(error_message, sizeof error_message, + "%s xmlSwitchToEncoding(): %s", + messages_get("MiscError"), + error ? error->message : "failed"); + msg_data.error = error_message; + content_broadcast(c, CONTENT_MSG_ERROR, msg_data); + return false; + } + + return true; +} + + +/** + * Attempt to detect the encoding of some HTML data. + * + * \param data HTML source data + * \param size length of data + * \return a constant string giving the encoding, or 0 if the encoding + * appears to be some 8-bit encoding + */ + +const char *html_detect_encoding(const char *data, unsigned int size) +{ + /* this detection assumes that the first two characters are <= 0xff */ + if (size < 4) + return 0; + if (data[0] == 0xfe && data[1] == 0xff) /* BOM fe ff */ + return "UTF-16BE"; + else if (data[0] == 0xfe && data[1] == 0xff) /* BOM ff fe */ + return "UTF-16LE"; + else if (data[0] == 0x00 && data[1] != 0x00 && + data[2] == 0x00 && data[3] != 0x00) /* 00 xx 00 xx */ + return "UTF-16BE"; + else if (data[0] != 0x00 && data[1] == 0x00 && + data[2] != 0x00 && data[3] == 0x00) /* xx 00 xx 00 */ + return "UTF-16BE"; + else if (data[0] == 0x00 && data[1] == 0x00 && + data[2] == 0x00 && data[3] != 0x00) /* 00 00 00 xx */ + return "ISO-10646-UCS-4"; + else if (data[0] != 0x00 && data[1] == 0x00 && + data[2] == 0x00 && data[3] == 0x00) /* xx 00 00 00 */ + return "ISO-10646-UCS-4"; + return 0; +} + + /** * Convert a CONTENT_HTML for display. * @@ -196,11 +262,19 @@ bool html_convert(struct content *c, int width, int height) content_broadcast(c, CONTENT_MSG_ERROR, msg_data); return false; } - /* Last change to pick the Content-Type charset information if the - * server didn't send it (or we're reading the HTML from disk) - */ - if (c->data.html.encoding == NULL && document->encoding != NULL) - c->data.html.encoding = xstrdup(document->encoding); + + if (!c->data.html.encoding && document->encoding) { + /* The encoding was not in headers or detected, and the parser + * found a . */ + c->data.html.encoding = strdup(document->encoding); + if (!c->data.html.encoding) { + msg_data.error = messages_get("NoMemory"); + content_broadcast(c, CONTENT_MSG_ERROR, msg_data); + return false; + } + c->data.html.encoding_source = ENCODING_SOURCE_META; + } /* locate html and head elements */ for (html = document->children; @@ -223,11 +297,20 @@ bool html_convert(struct content *c, int width, int height) LOG(("head element not found")); } - if (head != 0) - html_head(c, head); + if (head) { + if (!html_head(c, head)) { + msg_data.error = messages_get("NoMemory"); + content_broadcast(c, CONTENT_MSG_ERROR, msg_data); + return false; + } + } /* get stylesheets */ - html_find_stylesheets(c, head); + if (!html_find_stylesheets(c, head)) { + msg_data.error = messages_get("NoMemory"); + content_broadcast(c, CONTENT_MSG_ERROR, msg_data); + return false; + } /* convert xml tree to box tree */ LOG(("XML to box")); @@ -279,11 +362,12 @@ bool html_convert(struct content *c, int width, int height) * * \param c content structure * \param head xml node of head element + * \return true on success, false on memory exhaustion * * The title and base href are extracted if present. */ -void html_head(struct content *c, xmlNode *head) +bool html_head(struct content *c, xmlNode *head) { xmlNode *node; @@ -295,7 +379,11 @@ void html_head(struct content *c, xmlNode *head) if (!c->title && strcmp(node->name, "title") == 0) { xmlChar *title = xmlNodeGetContent(node); + if (!title) + return false; c->title = squash_whitespace(title); + if (!c->title) + return false; xmlFree(title); } else if (strcmp(node->name, "base") == 0) { @@ -312,6 +400,7 @@ void html_head(struct content *c, xmlNode *head) } } } + return true; } @@ -320,9 +409,10 @@ void html_head(struct content *c, xmlNode *head) * * \param c content structure * \param head xml node of head element, or 0 if none + * \return true on success, false on memory exhaustion */ -void html_find_stylesheets(struct content *c, xmlNode *head) +bool html_find_stylesheets(struct content *c, xmlNode *head) { xmlNode *node, *node2; char *rel, *type, *media, *href, *data, *url; @@ -330,11 +420,15 @@ void html_find_stylesheets(struct content *c, xmlNode *head) unsigned int last_active = 0; union content_msg_data msg_data; url_func_result res; + struct content **stylesheet_content; /* stylesheet 0 is the base style sheet, * stylesheet 1 is the adblocking stylesheet, * stylesheet 2 is any