/* * This file is part of NetSurf, http://netsurf.sourceforge.net/ * Licensed under the GNU General Public License, * http://www.opensource.org/licenses/gpl-license * Copyright 2003 James Bursa * Copyright 2003 Phil Mellor */ /** \file * Fetching of data from a URL (implementation). * * This implementation uses libcurl's 'multi' interface. * * Active fetches are held in the linked list fetch_list. There may be at most * one fetch in progress from each host. Any further fetches are queued until * the previous one ends. * * Invariant: only the fetch at the head of each queue is in progress, ie. * queue_prev == 0 <=> curl_handle != 0 * and queue_prev != 0 <=> curl_handle == 0. */ #include #include #include #include #include #include "curl/curl.h" #include "libxml/uri.h" #include "netsurf/utils/config.h" #include "netsurf/content/fetch.h" #ifdef riscos #include "netsurf/desktop/gui.h" #endif #include "netsurf/desktop/options.h" #ifdef WITH_AUTH #include "netsurf/desktop/401login.h" #endif #ifdef WITH_POST #include "netsurf/render/form.h" #endif #include "netsurf/utils/log.h" #include "netsurf/utils/messages.h" #include "netsurf/utils/utils.h" bool fetch_active; /**< Fetches in progress, please call fetch_poll(). */ /** Information for a single fetch. */ struct fetch { CURL * curl_handle; /**< cURL handle if being fetched, or 0. */ void (*callback)(fetch_msg msg, void *p, char *data, unsigned long size); /**< Callback function. */ bool had_headers; /**< Headers have been processed. */ bool in_callback; /**< Waiting for return from callback. */ bool aborting; /**< Abort requested in callback. */ bool only_2xx; /**< Only HTTP 2xx responses acceptable. */ char *url; /**< URL. */ char *referer; /**< URL for Referer header. */ char error_buffer[CURL_ERROR_SIZE + 10]; /**< Error buffer for cURL. */ void *p; /**< Private data for callback. */ struct curl_slist *headers; /**< List of request headers. */ char *host; /**< Host part of URL. */ char *location; /**< Response Location header, or 0. */ unsigned long content_length; /**< Response Content-Length, or 0. */ #ifdef WITH_AUTH char *realm; /**< HTTP Auth Realm */ #endif #ifdef WITH_POST char *post_urlenc; /**< Url encoded POST string, or 0. */ struct HttpPost *post_multipart; /**< Multipart post data, or 0. */ #endif struct fetch *queue_prev; /**< Previous fetch for this host. */ struct fetch *queue_next; /**< Next fetch for this host. */ struct fetch *prev; /**< Previous active fetch in ::fetch_list. */ struct fetch *next; /**< Next active fetch in ::fetch_list. */ }; static const char * const user_agent = "NetSurf"; static CURLM * curl_multi; /**< Global cURL multi handle. */ static struct fetch *fetch_list = 0; /**< List of active fetches. */ static size_t fetch_curl_data(void * data, size_t size, size_t nmemb, struct fetch *f); static size_t fetch_curl_header(char * data, size_t size, size_t nmemb, struct fetch *f); static bool fetch_process_headers(struct fetch *f); #ifdef WITH_POST static struct HttpPost *fetch_post_convert(struct form_successful_control *control); #endif #ifdef riscos static char * ca_bundle; /**< SSL certificate bundle filename. */ extern const char * const NETSURF_DIR; #endif /** * Initialise the fetcher. * * Must be called once before any other function. */ void fetch_init(void) { CURLcode code; code = curl_global_init(CURL_GLOBAL_ALL); if (code != CURLE_OK) die("curl_global_init failed"); curl_multi = curl_multi_init(); if (curl_multi == 0) die("curl_multi_init failed"); #ifdef riscos ca_bundle = xcalloc(strlen(NETSURF_DIR) + 100, 1); sprintf(ca_bundle, "%s.Resources.ca-bundle", NETSURF_DIR); LOG(("ca_bundle '%s'", ca_bundle)); #endif } /** * Clean up for quit. * * Must be called before exiting. */ void fetch_quit(void) { CURLMcode codem; codem = curl_multi_cleanup(curl_multi); if (codem != CURLM_OK) LOG(("curl_multi_cleanup failed: ignoring")); curl_global_cleanup(); } /** * Start fetching data for the given URL. * * The function returns immediately. The fetch may be queued for later * processing. * * A pointer to an opaque struct fetch is returned, which can be passed to * fetch_abort() to abort the fetch at any time. Returns 0 if the URL is * invalid. * * The caller must supply a callback function which is called when anything * interesting happens. The callback function is first called with msg * FETCH_TYPE, with the Content-Type header in data, then one or more times * with FETCH_DATA with some data for the url, and finally with * FETCH_FINISHED. Alternatively, FETCH_ERROR indicates an error occurred: * data contains an error message. FETCH_REDIRECT may replace the FETCH_TYPE, * FETCH_DATA, FETCH_FINISHED sequence if the server sends a replacement URL. * * Some private data can be passed as the last parameter to fetch_start, and * callbacks will contain this. */ struct fetch * fetch_start(char *url, char *referer, void (*callback)(fetch_msg msg, void *p, char *data, unsigned long size), void *p, bool only_2xx #ifdef WITH_POST , char *post_urlenc, struct form_successful_control *post_multipart #endif #ifdef WITH_COOKIES , bool cookies #endif ) { struct fetch *fetch = xcalloc(1, sizeof(*fetch)), *host_fetch; CURLcode code; CURLMcode codem; xmlURI *uri; #ifdef WITH_AUTH struct login *li; #endif LOG(("fetch %p, url '%s'", fetch, url)); uri = xmlParseURI(url); if (uri == 0) { LOG(("warning: failed to parse url")); return 0; } /* construct a new fetch structure */ fetch->callback = callback; fetch->had_headers = false; fetch->in_callback = false; fetch->aborting = false; fetch->only_2xx = only_2xx; fetch->url = xstrdup(url); fetch->referer = 0; if (referer != 0) fetch->referer = xstrdup(referer); fetch->p = p; fetch->headers = 0; fetch->host = 0; if (uri->server != 0) fetch->host = xstrdup(uri->server); fetch->content_length = 0; #ifdef WITH_POST fetch->post_urlenc = 0; fetch->post_multipart = 0; if (post_urlenc) fetch->post_urlenc = xstrdup(post_urlenc); else if (post_multipart) fetch->post_multipart = fetch_post_convert(post_multipart); #endif fetch->queue_prev = 0; fetch->queue_next = 0; fetch->prev = 0; fetch->next = 0; xmlFreeURI(uri); /* look for a fetch from the same host */ if (fetch->host != 0) { for (host_fetch = fetch_list; host_fetch != 0 && (host_fetch->host == 0 || strcasecmp(host_fetch->host, fetch->host) != 0); host_fetch = host_fetch->next) ; if (host_fetch != 0) { /* fetch from this host in progress: queue the new fetch */ LOG(("queueing")); fetch->curl_handle = 0; /* queue at end */ for (; host_fetch->queue_next; host_fetch = host_fetch->queue_next) ; fetch->queue_prev = host_fetch; host_fetch->queue_next = fetch; return fetch; } } fetch->next = fetch_list; if (fetch_list != 0) fetch_list->prev = fetch; fetch_list = fetch; fetch_active = true; /* create the curl easy handle */ fetch->curl_handle = curl_easy_init(); assert(fetch->curl_handle != 0); /* TODO: handle curl errors */ code = curl_easy_setopt(fetch->curl_handle, CURLOPT_VERBOSE, 1); assert(code == CURLE_OK); code = curl_easy_setopt(fetch->curl_handle, CURLOPT_URL, fetch->url); assert(code == CURLE_OK); code = curl_easy_setopt(fetch->curl_handle, CURLOPT_PRIVATE, fetch); assert(code == CURLE_OK); code = curl_easy_setopt(fetch->curl_handle, CURLOPT_ERRORBUFFER, fetch->error_buffer); assert(code == CURLE_OK); code = curl_easy_setopt(fetch->curl_handle, CURLOPT_WRITEFUNCTION, fetch_curl_data); assert(code == CURLE_OK); code = curl_easy_setopt(fetch->curl_handle, CURLOPT_WRITEDATA, fetch); assert(code == CURLE_OK); code = curl_easy_setopt(fetch->curl_handle, CURLOPT_HEADERFUNCTION, fetch_curl_header); assert(code == CURLE_OK); code = curl_easy_setopt(fetch->curl_handle, CURLOPT_WRITEHEADER, fetch); assert(code == CURLE_OK); code = curl_easy_setopt(fetch->curl_handle, CURLOPT_USERAGENT, user_agent); assert(code == CURLE_OK); if (referer != 0) { code = curl_easy_setopt(fetch->curl_handle, CURLOPT_REFERER, referer); assert(code == CURLE_OK); } #ifdef riscos code = curl_easy_setopt(fetch->curl_handle, CURLOPT_CAINFO, ca_bundle); assert(code == CURLE_OK); #endif code = curl_easy_setopt(fetch->curl_handle, CURLOPT_LOW_SPEED_LIMIT, 1L); assert(code == CURLE_OK); code = curl_easy_setopt(fetch->curl_handle, CURLOPT_LOW_SPEED_TIME, 60L); assert(code == CURLE_OK); code = curl_easy_setopt(fetch->curl_handle, CURLOPT_NOSIGNAL, 1L); assert(code == CURLE_OK); code = curl_easy_setopt(fetch->curl_handle, CURLOPT_CONNECTTIMEOUT, 60L); assert(code == CURLE_OK); /* custom request headers */ fetch->headers = 0; /* remove curl default headers */ fetch->headers = curl_slist_append(fetch->headers, "Accept:"); fetch->headers = curl_slist_append(fetch->headers, "Pragma:"); code = curl_easy_setopt(fetch->curl_handle, CURLOPT_HTTPHEADER, fetch->headers); assert(code == CURLE_OK); /* use proxy if options dictate this */ if (OPTIONS.http) { code = curl_easy_setopt(fetch->curl_handle, CURLOPT_PROXY, OPTIONS.http_proxy); assert(code == CURLE_OK); code = curl_easy_setopt(fetch->curl_handle, CURLOPT_PROXYPORT, (long)OPTIONS.http_port); assert(code == CURLE_OK); } /* HTTP auth */ #ifdef WITH_AUTH if ((li=login_list_get(url)) != NULL) { code = curl_easy_setopt(fetch->curl_handle, CURLOPT_HTTPAUTH, (long)CURLAUTH_ANY); assert(code == CURLE_OK); code = curl_easy_setopt(fetch->curl_handle, CURLOPT_USERPWD, li->logindetails); assert(code == CURLE_OK); } #endif /* POST */ #ifdef WITH_POST if (fetch->post_urlenc) { code = curl_easy_setopt(fetch->curl_handle, CURLOPT_POSTFIELDS, fetch->post_urlenc); assert(code == CURLE_OK); } else if (fetch->post_multipart) { code = curl_easy_setopt(fetch->curl_handle, CURLOPT_HTTPPOST, fetch->post_multipart); assert(code == CURLE_OK); } #endif /* Cookies */ #ifdef WITH_COOKIES if (cookies) { code = curl_easy_setopt(fetch->curl_handle, CURLOPT_COOKIEFILE, messages_get("cookiefile")); assert(code == CURLE_OK); code = curl_easy_setopt(fetch->curl_handle, CURLOPT_COOKIEJAR, messages_get("cookiejar")); assert(code == CURLE_OK); } #endif /* add to the global curl multi handle */ codem = curl_multi_add_handle(curl_multi, fetch->curl_handle); assert(codem == CURLM_OK || codem == CURLM_CALL_MULTI_PERFORM); return fetch; } /** * Stop a fetch. */ void fetch_abort(struct fetch *f) { CURLMcode codem; #ifdef WITH_AUTH struct login *li; #endif assert(f != 0); LOG(("fetch %p, url '%s'", f, f->url)); if (f->in_callback) { LOG(("in callback: will abort later")); f->aborting = true; return; } /* remove from list of fetches */ if (f->prev == 0) fetch_list = f->next; else f->prev->next = f->next; if (f->next != 0) f->next->prev = f->prev; /* remove from curl multi handle */ if (f->curl_handle) { codem = curl_multi_remove_handle(curl_multi, f->curl_handle); assert(codem == CURLM_OK); } if (f->curl_handle && f->queue_next) { /* start a queued fetch for this host, reusing the handle for this host */ struct fetch *fetch = f->queue_next; CURLcode code; CURLMcode codem; LOG(("starting queued %p '%s'", fetch, fetch->url)); fetch->prev = 0; fetch->next = fetch_list; if (fetch_list != 0) fetch_list->prev = fetch; fetch_list = fetch; fetch->queue_prev = 0; fetch->curl_handle = f->curl_handle; code = curl_easy_setopt(fetch->curl_handle, CURLOPT_URL, fetch->url); assert(code == CURLE_OK); code = curl_easy_setopt(fetch->curl_handle, CURLOPT_PRIVATE, fetch); assert(code == CURLE_OK); code = curl_easy_setopt(fetch->curl_handle, CURLOPT_ERRORBUFFER, fetch->error_buffer); assert(code == CURLE_OK); code = curl_easy_setopt(fetch->curl_handle, CURLOPT_WRITEDATA, fetch); assert(code == CURLE_OK); code = curl_easy_setopt(fetch->curl_handle, CURLOPT_WRITEHEADER, fetch); assert(code == CURLE_OK); /* TODO: remove referer header if fetch->referer == 0 */ /*if (fetch->referer != 0)*/ { code = curl_easy_setopt(fetch->curl_handle, CURLOPT_REFERER, fetch->referer); assert(code == CURLE_OK); } /* HTTP auth */ #ifdef WITH_AUTH if ((li=login_list_get(f->url)) != NULL) { code = curl_easy_setopt(fetch->curl_handle, CURLOPT_HTTPAUTH, (long)CURLAUTH_ANY); assert(code == CURLE_OK); code = curl_easy_setopt(fetch->curl_handle, CURLOPT_USERPWD, li->logindetails); assert(code == CURLE_OK); } #endif /* POST */ #ifdef WITH_POST if (fetch->post_urlenc) { code = curl_easy_setopt(fetch->curl_handle, CURLOPT_POSTFIELDS, fetch->post_urlenc); assert(code == CURLE_OK); } else if (fetch->post_multipart) { code = curl_easy_setopt(fetch->curl_handle, CURLOPT_HTTPPOST, fetch->post_multipart); assert(code == CURLE_OK); } else { code = curl_easy_setopt(fetch->curl_handle, CURLOPT_POST, 0); assert(code == CURLE_OK); code = curl_easy_setopt(fetch->curl_handle, CURLOPT_HTTPPOST, 0); assert(code == CURLE_OK); } #endif /* add to the global curl multi handle */ codem = curl_multi_add_handle(curl_multi, fetch->curl_handle); assert(codem == CURLM_OK || codem == CURLM_CALL_MULTI_PERFORM); } else { if (f->curl_handle) curl_easy_cleanup(f->curl_handle); if (f->headers) curl_slist_free_all(f->headers); if (f->queue_prev) f->queue_prev->queue_next = f->queue_next; if (f->queue_next) f->queue_next->queue_prev = f->queue_prev; } xfree(f->url); free(f->host); free(f->referer); free(f->location); #ifdef WITH_AUTH free(f->realm); #endif #ifdef WITH_POST free(f->post_urlenc); if (f->post_multipart) curl_formfree(f->post_multipart); #endif xfree(f); } /** * Do some work on current fetches. * * Must be called regularly to make progress on fetches. */ void fetch_poll(void) { CURLcode code; CURLMcode codem; int running, queue; bool finished; CURLMsg * curl_msg; struct fetch *f; void *p; void (*callback)(fetch_msg msg, void *p, char *data, unsigned long size); /* do any possible work on the current fetches */ do { codem = curl_multi_perform(curl_multi, &running); assert(codem == CURLM_OK || codem == CURLM_CALL_MULTI_PERFORM); } while (codem == CURLM_CALL_MULTI_PERFORM); /* process curl results */ curl_msg = curl_multi_info_read(curl_multi, &queue); while (curl_msg) { switch (curl_msg->msg) { case CURLMSG_DONE: /* find the structure associated with this fetch */ code = curl_easy_getinfo(curl_msg->easy_handle, CURLINFO_PRIVATE, &f); assert(code == CURLE_OK); LOG(("CURLMSG_DONE, result %i", curl_msg->data.result)); /* inform the caller that the fetch is done */ finished = false; callback = f->callback; p = f->p; if (curl_msg->data.result == CURLE_OK) { /* fetch completed normally */ if (!f->had_headers && fetch_process_headers(f)) ; /* redirect with no body or similar */ else finished = true; } else if (curl_msg->data.result != CURLE_WRITE_ERROR) { /* CURLE_WRITE_ERROR occurs when fetch_curl_data * returns 0, which we use to abort intentionally */ callback(FETCH_ERROR, f->p, f->error_buffer, 0); } /* clean up fetch */ fetch_abort(f); /* postponed until after abort so that queue fetches are started */ if (finished) callback(FETCH_FINISHED, p, 0, 0); break; default: assert(0); } curl_msg = curl_multi_info_read(curl_multi, &queue); } if (!fetch_list) fetch_active = false; } /** * Callback function for cURL. */ size_t fetch_curl_data(void * data, size_t size, size_t nmemb, struct fetch *f) { f->in_callback = true; LOG(("fetch %p, size %u", f, size * nmemb)); if (!f->had_headers && fetch_process_headers(f)) { f->in_callback = false; return 0; } /* send data to the caller */ LOG(("FETCH_DATA")); f->callback(FETCH_DATA, f->p, data, size * nmemb); f->in_callback = false; return size * nmemb; } /** * Callback function for headers. */ size_t fetch_curl_header(char * data, size_t size, size_t nmemb, struct fetch *f) { int i; size *= nmemb; if (12 < size && strncasecmp(data, "Location:", 9) == 0) { /* extract Location header */ f->location = xcalloc(size, 1); for (i = 9; data[i] == ' ' || data[i] == '\t'; i++) ; strncpy(f->location, data + i, size - i); for (i = size - i - 1; f->location[i] == ' ' || f->location[i] == '\t' || f->location[i] == '\r' || f->location[i] == '\n'; i--) f->location[i] = '\0'; } else if (15 < size && strncasecmp(data, "Content-Length:", 15) == 0) { /* extract Content-Length header */ for (i = 15; data[i] == ' ' || data[i] == '\t'; i++) ; if ('0' <= data[i] && data[i] <= '9') f->content_length = atol(data + i); #ifdef WITH_AUTH } else if (16 < size && strncasecmp(data, "WWW-Authenticate",16) == 0) { /* extract Realm from WWW-Authenticate header */ f->realm = xcalloc(size, 1); for (i=16;(unsigned int)i!=strlen(data);i++) if(data[i]=='=')break; strncpy(f->realm, data+i+2, size-i-5); #endif } return size; } /** * Find the status code and content type and inform the caller. * * Return true if the fetch is being aborted. */ bool fetch_process_headers(struct fetch *f) { long http_code; const char *type; CURLcode code; f->had_headers = true; code = curl_easy_getinfo(f->curl_handle, CURLINFO_HTTP_CODE, &http_code); assert(code == CURLE_OK); LOG(("HTTP status code %li", http_code)); /* handle HTTP redirects (3xx response codes) */ if (300 <= http_code && http_code < 400 && f->location != 0) { LOG(("FETCH_REDIRECT, '%s'", f->location)); f->callback(FETCH_REDIRECT, f->p, f->location, 0); return true; } /* handle HTTP 401 (Authentication errors) */ #ifdef WITH_AUTH if (http_code == 401) { f->callback(FETCH_AUTH, f->p, f->realm,0); return true; } #endif /* handle HTTP errors (non 2xx response codes) */ if (f->only_2xx && strncmp(f->url, "http", 4) == 0 && (http_code < 200 || 299 < http_code)) { f->callback(FETCH_ERROR, f->p, messages_get("Not2xx"), 0); return true; } /* find MIME type from headers or filetype for local files */ code = curl_easy_getinfo(f->curl_handle, CURLINFO_CONTENT_TYPE, &type); assert(code == CURLE_OK); if (type == 0) { type = "text/html"; if (strncmp(f->url, "file:///", 8) == 0) { char *url_path; url_path = curl_unescape(f->url + 8, (int) strlen(f->url) - 8); type = fetch_filetype(url_path); curl_free(url_path); } else if (strncmp(f->url, "file:/", 6) == 0) { char *url_path; url_path = curl_unescape(f->url + 6, (int) strlen(f->url) - 6); type = fetch_filetype(url_path); curl_free(url_path); } } LOG(("FETCH_TYPE, '%s'", type)); f->callback(FETCH_TYPE, f->p, type, f->content_length); if (f->aborting) return true; return false; } /** * Convert a list of struct ::form_successful_control to a list of * struct HttpPost for libcurl. */ #ifdef WITH_POST struct HttpPost *fetch_post_convert(struct form_successful_control *control) { struct HttpPost *post = 0, *last = 0; for (; control; control = control->next) { curl_formadd(&post, &last, CURLFORM_COPYNAME, control->name, CURLFORM_COPYCONTENTS, control->value, CURLFORM_END); } return post; } #endif /** * testing framework */ #ifdef TEST #include struct test {char *url; struct fetch *f;}; void callback(fetch_msg msg, struct test *t, char *data, unsigned long size) { printf("%s: ", t->url); switch (msg) { case FETCH_TYPE: printf("FETCH_TYPE '%s'", data); break; case FETCH_DATA: printf("FETCH_DATA %lu", size); break; case FETCH_FINISHED: printf("FETCH_FINISHED"); break; case FETCH_ERROR: printf("FETCH_ERROR '%s'", data); break; default: assert(0); } printf("\n"); } struct test test[] = { {"http://127.0.0.1/", 0}, {"http://netsurf.strcprstskrzkrk.co.uk/", 0}, {"http://www.oxfordstudent.com/", 0}, {"http://www.google.co.uk/", 0}, {"http://news.bbc.co.uk/", 0}, {"http://doesnt.exist/", 0}, {"blah://blah", 0}, }; int main(void) { int i; fetch_init(); for (i = 0; i != sizeof(test) / sizeof(test[0]); i++) test[i].f = fetch_start(test[i].url, 0, callback, &test[i]); while (1) { fetch_poll(); sleep(1); } return 0; } #endif