From 9f3a082658a979269838d4682f9abfb21a90f679 Mon Sep 17 00:00:00 2001
From: Michael Drake <tlsa@netsurf-browser.org>
Date: Wed, 21 Sep 2011 14:36:42 +0000
Subject: New URL handling (unused atm).

svn path=/trunk/netsurf/; revision=12843
---
 utils/errors.h |    4 +-
 utils/nsurl.c  | 1338 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 utils/nsurl.h  |  130 ++++++
 3 files changed, 1471 insertions(+), 1 deletion(-)
 create mode 100644 utils/nsurl.c
 create mode 100644 utils/nsurl.h

(limited to 'utils')

diff --git a/utils/errors.h b/utils/errors.h
index 5f1bd631b..546709703 100644
--- a/utils/errors.h
+++ b/utils/errors.h
@@ -45,7 +45,9 @@ typedef enum {
 
 	NSERROR_BAD_ENCODING,		/**< The character set is unknown */
 
-	NSERROR_NEED_DATA		/**< More data needed */
+	NSERROR_NEED_DATA,		/**< More data needed */
+
+	NSERROR_BAD_URL			/**< More data needed */
 } nserror;
 
 #endif
diff --git a/utils/nsurl.c b/utils/nsurl.c
new file mode 100644
index 000000000..063a36ce2
--- /dev/null
+++ b/utils/nsurl.c
@@ -0,0 +1,1338 @@
+/*
+ * Copyright 2011 Michael Drake <tlsa@netsurf-browser.org>
+ *
+ * This file is part of NetSurf, http://www.netsurf-browser.org/
+ *
+ * NetSurf is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * NetSurf is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** \file
+ * NetSurf URL handling (implementation).
+ */
+
+#include <assert.h>
+#include <ctype.h>
+#include <libwapcaplet/libwapcaplet.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "utils/errors.h"
+#include "utils/log.h"
+#include "utils/nsurl.h"
+#include "utils/utils.h"
+
+
+/* Define to enable NSURL debugging */
+#undef NSURL_DEBUG
+
+
+/**
+ * NetSurf URL object
+ *
+ * [scheme]://[username][:password]@[host]:[port][/path][?query][#fragment]
+ */
+struct nsurl {
+	lwc_string *scheme;	/**< may be NULL */
+	lwc_string *username;	/**< may be NULL */
+	lwc_string *password;	/**< may be NULL */
+	lwc_string *host;	/**< may be NULL */
+	lwc_string *port;	/**< may be NULL */
+	lwc_string *path;	/**< may be NULL */
+	lwc_string *query;	/**< may be NULL */
+	lwc_string *fragment;	/**< may be NULL */
+};
+
+
+/** Marker set, indicating positions of sections within a URL string */
+struct url_markers {
+	size_t start; /** start of URL */
+	size_t scheme_end;
+	size_t authority;
+
+	size_t colon_first;
+	size_t at;
+	size_t colon_last;
+
+	size_t path;
+	size_t query;
+	size_t fragment;
+
+	size_t end; /** end of URL */
+};
+
+
+/** Sections of a URL */
+enum url_sections {
+	URL_SCHEME,
+	URL_CREDENTIALS,
+	URL_HOST,
+	URL_PATH,
+	URL_QUERY,
+	URL_FRAGMENT
+};
+
+
+#define nsurl__component_copy(c) (c == NULL) ? NULL : lwc_string_ref(c)
+
+#define nsurl__component_compare(c1, c2, match)		\
+	if (c1 && c2)					\
+		lwc_string_isequal(c1, c2, match);	\
+	else if (c1 || c2)				\
+		*match = false;
+
+
+/**
+ * Obtains a set of markers delimiting sections in a URL string
+ *
+ * \param url_s		URL string
+ * \param markers	Updated to mark sections in the URL string
+ * \param joining	True iff URL string is a relative URL for joining
+ */
+static void nsurl__get_string_markers(const char const *url_s,
+		struct url_markers *markers, bool joining)
+{
+	const char *pos = url_s; /** current position in url_s */
+	bool is_http = false;
+
+	/* Initialise marker set */
+	struct url_markers marker = { 0, 0, 0,   0, 0, 0,   0, 0, 0,   0 };
+
+	/* Skip any leading whitespace in url_s */
+	while (isspace(*pos))
+		pos++;
+
+	/* Record start point */
+	marker.start = pos - url_s;
+
+	marker.authority = marker.colon_first = marker.at =
+			marker.colon_last = marker.path = marker.start;
+
+	/* Get scheme */
+	if (isalpha(*pos)) {
+		pos++;
+
+		while (*pos != ':' && *pos != '\0') {
+
+			if (!isalnum(*pos) && *pos != '+' &&
+					*pos != '-' && *pos != '.') {
+				/* This character is not valid in the
+				 * scheme */
+				break;
+			}
+			pos++;
+		}
+
+		if (*pos == ':') {
+			/* This delimits the end of the scheme */
+			size_t off;
+
+			marker.scheme_end = pos - url_s;
+
+			off = marker.scheme_end - marker.start;
+
+			/* Detect http(s) for scheme specifc normalisation */
+			if (off == SLEN("http") &&
+					(((*(pos - off + 0) == 'h') ||
+					  (*(pos - off + 0) == 'H')) &&
+					 ((*(pos - off + 1) == 't') ||
+					  (*(pos - off + 1) == 'T')) &&
+					 ((*(pos - off + 2) == 't') ||
+					  (*(pos - off + 2) == 'T')) &&
+					 ((*(pos - off + 3) == 'p') ||
+					  (*(pos - off + 3) == 'P')))) {
+				is_http = true;
+			} else if (off == SLEN("https") &&
+					(((*(pos - off + 0) == 'h') ||
+					  (*(pos - off + 0) == 'H')) &&
+					 ((*(pos - off + 1) == 't') ||
+					  (*(pos - off + 1) == 'T')) &&
+					 ((*(pos - off + 2) == 't') ||
+					  (*(pos - off + 2) == 'T')) &&
+					 ((*(pos - off + 3) == 'p') ||
+					  (*(pos - off + 3) == 'P')) &&
+					 ((*(pos - off + 3) == 's') ||
+					  (*(pos - off + 3) == 'S')))) {
+				is_http = true;
+			}
+
+			/* Skip over colon */
+			pos++;
+
+			/* Mark place as start of authority */
+			marker.authority = marker.colon_first = marker.at =
+					marker.colon_last = marker.path =
+					pos - url_s;
+
+		} else {
+			/* Not found a scheme  */
+			if (joining == false) {
+				/* Assuming no scheme == http */
+				is_http = true;
+			}
+		}
+	}
+
+	/* Get authority
+	 *
+	 * If this is a relative url that is to be joined onto a base URL, we
+	 * require two slashes to be certain we correctly handle a missing
+	 * authority.
+	 *
+	 * If this URL is not getting joined, we are less strict in the case of
+	 * http(s) and will accept any number of slashes, including 0.
+	 */
+	if (*pos != '\0' && ((joining == false && is_http == true) ||
+			(*pos == '/' && *(pos + 1) == '/'))) {
+		/* Skip over leading slashes */
+		if (is_http == false) {
+			if (*pos == '/') pos++;
+			if (*pos == '/') pos++;
+		} else {
+			while (*pos == '/')
+				pos++;
+		}
+
+		marker.authority = marker.colon_first = marker.at =
+				marker.colon_last = marker.path = pos - url_s;
+
+		/* Need to get (or complete) the authority */
+		do {
+			if (*pos == '/' || *pos == '?' || *pos == '#') {
+				/* End of the authority */
+				break;
+
+			} else if (*pos == ':' && marker.colon_first ==
+					marker.authority) {
+				/* could be username:password or host:port
+				 * separator */
+				marker.colon_first = pos - url_s;
+
+			} else if (*pos == ':' && marker.colon_first !=
+					marker.authority) {
+				/* could be host:port separator */
+				marker.colon_last = pos - url_s;
+
+			} else if (*pos == '@' && marker.at ==
+					marker.authority) {
+				/* Credentials @ host separator */
+				marker.at = pos - url_s;
+			}
+		} while (*(++pos) != '\0');
+
+		marker.path = pos - url_s;
+	}
+
+	/* Get path
+	 *
+	 * Needs to start with '/' if there's no authority
+	 */
+	if (*pos == '/' || ((marker.path == marker.authority) &&
+			(*pos != '?') && (*pos != '#') && (*pos != '\0'))) {
+		while (*(++pos) != '\0') {
+			if (*pos == '?' || *pos == '#') {
+				/* End of the path */
+				break;
+			}
+		}
+	}
+
+	marker.query = pos - url_s;
+
+	/* Get query */
+	if (*pos == '?') {
+		while (*(++pos) != '\0') {
+			if (*pos == '#') {
+				/* End of the query */
+				break;
+			}
+		}
+	}
+
+	marker.fragment = pos - url_s;
+
+	/* Get fragment */
+	if (*pos == '#') {
+		while (*(++pos) != '\0')
+			;
+	}
+
+	/* We got to the end of url_s.
+	 * Need to skip back over trailing whitespace to find end of URL */
+	pos--;
+	while (isspace(*pos))
+		pos--;
+	marker.end = pos + 1 - url_s;
+
+	/* Got all the URL components pegged out now */
+	*markers = marker;
+}
+
+
+/**
+ * Remove dot segments from a path, as per rfc 3986, 5.2.4
+ *
+ * \param path		path to remove dot segments from ('\0' terminated)
+ * \param output	path with dot segments removed
+ * \return size of output
+ */
+static size_t nsurl__remove_dot_segments(char *path, char *output)
+{
+	char *path_pos = path;
+	char *output_pos = output;
+
+	while (*path_pos != '\0') {
+#ifdef NSURL_DEBUG
+		LOG((" in:%s", path_pos));
+		LOG(("out:%.*s", output_pos - output, output));
+#endif
+		if (*path_pos == '.') {
+			if (*(path_pos + 1) == '.' &&
+					*(path_pos + 2) == '/') {
+				/* Found prefix of "../" */
+				path_pos += SLEN("../");
+				continue;
+
+			} else if (*(path_pos + 1) == '/') {
+				/* Found prefix of "./" */
+				path_pos += SLEN("./");
+				continue;
+			}
+		} else if (*path_pos == '/' && *(path_pos + 1) == '.') {
+			if (*(path_pos + 2) == '/') {
+				/* Found prefix of "/./" */
+				path_pos += SLEN("/.");
+				continue;
+
+			} else if (*(path_pos + 2) == '\0') {
+				/* Found "/." at end of path */
+				*(output_pos++) = '/';
+
+				/* End of input path */
+				break;
+
+			} else if (*(path_pos + 2) == '.') {
+				if (*(path_pos + 3) == '/') {
+					/* Found prefix of "/../" */
+					path_pos += SLEN("/..");
+
+					if (output_pos > output)
+						output_pos--;
+					while (output_pos > output &&
+							*output_pos != '/')
+						output_pos--;
+
+					continue;
+
+				} else if (*(path_pos + 3) == '\0') {
+					/* Found "/.." at end of path */
+
+					while (output_pos > output &&
+							*(output_pos -1 ) !='/')
+						output_pos--;
+
+					/* End of input path */
+					break;
+				}
+			}
+		} else if (*path_pos == '.') {
+			if (*(path_pos + 1) == '\0') {
+				/* Found "." at end of path */
+
+				/* End of input path */
+				break;
+
+			} else if (*(path_pos + 1) == '.' &&
+					*(path_pos + 2) == '\0') {
+				/* Found ".." at end of path */
+
+				/* End of input path */
+				break;
+			}
+		}
+		/* Copy first character into output path */
+		*output_pos++ = *path_pos++;
+
+		/* Copy up to but not including next '/' */
+		  while ((*path_pos != '/') && (*path_pos != '\0'))
+		  	*output_pos++ = *path_pos++;
+	}
+
+	return output_pos - output;
+}
+
+
+/**
+ * Get the length of the longest section
+ *
+ * \param m	markers delimiting url sections in a string
+ * \return the length of the longest section
+ */
+static size_t nsurl__get_longest_section(struct url_markers *m)
+{
+	size_t length = m->scheme_end - m->start;	/* scheme */
+
+	if (length < m->at - m->authority)		/* credentials */
+		length = m->at - m->authority;
+
+	if (length < m->path - m->at)			/* host */
+		length = m->path - m->at;
+
+	if (length < m->query - m->path)		/* path */
+		length = m->query - m->path;
+
+	if (length < m->fragment - m->query)		/* query */
+		length = m->fragment - m->query;
+
+	if (length < m->end - m->fragment)		/* fragment */
+		length = m->end - m->fragment;
+
+	return length;
+}
+
+
+/**
+ * Converts two hexadecimal digits to a single number
+ *
+ * \param c1	most significant hex digit
+ * \param c2	least significant hex digit
+ * \return the total value of the two digit hex number
+ *
+ * For unescaping url encoded characters.
+ */
+static inline int nsurl__get_ascii_offset(char c1, char c2)
+{
+	int offset;
+
+	/* Use 1st char as most significant hex digit */
+	if (isdigit(c1))
+		offset = 16 * (c1 - '0');
+	else if (c1 >= 'a' && c1 <= 'f')
+		offset = 16 * (c1 - 'a' + 10);
+	else
+		/* TODO: return something special to indicate error? */
+		return 0;
+
+	/* Use 2nd char as least significant hex digit and sum */
+	if (isdigit(c2))
+		offset += c2 - '0';
+	else if (c2 >= 'a' && c2 <= 'f')
+		offset += c2 - 'a' + 10;
+	else
+		/* TODO: return something special to indicate error? */
+		return 0;
+
+	return offset;
+}
+
+
+/**
+ * Create the components of a NetSurf URL object for a section of a URL string
+ *
+ * \param url_s		URL string
+ * \param section	Sets which section of URL string is to be normalised
+ * \param pegs		Set of markers delimiting the URL string's sections
+ * \param pos_norm	A buffer large enough for the normalised string (*3 + 1)
+ * \param url		A NetSurf URL object, to which components may be added
+ * \return NSERROR_OK on success, appropriate error otherwise
+ *
+ * The section of url_s is normalised appropriately.
+ */
+static nserror nsurl__create_from_section(const char const *url_s,
+		const enum url_sections section,
+		const struct url_markers *pegs,
+		char *pos_norm,
+		nsurl *url)
+{
+	int ascii_offset;
+	int start;
+	int end;
+	const char *pos;
+	const char *pos_url_s;
+	char *norm_start = pos_norm;
+	size_t copy_len;
+	size_t length;
+	enum {
+		NSURL_F_IS_HTTP		= (1 << 0),
+		NSURL_F_IS_HTTPS	= (1 << 1),
+		NSURL_F_NO_PORT		= (1 << 2)
+	} flags = 0;
+
+	switch (section) {
+	case URL_SCHEME:
+		start = pegs->start;
+		end = pegs->scheme_end;
+		break;
+
+	case URL_CREDENTIALS:
+		start = pegs->authority;
+		end = pegs->at;
+		break;
+
+	case URL_HOST:
+		start = (pegs->at == pegs->authority &&
+				*(url_s + pegs->at) != '@') ?
+				pegs->at :
+				pegs->at + 1;
+		end = pegs->path;
+		break;
+
+	case URL_PATH:
+		start = pegs->path;
+		end = pegs->query;
+		break;
+
+	case URL_QUERY:
+		start = pegs->query;
+		end = pegs->fragment;
+		break;
+
+	case URL_FRAGMENT:
+		start = pegs->fragment;
+		end = pegs->end;
+		break;
+	}
+
+	length = end - start;
+
+	/* Stage 1: Normalise the required section */
+
+	pos = pos_url_s = url_s + start;
+	copy_len = 0;
+	for (; pos < url_s + end; pos++) {
+		if (*pos == '%' && (pos + 2 < url_s + end)) {
+			/* Might be an escaped character needing unescaped */
+
+			/* Find which character which was escaped */
+			ascii_offset = nsurl__get_ascii_offset(*(pos + 1),
+					*(pos + 2));
+
+			if (ascii_offset <= 0x20 ||
+					strchr(";/?:@&=+$,<>#%\"{}|\\^[]`",
+							ascii_offset) ||
+					ascii_offset >= 0x7f) {
+				/* This character should be escaped after all,
+				 * just let it get copied */
+				copy_len += 3;
+				pos += 2;
+				continue;
+			}
+
+			if (copy_len > 0) {
+				/* Copy up to here */
+				memcpy(pos_norm, pos_url_s, copy_len);
+				pos_norm += copy_len;
+				copy_len = 0;
+			}
+
+			/* Put the unescaped character in the normalised URL */
+			*(pos_norm++) = (char)ascii_offset;
+			pos += 2;
+			pos_url_s = pos + 1;
+
+			length -= 2;
+
+		} else if (isspace(*pos)) {
+			/* This whitespace needs to be escaped */
+
+			if (copy_len > 0) {
+				/* Copy up to here */
+				memcpy(pos_norm, pos_url_s, copy_len);
+				pos_norm += copy_len;
+				copy_len = 0;
+			}
+			/* escape */
+
+			*(pos_norm++) = '%';
+			*(pos_norm++) = digit2lowcase_hex(*pos >> 4);
+			*(pos_norm++) = digit2lowcase_hex(*pos & 0xf);
+			pos_url_s = pos + 1;
+
+			length += 2;
+
+		} else if ((section == URL_SCHEME || section == URL_HOST) &&
+				isupper(*pos)) {
+			/* Lower case this letter */
+
+			if (copy_len > 0) {
+				/* Copy up to here */
+				memcpy(pos_norm, pos_url_s, copy_len);
+				pos_norm += copy_len;
+				copy_len = 0;
+			}
+			/* Copy lower cased letter into normalised URL */
+			*(pos_norm++) = tolower(*pos);
+			pos_url_s = pos + 1;
+
+		} else {
+			/* This character is safe in normalised URL */
+			copy_len++;
+		}
+	}
+
+	if (copy_len > 0) {
+		/* Copy up to here */
+		memcpy(pos_norm, pos_url_s, copy_len);
+		pos_norm += copy_len;
+	}
+
+	/* Mark end of section */
+	(*pos_norm) = '\0';
+
+	/* Stage 2: Create the URL components for the required section */
+	switch (section) {
+	case URL_SCHEME:
+		if (length == 0 || (length == SLEN("http") &&
+				strncmp(norm_start, "http",
+						SLEN("http")) == 0)) {
+			flags |= NSURL_F_IS_HTTP;
+		} else if (length == SLEN("https") &&
+				strncmp(norm_start, "https",
+						SLEN("https")) == 0) {
+			flags |= NSURL_F_IS_HTTPS;
+		}
+
+		if (length == 0) {
+			/* No scheme, assuming http, and add to URL */
+			if (lwc_intern_string("http", SLEN("http"),
+					&url->scheme) != lwc_error_ok) {
+				return NSERROR_NOMEM;
+			}
+		} else {
+			/* Add scheme to URL */
+			if (lwc_intern_string(norm_start, length,
+					&url->scheme) != lwc_error_ok) {
+				return NSERROR_NOMEM;
+			}
+		}
+
+		break;
+
+	case URL_CREDENTIALS:
+		if (length != 0 && *norm_start != ':') {
+			char *sec_start = norm_start;
+			if (pegs->colon_first != pegs->authority &&
+					pegs->at > pegs->colon_first + 1) {
+				/* there's a password */
+				sec_start += pegs->colon_first -
+						pegs->authority + 1;
+				if (lwc_intern_string(sec_start,
+						pegs->at - pegs->colon_first -1,
+						&url->password) !=
+						lwc_error_ok) {
+					return NSERROR_NOMEM;
+				}
+
+				/* update start pos and length for username */
+				sec_start = norm_start;
+				length -= pegs->at - pegs->colon_first;
+			} else if (pegs->colon_first != pegs->authority &&
+					pegs->at == pegs->colon_first + 1) {
+				/* strip username colon */
+				length--;
+			}
+
+			/* Username */
+			if (lwc_intern_string(sec_start, length,
+					&url->username) != lwc_error_ok) {
+				return NSERROR_NOMEM;
+			}
+		}
+
+		break;
+
+	case URL_HOST:
+		if (length != 0) {
+			size_t colon;
+			char *sec_start = norm_start;
+			if (pegs->at < pegs->colon_first &&
+					pegs->colon_last == pegs->authority) {
+				/* There's one colon and it's after @ marker */
+				colon = pegs->colon_first;
+			} else if (pegs->colon_last != pegs->authority) {
+				/* There's more than one colon */
+				colon = pegs->colon_last;
+			} else {
+				/* There's no colon that could be a port
+				 * separator */
+				flags |= NSURL_F_NO_PORT;
+			}
+
+			if (!(flags & NSURL_F_NO_PORT)) {
+				/* Determine whether colon is a port separator
+				 */
+				sec_start += colon - pegs->at;
+				while (++sec_start < norm_start + length) {
+					if (!isdigit(*sec_start)) {
+						/* Character after port isn't a
+						 * digit; not a port separator
+						 */
+						flags |= NSURL_F_NO_PORT;
+						break;
+					}
+				}
+			}
+
+			if (!(flags & NSURL_F_NO_PORT)) {
+				/* There's a port */
+				sec_start = norm_start + colon - pegs->at + 1;
+				if (flags & NSURL_F_IS_HTTP &&
+						length -
+						(colon - pegs->at + 1) == 2 &&
+						*sec_start == '8' &&
+						*(sec_start + 1) == '0') {
+					/* Scheme is http, and port is default
+					 * (80) */
+					flags |= NSURL_F_NO_PORT;
+				}
+
+				if (length - (colon - pegs->at + 1) <= 0) {
+					/* No space for a port after the colon
+					 */
+					flags |= NSURL_F_NO_PORT;
+				}
+
+				/* Add non-redundant ports to NetSurf URL */
+				sec_start = norm_start + colon - pegs->at + 1;
+				if (!(flags & NSURL_F_NO_PORT) &&
+						lwc_intern_string(sec_start,
+						length - (colon - pegs->at + 1),
+						&url->port) != lwc_error_ok) {
+					return NSERROR_NOMEM;
+				}
+
+				/* update length for host */
+				length = colon - pegs->at;
+			}
+
+			/* host */
+			if (lwc_intern_string(norm_start, length,
+					&url->host) != lwc_error_ok) {
+				return NSERROR_NOMEM;
+			}
+		}
+
+		break;
+
+	case URL_PATH:
+		if (length != 0) {
+			if (lwc_intern_string(norm_start, length,
+					&url->path) != lwc_error_ok) {
+				return NSERROR_NOMEM;
+			}
+		} else if (url->host != NULL) {
+			/* Set empty path to "/", if there's a host */
+			if (lwc_intern_string("/", SLEN("/"),
+					&url->path) != lwc_error_ok) {
+				return NSERROR_NOMEM;
+			}
+		}
+
+		break;
+
+	case URL_QUERY:
+		if (length != 0) {
+			if (lwc_intern_string(norm_start, length,
+					&url->query) != lwc_error_ok) {
+				return NSERROR_NOMEM;
+			}
+		}
+
+		break;
+
+	case URL_FRAGMENT:
+		if (length != 0) {
+			if (lwc_intern_string(norm_start, length,
+					&url->fragment) != lwc_error_ok) {
+				return NSERROR_NOMEM;
+			}
+		}
+
+		break;
+	}
+
+	return NSERROR_OK;
+}
+
+
+#ifdef NSURL_DEBUG
+/**
+ * Dump a NetSurf URL's internal components
+ *
+ * \param url	The NetSurf URL to dump components of
+ */
+static void nsurl__dump(const nsurl *url)
+{
+	if (url->scheme)
+		LOG(("  Scheme: %s", lwc_string_data(url->scheme)));
+
+	if (url->username)
+		LOG(("Username: %s", lwc_string_data(url->username)));
+
+	if (url->password)
+		LOG(("Password: %s", lwc_string_data(url->password)));
+
+	if (url->host)
+		LOG(("    Host: %s", lwc_string_data(url->host)));
+
+	if (url->port)
+		LOG(("    Port: %s", lwc_string_data(url->port)));
+
+	if (url->path)
+		LOG(("    Path: %s", lwc_string_data(url->path)));
+
+	if (url->query)
+		LOG(("   Query: %s", lwc_string_data(url->query)));
+
+	if (url->fragment)
+		LOG(("Fragment: %s", lwc_string_data(url->fragment)));
+}
+#endif
+
+
+/******************************************************************************
+ * NetSurf URL Public API                                                     *
+ ******************************************************************************/
+
+/* exported interface, documented in nsurl.h */
+nserror nsurl_create(const char const *url_s, nsurl **url)
+{
+	struct url_markers m;
+	size_t length;
+	char *buff;
+	nserror e = NSERROR_OK;
+
+	assert(url_s != NULL);
+
+	/* Peg out the URL sections */
+	nsurl__get_string_markers(url_s, &m, false);
+
+	/* Get the length of the longest section */
+	length = nsurl__get_longest_section(&m);
+
+	/* Create NetSurf URL object */
+	*url = calloc(1, sizeof(nsurl));
+	if (*url == NULL) {
+		return NSERROR_NOMEM;
+	}
+
+	/* Allocate enough memory to url escape the longest section */
+	buff = malloc(length * 3 + 1);
+	if (buff == NULL) {
+		nsurl_destroy(*url);
+		return NSERROR_NOMEM;
+	}
+
+	/* Build NetSurf URL object from sections */
+	e |= nsurl__create_from_section(url_s, URL_SCHEME, &m, buff, *url);
+	e |= nsurl__create_from_section(url_s, URL_CREDENTIALS, &m, buff, *url);
+	e |= nsurl__create_from_section(url_s, URL_HOST, &m, buff, *url);
+	e |= nsurl__create_from_section(url_s, URL_PATH, &m, buff, *url);
+	e |= nsurl__create_from_section(url_s, URL_QUERY, &m, buff, *url);
+	e |= nsurl__create_from_section(url_s, URL_FRAGMENT, &m, buff, *url);
+
+	/* Finished with buffer */
+	free(buff);
+
+	return (e == NSERROR_OK) ? NSERROR_OK : NSERROR_NOMEM;
+}
+
+
+/* exported interface, documented in nsurl.h */
+nserror nsurl_destroy(nsurl *url)
+{
+	assert(url != NULL);
+
+#ifdef NSURL_DEBUG
+	nsurl__dump(url);
+#endif
+
+	/* Release lwc strings */
+	if (url->scheme)
+		lwc_string_unref(url->scheme);
+
+	if (url->username)
+		lwc_string_unref(url->username);
+
+	if (url->password)
+		lwc_string_unref(url->password);
+
+	if (url->host)
+		lwc_string_unref(url->host);
+
+	if (url->port)
+		lwc_string_unref(url->port);
+
+	if (url->path)
+		lwc_string_unref(url->path);
+
+	if (url->query)
+		lwc_string_unref(url->query);
+
+	if (url->fragment)
+		lwc_string_unref(url->fragment);
+
+	/* Free the NetSurf URL */
+	free(url);
+
+	return NSERROR_OK;
+}
+
+
+/* exported interface, documented in nsurl.h */
+nserror nsurl_compare(const nsurl *url1, const nsurl *url2,
+		nsurl_component parts, bool *match)
+{
+	assert(url1 != NULL);
+	assert(url2 != NULL);
+
+	*match = true;
+
+	/* Compare URL components */
+
+	/* Path, host and query first, since they're most likely to differ */
+
+	if (parts & NSURL_PATH) {
+		nsurl__component_compare(url1->path, url2->path, match);
+
+		if (*match == false)
+			return NSERROR_OK;
+	}
+
+	if (parts & NSURL_HOST) {
+		nsurl__component_compare(url1->host, url2->host, match);
+
+		if (*match == false)
+			return NSERROR_OK;
+	}
+
+	if (parts & NSURL_QUERY) {
+		nsurl__component_compare(url1->query, url2->query, match);
+
+		if (*match == false)
+			return NSERROR_OK;
+	}
+
+	if (parts & NSURL_SCHEME) {
+		nsurl__component_compare(url1->scheme, url2->scheme, match);
+
+		if (*match == false)
+			return NSERROR_OK;
+	}
+
+	if (parts & NSURL_USERNAME) {
+		nsurl__component_compare(url1->username, url2->username, match);
+
+		if (*match == false)
+			return NSERROR_OK;
+	}
+
+	if (parts & NSURL_PASSWORD) {
+		nsurl__component_compare(url1->password, url2->password, match);
+
+		if (*match == false)
+			return NSERROR_OK;
+	}
+
+	if (parts & NSURL_PORT) {
+		nsurl__component_compare(url1->port, url2->port, match);
+
+		if (*match == false)
+			return NSERROR_OK;
+	}
+
+	if (parts & NSURL_FRAGMENT) {
+		nsurl__component_compare(url1->fragment, url2->fragment, match);
+
+		if (*match == false)
+			return NSERROR_OK;
+	}
+
+	*match = true;
+	return NSERROR_OK;
+}
+
+
+/* exported interface, documented in nsurl.h */
+nserror nsurl_get(const nsurl *url, nsurl_component parts,
+		char **url_s, size_t *url_l)
+{
+	size_t scheme;
+	size_t username;
+	size_t password;
+	size_t host;
+	size_t port;
+	size_t path;
+	size_t query;
+	size_t fragment;
+	char *pos;
+	enum {
+		NSURL_F_SCHEME			= (1 << 0),
+		NSURL_F_SCHEME_PUNCTUATION	= (1 << 1),
+		NSURL_F_AUTHORITY_PUNCTUATION	= (1 << 2),
+		NSURL_F_USERNAME		= (1 << 3),
+		NSURL_F_PASSWORD		= (1 << 4),
+		NSURL_F_CREDENTIALS_PUNCTUATION	= (1 << 5),
+		NSURL_F_HOST			= (1 << 6),
+		NSURL_F_PORT			= (1 << 7),
+		NSURL_F_AUTHORITY		= (NSURL_F_USERNAME |
+							NSURL_F_PASSWORD |
+							NSURL_F_HOST |
+							NSURL_F_PORT),
+		NSURL_F_PATH			= (1 << 8),
+		NSURL_F_QUERY			= (1 << 9),
+		NSURL_F_FRAGMENT		= (1 << 10)
+	} flags = 0;
+
+	/* Intersection of required parts and available parts gives
+	 * the output parts */
+	if (url->scheme && parts & NSURL_SCHEME)
+		flags |= NSURL_F_SCHEME;
+	if (url->username && parts & NSURL_USERNAME)
+		flags |= NSURL_F_USERNAME;
+	if (url->password && parts & NSURL_PASSWORD)
+		flags |= NSURL_F_PASSWORD;
+	if (url->host && parts & NSURL_HOST)
+		flags |= NSURL_F_HOST;
+	if (url->port && parts & NSURL_PORT)
+		flags |= NSURL_F_PORT;
+	if (url->path && parts & NSURL_PATH)
+		flags |= NSURL_F_PATH;
+	if (url->query && parts & NSURL_QUERY)
+		flags |= NSURL_F_QUERY;
+	if (url->fragment && parts & NSURL_FRAGMENT)
+		flags |= NSURL_F_FRAGMENT;
+
+	/* Turn on any spanned punctuation */
+	if ((flags & NSURL_F_SCHEME) && (parts > NSURL_SCHEME))
+		flags |= NSURL_F_SCHEME_PUNCTUATION;
+	if ((flags & NSURL_F_SCHEME) && (flags > NSURL_F_SCHEME) &&
+			url->path && lwc_string_data(url->path)[0] == '/')
+		flags |= NSURL_F_AUTHORITY_PUNCTUATION;
+	if ((flags & (NSURL_F_USERNAME | NSURL_F_PASSWORD)) &&
+				flags & NSURL_F_HOST)
+		flags |= NSURL_F_CREDENTIALS_PUNCTUATION;
+
+	/* Get total output length */
+	*url_l = 0;
+
+	if (flags & NSURL_F_SCHEME) {
+		scheme = lwc_string_length(url->scheme);
+		*url_l += scheme;
+	}
+
+	if (flags & NSURL_F_SCHEME_PUNCTUATION)
+		*url_l += SLEN(":");
+
+	if (flags & NSURL_F_AUTHORITY_PUNCTUATION)
+		*url_l += SLEN("//");
+
+	if (flags & NSURL_F_USERNAME) {
+		username = lwc_string_length(url->username);
+		*url_l += username;
+	}
+
+	if (flags & NSURL_F_PASSWORD) {
+		password = lwc_string_length(url->password);
+		*url_l += SLEN(":") + password;
+	}
+
+	if (flags & NSURL_F_CREDENTIALS_PUNCTUATION)
+		*url_l += SLEN("@");
+
+	if (flags & NSURL_F_HOST) {
+		host = lwc_string_length(url->host);
+		*url_l += host;
+	}
+
+	if (flags & NSURL_F_PORT) {
+		port = lwc_string_length(url->port);
+		*url_l += SLEN(":") + port;
+	}
+
+	if (flags & NSURL_F_PATH) {
+		path = lwc_string_length(url->path);
+		*url_l += path;
+	}
+
+	if (flags & NSURL_F_QUERY) {
+		query = lwc_string_length(url->query);
+		*url_l += query;
+	}
+
+	if (flags & NSURL_F_FRAGMENT) {
+		fragment = lwc_string_length(url->fragment);
+		*url_l += fragment;
+	}
+
+	if (*url_l == 0)
+		return NSERROR_BAD_URL;
+
+	/* Allocate memory for url string */
+	*url_s = malloc(*url_l + 1); /* adding 1 for '\0' */
+	if (*url_s == NULL) {
+		return NSERROR_NOMEM;
+	}
+
+	/* Copy the required parts into the url string */
+	pos = *url_s;
+
+	if (flags & NSURL_F_SCHEME) {
+		memcpy(pos, lwc_string_data(url->scheme), scheme);
+		pos += scheme;
+	}
+
+	if (flags & NSURL_F_SCHEME_PUNCTUATION) {
+		*(pos++) = ':';
+	}
+
+	if (flags & NSURL_F_AUTHORITY_PUNCTUATION) {
+		*(pos++) = '/';
+		*(pos++) = '/';
+	}
+
+	if (flags & NSURL_F_USERNAME) {
+		memcpy(pos, lwc_string_data(url->username), username);
+		pos += username;
+	}
+
+	if (flags & NSURL_F_PASSWORD) {
+		*(pos++) = ':';
+		memcpy(pos, lwc_string_data(url->password), password);
+		pos += password;
+	}
+
+	if (flags & NSURL_F_CREDENTIALS_PUNCTUATION) {
+		*(pos++) = '@';
+	}
+
+	if (flags & NSURL_F_HOST) {
+		memcpy(pos, lwc_string_data(url->host), host);
+		pos += host;
+	}
+
+	if (flags & NSURL_F_PORT) {
+		*(pos++) = ':';
+		memcpy(pos, lwc_string_data(url->port), port);
+		pos += port;
+	}
+
+	if (flags & NSURL_F_PATH) {
+		memcpy(pos, lwc_string_data(url->path), path);
+		pos += path;
+	}
+
+	if (flags & NSURL_F_QUERY) {
+		memcpy(pos, lwc_string_data(url->query), query);
+		pos += query;
+	}
+
+	if (flags & NSURL_F_FRAGMENT) {
+		memcpy(pos, lwc_string_data(url->fragment), fragment);
+		pos += fragment;
+	}
+
+	*pos = '\0';
+
+	return NSERROR_OK;
+}
+
+
+/* exported interface, documented in nsurl.h */
+nserror nsurl_join(const nsurl *base, const char *rel, nsurl **joined)
+{
+	struct url_markers m;
+	size_t length;
+	char *buff;
+	char *buff_pos;
+	char *buff_start;
+	nserror error = 0;
+	enum {
+		NSURL_F_REL		=  0,
+		NSURL_F_BASE_SCHEME	= (1 << 0),
+		NSURL_F_BASE_AUTHORITY	= (1 << 1),
+		NSURL_F_BASE_PATH	= (1 << 2),
+		NSURL_F_MERGED_PATH	= (1 << 3),
+		NSURL_F_BASE_QUERY	= (1 << 4)
+	} joined_parts;
+
+	assert(base != NULL);
+	assert(rel != NULL);
+
+	/* Peg out the URL sections */
+	nsurl__get_string_markers(rel, &m, true);
+
+	/* Get the length of the longest section */
+	length = nsurl__get_longest_section(&m);
+
+	/* Initially assume that the joined URL can be formed entierly from
+	 * the relative URL. */
+	joined_parts = NSURL_F_REL;
+
+	/* Update joined_compnents to indicate any required parts from the
+	 * base URL. */
+	if (m.scheme_end - m.start <= 0) {
+		/* The relative url has no scheme.
+		 * Use base URL's scheme. */
+		joined_parts |= NSURL_F_BASE_SCHEME;
+
+		if (m.path - m.authority <= 0) {
+			/* The relative URL has no authority.
+			 * Use base URL's authority. */
+			joined_parts |= NSURL_F_BASE_AUTHORITY;
+
+			if (m.query - m.path <= 0) {
+				/* The relative URL has no path.
+				 * Use base URL's path. */
+				joined_parts |= NSURL_F_BASE_PATH;
+
+				if (m.fragment - m.query <= 0) {
+					/* The relative URL has no query.
+					 * Use base URL's query. */
+					joined_parts |= NSURL_F_BASE_QUERY;
+				}
+
+			} else if (*(rel + m.path) != '/') {
+				/* Relative URL has relative path */
+				joined_parts |= NSURL_F_MERGED_PATH;
+			}
+		}
+	}
+
+	/* Create NetSurf URL object */
+	*joined = calloc(1, sizeof(nsurl));
+	if (*joined == NULL) {
+		return NSERROR_NOMEM;
+	}
+
+	/* Allocate enough memory to url escape the longest section, plus
+	 * space for path merging (if required). */
+	if (joined_parts & NSURL_F_MERGED_PATH) {
+		/* Need to merge paths */
+		length += lwc_string_length(base->path);
+	}
+	length *= 4;
+	/* Plus space for removing dots from path */
+	length += (m.query - m.path) + lwc_string_length(base->path);
+	buff = malloc(length + 5);
+	if (buff == NULL) {
+		nsurl_destroy(*joined);
+		return NSERROR_NOMEM;
+	}
+
+	buff_start = buff_pos = buff;
+
+	/* Form joined URL from base or rel components, as appropriate */
+
+	if (joined_parts & NSURL_F_BASE_SCHEME)
+		(*joined)->scheme = nsurl__component_copy(base->scheme);
+	else
+		error |= nsurl__create_from_section(rel, URL_SCHEME, &m,
+				buff, *joined);
+
+	if (joined_parts & NSURL_F_BASE_AUTHORITY) {
+		(*joined)->username = nsurl__component_copy(base->username);
+		(*joined)->password = nsurl__component_copy(base->password);
+		(*joined)->host = nsurl__component_copy(base->host);
+		(*joined)->port = nsurl__component_copy(base->port);
+	} else {
+		error |= nsurl__create_from_section(rel, URL_CREDENTIALS, &m,
+				buff, *joined);
+		error |= nsurl__create_from_section(rel, URL_HOST, &m,
+				buff, *joined);
+	}
+
+	if (joined_parts & NSURL_F_BASE_PATH) {
+		(*joined)->path = nsurl__component_copy(base->path);
+
+	} else if (joined_parts & NSURL_F_MERGED_PATH) {
+		struct url_markers m_path;
+		size_t path_len;
+		size_t new_length;
+
+		if (base->host != NULL && base->path == NULL) {
+			/* Append relative path to "/". */
+			*(buff_pos++) = '/';
+			memcpy(buff_pos, rel + m.path, m.query - m.path);
+			buff_pos += m.query - m.path;
+
+			path_len = 1 + m.query - m.path;
+
+		} else {
+			/* Append relative path to all but last segment of
+			 * base path. */
+			size_t path_end = lwc_string_length(base->path);
+			const char *path = lwc_string_data(base->path);
+
+			while (*(path + path_end) != '/' &&
+					path_end != 0) {
+				path_end--;
+			}
+			if (*(path + path_end) == '/')
+				path_end++;
+
+			/* Copy the base part */
+			memcpy(buff_pos, path, path_end);
+			buff_pos += path_end;
+
+			/* Copy the relative part */
+			memcpy(buff_pos, rel + m.path, m.query - m.path);
+			buff_pos += m.query - m.path;
+
+			path_len = path_end + m.query - m.path;
+		}
+
+		/* add termination to string */
+		*buff_pos++ = '\0';
+
+		new_length = nsurl__remove_dot_segments(buff, buff_pos);
+
+		m_path.path = 0;
+		m_path.query = new_length;
+
+		buff_start = buff_pos + new_length;
+		error |= nsurl__create_from_section(buff_pos, URL_PATH, &m_path,
+				buff_start, *joined);
+
+	} else {
+		struct url_markers m_path;
+		size_t new_length;
+
+		memcpy(buff_pos, rel + m.path, m.query - m.path);
+		buff_pos += m.query - m.path;
+		*(buff_pos++) = '\0';
+
+		new_length = nsurl__remove_dot_segments(buff, buff_pos);
+
+		m_path.path = 0;
+		m_path.query = new_length;
+
+		buff_start = buff_pos + new_length;
+		error |= nsurl__create_from_section(buff_pos, URL_PATH, &m_path,
+				buff_start, *joined);
+	}
+
+	if (joined_parts & NSURL_F_BASE_QUERY)
+		(*joined)->query = nsurl__component_copy(base->query);
+	else
+		error |= nsurl__create_from_section(rel, URL_QUERY, &m,
+				buff, *joined);
+
+	error |= nsurl__create_from_section(rel, URL_FRAGMENT, &m,
+			buff, *joined);
+
+	/* Free temporary buffer */
+	free(buff);
+
+	return (error == NSERROR_OK) ? NSERROR_OK : NSERROR_NOMEM;
+}
+
diff --git a/utils/nsurl.h b/utils/nsurl.h
new file mode 100644
index 000000000..7b77c7f1e
--- /dev/null
+++ b/utils/nsurl.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright 2011 Michael Drake <tlsa@netsurf-browser.org>
+ *
+ * This file is part of NetSurf, http://www.netsurf-browser.org/
+ *
+ * NetSurf is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * NetSurf is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** \file
+ * NetSurf URL handling (interface).
+ */
+
+#ifndef _NETSURF_UTILS_NSURL_H_
+#define _NETSURF_UTILS_NSURL_H_
+
+#include "utils/errors.h"
+
+
+/** NetSurf URL object */
+typedef struct nsurl nsurl;
+
+
+typedef enum nsurl_component {
+	NSURL_SCHEME		= (1 << 0),
+	NSURL_USERNAME		= (1 << 1),
+	NSURL_PASSWORD		= (1 << 2),
+	NSURL_CREDENTIALS	= NSURL_USERNAME | NSURL_PASSWORD,
+	NSURL_HOST		= (1 << 3),
+	NSURL_PORT		= (1 << 4),
+	NSURL_AUTHORITY		= NSURL_CREDENTIALS | NSURL_HOST | NSURL_PORT,
+	NSURL_PATH		= (1 << 5),
+	NSURL_QUERY		= (1 << 6),
+	NSURL_COMPLETE		= NSURL_SCHEME | NSURL_AUTHORITY |
+				  NSURL_PATH | NSURL_QUERY,
+	NSURL_FRAGMENT		= (1 << 7),
+	NSURL_WITH_FRAGMENT	= NSURL_COMPLETE | NSURL_FRAGMENT
+} nsurl_component;
+
+
+/**
+ * Create a NetSurf URL object from a URL string
+ *
+ * \param url_s	  String to create NetSurf URL from
+ * \param url	  Returns a NetSurf URL
+ * \return NSERROR_OK on success, appropriate error otherwise
+ *
+ * If return value != NSERROR_OK, nothing will be returned in url.
+ *
+ * It is up to the client to call nsurl_destroy when they are finished with
+ * the created object.
+ */
+nserror nsurl_create(const char const *url_s, nsurl **url);
+
+
+/**
+ * Destroy a NetSurf URL object
+ *
+ * \param url	  NetSurf URL to destroy
+ * \return NSERROR_OK on success, appropriate error otherwise
+ */
+nserror nsurl_destroy(nsurl *url);
+
+
+/**
+ * Compare two URLs
+ *
+ * \param url1	  First NetSurf URL
+ * \param url2	  Second NetSurf URL
+ * \param parts	  The URL components to be compared
+ * \param match	  Returns true if url1 and url2 matched, else false
+ * \return NSERROR_OK on success, appropriate error otherwise
+ *
+ * If return value != NSERROR_OK, match will be false.
+ */
+nserror nsurl_compare(const nsurl *url1, const nsurl *url2,
+		nsurl_component parts, bool *match);
+
+
+/**
+ * Get URL (section) as a string, from a NetSurf URL object
+ *
+ * \param url	  NetSurf URL
+ * \param parts	  The required URL components.
+ * \param url_s	  Returns a url string
+ * \param url_l	  Returns length of url_s
+ * \return NSERROR_OK on success, appropriate error otherwise
+ *
+ * If return value != NSERROR_OK, nothing will be returned in url_s or url_l.
+ *
+ * The string returned in url_s is owned by the client and it is up to them
+ * to free it.  It includes a trailing '\0'.
+ *
+ * The length returned in url_l excludes the trailing '\0'.
+ *
+ * That the required URL components be consecutive is not enforced, however,
+ * non-consecutive URL components generally make no sense.  The exception
+ * is removal of credentials from a URL, such as for display in browser
+ * window URL bar.  'NSURL_COMPLETE &~ NSURL_PASSWORD' would remove the
+ * password from a complete URL.
+ */
+nserror nsurl_get(const nsurl *url, nsurl_component parts,
+		char **url_s, size_t *url_l);
+
+
+/**
+ * Join a base url to a relative link part, creating a new NetSurf URL object
+ *
+ * \param base	  NetSurf URL containing the base to join rel to
+ * \param rel	  String containing the relative link part
+ * \param joined  Returns joined NetSurf URL
+ * \return NSERROR_OK on success, appropriate error otherwise
+ *
+ * If return value != NSERROR_OK, nothing will be returned in join.
+ *
+ * It is up to the client to call nsurl_destroy when they are finished with
+ * the created object.
+ */
+nserror nsurl_join(const nsurl *base, const char *rel, nsurl **joined);
+
+#endif
-- 
cgit v1.2.3