From 3f6d2a9f0c89f42fccff3b9bd8c79ed96ef0e5b8 Mon Sep 17 00:00:00 2001
From: John Tytgat <joty@netsurf-browser.org>
Date: Sun, 25 May 2008 15:04:22 +0000
Subject: Contribution from Philip Boulain <prb@ecs.soton.ac.uk>:

This makes url_normalize take care of whitespace in a fairly useful way,
consistent with other browsers:

 - Leading and trailing whitespace is trimmed
 - Internal whitespace is urlescaped

For example,
 "  http://www.google.co.uk/search?q=hello world  "
becomes
 "http://www.google.co.uk/search?q=hello%20world"

Explicit trailing whitespace, e.g. "...hello world%20", is left alone.

The upshot is that if you sloppily copy-paste a URL from IRC or whatnot
into the address bar, NetSurf no longer silently ignores you if you
caught some adjacent whitespace.

svn path=/trunk/netsurf/; revision=4198
---
 utils/url.c   | 117 ++++++++++++++++++++++++++++++++++++----------------------
 utils/utils.h |  12 ++++++
 2 files changed, 85 insertions(+), 44 deletions(-)

diff --git a/utils/url.c b/utils/url.c
index b272a903a..a9b46ad0c 100644
--- a/utils/url.c
+++ b/utils/url.c
@@ -127,41 +127,69 @@ url_func_result url_normalize(const char *url, char **result)
 {
 	char c;
 	int m;
-	int i;
+	size_t i;
 	size_t len;
+	size_t bufsize;
+	char* norm;
 	bool http = false;
 	regmatch_t match[10];
 
 	*result = NULL;
 
-	if ((m = regexec(&url_re, url, 10, match, 0))) {
+	/* skip past any leading whitespace (likely if URL was copy-pasted) */
+	while (isspace(*url))
+		url++;
+
+	/* allocate sufficiently large buffer for new URL */
+	len = strlen(url);
+	bufsize = len + sizeof("http://")-1 + sizeof("/")-1 + 1; /* 'http://' + '/' + '\0' */
+	/* work out how much extra to leave for internal whitespace */
+	for(i = 0; i < len; i++) {
+		if(isspace(url[i])) bufsize += 2; /* ' ' -> '%20' */
+	}
+	if ((norm = malloc(bufsize)) == NULL) {
+		LOG(("malloc failed"));
+		return URL_FUNC_NOMEM;
+	}
+	*result = norm;
+	strcpy(norm, url);
+
+	/* truncate trailing whitespace (significant should be uriencoded) */
+	for (i = len - 1; (i > 0) && isspace(norm[i]); i--) {
+		norm[i] = '\0';
+		len--;
+	}
+
+	/* encode any remaining (internal) whitespace */
+	for (i = 0; i < len; i++) {
+		if(isspace(norm[i])) {
+			char space = norm[i];
+			memmove(norm + i + 2, norm + i, 1 + len - i);
+			len += 2;
+			norm[  i] = '%';
+			norm[++i] = digit2lowcase_hex(space >> 4);
+			norm[++i] = digit2lowcase_hex(space & 0xf);
+		}
+	}
+
+	/* finally verify that it's actually an URL we're working on
+	 * (RFC regex too fussy to tolerate above WSP problems) */
+	if ((m = regexec(&url_re, norm, 10, match, 0))) {
 		LOG(("url '%s' failed to match regex", url));
 		return URL_FUNC_FAILED;
 	}
 
-	len = strlen(url);
-
 	if (match[URL_RE_SCHEME].rm_so == -1) {
 		/* scheme missing: add http:// and reparse */
 /*		LOG(("scheme missing: using http"));*/
-		if ((*result = malloc(len + 13)) == NULL) {
-			LOG(("malloc failed"));
-			return URL_FUNC_NOMEM;
-		}
-		strcpy(*result, "http://");
-		strcpy(*result + sizeof("http://")-1, url);
-		if ((m = regexec(&url_re, *result, 10, match, 0))) {
-			LOG(("url '%s' failed to match regex", (*result)));
-			free(*result);
+		memmove(norm + sizeof("http://")-1, norm, len + 1);
+		memcpy(norm, "http://", sizeof("http://")-1); /* do NOT copy null */
+		len += 7;
+		if ((m = regexec(&url_re, norm, 10, match, 0))) {
+			LOG(("url '%s' failed to match regex", norm));
+			free(norm);
 			return URL_FUNC_FAILED;
 		}
-		len += sizeof("http://")-1;
-	} else {
-		if ((*result = malloc(len + 6)) == NULL) {
-			LOG(("malloc failed"));
-			return URL_FUNC_NOMEM;
-		}
-		strcpy(*result, url);
 	}
 
 	/*for (unsigned int i = 0; i != 10; i++) {
@@ -177,22 +205,22 @@ url_func_result url_normalize(const char *url, char **result)
 	if (match[URL_RE_SCHEME].rm_so != -1) {
 		for (i = match[URL_RE_SCHEME].rm_so;
 				i != match[URL_RE_SCHEME].rm_eo; i++)
-			(*result)[i] = tolower((*result)[i]);
+			norm[i] = tolower(norm[i]);
 		if (match[URL_RE_SCHEME].rm_eo == 4
-				&& (*result)[0] == 'h'
-				&& (*result)[1] == 't'
-				&& (*result)[2] == 't'
-				&& (*result)[3] == 'p')
+				&& norm[0] == 'h'
+				&& norm[1] == 't'
+				&& norm[2] == 't'
+				&& norm[3] == 'p')
 			http = true;
 	}
 
 	/* make empty path into "/" */
 	if (match[URL_RE_PATH].rm_so != -1 &&
 			match[URL_RE_PATH].rm_so == match[URL_RE_PATH].rm_eo) {
-		memmove((*result) + match[URL_RE_PATH].rm_so + 1,
-				(*result) + match[URL_RE_PATH].rm_so,
+		memmove(norm + match[URL_RE_PATH].rm_so + 1,
+				norm + match[URL_RE_PATH].rm_so,
 				len - match[URL_RE_PATH].rm_so + 1);
-		(*result)[match[URL_RE_PATH].rm_so] = '/';
+		norm[match[URL_RE_PATH].rm_so] = '/';
 		len++;
 	}
 
@@ -200,45 +228,45 @@ url_func_result url_normalize(const char *url, char **result)
 	if (match[URL_RE_AUTHORITY].rm_so != -1) {
 		for (i = match[URL_RE_AUTHORITY].rm_so;
 				i != match[URL_RE_AUTHORITY].rm_eo; i++) {
-			if ((*result)[i] == ':') {
-				if (http && (*result)[i + 1] == '8' &&
-						(*result)[i + 2] == '0' &&
+			if (norm[i] == ':' && (i + 3) < len) {
+				if (http && norm[i + 1] == '8' &&
+						norm[i + 2] == '0' &&
 						i + 3 ==
 						match[URL_RE_AUTHORITY].rm_eo) {
-					memmove((*result) + i,
-							(*result) + i + 3,
+					memmove(norm + i,
+							norm + i + 3,
 							len -
 							match[URL_RE_AUTHORITY].
 							rm_eo);
 					len -= 3;
-					(*result)[len] = '\0';
+					norm[len] = '\0';
 				} else if (i + 1 == match[4].rm_eo) {
-					memmove((*result) + i,
-							(*result) + i + 1,
+					memmove(norm + i,
+							norm + i + 1,
 							len -
 							match[URL_RE_AUTHORITY].
 							rm_eo);
 					len--;
-					(*result)[len] = '\0';
+					norm[len] = '\0';
 				}
 				break;
 			}
-			(*result)[i] = tolower((*result)[i]);
+			norm[i] = tolower(norm[i]);
 		}
 	}
 
 	/* unescape non-"reserved" escaped characters */
-	for (i = 0; (unsigned)i != len; i++) {
-		if ((*result)[i] != '%')
+	for (i = 0; i + 2 < len; i++) {
+		if (norm[i] != '%')
 			continue;
-		c = tolower((*result)[i + 1]);
+		c = tolower(norm[i + 1]);
 		if ('0' <= c && c <= '9')
 			m = 16 * (c - '0');
 		else if ('a' <= c && c <= 'f')
 			m = 16 * (c - 'a' + 10);
 		else
 			continue;
-		c = tolower((*result)[i + 2]);
+		c = tolower(norm[i + 2]);
 		if ('0' <= c && c <= '9')
 			m += c - '0';
 		else if ('a' <= c && c <= 'f')
@@ -252,11 +280,12 @@ url_func_result url_normalize(const char *url, char **result)
 			continue;
 		}
 
-		(*result)[i] = m;
-		memmove((*result) + i + 1, (*result) + i + 3, len - i - 2);
+		norm[i] = m;
+		memmove(norm + i + 1, norm + i + 3, len - i - 2);
 		len -= 2;
 	}
 
+	/* norm and *result point to same memory, so just return ok */
 	return URL_FUNC_OK;
 }
 
diff --git a/utils/utils.h b/utils/utils.h
index 3a0417008..8f3516e1d 100644
--- a/utils/utils.h
+++ b/utils/utils.h
@@ -25,6 +25,7 @@
 #include <stdlib.h>
 #include <sys/types.h>
 #include <regex.h>
+#include <assert.h>
 
 #ifndef NOF_ELEMENTS
 #define NOF_ELEMENTS(array) (sizeof(array)/sizeof(*(array)))
@@ -71,6 +72,17 @@ char *strcasestr(const char *haystack, const char *needle);
 #endif
 unsigned int wallclock(void);
 
+/**
+ * Return a hex digit for the given numerical value.
+ *
+ * \return character in range 0-9a-f
+ */
+inline static char digit2lowcase_hex(unsigned char digit) {
+	assert(digit < 16);
+	return "0123456789abcdef"[digit];
+}
+
+
 /* Platform specific functions */
 void die(const char * const error);
 void warn_user(const char *warning, const char *detail);
-- 
cgit v1.2.3