summaryrefslogtreecommitdiff
path: root/src/lex
diff options
context:
space:
mode:
authorJohn Mark Bell <jmb@netsurf-browser.org>2009-05-26 01:05:46 +0000
committerJohn Mark Bell <jmb@netsurf-browser.org>2009-05-26 01:05:46 +0000
commit1d0f11e0afd72c458d035e737edf00fe8639e5b1 (patch)
treec33399858abb64f9281def986424824ce89cebfc /src/lex
parenteb6a36696e4eedd330ba966ae56329d29bc7ebf0 (diff)
downloadlibcss-1d0f11e0afd72c458d035e737edf00fe8639e5b1.tar.gz
libcss-1d0f11e0afd72c458d035e737edf00fe8639e5b1.tar.bz2
Fix tests for unicode escape sequences to actually have the correct expected values.
Fix expected value for a CHAR containing solely a \. Add new tests for invalid unicode escape sequences (out-of-range, lonely surrogates, etc). Add test to ensure that \000D gets converted to \000A. Fix unicode escape sequence handling to pass the above tests. Also ensure it correctly handles the whitespace character after the escape sequence. svn path=/trunk/libcss/; revision=7549
Diffstat (limited to 'src/lex')
-rw-r--r--src/lex/lex.c63
1 files changed, 57 insertions, 6 deletions
diff --git a/src/lex/lex.c b/src/lex/lex.c
index 70fcdd1..a3e7545 100644
--- a/src/lex/lex.c
+++ b/src/lex/lex.c
@@ -117,7 +117,7 @@ struct css_lexer
do { \
css_error error; \
error = appendToTokenData((lexer), \
- (const uint8_t*) (data), (len)); \
+ (const uint8_t *) (data), (len)); \
if (error != CSS_OK) \
return error; \
(lexer)->bytesReadForToken += (len); \
@@ -1981,11 +1981,55 @@ css_error consumeUnicode(css_lexer *lexer, uint32_t ucs)
}
}
+ /* Sanitise UCS4 character */
+ if (ucs > 0x10FFFF || ucs <= 0x0008 || ucs == 0x000B ||
+ (0x000E <= ucs && ucs <= 0x001F) ||
+ (0x007F <= ucs && ucs <= 0x009F) ||
+ (0xD800 <= ucs && ucs <= 0xDFFF) ||
+ (0xFDD0 <= ucs && ucs <= 0xFDEF) ||
+ (ucs & 0xFFFE) == 0xFFFE) {
+ ucs = 0xFFFD;
+ } else if (ucs == 0x000D) {
+ ucs = 0x000A;
+ }
+
/* Convert our UCS4 character to UTF-8 */
perror = parserutils_charset_utf8_from_ucs4(ucs, &utf8data, &utf8len);
assert(perror == PARSERUTILS_OK);
- /* Append it to the token data (unescaped buffer already set up) */
+ /* Attempt to read a trailing whitespace character */
+ perror = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken, &cptr, &clen);
+ if (perror != PARSERUTILS_OK && perror != PARSERUTILS_EOF) {
+ /* Rewind what we've read */
+ lexer->bytesReadForToken = bytesReadInit;
+ return css_error_from_parserutils_error(perror);
+ }
+
+ if (perror == PARSERUTILS_OK && *cptr == '\r') {
+ /* Potential CRLF */
+ const uint8_t *pCR = cptr;
+
+ perror = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken + 1, &cptr, &clen);
+ if (perror != PARSERUTILS_OK && perror != PARSERUTILS_EOF) {
+ /* Rewind what we've read */
+ lexer->bytesReadForToken = bytesReadInit;
+ return css_error_from_parserutils_error(perror);
+ }
+
+ if (perror == PARSERUTILS_OK && *cptr == '\n') {
+ /* CRLF -- account for CR */
+ lexer->bytesReadForToken += 1;
+ } else {
+ /* Stray CR -- restore for later */
+ cptr = pCR;
+ clen = 1;
+ perror = PARSERUTILS_OK;
+ }
+ }
+
+ /* Append char. to the token data (unescaped buffer already set up) */
/* We can't use the APPEND() macro here as we want to rewind correctly
* on error. Additionally, lexer->bytesReadForToken has already been
* advanced */
@@ -1997,16 +2041,23 @@ css_error consumeUnicode(css_lexer *lexer, uint32_t ucs)
return error;
}
- /* Finally, attempt to skip a whitespace character */
+ /* Deal with the whitespace character */
if (perror == PARSERUTILS_EOF)
return CSS_OK;
- if (isSpace(c)) {
+ if (isSpace(*cptr)) {
lexer->bytesReadForToken += clen;
}
- /* +2 for '\' and first digit */
- lexer->currentCol += lexer->bytesReadForToken - bytesReadInit + 2;
+ /* Fixup cursor position */
+ if (*cptr == '\r' || *cptr == '\n' || *cptr == '\f') {
+ lexer->currentCol = 1;
+ lexer->currentLine++;
+ } else {
+ /* +2 for '\' and first digit */
+ lexer->currentCol += lexer->bytesReadForToken -
+ bytesReadInit + 2;
+ }
return CSS_OK;
}