summaryrefslogtreecommitdiff
path: root/src/lex
diff options
context:
space:
mode:
Diffstat (limited to 'src/lex')
-rw-r--r--src/lex/lex.c63
1 files changed, 57 insertions, 6 deletions
diff --git a/src/lex/lex.c b/src/lex/lex.c
index 70fcdd1..a3e7545 100644
--- a/src/lex/lex.c
+++ b/src/lex/lex.c
@@ -117,7 +117,7 @@ struct css_lexer
do { \
css_error error; \
error = appendToTokenData((lexer), \
- (const uint8_t*) (data), (len)); \
+ (const uint8_t *) (data), (len)); \
if (error != CSS_OK) \
return error; \
(lexer)->bytesReadForToken += (len); \
@@ -1981,11 +1981,55 @@ css_error consumeUnicode(css_lexer *lexer, uint32_t ucs)
}
}
+ /* Sanitise UCS4 character */
+ if (ucs > 0x10FFFF || ucs <= 0x0008 || ucs == 0x000B ||
+ (0x000E <= ucs && ucs <= 0x001F) ||
+ (0x007F <= ucs && ucs <= 0x009F) ||
+ (0xD800 <= ucs && ucs <= 0xDFFF) ||
+ (0xFDD0 <= ucs && ucs <= 0xFDEF) ||
+ (ucs & 0xFFFE) == 0xFFFE) {
+ ucs = 0xFFFD;
+ } else if (ucs == 0x000D) {
+ ucs = 0x000A;
+ }
+
/* Convert our UCS4 character to UTF-8 */
perror = parserutils_charset_utf8_from_ucs4(ucs, &utf8data, &utf8len);
assert(perror == PARSERUTILS_OK);
- /* Append it to the token data (unescaped buffer already set up) */
+ /* Attempt to read a trailing whitespace character */
+ perror = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken, &cptr, &clen);
+ if (perror != PARSERUTILS_OK && perror != PARSERUTILS_EOF) {
+ /* Rewind what we've read */
+ lexer->bytesReadForToken = bytesReadInit;
+ return css_error_from_parserutils_error(perror);
+ }
+
+ if (perror == PARSERUTILS_OK && *cptr == '\r') {
+ /* Potential CRLF */
+ const uint8_t *pCR = cptr;
+
+ perror = parserutils_inputstream_peek(lexer->input,
+ lexer->bytesReadForToken + 1, &cptr, &clen);
+ if (perror != PARSERUTILS_OK && perror != PARSERUTILS_EOF) {
+ /* Rewind what we've read */
+ lexer->bytesReadForToken = bytesReadInit;
+ return css_error_from_parserutils_error(perror);
+ }
+
+ if (perror == PARSERUTILS_OK && *cptr == '\n') {
+ /* CRLF -- account for CR */
+ lexer->bytesReadForToken += 1;
+ } else {
+ /* Stray CR -- restore for later */
+ cptr = pCR;
+ clen = 1;
+ perror = PARSERUTILS_OK;
+ }
+ }
+
+ /* Append char. to the token data (unescaped buffer already set up) */
/* We can't use the APPEND() macro here as we want to rewind correctly
* on error. Additionally, lexer->bytesReadForToken has already been
* advanced */
@@ -1997,16 +2041,23 @@ css_error consumeUnicode(css_lexer *lexer, uint32_t ucs)
return error;
}
- /* Finally, attempt to skip a whitespace character */
+ /* Deal with the whitespace character */
if (perror == PARSERUTILS_EOF)
return CSS_OK;
- if (isSpace(c)) {
+ if (isSpace(*cptr)) {
lexer->bytesReadForToken += clen;
}
- /* +2 for '\' and first digit */
- lexer->currentCol += lexer->bytesReadForToken - bytesReadInit + 2;
+ /* Fixup cursor position */
+ if (*cptr == '\r' || *cptr == '\n' || *cptr == '\f') {
+ lexer->currentCol = 1;
+ lexer->currentLine++;
+ } else {
+ /* +2 for '\' and first digit */
+ lexer->currentCol += lexer->bytesReadForToken -
+ bytesReadInit + 2;
+ }
return CSS_OK;
}