added RAWTEXT contentModel. Also removed an if(c='-') condition because I felt it was extranious, with no clear logic, not according to the specs. Also fixed a sever bug in handling the tagname state. In all 3 more test files give a PASS

author: Rupinder Singh Khokhar <rsk1coder99@gmail.com> 2014-06-17 00:54:12 +0530
committer: Rupinder Singh Khokhar <rsk1coder99@gmail.com> 2014-07-09 10:04:21 +0530
commit: 8c55e32256f4081d097cd7114fcf5e307a8a9288 (patch)
tree: 4377a7d60ab894e15b3da93d1191a5090603f09b
parent: e68a4b8ac410f402d12308ce7d63083b78d7ee89 (diff)
download: libhubbub-8c55e32256f4081d097cd7114fcf5e307a8a9288.tar.gz
libhubbub-8c55e32256f4081d097cd7114fcf5e307a8a9288.tar.bz2
2 files changed, 96 insertions, 98 deletions
diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c
index 64eaf30..7a54df9 100644
--- a/src/tokeniser/tokeniser.c
+++ b/src/tokeniser/tokeniser.c
@@ -693,33 +693,13 @@ hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
 			/* Don't eat the '&'; it'll be handled by entity
 			 * consumption */
 			break;
-		} else if (c == '-' &&
-				tokeniser->escape_flag == false &&
-				(tokeniser->content_model ==
-						HUBBUB_CONTENT_MODEL_RCDATA ||
-				tokeniser->content_model ==
-						HUBBUB_CONTENT_MODEL_CDATA) &&
-				tokeniser->context.pending >= 3) {
-			size_t ignore;
-			error = parserutils_inputstream_peek(
-					tokeniser->input,
-					tokeniser->context.pending - 3,
-					&cptr,
-					&ignore);
-
-			assert(error == PARSERUTILS_OK);
-
-			if (strncmp((char *)cptr,
-					"<!--", SLEN("<!--")) == 0) {
-				tokeniser->escape_flag = true;
-			}
-
-			tokeniser->context.pending += len;
 		} else if (c == '<' && (tokeniser->content_model ==
 						HUBBUB_CONTENT_MODEL_PCDATA ||
 					((tokeniser->content_model ==
 						HUBBUB_CONTENT_MODEL_RCDATA ||
 					tokeniser->content_model ==
+						HUBBUB_CONTENT_MODEL_RAWTEXT ||
+					tokeniser->content_model ==
 						HUBBUB_CONTENT_MODEL_CDATA) &&
 				tokeniser->escape_flag == false))) {
 			if (tokeniser->context.pending > 0) {
@@ -910,6 +890,7 @@ hubbub_error hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
 
 		tokeniser->state = STATE_CLOSE_TAG_OPEN;
 	} else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
+			tokeniser->content_model == HUBBUB_CONTENT_MODEL_RAWTEXT ||
 			tokeniser->content_model ==
 					HUBBUB_CONTENT_MODEL_CDATA) {
 		/* Return to data state with '<' still in "chars" */
@@ -982,6 +963,7 @@ hubbub_error hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
 	/**\todo fragment case */
 
 	if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
+			tokeniser->content_model == HUBBUB_CONTENT_MODEL_RAWTEXT ||
 			tokeniser->content_model ==
 					HUBBUB_CONTENT_MODEL_CDATA) {
 		uint8_t *start_tag_name =
@@ -1037,73 +1019,67 @@ hubbub_error hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
 		}
 	}
 
-	if (ctx->close_tag_match.match == false &&
-			tokeniser->content_model !=
-					HUBBUB_CONTENT_MODEL_PCDATA) {
-		/* We should emit "</" here, but instead we leave it in the
-		 * buffer so the data state emits it with any characters
-		 * following it */
-		tokeniser->state = STATE_DATA;
-	} else {
-		error = parserutils_inputstream_peek(tokeniser->input,
-				tokeniser->context.pending, &cptr, &len);
 
-		if (error == PARSERUTILS_EOF) {
-			/** \todo parse error */
+	error = parserutils_inputstream_peek(tokeniser->input,
+			tokeniser->context.pending, &cptr, &len);
 
-			/* Return to data state with "</" pending */
-			tokeniser->state = STATE_DATA;
-			return HUBBUB_OK;
-		} else if (error != PARSERUTILS_OK) {
-			return hubbub_error_from_parserutils_error(error);
-		}
+	if (error == PARSERUTILS_EOF) {
+		/** \todo parse error */
+		/* Return to data state with "</" pending */
+		tokeniser->state = STATE_DATA;
+		return HUBBUB_OK;
+	} else if (error != PARSERUTILS_OK) {
+		return hubbub_error_from_parserutils_error(error);
+	}
 
-		c = *cptr;
+	c = *cptr;
 
-		if ('A' <= c && c <= 'Z') {
-			uint8_t lc = (c + 0x20);
-			START_BUF(tokeniser->context.current_tag.name,
-					&lc, len);
-			tokeniser->context.current_tag.n_attributes = 0;
+	if ('A' <= c && c <= 'Z') {
+		uint8_t lc = (c + 0x20);
+		START_BUF(tokeniser->context.current_tag.name,
+				&lc, len);
+		tokeniser->context.current_tag.n_attributes = 0;
 
-			tokeniser->context.current_tag_type =
-					HUBBUB_TOKEN_END_TAG;
+		tokeniser->context.current_tag_type =
+				HUBBUB_TOKEN_END_TAG;
 
-			tokeniser->context.pending += len;
+		tokeniser->context.pending += len;
 
-			tokeniser->state = STATE_TAG_NAME;
-		} else if ('a' <= c && c <= 'z') {
-			START_BUF(tokeniser->context.current_tag.name,
-					cptr, len);
-			tokeniser->context.current_tag.n_attributes = 0;
+		tokeniser->state = STATE_TAG_NAME;
+	} else if ('a' <= c && c <= 'z') {
+		START_BUF(tokeniser->context.current_tag.name,
+				cptr, len);
+		tokeniser->context.current_tag.n_attributes = 0;
 
-			tokeniser->context.current_tag_type =
-					HUBBUB_TOKEN_END_TAG;
+		tokeniser->context.current_tag_type =
+				HUBBUB_TOKEN_END_TAG;
 
-			tokeniser->context.pending += len;
+		tokeniser->context.pending += len;
 
-			tokeniser->state = STATE_TAG_NAME;
-		} else if (c == '>') {
-			/* Cursor still at "</", need to collect ">" */
-			tokeniser->context.pending += len;
+		tokeniser->state = STATE_TAG_NAME;
+	} else if(tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
+			tokeniser->content_model == HUBBUB_CONTENT_MODEL_RAWTEXT) {
+		tokeniser->state = STATE_DATA;
+	} else if (c == '>') {
+		/* Cursor still at "</", need to collect ">" */
+		tokeniser->context.pending += len;
 
-			/* Now need to advance past "</>" */
-			parserutils_inputstream_advance(tokeniser->input,
-					tokeniser->context.pending);
-			tokeniser->context.pending = 0;
+		/* Now need to advance past "</>" */
+		parserutils_inputstream_advance(tokeniser->input,
+				tokeniser->context.pending);
+		tokeniser->context.pending = 0;
 
-			/** \todo parse error */
-			tokeniser->state = STATE_DATA;
-		} else {
-			/** \todo parse error */
+		/** \todo parse error */
+		tokeniser->state = STATE_DATA;
+	} else {
+		/** \todo parse error */
 
-			/* Cursor still at "</", need to advance past it */
-			parserutils_inputstream_advance(tokeniser->input,
-					tokeniser->context.pending);
-			tokeniser->context.pending = 0;
+		/* Cursor still at "</", need to advance past it */
+		parserutils_inputstream_advance(tokeniser->input,
+				tokeniser->context.pending);
+		tokeniser->context.pending = 0;
 
-			tokeniser->state = STATE_BOGUS_COMMENT;
-		}
+		tokeniser->state = STATE_BOGUS_COMMENT;
 	}
 
 	return HUBBUB_OK;
@@ -1131,36 +1107,58 @@ hubbub_error hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser)
 	if (error != PARSERUTILS_OK) {
 		if (error == PARSERUTILS_EOF) {
 			tokeniser->state = STATE_DATA;
-			parserutils_inputstream_advance(tokeniser->input,
-					tokeniser->context.pending);
-			return HUBBUB_OK;
+			if(tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
+					tokeniser->content_model == HUBBUB_CONTENT_MODEL_RAWTEXT) {
+				return emit_current_chars(tokeniser);
+			} else {
+				parserutils_inputstream_advance(tokeniser->input,
+						tokeniser->context.pending);
+				return HUBBUB_OK;
+			}
 		} else {
 			return hubbub_error_from_parserutils_error(error);
 		}
 	}
 
 	c = *cptr;
-
-	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
-		tokeniser->context.pending += len;
-		tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
-	} else if (c == '>') {
-		tokeniser->context.pending += len;
-		tokeniser->state = STATE_DATA;
-		return emit_current_tag(tokeniser);
-	} else if (c == '\0') {
-		COLLECT(ctag->name, u_fffd, sizeof(u_fffd));
-		tokeniser->context.pending += len;
-	} else if (c == '/') {
-		tokeniser->context.pending += len;
-		tokeniser->state = STATE_SELF_CLOSING_START_TAG;
-	} else if ('A' <= c && c <= 'Z') {
+	if ('A' <= c && c <= 'Z') {
 		uint8_t lc = (c + 0x20);
 		COLLECT(ctag->name, &lc, len);
 		tokeniser->context.pending += len;
-	} else {
-		COLLECT(ctag->name, cptr, len);
+	} else if('a' <=c && c <= 'z') {
+		COLLECT(ctag->name, &c, len);
 		tokeniser->context.pending += len;
+	} else if (tokeniser->context.close_tag_match.match == false &&
+			tokeniser->content_model !=
+					HUBBUB_CONTENT_MODEL_PCDATA) {
+		/* We should emit "</" here, but instead we leave it in the
+		 * buffer so the data state emits it with any characters
+		 * following it */
+		tokeniser->state = STATE_DATA;
+
+		return emit_current_chars(tokeniser);
+	} else {
+		if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
+			tokeniser->context.pending += len;
+			tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
+		} else if (c == '>') {
+			tokeniser->context.pending += len;
+			tokeniser->state = STATE_DATA;
+			return emit_current_tag(tokeniser);
+		} else if (c == '\0') {
+			COLLECT(ctag->name, u_fffd, sizeof(u_fffd));
+			tokeniser->context.pending += len;
+		} else if (c == '/') {
+			tokeniser->context.pending += len;
+			tokeniser->state = STATE_SELF_CLOSING_START_TAG;
+		} else if(tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
+			tokeniser->content_model == HUBBUB_CONTENT_MODEL_RAWTEXT) {
+			tokeniser->state = STATE_DATA;
+			return emit_current_chars(tokeniser);
+		} else {
+			COLLECT(ctag->name, cptr, len);
+			tokeniser->context.pending += len;
+		}
 	}
 
 	return HUBBUB_OK;
diff --git a/test/data/tokeniser2/INDEX b/test/data/tokeniser2/INDEX
index 33b9060..4b9e037 100644
--- a/test/data/tokeniser2/INDEX
+++ b/test/data/tokeniser2/INDEX
@@ -6,11 +6,11 @@ test1.test		html5lib tests (part 1)
 test2.test		html5lib tests (part 2)
 test3.test		html5lib tests (part 3)
 test4.test		html5lib tests (part 4)
-#contentModelFlags.test	html5lib content model tests
+contentModelFlags.test	html5lib content model tests
 entities.test		html5lib entity tests
-#escapeFlag.test		html5lib escape flag tests
+escapeFlag.test		html5lib escape flag tests
 numericEntities.test	html5lib numeric entities tests
-#unicodeChars.test	html5lib unicode character tests
+unicodeChars.test	html5lib unicode character tests
 cdata.test		CDATA section tests
 regression.test		Regression tests
 #domjs.test		NA
author	Rupinder Singh Khokhar <rsk1coder99@gmail.com>	2014-06-17 00:54:12 +0530
committer	Rupinder Singh Khokhar <rsk1coder99@gmail.com>	2014-07-09 10:04:21 +0530
commit	8c55e32256f4081d097cd7114fcf5e307a8a9288 (patch)
tree	4377a7d60ab894e15b3da93d1191a5090603f09b
parent	e68a4b8ac410f402d12308ce7d63083b78d7ee89 (diff)
download	libhubbub-8c55e32256f4081d097cd7114fcf5e307a8a9288.tar.gz libhubbub-8c55e32256f4081d097cd7114fcf5e307a8a9288.tar.bz2