From a501b83d9be45e80b59fc8eca8e1816f467b4662 Mon Sep 17 00:00:00 2001 From: Rupinder Singh Khokhar Date: Fri, 13 Jun 2014 00:51:59 +0530 Subject: Updating Named Entities API in tokeniser --- build/make-entities.pl | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) (limited to 'build/make-entities.pl') diff --git a/build/make-entities.pl b/build/make-entities.pl index 7492052..ff4113f 100644 --- a/build/make-entities.pl +++ b/build/make-entities.pl @@ -4,8 +4,10 @@ # http://www.opensource.org/licenses/mit-license.php # Copyright 2010 Daniel Silverstone # John-Mark Bell +# Rupinder Singh Khokhar use strict; +use Encode; use constant ENTITIES_FILE => 'build/Entities'; use constant ENTITIES_INC => 'src/tokeniser/entities.inc'; @@ -21,8 +23,25 @@ while (my $line = ) { next if ($line eq ''); my @elements = split /\s+/, $line; my $entity = shift @elements; - my $code = shift @elements; - $entities{$entity} = $code; + my $len = 0; + my $code=""; + while (@elements) { + my $ucs4 = hex(shift(@elements)); + my $utf8 = Encode::encode_utf8(pack('U', $ucs4)); + $len += length($utf8); + if ($ucs4 < 0x20 || $ucs4 == 0x22 || + $ucs4 == 0x27 || $ucs4 == 0x5C || + $ucs4 >= 0x7f) { + # Unsafe, or non-ASCII: emit escaped + for my $octet (unpack('C*', $utf8)) { + $code = $code . sprintf("\\x%02x", $octet); + } + } else { + # Safe: emit value + $code = $code . $utf8; + } + } + $entities{$entity} = "(const uint8_t *)\"$code\",$len"; } close(INFILE); @@ -113,9 +132,9 @@ foreach my $node (@nodelist) { $split = ord($split) if ($split ne ''); $split = 0 if ($split eq ''); - $value = "0" unless defined($value); + $value = "NULL,0" unless defined($value); - $output .= "\t{ $split, $lt, $eq, $gt, $value },\n"; + $output .= "\t{ $split, $lt, $eq, $gt, {$value} },\n"; } $output .= "};\n\n"; -- cgit v1.2.3