From 397a1eabea5d7bca2f5f9831ac9431b5b85017fc Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Fri, 12 Dec 2014 16:27:49 -0500 Subject: update graphemes for Unicode 7, add utf8proc_grapheme_break function --- data_generator.rb | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'data_generator.rb') diff --git a/data_generator.rb b/data_generator.rb index f0e7aa5..1439038 100644 --- a/data_generator.rb +++ b/data_generator.rb @@ -75,13 +75,13 @@ $ignorable_list.each_line do |entry| end end -$grapheme_extend_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Grapheme_Extend.*?# Total code points:/m] -$grapheme_extend = [] -$grapheme_extend_list.each_line do |entry| - if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/ - $1.hex.upto($2.hex) { |e2| $grapheme_extend << e2 } - elsif entry =~ /^[0-9A-F]+/ - $grapheme_extend << $&.hex +$grapheme_boundclass_list = File.read("GraphemeBreakProperty.txt") +$grapheme_boundclass = Hash.new("UTF8PROC_BOUNDCLASS_OTHER") +$grapheme_boundclass_list.each_line do |entry| + if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*([A-Za-z_]+)/ + $1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_" + $3.upcase } + elsif entry =~ /^([0-9A-F]+)\s*;\s*([A-Za-z_]+)/ + $grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_" + $2.upcase end end @@ -161,18 +161,18 @@ class UnicodeChar "#{str2c bidi_class, 'BIDI_CLASS'}, " << "#{str2c decomp_type, 'DECOMP_TYPE'}, " << "#{ary2c decomp_mapping}, " << - "#{bidi_mirrored}, " << + "#{ary2c case_folding}, " << "#{uppercase_mapping or -1}, " << "#{lowercase_mapping or -1}, " << "#{titlecase_mapping or -1}, " << "#{comb1_indicies[code] ? (comb1_indicies[code]*comb2_indicies.keys.length) : -1 }, #{comb2_indicies[code] or -1}, " << + "#{bidi_mirrored}, " << "#{$exclusions.include?(code) or $excl_version.include?(code)}, " << "#{$ignorable.include?(code)}, " << "#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " << - "#{$grapheme_extend.include?(code)}, " << - "#{ary2c case_folding}},\n" + "#{$grapheme_boundclass[code]}},\n" end end @@ -295,7 +295,7 @@ end $stdout << "};\n\n" $stdout << "const utf8proc_property_t utf8proc_properties[] = {\n" -$stdout << " {0, 0, 0, 0, NULL, false, -1, -1, -1, -1, -1, false},\n" +$stdout << " {0, 0, 0, 0, NULL, NULL, -1, -1, -1, -1, -1, false,false,false,false, UTF8PROC_BOUNDCLASS_OTHER},\n" properties.each { |line| $stdout << line } -- cgit v1.2.3