summaryrefslogtreecommitdiff
path: root/data_generator.rb
diff options
context:
space:
mode:
authorSteven G. Johnson <stevenj@mit.edu>2014-12-12 16:27:49 -0500
committerSteven G. Johnson <stevenj@mit.edu>2014-12-12 16:30:31 -0500
commit397a1eabea5d7bca2f5f9831ac9431b5b85017fc (patch)
treecb113b817ce4cd76594b1fa2db827b66b7909148 /data_generator.rb
parent539d2cc2024f494b1e3292d4730bdc96390e1361 (diff)
downloadlibutf8proc-397a1eabea5d7bca2f5f9831ac9431b5b85017fc.tar.gz
libutf8proc-397a1eabea5d7bca2f5f9831ac9431b5b85017fc.tar.bz2
update graphemes for Unicode 7, add utf8proc_grapheme_break function
Diffstat (limited to 'data_generator.rb')
-rw-r--r--data_generator.rb22
1 files changed, 11 insertions, 11 deletions
diff --git a/data_generator.rb b/data_generator.rb
index f0e7aa5..1439038 100644
--- a/data_generator.rb
+++ b/data_generator.rb
@@ -75,13 +75,13 @@ $ignorable_list.each_line do |entry|
end
end
-$grapheme_extend_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Grapheme_Extend.*?# Total code points:/m]
-$grapheme_extend = []
-$grapheme_extend_list.each_line do |entry|
- if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
- $1.hex.upto($2.hex) { |e2| $grapheme_extend << e2 }
- elsif entry =~ /^[0-9A-F]+/
- $grapheme_extend << $&.hex
+$grapheme_boundclass_list = File.read("GraphemeBreakProperty.txt")
+$grapheme_boundclass = Hash.new("UTF8PROC_BOUNDCLASS_OTHER")
+$grapheme_boundclass_list.each_line do |entry|
+ if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*([A-Za-z_]+)/
+ $1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_" + $3.upcase }
+ elsif entry =~ /^([0-9A-F]+)\s*;\s*([A-Za-z_]+)/
+ $grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_" + $2.upcase
end
end
@@ -161,18 +161,18 @@ class UnicodeChar
"#{str2c bidi_class, 'BIDI_CLASS'}, " <<
"#{str2c decomp_type, 'DECOMP_TYPE'}, " <<
"#{ary2c decomp_mapping}, " <<
- "#{bidi_mirrored}, " <<
+ "#{ary2c case_folding}, " <<
"#{uppercase_mapping or -1}, " <<
"#{lowercase_mapping or -1}, " <<
"#{titlecase_mapping or -1}, " <<
"#{comb1_indicies[code] ?
(comb1_indicies[code]*comb2_indicies.keys.length) : -1
}, #{comb2_indicies[code] or -1}, " <<
+ "#{bidi_mirrored}, " <<
"#{$exclusions.include?(code) or $excl_version.include?(code)}, " <<
"#{$ignorable.include?(code)}, " <<
"#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
- "#{$grapheme_extend.include?(code)}, " <<
- "#{ary2c case_folding}},\n"
+ "#{$grapheme_boundclass[code]}},\n"
end
end
@@ -295,7 +295,7 @@ end
$stdout << "};\n\n"
$stdout << "const utf8proc_property_t utf8proc_properties[] = {\n"
-$stdout << " {0, 0, 0, 0, NULL, false, -1, -1, -1, -1, -1, false},\n"
+$stdout << " {0, 0, 0, 0, NULL, NULL, -1, -1, -1, -1, -1, false,false,false,false, UTF8PROC_BOUNDCLASS_OTHER},\n"
properties.each { |line|
$stdout << line
}