1#!/usr/bin/env ruby 2# 3# This scripted has been updated to accept more command-line arguments: 4# 5# -u, --url URL to process 6# -m, --machine Machine name 7# -p, --properties Properties to add to the machine 8# -o, --output Write output to file 9# 10# Updated by: Marty Schoch <marty.schoch@gmail.com> 11# 12# This script uses the unicode spec to generate a Ragel state machine 13# that recognizes unicode alphanumeric characters. It generates 5 14# character classes: uupper, ulower, ualpha, udigit, and ualnum. 15# Currently supported encodings are UTF-8 [default] and UCS-4. 16# 17# Usage: unicode2ragel.rb [options] 18# -e, --encoding [ucs4 | utf8] Data encoding 19# -h, --help Show this message 20# 21# This script was originally written as part of the Ferret search 22# engine library. 23# 24# Author: Rakan El-Khalil <rakan@well.com> 25 26require 'optparse' 27require 'open-uri' 28 29ENCODINGS = [ :utf8, :ucs4 ] 30ALPHTYPES = { :utf8 => "byte", :ucs4 => "rune" } 31DEFAULT_CHART_URL = "http://www.unicode.org/Public/5.1.0/ucd/DerivedCoreProperties.txt" 32DEFAULT_MACHINE_NAME= "WChar" 33 34### 35# Display vars & default option 36 37TOTAL_WIDTH = 80 38RANGE_WIDTH = 23 39@encoding = :utf8 40@chart_url = DEFAULT_CHART_URL 41machine_name = DEFAULT_MACHINE_NAME 42properties = [] 43@output = $stdout 44 45### 46# Option parsing 47 48cli_opts = OptionParser.new do |opts| 49 opts.on("-e", "--encoding [ucs4 | utf8]", "Data encoding") do |o| 50 @encoding = o.downcase.to_sym 51 end 52 opts.on("-h", "--help", "Show this message") do 53 puts opts 54 exit 55 end 56 opts.on("-u", "--url URL", "URL to process") do |o| 57 @chart_url = o 58 end 59 opts.on("-m", "--machine MACHINE_NAME", "Machine name") do |o| 60 machine_name = o 61 end 62 opts.on("-p", "--properties x,y,z", Array, "Properties to add to machine") do |o| 63 properties = o 64 end 65 opts.on("-o", "--output FILE", "output file") do |o| 66 @output = File.new(o, "w+") 67 end 68end 69 70cli_opts.parse(ARGV) 71unless ENCODINGS.member? @encoding 72 puts "Invalid encoding: #{@encoding}" 73 puts cli_opts 74 exit 75end 76 77## 78# Downloads the document at url and yields every alpha line's hex 79# range and description. 80 81def each_alpha( url, property ) 82 URI.open( url ) do |file| 83 file.each_line do |line| 84 next if line =~ /^#/; 85 next if line !~ /; #{property} *#/; 86 87 range, description = line.split(/;/) 88 range.strip! 89 description.gsub!(/.*#/, '').strip! 90 91 if range =~ /\.\./ 92 start, stop = range.split '..' 93 else start = stop = range 94 end 95 96 yield start.hex .. stop.hex, description 97 end 98 end 99end 100 101### 102# Formats to hex at minimum width 103 104def to_hex( n ) 105 r = "%0X" % n 106 r = "0#{r}" unless (r.length % 2).zero? 107 r 108end 109 110### 111# UCS4 is just a straight hex conversion of the unicode codepoint. 112 113def to_ucs4( range ) 114 rangestr = "0x" + to_hex(range.begin) 115 rangestr << "..0x" + to_hex(range.end) if range.begin != range.end 116 [ rangestr ] 117end 118 119## 120# 0x00 - 0x7f -> 0zzzzzzz[7] 121# 0x80 - 0x7ff -> 110yyyyy[5] 10zzzzzz[6] 122# 0x800 - 0xffff -> 1110xxxx[4] 10yyyyyy[6] 10zzzzzz[6] 123# 0x010000 - 0x10ffff -> 11110www[3] 10xxxxxx[6] 10yyyyyy[6] 10zzzzzz[6] 124 125UTF8_BOUNDARIES = [0x7f, 0x7ff, 0xffff, 0x10ffff] 126 127def to_utf8_enc( n ) 128 r = 0 129 if n <= 0x7f 130 r = n 131 elsif n <= 0x7ff 132 y = 0xc0 | (n >> 6) 133 z = 0x80 | (n & 0x3f) 134 r = y << 8 | z 135 elsif n <= 0xffff 136 x = 0xe0 | (n >> 12) 137 y = 0x80 | (n >> 6) & 0x3f 138 z = 0x80 | n & 0x3f 139 r = x << 16 | y << 8 | z 140 elsif n <= 0x10ffff 141 w = 0xf0 | (n >> 18) 142 x = 0x80 | (n >> 12) & 0x3f 143 y = 0x80 | (n >> 6) & 0x3f 144 z = 0x80 | n & 0x3f 145 r = w << 24 | x << 16 | y << 8 | z 146 end 147 148 to_hex(r) 149end 150 151def from_utf8_enc( n ) 152 n = n.hex 153 r = 0 154 if n <= 0x7f 155 r = n 156 elsif n <= 0xdfff 157 y = (n >> 8) & 0x1f 158 z = n & 0x3f 159 r = y << 6 | z 160 elsif n <= 0xefffff 161 x = (n >> 16) & 0x0f 162 y = (n >> 8) & 0x3f 163 z = n & 0x3f 164 r = x << 10 | y << 6 | z 165 elsif n <= 0xf7ffffff 166 w = (n >> 24) & 0x07 167 x = (n >> 16) & 0x3f 168 y = (n >> 8) & 0x3f 169 z = n & 0x3f 170 r = w << 18 | x << 12 | y << 6 | z 171 end 172 r 173end 174 175### 176# Given a range, splits it up into ranges that can be continuously 177# encoded into utf8. Eg: 0x00 .. 0xff => [0x00..0x7f, 0x80..0xff] 178# This is not strictly needed since the current [5.1] unicode standard 179# doesn't have ranges that straddle utf8 boundaries. This is included 180# for completeness as there is no telling if that will ever change. 181 182def utf8_ranges( range ) 183 ranges = [] 184 UTF8_BOUNDARIES.each do |max| 185 if range.begin <= max 186 if range.end <= max 187 ranges << range 188 return ranges 189 end 190 191 ranges << (range.begin .. max) 192 range = (max + 1) .. range.end 193 end 194 end 195 ranges 196end 197 198def build_range( start, stop ) 199 size = start.size/2 200 left = size - 1 201 return [""] if size < 1 202 203 a = start[0..1] 204 b = stop[0..1] 205 206 ### 207 # Shared prefix 208 209 if a == b 210 return build_range(start[2..-1], stop[2..-1]).map do |elt| 211 "0x#{a} " + elt 212 end 213 end 214 215 ### 216 # Unshared prefix, end of run 217 218 return ["0x#{a}..0x#{b} "] if left.zero? 219 220 ### 221 # Unshared prefix, not end of run 222 # Range can be 0x123456..0x56789A 223 # Which is equivalent to: 224 # 0x123456 .. 0x12FFFF 225 # 0x130000 .. 0x55FFFF 226 # 0x560000 .. 0x56789A 227 228 ret = [] 229 ret << build_range(start, a + "FF" * left) 230 231 ### 232 # Only generate middle range if need be. 233 234 if a.hex+1 != b.hex 235 max = to_hex(b.hex - 1) 236 max = "FF" if b == "FF" 237 ret << "0x#{to_hex(a.hex+1)}..0x#{max} " + "0x00..0xFF " * left 238 end 239 240 ### 241 # Don't generate last range if it is covered by first range 242 243 ret << build_range(b + "00" * left, stop) unless b == "FF" 244 ret.flatten! 245end 246 247def to_utf8( range ) 248 utf8_ranges( range ).map do |r| 249 begin_enc = to_utf8_enc(r.begin) 250 end_enc = to_utf8_enc(r.end) 251 build_range begin_enc, end_enc 252 end.flatten! 253end 254 255## 256# Perform a 3-way comparison of the number of codepoints advertised by 257# the unicode spec for the given range, the originally parsed range, 258# and the resulting utf8 encoded range. 259 260def count_codepoints( code ) 261 code.split(' ').inject(1) do |acc, elt| 262 if elt =~ /0x(.+)\.\.0x(.+)/ 263 if @encoding == :utf8 264 acc * (from_utf8_enc($2) - from_utf8_enc($1) + 1) 265 else 266 acc * ($2.hex - $1.hex + 1) 267 end 268 else 269 acc 270 end 271 end 272end 273 274def is_valid?( range, desc, codes ) 275 spec_count = 1 276 spec_count = $1.to_i if desc =~ /\[(\d+)\]/ 277 range_count = range.end - range.begin + 1 278 279 sum = codes.inject(0) { |acc, elt| acc + count_codepoints(elt) } 280 sum == spec_count and sum == range_count 281end 282 283## 284# Generate the state maching to stdout 285 286def generate_machine( name, property ) 287 pipe = " " 288 @output.puts " #{name} = " 289 each_alpha( @chart_url, property ) do |range, desc| 290 291 codes = (@encoding == :ucs4) ? to_ucs4(range) : to_utf8(range) 292 293 #raise "Invalid encoding of range #{range}: #{codes.inspect}" unless 294 # is_valid? range, desc, codes 295 296 range_width = codes.map { |a| a.size }.max 297 range_width = RANGE_WIDTH if range_width < RANGE_WIDTH 298 299 desc_width = TOTAL_WIDTH - RANGE_WIDTH - 11 300 desc_width -= (range_width - RANGE_WIDTH) if range_width > RANGE_WIDTH 301 302 if desc.size > desc_width 303 desc = desc[0..desc_width - 4] + "..." 304 end 305 306 codes.each_with_index do |r, idx| 307 desc = "" unless idx.zero? 308 code = "%-#{range_width}s" % r 309 @output.puts " #{pipe} #{code} ##{desc}" 310 pipe = "|" 311 end 312 end 313 @output.puts " ;" 314 @output.puts "" 315end 316 317@output.puts <<EOF 318# The following Ragel file was autogenerated with #{$0} 319# from: #{@chart_url} 320# 321# It defines #{properties}. 322# 323# To use this, make sure that your alphtype is set to #{ALPHTYPES[@encoding]}, 324# and that your input is in #{@encoding}. 325 326%%{ 327 machine #{machine_name}; 328 329EOF 330 331properties.each { |x| generate_machine( x, x ) } 332 333@output.puts <<EOF 334}%% 335EOF 336