1#!/usr/bin/env ruby
2#
3# This scripted has been updated to accept more command-line arguments:
4#
5#    -u, --url                        URL to process
6#    -m, --machine                    Machine name
7#    -p, --properties                 Properties to add to the machine
8#    -o, --output                     Write output to file
9#
10# Updated by: Marty Schoch <marty.schoch@gmail.com>
11#
12# This script uses the unicode spec to generate a Ragel state machine
13# that recognizes unicode alphanumeric characters.  It generates 5
14# character classes: uupper, ulower, ualpha, udigit, and ualnum.
15# Currently supported encodings are UTF-8 [default] and UCS-4.
16#
17# Usage: unicode2ragel.rb [options]
18#    -e, --encoding [ucs4 | utf8]     Data encoding
19#    -h, --help                       Show this message
20#
21# This script was originally written as part of the Ferret search
22# engine library.
23#
24# Author: Rakan El-Khalil <rakan@well.com>
25
26require 'optparse'
27require 'open-uri'
28
29ENCODINGS = [ :utf8, :ucs4 ]
30ALPHTYPES = { :utf8 => "byte", :ucs4 => "rune" }
31DEFAULT_CHART_URL = "http://www.unicode.org/Public/5.1.0/ucd/DerivedCoreProperties.txt"
32DEFAULT_MACHINE_NAME= "WChar"
33
34###
35# Display vars & default option
36
37TOTAL_WIDTH = 80
38RANGE_WIDTH = 23
39@encoding = :utf8
40@chart_url = DEFAULT_CHART_URL
41machine_name = DEFAULT_MACHINE_NAME
42properties = []
43@output = $stdout
44
45###
46# Option parsing
47
48cli_opts = OptionParser.new do |opts|
49  opts.on("-e", "--encoding [ucs4 | utf8]", "Data encoding") do |o|
50    @encoding = o.downcase.to_sym
51  end
52  opts.on("-h", "--help", "Show this message") do
53    puts opts
54    exit
55  end
56  opts.on("-u", "--url URL", "URL to process") do |o|
57    @chart_url = o
58  end
59  opts.on("-m", "--machine MACHINE_NAME", "Machine name") do |o|
60    machine_name = o
61  end
62  opts.on("-p", "--properties x,y,z", Array, "Properties to add to machine") do |o|
63    properties = o
64  end
65  opts.on("-o", "--output FILE", "output file") do |o|
66    @output = File.new(o, "w+")
67  end
68end
69
70cli_opts.parse(ARGV)
71unless ENCODINGS.member? @encoding
72  puts "Invalid encoding: #{@encoding}"
73  puts cli_opts
74  exit
75end
76
77##
78# Downloads the document at url and yields every alpha line's hex
79# range and description.
80
81def each_alpha( url, property )
82  URI.open( url ) do |file|
83    file.each_line do |line|
84      next if line =~ /^#/;
85      next if line !~ /; #{property} *#/;
86
87      range, description = line.split(/;/)
88      range.strip!
89      description.gsub!(/.*#/, '').strip!
90
91      if range =~ /\.\./
92           start, stop = range.split '..'
93      else start = stop = range
94      end
95
96      yield start.hex .. stop.hex, description
97    end
98  end
99end
100
101###
102# Formats to hex at minimum width
103
104def to_hex( n )
105  r = "%0X" % n
106  r = "0#{r}" unless (r.length % 2).zero?
107  r
108end
109
110###
111# UCS4 is just a straight hex conversion of the unicode codepoint.
112
113def to_ucs4( range )
114  rangestr  =   "0x" + to_hex(range.begin)
115  rangestr << "..0x" + to_hex(range.end) if range.begin != range.end
116  [ rangestr ]
117end
118
119##
120# 0x00     - 0x7f     -> 0zzzzzzz[7]
121# 0x80     - 0x7ff    -> 110yyyyy[5] 10zzzzzz[6]
122# 0x800    - 0xffff   -> 1110xxxx[4] 10yyyyyy[6] 10zzzzzz[6]
123# 0x010000 - 0x10ffff -> 11110www[3] 10xxxxxx[6] 10yyyyyy[6] 10zzzzzz[6]
124
125UTF8_BOUNDARIES = [0x7f, 0x7ff, 0xffff, 0x10ffff]
126
127def to_utf8_enc( n )
128  r = 0
129  if n <= 0x7f
130    r = n
131  elsif n <= 0x7ff
132    y = 0xc0 | (n >> 6)
133    z = 0x80 | (n & 0x3f)
134    r = y << 8 | z
135  elsif n <= 0xffff
136    x = 0xe0 | (n >> 12)
137    y = 0x80 | (n >>  6) & 0x3f
138    z = 0x80 |  n        & 0x3f
139    r = x << 16 | y << 8 | z
140  elsif n <= 0x10ffff
141    w = 0xf0 | (n >> 18)
142    x = 0x80 | (n >> 12) & 0x3f
143    y = 0x80 | (n >>  6) & 0x3f
144    z = 0x80 |  n        & 0x3f
145    r = w << 24 | x << 16 | y << 8 | z
146  end
147
148  to_hex(r)
149end
150
151def from_utf8_enc( n )
152  n = n.hex
153  r = 0
154  if n <= 0x7f
155    r = n
156  elsif n <= 0xdfff
157    y = (n >> 8) & 0x1f
158    z =  n       & 0x3f
159    r = y << 6 | z
160  elsif n <= 0xefffff
161    x = (n >> 16) & 0x0f
162    y = (n >>  8) & 0x3f
163    z =  n        & 0x3f
164    r = x << 10 | y << 6 | z
165  elsif n <= 0xf7ffffff
166    w = (n >> 24) & 0x07
167    x = (n >> 16) & 0x3f
168    y = (n >>  8) & 0x3f
169    z =  n        & 0x3f
170    r = w << 18 | x << 12 | y << 6 | z
171  end
172  r
173end
174
175###
176# Given a range, splits it up into ranges that can be continuously
177# encoded into utf8.  Eg: 0x00 .. 0xff => [0x00..0x7f, 0x80..0xff]
178# This is not strictly needed since the current [5.1] unicode standard
179# doesn't have ranges that straddle utf8 boundaries.  This is included
180# for completeness as there is no telling if that will ever change.
181
182def utf8_ranges( range )
183  ranges = []
184  UTF8_BOUNDARIES.each do |max|
185    if range.begin <= max
186      if range.end <= max
187        ranges << range
188        return ranges
189      end
190
191      ranges << (range.begin .. max)
192      range = (max + 1) .. range.end
193    end
194  end
195  ranges
196end
197
198def build_range( start, stop )
199  size = start.size/2
200  left = size - 1
201  return [""] if size < 1
202
203  a = start[0..1]
204  b = stop[0..1]
205
206  ###
207  # Shared prefix
208
209  if a == b
210    return build_range(start[2..-1], stop[2..-1]).map do |elt|
211      "0x#{a} " + elt
212    end
213  end
214
215  ###
216  # Unshared prefix, end of run
217
218  return ["0x#{a}..0x#{b} "] if left.zero?
219
220  ###
221  # Unshared prefix, not end of run
222  # Range can be 0x123456..0x56789A
223  # Which is equivalent to:
224  #     0x123456 .. 0x12FFFF
225  #     0x130000 .. 0x55FFFF
226  #     0x560000 .. 0x56789A
227
228  ret = []
229  ret << build_range(start, a + "FF" * left)
230
231  ###
232  # Only generate middle range if need be.
233
234  if a.hex+1 != b.hex
235    max = to_hex(b.hex - 1)
236    max = "FF" if b == "FF"
237    ret << "0x#{to_hex(a.hex+1)}..0x#{max} " + "0x00..0xFF " * left
238  end
239
240  ###
241  # Don't generate last range if it is covered by first range
242
243  ret << build_range(b + "00" * left, stop) unless b == "FF"
244  ret.flatten!
245end
246
247def to_utf8( range )
248  utf8_ranges( range ).map do |r|
249    begin_enc = to_utf8_enc(r.begin)
250    end_enc = to_utf8_enc(r.end)
251    build_range begin_enc, end_enc
252  end.flatten!
253end
254
255##
256# Perform a 3-way comparison of the number of codepoints advertised by
257# the unicode spec for the given range, the originally parsed range,
258# and the resulting utf8 encoded range.
259
260def count_codepoints( code )
261  code.split(' ').inject(1) do |acc, elt|
262    if elt =~ /0x(.+)\.\.0x(.+)/
263      if @encoding == :utf8
264        acc * (from_utf8_enc($2) - from_utf8_enc($1) + 1)
265      else
266        acc * ($2.hex - $1.hex + 1)
267      end
268    else
269      acc
270    end
271  end
272end
273
274def is_valid?( range, desc, codes )
275  spec_count  = 1
276  spec_count  = $1.to_i if desc =~ /\[(\d+)\]/
277  range_count = range.end - range.begin + 1
278
279  sum = codes.inject(0) { |acc, elt| acc + count_codepoints(elt) }
280  sum == spec_count and sum == range_count
281end
282
283##
284# Generate the state maching to stdout
285
286def generate_machine( name, property )
287  pipe = " "
288  @output.puts "    #{name} = "
289  each_alpha( @chart_url, property ) do |range, desc|
290
291    codes = (@encoding == :ucs4) ? to_ucs4(range) : to_utf8(range)
292
293    #raise "Invalid encoding of range #{range}: #{codes.inspect}" unless
294    #  is_valid? range, desc, codes
295
296    range_width = codes.map { |a| a.size }.max
297    range_width = RANGE_WIDTH if range_width < RANGE_WIDTH
298
299    desc_width  = TOTAL_WIDTH - RANGE_WIDTH - 11
300    desc_width -= (range_width - RANGE_WIDTH) if range_width > RANGE_WIDTH
301
302    if desc.size > desc_width
303      desc = desc[0..desc_width - 4] + "..."
304    end
305
306    codes.each_with_index do |r, idx|
307      desc = "" unless idx.zero?
308      code = "%-#{range_width}s" % r
309      @output.puts "      #{pipe} #{code} ##{desc}"
310      pipe = "|"
311    end
312  end
313  @output.puts "      ;"
314  @output.puts ""
315end
316
317@output.puts <<EOF
318# The following Ragel file was autogenerated with #{$0}
319# from: #{@chart_url}
320#
321# It defines #{properties}.
322#
323# To use this, make sure that your alphtype is set to #{ALPHTYPES[@encoding]},
324# and that your input is in #{@encoding}.
325
326%%{
327    machine #{machine_name};
328
329EOF
330
331properties.each { |x| generate_machine( x, x ) }
332
333@output.puts <<EOF
334}%%
335EOF
336