1#!/usr/bin/env ruby
2
3require 'tomoe-test-utils'
4
5require 'uconv'
6require 'suikyo/suikyo'
7
8unihan_txt = ARGV.shift
9kanjidic2_xml = ARGV.shift
10
11DO_NOT_EDIT_HEADER = <<-EOH
12/*
13  DO NOT EDIT!
14  THIS FILE IS GENERATED FROM Unihan.txt:
15    ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip
16*/
17EOH
18
19@romaji_to_hiragana = Suikyo.new("romaji-kana")
20@hiragana_to_katakana = Suikyo.new("hiragana-katakana")
21
22def kcode(code)
23  old_kcode = $KCODE
24  $KCODE = code
25  yield
26ensure
27  $KCODE = old_kcode
28end
29
30def euc_to_utf8(euc)
31  Uconv.euctou8(euc)
32end
33
34def romaji_to_hiragana(romaji)
35  kcode("e") do
36    euc_to_utf8(@romaji_to_hiragana.convert("#{romaji} "))
37  end
38end
39
40def romaji_to_katakana(romaji)
41  kcode("e") do
42    hiragana = @romaji_to_hiragana.convert("#{romaji} ")
43    euc_to_utf8(@hiragana_to_katakana.convert("#{hiragana} "))
44  end
45end
46
47def ucs4_to_utf8(ucs4)
48  TomoeTestUtils::Unicode.ucs4_to_utf8(Integer("0x#{ucs4}"))
49end
50
51def utf8_to_ucs4(utf8)
52  "%X" % TomoeTestUtils::Unicode.utf8_to_ucs4(utf8)
53end
54
55def cache(filename)
56  cache = "#{filename}.cache"
57  if File.exists?(cache) and (File.mtime(cache) > File.mtime(filename))
58    begin
59      return Marshal.load(File.read(cache))
60    rescue ArgumentError
61    end
62  end
63
64  result = yield
65  File.open(cache, "wb") {|f| f.print(Marshal.dump(result))}
66  result
67end
68
69def parse_unihan_entry(key, value)
70  case key
71  when "kCompatibilityVariant"
72    [:variants, value.split.collect {|v| ucs4_to_utf8(v.sub(/^U\+2?/, ''))}]
73  when "kJapaneseKun"
74    [:ja_kuns, value.split.collect {|k| romaji_to_hiragana(k)}]
75  when "kJapaneseOn"
76    [:ja_ons, value.split.collect {|o| romaji_to_katakana(o)}]
77  when "kTotalStrokes"
78    [:n_strokes, value]
79  else
80    [key, value]
81  end
82end
83
84def parse_unihan_txt(unihan_txt)
85  cache(unihan_txt) do
86    infos = {}
87    File.open(unihan_txt).each do |line|
88      case line
89      when /^#/ #
90        next
91      when /^U\+([\da-fA-F]+)\s+([a-zA-Z_]+)\s*(.*)\s*$/u
92        ucs4 = $1.upcase
93        key = $2
94        value = $3
95
96        infos[ucs4] ||= {}
97        key, value = parse_unihan_entry(key, value)
98        infos[ucs4][key] = value
99      else
100        STDERR.puts "Unknown line: #{line}"
101      end
102    end
103    infos
104  end
105end
106
107def merge_kanjidic2_xml(kanjidic2_xml, infos)
108  dict = Tomoe::DictXML.new("filename" => kanjidic2_xml, "editable" => false)
109  dict.search(Tomoe::Query.new).each do |cand|
110    char = cand.char
111    ucs4 = utf8_to_ucs4(char.utf8)
112    info = infos[ucs4] || {}
113
114    info[:n_strokes] ||= char.n_strokes
115
116    info[:ja_ons] ||= []
117    info[:ja_kuns] ||= []
118    char.readings.each do |reading|
119      case reading.type
120      when Tomoe::Reading::JA_ON
121        info[:ja_ons] << reading.reading
122      when Tomoe::Reading::JA_KUN
123        info[:ja_kuns] << reading.reading
124      end
125    end
126    info[:ja_ons].uniq!
127    info[:ja_kuns].uniq!
128
129    info[:radicals] ||= []
130    info[:radicals].concat(char.radicals)
131    info[:radicals].uniq!
132
133    if char.variant
134      info[:variants] ||= []
135      info[:variants] << char.variant
136      info[:variants].uniq!
137    end
138
139    info[:meta_data] ||= {}
140    char.each do |key, value|
141      info[:meta_data][key] = value
142    end
143
144    infos[ucs4] = info
145  end
146  infos
147end
148
149def generate_header(infos)
150  prefix = "tomoe_unihan_"
151
152  puts <<-EOH
153/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
154#{DO_NOT_EDIT_HEADER}
155
156#include "tomoe-unihan.h"
157
158#include <tomoe-char.h>
159
160typedef struct _TomoeUnihanReading TomoeUnihanReading;
161typedef struct _TomoeUnihanMetaData TomoeUnihanMetaData;
162typedef struct _TomoeUnihanInfo TomoeUnihanInfo;
163
164struct _TomoeUnihanReading {
165    TomoeReadingType  type;
166    gchar            *reading;
167};
168
169struct _TomoeUnihanMetaData {
170    gchar *key;
171    gchar *value;
172};
173
174struct _TomoeUnihanInfo {
175    gchar                *utf8;
176    gint                  n_strokes;
177    TomoeUnihanReading   *readings;
178    gint                  readings_size;
179    gchar               **radicals;
180    gint                  radicals_size;
181    gchar               **variants;
182    gint                  variants_size;
183    TomoeUnihanMetaData  *meta_data;
184    gint                  meta_data_size;
185};
186
187EOH
188
189  infos.each_with_index do |(ucs4, info), i|
190    readings = []
191    kuns = info[:ja_kuns]
192    ons = info[:ja_ons]
193    if kuns
194      readings.concat(kuns.collect {|x| ["TOMOE_READING_JA_KUN", x]})
195    end
196    if ons
197      readings.concat(ons.collect {|x| ["TOMOE_READING_JA_ON", x]})
198    end
199
200    unless readings.empty?
201      info[:have_readings] = true
202      puts("static TomoeUnihanReading #{prefix}readings_#{ucs4}[] = {")
203      readings.each do |type, reading|
204        puts("    {#{type}, \"#{reading}\"},")
205      end
206      puts("};")
207    end
208
209    radicals = info[:radicals] || []
210    unless radicals.empty?
211      info[:have_radicals] = true
212      puts("static gchar *#{prefix}radicals_#{ucs4}[] = {")
213      radicals.each do |radical|
214        puts("    \"#{radical}\",")
215      end
216      puts("};");
217    end
218
219    variants = info[:variants] || []
220    unless variants.empty?
221      info[:have_variants] = true
222      puts("static gchar *#{prefix}variants_#{ucs4}[] = {")
223      variants.each do |variant|
224        puts("    \"#{variant}\",")
225      end
226      puts("};");
227    end
228
229    meta_data = info[:meta_data] || []
230    unless meta_data.empty?
231      info[:have_meta_data] = true
232      puts("static TomoeUnihanMetaData #{prefix}meta_data_#{ucs4}[] = {")
233      meta_data.each do |key, value|
234        puts("    {\"#{key}\", \"#{value.gsub(/\"/, '\"')}\"},")
235      end
236      puts("};")
237    end
238  end
239
240  puts("static TomoeUnihanInfo #{prefix}infos[] = {")
241  infos.each_with_index do |(ucs4, info), i|
242    utf8 = ucs4_to_utf8(ucs4)
243    n_strokes = info[:n_strokes] || -1
244    readings = radicals = variants = meta_data = "NULL"
245    readings_size = radicals_size = variants_size = meta_data_size = "0"
246    if info[:have_readings]
247      readings = "#{prefix}readings_#{ucs4}"
248      readings_size = "G_N_ELEMENTS(#{readings})"
249    end
250    if info[:have_radicals]
251      radicals =  "#{prefix}radicals_#{ucs4}"
252      radicals_size = "G_N_ELEMENTS(#{radicals})"
253    end
254    if info[:have_variants]
255      variants =  "#{prefix}variants_#{ucs4}"
256      variants_size = "G_N_ELEMENTS(#{variants})"
257    end
258    if info[:have_meta_data]
259      meta_data =  "#{prefix}meta_data_#{ucs4}"
260      meta_data_size = "G_N_ELEMENTS(#{meta_data})"
261    end
262
263    puts("    {\"#{utf8}\", #{n_strokes}, #{readings}, #{readings_size},")
264    puts("     #{radicals}, #{radicals_size}, #{variants}, #{variants_size},")
265    puts("     #{meta_data}, #{meta_data_size}},")
266  end
267  puts("};")
268end
269
270infos = parse_unihan_txt(unihan_txt)
271infos = merge_kanjidic2_xml(kanjidic2_xml, infos).collect do |ucs4, info|
272  [ucs4, info]
273end.sort_by do |ucs4, info|
274  ucs4
275end
276generate_header(infos)
277