1#!/usr/bin/env ruby 2 3require 'tomoe-test-utils' 4 5require 'uconv' 6require 'suikyo/suikyo' 7 8unihan_txt = ARGV.shift 9kanjidic2_xml = ARGV.shift 10 11DO_NOT_EDIT_HEADER = <<-EOH 12/* 13 DO NOT EDIT! 14 THIS FILE IS GENERATED FROM Unihan.txt: 15 ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip 16*/ 17EOH 18 19@romaji_to_hiragana = Suikyo.new("romaji-kana") 20@hiragana_to_katakana = Suikyo.new("hiragana-katakana") 21 22def kcode(code) 23 old_kcode = $KCODE 24 $KCODE = code 25 yield 26ensure 27 $KCODE = old_kcode 28end 29 30def euc_to_utf8(euc) 31 Uconv.euctou8(euc) 32end 33 34def romaji_to_hiragana(romaji) 35 kcode("e") do 36 euc_to_utf8(@romaji_to_hiragana.convert("#{romaji} ")) 37 end 38end 39 40def romaji_to_katakana(romaji) 41 kcode("e") do 42 hiragana = @romaji_to_hiragana.convert("#{romaji} ") 43 euc_to_utf8(@hiragana_to_katakana.convert("#{hiragana} ")) 44 end 45end 46 47def ucs4_to_utf8(ucs4) 48 TomoeTestUtils::Unicode.ucs4_to_utf8(Integer("0x#{ucs4}")) 49end 50 51def utf8_to_ucs4(utf8) 52 "%X" % TomoeTestUtils::Unicode.utf8_to_ucs4(utf8) 53end 54 55def cache(filename) 56 cache = "#{filename}.cache" 57 if File.exists?(cache) and (File.mtime(cache) > File.mtime(filename)) 58 begin 59 return Marshal.load(File.read(cache)) 60 rescue ArgumentError 61 end 62 end 63 64 result = yield 65 File.open(cache, "wb") {|f| f.print(Marshal.dump(result))} 66 result 67end 68 69def parse_unihan_entry(key, value) 70 case key 71 when "kCompatibilityVariant" 72 [:variants, value.split.collect {|v| ucs4_to_utf8(v.sub(/^U\+2?/, ''))}] 73 when "kJapaneseKun" 74 [:ja_kuns, value.split.collect {|k| romaji_to_hiragana(k)}] 75 when "kJapaneseOn" 76 [:ja_ons, value.split.collect {|o| romaji_to_katakana(o)}] 77 when "kTotalStrokes" 78 [:n_strokes, value] 79 else 80 [key, value] 81 end 82end 83 84def parse_unihan_txt(unihan_txt) 85 cache(unihan_txt) do 86 infos = {} 87 File.open(unihan_txt).each do |line| 88 case line 89 when /^#/ # 90 next 91 when /^U\+([\da-fA-F]+)\s+([a-zA-Z_]+)\s*(.*)\s*$/u 92 ucs4 = $1.upcase 93 key = $2 94 value = $3 95 96 infos[ucs4] ||= {} 97 key, value = parse_unihan_entry(key, value) 98 infos[ucs4][key] = value 99 else 100 STDERR.puts "Unknown line: #{line}" 101 end 102 end 103 infos 104 end 105end 106 107def merge_kanjidic2_xml(kanjidic2_xml, infos) 108 dict = Tomoe::DictXML.new("filename" => kanjidic2_xml, "editable" => false) 109 dict.search(Tomoe::Query.new).each do |cand| 110 char = cand.char 111 ucs4 = utf8_to_ucs4(char.utf8) 112 info = infos[ucs4] || {} 113 114 info[:n_strokes] ||= char.n_strokes 115 116 info[:ja_ons] ||= [] 117 info[:ja_kuns] ||= [] 118 char.readings.each do |reading| 119 case reading.type 120 when Tomoe::Reading::JA_ON 121 info[:ja_ons] << reading.reading 122 when Tomoe::Reading::JA_KUN 123 info[:ja_kuns] << reading.reading 124 end 125 end 126 info[:ja_ons].uniq! 127 info[:ja_kuns].uniq! 128 129 info[:radicals] ||= [] 130 info[:radicals].concat(char.radicals) 131 info[:radicals].uniq! 132 133 if char.variant 134 info[:variants] ||= [] 135 info[:variants] << char.variant 136 info[:variants].uniq! 137 end 138 139 info[:meta_data] ||= {} 140 char.each do |key, value| 141 info[:meta_data][key] = value 142 end 143 144 infos[ucs4] = info 145 end 146 infos 147end 148 149def generate_header(infos) 150 prefix = "tomoe_unihan_" 151 152 puts <<-EOH 153/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ 154#{DO_NOT_EDIT_HEADER} 155 156#include "tomoe-unihan.h" 157 158#include <tomoe-char.h> 159 160typedef struct _TomoeUnihanReading TomoeUnihanReading; 161typedef struct _TomoeUnihanMetaData TomoeUnihanMetaData; 162typedef struct _TomoeUnihanInfo TomoeUnihanInfo; 163 164struct _TomoeUnihanReading { 165 TomoeReadingType type; 166 gchar *reading; 167}; 168 169struct _TomoeUnihanMetaData { 170 gchar *key; 171 gchar *value; 172}; 173 174struct _TomoeUnihanInfo { 175 gchar *utf8; 176 gint n_strokes; 177 TomoeUnihanReading *readings; 178 gint readings_size; 179 gchar **radicals; 180 gint radicals_size; 181 gchar **variants; 182 gint variants_size; 183 TomoeUnihanMetaData *meta_data; 184 gint meta_data_size; 185}; 186 187EOH 188 189 infos.each_with_index do |(ucs4, info), i| 190 readings = [] 191 kuns = info[:ja_kuns] 192 ons = info[:ja_ons] 193 if kuns 194 readings.concat(kuns.collect {|x| ["TOMOE_READING_JA_KUN", x]}) 195 end 196 if ons 197 readings.concat(ons.collect {|x| ["TOMOE_READING_JA_ON", x]}) 198 end 199 200 unless readings.empty? 201 info[:have_readings] = true 202 puts("static TomoeUnihanReading #{prefix}readings_#{ucs4}[] = {") 203 readings.each do |type, reading| 204 puts(" {#{type}, \"#{reading}\"},") 205 end 206 puts("};") 207 end 208 209 radicals = info[:radicals] || [] 210 unless radicals.empty? 211 info[:have_radicals] = true 212 puts("static gchar *#{prefix}radicals_#{ucs4}[] = {") 213 radicals.each do |radical| 214 puts(" \"#{radical}\",") 215 end 216 puts("};"); 217 end 218 219 variants = info[:variants] || [] 220 unless variants.empty? 221 info[:have_variants] = true 222 puts("static gchar *#{prefix}variants_#{ucs4}[] = {") 223 variants.each do |variant| 224 puts(" \"#{variant}\",") 225 end 226 puts("};"); 227 end 228 229 meta_data = info[:meta_data] || [] 230 unless meta_data.empty? 231 info[:have_meta_data] = true 232 puts("static TomoeUnihanMetaData #{prefix}meta_data_#{ucs4}[] = {") 233 meta_data.each do |key, value| 234 puts(" {\"#{key}\", \"#{value.gsub(/\"/, '\"')}\"},") 235 end 236 puts("};") 237 end 238 end 239 240 puts("static TomoeUnihanInfo #{prefix}infos[] = {") 241 infos.each_with_index do |(ucs4, info), i| 242 utf8 = ucs4_to_utf8(ucs4) 243 n_strokes = info[:n_strokes] || -1 244 readings = radicals = variants = meta_data = "NULL" 245 readings_size = radicals_size = variants_size = meta_data_size = "0" 246 if info[:have_readings] 247 readings = "#{prefix}readings_#{ucs4}" 248 readings_size = "G_N_ELEMENTS(#{readings})" 249 end 250 if info[:have_radicals] 251 radicals = "#{prefix}radicals_#{ucs4}" 252 radicals_size = "G_N_ELEMENTS(#{radicals})" 253 end 254 if info[:have_variants] 255 variants = "#{prefix}variants_#{ucs4}" 256 variants_size = "G_N_ELEMENTS(#{variants})" 257 end 258 if info[:have_meta_data] 259 meta_data = "#{prefix}meta_data_#{ucs4}" 260 meta_data_size = "G_N_ELEMENTS(#{meta_data})" 261 end 262 263 puts(" {\"#{utf8}\", #{n_strokes}, #{readings}, #{readings_size},") 264 puts(" #{radicals}, #{radicals_size}, #{variants}, #{variants_size},") 265 puts(" #{meta_data}, #{meta_data_size}},") 266 end 267 puts("};") 268end 269 270infos = parse_unihan_txt(unihan_txt) 271infos = merge_kanjidic2_xml(kanjidic2_xml, infos).collect do |ucs4, info| 272 [ucs4, info] 273end.sort_by do |ucs4, info| 274 ucs4 275end 276generate_header(infos) 277