1#!/usr/bin/env python 2# -*- coding: utf-8 -*- 3# 4# Script for decoding Lingea Dictionary (.trd) file 5# Result is <header>\t<definition> file, convertable easily 6# by stardict-editor from package stardict-tools into native 7# Stardict dictionary (stardict.sf.net and www.stardict.org) 8# 9# Copyright (C) 2007 - Klokan Petr Přidal (www.klokan.cz) 10# 11# Based on script CobuildConv.rb by Nomad 12# http://hp.vector.co.jp/authors/VA005784/cobuild/cobuildconv.html 13# 14# Version history: 15# 0.4 (30.10.2007) Patch by Petr Dlouhy, optional HTML generation 16# 0.3 (28.10.2007) Patch by Petr Dlouhy, cleanup, bugfix. More dictionaries. 17# 0.2 (19.7.2007) Changes, documentation, first 100% dictionary 18# 0.1 (20.5.2006) Initial version based on Nomad specs 19# 20# Supported dictionaries: 21# - Lingea Německý Kapesní slovník 22# - Lingea Anglický Kapesní slovník 23# - Lingea 2002 series (theoretically) 24# 25# Modified by: 26# - Petr Dlouhy (petr.dlouhy | email.cz) 27# Generalization of data block rules, sampleFlag 0x04, sound out fix, data phrase prefix with comment (0x04) 28# HTML output, debugging patch, options on command line 29# 30# <write your name here> 31# 32# This library is free software; you can redistribute it and/or 33# modify it under the terms of the GNU Library General Public 34# License as published by the Free Software Foundation; either 35# version 2 of the License, or (at your option) any later version. 36# 37# This library is distributed in the hope that it will be useful, 38# but WITHOUT ANY WARRANTY; without even the implied warranty of 39# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 40# Library General Public License for more details. 41# 42# You should have received a copy of the GNU Library General Public 43# License along with this library; if not, write to the 44# Free Software Foundation, Inc., 59 Temple Place - Suite 330, 45# Boston, MA 02111-1307, USA. 46 47# VERSION 48VERSION = "0.4" 49 50import getopt, sys 51def usage(): 52 print "Lingea Dictionary Decoder" 53 print "-------------------------" 54 print "Version: %s" % VERSION 55 print "Copyright (C) 2007 - Klokan Petr Pridal, Petr Dlouhy" 56 print 57 print "Usage: python lingea-trd-decoder.py DICTIONARY.trd > DICTIONARY.tab" 58 print "Result convertion by stardict-tools: /usr/lib/stardict-tools/tabfile" 59 print 60 print " -o <num> --out-style : Output style" 61 print " 0 no tags" 62 print " 1 \\n tags" 63 print " 2 html tags" 64 print " -h --help : Print this message" 65 print " -d --debug : Degub" 66 print " -r --debug-header : Degub - print headers" 67 print " -a --debug-all : Degub - print all records" 68 print " -l --debug-limit : Degub limit" 69 print 70 print "For HTML support in StarDict dictionary .ifo has to contain:" 71 print "sametypesequence=g" 72 print "!!! Change the .ifo file after generation by tabfile !!!" 73 print 74 75try: 76 opts, args = getopt.getopt(sys.argv[1:], "hdo:ral:", ["help", "debug", "out-style=", "debug-header", "debug-all", "debug-limit="]) 77except getopt.GetoptError: 78 usage() 79 print "ERROR: Bad option" 80 sys.exit(2) 81 82import locale 83DEBUG = False 84OUTSTYLE = 2 85DEBUGHEADER = False 86DEBUGALL = False 87DEBUGLIMIT = 1 88for o, a in opts: 89 if o in ("-d", "-debug"): 90 # DEBUGING !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 91 DEBUG = True 92 if o in ("-o", "--out-style"): 93 # output style 94 OUTSTYLE = locale.atoi(a) 95 if OUTSTYLE > 2: 96 usage() 97 print "ERROR: Output style not specified" 98 if o in ("-r", "--debug-header"): 99 # If DEBUG and DEBUGHEADER, then print just all header records 100 DEBUGHEADER = True 101 if o in ("-a", "--debug-all"): 102 # If DEBUG and DEBUGALL then print debug info for all records 103 DEBUGALL = True 104 if o in ("-h", "--help"): 105 usage() 106 sys.exit(0) 107 if o in ("-l", "--debug-limit"): 108 # Number of wrong records for printing to stop during debugging 109 DEBUGLIMIT = locale.atoi(a) 110# FILENAME is a first parameter on the commandline now 111 112if len(args) == 1: 113 FILENAME = args[0] 114else: 115 usage() 116 print "ERROR: You have to specify .trd file to decode" 117 sys.exit(2) 118 119from struct import * 120import re 121 122alpha = ['\x00', 'a','b','c','d','e','f','g','h','i', 123 'j','k','l','m','n','o','p','q','r','s', 124 't','u','v','w','x','y','z','#AL27#','#AL28#','#AL29#', 125 '#AL30#','#AL31#', ' ', '.', '<', '>', ',', ';', '-', '#AL39#', 126 '#GRAVE#', '#ACUTE#', '#CIRC#', '#TILDE#', '#UML#', '#AL45#', '#AL46#', '#CARON#', '#AL48#', '#CEDIL#', 127 '#AL50#', '#AL51#', '#GREEK#', '#AL53#', '#AL54#', '#AL55#', '#AL56#', '#AL57#', '#AL58#', '#SYMBOL#', 128 '#AL60#', '#UPCASE#', '#SPECIAL#', '#UNICODE#'] # 4 bytes after unicode 129 130upcase = ['#UP0#','#UP1#','#UP2#','#UP3#','#UP4#','#UP5#','#UP6#','#UP7#','#UP8#','#UP9#', 131 '#UP10#','#UP11#','#UP12#','#UP13#','#UP14#','#UP15#','#UP16#','#UP17#','#UP18#','#UP19#', 132 '#UP20#','#UP21#','#UP22#','#UP23#','#UP24#','#UP25#','#UP26#','#UP27#','#UP28#','#UP29#', 133 '#UP30#','#UP31#','A','B','C','D','E','F','G','H', 134 'I','J','K','L','M','N','O','P','Q','R', 135 'S','T','U','V','W','X','Y','Z','#UP58#','#UP59#', 136 '#UP60#','#UP61#','#UP62#','#UP63#'] 137 138upcase_pron = ['#pr0#', '#pr1#','#pr2#','#pr3#','#pr4#','#pr5#','#pr6#','#pr7#','#pr8#','#pr9#', 139 '#pr10#', '#pr11#','#pr12#','#pr13#','#pr14#','#pr15#','#pr16#','#pr17#','#pr18#','#pr19#', 140 '#pr20#', '#pr21#','#pr22#','#pr23#','#pr24#','#pr25#','#pr26#','#pr27#','#pr28#','#pr29#', 141 '#pr30#', '#pr31#','ɑ','#pr33#','ʧ','ð','ə','ɜ','#pr38#','æ', 142 'ɪ', 'ɭ','#pr42#','ŋ','#pr44#','ɳ','ɔ','#pr47#','ɒ','ɽ', 143 'ʃ', 'θ','ʊ','ʌ','#pr54#','#pr55#','#pr56#','ʒ','#pr58#','#pr59#', 144 '#pr60#', '#pr61#','#pr62#','#pr63#'] 145 146symbol = ['#SY0#', '#SY1#','#SY2#','#SY3#','§','#SY5#','#SY6#','#SY7#','#SY8#','#SY9#', 147 '#SY10#', '#SY11#','#SY12#','#SY13#','#SY14#','™','#SY16#','#SY17#','¢','£', 148 '#SY20#', '#SY21#','#SY22#','#SY23#','©','#SY25#','#SY26#','#SY27#','®','°', 149 '#SY30#', '²','³','#SY33#','#SY34#','#SY35#','¹','#SY37#','#SY38#','#SY39#', 150 '½', '#SY41#','#SY42#','×','÷','#SY45#','#SY46#','#SY47#','#SY48#','#SY49#', 151 '#SY50#', '#SY51#','#SY52#','#SY53#','#SY54#','#SY55#','#SY56#','#SY57#','#SY58#','#SY59#', 152 '#SY60#', '#SY61#','#SY62#','#SY63#'] 153 154special = ['#SP0#', '!','"','#','$','%','&','\'','(',')', 155 '*', '+','#SP12#','#SP13#','#SP14#','/','0','1','2','3', 156 '4', '5','6','7','8','9',':',';','<','=', 157 '>', '?','@','[','\\',']','^','_','`','{', 158 '|', '}','~','#SP43#','#SP44#','#SP45#','#SP46#','#SP47#','#SP48#','#SP49#', 159 '#SP50#', '#SP51#','#SP52#','#SP53#','#SP54#','#SP55#','#SP56#','#SP57#','#SP58#','#SP59#', 160 '#SP60#', '#SP61#','#SP62#','#SP63#'] 161 162wordclass = ('#0#','n:','adj:','pron:','#4#','v:','adv:','prep:','#8#','#9#', 163 'intr:','phr:','#12#','#13#','#14#','#15#','#16#','#17#','#18#','#19#', 164 '#20#','#21#','#22#','#23#','#24#','#25#','#26#','#27#','#28#','#29#', 165 '#30#','#31#') 166 167if OUTSTYLE == 0: 168 tag = { 169 'db':('' ,''), #Data begining 170 'rn':('' ,'\t'), #Record name 171 'va':('' ,' '), #Header variant 172 'wc':('(' ,')'), #WordClass 173 'pa':('' ,' '), #Header parts 174 'fo':('(' ,') '), #Header forms 175 'on':('(' ,')' ), #Header origin note 176 'pr':('[' ,']'), #Header pronunciation 177 'dv':('{' ,'} '), #Header dataVariant 178 'sa':('`' ,'`' ), #Data sample 179 'sw':('' ,''), #Data sample wordclass; is no printed by Lingea 180 'do':('`' ,'`' ), #Data origin note 181 'df':('' ,' '), #Data definition 182 'ps':('"' ,'" '), #Data phrase short form 183 'pg':('"' ,' = '), #Data phrase green 184 'pc':('`' ,'`'), #Data phrase comment; this comment is not printed by Lingea), but it seems useful 185 'p1':('"' ,' = '), #Data phrase 1 186 'p2':('' ,'" ' ), #Data phrase 2 187 'sp':('"' ,' = ' ),#Data simple phrase 188 'b1':('"' ,' = '), #Data phrase (block) 1 189 'b2':('" ' ,''), #Data phrase (block) 2 190 } 191if OUTSTYLE == 1: 192 tag = { 193 'db':('•' ,''), #Data begining 194 'rn':('' ,'\t'), #Record name 195 'va':('' ,' '), #Header variant 196 'wc':('' ,'\\n'), #WordClass 197 'pa':('' ,':\\n'), #Header parts 198 'fo':('(' ,') '), #Header forms 199 'on':('(' ,')\\n' ), #Header origin note 200 'pr':('[' ,']\\n'), #Header pronunciation 201 'dv':('{' ,'} '), #Header dataVariant 202 'sa':(' ' ,'\\n' ), #Data sample 203 'sw':('' ,''), #Data sample wordclass; is not printed by Lingea 204 'do':(' ' ,' ' ), #Data origin note 205 'df':(' ' ,'\\n'), #Data definition 206 'ps':(' ' ,'\\n'), #Data phrase short form 207 'pg':(' ' ,' '), #Data phrase green 208 'pc':(' ' ,' '), #Data phrase comment; this comment is not printed by Lingea), but it seems useful 209 'p1':(' ' ,' '), #Data phrase 1 210 'p2':(' ' ,'\\n' ), #Data phrase 2 211 'sp':('' ,'\\n' ), #Data simple phrase 212 'b1':('"' ,' = '), #Data phrase (block) 1 213 'b2':('" ' ,''), #Data phrase (block) 2 214 } 215if OUTSTYLE == 2: 216 tag = { 217 'db':('•' ,''), #Data begining 218 'rn':('' ,'\t'), #Record name 219 'va':('' ,' '), #Header variant 220 'wc':('<span size="larger" color="darkred" weight="bold">','</span>\\n'), #WordClass 221 'pa':('<span size="larger" color="darkred" weight="bold">',':</span>\\n'), #Header parts 222 'fo':('(' ,') '), #Header forms 223 'on':('<span color="blue">(' ,')</span>\\n' ), #Header origin note 224 'pr':('[' ,']\\n'), #Header pronunciation 225 'dv':('{' ,'} '), #Header dataVariant 226 'sa':(' <span color="darkred" weight="bold">' ,'</span>\\n' ), #Data sample 227 'sw':('' ,''), #Data sample wordclass; is not printed by Lingea 228 'do':(' <span color="darkred" weight="bold">' ,'</span> ' ), #Data origin note 229 'df':(' <span weight="bold">' ,'</span>\\n'), #Data definition 230 'ps':(' <span color="dimgray" weight="bold">' ,'</span>\\n'), #Data phrase short form 231 'pg':(' <span color="darkgreen" style="italic">' ,'</span> '), #Data phrase green 232 'pc':(' <span color="darkgreen" style="italic">' ,'</span> '), #Data phrase comment; this comment is not printed by Lingea), but it seems useful 233 'p1':(' <span color="dimgray" style="italic">' ,'</span> '), #Data phrase 1 234 'p2':(' ' ,'\\n' ), #Data phrase 2 235 'sp':('<span color="cyan">' ,'</span>\\n' ), #Data simple phrase 236 'b1':('"' ,' = '), #Data phrase (block) 1 237 'b2':('" ' ,''), #Data phrase (block) 2 238 } 239 240 241 242# Print color debug functions 243purple = lambda c: '\x1b[1;35m'+c+'\x1b[0m' 244blue = lambda c: '\x1b[1;34m'+c+'\x1b[0m' 245cyan = lambda c: '\x1b[36m'+c+'\x1b[0m' 246gray = lambda c: '\x1b[1m'+c+'\x1b[0m' 247 248def getRec(n): 249 """Get data stream for record of given number""" 250 if n >= 0 and n < entryCount: 251 f.seek(index[n]) 252 return f.read(index[n+1] - index[n]) 253 else: 254 return '' 255 256def decode_alpha( stream, nullstop=True): 257 """Decode 6-bit encoding data stream from the begining untit first NULL""" 258 offset = 0 259 triple = 0 260 result = [] 261 while triple < len( stream ): 262 if offset % 4 == 0: 263 c = stream[triple] >> 2 264 triple += 1 265 if offset % 4 == 1: 266 c = (stream[triple-1] & 3) << 4 | stream[triple] >> 4 267 triple += 1 268 if offset % 4 == 2: 269 c = (stream[triple-1] & 15) << 2 | (stream[triple] & 192) >> 6 270 triple += 1 271 if offset % 4 == 3: 272 c = stream[triple-1] & 63 273 if c == 0 and nullstop: 274 break 275 offset += 1 276 # TODO: ENCODE UNICODE 4 BYTE STREAM!!! and but it after #UNICODE# as unichr() 277 result.append(c) 278 return decode_alpha_postprocessing(result), triple - 1 279 280 281def decode_alpha_postprocessing( input ): 282 """Lowlevel alphabet decoding postprocessing, combines tuples into one character""" 283 result = "" 284 input.extend([0x00]*5) 285 286 # UPCASE, UPCASE_PRON, SYMBOL, SPECIAL 287 skip = False 288 for i in range(0,len(input)-1): 289 if skip: 290 skip = False 291 continue 292 293 bc = input[i] 294 c = alpha[bc] 295 bc1 = input[i+1] 296 c1 = alpha[bc1] 297 298 if bc < 40: 299 result += c 300 else: 301 if c == "#GRAVE#": 302 if c1 == 'a': result += 'à' 303 else: result += '#GRAVE%s#' % c1 304 elif c == "#UML#": 305 if c1 == 'o': result += 'ö' 306 elif c1 == 'u': result += 'ü' 307 elif c1 == 'a': result += 'ä' 308 elif c1 == ' ': result += 'Ä' 309 elif c1 == '#AL46#': result += 'Ö' 310 elif c1 == '#GREEK#': result += 'Ü' 311 else: result += '#UML%s#' % c1 312 elif c == "#ACUTE#": 313 if c1 == 'a': result += 'á' 314 elif c1 == 'e': result += 'é' 315 elif c1 == 'i': result += 'í' 316 elif c1 == 'o': result += 'ó' 317 elif c1 == 'u': result += 'ú' 318 elif c1 == 'y': result += 'ý' 319 elif c1 == ' ': result += 'Á' 320 elif c1 == '#GRAVE#': result += 'Í' 321 else: result += '#ACUTE%s#' % c1 322 elif c == "#CARON#": 323 if c1 == 'r': result += 'ř' 324 elif c1 == 'c': result += 'č' 325 elif c1 == 's': result += 'š' 326 elif c1 == 'z': result += 'ž' 327 elif c1 == 'e': result += 'ě' 328 elif c1 == 'd': result += 'ď' 329 elif c1 == 't': result += 'ť' 330 elif c1 == 'a': result += 'å' 331 elif c1 == 'u': result += 'ů' 332 elif c1 == 'n': result += 'ň' 333 elif c1 == '<': result += 'Č' 334 elif c1 == '#CEDIL#': result += 'Ř' 335 elif c1 == '#AL50#': result += 'Š' 336 elif c1 == '#AL57#': result += 'Ž' 337 else: result += '#CARON%s#' % c1 338 elif c == "#UPCASE#": 339 result += upcase[bc1] 340 elif c == "#SYMBOL#": 341 result += symbol[bc1] 342 elif c == "#AL51#": 343 if c1 == 's': result += 'ß' 344 elif c == "#AL48#": 345 result += "#AL48#%s" % c1 346 elif c == "#SPECIAL#": 347 result += special[bc1] 348 elif c == "#UNICODE#": 349 result += '#UNICODE%s#' % bc1 350 elif c == "#CIRC#": 351 if c1 == 'a': result += 'â' 352 else: result += '#CARON%s#' % c1 353 else: 354 result += '%sX%s#' % (c[:-1], bc1) 355 skip = True 356 return result 357 358def pronunciation_encode(s): 359 """Encode pronunciation upcase symbols into IPA symbols""" 360 for i in range(0, 64): 361 s = s.replace(upcase[i], upcase_pron[i]) 362 return s 363 364re_d = re.compile(r'<d(.*?)>') 365re_w = re.compile(r'<w(.*?)>') 366re_y = re.compile(r'<y(.*?)>') 367re_c = re.compile(r'<c(.*?)>') 368 369def decode_tag_postprocessing(input): 370 """Decode and replace tags used in lingea dictionaries; decode internal tags""" 371 s = input 372 373 # General information in http://www.david-zbiral.cz/El-slovniky-plnaverze.htm#_Toc151656799 374 # TODO: Better output handling 375 376 if OUTSTYLE == 0: 377 # ?? <d...> 378 s = re_d.sub(r'(\1)',s) 379 # ?? <w...> 380 s = re_w.sub(r'(\1)',s) 381 # ?? <y...> 382 s = re_y.sub(r'(\1)',s) 383 # ?? <c...> 384 s = re_c.sub(r'(\1)',s) 385 # ... 386 if OUTSTYLE == 1: 387 # ?? <d...> 388 s = re_d.sub(r'(\1)',s) 389 # ?? <w...> 390 s = re_w.sub(r'(\1)',s) 391 # ?? <y...> 392 s = re_y.sub(r'(\1)',s) 393 # ?? <c...> 394 s = re_c.sub(r'(\1)',s) 395 # ... 396 if OUTSTYLE == 2: 397 # ?? <d...> 398 s = re_d.sub(r'<span size="small" color="blue">(\1)</span>',s) 399 # ?? <w...> 400 s = re_w.sub(r'<span size="small" color="blue" style="italic">\1</span>',s) 401 # ?? <y...> 402 s = re_y.sub(r'<span size="small" color="blue" style="italic">\1</span>',s) 403 # ?? <c...> 404 s = re_c.sub(r'<span size="small" color="blue" style="italic">\1</span>',s) 405 # ... 406 407 return s 408 409def toBin( b ): 410 """Prettify debug output format: hex(bin)dec""" 411 original = b 412 r = 0; 413 i = 1; 414 while b > 0: 415 if b & 0x01 != 0: r += i 416 i *= 10 417 b = b >> 1 418 return "0x%02X(%08d)%03d" % (original, r, original) 419 420 421def out( comment = "", skip = False): 422 """Read next byte or string (with skip=True) and output DEBUG info""" 423 global bs, pos 424 s, triple = decode_alpha(bs[pos:]) 425 s = s.split('\x00')[0] # give me string until first NULL 426 if (comment.find('%') != -1): 427 if skip: 428 comment = comment % s 429 else: 430 comment = comment % bs[pos] 431 if DEBUG: print "%03d %s %s | %s | %03d" % (pos, toBin(bs[pos]),comment, s, (triple + pos)) 432 if skip: 433 pos += triple + 1 434 return s.replace('`','') # Remove '`' character from words 435 else: 436 pos += 1 437 return bs[pos-1] 438 439outInt = lambda c: out(c) 440outStr = lambda c: out(c, True) 441 442def decode(stream): 443 """Decode byte stream of one record, return decoded string with formatting in utf""" 444 result = "" 445 global bs, pos 446 # stream - data byte stream for one record 447 bs = unpack("<%sB" % len(stream), stream) 448 # bs - list of bytes from stream 449 450 pos = 0 451 itemCount = outInt("ItemCount: %s") # Number of blocks in the record 452 mainFlag = outInt("MainFlag: %s") 453 454 # HEADER BLOCK 455 # ------------ 456 if mainFlag & 0x01: 457 headerFlag = outInt("HeaderFlag: %s") # Blocks in header 458 if headerFlag & 0x01: 459 result += tag['rn'][0] + outStr("Header record name: %s").replace('_','') + tag['rn'][1] # Remove character '_' from index 460 if headerFlag & 0x02: 461 result += tag['va'][0] + outStr("Header variant: %s") + tag['va'][1] 462 if headerFlag & 0x04: 463 s = outInt("Header wordclass: %s") 464 if s < 32: 465 result += tag['wc'][0] + wordclass[s] + tag['wc'][1] 466 else: 467 raise "Header wordclass out of range in: %s" % result 468 if headerFlag & 0x08: 469 result += tag['pa'][0] + outStr("Header parts: %s") + tag['pa'][1] 470 if headerFlag & 0x10: 471 result += tag['fo'][0] + outStr("Header forms: %s") + tag['fo'][1] 472 if headerFlag & 0x20: 473 result += tag['on'][0] + outStr("Header origin note: %s") + tag['on'][1] 474 if headerFlag & 0x80: 475 result += tag['pr'][0] + pronunciation_encode(outStr("Header pronunciation: %s")) + tag['pr'][1] 476 477 # Header data block 478 if mainFlag & 0x02: 479 headerFlag = outInt("Header dataFlag: %s") # Blocks in header 480 if headerFlag & 0x02: 481 result += tag['dv'][0] + outStr("Header dataVariant: %s")+ tag['dv'][1] 482 483 # ??? Link elsewhere 484 pass 485 486 # SOUND DATA REFERENCE 487 if mainFlag & 0x80: 488 outInt("Sound reference byte #1: %s") 489 outInt("Sound reference byte #2: %s") 490 outInt("Sound reference byte #3: %s") 491 outInt("Sound reference byte #4: %s") 492 outInt("Sound reference byte #5: %s") 493 #out("Sound data reference (5 bytes)", 6) 494 495 # TODO: Test all mainFlags in header!!!! 496 497 #result += ': ' 498 li = 0 499 500 #print just every first word class identifier 501 # TODO: this is not systematic (should be handled by output) 502 global lastWordClass 503 lastWordClass = 0 504 505 # DATA BLOCK(S) 506 # ------------- 507 for i in range(0, itemCount): 508 item = tag['db'][0] + tag['db'][1] 509 ol = False 510 dataFlag = outInt("DataFlag: %s -----------------------------") 511 if dataFlag & 0x01: # small index 512 sampleFlag = outInt("Data sampleFlag: %s") 513 if sampleFlag & 0x01: 514 result += tag['sa'][0] + outStr("Data sample: %s") + tag['sa'][1] 515 if sampleFlag & 0x04: 516 s = outInt("Data wordclass: %s") 517 if s != lastWordClass: 518 if s < 32: 519 result += tag['wc'][0] + wordclass[s] + tag['wc'][1] 520 else: 521 raise "Header wordclass out of range in: %s" % result 522 lastWordClass = s 523 if sampleFlag & 0x08: 524 result += tag['sw'][0] + outStr("Data sample wordclass: %s") + tag['sw'][1] 525 if sampleFlag & 0x10: 526 outInt("Data sample Int: %s") 527 outInt("Data sample Int: %s") 528 outInt("Data sample Int: %s") 529 if sampleFlag & 0x20: 530 item += tag['do'][0] + outStr("Data origin note: %s") + tag['do'][1] 531 if sampleFlag & 0x80: 532 item += " " 533 result += tag['pr'][0] + pronunciation_encode(outStr("Data sample pronunciation: %s")) + tag['pr'][1] 534 if dataFlag & 0x02: 535 item += " " 536 subFlag = outInt("Data subFlag: %s") 537 if subFlag == 0x80: 538 outStr("Data sub prefix: %s") 539 # It seams that data sub prefix content is ignored and there is a generated number for the whole block instead. 540 li += 1 541 ol = True 542 if dataFlag & 0x04: # chart 543 pass # ??? 544 if dataFlag & 0x08: # reference 545 item += tag['df'][0] + outStr("Data definition: %s") + tag['df'][1] 546 if dataFlag & 0x10: 547 pass # ??? 548 if dataFlag & 0x20: # phrase 549 phraseFlag1 = outInt("Data phraseFlag1: %s") 550 if phraseFlag1 & 0x01: 551 item += tag['ps'][0] + outStr("Data phrase short form: %s") + tag['ps'][1] 552 if phraseFlag1 & 0x02: 553 phraseCount = outInt("Data phraseCount: %s") 554 for i in range(0, phraseCount): 555 phraseComment = outInt("Data phrase prefix") 556 if phraseComment & 0x04: 557 item += tag['pc'][0] + outStr("Data phrase comment: %s") + tag['pc'][1] 558 item += tag['p1'][0] + outStr("Data phrase 1: %s") + tag['p1'][1] 559 item += tag['p2'][0] + outStr("Data phrase 2: %s") + tag['p2'][1] 560 if phraseFlag1 & 0x04: 561 phraseCount = outInt("Data phraseCount: %s") 562 for i in range(0, phraseCount): 563 phraseComment = outInt("Data phrase prefix") 564 if phraseComment & 0x04: 565 item += tag['pc'][0] + outStr("Data phrase 1: %s") + tag['pc'][1] 566 item += tag['pg'][0] + outStr("Data phrase comment: %s") + tag['pg'][1] 567 item += tag['p2'][0] + outStr("Data phrase 2: %s") + tag['p2'][1] 568 if phraseFlag1 & 0x08: 569 phraseCount = outInt("Data simple phraseCount: %s") 570 for i in range(0, phraseCount): 571 item += " " 572 item += tag['sp'][0] + outStr("Data simple phrase: %s") + tag['sp'][1] 573 if phraseFlag1 & 0x40: 574 item += tag['ps'][0] + outStr("Data phrase short form: %s") + tag['ps'][1] 575 576 577 # TODO: be careful in changing the rules, to have back compatibility! 578 if dataFlag & 0x40: # reference, related language 579 #0x01 synonym ? 580 #0x02 antonym ? 581 pass 582 if dataFlag & 0x80: # Phrase block 583 flags = [ 584 out("Data phrase block: %s"), 585 out("Data phrase block: %s"), 586 out("Data phrase block: %s"), 587 out("Data phrase block: %s"), 588 out("Data phrase block: %s"), 589 out("Data phrase block: %s"), 590 out("Data phrase block: %s"), 591 out("Data phrase block: %s")] 592 if flags == [0x80,0x80,0xF9,0xDF,0x9D,0x00,0x0B,0x01]: 593 result += "\\nphr: " 594 li = 1 595 ol = True 596 item += tag['b1'][0]+outStr("Data phrase 1: %s") + tag['b1'][1] 597 out("Data phrase block: %s") 598 out("Data phrase block: %s") 599 out("Data phrase block: %s") 600 out("Data phrase block: %s") 601 item += tag['ds'][0] + outStr("Data phrase 2: %s") + tag['ds'][1] 602 if flags == [0x80,0x80,0xF9,0xDF,0x9D,0x00,0x23,0x01]: 603 result += "\\nphr: " 604 li = 1 605 ol = True 606 item += tag['b1'][0]+outStr("Data phrase 1: %s") + tag['b1'][1] 607 out("Data phrase block: %s") 608 out("Data phrase block: %s") 609 out("Data phrase block: %s") 610 out("Data phrase block: %s") 611 out("Data phrase block: %s") 612 item += tag['ds'][0] + outStr("Data phrase 2: %s") + tag['ds'][1] 613 if ol: 614 result += "\\n%d. %s" % (li, item) 615 else: 616 result += item 617 618 ok = True 619 while pos < len(stream): 620 ok = (out() == 0x00) and ok 621 622 if ok: 623 result += '\n' 624 625 return decode_tag_postprocessing(result) 626 627################################################################ 628# MAIN 629################################################################ 630 631 632f = open(FILENAME,'rb') 633 634# DECODE HEADER OF FILE 635 636copyright = unpack("<64s",f.read(64))[0] 637a = unpack("<16L",f.read(64)) 638 639entryCount = a[4] 640indexBaseCount = a[6] 641indexOffsetCount = a[7] 642pos1 = a[8] 643indexPos = a[9] 644bodyPos = a[10] 645smallIndex = (a[3] == 2052) 646 647# DECODE INDEX STRUCTURE OF FILE 648 649index = [] 650f.seek(indexPos) 651bases = unpack("<%sL" % indexBaseCount, f.read(indexBaseCount * 4)) 652if smallIndex: # In small dictionaries every base is used 4-times 653 bases4 = [] 654 for i in bases: 655 bases4.extend([i,i,i,i]) 656 bases = bases4 657for b in bases: 658 offsets = unpack("<64H", f.read(64*2)) 659 for o in offsets: 660 if len(index) < indexOffsetCount: 661 #print "Index %s: %s + %s + %s * 4 = %s" % (len(index), bodyPos, b, o, toBin(bodyPos + b + o * 4)) 662 index.append(bodyPos + b + o * 4) 663 664# DECODE RECORDS 665 666if DEBUG: 667 # PRINTOUT DEBUG OF FIRST <DEBUGLIMIT> WRONG RECORDS: 668 for i in range(1,entryCount): 669 if not DEBUGALL: 670 DEBUG = False 671 s = decode(getRec(i)) 672 if DEBUGHEADER: 673 # print s.split('\t')[0] 674 print s 675 if DEBUGLIMIT > 0 and not s.endswith('\n'): 676 DEBUG = True 677 print "-"*80 678 print "%s) at address %s" % (i, toBin(index[i])) 679 print 680 s = decode(getRec(i)) 681 print s 682 DEBUGLIMIT -= 1 683 DEBUG = True 684else: 685 # DECODE EACH RECORD AND PRINT IT IN FORMAT FOR stardict-editor <term>\t<definition> 686 for i in range(1,entryCount): 687 s = decode(getRec(i)) 688 if s.endswith('\n'): 689 print s, 690 else: 691 print s 692 print "!!! RECORD STRUCTURE DECODING ERROR !!!" 693 print "Please run this script in DEBUG mode and repair DATA BLOCK(S) section in function decode()" 694 print "If you succeed with whole dictionary send report (name of the dictionary and source code of script) to slovniky@googlegroups.com" 695 break 696