1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3#
4# Script for decoding Lingea Dictionary (.trd) file
5# Result is <header>\t<definition> file, convertable easily
6# by stardict-editor from package stardict-tools into native
7# Stardict dictionary (stardict.sf.net and www.stardict.org)
8#
9# Copyright (C) 2007 - Klokan Petr Přidal (www.klokan.cz)
10#
11# Based on script CobuildConv.rb by Nomad
12# http://hp.vector.co.jp/authors/VA005784/cobuild/cobuildconv.html
13#
14# Version history:
15# 0.4 (30.10.2007) Patch by Petr Dlouhy, optional HTML generation
16# 0.3 (28.10.2007) Patch by Petr Dlouhy, cleanup, bugfix. More dictionaries.
17# 0.2 (19.7.2007) Changes, documentation, first 100% dictionary
18# 0.1 (20.5.2006) Initial version based on Nomad specs
19#
20# Supported dictionaries:
21# - Lingea Německý Kapesní slovník
22# - Lingea Anglický Kapesní slovník
23# - Lingea 2002 series (theoretically)
24#
25# Modified by:
26# - Petr Dlouhy (petr.dlouhy | email.cz)
27# Generalization of data block rules, sampleFlag 0x04, sound out fix, data phrase prefix with comment (0x04)
28# HTML output, debugging patch, options on command line
29#
30# <write your name here>
31#
32# This library is free software; you can redistribute it and/or
33# modify it under the terms of the GNU Library General Public
34# License as published by the Free Software Foundation; either
35# version 2 of the License, or (at your option) any later version.
36#
37# This library is distributed in the hope that it will be useful,
38# but WITHOUT ANY WARRANTY; without even the implied warranty of
39# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
40# Library General Public License for more details.
41#
42# You should have received a copy of the GNU Library General Public
43# License along with this library; if not, write to the
44# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
45# Boston, MA 02111-1307, USA.
46
47# VERSION
48VERSION = "0.4"
49
50import getopt, sys
51def usage():
52   print "Lingea Dictionary Decoder"
53   print "-------------------------"
54   print "Version: %s" % VERSION
55   print "Copyright (C) 2007 - Klokan Petr Pridal, Petr Dlouhy"
56   print
57   print "Usage: python lingea-trd-decoder.py DICTIONARY.trd > DICTIONARY.tab"
58   print "Result convertion by stardict-tools: /usr/lib/stardict-tools/tabfile"
59   print
60   print "    -o <num>      --out-style        : Output style"
61   print "                                          0   no tags"
62   print "                                          1   \\n tags"
63   print "                                          2   html tags"
64   print "    -h            --help             : Print this message"
65   print "    -d            --debug            : Degub"
66   print "    -r            --debug-header     : Degub - print headers"
67   print "    -a            --debug-all        : Degub - print all records"
68   print "    -l            --debug-limit      : Degub limit"
69   print
70   print "For HTML support in StarDict dictionary .ifo has to contain:"
71   print "sametypesequence=g"
72   print "!!! Change the .ifo file after generation by tabfile !!!"
73   print
74
75try:
76   opts, args = getopt.getopt(sys.argv[1:], "hdo:ral:", ["help", "debug", "out-style=", "debug-header", "debug-all", "debug-limit="])
77except getopt.GetoptError:
78   usage()
79   print "ERROR: Bad option"
80   sys.exit(2)
81
82import locale
83DEBUG = False
84OUTSTYLE = 2
85DEBUGHEADER = False
86DEBUGALL = False
87DEBUGLIMIT = 1
88for o, a in opts:
89   if o in ("-d", "-debug"):
90      # DEBUGING !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
91      DEBUG = True
92   if o in ("-o", "--out-style"):
93      # output style
94      OUTSTYLE = locale.atoi(a)
95      if OUTSTYLE > 2:
96         usage()
97         print "ERROR: Output style not specified"
98   if o in ("-r", "--debug-header"):
99      # If DEBUG and DEBUGHEADER, then print just all header records
100      DEBUGHEADER = True
101   if o in ("-a", "--debug-all"):
102      # If DEBUG and DEBUGALL then print debug info for all records
103      DEBUGALL = True
104   if o in ("-h", "--help"):
105      usage()
106      sys.exit(0)
107   if o in ("-l", "--debug-limit"):
108      # Number of wrong records for printing to stop during debugging
109      DEBUGLIMIT = locale.atoi(a)
110# FILENAME is a first parameter on the commandline now
111
112if len(args) == 1:
113    FILENAME = args[0]
114else:
115   usage()
116   print "ERROR: You have to specify .trd file to decode"
117   sys.exit(2)
118
119from struct import *
120import re
121
122alpha = ['\x00', 'a','b','c','d','e','f','g','h','i',
123    'j','k','l','m','n','o','p','q','r','s',
124    't','u','v','w','x','y','z','#AL27#','#AL28#','#AL29#',
125    '#AL30#','#AL31#', ' ', '.', '<', '>', ',', ';', '-', '#AL39#',
126        '#GRAVE#', '#ACUTE#', '#CIRC#', '#TILDE#', '#UML#', '#AL45#', '#AL46#', '#CARON#', '#AL48#', '#CEDIL#',
127        '#AL50#', '#AL51#', '#GREEK#', '#AL53#', '#AL54#', '#AL55#', '#AL56#', '#AL57#', '#AL58#', '#SYMBOL#',
128        '#AL60#', '#UPCASE#', '#SPECIAL#', '#UNICODE#'] # 4 bytes after unicode
129
130upcase = ['#UP0#','#UP1#','#UP2#','#UP3#','#UP4#','#UP5#','#UP6#','#UP7#','#UP8#','#UP9#',
131    '#UP10#','#UP11#','#UP12#','#UP13#','#UP14#','#UP15#','#UP16#','#UP17#','#UP18#','#UP19#',
132    '#UP20#','#UP21#','#UP22#','#UP23#','#UP24#','#UP25#','#UP26#','#UP27#','#UP28#','#UP29#',
133    '#UP30#','#UP31#','A','B','C','D','E','F','G','H',
134    'I','J','K','L','M','N','O','P','Q','R',
135    'S','T','U','V','W','X','Y','Z','#UP58#','#UP59#',
136    '#UP60#','#UP61#','#UP62#','#UP63#']
137
138upcase_pron = ['#pr0#', '#pr1#','#pr2#','#pr3#','#pr4#','#pr5#','#pr6#','#pr7#','#pr8#','#pr9#',
139    '#pr10#', '#pr11#','#pr12#','#pr13#','#pr14#','#pr15#','#pr16#','#pr17#','#pr18#','#pr19#',
140    '#pr20#', '#pr21#','#pr22#','#pr23#','#pr24#','#pr25#','#pr26#','#pr27#','#pr28#','#pr29#',
141    '#pr30#', '#pr31#','ɑ','#pr33#','ʧ','ð','ə','ɜ','#pr38#','æ',
142    'ɪ', 'ɭ','#pr42#','ŋ','#pr44#','ɳ','ɔ','#pr47#','ɒ','ɽ',
143    'ʃ', 'θ','ʊ','ʌ','#pr54#','#pr55#','#pr56#','ʒ','#pr58#','#pr59#',
144    '#pr60#', '#pr61#','#pr62#','#pr63#']
145
146symbol = ['#SY0#', '#SY1#','#SY2#','#SY3#','§','#SY5#','#SY6#','#SY7#','#SY8#','#SY9#',
147    '#SY10#', '#SY11#','#SY12#','#SY13#','#SY14#','™','#SY16#','#SY17#','¢','£',
148    '#SY20#', '#SY21#','#SY22#','#SY23#','©','#SY25#','#SY26#','#SY27#','®','°',
149    '#SY30#', '²','³','#SY33#','#SY34#','#SY35#','¹','#SY37#','#SY38#','#SY39#',
150    '½', '#SY41#','#SY42#','×','÷','#SY45#','#SY46#','#SY47#','#SY48#','#SY49#',
151    '#SY50#', '#SY51#','#SY52#','#SY53#','#SY54#','#SY55#','#SY56#','#SY57#','#SY58#','#SY59#',
152    '#SY60#', '#SY61#','#SY62#','#SY63#']
153
154special = ['#SP0#', '!','"','#','$','%','&','\'','(',')',
155    '*', '+','#SP12#','#SP13#','#SP14#','/','0','1','2','3',
156    '4', '5','6','7','8','9',':',';','<','=',
157    '>', '?','@','[','\\',']','^','_','`','{',
158    '|', '}','~','#SP43#','#SP44#','#SP45#','#SP46#','#SP47#','#SP48#','#SP49#',
159    '#SP50#', '#SP51#','#SP52#','#SP53#','#SP54#','#SP55#','#SP56#','#SP57#','#SP58#','#SP59#',
160    '#SP60#', '#SP61#','#SP62#','#SP63#']
161
162wordclass = ('#0#','n:','adj:','pron:','#4#','v:','adv:','prep:','#8#','#9#',
163    'intr:','phr:','#12#','#13#','#14#','#15#','#16#','#17#','#18#','#19#',
164    '#20#','#21#','#22#','#23#','#24#','#25#','#26#','#27#','#28#','#29#',
165    '#30#','#31#')
166
167if OUTSTYLE == 0:
168    tag = {
169           'db':(''   ,''),    #Data begining
170           'rn':(''   ,'\t'),  #Record name
171           'va':(''   ,' '),   #Header variant
172           'wc':('('  ,')'),   #WordClass
173           'pa':(''   ,' '),   #Header parts
174           'fo':('('  ,') '),  #Header forms
175           'on':('('  ,')' ),  #Header origin note
176           'pr':('['  ,']'),   #Header pronunciation
177           'dv':('{'  ,'} '),  #Header dataVariant
178           'sa':('`'  ,'`' ),  #Data sample
179           'sw':(''   ,''),    #Data sample wordclass; is no printed by Lingea
180           'do':('`'  ,'`' ),  #Data origin note
181           'df':(''   ,' '),   #Data definition
182           'ps':('"'  ,'" '),  #Data phrase short form
183           'pg':('"'  ,' = '), #Data phrase green
184           'pc':('`'  ,'`'),   #Data phrase comment; this comment is not printed by Lingea), but it seems useful
185           'p1':('"'  ,' = '), #Data phrase 1
186           'p2':(''   ,'" ' ), #Data phrase 2
187           'sp':('"'  ,' = ' ),#Data simple phrase
188           'b1':('"'  ,' = '), #Data phrase (block) 1
189           'b2':('" ' ,''),    #Data phrase (block) 2
190           }
191if OUTSTYLE == 1:
192    tag = {
193           'db':('•'       ,''),      #Data begining
194           'rn':(''        ,'\t'),    #Record name
195           'va':(''        ,' '),     #Header variant
196           'wc':(''        ,'\\n'),   #WordClass
197           'pa':(''        ,':\\n'),  #Header parts
198           'fo':('('       ,') '),    #Header forms
199           'on':('('       ,')\\n' ), #Header origin note
200           'pr':('['       ,']\\n'),  #Header pronunciation
201           'dv':('{'       ,'} '),    #Header dataVariant
202           'sa':('    '    ,'\\n' ),  #Data sample
203           'sw':(''        ,''),      #Data sample wordclass; is not printed by Lingea
204           'do':('    '    ,' ' ),    #Data origin note
205           'df':('    '    ,'\\n'),   #Data definition
206           'ps':('    '    ,'\\n'),   #Data phrase short form
207           'pg':('    '    ,' '),     #Data phrase green
208           'pc':('    '    ,' '),     #Data phrase comment; this comment is not printed by Lingea), but it seems useful
209           'p1':('    '    ,' '),     #Data phrase 1
210           'p2':('      '  ,'\\n' ),  #Data phrase 2
211           'sp':(''        ,'\\n' ),  #Data simple phrase
212           'b1':('"'       ,' = '),   #Data phrase (block) 1
213           'b2':('" '      ,''),      #Data phrase (block) 2
214          }
215if OUTSTYLE == 2:
216    tag = {
217           'db':('•'                                                 ,''),              #Data begining
218           'rn':(''                                                  ,'\t'),            #Record name
219           'va':(''                                                  ,' '),             #Header variant
220           'wc':('<span size="larger" color="darkred" weight="bold">','</span>\\n'),    #WordClass
221           'pa':('<span size="larger" color="darkred" weight="bold">',':</span>\\n'),   #Header parts
222           'fo':('('                                                 ,') '),            #Header forms
223           'on':('<span color="blue">('                              ,')</span>\\n' ),  #Header origin note
224           'pr':('['                                                 ,']\\n'),          #Header pronunciation
225           'dv':('{'                                                 ,'} '),            #Header dataVariant
226           'sa':('    <span color="darkred" weight="bold">'          ,'</span>\\n' ),   #Data sample
227           'sw':(''                                                  ,''),              #Data sample wordclass; is not printed by Lingea
228           'do':('    <span color="darkred" weight="bold">'          ,'</span> ' ),     #Data origin note
229           'df':('    <span weight="bold">'                          ,'</span>\\n'),    #Data definition
230           'ps':('    <span color="dimgray" weight="bold">'          ,'</span>\\n'),    #Data phrase short form
231           'pg':('    <span color="darkgreen" style="italic">'       ,'</span> '),      #Data phrase green
232           'pc':('    <span color="darkgreen" style="italic">'       ,'</span> '),      #Data phrase comment; this comment is not printed by Lingea), but it seems useful
233           'p1':('    <span color="dimgray" style="italic">'         ,'</span> '),      #Data phrase 1
234           'p2':('      '                                            ,'\\n' ),          #Data phrase 2
235           'sp':('<span color="cyan">'                               ,'</span>\\n' ),   #Data simple phrase
236           'b1':('"'                                                 ,' = '),           #Data phrase (block) 1
237           'b2':('" '                                                ,''),              #Data phrase (block) 2
238          }
239
240
241
242# Print color debug functions
243purple = lambda c: '\x1b[1;35m'+c+'\x1b[0m'
244blue = lambda c: '\x1b[1;34m'+c+'\x1b[0m'
245cyan = lambda c: '\x1b[36m'+c+'\x1b[0m'
246gray = lambda c: '\x1b[1m'+c+'\x1b[0m'
247
248def getRec(n):
249    """Get data stream for record of given number"""
250    if n >= 0 and n < entryCount:
251        f.seek(index[n])
252        return f.read(index[n+1] - index[n])
253    else:
254        return ''
255
256def decode_alpha( stream, nullstop=True):
257    """Decode 6-bit encoding data stream from the begining untit first NULL"""
258    offset = 0
259    triple = 0
260    result = []
261    while triple < len( stream ):
262        if offset % 4 == 0:
263            c = stream[triple] >> 2
264            triple += 1
265        if offset % 4 == 1:
266            c = (stream[triple-1] & 3) << 4 | stream[triple] >> 4
267            triple += 1
268        if offset % 4 == 2:
269            c = (stream[triple-1] & 15) << 2 | (stream[triple] & 192) >> 6
270            triple += 1
271        if offset % 4 == 3:
272            c = stream[triple-1] & 63
273        if c == 0 and nullstop:
274            break
275        offset += 1
276        # TODO: ENCODE UNICODE 4 BYTE STREAM!!! and but it after #UNICODE# as unichr()
277        result.append(c)
278    return decode_alpha_postprocessing(result), triple - 1
279
280
281def decode_alpha_postprocessing( input ):
282    """Lowlevel alphabet decoding postprocessing, combines tuples into one character"""
283    result = ""
284    input.extend([0x00]*5)
285
286    # UPCASE, UPCASE_PRON, SYMBOL, SPECIAL
287    skip = False
288    for i in range(0,len(input)-1):
289        if skip:
290            skip = False
291            continue
292
293        bc = input[i]
294        c = alpha[bc]
295        bc1 = input[i+1]
296        c1 = alpha[bc1]
297
298        if bc < 40:
299            result += c
300        else:
301            if c == "#GRAVE#":
302                if   c1 == 'a': result += 'à'
303                else: result += '#GRAVE%s#' % c1
304            elif c == "#UML#":
305                if   c1 == 'o': result += 'ö'
306                elif c1 == 'u': result += 'ü'
307                elif c1 == 'a': result += 'ä'
308                elif c1 == ' ': result += 'Ä'
309                elif c1 == '#AL46#': result += 'Ö'
310                elif c1 == '#GREEK#': result += 'Ü'
311                else: result += '#UML%s#' % c1
312            elif c == "#ACUTE#":
313                if   c1 == 'a': result += 'á'
314                elif c1 == 'e': result += 'é'
315                elif c1 == 'i': result += 'í'
316                elif c1 == 'o': result += 'ó'
317                elif c1 == 'u': result += 'ú'
318                elif c1 == 'y': result += 'ý'
319                elif c1 == ' ': result += 'Á'
320                elif c1 == '#GRAVE#': result += 'Í'
321                else: result += '#ACUTE%s#' % c1
322            elif c == "#CARON#":
323                if   c1 == 'r': result += 'ř'
324                elif c1 == 'c': result += 'č'
325                elif c1 == 's': result += 'š'
326                elif c1 == 'z': result += 'ž'
327                elif c1 == 'e': result += 'ě'
328                elif c1 == 'd': result += 'ď'
329                elif c1 == 't': result += 'ť'
330                elif c1 == 'a': result += 'å'
331                elif c1 == 'u': result += 'ů'
332                elif c1 == 'n': result += 'ň'
333                elif c1 == '<': result += 'Č'
334                elif c1 == '#CEDIL#': result += 'Ř'
335                elif c1 == '#AL50#': result += 'Š'
336                elif c1 == '#AL57#': result += 'Ž'
337                else: result += '#CARON%s#' % c1
338            elif c == "#UPCASE#":
339                result += upcase[bc1]
340            elif c == "#SYMBOL#":
341                result += symbol[bc1]
342            elif c == "#AL51#":
343                if c1 == 's': result += 'ß'
344            elif c == "#AL48#":
345                result += "#AL48#%s" % c1
346            elif c == "#SPECIAL#":
347                result += special[bc1]
348            elif c == "#UNICODE#":
349                result += '#UNICODE%s#' % bc1
350            elif c == "#CIRC#":
351                if   c1 == 'a': result += 'â'
352                else: result += '#CARON%s#' % c1
353            else:
354                result += '%sX%s#' % (c[:-1], bc1)
355            skip = True
356    return result
357
358def pronunciation_encode(s):
359    """Encode pronunciation upcase symbols into IPA symbols"""
360    for i in range(0, 64):
361        s = s.replace(upcase[i], upcase_pron[i])
362    return s
363
364re_d = re.compile(r'<d(.*?)>')
365re_w = re.compile(r'<w(.*?)>')
366re_y = re.compile(r'<y(.*?)>')
367re_c = re.compile(r'<c(.*?)>')
368
369def decode_tag_postprocessing(input):
370    """Decode and replace tags used in lingea dictionaries; decode internal tags"""
371    s = input
372
373    # General information in http://www.david-zbiral.cz/El-slovniky-plnaverze.htm#_Toc151656799
374    # TODO: Better output handling
375
376    if OUTSTYLE == 0:
377        # ?? <d...>
378        s = re_d.sub(r'(\1)',s)
379        # ?? <w...>
380        s = re_w.sub(r'(\1)',s)
381        # ?? <y...>
382        s = re_y.sub(r'(\1)',s)
383        # ?? <c...>
384        s = re_c.sub(r'(\1)',s)
385        # ...
386    if OUTSTYLE == 1:
387        # ?? <d...>
388        s = re_d.sub(r'(\1)',s)
389        # ?? <w...>
390        s = re_w.sub(r'(\1)',s)
391        # ?? <y...>
392        s = re_y.sub(r'(\1)',s)
393        # ?? <c...>
394        s = re_c.sub(r'(\1)',s)
395        # ...
396    if OUTSTYLE == 2:
397        # ?? <d...>
398        s = re_d.sub(r'<span size="small" color="blue">(\1)</span>',s)
399        # ?? <w...>
400        s = re_w.sub(r'<span size="small" color="blue" style="italic">\1</span>',s)
401        # ?? <y...>
402        s = re_y.sub(r'<span size="small" color="blue" style="italic">\1</span>',s)
403        # ?? <c...>
404        s = re_c.sub(r'<span size="small" color="blue" style="italic">\1</span>',s)
405        # ...
406
407    return s
408
409def toBin( b ):
410    """Prettify debug output format: hex(bin)dec"""
411    original = b
412    r = 0;
413    i = 1;
414    while b > 0:
415        if b & 0x01 != 0: r += i
416        i *= 10
417        b = b >> 1
418    return "0x%02X(%08d)%03d" % (original, r, original)
419
420
421def out( comment = "", skip = False):
422    """Read next byte or string (with skip=True) and output DEBUG info"""
423    global bs, pos
424    s, triple  = decode_alpha(bs[pos:])
425    s = s.split('\x00')[0] # give me string until first NULL
426    if (comment.find('%') != -1):
427        if skip:
428            comment = comment % s
429        else:
430            comment = comment % bs[pos]
431    if DEBUG: print "%03d %s %s | %s | %03d" % (pos, toBin(bs[pos]),comment, s, (triple + pos))
432    if skip:
433        pos += triple + 1
434        return s.replace('`','') # Remove '`' character from words
435    else:
436        pos += 1
437        return bs[pos-1]
438
439outInt = lambda c: out(c)
440outStr = lambda c: out(c, True)
441
442def decode(stream):
443    """Decode byte stream of one record, return decoded string with formatting in utf"""
444    result = ""
445    global bs, pos
446    # stream - data byte stream for one record
447    bs = unpack("<%sB" % len(stream), stream)
448    # bs - list of bytes from stream
449
450    pos = 0
451    itemCount = outInt("ItemCount: %s") # Number of blocks in the record
452    mainFlag = outInt("MainFlag: %s")
453
454    # HEADER BLOCK
455    # ------------
456    if mainFlag & 0x01:
457        headerFlag = outInt("HeaderFlag: %s") # Blocks in header
458        if headerFlag & 0x01:
459            result += tag['rn'][0] + outStr("Header record name: %s").replace('_','') + tag['rn'][1]  # Remove character '_' from index
460        if headerFlag & 0x02:
461            result += tag['va'][0] + outStr("Header variant: %s") + tag['va'][1]
462        if headerFlag & 0x04:
463            s = outInt("Header wordclass: %s")
464            if s < 32:
465                result += tag['wc'][0] + wordclass[s] + tag['wc'][1]
466            else:
467                raise "Header wordclass out of range in: %s" % result
468        if headerFlag & 0x08:
469            result += tag['pa'][0] + outStr("Header parts: %s") + tag['pa'][1]
470        if headerFlag & 0x10:
471            result += tag['fo'][0] + outStr("Header forms: %s") + tag['fo'][1]
472        if headerFlag & 0x20:
473            result += tag['on'][0] + outStr("Header origin note: %s") +  tag['on'][1]
474        if headerFlag & 0x80:
475            result += tag['pr'][0] + pronunciation_encode(outStr("Header pronunciation: %s")) + tag['pr'][1]
476
477    # Header data block
478    if mainFlag & 0x02:
479        headerFlag = outInt("Header dataFlag: %s") # Blocks in header
480        if headerFlag & 0x02:
481            result += tag['dv'][0] + outStr("Header dataVariant: %s")+ tag['dv'][1]
482
483    # ??? Link elsewhere
484    pass
485
486    # SOUND DATA REFERENCE
487    if mainFlag & 0x80:
488       outInt("Sound reference byte #1: %s")
489       outInt("Sound reference byte #2: %s")
490       outInt("Sound reference byte #3: %s")
491       outInt("Sound reference byte #4: %s")
492       outInt("Sound reference byte #5: %s")
493       #out("Sound data reference (5 bytes)", 6)
494
495    # TODO: Test all mainFlags in header!!!!
496
497    #result += ': '
498    li = 0
499
500    #print just every first word class identifier
501    # TODO: this is not systematic (should be handled by output)
502    global lastWordClass
503    lastWordClass = 0
504
505    # DATA BLOCK(S)
506    # -------------
507    for i in range(0, itemCount):
508        item = tag['db'][0] + tag['db'][1]
509        ol = False
510        dataFlag = outInt("DataFlag: %s -----------------------------")
511        if dataFlag & 0x01: # small index
512            sampleFlag = outInt("Data sampleFlag: %s")
513            if sampleFlag & 0x01:
514                result += tag['sa'][0] + outStr("Data sample: %s") +  tag['sa'][1]
515            if sampleFlag & 0x04:
516               s = outInt("Data wordclass: %s")
517               if s != lastWordClass:
518                  if s < 32:
519                      result += tag['wc'][0] + wordclass[s] + tag['wc'][1]
520                  else:
521                      raise "Header wordclass out of range in: %s" % result
522               lastWordClass = s
523            if sampleFlag & 0x08:
524                result += tag['sw'][0] + outStr("Data sample wordclass: %s") + tag['sw'][1]
525            if sampleFlag & 0x10:
526                outInt("Data sample Int: %s")
527                outInt("Data sample Int: %s")
528                outInt("Data sample Int: %s")
529            if sampleFlag & 0x20:
530                item += tag['do'][0] + outStr("Data origin note: %s") + tag['do'][1]
531            if sampleFlag & 0x80:
532                item += "    "
533                result += tag['pr'][0] + pronunciation_encode(outStr("Data sample pronunciation: %s")) + tag['pr'][1]
534        if dataFlag & 0x02:
535            item += "    "
536            subFlag = outInt("Data subFlag: %s")
537            if subFlag == 0x80:
538                outStr("Data sub prefix: %s")
539                # It seams that data sub prefix content is ignored and there is a generated number for the whole block instead.
540                li += 1
541                ol = True
542        if dataFlag & 0x04: # chart
543            pass # ???
544        if dataFlag & 0x08: # reference
545            item += tag['df'][0] + outStr("Data definition: %s") + tag['df'][1]
546        if dataFlag & 0x10:
547            pass # ???
548        if dataFlag & 0x20: # phrase
549            phraseFlag1 = outInt("Data phraseFlag1: %s")
550            if phraseFlag1 & 0x01:
551                item += tag['ps'][0] + outStr("Data phrase short form: %s") + tag['ps'][1]
552            if phraseFlag1 & 0x02:
553                phraseCount = outInt("Data phraseCount: %s")
554                for i in range(0, phraseCount):
555                    phraseComment = outInt("Data phrase prefix")
556                    if phraseComment & 0x04:
557                       item += tag['pc'][0] + outStr("Data phrase comment: %s")  + tag['pc'][1]
558                    item += tag['p1'][0] + outStr("Data phrase 1: %s") + tag['p1'][1]
559                    item += tag['p2'][0] + outStr("Data phrase 2: %s") + tag['p2'][1]
560            if phraseFlag1 & 0x04:
561                phraseCount = outInt("Data phraseCount: %s")
562                for i in range(0, phraseCount):
563                    phraseComment = outInt("Data phrase prefix")
564                    if phraseComment & 0x04:
565                       item += tag['pc'][0] + outStr("Data phrase 1: %s")  + tag['pc'][1]
566                    item += tag['pg'][0] + outStr("Data phrase comment: %s")  + tag['pg'][1]
567                    item += tag['p2'][0] + outStr("Data phrase 2: %s") +  tag['p2'][1]
568            if phraseFlag1 & 0x08:
569                phraseCount = outInt("Data simple phraseCount: %s")
570                for i in range(0, phraseCount):
571                    item += "    "
572                    item += tag['sp'][0] + outStr("Data simple phrase: %s") +  tag['sp'][1]
573            if phraseFlag1 & 0x40:
574                item += tag['ps'][0] + outStr("Data phrase short form: %s") + tag['ps'][1]
575
576
577            # TODO: be careful in changing the rules, to have back compatibility!
578        if dataFlag & 0x40: # reference, related language
579            #0x01 synonym ?
580            #0x02 antonym ?
581            pass
582        if dataFlag & 0x80: # Phrase block
583            flags = [
584            out("Data phrase block: %s"),
585            out("Data phrase block: %s"),
586            out("Data phrase block: %s"),
587            out("Data phrase block: %s"),
588            out("Data phrase block: %s"),
589            out("Data phrase block: %s"),
590            out("Data phrase block: %s"),
591            out("Data phrase block: %s")]
592            if flags == [0x80,0x80,0xF9,0xDF,0x9D,0x00,0x0B,0x01]:
593                result += "\\nphr: "
594                li = 1
595                ol = True
596                item += tag['b1'][0]+outStr("Data phrase 1: %s") + tag['b1'][1]
597                out("Data phrase block: %s")
598                out("Data phrase block: %s")
599                out("Data phrase block: %s")
600                out("Data phrase block: %s")
601                item += tag['ds'][0] + outStr("Data phrase 2: %s") + tag['ds'][1]
602            if flags == [0x80,0x80,0xF9,0xDF,0x9D,0x00,0x23,0x01]:
603                result += "\\nphr: "
604                li = 1
605                ol = True
606                item += tag['b1'][0]+outStr("Data phrase 1: %s") + tag['b1'][1]
607                out("Data phrase block: %s")
608                out("Data phrase block: %s")
609                out("Data phrase block: %s")
610                out("Data phrase block: %s")
611                out("Data phrase block: %s")
612                item += tag['ds'][0] + outStr("Data phrase 2: %s") + tag['ds'][1]
613        if ol:
614            result += "\\n%d. %s" % (li, item)
615        else:
616            result += item
617
618    ok = True
619    while pos < len(stream):
620        ok = (out() == 0x00) and ok
621
622    if ok:
623        result += '\n'
624
625    return decode_tag_postprocessing(result)
626
627################################################################
628# MAIN
629################################################################
630
631
632f = open(FILENAME,'rb')
633
634# DECODE HEADER OF FILE
635
636copyright = unpack("<64s",f.read(64))[0]
637a = unpack("<16L",f.read(64))
638
639entryCount = a[4]
640indexBaseCount = a[6]
641indexOffsetCount = a[7]
642pos1 = a[8]
643indexPos = a[9]
644bodyPos = a[10]
645smallIndex = (a[3] == 2052)
646
647# DECODE INDEX STRUCTURE OF FILE
648
649index = []
650f.seek(indexPos)
651bases = unpack("<%sL" % indexBaseCount, f.read(indexBaseCount * 4))
652if smallIndex: # In small dictionaries every base is used 4-times
653    bases4 = []
654    for i in bases:
655        bases4.extend([i,i,i,i])
656    bases = bases4
657for b in bases:
658    offsets = unpack("<64H", f.read(64*2))
659    for o in offsets:
660        if len(index) < indexOffsetCount:
661            #print "Index %s: %s + %s + %s * 4 = %s" % (len(index), bodyPos, b, o, toBin(bodyPos + b + o * 4))
662            index.append(bodyPos + b + o * 4)
663
664# DECODE RECORDS
665
666if DEBUG:
667    # PRINTOUT DEBUG OF FIRST <DEBUGLIMIT> WRONG RECORDS:
668    for i in range(1,entryCount):
669        if not DEBUGALL:
670            DEBUG = False
671        s = decode(getRec(i))
672        if DEBUGHEADER:
673            # print s.split('\t')[0]
674            print s
675        if DEBUGLIMIT > 0 and not s.endswith('\n'):
676            DEBUG = True
677            print "-"*80
678            print "%s) at address %s" % (i, toBin(index[i]))
679            print
680            s = decode(getRec(i))
681            print s
682            DEBUGLIMIT -= 1
683    DEBUG = True
684else:
685    # DECODE EACH RECORD AND PRINT IT IN FORMAT FOR stardict-editor <term>\t<definition>
686    for i in range(1,entryCount):
687        s = decode(getRec(i))
688        if s.endswith('\n'):
689            print s,
690        else:
691            print s
692            print "!!! RECORD STRUCTURE DECODING ERROR !!!"
693            print "Please run this script in DEBUG mode and repair DATA BLOCK(S) section in function decode()"
694            print "If you succeed with whole dictionary send report (name of the dictionary and source code of script) to slovniky@googlegroups.com"
695            break
696