1#!python3
2
3"""
4FSA DICTIONARY BUILDER
5
6by Olivier R.
7License: MPL 2
8
9This tool encodes lexicon into an indexable binary dictionary
10Input files MUST be encoded in UTF-8.
11"""
12
13import sys
14import os
15import collections
16import json
17import time
18import re
19import traceback
20
21from . import str_transform as st
22from .progressbar import ProgressBar
23
24
25
26dLexiconData = {}
27
28def readFile (spf):
29    "generator: read file <spf> and return for each line a list of elements separated by a tabulation."
30    print(" < Read lexicon: " + spf)
31    if os.path.isfile(spf):
32        dLexiconData.clear()
33        with open(spf, "r", encoding="utf-8") as hSrc:
34            for sLine in hSrc:
35                sLine = sLine.strip()
36                if sLine.startswith("##") :
37                    m = re.match("## *(\\w+) *:(.*)$", sLine)
38                    if m:
39                        dLexiconData[m.group(1)] = m.group(2).strip()
40                elif sLine and not sLine.startswith("#"):
41                    yield sLine.split("\t")
42        if dLexiconData:
43            print("Data from dictionary:")
44            print(dLexiconData)
45    else:
46        raise OSError("# Error. File not found or not loadable: " + spf)
47
48
49
50class DAWG:
51    """DIRECT ACYCLIC WORD GRAPH"""
52    # This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)
53    # We store suffix/affix codes and tags within the graph after the “real” word.
54    # A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags]
55    # Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags.
56    # Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final.
57
58    def __init__ (self, src, cStemming, sLangCode, sLangName="", sDicName="", sDescription="", sSelectFilterRegex=""):
59        print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====")
60        cStemming = cStemming.upper()
61        if cStemming == "A":
62            funcStemmingGen = st.defineAffixCode
63        elif cStemming == "S":
64            funcStemmingGen = st.defineSuffixCode
65        elif cStemming == "N":
66            funcStemmingGen = st.noStemming
67        else:
68            raise ValueError("# Error. Unknown stemming code: {}".format(cStemming))
69
70        aEntry = set()
71        lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {}
72        lAff  = [];   dAff  = {}; nAff  = 0; dAffOccur = {}
73        lTag  = [];   dTag  = {}; nTag  = 0; dTagOccur = {}
74
75        self.a2grams = set()
76
77        try:
78            zFilter = re.compile(sSelectFilterRegex)  if sSelectFilterRegex  else None
79        except re.error:
80            print("# Error. Wrong filter regex. Filter ignored: ", zFilter)
81            traceback.print_exc()
82            zFilter = None
83
84        # read lexicon
85        if isinstance(src, str):
86            iterable = readFile(src)
87        else:
88            iterable = src
89        for sFlex, sStem, sTag in iterable:
90            if not zFilter or zFilter.search(sTag):
91                self.a2grams.update(st.getNgrams(sFlex))
92                addWordToCharDict(sFlex)
93                # chars
94                for c in sFlex:
95                    if c not in dChar:
96                        dChar[c] = nChar
97                        lChar.append(c)
98                        nChar += 1
99                    dCharOccur[c] = dCharOccur.get(c, 0) + 1
100                # affixes to find stem from flexion
101                sAff = funcStemmingGen(sFlex, sStem)
102                if sAff not in dAff:
103                    dAff[sAff] = nAff
104                    lAff.append(sAff)
105                    nAff += 1
106                dAffOccur[sAff] = dAffOccur.get(sAff, 0) + 1
107                # tags
108                if sTag not in dTag:
109                    dTag[sTag] = nTag
110                    lTag.append(sTag)
111                    nTag += 1
112                dTagOccur[sTag] = dTagOccur.get(sTag, 0) + 1
113                aEntry.add((sFlex, dAff[sAff], dTag[sTag]))
114        if not aEntry:
115            raise ValueError("# Error. Empty lexicon")
116
117        # Preparing DAWG
118        print(" > Preparing list of words")
119        print(" Filter: " + (sSelectFilterRegex or "[None]"))
120        lVal = lChar + lAff + lTag
121        lWord = [ [dChar[c] for c in sFlex] + [iAff+nChar] + [iTag+nChar+nAff]  for sFlex, iAff, iTag in aEntry ]
122        aEntry = None
123
124        # Dictionary of arc values occurrency, to sort arcs of each node
125        dValOccur = dict( [ (dChar[c], dCharOccur[c])  for c in dChar ] \
126                        + [ (dAff[aff]+nChar, dAffOccur[aff]) for aff in dAff ] \
127                        + [ (dTag[tag]+nChar+nAff, dTagOccur[tag]) for tag in dTag ] )
128
129        self.sFileName = src  if isinstance(src, str)  else "[None]"
130        self.sLangCode = sLangCode
131        self.sLangName = sLangName
132        self.sDicName = sDicName
133        self.sDescription = sDescription
134        if dLexiconData:
135            self.sLangCode = dLexiconData.get("LangCode", self.sLangCode)
136            self.sLangName = dLexiconData.get("LangName", self.sLangName)
137            self.sDicName = dLexiconData.get("DicName", self.sDicName)
138            self.sDescription = dLexiconData.get("Description", self.sDescription)
139        self.nEntry = len(lWord)
140        self.aPreviousEntry = []
141        DawgNode.resetNextId()
142        self.oRoot = DawgNode()
143        self.lUncheckedNodes = []  # list of nodes that have not been checked for duplication.
144        self.lMinimizedNodes = {}  # list of unique nodes that have been checked for duplication.
145        self.lSortedNodes = []     # version 2 and 3
146        self.nNode = 0
147        self.nArc = 0
148        self.dChar = dChar
149        self.nChar = len(dChar)
150        self.nAff = nAff
151        self.lArcVal = lVal
152        self.nArcVal = len(lVal)
153        self.nTag = self.nArcVal - self.nChar - nAff
154        self.cStemming = cStemming
155        if cStemming == "A":
156            self.funcStemming = st.changeWordWithAffixCode
157        elif cStemming == "S":
158            self.funcStemming = st.changeWordWithSuffixCode
159        else:
160            self.funcStemming = st.noStemming
161
162        # calculated later
163        self.nBytesNodeAddress = 1
164        self.nBytesArc = 0
165        self.nBytesOffset = 0
166        self.nMaxOffset = 0
167
168        # build
169        lWord.sort()
170        oProgBar = ProgressBar(0, len(lWord))
171        for aEntry in lWord:
172            self.insert(aEntry)
173            oProgBar.increment(1)
174        oProgBar.done()
175        self.finish()
176        self.countNodes()
177        self.countArcs()
178        self.sortNodes()         # version 2 and 3
179        self.sortNodeArcs(dValOccur)
180        #self.sortNodeArcs2 (self.oRoot, "")
181        self.displayInfo()
182
183    # BUILD DAWG
184    def insert (self, aEntry):
185        "insert a new entry (insertion must be made in alphabetical order)."
186        if aEntry < self.aPreviousEntry:
187            sys.exit("# Error: Words must be inserted in alphabetical order.")
188
189        # find common prefix between word and previous word
190        nCommonPrefix = 0
191        for i in range(min(len(aEntry), len(self.aPreviousEntry))):
192            if aEntry[i] != self.aPreviousEntry[i]:
193                break
194            nCommonPrefix += 1
195
196        # Check the lUncheckedNodes for redundant nodes, proceeding from last
197        # one down to the common prefix size. Then truncate the list at that point.
198        self._minimize(nCommonPrefix)
199
200        # add the suffix, starting from the correct node mid-way through the graph
201        if not self.lUncheckedNodes:
202            oNode = self.oRoot
203        else:
204            oNode = self.lUncheckedNodes[-1][2]
205
206        iChar = nCommonPrefix
207        for c in aEntry[nCommonPrefix:]:
208            oNextNode = DawgNode()
209            oNode.arcs[c] = oNextNode
210            self.lUncheckedNodes.append((oNode, c, oNextNode))
211            if iChar == (len(aEntry) - 2):
212                oNode.final = True
213            iChar += 1
214            oNode = oNextNode
215        oNode.final = True
216        self.aPreviousEntry = aEntry
217
218    def finish (self):
219        "minimize unchecked nodes"
220        self._minimize(0)
221
222    def _minimize (self, downTo):
223        # proceed from the leaf up to a certain point
224        for i in range( len(self.lUncheckedNodes)-1, downTo-1, -1 ):
225            oNode, char, oChildNode = self.lUncheckedNodes[i]
226            if oChildNode in self.lMinimizedNodes:
227                # replace the child with the previously encountered one
228                oNode.arcs[char] = self.lMinimizedNodes[oChildNode]
229            else:
230                # add the state to the minimized nodes.
231                self.lMinimizedNodes[oChildNode] = oChildNode
232            self.lUncheckedNodes.pop()
233
234    def countNodes (self):
235        "count the number of nodes of the whole word graph"
236        self.nNode = len(self.lMinimizedNodes)
237
238    def countArcs (self):
239        "count the number of arcs in the whole word graph"
240        self.nArc = len(self.oRoot.arcs)
241        for oNode in self.lMinimizedNodes:
242            self.nArc += len(oNode.arcs)
243
244    def sortNodeArcs (self, dValOccur):
245        "sort arcs of each node according to <dValOccur>"
246        print(" > Sort node arcs")
247        self.oRoot.sortArcs(dValOccur)
248        for oNode in self.lMinimizedNodes:
249            oNode.sortArcs(dValOccur)
250
251    def sortNodeArcs2 (self, oNode, cPrevious=""):
252        "sort arcs of each node depending on the previous char"
253        # recursive function
254        dCharOccur = getCharOrderAfterChar(cPrevious)
255        if dCharOccur:
256            oNode.sortArcs2(dCharOccur, self.lArcVal)
257        for nArcVal, oNextNode in oNode.arcs.items():
258            self.sortNodeArcs2(oNextNode, self.lArcVal[nArcVal])
259
260    def sortNodes (self):
261        "sort nodes"
262        print(" > Sort nodes")
263        for oNode in self.oRoot.arcs.values():
264            self._parseNodes(oNode)
265
266    def _parseNodes (self, oNode):
267        # Warning: recursive method
268        if oNode.pos > 0:
269            return
270        oNode.setPos()
271        self.lSortedNodes.append(oNode)
272        for oNextNode in oNode.arcs.values():
273            self._parseNodes(oNextNode)
274
275    def lookup (self, sWord):
276        "return True if <sWord> is within the word graph (debugging)"
277        oNode = self.oRoot
278        for c in sWord:
279            if self.dChar.get(c, '') not in oNode.arcs:
280                return False
281            oNode = oNode.arcs[self.dChar[c]]
282        return oNode.final
283
284    def morph (self, sWord):
285        "return a string of the morphologies of <sWord> (debugging)"
286        oNode = self.oRoot
287        for c in sWord:
288            if self.dChar.get(c, '') not in oNode.arcs:
289                return ''
290            oNode = oNode.arcs[self.dChar[c]]
291        if oNode.final:
292            s = "* "
293            for arc in oNode.arcs:
294                if arc >= self.nChar:
295                    s += " [" + self.funcStemming(sWord, self.lArcVal[arc])
296                    oNode2 = oNode.arcs[arc]
297                    for arc2 in oNode2.arcs:
298                        s += " / " + self.lArcVal[arc2]
299                    s += "]"
300            return s
301        return ''
302
303    def displayInfo (self):
304        "display informations about the word graph"
305        print(" * {:<12} {:>16,}".format("Entries:", self.nEntry))
306        print(" * {:<12} {:>16,}".format("Characters:", self.nChar))
307        print(" * {:<12} {:>16,}".format("Affixes:", self.nAff))
308        print(" * {:<12} {:>16,}".format("Tags:", self.nTag))
309        print(" * {:<12} {:>16,}".format("Arc values:", self.nArcVal))
310        print(" * {:<12} {:>16,}".format("Nodes:", self.nNode))
311        print(" * {:<12} {:>16,}".format("Arcs:", self.nArc))
312        print(" * {:<12} {:>16,}".format("2grams:", len(self.a2grams)))
313        print(" * {:<12} {:>16}".format("Stemming:", self.cStemming + "FX"))
314
315    def getArcStats (self):
316        "return a string with statistics about nodes and arcs"
317        d = {}
318        for oNode in self.lMinimizedNodes:
319            n = len(oNode.arcs)
320            d[n] = d.get(n, 0) + 1
321        s = " * Nodes:\n"
322        for n in d:
323            s = s + " {:>9} nodes have {:>3} arcs\n".format(d[n], n)
324        return s
325
326    def writeInfo (self, sPathFile):
327        "write informations in file <sPathFile>"
328        print(" > Write informations")
329        with open(sPathFile, 'w', encoding='utf-8', newline="\n") as hDst:
330            hDst.write(self.getArcStats())
331            hDst.write("\n * Values:\n")
332            for i, s in enumerate(self.lArcVal):
333                hDst.write(" {:>6}. {}\n".format(i, s))
334            hDst.close()
335
336    def select (self, sPattern=""):
337        "generator: returns all entries which morphology fits <sPattern>"
338        zPattern = None
339        if sPattern:
340            try:
341                zPattern = re.compile(sPattern)
342            except re.error:
343                print("# Error in regex pattern")
344                traceback.print_exc()
345        yield from self._select(zPattern, self.oRoot, "")
346
347    def _select (self, zPattern, oNode, sWord):
348        # recursive generator
349        for nVal, oNextNode in oNode.arcs.items():
350            if nVal <= self.nChar:
351                # simple character
352                yield from self._select(zPattern, oNextNode, sWord + self.lArcVal[nVal])
353            else:
354                sEntry = sWord + "\t" + self.funcStemming(sWord, self.lArcVal[nVal])
355                for nMorphVal, _ in oNextNode.arcs.items():
356                    if not zPattern or zPattern.search(self.lArcVal[nMorphVal]):
357                        yield sEntry + "\t" + self.lArcVal[nMorphVal]
358
359
360    # BINARY CONVERSION
361    def _calculateBinary (self, nCompressionMethod):
362        print(" > Write DAWG as an indexable binary dictionary [method: %d]" % nCompressionMethod)
363        if nCompressionMethod == 1:
364            self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1   # We add 2 bits. See DawgNode.convToBytes1()
365            self.nBytesOffset = 0
366            self._calcNumBytesNodeAddress()
367            self._calcNodesAddress1()
368        elif nCompressionMethod == 2:
369            self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1   # We add 3 bits. See DawgNode.convToBytes2()
370            self.nBytesOffset = 0
371            self._calcNumBytesNodeAddress()
372            self._calcNodesAddress2()
373        elif nCompressionMethod == 3:
374            self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1   # We add 3 bits. See DawgNode.convToBytes3()
375            self.nBytesOffset = 1
376            self.nMaxOffset = (2 ** (self.nBytesOffset * 8)) - 1
377            self._calcNumBytesNodeAddress()
378            self._calcNodesAddress3()
379        else:
380            print(" # Error: unknown compression method")
381        print("   Arc values (chars, affixes and tags): {}  ->  {} bytes".format( self.nArcVal, len("\t".join(self.lArcVal).encode("utf-8")) ))
382        print("   Arc size: {} bytes, Address size: {} bytes   ->   {} * {} = {} bytes".format( self.nBytesArc, self.nBytesNodeAddress, \
383                                                                                                self.nBytesArc+self.nBytesNodeAddress, self.nArc, \
384                                                                                                (self.nBytesArc+self.nBytesNodeAddress)*self.nArc ))
385
386    def _calcNumBytesNodeAddress (self):
387        "how many bytes needed to store all nodes/arcs in the binary dictionary"
388        self.nBytesNodeAddress = 1
389        while ((self.nBytesArc + self.nBytesNodeAddress) * self.nArc) > (2 ** (self.nBytesNodeAddress * 8)):
390            self.nBytesNodeAddress += 1
391
392    def _calcNodesAddress1 (self):
393        nBytesNode = self.nBytesArc + self.nBytesNodeAddress
394        iAddr = len(self.oRoot.arcs) * nBytesNode
395        for oNode in self.lMinimizedNodes:
396            oNode.addr = iAddr
397            iAddr += max(len(oNode.arcs), 1) * nBytesNode
398
399    def _calcNodesAddress2 (self):
400        nBytesNode = self.nBytesArc + self.nBytesNodeAddress
401        iAddr = len(self.oRoot.arcs) * nBytesNode
402        for oNode in self.lSortedNodes:
403            oNode.addr = iAddr
404            iAddr += max(len(oNode.arcs), 1) * nBytesNode
405            for oNextNode in oNode.arcs.values():
406                if (oNode.pos + 1) == oNextNode.pos:
407                    iAddr -= self.nBytesNodeAddress
408                    #break
409
410    def _calcNodesAddress3 (self):
411        nBytesNode = self.nBytesArc + self.nBytesNodeAddress
412        # theorical nodes size if only addresses and no offset
413        self.oRoot.size = len(self.oRoot.arcs) * nBytesNode
414        for oNode in self.lSortedNodes:
415            oNode.size = max(len(oNode.arcs), 1) * nBytesNode
416        # rewind and calculate dropdown from the end, several times
417        nDiff = self.nBytesNodeAddress - self.nBytesOffset
418        bEnd = False
419        while not bEnd:
420            bEnd = True
421            # recalculate addresses
422            iAddr = self.oRoot.size
423            for oNode in self.lSortedNodes:
424                oNode.addr = iAddr
425                iAddr += oNode.size
426            # rewind and calculate dropdown from the end, several times
427            for i in range(self.nNode-1, -1, -1):
428                nSize = max(len(self.lSortedNodes[i].arcs), 1) * nBytesNode
429                for oNextNode in self.lSortedNodes[i].arcs.values():
430                    if 1 < (oNextNode.addr - self.lSortedNodes[i].addr) < self.nMaxOffset:
431                        nSize -= nDiff
432                if self.lSortedNodes[i].size != nSize:
433                    self.lSortedNodes[i].size = nSize
434                    bEnd = False
435
436    def getBinaryAsJSON (self, nCompressionMethod=1, bBinaryDictAsHexString=True):
437        "return a JSON string containing all necessary data of the dictionary (compressed as a binary string)"
438        self._calculateBinary(nCompressionMethod)
439        byDic = b""
440        if nCompressionMethod == 1:
441            byDic = self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)
442            for oNode in self.lMinimizedNodes:
443                byDic += oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)
444        elif nCompressionMethod == 2:
445            byDic = self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)
446            for oNode in self.lSortedNodes:
447                byDic += oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)
448        elif nCompressionMethod == 3:
449            byDic = self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)
450            for oNode in self.lSortedNodes:
451                byDic += oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)
452        return {
453            "sHeader": "/grammalecte-fsa/",
454            "sLangCode": self.sLangCode,
455            "sLangName": self.sLangName,
456            "sDicName": self.sDicName,
457            "sDescription": self.sDescription,
458            "sFileName": self.sFileName,
459            "sDate": self._getDate(),
460            "nEntry": self.nEntry,
461            "nChar": self.nChar,
462            "nAff": self.nAff,
463            "nTag": self.nTag,
464            "cStemming": self.cStemming,
465            "dChar": self.dChar,
466            "nNode": self.nNode,
467            "nArc": self.nArc,
468            "nArcVal": self.nArcVal,
469            "lArcVal": self.lArcVal,
470            "nCompressionMethod": nCompressionMethod,
471            "nBytesArc": self.nBytesArc,
472            "nBytesNodeAddress": self.nBytesNodeAddress,
473            "nBytesOffset": self.nBytesOffset,
474            # Mozilla’s JS parser don’t like file bigger than 4 Mb!
475            # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension.
476            # https://github.com/mozilla/addons-linter/issues/1361
477            "sByDic": byDic.hex()  if bBinaryDictAsHexString  else [ e  for e in byDic ],
478            "l2grams": list(self.a2grams)
479        }
480
481    def writeAsJSObject (self, spfDst, nCompressionMethod, bInJSModule=False, bBinaryDictAsHexString=True):
482        "write a file (JSON or JS module) with all the necessary data"
483        if not spfDst.endswith(".json"):
484            spfDst += "."+str(nCompressionMethod)+".json"
485        with open(spfDst, "w", encoding="utf-8", newline="\n") as hDst:
486            if bInJSModule:
487                hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ')
488            hDst.write( json.dumps(self.getBinaryAsJSON(nCompressionMethod, bBinaryDictAsHexString), ensure_ascii=False) )
489            if bInJSModule:
490                hDst.write(";\n\nexports.dictionary = dictionary;\n")
491
492    def writeBinary (self, sPathFile, nCompressionMethod, bDebug=False):
493        """
494        Save as a binary file.
495
496        Format of the binary indexable dictionary:
497        Each section is separated with 4 bytes of \0
498
499        - Section Header:
500            /grammalecte-fsa/[compression method]
501                * compression method is an ASCII string
502
503        - Section Informations:
504            /[lang code]
505            /[lang name]
506            /[dictionary name]
507            /[date creation]
508            /[number of chars]
509            /[number of bytes for each arc]
510            /[number of bytes for each address node]
511            /[number of entries]
512            /[number of nodes]
513            /[number of arcs]
514            /[number of affixes]
515                * each field is a ASCII string
516            /[stemming code]
517                * "S" means stems are generated by /suffix_code/,
518                  "A" means they are generated by /affix_code/
519                  See defineSuffixCode() and defineAffixCode() for details.
520                  "N" means no stemming
521
522        - Section Values:
523                * a list of strings encoded in binary from utf-8, each value separated with a tabulation
524
525        - Section Word Graph (nodes / arcs)
526                * A list of nodes which are a list of arcs with an address of the next node.
527                  See DawgNode.convToBytes() for details.
528
529        - Section 2grams:
530                * A list of 2grams (as strings: 2 chars) encoded in binary from utf-8, each value separated with a tabulation
531        """
532        self._calculateBinary(nCompressionMethod)
533        if not sPathFile.endswith(".bdic"):
534            sPathFile += "."+str(nCompressionMethod)+".bdic"
535        with open(sPathFile, 'wb') as hDst:
536            # header
537            hDst.write("/grammalecte-fsa/{}/".format(nCompressionMethod).encode("utf-8"))
538            hDst.write(b"\0\0\0\0")
539            # infos
540            sInfo = "{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}".format(self.sLangCode, self.sLangName, self.sDicName, self.sDescription, self._getDate(), \
541                                                                                self.nChar, self.nBytesArc, self.nBytesNodeAddress, \
542                                                                                self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming)
543            hDst.write(sInfo.encode("utf-8"))
544            hDst.write(b"\0\0\0\0")
545            # lArcVal
546            hDst.write("\t".join(self.lArcVal).encode("utf-8"))
547            hDst.write(b"\0\0\0\0")
548            # 2grams
549            hDst.write("\t".join(self.a2grams).encode("utf-8"))
550            hDst.write(b"\0\0\0\0")
551            # DAWG: nodes / arcs
552            if nCompressionMethod == 1:
553                hDst.write(self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress))
554                for oNode in self.lMinimizedNodes:
555                    hDst.write(oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress))
556            elif nCompressionMethod == 2:
557                hDst.write(self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress))
558                for oNode in self.lSortedNodes:
559                    hDst.write(oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress))
560            elif nCompressionMethod == 3:
561                hDst.write(self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset))
562                for oNode in self.lSortedNodes:
563                    hDst.write(oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset))
564        if bDebug:
565            self._writeNodes(sPathFile, nCompressionMethod)
566
567    def _getDate (self):
568        return time.strftime("%Y-%m-%d %H:%M:%S")
569
570    def _writeNodes (self, sPathFile, nCompressionMethod):
571        "for debugging only"
572        print(" > Write nodes")
573        with open(sPathFile+".nodes."+str(nCompressionMethod)+".txt", 'w', encoding='utf-8', newline="\n") as hDst:
574            if nCompressionMethod == 1:
575                hDst.write(self.oRoot.getTxtRepr1(self.nBytesArc, self.lArcVal)+"\n")
576                #hDst.write( ''.join( [ "%02X " %  z  for z in self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() )
577                for oNode in self.lMinimizedNodes:
578                    hDst.write(oNode.getTxtRepr1(self.nBytesArc, self.lArcVal)+"\n")
579            if nCompressionMethod == 2:
580                hDst.write(self.oRoot.getTxtRepr2(self.nBytesArc, self.lArcVal)+"\n")
581                for oNode in self.lSortedNodes:
582                    hDst.write(oNode.getTxtRepr2(self.nBytesArc, self.lArcVal)+"\n")
583            if nCompressionMethod == 3:
584                hDst.write(self.oRoot.getTxtRepr3(self.nBytesArc, self.nBytesOffset, self.lArcVal)+"\n")
585                #hDst.write( ''.join( [ "%02X " %  z  for z in self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) ] ).strip() )
586                for oNode in self.lSortedNodes:
587                    hDst.write(oNode.getTxtRepr3(self.nBytesArc, self.nBytesOffset, self.lArcVal)+"\n")
588
589
590
591class DawgNode:
592    """Node of the word graph"""
593
594    NextId = 0
595    NextPos = 1 # (version 2)
596
597    def __init__ (self):
598        self.i = DawgNode.NextId
599        DawgNode.NextId += 1
600        self.final = False
601        self.arcs = {}          # key: arc value; value: a node
602        self.addr = 0           # address in the binary dictionary
603        self.pos = 0            # position in the binary dictionary (version 2)
604        self.size = 0           # size of node in bytes (version 3)
605
606    @classmethod
607    def resetNextId (cls):
608        "set NextId to 0 "
609        cls.NextId = 0
610
611    def setPos (self): # version 2
612        "define a position for node (version 2)"
613        self.pos = DawgNode.NextPos
614        DawgNode.NextPos += 1
615
616    def __str__ (self):
617        s = "Node " + str(self.i) + " @ " + str(self.addr) + (" [final]" if self.final else "") + "\n"
618        for arc, node in self.arcs.items():
619            s += "  " +str(arc)
620            s += " > " + str(node.i)
621            s += " @ " + str(node.addr) + "\n"
622        return s
623
624    def __repr__ (self):
625        # Caution! this function is used for hashing and comparison!
626        sFinalChar = "1"  if self.final  else "0"
627        l = [sFinalChar]
628        for arc, node in self.arcs.items():
629            l.append(str(arc))
630            l.append(str(node.i))
631        return "_".join(l)
632
633    def __hash__ (self):
634        # Used as a key in a python dictionary.
635        return self.__repr__().__hash__()
636
637    def __eq__ (self, other):
638        # Used as a key in a python dictionary.
639        # Nodes are equivalent if they have identical arcs, and each identical arc leads to identical states.
640        return self.__repr__() == other.__repr__()
641
642    def sortArcs (self, dValOccur):
643        "sort arcs of node according to <dValOccur>"
644        self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(t[0], 0), reverse=True))
645
646    def sortArcs2 (self, dValOccur, lArcVal):
647        "sort arcs of each node depending on the previous char"
648        self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(lArcVal[t[0]], 0), reverse=True))
649
650    # VERSION 1 =====================================================================================================
651    def convToBytes1 (self, nBytesArc, nBytesNodeAddress):
652        """
653        Convert to bytes (method 1).
654
655        Node scheme:
656        - Arc length is defined by nBytesArc
657        - Address length is defined by nBytesNodeAddress
658
659        |                Arc                |                         Address of next node                          |
660        |                                   |                                                                       |
661         ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓
662         ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃
663         ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛
664         [...]
665         ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓
666         ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃
667         ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛
668          ^ ^
669          ┃ ┃
670          ┃ ┃
671          ┃ ┗━━━ if 1, last arc of this node
672          ┗━━━━━ if 1, this node is final (only on the first arc)
673        """
674        nArc = len(self.arcs)
675        nFinalNodeMask = 1 << ((nBytesArc*8)-1)
676        nFinalArcMask = 1 << ((nBytesArc*8)-2)
677        if not nArc:
678            val = nFinalNodeMask | nFinalArcMask
679            by = val.to_bytes(nBytesArc, byteorder='big')
680            by += (0).to_bytes(nBytesNodeAddress, byteorder='big')
681            return by
682        by = b""
683        for i, arc in enumerate(self.arcs, 1):
684            val = arc
685            if i == 1 and self.final:
686                val = val | nFinalNodeMask
687            if i == nArc:
688                val = val | nFinalArcMask
689            by += val.to_bytes(nBytesArc, byteorder='big')
690            by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big')
691        return by
692
693    def getTxtRepr1 (self, nBytesArc, lVal):
694        "return representation as string of node (method 1)"
695        nArc = len(self.arcs)
696        nFinalNodeMask = 1 << ((nBytesArc*8)-1)
697        nFinalArcMask = 1 << ((nBytesArc*8)-2)
698        s = "i{:_>10} -- #{:_>10}\n".format(self.i, self.addr)
699        if not nArc:
700            s += "  {:<20}  {:0>16}  i{:_>10}   #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0")
701            return s
702        for i, arc in enumerate(self.arcs, 1):
703            val = arc
704            if i == 1 and self.final:
705                val = val | nFinalNodeMask
706            if i == nArc:
707                val = val | nFinalArcMask
708            s += "  {:<20}  {:0>16}  i{:_>10}   #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr)
709        return s
710
711    # VERSION 2 =====================================================================================================
712    def convToBytes2 (self, nBytesArc, nBytesNodeAddress):
713        """
714        Convert to bytes (method 2).
715
716        Node scheme:
717        - Arc length is defined by nBytesArc
718        - Address length is defined by nBytesNodeAddress
719
720        |                Arc                |                         Address of next node                          |
721        |                                   |                                                                       |
722         ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓
723         ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃
724         ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛
725         [...]
726         ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓
727         ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃
728         ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛
729          ^ ^ ^
730          ┃ ┃ ┃
731          ┃ ┃ ┗━━ if 1, caution, no address: next node is the following node
732          ┃ ┗━━━━ if 1, last arc of this node
733          ┗━━━━━━ if 1, this node is final (only on the first arc)
734        """
735        nArc = len(self.arcs)
736        nFinalNodeMask = 1 << ((nBytesArc*8)-1)
737        nFinalArcMask = 1 << ((nBytesArc*8)-2)
738        nNextNodeMask = 1 << ((nBytesArc*8)-3)
739        if not nArc:
740            val = nFinalNodeMask | nFinalArcMask
741            by = val.to_bytes(nBytesArc, byteorder='big')
742            by += (0).to_bytes(nBytesNodeAddress, byteorder='big')
743            return by
744        by = b""
745        for i, arc in enumerate(self.arcs, 1):
746            val = arc
747            if i == 1 and self.final:
748                val = val | nFinalNodeMask
749            if i == nArc:
750                val = val | nFinalArcMask
751            if (self.pos + 1) == self.arcs[arc].pos and self.i != 0:
752                val = val | nNextNodeMask
753                by += val.to_bytes(nBytesArc, byteorder='big')
754            else:
755                by += val.to_bytes(nBytesArc, byteorder='big')
756                by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big')
757        return by
758
759    def getTxtRepr2 (self, nBytesArc, lVal):
760        "return representation as string of node (method 2)"
761        nArc = len(self.arcs)
762        nFinalNodeMask = 1 << ((nBytesArc*8)-1)
763        nFinalArcMask = 1 << ((nBytesArc*8)-2)
764        nNextNodeMask = 1 << ((nBytesArc*8)-3)
765        s = "i{:_>10} -- #{:_>10}\n".format(self.i, self.addr)
766        if not nArc:
767            s += "  {:<20}  {:0>16}  i{:_>10}   #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0")
768            return s
769        for i, arc in enumerate(self.arcs, 1):
770            val = arc
771            if i == 1 and self.final:
772                val = val | nFinalNodeMask
773            if i == nArc:
774                val = val | nFinalArcMask
775            if (self.pos + 1) == self.arcs[arc].pos  and self.i != 0:
776                val = val | nNextNodeMask
777                s += "  {:<20}  {:0>16}\n".format(lVal[arc], bin(val)[2:])
778            else:
779                s += "  {:<20}  {:0>16}  i{:_>10}   #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr)
780        return s
781
782    # VERSION 3 =====================================================================================================
783    def convToBytes3 (self, nBytesArc, nBytesNodeAddress, nBytesOffset):
784        """
785        Convert to bytes (method 3).
786
787        Node scheme:
788        - Arc length is defined by nBytesArc
789        - Address length is defined by nBytesNodeAddress
790        - Offset length is defined by nBytesOffset
791
792        |                Arc                |            Address of next node  or  offset to next node              |
793        |                                   |                                                                       |
794         ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓
795         ┃1┃0┃0┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃
796         ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛
797         [...]
798         ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓
799         ┃0┃0┃1┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃     Offsets are shorter than addresses
800         ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛
801         ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓
802         ┃0┃1┃0┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃
803         ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛
804
805          ^ ^ ^
806          ┃ ┃ ┃
807          ┃ ┃ ┗━━ if 1, offset instead of address of next node
808          ┃ ┗━━━━ if 1, last arc of this node
809          ┗━━━━━━ if 1, this node is final (only on the first arc)
810        """
811        nArc = len(self.arcs)
812        nFinalNodeMask = 1 << ((nBytesArc*8)-1)
813        nFinalArcMask = 1 << ((nBytesArc*8)-2)
814        nNextNodeMask = 1 << ((nBytesArc*8)-3)
815        nMaxOffset = (2 ** (nBytesOffset * 8)) - 1
816        if not nArc:
817            val = nFinalNodeMask | nFinalArcMask
818            by = val.to_bytes(nBytesArc, byteorder='big')
819            by += (0).to_bytes(nBytesNodeAddress, byteorder='big')
820            return by
821        by = b""
822        for i, arc in enumerate(self.arcs, 1):
823            val = arc
824            if i == 1 and self.final:
825                val = val | nFinalNodeMask
826            if i == nArc:
827                val = val | nFinalArcMask
828            if 1 < (self.arcs[arc].addr - self.addr) < nMaxOffset and self.i != 0:
829                val = val | nNextNodeMask
830                by += val.to_bytes(nBytesArc, byteorder='big')
831                by += (self.arcs[arc].addr-self.addr).to_bytes(nBytesOffset, byteorder='big')
832            else:
833                by += val.to_bytes(nBytesArc, byteorder='big')
834                by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big')
835        return by
836
837    def getTxtRepr3 (self, nBytesArc, nBytesOffset, lVal):
838        "return representation as string of node (method 3)"
839        nArc = len(self.arcs)
840        nFinalNodeMask = 1 << ((nBytesArc*8)-1)
841        nFinalArcMask = 1 << ((nBytesArc*8)-2)
842        nNextNodeMask = 1 << ((nBytesArc*8)-3)
843        nMaxOffset = (2 ** (nBytesOffset * 8)) - 1
844        s = "i{:_>10} -- #{:_>10}  ({})\n".format(self.i, self.addr, self.size)
845        if not nArc:
846            s += "  {:<20}  {:0>16}  i{:_>10}   #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0")
847            return s
848        for i, arc in enumerate(self.arcs, 1):
849            val = arc
850            if i == 1 and self.final:
851                val = val | nFinalNodeMask
852            if i == nArc:
853                val = val | nFinalArcMask
854            if 1 < (self.arcs[arc].addr - self.addr) < nMaxOffset and self.i != 0:
855                val = val | nNextNodeMask
856                s += "  {:<20}  {:0>16}  i{:_>10}   +{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr - self.addr)
857            else:
858                s += "  {:<20}  {:0>16}  i{:_>10}   #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr)
859        return s
860
861
862
863# Another attempt to sort node arcs
864
865_dCharOrder = {
866    # key: previous char, value: dictionary of chars {c: nValue}
867    "": {}
868}
869
870
871def addWordToCharDict (sWord):
872    "for each character of <sWord>, count how many times it appears after the previous character, and store result in a <_dCharOrder>"
873    cPrevious = ""
874    for cChar in sWord:
875        if cPrevious not in _dCharOrder:
876            _dCharOrder[cPrevious] = {}
877        _dCharOrder[cPrevious][cChar] = _dCharOrder[cPrevious].get(cChar, 0) + 1
878        cPrevious = cChar
879
880
881def getCharOrderAfterChar (cChar):
882    "return a dictionary of chars with number of times it appears after character <cChar>"
883    return _dCharOrder.get(cChar, None)
884
885
886def displayCharOrder ():
887    "display how many times each character appear after another one"
888    for key, value in _dCharOrder.items():
889        print("[" + key + "]: ", ", ".join([ c+":"+str(n)  for c, n  in  sorted(value.items(), key=lambda t: t[1], reverse=True) ]))
890