1#!python3 2 3""" 4FSA DICTIONARY BUILDER 5 6by Olivier R. 7License: MPL 2 8 9This tool encodes lexicon into an indexable binary dictionary 10Input files MUST be encoded in UTF-8. 11""" 12 13import sys 14import os 15import collections 16import json 17import time 18import re 19import traceback 20 21from . import str_transform as st 22from .progressbar import ProgressBar 23 24 25 26dLexiconData = {} 27 28def readFile (spf): 29 "generator: read file <spf> and return for each line a list of elements separated by a tabulation." 30 print(" < Read lexicon: " + spf) 31 if os.path.isfile(spf): 32 dLexiconData.clear() 33 with open(spf, "r", encoding="utf-8") as hSrc: 34 for sLine in hSrc: 35 sLine = sLine.strip() 36 if sLine.startswith("##") : 37 m = re.match("## *(\\w+) *:(.*)$", sLine) 38 if m: 39 dLexiconData[m.group(1)] = m.group(2).strip() 40 elif sLine and not sLine.startswith("#"): 41 yield sLine.split("\t") 42 if dLexiconData: 43 print("Data from dictionary:") 44 print(dLexiconData) 45 else: 46 raise OSError("# Error. File not found or not loadable: " + spf) 47 48 49 50class DAWG: 51 """DIRECT ACYCLIC WORD GRAPH""" 52 # This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115) 53 # We store suffix/affix codes and tags within the graph after the “real” word. 54 # A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags] 55 # Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags. 56 # Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final. 57 58 def __init__ (self, src, cStemming, sLangCode, sLangName="", sDicName="", sDescription="", sSelectFilterRegex=""): 59 print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====") 60 cStemming = cStemming.upper() 61 if cStemming == "A": 62 funcStemmingGen = st.defineAffixCode 63 elif cStemming == "S": 64 funcStemmingGen = st.defineSuffixCode 65 elif cStemming == "N": 66 funcStemmingGen = st.noStemming 67 else: 68 raise ValueError("# Error. Unknown stemming code: {}".format(cStemming)) 69 70 aEntry = set() 71 lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {} 72 lAff = []; dAff = {}; nAff = 0; dAffOccur = {} 73 lTag = []; dTag = {}; nTag = 0; dTagOccur = {} 74 75 self.a2grams = set() 76 77 try: 78 zFilter = re.compile(sSelectFilterRegex) if sSelectFilterRegex else None 79 except re.error: 80 print("# Error. Wrong filter regex. Filter ignored: ", zFilter) 81 traceback.print_exc() 82 zFilter = None 83 84 # read lexicon 85 if isinstance(src, str): 86 iterable = readFile(src) 87 else: 88 iterable = src 89 for sFlex, sStem, sTag in iterable: 90 if not zFilter or zFilter.search(sTag): 91 self.a2grams.update(st.getNgrams(sFlex)) 92 addWordToCharDict(sFlex) 93 # chars 94 for c in sFlex: 95 if c not in dChar: 96 dChar[c] = nChar 97 lChar.append(c) 98 nChar += 1 99 dCharOccur[c] = dCharOccur.get(c, 0) + 1 100 # affixes to find stem from flexion 101 sAff = funcStemmingGen(sFlex, sStem) 102 if sAff not in dAff: 103 dAff[sAff] = nAff 104 lAff.append(sAff) 105 nAff += 1 106 dAffOccur[sAff] = dAffOccur.get(sAff, 0) + 1 107 # tags 108 if sTag not in dTag: 109 dTag[sTag] = nTag 110 lTag.append(sTag) 111 nTag += 1 112 dTagOccur[sTag] = dTagOccur.get(sTag, 0) + 1 113 aEntry.add((sFlex, dAff[sAff], dTag[sTag])) 114 if not aEntry: 115 raise ValueError("# Error. Empty lexicon") 116 117 # Preparing DAWG 118 print(" > Preparing list of words") 119 print(" Filter: " + (sSelectFilterRegex or "[None]")) 120 lVal = lChar + lAff + lTag 121 lWord = [ [dChar[c] for c in sFlex] + [iAff+nChar] + [iTag+nChar+nAff] for sFlex, iAff, iTag in aEntry ] 122 aEntry = None 123 124 # Dictionary of arc values occurrency, to sort arcs of each node 125 dValOccur = dict( [ (dChar[c], dCharOccur[c]) for c in dChar ] \ 126 + [ (dAff[aff]+nChar, dAffOccur[aff]) for aff in dAff ] \ 127 + [ (dTag[tag]+nChar+nAff, dTagOccur[tag]) for tag in dTag ] ) 128 129 self.sFileName = src if isinstance(src, str) else "[None]" 130 self.sLangCode = sLangCode 131 self.sLangName = sLangName 132 self.sDicName = sDicName 133 self.sDescription = sDescription 134 if dLexiconData: 135 self.sLangCode = dLexiconData.get("LangCode", self.sLangCode) 136 self.sLangName = dLexiconData.get("LangName", self.sLangName) 137 self.sDicName = dLexiconData.get("DicName", self.sDicName) 138 self.sDescription = dLexiconData.get("Description", self.sDescription) 139 self.nEntry = len(lWord) 140 self.aPreviousEntry = [] 141 DawgNode.resetNextId() 142 self.oRoot = DawgNode() 143 self.lUncheckedNodes = [] # list of nodes that have not been checked for duplication. 144 self.lMinimizedNodes = {} # list of unique nodes that have been checked for duplication. 145 self.lSortedNodes = [] # version 2 and 3 146 self.nNode = 0 147 self.nArc = 0 148 self.dChar = dChar 149 self.nChar = len(dChar) 150 self.nAff = nAff 151 self.lArcVal = lVal 152 self.nArcVal = len(lVal) 153 self.nTag = self.nArcVal - self.nChar - nAff 154 self.cStemming = cStemming 155 if cStemming == "A": 156 self.funcStemming = st.changeWordWithAffixCode 157 elif cStemming == "S": 158 self.funcStemming = st.changeWordWithSuffixCode 159 else: 160 self.funcStemming = st.noStemming 161 162 # calculated later 163 self.nBytesNodeAddress = 1 164 self.nBytesArc = 0 165 self.nBytesOffset = 0 166 self.nMaxOffset = 0 167 168 # build 169 lWord.sort() 170 oProgBar = ProgressBar(0, len(lWord)) 171 for aEntry in lWord: 172 self.insert(aEntry) 173 oProgBar.increment(1) 174 oProgBar.done() 175 self.finish() 176 self.countNodes() 177 self.countArcs() 178 self.sortNodes() # version 2 and 3 179 self.sortNodeArcs(dValOccur) 180 #self.sortNodeArcs2 (self.oRoot, "") 181 self.displayInfo() 182 183 # BUILD DAWG 184 def insert (self, aEntry): 185 "insert a new entry (insertion must be made in alphabetical order)." 186 if aEntry < self.aPreviousEntry: 187 sys.exit("# Error: Words must be inserted in alphabetical order.") 188 189 # find common prefix between word and previous word 190 nCommonPrefix = 0 191 for i in range(min(len(aEntry), len(self.aPreviousEntry))): 192 if aEntry[i] != self.aPreviousEntry[i]: 193 break 194 nCommonPrefix += 1 195 196 # Check the lUncheckedNodes for redundant nodes, proceeding from last 197 # one down to the common prefix size. Then truncate the list at that point. 198 self._minimize(nCommonPrefix) 199 200 # add the suffix, starting from the correct node mid-way through the graph 201 if not self.lUncheckedNodes: 202 oNode = self.oRoot 203 else: 204 oNode = self.lUncheckedNodes[-1][2] 205 206 iChar = nCommonPrefix 207 for c in aEntry[nCommonPrefix:]: 208 oNextNode = DawgNode() 209 oNode.arcs[c] = oNextNode 210 self.lUncheckedNodes.append((oNode, c, oNextNode)) 211 if iChar == (len(aEntry) - 2): 212 oNode.final = True 213 iChar += 1 214 oNode = oNextNode 215 oNode.final = True 216 self.aPreviousEntry = aEntry 217 218 def finish (self): 219 "minimize unchecked nodes" 220 self._minimize(0) 221 222 def _minimize (self, downTo): 223 # proceed from the leaf up to a certain point 224 for i in range( len(self.lUncheckedNodes)-1, downTo-1, -1 ): 225 oNode, char, oChildNode = self.lUncheckedNodes[i] 226 if oChildNode in self.lMinimizedNodes: 227 # replace the child with the previously encountered one 228 oNode.arcs[char] = self.lMinimizedNodes[oChildNode] 229 else: 230 # add the state to the minimized nodes. 231 self.lMinimizedNodes[oChildNode] = oChildNode 232 self.lUncheckedNodes.pop() 233 234 def countNodes (self): 235 "count the number of nodes of the whole word graph" 236 self.nNode = len(self.lMinimizedNodes) 237 238 def countArcs (self): 239 "count the number of arcs in the whole word graph" 240 self.nArc = len(self.oRoot.arcs) 241 for oNode in self.lMinimizedNodes: 242 self.nArc += len(oNode.arcs) 243 244 def sortNodeArcs (self, dValOccur): 245 "sort arcs of each node according to <dValOccur>" 246 print(" > Sort node arcs") 247 self.oRoot.sortArcs(dValOccur) 248 for oNode in self.lMinimizedNodes: 249 oNode.sortArcs(dValOccur) 250 251 def sortNodeArcs2 (self, oNode, cPrevious=""): 252 "sort arcs of each node depending on the previous char" 253 # recursive function 254 dCharOccur = getCharOrderAfterChar(cPrevious) 255 if dCharOccur: 256 oNode.sortArcs2(dCharOccur, self.lArcVal) 257 for nArcVal, oNextNode in oNode.arcs.items(): 258 self.sortNodeArcs2(oNextNode, self.lArcVal[nArcVal]) 259 260 def sortNodes (self): 261 "sort nodes" 262 print(" > Sort nodes") 263 for oNode in self.oRoot.arcs.values(): 264 self._parseNodes(oNode) 265 266 def _parseNodes (self, oNode): 267 # Warning: recursive method 268 if oNode.pos > 0: 269 return 270 oNode.setPos() 271 self.lSortedNodes.append(oNode) 272 for oNextNode in oNode.arcs.values(): 273 self._parseNodes(oNextNode) 274 275 def lookup (self, sWord): 276 "return True if <sWord> is within the word graph (debugging)" 277 oNode = self.oRoot 278 for c in sWord: 279 if self.dChar.get(c, '') not in oNode.arcs: 280 return False 281 oNode = oNode.arcs[self.dChar[c]] 282 return oNode.final 283 284 def morph (self, sWord): 285 "return a string of the morphologies of <sWord> (debugging)" 286 oNode = self.oRoot 287 for c in sWord: 288 if self.dChar.get(c, '') not in oNode.arcs: 289 return '' 290 oNode = oNode.arcs[self.dChar[c]] 291 if oNode.final: 292 s = "* " 293 for arc in oNode.arcs: 294 if arc >= self.nChar: 295 s += " [" + self.funcStemming(sWord, self.lArcVal[arc]) 296 oNode2 = oNode.arcs[arc] 297 for arc2 in oNode2.arcs: 298 s += " / " + self.lArcVal[arc2] 299 s += "]" 300 return s 301 return '' 302 303 def displayInfo (self): 304 "display informations about the word graph" 305 print(" * {:<12} {:>16,}".format("Entries:", self.nEntry)) 306 print(" * {:<12} {:>16,}".format("Characters:", self.nChar)) 307 print(" * {:<12} {:>16,}".format("Affixes:", self.nAff)) 308 print(" * {:<12} {:>16,}".format("Tags:", self.nTag)) 309 print(" * {:<12} {:>16,}".format("Arc values:", self.nArcVal)) 310 print(" * {:<12} {:>16,}".format("Nodes:", self.nNode)) 311 print(" * {:<12} {:>16,}".format("Arcs:", self.nArc)) 312 print(" * {:<12} {:>16,}".format("2grams:", len(self.a2grams))) 313 print(" * {:<12} {:>16}".format("Stemming:", self.cStemming + "FX")) 314 315 def getArcStats (self): 316 "return a string with statistics about nodes and arcs" 317 d = {} 318 for oNode in self.lMinimizedNodes: 319 n = len(oNode.arcs) 320 d[n] = d.get(n, 0) + 1 321 s = " * Nodes:\n" 322 for n in d: 323 s = s + " {:>9} nodes have {:>3} arcs\n".format(d[n], n) 324 return s 325 326 def writeInfo (self, sPathFile): 327 "write informations in file <sPathFile>" 328 print(" > Write informations") 329 with open(sPathFile, 'w', encoding='utf-8', newline="\n") as hDst: 330 hDst.write(self.getArcStats()) 331 hDst.write("\n * Values:\n") 332 for i, s in enumerate(self.lArcVal): 333 hDst.write(" {:>6}. {}\n".format(i, s)) 334 hDst.close() 335 336 def select (self, sPattern=""): 337 "generator: returns all entries which morphology fits <sPattern>" 338 zPattern = None 339 if sPattern: 340 try: 341 zPattern = re.compile(sPattern) 342 except re.error: 343 print("# Error in regex pattern") 344 traceback.print_exc() 345 yield from self._select(zPattern, self.oRoot, "") 346 347 def _select (self, zPattern, oNode, sWord): 348 # recursive generator 349 for nVal, oNextNode in oNode.arcs.items(): 350 if nVal <= self.nChar: 351 # simple character 352 yield from self._select(zPattern, oNextNode, sWord + self.lArcVal[nVal]) 353 else: 354 sEntry = sWord + "\t" + self.funcStemming(sWord, self.lArcVal[nVal]) 355 for nMorphVal, _ in oNextNode.arcs.items(): 356 if not zPattern or zPattern.search(self.lArcVal[nMorphVal]): 357 yield sEntry + "\t" + self.lArcVal[nMorphVal] 358 359 360 # BINARY CONVERSION 361 def _calculateBinary (self, nCompressionMethod): 362 print(" > Write DAWG as an indexable binary dictionary [method: %d]" % nCompressionMethod) 363 if nCompressionMethod == 1: 364 self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1 # We add 2 bits. See DawgNode.convToBytes1() 365 self.nBytesOffset = 0 366 self._calcNumBytesNodeAddress() 367 self._calcNodesAddress1() 368 elif nCompressionMethod == 2: 369 self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1 # We add 3 bits. See DawgNode.convToBytes2() 370 self.nBytesOffset = 0 371 self._calcNumBytesNodeAddress() 372 self._calcNodesAddress2() 373 elif nCompressionMethod == 3: 374 self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1 # We add 3 bits. See DawgNode.convToBytes3() 375 self.nBytesOffset = 1 376 self.nMaxOffset = (2 ** (self.nBytesOffset * 8)) - 1 377 self._calcNumBytesNodeAddress() 378 self._calcNodesAddress3() 379 else: 380 print(" # Error: unknown compression method") 381 print(" Arc values (chars, affixes and tags): {} -> {} bytes".format( self.nArcVal, len("\t".join(self.lArcVal).encode("utf-8")) )) 382 print(" Arc size: {} bytes, Address size: {} bytes -> {} * {} = {} bytes".format( self.nBytesArc, self.nBytesNodeAddress, \ 383 self.nBytesArc+self.nBytesNodeAddress, self.nArc, \ 384 (self.nBytesArc+self.nBytesNodeAddress)*self.nArc )) 385 386 def _calcNumBytesNodeAddress (self): 387 "how many bytes needed to store all nodes/arcs in the binary dictionary" 388 self.nBytesNodeAddress = 1 389 while ((self.nBytesArc + self.nBytesNodeAddress) * self.nArc) > (2 ** (self.nBytesNodeAddress * 8)): 390 self.nBytesNodeAddress += 1 391 392 def _calcNodesAddress1 (self): 393 nBytesNode = self.nBytesArc + self.nBytesNodeAddress 394 iAddr = len(self.oRoot.arcs) * nBytesNode 395 for oNode in self.lMinimizedNodes: 396 oNode.addr = iAddr 397 iAddr += max(len(oNode.arcs), 1) * nBytesNode 398 399 def _calcNodesAddress2 (self): 400 nBytesNode = self.nBytesArc + self.nBytesNodeAddress 401 iAddr = len(self.oRoot.arcs) * nBytesNode 402 for oNode in self.lSortedNodes: 403 oNode.addr = iAddr 404 iAddr += max(len(oNode.arcs), 1) * nBytesNode 405 for oNextNode in oNode.arcs.values(): 406 if (oNode.pos + 1) == oNextNode.pos: 407 iAddr -= self.nBytesNodeAddress 408 #break 409 410 def _calcNodesAddress3 (self): 411 nBytesNode = self.nBytesArc + self.nBytesNodeAddress 412 # theorical nodes size if only addresses and no offset 413 self.oRoot.size = len(self.oRoot.arcs) * nBytesNode 414 for oNode in self.lSortedNodes: 415 oNode.size = max(len(oNode.arcs), 1) * nBytesNode 416 # rewind and calculate dropdown from the end, several times 417 nDiff = self.nBytesNodeAddress - self.nBytesOffset 418 bEnd = False 419 while not bEnd: 420 bEnd = True 421 # recalculate addresses 422 iAddr = self.oRoot.size 423 for oNode in self.lSortedNodes: 424 oNode.addr = iAddr 425 iAddr += oNode.size 426 # rewind and calculate dropdown from the end, several times 427 for i in range(self.nNode-1, -1, -1): 428 nSize = max(len(self.lSortedNodes[i].arcs), 1) * nBytesNode 429 for oNextNode in self.lSortedNodes[i].arcs.values(): 430 if 1 < (oNextNode.addr - self.lSortedNodes[i].addr) < self.nMaxOffset: 431 nSize -= nDiff 432 if self.lSortedNodes[i].size != nSize: 433 self.lSortedNodes[i].size = nSize 434 bEnd = False 435 436 def getBinaryAsJSON (self, nCompressionMethod=1, bBinaryDictAsHexString=True): 437 "return a JSON string containing all necessary data of the dictionary (compressed as a binary string)" 438 self._calculateBinary(nCompressionMethod) 439 byDic = b"" 440 if nCompressionMethod == 1: 441 byDic = self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) 442 for oNode in self.lMinimizedNodes: 443 byDic += oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) 444 elif nCompressionMethod == 2: 445 byDic = self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress) 446 for oNode in self.lSortedNodes: 447 byDic += oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress) 448 elif nCompressionMethod == 3: 449 byDic = self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) 450 for oNode in self.lSortedNodes: 451 byDic += oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) 452 return { 453 "sHeader": "/grammalecte-fsa/", 454 "sLangCode": self.sLangCode, 455 "sLangName": self.sLangName, 456 "sDicName": self.sDicName, 457 "sDescription": self.sDescription, 458 "sFileName": self.sFileName, 459 "sDate": self._getDate(), 460 "nEntry": self.nEntry, 461 "nChar": self.nChar, 462 "nAff": self.nAff, 463 "nTag": self.nTag, 464 "cStemming": self.cStemming, 465 "dChar": self.dChar, 466 "nNode": self.nNode, 467 "nArc": self.nArc, 468 "nArcVal": self.nArcVal, 469 "lArcVal": self.lArcVal, 470 "nCompressionMethod": nCompressionMethod, 471 "nBytesArc": self.nBytesArc, 472 "nBytesNodeAddress": self.nBytesNodeAddress, 473 "nBytesOffset": self.nBytesOffset, 474 # Mozilla’s JS parser don’t like file bigger than 4 Mb! 475 # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension. 476 # https://github.com/mozilla/addons-linter/issues/1361 477 "sByDic": byDic.hex() if bBinaryDictAsHexString else [ e for e in byDic ], 478 "l2grams": list(self.a2grams) 479 } 480 481 def writeAsJSObject (self, spfDst, nCompressionMethod, bInJSModule=False, bBinaryDictAsHexString=True): 482 "write a file (JSON or JS module) with all the necessary data" 483 if not spfDst.endswith(".json"): 484 spfDst += "."+str(nCompressionMethod)+".json" 485 with open(spfDst, "w", encoding="utf-8", newline="\n") as hDst: 486 if bInJSModule: 487 hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ') 488 hDst.write( json.dumps(self.getBinaryAsJSON(nCompressionMethod, bBinaryDictAsHexString), ensure_ascii=False) ) 489 if bInJSModule: 490 hDst.write(";\n\nexports.dictionary = dictionary;\n") 491 492 def writeBinary (self, sPathFile, nCompressionMethod, bDebug=False): 493 """ 494 Save as a binary file. 495 496 Format of the binary indexable dictionary: 497 Each section is separated with 4 bytes of \0 498 499 - Section Header: 500 /grammalecte-fsa/[compression method] 501 * compression method is an ASCII string 502 503 - Section Informations: 504 /[lang code] 505 /[lang name] 506 /[dictionary name] 507 /[date creation] 508 /[number of chars] 509 /[number of bytes for each arc] 510 /[number of bytes for each address node] 511 /[number of entries] 512 /[number of nodes] 513 /[number of arcs] 514 /[number of affixes] 515 * each field is a ASCII string 516 /[stemming code] 517 * "S" means stems are generated by /suffix_code/, 518 "A" means they are generated by /affix_code/ 519 See defineSuffixCode() and defineAffixCode() for details. 520 "N" means no stemming 521 522 - Section Values: 523 * a list of strings encoded in binary from utf-8, each value separated with a tabulation 524 525 - Section Word Graph (nodes / arcs) 526 * A list of nodes which are a list of arcs with an address of the next node. 527 See DawgNode.convToBytes() for details. 528 529 - Section 2grams: 530 * A list of 2grams (as strings: 2 chars) encoded in binary from utf-8, each value separated with a tabulation 531 """ 532 self._calculateBinary(nCompressionMethod) 533 if not sPathFile.endswith(".bdic"): 534 sPathFile += "."+str(nCompressionMethod)+".bdic" 535 with open(sPathFile, 'wb') as hDst: 536 # header 537 hDst.write("/grammalecte-fsa/{}/".format(nCompressionMethod).encode("utf-8")) 538 hDst.write(b"\0\0\0\0") 539 # infos 540 sInfo = "{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}".format(self.sLangCode, self.sLangName, self.sDicName, self.sDescription, self._getDate(), \ 541 self.nChar, self.nBytesArc, self.nBytesNodeAddress, \ 542 self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming) 543 hDst.write(sInfo.encode("utf-8")) 544 hDst.write(b"\0\0\0\0") 545 # lArcVal 546 hDst.write("\t".join(self.lArcVal).encode("utf-8")) 547 hDst.write(b"\0\0\0\0") 548 # 2grams 549 hDst.write("\t".join(self.a2grams).encode("utf-8")) 550 hDst.write(b"\0\0\0\0") 551 # DAWG: nodes / arcs 552 if nCompressionMethod == 1: 553 hDst.write(self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)) 554 for oNode in self.lMinimizedNodes: 555 hDst.write(oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress)) 556 elif nCompressionMethod == 2: 557 hDst.write(self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)) 558 for oNode in self.lSortedNodes: 559 hDst.write(oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)) 560 elif nCompressionMethod == 3: 561 hDst.write(self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) 562 for oNode in self.lSortedNodes: 563 hDst.write(oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)) 564 if bDebug: 565 self._writeNodes(sPathFile, nCompressionMethod) 566 567 def _getDate (self): 568 return time.strftime("%Y-%m-%d %H:%M:%S") 569 570 def _writeNodes (self, sPathFile, nCompressionMethod): 571 "for debugging only" 572 print(" > Write nodes") 573 with open(sPathFile+".nodes."+str(nCompressionMethod)+".txt", 'w', encoding='utf-8', newline="\n") as hDst: 574 if nCompressionMethod == 1: 575 hDst.write(self.oRoot.getTxtRepr1(self.nBytesArc, self.lArcVal)+"\n") 576 #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() ) 577 for oNode in self.lMinimizedNodes: 578 hDst.write(oNode.getTxtRepr1(self.nBytesArc, self.lArcVal)+"\n") 579 if nCompressionMethod == 2: 580 hDst.write(self.oRoot.getTxtRepr2(self.nBytesArc, self.lArcVal)+"\n") 581 for oNode in self.lSortedNodes: 582 hDst.write(oNode.getTxtRepr2(self.nBytesArc, self.lArcVal)+"\n") 583 if nCompressionMethod == 3: 584 hDst.write(self.oRoot.getTxtRepr3(self.nBytesArc, self.nBytesOffset, self.lArcVal)+"\n") 585 #hDst.write( ''.join( [ "%02X " % z for z in self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) ] ).strip() ) 586 for oNode in self.lSortedNodes: 587 hDst.write(oNode.getTxtRepr3(self.nBytesArc, self.nBytesOffset, self.lArcVal)+"\n") 588 589 590 591class DawgNode: 592 """Node of the word graph""" 593 594 NextId = 0 595 NextPos = 1 # (version 2) 596 597 def __init__ (self): 598 self.i = DawgNode.NextId 599 DawgNode.NextId += 1 600 self.final = False 601 self.arcs = {} # key: arc value; value: a node 602 self.addr = 0 # address in the binary dictionary 603 self.pos = 0 # position in the binary dictionary (version 2) 604 self.size = 0 # size of node in bytes (version 3) 605 606 @classmethod 607 def resetNextId (cls): 608 "set NextId to 0 " 609 cls.NextId = 0 610 611 def setPos (self): # version 2 612 "define a position for node (version 2)" 613 self.pos = DawgNode.NextPos 614 DawgNode.NextPos += 1 615 616 def __str__ (self): 617 s = "Node " + str(self.i) + " @ " + str(self.addr) + (" [final]" if self.final else "") + "\n" 618 for arc, node in self.arcs.items(): 619 s += " " +str(arc) 620 s += " > " + str(node.i) 621 s += " @ " + str(node.addr) + "\n" 622 return s 623 624 def __repr__ (self): 625 # Caution! this function is used for hashing and comparison! 626 sFinalChar = "1" if self.final else "0" 627 l = [sFinalChar] 628 for arc, node in self.arcs.items(): 629 l.append(str(arc)) 630 l.append(str(node.i)) 631 return "_".join(l) 632 633 def __hash__ (self): 634 # Used as a key in a python dictionary. 635 return self.__repr__().__hash__() 636 637 def __eq__ (self, other): 638 # Used as a key in a python dictionary. 639 # Nodes are equivalent if they have identical arcs, and each identical arc leads to identical states. 640 return self.__repr__() == other.__repr__() 641 642 def sortArcs (self, dValOccur): 643 "sort arcs of node according to <dValOccur>" 644 self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(t[0], 0), reverse=True)) 645 646 def sortArcs2 (self, dValOccur, lArcVal): 647 "sort arcs of each node depending on the previous char" 648 self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(lArcVal[t[0]], 0), reverse=True)) 649 650 # VERSION 1 ===================================================================================================== 651 def convToBytes1 (self, nBytesArc, nBytesNodeAddress): 652 """ 653 Convert to bytes (method 1). 654 655 Node scheme: 656 - Arc length is defined by nBytesArc 657 - Address length is defined by nBytesNodeAddress 658 659 | Arc | Address of next node | 660 | | | 661 ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ 662 ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ 663 ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ 664 [...] 665 ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ 666 ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ 667 ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ 668 ^ ^ 669 ┃ ┃ 670 ┃ ┃ 671 ┃ ┗━━━ if 1, last arc of this node 672 ┗━━━━━ if 1, this node is final (only on the first arc) 673 """ 674 nArc = len(self.arcs) 675 nFinalNodeMask = 1 << ((nBytesArc*8)-1) 676 nFinalArcMask = 1 << ((nBytesArc*8)-2) 677 if not nArc: 678 val = nFinalNodeMask | nFinalArcMask 679 by = val.to_bytes(nBytesArc, byteorder='big') 680 by += (0).to_bytes(nBytesNodeAddress, byteorder='big') 681 return by 682 by = b"" 683 for i, arc in enumerate(self.arcs, 1): 684 val = arc 685 if i == 1 and self.final: 686 val = val | nFinalNodeMask 687 if i == nArc: 688 val = val | nFinalArcMask 689 by += val.to_bytes(nBytesArc, byteorder='big') 690 by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big') 691 return by 692 693 def getTxtRepr1 (self, nBytesArc, lVal): 694 "return representation as string of node (method 1)" 695 nArc = len(self.arcs) 696 nFinalNodeMask = 1 << ((nBytesArc*8)-1) 697 nFinalArcMask = 1 << ((nBytesArc*8)-2) 698 s = "i{:_>10} -- #{:_>10}\n".format(self.i, self.addr) 699 if not nArc: 700 s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0") 701 return s 702 for i, arc in enumerate(self.arcs, 1): 703 val = arc 704 if i == 1 and self.final: 705 val = val | nFinalNodeMask 706 if i == nArc: 707 val = val | nFinalArcMask 708 s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) 709 return s 710 711 # VERSION 2 ===================================================================================================== 712 def convToBytes2 (self, nBytesArc, nBytesNodeAddress): 713 """ 714 Convert to bytes (method 2). 715 716 Node scheme: 717 - Arc length is defined by nBytesArc 718 - Address length is defined by nBytesNodeAddress 719 720 | Arc | Address of next node | 721 | | | 722 ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ 723 ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ 724 ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ 725 [...] 726 ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ 727 ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ 728 ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ 729 ^ ^ ^ 730 ┃ ┃ ┃ 731 ┃ ┃ ┗━━ if 1, caution, no address: next node is the following node 732 ┃ ┗━━━━ if 1, last arc of this node 733 ┗━━━━━━ if 1, this node is final (only on the first arc) 734 """ 735 nArc = len(self.arcs) 736 nFinalNodeMask = 1 << ((nBytesArc*8)-1) 737 nFinalArcMask = 1 << ((nBytesArc*8)-2) 738 nNextNodeMask = 1 << ((nBytesArc*8)-3) 739 if not nArc: 740 val = nFinalNodeMask | nFinalArcMask 741 by = val.to_bytes(nBytesArc, byteorder='big') 742 by += (0).to_bytes(nBytesNodeAddress, byteorder='big') 743 return by 744 by = b"" 745 for i, arc in enumerate(self.arcs, 1): 746 val = arc 747 if i == 1 and self.final: 748 val = val | nFinalNodeMask 749 if i == nArc: 750 val = val | nFinalArcMask 751 if (self.pos + 1) == self.arcs[arc].pos and self.i != 0: 752 val = val | nNextNodeMask 753 by += val.to_bytes(nBytesArc, byteorder='big') 754 else: 755 by += val.to_bytes(nBytesArc, byteorder='big') 756 by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big') 757 return by 758 759 def getTxtRepr2 (self, nBytesArc, lVal): 760 "return representation as string of node (method 2)" 761 nArc = len(self.arcs) 762 nFinalNodeMask = 1 << ((nBytesArc*8)-1) 763 nFinalArcMask = 1 << ((nBytesArc*8)-2) 764 nNextNodeMask = 1 << ((nBytesArc*8)-3) 765 s = "i{:_>10} -- #{:_>10}\n".format(self.i, self.addr) 766 if not nArc: 767 s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0") 768 return s 769 for i, arc in enumerate(self.arcs, 1): 770 val = arc 771 if i == 1 and self.final: 772 val = val | nFinalNodeMask 773 if i == nArc: 774 val = val | nFinalArcMask 775 if (self.pos + 1) == self.arcs[arc].pos and self.i != 0: 776 val = val | nNextNodeMask 777 s += " {:<20} {:0>16}\n".format(lVal[arc], bin(val)[2:]) 778 else: 779 s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) 780 return s 781 782 # VERSION 3 ===================================================================================================== 783 def convToBytes3 (self, nBytesArc, nBytesNodeAddress, nBytesOffset): 784 """ 785 Convert to bytes (method 3). 786 787 Node scheme: 788 - Arc length is defined by nBytesArc 789 - Address length is defined by nBytesNodeAddress 790 - Offset length is defined by nBytesOffset 791 792 | Arc | Address of next node or offset to next node | 793 | | | 794 ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ 795 ┃1┃0┃0┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ 796 ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ 797 [...] 798 ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ 799 ┃0┃0┃1┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ Offsets are shorter than addresses 800 ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ 801 ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━┓ 802 ┃0┃1┃0┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ ┃ 803 ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ ┗━━━━━━━━━━━━━━━┛ 804 805 ^ ^ ^ 806 ┃ ┃ ┃ 807 ┃ ┃ ┗━━ if 1, offset instead of address of next node 808 ┃ ┗━━━━ if 1, last arc of this node 809 ┗━━━━━━ if 1, this node is final (only on the first arc) 810 """ 811 nArc = len(self.arcs) 812 nFinalNodeMask = 1 << ((nBytesArc*8)-1) 813 nFinalArcMask = 1 << ((nBytesArc*8)-2) 814 nNextNodeMask = 1 << ((nBytesArc*8)-3) 815 nMaxOffset = (2 ** (nBytesOffset * 8)) - 1 816 if not nArc: 817 val = nFinalNodeMask | nFinalArcMask 818 by = val.to_bytes(nBytesArc, byteorder='big') 819 by += (0).to_bytes(nBytesNodeAddress, byteorder='big') 820 return by 821 by = b"" 822 for i, arc in enumerate(self.arcs, 1): 823 val = arc 824 if i == 1 and self.final: 825 val = val | nFinalNodeMask 826 if i == nArc: 827 val = val | nFinalArcMask 828 if 1 < (self.arcs[arc].addr - self.addr) < nMaxOffset and self.i != 0: 829 val = val | nNextNodeMask 830 by += val.to_bytes(nBytesArc, byteorder='big') 831 by += (self.arcs[arc].addr-self.addr).to_bytes(nBytesOffset, byteorder='big') 832 else: 833 by += val.to_bytes(nBytesArc, byteorder='big') 834 by += self.arcs[arc].addr.to_bytes(nBytesNodeAddress, byteorder='big') 835 return by 836 837 def getTxtRepr3 (self, nBytesArc, nBytesOffset, lVal): 838 "return representation as string of node (method 3)" 839 nArc = len(self.arcs) 840 nFinalNodeMask = 1 << ((nBytesArc*8)-1) 841 nFinalArcMask = 1 << ((nBytesArc*8)-2) 842 nNextNodeMask = 1 << ((nBytesArc*8)-3) 843 nMaxOffset = (2 ** (nBytesOffset * 8)) - 1 844 s = "i{:_>10} -- #{:_>10} ({})\n".format(self.i, self.addr, self.size) 845 if not nArc: 846 s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format("", bin(nFinalNodeMask | nFinalArcMask)[2:], "0", "0") 847 return s 848 for i, arc in enumerate(self.arcs, 1): 849 val = arc 850 if i == 1 and self.final: 851 val = val | nFinalNodeMask 852 if i == nArc: 853 val = val | nFinalArcMask 854 if 1 < (self.arcs[arc].addr - self.addr) < nMaxOffset and self.i != 0: 855 val = val | nNextNodeMask 856 s += " {:<20} {:0>16} i{:_>10} +{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr - self.addr) 857 else: 858 s += " {:<20} {:0>16} i{:_>10} #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr) 859 return s 860 861 862 863# Another attempt to sort node arcs 864 865_dCharOrder = { 866 # key: previous char, value: dictionary of chars {c: nValue} 867 "": {} 868} 869 870 871def addWordToCharDict (sWord): 872 "for each character of <sWord>, count how many times it appears after the previous character, and store result in a <_dCharOrder>" 873 cPrevious = "" 874 for cChar in sWord: 875 if cPrevious not in _dCharOrder: 876 _dCharOrder[cPrevious] = {} 877 _dCharOrder[cPrevious][cChar] = _dCharOrder[cPrevious].get(cChar, 0) + 1 878 cPrevious = cChar 879 880 881def getCharOrderAfterChar (cChar): 882 "return a dictionary of chars with number of times it appears after character <cChar>" 883 return _dCharOrder.get(cChar, None) 884 885 886def displayCharOrder (): 887 "display how many times each character appear after another one" 888 for key, value in _dCharOrder.items(): 889 print("[" + key + "]: ", ", ".join([ c+":"+str(n) for c, n in sorted(value.items(), key=lambda t: t[1], reverse=True) ])) 890