1import collections 2import heapq 3import logging 4import math 5import numbers 6import random 7import re 8 9from .utils import _progress, _is_string 10from .exception import MorfessorException, SegmentOnlyModelException 11 12_logger = logging.getLogger(__name__) 13 14 15def _constructions_to_str(constructions): 16 """Return a readable string for a list of constructions.""" 17 if _is_string(constructions[0]): 18 # Constructions are strings 19 return ' + '.join(constructions) 20 else: 21 # Constructions are not strings (should be tuples of strings) 22 return ' + '.join(map(lambda x: ' '.join(x), constructions)) 23 24 25# rcount = root count (from corpus) 26# count = total count of the node 27# splitloc = integer or tuple. Location(s) of the possible splits for virtual 28# constructions; empty tuple or 0 if real construction 29ConstrNode = collections.namedtuple('ConstrNode', 30 ['rcount', 'count', 'splitloc']) 31 32 33class BaselineModel(object): 34 """Morfessor Baseline model class. 35 36 Implements training of and segmenting with a Morfessor model. The model 37 is complete agnostic to whether it is used with lists of strings (finding 38 phrases in sentences) or strings of characters (finding morphs in words). 39 40 """ 41 42 penalty = -9999.9 43 44 def __init__(self, forcesplit_list=None, corpusweight=None, 45 use_skips=False, nosplit_re=None): 46 """Initialize a new model instance. 47 48 Arguments: 49 forcesplit_list: force segmentations on the characters in 50 the given list 51 corpusweight: weight for the corpus cost 52 use_skips: randomly skip frequently occurring constructions 53 to speed up training 54 nosplit_re: regular expression string for preventing splitting 55 in certain contexts 56 57 """ 58 59 # In analyses for each construction a ConstrNode is stored. All 60 # training data has a rcount (real count) > 0. All real morphemes 61 # have no split locations. 62 self._analyses = {} 63 64 # Flag to indicate the model is only useful for segmentation 65 self._segment_only = False 66 67 # Cost variables 68 self._lexicon_coding = LexiconEncoding() 69 self._corpus_coding = CorpusEncoding(self._lexicon_coding) 70 self._annot_coding = None 71 72 #Set corpus weight updater 73 self.set_corpus_weight_updater(corpusweight) 74 75 # Configuration variables 76 self._use_skips = use_skips # Random skips for frequent constructions 77 self._supervised = False 78 79 # Counter for random skipping 80 self._counter = collections.Counter() 81 if forcesplit_list is None: 82 self.forcesplit_list = [] 83 else: 84 self.forcesplit_list = forcesplit_list 85 if nosplit_re is None: 86 self.nosplit_re = None 87 else: 88 self.nosplit_re = re.compile(nosplit_re, re.UNICODE) 89 90 # Used only for (semi-)supervised learning 91 self.annotations = None 92 93 def set_corpus_weight_updater(self, corpus_weight): 94 if corpus_weight is None: 95 self._corpus_weight_updater = FixedCorpusWeight(1.0) 96 elif isinstance(corpus_weight, numbers.Number): 97 self._corpus_weight_updater = FixedCorpusWeight(corpus_weight) 98 else: 99 self._corpus_weight_updater = corpus_weight 100 101 self._corpus_weight_updater.update(self, 0) 102 103 def _check_segment_only(self): 104 if self._segment_only: 105 raise SegmentOnlyModelException() 106 107 @property 108 def tokens(self): 109 """Return the number of construction tokens.""" 110 return self._corpus_coding.tokens 111 112 @property 113 def types(self): 114 """Return the number of construction types.""" 115 return self._corpus_coding.types - 1 # do not include boundary 116 117 def _add_compound(self, compound, c): 118 """Add compound with count c to data.""" 119 self._corpus_coding.boundaries += c 120 self._modify_construction_count(compound, c) 121 oldrc = self._analyses[compound].rcount 122 self._analyses[compound] = \ 123 self._analyses[compound]._replace(rcount=oldrc + c) 124 125 def _remove(self, construction): 126 """Remove construction from model.""" 127 rcount, count, splitloc = self._analyses[construction] 128 self._modify_construction_count(construction, -count) 129 return rcount, count 130 131 def _random_split(self, compound, threshold): 132 """Return a random split for compound. 133 134 Arguments: 135 compound: compound to split 136 threshold: probability of splitting at each position 137 138 """ 139 splitloc = tuple(i for i in range(1, len(compound)) 140 if random.random() < threshold) 141 return self._splitloc_to_segmentation(compound, splitloc) 142 143 def _set_compound_analysis(self, compound, parts, ptype='rbranch'): 144 """Set analysis of compound to according to given segmentation. 145 146 Arguments: 147 compound: compound to split 148 parts: desired constructions of the compound 149 ptype: type of the parse tree to use 150 151 If ptype is 'rbranch', the analysis is stored internally as a 152 right-branching tree. If ptype is 'flat', the analysis is stored 153 directly to the compound's node. 154 155 """ 156 if len(parts) == 1: 157 rcount, count = self._remove(compound) 158 self._analyses[compound] = ConstrNode(rcount, 0, tuple()) 159 self._modify_construction_count(compound, count) 160 elif ptype == 'flat': 161 rcount, count = self._remove(compound) 162 splitloc = self.segmentation_to_splitloc(parts) 163 self._analyses[compound] = ConstrNode(rcount, count, splitloc) 164 for constr in parts: 165 self._modify_construction_count(constr, count) 166 elif ptype == 'rbranch': 167 construction = compound 168 for p in range(len(parts)): 169 rcount, count = self._remove(construction) 170 prefix = parts[p] 171 if p == len(parts) - 1: 172 self._analyses[construction] = ConstrNode(rcount, 0, 173 0) 174 self._modify_construction_count(construction, count) 175 else: 176 suffix = self._join_constructions(parts[p + 1:]) 177 self._analyses[construction] = ConstrNode(rcount, count, 178 len(prefix)) 179 self._modify_construction_count(prefix, count) 180 self._modify_construction_count(suffix, count) 181 construction = suffix 182 else: 183 raise MorfessorException("Unknown parse type '%s'" % ptype) 184 185 def _update_annotation_choices(self): 186 """Update the selection of alternative analyses in annotations. 187 188 For semi-supervised models, select the most likely alternative 189 analyses included in the annotations of the compounds. 190 191 """ 192 if not self._supervised: 193 return 194 195 # Collect constructions from the most probable segmentations 196 # and add missing compounds also to the unannotated data 197 constructions = collections.Counter() 198 for compound, alternatives in self.annotations.items(): 199 if compound not in self._analyses: 200 self._add_compound(compound, 1) 201 202 analysis, cost = self._best_analysis(alternatives) 203 for m in analysis: 204 constructions[m] += self._analyses[compound].rcount 205 206 # Apply the selected constructions in annotated corpus coding 207 self._annot_coding.set_constructions(constructions) 208 for m, f in constructions.items(): 209 count = 0 210 if m in self._analyses and not self._analyses[m].splitloc: 211 count = self._analyses[m].count 212 self._annot_coding.set_count(m, count) 213 214 def _best_analysis(self, choices): 215 """Select the best analysis out of the given choices.""" 216 bestcost = None 217 bestanalysis = None 218 for analysis in choices: 219 cost = 0.0 220 for m in analysis: 221 if m in self._analyses and not self._analyses[m].splitloc: 222 cost += (math.log(self._corpus_coding.tokens) - 223 math.log(self._analyses[m].count)) 224 else: 225 cost -= self.penalty # penalty is negative 226 if bestcost is None or cost < bestcost: 227 bestcost = cost 228 bestanalysis = analysis 229 return bestanalysis, bestcost 230 231 def _force_split(self, compound): 232 """Return forced split of the compound.""" 233 if len(self.forcesplit_list) == 0: 234 return [compound] 235 clen = len(compound) 236 j = 0 237 parts = [] 238 for i in range(0, clen): 239 if compound[i] in self.forcesplit_list: 240 if len(compound[j:i]) > 0: 241 parts.append(compound[j:i]) 242 parts.append(compound[i:i + 1]) 243 j = i + 1 244 if j < clen: 245 parts.append(compound[j:]) 246 return [p for p in parts if len(p) > 0] 247 248 def _test_skip(self, construction): 249 """Return true if construction should be skipped.""" 250 if construction in self._counter: 251 t = self._counter[construction] 252 if random.random() > 1.0 / max(1, t): 253 return True 254 self._counter[construction] += 1 255 return False 256 257 def _viterbi_optimize(self, compound, addcount=0, maxlen=30): 258 """Optimize segmentation of the compound using the Viterbi algorithm. 259 260 Arguments: 261 compound: compound to optimize 262 addcount: constant for additive smoothing of Viterbi probs 263 maxlen: maximum length for a construction 264 265 Returns list of segments. 266 267 """ 268 clen = len(compound) 269 if clen == 1: # Single atom 270 return [compound] 271 if self._use_skips and self._test_skip(compound): 272 return self.segment(compound) 273 # Collect forced subsegments 274 parts = self._force_split(compound) 275 # Use Viterbi algorithm to optimize the subsegments 276 constructions = [] 277 for part in parts: 278 constructions += self.viterbi_segment(part, addcount=addcount, 279 maxlen=maxlen)[0] 280 self._set_compound_analysis(compound, constructions, ptype='flat') 281 return constructions 282 283 def _recursive_optimize(self, compound): 284 """Optimize segmentation of the compound using recursive splitting. 285 286 Returns list of segments. 287 288 """ 289 if len(compound) == 1: # Single atom 290 return [compound] 291 if self._use_skips and self._test_skip(compound): 292 return self.segment(compound) 293 # Collect forced subsegments 294 parts = self._force_split(compound) 295 if len(parts) == 1: 296 # just one part 297 return self._recursive_split(compound) 298 self._set_compound_analysis(compound, parts) 299 # Use recursive algorithm to optimize the subsegments 300 constructions = [] 301 for part in parts: 302 constructions += self._recursive_split(part) 303 return constructions 304 305 def _recursive_split(self, construction): 306 """Optimize segmentation of the construction by recursive splitting. 307 308 Returns list of segments. 309 310 """ 311 if len(construction) == 1: # Single atom 312 return [construction] 313 if self._use_skips and self._test_skip(construction): 314 return self.segment(construction) 315 rcount, count = self._remove(construction) 316 317 # Check all binary splits and no split 318 self._modify_construction_count(construction, count) 319 mincost = self.get_cost() 320 self._modify_construction_count(construction, -count) 321 splitloc = 0 322 for i in range(1, len(construction)): 323 if (self.nosplit_re and 324 self.nosplit_re.match(construction[(i - 1):(i + 1)])): 325 continue 326 prefix = construction[:i] 327 suffix = construction[i:] 328 self._modify_construction_count(prefix, count) 329 self._modify_construction_count(suffix, count) 330 cost = self.get_cost() 331 self._modify_construction_count(prefix, -count) 332 self._modify_construction_count(suffix, -count) 333 if cost <= mincost: 334 mincost = cost 335 splitloc = i 336 337 if splitloc: 338 # Virtual construction 339 self._analyses[construction] = ConstrNode(rcount, count, 340 splitloc) 341 prefix = construction[:splitloc] 342 suffix = construction[splitloc:] 343 self._modify_construction_count(prefix, count) 344 self._modify_construction_count(suffix, count) 345 lp = self._recursive_split(prefix) 346 if suffix != prefix: 347 return lp + self._recursive_split(suffix) 348 else: 349 return lp + lp 350 else: 351 # Real construction 352 self._analyses[construction] = ConstrNode(rcount, 0, tuple()) 353 self._modify_construction_count(construction, count) 354 return [construction] 355 356 def _modify_construction_count(self, construction, dcount): 357 """Modify the count of construction by dcount. 358 359 For virtual constructions, recurses to child nodes in the 360 tree. For real constructions, adds/removes construction 361 to/from the lexicon whenever necessary. 362 363 """ 364 if construction in self._analyses: 365 rcount, count, splitloc = self._analyses[construction] 366 else: 367 rcount, count, splitloc = 0, 0, 0 368 newcount = count + dcount 369 if newcount == 0: 370 del self._analyses[construction] 371 else: 372 self._analyses[construction] = ConstrNode(rcount, newcount, 373 splitloc) 374 if splitloc: 375 # Virtual construction 376 children = self._splitloc_to_segmentation(construction, splitloc) 377 for child in children: 378 self._modify_construction_count(child, dcount) 379 else: 380 # Real construction 381 self._corpus_coding.update_count(construction, count, newcount) 382 if self._supervised: 383 self._annot_coding.update_count(construction, count, newcount) 384 385 if count == 0 and newcount > 0: 386 self._lexicon_coding.add(construction) 387 elif count > 0 and newcount == 0: 388 self._lexicon_coding.remove(construction) 389 390 def _epoch_update(self, epoch_num): 391 """Do model updates that are necessary between training epochs. 392 393 The argument is the number of training epochs finished. 394 395 In practice, this does two things: 396 - If random skipping is in use, reset construction counters. 397 - If semi-supervised learning is in use and there are alternative 398 analyses in the annotated data, select the annotations that are 399 most likely given the model parameters. If not hand-set, update 400 the weight of the annotated corpus. 401 402 This method should also be run prior to training (with the 403 epoch number argument as 0). 404 405 """ 406 forced_epochs = 0 407 if self._corpus_weight_updater.update(self, epoch_num): 408 forced_epochs += 2 409 410 if self._use_skips: 411 self._counter = collections.Counter() 412 if self._supervised: 413 self._update_annotation_choices() 414 self._annot_coding.update_weight() 415 416 return forced_epochs 417 418 @staticmethod 419 def segmentation_to_splitloc(constructions): 420 """Return a list of split locations for a segmented compound.""" 421 splitloc = [] 422 i = 0 423 for c in constructions: 424 i += len(c) 425 splitloc.append(i) 426 return tuple(splitloc[:-1]) 427 428 @staticmethod 429 def _splitloc_to_segmentation(compound, splitloc): 430 """Return segmentation corresponding to the list of split locations.""" 431 if isinstance(splitloc, numbers.Number): 432 return [compound[:splitloc], compound[splitloc:]] 433 parts = [] 434 startpos = 0 435 endpos = 0 436 for i in range(len(splitloc)): 437 endpos = splitloc[i] 438 parts.append(compound[startpos:endpos]) 439 startpos = endpos 440 parts.append(compound[endpos:]) 441 return parts 442 443 @staticmethod 444 def _join_constructions(constructions): 445 """Append the constructions after each other by addition. Works for 446 both lists and strings """ 447 result = type(constructions[0])() 448 for c in constructions: 449 result += c 450 return result 451 452 def get_compounds(self): 453 """Return the compound types stored by the model.""" 454 self._check_segment_only() 455 return [w for w, node in self._analyses.items() 456 if node.rcount > 0] 457 458 def get_constructions(self): 459 """Return a list of the present constructions and their counts.""" 460 return sorted((c, node.count) for c, node in self._analyses.items() 461 if not node.splitloc) 462 463 def get_cost(self): 464 """Return current model encoding cost.""" 465 cost = self._corpus_coding.get_cost() + self._lexicon_coding.get_cost() 466 if self._supervised: 467 return cost + self._annot_coding.get_cost() 468 else: 469 return cost 470 471 def get_segmentations(self): 472 """Retrieve segmentations for all compounds encoded by the model.""" 473 self._check_segment_only() 474 for w in sorted(self._analyses.keys()): 475 c = self._analyses[w].rcount 476 if c > 0: 477 yield c, w, self.segment(w) 478 479 def load_data(self, data, freqthreshold=1, count_modifier=None, 480 init_rand_split=None): 481 """Load data to initialize the model for batch training. 482 483 Arguments: 484 data: iterator of (count, compound_atoms) tuples 485 freqthreshold: discard compounds that occur less than 486 given times in the corpus (default 1) 487 count_modifier: function for adjusting the counts of each 488 compound 489 init_rand_split: If given, random split the word with 490 init_rand_split as the probability for each 491 split 492 493 Adds the compounds in the corpus to the model lexicon. Returns 494 the total cost. 495 496 """ 497 self._check_segment_only() 498 totalcount = collections.Counter() 499 for count, atoms in data: 500 if len(atoms) > 0: 501 totalcount[atoms] += count 502 503 for atoms, count in totalcount.items(): 504 if count < freqthreshold: 505 continue 506 if count_modifier is not None: 507 self._add_compound(atoms, count_modifier(count)) 508 else: 509 self._add_compound(atoms, count) 510 511 if init_rand_split is not None and init_rand_split > 0: 512 parts = self._random_split(atoms, init_rand_split) 513 self._set_compound_analysis(atoms, parts) 514 515 return self.get_cost() 516 517 def load_segmentations(self, segmentations): 518 """Load model from existing segmentations. 519 520 The argument should be an iterator providing a count, a 521 compound, and its segmentation. 522 523 """ 524 self._check_segment_only() 525 for count, compound, segmentation in segmentations: 526 self._add_compound(compound, count) 527 self._set_compound_analysis(compound, segmentation) 528 529 def set_annotations(self, annotations, annotatedcorpusweight=None): 530 """Prepare model for semi-supervised learning with given 531 annotations. 532 533 """ 534 self._check_segment_only() 535 self._supervised = True 536 self.annotations = annotations 537 self._annot_coding = AnnotatedCorpusEncoding(self._corpus_coding, 538 weight= 539 annotatedcorpusweight) 540 self._annot_coding.boundaries = len(self.annotations) 541 542 def segment(self, compound): 543 """Segment the compound by looking it up in the model analyses. 544 545 Raises KeyError if compound is not present in the training 546 data. For segmenting new words, use viterbi_segment(compound). 547 548 """ 549 self._check_segment_only() 550 rcount, count, splitloc = self._analyses[compound] 551 constructions = [] 552 if splitloc: 553 for child in self._splitloc_to_segmentation(compound, 554 splitloc): 555 constructions += self.segment(child) 556 else: 557 constructions.append(compound) 558 return constructions 559 560 def train_batch(self, algorithm='recursive', algorithm_params=(), 561 finish_threshold=0.005, max_epochs=None): 562 """Train the model in batch fashion. 563 564 The model is trained with the data already loaded into the model (by 565 using an existing model or calling one of the load_ methods). 566 567 In each iteration (epoch) all compounds in the training data are 568 optimized once, in a random order. If applicable, corpus weight, 569 annotation cost, and random split counters are recalculated after 570 each iteration. 571 572 Arguments: 573 algorithm: string in ('recursive', 'viterbi') that indicates 574 the splitting algorithm used. 575 algorithm_params: parameters passed to the splitting algorithm. 576 finish_threshold: the stopping threshold. Training stops when 577 the improvement of the last iteration is 578 smaller then finish_threshold * #boundaries 579 max_epochs: maximum number of epochs to train 580 581 """ 582 epochs = 0 583 forced_epochs = max(1, self._epoch_update(epochs)) 584 newcost = self.get_cost() 585 compounds = list(self.get_compounds()) 586 _logger.info("Compounds in training data: %s types / %s tokens", 587 len(compounds), self._corpus_coding.boundaries) 588 589 _logger.info("Starting batch training") 590 _logger.info("Epochs: %s\tCost: %s", epochs, newcost) 591 592 while True: 593 # One epoch 594 random.shuffle(compounds) 595 596 for w in _progress(compounds): 597 if algorithm == 'recursive': 598 segments = self._recursive_optimize(w, *algorithm_params) 599 elif algorithm == 'viterbi': 600 segments = self._viterbi_optimize(w, *algorithm_params) 601 else: 602 raise MorfessorException("unknown algorithm '%s'" % 603 algorithm) 604 _logger.debug("#%s -> %s", w, _constructions_to_str(segments)) 605 epochs += 1 606 607 _logger.debug("Cost before epoch update: %s", self.get_cost()) 608 forced_epochs = max(forced_epochs, self._epoch_update(epochs)) 609 oldcost = newcost 610 newcost = self.get_cost() 611 612 _logger.info("Epochs: %s\tCost: %s", epochs, newcost) 613 if (forced_epochs == 0 and 614 newcost >= oldcost - finish_threshold * 615 self._corpus_coding.boundaries): 616 break 617 if forced_epochs > 0: 618 forced_epochs -= 1 619 if max_epochs is not None and epochs >= max_epochs: 620 _logger.info("Max number of epochs reached, stop training") 621 break 622 _logger.info("Done.") 623 return epochs, newcost 624 625 def train_online(self, data, count_modifier=None, epoch_interval=10000, 626 algorithm='recursive', algorithm_params=(), 627 init_rand_split=None, max_epochs=None): 628 """Train the model in online fashion. 629 630 The model is trained with the data provided in the data argument. 631 As example the data could come from a generator linked to standard in 632 for live monitoring of the splitting. 633 634 All compounds from data are only optimized once. After online 635 training, batch training could be used for further optimization. 636 637 Epochs are defined as a fixed number of compounds. After each epoch ( 638 like in batch training), the annotation cost, and random split counters 639 are recalculated if applicable. 640 641 Arguments: 642 data: iterator of (_, compound_atoms) tuples. The first 643 argument is ignored, as every occurence of the 644 compound is taken with count 1 645 count_modifier: function for adjusting the counts of each 646 compound 647 epoch_interval: number of compounds to process before starting 648 a new epoch 649 algorithm: string in ('recursive', 'viterbi') that indicates 650 the splitting algorithm used. 651 algorithm_params: parameters passed to the splitting algorithm. 652 init_rand_split: probability for random splitting a compound to 653 at any point for initializing the model. None 654 or 0 means no random splitting. 655 max_epochs: maximum number of epochs to train 656 657 """ 658 self._check_segment_only() 659 if count_modifier is not None: 660 counts = {} 661 662 _logger.info("Starting online training") 663 664 epochs = 0 665 i = 0 666 more_tokens = True 667 while more_tokens: 668 self._epoch_update(epochs) 669 newcost = self.get_cost() 670 _logger.info("Tokens processed: %s\tCost: %s", i, newcost) 671 672 for _ in _progress(range(epoch_interval)): 673 try: 674 _, w = next(data) 675 except StopIteration: 676 more_tokens = False 677 break 678 679 if len(w) == 0: 680 # Newline in corpus 681 continue 682 683 if count_modifier is not None: 684 if w not in counts: 685 c = 0 686 counts[w] = 1 687 addc = 1 688 else: 689 c = counts[w] 690 counts[w] = c + 1 691 addc = count_modifier(c + 1) - count_modifier(c) 692 if addc > 0: 693 self._add_compound(w, addc) 694 else: 695 self._add_compound(w, 1) 696 if init_rand_split is not None and init_rand_split > 0: 697 parts = self._random_split(w, init_rand_split) 698 self._set_compound_analysis(w, parts) 699 if algorithm == 'recursive': 700 segments = self._recursive_optimize(w, *algorithm_params) 701 elif algorithm == 'viterbi': 702 segments = self._viterbi_optimize(w, *algorithm_params) 703 else: 704 raise MorfessorException("unknown algorithm '%s'" % 705 algorithm) 706 _logger.debug("#%s: %s -> %s", i, w, _constructions_to_str(segments)) 707 i += 1 708 709 epochs += 1 710 if max_epochs is not None and epochs >= max_epochs: 711 _logger.info("Max number of epochs reached, stop training") 712 break 713 714 self._epoch_update(epochs) 715 newcost = self.get_cost() 716 _logger.info("Tokens processed: %s\tCost: %s", i, newcost) 717 return epochs, newcost 718 719 def viterbi_segment(self, compound, addcount=1.0, maxlen=30): 720 """Find optimal segmentation using the Viterbi algorithm. 721 722 Arguments: 723 compound: compound to be segmented 724 addcount: constant for additive smoothing (0 = no smoothing) 725 maxlen: maximum length for the constructions 726 727 If additive smoothing is applied, new complex construction types can 728 be selected during the search. Without smoothing, only new 729 single-atom constructions can be selected. 730 731 Returns the most probable segmentation and its log-probability. 732 733 """ 734 clen = len(compound) 735 grid = [(0.0, None)] 736 if self._corpus_coding.tokens + self._corpus_coding.boundaries + \ 737 addcount > 0: 738 logtokens = math.log(self._corpus_coding.tokens + 739 self._corpus_coding.boundaries + addcount) 740 else: 741 logtokens = 0 742 badlikelihood = clen * logtokens + 1.0 743 # Viterbi main loop 744 for t in range(1, clen + 1): 745 # Select the best path to current node. 746 # Note that we can come from any node in history. 747 bestpath = None 748 bestcost = None 749 if self.nosplit_re and t < clen and \ 750 self.nosplit_re.match(compound[(t-1):(t+1)]): 751 grid.append((clen*badlikelihood, t-1)) 752 continue 753 for pt in range(max(0, t - maxlen), t): 754 if grid[pt][0] is None: 755 continue 756 cost = grid[pt][0] 757 construction = compound[pt:t] 758 if (construction in self._analyses and 759 not self._analyses[construction].splitloc): 760 if self._analyses[construction].count <= 0: 761 raise MorfessorException( 762 "Construction count of '%s' is %s" % 763 (construction, 764 self._analyses[construction].count)) 765 cost += (logtokens - 766 math.log(self._analyses[construction].count + 767 addcount)) 768 elif addcount > 0: 769 if self._corpus_coding.tokens == 0: 770 cost += (addcount * math.log(addcount) + 771 self._lexicon_coding.get_codelength( 772 construction) 773 / self._corpus_coding.weight) 774 else: 775 cost += (logtokens - math.log(addcount) + 776 (((self._lexicon_coding.boundaries + 777 addcount) * 778 math.log(self._lexicon_coding.boundaries 779 + addcount)) 780 - (self._lexicon_coding.boundaries 781 * math.log(self._lexicon_coding.boundaries)) 782 + self._lexicon_coding.get_codelength( 783 construction)) 784 / self._corpus_coding.weight) 785 elif len(construction) == 1: 786 cost += badlikelihood 787 elif self.nosplit_re: 788 # Some splits are forbidden, so longer unknown 789 # constructions have to be allowed 790 cost += len(construction) * badlikelihood 791 else: 792 continue 793 if bestcost is None or cost < bestcost: 794 bestcost = cost 795 bestpath = pt 796 grid.append((bestcost, bestpath)) 797 constructions = [] 798 cost, path = grid[-1] 799 lt = clen + 1 800 while path is not None: 801 t = path 802 constructions.append(compound[t:lt]) 803 path = grid[t][1] 804 lt = t 805 constructions.reverse() 806 # Add boundary cost 807 cost += (math.log(self._corpus_coding.tokens + 808 self._corpus_coding.boundaries) - 809 math.log(self._corpus_coding.boundaries)) 810 return constructions, cost 811 812 def forward_logprob(self, compound): 813 """Find log-probability of a compound using the forward algorithm. 814 815 Arguments: 816 compound: compound to process 817 818 Returns the (negative) log-probability of the compound. If the 819 probability is zero, returns a number that is larger than the 820 value defined by the penalty attribute of the model object. 821 822 """ 823 clen = len(compound) 824 grid = [0.0] 825 if self._corpus_coding.tokens + self._corpus_coding.boundaries > 0: 826 logtokens = math.log(self._corpus_coding.tokens + 827 self._corpus_coding.boundaries) 828 else: 829 logtokens = 0 830 # Forward main loop 831 for t in range(1, clen + 1): 832 # Sum probabilities from all paths to the current node. 833 # Note that we can come from any node in history. 834 psum = 0.0 835 for pt in range(0, t): 836 cost = grid[pt] 837 construction = compound[pt:t] 838 if (construction in self._analyses and 839 not self._analyses[construction].splitloc): 840 if self._analyses[construction].count <= 0: 841 raise MorfessorException( 842 "Construction count of '%s' is %s" % 843 (construction, 844 self._analyses[construction].count)) 845 cost += (logtokens - 846 math.log(self._analyses[construction].count)) 847 else: 848 continue 849 psum += math.exp(-cost) 850 if psum > 0: 851 grid.append(-math.log(psum)) 852 else: 853 grid.append(-self.penalty) 854 cost = grid[-1] 855 # Add boundary cost 856 cost += (math.log(self._corpus_coding.tokens + 857 self._corpus_coding.boundaries) - 858 math.log(self._corpus_coding.boundaries)) 859 return cost 860 861 def viterbi_nbest(self, compound, n, addcount=1.0, maxlen=30): 862 """Find top-n optimal segmentations using the Viterbi algorithm. 863 864 Arguments: 865 compound: compound to be segmented 866 n: how many segmentations to return 867 addcount: constant for additive smoothing (0 = no smoothing) 868 maxlen: maximum length for the constructions 869 870 If additive smoothing is applied, new complex construction types can 871 be selected during the search. Without smoothing, only new 872 single-atom constructions can be selected. 873 874 Returns the n most probable segmentations and their 875 log-probabilities. 876 877 """ 878 clen = len(compound) 879 grid = [[(0.0, None, None)]] 880 if self._corpus_coding.tokens + self._corpus_coding.boundaries + \ 881 addcount > 0: 882 logtokens = math.log(self._corpus_coding.tokens + 883 self._corpus_coding.boundaries + addcount) 884 else: 885 logtokens = 0 886 badlikelihood = clen * logtokens + 1.0 887 # Viterbi main loop 888 for t in range(1, clen + 1): 889 # Select the best path to current node. 890 # Note that we can come from any node in history. 891 bestn = [] 892 if self.nosplit_re and t < clen and \ 893 self.nosplit_re.match(compound[(t-1):(t+1)]): 894 grid.append([(-clen*badlikelihood, t-1, -1)]) 895 continue 896 for pt in range(max(0, t - maxlen), t): 897 for k in range(len(grid[pt])): 898 if grid[pt][k][0] is None: 899 continue 900 cost = grid[pt][k][0] 901 construction = compound[pt:t] 902 if (construction in self._analyses and 903 not self._analyses[construction].splitloc): 904 if self._analyses[construction].count <= 0: 905 raise MorfessorException( 906 "Construction count of '%s' is %s" % 907 (construction, 908 self._analyses[construction].count)) 909 cost -= (logtokens - 910 math.log(self._analyses[construction].count + 911 addcount)) 912 elif addcount > 0: 913 if self._corpus_coding.tokens == 0: 914 cost -= (addcount * math.log(addcount) + 915 self._lexicon_coding.get_codelength( 916 construction) 917 / self._corpus_coding.weight) 918 else: 919 cost -= (logtokens - math.log(addcount) + 920 (((self._lexicon_coding.boundaries + 921 addcount) * 922 math.log(self._lexicon_coding.boundaries 923 + addcount)) 924 - (self._lexicon_coding.boundaries 925 * math.log(self._lexicon_coding. 926 boundaries)) 927 + self._lexicon_coding.get_codelength( 928 construction)) 929 / self._corpus_coding.weight) 930 elif len(construction) == 1: 931 cost -= badlikelihood 932 elif self.nosplit_re: 933 # Some splits are forbidden, so longer unknown 934 # constructions have to be allowed 935 cost -= len(construction) * badlikelihood 936 else: 937 continue 938 if len(bestn) < n: 939 heapq.heappush(bestn, (cost, pt, k)) 940 else: 941 heapq.heappushpop(bestn, (cost, pt, k)) 942 grid.append(bestn) 943 results = [] 944 for k in range(len(grid[-1])): 945 constructions = [] 946 cost, path, ki = grid[-1][k] 947 lt = clen + 1 948 while path is not None: 949 t = path 950 constructions.append(compound[t:lt]) 951 path = grid[t][ki][1] 952 ki = grid[t][ki][2] 953 lt = t 954 constructions.reverse() 955 # Add boundary cost 956 cost -= (math.log(self._corpus_coding.tokens + 957 self._corpus_coding.boundaries) - 958 math.log(self._corpus_coding.boundaries)) 959 results.append((-cost, constructions)) 960 return [(constr, cost) for cost, constr in sorted(results)] 961 962 def get_corpus_coding_weight(self): 963 return self._corpus_coding.weight 964 965 def set_corpus_coding_weight(self, weight): 966 self._check_segment_only() 967 self._corpus_coding.weight = weight 968 969 def make_segment_only(self): 970 """Reduce the size of this model by removing all non-morphs from the 971 analyses. After calling this method it is not possible anymore to call 972 any other method that would change the state of the model. Anyway 973 doing so would throw an exception. 974 975 """ 976 self._segment_only = True 977 self._analyses = {k: v for (k, v) in self._analyses.items() 978 if not v.splitloc} 979 980 def clear_segmentation(self): 981 for compound in list(self.get_compounds()): 982 self._set_compound_analysis(compound, [compound]) 983 984 985class CorpusWeight(object): 986 @classmethod 987 def move_direction(cls, model, direction, epoch): 988 if direction != 0: 989 weight = model.get_corpus_coding_weight() 990 if direction > 0: 991 weight *= 1 + 2.0 / epoch 992 else: 993 weight *= 1.0 / (1 + 2.0 / epoch) 994 model.set_corpus_coding_weight(weight) 995 _logger.info("Corpus weight set to %s", weight) 996 return True 997 return False 998 999 1000class FixedCorpusWeight(CorpusWeight): 1001 def __init__(self, weight): 1002 self.weight = weight 1003 1004 def update(self, model, _): 1005 model.set_corpus_coding_weight(self.weight) 1006 return False 1007 1008 1009class AnnotationCorpusWeight(CorpusWeight): 1010 """Class for using development annotations to update the corpus weight 1011 during batch training 1012 1013 """ 1014 1015 def __init__(self, devel_set, threshold=0.01): 1016 self.data = devel_set 1017 self.threshold = threshold 1018 1019 def update(self, model, epoch): 1020 """Tune model corpus weight based on the precision and 1021 recall of the development data, trying to keep them equal""" 1022 if epoch < 1: 1023 return False 1024 tmp = self.data.items() 1025 wlist, annotations = zip(*tmp) 1026 segments = [model.viterbi_segment(w)[0] for w in wlist] 1027 d = self._estimate_segmentation_dir(segments, annotations) 1028 1029 return self.move_direction(model, d, epoch) 1030 1031 @classmethod 1032 def _boundary_recall(cls, prediction, reference): 1033 """Calculate average boundary recall for given segmentations.""" 1034 rec_total = 0 1035 rec_sum = 0.0 1036 for pre_list, ref_list in zip(prediction, reference): 1037 best = -1 1038 for ref in ref_list: 1039 # list of internal boundary positions 1040 ref_b = set(BaselineModel.segmentation_to_splitloc(ref)) 1041 if len(ref_b) == 0: 1042 best = 1.0 1043 break 1044 for pre in pre_list: 1045 pre_b = set(BaselineModel.segmentation_to_splitloc(pre)) 1046 r = len(ref_b.intersection(pre_b)) / float(len(ref_b)) 1047 if r > best: 1048 best = r 1049 if best >= 0: 1050 rec_sum += best 1051 rec_total += 1 1052 return rec_sum, rec_total 1053 1054 @classmethod 1055 def _bpr_evaluation(cls, prediction, reference): 1056 """Return boundary precision, recall, and F-score for segmentations.""" 1057 rec_s, rec_t = cls._boundary_recall(prediction, reference) 1058 pre_s, pre_t = cls._boundary_recall(reference, prediction) 1059 rec = rec_s / rec_t 1060 pre = pre_s / pre_t 1061 f = 2.0 * pre * rec / (pre + rec) 1062 return pre, rec, f 1063 1064 def _estimate_segmentation_dir(self, segments, annotations): 1065 """Estimate if the given compounds are under- or oversegmented. 1066 1067 The decision is based on the difference between boundary precision 1068 and recall values for the given sample of segmented data. 1069 1070 Arguments: 1071 segments: list of predicted segmentations 1072 annotations: list of reference segmentations 1073 1074 Return 1 in the case of oversegmentation, -1 in the case of 1075 undersegmentation, and 0 if no changes are required. 1076 1077 """ 1078 pre, rec, f = self._bpr_evaluation([[x] for x in segments], annotations) 1079 _logger.info("Boundary evaluation: precision %.4f; recall %.4f", pre, rec) 1080 if abs(pre - rec) < self.threshold: 1081 return 0 1082 elif rec > pre: 1083 return 1 1084 else: 1085 return -1 1086 1087 1088class MorphLengthCorpusWeight(CorpusWeight): 1089 def __init__(self, morph_lenght, threshold=0.01): 1090 self.morph_length = morph_lenght 1091 self.threshold = threshold 1092 1093 def update(self, model, epoch): 1094 if epoch < 1: 1095 return False 1096 cur_length = self.calc_morph_length(model) 1097 1098 _logger.info("Current morph-length: %s", cur_length) 1099 1100 if (abs(self.morph_length - cur_length) / self.morph_length > 1101 self.threshold): 1102 d = abs(self.morph_length - cur_length) / (self.morph_length 1103 - cur_length) 1104 return self.move_direction(model, d, epoch) 1105 return False 1106 1107 @classmethod 1108 def calc_morph_length(cls, model): 1109 total_constructions = 0 1110 total_atoms = 0 1111 for compound in model.get_compounds(): 1112 constructions = model.segment(compound) 1113 for construction in constructions: 1114 total_constructions += 1 1115 total_atoms += len(construction) 1116 if total_constructions > 0: 1117 return float(total_atoms) / total_constructions 1118 else: 1119 return 0.0 1120 1121 1122class NumMorphCorpusWeight(CorpusWeight): 1123 def __init__(self, num_morph_types, threshold=0.01): 1124 self.num_morph_types = num_morph_types 1125 self.threshold = threshold 1126 1127 def update(self, model, epoch): 1128 if epoch < 1: 1129 return False 1130 cur_morph_types = model._lexicon_coding.boundaries 1131 1132 _logger.info("Number of morph types: %s", cur_morph_types) 1133 1134 1135 if (abs(self.num_morph_types - cur_morph_types) / self.num_morph_types 1136 > self.threshold): 1137 d = (abs(self.num_morph_types - cur_morph_types) / 1138 (self.num_morph_types - cur_morph_types)) 1139 return self.move_direction(model, d, epoch) 1140 return False 1141 1142class Encoding(object): 1143 """Base class for calculating the entropy (encoding length) of a corpus 1144 or lexicon. 1145 1146 Commonly subclassed to redefine specific methods. 1147 1148 """ 1149 def __init__(self, weight=1.0): 1150 """Initizalize class 1151 1152 Arguments: 1153 weight: weight used for this encoding 1154 """ 1155 self.logtokensum = 0.0 1156 self.tokens = 0 1157 self.boundaries = 0 1158 self.weight = weight 1159 1160 # constant used for speeding up logfactorial calculations with Stirling's 1161 # approximation 1162 _log2pi = math.log(2 * math.pi) 1163 1164 @property 1165 def types(self): 1166 """Define number of types as 0. types is made a property method to 1167 ensure easy redefinition in subclasses 1168 1169 """ 1170 return 0 1171 1172 @classmethod 1173 def _logfactorial(cls, n): 1174 """Calculate logarithm of n!. 1175 1176 For large n (n > 20), use Stirling's approximation. 1177 1178 """ 1179 if n < 2: 1180 return 0.0 1181 if n < 20: 1182 return math.log(math.factorial(n)) 1183 logn = math.log(n) 1184 return n * logn - n + 0.5 * (logn + cls._log2pi) 1185 1186 def frequency_distribution_cost(self): 1187 """Calculate -log[(u - 1)! (v - u)! / (v - 1)!] 1188 1189 v is the number of tokens+boundaries and u the number of types 1190 1191 """ 1192 if self.types < 2: 1193 return 0.0 1194 tokens = self.tokens + self.boundaries 1195 return (self._logfactorial(tokens - 1) - 1196 self._logfactorial(self.types - 1) - 1197 self._logfactorial(tokens - self.types)) 1198 1199 def permutations_cost(self): 1200 """The permutations cost for the encoding.""" 1201 return -self._logfactorial(self.boundaries) 1202 1203 def update_count(self, construction, old_count, new_count): 1204 """Update the counts in the encoding.""" 1205 self.tokens += new_count - old_count 1206 if old_count > 1: 1207 self.logtokensum -= old_count * math.log(old_count) 1208 if new_count > 1: 1209 self.logtokensum += new_count * math.log(new_count) 1210 1211 def get_cost(self): 1212 """Calculate the cost for encoding the corpus/lexicon""" 1213 if self.boundaries == 0: 1214 return 0.0 1215 1216 n = self.tokens + self.boundaries 1217 return ((n * math.log(n) 1218 - self.boundaries * math.log(self.boundaries) 1219 - self.logtokensum 1220 + self.permutations_cost()) * self.weight 1221 + self.frequency_distribution_cost()) 1222 1223 1224class CorpusEncoding(Encoding): 1225 """Encoding the corpus class 1226 1227 The basic difference to a normal encoding is that the number of types is 1228 not stored directly but fetched from the lexicon encoding. Also does the 1229 cost function not contain any permutation cost. 1230 """ 1231 def __init__(self, lexicon_encoding, weight=1.0): 1232 super(CorpusEncoding, self).__init__(weight) 1233 self.lexicon_encoding = lexicon_encoding 1234 1235 @property 1236 def types(self): 1237 """Return the number of types of the corpus, which is the same as the 1238 number of boundaries in the lexicon + 1 1239 1240 """ 1241 return self.lexicon_encoding.boundaries + 1 1242 1243 def frequency_distribution_cost(self): 1244 """Calculate -log[(M - 1)! (N - M)! / (N - 1)!] for M types and N 1245 tokens. 1246 1247 """ 1248 if self.types < 2: 1249 return 0.0 1250 tokens = self.tokens 1251 return (self._logfactorial(tokens - 1) - 1252 self._logfactorial(self.types - 2) - 1253 self._logfactorial(tokens - self.types + 1)) 1254 1255 def get_cost(self): 1256 """Override for the Encoding get_cost function. A corpus does not 1257 have a permutation cost 1258 1259 """ 1260 if self.boundaries == 0: 1261 return 0.0 1262 1263 n = self.tokens + self.boundaries 1264 return ((n * math.log(n) 1265 - self.boundaries * math.log(self.boundaries) 1266 - self.logtokensum) * self.weight 1267 + self.frequency_distribution_cost()) 1268 1269 1270class AnnotatedCorpusEncoding(Encoding): 1271 """Encoding the cost of an Annotated Corpus. 1272 1273 In this encoding constructions that are missing are penalized. 1274 1275 """ 1276 def __init__(self, corpus_coding, weight=None, penalty=-9999.9): 1277 """ 1278 Initialize encoding with appropriate meta data 1279 1280 Arguments: 1281 corpus_coding: CorpusEncoding instance used for retrieving the 1282 number of tokens and boundaries in the corpus 1283 weight: The weight of this encoding. If the weight is None, 1284 it is updated automatically to be in balance with the 1285 corpus 1286 penalty: log penalty used for missing constructions 1287 1288 """ 1289 super(AnnotatedCorpusEncoding, self).__init__() 1290 self.do_update_weight = True 1291 self.weight = 1.0 1292 if weight is not None: 1293 self.do_update_weight = False 1294 self.weight = weight 1295 self.corpus_coding = corpus_coding 1296 self.penalty = penalty 1297 self.constructions = collections.Counter() 1298 1299 def set_constructions(self, constructions): 1300 """Method for re-initializing the constructions. The count of the 1301 constructions must still be set with a call to set_count 1302 1303 """ 1304 self.constructions = constructions 1305 self.tokens = sum(constructions.values()) 1306 self.logtokensum = 0.0 1307 1308 def set_count(self, construction, count): 1309 """Set an initial count for each construction. Missing constructions 1310 are penalized 1311 """ 1312 annot_count = self.constructions[construction] 1313 if count > 0: 1314 self.logtokensum += annot_count * math.log(count) 1315 else: 1316 self.logtokensum += annot_count * self.penalty 1317 1318 def update_count(self, construction, old_count, new_count): 1319 """Update the counts in the Encoding, setting (or removing) a penalty 1320 for missing constructions 1321 1322 """ 1323 if construction in self.constructions: 1324 annot_count = self.constructions[construction] 1325 if old_count > 0: 1326 self.logtokensum -= annot_count * math.log(old_count) 1327 else: 1328 self.logtokensum -= annot_count * self.penalty 1329 if new_count > 0: 1330 self.logtokensum += annot_count * math.log(new_count) 1331 else: 1332 self.logtokensum += annot_count * self.penalty 1333 1334 def update_weight(self): 1335 """Update the weight of the Encoding by taking the ratio of the 1336 corpus boundaries and annotated boundaries 1337 """ 1338 if not self.do_update_weight: 1339 return 1340 old = self.weight 1341 self.weight = (self.corpus_coding.weight * 1342 float(self.corpus_coding.boundaries) / self.boundaries) 1343 if self.weight != old: 1344 _logger.info("Corpus weight of annotated data set to %s", self.weight) 1345 1346 def get_cost(self): 1347 """Return the cost of the Annotation Corpus.""" 1348 if self.boundaries == 0: 1349 return 0.0 1350 n = self.tokens + self.boundaries 1351 return ((n * math.log(self.corpus_coding.tokens + 1352 self.corpus_coding.boundaries) 1353 - self.boundaries * math.log(self.corpus_coding.boundaries) 1354 - self.logtokensum) * self.weight) 1355 1356 1357class LexiconEncoding(Encoding): 1358 """Class for calculating the encoding cost for the Lexicon""" 1359 1360 def __init__(self): 1361 """Initialize Lexcion Encoding""" 1362 super(LexiconEncoding, self).__init__() 1363 self.atoms = collections.Counter() 1364 1365 @property 1366 def types(self): 1367 """Return the number of different atoms in the lexicon + 1 for the 1368 compound-end-token 1369 1370 """ 1371 return len(self.atoms) + 1 1372 1373 def add(self, construction): 1374 """Add a construction to the lexicon, updating automatically the 1375 count for its atoms 1376 1377 """ 1378 self.boundaries += 1 1379 for atom in construction: 1380 c = self.atoms[atom] 1381 self.atoms[atom] = c + 1 1382 self.update_count(atom, c, c + 1) 1383 1384 def remove(self, construction): 1385 """Remove construction from the lexicon, updating automatically the 1386 count for its atoms 1387 1388 """ 1389 self.boundaries -= 1 1390 for atom in construction: 1391 c = self.atoms[atom] 1392 self.atoms[atom] = c - 1 1393 self.update_count(atom, c, c - 1) 1394 1395 def get_codelength(self, construction): 1396 """Return an approximate codelength for new construction.""" 1397 l = len(construction) + 1 1398 cost = l * math.log(self.tokens + l) 1399 cost -= math.log(self.boundaries + 1) 1400 for atom in construction: 1401 if atom in self.atoms: 1402 c = self.atoms[atom] 1403 else: 1404 c = 1 1405 cost -= math.log(c) 1406 return cost 1407