1import collections
2import heapq
3import logging
4import math
5import numbers
6import random
7import re
8
9from .utils import _progress, _is_string
10from .exception import MorfessorException, SegmentOnlyModelException
11
12_logger = logging.getLogger(__name__)
13
14
15def _constructions_to_str(constructions):
16    """Return a readable string for a list of constructions."""
17    if _is_string(constructions[0]):
18        # Constructions are strings
19        return ' + '.join(constructions)
20    else:
21        # Constructions are not strings (should be tuples of strings)
22        return ' + '.join(map(lambda x: ' '.join(x), constructions))
23
24
25# rcount = root count (from corpus)
26# count = total count of the node
27# splitloc = integer or tuple. Location(s) of the possible splits for virtual
28#            constructions; empty tuple or 0 if real construction
29ConstrNode = collections.namedtuple('ConstrNode',
30                                    ['rcount', 'count', 'splitloc'])
31
32
33class BaselineModel(object):
34    """Morfessor Baseline model class.
35
36    Implements training of and segmenting with a Morfessor model. The model
37    is complete agnostic to whether it is used with lists of strings (finding
38    phrases in sentences) or strings of characters (finding morphs in words).
39
40    """
41
42    penalty = -9999.9
43
44    def __init__(self, forcesplit_list=None, corpusweight=None,
45                 use_skips=False, nosplit_re=None):
46        """Initialize a new model instance.
47
48        Arguments:
49            forcesplit_list: force segmentations on the characters in
50                               the given list
51            corpusweight: weight for the corpus cost
52            use_skips: randomly skip frequently occurring constructions
53                         to speed up training
54            nosplit_re: regular expression string for preventing splitting
55                          in certain contexts
56
57        """
58
59        # In analyses for each construction a ConstrNode is stored. All
60        # training data has a rcount (real count) > 0. All real morphemes
61        # have no split locations.
62        self._analyses = {}
63
64        # Flag to indicate the model is only useful for segmentation
65        self._segment_only = False
66
67        # Cost variables
68        self._lexicon_coding = LexiconEncoding()
69        self._corpus_coding = CorpusEncoding(self._lexicon_coding)
70        self._annot_coding = None
71
72        #Set corpus weight updater
73        self.set_corpus_weight_updater(corpusweight)
74
75        # Configuration variables
76        self._use_skips = use_skips  # Random skips for frequent constructions
77        self._supervised = False
78
79        # Counter for random skipping
80        self._counter = collections.Counter()
81        if forcesplit_list is None:
82            self.forcesplit_list = []
83        else:
84            self.forcesplit_list = forcesplit_list
85        if nosplit_re is None:
86            self.nosplit_re = None
87        else:
88            self.nosplit_re = re.compile(nosplit_re, re.UNICODE)
89
90        # Used only for (semi-)supervised learning
91        self.annotations = None
92
93    def set_corpus_weight_updater(self, corpus_weight):
94        if corpus_weight is None:
95            self._corpus_weight_updater = FixedCorpusWeight(1.0)
96        elif isinstance(corpus_weight, numbers.Number):
97            self._corpus_weight_updater = FixedCorpusWeight(corpus_weight)
98        else:
99            self._corpus_weight_updater = corpus_weight
100
101        self._corpus_weight_updater.update(self, 0)
102
103    def _check_segment_only(self):
104        if self._segment_only:
105            raise SegmentOnlyModelException()
106
107    @property
108    def tokens(self):
109        """Return the number of construction tokens."""
110        return self._corpus_coding.tokens
111
112    @property
113    def types(self):
114        """Return the number of construction types."""
115        return self._corpus_coding.types - 1  # do not include boundary
116
117    def _add_compound(self, compound, c):
118        """Add compound with count c to data."""
119        self._corpus_coding.boundaries += c
120        self._modify_construction_count(compound, c)
121        oldrc = self._analyses[compound].rcount
122        self._analyses[compound] = \
123            self._analyses[compound]._replace(rcount=oldrc + c)
124
125    def _remove(self, construction):
126        """Remove construction from model."""
127        rcount, count, splitloc = self._analyses[construction]
128        self._modify_construction_count(construction, -count)
129        return rcount, count
130
131    def _random_split(self, compound, threshold):
132        """Return a random split for compound.
133
134        Arguments:
135            compound: compound to split
136            threshold: probability of splitting at each position
137
138        """
139        splitloc = tuple(i for i in range(1, len(compound))
140                         if random.random() < threshold)
141        return self._splitloc_to_segmentation(compound, splitloc)
142
143    def _set_compound_analysis(self, compound, parts, ptype='rbranch'):
144        """Set analysis of compound to according to given segmentation.
145
146        Arguments:
147            compound: compound to split
148            parts: desired constructions of the compound
149            ptype: type of the parse tree to use
150
151        If ptype is 'rbranch', the analysis is stored internally as a
152        right-branching tree. If ptype is 'flat', the analysis is stored
153        directly to the compound's node.
154
155        """
156        if len(parts) == 1:
157            rcount, count = self._remove(compound)
158            self._analyses[compound] = ConstrNode(rcount, 0, tuple())
159            self._modify_construction_count(compound, count)
160        elif ptype == 'flat':
161            rcount, count = self._remove(compound)
162            splitloc = self.segmentation_to_splitloc(parts)
163            self._analyses[compound] = ConstrNode(rcount, count, splitloc)
164            for constr in parts:
165                self._modify_construction_count(constr, count)
166        elif ptype == 'rbranch':
167            construction = compound
168            for p in range(len(parts)):
169                rcount, count = self._remove(construction)
170                prefix = parts[p]
171                if p == len(parts) - 1:
172                    self._analyses[construction] = ConstrNode(rcount, 0,
173                                                              0)
174                    self._modify_construction_count(construction, count)
175                else:
176                    suffix = self._join_constructions(parts[p + 1:])
177                    self._analyses[construction] = ConstrNode(rcount, count,
178                                                              len(prefix))
179                    self._modify_construction_count(prefix, count)
180                    self._modify_construction_count(suffix, count)
181                    construction = suffix
182        else:
183            raise MorfessorException("Unknown parse type '%s'" % ptype)
184
185    def _update_annotation_choices(self):
186        """Update the selection of alternative analyses in annotations.
187
188        For semi-supervised models, select the most likely alternative
189        analyses included in the annotations of the compounds.
190
191        """
192        if not self._supervised:
193            return
194
195        # Collect constructions from the most probable segmentations
196        # and add missing compounds also to the unannotated data
197        constructions = collections.Counter()
198        for compound, alternatives in self.annotations.items():
199            if compound not in self._analyses:
200                self._add_compound(compound, 1)
201
202            analysis, cost = self._best_analysis(alternatives)
203            for m in analysis:
204                constructions[m] += self._analyses[compound].rcount
205
206        # Apply the selected constructions in annotated corpus coding
207        self._annot_coding.set_constructions(constructions)
208        for m, f in constructions.items():
209            count = 0
210            if m in self._analyses and not self._analyses[m].splitloc:
211                count = self._analyses[m].count
212            self._annot_coding.set_count(m, count)
213
214    def _best_analysis(self, choices):
215        """Select the best analysis out of the given choices."""
216        bestcost = None
217        bestanalysis = None
218        for analysis in choices:
219            cost = 0.0
220            for m in analysis:
221                if m in self._analyses and not self._analyses[m].splitloc:
222                    cost += (math.log(self._corpus_coding.tokens) -
223                             math.log(self._analyses[m].count))
224                else:
225                    cost -= self.penalty  # penalty is negative
226            if bestcost is None or cost < bestcost:
227                bestcost = cost
228                bestanalysis = analysis
229        return bestanalysis, bestcost
230
231    def _force_split(self, compound):
232        """Return forced split of the compound."""
233        if len(self.forcesplit_list) == 0:
234            return [compound]
235        clen = len(compound)
236        j = 0
237        parts = []
238        for i in range(0, clen):
239            if compound[i] in self.forcesplit_list:
240                if len(compound[j:i]) > 0:
241                    parts.append(compound[j:i])
242                parts.append(compound[i:i + 1])
243                j = i + 1
244        if j < clen:
245            parts.append(compound[j:])
246        return [p for p in parts if len(p) > 0]
247
248    def _test_skip(self, construction):
249        """Return true if construction should be skipped."""
250        if construction in self._counter:
251            t = self._counter[construction]
252            if random.random() > 1.0 / max(1, t):
253                return True
254        self._counter[construction] += 1
255        return False
256
257    def _viterbi_optimize(self, compound, addcount=0, maxlen=30):
258        """Optimize segmentation of the compound using the Viterbi algorithm.
259
260        Arguments:
261          compound: compound to optimize
262          addcount: constant for additive smoothing of Viterbi probs
263          maxlen: maximum length for a construction
264
265        Returns list of segments.
266
267        """
268        clen = len(compound)
269        if clen == 1:  # Single atom
270            return [compound]
271        if self._use_skips and self._test_skip(compound):
272            return self.segment(compound)
273        # Collect forced subsegments
274        parts = self._force_split(compound)
275        # Use Viterbi algorithm to optimize the subsegments
276        constructions = []
277        for part in parts:
278            constructions += self.viterbi_segment(part, addcount=addcount,
279                                                  maxlen=maxlen)[0]
280        self._set_compound_analysis(compound, constructions, ptype='flat')
281        return constructions
282
283    def _recursive_optimize(self, compound):
284        """Optimize segmentation of the compound using recursive splitting.
285
286        Returns list of segments.
287
288        """
289        if len(compound) == 1:  # Single atom
290            return [compound]
291        if self._use_skips and self._test_skip(compound):
292            return self.segment(compound)
293        # Collect forced subsegments
294        parts = self._force_split(compound)
295        if len(parts) == 1:
296            # just one part
297            return self._recursive_split(compound)
298        self._set_compound_analysis(compound, parts)
299        # Use recursive algorithm to optimize the subsegments
300        constructions = []
301        for part in parts:
302            constructions += self._recursive_split(part)
303        return constructions
304
305    def _recursive_split(self, construction):
306        """Optimize segmentation of the construction by recursive splitting.
307
308        Returns list of segments.
309
310        """
311        if len(construction) == 1:  # Single atom
312            return [construction]
313        if self._use_skips and self._test_skip(construction):
314            return self.segment(construction)
315        rcount, count = self._remove(construction)
316
317        # Check all binary splits and no split
318        self._modify_construction_count(construction, count)
319        mincost = self.get_cost()
320        self._modify_construction_count(construction, -count)
321        splitloc = 0
322        for i in range(1, len(construction)):
323            if (self.nosplit_re and
324                    self.nosplit_re.match(construction[(i - 1):(i + 1)])):
325                continue
326            prefix = construction[:i]
327            suffix = construction[i:]
328            self._modify_construction_count(prefix, count)
329            self._modify_construction_count(suffix, count)
330            cost = self.get_cost()
331            self._modify_construction_count(prefix, -count)
332            self._modify_construction_count(suffix, -count)
333            if cost <= mincost:
334                mincost = cost
335                splitloc = i
336
337        if splitloc:
338            # Virtual construction
339            self._analyses[construction] = ConstrNode(rcount, count,
340                                                      splitloc)
341            prefix = construction[:splitloc]
342            suffix = construction[splitloc:]
343            self._modify_construction_count(prefix, count)
344            self._modify_construction_count(suffix, count)
345            lp = self._recursive_split(prefix)
346            if suffix != prefix:
347                return lp + self._recursive_split(suffix)
348            else:
349                return lp + lp
350        else:
351            # Real construction
352            self._analyses[construction] = ConstrNode(rcount, 0, tuple())
353            self._modify_construction_count(construction, count)
354            return [construction]
355
356    def _modify_construction_count(self, construction, dcount):
357        """Modify the count of construction by dcount.
358
359        For virtual constructions, recurses to child nodes in the
360        tree. For real constructions, adds/removes construction
361        to/from the lexicon whenever necessary.
362
363        """
364        if construction in self._analyses:
365            rcount, count, splitloc = self._analyses[construction]
366        else:
367            rcount, count, splitloc = 0, 0, 0
368        newcount = count + dcount
369        if newcount == 0:
370            del self._analyses[construction]
371        else:
372            self._analyses[construction] = ConstrNode(rcount, newcount,
373                                                      splitloc)
374        if splitloc:
375            # Virtual construction
376            children = self._splitloc_to_segmentation(construction, splitloc)
377            for child in children:
378                self._modify_construction_count(child, dcount)
379        else:
380            # Real construction
381            self._corpus_coding.update_count(construction, count, newcount)
382            if self._supervised:
383                self._annot_coding.update_count(construction, count, newcount)
384
385            if count == 0 and newcount > 0:
386                self._lexicon_coding.add(construction)
387            elif count > 0 and newcount == 0:
388                self._lexicon_coding.remove(construction)
389
390    def _epoch_update(self, epoch_num):
391        """Do model updates that are necessary between training epochs.
392
393        The argument is the number of training epochs finished.
394
395        In practice, this does two things:
396        - If random skipping is in use, reset construction counters.
397        - If semi-supervised learning is in use and there are alternative
398          analyses in the annotated data, select the annotations that are
399          most likely given the model parameters. If not hand-set, update
400          the weight of the annotated corpus.
401
402        This method should also be run prior to training (with the
403        epoch number argument as 0).
404
405        """
406        forced_epochs = 0
407        if self._corpus_weight_updater.update(self, epoch_num):
408            forced_epochs += 2
409
410        if self._use_skips:
411            self._counter = collections.Counter()
412        if self._supervised:
413            self._update_annotation_choices()
414            self._annot_coding.update_weight()
415
416        return forced_epochs
417
418    @staticmethod
419    def segmentation_to_splitloc(constructions):
420        """Return a list of split locations for a segmented compound."""
421        splitloc = []
422        i = 0
423        for c in constructions:
424            i += len(c)
425            splitloc.append(i)
426        return tuple(splitloc[:-1])
427
428    @staticmethod
429    def _splitloc_to_segmentation(compound, splitloc):
430        """Return segmentation corresponding to the list of split locations."""
431        if isinstance(splitloc, numbers.Number):
432            return [compound[:splitloc], compound[splitloc:]]
433        parts = []
434        startpos = 0
435        endpos = 0
436        for i in range(len(splitloc)):
437            endpos = splitloc[i]
438            parts.append(compound[startpos:endpos])
439            startpos = endpos
440        parts.append(compound[endpos:])
441        return parts
442
443    @staticmethod
444    def _join_constructions(constructions):
445        """Append the constructions after each other by addition. Works for
446        both lists and strings """
447        result = type(constructions[0])()
448        for c in constructions:
449            result += c
450        return result
451
452    def get_compounds(self):
453        """Return the compound types stored by the model."""
454        self._check_segment_only()
455        return [w for w, node in self._analyses.items()
456                if node.rcount > 0]
457
458    def get_constructions(self):
459        """Return a list of the present constructions and their counts."""
460        return sorted((c, node.count) for c, node in self._analyses.items()
461                      if not node.splitloc)
462
463    def get_cost(self):
464        """Return current model encoding cost."""
465        cost = self._corpus_coding.get_cost() + self._lexicon_coding.get_cost()
466        if self._supervised:
467            return cost + self._annot_coding.get_cost()
468        else:
469            return cost
470
471    def get_segmentations(self):
472        """Retrieve segmentations for all compounds encoded by the model."""
473        self._check_segment_only()
474        for w in sorted(self._analyses.keys()):
475            c = self._analyses[w].rcount
476            if c > 0:
477                yield c, w, self.segment(w)
478
479    def load_data(self, data, freqthreshold=1, count_modifier=None,
480                  init_rand_split=None):
481        """Load data to initialize the model for batch training.
482
483        Arguments:
484            data: iterator of (count, compound_atoms) tuples
485            freqthreshold: discard compounds that occur less than
486                             given times in the corpus (default 1)
487            count_modifier: function for adjusting the counts of each
488                              compound
489            init_rand_split: If given, random split the word with
490                               init_rand_split as the probability for each
491                               split
492
493        Adds the compounds in the corpus to the model lexicon. Returns
494        the total cost.
495
496        """
497        self._check_segment_only()
498        totalcount = collections.Counter()
499        for count, atoms in data:
500            if len(atoms) > 0:
501                totalcount[atoms] += count
502
503        for atoms, count in totalcount.items():
504            if count < freqthreshold:
505                continue
506            if count_modifier is not None:
507                self._add_compound(atoms, count_modifier(count))
508            else:
509                self._add_compound(atoms, count)
510
511            if init_rand_split is not None and init_rand_split > 0:
512                parts = self._random_split(atoms, init_rand_split)
513                self._set_compound_analysis(atoms, parts)
514
515        return self.get_cost()
516
517    def load_segmentations(self, segmentations):
518        """Load model from existing segmentations.
519
520        The argument should be an iterator providing a count, a
521        compound, and its segmentation.
522
523        """
524        self._check_segment_only()
525        for count, compound, segmentation in segmentations:
526            self._add_compound(compound, count)
527            self._set_compound_analysis(compound, segmentation)
528
529    def set_annotations(self, annotations, annotatedcorpusweight=None):
530        """Prepare model for semi-supervised learning with given
531         annotations.
532
533         """
534        self._check_segment_only()
535        self._supervised = True
536        self.annotations = annotations
537        self._annot_coding = AnnotatedCorpusEncoding(self._corpus_coding,
538                                                     weight=
539                                                     annotatedcorpusweight)
540        self._annot_coding.boundaries = len(self.annotations)
541
542    def segment(self, compound):
543        """Segment the compound by looking it up in the model analyses.
544
545        Raises KeyError if compound is not present in the training
546        data. For segmenting new words, use viterbi_segment(compound).
547
548        """
549        self._check_segment_only()
550        rcount, count, splitloc = self._analyses[compound]
551        constructions = []
552        if splitloc:
553            for child in self._splitloc_to_segmentation(compound,
554                                                        splitloc):
555                constructions += self.segment(child)
556        else:
557            constructions.append(compound)
558        return constructions
559
560    def train_batch(self, algorithm='recursive', algorithm_params=(),
561                    finish_threshold=0.005, max_epochs=None):
562        """Train the model in batch fashion.
563
564        The model is trained with the data already loaded into the model (by
565        using an existing model or calling one of the load_ methods).
566
567        In each iteration (epoch) all compounds in the training data are
568        optimized once, in a random order. If applicable, corpus weight,
569        annotation cost, and random split counters are recalculated after
570        each iteration.
571
572        Arguments:
573            algorithm: string in ('recursive', 'viterbi') that indicates
574                         the splitting algorithm used.
575            algorithm_params: parameters passed to the splitting algorithm.
576            finish_threshold: the stopping threshold. Training stops when
577                                the improvement of the last iteration is
578                                smaller then finish_threshold * #boundaries
579            max_epochs: maximum number of epochs to train
580
581        """
582        epochs = 0
583        forced_epochs = max(1, self._epoch_update(epochs))
584        newcost = self.get_cost()
585        compounds = list(self.get_compounds())
586        _logger.info("Compounds in training data: %s types / %s tokens",
587                     len(compounds), self._corpus_coding.boundaries)
588
589        _logger.info("Starting batch training")
590        _logger.info("Epochs: %s\tCost: %s", epochs, newcost)
591
592        while True:
593            # One epoch
594            random.shuffle(compounds)
595
596            for w in _progress(compounds):
597                if algorithm == 'recursive':
598                    segments = self._recursive_optimize(w, *algorithm_params)
599                elif algorithm == 'viterbi':
600                    segments = self._viterbi_optimize(w, *algorithm_params)
601                else:
602                    raise MorfessorException("unknown algorithm '%s'" %
603                                             algorithm)
604                _logger.debug("#%s -> %s", w, _constructions_to_str(segments))
605            epochs += 1
606
607            _logger.debug("Cost before epoch update: %s", self.get_cost())
608            forced_epochs = max(forced_epochs, self._epoch_update(epochs))
609            oldcost = newcost
610            newcost = self.get_cost()
611
612            _logger.info("Epochs: %s\tCost: %s", epochs, newcost)
613            if (forced_epochs == 0 and
614                    newcost >= oldcost - finish_threshold *
615                    self._corpus_coding.boundaries):
616                break
617            if forced_epochs > 0:
618                forced_epochs -= 1
619            if max_epochs is not None and epochs >= max_epochs:
620                _logger.info("Max number of epochs reached, stop training")
621                break
622        _logger.info("Done.")
623        return epochs, newcost
624
625    def train_online(self, data, count_modifier=None, epoch_interval=10000,
626                     algorithm='recursive', algorithm_params=(),
627                     init_rand_split=None, max_epochs=None):
628        """Train the model in online fashion.
629
630        The model is trained with the data provided in the data argument.
631        As example the data could come from a generator linked to standard in
632        for live monitoring of the splitting.
633
634        All compounds from data are only optimized once. After online
635        training, batch training could be used for further optimization.
636
637        Epochs are defined as a fixed number of compounds. After each epoch (
638        like in batch training), the annotation cost, and random split counters
639        are recalculated if applicable.
640
641        Arguments:
642            data: iterator of (_, compound_atoms) tuples. The first
643                    argument is ignored, as every occurence of the
644                    compound is taken with count 1
645            count_modifier: function for adjusting the counts of each
646                              compound
647            epoch_interval: number of compounds to process before starting
648                              a new epoch
649            algorithm: string in ('recursive', 'viterbi') that indicates
650                         the splitting algorithm used.
651            algorithm_params: parameters passed to the splitting algorithm.
652            init_rand_split: probability for random splitting a compound to
653                               at any point for initializing the model. None
654                               or 0 means no random splitting.
655            max_epochs: maximum number of epochs to train
656
657        """
658        self._check_segment_only()
659        if count_modifier is not None:
660            counts = {}
661
662        _logger.info("Starting online training")
663
664        epochs = 0
665        i = 0
666        more_tokens = True
667        while more_tokens:
668            self._epoch_update(epochs)
669            newcost = self.get_cost()
670            _logger.info("Tokens processed: %s\tCost: %s", i, newcost)
671
672            for _ in _progress(range(epoch_interval)):
673                try:
674                    _, w = next(data)
675                except StopIteration:
676                    more_tokens = False
677                    break
678
679                if len(w) == 0:
680                    # Newline in corpus
681                    continue
682
683                if count_modifier is not None:
684                    if w not in counts:
685                        c = 0
686                        counts[w] = 1
687                        addc = 1
688                    else:
689                        c = counts[w]
690                        counts[w] = c + 1
691                        addc = count_modifier(c + 1) - count_modifier(c)
692                    if addc > 0:
693                        self._add_compound(w, addc)
694                else:
695                    self._add_compound(w, 1)
696                if init_rand_split is not None and init_rand_split > 0:
697                    parts = self._random_split(w, init_rand_split)
698                    self._set_compound_analysis(w, parts)
699                if algorithm == 'recursive':
700                    segments = self._recursive_optimize(w, *algorithm_params)
701                elif algorithm == 'viterbi':
702                    segments = self._viterbi_optimize(w, *algorithm_params)
703                else:
704                    raise MorfessorException("unknown algorithm '%s'" %
705                                             algorithm)
706                _logger.debug("#%s: %s -> %s", i, w, _constructions_to_str(segments))
707                i += 1
708
709            epochs += 1
710            if max_epochs is not None and epochs >= max_epochs:
711                _logger.info("Max number of epochs reached, stop training")
712                break
713
714        self._epoch_update(epochs)
715        newcost = self.get_cost()
716        _logger.info("Tokens processed: %s\tCost: %s", i, newcost)
717        return epochs, newcost
718
719    def viterbi_segment(self, compound, addcount=1.0, maxlen=30):
720        """Find optimal segmentation using the Viterbi algorithm.
721
722        Arguments:
723          compound: compound to be segmented
724          addcount: constant for additive smoothing (0 = no smoothing)
725          maxlen: maximum length for the constructions
726
727        If additive smoothing is applied, new complex construction types can
728        be selected during the search. Without smoothing, only new
729        single-atom constructions can be selected.
730
731        Returns the most probable segmentation and its log-probability.
732
733        """
734        clen = len(compound)
735        grid = [(0.0, None)]
736        if self._corpus_coding.tokens + self._corpus_coding.boundaries + \
737                addcount > 0:
738            logtokens = math.log(self._corpus_coding.tokens +
739                                 self._corpus_coding.boundaries + addcount)
740        else:
741            logtokens = 0
742        badlikelihood = clen * logtokens + 1.0
743        # Viterbi main loop
744        for t in range(1, clen + 1):
745            # Select the best path to current node.
746            # Note that we can come from any node in history.
747            bestpath = None
748            bestcost = None
749            if self.nosplit_re and t < clen and \
750                    self.nosplit_re.match(compound[(t-1):(t+1)]):
751                grid.append((clen*badlikelihood, t-1))
752                continue
753            for pt in range(max(0, t - maxlen), t):
754                if grid[pt][0] is None:
755                    continue
756                cost = grid[pt][0]
757                construction = compound[pt:t]
758                if (construction in self._analyses and
759                        not self._analyses[construction].splitloc):
760                    if self._analyses[construction].count <= 0:
761                        raise MorfessorException(
762                            "Construction count of '%s' is %s" %
763                            (construction,
764                             self._analyses[construction].count))
765                    cost += (logtokens -
766                             math.log(self._analyses[construction].count +
767                                      addcount))
768                elif addcount > 0:
769                    if self._corpus_coding.tokens == 0:
770                        cost += (addcount * math.log(addcount) +
771                                 self._lexicon_coding.get_codelength(
772                                     construction)
773                                 / self._corpus_coding.weight)
774                    else:
775                        cost += (logtokens - math.log(addcount) +
776                                 (((self._lexicon_coding.boundaries +
777                                    addcount) *
778                                   math.log(self._lexicon_coding.boundaries
779                                            + addcount))
780                                  - (self._lexicon_coding.boundaries
781                                     * math.log(self._lexicon_coding.boundaries))
782                                  + self._lexicon_coding.get_codelength(
783                                      construction))
784                                 / self._corpus_coding.weight)
785                elif len(construction) == 1:
786                    cost += badlikelihood
787                elif self.nosplit_re:
788                    # Some splits are forbidden, so longer unknown
789                    # constructions have to be allowed
790                    cost += len(construction) * badlikelihood
791                else:
792                    continue
793                if bestcost is None or cost < bestcost:
794                    bestcost = cost
795                    bestpath = pt
796            grid.append((bestcost, bestpath))
797        constructions = []
798        cost, path = grid[-1]
799        lt = clen + 1
800        while path is not None:
801            t = path
802            constructions.append(compound[t:lt])
803            path = grid[t][1]
804            lt = t
805        constructions.reverse()
806        # Add boundary cost
807        cost += (math.log(self._corpus_coding.tokens +
808                          self._corpus_coding.boundaries) -
809                 math.log(self._corpus_coding.boundaries))
810        return constructions, cost
811
812    def forward_logprob(self, compound):
813        """Find log-probability of a compound using the forward algorithm.
814
815        Arguments:
816          compound: compound to process
817
818        Returns the (negative) log-probability of the compound. If the
819        probability is zero, returns a number that is larger than the
820        value defined by the penalty attribute of the model object.
821
822        """
823        clen = len(compound)
824        grid = [0.0]
825        if self._corpus_coding.tokens + self._corpus_coding.boundaries > 0:
826            logtokens = math.log(self._corpus_coding.tokens +
827                                 self._corpus_coding.boundaries)
828        else:
829            logtokens = 0
830        # Forward main loop
831        for t in range(1, clen + 1):
832            # Sum probabilities from all paths to the current node.
833            # Note that we can come from any node in history.
834            psum = 0.0
835            for pt in range(0, t):
836                cost = grid[pt]
837                construction = compound[pt:t]
838                if (construction in self._analyses and
839                        not self._analyses[construction].splitloc):
840                    if self._analyses[construction].count <= 0:
841                        raise MorfessorException(
842                            "Construction count of '%s' is %s" %
843                            (construction,
844                             self._analyses[construction].count))
845                    cost += (logtokens -
846                             math.log(self._analyses[construction].count))
847                else:
848                    continue
849                psum += math.exp(-cost)
850            if psum > 0:
851                grid.append(-math.log(psum))
852            else:
853                grid.append(-self.penalty)
854        cost = grid[-1]
855        # Add boundary cost
856        cost += (math.log(self._corpus_coding.tokens +
857                          self._corpus_coding.boundaries) -
858                 math.log(self._corpus_coding.boundaries))
859        return cost
860
861    def viterbi_nbest(self, compound, n, addcount=1.0, maxlen=30):
862        """Find top-n optimal segmentations using the Viterbi algorithm.
863
864        Arguments:
865          compound: compound to be segmented
866          n: how many segmentations to return
867          addcount: constant for additive smoothing (0 = no smoothing)
868          maxlen: maximum length for the constructions
869
870        If additive smoothing is applied, new complex construction types can
871        be selected during the search. Without smoothing, only new
872        single-atom constructions can be selected.
873
874        Returns the n most probable segmentations and their
875        log-probabilities.
876
877        """
878        clen = len(compound)
879        grid = [[(0.0, None, None)]]
880        if self._corpus_coding.tokens + self._corpus_coding.boundaries + \
881                addcount > 0:
882            logtokens = math.log(self._corpus_coding.tokens +
883                                 self._corpus_coding.boundaries + addcount)
884        else:
885            logtokens = 0
886        badlikelihood = clen * logtokens + 1.0
887        # Viterbi main loop
888        for t in range(1, clen + 1):
889            # Select the best path to current node.
890            # Note that we can come from any node in history.
891            bestn = []
892            if self.nosplit_re and t < clen and \
893                    self.nosplit_re.match(compound[(t-1):(t+1)]):
894                grid.append([(-clen*badlikelihood, t-1, -1)])
895                continue
896            for pt in range(max(0, t - maxlen), t):
897                for k in range(len(grid[pt])):
898                    if grid[pt][k][0] is None:
899                        continue
900                    cost = grid[pt][k][0]
901                    construction = compound[pt:t]
902                    if (construction in self._analyses and
903                            not self._analyses[construction].splitloc):
904                        if self._analyses[construction].count <= 0:
905                            raise MorfessorException(
906                                "Construction count of '%s' is %s" %
907                                (construction,
908                                 self._analyses[construction].count))
909                        cost -= (logtokens -
910                                 math.log(self._analyses[construction].count +
911                                          addcount))
912                    elif addcount > 0:
913                        if self._corpus_coding.tokens == 0:
914                            cost -= (addcount * math.log(addcount) +
915                                     self._lexicon_coding.get_codelength(
916                                         construction)
917                                     / self._corpus_coding.weight)
918                        else:
919                            cost -= (logtokens - math.log(addcount) +
920                                     (((self._lexicon_coding.boundaries +
921                                        addcount) *
922                                       math.log(self._lexicon_coding.boundaries
923                                                + addcount))
924                                      - (self._lexicon_coding.boundaries
925                                         * math.log(self._lexicon_coding.
926                                                    boundaries))
927                                      + self._lexicon_coding.get_codelength(
928                                          construction))
929                                     / self._corpus_coding.weight)
930                    elif len(construction) == 1:
931                        cost -= badlikelihood
932                    elif self.nosplit_re:
933                        # Some splits are forbidden, so longer unknown
934                        # constructions have to be allowed
935                        cost -= len(construction) * badlikelihood
936                    else:
937                        continue
938                    if len(bestn) < n:
939                        heapq.heappush(bestn, (cost, pt, k))
940                    else:
941                        heapq.heappushpop(bestn, (cost, pt, k))
942            grid.append(bestn)
943        results = []
944        for k in range(len(grid[-1])):
945            constructions = []
946            cost, path, ki = grid[-1][k]
947            lt = clen + 1
948            while path is not None:
949                t = path
950                constructions.append(compound[t:lt])
951                path = grid[t][ki][1]
952                ki = grid[t][ki][2]
953                lt = t
954            constructions.reverse()
955            # Add boundary cost
956            cost -= (math.log(self._corpus_coding.tokens +
957                              self._corpus_coding.boundaries) -
958                     math.log(self._corpus_coding.boundaries))
959            results.append((-cost, constructions))
960        return [(constr, cost) for cost, constr in sorted(results)]
961
962    def get_corpus_coding_weight(self):
963        return self._corpus_coding.weight
964
965    def set_corpus_coding_weight(self, weight):
966        self._check_segment_only()
967        self._corpus_coding.weight = weight
968
969    def make_segment_only(self):
970        """Reduce the size of this model by removing all non-morphs from the
971        analyses. After calling this method it is not possible anymore to call
972        any other method that would change the state of the model. Anyway
973        doing so would throw an exception.
974
975        """
976        self._segment_only = True
977        self._analyses = {k: v for (k, v) in self._analyses.items()
978                          if not v.splitloc}
979
980    def clear_segmentation(self):
981        for compound in list(self.get_compounds()):
982            self._set_compound_analysis(compound, [compound])
983
984
985class CorpusWeight(object):
986    @classmethod
987    def move_direction(cls, model, direction, epoch):
988        if direction != 0:
989            weight = model.get_corpus_coding_weight()
990            if direction > 0:
991                weight *= 1 + 2.0 / epoch
992            else:
993                weight *= 1.0 / (1 + 2.0 / epoch)
994            model.set_corpus_coding_weight(weight)
995            _logger.info("Corpus weight set to %s", weight)
996            return True
997        return False
998
999
1000class FixedCorpusWeight(CorpusWeight):
1001    def __init__(self, weight):
1002        self.weight = weight
1003
1004    def update(self, model, _):
1005        model.set_corpus_coding_weight(self.weight)
1006        return False
1007
1008
1009class AnnotationCorpusWeight(CorpusWeight):
1010    """Class for using development annotations to update the corpus weight
1011    during batch training
1012
1013    """
1014
1015    def __init__(self, devel_set, threshold=0.01):
1016        self.data = devel_set
1017        self.threshold = threshold
1018
1019    def update(self, model, epoch):
1020        """Tune model corpus weight based on the precision and
1021        recall of the development data, trying to keep them equal"""
1022        if epoch < 1:
1023            return False
1024        tmp = self.data.items()
1025        wlist, annotations = zip(*tmp)
1026        segments = [model.viterbi_segment(w)[0] for w in wlist]
1027        d = self._estimate_segmentation_dir(segments, annotations)
1028
1029        return self.move_direction(model, d, epoch)
1030
1031    @classmethod
1032    def _boundary_recall(cls, prediction, reference):
1033        """Calculate average boundary recall for given segmentations."""
1034        rec_total = 0
1035        rec_sum = 0.0
1036        for pre_list, ref_list in zip(prediction, reference):
1037            best = -1
1038            for ref in ref_list:
1039                # list of internal boundary positions
1040                ref_b = set(BaselineModel.segmentation_to_splitloc(ref))
1041                if len(ref_b) == 0:
1042                    best = 1.0
1043                    break
1044                for pre in pre_list:
1045                    pre_b = set(BaselineModel.segmentation_to_splitloc(pre))
1046                    r = len(ref_b.intersection(pre_b)) / float(len(ref_b))
1047                    if r > best:
1048                        best = r
1049            if best >= 0:
1050                rec_sum += best
1051                rec_total += 1
1052        return rec_sum, rec_total
1053
1054    @classmethod
1055    def _bpr_evaluation(cls, prediction, reference):
1056        """Return boundary precision, recall, and F-score for segmentations."""
1057        rec_s, rec_t = cls._boundary_recall(prediction, reference)
1058        pre_s, pre_t = cls._boundary_recall(reference, prediction)
1059        rec = rec_s / rec_t
1060        pre = pre_s / pre_t
1061        f = 2.0 * pre * rec / (pre + rec)
1062        return pre, rec, f
1063
1064    def _estimate_segmentation_dir(self, segments, annotations):
1065        """Estimate if the given compounds are under- or oversegmented.
1066
1067        The decision is based on the difference between boundary precision
1068        and recall values for the given sample of segmented data.
1069
1070        Arguments:
1071          segments: list of predicted segmentations
1072          annotations: list of reference segmentations
1073
1074        Return 1 in the case of oversegmentation, -1 in the case of
1075        undersegmentation, and 0 if no changes are required.
1076
1077        """
1078        pre, rec, f = self._bpr_evaluation([[x] for x in segments], annotations)
1079        _logger.info("Boundary evaluation: precision %.4f; recall %.4f", pre, rec)
1080        if abs(pre - rec) < self.threshold:
1081            return 0
1082        elif rec > pre:
1083            return 1
1084        else:
1085            return -1
1086
1087
1088class MorphLengthCorpusWeight(CorpusWeight):
1089    def __init__(self, morph_lenght, threshold=0.01):
1090        self.morph_length = morph_lenght
1091        self.threshold = threshold
1092
1093    def update(self, model, epoch):
1094        if epoch < 1:
1095            return False
1096        cur_length = self.calc_morph_length(model)
1097
1098        _logger.info("Current morph-length: %s", cur_length)
1099
1100        if (abs(self.morph_length - cur_length) / self.morph_length >
1101                self.threshold):
1102            d = abs(self.morph_length - cur_length) / (self.morph_length
1103                                                       - cur_length)
1104            return self.move_direction(model, d, epoch)
1105        return False
1106
1107    @classmethod
1108    def calc_morph_length(cls, model):
1109        total_constructions = 0
1110        total_atoms = 0
1111        for compound in model.get_compounds():
1112            constructions = model.segment(compound)
1113            for construction in constructions:
1114                total_constructions += 1
1115                total_atoms += len(construction)
1116        if total_constructions > 0:
1117            return float(total_atoms) / total_constructions
1118        else:
1119            return 0.0
1120
1121
1122class NumMorphCorpusWeight(CorpusWeight):
1123    def __init__(self, num_morph_types, threshold=0.01):
1124        self.num_morph_types = num_morph_types
1125        self.threshold = threshold
1126
1127    def update(self, model, epoch):
1128        if epoch < 1:
1129            return False
1130        cur_morph_types = model._lexicon_coding.boundaries
1131
1132        _logger.info("Number of morph types: %s", cur_morph_types)
1133
1134
1135        if (abs(self.num_morph_types - cur_morph_types) / self.num_morph_types
1136                > self.threshold):
1137            d = (abs(self.num_morph_types - cur_morph_types) /
1138                 (self.num_morph_types - cur_morph_types))
1139            return self.move_direction(model, d, epoch)
1140        return False
1141
1142class Encoding(object):
1143    """Base class for calculating the entropy (encoding length) of a corpus
1144    or lexicon.
1145
1146    Commonly subclassed to redefine specific methods.
1147
1148    """
1149    def __init__(self, weight=1.0):
1150        """Initizalize class
1151
1152        Arguments:
1153            weight: weight used for this encoding
1154        """
1155        self.logtokensum = 0.0
1156        self.tokens = 0
1157        self.boundaries = 0
1158        self.weight = weight
1159
1160    # constant used for speeding up logfactorial calculations with Stirling's
1161    # approximation
1162    _log2pi = math.log(2 * math.pi)
1163
1164    @property
1165    def types(self):
1166        """Define number of types as 0. types is made a property method to
1167        ensure easy redefinition in subclasses
1168
1169        """
1170        return 0
1171
1172    @classmethod
1173    def _logfactorial(cls, n):
1174        """Calculate logarithm of n!.
1175
1176        For large n (n > 20), use Stirling's approximation.
1177
1178        """
1179        if n < 2:
1180            return 0.0
1181        if n < 20:
1182            return math.log(math.factorial(n))
1183        logn = math.log(n)
1184        return n * logn - n + 0.5 * (logn + cls._log2pi)
1185
1186    def frequency_distribution_cost(self):
1187        """Calculate -log[(u - 1)! (v - u)! / (v - 1)!]
1188
1189        v is the number of tokens+boundaries and u the number of types
1190
1191        """
1192        if self.types < 2:
1193            return 0.0
1194        tokens = self.tokens + self.boundaries
1195        return (self._logfactorial(tokens - 1) -
1196                self._logfactorial(self.types - 1) -
1197                self._logfactorial(tokens - self.types))
1198
1199    def permutations_cost(self):
1200        """The permutations cost for the encoding."""
1201        return -self._logfactorial(self.boundaries)
1202
1203    def update_count(self, construction, old_count, new_count):
1204        """Update the counts in the encoding."""
1205        self.tokens += new_count - old_count
1206        if old_count > 1:
1207            self.logtokensum -= old_count * math.log(old_count)
1208        if new_count > 1:
1209            self.logtokensum += new_count * math.log(new_count)
1210
1211    def get_cost(self):
1212        """Calculate the cost for encoding the corpus/lexicon"""
1213        if self.boundaries == 0:
1214            return 0.0
1215
1216        n = self.tokens + self.boundaries
1217        return ((n * math.log(n)
1218                 - self.boundaries * math.log(self.boundaries)
1219                 - self.logtokensum
1220                 + self.permutations_cost()) * self.weight
1221                + self.frequency_distribution_cost())
1222
1223
1224class CorpusEncoding(Encoding):
1225    """Encoding the corpus class
1226
1227    The basic difference to a normal encoding is that the number of types is
1228    not stored directly but fetched from the lexicon encoding. Also does the
1229    cost function not contain any permutation cost.
1230    """
1231    def __init__(self, lexicon_encoding, weight=1.0):
1232        super(CorpusEncoding, self).__init__(weight)
1233        self.lexicon_encoding = lexicon_encoding
1234
1235    @property
1236    def types(self):
1237        """Return the number of types of the corpus, which is the same as the
1238         number of boundaries in the lexicon + 1
1239
1240        """
1241        return self.lexicon_encoding.boundaries + 1
1242
1243    def frequency_distribution_cost(self):
1244        """Calculate -log[(M - 1)! (N - M)! / (N - 1)!] for M types and N
1245        tokens.
1246
1247        """
1248        if self.types < 2:
1249            return 0.0
1250        tokens = self.tokens
1251        return (self._logfactorial(tokens - 1) -
1252                self._logfactorial(self.types - 2) -
1253                self._logfactorial(tokens - self.types + 1))
1254
1255    def get_cost(self):
1256        """Override for the Encoding get_cost function. A corpus does not
1257        have a permutation cost
1258
1259        """
1260        if self.boundaries == 0:
1261            return 0.0
1262
1263        n = self.tokens + self.boundaries
1264        return ((n * math.log(n)
1265                 - self.boundaries * math.log(self.boundaries)
1266                 - self.logtokensum) * self.weight
1267                + self.frequency_distribution_cost())
1268
1269
1270class AnnotatedCorpusEncoding(Encoding):
1271    """Encoding the cost of an Annotated Corpus.
1272
1273    In this encoding constructions that are missing are penalized.
1274
1275    """
1276    def __init__(self, corpus_coding, weight=None, penalty=-9999.9):
1277        """
1278        Initialize encoding with appropriate meta data
1279
1280        Arguments:
1281            corpus_coding: CorpusEncoding instance used for retrieving the
1282                             number of tokens and boundaries in the corpus
1283            weight: The weight of this encoding. If the weight is None,
1284                      it is updated automatically to be in balance with the
1285                      corpus
1286            penalty: log penalty used for missing constructions
1287
1288        """
1289        super(AnnotatedCorpusEncoding, self).__init__()
1290        self.do_update_weight = True
1291        self.weight = 1.0
1292        if weight is not None:
1293            self.do_update_weight = False
1294            self.weight = weight
1295        self.corpus_coding = corpus_coding
1296        self.penalty = penalty
1297        self.constructions = collections.Counter()
1298
1299    def set_constructions(self, constructions):
1300        """Method for re-initializing the constructions. The count of the
1301        constructions must still be set with a call to set_count
1302
1303        """
1304        self.constructions = constructions
1305        self.tokens = sum(constructions.values())
1306        self.logtokensum = 0.0
1307
1308    def set_count(self, construction, count):
1309        """Set an initial count for each construction. Missing constructions
1310        are penalized
1311        """
1312        annot_count = self.constructions[construction]
1313        if count > 0:
1314            self.logtokensum += annot_count * math.log(count)
1315        else:
1316            self.logtokensum += annot_count * self.penalty
1317
1318    def update_count(self, construction, old_count, new_count):
1319        """Update the counts in the Encoding, setting (or removing) a penalty
1320         for missing constructions
1321
1322        """
1323        if construction in self.constructions:
1324            annot_count = self.constructions[construction]
1325            if old_count > 0:
1326                self.logtokensum -= annot_count * math.log(old_count)
1327            else:
1328                self.logtokensum -= annot_count * self.penalty
1329            if new_count > 0:
1330                self.logtokensum += annot_count * math.log(new_count)
1331            else:
1332                self.logtokensum += annot_count * self.penalty
1333
1334    def update_weight(self):
1335        """Update the weight of the Encoding by taking the ratio of the
1336        corpus boundaries and annotated boundaries
1337        """
1338        if not self.do_update_weight:
1339            return
1340        old = self.weight
1341        self.weight = (self.corpus_coding.weight *
1342                       float(self.corpus_coding.boundaries) / self.boundaries)
1343        if self.weight != old:
1344            _logger.info("Corpus weight of annotated data set to %s", self.weight)
1345
1346    def get_cost(self):
1347        """Return the cost of the Annotation Corpus."""
1348        if self.boundaries == 0:
1349            return 0.0
1350        n = self.tokens + self.boundaries
1351        return ((n * math.log(self.corpus_coding.tokens +
1352                              self.corpus_coding.boundaries)
1353                 - self.boundaries * math.log(self.corpus_coding.boundaries)
1354                 - self.logtokensum) * self.weight)
1355
1356
1357class LexiconEncoding(Encoding):
1358    """Class for calculating the encoding cost for the Lexicon"""
1359
1360    def __init__(self):
1361        """Initialize Lexcion Encoding"""
1362        super(LexiconEncoding, self).__init__()
1363        self.atoms = collections.Counter()
1364
1365    @property
1366    def types(self):
1367        """Return the number of different atoms in the lexicon + 1 for the
1368        compound-end-token
1369
1370        """
1371        return len(self.atoms) + 1
1372
1373    def add(self, construction):
1374        """Add a construction to the lexicon, updating automatically the
1375        count for its atoms
1376
1377        """
1378        self.boundaries += 1
1379        for atom in construction:
1380            c = self.atoms[atom]
1381            self.atoms[atom] = c + 1
1382            self.update_count(atom, c, c + 1)
1383
1384    def remove(self, construction):
1385        """Remove construction from the lexicon, updating automatically the
1386        count for its atoms
1387
1388        """
1389        self.boundaries -= 1
1390        for atom in construction:
1391            c = self.atoms[atom]
1392            self.atoms[atom] = c - 1
1393            self.update_count(atom, c, c - 1)
1394
1395    def get_codelength(self, construction):
1396        """Return an approximate codelength for new construction."""
1397        l = len(construction) + 1
1398        cost = l * math.log(self.tokens + l)
1399        cost -= math.log(self.boundaries + 1)
1400        for atom in construction:
1401            if atom in self.atoms:
1402                c = self.atoms[atom]
1403            else:
1404                c = 1
1405            cost -= math.log(c)
1406        return cost
1407