1# -*- coding: utf-8 -*-
2# Natural Language Toolkit: IBM Model 4
3#
4# Copyright (C) 2001-2019 NLTK Project
5# Author: Tah Wei Hoon <hoon.tw@gmail.com>
6# URL: <http://nltk.org/>
7# For license information, see LICENSE.TXT
8
9"""
10Translation model that reorders output words based on their type and
11distance from other related words in the output sentence.
12
13IBM Model 4 improves the distortion model of Model 3, motivated by the
14observation that certain words tend to be re-ordered in a predictable
15way relative to one another. For example, <adjective><noun> in English
16usually has its order flipped as <noun><adjective> in French.
17
18Model 4 requires words in the source and target vocabularies to be
19categorized into classes. This can be linguistically driven, like parts
20of speech (adjective, nouns, prepositions, etc). Word classes can also
21be obtained by statistical methods. The original IBM Model 4 uses an
22information theoretic approach to group words into 50 classes for each
23vocabulary.
24
25Terminology:
26Cept:
27    A source word with non-zero fertility i.e. aligned to one or more
28    target words.
29Tablet:
30    The set of target word(s) aligned to a cept.
31Head of cept:
32    The first word of the tablet of that cept.
33Center of cept:
34    The average position of the words in that cept's tablet. If the
35    value is not an integer, the ceiling is taken.
36    For example, for a tablet with words in positions 2, 5, 6 in the
37    target sentence, the center of the corresponding cept is
38    ceil((2 + 5 + 6) / 3) = 5
39Displacement:
40    For a head word, defined as (position of head word - position of
41    previous cept's center). Can be positive or negative.
42    For a non-head word, defined as (position of non-head word -
43    position of previous word in the same tablet). Always positive,
44    because successive words in a tablet are assumed to appear to the
45    right of the previous word.
46
47In contrast to Model 3 which reorders words in a tablet independently of
48other words, Model 4 distinguishes between three cases.
49(1) Words generated by NULL are distributed uniformly.
50(2) For a head word t, its position is modeled by the probability
51    d_head(displacement | word_class_s(s),word_class_t(t)),
52    where s is the previous cept, and word_class_s and word_class_t maps
53    s and t to a source and target language word class respectively.
54(3) For a non-head word t, its position is modeled by the probability
55    d_non_head(displacement | word_class_t(t))
56
57The EM algorithm used in Model 4 is:
58E step - In the training data, collect counts, weighted by prior
59         probabilities.
60         (a) count how many times a source language word is translated
61             into a target language word
62         (b) for a particular word class, count how many times a head
63             word is located at a particular displacement from the
64             previous cept's center
65         (c) for a particular word class, count how many times a
66             non-head word is located at a particular displacement from
67             the previous target word
68         (d) count how many times a source word is aligned to phi number
69             of target words
70         (e) count how many times NULL is aligned to a target word
71
72M step - Estimate new probabilities based on the counts from the E step
73
74Like Model 3, there are too many possible alignments to consider. Thus,
75a hill climbing approach is used to sample good candidates.
76
77
78Notations:
79i: Position in the source sentence
80    Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
81j: Position in the target sentence
82    Valid values are 1, 2, ..., length of target sentence
83l: Number of words in the source sentence, excluding NULL
84m: Number of words in the target sentence
85s: A word in the source language
86t: A word in the target language
87phi: Fertility, the number of target words produced by a source word
88p1: Probability that a target word produced by a source word is
89    accompanied by another target word that is aligned to NULL
90p0: 1 - p1
91dj: Displacement, Δj
92
93
94References:
95Philipp Koehn. 2010. Statistical Machine Translation.
96Cambridge University Press, New York.
97
98Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
99Robert L. Mercer. 1993. The Mathematics of Statistical Machine
100Translation: Parameter Estimation. Computational Linguistics, 19 (2),
101263-311.
102"""
103
104from __future__ import division
105
106import warnings
107from collections import defaultdict
108from math import factorial
109
110from nltk.translate import AlignedSent
111from nltk.translate import Alignment
112from nltk.translate import IBMModel
113from nltk.translate import IBMModel3
114from nltk.translate.ibm_model import Counts
115from nltk.translate.ibm_model import longest_target_sentence_length
116
117
118class IBMModel4(IBMModel):
119    """
120    Translation model that reorders output words based on their type and
121    their distance from other related words in the output sentence
122
123    >>> bitext = []
124    >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
125    >>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big']))
126    >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
127    >>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small']))
128    >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
129    >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
130    >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
131    >>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book']))
132    >>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize']))
133    >>> src_classes = {'the': 0, 'a': 0, 'small': 1, 'big': 1, 'house': 2, 'book': 2, 'is': 3, 'was': 3, 'i': 4, 'summarize': 5 }
134    >>> trg_classes = {'das': 0, 'ein': 0, 'haus': 1, 'buch': 1, 'klein': 2, 'groß': 2, 'ist': 3, 'war': 3, 'ja': 4, 'ich': 5, 'fasse': 6, 'zusammen': 6 }
135
136    >>> ibm4 = IBMModel4(bitext, 5, src_classes, trg_classes)
137
138    >>> print(round(ibm4.translation_table['buch']['book'], 3))
139    1.0
140    >>> print(round(ibm4.translation_table['das']['book'], 3))
141    0.0
142    >>> print(round(ibm4.translation_table['ja'][None], 3))
143    1.0
144
145    >>> print(round(ibm4.head_distortion_table[1][0][1], 3))
146    1.0
147    >>> print(round(ibm4.head_distortion_table[2][0][1], 3))
148    0.0
149    >>> print(round(ibm4.non_head_distortion_table[3][6], 3))
150    0.5
151
152    >>> print(round(ibm4.fertility_table[2]['summarize'], 3))
153    1.0
154    >>> print(round(ibm4.fertility_table[1]['book'], 3))
155    1.0
156
157    >>> print(ibm4.p1)
158    0.033...
159
160    >>> test_sentence = bitext[2]
161    >>> test_sentence.words
162    ['das', 'buch', 'ist', 'ja', 'klein']
163    >>> test_sentence.mots
164    ['the', 'book', 'is', 'small']
165    >>> test_sentence.alignment
166    Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)])
167
168    """
169
170    def __init__(
171        self,
172        sentence_aligned_corpus,
173        iterations,
174        source_word_classes,
175        target_word_classes,
176        probability_tables=None,
177    ):
178        """
179        Train on ``sentence_aligned_corpus`` and create a lexical
180        translation model, distortion models, a fertility model, and a
181        model for generating NULL-aligned words.
182
183        Translation direction is from ``AlignedSent.mots`` to
184        ``AlignedSent.words``.
185
186        :param sentence_aligned_corpus: Sentence-aligned parallel corpus
187        :type sentence_aligned_corpus: list(AlignedSent)
188
189        :param iterations: Number of iterations to run training algorithm
190        :type iterations: int
191
192        :param source_word_classes: Lookup table that maps a source word
193            to its word class, the latter represented by an integer id
194        :type source_word_classes: dict[str]: int
195
196        :param target_word_classes: Lookup table that maps a target word
197            to its word class, the latter represented by an integer id
198        :type target_word_classes: dict[str]: int
199
200        :param probability_tables: Optional. Use this to pass in custom
201            probability values. If not specified, probabilities will be
202            set to a uniform distribution, or some other sensible value.
203            If specified, all the following entries must be present:
204            ``translation_table``, ``alignment_table``,
205            ``fertility_table``, ``p1``, ``head_distortion_table``,
206            ``non_head_distortion_table``. See ``IBMModel`` and
207            ``IBMModel4`` for the type and purpose of these tables.
208        :type probability_tables: dict[str]: object
209        """
210        super(IBMModel4, self).__init__(sentence_aligned_corpus)
211        self.reset_probabilities()
212        self.src_classes = source_word_classes
213        self.trg_classes = target_word_classes
214
215        if probability_tables is None:
216            # Get probabilities from IBM model 3
217            ibm3 = IBMModel3(sentence_aligned_corpus, iterations)
218            self.translation_table = ibm3.translation_table
219            self.alignment_table = ibm3.alignment_table
220            self.fertility_table = ibm3.fertility_table
221            self.p1 = ibm3.p1
222            self.set_uniform_probabilities(sentence_aligned_corpus)
223        else:
224            # Set user-defined probabilities
225            self.translation_table = probability_tables['translation_table']
226            self.alignment_table = probability_tables['alignment_table']
227            self.fertility_table = probability_tables['fertility_table']
228            self.p1 = probability_tables['p1']
229            self.head_distortion_table = probability_tables['head_distortion_table']
230            self.non_head_distortion_table = probability_tables[
231                'non_head_distortion_table'
232            ]
233
234        for n in range(0, iterations):
235            self.train(sentence_aligned_corpus)
236
237    def reset_probabilities(self):
238        super(IBMModel4, self).reset_probabilities()
239        self.head_distortion_table = defaultdict(
240            lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB))
241        )
242        """
243        dict[int][int][int]: float. Probability(displacement of head
244        word | word class of previous cept,target word class).
245        Values accessed as ``distortion_table[dj][src_class][trg_class]``.
246        """
247
248        self.non_head_distortion_table = defaultdict(
249            lambda: defaultdict(lambda: self.MIN_PROB)
250        )
251        """
252        dict[int][int]: float. Probability(displacement of non-head
253        word | target word class).
254        Values accessed as ``distortion_table[dj][trg_class]``.
255        """
256
257    def set_uniform_probabilities(self, sentence_aligned_corpus):
258        """
259        Set distortion probabilities uniformly to
260        1 / cardinality of displacement values
261        """
262        max_m = longest_target_sentence_length(sentence_aligned_corpus)
263
264        # The maximum displacement is m-1, when a word is in the last
265        # position m of the target sentence and the previously placed
266        # word is in the first position.
267        # Conversely, the minimum displacement is -(m-1).
268        # Thus, the displacement range is (m-1) - (-(m-1)). Note that
269        # displacement cannot be zero and is not included in the range.
270        if max_m <= 1:
271            initial_prob = IBMModel.MIN_PROB
272        else:
273            initial_prob = 1 / (2 * (max_m - 1))
274        if initial_prob < IBMModel.MIN_PROB:
275            warnings.warn(
276                "A target sentence is too long ("
277                + str(max_m)
278                + " words). Results may be less accurate."
279            )
280
281        for dj in range(1, max_m):
282            self.head_distortion_table[dj] = defaultdict(
283                lambda: defaultdict(lambda: initial_prob)
284            )
285            self.head_distortion_table[-dj] = defaultdict(
286                lambda: defaultdict(lambda: initial_prob)
287            )
288            self.non_head_distortion_table[dj] = defaultdict(lambda: initial_prob)
289            self.non_head_distortion_table[-dj] = defaultdict(lambda: initial_prob)
290
291    def train(self, parallel_corpus):
292        counts = Model4Counts()
293        for aligned_sentence in parallel_corpus:
294            m = len(aligned_sentence.words)
295
296            # Sample the alignment space
297            sampled_alignments, best_alignment = self.sample(aligned_sentence)
298            # Record the most probable alignment
299            aligned_sentence.alignment = Alignment(
300                best_alignment.zero_indexed_alignment()
301            )
302
303            # E step (a): Compute normalization factors to weigh counts
304            total_count = self.prob_of_alignments(sampled_alignments)
305
306            # E step (b): Collect counts
307            for alignment_info in sampled_alignments:
308                count = self.prob_t_a_given_s(alignment_info)
309                normalized_count = count / total_count
310
311                for j in range(1, m + 1):
312                    counts.update_lexical_translation(
313                        normalized_count, alignment_info, j
314                    )
315                    counts.update_distortion(
316                        normalized_count,
317                        alignment_info,
318                        j,
319                        self.src_classes,
320                        self.trg_classes,
321                    )
322
323                counts.update_null_generation(normalized_count, alignment_info)
324                counts.update_fertility(normalized_count, alignment_info)
325
326        # M step: Update probabilities with maximum likelihood estimates
327        # If any probability is less than MIN_PROB, clamp it to MIN_PROB
328        existing_alignment_table = self.alignment_table
329        self.reset_probabilities()
330        self.alignment_table = existing_alignment_table  # don't retrain
331
332        self.maximize_lexical_translation_probabilities(counts)
333        self.maximize_distortion_probabilities(counts)
334        self.maximize_fertility_probabilities(counts)
335        self.maximize_null_generation_probabilities(counts)
336
337    def maximize_distortion_probabilities(self, counts):
338        head_d_table = self.head_distortion_table
339        for dj, src_classes in counts.head_distortion.items():
340            for s_cls, trg_classes in src_classes.items():
341                for t_cls in trg_classes:
342                    estimate = (
343                        counts.head_distortion[dj][s_cls][t_cls]
344                        / counts.head_distortion_for_any_dj[s_cls][t_cls]
345                    )
346                    head_d_table[dj][s_cls][t_cls] = max(estimate, IBMModel.MIN_PROB)
347
348        non_head_d_table = self.non_head_distortion_table
349        for dj, trg_classes in counts.non_head_distortion.items():
350            for t_cls in trg_classes:
351                estimate = (
352                    counts.non_head_distortion[dj][t_cls]
353                    / counts.non_head_distortion_for_any_dj[t_cls]
354                )
355                non_head_d_table[dj][t_cls] = max(estimate, IBMModel.MIN_PROB)
356
357    def prob_t_a_given_s(self, alignment_info):
358        """
359        Probability of target sentence and an alignment given the
360        source sentence
361        """
362        return IBMModel4.model4_prob_t_a_given_s(alignment_info, self)
363
364    @staticmethod  # exposed for Model 5 to use
365    def model4_prob_t_a_given_s(alignment_info, ibm_model):
366        probability = 1.0
367        MIN_PROB = IBMModel.MIN_PROB
368
369        def null_generation_term():
370            # Binomial distribution: B(m - null_fertility, p1)
371            value = 1.0
372            p1 = ibm_model.p1
373            p0 = 1 - p1
374            null_fertility = alignment_info.fertility_of_i(0)
375            m = len(alignment_info.trg_sentence) - 1
376            value *= pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility)
377            if value < MIN_PROB:
378                return MIN_PROB
379
380            # Combination: (m - null_fertility) choose null_fertility
381            for i in range(1, null_fertility + 1):
382                value *= (m - null_fertility - i + 1) / i
383            return value
384
385        def fertility_term():
386            value = 1.0
387            src_sentence = alignment_info.src_sentence
388            for i in range(1, len(src_sentence)):
389                fertility = alignment_info.fertility_of_i(i)
390                value *= (
391                    factorial(fertility)
392                    * ibm_model.fertility_table[fertility][src_sentence[i]]
393                )
394                if value < MIN_PROB:
395                    return MIN_PROB
396            return value
397
398        def lexical_translation_term(j):
399            t = alignment_info.trg_sentence[j]
400            i = alignment_info.alignment[j]
401            s = alignment_info.src_sentence[i]
402            return ibm_model.translation_table[t][s]
403
404        def distortion_term(j):
405            t = alignment_info.trg_sentence[j]
406            i = alignment_info.alignment[j]
407            if i == 0:
408                # case 1: t is aligned to NULL
409                return 1.0
410            if alignment_info.is_head_word(j):
411                # case 2: t is the first word of a tablet
412                previous_cept = alignment_info.previous_cept(j)
413                src_class = None
414                if previous_cept is not None:
415                    previous_s = alignment_info.src_sentence[previous_cept]
416                    src_class = ibm_model.src_classes[previous_s]
417                trg_class = ibm_model.trg_classes[t]
418                dj = j - alignment_info.center_of_cept(previous_cept)
419                return ibm_model.head_distortion_table[dj][src_class][trg_class]
420
421            # case 3: t is a subsequent word of a tablet
422            previous_position = alignment_info.previous_in_tablet(j)
423            trg_class = ibm_model.trg_classes[t]
424            dj = j - previous_position
425            return ibm_model.non_head_distortion_table[dj][trg_class]
426
427        # end nested functions
428
429        # Abort computation whenever probability falls below MIN_PROB at
430        # any point, since MIN_PROB can be considered as zero
431        probability *= null_generation_term()
432        if probability < MIN_PROB:
433            return MIN_PROB
434
435        probability *= fertility_term()
436        if probability < MIN_PROB:
437            return MIN_PROB
438
439        for j in range(1, len(alignment_info.trg_sentence)):
440            probability *= lexical_translation_term(j)
441            if probability < MIN_PROB:
442                return MIN_PROB
443
444            probability *= distortion_term(j)
445            if probability < MIN_PROB:
446                return MIN_PROB
447
448        return probability
449
450
451class Model4Counts(Counts):
452    """
453    Data object to store counts of various parameters during training.
454    Includes counts for distortion.
455    """
456
457    def __init__(self):
458        super(Model4Counts, self).__init__()
459        self.head_distortion = defaultdict(
460            lambda: defaultdict(lambda: defaultdict(lambda: 0.0))
461        )
462        self.head_distortion_for_any_dj = defaultdict(lambda: defaultdict(lambda: 0.0))
463        self.non_head_distortion = defaultdict(lambda: defaultdict(lambda: 0.0))
464        self.non_head_distortion_for_any_dj = defaultdict(lambda: 0.0)
465
466    def update_distortion(self, count, alignment_info, j, src_classes, trg_classes):
467        i = alignment_info.alignment[j]
468        t = alignment_info.trg_sentence[j]
469        if i == 0:
470            # case 1: t is aligned to NULL
471            pass
472        elif alignment_info.is_head_word(j):
473            # case 2: t is the first word of a tablet
474            previous_cept = alignment_info.previous_cept(j)
475            if previous_cept is not None:
476                previous_src_word = alignment_info.src_sentence[previous_cept]
477                src_class = src_classes[previous_src_word]
478            else:
479                src_class = None
480            trg_class = trg_classes[t]
481            dj = j - alignment_info.center_of_cept(previous_cept)
482            self.head_distortion[dj][src_class][trg_class] += count
483            self.head_distortion_for_any_dj[src_class][trg_class] += count
484        else:
485            # case 3: t is a subsequent word of a tablet
486            previous_j = alignment_info.previous_in_tablet(j)
487            trg_class = trg_classes[t]
488            dj = j - previous_j
489            self.non_head_distortion[dj][trg_class] += count
490            self.non_head_distortion_for_any_dj[trg_class] += count
491