1# -*- coding: utf-8 -*-
2# Natural Language Toolkit: IBM Model 1
3#
4# Copyright (C) 2001-2013 NLTK Project
5# Author: Chin Yee Lee <c.lee32@student.unimelb.edu.au>
6#         Hengfeng Li <hengfeng12345@gmail.com>
7#         Ruxin Hou <r.hou@student.unimelb.edu.au>
8#         Calvin Tanujaya Lim <c.tanujayalim@gmail.com>
9# Based on earlier version by:
10#         Will Zhang <wilzzha@gmail.com>
11#         Guan Gui <ggui@student.unimelb.edu.au>
12# URL: <http://nltk.org/>
13# For license information, see LICENSE.TXT
14
15"""
16Lexical translation model that ignores word order.
17
18In IBM Model 1, word order is ignored for simplicity. As long as the
19word alignments are equivalent, it doesn't matter where the word occurs
20in the source or target sentence. Thus, the following three alignments
21are equally likely.
22
23Source: je mange du jambon
24Target: i eat some ham
25Alignment: (0,0) (1,1) (2,2) (3,3)
26
27Source: je mange du jambon
28Target: some ham eat i
29Alignment: (0,2) (1,3) (2,1) (3,1)
30
31Source: du jambon je mange
32Target: eat i some ham
33Alignment: (0,3) (1,2) (2,0) (3,1)
34
35Note that an alignment is represented here as
36(word_index_in_target, word_index_in_source).
37
38The EM algorithm used in Model 1 is:
39E step - In the training data, count how many times a source language
40         word is translated into a target language word, weighted by
41         the prior probability of the translation.
42
43M step - Estimate the new probability of translation based on the
44         counts from the Expectation step.
45
46
47Notations:
48i: Position in the source sentence
49    Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
50j: Position in the target sentence
51    Valid values are 1, 2, ..., length of target sentence
52s: A word in the source language
53t: A word in the target language
54
55
56References:
57Philipp Koehn. 2010. Statistical Machine Translation.
58Cambridge University Press, New York.
59
60Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
61Robert L. Mercer. 1993. The Mathematics of Statistical Machine
62Translation: Parameter Estimation. Computational Linguistics, 19 (2),
63263-311.
64"""
65
66from __future__ import division
67from collections import defaultdict
68from nltk.translate import AlignedSent
69from nltk.translate import Alignment
70from nltk.translate import IBMModel
71from nltk.translate.ibm_model import Counts
72import warnings
73
74
75class IBMModel1(IBMModel):
76    """
77    Lexical translation model that ignores word order
78
79    >>> bitext = []
80    >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
81    >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big']))
82    >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
83    >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
84    >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
85    >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
86
87    >>> ibm1 = IBMModel1(bitext, 5)
88
89    >>> print(ibm1.translation_table['buch']['book'])
90    0.889...
91    >>> print(ibm1.translation_table['das']['book'])
92    0.061...
93    >>> print(ibm1.translation_table['buch'][None])
94    0.113...
95    >>> print(ibm1.translation_table['ja'][None])
96    0.072...
97
98    >>> test_sentence = bitext[2]
99    >>> test_sentence.words
100    ['das', 'buch', 'ist', 'ja', 'klein']
101    >>> test_sentence.mots
102    ['the', 'book', 'is', 'small']
103    >>> test_sentence.alignment
104    Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)])
105
106    """
107
108    def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None):
109        """
110        Train on ``sentence_aligned_corpus`` and create a lexical
111        translation model.
112
113        Translation direction is from ``AlignedSent.mots`` to
114        ``AlignedSent.words``.
115
116        :param sentence_aligned_corpus: Sentence-aligned parallel corpus
117        :type sentence_aligned_corpus: list(AlignedSent)
118
119        :param iterations: Number of iterations to run training algorithm
120        :type iterations: int
121
122        :param probability_tables: Optional. Use this to pass in custom
123            probability values. If not specified, probabilities will be
124            set to a uniform distribution, or some other sensible value.
125            If specified, the following entry must be present:
126            ``translation_table``.
127            See ``IBMModel`` for the type and purpose of this table.
128        :type probability_tables: dict[str]: object
129        """
130        super(IBMModel1, self).__init__(sentence_aligned_corpus)
131
132        if probability_tables is None:
133            self.set_uniform_probabilities(sentence_aligned_corpus)
134        else:
135            # Set user-defined probabilities
136            self.translation_table = probability_tables['translation_table']
137
138        for n in range(0, iterations):
139            self.train(sentence_aligned_corpus)
140
141        self.align_all(sentence_aligned_corpus)
142
143    def set_uniform_probabilities(self, sentence_aligned_corpus):
144        initial_prob = 1 / len(self.trg_vocab)
145        if initial_prob < IBMModel.MIN_PROB:
146            warnings.warn(
147                "Target language vocabulary is too large ("
148                + str(len(self.trg_vocab))
149                + " words). "
150                "Results may be less accurate."
151            )
152
153        for t in self.trg_vocab:
154            self.translation_table[t] = defaultdict(lambda: initial_prob)
155
156    def train(self, parallel_corpus):
157        counts = Counts()
158        for aligned_sentence in parallel_corpus:
159            trg_sentence = aligned_sentence.words
160            src_sentence = [None] + aligned_sentence.mots
161
162            # E step (a): Compute normalization factors to weigh counts
163            total_count = self.prob_all_alignments(src_sentence, trg_sentence)
164
165            # E step (b): Collect counts
166            for t in trg_sentence:
167                for s in src_sentence:
168                    count = self.prob_alignment_point(s, t)
169                    normalized_count = count / total_count[t]
170                    counts.t_given_s[t][s] += normalized_count
171                    counts.any_t_given_s[s] += normalized_count
172
173        # M step: Update probabilities with maximum likelihood estimate
174        self.maximize_lexical_translation_probabilities(counts)
175
176    def prob_all_alignments(self, src_sentence, trg_sentence):
177        """
178        Computes the probability of all possible word alignments,
179        expressed as a marginal distribution over target words t
180
181        Each entry in the return value represents the contribution to
182        the total alignment probability by the target word t.
183
184        To obtain probability(alignment | src_sentence, trg_sentence),
185        simply sum the entries in the return value.
186
187        :return: Probability of t for all s in ``src_sentence``
188        :rtype: dict(str): float
189        """
190        alignment_prob_for_t = defaultdict(lambda: 0.0)
191        for t in trg_sentence:
192            for s in src_sentence:
193                alignment_prob_for_t[t] += self.prob_alignment_point(s, t)
194        return alignment_prob_for_t
195
196    def prob_alignment_point(self, s, t):
197        """
198        Probability that word ``t`` in the target sentence is aligned to
199        word ``s`` in the source sentence
200        """
201        return self.translation_table[t][s]
202
203    def prob_t_a_given_s(self, alignment_info):
204        """
205        Probability of target sentence and an alignment given the
206        source sentence
207        """
208        prob = 1.0
209
210        for j, i in enumerate(alignment_info.alignment):
211            if j == 0:
212                continue  # skip the dummy zeroeth element
213            trg_word = alignment_info.trg_sentence[j]
214            src_word = alignment_info.src_sentence[i]
215            prob *= self.translation_table[trg_word][src_word]
216
217        return max(prob, IBMModel.MIN_PROB)
218
219    def align_all(self, parallel_corpus):
220        for sentence_pair in parallel_corpus:
221            self.align(sentence_pair)
222
223    def align(self, sentence_pair):
224        """
225        Determines the best word alignment for one sentence pair from
226        the corpus that the model was trained on.
227
228        The best alignment will be set in ``sentence_pair`` when the
229        method returns. In contrast with the internal implementation of
230        IBM models, the word indices in the ``Alignment`` are zero-
231        indexed, not one-indexed.
232
233        :param sentence_pair: A sentence in the source language and its
234            counterpart sentence in the target language
235        :type sentence_pair: AlignedSent
236        """
237        best_alignment = []
238
239        for j, trg_word in enumerate(sentence_pair.words):
240            # Initialize trg_word to align with the NULL token
241            best_prob = max(self.translation_table[trg_word][None], IBMModel.MIN_PROB)
242            best_alignment_point = None
243            for i, src_word in enumerate(sentence_pair.mots):
244                align_prob = self.translation_table[trg_word][src_word]
245                if align_prob >= best_prob:  # prefer newer word in case of tie
246                    best_prob = align_prob
247                    best_alignment_point = i
248
249            best_alignment.append((j, best_alignment_point))
250
251        sentence_pair.alignment = Alignment(best_alignment)
252