1# -*- coding: utf-8 -*- 2# Natural Language Toolkit: IBM Model 1 3# 4# Copyright (C) 2001-2013 NLTK Project 5# Author: Chin Yee Lee <c.lee32@student.unimelb.edu.au> 6# Hengfeng Li <hengfeng12345@gmail.com> 7# Ruxin Hou <r.hou@student.unimelb.edu.au> 8# Calvin Tanujaya Lim <c.tanujayalim@gmail.com> 9# Based on earlier version by: 10# Will Zhang <wilzzha@gmail.com> 11# Guan Gui <ggui@student.unimelb.edu.au> 12# URL: <http://nltk.org/> 13# For license information, see LICENSE.TXT 14 15""" 16Lexical translation model that ignores word order. 17 18In IBM Model 1, word order is ignored for simplicity. As long as the 19word alignments are equivalent, it doesn't matter where the word occurs 20in the source or target sentence. Thus, the following three alignments 21are equally likely. 22 23Source: je mange du jambon 24Target: i eat some ham 25Alignment: (0,0) (1,1) (2,2) (3,3) 26 27Source: je mange du jambon 28Target: some ham eat i 29Alignment: (0,2) (1,3) (2,1) (3,1) 30 31Source: du jambon je mange 32Target: eat i some ham 33Alignment: (0,3) (1,2) (2,0) (3,1) 34 35Note that an alignment is represented here as 36(word_index_in_target, word_index_in_source). 37 38The EM algorithm used in Model 1 is: 39E step - In the training data, count how many times a source language 40 word is translated into a target language word, weighted by 41 the prior probability of the translation. 42 43M step - Estimate the new probability of translation based on the 44 counts from the Expectation step. 45 46 47Notations: 48i: Position in the source sentence 49 Valid values are 0 (for NULL), 1, 2, ..., length of source sentence 50j: Position in the target sentence 51 Valid values are 1, 2, ..., length of target sentence 52s: A word in the source language 53t: A word in the target language 54 55 56References: 57Philipp Koehn. 2010. Statistical Machine Translation. 58Cambridge University Press, New York. 59 60Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and 61Robert L. Mercer. 1993. The Mathematics of Statistical Machine 62Translation: Parameter Estimation. Computational Linguistics, 19 (2), 63263-311. 64""" 65 66from __future__ import division 67from collections import defaultdict 68from nltk.translate import AlignedSent 69from nltk.translate import Alignment 70from nltk.translate import IBMModel 71from nltk.translate.ibm_model import Counts 72import warnings 73 74 75class IBMModel1(IBMModel): 76 """ 77 Lexical translation model that ignores word order 78 79 >>> bitext = [] 80 >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small'])) 81 >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big'])) 82 >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small'])) 83 >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house'])) 84 >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book'])) 85 >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book'])) 86 87 >>> ibm1 = IBMModel1(bitext, 5) 88 89 >>> print(ibm1.translation_table['buch']['book']) 90 0.889... 91 >>> print(ibm1.translation_table['das']['book']) 92 0.061... 93 >>> print(ibm1.translation_table['buch'][None]) 94 0.113... 95 >>> print(ibm1.translation_table['ja'][None]) 96 0.072... 97 98 >>> test_sentence = bitext[2] 99 >>> test_sentence.words 100 ['das', 'buch', 'ist', 'ja', 'klein'] 101 >>> test_sentence.mots 102 ['the', 'book', 'is', 'small'] 103 >>> test_sentence.alignment 104 Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)]) 105 106 """ 107 108 def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None): 109 """ 110 Train on ``sentence_aligned_corpus`` and create a lexical 111 translation model. 112 113 Translation direction is from ``AlignedSent.mots`` to 114 ``AlignedSent.words``. 115 116 :param sentence_aligned_corpus: Sentence-aligned parallel corpus 117 :type sentence_aligned_corpus: list(AlignedSent) 118 119 :param iterations: Number of iterations to run training algorithm 120 :type iterations: int 121 122 :param probability_tables: Optional. Use this to pass in custom 123 probability values. If not specified, probabilities will be 124 set to a uniform distribution, or some other sensible value. 125 If specified, the following entry must be present: 126 ``translation_table``. 127 See ``IBMModel`` for the type and purpose of this table. 128 :type probability_tables: dict[str]: object 129 """ 130 super(IBMModel1, self).__init__(sentence_aligned_corpus) 131 132 if probability_tables is None: 133 self.set_uniform_probabilities(sentence_aligned_corpus) 134 else: 135 # Set user-defined probabilities 136 self.translation_table = probability_tables['translation_table'] 137 138 for n in range(0, iterations): 139 self.train(sentence_aligned_corpus) 140 141 self.align_all(sentence_aligned_corpus) 142 143 def set_uniform_probabilities(self, sentence_aligned_corpus): 144 initial_prob = 1 / len(self.trg_vocab) 145 if initial_prob < IBMModel.MIN_PROB: 146 warnings.warn( 147 "Target language vocabulary is too large (" 148 + str(len(self.trg_vocab)) 149 + " words). " 150 "Results may be less accurate." 151 ) 152 153 for t in self.trg_vocab: 154 self.translation_table[t] = defaultdict(lambda: initial_prob) 155 156 def train(self, parallel_corpus): 157 counts = Counts() 158 for aligned_sentence in parallel_corpus: 159 trg_sentence = aligned_sentence.words 160 src_sentence = [None] + aligned_sentence.mots 161 162 # E step (a): Compute normalization factors to weigh counts 163 total_count = self.prob_all_alignments(src_sentence, trg_sentence) 164 165 # E step (b): Collect counts 166 for t in trg_sentence: 167 for s in src_sentence: 168 count = self.prob_alignment_point(s, t) 169 normalized_count = count / total_count[t] 170 counts.t_given_s[t][s] += normalized_count 171 counts.any_t_given_s[s] += normalized_count 172 173 # M step: Update probabilities with maximum likelihood estimate 174 self.maximize_lexical_translation_probabilities(counts) 175 176 def prob_all_alignments(self, src_sentence, trg_sentence): 177 """ 178 Computes the probability of all possible word alignments, 179 expressed as a marginal distribution over target words t 180 181 Each entry in the return value represents the contribution to 182 the total alignment probability by the target word t. 183 184 To obtain probability(alignment | src_sentence, trg_sentence), 185 simply sum the entries in the return value. 186 187 :return: Probability of t for all s in ``src_sentence`` 188 :rtype: dict(str): float 189 """ 190 alignment_prob_for_t = defaultdict(lambda: 0.0) 191 for t in trg_sentence: 192 for s in src_sentence: 193 alignment_prob_for_t[t] += self.prob_alignment_point(s, t) 194 return alignment_prob_for_t 195 196 def prob_alignment_point(self, s, t): 197 """ 198 Probability that word ``t`` in the target sentence is aligned to 199 word ``s`` in the source sentence 200 """ 201 return self.translation_table[t][s] 202 203 def prob_t_a_given_s(self, alignment_info): 204 """ 205 Probability of target sentence and an alignment given the 206 source sentence 207 """ 208 prob = 1.0 209 210 for j, i in enumerate(alignment_info.alignment): 211 if j == 0: 212 continue # skip the dummy zeroeth element 213 trg_word = alignment_info.trg_sentence[j] 214 src_word = alignment_info.src_sentence[i] 215 prob *= self.translation_table[trg_word][src_word] 216 217 return max(prob, IBMModel.MIN_PROB) 218 219 def align_all(self, parallel_corpus): 220 for sentence_pair in parallel_corpus: 221 self.align(sentence_pair) 222 223 def align(self, sentence_pair): 224 """ 225 Determines the best word alignment for one sentence pair from 226 the corpus that the model was trained on. 227 228 The best alignment will be set in ``sentence_pair`` when the 229 method returns. In contrast with the internal implementation of 230 IBM models, the word indices in the ``Alignment`` are zero- 231 indexed, not one-indexed. 232 233 :param sentence_pair: A sentence in the source language and its 234 counterpart sentence in the target language 235 :type sentence_pair: AlignedSent 236 """ 237 best_alignment = [] 238 239 for j, trg_word in enumerate(sentence_pair.words): 240 # Initialize trg_word to align with the NULL token 241 best_prob = max(self.translation_table[trg_word][None], IBMModel.MIN_PROB) 242 best_alignment_point = None 243 for i, src_word in enumerate(sentence_pair.mots): 244 align_prob = self.translation_table[trg_word][src_word] 245 if align_prob >= best_prob: # prefer newer word in case of tie 246 best_prob = align_prob 247 best_alignment_point = i 248 249 best_alignment.append((j, best_alignment_point)) 250 251 sentence_pair.alignment = Alignment(best_alignment) 252