1# -*- coding: utf-8 -*- 2# Natural Language Toolkit: IBM Model 4 3# 4# Copyright (C) 2001-2019 NLTK Project 5# Author: Tah Wei Hoon <hoon.tw@gmail.com> 6# URL: <http://nltk.org/> 7# For license information, see LICENSE.TXT 8 9""" 10Translation model that reorders output words based on their type and 11distance from other related words in the output sentence. 12 13IBM Model 4 improves the distortion model of Model 3, motivated by the 14observation that certain words tend to be re-ordered in a predictable 15way relative to one another. For example, <adjective><noun> in English 16usually has its order flipped as <noun><adjective> in French. 17 18Model 4 requires words in the source and target vocabularies to be 19categorized into classes. This can be linguistically driven, like parts 20of speech (adjective, nouns, prepositions, etc). Word classes can also 21be obtained by statistical methods. The original IBM Model 4 uses an 22information theoretic approach to group words into 50 classes for each 23vocabulary. 24 25Terminology: 26Cept: 27 A source word with non-zero fertility i.e. aligned to one or more 28 target words. 29Tablet: 30 The set of target word(s) aligned to a cept. 31Head of cept: 32 The first word of the tablet of that cept. 33Center of cept: 34 The average position of the words in that cept's tablet. If the 35 value is not an integer, the ceiling is taken. 36 For example, for a tablet with words in positions 2, 5, 6 in the 37 target sentence, the center of the corresponding cept is 38 ceil((2 + 5 + 6) / 3) = 5 39Displacement: 40 For a head word, defined as (position of head word - position of 41 previous cept's center). Can be positive or negative. 42 For a non-head word, defined as (position of non-head word - 43 position of previous word in the same tablet). Always positive, 44 because successive words in a tablet are assumed to appear to the 45 right of the previous word. 46 47In contrast to Model 3 which reorders words in a tablet independently of 48other words, Model 4 distinguishes between three cases. 49(1) Words generated by NULL are distributed uniformly. 50(2) For a head word t, its position is modeled by the probability 51 d_head(displacement | word_class_s(s),word_class_t(t)), 52 where s is the previous cept, and word_class_s and word_class_t maps 53 s and t to a source and target language word class respectively. 54(3) For a non-head word t, its position is modeled by the probability 55 d_non_head(displacement | word_class_t(t)) 56 57The EM algorithm used in Model 4 is: 58E step - In the training data, collect counts, weighted by prior 59 probabilities. 60 (a) count how many times a source language word is translated 61 into a target language word 62 (b) for a particular word class, count how many times a head 63 word is located at a particular displacement from the 64 previous cept's center 65 (c) for a particular word class, count how many times a 66 non-head word is located at a particular displacement from 67 the previous target word 68 (d) count how many times a source word is aligned to phi number 69 of target words 70 (e) count how many times NULL is aligned to a target word 71 72M step - Estimate new probabilities based on the counts from the E step 73 74Like Model 3, there are too many possible alignments to consider. Thus, 75a hill climbing approach is used to sample good candidates. 76 77 78Notations: 79i: Position in the source sentence 80 Valid values are 0 (for NULL), 1, 2, ..., length of source sentence 81j: Position in the target sentence 82 Valid values are 1, 2, ..., length of target sentence 83l: Number of words in the source sentence, excluding NULL 84m: Number of words in the target sentence 85s: A word in the source language 86t: A word in the target language 87phi: Fertility, the number of target words produced by a source word 88p1: Probability that a target word produced by a source word is 89 accompanied by another target word that is aligned to NULL 90p0: 1 - p1 91dj: Displacement, Δj 92 93 94References: 95Philipp Koehn. 2010. Statistical Machine Translation. 96Cambridge University Press, New York. 97 98Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and 99Robert L. Mercer. 1993. The Mathematics of Statistical Machine 100Translation: Parameter Estimation. Computational Linguistics, 19 (2), 101263-311. 102""" 103 104from __future__ import division 105 106import warnings 107from collections import defaultdict 108from math import factorial 109 110from nltk.translate import AlignedSent 111from nltk.translate import Alignment 112from nltk.translate import IBMModel 113from nltk.translate import IBMModel3 114from nltk.translate.ibm_model import Counts 115from nltk.translate.ibm_model import longest_target_sentence_length 116 117 118class IBMModel4(IBMModel): 119 """ 120 Translation model that reorders output words based on their type and 121 their distance from other related words in the output sentence 122 123 >>> bitext = [] 124 >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small'])) 125 >>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big'])) 126 >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small'])) 127 >>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small'])) 128 >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house'])) 129 >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book'])) 130 >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book'])) 131 >>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book'])) 132 >>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize'])) 133 >>> src_classes = {'the': 0, 'a': 0, 'small': 1, 'big': 1, 'house': 2, 'book': 2, 'is': 3, 'was': 3, 'i': 4, 'summarize': 5 } 134 >>> trg_classes = {'das': 0, 'ein': 0, 'haus': 1, 'buch': 1, 'klein': 2, 'groß': 2, 'ist': 3, 'war': 3, 'ja': 4, 'ich': 5, 'fasse': 6, 'zusammen': 6 } 135 136 >>> ibm4 = IBMModel4(bitext, 5, src_classes, trg_classes) 137 138 >>> print(round(ibm4.translation_table['buch']['book'], 3)) 139 1.0 140 >>> print(round(ibm4.translation_table['das']['book'], 3)) 141 0.0 142 >>> print(round(ibm4.translation_table['ja'][None], 3)) 143 1.0 144 145 >>> print(round(ibm4.head_distortion_table[1][0][1], 3)) 146 1.0 147 >>> print(round(ibm4.head_distortion_table[2][0][1], 3)) 148 0.0 149 >>> print(round(ibm4.non_head_distortion_table[3][6], 3)) 150 0.5 151 152 >>> print(round(ibm4.fertility_table[2]['summarize'], 3)) 153 1.0 154 >>> print(round(ibm4.fertility_table[1]['book'], 3)) 155 1.0 156 157 >>> print(ibm4.p1) 158 0.033... 159 160 >>> test_sentence = bitext[2] 161 >>> test_sentence.words 162 ['das', 'buch', 'ist', 'ja', 'klein'] 163 >>> test_sentence.mots 164 ['the', 'book', 'is', 'small'] 165 >>> test_sentence.alignment 166 Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)]) 167 168 """ 169 170 def __init__( 171 self, 172 sentence_aligned_corpus, 173 iterations, 174 source_word_classes, 175 target_word_classes, 176 probability_tables=None, 177 ): 178 """ 179 Train on ``sentence_aligned_corpus`` and create a lexical 180 translation model, distortion models, a fertility model, and a 181 model for generating NULL-aligned words. 182 183 Translation direction is from ``AlignedSent.mots`` to 184 ``AlignedSent.words``. 185 186 :param sentence_aligned_corpus: Sentence-aligned parallel corpus 187 :type sentence_aligned_corpus: list(AlignedSent) 188 189 :param iterations: Number of iterations to run training algorithm 190 :type iterations: int 191 192 :param source_word_classes: Lookup table that maps a source word 193 to its word class, the latter represented by an integer id 194 :type source_word_classes: dict[str]: int 195 196 :param target_word_classes: Lookup table that maps a target word 197 to its word class, the latter represented by an integer id 198 :type target_word_classes: dict[str]: int 199 200 :param probability_tables: Optional. Use this to pass in custom 201 probability values. If not specified, probabilities will be 202 set to a uniform distribution, or some other sensible value. 203 If specified, all the following entries must be present: 204 ``translation_table``, ``alignment_table``, 205 ``fertility_table``, ``p1``, ``head_distortion_table``, 206 ``non_head_distortion_table``. See ``IBMModel`` and 207 ``IBMModel4`` for the type and purpose of these tables. 208 :type probability_tables: dict[str]: object 209 """ 210 super(IBMModel4, self).__init__(sentence_aligned_corpus) 211 self.reset_probabilities() 212 self.src_classes = source_word_classes 213 self.trg_classes = target_word_classes 214 215 if probability_tables is None: 216 # Get probabilities from IBM model 3 217 ibm3 = IBMModel3(sentence_aligned_corpus, iterations) 218 self.translation_table = ibm3.translation_table 219 self.alignment_table = ibm3.alignment_table 220 self.fertility_table = ibm3.fertility_table 221 self.p1 = ibm3.p1 222 self.set_uniform_probabilities(sentence_aligned_corpus) 223 else: 224 # Set user-defined probabilities 225 self.translation_table = probability_tables['translation_table'] 226 self.alignment_table = probability_tables['alignment_table'] 227 self.fertility_table = probability_tables['fertility_table'] 228 self.p1 = probability_tables['p1'] 229 self.head_distortion_table = probability_tables['head_distortion_table'] 230 self.non_head_distortion_table = probability_tables[ 231 'non_head_distortion_table' 232 ] 233 234 for n in range(0, iterations): 235 self.train(sentence_aligned_corpus) 236 237 def reset_probabilities(self): 238 super(IBMModel4, self).reset_probabilities() 239 self.head_distortion_table = defaultdict( 240 lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB)) 241 ) 242 """ 243 dict[int][int][int]: float. Probability(displacement of head 244 word | word class of previous cept,target word class). 245 Values accessed as ``distortion_table[dj][src_class][trg_class]``. 246 """ 247 248 self.non_head_distortion_table = defaultdict( 249 lambda: defaultdict(lambda: self.MIN_PROB) 250 ) 251 """ 252 dict[int][int]: float. Probability(displacement of non-head 253 word | target word class). 254 Values accessed as ``distortion_table[dj][trg_class]``. 255 """ 256 257 def set_uniform_probabilities(self, sentence_aligned_corpus): 258 """ 259 Set distortion probabilities uniformly to 260 1 / cardinality of displacement values 261 """ 262 max_m = longest_target_sentence_length(sentence_aligned_corpus) 263 264 # The maximum displacement is m-1, when a word is in the last 265 # position m of the target sentence and the previously placed 266 # word is in the first position. 267 # Conversely, the minimum displacement is -(m-1). 268 # Thus, the displacement range is (m-1) - (-(m-1)). Note that 269 # displacement cannot be zero and is not included in the range. 270 if max_m <= 1: 271 initial_prob = IBMModel.MIN_PROB 272 else: 273 initial_prob = 1 / (2 * (max_m - 1)) 274 if initial_prob < IBMModel.MIN_PROB: 275 warnings.warn( 276 "A target sentence is too long (" 277 + str(max_m) 278 + " words). Results may be less accurate." 279 ) 280 281 for dj in range(1, max_m): 282 self.head_distortion_table[dj] = defaultdict( 283 lambda: defaultdict(lambda: initial_prob) 284 ) 285 self.head_distortion_table[-dj] = defaultdict( 286 lambda: defaultdict(lambda: initial_prob) 287 ) 288 self.non_head_distortion_table[dj] = defaultdict(lambda: initial_prob) 289 self.non_head_distortion_table[-dj] = defaultdict(lambda: initial_prob) 290 291 def train(self, parallel_corpus): 292 counts = Model4Counts() 293 for aligned_sentence in parallel_corpus: 294 m = len(aligned_sentence.words) 295 296 # Sample the alignment space 297 sampled_alignments, best_alignment = self.sample(aligned_sentence) 298 # Record the most probable alignment 299 aligned_sentence.alignment = Alignment( 300 best_alignment.zero_indexed_alignment() 301 ) 302 303 # E step (a): Compute normalization factors to weigh counts 304 total_count = self.prob_of_alignments(sampled_alignments) 305 306 # E step (b): Collect counts 307 for alignment_info in sampled_alignments: 308 count = self.prob_t_a_given_s(alignment_info) 309 normalized_count = count / total_count 310 311 for j in range(1, m + 1): 312 counts.update_lexical_translation( 313 normalized_count, alignment_info, j 314 ) 315 counts.update_distortion( 316 normalized_count, 317 alignment_info, 318 j, 319 self.src_classes, 320 self.trg_classes, 321 ) 322 323 counts.update_null_generation(normalized_count, alignment_info) 324 counts.update_fertility(normalized_count, alignment_info) 325 326 # M step: Update probabilities with maximum likelihood estimates 327 # If any probability is less than MIN_PROB, clamp it to MIN_PROB 328 existing_alignment_table = self.alignment_table 329 self.reset_probabilities() 330 self.alignment_table = existing_alignment_table # don't retrain 331 332 self.maximize_lexical_translation_probabilities(counts) 333 self.maximize_distortion_probabilities(counts) 334 self.maximize_fertility_probabilities(counts) 335 self.maximize_null_generation_probabilities(counts) 336 337 def maximize_distortion_probabilities(self, counts): 338 head_d_table = self.head_distortion_table 339 for dj, src_classes in counts.head_distortion.items(): 340 for s_cls, trg_classes in src_classes.items(): 341 for t_cls in trg_classes: 342 estimate = ( 343 counts.head_distortion[dj][s_cls][t_cls] 344 / counts.head_distortion_for_any_dj[s_cls][t_cls] 345 ) 346 head_d_table[dj][s_cls][t_cls] = max(estimate, IBMModel.MIN_PROB) 347 348 non_head_d_table = self.non_head_distortion_table 349 for dj, trg_classes in counts.non_head_distortion.items(): 350 for t_cls in trg_classes: 351 estimate = ( 352 counts.non_head_distortion[dj][t_cls] 353 / counts.non_head_distortion_for_any_dj[t_cls] 354 ) 355 non_head_d_table[dj][t_cls] = max(estimate, IBMModel.MIN_PROB) 356 357 def prob_t_a_given_s(self, alignment_info): 358 """ 359 Probability of target sentence and an alignment given the 360 source sentence 361 """ 362 return IBMModel4.model4_prob_t_a_given_s(alignment_info, self) 363 364 @staticmethod # exposed for Model 5 to use 365 def model4_prob_t_a_given_s(alignment_info, ibm_model): 366 probability = 1.0 367 MIN_PROB = IBMModel.MIN_PROB 368 369 def null_generation_term(): 370 # Binomial distribution: B(m - null_fertility, p1) 371 value = 1.0 372 p1 = ibm_model.p1 373 p0 = 1 - p1 374 null_fertility = alignment_info.fertility_of_i(0) 375 m = len(alignment_info.trg_sentence) - 1 376 value *= pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility) 377 if value < MIN_PROB: 378 return MIN_PROB 379 380 # Combination: (m - null_fertility) choose null_fertility 381 for i in range(1, null_fertility + 1): 382 value *= (m - null_fertility - i + 1) / i 383 return value 384 385 def fertility_term(): 386 value = 1.0 387 src_sentence = alignment_info.src_sentence 388 for i in range(1, len(src_sentence)): 389 fertility = alignment_info.fertility_of_i(i) 390 value *= ( 391 factorial(fertility) 392 * ibm_model.fertility_table[fertility][src_sentence[i]] 393 ) 394 if value < MIN_PROB: 395 return MIN_PROB 396 return value 397 398 def lexical_translation_term(j): 399 t = alignment_info.trg_sentence[j] 400 i = alignment_info.alignment[j] 401 s = alignment_info.src_sentence[i] 402 return ibm_model.translation_table[t][s] 403 404 def distortion_term(j): 405 t = alignment_info.trg_sentence[j] 406 i = alignment_info.alignment[j] 407 if i == 0: 408 # case 1: t is aligned to NULL 409 return 1.0 410 if alignment_info.is_head_word(j): 411 # case 2: t is the first word of a tablet 412 previous_cept = alignment_info.previous_cept(j) 413 src_class = None 414 if previous_cept is not None: 415 previous_s = alignment_info.src_sentence[previous_cept] 416 src_class = ibm_model.src_classes[previous_s] 417 trg_class = ibm_model.trg_classes[t] 418 dj = j - alignment_info.center_of_cept(previous_cept) 419 return ibm_model.head_distortion_table[dj][src_class][trg_class] 420 421 # case 3: t is a subsequent word of a tablet 422 previous_position = alignment_info.previous_in_tablet(j) 423 trg_class = ibm_model.trg_classes[t] 424 dj = j - previous_position 425 return ibm_model.non_head_distortion_table[dj][trg_class] 426 427 # end nested functions 428 429 # Abort computation whenever probability falls below MIN_PROB at 430 # any point, since MIN_PROB can be considered as zero 431 probability *= null_generation_term() 432 if probability < MIN_PROB: 433 return MIN_PROB 434 435 probability *= fertility_term() 436 if probability < MIN_PROB: 437 return MIN_PROB 438 439 for j in range(1, len(alignment_info.trg_sentence)): 440 probability *= lexical_translation_term(j) 441 if probability < MIN_PROB: 442 return MIN_PROB 443 444 probability *= distortion_term(j) 445 if probability < MIN_PROB: 446 return MIN_PROB 447 448 return probability 449 450 451class Model4Counts(Counts): 452 """ 453 Data object to store counts of various parameters during training. 454 Includes counts for distortion. 455 """ 456 457 def __init__(self): 458 super(Model4Counts, self).__init__() 459 self.head_distortion = defaultdict( 460 lambda: defaultdict(lambda: defaultdict(lambda: 0.0)) 461 ) 462 self.head_distortion_for_any_dj = defaultdict(lambda: defaultdict(lambda: 0.0)) 463 self.non_head_distortion = defaultdict(lambda: defaultdict(lambda: 0.0)) 464 self.non_head_distortion_for_any_dj = defaultdict(lambda: 0.0) 465 466 def update_distortion(self, count, alignment_info, j, src_classes, trg_classes): 467 i = alignment_info.alignment[j] 468 t = alignment_info.trg_sentence[j] 469 if i == 0: 470 # case 1: t is aligned to NULL 471 pass 472 elif alignment_info.is_head_word(j): 473 # case 2: t is the first word of a tablet 474 previous_cept = alignment_info.previous_cept(j) 475 if previous_cept is not None: 476 previous_src_word = alignment_info.src_sentence[previous_cept] 477 src_class = src_classes[previous_src_word] 478 else: 479 src_class = None 480 trg_class = trg_classes[t] 481 dj = j - alignment_info.center_of_cept(previous_cept) 482 self.head_distortion[dj][src_class][trg_class] += count 483 self.head_distortion_for_any_dj[src_class][trg_class] += count 484 else: 485 # case 3: t is a subsequent word of a tablet 486 previous_j = alignment_info.previous_in_tablet(j) 487 trg_class = trg_classes[t] 488 dj = j - previous_j 489 self.non_head_distortion[dj][trg_class] += count 490 self.non_head_distortion_for_any_dj[trg_class] += count 491