1 /*************************************************************************
2 * *
3 * (C) Copyright 2004. Media Research Centre at the *
4 * Sociology and Communications Department of the *
5 * Budapest University of Technology and Economics. *
6 * *
7 * Developed by Daniel Varga. *
8 * *
9 * From hunalign; for license see ../AUTHORS and ../COPYING.hunalign *
10 * *
11 *************************************************************************/
12 #include <apertium/tmx_alignment.h>
13
14 #include <apertium/tmx_words.h> // For SentenceList
15 #include <apertium/tmx_dictionary.h> // For FrequencyMap
16 #include <apertium/string_utils.h>
17
18 #include <iostream>
19 #include <map>
20 #include <set>
21 #include <algorithm>
22
23 // Copypaste-elve. TODO Elhelyezni.
24 #define massert(e) if (!(e)) { std::wcerr << #e << " failed" << std::endl; throw "assert"; }
25
operator <<(std::ostream & os,std::pair<int,int> p)26 std::ostream& operator<<( std::ostream& os, std::pair<int,int> p )
27 {
28 os << p.first << "," << p.second;
29 return os;
30 }
31
32 namespace TMXAligner
33 {
34
35
36 // Attention, the two-sentence length is the first argument. Usually the Hungarian is, but not here.
37 // The bigger the better. closeness is always smaller than bestScore.
closeness(double twoSentenceLength,double oneSentenceLength)38 double closeness( double twoSentenceLength, double oneSentenceLength )
39 {
40 const double bestScore = 0.3;
41 const double quasiglobal_closenessMultiplier = 0.3;
42
43 double ratio;
44
45 if (twoSentenceLength>oneSentenceLength)
46 {
47 ratio = (twoSentenceLength+1)/(oneSentenceLength+1);
48 }
49 else
50 {
51 ratio = (oneSentenceLength+1)/(twoSentenceLength+1);
52 }
53
54 ratio -= 1.0;
55
56 // assert(ratio>=0);
57 return bestScore - quasiglobal_closenessMultiplier * ratio;
58 }
59
60 const unsigned char Diag = 1;
61 const unsigned char HuSkip = 2;
62 const unsigned char EnSkip = 3;
63 const unsigned char HuHuEnSkip = 4;
64 const unsigned char HuEnEnSkip = 5;
65 const unsigned char Dead = 6;
66
buildDynProgMatrix(const AlignMatrix & w,const SentenceValues & huLength,const SentenceValues & enLength,QuasiDiagonal<double> & v,TrelliMatrix & trellis)67 void buildDynProgMatrix( const AlignMatrix& w, const SentenceValues& huLength, const SentenceValues& enLength,
68 QuasiDiagonal<double>& v, TrelliMatrix& trellis )
69 {
70 const int huBookSize = w.size();
71
72
73 int huPos,enPos;
74
75 // v[huPos][enPos] gives the similarity of the [0,huPos) and [0,enPos) intervals.
76 // The smaller value, the better similarity. (Unlike in the original similarity matrix w, where bigger is better.)
77
78 double infinity = 1e6;
79
80 for ( huPos=0; huPos<=huBookSize; ++huPos )
81 {
82 int rowStart = v.rowStart(huPos);
83 int rowEnd = v.rowEnd(huPos);
84 for ( enPos=rowStart; enPos<rowEnd; ++enPos )
85 {
86 double& val = v.cell(huPos,enPos);
87 unsigned char& trail = trellis.cell(huPos,enPos);
88
89 bool quasiglobal_knightsMoveAllowed = true;
90 if (quasiglobal_knightsMoveAllowed)
91 {
92 double lengthFitness(0);
93
94 bool quasiglobal_lengthFitnessApplied = true;
95
96 // The array is indexed by the step directions. The smaller value, the better.
97 double values[Dead];
98 int i;
99 for ( i=1; i<Dead; ++i )
100 values[i] = infinity;
101
102 if (huPos>0)
103 {
104 values[HuSkip] = v[huPos-1][enPos] - skipScore;
105 }
106
107 if (enPos>0)
108 {
109 values[EnSkip] = v[huPos][enPos-1] - skipScore;
110 }
111
112 if ((huPos>0) && (enPos>0))
113 {
114 if (quasiglobal_lengthFitnessApplied)
115 {
116 lengthFitness = closeness(huLength[huPos-1], enLength[enPos-1]);
117 }
118 else
119 {
120 lengthFitness = 0;
121 }
122
123 values[Diag] = v[huPos-1][enPos-1] - w[huPos-1][enPos-1] - lengthFitness ;
124 }
125
126 const double dotLength = 2.0 ;
127
128 if ((huPos>1) && (enPos>0))
129 {
130 if (quasiglobal_lengthFitnessApplied)
131 {
132 lengthFitness = closeness(huLength[huPos-2]+huLength[huPos-1]+dotLength, enLength[enPos-1]);
133 }
134 else
135 {
136 lengthFitness = 0;
137 }
138
139 }
140
141 if ((huPos>0) && (enPos>1))
142 {
143 if (quasiglobal_lengthFitnessApplied)
144 {
145 // Attention, the two-sentence length is the first argument. Usually the Hungarian is the first argument, but not here.
146 lengthFitness = closeness(enLength[enPos-2]+enLength[enPos-1]+dotLength, huLength[huPos-1]);
147 }
148 else
149 {
150 lengthFitness = 0;
151 }
152
153 const double& a = w[huPos-1][enPos-1] ;
154 const double& b = w[huPos-1][enPos-2] ;
155 values[HuEnEnSkip] = v[huPos-1][enPos-2] - ( a<b ? a : b ) - skipScore - lengthFitness ; // The worse of the two crossed square.
156 }
157
158 unsigned char direction = Dead;
159 double bestValue = infinity;
160 for ( i=1; i<Dead; ++i )
161 {
162 if (values[i]<bestValue)
163 {
164 bestValue = values[i];
165 direction = i;
166 }
167 }
168
169 trail = direction;
170 if (direction==Dead)
171 {
172 val = 0;
173 }
174 else
175 {
176 val = bestValue;
177 }
178 }
179 else // (!quasiglobal_knightsMoveAllowed)
180 {
181 int borderCase = ( (huPos==0) ? 0 : 2 ) + ( (enPos==0) ? 0 : 1 ) ;
182
183 switch (borderCase)
184 {
185 case 0:
186 {
187 val = 0;
188 trail = Dead;
189 break;
190 }
191 case 1: // huPos==0
192 {
193 val = v[0][enPos-1] - skipScore ;
194 trail = EnSkip;
195 break;
196 }
197 case 2: // enPos==0
198 {
199 val = v[huPos-1][0] - skipScore ;
200 trail = HuSkip;
201 break;
202 }
203 case 3:
204 {
205 double x = v[huPos-1][enPos] - skipScore ;
206 double y = v[huPos] [enPos-1] - skipScore ;
207 double xy = v[huPos-1][enPos-1] - w[huPos-1][enPos-1] ;
208
209 double best = xy;
210 trail = Diag;
211 if (x<best)
212 {
213 best = x;
214 trail = HuSkip;
215 }
216 if (y<best)
217 {
218 best = y;
219 trail = EnSkip;
220 }
221 val = best;
222 break;
223 }
224 }
225 }
226 }
227 }
228 }
229
trelliToLadder(const TrelliMatrix & trellis,Trail & bestTrail)230 void trelliToLadder( const TrelliMatrix& trellis, Trail& bestTrail )
231 {
232 bestTrail.clear();
233
234 // The -1 is needed because the trellis matrix is one larger than the similarity matrix.
235 // This points to its downmost rightmost element.
236 const int huBookSize = trellis.size()-1;
237 const int enBookSize = trellis.otherSize()-1;
238
239 int huPos=huBookSize;
240 int enPos=enBookSize;
241
242 bool logging = false;
243
244 if (logging) std::wcerr << std::endl;
245
246 bool over = false;
247 bool hopelesslyBadTrail = false;
248 bestTrail.push_back(std::make_pair(huPos,enPos));
249
250 while (true)
251 {
252 unsigned char trelli = trellis[huPos][enPos];
253
254 if ((huPos==0) || (enPos==0))
255 break;
256
257 switch (trelli)
258 {
259 case Diag :
260 {
261 --huPos;
262 --enPos;
263 break;
264 }
265 case HuSkip :
266 {
267 --huPos;
268 break;
269 }
270 case EnSkip :
271 {
272 --enPos;
273 break;
274 }
275 case HuHuEnSkip :
276 {
277 huPos -= 2;
278 --enPos;
279 break;
280 }
281 case HuEnEnSkip :
282 {
283 --huPos;
284 enPos -= 2;
285 break;
286 }
287 case Dead :
288 {
289 over = true;
290 break;
291 }
292 default:
293 {
294 hopelesslyBadTrail = true;
295 over = true;
296 break;
297 }
298 }
299
300 if (over)
301 break;
302
303 bestTrail.push_back(std::make_pair(huPos,enPos));
304
305 if (logging)
306 {
307 std::wcerr << huPos << " \t" << enPos << std::endl;
308 }
309
310 }
311
312 if (hopelesslyBadTrail)
313 {
314 bestTrail.clear();
315 bestTrail.push_back(std::make_pair(huBookSize,enBookSize));
316 bestTrail.push_back(std::make_pair(0,0));
317 std::wcerr << "Error: hopelessly bad trail." << std::endl;
318 }
319
320 std::reverse(bestTrail.begin(), bestTrail.end() );
321 }
322
323
align(const AlignMatrix & w,const SentenceValues & huLength,const SentenceValues & enLength,Trail & bestTrail,AlignMatrix & v)324 void align( const AlignMatrix& w, const SentenceValues& huLength, const SentenceValues& enLength,
325 Trail& bestTrail, AlignMatrix& v )
326 {
327 const int huBookSize = w.size();
328 const int enBookSize = w.otherSize();
329 const int thickness = w.thickness();
330
331 massert(w.size()+1 == v.size());
332 massert(w.otherSize()+1 == v.otherSize());
333
334 TrelliMatrix trellis( huBookSize+1,enBookSize+1,thickness, Dead );
335
336 buildDynProgMatrix( w, huLength, enLength, v, trellis );
337
338 // std::wcerr << "Matrix built." << std::endl;
339
340 trelliToLadder( trellis, bestTrail );
341
342 // std::wcerr << "Trail found." << std::endl;
343 }
344
345
oneToOne(const Trail & bestTrail,int pos)346 bool oneToOne( const Trail& bestTrail, int pos )
347 {
348 return (
349 ( bestTrail[pos+1].first -bestTrail[pos].first == 1 )
350 &&
351 ( bestTrail[pos+1].second-bestTrail[pos].second == 1 )
352 );
353 }
354
355
countIntersectionOfTrails(const Trail & sx,const Trail & sy)356 int countIntersectionOfTrails( const Trail& sx, const Trail& sy )
357 {
358 int inter(0);
359
360 Trail::const_iterator sxt = sx.begin();
361 Trail::const_iterator syt = sy.begin();
362 Trail::const_iterator sxe = sx.end();
363 Trail::const_iterator sye = sy.end();
364 for ( ; sxt!=sxe && syt!=sye ; )
365 {
366 if ( *sxt < *syt )
367 ++sxt;
368 else if ( *sxt > *syt )
369 ++syt;
370 else
371 {
372 ++inter;
373 ++sxt;
374 ++syt;
375 }
376 }
377 return inter;
378 }
379
380
381 // A bit of an abuse of the fact that Trail and BisentenceList are typedef'd to the same structure.
scoreTrailOrBisentenceList(const Trail & trailAuto,const Trail & trailHand)382 double scoreTrailOrBisentenceList( const Trail& trailAuto, const Trail& trailHand )
383 {
384 int score = countIntersectionOfTrails( trailAuto, trailHand );
385
386 std::wcerr << trailAuto.size()-score << " misaligned out of " << trailHand.size() << " correct items, "
387 << trailAuto.size() << " bets." << std::endl;
388
389 std::wcerr << "Precision: " << 1.0*score/trailAuto.size()
390 << ", Recall: " << 1.0*score/trailHand.size() << std::endl;
391
392 double ratio = 1.0*(trailAuto.size()-score)/trailAuto.size();
393 return ratio;
394 }
395
396
trailToBisentenceList(const Trail & bestTrail,BisentenceList & bisentenceList)397 void trailToBisentenceList( const Trail& bestTrail,
398 BisentenceList& bisentenceList )
399 {
400 bisentenceList.clear();
401
402 int trailSize = bestTrail.size();
403
404 for ( int pos=0; pos<trailSize-1; ++pos )
405 {
406 if ( oneToOne(bestTrail,pos) )
407 {
408 bisentenceList.push_back(bestTrail [pos]);
409 }
410 }
411 }
412
413
scoreBisentenceList(const BisentenceList & bisentenceListAuto,const Trail & trailHand)414 double scoreBisentenceList( const BisentenceList& bisentenceListAuto, const Trail& trailHand )
415 {
416 BisentenceList bisentenceListHand;
417 trailToBisentenceList( trailHand, bisentenceListHand );
418
419 double score = scoreTrailOrBisentenceList( bisentenceListAuto, bisentenceListHand ) ;
420
421 return score;
422 }
423
scoreTrail(const Trail & trailAuto,const Trail & trailHand)424 double scoreTrail( const Trail& trailAuto, const Trail& trailHand )
425 {
426 return ( scoreTrailOrBisentenceList( trailAuto, trailHand ) );
427 }
428
429
setBox(AlignMatrix & m,int huPos,int enPos,int radius,int insideOfRadiusValue)430 void setBox( AlignMatrix& m, int huPos, int enPos, int radius, int insideOfRadiusValue )
431 {
432 for ( int x=huPos-radius; x<=huPos+radius; ++x )
433 {
434 for ( int y=enPos-radius; y<=enPos+radius; ++y )
435 {
436 if ( (x>=0) && (x<m.size()) && (y>=0) && (y<m.otherSize()) )
437 {
438 m.cell(x,y) = insideOfRadiusValue ; // ToDo: Should this be (y,x) instead? Function has args y,x not x,y. Fix here or function
439 }
440 }
441 }
442 }
443
444 // Fills the complement of the radius of the trail with minus infties.
445 // The return value true means success. Failure means that during the fill,
446 // we intersected the outside of the quasidiagonal area.
447 // In this case, the operation is not finished.
borderDetailedAlignMatrix(AlignMatrix & alignMatrix,const Trail & trail,int radius)448 bool borderDetailedAlignMatrix( AlignMatrix& alignMatrix, const Trail& trail, int radius )
449 {
450 int huBookSize = alignMatrix.size();
451
452 int huPos, enPos;
453 for ( huPos=0; huPos<huBookSize; ++huPos )
454 {
455 int rowStart = alignMatrix.rowStart(huPos);
456 int rowEnd = alignMatrix.rowEnd(huPos);
457 for ( enPos=rowStart; enPos<rowEnd; ++enPos )
458 {
459 alignMatrix.cell(huPos,enPos) = outsideOfRadiusValue;
460 }
461 }
462
463 // We seriously use the fact that many-to-zero segments are subdivided into one-to-zero segments.
464 // Inside setBox, an exception is thrown if we try to write outside the quasidiagonal.
465 // If we catch such an exception, it means that the quasidiagonal is not thick enough.
466 // In this case, we abandon the whole align, just to be sure.
467 try
468 {
469 for ( size_t i=0; i<trail.size(); ++i )
470 {
471 setBox( alignMatrix, trail[i].first, trail[i].second, radius, insideOfRadiusValue );
472 }
473 }
474 catch ( const char* errorType )
475 {
476 massert( std::string(errorType) == "out of quasidiagonal" )
477 return false;
478 }
479
480 bool verify = true;
481 if (verify)
482 {
483 int numberOfEvaluatedItems(0);
484 for ( huPos=0; huPos<huBookSize; ++huPos )
485 {
486 int rowStart = alignMatrix.rowStart(huPos);
487 int rowEnd = alignMatrix.rowEnd(huPos);
488 for ( enPos=rowStart; enPos<rowEnd; ++enPos )
489 {
490 if (alignMatrix[huPos][enPos]==insideOfRadiusValue)
491 {
492 ++numberOfEvaluatedItems;
493 }
494 }
495 }
496
497 std::wcerr << numberOfEvaluatedItems << " items inside the border." << std::endl;
498 }
499
500 return true;
501 }
502
503 template <class T>
dumpAlignMatrix(const QuasiDiagonal<T> & alignMatrix)504 void dumpAlignMatrix( const QuasiDiagonal<T>& alignMatrix )
505 {
506 int huPos,enPos;
507
508 int huBookSize = alignMatrix.size();
509 int enBookSize = alignMatrix.otherSize();
510
511 for ( huPos=0; huPos<huBookSize; ++huPos )
512 {
513 for ( enPos=0; enPos<enBookSize; ++enPos )
514 {
515 int start = alignMatrix.rowStart(huPos);
516 int end = alignMatrix.rowEnd (huPos);
517
518 if ( (enPos<start) || (enPos>=end) )
519 {
520 std::cout << "-1\t";
521 continue;
522 }
523
524 std::cout << alignMatrix[huPos][enPos] << "\t";
525 }
526 std::cout << std::endl;
527 }
528 }
529
dumpAlignMatrix(const QuasiDiagonal<int> & alignMatrix,bool graphical)530 void dumpAlignMatrix( const QuasiDiagonal<int>& alignMatrix, bool graphical )
531 {
532 int huPos,enPos;
533
534 int huBookSize = alignMatrix.size();
535 int enBookSize = alignMatrix.otherSize();
536
537 for ( huPos=0; huPos<huBookSize; ++huPos )
538 {
539 for ( enPos=0; enPos<enBookSize; ++enPos )
540 {
541 int start = alignMatrix.rowStart(huPos);
542 int end = alignMatrix.rowEnd (huPos);
543
544 if ( (enPos<start) || (enPos>=end) )
545 {
546 if (graphical)
547 {
548 std::cout << " ";
549 }
550 else
551 {
552 std::cout << "-1\t";
553 }
554 continue;
555 }
556
557 if (graphical)
558 {
559 char c(' ');
560 switch (alignMatrix[huPos][enPos])
561 {
562 case 0: c=' '; break;
563 case 1: c='.'; break;
564 case 2: c=':'; break;
565 case 3: c='|'; break;
566 case 4: c='+'; break;
567 default: c='X'; break;
568 }
569 std::cout << c << " ";
570 }
571 else
572 {
573 std::cout << alignMatrix[huPos][enPos] << "\t";
574 }
575 }
576 std::cout << std::endl;
577 }
578 }
579
dumpTrelliMatrix(const TrelliMatrix & trellis)580 void dumpTrelliMatrix( const TrelliMatrix& trellis )
581 {
582 std::map<int, std::string> directions;
583
584 directions[Diag] = "HuEn";
585 directions[HuSkip] = "Hu";
586 directions[EnSkip] = "En";
587 directions[HuHuEnSkip] = "HuHuEn";
588 directions[HuEnEnSkip] = "HuEnEn";
589 directions[Dead] = "Dead";
590
591 int huPos,enPos;
592
593 int huBookSize = trellis.size();
594 int enBookSize = trellis.otherSize();
595
596 for ( huPos=0; huPos<huBookSize; ++huPos )
597 {
598 for ( enPos=0; enPos<enBookSize; ++enPos )
599 {
600 int start = trellis.rowStart(huPos);
601 int end = trellis.rowEnd (huPos);
602
603 if ( (enPos<start) || (enPos>=end) )
604 {
605 std::cout << "-1\t";
606 continue;
607 }
608
609 std::cout << directions[trellis[huPos][enPos]] << "\t";
610 }
611 std::cout << std::endl;
612 }
613 }
614
615 } // namespace TMXAligner
616