1 /*************************************************************************
2 *                                                                        *
3 *  (C) Copyright 2004. Media Research Centre at the                      *
4 *  Sociology and Communications Department of the                        *
5 *  Budapest University of Technology and Economics.                      *
6 *                                                                        *
7 *  Developed by Daniel Varga.                                            *
8 *                                                                        *
9 *  From hunalign; for license see ../AUTHORS and ../COPYING.hunalign     *
10 *                                                                        *
11 *************************************************************************/
12 #include <apertium/tmx_alignment.h>
13 
14 #include <apertium/tmx_words.h> // For SentenceList
15 #include <apertium/tmx_dictionary.h> // For FrequencyMap
16 #include <apertium/string_utils.h>
17 
18 #include <iostream>
19 #include <map>
20 #include <set>
21 #include <algorithm>
22 
23 // Copypaste-elve. TODO Elhelyezni.
24 #define massert(e) if (!(e)) { std::wcerr << #e << " failed" << std::endl; throw "assert"; }
25 
operator <<(std::ostream & os,std::pair<int,int> p)26 std::ostream& operator<<( std::ostream& os, std::pair<int,int> p )
27 {
28   os << p.first << "," << p.second;
29   return os;
30 }
31 
32 namespace TMXAligner
33 {
34 
35 
36 // Attention, the two-sentence length is the first argument. Usually the Hungarian is, but not here.
37 // The bigger the better. closeness is always smaller than bestScore.
closeness(double twoSentenceLength,double oneSentenceLength)38 double closeness( double twoSentenceLength, double oneSentenceLength )
39 {
40   const double bestScore = 0.3;
41   const double quasiglobal_closenessMultiplier = 0.3;
42 
43   double ratio;
44 
45   if (twoSentenceLength>oneSentenceLength)
46   {
47     ratio = (twoSentenceLength+1)/(oneSentenceLength+1);
48   }
49   else
50   {
51     ratio = (oneSentenceLength+1)/(twoSentenceLength+1);
52   }
53 
54   ratio -= 1.0;
55 
56   // assert(ratio>=0);
57   return bestScore - quasiglobal_closenessMultiplier * ratio;
58 }
59 
60 const unsigned char Diag = 1;
61 const unsigned char HuSkip = 2;
62 const unsigned char EnSkip = 3;
63 const unsigned char HuHuEnSkip = 4;
64 const unsigned char HuEnEnSkip = 5;
65 const unsigned char Dead = 6;
66 
buildDynProgMatrix(const AlignMatrix & w,const SentenceValues & huLength,const SentenceValues & enLength,QuasiDiagonal<double> & v,TrelliMatrix & trellis)67 void buildDynProgMatrix( const AlignMatrix& w, const SentenceValues& huLength, const SentenceValues& enLength,
68                          QuasiDiagonal<double>& v, TrelliMatrix& trellis )
69 {
70   const int huBookSize = w.size();
71 
72 
73   int huPos,enPos;
74 
75   // v[huPos][enPos] gives the similarity of the [0,huPos) and [0,enPos) intervals.
76   // The smaller value, the better similarity. (Unlike in the original similarity matrix w, where bigger is better.)
77 
78   double infinity = 1e6;
79 
80   for ( huPos=0; huPos<=huBookSize; ++huPos )
81   {
82     int rowStart = v.rowStart(huPos);
83     int rowEnd   = v.rowEnd(huPos);
84     for ( enPos=rowStart; enPos<rowEnd; ++enPos )
85     {
86       double& val = v.cell(huPos,enPos);
87       unsigned char& trail = trellis.cell(huPos,enPos);
88 
89       bool quasiglobal_knightsMoveAllowed = true;
90       if (quasiglobal_knightsMoveAllowed)
91       {
92         double lengthFitness(0);
93 
94         bool quasiglobal_lengthFitnessApplied = true;
95 
96         // The array is indexed by the step directions. The smaller value, the better.
97         double values[Dead];
98         int i;
99         for ( i=1; i<Dead; ++i )
100           values[i] = infinity;
101 
102         if (huPos>0)
103         {
104           values[HuSkip] = v[huPos-1][enPos]   - skipScore;
105         }
106 
107         if (enPos>0)
108         {
109           values[EnSkip] = v[huPos][enPos-1]   - skipScore;
110         }
111 
112         if ((huPos>0) && (enPos>0))
113         {
114           if (quasiglobal_lengthFitnessApplied)
115           {
116             lengthFitness = closeness(huLength[huPos-1], enLength[enPos-1]);
117           }
118           else
119           {
120             lengthFitness = 0;
121           }
122 
123           values[Diag] = v[huPos-1][enPos-1] - w[huPos-1][enPos-1] - lengthFitness ;
124         }
125 
126         const double dotLength = 2.0 ;
127 
128         if ((huPos>1) && (enPos>0))
129         {
130           if (quasiglobal_lengthFitnessApplied)
131           {
132             lengthFitness = closeness(huLength[huPos-2]+huLength[huPos-1]+dotLength, enLength[enPos-1]);
133           }
134           else
135           {
136             lengthFitness = 0;
137           }
138 
139         }
140 
141         if ((huPos>0) && (enPos>1))
142         {
143           if (quasiglobal_lengthFitnessApplied)
144           {
145             // Attention, the two-sentence length is the first argument. Usually the Hungarian is the first argument, but not here.
146             lengthFitness = closeness(enLength[enPos-2]+enLength[enPos-1]+dotLength, huLength[huPos-1]);
147           }
148           else
149           {
150             lengthFitness = 0;
151           }
152 
153           const double& a = w[huPos-1][enPos-1] ;
154           const double& b = w[huPos-1][enPos-2] ;
155           values[HuEnEnSkip] = v[huPos-1][enPos-2] - ( a<b ? a : b ) - skipScore - lengthFitness ; // The worse of the two crossed square.
156         }
157 
158         unsigned char direction = Dead;
159         double bestValue = infinity;
160         for ( i=1; i<Dead; ++i )
161         {
162           if (values[i]<bestValue)
163           {
164             bestValue = values[i];
165             direction = i;
166           }
167         }
168 
169         trail = direction;
170         if (direction==Dead)
171         {
172           val = 0;
173         }
174         else
175         {
176           val = bestValue;
177         }
178       }
179       else // (!quasiglobal_knightsMoveAllowed)
180       {
181         int borderCase = ( (huPos==0) ? 0 : 2 ) + ( (enPos==0) ? 0 : 1 ) ;
182 
183         switch (borderCase)
184         {
185         case 0:
186           {
187             val = 0;
188             trail = Dead;
189             break;
190           }
191         case 1: // huPos==0
192           {
193             val = v[0][enPos-1] - skipScore ;
194             trail = EnSkip;
195             break;
196           }
197         case 2: // enPos==0
198           {
199             val = v[huPos-1][0] - skipScore ;
200             trail = HuSkip;
201             break;
202           }
203         case 3:
204           {
205             double x  = v[huPos-1][enPos]   - skipScore ;
206             double y  = v[huPos]  [enPos-1] - skipScore ;
207             double xy = v[huPos-1][enPos-1] - w[huPos-1][enPos-1] ;
208 
209             double best = xy;
210             trail = Diag;
211             if (x<best)
212             {
213               best = x;
214               trail = HuSkip;
215             }
216             if (y<best)
217             {
218               best = y;
219               trail = EnSkip;
220             }
221             val = best;
222             break;
223           }
224         }
225       }
226     }
227   }
228 }
229 
trelliToLadder(const TrelliMatrix & trellis,Trail & bestTrail)230 void trelliToLadder( const TrelliMatrix& trellis, Trail& bestTrail )
231 {
232   bestTrail.clear();
233 
234   // The -1 is needed because the trellis matrix is one larger than the similarity matrix.
235   // This points to its downmost rightmost element.
236   const int huBookSize = trellis.size()-1;
237   const int enBookSize = trellis.otherSize()-1;
238 
239   int huPos=huBookSize;
240   int enPos=enBookSize;
241 
242   bool logging = false;
243 
244   if (logging) std::wcerr << std::endl;
245 
246   bool over = false;
247   bool hopelesslyBadTrail = false;
248   bestTrail.push_back(std::make_pair(huPos,enPos));
249 
250   while (true)
251   {
252     unsigned char trelli = trellis[huPos][enPos];
253 
254     if ((huPos==0) || (enPos==0))
255       break;
256 
257     switch (trelli)
258     {
259     case Diag :
260     {
261       --huPos;
262       --enPos;
263       break;
264     }
265     case HuSkip :
266     {
267       --huPos;
268       break;
269     }
270     case EnSkip :
271     {
272       --enPos;
273       break;
274     }
275     case HuHuEnSkip :
276     {
277       huPos -= 2;
278       --enPos;
279       break;
280     }
281     case HuEnEnSkip :
282     {
283       --huPos;
284       enPos -= 2;
285       break;
286     }
287     case Dead :
288     {
289       over = true;
290       break;
291     }
292     default:
293     {
294       hopelesslyBadTrail = true;
295       over = true;
296       break;
297     }
298     }
299 
300     if (over)
301       break;
302 
303     bestTrail.push_back(std::make_pair(huPos,enPos));
304 
305     if (logging)
306     {
307       std::wcerr << huPos << " \t" << enPos << std::endl;
308     }
309 
310   }
311 
312   if (hopelesslyBadTrail)
313   {
314     bestTrail.clear();
315     bestTrail.push_back(std::make_pair(huBookSize,enBookSize));
316     bestTrail.push_back(std::make_pair(0,0));
317     std::wcerr << "Error: hopelessly bad trail." << std::endl;
318   }
319 
320   std::reverse(bestTrail.begin(),  bestTrail.end()  );
321 }
322 
323 
align(const AlignMatrix & w,const SentenceValues & huLength,const SentenceValues & enLength,Trail & bestTrail,AlignMatrix & v)324 void align( const AlignMatrix& w, const SentenceValues& huLength, const SentenceValues& enLength,
325             Trail& bestTrail, AlignMatrix& v )
326 {
327   const int huBookSize = w.size();
328   const int enBookSize = w.otherSize();
329   const int thickness  = w.thickness();
330 
331   massert(w.size()+1 == v.size());
332   massert(w.otherSize()+1 == v.otherSize());
333 
334   TrelliMatrix trellis( huBookSize+1,enBookSize+1,thickness, Dead );
335 
336   buildDynProgMatrix( w, huLength, enLength, v, trellis );
337 
338 //  std::wcerr << "Matrix built." << std::endl;
339 
340   trelliToLadder( trellis, bestTrail );
341 
342 //  std::wcerr << "Trail found." << std::endl;
343 }
344 
345 
oneToOne(const Trail & bestTrail,int pos)346 bool oneToOne( const Trail& bestTrail, int pos )
347 {
348   return (
349       ( bestTrail[pos+1].first -bestTrail[pos].first  == 1 )
350         &&
351       ( bestTrail[pos+1].second-bestTrail[pos].second == 1 )
352      );
353 }
354 
355 
countIntersectionOfTrails(const Trail & sx,const Trail & sy)356 int countIntersectionOfTrails( const Trail& sx, const Trail& sy )
357 {
358   int inter(0);
359 
360   Trail::const_iterator sxt = sx.begin();
361   Trail::const_iterator syt = sy.begin();
362   Trail::const_iterator sxe = sx.end();
363   Trail::const_iterator sye = sy.end();
364   for ( ; sxt!=sxe && syt!=sye ; )
365   {
366     if ( *sxt < *syt )
367       ++sxt;
368     else if ( *sxt > *syt )
369       ++syt;
370     else
371     {
372       ++inter;
373       ++sxt;
374       ++syt;
375     }
376   }
377   return inter;
378 }
379 
380 
381 // A bit of an abuse of the fact that Trail and BisentenceList are typedef'd to the same structure.
scoreTrailOrBisentenceList(const Trail & trailAuto,const Trail & trailHand)382 double scoreTrailOrBisentenceList( const Trail& trailAuto, const Trail& trailHand )
383 {
384   int score = countIntersectionOfTrails( trailAuto, trailHand );
385 
386   std::wcerr << trailAuto.size()-score << " misaligned out of " << trailHand.size() << " correct items, "
387     << trailAuto.size() << " bets." << std::endl;
388 
389   std::wcerr << "Precision: " << 1.0*score/trailAuto.size()
390     << ", Recall: " << 1.0*score/trailHand.size() << std::endl;
391 
392   double ratio = 1.0*(trailAuto.size()-score)/trailAuto.size();
393   return ratio;
394 }
395 
396 
trailToBisentenceList(const Trail & bestTrail,BisentenceList & bisentenceList)397 void trailToBisentenceList( const Trail& bestTrail,
398                             BisentenceList& bisentenceList )
399 {
400   bisentenceList.clear();
401 
402   int trailSize = bestTrail.size();
403 
404   for ( int pos=0; pos<trailSize-1; ++pos )
405   {
406     if ( oneToOne(bestTrail,pos) )
407     {
408       bisentenceList.push_back(bestTrail  [pos]);
409     }
410   }
411 }
412 
413 
scoreBisentenceList(const BisentenceList & bisentenceListAuto,const Trail & trailHand)414 double scoreBisentenceList( const BisentenceList& bisentenceListAuto, const Trail& trailHand )
415 {
416   BisentenceList bisentenceListHand;
417   trailToBisentenceList( trailHand, bisentenceListHand );
418 
419   double score = scoreTrailOrBisentenceList( bisentenceListAuto, bisentenceListHand ) ;
420 
421   return score;
422 }
423 
scoreTrail(const Trail & trailAuto,const Trail & trailHand)424 double scoreTrail( const Trail& trailAuto, const Trail& trailHand )
425 {
426   return ( scoreTrailOrBisentenceList( trailAuto, trailHand ) );
427 }
428 
429 
setBox(AlignMatrix & m,int huPos,int enPos,int radius,int insideOfRadiusValue)430 void setBox( AlignMatrix& m, int huPos, int enPos, int radius, int insideOfRadiusValue )
431 {
432   for ( int x=huPos-radius; x<=huPos+radius; ++x )
433   {
434     for ( int y=enPos-radius; y<=enPos+radius; ++y )
435     {
436       if ( (x>=0) && (x<m.size()) && (y>=0) && (y<m.otherSize()) )
437       {
438         m.cell(x,y) = insideOfRadiusValue ; // ToDo: Should this be (y,x) instead? Function has args y,x not x,y. Fix here or function
439       }
440     }
441   }
442 }
443 
444 // Fills the complement of the radius of the trail with minus infties.
445 // The return value true means success. Failure means that during the fill,
446 // we intersected the outside of the quasidiagonal area.
447 // In this case, the operation is not finished.
borderDetailedAlignMatrix(AlignMatrix & alignMatrix,const Trail & trail,int radius)448 bool borderDetailedAlignMatrix( AlignMatrix& alignMatrix, const Trail& trail, int radius )
449 {
450   int huBookSize = alignMatrix.size();
451 
452   int huPos, enPos;
453   for ( huPos=0; huPos<huBookSize; ++huPos )
454   {
455     int rowStart = alignMatrix.rowStart(huPos);
456     int rowEnd   = alignMatrix.rowEnd(huPos);
457     for ( enPos=rowStart; enPos<rowEnd; ++enPos )
458     {
459       alignMatrix.cell(huPos,enPos) = outsideOfRadiusValue;
460     }
461   }
462 
463   // We seriously use the fact that many-to-zero segments are subdivided into one-to-zero segments.
464   // Inside setBox, an exception is thrown if we try to write outside the quasidiagonal.
465   // If we catch such an exception, it means that the quasidiagonal is not thick enough.
466   // In this case, we abandon the whole align, just to be sure.
467   try
468   {
469     for ( size_t i=0; i<trail.size(); ++i )
470     {
471       setBox( alignMatrix, trail[i].first, trail[i].second, radius, insideOfRadiusValue );
472     }
473   }
474   catch ( const char* errorType )
475   {
476     massert( std::string(errorType) == "out of quasidiagonal" )
477     return false;
478   }
479 
480   bool verify = true;
481   if (verify)
482   {
483     int numberOfEvaluatedItems(0);
484     for ( huPos=0; huPos<huBookSize; ++huPos )
485     {
486       int rowStart = alignMatrix.rowStart(huPos);
487       int rowEnd   = alignMatrix.rowEnd(huPos);
488       for ( enPos=rowStart; enPos<rowEnd; ++enPos )
489       {
490         if (alignMatrix[huPos][enPos]==insideOfRadiusValue)
491         {
492           ++numberOfEvaluatedItems;
493         }
494       }
495     }
496 
497     std::wcerr << numberOfEvaluatedItems << " items inside the border." << std::endl;
498   }
499 
500   return true;
501 }
502 
503 template <class T>
dumpAlignMatrix(const QuasiDiagonal<T> & alignMatrix)504 void dumpAlignMatrix( const QuasiDiagonal<T>& alignMatrix )
505 {
506   int huPos,enPos;
507 
508   int huBookSize = alignMatrix.size();
509   int enBookSize = alignMatrix.otherSize();
510 
511   for ( huPos=0; huPos<huBookSize; ++huPos )
512   {
513     for ( enPos=0; enPos<enBookSize; ++enPos )
514     {
515       int start = alignMatrix.rowStart(huPos);
516       int end   = alignMatrix.rowEnd  (huPos);
517 
518       if ( (enPos<start) || (enPos>=end) )
519       {
520         std::cout << "-1\t";
521         continue;
522       }
523 
524       std::cout << alignMatrix[huPos][enPos] << "\t";
525     }
526     std::cout << std::endl;
527   }
528 }
529 
dumpAlignMatrix(const QuasiDiagonal<int> & alignMatrix,bool graphical)530 void dumpAlignMatrix( const QuasiDiagonal<int>& alignMatrix, bool graphical )
531 {
532   int huPos,enPos;
533 
534   int huBookSize = alignMatrix.size();
535   int enBookSize = alignMatrix.otherSize();
536 
537   for ( huPos=0; huPos<huBookSize; ++huPos )
538   {
539     for ( enPos=0; enPos<enBookSize; ++enPos )
540     {
541       int start = alignMatrix.rowStart(huPos);
542       int end   = alignMatrix.rowEnd  (huPos);
543 
544       if ( (enPos<start) || (enPos>=end) )
545       {
546         if (graphical)
547         {
548           std::cout << "   ";
549         }
550         else
551         {
552           std::cout << "-1\t";
553         }
554         continue;
555       }
556 
557       if (graphical)
558       {
559         char c(' ');
560         switch (alignMatrix[huPos][enPos])
561         {
562           case 0: c=' '; break;
563           case 1: c='.'; break;
564           case 2: c=':'; break;
565           case 3: c='|'; break;
566           case 4: c='+'; break;
567           default: c='X'; break;
568         }
569         std::cout << c << " ";
570       }
571       else
572       {
573         std::cout << alignMatrix[huPos][enPos] << "\t";
574       }
575     }
576     std::cout << std::endl;
577   }
578 }
579 
dumpTrelliMatrix(const TrelliMatrix & trellis)580 void dumpTrelliMatrix( const TrelliMatrix& trellis )
581 {
582   std::map<int, std::string> directions;
583 
584   directions[Diag] = "HuEn";
585   directions[HuSkip] = "Hu";
586   directions[EnSkip] = "En";
587   directions[HuHuEnSkip] = "HuHuEn";
588   directions[HuEnEnSkip] = "HuEnEn";
589   directions[Dead] = "Dead";
590 
591   int huPos,enPos;
592 
593   int huBookSize = trellis.size();
594   int enBookSize = trellis.otherSize();
595 
596   for ( huPos=0; huPos<huBookSize; ++huPos )
597   {
598     for ( enPos=0; enPos<enBookSize; ++enPos )
599     {
600       int start = trellis.rowStart(huPos);
601       int end   = trellis.rowEnd  (huPos);
602 
603       if ( (enPos<start) || (enPos>=end) )
604       {
605         std::cout << "-1\t";
606         continue;
607       }
608 
609       std::cout << directions[trellis[huPos][enPos]] << "\t";
610     }
611     std::cout << std::endl;
612   }
613 }
614 
615 } // namespace TMXAligner
616