1 /****************************************************************************
2 **
3 ** Copyright (C) 2015 The Qt Company Ltd.
4 ** Contact: http://www.qt.io/licensing/
5 **
6 ** This file is part of the Qt Linguist of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** Commercial License Usage
10 ** Licensees holding valid commercial Qt licenses may use this file in
11 ** accordance with the commercial license agreement provided with the
12 ** Software or, alternatively, in accordance with the terms contained in
13 ** a written agreement between you and The Qt Company. For licensing terms
14 ** and conditions see http://www.qt.io/terms-conditions. For further
15 ** information use the contact form at http://www.qt.io/contact-us.
16 **
17 ** GNU Lesser General Public License Usage
18 ** Alternatively, this file may be used under the terms of the GNU Lesser
19 ** General Public License version 2.1 or version 3 as published by the Free
20 ** Software Foundation and appearing in the file LICENSE.LGPLv21 and
21 ** LICENSE.LGPLv3 included in the packaging of this file. Please review the
22 ** following information to ensure the GNU Lesser General Public License
23 ** requirements will be met: https://www.gnu.org/licenses/lgpl.html and
24 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
25 **
26 ** As a special exception, The Qt Company gives you certain additional
27 ** rights. These rights are described in The Qt Company LGPL Exception
28 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
29 **
30 ** GNU General Public License Usage
31 ** Alternatively, this file may be used under the terms of the GNU
32 ** General Public License version 3.0 as published by the Free Software
33 ** Foundation and appearing in the file LICENSE.GPL included in the
34 ** packaging of this file.  Please review the following information to
35 ** ensure the GNU General Public License version 3.0 requirements will be
36 ** met: http://www.gnu.org/copyleft/gpl.html.
37 **
38 ** $QT_END_LICENSE$
39 **
40 ****************************************************************************/
41 
42 #include "simtexth.h"
43 #include "translator.h"
44 
45 #include <QtCore/QByteArray>
46 #include <QtCore/QString>
47 #include <QtCore/QList>
48 
49 
50 QT_BEGIN_NAMESPACE
51 
52 typedef QList<TranslatorMessage> TML;
53 
54 /*
55   How similar are two texts?  The approach used here relies on co-occurrence
56   matrices and is very efficient.
57 
58   Let's see with an example: how similar are "here" and "hither"?  The
59   co-occurrence matrix M for "here" is M[h,e] = 1, M[e,r] = 1, M[r,e] = 1, and 0
60   elsewhere; the matrix N for "hither" is N[h,i] = 1, N[i,t] = 1, ...,
61   N[h,e] = 1, N[e,r] = 1, and 0 elsewhere.  The union U of both matrices is the
62   matrix U[i,j] = max { M[i,j], N[i,j] }, and the intersection V is
63   V[i,j] = min { M[i,j], N[i,j] }.  The score for a pair of texts is
64 
65       score = (sum of V[i,j] over all i, j) / (sum of U[i,j] over all i, j),
66 
67   a formula suggested by Arnt Gulbrandsen.  Here we have
68 
69       score = 2 / 6,
70 
71   or one third.
72 
73   The implementation differs from this in a few details.  Most importantly,
74   repetitions are ignored; for input "xxx", M[x,x] equals 1, not 2.
75 */
76 
77 /*
78   Every character is assigned to one of 20 buckets so that the co-occurrence
79   matrix requires only 20 * 20 = 400 bits, not 256 * 256 = 65536 bits or even
80   more if we want the whole Unicode.  Which character falls in which bucket is
81   arbitrary.
82 
83   The second half of the table is a replica of the first half, because of
84   laziness.
85 */
86 static const int indexOf[256] = {
87     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
88     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
89 //      !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /
90     0,  2,  6,  7,  10, 12, 15, 19, 2,  6,  7,  10, 12, 15, 19, 0,
91 //  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?
92     1,  3,  4,  5,  8,  9,  11, 13, 14, 16, 2,  6,  7,  10, 12, 15,
93 //  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O
94     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  6,  10, 11, 12, 13, 14,
95 //  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _
96     15, 12, 16, 17, 18, 19, 2,  10, 15, 7,  19, 2,  6,  7,  10, 0,
97 //  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o
98     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  6,  10, 11, 12, 13, 14,
99 //  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~
100     15, 12, 16, 17, 18, 19, 2,  10, 15, 7,  19, 2,  6,  7,  10, 0,
101 
102     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
103     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
104     0,  2,  6,  7,  10, 12, 15, 19, 2,  6,  7,  10, 12, 15, 19, 0,
105     1,  3,  4,  5,  8,  9,  11, 13, 14, 16, 2,  6,  7,  10, 12, 15,
106     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  6,  10, 11, 12, 13, 14,
107     15, 12, 16, 17, 18, 19, 2,  10, 15, 7,  19, 2,  6,  7,  10, 0,
108     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  6,  10, 11, 12, 13, 14,
109     15, 12, 16, 17, 18, 19, 2,  10, 15, 7,  19, 2,  6,  7,  10, 0
110 };
111 
112 /*
113   The entry bitCount[i] (for i between 0 and 255) is the number of bits used to
114   represent i in binary.
115 */
116 static const int bitCount[256] = {
117     0,  1,  1,  2,  1,  2,  2,  3,  1,  2,  2,  3,  2,  3,  3,  4,
118     1,  2,  2,  3,  2,  3,  3,  4,  2,  3,  3,  4,  3,  4,  4,  5,
119     1,  2,  2,  3,  2,  3,  3,  4,  2,  3,  3,  4,  3,  4,  4,  5,
120     2,  3,  3,  4,  3,  4,  4,  5,  3,  4,  4,  5,  4,  5,  5,  6,
121     1,  2,  2,  3,  2,  3,  3,  4,  2,  3,  3,  4,  3,  4,  4,  5,
122     2,  3,  3,  4,  3,  4,  4,  5,  3,  4,  4,  5,  4,  5,  5,  6,
123     2,  3,  3,  4,  3,  4,  4,  5,  3,  4,  4,  5,  4,  5,  5,  6,
124     3,  4,  4,  5,  4,  5,  5,  6,  4,  5,  5,  6,  5,  6,  6,  7,
125     1,  2,  2,  3,  2,  3,  3,  4,  2,  3,  3,  4,  3,  4,  4,  5,
126     2,  3,  3,  4,  3,  4,  4,  5,  3,  4,  4,  5,  4,  5,  5,  6,
127     2,  3,  3,  4,  3,  4,  4,  5,  3,  4,  4,  5,  4,  5,  5,  6,
128     3,  4,  4,  5,  4,  5,  5,  6,  4,  5,  5,  6,  5,  6,  6,  7,
129     2,  3,  3,  4,  3,  4,  4,  5,  3,  4,  4,  5,  4,  5,  5,  6,
130     3,  4,  4,  5,  4,  5,  5,  6,  4,  5,  5,  6,  5,  6,  6,  7,
131     3,  4,  4,  5,  4,  5,  5,  6,  4,  5,  5,  6,  5,  6,  6,  7,
132     4,  5,  5,  6,  5,  6,  6,  7,  5,  6,  6,  7,  6,  7,  7,  8
133 };
134 
135 struct CoMatrix
136 {
137     /*
138       The matrix has 20 * 20 = 400 entries.  This requires 50 bytes, or 13
139       words.  Some operations are performed on words for more efficiency.
140     */
141     union {
142         quint8 b[52];
143         quint32 w[13];
144     };
145 
CoMatrixCoMatrix146     CoMatrix() { memset( b, 0, 52 ); }
147 
CoMatrixCoMatrix148     CoMatrix(const QString &str)
149     {
150         QByteArray ba = str.toUtf8();
151         const char *text = ba.constData();
152         char c = '\0', d;
153         memset( b, 0, 52 );
154         /*
155           The Knuth books are not in the office only for show; they help make
156           loops 30% faster and 20% as readable.
157         */
158         while ( (d = *text) != '\0' ) {
159             setCoOccurence( c, d );
160             if ( (c = *++text) != '\0' ) {
161                 setCoOccurence( d, c );
162                 text++;
163             }
164         }
165     }
166 
setCoOccurenceCoMatrix167     void setCoOccurence( char c, char d ) {
168         int k = indexOf[(uchar) c] + 20 * indexOf[(uchar) d];
169         b[k >> 3] |= (1 << (k & 0x7));
170     }
171 
worthCoMatrix172     int worth() const {
173         int w = 0;
174         for ( int i = 0; i < 50; i++ )
175             w += bitCount[b[i]];
176         return w;
177     }
178 };
179 
reunion(const CoMatrix & m,const CoMatrix & n)180 static inline CoMatrix reunion(const CoMatrix &m, const CoMatrix &n)
181 {
182     CoMatrix p;
183     for (int i = 0; i < 13; ++i)
184         p.w[i] = m.w[i] | n.w[i];
185     return p;
186 }
187 
intersection(const CoMatrix & m,const CoMatrix & n)188 static inline CoMatrix intersection(const CoMatrix &m, const CoMatrix &n)
189 {
190     CoMatrix p;
191     for (int i = 0; i < 13; ++i)
192         p.w[i] = m.w[i] & n.w[i];
193     return p;
194 }
195 
StringSimilarityMatcher(const QString & stringToMatch)196 StringSimilarityMatcher::StringSimilarityMatcher(const QString &stringToMatch)
197 {
198     m_cm = new CoMatrix(stringToMatch);
199     m_length = stringToMatch.length();
200 }
201 
getSimilarityScore(const QString & strCandidate)202 int StringSimilarityMatcher::getSimilarityScore(const QString &strCandidate)
203 {
204     CoMatrix cmTarget(strCandidate);
205     int delta = qAbs(m_length - strCandidate.size());
206     int score = ( (intersection(*m_cm, cmTarget).worth() + 1) << 10 ) /
207         ( reunion(*m_cm, cmTarget).worth() + (delta << 1) + 1 );
208     return score;
209 }
210 
~StringSimilarityMatcher()211 StringSimilarityMatcher::~StringSimilarityMatcher()
212 {
213     delete m_cm;
214 }
215 
216 /**
217  * Checks how similar two strings are.
218  * The return value is the score, and a higher score is more similar
219  * than one with a low score.
220  * Linguist considers a score over 190 to be a good match.
221  * \sa StringSimilarityMatcher
222  */
getSimilarityScore(const QString & str1,const QString & str2)223 int getSimilarityScore(const QString &str1, const QString &str2)
224 {
225     CoMatrix cmTarget(str2);
226     CoMatrix cm(str1);
227     int delta = qAbs(str1.size() - str2.size());
228 
229     int score = ( (intersection(cm, cmTarget).worth() + 1) << 10 )
230         / ( reunion(cm, cmTarget).worth() + (delta << 1) + 1 );
231 
232     return score;
233 }
234 
similarTextHeuristicCandidates(const Translator * tor,const QString & text,int maxCandidates)235 CandidateList similarTextHeuristicCandidates(const Translator *tor,
236     const QString &text, int maxCandidates)
237 {
238     QList<int> scores;
239     CandidateList candidates;
240 
241     foreach (const TranslatorMessage &mtm, tor->messages()) {
242         if (mtm.type() == TranslatorMessage::Unfinished
243             || mtm.translation().isEmpty())
244             continue;
245 
246         QString s = mtm.sourceText();
247         int score = getSimilarityScore(s, text);
248 
249         if (candidates.size() == maxCandidates && score > scores[maxCandidates - 1] )
250             candidates.removeLast();
251 
252         if (candidates.size() < maxCandidates && score >= textSimilarityThreshold) {
253             Candidate cand( s, mtm.translation() );
254 
255             int i;
256             for (i = 0; i < candidates.size(); i++) {
257                 if (score >= scores.at(i)) {
258                     if (score == scores.at(i)) {
259                         if (candidates.at(i) == cand)
260                             goto continue_outer_loop;
261                     } else {
262                         break;
263                     }
264                 }
265             }
266             scores.insert(i, score);
267             candidates.insert(i, cand);
268         }
269         continue_outer_loop:
270         ;
271     }
272     return candidates;
273 }
274 
275 QT_END_NAMESPACE
276