1 /****************************************************************************
2 **
3 ** Copyright (C) 2015 The Qt Company Ltd.
4 ** Contact: http://www.qt.io/licensing/
5 **
6 ** This file is part of the Qt Linguist of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** Commercial License Usage
10 ** Licensees holding valid commercial Qt licenses may use this file in
11 ** accordance with the commercial license agreement provided with the
12 ** Software or, alternatively, in accordance with the terms contained in
13 ** a written agreement between you and The Qt Company. For licensing terms
14 ** and conditions see http://www.qt.io/terms-conditions. For further
15 ** information use the contact form at http://www.qt.io/contact-us.
16 **
17 ** GNU Lesser General Public License Usage
18 ** Alternatively, this file may be used under the terms of the GNU Lesser
19 ** General Public License version 2.1 or version 3 as published by the Free
20 ** Software Foundation and appearing in the file LICENSE.LGPLv21 and
21 ** LICENSE.LGPLv3 included in the packaging of this file. Please review the
22 ** following information to ensure the GNU Lesser General Public License
23 ** requirements will be met: https://www.gnu.org/licenses/lgpl.html and
24 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
25 **
26 ** As a special exception, The Qt Company gives you certain additional
27 ** rights. These rights are described in The Qt Company LGPL Exception
28 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
29 **
30 ** GNU General Public License Usage
31 ** Alternatively, this file may be used under the terms of the GNU
32 ** General Public License version 3.0 as published by the Free Software
33 ** Foundation and appearing in the file LICENSE.GPL included in the
34 ** packaging of this file. Please review the following information to
35 ** ensure the GNU General Public License version 3.0 requirements will be
36 ** met: http://www.gnu.org/copyleft/gpl.html.
37 **
38 ** $QT_END_LICENSE$
39 **
40 ****************************************************************************/
41
42 #include "simtexth.h"
43 #include "translator.h"
44
45 #include <QtCore/QByteArray>
46 #include <QtCore/QString>
47 #include <QtCore/QList>
48
49
50 QT_BEGIN_NAMESPACE
51
52 typedef QList<TranslatorMessage> TML;
53
54 /*
55 How similar are two texts? The approach used here relies on co-occurrence
56 matrices and is very efficient.
57
58 Let's see with an example: how similar are "here" and "hither"? The
59 co-occurrence matrix M for "here" is M[h,e] = 1, M[e,r] = 1, M[r,e] = 1, and 0
60 elsewhere; the matrix N for "hither" is N[h,i] = 1, N[i,t] = 1, ...,
61 N[h,e] = 1, N[e,r] = 1, and 0 elsewhere. The union U of both matrices is the
62 matrix U[i,j] = max { M[i,j], N[i,j] }, and the intersection V is
63 V[i,j] = min { M[i,j], N[i,j] }. The score for a pair of texts is
64
65 score = (sum of V[i,j] over all i, j) / (sum of U[i,j] over all i, j),
66
67 a formula suggested by Arnt Gulbrandsen. Here we have
68
69 score = 2 / 6,
70
71 or one third.
72
73 The implementation differs from this in a few details. Most importantly,
74 repetitions are ignored; for input "xxx", M[x,x] equals 1, not 2.
75 */
76
77 /*
78 Every character is assigned to one of 20 buckets so that the co-occurrence
79 matrix requires only 20 * 20 = 400 bits, not 256 * 256 = 65536 bits or even
80 more if we want the whole Unicode. Which character falls in which bucket is
81 arbitrary.
82
83 The second half of the table is a replica of the first half, because of
84 laziness.
85 */
86 static const int indexOf[256] = {
87 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
88 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
89 // ! " # $ % & ' ( ) * + , - . /
90 0, 2, 6, 7, 10, 12, 15, 19, 2, 6, 7, 10, 12, 15, 19, 0,
91 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
92 1, 3, 4, 5, 8, 9, 11, 13, 14, 16, 2, 6, 7, 10, 12, 15,
93 // @ A B C D E F G H I J K L M N O
94 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 6, 10, 11, 12, 13, 14,
95 // P Q R S T U V W X Y Z [ \ ] ^ _
96 15, 12, 16, 17, 18, 19, 2, 10, 15, 7, 19, 2, 6, 7, 10, 0,
97 // ` a b c d e f g h i j k l m n o
98 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 6, 10, 11, 12, 13, 14,
99 // p q r s t u v w x y z { | } ~
100 15, 12, 16, 17, 18, 19, 2, 10, 15, 7, 19, 2, 6, 7, 10, 0,
101
102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104 0, 2, 6, 7, 10, 12, 15, 19, 2, 6, 7, 10, 12, 15, 19, 0,
105 1, 3, 4, 5, 8, 9, 11, 13, 14, 16, 2, 6, 7, 10, 12, 15,
106 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 6, 10, 11, 12, 13, 14,
107 15, 12, 16, 17, 18, 19, 2, 10, 15, 7, 19, 2, 6, 7, 10, 0,
108 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 6, 10, 11, 12, 13, 14,
109 15, 12, 16, 17, 18, 19, 2, 10, 15, 7, 19, 2, 6, 7, 10, 0
110 };
111
112 /*
113 The entry bitCount[i] (for i between 0 and 255) is the number of bits used to
114 represent i in binary.
115 */
116 static const int bitCount[256] = {
117 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
118 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
119 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
120 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
121 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
122 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
123 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
124 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
125 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
126 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
127 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
128 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
129 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
130 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
131 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
132 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
133 };
134
135 struct CoMatrix
136 {
137 /*
138 The matrix has 20 * 20 = 400 entries. This requires 50 bytes, or 13
139 words. Some operations are performed on words for more efficiency.
140 */
141 union {
142 quint8 b[52];
143 quint32 w[13];
144 };
145
CoMatrixCoMatrix146 CoMatrix() { memset( b, 0, 52 ); }
147
CoMatrixCoMatrix148 CoMatrix(const QString &str)
149 {
150 QByteArray ba = str.toUtf8();
151 const char *text = ba.constData();
152 char c = '\0', d;
153 memset( b, 0, 52 );
154 /*
155 The Knuth books are not in the office only for show; they help make
156 loops 30% faster and 20% as readable.
157 */
158 while ( (d = *text) != '\0' ) {
159 setCoOccurence( c, d );
160 if ( (c = *++text) != '\0' ) {
161 setCoOccurence( d, c );
162 text++;
163 }
164 }
165 }
166
setCoOccurenceCoMatrix167 void setCoOccurence( char c, char d ) {
168 int k = indexOf[(uchar) c] + 20 * indexOf[(uchar) d];
169 b[k >> 3] |= (1 << (k & 0x7));
170 }
171
worthCoMatrix172 int worth() const {
173 int w = 0;
174 for ( int i = 0; i < 50; i++ )
175 w += bitCount[b[i]];
176 return w;
177 }
178 };
179
reunion(const CoMatrix & m,const CoMatrix & n)180 static inline CoMatrix reunion(const CoMatrix &m, const CoMatrix &n)
181 {
182 CoMatrix p;
183 for (int i = 0; i < 13; ++i)
184 p.w[i] = m.w[i] | n.w[i];
185 return p;
186 }
187
intersection(const CoMatrix & m,const CoMatrix & n)188 static inline CoMatrix intersection(const CoMatrix &m, const CoMatrix &n)
189 {
190 CoMatrix p;
191 for (int i = 0; i < 13; ++i)
192 p.w[i] = m.w[i] & n.w[i];
193 return p;
194 }
195
StringSimilarityMatcher(const QString & stringToMatch)196 StringSimilarityMatcher::StringSimilarityMatcher(const QString &stringToMatch)
197 {
198 m_cm = new CoMatrix(stringToMatch);
199 m_length = stringToMatch.length();
200 }
201
getSimilarityScore(const QString & strCandidate)202 int StringSimilarityMatcher::getSimilarityScore(const QString &strCandidate)
203 {
204 CoMatrix cmTarget(strCandidate);
205 int delta = qAbs(m_length - strCandidate.size());
206 int score = ( (intersection(*m_cm, cmTarget).worth() + 1) << 10 ) /
207 ( reunion(*m_cm, cmTarget).worth() + (delta << 1) + 1 );
208 return score;
209 }
210
~StringSimilarityMatcher()211 StringSimilarityMatcher::~StringSimilarityMatcher()
212 {
213 delete m_cm;
214 }
215
216 /**
217 * Checks how similar two strings are.
218 * The return value is the score, and a higher score is more similar
219 * than one with a low score.
220 * Linguist considers a score over 190 to be a good match.
221 * \sa StringSimilarityMatcher
222 */
getSimilarityScore(const QString & str1,const QString & str2)223 int getSimilarityScore(const QString &str1, const QString &str2)
224 {
225 CoMatrix cmTarget(str2);
226 CoMatrix cm(str1);
227 int delta = qAbs(str1.size() - str2.size());
228
229 int score = ( (intersection(cm, cmTarget).worth() + 1) << 10 )
230 / ( reunion(cm, cmTarget).worth() + (delta << 1) + 1 );
231
232 return score;
233 }
234
similarTextHeuristicCandidates(const Translator * tor,const QString & text,int maxCandidates)235 CandidateList similarTextHeuristicCandidates(const Translator *tor,
236 const QString &text, int maxCandidates)
237 {
238 QList<int> scores;
239 CandidateList candidates;
240
241 foreach (const TranslatorMessage &mtm, tor->messages()) {
242 if (mtm.type() == TranslatorMessage::Unfinished
243 || mtm.translation().isEmpty())
244 continue;
245
246 QString s = mtm.sourceText();
247 int score = getSimilarityScore(s, text);
248
249 if (candidates.size() == maxCandidates && score > scores[maxCandidates - 1] )
250 candidates.removeLast();
251
252 if (candidates.size() < maxCandidates && score >= textSimilarityThreshold) {
253 Candidate cand( s, mtm.translation() );
254
255 int i;
256 for (i = 0; i < candidates.size(); i++) {
257 if (score >= scores.at(i)) {
258 if (score == scores.at(i)) {
259 if (candidates.at(i) == cand)
260 goto continue_outer_loop;
261 } else {
262 break;
263 }
264 }
265 }
266 scores.insert(i, score);
267 candidates.insert(i, cand);
268 }
269 continue_outer_loop:
270 ;
271 }
272 return candidates;
273 }
274
275 QT_END_NAMESPACE
276