1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  *   Licensed to the Apache Software Foundation (ASF) under one or more
12  *   contributor license agreements. See the NOTICE file distributed
13  *   with this work for additional information regarding copyright
14  *   ownership. The ASF licenses this file to you under the Apache
15  *   License, Version 2.0 (the "License"); you may not use this file
16  *   except in compliance with the License. You may obtain a copy of
17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 /*
21 
22     Weighted Levenshtein Distance
23     including wildcards
24     '*' for any number (0 or more) of arbitrary characters
25     '?' for exactly one arbitrary character
26     escapable with  backslash, "\*" or "\?"
27 
28     Return:
29         WLD if WLD <= nLimit, else nLimit+1
30 
31     or, if bSplitCount:
32         WLD if WLD <= nLimit
33         -WLD if Replace and Insert and Delete <= nLimit
34         else nLimit+1
35 
36     Recursive definition of WLD:
37 
38     WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) ,
39                              WLD( X(i)  , Y(j-1) ) + q      ,
40                              WLD( X(i-1), Y(j)   ) + r      )
41 
42     X(i)   := the first i characters of the word X
43     Y(j)   := the first j characters of the word Y
44     p(i,j) := 0 if i-th character of X == j-th character of Y,
45               p else
46 
47     Boundary conditions:
48     WLD( X(0), Y(j) ) := j*q  (Y created by j inserts)
49     WLD( X(i), Y(0) ) := i*r  (Y created by i deletes)
50     WLD( X(0), Y(0) ) := 0
51 
52     Instead of recursions a dynamic algorithm is used.
53 
54     See also: German computer magazine
55     c't 07/89 pages 192-208 and c't 03/94 pages 230-239
56 */
57 
58 #include <algorithm>
59 #include <numeric>
60 
61 #include "levdis.hxx"
62 
63 #define LEVDISBIG   (nLimit + 1)    // Return value if distance > nLimit
64 #define LEVDISDOUBLEBUF 2048        // no doubling atop this border
65 
Impl_WLD_StringLen(const sal_Unicode * pStr)66 static sal_Int32 Impl_WLD_StringLen( const sal_Unicode* pStr )
67 {
68     const sal_Unicode* pTempStr = pStr;
69     while( *pTempStr )
70         pTempStr++;
71     return static_cast<sal_Int32>(pTempStr-pStr);
72 }
73 
74 // Distance from string to pattern
WLD(const sal_Unicode * cString,sal_Int32 nStringLen)75 int WLevDistance::WLD( const sal_Unicode* cString, sal_Int32 nStringLen )
76 {
77     int nSPMin = 0;     // penalty point Minimum
78     int nRepS = 0;      // for SplitCount
79 
80     // length difference between pattern and string
81     int nLenDiff = nPatternLen - nStars - nStringLen;
82     // more insertions or deletions necessary as the limit? Then leave
83     if ( (nLenDiff * nInsQ0 > nLimit)
84             || ((nStars == 0) && (nLenDiff * nDelR0 < -nLimit)) )
85         return LEVDISBIG;
86 
87      // comparative String greater than  instantaneous array
88     // -> adapt array size
89     if ( nStringLen >= nArrayLen )
90     {
91         // increase size much more to avoid reallocation
92         if ( nStringLen < LEVDISDOUBLEBUF )
93             nArrayLen = 2 * nStringLen;
94         else
95             nArrayLen = nStringLen + 1;
96         npDistance = aDisMem.NewMem( nArrayLen );
97     }
98 
99     // Calculate start values of the second column (first pattern value).
100     // First column (0-Len pattern) is always zero .. nStringLen * nInsQ0,
101     // therefore the minimum is 0
102     if ( nPatternLen == 0 )
103     {
104         // Count of deletions to reach pattern
105         for ( sal_Int32 i=0; i <= nStringLen; i++ )
106             npDistance[i] = i * nDelR0;
107     }
108     else if ( cpPattern[0] == '*' && bpPatIsWild[0] )
109     {
110         // instead of a '*' you can fit in anything
111         for ( sal_Int32 i=0; i <= nStringLen; i++ )
112             npDistance[i] = 0;
113     }
114     else
115     {
116         sal_Unicode c;
117         int nP;
118         c = cpPattern[0];
119         if ( c == '?' && bpPatIsWild[0] )
120             nP = 0;     // a '?' could be any character.
121         else
122             // Minimum of replacement and deletion+insertion weighting
123             nP = std::min({ nRepP0, nRepP0, nDelR0 + nInsQ0 });
124         npDistance[0] = nInsQ0;     // start with simple insert
125         npDistance[1] = nInsQ0;
126         npDistance[2] = nInsQ0;
127         int nReplacePos = -1;       // tristate flag
128         int nDelCnt = 0;
129         for ( sal_Int32 i=1; i <= nStringLen; i++, nDelCnt += nDelR0 )
130         {
131             if ( cString[i-1] == c )
132                 nP = 0;     // Replace from this position is 0
133             // Deletions to match pattern + Replace
134             npDistance[i] = nDelCnt + nP;
135             if ( bSplitCount )
136             {
137                 if ( nReplacePos < 0 && nP )
138                 {   // this position will be replaced
139                     nRepS++;
140                     nReplacePos = i;
141                 }
142                 else if ( nReplacePos > 0 && !nP )
143                 {
144                     // same count of c
145                     int nBalance = levdisbalance( 0, i-1, c, cString, nStringLen );
146                     if ( !nBalance )
147                     {   // one was replaced that was an insertion instead
148                         nRepS--;
149                         nReplacePos = 0;
150                     }
151                 }
152             }
153         }
154         nSPMin = std::min({ npDistance[0], npDistance[1], npDistance[2] });
155     }
156 
157     // calculate distance matrix
158     sal_Int32 j = 0;        // for all columns of the pattern, till limit is not reached
159     while ( (j < nPatternLen-1)
160             && nSPMin <= (bSplitCount ? 2 * nLimit : nLimit) )
161     {
162         sal_Unicode c;
163         int nP, nQ, nR, nPij, d2;
164 
165         j++;
166         c = cpPattern[j];
167         if ( bpPatIsWild[j] )   // '*' or '?' not escaped
168             nP = 0;     // could be replaced without penalty
169         else
170             nP = nRepP0;
171         if ( c == '*' && bpPatIsWild[j] )
172         {
173             nQ = 0;     // insertion and deletion without penalty
174             nR = 0;
175         }
176         else
177         {
178             nQ = nInsQ0;    // usual weighting
179             nR = nDelR0;
180         }
181         d2 = npDistance[0];
182         // increase insert count to get from null string to pattern
183         npDistance[0] = npDistance[0] + nQ;
184         nSPMin = npDistance[0];
185         int nReplacePos = -1;       // tristate flag
186         // for each pattern column run through the string
187         for ( sal_Int32 i=1; i <= nStringLen; i++ )
188         {
189             int d1 = d2;            // WLD( X(i-1), Y(j-1) )
190             d2 = npDistance[i];     // WLD( X(i)  , Y(j-1) )
191             if ( cString[i-1] == c )
192             {
193                 nPij = 0;           // p(i,j)
194                 if ( nReplacePos < 0 )
195                 {
196                     // same count of c
197                     int nBalance = levdisbalance( j, i-1, c, cString, nStringLen );
198                     if ( !nBalance )
199                         nReplacePos = 0;    // no replacement
200                 }
201             }
202             else
203                 nPij = nP;
204             // WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) ,
205             //                          WLD( X(i)  , Y(j-1) ) + q      ,
206             //                          WLD( X(i-1), Y(j)   ) + r      )
207             npDistance[i] = std::min({ d1 + nPij, d2 + nQ, npDistance[i-1] + nR });
208             if ( npDistance[i] < nSPMin )
209                 nSPMin = npDistance[i];
210             if ( bSplitCount )
211             {
212                 if ( nReplacePos < 0 && nPij && npDistance[i] == d1 + nPij )
213                 {   // this position will be replaced
214                     nRepS++;
215                     nReplacePos = i;
216                 }
217                 else if ( nReplacePos > 0 && !nPij )
218                 {
219                     // character is equal in string and pattern
220                     //
221                     // If from this point:
222                     // * pattern and string have the same count of this
223                     //   character
224                     // * and character count is the same before this position
225                     // then the replace was none.
226                     //
227                     // Scrambled letters are recognized here and the nRepS
228                     // replacement is withdrawn, whereby the double limit kicks
229                     // in.
230 
231                     // Same count of c
232                     int nBalance = levdisbalance( j, i-1, c, cString, nStringLen );
233                     if ( !nBalance )
234                     {   // one was replaced that was an insertion instead
235                         nRepS--;
236                         nReplacePos = 0;
237                     }
238                 }
239             }
240         }
241     }
242     if ( (nSPMin <= nLimit) && (npDistance[nStringLen] <= nLimit) )
243         return npDistance[nStringLen];
244     else
245     {
246         if ( bSplitCount )
247         {
248             if ( nRepS && nLenDiff > 0 )
249                 nRepS -= nLenDiff;      // Inserts were counted
250             if ( (nSPMin <= 2 * nLimit)
251                     && (npDistance[nStringLen] <= 2 * nLimit)
252                     && (nRepS * nRepP0 <= nLimit) )
253                 return -npDistance[nStringLen];
254             return LEVDISBIG;
255         }
256         return LEVDISBIG;
257     }
258 }
259 
260 // Calculating      nLimit,   nReplP0,    nInsQ0,     nDelR0,     bSplitCount
261 // from user values           nOtherX,    nShorterY,  nLongerZ,   bRelaxed
CalcLPQR(int nX,int nY,int nZ,bool bRelaxed)262 void WLevDistance::CalcLPQR( int nX, int nY, int nZ, bool bRelaxed )
263 {
264     if ( nX < 0 ) nX = 0;       // only positive values
265     if ( nY < 0 ) nY = 0;
266     if ( nZ < 0 ) nZ = 0;
267     if (0 == std::min({ nX, nY, nZ })) // at least one 0
268     {
269         int nMid, nMax;
270         nMax = std::max({ nX, nY, nZ }); // either 0 for three 0s or Max
271         if ( 0 == (nMid = Mid3( nX, nY, nZ )) ) // even two 0
272             nLimit = nMax;                      // either 0 or the only one >0
273         else                                    // one is 0
274             nLimit = std::lcm( nMid, nMax );
275     }
276     else                                        // all three of them are not 0
277         nLimit = std::lcm(std::lcm(nX, nY), nZ);
278     nRepP0 = ( nX ? nLimit / nX : nLimit + 1 );
279     nInsQ0 = ( nY ? nLimit / nY : nLimit + 1 );
280     nDelR0 = ( nZ ? nLimit / nZ : nLimit + 1 );
281     bSplitCount = bRelaxed;
282 }
283 
284 // The value in the middle
Mid3(int x,int y,int z)285 int WLevDistance::Mid3( int x, int y, int z )
286 {
287     int min = std::min({ x, y, z });
288     if ( x == min )
289         return std::min(y, z);
290     else if ( y == min )
291         return std::min(x, z);
292     else        // z == min
293         return std::min(x, y);
294 }
295 
296 // initialize data from CTOR
InitData(const sal_Unicode * cPattern)297 void WLevDistance::InitData( const sal_Unicode* cPattern )
298 {
299     cpPattern = aPatMem.GetcPtr();
300     bpPatIsWild = aPatMem.GetbPtr();
301     npDistance = aDisMem.GetPtr();
302     nStars = 0;
303     const sal_Unicode* cp1 = cPattern;
304     sal_Unicode* cp2 = cpPattern;
305     bool* bp = bpPatIsWild;
306     // copy pattern, count asterisks, escaped Jokers
307     while ( *cp1 )
308     {
309         if ( *cp1 == '\\' )     // maybe escaped
310         {
311             if ( *(cp1+1) == '*' || *(cp1+1) == '?' )   // next Joker?
312             {
313                 cp1++;          // skip '\\'
314                 nPatternLen--;
315             }
316             *bp++ = false;
317         }
318         else if ( *cp1 == '*' || *cp1 == '?' )      // Joker
319         {
320             if ( *cp1 == '*' )
321                 nStars++;
322             *bp++ = true;
323         }
324         else
325             *bp++ = false;
326         *cp2++ = *cp1++;
327     }
328     *cp2 = '\0';
329 }
330 
WLevDistance(const sal_Unicode * cPattern,int nOtherX,int nShorterY,int nLongerZ,bool bRelaxed)331 WLevDistance::WLevDistance( const sal_Unicode* cPattern,
332                             int nOtherX, int nShorterY, int nLongerZ,
333                             bool bRelaxed ) :
334     nPatternLen( Impl_WLD_StringLen(cPattern) ),
335     aPatMem( nPatternLen + 1 ),
336     nArrayLen( nPatternLen + 1 ),
337     aDisMem( nArrayLen )
338 {
339     InitData( cPattern );
340     CalcLPQR( nOtherX, nShorterY, nLongerZ, bRelaxed );
341 }
342 
343 // CopyCTor
WLevDistance(const WLevDistance & rWLD)344 WLevDistance::WLevDistance( const WLevDistance& rWLD ) :
345     nPatternLen( rWLD.nPatternLen ),
346     aPatMem( nPatternLen + 1 ),
347     nArrayLen( nPatternLen + 1 ),
348     aDisMem( nArrayLen ),
349     nLimit( rWLD.nLimit ),
350     nRepP0( rWLD.nRepP0 ),
351     nInsQ0( rWLD.nInsQ0 ),
352     nDelR0( rWLD.nDelR0 ),
353     nStars( rWLD.nStars ),
354     bSplitCount( rWLD.bSplitCount )
355 {
356     cpPattern = aPatMem.GetcPtr();
357     bpPatIsWild = aPatMem.GetbPtr();
358     npDistance = aDisMem.GetPtr();
359     sal_Int32 i;
360     for ( i=0; i<nPatternLen; i++ )
361     {
362         cpPattern[i] = rWLD.cpPattern[i];
363         bpPatIsWild[i] = rWLD.bpPatIsWild[i];
364     }
365     cpPattern[i] = '\0';
366 }
367 
368 // DTor
~WLevDistance()369 WLevDistance::~WLevDistance()
370 {
371 }
372 
373 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
374