LibreOffice Module i18npool (master) 1
levdis.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
20/*
21
22 Weighted Levenshtein Distance
23 including wildcards
24 '*' for any number (0 or more) of arbitrary characters
25 '?' for exactly one arbitrary character
26 escapable with backslash, "\*" or "\?"
27
28 Return:
29 WLD if WLD <= nLimit, else nLimit+1
30
31 or, if bSplitCount:
32 WLD if WLD <= nLimit
33 -WLD if Replace and Insert and Delete <= nLimit
34 else nLimit+1
35
36 Recursive definition of WLD:
37
38 WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) ,
39 WLD( X(i) , Y(j-1) ) + q ,
40 WLD( X(i-1), Y(j) ) + r )
41
42 X(i) := the first i characters of the word X
43 Y(j) := the first j characters of the word Y
44 p(i,j) := 0 if i-th character of X == j-th character of Y,
45 p else
46
47 Boundary conditions:
48 WLD( X(0), Y(j) ) := j*q (Y created by j inserts)
49 WLD( X(i), Y(0) ) := i*r (Y created by i deletes)
50 WLD( X(0), Y(0) ) := 0
51
52 Instead of recursions a dynamic algorithm is used.
53
54 See also: German computer magazine
55 c't 07/89 pages 192-208 and c't 03/94 pages 230-239
56*/
57
58#include <algorithm>
59#include <numeric>
60
61#include "levdis.hxx"
62
63#define LEVDISBIG (nLimit + 1) // Return value if distance > nLimit
64#define LEVDISDOUBLEBUF 2048 // no doubling atop this border
65
66static sal_Int32 Impl_WLD_StringLen( const sal_Unicode* pStr )
67{
68 const sal_Unicode* pTempStr = pStr;
69 while( *pTempStr )
70 pTempStr++;
71 return static_cast<sal_Int32>(pTempStr-pStr);
72}
73
74// Distance from string to pattern
75int WLevDistance::WLD( const sal_Unicode* cString, sal_Int32 nStringLen )
76{
77 int nSPMin = 0; // penalty point Minimum
78 int nRepS = 0; // for SplitCount
79
80 // length difference between pattern and string
81 int nLenDiff = nPatternLen - nStars - nStringLen;
82 // more insertions or deletions necessary as the limit? Then leave
83 if ( (nLenDiff * nInsQ0 > nLimit)
84 || ((nStars == 0) && (nLenDiff * nDelR0 < -nLimit)) )
85 return LEVDISBIG;
86
87 // comparative String greater than instantaneous array
88 // -> adapt array size
89 if ( nStringLen >= nArrayLen )
90 {
91 // increase size much more to avoid reallocation
92 if ( nStringLen < LEVDISDOUBLEBUF )
93 nArrayLen = 2 * nStringLen;
94 else
95 nArrayLen = nStringLen + 1;
97 }
98
99 // Calculate start values of the second column (first pattern value).
100 // First column (0-Len pattern) is always zero .. nStringLen * nInsQ0,
101 // therefore the minimum is 0
102 if ( nPatternLen == 0 )
103 {
104 // Count of deletions to reach pattern
105 for ( sal_Int32 i=0; i <= nStringLen; i++ )
106 npDistance[i] = i * nDelR0;
107 }
108 else if ( cpPattern[0] == '*' && bpPatIsWild[0] )
109 {
110 // instead of a '*' you can fit in anything
111 for ( sal_Int32 i=0; i <= nStringLen; i++ )
112 npDistance[i] = 0;
113 }
114 else
115 {
116 sal_Unicode c;
117 int nP;
118 c = cpPattern[0];
119 if ( c == '?' && bpPatIsWild[0] )
120 nP = 0; // a '?' could be any character.
121 else
122 // Minimum of replacement and deletion+insertion weighting
123 nP = std::min({ nRepP0, nRepP0, nDelR0 + nInsQ0 });
124 npDistance[0] = nInsQ0; // start with simple insert
125 npDistance[1] = nInsQ0;
126 npDistance[2] = nInsQ0;
127 int nReplacePos = -1; // tristate flag
128 int nDelCnt = 0;
129 for ( sal_Int32 i=1; i <= nStringLen; i++, nDelCnt += nDelR0 )
130 {
131 if ( cString[i-1] == c )
132 nP = 0; // Replace from this position is 0
133 // Deletions to match pattern + Replace
134 npDistance[i] = nDelCnt + nP;
135 if ( bSplitCount )
136 {
137 if ( nReplacePos < 0 && nP )
138 { // this position will be replaced
139 nRepS++;
140 nReplacePos = i;
141 }
142 else if ( nReplacePos > 0 && !nP )
143 {
144 // same count of c
145 int nBalance = levdisbalance( 0, i-1, c, cString, nStringLen );
146 if ( !nBalance )
147 { // one was replaced that was an insertion instead
148 nRepS--;
149 nReplacePos = 0;
150 }
151 }
152 }
153 }
154 nSPMin = std::min({ npDistance[0], npDistance[1], npDistance[2] });
155 }
156
157 // calculate distance matrix
158 sal_Int32 j = 0; // for all columns of the pattern, till limit is not reached
159 while ( (j < nPatternLen-1)
160 && nSPMin <= (bSplitCount ? 2 * nLimit : nLimit) )
161 {
162 sal_Unicode c;
163 int nP, nQ, nR, nPij, d2;
164
165 j++;
166 c = cpPattern[j];
167 if ( bpPatIsWild[j] ) // '*' or '?' not escaped
168 nP = 0; // could be replaced without penalty
169 else
170 nP = nRepP0;
171 if ( c == '*' && bpPatIsWild[j] )
172 {
173 nQ = 0; // insertion and deletion without penalty
174 nR = 0;
175 }
176 else
177 {
178 nQ = nInsQ0; // usual weighting
179 nR = nDelR0;
180 }
181 d2 = npDistance[0];
182 // increase insert count to get from null string to pattern
183 npDistance[0] = npDistance[0] + nQ;
184 nSPMin = npDistance[0];
185 int nReplacePos = -1; // tristate flag
186 // for each pattern column run through the string
187 for ( sal_Int32 i=1; i <= nStringLen; i++ )
188 {
189 int d1 = d2; // WLD( X(i-1), Y(j-1) )
190 d2 = npDistance[i]; // WLD( X(i) , Y(j-1) )
191 if ( cString[i-1] == c )
192 {
193 nPij = 0; // p(i,j)
194 if ( nReplacePos < 0 )
195 {
196 // same count of c
197 int nBalance = levdisbalance( j, i-1, c, cString, nStringLen );
198 if ( !nBalance )
199 nReplacePos = 0; // no replacement
200 }
201 }
202 else
203 nPij = nP;
204 // WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) ,
205 // WLD( X(i) , Y(j-1) ) + q ,
206 // WLD( X(i-1), Y(j) ) + r )
207 npDistance[i] = std::min({ d1 + nPij, d2 + nQ, npDistance[i-1] + nR });
208 if ( npDistance[i] < nSPMin )
209 nSPMin = npDistance[i];
210 if ( bSplitCount )
211 {
212 if ( nReplacePos < 0 && nPij && npDistance[i] == d1 + nPij )
213 { // this position will be replaced
214 nRepS++;
215 nReplacePos = i;
216 }
217 else if ( nReplacePos > 0 && !nPij )
218 {
219 // character is equal in string and pattern
220 //
221 // If from this point:
222 // * pattern and string have the same count of this
223 // character
224 // * and character count is the same before this position
225 // then the replace was none.
226 //
227 // Scrambled letters are recognized here and the nRepS
228 // replacement is withdrawn, whereby the double limit kicks
229 // in.
230
231 // Same count of c
232 int nBalance = levdisbalance( j, i-1, c, cString, nStringLen );
233 if ( !nBalance )
234 { // one was replaced that was an insertion instead
235 nRepS--;
236 nReplacePos = 0;
237 }
238 }
239 }
240 }
241 }
242 if ( (nSPMin <= nLimit) && (npDistance[nStringLen] <= nLimit) )
243 return npDistance[nStringLen];
244 else
245 {
246 if ( bSplitCount )
247 {
248 if ( nRepS && nLenDiff > 0 )
249 nRepS -= nLenDiff; // Inserts were counted
250 if ( (nSPMin <= 2 * nLimit)
251 && (npDistance[nStringLen] <= 2 * nLimit)
252 && (nRepS * nRepP0 <= nLimit) )
253 return -npDistance[nStringLen];
254 return LEVDISBIG;
255 }
256 return LEVDISBIG;
257 }
258}
259
260// Calculating nLimit, nReplP0, nInsQ0, nDelR0, bSplitCount
261// from user values nOtherX, nShorterY, nLongerZ, bRelaxed
262void WLevDistance::CalcLPQR( int nX, int nY, int nZ, bool bRelaxed )
263{
264 if ( nX < 0 ) nX = 0; // only positive values
265 if ( nY < 0 ) nY = 0;
266 if ( nZ < 0 ) nZ = 0;
267 if (0 == std::min({ nX, nY, nZ })) // at least one 0
268 {
269 int nMid, nMax;
270 nMax = std::max({ nX, nY, nZ }); // either 0 for three 0s or Max
271 if ( 0 == (nMid = Mid3( nX, nY, nZ )) ) // even two 0
272 nLimit = nMax; // either 0 or the only one >0
273 else // one is 0
274 nLimit = std::lcm( nMid, nMax );
275 }
276 else // all three of them are not 0
277 nLimit = std::lcm(std::lcm(nX, nY), nZ);
278 nRepP0 = ( nX ? nLimit / nX : nLimit + 1 );
279 nInsQ0 = ( nY ? nLimit / nY : nLimit + 1 );
280 nDelR0 = ( nZ ? nLimit / nZ : nLimit + 1 );
281 bSplitCount = bRelaxed;
282}
283
284// The value in the middle
285int WLevDistance::Mid3( int x, int y, int z )
286{
287 int min = std::min({ x, y, z });
288 if ( x == min )
289 return std::min(y, z);
290 else if ( y == min )
291 return std::min(x, z);
292 else // z == min
293 return std::min(x, y);
294}
295
296// initialize data from CTOR
297void WLevDistance::InitData( const sal_Unicode* cPattern )
298{
302 nStars = 0;
303 const sal_Unicode* cp1 = cPattern;
304 sal_Unicode* cp2 = cpPattern;
305 bool* bp = bpPatIsWild;
306 // copy pattern, count asterisks, escaped Jokers
307 while ( *cp1 )
308 {
309 if ( *cp1 == '\\' ) // maybe escaped
310 {
311 if ( *(cp1+1) == '*' || *(cp1+1) == '?' ) // next Joker?
312 {
313 cp1++; // skip '\\'
314 nPatternLen--;
315 }
316 *bp++ = false;
317 }
318 else if ( *cp1 == '*' || *cp1 == '?' ) // Joker
319 {
320 if ( *cp1 == '*' )
321 nStars++;
322 *bp++ = true;
323 }
324 else
325 *bp++ = false;
326 *cp2++ = *cp1++;
327 }
328 *cp2 = '\0';
329}
330
332 int nOtherX, int nShorterY, int nLongerZ,
333 bool bRelaxed ) :
334 nPatternLen( Impl_WLD_StringLen(cPattern) ),
335 aPatMem( nPatternLen + 1 ),
336 nArrayLen( nPatternLen + 1 ),
337 aDisMem( nArrayLen )
338{
339 InitData( cPattern );
340 CalcLPQR( nOtherX, nShorterY, nLongerZ, bRelaxed );
341}
342
343// CopyCTor
345 nPatternLen( rWLD.nPatternLen ),
346 aPatMem( nPatternLen + 1 ),
347 nArrayLen( nPatternLen + 1 ),
348 aDisMem( nArrayLen ),
349 nLimit( rWLD.nLimit ),
350 nRepP0( rWLD.nRepP0 ),
351 nInsQ0( rWLD.nInsQ0 ),
352 nDelR0( rWLD.nDelR0 ),
353 nStars( rWLD.nStars ),
354 bSplitCount( rWLD.bSplitCount )
355{
359 sal_Int32 i;
360 for ( i=0; i<nPatternLen; i++ )
361 {
362 cpPattern[i] = rWLD.cpPattern[i];
363 bpPatIsWild[i] = rWLD.bpPatIsWild[i];
364 }
365 cpPattern[i] = '\0';
366}
367
368// DTor
370{
371}
372
373/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
int * NewMem(size_t s)
Definition: levdis.hxx:121
int * GetPtr() const
Definition: levdis.hxx:120
sal_Unicode * GetcPtr() const
Definition: levdis.hxx:108
bool * GetbPtr() const
Definition: levdis.hxx:109
Weighted Levenshtein Distance (WLD)
Definition: levdis.hxx:134
int levdisbalance(sal_Int32 jj, sal_Int32 ii, sal_Unicode c, const sal_Unicode *cString, sal_Int32 nStringLen) const
Definition: levdis.hxx:181
sal_Int32 nArrayLen
length of distance array
Definition: levdis.hxx:139
int nRepP0
replacement weigh
Definition: levdis.hxx:143
bool * bpPatIsWild
pointer to bool array whether pattern is wildcard
Definition: levdis.hxx:138
int nDelR0
deletion weigh
Definition: levdis.hxx:145
bool bSplitCount
if TRUE, Rep/Ins/Del are counted separately
Definition: levdis.hxx:147
WLevDisDistanceMem aDisMem
manage allocation of distance array
Definition: levdis.hxx:140
static int Mid3(int x, int y, int z)
middle value of 3 values
Definition: levdis.cxx:285
int nLimit
WLD limit replacements/insertions/deletions.
Definition: levdis.hxx:142
sal_Int32 nPatternLen
length of pattern
Definition: levdis.hxx:135
void InitData(const sal_Unicode *cPattern)
Definition: levdis.cxx:297
int WLD(const sal_Unicode *cString, sal_Int32 nStringLen)
Calculate the Weighted Levenshtein Distance from string to pattern.
Definition: levdis.cxx:75
WLevDisPatternMem aPatMem
manage allocation of pattern array
Definition: levdis.hxx:136
int nStars
count of '*' wildcards in pattern
Definition: levdis.hxx:146
int * npDistance
pointer to distance array
Definition: levdis.hxx:141
sal_Unicode * cpPattern
pointer to pattern array
Definition: levdis.hxx:137
WLevDistance(const sal_Unicode *cPattern, int nOtherX, int nShorterY, int nLongerZ, bool bRelaxed)
CTor with user input.
Definition: levdis.cxx:331
int nInsQ0
insertion weigh
Definition: levdis.hxx:144
void CalcLPQR(int nOtherX, int nShorterY, int nLongerZ, bool bRelaxed)
Calculate the internal weighs corresponding to the user input values.
Definition: levdis.cxx:262
float y
float x
float z
#define LEVDISDOUBLEBUF
Definition: levdis.cxx:64
#define LEVDISBIG
Definition: levdis.cxx:63
static sal_Int32 Impl_WLD_StringLen(const sal_Unicode *pStr)
Definition: levdis.cxx:66
int i
SwNodeOffset min(const SwNodeOffset &a, const SwNodeOffset &b)
sal_uInt16 sal_Unicode