LibreOffice Module i18npool (master)  1
levdis.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 /*
21 
22  Weighted Levenshtein Distance
23  including wildcards
24  '*' for any number (0 or more) of arbitrary characters
25  '?' for exactly one arbitrary character
26  escapable with backslash, "\*" or "\?"
27 
28  Return:
29  WLD if WLD <= nLimit, else nLimit+1
30 
31  or, if bSplitCount:
32  WLD if WLD <= nLimit
33  -WLD if Replace and Insert and Delete <= nLimit
34  else nLimit+1
35 
36  Recursive definition of WLD:
37 
38  WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) ,
39  WLD( X(i) , Y(j-1) ) + q ,
40  WLD( X(i-1), Y(j) ) + r )
41 
42  X(i) := the first i characters of the word X
43  Y(j) := the first j characters of the word Y
44  p(i,j) := 0 if i-th character of X == j-th character of Y,
45  p else
46 
47  Boundary conditions:
48  WLD( X(0), Y(j) ) := j*q (Y created by j inserts)
49  WLD( X(i), Y(0) ) := i*r (Y created by i deletes)
50  WLD( X(0), Y(0) ) := 0
51 
52  Instead of recursions a dynamic algorithm is used.
53 
54  See also: German computer magazine
55  c't 07/89 pages 192-208 and c't 03/94 pages 230-239
56 */
57 
58 #include <algorithm>
59 #include <numeric>
60 
61 #include "levdis.hxx"
62 
63 #define LEVDISBIG (nLimit + 1) // Return value if distance > nLimit
64 #define LEVDISDOUBLEBUF 2048 // no doubling atop this border
65 
66 static sal_Int32 Impl_WLD_StringLen( const sal_Unicode* pStr )
67 {
68  const sal_Unicode* pTempStr = pStr;
69  while( *pTempStr )
70  pTempStr++;
71  return static_cast<sal_Int32>(pTempStr-pStr);
72 }
73 
74 // Distance from string to pattern
75 int WLevDistance::WLD( const sal_Unicode* cString, sal_Int32 nStringLen )
76 {
77  int nSPMin = 0; // penalty point Minimum
78  int nRepS = 0; // for SplitCount
79 
80  // length difference between pattern and string
81  int nLenDiff = nPatternLen - nStars - nStringLen;
82  // more insertions or deletions necessary as the limit? Then leave
83  if ( (nLenDiff * nInsQ0 > nLimit)
84  || ((nStars == 0) && (nLenDiff * nDelR0 < -nLimit)) )
85  return LEVDISBIG;
86 
87  // comparative String greater than instantaneous array
88  // -> adapt array size
89  if ( nStringLen >= nArrayLen )
90  {
91  // increase size much more to avoid reallocation
92  if ( nStringLen < LEVDISDOUBLEBUF )
93  nArrayLen = 2 * nStringLen;
94  else
95  nArrayLen = nStringLen + 1;
97  }
98 
99  // Calculate start values of the second column (first pattern value).
100  // First column (0-Len pattern) is always zero .. nStringLen * nInsQ0,
101  // therefore the minimum is 0
102  if ( nPatternLen == 0 )
103  {
104  // Count of deletions to reach pattern
105  for ( sal_Int32 i=0; i <= nStringLen; i++ )
106  npDistance[i] = i * nDelR0;
107  }
108  else if ( cpPattern[0] == '*' && bpPatIsWild[0] )
109  {
110  // instead of a '*' you can fit in anything
111  for ( sal_Int32 i=0; i <= nStringLen; i++ )
112  npDistance[i] = 0;
113  }
114  else
115  {
116  sal_Unicode c;
117  int nP;
118  c = cpPattern[0];
119  if ( c == '?' && bpPatIsWild[0] )
120  nP = 0; // a '?' could be any character.
121  else
122  // Minimum of replacement and deletion+insertion weighting
123  nP = std::min({ nRepP0, nRepP0, nDelR0 + nInsQ0 });
124  npDistance[0] = nInsQ0; // start with simple insert
125  npDistance[1] = nInsQ0;
126  npDistance[2] = nInsQ0;
127  int nReplacePos = -1; // tristate flag
128  int nDelCnt = 0;
129  for ( sal_Int32 i=1; i <= nStringLen; i++, nDelCnt += nDelR0 )
130  {
131  if ( cString[i-1] == c )
132  nP = 0; // Replace from this position is 0
133  // Deletions to match pattern + Replace
134  npDistance[i] = nDelCnt + nP;
135  if ( bSplitCount )
136  {
137  if ( nReplacePos < 0 && nP )
138  { // this position will be replaced
139  nRepS++;
140  nReplacePos = i;
141  }
142  else if ( nReplacePos > 0 && !nP )
143  {
144  // same count of c
145  int nBalance = levdisbalance( 0, i-1, c, cString, nStringLen );
146  if ( !nBalance )
147  { // one was replaced that was an insertion instead
148  nRepS--;
149  nReplacePos = 0;
150  }
151  }
152  }
153  }
154  nSPMin = std::min({ npDistance[0], npDistance[1], npDistance[2] });
155  }
156 
157  // calculate distance matrix
158  sal_Int32 j = 0; // for all columns of the pattern, till limit is not reached
159  while ( (j < nPatternLen-1)
160  && nSPMin <= (bSplitCount ? 2 * nLimit : nLimit) )
161  {
162  sal_Unicode c;
163  int nP, nQ, nR, nPij, d2;
164 
165  j++;
166  c = cpPattern[j];
167  if ( bpPatIsWild[j] ) // '*' or '?' not escaped
168  nP = 0; // could be replaced without penalty
169  else
170  nP = nRepP0;
171  if ( c == '*' && bpPatIsWild[j] )
172  {
173  nQ = 0; // insertion and deletion without penalty
174  nR = 0;
175  }
176  else
177  {
178  nQ = nInsQ0; // usual weighting
179  nR = nDelR0;
180  }
181  d2 = npDistance[0];
182  // increase insert count to get from null string to pattern
183  npDistance[0] = npDistance[0] + nQ;
184  nSPMin = npDistance[0];
185  int nReplacePos = -1; // tristate flag
186  // for each pattern column run through the string
187  for ( sal_Int32 i=1; i <= nStringLen; i++ )
188  {
189  int d1 = d2; // WLD( X(i-1), Y(j-1) )
190  d2 = npDistance[i]; // WLD( X(i) , Y(j-1) )
191  if ( cString[i-1] == c )
192  {
193  nPij = 0; // p(i,j)
194  if ( nReplacePos < 0 )
195  {
196  // same count of c
197  int nBalance = levdisbalance( j, i-1, c, cString, nStringLen );
198  if ( !nBalance )
199  nReplacePos = 0; // no replacement
200  }
201  }
202  else
203  nPij = nP;
204  // WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) ,
205  // WLD( X(i) , Y(j-1) ) + q ,
206  // WLD( X(i-1), Y(j) ) + r )
207  npDistance[i] = std::min({ d1 + nPij, d2 + nQ, npDistance[i-1] + nR });
208  if ( npDistance[i] < nSPMin )
209  nSPMin = npDistance[i];
210  if ( bSplitCount )
211  {
212  if ( nReplacePos < 0 && nPij && npDistance[i] == d1 + nPij )
213  { // this position will be replaced
214  nRepS++;
215  nReplacePos = i;
216  }
217  else if ( nReplacePos > 0 && !nPij )
218  {
219  // character is equal in string and pattern
220  //
221  // If from this point:
222  // * pattern and string have the same count of this
223  // character
224  // * and character count is the same before this position
225  // then the replace was none.
226  //
227  // Scrambled letters are recognized here and the nRepS
228  // replacement is withdrawn, whereby the double limit kicks
229  // in.
230 
231  // Same count of c
232  int nBalance = levdisbalance( j, i-1, c, cString, nStringLen );
233  if ( !nBalance )
234  { // one was replaced that was an insertion instead
235  nRepS--;
236  nReplacePos = 0;
237  }
238  }
239  }
240  }
241  }
242  if ( (nSPMin <= nLimit) && (npDistance[nStringLen] <= nLimit) )
243  return npDistance[nStringLen];
244  else
245  {
246  if ( bSplitCount )
247  {
248  if ( nRepS && nLenDiff > 0 )
249  nRepS -= nLenDiff; // Inserts were counted
250  if ( (nSPMin <= 2 * nLimit)
251  && (npDistance[nStringLen] <= 2 * nLimit)
252  && (nRepS * nRepP0 <= nLimit) )
253  return -npDistance[nStringLen];
254  return LEVDISBIG;
255  }
256  return LEVDISBIG;
257  }
258 }
259 
260 // Calculating nLimit, nReplP0, nInsQ0, nDelR0, bSplitCount
261 // from user values nOtherX, nShorterY, nLongerZ, bRelaxed
262 void WLevDistance::CalcLPQR( int nX, int nY, int nZ, bool bRelaxed )
263 {
264  if ( nX < 0 ) nX = 0; // only positive values
265  if ( nY < 0 ) nY = 0;
266  if ( nZ < 0 ) nZ = 0;
267  if (0 == std::min({ nX, nY, nZ })) // at least one 0
268  {
269  int nMid, nMax;
270  nMax = std::max({ nX, nY, nZ }); // either 0 for three 0s or Max
271  if ( 0 == (nMid = Mid3( nX, nY, nZ )) ) // even two 0
272  nLimit = nMax; // either 0 or the only one >0
273  else // one is 0
274  nLimit = std::lcm( nMid, nMax );
275  }
276  else // all three of them are not 0
277  nLimit = std::lcm(std::lcm(nX, nY), nZ);
278  nRepP0 = ( nX ? nLimit / nX : nLimit + 1 );
279  nInsQ0 = ( nY ? nLimit / nY : nLimit + 1 );
280  nDelR0 = ( nZ ? nLimit / nZ : nLimit + 1 );
281  bSplitCount = bRelaxed;
282 }
283 
284 // The value in the middle
285 int WLevDistance::Mid3( int x, int y, int z )
286 {
287  int min = std::min({ x, y, z });
288  if ( x == min )
289  return std::min(y, z);
290  else if ( y == min )
291  return std::min(x, z);
292  else // z == min
293  return std::min(x, y);
294 }
295 
296 // initialize data from CTOR
297 void WLevDistance::InitData( const sal_Unicode* cPattern )
298 {
302  nStars = 0;
303  const sal_Unicode* cp1 = cPattern;
304  sal_Unicode* cp2 = cpPattern;
305  bool* bp = bpPatIsWild;
306  // copy pattern, count asterisks, escaped Jokers
307  while ( *cp1 )
308  {
309  if ( *cp1 == '\\' ) // maybe escaped
310  {
311  if ( *(cp1+1) == '*' || *(cp1+1) == '?' ) // next Joker?
312  {
313  cp1++; // skip '\\'
314  nPatternLen--;
315  }
316  *bp++ = false;
317  }
318  else if ( *cp1 == '*' || *cp1 == '?' ) // Joker
319  {
320  if ( *cp1 == '*' )
321  nStars++;
322  *bp++ = true;
323  }
324  else
325  *bp++ = false;
326  *cp2++ = *cp1++;
327  }
328  *cp2 = '\0';
329 }
330 
332  int nOtherX, int nShorterY, int nLongerZ,
333  bool bRelaxed ) :
334  nPatternLen( Impl_WLD_StringLen(cPattern) ),
335  aPatMem( nPatternLen + 1 ),
336  nArrayLen( nPatternLen + 1 ),
337  aDisMem( nArrayLen )
338 {
339  InitData( cPattern );
340  CalcLPQR( nOtherX, nShorterY, nLongerZ, bRelaxed );
341 }
342 
343 // CopyCTor
345  nPatternLen( rWLD.nPatternLen ),
346  aPatMem( nPatternLen + 1 ),
347  nArrayLen( nPatternLen + 1 ),
348  aDisMem( nArrayLen ),
349  nLimit( rWLD.nLimit ),
350  nRepP0( rWLD.nRepP0 ),
351  nInsQ0( rWLD.nInsQ0 ),
352  nDelR0( rWLD.nDelR0 ),
353  nStars( rWLD.nStars ),
354  bSplitCount( rWLD.bSplitCount )
355 {
359  sal_Int32 i;
360  for ( i=0; i<nPatternLen; i++ )
361  {
362  cpPattern[i] = rWLD.cpPattern[i];
363  bpPatIsWild[i] = rWLD.bpPatIsWild[i];
364  }
365  cpPattern[i] = '\0';
366 }
367 
368 // DTor
370 {
371 }
372 
373 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
WLevDistance(const sal_Unicode *cPattern, int nOtherX, int nShorterY, int nLongerZ, bool bRelaxed)
CTor with user input.
Definition: levdis.cxx:331
int nStars
count of '*' wildcards in pattern
Definition: levdis.hxx:146
sal_Unicode * cpPattern
pointer to pattern array
Definition: levdis.hxx:137
#define LEVDISBIG
Definition: levdis.cxx:63
int nRepP0
replacement weigh
Definition: levdis.hxx:143
int * NewMem(size_t s)
Definition: levdis.hxx:121
void InitData(const sal_Unicode *cPattern)
Definition: levdis.cxx:297
sal_uInt16 sal_Unicode
#define min(a, b)
int nDelR0
deletion weigh
Definition: levdis.hxx:145
bool * bpPatIsWild
pointer to bool array whether pattern is wildcard
Definition: levdis.hxx:138
int levdisbalance(sal_Int32 jj, sal_Int32 ii, sal_Unicode c, const sal_Unicode *cString, sal_Int32 nStringLen)
Definition: levdis.hxx:181
sal_Int32 nArrayLen
length of distance array
Definition: levdis.hxx:139
sal_Unicode * GetcPtr() const
Definition: levdis.hxx:108
void CalcLPQR(int nOtherX, int nShorterY, int nLongerZ, bool bRelaxed)
Calculate the internal weighs corresponding to the user input values.
Definition: levdis.cxx:262
int * GetPtr() const
Definition: levdis.hxx:120
int i
int nLimit
WLD limit replacements/insertions/deletions.
Definition: levdis.hxx:142
int * npDistance
pointer to distance array
Definition: levdis.hxx:141
int WLD(const sal_Unicode *cString, sal_Int32 nStringLen)
Calculate the Weighted Levenshtein Distance from string to pattern.
Definition: levdis.cxx:75
Weighted Levenshtein Distance (WLD)
Definition: levdis.hxx:133
sal_Int32 nPatternLen
length of pattern
Definition: levdis.hxx:135
#define LEVDISDOUBLEBUF
Definition: levdis.cxx:64
static int Mid3(int x, int y, int z)
middle value of 3 values
Definition: levdis.cxx:285
static sal_Int32 Impl_WLD_StringLen(const sal_Unicode *pStr)
Definition: levdis.cxx:66
WLevDisPatternMem aPatMem
manage allocation of pattern array
Definition: levdis.hxx:136
bool * GetbPtr() const
Definition: levdis.hxx:109
int nInsQ0
insertion weigh
Definition: levdis.hxx:144
bool bSplitCount
if TRUE, Rep/Ins/Del are counted separately
Definition: levdis.hxx:147
WLevDisDistanceMem aDisMem
manage allocation of distance array
Definition: levdis.hxx:140