LibreOffice Module i18npool (master) 1
levdis.hxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
20#ifndef INCLUDED_I18NPOOL_SOURCE_SEARCH_LEVDIS_HXX
21#define INCLUDED_I18NPOOL_SOURCE_SEARCH_LEVDIS_HXX
22
23#include <sal/types.h>
24#include <memory>
25
26// Sensible default values for a user interface could be:
27// LEVDISDEFAULT_XOTHER 2
28// Maximum X replacements to match query, found data may be different by X
29// characters from query.
30// LEVDISDEFAULT_YSHORTER 1
31// Maximum Y insertions to match query, found data may be Y characters
32// shorter than query.
33// LEVDISDEFAULT_ZLONGER 3
34// Maximum Z deletions to match query, found data may be Z characters
35// longer than query.
36// LEVDISDEFAULT_RELAXED TRUE
37// Use relaxed SplitCount instead of mathematical WLD.
38//
39// Joker/wildcards ('?' and '*') of course do not count as
40// replacement/insertion/deletion. At a '?' a replacement is not counted, for a
41// '*' the found data may be any number of characters longer than the query.
42//
43// Strict mathematical WLD: EITHER maximum X replacements OR Y characters
44// shorter OR Z characters longer.
45// Relaxed SplitCount: maximum X replacements AND/OR Y character shorter AND/OR
46// Z characters longer. Any combination of actions is valid.
47//
48// The value range for X,Y,Z is 0..33 to keep the limit within a 16 bit signed
49// integer, 31*32*33 is the maximum limit, LCM(31,32,33) == 32736.
50//
51// The corresponding internal default weigh values for these user interface
52// values would be:
53// LEVDISDEFAULTLIMIT 6
54// Default nLimit, matches x=2, y=1, z=3, p=3, q=6, r=2
55// LEVDISDEFAULT_P0 3
56// Default nRepP0, weight of replacements.
57// LEVDISDEFAULT_Q0 6
58// Default nInsQ0, weight of insertions.
59// LEVDISDEFAULT_R0 2
60// Default nDelR0, weight of deletions.
61
62// The transformation of user input values to weighs is done using CalcLPQR().
63// One caveat, if the WLD reaches nLimit due to nDelR0 (i.e. data string is nZ
64// characters longer than pattern) then no character can be replaced any more.
65// This can be circumvented by increasing nX or/and nZ, but of course with the
66// side effect of being less strict then... or the other solution is to use
67// relaxed SplitCount (see below), which is the default when using CalcLPQR().
68//
69// Attention: shorter = WLD.Insert, longer = WLD.Delete
70//
71// View and counting is always from data string to pattern, a deletion means
72// that something is deleted from data to match pattern.
73//
74// Deletions weigh less in this example because usually less is known than is
75// searched for. Replacements get middle weight, for example because of
76// misspellings. Insertions are expensive.
77//
78// Another example: P0 = 1, Q0 = 4, R0 = 4, Limit = 3
79// Allowed are maximum 4 replacements, no insertion, no deletion.
80// Matches the user interface values X = 3, Y = 0, Z = 0
81//
82// bSplitCount: if TRUE, Rep/Ins/Del are counted differently. The return value
83// of WLD() then isn't necessarily the Levenshtein-Distance, but can be
84// negative (-WLD) if the WLD is greater than nLimit but single values are
85// within the limits.
86// For the above default values that could mean: even if the found string is
87// already 2 characters longer (nLongerZ), 3 replacements (nOtherX) can be made
88// to reach pattern. Additionally, character swaps count as one replacement.
89// Mathematically completely incorrect, but meets user expectations ;-)
90//
91// Explanation: in the real WLD all actions are withdrawn from a common 100%
92// pool, if one gets all there's nothing left for others. With bSplitCount
93// replacements have their own pool.
94
95
98{
99 std::unique_ptr<sal_Unicode[]> cp;
100 std::unique_ptr<bool[]> bp;
101public:
102 explicit WLevDisPatternMem( sal_Int32 s )
103 : cp(new sal_Unicode[s])
104 , bp(new bool[s])
105 {
106 }
107
108 sal_Unicode* GetcPtr() const { return cp.get(); }
109 bool* GetbPtr() const { return bp.get(); }
110};
111
113{
114 std::unique_ptr<int[]> p;
115public:
116 explicit WLevDisDistanceMem( size_t s )
117 {
118 NewMem(s);
119 }
120 int* GetPtr() const { return p.get(); }
121 int* NewMem( size_t s )
122 {
123 p.reset(new int[ s<3 ? 3 : s ]);
124 return p.get();
125 }
126};
127
134{
135 sal_Int32 nPatternLen;
139 sal_Int32 nArrayLen;
142 int nLimit;
143 int nRepP0;
144 int nInsQ0;
145 int nDelR0;
146 int nStars;
148
149 void InitData( const sal_Unicode* cPattern );
150 static int Mid3( int x, int y, int z );
151
152public:
153
160 WLevDistance( const sal_Unicode* cPattern, int nOtherX, int nShorterY,
161 int nLongerZ, bool bRelaxed );
162
163 WLevDistance( const WLevDistance& rWLD );
165
167 int WLD( const sal_Unicode* cString, sal_Int32 nStringLen );
168
172 void CalcLPQR( int nOtherX, int nShorterY, int nLongerZ,
173 bool bRelaxed );
174
175 int GetLimit() const { return nLimit; }
176
177 // Calculate current balance, keep this inline for performance reasons!
178 // c == cpPattern[jj] == cString[ii]
179 // First seek up to found place, if the balance is still equal there then
180 // also compare after the found place.
181 int levdisbalance(sal_Int32 jj, sal_Int32 ii, sal_Unicode c, const sal_Unicode* cString, sal_Int32 nStringLen) const
182 {
183 int nBalance = 0;
184
185 if ( jj != ii )
186 {
187 sal_Int32 k;
188 if ( jj > 0 )
189 for ( k=0; k < jj; k++ )
190 if ( cpPattern[k] == c )
191 nBalance++;
192 if ( ii > 0 )
193 for ( k=0; k < ii; k++ )
194 if ( cString[k] == c )
195 nBalance--;
196 if ( !nBalance )
197 {
198 for ( k=jj+1; k < nPatternLen; k++ )
199 if ( cpPattern[k] == c )
200 nBalance++;
201 for ( k=ii+1; k < nStringLen; k++ )
202 if ( cString[k] == c )
203 nBalance--;
204 }
205 }
206
207 return nBalance;
208 }
209};
210
211
212#endif
213
214/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
std::unique_ptr< int[]> p
Definition: levdis.hxx:114
WLevDisDistanceMem(size_t s)
Definition: levdis.hxx:116
int * NewMem(size_t s)
Definition: levdis.hxx:121
int * GetPtr() const
Definition: levdis.hxx:120
"Safe" memory allocation in ctor
Definition: levdis.hxx:98
sal_Unicode * GetcPtr() const
Definition: levdis.hxx:108
std::unique_ptr< bool[]> bp
Definition: levdis.hxx:100
bool * GetbPtr() const
Definition: levdis.hxx:109
WLevDisPatternMem(sal_Int32 s)
Definition: levdis.hxx:102
std::unique_ptr< sal_Unicode[]> cp
Definition: levdis.hxx:99
Weighted Levenshtein Distance (WLD)
Definition: levdis.hxx:134
int levdisbalance(sal_Int32 jj, sal_Int32 ii, sal_Unicode c, const sal_Unicode *cString, sal_Int32 nStringLen) const
Definition: levdis.hxx:181
sal_Int32 nArrayLen
length of distance array
Definition: levdis.hxx:139
int GetLimit() const
Definition: levdis.hxx:175
int nRepP0
replacement weigh
Definition: levdis.hxx:143
bool * bpPatIsWild
pointer to bool array whether pattern is wildcard
Definition: levdis.hxx:138
int nDelR0
deletion weigh
Definition: levdis.hxx:145
bool bSplitCount
if TRUE, Rep/Ins/Del are counted separately
Definition: levdis.hxx:147
WLevDisDistanceMem aDisMem
manage allocation of distance array
Definition: levdis.hxx:140
static int Mid3(int x, int y, int z)
middle value of 3 values
Definition: levdis.cxx:285
int nLimit
WLD limit replacements/insertions/deletions.
Definition: levdis.hxx:142
sal_Int32 nPatternLen
length of pattern
Definition: levdis.hxx:135
void InitData(const sal_Unicode *cPattern)
Definition: levdis.cxx:297
int WLD(const sal_Unicode *cString, sal_Int32 nStringLen)
Calculate the Weighted Levenshtein Distance from string to pattern.
Definition: levdis.cxx:75
WLevDisPatternMem aPatMem
manage allocation of pattern array
Definition: levdis.hxx:136
int nStars
count of '*' wildcards in pattern
Definition: levdis.hxx:146
int * npDistance
pointer to distance array
Definition: levdis.hxx:141
sal_Unicode * cpPattern
pointer to pattern array
Definition: levdis.hxx:137
WLevDistance(const sal_Unicode *cPattern, int nOtherX, int nShorterY, int nLongerZ, bool bRelaxed)
CTor with user input.
Definition: levdis.cxx:331
int nInsQ0
insertion weigh
Definition: levdis.hxx:144
void CalcLPQR(int nOtherX, int nShorterY, int nLongerZ, bool bRelaxed)
Calculate the internal weighs corresponding to the user input values.
Definition: levdis.cxx:262
sal_uInt16 sal_Unicode