LibreOffice Module lingucomponent (master) 1
lingutil.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
20#if defined(_WIN32)
21#if !defined WIN32_LEAN_AND_MEAN
22# define WIN32_LEAN_AND_MEAN
23#endif
24#include <windows.h>
25#endif
26
27#include <osl/thread.h>
28#include <osl/file.hxx>
29#include <osl/process.h>
30#include <tools/debug.hxx>
31#include <tools/urlobj.hxx>
35#include <unotools/lingucfg.hxx>
37#include <rtl/bootstrap.hxx>
38#include <rtl/ustring.hxx>
39#include <rtl/string.hxx>
40#include <rtl/tencinfo.h>
41#include <linguistic/misc.hxx>
42
43#include <set>
44#include <vector>
45#include <string.h>
46
47#include "lingutil.hxx"
48
49#include <sal/macros.h>
50
51using namespace ::com::sun::star;
52
53#if defined(_WIN32)
54OString Win_AddLongPathPrefix( const OString &rPathName )
55{
56 constexpr OStringLiteral WIN32_LONG_PATH_PREFIX = "\\\\?\\";
57 if (!rPathName.match(WIN32_LONG_PATH_PREFIX)) return WIN32_LONG_PATH_PREFIX + rPathName;
58 return rPathName;
59}
60#endif //defined(_WIN32)
61
62#if defined SYSTEM_DICTS || defined IOS
63// find old style dictionaries in system directories
64static void GetOldStyleDicsInDir(
65 OUString const & aSystemDir, OUString const & aFormatName,
66 std::u16string_view aSystemSuffix, std::u16string_view aSystemPrefix,
67 std::set< OUString >& aDicLangInUse,
68 std::vector< SvtLinguConfigDictionaryEntry >& aRes )
69{
70 osl::Directory aSystemDicts(aSystemDir);
71 if (aSystemDicts.open() != osl::FileBase::E_None)
72 return;
73
74 osl::DirectoryItem aItem;
75 osl::FileStatus aFileStatus(osl_FileStatus_Mask_FileURL);
76 while (aSystemDicts.getNextItem(aItem) == osl::FileBase::E_None)
77 {
78 aItem.getFileStatus(aFileStatus);
79 OUString sPath = aFileStatus.getFileURL();
80 if (sPath.endsWith(aSystemSuffix))
81 {
82 sal_Int32 nStartIndex = sPath.lastIndexOf('/') + 1;
83 if (!sPath.match(aSystemPrefix, nStartIndex))
84 continue;
85 OUString sChunk = sPath.copy(nStartIndex + aSystemPrefix.size(),
86 sPath.getLength() - aSystemSuffix.size() -
87 nStartIndex - aSystemPrefix.size());
88 if (sChunk.isEmpty())
89 continue;
90
91 // We prefer (now) to use language tags.
92 // Avoid feeding in the older LANG_REGION scheme to the BCP47
93 // ctor as that triggers use of liblangtag and initializes its
94 // database which we do not want during startup. Convert
95 // instead.
96 sChunk = sChunk.replace( '_', '-');
97
98 // There's a known exception to the rule, the dreaded
99 // hu_HU_u8.dic of the myspell-hu package, see
100 // http://packages.debian.org/search?arch=any&searchon=contents&keywords=hu_HU_u8.dic
101 // This was ignored because unknown in the old implementation,
102 // truncate to the known locale and either insert because hu_HU
103 // wasn't encountered yet, or skip because it was. It doesn't
104 // really matter because the proper new-style hu_HU dictionary
105 // will take precedence anyway if installed with a Hungarian
106 // languagepack. Again, this is only to not pull in all
107 // liblangtag and stuff during startup, the result would be
108 // !isValidBcp47() and the dictionary ignored.
109 if (sChunk == "hu-HU-u8")
110 sChunk = "hu-HU";
111
112 LanguageTag aLangTag(sChunk, true);
113 if (!aLangTag.isValidBcp47())
114 continue;
115
116 // Thus we first get the language of the dictionary
117 const OUString& aLocaleName(aLangTag.getBcp47());
118
119 if (aDicLangInUse.insert(aLocaleName).second)
120 {
121 // add the dictionary to the resulting vector
123 aDicEntry.aLocations = { sPath };
124 aDicEntry.aFormatName = aFormatName;
125 if (aLocaleName == u"ar")
126 aDicEntry.aLocaleNames = {
127 aLocaleName,
128 u"ar-AE", u"ar-BH", u"ar-DJ", u"ar-DZ", u"ar-EG",
129 u"ar-ER", u"ar-IL", u"ar-IQ", u"ar-JO", u"ar-KM",
130 u"ar-KW", u"ar-LB", u"ar-LY", u"ar-MA", u"ar-MR",
131 u"ar-OM", u"ar-PS", u"ar-QA", u"ar-SA", u"ar-SD",
132 u"ar-SO", u"ar-SY", u"ar-TD", u"ar-TN", u"ar-YE"
133 };
134 else
135 aDicEntry.aLocaleNames = { aLocaleName };
136 aRes.push_back( aDicEntry );
137 }
138 }
139 }
140}
141#endif
142
143// build list of old style dictionaries (not as extensions) to use.
144// User installed dictionaries (the ones residing in the user paths)
145// will get precedence over system installed ones for the same language.
146std::vector< SvtLinguConfigDictionaryEntry > GetOldStyleDics( const char *pDicType )
147{
148 std::vector< SvtLinguConfigDictionaryEntry > aRes;
149
150 if (!pDicType)
151 return aRes;
152
153 OUString aFormatName;
154 OUString aDicExtension;
155#if defined SYSTEM_DICTS || defined IOS
156 OUString aSystemDir;
157 OUString aSystemPrefix;
158 OUString aSystemSuffix;
159#endif
160 if (strcmp( pDicType, "DICT" ) == 0)
161 {
162 aFormatName = "DICT_SPELL";
163 aDicExtension = ".dic";
164#ifdef SYSTEM_DICTS
165 aSystemDir = DICT_SYSTEM_DIR;
166 aSystemSuffix = aDicExtension;
167#elif defined IOS
168 aSystemDir = "$BRAND_BASE_DIR/share/spell";
169 rtl::Bootstrap::expandMacros(aSystemDir);
170 aSystemSuffix = ".dic";
171#endif
172 }
173 else if (strcmp( pDicType, "HYPH" ) == 0)
174 {
175 aFormatName = "DICT_HYPH";
176 aDicExtension = ".dic";
177#ifdef SYSTEM_DICTS
178 aSystemDir = HYPH_SYSTEM_DIR;
179 aSystemPrefix = "hyph_";
180 aSystemSuffix = aDicExtension;
181#endif
182 }
183 else if (strcmp( pDicType, "THES" ) == 0)
184 {
185 aFormatName = "DICT_THES";
186 aDicExtension = ".dat";
187#ifdef SYSTEM_DICTS
188 aSystemDir = THES_SYSTEM_DIR;
189 aSystemPrefix = "th_";
190 aSystemSuffix = "_v2.dat";
191#elif defined IOS
192 aSystemDir = "$BRAND_BASE_DIR/share/thes";
193 rtl::Bootstrap::expandMacros(aSystemDir);
194 aSystemPrefix = "th_";
195 aSystemSuffix = "_v2.dat";
196#endif
197 }
198
199 if (aFormatName.isEmpty() || aDicExtension.isEmpty())
200 return aRes;
201
202#if defined SYSTEM_DICTS || defined IOS
203 // set of languages to remember the language where it is already
204 // decided to make use of the dictionary.
205 std::set< OUString > aDicLangInUse;
206
207#ifndef IOS
208 // follow the hunspell tool's example and check DICPATH for preferred dictionaries
209 rtl_uString * pSearchPath = nullptr;
210 osl_getEnvironment(OUString("DICPATH").pData, &pSearchPath);
211
212 if (pSearchPath)
213 {
214 OUString aSearchPath(pSearchPath);
215 rtl_uString_release(pSearchPath);
216
217 sal_Int32 nIndex = 0;
218 do
219 {
220 OUString aSystem( aSearchPath.getToken(0, ':', nIndex) );
221 OUString aCWD;
222 OUString aRelative;
223 OUString aAbsolute;
224
226 continue;
227 if (osl::FileBase::getFileURLFromSystemPath(aSystem, aRelative)
228 != osl::FileBase::E_None)
229 continue;
230 if (osl::FileBase::getAbsoluteFileURL(aCWD, aRelative, aAbsolute)
231 != osl::FileBase::E_None)
232 continue;
233
234 // GetOldStyleDicsInDir will make sure the dictionary is the right
235 // type based on its prefix, that way hyphen, mythes and regular
236 // dictionaries can live in one directory
237 GetOldStyleDicsInDir(aAbsolute, aFormatName, aSystemSuffix,
238 aSystemPrefix, aDicLangInUse, aRes);
239 }
240 while (nIndex != -1);
241 }
242#endif
243
244 // load system directories last so that DICPATH prevails
245 GetOldStyleDicsInDir(aSystemDir, aFormatName, aSystemSuffix, aSystemPrefix,
246 aDicLangInUse, aRes);
247#endif
248
249 return aRes;
250}
251
253 std::vector< SvtLinguConfigDictionaryEntry > &rNewStyleDics,
254 const std::vector< SvtLinguConfigDictionaryEntry > &rOldStyleDics )
255{
256 // get list of languages supported by new style dictionaries
257 std::set< OUString > aNewStyleLanguages;
258 for (auto const& newStyleDic : rNewStyleDics)
259 {
260 const uno::Sequence< OUString > aLocaleNames(newStyleDic.aLocaleNames);
261 sal_Int32 nLocaleNames = aLocaleNames.getLength();
262 for (sal_Int32 k = 0; k < nLocaleNames; ++k)
263 {
264 aNewStyleLanguages.insert( aLocaleNames[k] );
265 }
266 }
267
268 // now check all old style dictionaries if they will add a not yet
269 // added language. If so add them to the resulting vector
270 for (auto const& oldStyleDic : rOldStyleDics)
271 {
272 sal_Int32 nOldStyleDics = oldStyleDic.aLocaleNames.getLength();
273
274 // old style dics should only have one language listed...
275 DBG_ASSERT( nOldStyleDics, "old style dictionary with more than one language found!");
276 if (nOldStyleDics > 0)
277 {
278 if (linguistic::LinguIsUnspecified( oldStyleDic.aLocaleNames[0]))
279 {
280 OSL_FAIL( "old style dictionary with invalid language found!" );
281 continue;
282 }
283
284 // language not yet added?
285 if (aNewStyleLanguages.find( oldStyleDic.aLocaleNames[0] ) == aNewStyleLanguages.end())
286 rNewStyleDics.push_back(oldStyleDic);
287 }
288 else
289 {
290 OSL_FAIL( "old style dictionary with no language found!" );
291 }
292 }
293}
294
295rtl_TextEncoding getTextEncodingFromCharset(const char* pCharset)
296{
297 // default result: used to indicate that we failed to get the proper encoding
298 rtl_TextEncoding eRet = RTL_TEXTENCODING_DONTKNOW;
299
300 if (pCharset)
301 {
302 eRet = rtl_getTextEncodingFromMimeCharset(pCharset);
303 if (eRet == RTL_TEXTENCODING_DONTKNOW)
304 eRet = rtl_getTextEncodingFromUnixCharset(pCharset);
305 if (eRet == RTL_TEXTENCODING_DONTKNOW)
306 {
307 if (strcmp("ISCII-DEVANAGARI", pCharset) == 0)
308 eRet = RTL_TEXTENCODING_ISCII_DEVANAGARI;
309 }
310 }
311 return eRet;
312}
313
314/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
static bool getProcessWorkingDir(OUString &rUrl)
#define DBG_ASSERT(sCon, aError)
float u
sal_Int32 nIndex
std::vector< SvtLinguConfigDictionaryEntry > GetOldStyleDics(const char *pDicType)
Definition: lingutil.cxx:146
void MergeNewStyleDicsAndOldStyleDics(std::vector< SvtLinguConfigDictionaryEntry > &rNewStyleDics, const std::vector< SvtLinguConfigDictionaryEntry > &rOldStyleDics)
Definition: lingutil.cxx:252
rtl_TextEncoding getTextEncodingFromCharset(const char *pCharset)
Definition: lingutil.cxx:295
std::unique_ptr< sal_Int32[]> pData
bool LinguIsUnspecified(LanguageType nLanguage)
css::uno::Sequence< OUString > aLocaleNames
css::uno::Sequence< OUString > aLocations