LibreOffice Module lingucomponent (master)  1
lingutil.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #if defined(_WIN32)
21 #if !defined WIN32_LEAN_AND_MEAN
22 # define WIN32_LEAN_AND_MEAN
23 #endif
24 #include <windows.h>
25 #endif
26 
27 #include <osl/thread.h>
28 #include <osl/file.hxx>
29 #include <osl/process.h>
30 #include <tools/debug.hxx>
31 #include <tools/urlobj.hxx>
33 #include <i18nlangtag/mslangid.hxx>
34 #include <unotools/bootstrap.hxx>
35 #include <unotools/lingucfg.hxx>
36 #include <unotools/pathoptions.hxx>
37 #include <rtl/ustring.hxx>
38 #include <rtl/string.hxx>
39 #include <rtl/tencinfo.h>
40 #include <linguistic/misc.hxx>
41 
42 #include <set>
43 #include <vector>
44 #include <string.h>
45 
46 #include "lingutil.hxx"
47 
48 #include <sal/macros.h>
49 
50 using namespace ::com::sun::star;
51 
52 #if defined(_WIN32)
53 OString Win_AddLongPathPrefix( const OString &rPathName )
54 {
55 #define WIN32_LONG_PATH_PREFIX "\\\\?\\"
56  if (!rPathName.match(WIN32_LONG_PATH_PREFIX)) return WIN32_LONG_PATH_PREFIX + rPathName;
57  return rPathName;
58 }
59 #endif //defined(WNT)
60 
61 #ifdef SYSTEM_DICTS
62 // find old style dictionaries in system directories
63 static void GetOldStyleDicsInDir(
64  OUString const & aSystemDir, OUString const & aFormatName,
65  OUString const & aSystemSuffix, OUString const & aSystemPrefix,
66  std::set< OUString >& aDicLangInUse,
67  std::vector< SvtLinguConfigDictionaryEntry >& aRes )
68 {
69  osl::Directory aSystemDicts(aSystemDir);
70  if (aSystemDicts.open() == osl::FileBase::E_None)
71  {
72  osl::DirectoryItem aItem;
73  osl::FileStatus aFileStatus(osl_FileStatus_Mask_FileURL);
74  while (aSystemDicts.getNextItem(aItem) == osl::FileBase::E_None)
75  {
76  aItem.getFileStatus(aFileStatus);
77  OUString sPath = aFileStatus.getFileURL();
78  if (sPath.endsWith(aSystemSuffix))
79  {
80  sal_Int32 nStartIndex = sPath.lastIndexOf('/') + 1;
81  if (!sPath.match(aSystemPrefix, nStartIndex))
82  continue;
83  OUString sChunk = sPath.copy(nStartIndex + aSystemPrefix.getLength(),
84  sPath.getLength() - aSystemSuffix.getLength() -
85  nStartIndex - aSystemPrefix.getLength());
86  if (sChunk.isEmpty())
87  continue;
88 
89  // We prefer (now) to use language tags.
90  // Avoid feeding in the older LANG_REGION scheme to the BCP47
91  // ctor as that triggers use of liblangtag and initializes its
92  // database which we do not want during startup. Convert
93  // instead.
94  sChunk = sChunk.replace( '_', '-');
95 
96  // There's a known exception to the rule, the dreaded
97  // hu_HU_u8.dic of the myspell-hu package, see
98  // http://packages.debian.org/search?arch=any&searchon=contents&keywords=hu_HU_u8.dic
99  // This was ignored because unknown in the old implementation,
100  // truncate to the known locale and either insert because hu_HU
101  // wasn't encountered yet, or skip because it was. It doesn't
102  // really matter because the proper new-style hu_HU dictionary
103  // will take precedence anyway if installed with a Hungarian
104  // languagepack. Again, this is only to not pull in all
105  // liblangtag and stuff during startup, the result would be
106  // !isValidBcp47() and the dictionary ignored.
107  if (sChunk == "hu-HU-u8")
108  sChunk = "hu-HU";
109 
110  LanguageTag aLangTag(sChunk, true);
111  if (!aLangTag.isValidBcp47())
112  continue;
113 
114  // Thus we first get the language of the dictionary
115  const OUString& aLocaleName(aLangTag.getBcp47());
116 
117  if (aDicLangInUse.insert(aLocaleName).second)
118  {
119  // add the dictionary to the resulting vector
121  aDicEntry.aLocations.realloc(1);
122  aDicEntry.aLocaleNames.realloc(1);
123  aDicEntry.aLocations[0] = sPath;
124  aDicEntry.aFormatName = aFormatName;
125  aDicEntry.aLocaleNames[0] = aLocaleName;
126  aRes.push_back( aDicEntry );
127  }
128  }
129  }
130  }
131 }
132 #endif
133 
134 // build list of old style dictionaries (not as extensions) to use.
135 // User installed dictionaries (the ones residing in the user paths)
136 // will get precedence over system installed ones for the same language.
137 std::vector< SvtLinguConfigDictionaryEntry > GetOldStyleDics( const char *pDicType )
138 {
139  std::vector< SvtLinguConfigDictionaryEntry > aRes;
140 
141  if (!pDicType)
142  return aRes;
143 
144  OUString aFormatName;
145  OUString aDicExtension;
146 #ifdef SYSTEM_DICTS
147  OUString aSystemDir;
148  OUString aSystemPrefix;
149  OUString aSystemSuffix;
150 #endif
151  if (strcmp( pDicType, "DICT" ) == 0)
152  {
153  aFormatName = "DICT_SPELL";
154  aDicExtension = ".dic";
155 #ifdef SYSTEM_DICTS
156  aSystemDir = DICT_SYSTEM_DIR;
157  aSystemSuffix = aDicExtension;
158 #endif
159  }
160  else if (strcmp( pDicType, "HYPH" ) == 0)
161  {
162  aFormatName = "DICT_HYPH";
163  aDicExtension = ".dic";
164 #ifdef SYSTEM_DICTS
165  aSystemDir = HYPH_SYSTEM_DIR;
166  aSystemPrefix = "hyph_";
167  aSystemSuffix = aDicExtension;
168 #endif
169  }
170  else if (strcmp( pDicType, "THES" ) == 0)
171  {
172  aFormatName = "DICT_THES";
173  aDicExtension = ".dat";
174 #ifdef SYSTEM_DICTS
175  aSystemDir = THES_SYSTEM_DIR;
176  aSystemPrefix = "th_";
177  aSystemSuffix = "_v2.dat";
178 #endif
179  }
180 
181  if (aFormatName.isEmpty() || aDicExtension.isEmpty())
182  return aRes;
183 
184 #ifdef SYSTEM_DICTS
185  // set of languages to remember the language where it is already
186  // decided to make use of the dictionary.
187  std::set< OUString > aDicLangInUse;
188 
189  // follow the hunspell tool's example and check DICPATH for preferred dictionaries
190  rtl_uString * pSearchPath = nullptr;
191  osl_getEnvironment(OUString("DICPATH").pData, &pSearchPath);
192 
193  if (pSearchPath)
194  {
195  OUString aSearchPath(pSearchPath);
196  rtl_uString_release(pSearchPath);
197 
198  sal_Int32 nIndex = 0;
199  do
200  {
201  OUString aSystem = aSearchPath.getToken(0, ':', nIndex);
202  OUString aCWD;
203  OUString aRelative;
204  OUString aAbsolute;
205 
207  continue;
208  if (osl::FileBase::getFileURLFromSystemPath(aSystem, aRelative)
209  != osl::FileBase::E_None)
210  continue;
211  if (osl::FileBase::getAbsoluteFileURL(aCWD, aRelative, aAbsolute)
212  != osl::FileBase::E_None)
213  continue;
214 
215  // GetOldStyleDicsInDir will make sure the dictionary is the right
216  // type based on its prefix, that way hyphen, mythes and regular
217  // dictionaries can live in one directory
218  GetOldStyleDicsInDir(aAbsolute, aFormatName, aSystemSuffix,
219  aSystemPrefix, aDicLangInUse, aRes);
220  }
221  while (nIndex != -1);
222  }
223 
224  // load system directories last so that DICPATH prevails
225  GetOldStyleDicsInDir(aSystemDir, aFormatName, aSystemSuffix, aSystemPrefix,
226  aDicLangInUse, aRes);
227 #endif
228 
229  return aRes;
230 }
231 
233  std::vector< SvtLinguConfigDictionaryEntry > &rNewStyleDics,
234  const std::vector< SvtLinguConfigDictionaryEntry > &rOldStyleDics )
235 {
236  // get list of languages supported by new style dictionaries
237  std::set< OUString > aNewStyleLanguages;
238  for (auto const& newStyleDic : rNewStyleDics)
239  {
240  const uno::Sequence< OUString > aLocaleNames(newStyleDic.aLocaleNames);
241  sal_Int32 nLocaleNames = aLocaleNames.getLength();
242  for (sal_Int32 k = 0; k < nLocaleNames; ++k)
243  {
244  aNewStyleLanguages.insert( aLocaleNames[k] );
245  }
246  }
247 
248  // now check all old style dictionaries if they will add a not yet
249  // added language. If so add them to the resulting vector
250  for (auto const& oldStyleDic : rOldStyleDics)
251  {
252  sal_Int32 nOldStyleDics = oldStyleDic.aLocaleNames.getLength();
253 
254  // old style dics should only have one language listed...
255  DBG_ASSERT( nOldStyleDics, "old style dictionary with more than one language found!");
256  if (nOldStyleDics > 0)
257  {
258  if (linguistic::LinguIsUnspecified( oldStyleDic.aLocaleNames[0]))
259  {
260  OSL_FAIL( "old style dictionary with invalid language found!" );
261  continue;
262  }
263 
264  // language not yet added?
265  if (aNewStyleLanguages.find( oldStyleDic.aLocaleNames[0] ) == aNewStyleLanguages.end())
266  rNewStyleDics.push_back(oldStyleDic);
267  }
268  else
269  {
270  OSL_FAIL( "old style dictionary with no language found!" );
271  }
272  }
273 }
274 
275 rtl_TextEncoding getTextEncodingFromCharset(const sal_Char* pCharset)
276 {
277  // default result: used to indicate that we failed to get the proper encoding
278  rtl_TextEncoding eRet = RTL_TEXTENCODING_DONTKNOW;
279 
280  if (pCharset)
281  {
282  eRet = rtl_getTextEncodingFromMimeCharset(pCharset);
283  if (eRet == RTL_TEXTENCODING_DONTKNOW)
284  eRet = rtl_getTextEncodingFromUnixCharset(pCharset);
285  if (eRet == RTL_TEXTENCODING_DONTKNOW)
286  {
287  if (strcmp("ISCII-DEVANAGARI", pCharset) == 0)
288  eRet = RTL_TEXTENCODING_ISCII_DEVANAGARI;
289  }
290  }
291  return eRet;
292 }
293 
294 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
static bool getProcessWorkingDir(OUString &rUrl)
std::vector< SvtLinguConfigDictionaryEntry > GetOldStyleDics(const char *pDicType)
Definition: lingutil.cxx:137
char sal_Char
bool LinguIsUnspecified(LanguageType nLanguage)
#define DBG_ASSERT(sCon, aError)
css::uno::Sequence< OUString > aLocaleNames
void MergeNewStyleDicsAndOldStyleDics(std::vector< SvtLinguConfigDictionaryEntry > &rNewStyleDics, const std::vector< SvtLinguConfigDictionaryEntry > &rOldStyleDics)
Definition: lingutil.cxx:232
css::uno::Sequence< OUString > aLocations
rtl_TextEncoding getTextEncodingFromCharset(const sal_Char *pCharset)
Definition: lingutil.cxx:275