LibreOffice Module helpcompiler (master) 1
HelpIndexer.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 */
9
11
12#include <rtl/string.hxx>
13#include <rtl/uri.hxx>
15#include <osl/file.hxx>
16#include <osl/thread.h>
17#include <o3tl/string_view.hxx>
18#include <memory>
19#include <utility>
20
21#include "LuceneHelper.hxx"
22#include <CLucene.h>
23#include <CLucene/analysis/LanguageBasedAnalyzer.h>
24
25#if defined _WIN32
27#include <prewin.h>
28#include <postwin.h>
29#endif
30
31using namespace lucene::document;
32
33HelpIndexer::HelpIndexer(OUString lang, OUString module,
34 std::u16string_view srcDir, std::u16string_view outDir)
35 : d_lang(std::move(lang)), d_module(std::move(module))
36{
37 d_indexDir = outDir + OUStringChar('/') + d_module + ".idxl";
38 d_captionDir = OUString::Concat(srcDir) + "/caption";
39 d_contentDir = OUString::Concat(srcDir) + "/content";
40}
41
42#if defined _WIN32
43namespace
44{
45template <class Constructor>
46auto TryWithUnicodePathWorkaround(const OUString& ustrPath, const Constructor& constructor)
47{
48 const rtl_TextEncoding eThreadEncoding = osl_getThreadTextEncoding();
49 OString sPath = OUStringToOString(ustrPath, eThreadEncoding);
50 try
51 {
52 // First try path in thread encoding (ACP in case of Windows).
53 return constructor(sPath);
54 }
55 catch (const CLuceneError&)
56 {
57 // Maybe the path contains characters not representable in ACP. There's no API in lucene
58 // that takes Unicode strings (they take 8-bit strings, and pass them to CRT library
59 // functions without conversion).
60
61 // For a workaround, try short name, which should only contain ASCII characters. Would
62 // not help (i.e., would return original long name) if short (8.3) file name creation is
63 // disabled in OS or volume settings.
64 wchar_t buf[32767];
65 if (GetShortPathNameW(o3tl::toW(ustrPath.getStr()), buf, std::size(buf)) == 0)
66 throw;
67 sPath = OUStringToOString(o3tl::toU(buf), eThreadEncoding);
68 return constructor(sPath);
69 }
70}
71}
72#endif
73
75{
76 if (!scanForFiles())
77 return false;
78
79 try
80 {
81 std::u16string_view sLang = o3tl::getToken(d_lang, 0, '-');
82 bool bUseCJK = sLang == u"ja" || sLang == u"ko" || sLang == u"zh";
83
84 // Construct the analyzer appropriate for the given language
85 std::unique_ptr<lucene::analysis::Analyzer> analyzer;
86 if (bUseCJK)
87 analyzer.reset(new lucene::analysis::LanguageBasedAnalyzer(L"cjk"));
88 else
89 analyzer.reset(new lucene::analysis::standard::StandardAnalyzer());
90
91 OUString ustrSystemPath;
92 osl::File::getSystemPathFromFileURL(d_indexDir, ustrSystemPath);
93
94#if defined _WIN32
95 // Make sure the path exists, or GetShortPathNameW (if attempted) will fail.
96 osl::Directory::createPath(d_indexDir);
97 auto writer = TryWithUnicodePathWorkaround(ustrSystemPath, [&analyzer](const OString& s) {
98 return std::make_unique<lucene::index::IndexWriter>(s.getStr(), analyzer.get(), true);
99 });
100#else
101 OString indexDirStr = OUStringToOString(ustrSystemPath, osl_getThreadTextEncoding());
102 auto writer = std::make_unique<lucene::index::IndexWriter>(indexDirStr.getStr(),
103 analyzer.get(), true);
104#endif
105
106 //Double limit of tokens allowed, otherwise we'll get a too-many-tokens
107 //exception for ja help. Could alternative ignore the exception and get
108 //truncated results as per java-Lucene apparently
109 writer->setMaxFieldLength(lucene::index::IndexWriter::DEFAULT_MAX_FIELD_LENGTH*2);
110
111 // Index the identified help files
112 Document doc;
113 for (auto const& elem : d_files)
114 {
115 helpDocument(elem, &doc);
116 writer->addDocument(&doc);
117 doc.clear();
118 }
119
120 // Optimize the index
121 writer->optimize();
122 }
123 catch (CLuceneError &e)
124 {
126 return false;
127 }
128
129 return true;
130}
131
132
135 return false;
136 }
138 return false;
139 }
140 return true;
141}
142
143bool HelpIndexer::scanForFiles(OUString const & path) {
144
145 osl::Directory dir(path);
146 if (osl::FileBase::E_None != dir.open()) {
147 d_error = "Error reading directory " + path;
148 return false;
149 }
150
151 osl::DirectoryItem item;
152 osl::FileStatus fileStatus(osl_FileStatus_Mask_FileName | osl_FileStatus_Mask_Type);
153 while (dir.getNextItem(item) == osl::FileBase::E_None) {
154 item.getFileStatus(fileStatus);
155 if (fileStatus.getFileType() == osl::FileStatus::Regular) {
156 d_files.insert(fileStatus.getFileName());
157 }
158 }
159
160 return true;
161}
162
163void HelpIndexer::helpDocument(OUString const & fileName, Document *doc) const {
164 // Add the help path as an indexed, untokenized field.
165
166 OUString path = "#HLP#" + d_module + "/" + fileName;
167 std::vector<TCHAR> aPath(OUStringToTCHARVec(path));
168 doc->add(*_CLNEW Field(_T("path"), aPath.data(), int(Field::STORE_YES) | int(Field::INDEX_UNTOKENIZED)));
169
170 OUString sEscapedFileName =
171 rtl::Uri::encode(fileName,
172 rtl_UriCharClassUric, rtl_UriEncodeIgnoreEscapes, RTL_TEXTENCODING_UTF8);
173
174 // Add the caption as a field.
175 OUString captionPath = d_captionDir + "/" + sEscapedFileName;
176 doc->add(*_CLNEW Field(_T("caption"), helpFileReader(captionPath), int(Field::STORE_NO) | int(Field::INDEX_TOKENIZED)));
177
178 // Add the content as a field.
179 OUString contentPath = d_contentDir + "/" + sEscapedFileName;
180 doc->add(*_CLNEW Field(_T("content"), helpFileReader(contentPath), int(Field::STORE_NO) | int(Field::INDEX_TOKENIZED)));
181}
182
183lucene::util::Reader *HelpIndexer::helpFileReader(OUString const & path) {
184 osl::File file(path);
185 if (osl::FileBase::E_None == file.open(osl_File_OpenFlag_Read)) {
186 file.close();
187 OUString ustrSystemPath;
188 osl::File::getSystemPathFromFileURL(path, ustrSystemPath);
189#if defined _WIN32
190 return TryWithUnicodePathWorkaround(ustrSystemPath, [](const OString& s) {
191 return _CLNEW lucene::util::FileReader(s.getStr(), "UTF-8");
192 });
193#else
194 OString pathStr = OUStringToOString(ustrSystemPath, osl_getThreadTextEncoding());
195 return _CLNEW lucene::util::FileReader(pathStr.getStr(), "UTF-8");
196#endif
197 } else {
198 return _CLNEW lucene::util::StringReader(L"");
199 }
200}
201
202/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
std::vector< TCHAR > OUStringToTCHARVec(OUString const &rStr)
bool indexDocuments()
Run the indexer.
Definition: HelpIndexer.cxx:74
bool scanForFiles()
Scan the caption & contents directories for help files.
OUString d_module
Definition: HelpIndexer.hxx:36
OUString d_captionDir
Definition: HelpIndexer.hxx:37
std::set< OUString > d_files
Definition: HelpIndexer.hxx:41
static lucene::util::Reader * helpFileReader(OUString const &path)
Create a reader for the given file, and create an "empty" reader in case the file doesn't exist.
OUString d_contentDir
Definition: HelpIndexer.hxx:38
OUString d_error
Definition: HelpIndexer.hxx:40
OUString d_indexDir
Definition: HelpIndexer.hxx:39
HelpIndexer(OUString lang, OUString module, std::u16string_view srcDir, std::u16string_view outDir)
Definition: HelpIndexer.cxx:33
void helpDocument(OUString const &fileName, lucene::document::Document *doc) const
Fill the Document with information on the given help file.
OUString d_lang
Definition: HelpIndexer.hxx:35
float u
static std::string encode(const std::string &rIn)
Definition: HelpLinker.cxx:203
OUString runtimeToOUString(char const *runtimeString)
std::basic_string_view< charT, traits > getToken(std::basic_string_view< charT, traits > sv, charT delimiter, std::size_t &position)
module
OString OUStringToOString(std::u16string_view str, ConnectionSettings const *settings)