LibreOffice Module sc (master) 1
htmldataprovider.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 */
9
10#include "htmldataprovider.hxx"
11#include <datamapper.hxx>
13#include <salhelper/thread.hxx>
14#include <utility>
15#include <vcl/svapp.hxx>
16#include <tools/stream.hxx>
17
18#include <libxml/HTMLparser.h>
19
20#include <libxml/xpath.h>
21
22#include <comphelper/string.hxx>
23
24namespace sc {
25
27{
29 OUString maURL;
30 OUString maID;
31 const std::vector<std::shared_ptr<sc::DataTransformation>> maDataTransformations;
32 std::function<void()> maImportFinishedHdl;
33
34 void handleTable(xmlNodePtr pTable);
35 void handleRow(xmlNodePtr pRow, SCROW nRow);
36 void skipHeadBody(xmlNodePtr pSkip, SCROW& rRow);
37 void handleCell(xmlNodePtr pCell, SCROW nRow, SCCOL nCol);
38
39public:
40 HTMLFetchThread(ScDocument& rDoc, const OUString&, const OUString& rID, std::function<void()> aImportFinishedHdl,
41 std::vector<std::shared_ptr<sc::DataTransformation>>&& rTransformations);
42
43 virtual void execute() override;
44};
45
47 ScDocument& rDoc, const OUString& rURL, const OUString& rID,
48 std::function<void()> aImportFinishedHdl,
49 std::vector<std::shared_ptr<sc::DataTransformation>>&& rTransformations)
50 : salhelper::Thread("HTML Fetch Thread")
51 , mrDocument(rDoc)
52 , maURL(rURL)
53 , maID(rID)
54 , maDataTransformations(std::move(rTransformations))
55 , maImportFinishedHdl(std::move(aImportFinishedHdl))
56{
57}
58
59namespace {
60
61OString toString(const xmlChar* pStr)
62{
63 return OString(reinterpret_cast<const char*>(pStr), xmlStrlen(pStr));
64}
65
66OUString trim_string(const OUString& aStr)
67{
68 OUString aOldString;
69 OUString aString = aStr;
70 do
71 {
72 aOldString = aString;
73 aString = comphelper::string::strip(aString, ' ');
74 aString = comphelper::string::strip(aString, '\n');
75 aString = comphelper::string::strip(aString, '\r');
76 aString = comphelper::string::strip(aString, '\t');
77 }
78 while (aOldString != aString);
79
80 return aString;
81}
82
83OUString get_node_str(xmlNodePtr pNode)
84{
85 OUStringBuffer aStr;
86 for (xmlNodePtr cur_node = pNode->children; cur_node; cur_node = cur_node->next)
87 {
88 if (cur_node->type == XML_TEXT_NODE)
89 {
90 OUString aString = OStringToOUString(toString(cur_node->content), RTL_TEXTENCODING_UTF8);
91 aStr.append(trim_string(aString));
92 }
93 else if (cur_node->type == XML_ELEMENT_NODE)
94 {
95 aStr.append(get_node_str(cur_node));
96 }
97 }
98
99 return aStr.makeStringAndClear();
100}
101
102}
103
104void HTMLFetchThread::handleCell(xmlNodePtr pCellNode, SCROW nRow, SCCOL nCol)
105{
106 OUStringBuffer aStr;
107 for (xmlNodePtr cur_node = pCellNode->children; cur_node; cur_node = cur_node->next)
108 {
109 if (cur_node->type == XML_TEXT_NODE)
110 {
111 OUString aString = OStringToOUString(toString(cur_node->content), RTL_TEXTENCODING_UTF8);
112 aStr.append(trim_string(aString));
113 }
114 else if (cur_node->type == XML_ELEMENT_NODE)
115 {
116 aStr.append(get_node_str(cur_node));
117 }
118 }
119
120 if (!aStr.isEmpty())
121 {
122 OUString aCellStr = aStr.makeStringAndClear();
123 mrDocument.SetString(nCol, nRow, 0, aCellStr);
124 }
125}
126
127void HTMLFetchThread::handleRow(xmlNodePtr pRowNode, SCROW nRow)
128{
129 sal_Int32 nCol = 0;
130 for (xmlNodePtr cur_node = pRowNode->children; cur_node; cur_node = cur_node->next)
131 {
132 if (cur_node->type == XML_ELEMENT_NODE)
133 {
134 OString aNodeName = toString(cur_node->name);
135 if (aNodeName == "td" || aNodeName == "th")
136 {
137 handleCell(cur_node, nRow, nCol);
138 ++nCol;
139 }
140 }
141 }
142}
143
144void HTMLFetchThread::skipHeadBody(xmlNodePtr pSkipElement, SCROW& rRow)
145{
146 for (xmlNodePtr cur_node = pSkipElement->children; cur_node; cur_node = cur_node->next)
147 {
148 if (cur_node->type == XML_ELEMENT_NODE)
149 {
150 OString aNodeName = toString(cur_node->name);
151 if (aNodeName == "tr")
152 {
153 handleRow(cur_node, rRow);
154 ++rRow;
155 }
156
157 }
158 }
159}
160
161void HTMLFetchThread::handleTable(xmlNodePtr pTable)
162{
163 sal_Int32 nRow = 0;
164 for (xmlNodePtr cur_node = pTable->children; cur_node; cur_node = cur_node->next)
165 {
166 if (cur_node->type == XML_ELEMENT_NODE)
167 {
168 OString aNodeName = toString(cur_node->name);
169 if (aNodeName == "tr")
170 {
171 handleRow(cur_node, nRow);
172 ++nRow;
173 }
174 else if (aNodeName == "thead" || aNodeName == "tbody")
175 {
176 skipHeadBody(cur_node, nRow);
177 }
178 }
179 }
180}
181
183{
184 OStringBuffer aBuffer(64000);
186
187 if (aBuffer.isEmpty())
188 return;
189
190 htmlDocPtr pHtmlPtr = htmlParseDoc(reinterpret_cast<xmlChar*>(const_cast<char*>(aBuffer.getStr())), nullptr);
191
192 OString aID = OUStringToOString(maID, RTL_TEXTENCODING_UTF8);
193 xmlXPathContextPtr pXmlXpathCtx = xmlXPathNewContext(pHtmlPtr);
194 xmlXPathObjectPtr pXmlXpathObj = xmlXPathEvalExpression(BAD_CAST(aID.getStr()), pXmlXpathCtx);
195
196 if (!pXmlXpathObj)
197 {
198 xmlXPathFreeContext(pXmlXpathCtx);
199 return;
200 }
201 xmlNodeSetPtr pXmlNodes = pXmlXpathObj->nodesetval;
202
203 if (!pXmlNodes)
204 {
205 xmlXPathFreeNodeSetList(pXmlXpathObj);
206 xmlXPathFreeContext(pXmlXpathCtx);
207 return;
208 }
209
210 if (pXmlNodes->nodeNr == 0)
211 {
212 xmlXPathFreeNodeSet(pXmlNodes);
213 xmlXPathFreeNodeSetList(pXmlXpathObj);
214 xmlXPathFreeContext(pXmlXpathCtx);
215 return;
216 }
217
218 xmlNodePtr pNode = pXmlNodes->nodeTab[0];
219 handleTable(pNode);
220
221 xmlXPathFreeNodeSet(pXmlNodes);
222 xmlXPathFreeNodeSetList(pXmlXpathObj);
223 xmlXPathFreeContext(pXmlXpathCtx);
224
225 for (auto& itr : maDataTransformations)
226 {
227 itr->Transform(mrDocument);
228 }
229
230 SolarMutexGuard aGuard;
232}
233
235 DataProvider(rDataSource),
236 mpDocument(pDoc)
237{
238}
239
241{
242 if (mxHTMLFetchThread.is())
243 {
244 SolarMutexReleaser aReleaser;
245 mxHTMLFetchThread->join();
246 }
247}
248
250{
251 // already importing data
252 if (mpDoc)
253 return;
254
255 mpDoc.reset(new ScDocument(SCDOCMODE_CLIP));
256 mpDoc->ResetClip(mpDocument, SCTAB(0));
259 mxHTMLFetchThread->launch();
260
261 if (mbDeterministic)
262 {
263 SolarMutexReleaser aReleaser;
264 mxHTMLFetchThread->join();
265 }
266}
267
269{
271}
272
273const OUString& HTMLDataProvider::GetURL() const
274{
275 return mrDataSource.getURL();
276}
277
278}
279
280/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
SC_DLLPUBLIC bool SetString(SCCOL nCol, SCROW nRow, SCTAB nTab, const OUString &rString, const ScSetStringParam *pParam=nullptr)
Definition: document.cxx:3391
Abstract class for all data provider.
static std::unique_ptr< SvStream > FetchStreamFromURL(const OUString &, OStringBuffer &rBuffer)
bool mbDeterministic
If true make the threaded import deterministic for the tests.
sc::ExternalDataSource & mrDataSource
const OUString & getID() const
const std::vector< std::shared_ptr< sc::DataTransformation > > & getDataTransformation() const
const OUString & getURL() const
ScDBDataManager * getDBManager()
ScDocumentUniquePtr mpDoc
HTMLDataProvider(ScDocument *pDoc, sc::ExternalDataSource &rDataSource)
virtual const OUString & GetURL() const override
virtual void Import() override
rtl::Reference< HTMLFetchThread > mxHTMLFetchThread
virtual ~HTMLDataProvider() override
void handleCell(xmlNodePtr pCell, SCROW nRow, SCCOL nCol)
void skipHeadBody(xmlNodePtr pSkip, SCROW &rRow)
HTMLFetchThread(ScDocument &rDoc, const OUString &, const OUString &rID, std::function< void()> aImportFinishedHdl, std::vector< std::shared_ptr< sc::DataTransformation > > &&rTransformations)
virtual void execute() override
const std::vector< std::shared_ptr< sc::DataTransformation > > maDataTransformations
std::function< void()> maImportFinishedHdl
void handleRow(xmlNodePtr pRow, SCROW nRow)
void handleTable(xmlNodePtr pTable)
void WriteToDoc(ScDocument &rDoc)
@ SCDOCMODE_CLIP
Definition: document.hxx:257
aStr
OString strip(const OString &rIn, char c)
OString OUStringToOString(std::u16string_view str, ConnectionSettings const *settings)
CAUTION! The following defines must be in the same namespace as the respective type.
Definition: broadcast.cxx:15
OUString toString(OptionInfo const *info)
const URL maURL
sal_Int16 SCTAB
Definition: types.hxx:22
sal_Int16 SCCOL
Definition: types.hxx:21
sal_Int32 SCROW
Definition: types.hxx:17
std::unique_ptr< char[]> aBuffer