LibreOffice Module sc (master)  1
htmldataprovider.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  */
9 
10 #include "htmldataprovider.hxx"
11 #include <datamapper.hxx>
12 #include <datatransformation.hxx>
13 #include <salhelper/thread.hxx>
14 #include <vcl/svapp.hxx>
15 #include <tools/stream.hxx>
16 
17 #include <libxml/HTMLparser.h>
18 
19 #include <libxml/xpath.h>
20 
21 #include <comphelper/string.hxx>
22 #include <utility>
23 
24 namespace sc {
25 
27 {
29  OUString maURL;
30  OUString maID;
31  const std::vector<std::shared_ptr<sc::DataTransformation>> maDataTransformations;
32  std::function<void()> maImportFinishedHdl;
33 
34  void handleTable(xmlNodePtr pTable);
35  void handleRow(xmlNodePtr pRow, SCROW nRow);
36  void skipHeadBody(xmlNodePtr pSkip, SCROW& rRow);
37  void handleCell(xmlNodePtr pCell, SCROW nRow, SCCOL nCol);
38 
39 public:
40  HTMLFetchThread(ScDocument& rDoc, const OUString&, const OUString& rID, std::function<void()> aImportFinishedHdl,
41  const std::vector<std::shared_ptr<sc::DataTransformation>>& rTransformations);
42 
43  virtual void execute() override;
44 };
45 
47  ScDocument& rDoc, const OUString& rURL, const OUString& rID,
48  std::function<void()> aImportFinishedHdl,
49  const std::vector<std::shared_ptr<sc::DataTransformation>>& rTransformations)
50  : salhelper::Thread("HTML Fetch Thread")
51  , mrDocument(rDoc)
52  , maURL(rURL)
53  , maID(rID)
54  , maDataTransformations(rTransformations)
55  , maImportFinishedHdl(std::move(aImportFinishedHdl))
56 {
57 }
58 
59 namespace {
60 
61 OString toString(const xmlChar* pStr)
62 {
63  return OString(reinterpret_cast<const char*>(pStr), xmlStrlen(pStr));
64 }
65 
66 OUString trim_string(const OUString& aStr)
67 {
68  OUString aOldString;
69  OUString aString = aStr;
70  do
71  {
72  aOldString = aString;
73  aString = comphelper::string::strip(aString, ' ');
74  aString = comphelper::string::strip(aString, '\n');
75  aString = comphelper::string::strip(aString, '\r');
76  aString = comphelper::string::strip(aString, '\t');
77  }
78  while (aOldString != aString);
79 
80  return aString;
81 }
82 
83 OUString get_node_str(xmlNodePtr pNode)
84 {
85  OUStringBuffer aStr;
86  for (xmlNodePtr cur_node = pNode->children; cur_node; cur_node = cur_node->next)
87  {
88  if (cur_node->type == XML_TEXT_NODE)
89  {
90  OUString aString = OStringToOUString(toString(cur_node->content), RTL_TEXTENCODING_UTF8);
91  aStr.append(trim_string(aString));
92  }
93  else if (cur_node->type == XML_ELEMENT_NODE)
94  {
95  aStr.append(get_node_str(cur_node));
96  }
97  }
98 
99  return aStr.makeStringAndClear();
100 }
101 
102 }
103 
104 void HTMLFetchThread::handleCell(xmlNodePtr pCellNode, SCROW nRow, SCCOL nCol)
105 {
106  OUStringBuffer aStr;
107  for (xmlNodePtr cur_node = pCellNode->children; cur_node; cur_node = cur_node->next)
108  {
109  if (cur_node->type == XML_TEXT_NODE)
110  {
111  OUString aString = OStringToOUString(toString(cur_node->content), RTL_TEXTENCODING_UTF8);
112  aStr.append(trim_string(aString));
113  }
114  else if (cur_node->type == XML_ELEMENT_NODE)
115  {
116  aStr.append(get_node_str(cur_node));
117  }
118  }
119 
120  if (!aStr.isEmpty())
121  {
122  OUString aCellStr = aStr.makeStringAndClear();
123  mrDocument.SetString(nCol, nRow, 0, aCellStr);
124  }
125 }
126 
127 void HTMLFetchThread::handleRow(xmlNodePtr pRowNode, SCROW nRow)
128 {
129  sal_Int32 nCol = 0;
130  for (xmlNodePtr cur_node = pRowNode->children; cur_node; cur_node = cur_node->next)
131  {
132  if (cur_node->type == XML_ELEMENT_NODE)
133  {
134  OString aNodeName = toString(cur_node->name);
135  if (aNodeName == "td" || aNodeName == "th")
136  {
137  handleCell(cur_node, nRow, nCol);
138  ++nCol;
139  }
140  }
141  }
142 }
143 
144 void HTMLFetchThread::skipHeadBody(xmlNodePtr pSkipElement, SCROW& rRow)
145 {
146  for (xmlNodePtr cur_node = pSkipElement->children; cur_node; cur_node = cur_node->next)
147  {
148  if (cur_node->type == XML_ELEMENT_NODE)
149  {
150  OString aNodeName = toString(cur_node->name);
151  if (aNodeName == "tr")
152  {
153  handleRow(cur_node, rRow);
154  ++rRow;
155  }
156 
157  }
158  }
159 }
160 
161 void HTMLFetchThread::handleTable(xmlNodePtr pTable)
162 {
163  sal_Int32 nRow = 0;
164  for (xmlNodePtr cur_node = pTable->children; cur_node; cur_node = cur_node->next)
165  {
166  if (cur_node->type == XML_ELEMENT_NODE)
167  {
168  OString aNodeName = toString(cur_node->name);
169  if (aNodeName == "tr")
170  {
171  handleRow(cur_node, nRow);
172  ++nRow;
173  }
174  else if (aNodeName == "thead" || aNodeName == "tbody")
175  {
176  skipHeadBody(cur_node, nRow);
177  }
178  }
179  }
180 }
181 
183 {
184  OStringBuffer aBuffer(64000);
185  DataProvider::FetchStreamFromURL(maURL, aBuffer);
186 
187  if (aBuffer.isEmpty())
188  return;
189 
190  htmlDocPtr pHtmlPtr = htmlParseDoc(reinterpret_cast<xmlChar*>(const_cast<char*>(aBuffer.getStr())), nullptr);
191 
192  OString aID = OUStringToOString(maID, RTL_TEXTENCODING_UTF8);
193  xmlXPathContextPtr pXmlXpathCtx = xmlXPathNewContext(pHtmlPtr);
194  xmlXPathObjectPtr pXmlXpathObj = xmlXPathEvalExpression(BAD_CAST(aID.getStr()), pXmlXpathCtx);
195 
196  if (!pXmlXpathObj)
197  {
198  xmlXPathFreeContext(pXmlXpathCtx);
199  return;
200  }
201  xmlNodeSetPtr pXmlNodes = pXmlXpathObj->nodesetval;
202 
203  if (!pXmlNodes)
204  {
205  xmlXPathFreeNodeSetList(pXmlXpathObj);
206  xmlXPathFreeContext(pXmlXpathCtx);
207  return;
208  }
209 
210  if (pXmlNodes->nodeNr == 0)
211  {
212  xmlXPathFreeNodeSet(pXmlNodes);
213  xmlXPathFreeNodeSetList(pXmlXpathObj);
214  xmlXPathFreeContext(pXmlXpathCtx);
215  return;
216  }
217 
218  xmlNodePtr pNode = pXmlNodes->nodeTab[0];
219  handleTable(pNode);
220 
221  xmlXPathFreeNodeSet(pXmlNodes);
222  xmlXPathFreeNodeSetList(pXmlXpathObj);
223  xmlXPathFreeContext(pXmlXpathCtx);
224 
225  for (auto& itr : maDataTransformations)
226  {
227  itr->Transform(mrDocument);
228  }
229 
230  SolarMutexGuard aGuard;
232 }
233 
235  DataProvider(rDataSource),
236  mpDocument(pDoc)
237 {
238 }
239 
241 {
242  if (mxHTMLFetchThread.is())
243  {
244  SolarMutexReleaser aReleaser;
245  mxHTMLFetchThread->join();
246  }
247 }
248 
250 {
251  // already importing data
252  if (mpDoc)
253  return;
254 
255  mpDoc.reset(new ScDocument(SCDOCMODE_CLIP));
256  mpDoc->ResetClip(mpDocument, SCTAB(0));
259  mxHTMLFetchThread->launch();
260 
261  if (mbDeterministic)
262  {
263  SolarMutexReleaser aReleaser;
264  mxHTMLFetchThread->join();
265  }
266 }
267 
269 {
271  mxHTMLFetchThread.clear();
272  mpDoc.reset();
273 }
274 
275 const OUString& HTMLDataProvider::GetURL() const
276 {
277  return mrDataSource.getURL();
278 }
279 
280 }
281 
282 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
sc::ExternalDataSource & mrDataSource
OString strip(std::string_view rIn, char c)
Abstract class for all data provider.
virtual ~HTMLDataProvider() override
std::function< void()> maImportFinishedHdl
bool mbDeterministic
If true make the threaded import deterministic for the tests.
OString OUStringToOString(std::u16string_view str, ConnectionSettings const *settings)
HTMLFetchThread(ScDocument &rDoc, const OUString &, const OUString &rID, std::function< void()> aImportFinishedHdl, const std::vector< std::shared_ptr< sc::DataTransformation >> &rTransformations)
static std::unique_ptr< SvStream > FetchStreamFromURL(const OUString &, OStringBuffer &rBuffer)
const std::vector< std::shared_ptr< sc::DataTransformation > > maDataTransformations
void WriteToDoc(ScDocument &rDoc)
SC_DLLPUBLIC bool SetString(SCCOL nCol, SCROW nRow, SCTAB nTab, const OUString &rString, const ScSetStringParam *pParam=nullptr)
Definition: document.cxx:3413
virtual const OUString & GetURL() const override
ScDocumentUniquePtr mpDoc
sal_Int16 SCCOL
Definition: types.hxx:21
void skipHeadBody(xmlNodePtr pSkip, SCROW &rRow)
void handleRow(xmlNodePtr pRow, SCROW nRow)
const OUString & getURL() const
rtl::Reference< HTMLFetchThread > mxHTMLFetchThread
HTMLDataProvider(ScDocument *pDoc, sc::ExternalDataSource &rDataSource)
void handleCell(xmlNodePtr pCell, SCROW nRow, SCCOL nCol)
ScDBDataManager * getDBManager()
sal_Int32 SCROW
Definition: types.hxx:17
std::unique_ptr< char[]> aBuffer
virtual void Import() override
void handleTable(xmlNodePtr pTable)
virtual void execute() override
const std::vector< std::shared_ptr< sc::DataTransformation > > & getDataTransformation() const
const OUString & getID() const
sal_Int16 SCTAB
Definition: types.hxx:22
OUString toString(OptionInfo const *info)