LibreOffice Module sax (master) 1
fastparser.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
20#include <sax/fastparser.hxx>
21#include <sax/fastattribs.hxx>
22#include <utility>
23#include <xml2utf.hxx>
24
25#include <com/sun/star/io/XSeekable.hpp>
26#include <com/sun/star/lang/DisposedException.hpp>
27#include <com/sun/star/lang/IllegalArgumentException.hpp>
28#include <com/sun/star/uno/XComponentContext.hpp>
29#include <com/sun/star/xml/sax/FastToken.hpp>
30#include <com/sun/star/xml/sax/SAXParseException.hpp>
31#include <com/sun/star/xml/sax/XFastContextHandler.hpp>
35#include <osl/conditn.hxx>
36#include <rtl/ref.hxx>
37#include <rtl/ustrbuf.hxx>
38#include <sal/log.hxx>
39#include <salhelper/thread.hxx>
41#include <o3tl/string_view.hxx>
42
43#include <queue>
44#include <memory>
45#include <mutex>
46#include <optional>
47#include <stack>
48#include <string_view>
49#include <unordered_map>
50#include <vector>
51#include <cassert>
52#include <cstring>
53#include <libxml/parser.h>
54
55// Inverse of libxml's BAD_CAST.
56#define XML_CAST( str ) reinterpret_cast< const char* >( str )
57
58using namespace ::osl;
59using namespace ::cppu;
60using namespace ::com::sun::star::uno;
61using namespace ::com::sun::star::lang;
62using namespace ::com::sun::star::xml::sax;
63using namespace ::com::sun::star::io;
64using namespace com::sun::star;
65using namespace sax_fastparser;
66
67static void NormalizeURI( OUString& rName );
68
69namespace {
70
71struct Event;
72class FastLocatorImpl;
73struct NamespaceDefine;
74struct Entity;
75
76typedef std::unordered_map< OUString, sal_Int32 > NamespaceMap;
77
78struct EventList
79{
80 std::vector<Event> maEvents;
81 bool mbIsAttributesEmpty;
82};
83
84enum class CallbackType { START_ELEMENT, END_ELEMENT, CHARACTERS, PROCESSING_INSTRUCTION, DONE, EXCEPTION };
85
86struct Event
87{
88 CallbackType maType;
89 sal_Int32 mnElementToken;
90 OUString msNamespace;
91 OUString msElementName;
94 OUString msChars;
95};
96
97struct NameWithToken
98{
99 OUString msName;
100 sal_Int32 mnToken;
101
102 NameWithToken(OUString sName, sal_Int32 nToken) :
103 msName(std::move(sName)), mnToken(nToken) {}
104};
105
106struct SaxContext
107{
108 Reference< XFastContextHandler > mxContext;
109 sal_Int32 mnElementToken;
110 std::optional<OUString> moNamespace;
111 std::optional<OUString> moElementName;
112
113 SaxContext( sal_Int32 nElementToken, const OUString& aNamespace, const OUString& aElementName ):
114 mnElementToken(nElementToken)
115 {
116 if (nElementToken == FastToken::DONTKNOW)
117 {
118 moNamespace = aNamespace;
119 moElementName = aElementName;
120 }
121 }
122};
123
124struct ParserData
125{
126 css::uno::Reference< css::xml::sax::XFastDocumentHandler > mxDocumentHandler;
128 css::uno::Reference< css::xml::sax::XErrorHandler > mxErrorHandler;
129 css::uno::Reference< css::xml::sax::XFastNamespaceHandler >mxNamespaceHandler;
130
131 ParserData();
132};
133
134struct NamespaceDefine
135{
136 OString maPrefix;
137 sal_Int32 mnToken;
138 OUString maNamespaceURL;
139
140 NamespaceDefine( OString aPrefix, sal_Int32 nToken, OUString aNamespaceURL )
141 : maPrefix(std::move( aPrefix )), mnToken( nToken ), maNamespaceURL(std::move( aNamespaceURL )) {}
142 NamespaceDefine() : mnToken(-1) {}
143};
144
145// Entity binds all information needed for a single file | single call of parseStream
146struct Entity : public ParserData
147{
148 // Amount of work producer sends to consumer in one iteration:
149 static const size_t mnEventListSize = 1000;
150
151 // unique for each Entity instance:
152
153 // Number of valid events in mxProducedEvents:
154 size_t mnProducedEventsSize;
155 std::optional<EventList> mxProducedEvents;
156 std::queue<EventList> maPendingEvents;
157 std::queue<EventList> maUsedEvents;
158 std::mutex maEventProtector;
159
160 static const size_t mnEventLowWater = 4;
161 static const size_t mnEventHighWater = 8;
162 osl::Condition maConsumeResume;
163 osl::Condition maProduceResume;
164 // Event we use to store data if threading is disabled:
165 Event maSharedEvent;
166
167 // copied in copy constructor:
168
169 // Allow to disable threading for small documents:
170 bool mbEnableThreads;
171 css::xml::sax::InputSource maStructSource;
172 xmlParserCtxtPtr mpParser;
174
175 // Exceptions cannot be thrown through the C-XmlParser (possible
176 // resource leaks), therefore any exception thrown by a UNO callback
177 // must be saved somewhere until the C-XmlParser is stopped.
178 css::uno::Any maSavedException;
179 std::mutex maSavedExceptionMutex;
180 void saveException( const Any & e );
181 // Thread-safe check if maSavedException has value
182 bool hasException();
183 void throwException( const ::rtl::Reference< FastLocatorImpl > &xDocumentLocator,
184 bool mbDuringParse );
185
186 std::stack< NameWithToken, std::vector<NameWithToken> > maNamespaceStack;
187 /* Context for main thread consuming events.
188 * startElement() stores the data, which characters() and endElement() uses
189 */
190 std::stack< SaxContext, std::vector<SaxContext> > maContextStack;
191 // Determines which elements of maNamespaceDefines are valid in current context
192 std::stack< sal_uInt32, std::vector<sal_uInt32> > maNamespaceCount;
193 std::vector< NamespaceDefine > maNamespaceDefines;
194
195 explicit Entity( const ParserData& rData );
196 Entity( const Entity& rEntity ) = delete;
197 Entity& operator=( const Entity& rEntity ) = delete;
198 void startElement( Event const *pEvent );
199 void characters( const OUString& sChars );
200 void endElement();
201 void processingInstruction( const OUString& rTarget, const OUString& rData );
202 EventList& getEventList();
203 Event& getEvent( CallbackType aType );
204};
205
206// Stuff for custom entity names
207struct ReplacementPair
208{
209 OUString name;
210 OUString replacement;
211};
212inline bool operator<(const ReplacementPair& lhs, const ReplacementPair& rhs)
213{
214 return lhs.name < rhs.name;
215}
216inline bool operator<(const ReplacementPair& lhs, const char* rhs)
217{
218 return lhs.name.compareToAscii(rhs) < 0;
219}
220
221} // namespace
222
223namespace sax_fastparser {
224
226{
227public:
228 explicit FastSaxParserImpl();
230
231private:
232 std::vector<ReplacementPair> m_Replacements;
233 std::vector<xmlEntityPtr> m_TemporalEntities;
234
235public:
236 // XFastParser
240 void parseStream( const css::xml::sax::InputSource& aInputSource );
242 void setFastDocumentHandler( const css::uno::Reference< css::xml::sax::XFastDocumentHandler >& Handler );
244 void setTokenHandler( const css::uno::Reference< css::xml::sax::XFastTokenHandler >& Handler );
247 void registerNamespace( const OUString& NamespaceURL, sal_Int32 NamespaceToken );
250 OUString const & getNamespaceURL( std::u16string_view rPrefix );
252 void setErrorHandler( const css::uno::Reference< css::xml::sax::XErrorHandler >& Handler );
254 void setNamespaceHandler( const css::uno::Reference< css::xml::sax::XFastNamespaceHandler >& Handler);
255 // Fake DTD file
257 const ::css::uno::Sequence<::css::beans::Pair<::rtl::OUString, ::rtl::OUString>>& replacements);
258
259 // called by the C callbacks of the expat parser
260 void callbackStartElement( const xmlChar *localName , const xmlChar* prefix, const xmlChar* URI,
261 int numNamespaces, const xmlChar** namespaces, int numAttributes, const xmlChar **attributes );
262 void callbackEndElement();
263 void callbackCharacters( const xmlChar* s, int nLen );
264 void callbackProcessingInstruction( const xmlChar *target, const xmlChar *data );
265 xmlEntityPtr callbackGetEntity( const xmlChar *name );
266
267 void pushEntity(const ParserData&, xml::sax::InputSource const&);
268 void popEntity();
269 Entity& getEntity() { return *mpTop; }
270 void parse();
271 void produce( bool bForceFlush = false );
274
275private:
276 bool consume(EventList&);
277 void deleteUsedEvents();
279 void addUnknownElementWithPrefix(const xmlChar **attributes, int i, rtl::Reference< FastAttributeList > const & xAttributes);
280
281 sal_Int32 GetToken( const xmlChar* pName );
283 sal_Int32 GetTokenWithPrefix( const xmlChar* pPrefix, const xmlChar* pName );
285 OUString const & GetNamespaceURL( std::string_view rPrefix );
286 sal_Int32 GetNamespaceToken( const OUString& rNamespaceURL );
287 sal_Int32 GetTokenWithContextNamespace( sal_Int32 nNamespaceToken, const xmlChar* pName );
288 void DefineNamespace( const OString& rPrefix, const OUString& namespaceURL );
289
290private:
291 std::mutex maMutex;
293 NamespaceMap maNamespaceMap;
294
295 ParserData maData;
296
297 Entity *mpTop;
298 std::stack< Entity > maEntities;
299 std::vector<char> pendingCharacters;
300};
301
302} // namespace sax_fastparser
303
304namespace {
305
306class ParserThread: public salhelper::Thread
307{
308 FastSaxParserImpl *mpParser;
309public:
310 explicit ParserThread(FastSaxParserImpl *pParser): Thread("Parser"), mpParser(pParser) {}
311private:
312 virtual void execute() override
313 {
314 try
315 {
316 mpParser->parse();
317 }
318 catch (...)
319 {
320 Entity &rEntity = mpParser->getEntity();
321 rEntity.getEvent( CallbackType::EXCEPTION );
322 mpParser->produce( true );
323 }
324 }
325};
326
327extern "C" {
328
329static void call_callbackStartElement(void *userData, const xmlChar *localName , const xmlChar* prefix, const xmlChar* URI,
330 int numNamespaces, const xmlChar** namespaces, int numAttributes, int /*defaultedAttributes*/, const xmlChar **attributes)
331{
332 FastSaxParserImpl* pFastParser = static_cast<FastSaxParserImpl*>( userData );
333 pFastParser->callbackStartElement( localName, prefix, URI, numNamespaces, namespaces, numAttributes, attributes );
334}
335
336static void call_callbackEndElement(void *userData, const xmlChar* /*localName*/, const xmlChar* /*prefix*/, const xmlChar* /*URI*/)
337{
338 FastSaxParserImpl* pFastParser = static_cast<FastSaxParserImpl*>( userData );
339 pFastParser->callbackEndElement();
340}
341
342static void call_callbackCharacters( void *userData , const xmlChar *s , int nLen )
343{
344 FastSaxParserImpl* pFastParser = static_cast<FastSaxParserImpl*>( userData );
345 pFastParser->callbackCharacters( s, nLen );
346}
347
348static void call_callbackProcessingInstruction( void *userData, const xmlChar *target, const xmlChar *data )
349{
350 FastSaxParserImpl* pFastParser = static_cast<FastSaxParserImpl*>( userData );
351 pFastParser->callbackProcessingInstruction( target, data );
352}
353
354static xmlEntityPtr call_callbackGetEntity( void *userData, const xmlChar *name)
355{
356 FastSaxParserImpl* pFastParser = static_cast<FastSaxParserImpl*>( userData );
357 return pFastParser->callbackGetEntity( name );
358}
359
360}
361
362class FastLocatorImpl : public WeakImplHelper< XLocator >
363{
364public:
365 explicit FastLocatorImpl(FastSaxParserImpl *p) : mpParser(p) {}
366
367 void dispose() { mpParser = nullptr; }
369 void checkDispose() const { if( !mpParser ) throw DisposedException(); }
370
371 //XLocator
372 virtual sal_Int32 SAL_CALL getColumnNumber() override;
373 virtual sal_Int32 SAL_CALL getLineNumber() override;
374 virtual OUString SAL_CALL getPublicId() override;
375 virtual OUString SAL_CALL getSystemId() override;
376
377private:
378 FastSaxParserImpl *mpParser;
379};
380
381sal_Int32 SAL_CALL FastLocatorImpl::getColumnNumber()
382{
383 checkDispose();
384 return xmlSAX2GetColumnNumber( mpParser->getEntity().mpParser );
385}
386
387sal_Int32 SAL_CALL FastLocatorImpl::getLineNumber()
388{
389 checkDispose();
390 return xmlSAX2GetLineNumber( mpParser->getEntity().mpParser );
391}
392
393OUString SAL_CALL FastLocatorImpl::getPublicId()
394{
395 checkDispose();
396 return mpParser->getEntity().maStructSource.sPublicId;
397}
398
399OUString SAL_CALL FastLocatorImpl::getSystemId()
400{
401 checkDispose();
402 return mpParser->getEntity().maStructSource.sSystemId;
403}
404
405ParserData::ParserData()
406{}
407
408Entity::Entity(const ParserData& rData)
409 : ParserData(rData)
410 , mnProducedEventsSize(0)
411 , mbEnableThreads(false)
412 , mpParser(nullptr)
413{
414}
415
416void Entity::startElement( Event const *pEvent )
417{
418 const sal_Int32& nElementToken = pEvent->mnElementToken;
419 const OUString& aNamespace = pEvent->msNamespace;
420 const OUString& aElementName = pEvent->msElementName;
421
422 // Use un-wrapped pointers to avoid significant acquire/release overhead
423 XFastContextHandler *pParentContext = nullptr;
424 if( !maContextStack.empty() )
425 {
426 pParentContext = maContextStack.top().mxContext.get();
427 if( !pParentContext )
428 {
429 maContextStack.push( SaxContext(nElementToken, aNamespace, aElementName) );
430 return;
431 }
432 }
433
434 maContextStack.push( SaxContext( nElementToken, aNamespace, aElementName ) );
435
436 try
437 {
438 const Reference< XFastAttributeList > & xAttr( pEvent->mxAttributes );
439 Reference< XFastContextHandler > xContext;
440
441 if ( mxNamespaceHandler.is() )
442 {
443 const Sequence< xml::Attribute > NSDeclAttribs = pEvent->mxDeclAttributes->getUnknownAttributes();
444 for (const auto& rNSDeclAttrib : NSDeclAttribs)
445 {
446 mxNamespaceHandler->registerNamespace( rNSDeclAttrib.Name, rNSDeclAttrib.Value );
447 }
448 }
449
450 if( nElementToken == FastToken::DONTKNOW )
451 {
452 if( pParentContext )
453 xContext = pParentContext->createUnknownChildContext( aNamespace, aElementName, xAttr );
454 else if( mxDocumentHandler.is() )
455 xContext = mxDocumentHandler->createUnknownChildContext( aNamespace, aElementName, xAttr );
456
457 if( xContext.is() )
458 {
459 xContext->startUnknownElement( aNamespace, aElementName, xAttr );
460 }
461 }
462 else
463 {
464 if( pParentContext )
465 xContext = pParentContext->createFastChildContext( nElementToken, xAttr );
466 else if( mxDocumentHandler.is() )
467 xContext = mxDocumentHandler->createFastChildContext( nElementToken, xAttr );
468
469 if( xContext.is() )
470 xContext->startFastElement( nElementToken, xAttr );
471 }
472 // swap the reference we own in to avoid referencing thrash.
473 maContextStack.top().mxContext = std::move( xContext );
474 }
475 catch (...)
476 {
477 saveException( ::cppu::getCaughtException() );
478 }
479}
480
481void Entity::characters( const OUString& sChars )
482{
483 if (maContextStack.empty())
484 {
485 // Malformed XML stream !?
486 return;
487 }
488
489 XFastContextHandler * pContext( maContextStack.top().mxContext.get() );
490 if( pContext ) try
491 {
492 pContext->characters( sChars );
493 }
494 catch (...)
495 {
496 saveException( ::cppu::getCaughtException() );
497 }
498}
499
500void Entity::endElement()
501{
502 if (maContextStack.empty())
503 {
504 // Malformed XML stream !?
505 return;
506 }
507
508 const SaxContext& aContext = maContextStack.top();
509 XFastContextHandler* pContext( aContext.mxContext.get() );
510 if( pContext )
511 try
512 {
513 sal_Int32 nElementToken = aContext.mnElementToken;
514 if( nElementToken != FastToken::DONTKNOW )
515 pContext->endFastElement( nElementToken );
516 else
517 pContext->endUnknownElement( *aContext.moNamespace, *aContext.moElementName );
518 }
519 catch (...)
520 {
521 saveException( ::cppu::getCaughtException() );
522 }
523 maContextStack.pop();
524}
525
526void Entity::processingInstruction( const OUString& rTarget, const OUString& rData )
527{
528 if( mxDocumentHandler.is() ) try
529 {
530 mxDocumentHandler->processingInstruction( rTarget, rData );
531 }
532 catch (...)
533 {
534 saveException( ::cppu::getCaughtException() );
535 }
536}
537
538EventList& Entity::getEventList()
539{
540 if (!mxProducedEvents)
541 {
542 std::unique_lock aGuard(maEventProtector);
543 if (!maUsedEvents.empty())
544 {
545 mxProducedEvents = std::move(maUsedEvents.front());
546 maUsedEvents.pop();
547 aGuard.unlock(); // unlock
548 mnProducedEventsSize = 0;
549 }
550 if (!mxProducedEvents)
551 {
552 mxProducedEvents.emplace();
553 mxProducedEvents->maEvents.resize(mnEventListSize);
554 mxProducedEvents->mbIsAttributesEmpty = false;
555 mnProducedEventsSize = 0;
556 }
557 }
558 return *mxProducedEvents;
559}
560
561Event& Entity::getEvent( CallbackType aType )
562{
563 if (!mbEnableThreads)
564 return maSharedEvent;
565
566 EventList& rEventList = getEventList();
567 if (mnProducedEventsSize == rEventList.maEvents.size())
568 {
569 SAL_WARN_IF(!maSavedException.hasValue(), "sax",
570 "Event vector should only exceed " << mnEventListSize <<
571 " temporarily while an exception is pending");
572 rEventList.maEvents.resize(mnProducedEventsSize + 1);
573 }
574 Event& rEvent = rEventList.maEvents[mnProducedEventsSize++];
575 rEvent.maType = aType;
576 return rEvent;
577}
578
579OUString lclGetErrorMessage( xmlParserCtxtPtr ctxt, std::u16string_view sSystemId, sal_Int32 nLine )
580{
581 const char* pMessage;
582 xmlErrorPtr error = xmlCtxtGetLastError( ctxt );
583 if( error && error->message )
584 pMessage = error->message;
585 else
586 pMessage = "unknown error";
587 return OUString::Concat("[") + sSystemId + " line " + OUString::number(nLine) + "]: " +
588 OUString(pMessage, strlen(pMessage), RTL_TEXTENCODING_ASCII_US);
589}
590
591// throw an exception, but avoid callback if
592// during a threaded produce
593void Entity::throwException( const ::rtl::Reference< FastLocatorImpl > &xDocumentLocator,
594 bool mbDuringParse )
595{
596 // Error during parsing !
597 Any savedException;
598 {
599 std::scoped_lock g(maSavedExceptionMutex);
600 if (maSavedException.hasValue())
601 {
602 savedException.setValue(&maSavedException, cppu::UnoType<decltype(maSavedException)>::get());
603 }
604 }
605 SAXParseException aExcept(
606 lclGetErrorMessage( mpParser,
607 xDocumentLocator->getSystemId(),
608 xDocumentLocator->getLineNumber() ),
609 Reference< XInterface >(),
610 savedException,
611 xDocumentLocator->getPublicId(),
612 xDocumentLocator->getSystemId(),
613 xDocumentLocator->getLineNumber(),
614 xDocumentLocator->getColumnNumber()
615 );
616
617 // error handler is set, it may throw the exception
618 if( !mbDuringParse || !mbEnableThreads )
619 {
620 if (mxErrorHandler.is() )
621 mxErrorHandler->fatalError( Any( aExcept ) );
622 }
623
624 // error handler has not thrown, but parsing must stop => throw ourselves
625 throw aExcept;
626}
627
628// In the single threaded case we emit events via our C
629// callbacks, so any exception caught must be queued up until
630// we can safely re-throw it from our C++ parent of parse()
631
632// If multi-threaded, we need to push an EXCEPTION event, at
633// which point we transfer ownership of maSavedException to
634// the consuming thread.
635void Entity::saveException( const Any & e )
636{
637 // fdo#81214 - allow the parser to run on after an exception,
638 // unexpectedly some 'startElements' produce a UNO_QUERY_THROW
639 // for XComponent; and yet expect to continue parsing.
640 SAL_WARN("sax", "Unexpected exception from XML parser " << exceptionToString(e));
641 std::scoped_lock g(maSavedExceptionMutex);
642 if (maSavedException.hasValue())
643 {
644 SAL_INFO("sax.fastparser", "discarding exception, already have one");
645 }
646 else
647 {
648 maSavedException = e;
649 }
650}
651
652bool Entity::hasException()
653{
654 std::scoped_lock g(maSavedExceptionMutex);
655 return maSavedException.hasValue();
656}
657
658} // namespace
659
660namespace sax_fastparser {
661
663 m_bIgnoreMissingNSDecl(false),
664 m_bDisableThreadedParser(false),
665 mpTop(nullptr)
666{
667 mxDocumentLocator.set( new FastLocatorImpl( this ) );
668}
669
671{
672 if( mxDocumentLocator.is() )
673 mxDocumentLocator->dispose();
674 for (auto& entity : m_TemporalEntities)
675 {
676 if (!entity)
677 continue;
678 xmlNodePtr pPtr = reinterpret_cast<xmlNodePtr>(entity);
679 xmlUnlinkNode(pPtr);
680 xmlFreeNode(pPtr);
681 }
682}
683
684void FastSaxParserImpl::DefineNamespace( const OString& rPrefix, const OUString& namespaceURL )
685{
686 Entity& rEntity = getEntity();
687 assert(!rEntity.maNamespaceCount.empty()); // need a context!
688
689 sal_uInt32 nOffset = rEntity.maNamespaceCount.top()++;
690 if( rEntity.maNamespaceDefines.size() <= nOffset )
691 rEntity.maNamespaceDefines.resize( rEntity.maNamespaceDefines.size() + 64 );
692
693 rEntity.maNamespaceDefines[nOffset] = NamespaceDefine( rPrefix, GetNamespaceToken( namespaceURL ), namespaceURL );
694}
695
696sal_Int32 FastSaxParserImpl::GetToken(const xmlChar* pName)
697{
698 return FastTokenHandlerBase::getTokenFromChars( getEntity(). mxTokenHandler.get(),
699 XML_CAST( pName ) ); // uses utf-8
700}
701
702sal_Int32 FastSaxParserImpl::GetTokenWithPrefix( const xmlChar* pPrefix, const xmlChar* pName )
703{
704 Entity& rEntity = getEntity();
705 if (rEntity.maNamespaceCount.empty())
706 return FastToken::DONTKNOW;
707
708 std::string_view sPrefix(XML_CAST(pPrefix));
709 sal_uInt32 nNamespace = rEntity.maNamespaceCount.top();
710 while( nNamespace-- )
711 {
712 const auto & rNamespaceDefine = rEntity.maNamespaceDefines[nNamespace];
713 if( rNamespaceDefine.maPrefix == sPrefix )
714 return GetTokenWithContextNamespace(rNamespaceDefine.mnToken, pName);
715 }
716
718 throw SAXException("No namespace defined for " + OStringToOUString(sPrefix,
719 RTL_TEXTENCODING_UTF8), {}, {});
720
721 return FastToken::DONTKNOW;
722}
723
724sal_Int32 FastSaxParserImpl::GetNamespaceToken( const OUString& rNamespaceURL )
725{
726 NamespaceMap::iterator aIter( maNamespaceMap.find( rNamespaceURL ) );
727 if( aIter != maNamespaceMap.end() )
728 return (*aIter).second;
729 else
730 return FastToken::DONTKNOW;
731}
732
733OUString const & FastSaxParserImpl::GetNamespaceURL( std::string_view rPrefix )
734{
735 Entity& rEntity = getEntity();
736 if( !rEntity.maNamespaceCount.empty() )
737 {
738 sal_uInt32 nNamespace = rEntity.maNamespaceCount.top();
739 while( nNamespace-- )
740 if( rEntity.maNamespaceDefines[nNamespace].maPrefix == rPrefix )
741 return rEntity.maNamespaceDefines[nNamespace].maNamespaceURL;
742 }
743
744 throw SAXException("No namespace defined for " + OUString::fromUtf8(rPrefix),
745 Reference< XInterface >(), Any());
746}
747
748sal_Int32 FastSaxParserImpl::GetTokenWithContextNamespace( sal_Int32 nNamespaceToken, const xmlChar* pName )
749{
750 if( nNamespaceToken != FastToken::DONTKNOW )
751 {
752 sal_Int32 nNameToken = GetToken( pName );
753 if( nNameToken != FastToken::DONTKNOW )
754 return nNamespaceToken | nNameToken;
755 }
756
757 return FastToken::DONTKNOW;
758}
759
760namespace
761{
762 class ParserCleanup
763 {
764 private:
766 Entity& m_rEntity;
768 public:
769 ParserCleanup(FastSaxParserImpl& rParser, Entity& rEntity)
770 : m_rParser(rParser)
771 , m_rEntity(rEntity)
772 {
773 }
774 ~ParserCleanup()
775 {
776 if (m_rEntity.mpParser)
777 {
778 if (m_rEntity.mpParser->myDoc)
779 xmlFreeDoc(m_rEntity.mpParser->myDoc);
780 xmlFreeParserCtxt(m_rEntity.mpParser);
781 }
782 joinThread();
784 }
785 void setThread(const rtl::Reference<ParserThread> &xParser)
786 {
787 m_xParser = xParser;
788 }
789 void joinThread()
790 {
791 if (m_xParser.is())
792 {
794 m_xParser.clear();
795 xToJoin->join();
796 }
797 }
798 };
799}
800/***************
801*
802* parseStream does Parser-startup initializations. The FastSaxParser::parse() method does
803* the file-specific initialization work. (During a parser run, external files may be opened)
804*
805****************/
806void FastSaxParserImpl::parseStream(const InputSource& rStructSource)
807{
808 xmlInitParser();
809
810 // Only one text at one time
811 std::unique_lock guard( maMutex );
812
813 pushEntity(maData, rStructSource);
814 Entity& rEntity = getEntity();
815 ParserCleanup aEnsureFree(*this, rEntity);
816
817 // start the document
818 if( rEntity.mxDocumentHandler.is() )
819 {
820 rEntity.mxDocumentHandler->setDocumentLocator( mxDocumentLocator );
821 rEntity.mxDocumentHandler->startDocument();
822 }
823
824#ifdef EMSCRIPTEN
825 rEntity.mbEnableThreads = false;
826#else
827 if (!getenv("SAX_DISABLE_THREADS") && !m_bDisableThreadedParser)
828 {
829 Reference<css::io::XSeekable> xSeekable(rEntity.maStructSource.aInputStream, UNO_QUERY);
830 // available() is not __really__ relevant here, but leave it in as a heuristic for non-seekable streams
831 rEntity.mbEnableThreads = (xSeekable.is() && xSeekable->getLength() > 10000)
832 || (rEntity.maStructSource.aInputStream->available() > 10000);
833 }
834#endif
835
836 if (rEntity.mbEnableThreads)
837 {
838 rtl::Reference<ParserThread> xParser = new ParserThread(this);
839 xParser->launch();
840 aEnsureFree.setThread(xParser);
841 bool done = false;
842 do {
843 rEntity.maConsumeResume.wait();
844 rEntity.maConsumeResume.reset();
845
846 std::unique_lock aGuard(rEntity.maEventProtector);
847 while (!rEntity.maPendingEvents.empty())
848 {
849 if (rEntity.maPendingEvents.size() <= Entity::mnEventLowWater)
850 rEntity.maProduceResume.set(); // start producer again
851
852 EventList aEventList = std::move(rEntity.maPendingEvents.front());
853 rEntity.maPendingEvents.pop();
854 aGuard.unlock(); // unlock
855
856 if (!consume(aEventList))
857 done = true;
858
859 aGuard.lock(); // lock
860
861 if ( rEntity.maPendingEvents.size() <= Entity::mnEventLowWater )
862 {
863 aGuard.unlock();
864 for (auto& rEvent : aEventList.maEvents)
865 {
866 if (rEvent.mxAttributes.is())
867 {
868 rEvent.mxAttributes->clear();
869 if( rEntity.mxNamespaceHandler.is() )
870 rEvent.mxDeclAttributes->clear();
871 }
872 aEventList.mbIsAttributesEmpty = true;
873 }
874 aGuard.lock();
875 }
876
877 rEntity.maUsedEvents.push(std::move(aEventList));
878 }
879 } while (!done);
880 aEnsureFree.joinThread();
882
883 // callbacks used inside XML_Parse may have caught an exception
884 // No need to lock maSavedExceptionMutex here because parser
885 // thread is joined.
886 if( rEntity.maSavedException.hasValue() )
887 rEntity.throwException( mxDocumentLocator, true );
888 }
889 else
890 {
891 parse();
892 }
893
894 // finish document
895 if( rEntity.mxDocumentHandler.is() )
896 {
897 rEntity.mxDocumentHandler->endDocument();
898 }
899}
900
901void FastSaxParserImpl::setFastDocumentHandler( const Reference< XFastDocumentHandler >& Handler )
902{
903 maData.mxDocumentHandler = Handler;
904}
905
906void FastSaxParserImpl::setTokenHandler( const Reference< XFastTokenHandler >& xHandler )
907{
908 assert( dynamic_cast< FastTokenHandlerBase *>( xHandler.get() ) && "we expect this handler to be a subclass of FastTokenHandlerBase" );
909 maData.mxTokenHandler = dynamic_cast< FastTokenHandlerBase *>( xHandler.get() );
910}
911
912void FastSaxParserImpl::registerNamespace( const OUString& NamespaceURL, sal_Int32 NamespaceToken )
913{
914 if( NamespaceToken < FastToken::NAMESPACE )
915 throw IllegalArgumentException("Invalid namespace token " + OUString::number(NamespaceToken), css::uno::Reference<css::uno::XInterface >(), 0);
916
917 if( GetNamespaceToken( NamespaceURL ) == FastToken::DONTKNOW )
918 {
919 maNamespaceMap[ NamespaceURL ] = NamespaceToken;
920 return;
921 }
922 throw IllegalArgumentException("namespace URL is already registered: " + NamespaceURL, css::uno::Reference<css::uno::XInterface >(), 0);
923}
924
925OUString const & FastSaxParserImpl::getNamespaceURL( std::u16string_view rPrefix )
926{
927 try
928 {
929 return GetNamespaceURL( OUStringToOString( rPrefix, RTL_TEXTENCODING_UTF8 ) );
930 }
931 catch (const Exception&)
932 {
933 }
934 throw IllegalArgumentException();
935}
936
937void FastSaxParserImpl::setErrorHandler(const Reference< XErrorHandler > & Handler)
938{
939 maData.mxErrorHandler = Handler;
940}
941
942void FastSaxParserImpl::setNamespaceHandler( const Reference< XFastNamespaceHandler >& Handler )
943{
944 maData.mxNamespaceHandler = Handler;
945}
946
948 const ::css::uno::Sequence<::css::beans::Pair<::rtl::OUString, ::rtl::OUString>>& replacements)
949{
950 m_Replacements.resize(replacements.size());
951 for (size_t i = 0; i < replacements.size(); ++i)
952 {
953 m_Replacements[i].name = replacements[i].First;
954 m_Replacements[i].replacement = replacements[i].Second;
955 }
956 if (m_Replacements.size() > 1)
957 std::sort(m_Replacements.begin(), m_Replacements.end());
958}
959
961{
962 Entity& rEntity = getEntity();
963 std::unique_lock aGuard(rEntity.maEventProtector);
964
965 while (!rEntity.maUsedEvents.empty())
966 {
967 { // the block makes sure that aEventList is destructed outside the lock
968 EventList aEventList = std::move(rEntity.maUsedEvents.front());
969 rEntity.maUsedEvents.pop();
970
971 aGuard.unlock(); // unlock
972 }
973
974 aGuard.lock(); // lock
975 }
976}
977
978void FastSaxParserImpl::produce( bool bForceFlush )
979{
980 Entity& rEntity = getEntity();
981 if (!(bForceFlush ||
982 rEntity.mnProducedEventsSize >= Entity::mnEventListSize))
983 return;
984
985 std::unique_lock aGuard(rEntity.maEventProtector);
986
987 while (rEntity.maPendingEvents.size() >= Entity::mnEventHighWater)
988 { // pause parsing for a bit
989 aGuard.unlock(); // unlock
990 rEntity.maProduceResume.wait();
991 rEntity.maProduceResume.reset();
992 aGuard.lock(); // lock
993 }
994
995 rEntity.maPendingEvents.push(std::move(*rEntity.mxProducedEvents));
996 rEntity.mxProducedEvents.reset();
997 assert(!rEntity.mxProducedEvents);
998
999 aGuard.unlock(); // unlock
1000
1001 rEntity.maConsumeResume.set();
1002}
1003
1004bool FastSaxParserImpl::consume(EventList& rEventList)
1005{
1006 Entity& rEntity = getEntity();
1007 rEventList.mbIsAttributesEmpty = false;
1008 for (auto& rEvent : rEventList.maEvents)
1009 {
1010 switch (rEvent.maType)
1011 {
1012 case CallbackType::START_ELEMENT:
1013 rEntity.startElement( &rEvent );
1014 break;
1015 case CallbackType::END_ELEMENT:
1016 rEntity.endElement();
1017 break;
1018 case CallbackType::CHARACTERS:
1019 rEntity.characters( rEvent.msChars );
1020 break;
1021 case CallbackType::PROCESSING_INSTRUCTION:
1022 rEntity.processingInstruction(
1023 rEvent.msNamespace, rEvent.msElementName ); // ( target, data )
1024 break;
1025 case CallbackType::DONE:
1026 return false;
1027 case CallbackType::EXCEPTION:
1028 rEntity.throwException( mxDocumentLocator, false );
1029 [[fallthrough]]; // avoid unreachable code warning with some compilers
1030 default:
1031 assert(false);
1032 return false;
1033 }
1034 }
1035 return true;
1036}
1037
1038void FastSaxParserImpl::pushEntity(const ParserData& rEntityData,
1039 xml::sax::InputSource const& rSource)
1040{
1041 if (!rSource.aInputStream.is())
1042 throw SAXException("No input source", Reference<XInterface>(), Any());
1043
1044 maEntities.emplace(rEntityData);
1045 mpTop = &maEntities.top();
1046
1047 mpTop->maStructSource = rSource;
1048
1049 mpTop->maConverter.setInputStream(mpTop->maStructSource.aInputStream);
1050 if (!mpTop->maStructSource.sEncoding.isEmpty())
1051 {
1052 mpTop->maConverter.setEncoding(OUStringToOString(mpTop->maStructSource.sEncoding, RTL_TEXTENCODING_ASCII_US));
1053 }
1054}
1055
1057{
1058 maEntities.pop();
1059 mpTop = !maEntities.empty() ? &maEntities.top() : nullptr;
1060}
1061
1062// starts parsing with actual parser !
1064{
1065 const int BUFFER_SIZE = 16 * 1024;
1067
1068 Entity& rEntity = getEntity();
1069
1070 // set all necessary C-Callbacks
1071 static xmlSAXHandler callbacks;
1072 callbacks.startElementNs = call_callbackStartElement;
1073 callbacks.endElementNs = call_callbackEndElement;
1074 callbacks.characters = call_callbackCharacters;
1075 callbacks.processingInstruction = call_callbackProcessingInstruction;
1076 callbacks.getEntity = call_callbackGetEntity;
1077 callbacks.initialized = XML_SAX2_MAGIC;
1078 int nRead = 0;
1079 do
1080 {
1081 nRead = rEntity.maConverter.readAndConvert( seqOut, BUFFER_SIZE );
1082 if( nRead <= 0 )
1083 {
1084 if( rEntity.mpParser != nullptr )
1085 {
1086 if( xmlParseChunk( rEntity.mpParser, reinterpret_cast<const char*>(seqOut.getConstArray()), 0, 1 ) != XML_ERR_OK )
1087 rEntity.throwException( mxDocumentLocator, true );
1088 if (rEntity.hasException())
1089 rEntity.throwException(mxDocumentLocator, true);
1090 }
1091 break;
1092 }
1093
1094 bool bContinue = true;
1095 if( rEntity.mpParser == nullptr )
1096 {
1097 // create parser with proper encoding (needs the first chunk of data)
1098 rEntity.mpParser = xmlCreatePushParserCtxt( &callbacks, this,
1099 reinterpret_cast<const char*>(seqOut.getConstArray()), nRead, nullptr );
1100 if( !rEntity.mpParser )
1101 throw SAXException("Couldn't create parser", Reference< XInterface >(), Any() );
1102
1103 // Tell libxml2 parser to decode entities in attribute values.
1104 // Also allow XML attribute values which are larger than 10MB, because this used to work
1105 // with expat.
1106 // coverity[unsafe_xml_parse_config] - entity support is required
1107 xmlCtxtUseOptions(rEntity.mpParser, XML_PARSE_NOENT | XML_PARSE_HUGE);
1108 }
1109 else
1110 {
1111 bContinue = xmlParseChunk( rEntity.mpParser, reinterpret_cast<const char*>(seqOut.getConstArray()), nRead, 0 )
1112 == XML_ERR_OK;
1113 }
1114
1115 // callbacks used inside XML_Parse may have caught an exception
1116 if (!bContinue)
1117 {
1118 rEntity.throwException( mxDocumentLocator, true );
1119 }
1120 if (rEntity.hasException())
1121 {
1122 rEntity.throwException( mxDocumentLocator, true );
1123 }
1124 } while( nRead > 0 );
1125 rEntity.getEvent( CallbackType::DONE );
1126 if( rEntity.mbEnableThreads )
1127 produce( true );
1128}
1129
1130// The C-Callbacks
1131void FastSaxParserImpl::callbackStartElement(const xmlChar *localName , const xmlChar* prefix, const xmlChar* URI,
1132 int numNamespaces, const xmlChar** namespaces, int numAttributes, const xmlChar **attributes)
1133{
1134 if (!pendingCharacters.empty())
1136 Entity& rEntity = getEntity();
1137 if( rEntity.maNamespaceCount.empty() )
1138 {
1139 rEntity.maNamespaceCount.push(0);
1140 DefineNamespace( "xml", "http://www.w3.org/XML/1998/namespace");
1141 }
1142 else
1143 {
1144 rEntity.maNamespaceCount.push( rEntity.maNamespaceCount.top() );
1145 }
1146
1147 // create attribute map and process namespace instructions
1148 Event& rEvent = rEntity.getEvent( CallbackType::START_ELEMENT );
1149 bool bIsAttributesEmpty = false;
1150 if ( rEntity.mbEnableThreads )
1151 bIsAttributesEmpty = rEntity.getEventList().mbIsAttributesEmpty;
1152
1153 if (rEvent.mxAttributes.is())
1154 {
1155 if( !bIsAttributesEmpty )
1156 rEvent.mxAttributes->clear();
1157 }
1158 else
1159 rEvent.mxAttributes.set(
1160 new FastAttributeList( rEntity.mxTokenHandler.get() ) );
1161
1162 if( rEntity.mxNamespaceHandler.is() )
1163 {
1164 if (rEvent.mxDeclAttributes.is())
1165 {
1166 if( !bIsAttributesEmpty )
1167 rEvent.mxDeclAttributes->clear();
1168 }
1169 else
1170 rEvent.mxDeclAttributes.set(
1171 new FastAttributeList( rEntity.mxTokenHandler.get() ) );
1172 }
1173
1174 OUString sNamespace;
1175 sal_Int32 nNamespaceToken = FastToken::DONTKNOW;
1176 if (!rEntity.maNamespaceStack.empty())
1177 {
1178 sNamespace = rEntity.maNamespaceStack.top().msName;
1179 nNamespaceToken = rEntity.maNamespaceStack.top().mnToken;
1180 }
1181
1182 try
1183 {
1184 /* #158414# Each element may define new namespaces, also for attributes.
1185 First, process all namespaces, second, process the attributes after namespaces
1186 have been initialized. */
1187
1188 // #158414# first: get namespaces
1189 for (int i = 0; i < numNamespaces * 2; i += 2)
1190 {
1191 // namespaces[] is (prefix/URI)
1192 if( namespaces[ i ] != nullptr )
1193 {
1194 OString aPrefix( XML_CAST( namespaces[ i ] ));
1195 OUString namespaceURL( XML_CAST( namespaces[ i + 1 ] ), strlen( XML_CAST( namespaces[ i + 1 ] )), RTL_TEXTENCODING_UTF8 );
1196 NormalizeURI( namespaceURL );
1197 DefineNamespace(aPrefix, namespaceURL);
1198 if( rEntity.mxNamespaceHandler.is() )
1199 rEvent.mxDeclAttributes->addUnknown( OString( XML_CAST( namespaces[ i ] ) ), OString( XML_CAST( namespaces[ i + 1 ] ) ) );
1200 }
1201 else
1202 {
1203 // default namespace
1204 sNamespace = OUString( XML_CAST( namespaces[ i + 1 ] ), strlen( XML_CAST( namespaces[ i + 1 ] )), RTL_TEXTENCODING_UTF8 );
1205 NormalizeURI( sNamespace );
1206 nNamespaceToken = GetNamespaceToken( sNamespace );
1207 if( rEntity.mxNamespaceHandler.is() )
1208 rEvent.mxDeclAttributes->addUnknown( "", OString( XML_CAST( namespaces[ i + 1 ] ) ) );
1209 }
1210 }
1211
1212 if ( rEntity.mxTokenHandler.is() )
1213 {
1214 // #158414# second: fill attribute list with other attributes
1215 rEvent.mxAttributes->reserve( numAttributes );
1216 for (int i = 0; i < numAttributes * 5; i += 5)
1217 {
1218 // attributes[] is ( localname / prefix / nsURI / valueBegin / valueEnd )
1219 if( attributes[ i + 1 ] != nullptr )
1220 {
1221 sal_Int32 nAttributeToken = GetTokenWithPrefix(attributes[ i + 1 ], attributes[ i ]);
1222 if( nAttributeToken != FastToken::DONTKNOW )
1223 rEvent.mxAttributes->add( nAttributeToken, std::string_view(XML_CAST( attributes[ i + 3 ] ), attributes[ i + 4 ] - attributes[ i + 3 ]) );
1224 else
1225 addUnknownElementWithPrefix(attributes, i, rEvent.mxAttributes);
1226 }
1227 else
1228 {
1229 sal_Int32 nAttributeToken = GetToken(attributes[ i ]);
1230 if( nAttributeToken != FastToken::DONTKNOW )
1231 rEvent.mxAttributes->add( nAttributeToken, std::string_view(XML_CAST( attributes[ i + 3 ] ), attributes[ i + 4 ] - attributes[ i + 3 ]) );
1232 else
1233 {
1234 SAL_WARN("xmloff", "unknown attribute " << XML_CAST( attributes[ i ] ) << "=" <<
1235 OString( XML_CAST( attributes[ i + 3 ] ), attributes[ i + 4 ] - attributes[ i + 3 ] ));
1236 rEvent.mxAttributes->addUnknown( XML_CAST( attributes[ i ] ),
1237 OString( XML_CAST( attributes[ i + 3 ] ), attributes[ i + 4 ] - attributes[ i + 3 ] ));
1238 }
1239 }
1240 }
1241
1242 if( prefix != nullptr )
1243 rEvent.mnElementToken = GetTokenWithPrefix(prefix, localName);
1244 else if( !sNamespace.isEmpty() )
1245 rEvent.mnElementToken = GetTokenWithContextNamespace(nNamespaceToken, localName);
1246 else
1247 rEvent.mnElementToken = GetToken(localName);
1248 }
1249 else
1250 {
1251 for (int i = 0; i < numAttributes * 5; i += 5)
1252 {
1253 if( attributes[ i + 1 ] != nullptr )
1254 addUnknownElementWithPrefix(attributes, i, rEvent.mxAttributes);
1255 else
1256 rEvent.mxAttributes->addUnknown( XML_CAST( attributes[ i ] ),
1257 OString( XML_CAST( attributes[ i + 3 ] ), attributes[ i + 4 ] - attributes[ i + 3 ] ));
1258 }
1259
1260 rEvent.mnElementToken = FastToken::DONTKNOW;
1261 }
1262
1263 if( rEvent.mnElementToken == FastToken::DONTKNOW )
1264 {
1265 OUString aElementPrefix;
1266 if( prefix != nullptr )
1267 {
1268 if ( !m_bIgnoreMissingNSDecl || URI != nullptr )
1269 sNamespace = OUString( XML_CAST( URI ), strlen( XML_CAST( URI )), RTL_TEXTENCODING_UTF8 );
1270 else
1271 sNamespace.clear();
1272 nNamespaceToken = GetNamespaceToken( sNamespace );
1273 aElementPrefix = OUString( XML_CAST( prefix ), strlen( XML_CAST( prefix )), RTL_TEXTENCODING_UTF8 );
1274 }
1275 OUString aElementLocalName( XML_CAST( localName ), strlen( XML_CAST( localName )), RTL_TEXTENCODING_UTF8 );
1276 rEvent.msNamespace = sNamespace;
1277 if( aElementPrefix.isEmpty() )
1278 rEvent.msElementName = std::move(aElementLocalName);
1279 else
1280 rEvent.msElementName = aElementPrefix + ":" + aElementLocalName;
1281 }
1282 else // token is always preferred.
1283 rEvent.msElementName.clear();
1284
1285 rEntity.maNamespaceStack.push( NameWithToken(sNamespace, nNamespaceToken) );
1286 if (rEntity.mbEnableThreads)
1287 produce();
1288 else
1289 {
1290 SAL_INFO("sax.fastparser", " startElement line " << mxDocumentLocator->getLineNumber() << " column " << mxDocumentLocator->getColumnNumber() << " " << ( prefix ? XML_CAST(prefix) : "(null)" ) << ":" << localName);
1291 rEntity.startElement( &rEvent );
1292 }
1293 }
1294 catch (...)
1295 {
1296 rEntity.saveException( ::cppu::getCaughtException() );
1297 }
1298}
1299
1300void FastSaxParserImpl::addUnknownElementWithPrefix(const xmlChar **attributes, int i, rtl::Reference< FastAttributeList > const & xAttributes)
1301{
1302 OUString aNamespaceURI;
1303 if ( !m_bIgnoreMissingNSDecl || attributes[i + 2] != nullptr )
1304 aNamespaceURI = OUString( XML_CAST( attributes[ i + 2 ] ), strlen( XML_CAST( attributes[ i + 2 ] )), RTL_TEXTENCODING_UTF8 );
1305 const OString& rPrefix = OString( XML_CAST( attributes[ i + 1 ] ));
1306 const OString& rLocalName = OString( XML_CAST( attributes[ i ] ));
1307 OString aQualifiedName = (rPrefix.isEmpty())? rLocalName : rPrefix + ":" + rLocalName;
1308 xAttributes->addUnknown( aNamespaceURI, aQualifiedName,
1309 OString( XML_CAST( attributes[ i + 3 ] ), attributes[ i + 4 ] - attributes[ i + 3 ] ));
1310 SAL_INFO("xmloff", "unknown element " << aQualifiedName << " " << aNamespaceURI);
1311}
1312
1314{
1315 if (!pendingCharacters.empty())
1317 Entity& rEntity = getEntity();
1318 SAL_WARN_IF(rEntity.maNamespaceCount.empty(), "sax", "Empty NamespaceCount");
1319 if( !rEntity.maNamespaceCount.empty() )
1320 rEntity.maNamespaceCount.pop();
1321
1322 SAL_WARN_IF(rEntity.maNamespaceStack.empty(), "sax", "Empty NamespaceStack");
1323 if( !rEntity.maNamespaceStack.empty() )
1324 rEntity.maNamespaceStack.pop();
1325
1326 rEntity.getEvent( CallbackType::END_ELEMENT );
1327 if (rEntity.mbEnableThreads)
1328 produce();
1329 else
1330 rEntity.endElement();
1331}
1332
1333void FastSaxParserImpl::callbackCharacters( const xmlChar* s, int nLen )
1334{
1335 // SAX interface allows that the characters callback splits content of one XML node
1336 // (e.g. because there's an entity that needs decoding), however for consumers it's
1337 // simpler FastSaxParser's character callback provides the whole string at once,
1338 // so merge data from possible multiple calls and send them at once (before the element
1339 // ends or another one starts).
1340 //
1341 // We use a std::vector<char> to avoid calling into the OUString constructor more than once when
1342 // we have multiple callbackCharacters() calls that we have to merge, which happens surprisingly
1343 // often in writer documents.
1344 int nOriginalLen = pendingCharacters.size();
1345 pendingCharacters.resize(nOriginalLen + nLen);
1346 memcpy(pendingCharacters.data() + nOriginalLen, s, nLen);
1347}
1348
1350{
1351 Entity& rEntity = getEntity();
1352 OUString sChars( pendingCharacters.data(), pendingCharacters.size(), RTL_TEXTENCODING_UTF8 );
1353 if (rEntity.mbEnableThreads)
1354 {
1355 Event& rEvent = rEntity.getEvent( CallbackType::CHARACTERS );
1356 rEvent.msChars = std::move(sChars);
1357 produce();
1358 }
1359 else
1360 rEntity.characters( sChars );
1361 pendingCharacters.resize(0);
1362}
1363
1364void FastSaxParserImpl::callbackProcessingInstruction( const xmlChar *target, const xmlChar *data )
1365{
1366 if (!pendingCharacters.empty())
1368 Entity& rEntity = getEntity();
1369 Event& rEvent = rEntity.getEvent( CallbackType::PROCESSING_INSTRUCTION );
1370
1371 // This event is very rare, so no need to waste extra space for this
1372 // Using namespace and element strings to be target and data in that order.
1373 rEvent.msNamespace = OUString( XML_CAST( target ), strlen( XML_CAST( target ) ), RTL_TEXTENCODING_UTF8 );
1374 if ( data != nullptr )
1375 rEvent.msElementName = OUString( XML_CAST( data ), strlen( XML_CAST( data ) ), RTL_TEXTENCODING_UTF8 );
1376 else
1377 rEvent.msElementName.clear();
1378
1379 if (rEntity.mbEnableThreads)
1380 produce();
1381 else
1382 rEntity.processingInstruction( rEvent.msNamespace, rEvent.msElementName );
1383}
1384
1385xmlEntityPtr FastSaxParserImpl::callbackGetEntity( const xmlChar *name )
1386{
1387 if( !name )
1388 return xmlGetPredefinedEntity(name);
1389 const char* dname = XML_CAST(name);
1390 int lname = strlen(dname);
1391 if( lname == 0 )
1392 return xmlGetPredefinedEntity(name);
1393 if (m_Replacements.size() > 0)
1394 {
1395 auto it = std::lower_bound(m_Replacements.begin(), m_Replacements.end(), dname);
1396 if (it != m_Replacements.end() && it->name.compareToAscii(dname) == 0)
1397 {
1398 xmlEntityPtr entpt = xmlNewEntity(
1399 nullptr, name, XML_INTERNAL_GENERAL_ENTITY, nullptr, nullptr,
1400 BAD_CAST(OUStringToOString(it->replacement, RTL_TEXTENCODING_UTF8).getStr()));
1401 m_TemporalEntities.push_back(entpt);
1402 return entpt;
1403 }
1404 }
1405 if( lname < 2 )
1406 return xmlGetPredefinedEntity(name);
1407 if ( dname[0] == '#' )
1408 {
1409 sal_uInt32 cval = 0;
1410 if( dname[1] == 'x' || dname[1] == 'X' )
1411 {
1412 if( lname < 3 )
1413 return xmlGetPredefinedEntity(name);
1414 cval = static_cast<sal_uInt32>( strtoul( dname + 2, nullptr, 16 ) );
1415 if( cval == 0 )
1416 return xmlGetPredefinedEntity(name);
1417 OUString vname( &cval, 1 );
1418 xmlEntityPtr entpt
1419 = xmlNewEntity(nullptr, name, XML_INTERNAL_GENERAL_ENTITY, nullptr, nullptr,
1420 BAD_CAST(OUStringToOString(vname, RTL_TEXTENCODING_UTF8).getStr()));
1421 m_TemporalEntities.push_back(entpt);
1422 return entpt;
1423 }
1424 else
1425 {
1426 cval = static_cast<sal_uInt32>( strtoul( dname + 2, nullptr, 10 ) );
1427 if( cval == 0 )
1428 return xmlGetPredefinedEntity(name);
1429 OUString vname(&cval, 1);
1430 xmlEntityPtr entpt
1431 = xmlNewEntity(nullptr, name, XML_INTERNAL_GENERAL_ENTITY, nullptr, nullptr,
1432 BAD_CAST(OUStringToOString(vname, RTL_TEXTENCODING_UTF8).getStr()));
1433 m_TemporalEntities.push_back(entpt);
1434 return entpt;
1435 }
1436 }
1437 return xmlGetPredefinedEntity(name);
1438}
1439
1441
1443{
1444}
1445
1446void SAL_CALL
1447FastSaxParser::initialize(css::uno::Sequence< css::uno::Any > const& rArguments)
1448{
1449 if (!rArguments.hasElements())
1450 return;
1451
1452 OUString str;
1453 if ( !(rArguments[0] >>= str) )
1454 throw IllegalArgumentException();
1455
1456 if ( str == "IgnoreMissingNSDecl" )
1457 mpImpl->m_bIgnoreMissingNSDecl = true;
1458 else if ( str == "DoSmeplease" )
1459 ; //just ignore as this is already immune to billion laughs
1460 else if ( str == "DisableThreadedParser" )
1461 mpImpl->m_bDisableThreadedParser = true;
1462 else
1463 throw IllegalArgumentException();
1464
1465}
1466
1467void FastSaxParser::parseStream( const xml::sax::InputSource& aInputSource )
1468{
1469 mpImpl->parseStream(aInputSource);
1470}
1471
1472void FastSaxParser::setFastDocumentHandler( const uno::Reference<xml::sax::XFastDocumentHandler>& Handler )
1473{
1474 mpImpl->setFastDocumentHandler(Handler);
1475}
1476
1477void FastSaxParser::setTokenHandler( const uno::Reference<xml::sax::XFastTokenHandler>& Handler )
1478{
1479 mpImpl->setTokenHandler(Handler);
1480}
1481
1482void FastSaxParser::registerNamespace( const OUString& NamespaceURL, sal_Int32 NamespaceToken )
1483{
1484 mpImpl->registerNamespace(NamespaceURL, NamespaceToken);
1485}
1486
1487OUString FastSaxParser::getNamespaceURL( const OUString& rPrefix )
1488{
1489 return mpImpl->getNamespaceURL(rPrefix);
1490}
1491
1492void FastSaxParser::setErrorHandler( const uno::Reference< xml::sax::XErrorHandler >& Handler )
1493{
1494 mpImpl->setErrorHandler(Handler);
1495}
1496
1497void FastSaxParser::setEntityResolver( const uno::Reference< xml::sax::XEntityResolver >& )
1498{
1499 // not implemented
1500}
1501
1502void FastSaxParser::setLocale( const lang::Locale& )
1503{
1504 // not implemented
1505}
1506
1507void FastSaxParser::setNamespaceHandler( const uno::Reference< css::xml::sax::XFastNamespaceHandler >& Handler)
1508{
1509 mpImpl->setNamespaceHandler(Handler);
1510}
1511
1513{
1514 return "com.sun.star.comp.extensions.xml.sax.FastParser";
1515}
1516
1518 const ::css::uno::Sequence<::css::beans::Pair<::rtl::OUString, ::rtl::OUString>>& replacements)
1519{
1520 mpImpl->setCustomEntityNames(replacements);
1521}
1522
1523sal_Bool FastSaxParser::supportsService( const OUString& ServiceName )
1524{
1525 return cppu::supportsService(this, ServiceName);
1526}
1527
1529{
1530 return { "com.sun.star.xml.sax.FastParser" };
1531}
1532
1533} // namespace sax_fastparser
1534
1535extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface *
1537 css::uno::XComponentContext *,
1538 css::uno::Sequence<css::uno::Any> const &)
1539{
1540 return cppu::acquire(new FastSaxParser);
1541}
1542
1543// ----------------------------------------------------------
1544// copy of the code in xmloff/source/core/namespace.cxx, which adds namespace aliases
1545// for various dodgy namespace decls in the wild.
1546
1547static bool NormalizeW3URI( OUString& rName );
1548static bool NormalizeOasisURN( OUString& rName );
1549
1550static void NormalizeURI( OUString& rName )
1551{
1552 // try OASIS + W3 URI normalization
1553 bool bSuccess = NormalizeOasisURN( rName );
1554 if( ! bSuccess )
1555 NormalizeW3URI( rName );
1556}
1557
1558constexpr OUStringLiteral XML_URI_W3_PREFIX(u"http://www.w3.org/");
1559constexpr OUStringLiteral XML_URI_XFORMS_SUFFIX(u"/xforms");
1560constexpr OUStringLiteral XML_N_XFORMS_1_0(u"http://www.w3.org/2002/xforms");
1561constexpr OUStringLiteral XML_N_SVG(u"http://www.w3.org/2000/svg");
1562constexpr OUStringLiteral XML_N_SVG_COMPAT(u"urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0");
1563constexpr OUStringLiteral XML_N_FO(u"http://www.w3.org/1999/XSL/Format");
1564constexpr OUStringLiteral XML_N_FO_COMPAT(u"urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0");
1565constexpr OUStringLiteral XML_N_SMIL(u"http://www.w3.org/2001/SMIL20/");
1566constexpr OUStringLiteral XML_N_SMIL_OLD(u"http://www.w3.org/2001/SMIL20");
1567constexpr OUStringLiteral XML_N_SMIL_COMPAT(u"urn:oasis:names:tc:opendocument:xmlns:smil-compatible:1.0");
1568constexpr OUStringLiteral XML_URN_OASIS_NAMES_TC(u"urn:oasis:names:tc");
1569constexpr OUStringLiteral XML_XMLNS(u"xmlns");
1570constexpr OUStringLiteral XML_OPENDOCUMENT(u"opendocument");
1571constexpr OUStringLiteral XML_1_0(u"1.0");
1572
1573static bool NormalizeW3URI( OUString& rName )
1574{
1575 // check if URI matches:
1576 // http://www.w3.org/[0-9]*/[:letter:]*
1577 // (year)/(WG name)
1578 // For the following WG/standards names:
1579 // - xforms
1580
1581 bool bSuccess = false;
1582 const OUString& sURIPrefix = XML_URI_W3_PREFIX;
1583 if( rName.startsWith( sURIPrefix ) )
1584 {
1585 const OUString& sURISuffix = XML_URI_XFORMS_SUFFIX ;
1586 sal_Int32 nCompareFrom = rName.getLength() - sURISuffix.getLength();
1587 if( rName.subView( nCompareFrom ) == sURISuffix )
1588 {
1589 // found W3 prefix, and xforms suffix
1590 rName = XML_N_XFORMS_1_0;
1591 bSuccess = true;
1592 }
1593 }
1594 return bSuccess;
1595}
1596
1597static bool NormalizeOasisURN( OUString& rName )
1598{
1599 // #i38644#
1600 // we exported the wrong namespace for smil, so we correct this here on load
1601 // for older documents
1602 if( rName == XML_N_SVG )
1603 {
1604 rName = XML_N_SVG_COMPAT;
1605 return true;
1606 }
1607 else if( rName == XML_N_FO )
1608 {
1609 rName = XML_N_FO_COMPAT;
1610 return true;
1611 }
1612 else if( rName == XML_N_SMIL || rName == XML_N_SMIL_OLD )
1613 {
1614 rName = XML_N_SMIL_COMPAT;
1615 return true;
1616 }
1617
1618
1619 // Check if URN matches
1620 // :urn:oasis:names:tc:[^:]*:xmlns:[^:]*:1.[^:]*
1621 // |---| |---| |-----|
1622 // TC-Id Sub-Id Version
1623
1624 sal_Int32 nNameLen = rName.getLength();
1625 // :urn:oasis:names:tc.*
1626 const OUString& rOasisURN = XML_URN_OASIS_NAMES_TC;
1627 if( !rName.startsWith( rOasisURN ) )
1628 return false;
1629
1630 // :urn:oasis:names:tc:.*
1631 sal_Int32 nPos = rOasisURN.getLength();
1632 if( nPos >= nNameLen || rName[nPos] != ':' )
1633 return false;
1634
1635 // :urn:oasis:names:tc:[^:]:.*
1636 sal_Int32 nTCIdStart = nPos+1;
1637 sal_Int32 nTCIdEnd = rName.indexOf( ':', nTCIdStart );
1638 if( -1 == nTCIdEnd )
1639 return false;
1640
1641 // :urn:oasis:names:tc:[^:]:xmlns.*
1642 nPos = nTCIdEnd + 1;
1643 std::u16string_view sTmp( rName.subView( nPos ) );
1644 const OUString& rXMLNS = XML_XMLNS;
1645 if( !o3tl::starts_with(sTmp, rXMLNS ) )
1646 return false;
1647
1648 // :urn:oasis:names:tc:[^:]:xmlns:.*
1649 nPos += rXMLNS.getLength();
1650 if( nPos >= nNameLen || rName[nPos] != ':' )
1651 return false;
1652
1653 // :urn:oasis:names:tc:[^:]:xmlns:[^:]*:.*
1654 nPos = rName.indexOf( ':', nPos+1 );
1655 if( -1 == nPos )
1656 return false;
1657
1658 // :urn:oasis:names:tc:[^:]:xmlns:[^:]*:[^:][^:][^:][^:]*
1659 sal_Int32 nVersionStart = nPos+1;
1660 if( nVersionStart+2 >= nNameLen ||
1661 -1 != rName.indexOf( ':', nVersionStart ) )
1662 return false;
1663
1664 // :urn:oasis:names:tc:[^:]:xmlns:[^:]*:1\.[^:][^:]*
1665 if( rName[nVersionStart] != '1' || rName[nVersionStart+1] != '.' )
1666 return false;
1667
1668 // replace [tcid] with current TCID and version with current version.
1669
1670 rName = rName.subView( 0, nTCIdStart ) +
1672 rName.subView( nTCIdEnd, nVersionStart-nTCIdEnd ) +
1673 XML_1_0;
1674
1675 return true;
1676}
1677
1678
1679/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
SharedBitmapDescriptor mpTop
const char * pName
Thread(char const *name)
virtual void execute()=0
sal_Int32 GetNamespaceToken(const OUString &rNamespaceURL)
Definition: fastparser.cxx:724
xmlEntityPtr callbackGetEntity(const xmlChar *name)
void setCustomEntityNames(const ::css::uno::Sequence<::css::beans::Pair<::rtl::OUString, ::rtl::OUString > > &replacements)
Definition: fastparser.cxx:947
void addUnknownElementWithPrefix(const xmlChar **attributes, int i, rtl::Reference< FastAttributeList > const &xAttributes)
void setFastDocumentHandler(const css::uno::Reference< css::xml::sax::XFastDocumentHandler > &Handler)
Definition: fastparser.cxx:901
std::vector< ReplacementPair > m_Replacements
Definition: fastparser.cxx:232
void callbackProcessingInstruction(const xmlChar *target, const xmlChar *data)
std::vector< char > pendingCharacters
Entity stack for each call of parseStream().
Definition: fastparser.cxx:299
void pushEntity(const ParserData &, xml::sax::InputSource const &)
void callbackCharacters(const xmlChar *s, int nLen)
std::stack< Entity > maEntities
std::stack::top() is amazingly slow => cache this.
Definition: fastparser.cxx:298
void setNamespaceHandler(const css::uno::Reference< css::xml::sax::XFastNamespaceHandler > &Handler)
Definition: fastparser.cxx:942
Entity * mpTop
Cached parser configuration for next call of parseStream().
Definition: fastparser.cxx:297
sal_Int32 GetToken(const xmlChar *pName)
Definition: fastparser.cxx:696
OUString const & GetNamespaceURL(std::string_view rPrefix)
Definition: fastparser.cxx:733
void DefineNamespace(const OString &rPrefix, const OUString &namespaceURL)
Definition: fastparser.cxx:684
void setErrorHandler(const css::uno::Reference< css::xml::sax::XErrorHandler > &Handler)
Definition: fastparser.cxx:937
std::vector< xmlEntityPtr > m_TemporalEntities
Definition: fastparser.cxx:233
::rtl::Reference< FastLocatorImpl > mxDocumentLocator
Definition: fastparser.cxx:292
OUString const & getNamespaceURL(std::u16string_view rPrefix)
Definition: fastparser.cxx:925
sal_Int32 GetTokenWithPrefix(const xmlChar *pPrefix, const xmlChar *pName)
Definition: fastparser.cxx:702
void setTokenHandler(const css::uno::Reference< css::xml::sax::XFastTokenHandler > &Handler)
Definition: fastparser.cxx:906
void registerNamespace(const OUString &NamespaceURL, sal_Int32 NamespaceToken)
Definition: fastparser.cxx:912
void parseStream(const css::xml::sax::InputSource &aInputSource)
Definition: fastparser.cxx:806
sal_Int32 GetTokenWithContextNamespace(sal_Int32 nNamespaceToken, const xmlChar *pName)
Definition: fastparser.cxx:748
void produce(bool bForceFlush=false)
Definition: fastparser.cxx:978
void callbackStartElement(const xmlChar *localName, const xmlChar *prefix, const xmlChar *URI, int numNamespaces, const xmlChar **namespaces, int numAttributes, const xmlChar **attributes)
std::mutex maMutex
Protecting whole parseStream() execution.
Definition: fastparser.cxx:291
virtual void SAL_CALL setErrorHandler(const css::uno::Reference< css::xml::sax::XErrorHandler > &Handler) override
virtual void SAL_CALL setLocale(const css::lang::Locale &rLocale) override
virtual ~FastSaxParser() override
virtual void SAL_CALL setCustomEntityNames(const ::css::uno::Sequence< ::css::beans::Pair<::rtl::OUString, ::rtl::OUString > > &replacements) override
std::unique_ptr< FastSaxParserImpl > mpImpl
Definition: fastparser.hxx:48
virtual void SAL_CALL parseStream(const css::xml::sax::InputSource &aInputSource) override
virtual void SAL_CALL initialize(css::uno::Sequence< css::uno::Any > const &rArguments) override
virtual void SAL_CALL setNamespaceHandler(const css::uno::Reference< css::xml::sax::XFastNamespaceHandler > &Handler) override
virtual OUString SAL_CALL getNamespaceURL(const OUString &rPrefix) override
virtual void SAL_CALL setFastDocumentHandler(const css::uno::Reference< css::xml::sax::XFastDocumentHandler > &Handler) override
virtual sal_Bool SAL_CALL supportsService(const OUString &ServiceName) override
virtual css::uno::Sequence< OUString > SAL_CALL getSupportedServiceNames() override
virtual OUString SAL_CALL getImplementationName() override
virtual void SAL_CALL setEntityResolver(const css::uno::Reference< css::xml::sax::XEntityResolver > &Resolver) override
virtual void SAL_CALL setTokenHandler(const css::uno::Reference< css::xml::sax::XFastTokenHandler > &Handler) override
virtual void SAL_CALL registerNamespace(const OUString &NamespaceURL, sal_Int32 NamespaceToken) override
A native C++ interface to tokenisation.
Definition: fastattribs.hxx:57
static sal_Int32 getTokenFromChars(const FastTokenHandlerBase *pTokenHandler, std::string_view str)
Client method to attempt the use of this interface if possible.
OString exceptionToString(const css::uno::Any &caught)
uno::Reference< uno::XComponentContext > mxContext
std::deque< ScriptEventDescriptor > aEventList
OUString maPrefix
constexpr OUStringLiteral XML_N_SMIL_COMPAT(u"urn:oasis:names:tc:opendocument:xmlns:smil-compatible:1.0")
constexpr OUStringLiteral XML_N_XFORMS_1_0(u"http://www.w3.org/2002/xforms")
static void NormalizeURI(OUString &rName)
constexpr OUStringLiteral XML_URN_OASIS_NAMES_TC(u"urn:oasis:names:tc")
constexpr OUStringLiteral XML_URI_W3_PREFIX(u"http://www.w3.org/")
FastSaxParserImpl & m_rParser
Definition: fastparser.cxx:765
Entity & m_rEntity
Definition: fastparser.cxx:766
static bool NormalizeOasisURN(OUString &rName)
static bool NormalizeW3URI(OUString &rName)
constexpr OUStringLiteral XML_XMLNS(u"xmlns")
#define XML_CAST(str)
Definition: fastparser.cxx:56
constexpr OUStringLiteral XML_N_SMIL_OLD(u"http://www.w3.org/2001/SMIL20")
constexpr OUStringLiteral XML_N_SVG(u"http://www.w3.org/2000/svg")
constexpr OUStringLiteral XML_URI_XFORMS_SUFFIX(u"/xforms")
constexpr OUStringLiteral XML_N_SVG_COMPAT(u"urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0")
constexpr OUStringLiteral XML_N_FO_COMPAT(u"urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0")
constexpr OUStringLiteral XML_OPENDOCUMENT(u"opendocument")
constexpr OUStringLiteral XML_N_FO(u"http://www.w3.org/1999/XSL/Format")
constexpr OUStringLiteral XML_1_0(u"1.0")
constexpr OUStringLiteral XML_N_SMIL(u"http://www.w3.org/2001/SMIL20/")
rtl::Reference< ParserThread > m_xParser
Definition: fastparser.cxx:767
SAL_DLLPUBLIC_EXPORT css::uno::XInterface * com_sun_star_comp_extensions_xml_sax_FastParser_get_implementation(css::uno::XComponentContext *, css::uno::Sequence< css::uno::Any > const &)
OUString sName
OUString sPrefix
const char * name
void * p
sal_uInt16 nPos
#define SAL_WARN_IF(condition, area, stream)
#define SAL_WARN(area, stream)
#define SAL_INFO(area, stream)
@ Exception
bool CPPUHELPER_DLLPUBLIC supportsService(css::lang::XServiceInfo *implementation, rtl::OUString const &name)
void SAL_CALL throwException(Any const &exc)
int i
constexpr bool starts_with(std::basic_string_view< charT, traits > sv, std::basic_string_view< charT, traits > x) noexcept
OString OUStringToOString(std::u16string_view str, ConnectionSettings const *settings)
void dispose()
DefTokenId nToken
OReadStatusBarDocumentHandler::StatusBar_XML_Namespace nNamespace
OUString msName
OUString maType
unsigned char sal_Bool
ImpEventQueue maEvents
bool operator<(const wwFont &r1, const wwFont &r2)
#define BUFFER_SIZE