26#pragma warning(disable:4996)
27#pragma warning(disable:4503)
31#include <boost/spirit/include/classic_core.hpp>
32#include <boost/spirit/include/classic_utility.hpp>
33#include <boost/spirit/include/classic_error_handling.hpp>
34#include <boost/spirit/include/classic_file_iterator.hpp>
35#include <boost/bind/bind.hpp>
40#include <rtl/strbuf.hxx>
41#include <rtl/ustrbuf.hxx>
49#pragma warning(disable:4996)
50#pragma warning(disable:4503)
54using namespace boost::spirit::classic;
63 StringEmitContext() :
m_aBuf(256) {}
65 virtual bool write(
const void* pBuf,
unsigned int nLen )
noexcept override
67 m_aBuf.append(
static_cast<const char*
>(pBuf), nLen );
70 virtual unsigned int getCurPos() noexcept
override {
return m_aBuf.getLength(); }
71 virtual bool copyOrigBytes(
unsigned int nOrigOffset,
unsigned int nLen )
noexcept override
74 virtual unsigned int readOrigBytes(
unsigned int nOrigOffset,
unsigned int nLen,
void* pBuf )
noexcept override
78 memcpy( pBuf,
m_aBuf.getStr()+nOrigOffset, nLen );
87template<
class iteratorT >
88class PDFGrammar :
public grammar< PDFGrammar<iteratorT> >
92 explicit PDFGrammar( iteratorT first )
93 : m_fDouble( 0.0 ), m_aGlobalBegin(
std::move(
first )) {}
96 if( !m_aObjectStack.empty() )
97 delete m_aObjectStack.front();
101 std::vector< unsigned int > m_aUIntStack;
102 std::vector< PDFEntry* > m_aObjectStack;
103 OString m_aErrorString;
104 iteratorT m_aGlobalBegin;
107 struct pdf_string_parser
109 typedef nil_t result_t;
110 template <
typename ScannerT>
112 operator()(ScannerT
const& scan, result_t&)
const
114 std::ptrdiff_t len = 0;
117 while( ! scan.at_end() )
123 if( nBraceLevel < 0 )
132 if( scan.first == scan.last )
138 return scan.at_end() ? -1 : len;
142 template<
typename ScannerT >
145 explicit definition(
const PDFGrammar<iteratorT>& rSelf )
147 using namespace boost::placeholders;
149 PDFGrammar<iteratorT>* pSelf =
const_cast< PDFGrammar<iteratorT>*
>( &rSelf );
153 comment = lexeme_d[ (ch_p(
'%') >> *(~ch_p(
'\r') & ~ch_p(
'\n')) >> eol_p)[boost::bind(&PDFGrammar::pushComment, pSelf, _1, _2 )] ];
155 boolean = (str_p(
"true") | str_p(
"false"))[boost::bind(&PDFGrammar::pushBool, pSelf, _1, _2)];
159 stream = (str_p(
"stream") >> *(anychar_p - str_p(
"endstream")) >> str_p(
"endstream"))[boost::bind(&PDFGrammar::emitStream, pSelf, _1, _2 )];
163 >> (*(anychar_p-chset_p(
"\t\n\f\r ()<>[]{}/%")-ch_p(
'\0')))
164 [boost::bind(&PDFGrammar::pushName, pSelf, _1, _2)] ];
171 stringtype = ( ( ch_p(
'(') >> functor_parser<pdf_string_parser>() >> ch_p(
')') ) |
172 ( ch_p(
'<') >> *xdigit_p >> ch_p(
'>') ) )
173 [boost::bind(&PDFGrammar::pushString,pSelf, _1, _2)];
175 null_object = str_p(
"null" )[boost::bind(&PDFGrammar::pushNull, pSelf, _1, _2)];
177 #ifdef USE_ASSIGN_ACTOR
178 objectref = ( uint_p[push_back_a(pSelf->m_aUIntStack)]
179 >> uint_p[push_back_a(pSelf->m_aUIntStack)]
182 )[boost::bind(&PDFGrammar::pushObjectRef, pSelf, _1, _2)];
184 objectref = ( uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
185 >> uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
188 )[boost::bind(&PDFGrammar::pushObjectRef, pSelf, _1, _2)];
191 #ifdef USE_ASSIGN_ACTOR
192 simple_type = objectref |
name |
193 ( real_p[assign_a(pSelf->m_fDouble)] >> eps_p )
194 [boost::bind(&PDFGrammar::pushDouble, pSelf, _1, _2)]
195 | stringtype |
boolean | null_object;
197 simple_type = objectref |
name |
198 ( real_p[boost::bind(&PDFGrammar::assign_action_double, pSelf, _1)] >> eps_p )
199 [boost::bind(&PDFGrammar::pushDouble, pSelf, _1, _2)]
200 | stringtype |
boolean | null_object;
203 dict_begin = str_p(
"<<" )[boost::bind(&PDFGrammar::beginDict, pSelf, _1, _2)];
204 dict_end = str_p(
">>" )[boost::bind(&PDFGrammar::endDict, pSelf, _1, _2)];
206 array_begin = str_p(
"[")[boost::bind(&PDFGrammar::beginArray,pSelf, _1, _2)];
207 array_end = str_p(
"]")[boost::bind(&PDFGrammar::endArray,pSelf, _1, _2)];
209 #ifdef USE_ASSIGN_ACTOR
210 object_begin= uint_p[push_back_a(pSelf->m_aUIntStack)]
211 >> uint_p[push_back_a(pSelf->m_aUIntStack)]
212 >> str_p(
"obj" )[boost::bind(&PDFGrammar::beginObject, pSelf, _1, _2)];
214 object_begin= uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
215 >> uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
216 >> str_p(
"obj" )[boost::bind(&PDFGrammar::beginObject, pSelf, _1, _2)];
218 object_end = str_p(
"endobj" )[boost::bind(&PDFGrammar::endObject, pSelf, _1, _2)];
220 xref = str_p(
"xref" ) >> uint_p >> uint_p
222 +( repeat_p(10)[digit_p]
224 >> repeat_p(5)[digit_p]
226 >> ( ch_p(
'n') | ch_p(
'f') )
227 >> repeat_p(2)[space_p]
230 dict_element= dict_begin | comment | simple_type
231 | array_begin | array_end | dict_end;
233 object = object_begin
238 trailer = str_p(
"trailer" )[boost::bind(&PDFGrammar::beginTrailer,pSelf,_1,_2)]
240 >> str_p(
"startxref")
242 >> str_p(
"%%EOF")[boost::bind(&PDFGrammar::endTrailer,pSelf,_1,_2)];
244 #ifdef USE_ASSIGN_ACTOR
245 pdfrule = ! (lexeme_d[
247 >> uint_p[push_back_a(pSelf->m_aUIntStack)]
249 >> uint_p[push_back_a(pSelf->m_aUIntStack)]
250 >> *((~ch_p(
'\r') & ~ch_p(
'\n')))
252 ])[boost::bind(&PDFGrammar::haveFile,pSelf, _1, _2)]
253 >> *( comment |
object | ( xref >> trailer ) );
255 pdfrule = ! (lexeme_d[
257 >> uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
259 >> uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
260 >> *(~ch_p(
'\r') & ~ch_p(
'\n'))
262 ])[boost::bind(&PDFGrammar::haveFile,pSelf, _1, _2)]
263 >> *( comment |
object | ( xref >> trailer ) );
266 rule< ScannerT > comment,
stream, boolean,
name, stringtype, null_object, simple_type,
267 objectref, array,
value, dict_element, dict_begin, dict_end,
268 array_begin, array_end, object, object_begin, object_end,
269 xref, trailer, pdfrule;
271 const rule< ScannerT >&
start()
const {
return pdfrule; }
274 #ifndef USE_ASSIGN_ACTOR
275 void push_back_action_uint(
unsigned int i )
277 m_aUIntStack.push_back( i );
279 void assign_action_double(
double d )
285 static void parseError(
const char* pMessage, iteratorT pLocation )
287 throw_( pLocation, pMessage );
290 OString iteratorToString( iteratorT first, iteratorT last )
const
292 OStringBuffer
aStr( 32 );
293 while( first != last )
295 aStr.append( *first );
298 return aStr.makeStringAndClear();
301 void haveFile( iteratorT pBegin, SAL_UNUSED_PARAMETER iteratorT )
303 if( m_aObjectStack.empty() )
306 pFile->
m_nMinor = m_aUIntStack.back();
307 m_aUIntStack.pop_back();
308 pFile->
m_nMajor = m_aUIntStack.back();
309 m_aUIntStack.pop_back();
310 m_aObjectStack.push_back( pFile );
313 parseError(
"found file header in unusual place", pBegin );
316 void pushComment( iteratorT first, iteratorT last )
321 if( m_aObjectStack.empty() )
322 m_aObjectStack.push_back(
new PDFPart() );
324 if( pContainer ==
nullptr )
325 parseError(
"comment without container", first );
329 void insertNewValue( std::unique_ptr<PDFEntry> pNewValue, iteratorT pPos )
332 const char* pMsg =
nullptr;
333 if( ! m_aObjectStack.empty() )
335 pContainer =
dynamic_cast<PDFContainer*
>(m_aObjectStack.back());
338 if(
dynamic_cast<PDFDict*
>(pContainer) ==
nullptr &&
339 dynamic_cast<PDFArray*
>(pContainer) ==
nullptr )
348 pMsg =
"second value for object";
349 pContainer =
nullptr;
352 else if(
dynamic_cast<PDFDict*
>(pNewValue.get()) )
357 if( pTrailer->
m_pDict ==
nullptr )
360 pContainer =
nullptr;
363 pContainer =
nullptr;
366 pContainer =
nullptr;
377 pMsg =
"array without container";
379 pMsg =
"value without container";
381 parseError( pMsg, pPos );
385 void pushName( iteratorT first, iteratorT last )
387 insertNewValue( std::make_unique<PDFName>(iteratorToString(first,last)), first );
390 void pushDouble( iteratorT first, SAL_UNUSED_PARAMETER iteratorT )
392 insertNewValue( std::make_unique<PDFNumber>(m_fDouble), first );
395 void pushString( iteratorT first, iteratorT last )
397 insertNewValue( std::make_unique<PDFString>(iteratorToString(first,last)), first );
400 void pushBool( iteratorT first, iteratorT last )
402 insertNewValue( std::make_unique<PDFBool>( last-first == 4 ), first );
405 void pushNull( iteratorT first, SAL_UNUSED_PARAMETER iteratorT )
407 insertNewValue( std::make_unique<PDFNull>(), first );
411 void beginObject( iteratorT first, SAL_UNUSED_PARAMETER iteratorT )
413 if( m_aObjectStack.empty() )
414 m_aObjectStack.push_back(
new PDFPart() );
416 unsigned int nGeneration = m_aUIntStack.back();
417 m_aUIntStack.pop_back();
418 unsigned int nObject = m_aUIntStack.back();
419 m_aUIntStack.pop_back();
426 (
dynamic_cast<PDFFile*
>(pContainer) ||
427 dynamic_cast<PDFPart*
>(pContainer) ) )
430 m_aObjectStack.push_back( pObj );
433 parseError(
"object in wrong place", first );
436 void endObject( iteratorT first, SAL_UNUSED_PARAMETER iteratorT )
438 if( m_aObjectStack.empty() )
439 parseError(
"endobj without obj", first );
440 else if(
dynamic_cast<PDFObject*
>(m_aObjectStack.back()) ==
nullptr )
441 parseError(
"spurious endobj", first );
443 m_aObjectStack.pop_back();
446 void pushObjectRef( iteratorT first, SAL_UNUSED_PARAMETER iteratorT )
448 unsigned int nGeneration = m_aUIntStack.back();
449 m_aUIntStack.pop_back();
450 unsigned int nObject = m_aUIntStack.back();
451 m_aUIntStack.pop_back();
452 insertNewValue( std::make_unique<PDFObjectRef>(nObject,nGeneration), first );
455 void beginDict( iteratorT first, SAL_UNUSED_PARAMETER iteratorT )
460 insertNewValue( std::unique_ptr<PDFEntry>(pDict), first );
462 m_aObjectStack.push_back( pDict );
464 void endDict( iteratorT first, SAL_UNUSED_PARAMETER iteratorT )
467 if( m_aObjectStack.empty() )
468 parseError(
"dictionary end without begin", first );
469 else if( (pDict =
dynamic_cast<PDFDict*
>(m_aObjectStack.back())) ==
nullptr )
470 parseError(
"spurious dictionary end", first );
472 m_aObjectStack.pop_back();
477 StringEmitContext aCtx;
478 aCtx.write(
"offending dictionary element: ", 30 );
479 pOffender->
emit( aCtx );
480 m_aErrorString = aCtx.getString();
481 parseError( m_aErrorString.getStr(), first );
485 void beginArray( iteratorT first, SAL_UNUSED_PARAMETER iteratorT )
490 insertNewValue( std::unique_ptr<PDFEntry>(pArray), first );
492 m_aObjectStack.push_back( pArray );
495 void endArray( iteratorT first, SAL_UNUSED_PARAMETER iteratorT )
497 if( m_aObjectStack.empty() )
498 parseError(
"array end without begin", first );
499 else if(
dynamic_cast<PDFArray*
>(m_aObjectStack.back()) ==
nullptr )
500 parseError(
"spurious array end", first );
502 m_aObjectStack.pop_back();
505 void emitStream( iteratorT first, iteratorT last )
507 if( m_aObjectStack.empty() )
508 parseError(
"stream without object", first );
513 parseError(
"multiple streams in object", first );
525 parseError(
"stream without object", first );
528 void beginTrailer( iteratorT first, SAL_UNUSED_PARAMETER iteratorT )
530 if( m_aObjectStack.empty() )
531 m_aObjectStack.push_back(
new PDFPart() );
538 (
dynamic_cast<PDFFile*
>(pContainer) ||
539 dynamic_cast<PDFPart*
>(pContainer) ) )
542 m_aObjectStack.push_back( pTrailer );
545 parseError(
"trailer in wrong place", first );
548 void endTrailer( iteratorT first, SAL_UNUSED_PARAMETER iteratorT )
550 if( m_aObjectStack.empty() )
551 parseError(
"%%EOF without trailer", first );
552 else if(
dynamic_cast<PDFTrailer*
>(m_aObjectStack.back()) ==
nullptr )
553 parseError(
"spurious %%EOF", first );
555 m_aObjectStack.pop_back();
562std::unique_ptr<PDFEntry> PDFReader::read(
const char* pBuffer,
unsigned int nLen )
564 PDFGrammar<const char*> aGrammar( pBuffer );
568#if OSL_DEBUG_LEVEL > 0
569 boost::spirit::classic::parse_info<const char*> aInfo =
571 boost::spirit::classic::parse( pBuffer,
574 boost::spirit::classic::space_p );
575#if OSL_DEBUG_LEVEL > 0
576 SAL_INFO(
"sdext.pdfimport.pdfparse",
"parseinfo: stop = " << aInfo.stop <<
" (buff=" << pBuffer <<
", offset = " << aInfo.stop - pBuffer <<
"), hit = " << (aInfo.hit ? OUString(
"true") : OUString(
"false")) <<
", full = " << (aInfo.full ? OUString(
"true") : OUString(
"false")) <<
", length = " <<
static_cast<int>(aInfo.length) );
579 catch(
const parser_error<const char*, const char*>& rError )
581#if OSL_DEBUG_LEVEL > 0
583 unsigned int nElem = aGrammar.m_aObjectStack.size();
584 for(
unsigned int i = 0;
i < nElem;
i++ )
585 aTmp += OString::Concat(
" ") +
typeid( *(aGrammar.m_aObjectStack[
i]) ).name();
587 SAL_WARN(
"sdext.pdfimport.pdfparse",
"parse error: " << rError.descriptor <<
" at buffer pos " << rError.where - pBuffer <<
", object stack: " << aTmp);
593 std::unique_ptr<PDFEntry> pRet;
594 unsigned int nEntries = aGrammar.m_aObjectStack.size();
597 pRet.reset(aGrammar.m_aObjectStack.back());
598 aGrammar.m_aObjectStack.pop_back();
600#if OSL_DEBUG_LEVEL > 0
601 else if( nEntries > 1 )
602 SAL_WARN(
"sdext.pdfimport.pdfparse",
"error got " << nEntries <<
" stack objects in parse" );
620 std::unique_ptr<PDFEntry> pRet;
621 FILE* fp = fopen( pFileName,
"rb" );
624 fseek( fp, 0, SEEK_END );
625 unsigned int nLen =
static_cast<unsigned int>(ftell( fp ));
626 fseek( fp, 0, SEEK_SET );
627 char* pBuf =
static_cast<char*
>(std::malloc( nLen ));
630 fread( pBuf, 1, nLen, fp );
631 pRet =
read( pBuf, nLen );
638 file_iterator<> file_start( pFileName );
641 file_iterator<> file_end = file_start.make_end();
642 PDFGrammar< file_iterator<> > aGrammar( file_start );
646#if OSL_DEBUG_LEVEL > 0
647 boost::spirit::classic::parse_info< file_iterator<> > aInfo =
649 boost::spirit::classic::parse( file_start,
652 boost::spirit::classic::space_p );
653#if OSL_DEBUG_LEVEL > 0
654 SAL_INFO(
"sdext.pdfimport.pdfparse",
"parseinfo: stop at offset = " << aInfo.stop - file_start <<
", hit = " << (aInfo.hit ?
"true" :
"false") <<
", full = " << (aInfo.full ?
"true" :
"false") <<
", length = " << aInfo.length);
657 catch(
const parser_error<
const char*, file_iterator<> >& rError )
659 SAL_WARN(
"sdext.pdfimport.pdfparse",
"parse error: " << rError.descriptor <<
" at buffer pos " << rError.where - file_start);
660#if OSL_DEBUG_LEVEL > 0
662 unsigned int nElem = aGrammar.m_aObjectStack.size();
663 for(
unsigned int i = 0;
i < nElem;
i++ )
666 aTmp.appendAscii(
typeid( *(aGrammar.m_aObjectStack[
i]) ).name());
668 SAL_WARN(
"sdext.pdfimport.pdfparse",
"parse error object stack: " << aTmp.makeStringAndClear());
672 std::unique_ptr<PDFEntry> pRet;
673 unsigned int nEntries = aGrammar.m_aObjectStack.size();
676 pRet.reset(aGrammar.m_aObjectStack.back());
677 aGrammar.m_aObjectStack.pop_back();
679#if OSL_DEBUG_LEVEL > 0
680 else if( nEntries > 1 )
682 SAL_WARN(
"sdext.pdfimport.pdfparse",
"error got " << nEntries <<
" stack objects in parse");
683 for(
unsigned int i = 0;
i < nEntries;
i++ )
685 SAL_WARN(
"sdext.pdfimport.pdfparse",
typeid(*aGrammar.m_aObjectStack[
i]).name());
690 SAL_WARN(
"sdext.pdfimport.pdfparse",
"(type " <<
typeid(*aGrammar.m_aObjectStack[
i]).name() <<
")");
virtual bool copyOrigBytes(unsigned int nOrigOffset, unsigned int nLen)=0
virtual unsigned int readOrigBytes(unsigned int nOrigOffset, unsigned int nLen, void *pBuf)=0
virtual bool write(const void *pBuf, unsigned int nLen)=0
virtual unsigned int getCurPos()=0
Reference< XOutputStream > stream
static osl::File * pStream
#define SAL_WARN(area, stream)
#define SAL_INFO(area, stream)
OUString getString(const Any &_rAny)
constexpr OUStringLiteral first
constexpr std::enable_if_t< std::is_signed_v< T >, std::make_unsigned_t< T > > make_unsigned(T value)
uno::Sequence< sal_Int8 > m_aBuf
std::vector< std::unique_ptr< PDFEntry > > m_aSubElements
virtual bool emit(EmitContext &rWriteContext) const =0
unsigned int m_nGeneration
static std::unique_ptr< PDFEntry > read(const char *pFileName)