LibreOffice Module sdext (master) 1
pdfunzip.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
20
21#include <stdio.h>
22#include <string_view>
23
24#include <sal/main.h>
25#include <osl/file.h>
26#include <osl/thread.h>
27#include <rtl/alloc.h>
28#include <rtl/ustring.hxx>
29#include <rtl/strbuf.hxx>
30#include <o3tl/string_view.hxx>
31
32#include <pdfparse.hxx>
33
34using namespace pdfparse;
35
36
37static void printHelp( const char* pExe )
38{
39 fprintf( stdout,
40 "USAGE: %s [-h,--help]\n"
41 " %s [-pw, --password <password>] <inputfile> [<outputfile>]\n"
42 " %s <-a, --extract-add-streams> [-pw, --password <password>] <inputfile> [<outputfile>]\n"
43 " %s <-f, --extract-fonts> [-pw, --password <password>] <inputfile> [<outputfile>]\n"
44 " %s <-o, --extract-objects> <o0>[:<g0>][,<o1>[:g1][,...]] [-pw, --password <password>] <inputfile> [<outputfile>]\n"
45 " -h, --help: show help\n"
46 " -a, --extract-add-streams: extracts additional streams to outputfile_object\n"
47 " and prints the mimetype found to stdout\n"
48 " -f, --extract-fonts: extracts fonts (currently only type1 and truetype are supported\n"
49 " -o, --extract-objects: extracts object streams, the syntax of the argument is comma separated\n"
50 " object numbers, where object number and generation number are separated by \':\'\n"
51 " an omitted generation number defaults to 0\n"
52 " -pw, --password: use password for decryption\n"
53 "\n"
54 "note: -f, -a, -o and normal unzip operation are mutually exclusive\n"
55 , pExe, pExe, pExe, pExe, pExe );
56}
57
58namespace {
59
60class FileEmitContext : public EmitContext
61{
62 oslFileHandle m_aHandle;
63 oslFileHandle m_aReadHandle;
64 unsigned int m_nReadLen;
65
66 void openReadFile( const char* pOrigName );
67
68 public:
69 FileEmitContext( const char* pFileName, const char* pOrigName, const PDFContainer* pTop );
70 virtual ~FileEmitContext() override;
71
72 virtual bool write( const void* pBuf, unsigned int nLen ) noexcept override;
73 virtual unsigned int getCurPos() noexcept override;
74 virtual bool copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) noexcept override;
75 virtual unsigned int readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) noexcept override;
76};
77
78}
79
80FileEmitContext::FileEmitContext( const char* pFileName, const char* pOrigName, const PDFContainer* pTop )
81 : EmitContext( pTop ),
82 m_aHandle( nullptr ),
83 m_aReadHandle( nullptr ),
84 m_nReadLen( 0 )
85{
86 OUString aSysFile(
87 OStringToOUString( std::string_view( pFileName ), osl_getThreadTextEncoding() ) );
88 OUString aURL;
89 if( osl_getFileURLFromSystemPath( aSysFile.pData, &aURL.pData ) != osl_File_E_None )
90 {
91 fprintf( stderr, "filename conversion \"%s\" failed\n", pFileName );
92 return;
93 }
94
95 if( osl_openFile( aURL.pData, &m_aHandle, osl_File_OpenFlag_Write ) == osl_File_E_None )
96 {
97 if( osl_setFileSize( m_aHandle, 0 ) != osl_File_E_None )
98 {
99 fprintf( stderr, "could not truncate %s\n", pFileName );
100 osl_closeFile( m_aHandle );
101 m_aHandle = nullptr;
102 }
103 }
104 else if( osl_openFile( aURL.pData, &m_aHandle,
105 osl_File_OpenFlag_Write |osl_File_OpenFlag_Create ) != osl_File_E_None )
106 {
107 fprintf( stderr, "could not open %s\n", pFileName );
108 return;
109 }
110 m_bDeflate = true;
111
112 openReadFile( pOrigName );
113}
114
115FileEmitContext::~FileEmitContext()
116{
117 if( m_aHandle )
118 osl_closeFile( m_aHandle );
119 if( m_aReadHandle )
120 osl_closeFile( m_aReadHandle );
121}
122
123void FileEmitContext::openReadFile( const char* pInFile )
124{
125 OUString aSysFile(
126 OStringToOUString( std::string_view( pInFile ), osl_getThreadTextEncoding() ) );
127 OUString aURL;
128 if( osl_getFileURLFromSystemPath( aSysFile.pData, &aURL.pData ) != osl_File_E_None )
129 {
130 fprintf( stderr, "filename conversion \"%s\" failed\n", pInFile );
131 return;
132 }
133
134 if( osl_openFile( aURL.pData, &m_aReadHandle, osl_File_OpenFlag_Read ) != osl_File_E_None )
135 {
136 fprintf( stderr, "could not open %s\n", pInFile );
137 return;
138 }
139
140 if( osl_setFilePos( m_aReadHandle, osl_Pos_End, 0 ) != osl_File_E_None )
141 {
142 fprintf( stderr, "could not seek to end of %s\n", pInFile );
143 osl_closeFile( m_aReadHandle );
144 return;
145 }
146
147 sal_uInt64 nFileSize = 0;
148 if( osl_getFilePos( m_aReadHandle, &nFileSize ) != osl_File_E_None )
149 {
150 fprintf( stderr, "could not get end pos of %s\n", pInFile );
151 osl_closeFile( m_aReadHandle );
152 return;
153 }
154
155 m_nReadLen = static_cast<unsigned int>(nFileSize);
156}
157
158bool FileEmitContext::write( const void* pBuf, unsigned int nLen ) noexcept
159{
160 if( ! m_aHandle )
161 return false;
162
163 sal_uInt64 nWrite = static_cast<sal_uInt64>(nLen);
164 sal_uInt64 nWritten = 0;
165 return (osl_writeFile( m_aHandle, pBuf, nWrite, &nWritten ) == osl_File_E_None)
166 && nWrite == nWritten;
167}
168
169unsigned int FileEmitContext::getCurPos() noexcept
170{
171 sal_uInt64 nFileSize = 0;
172 if( m_aHandle )
173 {
174 if( osl_getFilePos( m_aHandle, &nFileSize ) != osl_File_E_None )
175 nFileSize = 0;
176 }
177 return static_cast<unsigned int>(nFileSize);
178}
179
180bool FileEmitContext::copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) noexcept
181{
182 if( nOrigOffset + nLen > m_nReadLen )
183 return false;
184
185 if( osl_setFilePos( m_aReadHandle, osl_Pos_Absolut, nOrigOffset ) != osl_File_E_None )
186 {
187 fprintf( stderr, "could not seek to offset %u\n", nOrigOffset );
188 return false;
189 }
190 void* pBuf = std::malloc( nLen );
191 if( ! pBuf )
192 return false;
193 sal_uInt64 nBytesRead = 0;
194 if( osl_readFile( m_aReadHandle, pBuf, nLen, &nBytesRead ) != osl_File_E_None
195 || nBytesRead != static_cast<sal_uInt64>(nLen) )
196 {
197 fprintf( stderr, "could not read %u bytes\n", nLen );
198 std::free( pBuf );
199 return false;
200 }
201 bool bRet = write( pBuf, nLen );
202 std::free( pBuf );
203 return bRet;
204}
205
206unsigned int FileEmitContext::readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) noexcept
207{
208 if( nOrigOffset + nLen > m_nReadLen )
209 return 0;
210
211 if( osl_setFilePos( m_aReadHandle, osl_Pos_Absolut, nOrigOffset ) != osl_File_E_None )
212 {
213 fprintf( stderr, "could not seek to offset %u\n", nOrigOffset );
214 return 0;
215 }
216 sal_uInt64 nBytesRead = 0;
217 if( osl_readFile( m_aReadHandle, pBuf, nLen, &nBytesRead ) != osl_File_E_None )
218 return 0;
219 return static_cast<unsigned int>(nBytesRead);
220}
221
222typedef int(*PDFFileHdl)(const char*, const char*, PDFFile*);
223
224static int handleFile( const char* pInFile, const char* pOutFile, const char* pPassword, PDFFileHdl pHdl )
225{
226 int nRet = 0;
227 std::unique_ptr<PDFEntry> pEntry = pdfparse::PDFReader::read( pInFile );
228 if( pEntry )
229 {
230 PDFFile* pPDFFile = dynamic_cast<PDFFile*>(pEntry.get());
231 if( pPDFFile )
232 {
233 fprintf( stdout, "have a %s PDF file\n", pPDFFile->isEncrypted() ? "encrypted" : "unencrypted" );
234 if( pPassword )
235 fprintf( stdout, "password %s\n",
236 pPDFFile->setupDecryptionData( pPassword ) ? "matches" : "does not match" );
237 nRet = pHdl( pInFile, pOutFile, pPDFFile );
238 }
239 else
240 nRet = 20;
241 }
242 return nRet;
243}
244
245static int write_unzipFile( const char* pInFile, const char* pOutFile, PDFFile* pPDFFile )
246{
247 FileEmitContext aContext( pOutFile, pInFile, pPDFFile );
248 aContext.m_bDecrypt = pPDFFile->isEncrypted();
249 pPDFFile->emit(aContext);
250 return 0;
251}
252
253static int write_addStreamArray( const char* pOutFile, PDFArray* pStreams, PDFFile* pPDFFile, const char* pInFile )
254{
255 int nRet = 0;
256 unsigned int nArrayElements = pStreams->m_aSubElements.size();
257 for( unsigned int i = 0; i < nArrayElements-1 && nRet == 0; i++ )
258 {
259 PDFName* pMimeType = dynamic_cast<PDFName*>(pStreams->m_aSubElements[i].get());
260 PDFObjectRef* pStreamRef = dynamic_cast<PDFObjectRef*>(pStreams->m_aSubElements[i+1].get());
261 if( ! pMimeType )
262 fprintf( stderr, "error: no mimetype element\n" );
263 if( ! pStreamRef )
264 fprintf( stderr, "error: no stream ref element\n" );
265 if( pMimeType && pStreamRef )
266 {
267 fprintf( stdout, "found stream %d %d with mimetype %s\n",
268 pStreamRef->m_nNumber, pStreamRef->m_nGeneration,
269 pMimeType->m_aName.getStr() );
270 PDFObject* pObject = pPDFFile->findObject( pStreamRef->m_nNumber, pStreamRef->m_nGeneration );
271 if( pObject )
272 {
273 OString aOutStream = pOutFile +
274 OString::Concat("_stream_") +
275 OString::number( sal_Int32(pStreamRef->m_nNumber) ) +
276 "_" +
277 OString::number( sal_Int32(pStreamRef->m_nGeneration) );
278 FileEmitContext aContext( aOutStream.getStr(), pInFile, pPDFFile );
279 aContext.m_bDecrypt = pPDFFile->isEncrypted();
280 pObject->writeStream( aContext, pPDFFile );
281 }
282 else
283 {
284 fprintf( stderr, "object not found\n" );
285 nRet = 121;
286 }
287 }
288 else
289 nRet = 120;
290 }
291 return nRet;
292}
293
294static int write_addStreams( const char* pInFile, const char* pOutFile, PDFFile* pPDFFile )
295{
296 // find all trailers
297 int nRet = 0;
298 unsigned int nElements = pPDFFile->m_aSubElements.size();
299 for( unsigned i = 0; i < nElements && nRet == 0; i++ )
300 {
301 PDFTrailer* pTrailer = dynamic_cast<PDFTrailer*>(pPDFFile->m_aSubElements[i].get());
302 if( pTrailer && pTrailer->m_pDict )
303 {
304 // search for AdditionalStreams entry
305 auto add_stream = pTrailer->m_pDict->m_aMap.find( "AdditionalStreams" );
306 if( add_stream != pTrailer->m_pDict->m_aMap.end() )
307 {
308 PDFArray* pStreams = dynamic_cast<PDFArray*>(add_stream->second);
309 if( pStreams )
310 nRet = write_addStreamArray( pOutFile, pStreams, pPDFFile, pInFile );
311 }
312 }
313 }
314 return nRet;
315}
316
317static int write_fonts( const char* i_pInFile, const char* i_pOutFile, PDFFile* i_pPDFFile )
318{
319 unsigned int nElements = i_pPDFFile->m_aSubElements.size();
320 for (unsigned i = 0; i < nElements; i++)
321 {
322 // search FontDescriptors
323 PDFObject* pObj = dynamic_cast<PDFObject*>(i_pPDFFile->m_aSubElements[i].get());
324 if( ! pObj )
325 continue;
326 PDFDict* pDict = dynamic_cast<PDFDict*>(pObj->m_pObject);
327 if( ! pDict )
328 continue;
329
330 std::unordered_map<OString,PDFEntry*>::iterator map_it =
331 pDict->m_aMap.find( "Type" );
332 if( map_it == pDict->m_aMap.end() )
333 continue;
334
335 PDFName* pName = dynamic_cast<PDFName*>(map_it->second);
336 if( ! pName )
337 continue;
338 if( pName->m_aName != "FontDescriptor" )
339 continue;
340
341 // the font name will be helpful, also there must be one in
342 // a font descriptor
343 map_it = pDict->m_aMap.find( "FontName" );
344 if( map_it == pDict->m_aMap.end() )
345 continue;
346 pName = dynamic_cast<PDFName*>(map_it->second);
347 if( ! pName )
348 continue;
349 OString aFontName( pName->m_aName );
350
351 PDFObjectRef* pStreamRef = nullptr;
352 const char* pFileType = nullptr;
353 // we have a font descriptor, try for a type 1 font
354 map_it = pDict->m_aMap.find( "FontFile" );
355 if( map_it != pDict->m_aMap.end() )
356 {
357 pStreamRef = dynamic_cast<PDFObjectRef*>(map_it->second);
358 if( pStreamRef )
359 pFileType = "pfa";
360 }
361
362 // perhaps it's a truetype file ?
363 if( ! pStreamRef )
364 {
365 map_it = pDict->m_aMap.find( "FontFile2" );
366 if( map_it != pDict->m_aMap.end() )
367 {
368 pStreamRef = dynamic_cast<PDFObjectRef*>(map_it->second);
369 if( pStreamRef )
370 pFileType = "ttf";
371 }
372 }
373
374 if( ! pStreamRef )
375 continue;
376
377 PDFObject* pStream = i_pPDFFile->findObject( pStreamRef );
378 if( ! pStream )
379 continue;
380
381 OStringBuffer aOutStream( OString::Concat(i_pOutFile)
382 + "_font_"
383 + OString::number( sal_Int32(pStreamRef->m_nNumber) )
384 + "_"
385 + OString::number( sal_Int32(pStreamRef->m_nGeneration) )
386 + "_"
387 + aFontName );
388 if( pFileType )
389 {
390 aOutStream.append( OString::Concat(".") + pFileType );
391 }
392 FileEmitContext aContext( aOutStream.getStr(), i_pInFile, i_pPDFFile );
393 aContext.m_bDecrypt = i_pPDFFile->isEncrypted();
394 pStream->writeStream( aContext, i_pPDFFile );
395 }
396 return 0;
397}
398
399static std::vector< std::pair< sal_Int32, sal_Int32 > > s_aEmitObjects;
400
401static int write_objects( const char* i_pInFile, const char* i_pOutFile, PDFFile* i_pPDFFile )
402{
403 unsigned int nElements = s_aEmitObjects.size();
404 for (unsigned i = 0; i < nElements; i++)
405 {
406 sal_Int32 nObject = s_aEmitObjects[i].first;
407 sal_Int32 nGeneration = s_aEmitObjects[i].second;
408 PDFObject* pStream = i_pPDFFile->findObject( nObject, nGeneration );
409 if( ! pStream )
410 {
411 fprintf( stderr, "object %d %d not found !\n", static_cast<int>(nObject), static_cast<int>(nGeneration) );
412 continue;
413 }
414
415 OString aOutStream = i_pOutFile +
416 OString::Concat("_stream_") +
417 OString::number( nObject ) +
418 "_" +
419 OString::number( nGeneration );
420 FileEmitContext aContext( aOutStream.getStr(), i_pInFile, i_pPDFFile );
421 aContext.m_bDecrypt = i_pPDFFile->isEncrypted();
422 pStream->writeStream( aContext, i_pPDFFile );
423 }
424 return 0;
425}
426
428{
429 const char* pInFile = nullptr;
430 const char* pOutFile = nullptr;
431 const char* pPassword = nullptr;
432 OStringBuffer aOutFile( 256 );
434
435 for( int nArg = 1; nArg < argc; nArg++ )
436 {
437 if( argv[nArg][0] == '-' )
438 {
439 if( ! rtl_str_compare( "-pw", argv[nArg] ) ||
440 ! rtl_str_compare( "--password" , argv[nArg] ) )
441 {
442 if( nArg == argc-1 )
443 {
444 fprintf( stderr, "no password given\n" );
445 return 1;
446 }
447 nArg++;
448 pPassword = argv[nArg];
449 }
450 else if( ! rtl_str_compare( "-h", argv[nArg] ) ||
451 ! rtl_str_compare( "--help", argv[nArg] ) )
452 {
453 printHelp( argv[0] );
454 return 0;
455 }
456 else if( ! rtl_str_compare( "-a", argv[nArg] ) ||
457 ! rtl_str_compare( "--extract-add-streams", argv[nArg] ) )
458 {
459 aHdl = write_addStreams;
460 }
461 else if( ! rtl_str_compare( "-f", argv[nArg] ) ||
462 ! rtl_str_compare( "--extract-fonts", argv[nArg] ) )
463 {
464 aHdl = write_fonts;
465 }
466 else if( ! rtl_str_compare( "-o", argv[nArg] ) ||
467 ! rtl_str_compare( "--extract-objects", argv[nArg] ) )
468 {
469 aHdl = write_objects;
470 nArg++;
471 if( nArg < argc )
472 {
473 OString aObjs( argv[nArg] );
474 sal_Int32 nIndex = 0;
475 while( nIndex != -1 )
476 {
477 OString aToken( aObjs.getToken( 0, ',', nIndex ) );
478 sal_Int32 nObject = 0;
479 sal_Int32 nGeneration = 0;
480 sal_Int32 nGenIndex = 0;
481 nObject = o3tl::toInt32( o3tl::getToken( aToken, 0, ':', nGenIndex ) );
482 if( nGenIndex != -1 )
483 nGeneration = o3tl::toInt32( o3tl::getToken(aToken, 0, ':', nGenIndex ));
484 s_aEmitObjects.push_back( std::pair<sal_Int32,sal_Int32>(nObject,nGeneration) );
485 }
486 }
487 }
488 else
489 {
490 fprintf( stderr, "unrecognized option \"%s\"\n",
491 argv[nArg] );
492 printHelp( argv[0] );
493 return 1;
494 }
495 }
496 else if( pInFile == nullptr )
497 pInFile = argv[nArg];
498 else if( pOutFile == nullptr )
499 pOutFile = argv[nArg];
500 }
501 if( ! pInFile )
502 {
503 fprintf( stderr, "no input file given\n" );
504 return 10;
505 }
506 if( ! pOutFile )
507 {
508 OString aFile( pInFile );
509 if( aFile.getLength() > 0 )
510 {
511 if( aFile.getLength() > 4 )
512 {
513 if( aFile.matchIgnoreAsciiCase( ".pdf", aFile.getLength()-4 ) )
514 aOutFile.append( pInFile, aFile.getLength() - 4 );
515 else
516 aOutFile.append( aFile );
517 }
518 aOutFile.append( "_unzip.pdf" );
519 pOutFile = aOutFile.getStr();
520 }
521 else
522 {
523 fprintf( stderr, "no output file given\n" );
524 return 11;
525 }
526 }
527
528 return handleFile( pInFile, pOutFile, pPassword, aHdl );
529}
530
531/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
const char * pName
virtual bool write(const void *pBuf, unsigned int nLen)=0
virtual unsigned int getCurPos()=0
sal_Int32 nElements
URL aURL
EmbeddedObjectRef * pObject
static osl::File * pStream
Definition: emitcontext.cxx:32
unsigned int m_nReadLen
Definition: filterdet.cxx:56
oslFileHandle m_aReadHandle
Definition: filterdet.cxx:55
sal_Int32 nIndex
int i
sal_Int32 toInt32(std::u16string_view str, sal_Int16 radix=10)
std::basic_string_view< charT, traits > getToken(std::basic_string_view< charT, traits > sv, charT delimiter, std::size_t &position)
static std::vector< std::pair< sal_Int32, sal_Int32 > > s_aEmitObjects
Definition: pdfunzip.cxx:399
static int write_objects(const char *i_pInFile, const char *i_pOutFile, PDFFile *i_pPDFFile)
Definition: pdfunzip.cxx:401
static int write_addStreams(const char *pInFile, const char *pOutFile, PDFFile *pPDFFile)
Definition: pdfunzip.cxx:294
static int write_fonts(const char *i_pInFile, const char *i_pOutFile, PDFFile *i_pPDFFile)
Definition: pdfunzip.cxx:317
int(* PDFFileHdl)(const char *, const char *, PDFFile *)
Definition: pdfunzip.cxx:222
static int write_unzipFile(const char *pInFile, const char *pOutFile, PDFFile *pPDFFile)
Definition: pdfunzip.cxx:245
static int handleFile(const char *pInFile, const char *pOutFile, const char *pPassword, PDFFileHdl pHdl)
Definition: pdfunzip.cxx:224
static void printHelp(const char *pExe)
Definition: pdfunzip.cxx:37
SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
Definition: pdfunzip.cxx:427
static int write_addStreamArray(const char *pOutFile, PDFArray *pStreams, PDFFile *pPDFFile, const char *pInFile)
Definition: pdfunzip.cxx:253
const wchar_t *typedef int(__stdcall *DllNativeUnregProc)(int
PDFObject * findObject(unsigned int nNumber, unsigned int nGeneration) const
Definition: pdfentries.cxx:475
std::vector< std::unique_ptr< PDFEntry > > m_aSubElements
Definition: pdfparse.hxx:162
virtual bool emit(EmitContext &rWriteContext) const override
bool setupDecryptionData(const OString &rPwd) const
bool isEncrypted() const
unsigned int m_nNumber
Definition: pdfparse.hxx:140
unsigned int m_nGeneration
Definition: pdfparse.hxx:141
PDFEntry * m_pObject
Definition: pdfparse.hxx:261
static std::unique_ptr< PDFEntry > read(const char *pFileName)
Definition: pdfparse.cxx:609