LibreOffice Module sdext (master)  1
pdfunzip.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 
21 #include <stdio.h>
22 #include <sal/main.h>
23 #include <osl/file.h>
24 #include <osl/thread.h>
25 #include <rtl/alloc.h>
26 #include <rtl/ustring.hxx>
27 #include <rtl/strbuf.hxx>
28 
29 #include <pdfparse.hxx>
30 
31 using namespace pdfparse;
32 
33 
34 static void printHelp( const char* pExe )
35 {
36  fprintf( stdout,
37  "USAGE: %s [-h,--help]\n"
38  " %s [-pw, --password <password>] <inputfile> [<outputfile>]\n"
39  " %s <-a, --extract-add-streams> [-pw, --password <password>] <inputfile> [<outputfile>]\n"
40  " %s <-f, --extract-fonts> [-pw, --password <password>] <inputfile> [<outputfile>]\n"
41  " %s <-o, --extract-objects> <o0>[:<g0>][,<o1>[:g1][,...]] [-pw, --password <password>] <inputfile> [<outputfile>]\n"
42  " -h, --help: show help\n"
43  " -a, --extract-add-streams: extracts additional streams to outputfile_object\n"
44  " and prints the mimetype found to stdout\n"
45  " -f, --extract-fonts: extracts fonts (currently only type1 and truetype are supported\n"
46  " -o, --extract-objects: extracts object streams, the syntax of the argument is comma separated\n"
47  " object numbers, where object number and generation number are separated by \':\'\n"
48  " an omitted generation number defaults to 0\n"
49  " -pw, --password: use password for decryption\n"
50  "\n"
51  "note: -f, -a, -o and normal unzip operation are mutually exclusive\n"
52  , pExe, pExe, pExe, pExe, pExe );
53 }
54 
55 namespace {
56 
57 class FileEmitContext : public EmitContext
58 {
59  oslFileHandle m_aHandle;
60  oslFileHandle m_aReadHandle;
61  unsigned int m_nReadLen;
62 
63  void openReadFile( const char* pOrigName );
64 
65  public:
66  FileEmitContext( const char* pFileName, const char* pOrigName, const PDFContainer* pTop );
67  virtual ~FileEmitContext() override;
68 
69  virtual bool write( const void* pBuf, unsigned int nLen ) throw() override;
70  virtual unsigned int getCurPos() throw() override;
71  virtual bool copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) throw() override;
72  virtual unsigned int readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) throw() override;
73 };
74 
75 }
76 
77 FileEmitContext::FileEmitContext( const char* pFileName, const char* pOrigName, const PDFContainer* pTop )
78  : EmitContext( pTop ),
79  m_aHandle( nullptr ),
80  m_aReadHandle( nullptr ),
81  m_nReadLen( 0 )
82 {
83  OUString aSysFile( OStringToOUString( OString( pFileName ), osl_getThreadTextEncoding() ) );
84  OUString aURL;
85  if( osl_getFileURLFromSystemPath( aSysFile.pData, &aURL.pData ) != osl_File_E_None )
86  {
87  fprintf( stderr, "filename conversion \"%s\" failed\n", pFileName );
88  return;
89  }
90 
91  if( osl_openFile( aURL.pData, &m_aHandle, osl_File_OpenFlag_Write ) == osl_File_E_None )
92  {
93  if( osl_setFileSize( m_aHandle, 0 ) != osl_File_E_None )
94  {
95  fprintf( stderr, "could not truncate %s\n", pFileName );
96  osl_closeFile( m_aHandle );
97  m_aHandle = nullptr;
98  }
99  }
100  else if( osl_openFile( aURL.pData, &m_aHandle,
101  osl_File_OpenFlag_Write |osl_File_OpenFlag_Create ) != osl_File_E_None )
102  {
103  fprintf( stderr, "could not open %s\n", pFileName );
104  return;
105  }
106  m_bDeflate = true;
107 
108  openReadFile( pOrigName );
109 }
110 
111 FileEmitContext::~FileEmitContext()
112 {
113  if( m_aHandle )
114  osl_closeFile( m_aHandle );
115  if( m_aReadHandle )
116  osl_closeFile( m_aReadHandle );
117 }
118 
119 void FileEmitContext::openReadFile( const char* pInFile )
120 {
121  OUString aSysFile( OStringToOUString( OString( pInFile ), osl_getThreadTextEncoding() ) );
122  OUString aURL;
123  if( osl_getFileURLFromSystemPath( aSysFile.pData, &aURL.pData ) != osl_File_E_None )
124  {
125  fprintf( stderr, "filename conversion \"%s\" failed\n", pInFile );
126  return;
127  }
128 
129  if( osl_openFile( aURL.pData, &m_aReadHandle, osl_File_OpenFlag_Read ) != osl_File_E_None )
130  {
131  fprintf( stderr, "could not open %s\n", pInFile );
132  return;
133  }
134 
135  if( osl_setFilePos( m_aReadHandle, osl_Pos_End, 0 ) != osl_File_E_None )
136  {
137  fprintf( stderr, "could not seek to end of %s\n", pInFile );
138  osl_closeFile( m_aReadHandle );
139  return;
140  }
141 
142  sal_uInt64 nFileSize = 0;
143  if( osl_getFilePos( m_aReadHandle, &nFileSize ) != osl_File_E_None )
144  {
145  fprintf( stderr, "could not get end pos of %s\n", pInFile );
146  osl_closeFile( m_aReadHandle );
147  return;
148  }
149 
150  m_nReadLen = static_cast<unsigned int>(nFileSize);
151 }
152 
153 bool FileEmitContext::write( const void* pBuf, unsigned int nLen ) throw()
154 {
155  if( ! m_aHandle )
156  return false;
157 
158  sal_uInt64 nWrite = static_cast<sal_uInt64>(nLen);
159  sal_uInt64 nWritten = 0;
160  return (osl_writeFile( m_aHandle, pBuf, nWrite, &nWritten ) == osl_File_E_None)
161  && nWrite == nWritten;
162 }
163 
164 unsigned int FileEmitContext::getCurPos() throw()
165 {
166  sal_uInt64 nFileSize = 0;
167  if( m_aHandle )
168  {
169  if( osl_getFilePos( m_aHandle, &nFileSize ) != osl_File_E_None )
170  nFileSize = 0;
171  }
172  return static_cast<unsigned int>(nFileSize);
173 }
174 
175 bool FileEmitContext::copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) throw()
176 {
177  if( nOrigOffset + nLen > m_nReadLen )
178  return false;
179 
180  if( osl_setFilePos( m_aReadHandle, osl_Pos_Absolut, nOrigOffset ) != osl_File_E_None )
181  {
182  fprintf( stderr, "could not seek to offset %u\n", nOrigOffset );
183  return false;
184  }
185  void* pBuf = std::malloc( nLen );
186  if( ! pBuf )
187  return false;
188  sal_uInt64 nBytesRead = 0;
189  if( osl_readFile( m_aReadHandle, pBuf, nLen, &nBytesRead ) != osl_File_E_None
190  || nBytesRead != static_cast<sal_uInt64>(nLen) )
191  {
192  fprintf( stderr, "could not read %u bytes\n", nLen );
193  std::free( pBuf );
194  return false;
195  }
196  bool bRet = write( pBuf, nLen );
197  std::free( pBuf );
198  return bRet;
199 }
200 
201 unsigned int FileEmitContext::readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) throw()
202 {
203  if( nOrigOffset + nLen > m_nReadLen )
204  return 0;
205 
206  if( osl_setFilePos( m_aReadHandle, osl_Pos_Absolut, nOrigOffset ) != osl_File_E_None )
207  {
208  fprintf( stderr, "could not seek to offset %u\n", nOrigOffset );
209  return 0;
210  }
211  sal_uInt64 nBytesRead = 0;
212  if( osl_readFile( m_aReadHandle, pBuf, nLen, &nBytesRead ) != osl_File_E_None )
213  return 0;
214  return static_cast<unsigned int>(nBytesRead);
215 }
216 
217 typedef int(*PDFFileHdl)(const char*, const char*, PDFFile*);
218 
219 static int handleFile( const char* pInFile, const char* pOutFile, const char* pPassword, PDFFileHdl pHdl )
220 {
221  int nRet = 0;
222  std::unique_ptr<PDFEntry> pEntry = pdfparse::PDFReader::read( pInFile );
223  if( pEntry )
224  {
225  PDFFile* pPDFFile = dynamic_cast<PDFFile*>(pEntry.get());
226  if( pPDFFile )
227  {
228  fprintf( stdout, "have a %s PDF file\n", pPDFFile->isEncrypted() ? "encrypted" : "unencrypted" );
229  if( pPassword )
230  fprintf( stdout, "password %s\n",
231  pPDFFile->setupDecryptionData( pPassword ) ? "matches" : "does not match" );
232  nRet = pHdl( pInFile, pOutFile, pPDFFile );
233  }
234  else
235  nRet = 20;
236  }
237  return nRet;
238 }
239 
240 static int write_unzipFile( const char* pInFile, const char* pOutFile, PDFFile* pPDFFile )
241 {
242  FileEmitContext aContext( pOutFile, pInFile, pPDFFile );
243  aContext.m_bDecrypt = pPDFFile->isEncrypted();
244  pPDFFile->emit(aContext);
245  return 0;
246 }
247 
248 static int write_addStreamArray( const char* pOutFile, PDFArray* pStreams, PDFFile* pPDFFile, const char* pInFile )
249 {
250  int nRet = 0;
251  unsigned int nArrayElements = pStreams->m_aSubElements.size();
252  for( unsigned int i = 0; i < nArrayElements-1 && nRet == 0; i++ )
253  {
254  PDFName* pMimeType = dynamic_cast<PDFName*>(pStreams->m_aSubElements[i].get());
255  PDFObjectRef* pStreamRef = dynamic_cast<PDFObjectRef*>(pStreams->m_aSubElements[i+1].get());
256  if( ! pMimeType )
257  fprintf( stderr, "error: no mimetype element\n" );
258  if( ! pStreamRef )
259  fprintf( stderr, "error: no stream ref element\n" );
260  if( pMimeType && pStreamRef )
261  {
262  fprintf( stdout, "found stream %d %d with mimetype %s\n",
263  pStreamRef->m_nNumber, pStreamRef->m_nGeneration,
264  pMimeType->m_aName.getStr() );
265  PDFObject* pObject = pPDFFile->findObject( pStreamRef->m_nNumber, pStreamRef->m_nGeneration );
266  if( pObject )
267  {
268  OString aOutStream = pOutFile +
269  OStringLiteral("_stream_") +
270  OString::number( sal_Int32(pStreamRef->m_nNumber) ) +
271  "_" +
272  OString::number( sal_Int32(pStreamRef->m_nGeneration) );
273  FileEmitContext aContext( aOutStream.getStr(), pInFile, pPDFFile );
274  aContext.m_bDecrypt = pPDFFile->isEncrypted();
275  pObject->writeStream( aContext, pPDFFile );
276  }
277  else
278  {
279  fprintf( stderr, "object not found\n" );
280  nRet = 121;
281  }
282  }
283  else
284  nRet = 120;
285  }
286  return nRet;
287 }
288 
289 static int write_addStreams( const char* pInFile, const char* pOutFile, PDFFile* pPDFFile )
290 {
291  // find all trailers
292  int nRet = 0;
293  unsigned int nElements = pPDFFile->m_aSubElements.size();
294  for( unsigned i = 0; i < nElements && nRet == 0; i++ )
295  {
296  PDFTrailer* pTrailer = dynamic_cast<PDFTrailer*>(pPDFFile->m_aSubElements[i].get());
297  if( pTrailer && pTrailer->m_pDict )
298  {
299  // search for AdditionalStreams entry
300  auto add_stream = pTrailer->m_pDict->m_aMap.find( "AdditionalStreams" );
301  if( add_stream != pTrailer->m_pDict->m_aMap.end() )
302  {
303  PDFArray* pStreams = dynamic_cast<PDFArray*>(add_stream->second);
304  if( pStreams )
305  nRet = write_addStreamArray( pOutFile, pStreams, pPDFFile, pInFile );
306  }
307  }
308  }
309  return nRet;
310 }
311 
312 static int write_fonts( const char* i_pInFile, const char* i_pOutFile, PDFFile* i_pPDFFile )
313 {
314  unsigned int nElements = i_pPDFFile->m_aSubElements.size();
315  for (unsigned i = 0; i < nElements; i++)
316  {
317  // search FontDescriptors
318  PDFObject* pObj = dynamic_cast<PDFObject*>(i_pPDFFile->m_aSubElements[i].get());
319  if( ! pObj )
320  continue;
321  PDFDict* pDict = dynamic_cast<PDFDict*>(pObj->m_pObject);
322  if( ! pDict )
323  continue;
324 
325  std::unordered_map<OString,PDFEntry*>::iterator map_it =
326  pDict->m_aMap.find( "Type" );
327  if( map_it == pDict->m_aMap.end() )
328  continue;
329 
330  PDFName* pName = dynamic_cast<PDFName*>(map_it->second);
331  if( ! pName )
332  continue;
333  if( pName->m_aName != "FontDescriptor" )
334  continue;
335 
336  // the font name will be helpful, also there must be one in
337  // a font descriptor
338  map_it = pDict->m_aMap.find( "FontName" );
339  if( map_it == pDict->m_aMap.end() )
340  continue;
341  pName = dynamic_cast<PDFName*>(map_it->second);
342  if( ! pName )
343  continue;
344  OString aFontName( pName->m_aName );
345 
346  PDFObjectRef* pStreamRef = nullptr;
347  const char* pFileType = nullptr;
348  // we have a font descriptor, try for a type 1 font
349  map_it = pDict->m_aMap.find( "FontFile" );
350  if( map_it != pDict->m_aMap.end() )
351  {
352  pStreamRef = dynamic_cast<PDFObjectRef*>(map_it->second);
353  if( pStreamRef )
354  pFileType = "pfa";
355  }
356 
357  // perhaps it's a truetype file ?
358  if( ! pStreamRef )
359  {
360  map_it = pDict->m_aMap.find( "FontFile2" );
361  if( map_it != pDict->m_aMap.end() )
362  {
363  pStreamRef = dynamic_cast<PDFObjectRef*>(map_it->second);
364  if( pStreamRef )
365  pFileType = "ttf";
366  }
367  }
368 
369  if( ! pStreamRef )
370  continue;
371 
372  PDFObject* pStream = i_pPDFFile->findObject( pStreamRef );
373  if( ! pStream )
374  continue;
375 
376  OStringBuffer aOutStream( i_pOutFile );
377  aOutStream.append( "_font_" );
378  aOutStream.append( sal_Int32(pStreamRef->m_nNumber) );
379  aOutStream.append( "_" );
380  aOutStream.append( sal_Int32(pStreamRef->m_nGeneration) );
381  aOutStream.append( "_" );
382  aOutStream.append( aFontName );
383  if( pFileType )
384  {
385  aOutStream.append( "." );
386  aOutStream.append( pFileType );
387  }
388  FileEmitContext aContext( aOutStream.getStr(), i_pInFile, i_pPDFFile );
389  aContext.m_bDecrypt = i_pPDFFile->isEncrypted();
390  pStream->writeStream( aContext, i_pPDFFile );
391  }
392  return 0;
393 }
394 
395 static std::vector< std::pair< sal_Int32, sal_Int32 > > s_aEmitObjects;
396 
397 static int write_objects( const char* i_pInFile, const char* i_pOutFile, PDFFile* i_pPDFFile )
398 {
399  unsigned int nElements = s_aEmitObjects.size();
400  for (unsigned i = 0; i < nElements; i++)
401  {
402  sal_Int32 nObject = s_aEmitObjects[i].first;
403  sal_Int32 nGeneration = s_aEmitObjects[i].second;
404  PDFObject* pStream = i_pPDFFile->findObject( nObject, nGeneration );
405  if( ! pStream )
406  {
407  fprintf( stderr, "object %d %d not found !\n", static_cast<int>(nObject), static_cast<int>(nGeneration) );
408  continue;
409  }
410 
411  OString aOutStream = i_pOutFile +
412  OStringLiteral("_stream_") +
413  OString::number( nObject ) +
414  "_" +
415  OString::number( nGeneration );
416  FileEmitContext aContext( aOutStream.getStr(), i_pInFile, i_pPDFFile );
417  aContext.m_bDecrypt = i_pPDFFile->isEncrypted();
418  pStream->writeStream( aContext, i_pPDFFile );
419  }
420  return 0;
421 }
422 
424 {
425  const char* pInFile = nullptr;
426  const char* pOutFile = nullptr;
427  const char* pPassword = nullptr;
428  OStringBuffer aOutFile( 256 );
430 
431  for( int nArg = 1; nArg < argc; nArg++ )
432  {
433  if( argv[nArg][0] == '-' )
434  {
435  if( ! rtl_str_compare( "-pw", argv[nArg] ) ||
436  ! rtl_str_compare( "--password" , argv[nArg] ) )
437  {
438  if( nArg == argc-1 )
439  {
440  fprintf( stderr, "no password given\n" );
441  return 1;
442  }
443  nArg++;
444  pPassword = argv[nArg];
445  }
446  else if( ! rtl_str_compare( "-h", argv[nArg] ) ||
447  ! rtl_str_compare( "--help", argv[nArg] ) )
448  {
449  printHelp( argv[0] );
450  return 0;
451  }
452  else if( ! rtl_str_compare( "-a", argv[nArg] ) ||
453  ! rtl_str_compare( "--extract-add-streams", argv[nArg] ) )
454  {
455  aHdl = write_addStreams;
456  }
457  else if( ! rtl_str_compare( "-f", argv[nArg] ) ||
458  ! rtl_str_compare( "--extract-fonts", argv[nArg] ) )
459  {
460  aHdl = write_fonts;
461  }
462  else if( ! rtl_str_compare( "-o", argv[nArg] ) ||
463  ! rtl_str_compare( "--extract-objects", argv[nArg] ) )
464  {
465  aHdl = write_objects;
466  nArg++;
467  if( nArg < argc )
468  {
469  OString aObjs( argv[nArg] );
470  sal_Int32 nIndex = 0;
471  while( nIndex != -1 )
472  {
473  OString aToken( aObjs.getToken( 0, ',', nIndex ) );
474  sal_Int32 nObject = 0;
475  sal_Int32 nGeneration = 0;
476  sal_Int32 nGenIndex = 0;
477  nObject = aToken.getToken( 0, ':', nGenIndex ).toInt32();
478  if( nGenIndex != -1 )
479  nGeneration = aToken.getToken( 0, ':', nGenIndex ).toInt32();
480  s_aEmitObjects.push_back( std::pair<sal_Int32,sal_Int32>(nObject,nGeneration) );
481  }
482  }
483  }
484  else
485  {
486  fprintf( stderr, "unrecognized option \"%s\"\n",
487  argv[nArg] );
488  printHelp( argv[0] );
489  return 1;
490  }
491  }
492  else if( pInFile == nullptr )
493  pInFile = argv[nArg];
494  else if( pOutFile == nullptr )
495  pOutFile = argv[nArg];
496  }
497  if( ! pInFile )
498  {
499  fprintf( stderr, "no input file given\n" );
500  return 10;
501  }
502  if( ! pOutFile )
503  {
504  OString aFile( pInFile );
505  if( aFile.getLength() > 0 )
506  {
507  if( aFile.getLength() > 4 )
508  {
509  if( aFile.matchIgnoreAsciiCase( ".pdf", aFile.getLength()-4 ) )
510  aOutFile.append( pInFile, aFile.getLength() - 4 );
511  else
512  aOutFile.append( aFile );
513  }
514  aOutFile.append( "_unzip.pdf" );
515  pOutFile = aOutFile.getStr();
516  }
517  else
518  {
519  fprintf( stderr, "no output file given\n" );
520  return 11;
521  }
522  }
523 
524  return handleFile( pInFile, pOutFile, pPassword, aHdl );
525 }
526 
527 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
static int write_fonts(const char *i_pInFile, const char *i_pOutFile, PDFFile *i_pPDFFile)
Definition: pdfunzip.cxx:312
URL aURL
sal_Int32 nIndex
static int write_unzipFile(const char *pInFile, const char *pOutFile, PDFFile *pPDFFile)
Definition: pdfunzip.cxx:240
static std::unique_ptr< PDFEntry > read(const char *pFileName)
Definition: pdfparse.cxx:602
const wchar_t *typedef int(__stdcall *DllNativeUnregProc)(int
PDFObject * findObject(unsigned int nNumber, unsigned int nGeneration) const
Definition: pdfentries.cxx:475
EmbeddedObjectRef * pObject
unsigned int const m_nGeneration
Definition: pdfparse.hxx:139
std::vector< std::unique_ptr< PDFEntry > > m_aSubElements
Definition: pdfparse.hxx:160
sal_Int32 nElements
sal_uInt16 char * pName
static void printHelp(const char *pExe)
Definition: pdfunzip.cxx:34
oslFileHandle m_aReadHandle
Definition: filterdet.cxx:54
int i
unsigned int m_nReadLen
Definition: filterdet.cxx:55
int(* PDFFileHdl)(const char *, const char *, PDFFile *)
Definition: pdfunzip.cxx:217
unsigned int const m_nNumber
Definition: pdfparse.hxx:138
static std::vector< std::pair< sal_Int32, sal_Int32 > > s_aEmitObjects
Definition: pdfunzip.cxx:395
bool isEncrypted() const
virtual bool emit(EmitContext &rWriteContext) const override
SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
Definition: pdfunzip.cxx:423
static int handleFile(const char *pInFile, const char *pOutFile, const char *pPassword, PDFFileHdl pHdl)
Definition: pdfunzip.cxx:219
PDFEntry * m_pObject
Definition: pdfparse.hxx:259
static int write_addStreamArray(const char *pOutFile, PDFArray *pStreams, PDFFile *pPDFFile, const char *pInFile)
Definition: pdfunzip.cxx:248
static int write_addStreams(const char *pInFile, const char *pOutFile, PDFFile *pPDFFile)
Definition: pdfunzip.cxx:289
bool setupDecryptionData(const OString &rPwd) const
static osl::File * pStream
Definition: emitcontext.cxx:32
static int write_objects(const char *i_pInFile, const char *i_pOutFile, PDFFile *i_pPDFFile)
Definition: pdfunzip.cxx:397
void writeStream(EmitContext &rContext, const PDFFile *pPDFFile) const
Definition: pdfentries.cxx:783