LibreOffice Module sdext (master)  1
pdfunzip.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 
21 #include <stdio.h>
22 #include <string_view>
23 
24 #include <sal/main.h>
25 #include <osl/file.h>
26 #include <osl/thread.h>
27 #include <rtl/alloc.h>
28 #include <rtl/ustring.hxx>
29 #include <rtl/strbuf.hxx>
30 
31 #include <pdfparse.hxx>
32 
33 using namespace pdfparse;
34 
35 
36 static void printHelp( const char* pExe )
37 {
38  fprintf( stdout,
39  "USAGE: %s [-h,--help]\n"
40  " %s [-pw, --password <password>] <inputfile> [<outputfile>]\n"
41  " %s <-a, --extract-add-streams> [-pw, --password <password>] <inputfile> [<outputfile>]\n"
42  " %s <-f, --extract-fonts> [-pw, --password <password>] <inputfile> [<outputfile>]\n"
43  " %s <-o, --extract-objects> <o0>[:<g0>][,<o1>[:g1][,...]] [-pw, --password <password>] <inputfile> [<outputfile>]\n"
44  " -h, --help: show help\n"
45  " -a, --extract-add-streams: extracts additional streams to outputfile_object\n"
46  " and prints the mimetype found to stdout\n"
47  " -f, --extract-fonts: extracts fonts (currently only type1 and truetype are supported\n"
48  " -o, --extract-objects: extracts object streams, the syntax of the argument is comma separated\n"
49  " object numbers, where object number and generation number are separated by \':\'\n"
50  " an omitted generation number defaults to 0\n"
51  " -pw, --password: use password for decryption\n"
52  "\n"
53  "note: -f, -a, -o and normal unzip operation are mutually exclusive\n"
54  , pExe, pExe, pExe, pExe, pExe );
55 }
56 
57 namespace {
58 
59 class FileEmitContext : public EmitContext
60 {
61  oslFileHandle m_aHandle;
62  oslFileHandle m_aReadHandle;
63  unsigned int m_nReadLen;
64 
65  void openReadFile( const char* pOrigName );
66 
67  public:
68  FileEmitContext( const char* pFileName, const char* pOrigName, const PDFContainer* pTop );
69  virtual ~FileEmitContext() override;
70 
71  virtual bool write( const void* pBuf, unsigned int nLen ) noexcept override;
72  virtual unsigned int getCurPos() noexcept override;
73  virtual bool copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) noexcept override;
74  virtual unsigned int readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) noexcept override;
75 };
76 
77 }
78 
79 FileEmitContext::FileEmitContext( const char* pFileName, const char* pOrigName, const PDFContainer* pTop )
80  : EmitContext( pTop ),
81  m_aHandle( nullptr ),
82  m_aReadHandle( nullptr ),
83  m_nReadLen( 0 )
84 {
85  OUString aSysFile(
86  OStringToOUString( std::string_view( pFileName ), osl_getThreadTextEncoding() ) );
87  OUString aURL;
88  if( osl_getFileURLFromSystemPath( aSysFile.pData, &aURL.pData ) != osl_File_E_None )
89  {
90  fprintf( stderr, "filename conversion \"%s\" failed\n", pFileName );
91  return;
92  }
93 
94  if( osl_openFile( aURL.pData, &m_aHandle, osl_File_OpenFlag_Write ) == osl_File_E_None )
95  {
96  if( osl_setFileSize( m_aHandle, 0 ) != osl_File_E_None )
97  {
98  fprintf( stderr, "could not truncate %s\n", pFileName );
99  osl_closeFile( m_aHandle );
100  m_aHandle = nullptr;
101  }
102  }
103  else if( osl_openFile( aURL.pData, &m_aHandle,
104  osl_File_OpenFlag_Write |osl_File_OpenFlag_Create ) != osl_File_E_None )
105  {
106  fprintf( stderr, "could not open %s\n", pFileName );
107  return;
108  }
109  m_bDeflate = true;
110 
111  openReadFile( pOrigName );
112 }
113 
114 FileEmitContext::~FileEmitContext()
115 {
116  if( m_aHandle )
117  osl_closeFile( m_aHandle );
118  if( m_aReadHandle )
119  osl_closeFile( m_aReadHandle );
120 }
121 
122 void FileEmitContext::openReadFile( const char* pInFile )
123 {
124  OUString aSysFile(
125  OStringToOUString( std::string_view( pInFile ), osl_getThreadTextEncoding() ) );
126  OUString aURL;
127  if( osl_getFileURLFromSystemPath( aSysFile.pData, &aURL.pData ) != osl_File_E_None )
128  {
129  fprintf( stderr, "filename conversion \"%s\" failed\n", pInFile );
130  return;
131  }
132 
133  if( osl_openFile( aURL.pData, &m_aReadHandle, osl_File_OpenFlag_Read ) != osl_File_E_None )
134  {
135  fprintf( stderr, "could not open %s\n", pInFile );
136  return;
137  }
138 
139  if( osl_setFilePos( m_aReadHandle, osl_Pos_End, 0 ) != osl_File_E_None )
140  {
141  fprintf( stderr, "could not seek to end of %s\n", pInFile );
142  osl_closeFile( m_aReadHandle );
143  return;
144  }
145 
146  sal_uInt64 nFileSize = 0;
147  if( osl_getFilePos( m_aReadHandle, &nFileSize ) != osl_File_E_None )
148  {
149  fprintf( stderr, "could not get end pos of %s\n", pInFile );
150  osl_closeFile( m_aReadHandle );
151  return;
152  }
153 
154  m_nReadLen = static_cast<unsigned int>(nFileSize);
155 }
156 
157 bool FileEmitContext::write( const void* pBuf, unsigned int nLen ) noexcept
158 {
159  if( ! m_aHandle )
160  return false;
161 
162  sal_uInt64 nWrite = static_cast<sal_uInt64>(nLen);
163  sal_uInt64 nWritten = 0;
164  return (osl_writeFile( m_aHandle, pBuf, nWrite, &nWritten ) == osl_File_E_None)
165  && nWrite == nWritten;
166 }
167 
168 unsigned int FileEmitContext::getCurPos() noexcept
169 {
170  sal_uInt64 nFileSize = 0;
171  if( m_aHandle )
172  {
173  if( osl_getFilePos( m_aHandle, &nFileSize ) != osl_File_E_None )
174  nFileSize = 0;
175  }
176  return static_cast<unsigned int>(nFileSize);
177 }
178 
179 bool FileEmitContext::copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) noexcept
180 {
181  if( nOrigOffset + nLen > m_nReadLen )
182  return false;
183 
184  if( osl_setFilePos( m_aReadHandle, osl_Pos_Absolut, nOrigOffset ) != osl_File_E_None )
185  {
186  fprintf( stderr, "could not seek to offset %u\n", nOrigOffset );
187  return false;
188  }
189  void* pBuf = std::malloc( nLen );
190  if( ! pBuf )
191  return false;
192  sal_uInt64 nBytesRead = 0;
193  if( osl_readFile( m_aReadHandle, pBuf, nLen, &nBytesRead ) != osl_File_E_None
194  || nBytesRead != static_cast<sal_uInt64>(nLen) )
195  {
196  fprintf( stderr, "could not read %u bytes\n", nLen );
197  std::free( pBuf );
198  return false;
199  }
200  bool bRet = write( pBuf, nLen );
201  std::free( pBuf );
202  return bRet;
203 }
204 
205 unsigned int FileEmitContext::readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) noexcept
206 {
207  if( nOrigOffset + nLen > m_nReadLen )
208  return 0;
209 
210  if( osl_setFilePos( m_aReadHandle, osl_Pos_Absolut, nOrigOffset ) != osl_File_E_None )
211  {
212  fprintf( stderr, "could not seek to offset %u\n", nOrigOffset );
213  return 0;
214  }
215  sal_uInt64 nBytesRead = 0;
216  if( osl_readFile( m_aReadHandle, pBuf, nLen, &nBytesRead ) != osl_File_E_None )
217  return 0;
218  return static_cast<unsigned int>(nBytesRead);
219 }
220 
221 typedef int(*PDFFileHdl)(const char*, const char*, PDFFile*);
222 
223 static int handleFile( const char* pInFile, const char* pOutFile, const char* pPassword, PDFFileHdl pHdl )
224 {
225  int nRet = 0;
226  std::unique_ptr<PDFEntry> pEntry = pdfparse::PDFReader::read( pInFile );
227  if( pEntry )
228  {
229  PDFFile* pPDFFile = dynamic_cast<PDFFile*>(pEntry.get());
230  if( pPDFFile )
231  {
232  fprintf( stdout, "have a %s PDF file\n", pPDFFile->isEncrypted() ? "encrypted" : "unencrypted" );
233  if( pPassword )
234  fprintf( stdout, "password %s\n",
235  pPDFFile->setupDecryptionData( pPassword ) ? "matches" : "does not match" );
236  nRet = pHdl( pInFile, pOutFile, pPDFFile );
237  }
238  else
239  nRet = 20;
240  }
241  return nRet;
242 }
243 
244 static int write_unzipFile( const char* pInFile, const char* pOutFile, PDFFile* pPDFFile )
245 {
246  FileEmitContext aContext( pOutFile, pInFile, pPDFFile );
247  aContext.m_bDecrypt = pPDFFile->isEncrypted();
248  pPDFFile->emit(aContext);
249  return 0;
250 }
251 
252 static int write_addStreamArray( const char* pOutFile, PDFArray* pStreams, PDFFile* pPDFFile, const char* pInFile )
253 {
254  int nRet = 0;
255  unsigned int nArrayElements = pStreams->m_aSubElements.size();
256  for( unsigned int i = 0; i < nArrayElements-1 && nRet == 0; i++ )
257  {
258  PDFName* pMimeType = dynamic_cast<PDFName*>(pStreams->m_aSubElements[i].get());
259  PDFObjectRef* pStreamRef = dynamic_cast<PDFObjectRef*>(pStreams->m_aSubElements[i+1].get());
260  if( ! pMimeType )
261  fprintf( stderr, "error: no mimetype element\n" );
262  if( ! pStreamRef )
263  fprintf( stderr, "error: no stream ref element\n" );
264  if( pMimeType && pStreamRef )
265  {
266  fprintf( stdout, "found stream %d %d with mimetype %s\n",
267  pStreamRef->m_nNumber, pStreamRef->m_nGeneration,
268  pMimeType->m_aName.getStr() );
269  PDFObject* pObject = pPDFFile->findObject( pStreamRef->m_nNumber, pStreamRef->m_nGeneration );
270  if( pObject )
271  {
272  OString aOutStream = pOutFile +
273  OString::Concat("_stream_") +
274  OString::number( sal_Int32(pStreamRef->m_nNumber) ) +
275  "_" +
276  OString::number( sal_Int32(pStreamRef->m_nGeneration) );
277  FileEmitContext aContext( aOutStream.getStr(), pInFile, pPDFFile );
278  aContext.m_bDecrypt = pPDFFile->isEncrypted();
279  pObject->writeStream( aContext, pPDFFile );
280  }
281  else
282  {
283  fprintf( stderr, "object not found\n" );
284  nRet = 121;
285  }
286  }
287  else
288  nRet = 120;
289  }
290  return nRet;
291 }
292 
293 static int write_addStreams( const char* pInFile, const char* pOutFile, PDFFile* pPDFFile )
294 {
295  // find all trailers
296  int nRet = 0;
297  unsigned int nElements = pPDFFile->m_aSubElements.size();
298  for( unsigned i = 0; i < nElements && nRet == 0; i++ )
299  {
300  PDFTrailer* pTrailer = dynamic_cast<PDFTrailer*>(pPDFFile->m_aSubElements[i].get());
301  if( pTrailer && pTrailer->m_pDict )
302  {
303  // search for AdditionalStreams entry
304  auto add_stream = pTrailer->m_pDict->m_aMap.find( "AdditionalStreams" );
305  if( add_stream != pTrailer->m_pDict->m_aMap.end() )
306  {
307  PDFArray* pStreams = dynamic_cast<PDFArray*>(add_stream->second);
308  if( pStreams )
309  nRet = write_addStreamArray( pOutFile, pStreams, pPDFFile, pInFile );
310  }
311  }
312  }
313  return nRet;
314 }
315 
316 static int write_fonts( const char* i_pInFile, const char* i_pOutFile, PDFFile* i_pPDFFile )
317 {
318  unsigned int nElements = i_pPDFFile->m_aSubElements.size();
319  for (unsigned i = 0; i < nElements; i++)
320  {
321  // search FontDescriptors
322  PDFObject* pObj = dynamic_cast<PDFObject*>(i_pPDFFile->m_aSubElements[i].get());
323  if( ! pObj )
324  continue;
325  PDFDict* pDict = dynamic_cast<PDFDict*>(pObj->m_pObject);
326  if( ! pDict )
327  continue;
328 
329  std::unordered_map<OString,PDFEntry*>::iterator map_it =
330  pDict->m_aMap.find( "Type" );
331  if( map_it == pDict->m_aMap.end() )
332  continue;
333 
334  PDFName* pName = dynamic_cast<PDFName*>(map_it->second);
335  if( ! pName )
336  continue;
337  if( pName->m_aName != "FontDescriptor" )
338  continue;
339 
340  // the font name will be helpful, also there must be one in
341  // a font descriptor
342  map_it = pDict->m_aMap.find( "FontName" );
343  if( map_it == pDict->m_aMap.end() )
344  continue;
345  pName = dynamic_cast<PDFName*>(map_it->second);
346  if( ! pName )
347  continue;
348  OString aFontName( pName->m_aName );
349 
350  PDFObjectRef* pStreamRef = nullptr;
351  const char* pFileType = nullptr;
352  // we have a font descriptor, try for a type 1 font
353  map_it = pDict->m_aMap.find( "FontFile" );
354  if( map_it != pDict->m_aMap.end() )
355  {
356  pStreamRef = dynamic_cast<PDFObjectRef*>(map_it->second);
357  if( pStreamRef )
358  pFileType = "pfa";
359  }
360 
361  // perhaps it's a truetype file ?
362  if( ! pStreamRef )
363  {
364  map_it = pDict->m_aMap.find( "FontFile2" );
365  if( map_it != pDict->m_aMap.end() )
366  {
367  pStreamRef = dynamic_cast<PDFObjectRef*>(map_it->second);
368  if( pStreamRef )
369  pFileType = "ttf";
370  }
371  }
372 
373  if( ! pStreamRef )
374  continue;
375 
376  PDFObject* pStream = i_pPDFFile->findObject( pStreamRef );
377  if( ! pStream )
378  continue;
379 
380  OStringBuffer aOutStream( i_pOutFile );
381  aOutStream.append( "_font_" );
382  aOutStream.append( sal_Int32(pStreamRef->m_nNumber) );
383  aOutStream.append( "_" );
384  aOutStream.append( sal_Int32(pStreamRef->m_nGeneration) );
385  aOutStream.append( "_" );
386  aOutStream.append( aFontName );
387  if( pFileType )
388  {
389  aOutStream.append( "." );
390  aOutStream.append( pFileType );
391  }
392  FileEmitContext aContext( aOutStream.getStr(), i_pInFile, i_pPDFFile );
393  aContext.m_bDecrypt = i_pPDFFile->isEncrypted();
394  pStream->writeStream( aContext, i_pPDFFile );
395  }
396  return 0;
397 }
398 
399 static std::vector< std::pair< sal_Int32, sal_Int32 > > s_aEmitObjects;
400 
401 static int write_objects( const char* i_pInFile, const char* i_pOutFile, PDFFile* i_pPDFFile )
402 {
403  unsigned int nElements = s_aEmitObjects.size();
404  for (unsigned i = 0; i < nElements; i++)
405  {
406  sal_Int32 nObject = s_aEmitObjects[i].first;
407  sal_Int32 nGeneration = s_aEmitObjects[i].second;
408  PDFObject* pStream = i_pPDFFile->findObject( nObject, nGeneration );
409  if( ! pStream )
410  {
411  fprintf( stderr, "object %d %d not found !\n", static_cast<int>(nObject), static_cast<int>(nGeneration) );
412  continue;
413  }
414 
415  OString aOutStream = i_pOutFile +
416  OString::Concat("_stream_") +
417  OString::number( nObject ) +
418  "_" +
419  OString::number( nGeneration );
420  FileEmitContext aContext( aOutStream.getStr(), i_pInFile, i_pPDFFile );
421  aContext.m_bDecrypt = i_pPDFFile->isEncrypted();
422  pStream->writeStream( aContext, i_pPDFFile );
423  }
424  return 0;
425 }
426 
428 {
429  const char* pInFile = nullptr;
430  const char* pOutFile = nullptr;
431  const char* pPassword = nullptr;
432  OStringBuffer aOutFile( 256 );
434 
435  for( int nArg = 1; nArg < argc; nArg++ )
436  {
437  if( argv[nArg][0] == '-' )
438  {
439  if( ! rtl_str_compare( "-pw", argv[nArg] ) ||
440  ! rtl_str_compare( "--password" , argv[nArg] ) )
441  {
442  if( nArg == argc-1 )
443  {
444  fprintf( stderr, "no password given\n" );
445  return 1;
446  }
447  nArg++;
448  pPassword = argv[nArg];
449  }
450  else if( ! rtl_str_compare( "-h", argv[nArg] ) ||
451  ! rtl_str_compare( "--help", argv[nArg] ) )
452  {
453  printHelp( argv[0] );
454  return 0;
455  }
456  else if( ! rtl_str_compare( "-a", argv[nArg] ) ||
457  ! rtl_str_compare( "--extract-add-streams", argv[nArg] ) )
458  {
459  aHdl = write_addStreams;
460  }
461  else if( ! rtl_str_compare( "-f", argv[nArg] ) ||
462  ! rtl_str_compare( "--extract-fonts", argv[nArg] ) )
463  {
464  aHdl = write_fonts;
465  }
466  else if( ! rtl_str_compare( "-o", argv[nArg] ) ||
467  ! rtl_str_compare( "--extract-objects", argv[nArg] ) )
468  {
469  aHdl = write_objects;
470  nArg++;
471  if( nArg < argc )
472  {
473  OString aObjs( argv[nArg] );
474  sal_Int32 nIndex = 0;
475  while( nIndex != -1 )
476  {
477  OString aToken( aObjs.getToken( 0, ',', nIndex ) );
478  sal_Int32 nObject = 0;
479  sal_Int32 nGeneration = 0;
480  sal_Int32 nGenIndex = 0;
481  nObject = aToken.getToken( 0, ':', nGenIndex ).toInt32();
482  if( nGenIndex != -1 )
483  nGeneration = aToken.getToken( 0, ':', nGenIndex ).toInt32();
484  s_aEmitObjects.push_back( std::pair<sal_Int32,sal_Int32>(nObject,nGeneration) );
485  }
486  }
487  }
488  else
489  {
490  fprintf( stderr, "unrecognized option \"%s\"\n",
491  argv[nArg] );
492  printHelp( argv[0] );
493  return 1;
494  }
495  }
496  else if( pInFile == nullptr )
497  pInFile = argv[nArg];
498  else if( pOutFile == nullptr )
499  pOutFile = argv[nArg];
500  }
501  if( ! pInFile )
502  {
503  fprintf( stderr, "no input file given\n" );
504  return 10;
505  }
506  if( ! pOutFile )
507  {
508  OString aFile( pInFile );
509  if( aFile.getLength() > 0 )
510  {
511  if( aFile.getLength() > 4 )
512  {
513  if( aFile.matchIgnoreAsciiCase( ".pdf", aFile.getLength()-4 ) )
514  aOutFile.append( pInFile, aFile.getLength() - 4 );
515  else
516  aOutFile.append( aFile );
517  }
518  aOutFile.append( "_unzip.pdf" );
519  pOutFile = aOutFile.getStr();
520  }
521  else
522  {
523  fprintf( stderr, "no output file given\n" );
524  return 11;
525  }
526  }
527 
528  return handleFile( pInFile, pOutFile, pPassword, aHdl );
529 }
530 
531 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
static int write_fonts(const char *i_pInFile, const char *i_pOutFile, PDFFile *i_pPDFFile)
Definition: pdfunzip.cxx:316
URL aURL
sal_Int32 nIndex
static int write_unzipFile(const char *pInFile, const char *pOutFile, PDFFile *pPDFFile)
Definition: pdfunzip.cxx:244
static std::unique_ptr< PDFEntry > read(const char *pFileName)
Definition: pdfparse.cxx:608
const wchar_t *typedef int(__stdcall *DllNativeUnregProc)(int
PDFObject * findObject(unsigned int nNumber, unsigned int nGeneration) const
Definition: pdfentries.cxx:475
unsigned int m_nNumber
Definition: pdfparse.hxx:139
unsigned int m_nGeneration
Definition: pdfparse.hxx:140
EmbeddedObjectRef * pObject
std::vector< std::unique_ptr< PDFEntry > > m_aSubElements
Definition: pdfparse.hxx:161
sal_Int32 nElements
sal_uInt16 char * pName
int i
static void printHelp(const char *pExe)
Definition: pdfunzip.cxx:36
oslFileHandle m_aReadHandle
Definition: filterdet.cxx:54
unsigned int m_nReadLen
Definition: filterdet.cxx:55
int(* PDFFileHdl)(const char *, const char *, PDFFile *)
Definition: pdfunzip.cxx:221
static std::vector< std::pair< sal_Int32, sal_Int32 > > s_aEmitObjects
Definition: pdfunzip.cxx:399
bool isEncrypted() const
virtual bool emit(EmitContext &rWriteContext) const override
SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
Definition: pdfunzip.cxx:427
static int handleFile(const char *pInFile, const char *pOutFile, const char *pPassword, PDFFileHdl pHdl)
Definition: pdfunzip.cxx:223
PDFEntry * m_pObject
Definition: pdfparse.hxx:260
static int write_addStreamArray(const char *pOutFile, PDFArray *pStreams, PDFFile *pPDFFile, const char *pInFile)
Definition: pdfunzip.cxx:252
static int write_addStreams(const char *pInFile, const char *pOutFile, PDFFile *pPDFFile)
Definition: pdfunzip.cxx:293
bool setupDecryptionData(const OString &rPwd) const
static osl::File * pStream
Definition: emitcontext.cxx:32
static int write_objects(const char *i_pInFile, const char *i_pOutFile, PDFFile *i_pPDFFile)
Definition: pdfunzip.cxx:401
void writeStream(EmitContext &rContext, const PDFFile *pPDFFile) const
Definition: pdfentries.cxx:782