1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  *   Licensed to the Apache Software Foundation (ASF) under one or more
12  *   contributor license agreements. See the NOTICE file distributed
13  *   with this work for additional information regarding copyright
14  *   ownership. The ASF licenses this file to you under the Apache
15  *   License, Version 2.0 (the "License"); you may not use this file
16  *   except in compliance with the License. You may obtain a copy of
17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 
21 #include <stdio.h>
22 #include <string_view>
23 
24 #include <sal/main.h>
25 #include <osl/file.h>
26 #include <osl/thread.h>
27 #include <rtl/alloc.h>
28 #include <rtl/ustring.hxx>
29 #include <rtl/strbuf.hxx>
30 
31 #include <pdfparse.hxx>
32 
33 using namespace pdfparse;
34 
35 
printHelp(const char * pExe)36 static void printHelp( const char* pExe )
37 {
38     fprintf( stdout,
39     "USAGE: %s [-h,--help]\n"
40     "       %s [-pw, --password <password>] <inputfile> [<outputfile>]\n"
41     "       %s <-a, --extract-add-streams> [-pw, --password <password>] <inputfile> [<outputfile>]\n"
42     "       %s <-f, --extract-fonts> [-pw, --password <password>] <inputfile> [<outputfile>]\n"
43     "       %s <-o, --extract-objects> <o0>[:<g0>][,<o1>[:g1][,...]] [-pw, --password <password>] <inputfile> [<outputfile>]\n"
44     "  -h, --help: show help\n"
45     "  -a, --extract-add-streams: extracts additional streams to outputfile_object\n"
46     "      and prints the mimetype found to stdout\n"
47     "  -f, --extract-fonts: extracts fonts (currently only type1 and truetype are supported\n"
48     "  -o, --extract-objects: extracts object streams, the syntax of the argument is comma separated\n"
49     "      object numbers, where object number and generation number are separated by \':\'\n"
50     "      an omitted generation number defaults to 0\n"
51     "  -pw, --password: use password for decryption\n"
52     "\n"
53     "note: -f, -a, -o and normal unzip operation are mutually exclusive\n"
54     , pExe, pExe, pExe, pExe, pExe );
55 }
56 
57 namespace {
58 
59 class FileEmitContext : public EmitContext
60 {
61     oslFileHandle m_aHandle;
62     oslFileHandle m_aReadHandle;
63     unsigned int  m_nReadLen;
64 
65     void openReadFile( const char* pOrigName );
66 
67     public:
68     FileEmitContext( const char* pFileName, const char* pOrigName, const PDFContainer* pTop );
69     virtual ~FileEmitContext() override;
70 
71     virtual bool write( const void* pBuf, unsigned int nLen ) noexcept override;
72     virtual unsigned int getCurPos() noexcept override;
73     virtual bool copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) noexcept override;
74     virtual unsigned int readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) noexcept override;
75 };
76 
77 }
78 
FileEmitContext(const char * pFileName,const char * pOrigName,const PDFContainer * pTop)79 FileEmitContext::FileEmitContext( const char* pFileName, const char* pOrigName, const PDFContainer* pTop )
80     : EmitContext( pTop ),
81       m_aHandle( nullptr ),
82       m_aReadHandle( nullptr ),
83       m_nReadLen( 0 )
84 {
85     OUString aSysFile(
86         OStringToOUString( std::string_view( pFileName ), osl_getThreadTextEncoding() ) );
87     OUString aURL;
88     if( osl_getFileURLFromSystemPath( aSysFile.pData, &aURL.pData ) != osl_File_E_None )
89     {
90         fprintf( stderr, "filename conversion \"%s\" failed\n", pFileName );
91         return;
92     }
93 
94     if( osl_openFile( aURL.pData, &m_aHandle, osl_File_OpenFlag_Write ) == osl_File_E_None )
95     {
96         if( osl_setFileSize( m_aHandle, 0 ) != osl_File_E_None )
97         {
98             fprintf( stderr, "could not truncate %s\n", pFileName );
99             osl_closeFile( m_aHandle );
100             m_aHandle = nullptr;
101         }
102     }
103     else if( osl_openFile( aURL.pData, &m_aHandle,
104             osl_File_OpenFlag_Write |osl_File_OpenFlag_Create ) != osl_File_E_None )
105     {
106         fprintf( stderr, "could not open %s\n", pFileName );
107         return;
108     }
109     m_bDeflate = true;
110 
111     openReadFile( pOrigName );
112 }
113 
~FileEmitContext()114 FileEmitContext::~FileEmitContext()
115 {
116     if( m_aHandle )
117         osl_closeFile( m_aHandle );
118     if( m_aReadHandle )
119         osl_closeFile( m_aReadHandle );
120 }
121 
openReadFile(const char * pInFile)122 void FileEmitContext::openReadFile( const char* pInFile )
123 {
124     OUString aSysFile(
125         OStringToOUString( std::string_view( pInFile ), osl_getThreadTextEncoding() ) );
126     OUString aURL;
127     if( osl_getFileURLFromSystemPath( aSysFile.pData, &aURL.pData ) != osl_File_E_None )
128     {
129         fprintf( stderr, "filename conversion \"%s\" failed\n", pInFile );
130         return;
131     }
132 
133     if( osl_openFile( aURL.pData, &m_aReadHandle, osl_File_OpenFlag_Read ) != osl_File_E_None )
134     {
135         fprintf( stderr, "could not open %s\n", pInFile );
136         return;
137     }
138 
139     if( osl_setFilePos( m_aReadHandle, osl_Pos_End, 0 ) != osl_File_E_None )
140     {
141         fprintf( stderr, "could not seek to end of %s\n", pInFile );
142         osl_closeFile( m_aReadHandle );
143         return;
144     }
145 
146     sal_uInt64 nFileSize = 0;
147     if( osl_getFilePos( m_aReadHandle, &nFileSize ) != osl_File_E_None )
148     {
149         fprintf( stderr, "could not get end pos of %s\n", pInFile );
150         osl_closeFile( m_aReadHandle );
151         return;
152     }
153 
154     m_nReadLen = static_cast<unsigned int>(nFileSize);
155 }
156 
write(const void * pBuf,unsigned int nLen)157 bool FileEmitContext::write( const void* pBuf, unsigned int nLen ) noexcept
158 {
159     if( ! m_aHandle )
160         return false;
161 
162     sal_uInt64 nWrite = static_cast<sal_uInt64>(nLen);
163     sal_uInt64 nWritten = 0;
164     return (osl_writeFile( m_aHandle, pBuf, nWrite, &nWritten ) == osl_File_E_None)
165            && nWrite == nWritten;
166 }
167 
getCurPos()168 unsigned int FileEmitContext::getCurPos() noexcept
169 {
170     sal_uInt64 nFileSize = 0;
171     if( m_aHandle )
172     {
173         if( osl_getFilePos( m_aHandle, &nFileSize ) != osl_File_E_None )
174             nFileSize = 0;
175     }
176     return static_cast<unsigned int>(nFileSize);
177 }
178 
copyOrigBytes(unsigned int nOrigOffset,unsigned int nLen)179 bool FileEmitContext::copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) noexcept
180 {
181     if( nOrigOffset + nLen > m_nReadLen )
182         return false;
183 
184     if( osl_setFilePos( m_aReadHandle, osl_Pos_Absolut, nOrigOffset ) != osl_File_E_None )
185     {
186         fprintf( stderr, "could not seek to offset %u\n", nOrigOffset );
187         return false;
188     }
189     void* pBuf = std::malloc( nLen );
190     if( ! pBuf )
191         return false;
192     sal_uInt64 nBytesRead = 0;
193     if( osl_readFile( m_aReadHandle, pBuf, nLen, &nBytesRead ) != osl_File_E_None
194         || nBytesRead != static_cast<sal_uInt64>(nLen) )
195     {
196         fprintf( stderr, "could not read %u bytes\n", nLen );
197         std::free( pBuf );
198         return false;
199     }
200     bool bRet = write( pBuf, nLen );
201     std::free( pBuf );
202     return bRet;
203 }
204 
readOrigBytes(unsigned int nOrigOffset,unsigned int nLen,void * pBuf)205 unsigned int FileEmitContext::readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) noexcept
206 {
207     if( nOrigOffset + nLen > m_nReadLen )
208         return 0;
209 
210     if( osl_setFilePos( m_aReadHandle, osl_Pos_Absolut, nOrigOffset ) != osl_File_E_None )
211     {
212         fprintf( stderr, "could not seek to offset %u\n", nOrigOffset );
213         return 0;
214     }
215     sal_uInt64 nBytesRead = 0;
216     if( osl_readFile( m_aReadHandle, pBuf, nLen, &nBytesRead ) != osl_File_E_None )
217         return 0;
218     return static_cast<unsigned int>(nBytesRead);
219 }
220 
221 typedef int(*PDFFileHdl)(const char*, const char*, PDFFile*);
222 
handleFile(const char * pInFile,const char * pOutFile,const char * pPassword,PDFFileHdl pHdl)223 static int handleFile( const char* pInFile, const char* pOutFile, const char* pPassword, PDFFileHdl pHdl )
224 {
225     int nRet = 0;
226     std::unique_ptr<PDFEntry> pEntry = pdfparse::PDFReader::read( pInFile );
227     if( pEntry )
228     {
229         PDFFile* pPDFFile = dynamic_cast<PDFFile*>(pEntry.get());
230         if( pPDFFile )
231         {
232             fprintf( stdout, "have a %s PDF file\n", pPDFFile->isEncrypted() ? "encrypted" : "unencrypted" );
233             if( pPassword )
234                 fprintf( stdout, "password %s\n",
235                          pPDFFile->setupDecryptionData( pPassword ) ? "matches" : "does not match" );
236             nRet = pHdl( pInFile, pOutFile, pPDFFile );
237         }
238         else
239             nRet = 20;
240     }
241     return nRet;
242 }
243 
write_unzipFile(const char * pInFile,const char * pOutFile,PDFFile * pPDFFile)244 static int write_unzipFile( const char* pInFile, const char* pOutFile, PDFFile* pPDFFile )
245 {
246     FileEmitContext aContext( pOutFile, pInFile, pPDFFile );
247     aContext.m_bDecrypt = pPDFFile->isEncrypted();
248     pPDFFile->emit(aContext);
249     return 0;
250 }
251 
write_addStreamArray(const char * pOutFile,PDFArray * pStreams,PDFFile * pPDFFile,const char * pInFile)252 static int write_addStreamArray( const char* pOutFile, PDFArray* pStreams, PDFFile* pPDFFile, const char* pInFile )
253 {
254     int nRet = 0;
255     unsigned int nArrayElements = pStreams->m_aSubElements.size();
256     for( unsigned int i = 0; i < nArrayElements-1 && nRet == 0; i++ )
257     {
258         PDFName* pMimeType = dynamic_cast<PDFName*>(pStreams->m_aSubElements[i].get());
259         PDFObjectRef* pStreamRef = dynamic_cast<PDFObjectRef*>(pStreams->m_aSubElements[i+1].get());
260         if( ! pMimeType )
261             fprintf( stderr, "error: no mimetype element\n" );
262         if( ! pStreamRef )
263             fprintf( stderr, "error: no stream ref element\n" );
264         if( pMimeType && pStreamRef )
265         {
266             fprintf( stdout, "found stream %d %d with mimetype %s\n",
267                      pStreamRef->m_nNumber, pStreamRef->m_nGeneration,
268                      pMimeType->m_aName.getStr() );
269             PDFObject* pObject = pPDFFile->findObject( pStreamRef->m_nNumber, pStreamRef->m_nGeneration );
270             if( pObject )
271             {
272                 OString aOutStream = pOutFile +
273                     OString::Concat("_stream_") +
274                     OString::number( sal_Int32(pStreamRef->m_nNumber) ) +
275                     "_" +
276                     OString::number( sal_Int32(pStreamRef->m_nGeneration) );
277                 FileEmitContext aContext( aOutStream.getStr(), pInFile, pPDFFile );
278                 aContext.m_bDecrypt = pPDFFile->isEncrypted();
279                 pObject->writeStream( aContext, pPDFFile );
280             }
281             else
282             {
283                 fprintf( stderr, "object not found\n" );
284                 nRet = 121;
285             }
286         }
287         else
288             nRet = 120;
289     }
290     return nRet;
291 }
292 
write_addStreams(const char * pInFile,const char * pOutFile,PDFFile * pPDFFile)293 static int write_addStreams( const char* pInFile, const char* pOutFile, PDFFile* pPDFFile )
294 {
295     // find all trailers
296     int nRet = 0;
297     unsigned int nElements = pPDFFile->m_aSubElements.size();
298     for( unsigned i = 0; i < nElements && nRet == 0; i++ )
299     {
300         PDFTrailer* pTrailer = dynamic_cast<PDFTrailer*>(pPDFFile->m_aSubElements[i].get());
301         if( pTrailer && pTrailer->m_pDict )
302         {
303             // search for AdditionalStreams entry
304             auto add_stream = pTrailer->m_pDict->m_aMap.find( "AdditionalStreams" );
305             if( add_stream != pTrailer->m_pDict->m_aMap.end() )
306             {
307                 PDFArray* pStreams = dynamic_cast<PDFArray*>(add_stream->second);
308                 if( pStreams )
309                     nRet = write_addStreamArray( pOutFile, pStreams, pPDFFile, pInFile );
310             }
311         }
312     }
313     return nRet;
314 }
315 
write_fonts(const char * i_pInFile,const char * i_pOutFile,PDFFile * i_pPDFFile)316 static int write_fonts( const char* i_pInFile, const char* i_pOutFile, PDFFile* i_pPDFFile )
317 {
318     unsigned int nElements = i_pPDFFile->m_aSubElements.size();
319     for (unsigned i = 0; i < nElements; i++)
320     {
321         // search FontDescriptors
322         PDFObject* pObj = dynamic_cast<PDFObject*>(i_pPDFFile->m_aSubElements[i].get());
323         if( ! pObj )
324             continue;
325         PDFDict* pDict = dynamic_cast<PDFDict*>(pObj->m_pObject);
326         if( ! pDict )
327             continue;
328 
329         std::unordered_map<OString,PDFEntry*>::iterator map_it =
330                 pDict->m_aMap.find( "Type" );
331         if( map_it == pDict->m_aMap.end() )
332             continue;
333 
334         PDFName* pName = dynamic_cast<PDFName*>(map_it->second);
335         if( ! pName )
336             continue;
337         if( pName->m_aName != "FontDescriptor" )
338             continue;
339 
340         // the font name will be helpful, also there must be one in
341         // a font descriptor
342         map_it = pDict->m_aMap.find( "FontName" );
343         if( map_it == pDict->m_aMap.end() )
344             continue;
345         pName = dynamic_cast<PDFName*>(map_it->second);
346         if( ! pName )
347             continue;
348         OString aFontName( pName->m_aName );
349 
350         PDFObjectRef* pStreamRef = nullptr;
351         const char* pFileType = nullptr;
352         // we have a font descriptor, try for a type 1 font
353         map_it = pDict->m_aMap.find( "FontFile" );
354         if( map_it != pDict->m_aMap.end() )
355         {
356             pStreamRef = dynamic_cast<PDFObjectRef*>(map_it->second);
357             if( pStreamRef )
358                 pFileType = "pfa";
359         }
360 
361         // perhaps it's a truetype file ?
362         if( ! pStreamRef )
363         {
364             map_it  = pDict->m_aMap.find( "FontFile2" );
365             if( map_it != pDict->m_aMap.end() )
366             {
367                 pStreamRef = dynamic_cast<PDFObjectRef*>(map_it->second);
368                 if( pStreamRef )
369                     pFileType = "ttf";
370             }
371         }
372 
373         if( ! pStreamRef )
374             continue;
375 
376         PDFObject* pStream = i_pPDFFile->findObject( pStreamRef );
377         if( ! pStream )
378             continue;
379 
380         OStringBuffer aOutStream( i_pOutFile );
381         aOutStream.append( "_font_" );
382         aOutStream.append( sal_Int32(pStreamRef->m_nNumber) );
383         aOutStream.append( "_" );
384         aOutStream.append( sal_Int32(pStreamRef->m_nGeneration) );
385         aOutStream.append( "_" );
386         aOutStream.append( aFontName );
387         if( pFileType )
388         {
389             aOutStream.append( "." );
390             aOutStream.append( pFileType );
391         }
392         FileEmitContext aContext( aOutStream.getStr(), i_pInFile, i_pPDFFile );
393         aContext.m_bDecrypt = i_pPDFFile->isEncrypted();
394         pStream->writeStream( aContext, i_pPDFFile );
395     }
396     return 0;
397 }
398 
399 static std::vector< std::pair< sal_Int32, sal_Int32 > > s_aEmitObjects;
400 
write_objects(const char * i_pInFile,const char * i_pOutFile,PDFFile * i_pPDFFile)401 static int write_objects( const char* i_pInFile, const char* i_pOutFile, PDFFile* i_pPDFFile )
402 {
403     unsigned int nElements = s_aEmitObjects.size();
404     for (unsigned i = 0; i < nElements; i++)
405     {
406         sal_Int32 nObject     = s_aEmitObjects[i].first;
407         sal_Int32 nGeneration = s_aEmitObjects[i].second;
408         PDFObject* pStream = i_pPDFFile->findObject( nObject, nGeneration );
409         if( ! pStream )
410         {
411             fprintf( stderr, "object %d %d not found !\n", static_cast<int>(nObject), static_cast<int>(nGeneration) );
412             continue;
413         }
414 
415         OString aOutStream = i_pOutFile +
416             OString::Concat("_stream_") +
417             OString::number( nObject ) +
418             "_"  +
419             OString::number( nGeneration );
420         FileEmitContext aContext( aOutStream.getStr(), i_pInFile, i_pPDFFile );
421         aContext.m_bDecrypt = i_pPDFFile->isEncrypted();
422         pStream->writeStream( aContext, i_pPDFFile );
423     }
424     return 0;
425 }
426 
SAL_IMPLEMENT_MAIN_WITH_ARGS(argc,argv)427 SAL_IMPLEMENT_MAIN_WITH_ARGS( argc, argv )
428 {
429     const char* pInFile = nullptr;
430     const char* pOutFile = nullptr;
431     const char* pPassword = nullptr;
432     OStringBuffer aOutFile( 256 );
433     PDFFileHdl aHdl = write_unzipFile;
434 
435     for( int nArg = 1; nArg < argc; nArg++ )
436     {
437         if( argv[nArg][0] == '-' )
438         {
439             if( ! rtl_str_compare( "-pw", argv[nArg] ) ||
440                 ! rtl_str_compare( "--password" , argv[nArg] ) )
441             {
442                 if( nArg == argc-1 )
443                 {
444                     fprintf( stderr, "no password given\n" );
445                     return 1;
446                 }
447                 nArg++;
448                 pPassword = argv[nArg];
449             }
450             else if( ! rtl_str_compare( "-h", argv[nArg] ) ||
451                 ! rtl_str_compare( "--help", argv[nArg] ) )
452             {
453                 printHelp( argv[0] );
454                 return 0;
455             }
456             else if( ! rtl_str_compare( "-a", argv[nArg] ) ||
457                 ! rtl_str_compare( "--extract-add-streams", argv[nArg] ) )
458             {
459                 aHdl = write_addStreams;
460             }
461             else if( ! rtl_str_compare( "-f", argv[nArg] ) ||
462                 ! rtl_str_compare( "--extract-fonts", argv[nArg] ) )
463             {
464                 aHdl = write_fonts;
465             }
466             else if( ! rtl_str_compare( "-o", argv[nArg] ) ||
467                 ! rtl_str_compare( "--extract-objects", argv[nArg] ) )
468             {
469                 aHdl = write_objects;
470                 nArg++;
471                 if( nArg < argc )
472                 {
473                     OString aObjs( argv[nArg] );
474                     sal_Int32 nIndex = 0;
475                     while( nIndex != -1 )
476                     {
477                         OString aToken( aObjs.getToken( 0, ',', nIndex ) );
478                         sal_Int32 nObject = 0;
479                         sal_Int32 nGeneration = 0;
480                         sal_Int32 nGenIndex = 0;
481                         nObject = aToken.getToken( 0, ':', nGenIndex ).toInt32();
482                         if( nGenIndex != -1 )
483                             nGeneration = aToken.getToken( 0, ':', nGenIndex ).toInt32();
484                         s_aEmitObjects.push_back( std::pair<sal_Int32,sal_Int32>(nObject,nGeneration) );
485                     }
486                 }
487             }
488             else
489             {
490                 fprintf( stderr, "unrecognized option \"%s\"\n",
491                          argv[nArg] );
492                 printHelp( argv[0] );
493                 return 1;
494             }
495         }
496         else if( pInFile == nullptr )
497             pInFile = argv[nArg];
498         else if( pOutFile == nullptr )
499             pOutFile = argv[nArg];
500     }
501     if( ! pInFile )
502     {
503         fprintf( stderr, "no input file given\n" );
504         return 10;
505     }
506     if( ! pOutFile )
507     {
508         OString aFile( pInFile );
509         if( aFile.getLength() > 0 )
510         {
511             if( aFile.getLength() > 4 )
512             {
513                 if( aFile.matchIgnoreAsciiCase( ".pdf", aFile.getLength()-4 ) )
514                     aOutFile.append( pInFile, aFile.getLength() - 4 );
515                 else
516                     aOutFile.append( aFile );
517             }
518             aOutFile.append( "_unzip.pdf" );
519             pOutFile = aOutFile.getStr();
520         }
521         else
522         {
523             fprintf( stderr, "no output file given\n" );
524             return 11;
525         }
526     }
527 
528     return handleFile( pInFile, pOutFile, pPassword, aHdl );
529 }
530 
531 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
532