1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
20
21 #include "filterdet.hxx"
22 #include "inc/pdfihelper.hxx"
23 #include "inc/pdfparse.hxx"
24
25 #include <osl/file.h>
26 #include <osl/thread.h>
27 #include <rtl/digest.h>
28 #include <sal/log.hxx>
29 #include <com/sun/star/io/IOException.hpp>
30 #include <com/sun/star/io/XInputStream.hpp>
31 #include <com/sun/star/io/XStream.hpp>
32 #include <com/sun/star/io/XSeekable.hpp>
33 #include <com/sun/star/io/TempFile.hpp>
34 #include <com/sun/star/task/XInteractionHandler.hpp>
35 #include <comphelper/fileurl.hxx>
36 #include <comphelper/hash.hxx>
37 #include <cppuhelper/supportsservice.hxx>
38 #include <tools/diagnose_ex.h>
39 #include <memory>
40 #include <string.h>
41
42 using namespace com::sun::star;
43
44 namespace pdfi
45 {
46
47 // TODO(T3): locking/thread safety
48
49 class FileEmitContext : public pdfparse::EmitContext
50 {
51 private:
52 oslFileHandle m_aReadHandle;
53 unsigned int m_nReadLen;
54 uno::Reference< io::XStream > m_xContextStream;
55 uno::Reference< io::XSeekable > m_xSeek;
56 uno::Reference< io::XOutputStream > m_xOut;
57
58 public:
59 FileEmitContext( const OUString& rOrigFile,
60 const uno::Reference< uno::XComponentContext >& xContext,
61 const pdfparse::PDFContainer* pTop );
62 virtual ~FileEmitContext() override;
63
64 virtual bool write( const void* pBuf, unsigned int nLen ) override;
65 virtual unsigned int getCurPos() override;
66 virtual bool copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) override;
67 virtual unsigned int readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) override;
68
getContextStream() const69 const uno::Reference< io::XStream >& getContextStream() const { return m_xContextStream; }
70 };
71
FileEmitContext(const OUString & rOrigFile,const uno::Reference<uno::XComponentContext> & xContext,const pdfparse::PDFContainer * pTop)72 FileEmitContext::FileEmitContext( const OUString& rOrigFile,
73 const uno::Reference< uno::XComponentContext >& xContext,
74 const pdfparse::PDFContainer* pTop ) :
75 pdfparse::EmitContext( pTop ),
76 m_aReadHandle(nullptr),
77 m_nReadLen(0),
78 m_xContextStream(),
79 m_xSeek(),
80 m_xOut()
81 {
82 m_xContextStream.set( io::TempFile::create(xContext), uno::UNO_QUERY_THROW );
83 m_xOut = m_xContextStream->getOutputStream();
84 m_xSeek.set(m_xOut, uno::UNO_QUERY_THROW );
85
86 oslFileError aErr = osl_File_E_None;
87 if( (aErr=osl_openFile( rOrigFile.pData,
88 &m_aReadHandle,
89 osl_File_OpenFlag_Read )) == osl_File_E_None )
90 {
91 if( (aErr=osl_setFilePos( m_aReadHandle,
92 osl_Pos_End,
93 0 )) == osl_File_E_None )
94 {
95 sal_uInt64 nFileSize = 0;
96 if( (aErr=osl_getFilePos( m_aReadHandle,
97 &nFileSize )) == osl_File_E_None )
98 {
99 m_nReadLen = static_cast<unsigned int>(nFileSize);
100 }
101 }
102 if( aErr != osl_File_E_None )
103 {
104 osl_closeFile( m_aReadHandle );
105 m_aReadHandle = nullptr;
106 }
107 }
108 m_bDeflate = true;
109 }
110
~FileEmitContext()111 FileEmitContext::~FileEmitContext()
112 {
113 if( m_aReadHandle )
114 osl_closeFile( m_aReadHandle );
115 }
116
write(const void * pBuf,unsigned int nLen)117 bool FileEmitContext::write( const void* pBuf, unsigned int nLen )
118 {
119 if( ! m_xOut.is() )
120 return false;
121
122 uno::Sequence< sal_Int8 > aSeq( nLen );
123 memcpy( aSeq.getArray(), pBuf, nLen );
124 m_xOut->writeBytes( aSeq );
125 return true;
126 }
127
getCurPos()128 unsigned int FileEmitContext::getCurPos()
129 {
130 unsigned int nPos = 0;
131 if( m_xSeek.is() )
132 {
133 nPos = static_cast<unsigned int>( m_xSeek->getPosition() );
134 }
135 return nPos;
136 }
137
copyOrigBytes(unsigned int nOrigOffset,unsigned int nLen)138 bool FileEmitContext::copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen )
139 {
140 if( nOrigOffset + nLen > m_nReadLen )
141 return false;
142
143 if( osl_setFilePos( m_aReadHandle, osl_Pos_Absolut, nOrigOffset ) != osl_File_E_None )
144 return false;
145
146 uno::Sequence< sal_Int8 > aSeq( nLen );
147
148 sal_uInt64 nBytesRead = 0;
149 if( osl_readFile( m_aReadHandle,
150 aSeq.getArray(),
151 nLen,
152 &nBytesRead ) != osl_File_E_None
153 || nBytesRead != static_cast<sal_uInt64>(nLen) )
154 {
155 return false;
156 }
157
158 m_xOut->writeBytes( aSeq );
159 return true;
160 }
161
readOrigBytes(unsigned int nOrigOffset,unsigned int nLen,void * pBuf)162 unsigned int FileEmitContext::readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf )
163 {
164 if( nOrigOffset + nLen > m_nReadLen )
165 return 0;
166
167 if( osl_setFilePos( m_aReadHandle,
168 osl_Pos_Absolut,
169 nOrigOffset ) != osl_File_E_None )
170 {
171 return 0;
172 }
173
174 sal_uInt64 nBytesRead = 0;
175 if( osl_readFile( m_aReadHandle,
176 pBuf,
177 nLen,
178 &nBytesRead ) != osl_File_E_None )
179 {
180 return 0;
181 }
182 return static_cast<unsigned int>(nBytesRead);
183 }
184
185
PDFDetector(const uno::Reference<uno::XComponentContext> & xContext)186 PDFDetector::PDFDetector( const uno::Reference< uno::XComponentContext >& xContext) :
187 PDFDetectorBase( m_aMutex ),
188 m_xContext( xContext )
189 {}
190
191 // XExtendedFilterDetection
detect(uno::Sequence<beans::PropertyValue> & rFilterData)192 OUString SAL_CALL PDFDetector::detect( uno::Sequence< beans::PropertyValue >& rFilterData )
193 {
194 osl::MutexGuard const guard( m_aMutex );
195 bool bSuccess = false;
196
197 // get the InputStream carrying the PDF content
198 uno::Reference< io::XInputStream > xInput;
199 uno::Reference< io::XStream > xEmbedStream;
200 OUString aOutFilterName, aOutTypeName;
201 OUString aURL;
202 OUString aPwd;
203 const beans::PropertyValue* pAttribs = rFilterData.getConstArray();
204 sal_Int32 nAttribs = rFilterData.getLength();
205 sal_Int32 nFilterNamePos = -1;
206 sal_Int32 nPwdPos = -1;
207 for( sal_Int32 i = 0; i < nAttribs; i++ )
208 {
209 OUString aVal( "<no string>" );
210 pAttribs[i].Value >>= aVal;
211 SAL_INFO( "sdext.pdfimport", "doDetection: Attrib: " + pAttribs[i].Name + " = " + aVal);
212
213 if ( pAttribs[i].Name == "InputStream" )
214 pAttribs[i].Value >>= xInput;
215 else if ( pAttribs[i].Name == "URL" )
216 pAttribs[i].Value >>= aURL;
217 else if ( pAttribs[i].Name == "FilterName" )
218 nFilterNamePos = i;
219 else if ( pAttribs[i].Name == "Password" )
220 {
221 nPwdPos = i;
222 pAttribs[i].Value >>= aPwd;
223 }
224 }
225 if( xInput.is() )
226 {
227 oslFileHandle aFile = nullptr;
228 try {
229 uno::Reference< io::XSeekable > xSeek( xInput, uno::UNO_QUERY );
230 if( xSeek.is() )
231 xSeek->seek( 0 );
232 // read the first 1024 byte (see PDF reference implementation note 12)
233 const sal_Int32 nHeaderSize = 1024;
234 uno::Sequence< sal_Int8 > aBuf( nHeaderSize );
235 sal_uInt64 nBytes = xInput->readBytes( aBuf, nHeaderSize );
236 if( nBytes > 5 )
237 {
238 const sal_Int8* pBytes = aBuf.getConstArray();
239 for( sal_uInt64 i = 0; i < nBytes-5; i++ )
240 {
241 if( pBytes[i] == '%' &&
242 pBytes[i+1] == 'P' &&
243 pBytes[i+2] == 'D' &&
244 pBytes[i+3] == 'F' &&
245 pBytes[i+4] == '-' )
246 {
247 bSuccess = true;
248 break;
249 }
250 }
251 }
252
253 // check for hybrid PDF
254 if( bSuccess &&
255 ( aURL.isEmpty() || !comphelper::isFileUrl(aURL) )
256 )
257 {
258 sal_uInt64 nWritten = 0;
259 if( osl_createTempFile( nullptr, &aFile, &aURL.pData ) != osl_File_E_None )
260 {
261 bSuccess = false;
262 }
263 else
264 {
265 SAL_INFO( "sdext.pdfimport", "created temp file " + aURL );
266
267 osl_writeFile( aFile, aBuf.getConstArray(), nBytes, &nWritten );
268
269 SAL_WARN_IF( nWritten != nBytes, "sdext.pdfimport", "writing of header bytes failed" );
270
271 if( nWritten == nBytes )
272 {
273 const sal_uInt32 nBufSize = 4096;
274 aBuf = uno::Sequence<sal_Int8>(nBufSize);
275 // copy the bytes
276 do
277 {
278 nBytes = xInput->readBytes( aBuf, nBufSize );
279 if( nBytes > 0 )
280 {
281 osl_writeFile( aFile, aBuf.getConstArray(), nBytes, &nWritten );
282 if( nWritten != nBytes )
283 {
284 bSuccess = false;
285 break;
286 }
287 }
288 } while( nBytes == nBufSize );
289 }
290 }
291 osl_closeFile( aFile );
292 }
293 } catch (const css::io::IOException &) {
294 TOOLS_WARN_EXCEPTION("sdext.pdfimport", "caught");
295 return OUString();
296 }
297 OUString aEmbedMimetype;
298 xEmbedStream = getAdditionalStream( aURL, aEmbedMimetype, aPwd, m_xContext, rFilterData, false );
299 if( aFile )
300 osl_removeFile( aURL.pData );
301 if( !aEmbedMimetype.isEmpty() )
302 {
303 if( aEmbedMimetype == "application/vnd.oasis.opendocument.text"
304 || aEmbedMimetype == "application/vnd.oasis.opendocument.text-master" )
305 aOutFilterName = "writer_pdf_addstream_import";
306 else if ( aEmbedMimetype == "application/vnd.oasis.opendocument.presentation" )
307 aOutFilterName = "impress_pdf_addstream_import";
308 else if( aEmbedMimetype == "application/vnd.oasis.opendocument.graphics"
309 || aEmbedMimetype == "application/vnd.oasis.opendocument.drawing" )
310 aOutFilterName = "draw_pdf_addstream_import";
311 else if ( aEmbedMimetype == "application/vnd.oasis.opendocument.spreadsheet" )
312 aOutFilterName = "calc_pdf_addstream_import";
313 }
314 }
315
316 if( bSuccess )
317 {
318 if( !aOutFilterName.isEmpty() )
319 {
320 if( nFilterNamePos == -1 )
321 {
322 nFilterNamePos = nAttribs;
323 rFilterData.realloc( ++nAttribs );
324 rFilterData[ nFilterNamePos ].Name = "FilterName";
325 }
326 aOutTypeName = "pdf_Portable_Document_Format";
327
328 rFilterData[nFilterNamePos].Value <<= aOutFilterName;
329 if( xEmbedStream.is() )
330 {
331 rFilterData.realloc( ++nAttribs );
332 rFilterData[nAttribs-1].Name = "EmbeddedSubstream";
333 rFilterData[nAttribs-1].Value <<= xEmbedStream;
334 }
335 if( !aPwd.isEmpty() )
336 {
337 if( nPwdPos == -1 )
338 {
339 nPwdPos = nAttribs;
340 rFilterData.realloc( ++nAttribs );
341 rFilterData[ nPwdPos ].Name = "Password";
342 }
343 rFilterData[ nPwdPos ].Value <<= aPwd;
344 }
345 }
346 else
347 {
348 if( nFilterNamePos == -1 )
349 {
350 nFilterNamePos = nAttribs;
351 rFilterData.realloc( ++nAttribs );
352 rFilterData[ nFilterNamePos ].Name = "FilterName";
353 }
354
355 const sal_Int32 nDocumentType = 0; //const sal_Int32 nDocumentType = queryDocumentTypeDialog(m_xContext,aURL);
356 if( nDocumentType < 0 )
357 {
358 return OUString();
359 }
360 else switch( nDocumentType )
361 {
362 case 0:
363 rFilterData[nFilterNamePos].Value <<= OUString( "draw_pdf_import" );
364 break;
365
366 case 1:
367 rFilterData[nFilterNamePos].Value <<= OUString( "impress_pdf_import" );
368 break;
369
370 case 2:
371 rFilterData[nFilterNamePos].Value <<= OUString( "writer_pdf_import" );
372 break;
373
374 default:
375 assert(!"Unexpected case");
376 }
377
378 aOutTypeName = "pdf_Portable_Document_Format";
379 }
380 }
381
382 return aOutTypeName;
383 }
384
getImplementationName()385 OUString PDFDetector::getImplementationName()
386 {
387 return "org.libreoffice.comp.documents.PDFDetector";
388 }
389
supportsService(OUString const & ServiceName)390 sal_Bool PDFDetector::supportsService(OUString const & ServiceName)
391 {
392 return cppu::supportsService(this, ServiceName);
393 }
394
getSupportedServiceNames()395 css::uno::Sequence<OUString> PDFDetector::getSupportedServiceNames()
396 {
397 return css::uno::Sequence<OUString>{"com.sun.star.document.ImportFilter"};
398 }
399
checkDocChecksum(const OUString & rInPDFFileURL,sal_uInt32 nBytes,const OUString & rChkSum)400 bool checkDocChecksum( const OUString& rInPDFFileURL,
401 sal_uInt32 nBytes,
402 const OUString& rChkSum )
403 {
404 if( rChkSum.getLength() != 2* RTL_DIGEST_LENGTH_MD5 )
405 {
406 SAL_INFO(
407 "sdext.pdfimport",
408 "checksum of length " << rChkSum.getLength() << ", expected "
409 << 2*RTL_DIGEST_LENGTH_MD5);
410 return false;
411 }
412
413 // prepare checksum to test
414 sal_uInt8 nTestChecksum[ RTL_DIGEST_LENGTH_MD5 ];
415 const sal_Unicode* pChar = rChkSum.getStr();
416 for(sal_uInt8 & rn : nTestChecksum)
417 {
418 sal_uInt8 nByte = sal_uInt8( ( (*pChar >= '0' && *pChar <= '9') ? *pChar - '0' :
419 ( (*pChar >= 'A' && *pChar <= 'F') ? *pChar - 'A' + 10 :
420 ( (*pChar >= 'a' && *pChar <= 'f') ? *pChar - 'a' + 10 :
421 0 ) ) ) );
422 nByte <<= 4;
423 pChar++;
424 nByte |= ( (*pChar >= '0' && *pChar <= '9') ? *pChar - '0' :
425 ( (*pChar >= 'A' && *pChar <= 'F') ? *pChar - 'A' + 10 :
426 ( (*pChar >= 'a' && *pChar <= 'f') ? *pChar - 'a' + 10 :
427 0 ) ) );
428 pChar++;
429 rn = nByte;
430 }
431
432 // open file and calculate actual checksum up to index nBytes
433 ::std::vector<unsigned char> nChecksum;
434 ::comphelper::Hash aDigest(::comphelper::HashType::MD5);
435 oslFileHandle aRead = nullptr;
436 oslFileError aErr = osl_File_E_None;
437 if( (aErr = osl_openFile(rInPDFFileURL.pData,
438 &aRead,
439 osl_File_OpenFlag_Read )) == osl_File_E_None )
440 {
441 sal_uInt8 aBuf[4096];
442 sal_uInt32 nCur = 0;
443 sal_uInt64 nBytesRead = 0;
444 while( nCur < nBytes )
445 {
446 sal_uInt32 nPass = std::min<sal_uInt32>(nBytes - nCur, sizeof( aBuf ));
447 if( (aErr = osl_readFile( aRead, aBuf, nPass, &nBytesRead)) != osl_File_E_None
448 || nBytesRead == 0 )
449 {
450 break;
451 }
452 nPass = static_cast<sal_uInt32>(nBytesRead);
453 nCur += nPass;
454 aDigest.update(aBuf, nPass);
455 }
456
457 nChecksum = aDigest.finalize();
458 osl_closeFile( aRead );
459 }
460
461 // compare the contents
462 return nChecksum.size() == RTL_DIGEST_LENGTH_MD5
463 && (0 == memcmp(nChecksum.data(), nTestChecksum, nChecksum.size()));
464 }
465
getAdditionalStream(const OUString & rInPDFFileURL,OUString & rOutMimetype,OUString & io_rPwd,const uno::Reference<uno::XComponentContext> & xContext,const uno::Sequence<beans::PropertyValue> & rFilterData,bool bMayUseUI)466 uno::Reference< io::XStream > getAdditionalStream( const OUString& rInPDFFileURL,
467 OUString& rOutMimetype,
468 OUString& io_rPwd,
469 const uno::Reference<uno::XComponentContext>& xContext,
470 const uno::Sequence<beans::PropertyValue>& rFilterData,
471 bool bMayUseUI )
472 {
473 uno::Reference< io::XStream > xEmbed;
474 OString aPDFFile;
475 OUString aSysUPath;
476 if( osl_getSystemPathFromFileURL( rInPDFFileURL.pData, &aSysUPath.pData ) != osl_File_E_None )
477 return xEmbed;
478 aPDFFile = OUStringToOString( aSysUPath, osl_getThreadTextEncoding() );
479
480 std::unique_ptr<pdfparse::PDFEntry> pEntry( pdfparse::PDFReader::read( aPDFFile.getStr() ));
481 if( pEntry )
482 {
483 pdfparse::PDFFile* pPDFFile = dynamic_cast<pdfparse::PDFFile*>(pEntry.get());
484 if( pPDFFile )
485 {
486 unsigned int nElements = pPDFFile->m_aSubElements.size();
487 while( nElements-- > 0 )
488 {
489 pdfparse::PDFTrailer* pTrailer = dynamic_cast<pdfparse::PDFTrailer*>(pPDFFile->m_aSubElements[nElements].get());
490 if( pTrailer && pTrailer->m_pDict )
491 {
492 // search document checksum entry
493 auto chk = pTrailer->m_pDict->m_aMap.find( "DocChecksum" );
494 if( chk == pTrailer->m_pDict->m_aMap.end() )
495 {
496 SAL_INFO( "sdext.pdfimport", "no DocChecksum entry" );
497 continue;
498 }
499 pdfparse::PDFName* pChkSumName = dynamic_cast<pdfparse::PDFName*>(chk->second);
500 if( pChkSumName == nullptr )
501 {
502 SAL_INFO( "sdext.pdfimport", "no name for DocChecksum entry" );
503 continue;
504 }
505
506 // search for AdditionalStreams entry
507 auto add_stream = pTrailer->m_pDict->m_aMap.find( "AdditionalStreams" );
508 if( add_stream == pTrailer->m_pDict->m_aMap.end() )
509 {
510 SAL_INFO( "sdext.pdfimport", "no AdditionalStreams entry" );
511 continue;
512 }
513 pdfparse::PDFArray* pStreams = dynamic_cast<pdfparse::PDFArray*>(add_stream->second);
514 if( ! pStreams || pStreams->m_aSubElements.size() < 2 )
515 {
516 SAL_INFO( "sdext.pdfimport", "AdditionalStreams array too small" );
517 continue;
518 }
519
520 // check checksum
521 OUString aChkSum = pChkSumName->getFilteredName();
522 if( ! checkDocChecksum( rInPDFFileURL, pTrailer->m_nOffset, aChkSum ) )
523 continue;
524
525 // extract addstream and mimetype
526 pdfparse::PDFName* pMimeType = dynamic_cast<pdfparse::PDFName*>(pStreams->m_aSubElements[0].get());
527 pdfparse::PDFObjectRef* pStreamRef = dynamic_cast<pdfparse::PDFObjectRef*>(pStreams->m_aSubElements[1].get());
528
529 SAL_WARN_IF( !pMimeType, "sdext.pdfimport", "error: no mimetype element" );
530 SAL_WARN_IF( !pStreamRef, "sdext.pdfimport", "error: no stream ref element" );
531
532 if( pMimeType && pStreamRef )
533 {
534 pdfparse::PDFObject* pObject = pPDFFile->findObject( pStreamRef->m_nNumber, pStreamRef->m_nGeneration );
535 SAL_WARN_IF( !pObject, "sdext.pdfimport", "object not found" );
536 if( pObject )
537 {
538 if( pPDFFile->isEncrypted() )
539 {
540 bool bAuthenticated = false;
541 if( !io_rPwd.isEmpty() )
542 {
543 OString aIsoPwd = OUStringToOString( io_rPwd,
544 RTL_TEXTENCODING_ISO_8859_1 );
545 bAuthenticated = pPDFFile->setupDecryptionData( aIsoPwd.getStr() );
546 }
547 if( ! bAuthenticated )
548 {
549 uno::Reference< task::XInteractionHandler > xIntHdl;
550 for( const beans::PropertyValue& rAttrib : rFilterData )
551 {
552 if ( rAttrib.Name == "InteractionHandler" )
553 rAttrib.Value >>= xIntHdl;
554 }
555 if( ! bMayUseUI || ! xIntHdl.is() )
556 {
557 rOutMimetype = pMimeType->getFilteredName();
558 xEmbed.clear();
559 break;
560 }
561
562 OUString aDocName( rInPDFFileURL.copy( rInPDFFileURL.lastIndexOf( '/' )+1 ) );
563
564 bool bEntered = false;
565 do
566 {
567 bEntered = getPassword( xIntHdl, io_rPwd, ! bEntered, aDocName );
568 OString aIsoPwd = OUStringToOString( io_rPwd,
569 RTL_TEXTENCODING_ISO_8859_1 );
570 bAuthenticated = pPDFFile->setupDecryptionData( aIsoPwd.getStr() );
571 } while( bEntered && ! bAuthenticated );
572 }
573
574 if( ! bAuthenticated )
575 continue;
576 }
577 rOutMimetype = pMimeType->getFilteredName();
578 FileEmitContext aContext( rInPDFFileURL,
579 xContext,
580 pPDFFile );
581 aContext.m_bDecrypt = pPDFFile->isEncrypted();
582 pObject->writeStream( aContext, pPDFFile );
583 xEmbed = aContext.getContextStream();
584 break; // success
585 }
586 }
587 }
588 }
589 }
590 }
591
592 return xEmbed;
593 }
594
595 }
596
597 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
598