1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  *   Licensed to the Apache Software Foundation (ASF) under one or more
12  *   contributor license agreements. See the NOTICE file distributed
13  *   with this work for additional information regarding copyright
14  *   ownership. The ASF licenses this file to you under the Apache
15  *   License, Version 2.0 (the "License"); you may not use this file
16  *   except in compliance with the License. You may obtain a copy of
17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include "scdetect.hxx"
21 
22 #include <sal/macros.h>
23 
24 #include <com/sun/star/beans/PropertyValue.hpp>
25 #include <com/sun/star/uno/XComponentContext.hpp>
26 #include <cppuhelper/supportsservice.hxx>
27 #include <com/sun/star/io/XInputStream.hpp>
28 #include <unotools/mediadescriptor.hxx>
29 #include <sfx2/docfile.hxx>
30 #include <sfx2/docfilt.hxx>
31 #include <sfx2/fcontnr.hxx>
32 #include <tools/solar.h>
33 
34 using namespace ::com::sun::star;
35 using utl::MediaDescriptor;
36 
37 namespace {
38 
39 // table with search pattern
40 // meaning of the sequences
41 // 0x00??: the exact byte 0x?? must be at that place
42 // 0x0100: read over a byte (don't care)
43 // 0x02nn: a byte of 0xnn variations follows
44 // 0x8000: recognition finished
45 
46 #define M_DC        0x0100
47 #define M_ALT(CNT)  (0x0200+(CNT))
48 #define M_ENDE      0x8000
49 
50 const sal_uInt16 pLotus[] =      // Lotus 1/1A/2
51     { 0x0000, 0x0000, 0x0002, 0x0000,
52     M_ALT(2), 0x0004, 0x0006,
53     0x0004, M_ENDE };
54 
55 const sal_uInt16 pLotusNew[] =   // Lotus >= 9.7
56     { 0x0000, 0x0000, M_DC, 0x0000,     // Rec# + Len (0x1a)
57       M_ALT(3), 0x0003, 0x0004, 0x0005, // File Revision Code 97->ME
58       0x0010, 0x0004, 0x0000, 0x0000,
59       M_ENDE };
60 
61 const sal_uInt16 pLotus2[] =     // Lotus >3
62     { 0x0000, 0x0000, 0x001A, 0x0000,   // Rec# + Len (26)
63     M_ALT(2), 0x0000, 0x0002,         // File Revision Code
64     0x0010,
65     0x0004, 0x0000,                   // File Revision Subcode
66     M_ENDE };
67 
68 const sal_uInt16 pQPro[] =
69        { 0x0000, 0x0000, 0x0002, 0x0000,
70          M_ALT(4), 0x0001, 0x0002, // WB1, WB2
71          0x0006, 0x0007,           // QPro 6/7 (?)
72          0x0010,
73          M_ENDE };
74 
75 const sal_uInt16 pDIF1[] =       // DIF with CR-LF
76     {
77     'T', 'A', 'B', 'L', 'E',
78     M_DC, M_DC,
79     '0', ',', '1',
80     M_DC, M_DC,
81     '\"',
82     M_ENDE };
83 
84 const sal_uInt16 pDIF2[] =       // DIF with CR or LF
85     {
86     'T', 'A', 'B', 'L', 'E',
87     M_DC,
88     '0', ',', '1',
89     M_DC,
90     '\"',
91     M_ENDE };
92 
93 const sal_uInt16 pSylk[] =       // Sylk
94     {
95     'I', 'D', ';',
96     M_ALT(3), 'P', 'N', 'E',        // 'P' plus undocumented Excel extensions 'N' and 'E'
97     M_ENDE };
98 
detectThisFormat(SvStream & rStr,const sal_uInt16 * pSearch)99 bool detectThisFormat(SvStream& rStr, const sal_uInt16* pSearch)
100 {
101     sal_uInt8 nByte;
102     rStr.Seek( 0 ); // in the beginning everything was bad...
103     rStr.ReadUChar( nByte );
104     bool bSync = true;
105     while( !rStr.eof() && bSync )
106     {
107         sal_uInt16 nMuster = *pSearch;
108 
109         if( nMuster < 0x0100 )
110         { // compare bytes
111             if( static_cast<sal_uInt8>(nMuster) != nByte )
112                 bSync = false;
113         }
114         else if( nMuster & M_DC )
115         { // don't care
116         }
117         else if( nMuster & M_ALT(0) )
118         { // alternative Bytes
119             sal_uInt8 nCntAlt = static_cast<sal_uInt8>(nMuster);
120             bSync = false;          // first unsynchron
121             while( nCntAlt > 0 )
122             {
123                 pSearch++;
124                 if( static_cast<sal_uInt8>(*pSearch) == nByte )
125                     bSync = true;   // only now synchronization
126                 nCntAlt--;
127             }
128         }
129         else if( nMuster & M_ENDE )
130         { // Format detected
131             return true;
132         }
133 
134         pSearch++;
135         rStr.ReadUChar( nByte );
136     }
137 
138     return false;
139 }
140 
141 }
142 
ScFilterDetect()143 ScFilterDetect::ScFilterDetect()
144 {
145 }
146 
~ScFilterDetect()147 ScFilterDetect::~ScFilterDetect()
148 {
149 }
150 
151 #if 0
152 // This method is no longer used, but I do want to keep this for now to see
153 // if we could transfer this check to the now centralized ascii detection
154 // code in the filter module.
155 static sal_Bool lcl_MayBeAscii( SvStream& rStream )
156 {
157     // ASCII/CSV is considered possible if there are no null bytes, or a Byte
158     // Order Mark is present, or if, for Unicode UCS2/UTF-16, all null bytes
159     // are on either even or uneven byte positions.
160 
161     rStream.Seek(STREAM_SEEK_TO_BEGIN);
162 
163     const size_t nBufSize = 2048;
164     sal_uInt16 aBuffer[ nBufSize ];
165     sal_uInt8* pByte = reinterpret_cast<sal_uInt8*>(aBuffer);
166     sal_uLong nBytesRead = rStream.Read( pByte, nBufSize*2);
167 
168     if ( nBytesRead >= 2 && (aBuffer[0] == 0xfffe || aBuffer[0] == 0xfeff) )
169     {
170         // Unicode BOM file may contain null bytes.
171         return sal_True;
172     }
173 
174     const sal_uInt16* p = aBuffer;
175     sal_uInt16 nMask = 0xffff;
176     nBytesRead /= 2;
177     while( nBytesRead-- && nMask )
178     {
179         sal_uInt16 nVal = *p++ & nMask;
180         if (!(nVal & 0x00ff))
181             nMask &= 0xff00;
182         if (!(nVal & 0xff00))
183             nMask &= 0x00ff;
184     }
185 
186     return nMask != 0;
187 }
188 #endif
189 
lcl_MayBeDBase(SvStream & rStream)190 static bool lcl_MayBeDBase( SvStream& rStream )
191 {
192     // Look for dbf marker, see connectivity/source/inc/dbase/DTable.hxx
193     // DBFType for values.
194     const sal_uInt8 nValidMarks[] = {
195         0x03, 0x04, 0x05, 0x30, 0x31, 0x43, 0xB3, 0x83, 0x8b, 0x8e, 0xf5 };
196     sal_uInt8 nMark;
197     rStream.Seek(STREAM_SEEK_TO_BEGIN);
198     rStream.ReadUChar( nMark );
199     bool bValidMark = false;
200     for (size_t i=0; i < SAL_N_ELEMENTS(nValidMarks) && !bValidMark; ++i)
201     {
202         if (nValidMarks[i] == nMark)
203             bValidMark = true;
204     }
205     if ( !bValidMark )
206         return false;
207 
208     const size_t nHeaderBlockSize = 32;
209     // Empty dbf is >= 32*2+1 bytes in size.
210     const size_t nEmptyDbf = nHeaderBlockSize * 2 + 1;
211 
212     sal_uLong nSize = rStream.TellEnd();
213     if ( nSize < nEmptyDbf )
214         return false;
215 
216     // count of records at 4
217     rStream.Seek(4);
218     sal_uInt32 nRecords(0);
219     rStream.ReadUInt32(nRecords);
220 
221     // length of header starts at 8
222     rStream.Seek(8);
223     sal_uInt16 nHeaderLen;
224     rStream.ReadUInt16( nHeaderLen );
225 
226     // size of record at 10
227     sal_uInt16 nRecordSize(0);
228     rStream.ReadUInt16(nRecordSize);
229 
230     if ( nHeaderLen < nEmptyDbf || nSize < nHeaderLen )
231         return false;
232 
233     // see DTable.cxx ODbaseTable::readHeader()
234     if (0 == nRecordSize)
235         return false;
236 
237     // see DTable.cxx ODbaseTable::construct() line 546
238     if (0 == nRecords)
239     {
240         nRecords = (nSize - nHeaderLen) / nRecordSize;
241     }
242 
243     // tdf#84834 sanity check of size
244     // tdf#106423: a dbf file can have 0 record, so no need to check nRecords
245     if (nSize < nHeaderLen + nRecords * sal_uInt64(nRecordSize))
246         return false;
247 
248     // Last byte of header must be 0x0d, this is how it's specified.
249     // #i9581#,#i26407# but some applications don't follow the specification
250     // and pad the header with one byte 0x00 to reach an
251     // even boundary. Some (#i88577# ) even pad more or pad using a 0x1a ^Z
252     // control character (#i8857#). This results in:
253     // Last byte of header must be 0x0d on 32 bytes boundary.
254     sal_uInt16 nBlocks = (nHeaderLen - 1) / nHeaderBlockSize;
255     sal_uInt8 nEndFlag = 0;
256     while ( nBlocks > 1 && nEndFlag != 0x0d ) {
257         rStream.Seek( nBlocks-- * nHeaderBlockSize );
258         rStream.ReadUChar( nEndFlag );
259     }
260 
261     return ( 0x0d == nEndFlag );
262 }
263 
detect(uno::Sequence<beans::PropertyValue> & lDescriptor)264 OUString SAL_CALL ScFilterDetect::detect( uno::Sequence<beans::PropertyValue>& lDescriptor )
265 {
266     MediaDescriptor aMediaDesc( lDescriptor );
267     OUString aTypeName = aMediaDesc.getUnpackedValueOrDefault( MediaDescriptor::PROP_TYPENAME(), OUString() );
268     uno::Reference< io::XInputStream > xStream ( aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM()], uno::UNO_QUERY );
269     if ( !xStream.is() )
270         return OUString();
271 
272     SfxMedium aMedium;
273     aMedium.UseInteractionHandler( false );
274     aMedium.setStreamToLoadFrom( xStream, true );
275 
276     SvStream* pStream = aMedium.GetInStream();
277     if ( !pStream || pStream->GetError() )
278         // No stream, no detection.
279         return OUString();
280 
281     const char* pSearchFilterName = nullptr;
282     if (aTypeName == "calc_Lotus")
283     {
284         if (!detectThisFormat(*pStream, pLotus) && !detectThisFormat(*pStream, pLotusNew) && !detectThisFormat(*pStream, pLotus2))
285             return OUString();
286 
287         pSearchFilterName = "Lotus";
288     }
289     else if (aTypeName == "calc_QPro")
290     {
291         if (!detectThisFormat(*pStream, pQPro))
292             return OUString();
293 
294         pSearchFilterName = "Quattro Pro 6.0";
295     }
296     else if (aTypeName == "calc_SYLK")
297     {
298         if (!detectThisFormat(*pStream, pSylk))
299             return OUString();
300 
301         pSearchFilterName = "SYLK";
302     }
303     else if (aTypeName == "calc_DIF")
304     {
305         if (!detectThisFormat(*pStream, pDIF1) && !detectThisFormat(*pStream, pDIF2))
306             return OUString();
307 
308         pSearchFilterName = "DIF";
309     }
310     else if (aTypeName == "calc_dBase")
311     {
312         if (!lcl_MayBeDBase(*pStream))
313             return OUString();
314 
315         pSearchFilterName = "dBase";
316     }
317     else
318         return OUString();
319 
320     SfxFilterMatcher aMatcher("scalc");
321     std::shared_ptr<const SfxFilter> pFilter = aMatcher.GetFilter4FilterName(OUString::createFromAscii(pSearchFilterName));
322 
323     if (!pFilter)
324         return OUString();
325 
326     aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= pFilter->GetName();
327     aMediaDesc >> lDescriptor;
328     return aTypeName;
329 }
330 
getImplementationName()331 OUString SAL_CALL ScFilterDetect::getImplementationName()
332 {
333     return "com.sun.star.comp.calc.FormatDetector";
334 }
335 
supportsService(const OUString & sServiceName)336 sal_Bool ScFilterDetect::supportsService( const OUString& sServiceName )
337 {
338     return cppu::supportsService(this, sServiceName);
339 }
340 
getSupportedServiceNames()341 css::uno::Sequence<OUString> ScFilterDetect::getSupportedServiceNames()
342 {
343     uno::Sequence<OUString> seqServiceNames { "com.sun.star.frame.ExtendedTypeDetection" };
344     return seqServiceNames;
345 }
346 
347 extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface*
com_sun_star_comp_calc_FormatDetector_get_implementation(css::uno::XComponentContext *,css::uno::Sequence<css::uno::Any> const &)348 com_sun_star_comp_calc_FormatDetector_get_implementation(css::uno::XComponentContext* /*context*/,
349                                                          css::uno::Sequence<css::uno::Any> const &)
350 {
351     return cppu::acquire(new ScFilterDetect);
352 }
353 
354 
355 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
356