1""" Parse xls up to some point
2
3Read storages, (sub-)streams, records from xls file
4"""
5#
6# === LICENSE ==================================================================
7
8# xls_parser is copyright (c) 2014-2019 Philippe Lagadec (http://www.decalage.info)
9# All rights reserved.
10#
11# Redistribution and use in source and binary forms, with or without modification,
12# are permitted provided that the following conditions are met:
13#
14#  * Redistributions of source code must retain the above copyright notice, this
15#    list of conditions and the following disclaimer.
16#  * Redistributions in binary form must reproduce the above copyright notice,
17#    this list of conditions and the following disclaimer in the documentation
18#    and/or other materials provided with the distribution.
19#
20# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
21# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31#------------------------------------------------------------------------------
32# CHANGELOG:
33# 2017-11-02 v0.1 CH: - first version
34# 2017-11-02 v0.2 CH: - move some code to record_base.py
35#                        (to avoid copy-and-paste in ppt_parser.py)
36# 2019-01-30 v0.54 PL: - fixed import to avoid mixing installed oletools
37#                        and dev version
38
39__version__ = '0.54'
40
41# -----------------------------------------------------------------------------
42#  TODO:
43#  - parse more record types (ExternName, ...)
44#  - check what bad stuff can be in other storages: Embedded ("MBD..."), Linked
45#    ("LNK..."), "MsoDataStore" and OleStream ('\001Ole')
46#
47# -----------------------------------------------------------------------------
48#  REFERENCES:
49#  - [MS-XLS]: Excel Binary File Format (.xls) Structure Specification
50#    https://msdn.microsoft.com/en-us/library/office/cc313154(v=office.14).aspx
51#  - Understanding the Excel .xls Binary File Format
52#    https://msdn.microsoft.com/en-us/library/office/gg615597(v=office.14).aspx
53#
54# -- IMPORTS ------------------------------------------------------------------
55
56import sys
57import os.path
58from struct import unpack
59import logging
60
61# little hack to allow absolute imports even if oletools is not installed.
62# Copied from olevba.py
63PARENT_DIR = os.path.normpath(os.path.dirname(os.path.dirname(
64    os.path.abspath(__file__))))
65if PARENT_DIR not in sys.path:
66    sys.path.insert(0, PARENT_DIR)
67del PARENT_DIR
68from oletools import record_base
69
70
71# === PYTHON 2+3 SUPPORT ======================================================
72
73if sys.version_info[0] >= 3:
74    unichr = chr
75
76###############################################################################
77# Helpers
78###############################################################################
79
80
81def is_xls(filename):
82    """
83    determine whether a given file is an excel ole file
84
85    returns True if given file is an ole file and contains a Workbook stream
86
87    todo: could further check that workbook stream starts with a globals
88    substream.
89    See also: oleid.OleID.check_excel
90    """
91    xls_file = None
92    try:
93        xls_file = XlsFile(filename)
94        for stream in xls_file.iter_streams():
95            if isinstance(stream, WorkbookStream):
96                return True
97    except Exception:
98        logging.debug('Ignoring exception in is_xls, assume is not xls',
99                      exc_info=True)
100    finally:
101        if xls_file is not None:
102            xls_file.close()
103    return False
104
105
106def read_unicode(data, start_idx, n_chars):
107    """ read a unicode string from a XLUnicodeStringNoCch structure """
108    # first bit 0x0 --> only low-bytes are saved, all high bytes are 0
109    # first bit 0x1 --> 2 bytes per character
110    low_bytes_only = (ord(data[start_idx:start_idx+1]) == 0)
111    if low_bytes_only:
112        end_idx = start_idx + 1 + n_chars
113        return data[start_idx+1:end_idx].decode('ascii'), end_idx
114    else:
115        return read_unicode_2byte(data, start_idx+1, n_chars)
116
117
118def read_unicode_2byte(data, start_idx, n_chars):
119    """ read a unicode string with characters encoded by 2 bytes """
120    end_idx = start_idx + n_chars * 2
121    if n_chars < 256:  # faster version, long format string for unpack
122        unichars = (unichr(val) for val in
123                    unpack('<' + 'H'*n_chars, data[start_idx:end_idx]))
124    else:              # slower version but less memory-extensive
125        unichars = (unichr(unpack('<H', data[data_idx:data_idx+2])[0])
126                    for data_idx in range(start_idx, end_idx, 2))
127    return u''.join(unichars), end_idx
128
129
130###############################################################################
131# File, Storage, Stream
132###############################################################################
133
134class XlsFile(record_base.OleRecordFile):
135    """ An xls file has most streams made up of records """
136
137    @classmethod
138    def stream_class_for_name(cls, stream_name):
139        """ helper for iter_streams """
140        if stream_name == 'Workbook':
141            return WorkbookStream
142        return XlsStream
143
144
145class XlsStream(record_base.OleRecordStream):
146    """ most streams in xls file consist of records """
147
148    def read_record_head(self):
149        """ read first few bytes of record to determine size and type
150
151        returns (type, size, other) where other is None
152        """
153        rec_type, rec_size = unpack('<HH', self.stream.read(4))
154        return rec_type, rec_size, None
155
156    @classmethod
157    def record_class_for_type(cls, rec_type):
158        """ determine a class for given record type
159
160        returns (clz, force_read)
161        """
162        return XlsRecord, False
163
164
165class WorkbookStream(XlsStream):
166    """ Stream in excel file that holds most info """
167
168    @classmethod
169    def record_class_for_type(cls, rec_type):
170        """ determine a class for given record type
171
172        returns (clz, force_read)
173        """
174        if rec_type == XlsRecordBof.TYPE:
175            return XlsRecordBof, True
176        elif rec_type == XlsRecordEof.TYPE:
177            return XlsRecordEof, False
178        elif rec_type == XlsRecordSupBook.TYPE:
179            return XlsRecordSupBook, True
180        else:
181            return XlsRecord, False
182
183
184class XlsbStream(record_base.OleRecordStream):
185    """ binary stream of an xlsb file, usually have a record structure """
186
187    HIGH_BIT_MASK = 0b10000000
188    LOW7_BIT_MASK = 0b01111111
189
190    def read_record_head(self):
191        """ read first few bytes of record to determine size and type
192
193        returns (type, size, other) where other is None
194        """
195        val = ord(self.stream.read(1))
196        if val & self.HIGH_BIT_MASK:    # high bit of the low byte is 1
197            val2 = ord(self.stream.read(1))         # need another byte
198            # combine 7 low bits of each byte
199            rec_type = (val & self.LOW7_BIT_MASK) + \
200                       ((val2 & self.LOW7_BIT_MASK) << 7)
201        else:
202            rec_type = val
203
204        rec_size = 0
205        shift = 0
206        for _ in range(4):      # rec_size needs up to 4 byte
207            val = ord(self.stream.read(1))
208            rec_size += (val & self.LOW7_BIT_MASK) << shift
209            shift += 7
210            if (val & self.HIGH_BIT_MASK) == 0:   # high-bit is 0 --> done
211                break
212        return rec_type, rec_size, None
213
214    @classmethod
215    def record_class_for_type(cls, rec_type):
216        """ determine a class for given record type
217
218        returns (clz, force_read)
219        """
220        if rec_type == XlsbBeginSupBook.TYPE:
221            return XlsbBeginSupBook, True
222        else:
223            return XlsbRecord, False
224
225
226###############################################################################
227# RECORDS
228###############################################################################
229
230# records that appear often but do not need their own XlsRecord subclass (yet)
231FREQUENT_RECORDS = dict([
232    ( 156, 'BuiltInFnGroupCount'),             # pylint: disable=bad-whitespace
233    (2147, 'BookExt'),                         # pylint: disable=bad-whitespace
234    ( 442, 'CodeName'),                        # pylint: disable=bad-whitespace
235    (  66, 'CodePage'),                        # pylint: disable=bad-whitespace
236    (4195, 'Dat'),                             # pylint: disable=bad-whitespace
237    (2154, 'DataLabExt'),                      # pylint: disable=bad-whitespace
238    (2155, 'DataLabExtContents'),              # pylint: disable=bad-whitespace
239    ( 215, 'DBCell'),                          # pylint: disable=bad-whitespace
240    ( 220, 'DbOrParmQry'),                     # pylint: disable=bad-whitespace
241    (2051, 'DBQueryExt'),                      # pylint: disable=bad-whitespace
242    (2166, 'DConn'),                           # pylint: disable=bad-whitespace
243    (  35, 'ExternName'),                      # pylint: disable=bad-whitespace
244    (  23, 'ExternSheet'),                     # pylint: disable=bad-whitespace
245    ( 255, 'ExtSST'),                          # pylint: disable=bad-whitespace
246    (2052, 'ExtString'),                       # pylint: disable=bad-whitespace
247    (2151, 'FeatHdr'),                         # pylint: disable=bad-whitespace
248    (  91, 'FileSharing'),                     # pylint: disable=bad-whitespace
249    (1054, 'Format'),                          # pylint: disable=bad-whitespace
250    (  49, 'Font'),                            # pylint: disable=bad-whitespace
251    (2199, 'GUIDTypeLib'),                     # pylint: disable=bad-whitespace
252    ( 440, 'HLink'),                           # pylint: disable=bad-whitespace
253    ( 225, 'InterfaceHdr'),                    # pylint: disable=bad-whitespace
254    ( 226, 'InterfaceEnd'),                    # pylint: disable=bad-whitespace
255    ( 523, 'Index'),                           # pylint: disable=bad-whitespace
256    (  24, 'Lbl'),                             # pylint: disable=bad-whitespace
257    ( 193, 'Mms'),                             # pylint: disable=bad-whitespace
258    (  93, 'Obj'),                             # pylint: disable=bad-whitespace
259    (4135, 'ObjectLink'),                      # pylint: disable=bad-whitespace
260    (2058, 'OleDbConn'),                       # pylint: disable=bad-whitespace
261    ( 222, 'OleObjectSize'),                   # pylint: disable=bad-whitespace
262    (2214, 'RichTextStream'),                  # pylint: disable=bad-whitespace
263    (2146, 'SheetExt'),                        # pylint: disable=bad-whitespace
264    (1212, 'ShrFmla'),                         # pylint: disable=bad-whitespace
265    (2060, 'SxViewExt'),                       # pylint: disable=bad-whitespace
266    (2136, 'SxViewLink'),                      # pylint: disable=bad-whitespace
267    (2049, 'WebPub'),                          # pylint: disable=bad-whitespace
268    ( 224, 'XF (formatting)'),                 # pylint: disable=bad-whitespace
269    (2173, 'XFExt (formatting)'),              # pylint: disable=bad-whitespace
270    ( 659, 'Style'),                           # pylint: disable=bad-whitespace
271    (2194, 'StyleExt')                         # pylint: disable=bad-whitespace
272])
273
274#: records found in xlsb binary parts
275FREQUENT_RECORDS_XLSB = dict([
276    (588, 'BrtEndSupBook'),
277    (667, 'BrtSupAddin'),
278    (355, 'BrtSupBookSrc'),
279    (586, 'BrtSupNameBits'),
280    (584, 'BrtSupNameBool'),
281    (587, 'BrtSupNameEnd'),
282    (581, 'BrtSupNameErr'),
283    (585, 'BrtSupNameFmla'),
284    (583, 'BrtSupNameNil'),
285    (580, 'BrtSupNameNum'),
286    (582, 'BrtSupNameSt'),
287    (577, 'BrtSupNameStart'),
288    (579, 'BrtSupNameValueEnd'),
289    (578, 'BrtSupNameValueStart'),
290    (358, 'BrtSupSame'),
291    (357, 'BrtSupSelf'),
292    (359, 'BrtSupTabs'),
293])
294
295
296class XlsRecord(record_base.OleRecordBase):
297    """ basic building block of data in workbook stream """
298
299    #: max size of a record in xls stream (does not apply to xlsb)
300    MAX_SIZE = 8224
301
302    def _type_str(self):
303        """ simplification for subclasses to create their own __str__ """
304        try:
305            return FREQUENT_RECORDS[self.type]
306        except KeyError:
307            return 'XlsRecord type {0}'.format(self.type)
308
309
310class XlsRecordBof(XlsRecord):
311    """ record found at beginning of substreams """
312    TYPE = 2057
313    SIZE = 16
314    # types of substreams
315    DOCTYPES = dict([(0x5, 'workbook'), (0x10, 'dialog/worksheet'),
316                     (0x20, 'chart'), (0x40, 'macro')])
317
318    def finish_constructing(self, _):
319        if self.data is None:
320            self.doctype = None
321            return
322        # parse data (only doctype, ignore rest)
323        self.doctype = unpack('<H', self.data[2:4])[0]
324
325    def _type_str(self):
326        return 'BOF Record ({0} substream)'.format(
327            self.DOCTYPES[self.doctype] if self.doctype in self.DOCTYPES
328            else 'unknown')
329
330
331class XlsRecordEof(XlsRecord):
332    """ record found at end of substreams """
333    TYPE = 10
334    SIZE = 0
335
336    def _type_str(self):
337        return 'EOF Record'
338
339
340class XlsRecordSupBook(XlsRecord):
341    """ The SupBook record specifies a supporting link
342
343    "... The collection of records specifies the contents of an external
344    workbook, DDE data source, or OLE data source." (MS-XLS, paragraph 2.4.271)
345    """
346
347    TYPE = 430
348
349    LINK_TYPE_UNKNOWN = 'unknown'
350    LINK_TYPE_SELF = 'self-referencing'
351    LINK_TYPE_ADDIN = 'addin-referencing'
352    LINK_TYPE_UNUSED = 'unused'
353    LINK_TYPE_SAMESHEET = 'same-sheet'
354    LINK_TYPE_OLE_DDE = 'ole/dde data source'
355    LINK_TYPE_EXTERNAL = 'external workbook'
356
357    def finish_constructing(self, _):
358        """Finish constructing this record; called at end of constructor."""
359        # set defaults
360        self.ctab = None
361        self.cch = None
362        self.virt_path = None
363        self.support_link_type = self.LINK_TYPE_UNKNOWN
364        if self.data is None:
365            return
366
367        # parse data
368        if self.size < 4:
369            raise ValueError('not enough data (size is {0} but need >= 4)'
370                             .format(self.size))
371        self.ctab, self.cch = unpack('<HH', self.data[:4])
372        if 0 < self.cch <= 0xff:
373            # this is the length of virt_path
374            self.virt_path, _ = read_unicode(self.data, 4, self.cch)
375        else:
376            self.virt_path, _ = u'', 4
377        # ignore variable rgst
378
379        if self.cch == 0x401:    # ctab is undefined and to be ignored
380            self.support_link_type = self.LINK_TYPE_SELF
381        elif self.ctab == 0x1 and self.cch == 0x3A01:
382            self.support_link_type = self.LINK_TYPE_ADDIN
383            # next records must be ExternName with all add-in functions
384        elif self.virt_path == u'\u0020':   # space ; ctab can be anything
385            self.support_link_type = self.LINK_TYPE_UNUSED
386        elif self.virt_path == u'\u0000':
387            self.support_link_type = self.LINK_TYPE_SAMESHEET
388        elif self.ctab == 0x0 and self.virt_path:
389            self.support_link_type = self.LINK_TYPE_OLE_DDE
390        elif self.ctab > 0 and self.virt_path:
391            self.support_link_type = self.LINK_TYPE_EXTERNAL
392
393    def _type_str(self):
394        return 'SupBook Record ({0})'.format(self.support_link_type)
395
396
397class XlsbRecord(record_base.OleRecordBase):
398    """ like an xls record, but from binary part of xlsb file
399
400    has no MAX_SIZE and types have different meanings
401    """
402
403    MAX_SIZE = None
404
405    def _type_str(self):
406        """ simplification for subclasses to create their own __str__ """
407        try:
408            return FREQUENT_RECORDS_XLSB[self.type]
409        except KeyError:
410            return 'XlsbRecord type {0}'.format(self.type)
411
412
413class XlsbBeginSupBook(XlsbRecord):
414    """ Record beginning an external link in xlsb file
415
416    contains information about the link itself (e.g. for DDE the link is
417    string1 + ' ' + string2)
418    """
419
420    TYPE = 360
421    LINK_TYPE_WORKBOOK = 'workbook'
422    LINK_TYPE_DDE = 'DDE'
423    LINK_TYPE_OLE = 'OLE'
424    LINK_TYPE_UNEXPECTED = 'unexpected'
425    LINK_TYPE_UNKNOWN = 'unknown'
426
427    def finish_constructing(self, _):
428        self.link_type = self.LINK_TYPE_UNKNOWN
429        self.string1 = ''
430        self.string2 = ''
431        if self.data is None:
432            return
433        self.sbt = unpack('<H', self.data[0:2])[0]
434        if self.sbt == 0:
435            self.link_type = self.LINK_TYPE_WORKBOOK
436        elif self.sbt == 1:
437            self.link_type = self.LINK_TYPE_DDE
438        elif self.sbt == 2:
439            self.link_type = self.LINK_TYPE_OLE
440        else:
441            logging.warning('Unexpected link type {0} encountered'
442                            .format(self.data[0]))
443            self.link_type = self.LINK_TYPE_UNEXPECTED
444
445        start_idx = 2
446        n_chars = unpack('<I', self.data[start_idx:start_idx+4])[0]
447        if n_chars == 0xFFFFFFFF:
448            logging.warning('Max string length 0xFFFFFFF is not allowed')
449        elif self.size < n_chars*2 + start_idx+4:
450            logging.warning('Impossible string length {0} for data length {1}'
451                            .format(n_chars, self.size))
452        else:
453            self.string1, start_idx = read_unicode_2byte(self.data,
454                                                         start_idx+4, n_chars)
455
456        n_chars = unpack('<I', self.data[start_idx:start_idx+4])[0]
457        if n_chars == 0xFFFFFFFF:
458            logging.warning('Max string length 0xFFFFFFF is not allowed')
459        elif self.size < n_chars*2 + start_idx+4:
460            logging.warning('Impossible string length {0} for data length {1}'
461                            .format(n_chars, self.size) + ' for string2')
462        else:
463            self.string2, _ = read_unicode_2byte(self.data, start_idx+4,
464                                                 n_chars)
465
466    def _type_str(self):
467        return 'XlsbBeginSupBook Record ({0}, "{1}", "{2}")' \
468               .format(self.link_type, self.string1, self.string2)
469
470
471###############################################################################
472# XLSB Binary Parts
473###############################################################################
474
475
476def parse_xlsb_part(file_stream, _, filename):
477    """ Excel xlsb files also have bin files with record structure. iter! """
478    xlsb_stream = None
479    try:
480        xlsb_stream = XlsbStream(file_stream, file_stream.size, filename,
481                                 record_base.STGTY_STREAM)
482        for record in xlsb_stream.iter_records():
483            yield record
484    except Exception:
485        raise
486    finally:
487        if xlsb_stream is not None:
488            xlsb_stream.close()
489
490
491###############################################################################
492# TESTING
493###############################################################################
494
495
496if __name__ == '__main__':
497    sys.exit(record_base.test(sys.argv[1:], XlsFile, WorkbookStream))
498