1#!/usr/bin/env python
2
3"""
4record_base.py
5
6Common stuff for ole files whose streams are a sequence of record structures.
7This is the case for xls and ppt, so classes are bases for xls_parser.py and
8ppt_record_parser.py .
9"""
10
11# === LICENSE ==================================================================
12
13# record_base is copyright (c) 2014-2019 Philippe Lagadec (http://www.decalage.info)
14# All rights reserved.
15#
16# Redistribution and use in source and binary forms, with or without
17# modification, are permitted provided that the following conditions are met:
18#
19#  * Redistributions of source code must retain the above copyright notice,
20#    this list of conditions and the following disclaimer.
21#  * Redistributions in binary form must reproduce the above copyright notice,
22#    this list of conditions and the following disclaimer in the documentation
23#    and/or other materials provided with the distribution.
24#
25# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
26# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
29# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35# POSSIBILITY OF SUCH DAMAGE.
36
37from __future__ import print_function
38
39# -----------------------------------------------------------------------------
40# CHANGELOG:
41# 2017-11-30 v0.01 CH: - first version based on xls_parser
42# 2018-09-11 v0.54 PL: - olefile is now a dependency
43# 2019-01-30       PL: - fixed import to avoid mixing installed oletools
44#                        and dev version
45
46__version__ = '0.54'
47
48# -----------------------------------------------------------------------------
49# TODO:
50# - read DocumentSummaryInformation first to get more info about streams
51#   (maybe content type or so; identify streams that are never record-based)
52#   Or use oleid to avoid same functionality in several files
53# - think about integrating this with olefile itself
54
55# -----------------------------------------------------------------------------
56#  REFERENCES:
57#  - [MS-XLS]: Excel Binary File Format (.xls) Structure Specification
58#    https://msdn.microsoft.com/en-us/library/office/cc313154(v=office.14).aspx
59#  - Understanding the Excel .xls Binary File Format
60#    https://msdn.microsoft.com/en-us/library/office/gg615597(v=office.14).aspx
61#  - [MS-PPT]
62
63
64import sys
65import os.path
66from io import SEEK_CUR
67import logging
68
69import olefile
70
71# little hack to allow absolute imports even if oletools is not installed.
72PARENT_DIR = os.path.normpath(os.path.dirname(os.path.dirname(
73    os.path.abspath(__file__))))
74if PARENT_DIR not in sys.path:
75    sys.path.insert(0, PARENT_DIR)
76del PARENT_DIR
77from oletools import oleid
78
79
80###############################################################################
81# Helpers
82###############################################################################
83
84OleFileIO = olefile.OleFileIO
85STGTY_EMPTY     = olefile.STGTY_EMPTY      # 0
86STGTY_STORAGE   = olefile.STGTY_STORAGE    # 1
87STGTY_STREAM    = olefile.STGTY_STREAM     # 2
88STGTY_LOCKBYTES = olefile.STGTY_LOCKBYTES  # 3
89STGTY_PROPERTY  = olefile.STGTY_PROPERTY   # 4
90STGTY_ROOT      = olefile.STGTY_ROOT       # 5
91STGTY_SUBSTREAM = 10
92
93ENTRY_TYPE2STR = {
94    olefile.STGTY_EMPTY: 'empty',
95    olefile.STGTY_STORAGE: 'storage',
96    olefile.STGTY_STREAM: 'stream',
97    olefile.STGTY_LOCKBYTES: 'lock-bytes',
98    olefile.STGTY_PROPERTY: 'property',
99    olefile.STGTY_ROOT: 'root',
100    STGTY_SUBSTREAM: 'substream'
101}
102
103
104def enable_olefile_logging():
105    """ enable logging olefile e.g., to get debug info from OleFileIO """
106    olefile.enable_logging()
107
108
109###############################################################################
110# Base Classes
111###############################################################################
112
113
114SUMMARY_INFORMATION_STREAM_NAMES = ('\x05SummaryInformation',
115                                    '\x05DocumentSummaryInformation')
116
117
118class OleRecordFile(olefile.OleFileIO):
119    """ an OLE compound file whose streams have (mostly) record structure
120
121    'record structure' meaning that streams are a sequence of records. Records
122    are structure with information about type and size in their first bytes
123    and type-dependent data of given size after that.
124
125    Subclass of OleFileIO!
126    """
127
128    def open(self, filename, *args, **kwargs):
129        """Call OleFileIO.open."""
130        #super(OleRecordFile, self).open(filename, *args, **kwargs)
131        OleFileIO.open(self, filename, *args, **kwargs)
132
133    @classmethod
134    def stream_class_for_name(cls, stream_name):
135        """ helper for iter_streams, must be overwritten in subclasses
136
137        will not be called for SUMMARY_INFORMATION_STREAM_NAMES
138        """
139        return OleRecordStream    # this is an abstract class!
140
141    def iter_streams(self):
142        """ find all streams, including orphans """
143        logging.debug('Finding streams in ole file')
144
145        for sid, direntry in enumerate(self.direntries):
146            is_orphan = direntry is None
147            if is_orphan:
148                # this direntry is not part of the tree --> unused or orphan
149                direntry = self._load_direntry(sid)
150            is_stream = direntry.entry_type == olefile.STGTY_STREAM
151            logging.debug('direntry {:2d} {}: {}'.format(
152                sid, '[orphan]' if is_orphan else direntry.name,
153                'is stream of size {}'.format(direntry.size) if is_stream else
154                'no stream ({})'.format(ENTRY_TYPE2STR[direntry.entry_type])))
155            if is_stream:
156                if not is_orphan and \
157                        direntry.name in SUMMARY_INFORMATION_STREAM_NAMES:
158                    clz = OleSummaryInformationStream
159                else:
160                    clz = self.stream_class_for_name(direntry.name)
161                stream = clz(self._open(direntry.isectStart, direntry.size),
162                             direntry.size,
163                             None if is_orphan else direntry.name,
164                             direntry.entry_type)
165                yield stream
166                stream.close()
167
168
169class OleRecordStream(object):
170    """ a stream found in an OleRecordFile
171
172    Always has a name and a size (both read-only). Has an OleFileStream handle.
173
174    abstract base class
175    """
176
177    def __init__(self, stream, size, name, stream_type):
178        self.stream = stream
179        self.size = size
180        self.name = name
181        if stream_type not in ENTRY_TYPE2STR:
182            raise ValueError('Unknown stream type: {0}'.format(stream_type))
183        self.stream_type = stream_type
184
185    def read_record_head(self):
186        """ read first few bytes of record to determine size and type
187
188        Abstract base method, to be implemented in subclasses.
189
190        returns (rec_type, rec_size, other) where other will be forwarded to
191        record constructors
192        """
193        raise NotImplementedError('Abstract method '
194                                  'OleRecordStream.read_record_head called')
195
196    @classmethod
197    def record_class_for_type(cls, rec_type):
198        """ determine a class for given record type
199
200        Only a base implementation. Create subclasses of OleRecordBase and
201        return those when appropriate.
202
203        returns (clz, force_read)
204        """
205        return OleRecordBase, False
206
207    def iter_records(self, fill_data=False):
208        """ yield all records in this stream
209
210        Stream must be positioned at start of records (e.g. start of stream).
211        """
212        while True:
213            # unpacking as in olevba._extract_vba
214            pos = self.stream.tell()
215            if pos >= self.size:
216                break
217
218            # read first few bytes, determine record type and size
219            rec_type, rec_size, other = self.read_record_head()
220            # logging.debug('Record type {0} of size {1}'
221            #               .format(rec_type, rec_size))
222
223            # determine what class to wrap this into
224            rec_clz, force_read = self.record_class_for_type(rec_type)
225
226            if fill_data or force_read:
227                data = self.stream.read(rec_size)
228                if len(data) != rec_size:
229                    raise IOError('Unexpected end of stream ({0} < {1})'
230                                  .format(len(data), rec_size))
231            else:
232                self.stream.seek(rec_size, SEEK_CUR)
233                data = None
234            rec_object = rec_clz(rec_type, rec_size, other, pos, data)
235
236            # "We are microsoft, we do not always adhere to our specifications"
237            rec_object.read_some_more(self.stream)
238            yield rec_object
239
240    def close(self):
241        self.stream.close()
242
243    def __str__(self):
244        return '[{0} {1} (type {2}, size {3})' \
245               .format(self.__class__.__name__,
246                       self.name or '[orphan]',
247                       ENTRY_TYPE2STR[self.stream_type],
248                       self.size)
249
250
251class OleSummaryInformationStream(OleRecordStream):
252    """ stream for \05SummaryInformation and \05DocumentSummaryInformation
253
254    Do nothing so far. OleFileIO reads quite some info from this. For more info
255    see [MS-OSHARED] 2.3.3 and [MS-OLEPS] 2.21 and references therein.
256
257    See also: info read in oleid.py.
258    """
259    def iter_records(self, fill_data=False):
260        """ yields nothing, stops at once """
261        return
262        yield   # required to make this a generator pylint: disable=unreachable
263
264
265class OleRecordBase(object):
266    """ a record found in an OleRecordStream
267
268    always has a type and a size, also pos and data
269    """
270
271    # for subclasses with a fixed type
272    TYPE = None
273
274    # (max) size of subclasses
275    MAX_SIZE = None
276    SIZE = None
277
278    def __init__(self, type, size, more_data, pos, data):
279        """ create a record; more_data is discarded """
280        if self.TYPE is not None and type != self.TYPE:
281            raise ValueError('Wrong subclass {0} for type {1}'
282                             .format(self.__class__.__name__, type))
283        self.type = type
284        if self.SIZE is not None and size != self.SIZE:
285            raise ValueError('Wrong size {0} for record type {1}'
286                             .format(size, type))
287        elif self.MAX_SIZE is not None and size > self.MAX_SIZE:
288            raise ValueError('Wrong size: {0} > MAX_SIZE for record type {1}'
289                             .format(size, type))
290        self.size = size
291        self.pos = pos
292        self.data = data
293        self.finish_constructing(more_data)
294
295    def finish_constructing(self, more_data):
296        """ finish constructing this record
297
298        Can save more_data from OleRecordStream.read_record_head and/or parse
299        data (if it was read).
300
301        Base implementation, does nothing. To be overwritten in subclasses.
302
303        Implementations should take into account that self.data may be None.
304        Should create the same attributes, whether data is present or not. Eg::
305
306            def finish_constructing(self, more_data):
307                self.more = more_data
308                self.attr1 = None
309                self.attr2 = None
310                if self.data:
311                    self.attr1, self.attr2 = struct.unpack('<HH', self.data)
312        """
313        pass
314
315    def read_some_more(self, stream):
316        """ Read some more data from stream after end of this record
317
318        Found that for CurrentUserAtom in "Current User" stream of ppt files,
319        the last attribute (user name in unicode) is found *behind* the record
320        data. Thank you, Microsoft!
321
322        Do this only if you are certain you will not mess up the following
323        records!
324
325        This base implementation does nothing. For optional overwriting in
326        subclasses (like PptRecordUserAtom where no record should follow.)
327        """
328        return
329
330    def _type_str(self):
331        """ helper for __str__, base implementation """
332        return '{0} type {1}'.format(self.__class__.__name__, self.type)
333
334    def __str__(self):
335        """ create a short but informative textual representation of self """
336        return '[' + self._type_str() + \
337               ' (size {0} from {1})]'.format(self.size, self.pos)
338
339
340###############################################################################
341# TESTING
342###############################################################################
343
344
345def test(filenames, ole_file_class=OleRecordFile,
346         must_parse=None, do_per_record=None, verbose=False):
347    """ parse all given file names and print rough structure
348
349    if an error occurs while parsing a stream of type in must_parse, the error
350    will be raised. Otherwise a message is printed
351    """
352    logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)
353    if do_per_record is None:
354        def do_per_record(record):         # pylint: disable=function-redefined
355            pass   # do nothing
356    if not filenames:
357        logging.info('need file name[s]')
358        return 2
359    for filename in filenames:
360        logging.info('checking file {0}'.format(filename))
361        if not olefile.isOleFile(filename):
362            logging.info('not an ole file - skip')
363            continue
364        ole = ole_file_class(filename)
365
366        for stream in ole.iter_streams():
367            logging.info('  parse ' + str(stream))
368            try:
369                for record in stream.iter_records():
370                    logging.info('    ' + str(record))
371                    do_per_record(record)
372            except Exception:
373                if not must_parse:
374                    raise
375                elif isinstance(stream, must_parse):
376                    raise
377                else:
378                    logging.info('  failed to parse', exc_info=True)
379    return 0
380
381
382if __name__ == '__main__':
383    sys.exit(test(sys.argv[1:]))
384