1#!/usr/bin/env python 2 3""" 4record_base.py 5 6Common stuff for ole files whose streams are a sequence of record structures. 7This is the case for xls and ppt, so classes are bases for xls_parser.py and 8ppt_record_parser.py . 9""" 10 11# === LICENSE ================================================================== 12 13# record_base is copyright (c) 2014-2019 Philippe Lagadec (http://www.decalage.info) 14# All rights reserved. 15# 16# Redistribution and use in source and binary forms, with or without 17# modification, are permitted provided that the following conditions are met: 18# 19# * Redistributions of source code must retain the above copyright notice, 20# this list of conditions and the following disclaimer. 21# * Redistributions in binary form must reproduce the above copyright notice, 22# this list of conditions and the following disclaimer in the documentation 23# and/or other materials provided with the distribution. 24# 25# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 26# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 29# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 30# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 31# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 32# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 33# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 34# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 35# POSSIBILITY OF SUCH DAMAGE. 36 37from __future__ import print_function 38 39# ----------------------------------------------------------------------------- 40# CHANGELOG: 41# 2017-11-30 v0.01 CH: - first version based on xls_parser 42# 2018-09-11 v0.54 PL: - olefile is now a dependency 43# 2019-01-30 PL: - fixed import to avoid mixing installed oletools 44# and dev version 45 46__version__ = '0.54' 47 48# ----------------------------------------------------------------------------- 49# TODO: 50# - read DocumentSummaryInformation first to get more info about streams 51# (maybe content type or so; identify streams that are never record-based) 52# Or use oleid to avoid same functionality in several files 53# - think about integrating this with olefile itself 54 55# ----------------------------------------------------------------------------- 56# REFERENCES: 57# - [MS-XLS]: Excel Binary File Format (.xls) Structure Specification 58# https://msdn.microsoft.com/en-us/library/office/cc313154(v=office.14).aspx 59# - Understanding the Excel .xls Binary File Format 60# https://msdn.microsoft.com/en-us/library/office/gg615597(v=office.14).aspx 61# - [MS-PPT] 62 63 64import sys 65import os.path 66from io import SEEK_CUR 67import logging 68 69import olefile 70 71# little hack to allow absolute imports even if oletools is not installed. 72PARENT_DIR = os.path.normpath(os.path.dirname(os.path.dirname( 73 os.path.abspath(__file__)))) 74if PARENT_DIR not in sys.path: 75 sys.path.insert(0, PARENT_DIR) 76del PARENT_DIR 77from oletools import oleid 78 79 80############################################################################### 81# Helpers 82############################################################################### 83 84OleFileIO = olefile.OleFileIO 85STGTY_EMPTY = olefile.STGTY_EMPTY # 0 86STGTY_STORAGE = olefile.STGTY_STORAGE # 1 87STGTY_STREAM = olefile.STGTY_STREAM # 2 88STGTY_LOCKBYTES = olefile.STGTY_LOCKBYTES # 3 89STGTY_PROPERTY = olefile.STGTY_PROPERTY # 4 90STGTY_ROOT = olefile.STGTY_ROOT # 5 91STGTY_SUBSTREAM = 10 92 93ENTRY_TYPE2STR = { 94 olefile.STGTY_EMPTY: 'empty', 95 olefile.STGTY_STORAGE: 'storage', 96 olefile.STGTY_STREAM: 'stream', 97 olefile.STGTY_LOCKBYTES: 'lock-bytes', 98 olefile.STGTY_PROPERTY: 'property', 99 olefile.STGTY_ROOT: 'root', 100 STGTY_SUBSTREAM: 'substream' 101} 102 103 104def enable_olefile_logging(): 105 """ enable logging olefile e.g., to get debug info from OleFileIO """ 106 olefile.enable_logging() 107 108 109############################################################################### 110# Base Classes 111############################################################################### 112 113 114SUMMARY_INFORMATION_STREAM_NAMES = ('\x05SummaryInformation', 115 '\x05DocumentSummaryInformation') 116 117 118class OleRecordFile(olefile.OleFileIO): 119 """ an OLE compound file whose streams have (mostly) record structure 120 121 'record structure' meaning that streams are a sequence of records. Records 122 are structure with information about type and size in their first bytes 123 and type-dependent data of given size after that. 124 125 Subclass of OleFileIO! 126 """ 127 128 def open(self, filename, *args, **kwargs): 129 """Call OleFileIO.open.""" 130 #super(OleRecordFile, self).open(filename, *args, **kwargs) 131 OleFileIO.open(self, filename, *args, **kwargs) 132 133 @classmethod 134 def stream_class_for_name(cls, stream_name): 135 """ helper for iter_streams, must be overwritten in subclasses 136 137 will not be called for SUMMARY_INFORMATION_STREAM_NAMES 138 """ 139 return OleRecordStream # this is an abstract class! 140 141 def iter_streams(self): 142 """ find all streams, including orphans """ 143 logging.debug('Finding streams in ole file') 144 145 for sid, direntry in enumerate(self.direntries): 146 is_orphan = direntry is None 147 if is_orphan: 148 # this direntry is not part of the tree --> unused or orphan 149 direntry = self._load_direntry(sid) 150 is_stream = direntry.entry_type == olefile.STGTY_STREAM 151 logging.debug('direntry {:2d} {}: {}'.format( 152 sid, '[orphan]' if is_orphan else direntry.name, 153 'is stream of size {}'.format(direntry.size) if is_stream else 154 'no stream ({})'.format(ENTRY_TYPE2STR[direntry.entry_type]))) 155 if is_stream: 156 if not is_orphan and \ 157 direntry.name in SUMMARY_INFORMATION_STREAM_NAMES: 158 clz = OleSummaryInformationStream 159 else: 160 clz = self.stream_class_for_name(direntry.name) 161 stream = clz(self._open(direntry.isectStart, direntry.size), 162 direntry.size, 163 None if is_orphan else direntry.name, 164 direntry.entry_type) 165 yield stream 166 stream.close() 167 168 169class OleRecordStream(object): 170 """ a stream found in an OleRecordFile 171 172 Always has a name and a size (both read-only). Has an OleFileStream handle. 173 174 abstract base class 175 """ 176 177 def __init__(self, stream, size, name, stream_type): 178 self.stream = stream 179 self.size = size 180 self.name = name 181 if stream_type not in ENTRY_TYPE2STR: 182 raise ValueError('Unknown stream type: {0}'.format(stream_type)) 183 self.stream_type = stream_type 184 185 def read_record_head(self): 186 """ read first few bytes of record to determine size and type 187 188 Abstract base method, to be implemented in subclasses. 189 190 returns (rec_type, rec_size, other) where other will be forwarded to 191 record constructors 192 """ 193 raise NotImplementedError('Abstract method ' 194 'OleRecordStream.read_record_head called') 195 196 @classmethod 197 def record_class_for_type(cls, rec_type): 198 """ determine a class for given record type 199 200 Only a base implementation. Create subclasses of OleRecordBase and 201 return those when appropriate. 202 203 returns (clz, force_read) 204 """ 205 return OleRecordBase, False 206 207 def iter_records(self, fill_data=False): 208 """ yield all records in this stream 209 210 Stream must be positioned at start of records (e.g. start of stream). 211 """ 212 while True: 213 # unpacking as in olevba._extract_vba 214 pos = self.stream.tell() 215 if pos >= self.size: 216 break 217 218 # read first few bytes, determine record type and size 219 rec_type, rec_size, other = self.read_record_head() 220 # logging.debug('Record type {0} of size {1}' 221 # .format(rec_type, rec_size)) 222 223 # determine what class to wrap this into 224 rec_clz, force_read = self.record_class_for_type(rec_type) 225 226 if fill_data or force_read: 227 data = self.stream.read(rec_size) 228 if len(data) != rec_size: 229 raise IOError('Unexpected end of stream ({0} < {1})' 230 .format(len(data), rec_size)) 231 else: 232 self.stream.seek(rec_size, SEEK_CUR) 233 data = None 234 rec_object = rec_clz(rec_type, rec_size, other, pos, data) 235 236 # "We are microsoft, we do not always adhere to our specifications" 237 rec_object.read_some_more(self.stream) 238 yield rec_object 239 240 def close(self): 241 self.stream.close() 242 243 def __str__(self): 244 return '[{0} {1} (type {2}, size {3})' \ 245 .format(self.__class__.__name__, 246 self.name or '[orphan]', 247 ENTRY_TYPE2STR[self.stream_type], 248 self.size) 249 250 251class OleSummaryInformationStream(OleRecordStream): 252 """ stream for \05SummaryInformation and \05DocumentSummaryInformation 253 254 Do nothing so far. OleFileIO reads quite some info from this. For more info 255 see [MS-OSHARED] 2.3.3 and [MS-OLEPS] 2.21 and references therein. 256 257 See also: info read in oleid.py. 258 """ 259 def iter_records(self, fill_data=False): 260 """ yields nothing, stops at once """ 261 return 262 yield # required to make this a generator pylint: disable=unreachable 263 264 265class OleRecordBase(object): 266 """ a record found in an OleRecordStream 267 268 always has a type and a size, also pos and data 269 """ 270 271 # for subclasses with a fixed type 272 TYPE = None 273 274 # (max) size of subclasses 275 MAX_SIZE = None 276 SIZE = None 277 278 def __init__(self, type, size, more_data, pos, data): 279 """ create a record; more_data is discarded """ 280 if self.TYPE is not None and type != self.TYPE: 281 raise ValueError('Wrong subclass {0} for type {1}' 282 .format(self.__class__.__name__, type)) 283 self.type = type 284 if self.SIZE is not None and size != self.SIZE: 285 raise ValueError('Wrong size {0} for record type {1}' 286 .format(size, type)) 287 elif self.MAX_SIZE is not None and size > self.MAX_SIZE: 288 raise ValueError('Wrong size: {0} > MAX_SIZE for record type {1}' 289 .format(size, type)) 290 self.size = size 291 self.pos = pos 292 self.data = data 293 self.finish_constructing(more_data) 294 295 def finish_constructing(self, more_data): 296 """ finish constructing this record 297 298 Can save more_data from OleRecordStream.read_record_head and/or parse 299 data (if it was read). 300 301 Base implementation, does nothing. To be overwritten in subclasses. 302 303 Implementations should take into account that self.data may be None. 304 Should create the same attributes, whether data is present or not. Eg:: 305 306 def finish_constructing(self, more_data): 307 self.more = more_data 308 self.attr1 = None 309 self.attr2 = None 310 if self.data: 311 self.attr1, self.attr2 = struct.unpack('<HH', self.data) 312 """ 313 pass 314 315 def read_some_more(self, stream): 316 """ Read some more data from stream after end of this record 317 318 Found that for CurrentUserAtom in "Current User" stream of ppt files, 319 the last attribute (user name in unicode) is found *behind* the record 320 data. Thank you, Microsoft! 321 322 Do this only if you are certain you will not mess up the following 323 records! 324 325 This base implementation does nothing. For optional overwriting in 326 subclasses (like PptRecordUserAtom where no record should follow.) 327 """ 328 return 329 330 def _type_str(self): 331 """ helper for __str__, base implementation """ 332 return '{0} type {1}'.format(self.__class__.__name__, self.type) 333 334 def __str__(self): 335 """ create a short but informative textual representation of self """ 336 return '[' + self._type_str() + \ 337 ' (size {0} from {1})]'.format(self.size, self.pos) 338 339 340############################################################################### 341# TESTING 342############################################################################### 343 344 345def test(filenames, ole_file_class=OleRecordFile, 346 must_parse=None, do_per_record=None, verbose=False): 347 """ parse all given file names and print rough structure 348 349 if an error occurs while parsing a stream of type in must_parse, the error 350 will be raised. Otherwise a message is printed 351 """ 352 logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO) 353 if do_per_record is None: 354 def do_per_record(record): # pylint: disable=function-redefined 355 pass # do nothing 356 if not filenames: 357 logging.info('need file name[s]') 358 return 2 359 for filename in filenames: 360 logging.info('checking file {0}'.format(filename)) 361 if not olefile.isOleFile(filename): 362 logging.info('not an ole file - skip') 363 continue 364 ole = ole_file_class(filename) 365 366 for stream in ole.iter_streams(): 367 logging.info(' parse ' + str(stream)) 368 try: 369 for record in stream.iter_records(): 370 logging.info(' ' + str(record)) 371 do_per_record(record) 372 except Exception: 373 if not must_parse: 374 raise 375 elif isinstance(stream, must_parse): 376 raise 377 else: 378 logging.info(' failed to parse', exc_info=True) 379 return 0 380 381 382if __name__ == '__main__': 383 sys.exit(test(sys.argv[1:])) 384