1""" Parse xls up to some point 2 3Read storages, (sub-)streams, records from xls file 4""" 5# 6# === LICENSE ================================================================== 7 8# xls_parser is copyright (c) 2014-2019 Philippe Lagadec (http://www.decalage.info) 9# All rights reserved. 10# 11# Redistribution and use in source and binary forms, with or without modification, 12# are permitted provided that the following conditions are met: 13# 14# * Redistributions of source code must retain the above copyright notice, this 15# list of conditions and the following disclaimer. 16# * Redistributions in binary form must reproduce the above copyright notice, 17# this list of conditions and the following disclaimer in the documentation 18# and/or other materials provided with the distribution. 19# 20# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 21# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 22# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31#------------------------------------------------------------------------------ 32# CHANGELOG: 33# 2017-11-02 v0.1 CH: - first version 34# 2017-11-02 v0.2 CH: - move some code to record_base.py 35# (to avoid copy-and-paste in ppt_parser.py) 36# 2019-01-30 v0.54 PL: - fixed import to avoid mixing installed oletools 37# and dev version 38 39__version__ = '0.54' 40 41# ----------------------------------------------------------------------------- 42# TODO: 43# - parse more record types (ExternName, ...) 44# - check what bad stuff can be in other storages: Embedded ("MBD..."), Linked 45# ("LNK..."), "MsoDataStore" and OleStream ('\001Ole') 46# 47# ----------------------------------------------------------------------------- 48# REFERENCES: 49# - [MS-XLS]: Excel Binary File Format (.xls) Structure Specification 50# https://msdn.microsoft.com/en-us/library/office/cc313154(v=office.14).aspx 51# - Understanding the Excel .xls Binary File Format 52# https://msdn.microsoft.com/en-us/library/office/gg615597(v=office.14).aspx 53# 54# -- IMPORTS ------------------------------------------------------------------ 55 56import sys 57import os.path 58from struct import unpack 59import logging 60 61# little hack to allow absolute imports even if oletools is not installed. 62# Copied from olevba.py 63PARENT_DIR = os.path.normpath(os.path.dirname(os.path.dirname( 64 os.path.abspath(__file__)))) 65if PARENT_DIR not in sys.path: 66 sys.path.insert(0, PARENT_DIR) 67del PARENT_DIR 68from oletools import record_base 69 70 71# === PYTHON 2+3 SUPPORT ====================================================== 72 73if sys.version_info[0] >= 3: 74 unichr = chr 75 76############################################################################### 77# Helpers 78############################################################################### 79 80 81def is_xls(filename): 82 """ 83 determine whether a given file is an excel ole file 84 85 returns True if given file is an ole file and contains a Workbook stream 86 87 todo: could further check that workbook stream starts with a globals 88 substream. 89 See also: oleid.OleID.check_excel 90 """ 91 xls_file = None 92 try: 93 xls_file = XlsFile(filename) 94 for stream in xls_file.iter_streams(): 95 if isinstance(stream, WorkbookStream): 96 return True 97 except Exception: 98 logging.debug('Ignoring exception in is_xls, assume is not xls', 99 exc_info=True) 100 finally: 101 if xls_file is not None: 102 xls_file.close() 103 return False 104 105 106def read_unicode(data, start_idx, n_chars): 107 """ read a unicode string from a XLUnicodeStringNoCch structure """ 108 # first bit 0x0 --> only low-bytes are saved, all high bytes are 0 109 # first bit 0x1 --> 2 bytes per character 110 low_bytes_only = (ord(data[start_idx:start_idx+1]) == 0) 111 if low_bytes_only: 112 end_idx = start_idx + 1 + n_chars 113 return data[start_idx+1:end_idx].decode('ascii'), end_idx 114 else: 115 return read_unicode_2byte(data, start_idx+1, n_chars) 116 117 118def read_unicode_2byte(data, start_idx, n_chars): 119 """ read a unicode string with characters encoded by 2 bytes """ 120 end_idx = start_idx + n_chars * 2 121 if n_chars < 256: # faster version, long format string for unpack 122 unichars = (unichr(val) for val in 123 unpack('<' + 'H'*n_chars, data[start_idx:end_idx])) 124 else: # slower version but less memory-extensive 125 unichars = (unichr(unpack('<H', data[data_idx:data_idx+2])[0]) 126 for data_idx in range(start_idx, end_idx, 2)) 127 return u''.join(unichars), end_idx 128 129 130############################################################################### 131# File, Storage, Stream 132############################################################################### 133 134class XlsFile(record_base.OleRecordFile): 135 """ An xls file has most streams made up of records """ 136 137 @classmethod 138 def stream_class_for_name(cls, stream_name): 139 """ helper for iter_streams """ 140 if stream_name == 'Workbook': 141 return WorkbookStream 142 return XlsStream 143 144 145class XlsStream(record_base.OleRecordStream): 146 """ most streams in xls file consist of records """ 147 148 def read_record_head(self): 149 """ read first few bytes of record to determine size and type 150 151 returns (type, size, other) where other is None 152 """ 153 rec_type, rec_size = unpack('<HH', self.stream.read(4)) 154 return rec_type, rec_size, None 155 156 @classmethod 157 def record_class_for_type(cls, rec_type): 158 """ determine a class for given record type 159 160 returns (clz, force_read) 161 """ 162 return XlsRecord, False 163 164 165class WorkbookStream(XlsStream): 166 """ Stream in excel file that holds most info """ 167 168 @classmethod 169 def record_class_for_type(cls, rec_type): 170 """ determine a class for given record type 171 172 returns (clz, force_read) 173 """ 174 if rec_type == XlsRecordBof.TYPE: 175 return XlsRecordBof, True 176 elif rec_type == XlsRecordEof.TYPE: 177 return XlsRecordEof, False 178 elif rec_type == XlsRecordSupBook.TYPE: 179 return XlsRecordSupBook, True 180 else: 181 return XlsRecord, False 182 183 184class XlsbStream(record_base.OleRecordStream): 185 """ binary stream of an xlsb file, usually have a record structure """ 186 187 HIGH_BIT_MASK = 0b10000000 188 LOW7_BIT_MASK = 0b01111111 189 190 def read_record_head(self): 191 """ read first few bytes of record to determine size and type 192 193 returns (type, size, other) where other is None 194 """ 195 val = ord(self.stream.read(1)) 196 if val & self.HIGH_BIT_MASK: # high bit of the low byte is 1 197 val2 = ord(self.stream.read(1)) # need another byte 198 # combine 7 low bits of each byte 199 rec_type = (val & self.LOW7_BIT_MASK) + \ 200 ((val2 & self.LOW7_BIT_MASK) << 7) 201 else: 202 rec_type = val 203 204 rec_size = 0 205 shift = 0 206 for _ in range(4): # rec_size needs up to 4 byte 207 val = ord(self.stream.read(1)) 208 rec_size += (val & self.LOW7_BIT_MASK) << shift 209 shift += 7 210 if (val & self.HIGH_BIT_MASK) == 0: # high-bit is 0 --> done 211 break 212 return rec_type, rec_size, None 213 214 @classmethod 215 def record_class_for_type(cls, rec_type): 216 """ determine a class for given record type 217 218 returns (clz, force_read) 219 """ 220 if rec_type == XlsbBeginSupBook.TYPE: 221 return XlsbBeginSupBook, True 222 else: 223 return XlsbRecord, False 224 225 226############################################################################### 227# RECORDS 228############################################################################### 229 230# records that appear often but do not need their own XlsRecord subclass (yet) 231FREQUENT_RECORDS = dict([ 232 ( 156, 'BuiltInFnGroupCount'), # pylint: disable=bad-whitespace 233 (2147, 'BookExt'), # pylint: disable=bad-whitespace 234 ( 442, 'CodeName'), # pylint: disable=bad-whitespace 235 ( 66, 'CodePage'), # pylint: disable=bad-whitespace 236 (4195, 'Dat'), # pylint: disable=bad-whitespace 237 (2154, 'DataLabExt'), # pylint: disable=bad-whitespace 238 (2155, 'DataLabExtContents'), # pylint: disable=bad-whitespace 239 ( 215, 'DBCell'), # pylint: disable=bad-whitespace 240 ( 220, 'DbOrParmQry'), # pylint: disable=bad-whitespace 241 (2051, 'DBQueryExt'), # pylint: disable=bad-whitespace 242 (2166, 'DConn'), # pylint: disable=bad-whitespace 243 ( 35, 'ExternName'), # pylint: disable=bad-whitespace 244 ( 23, 'ExternSheet'), # pylint: disable=bad-whitespace 245 ( 255, 'ExtSST'), # pylint: disable=bad-whitespace 246 (2052, 'ExtString'), # pylint: disable=bad-whitespace 247 (2151, 'FeatHdr'), # pylint: disable=bad-whitespace 248 ( 91, 'FileSharing'), # pylint: disable=bad-whitespace 249 (1054, 'Format'), # pylint: disable=bad-whitespace 250 ( 49, 'Font'), # pylint: disable=bad-whitespace 251 (2199, 'GUIDTypeLib'), # pylint: disable=bad-whitespace 252 ( 440, 'HLink'), # pylint: disable=bad-whitespace 253 ( 225, 'InterfaceHdr'), # pylint: disable=bad-whitespace 254 ( 226, 'InterfaceEnd'), # pylint: disable=bad-whitespace 255 ( 523, 'Index'), # pylint: disable=bad-whitespace 256 ( 24, 'Lbl'), # pylint: disable=bad-whitespace 257 ( 193, 'Mms'), # pylint: disable=bad-whitespace 258 ( 93, 'Obj'), # pylint: disable=bad-whitespace 259 (4135, 'ObjectLink'), # pylint: disable=bad-whitespace 260 (2058, 'OleDbConn'), # pylint: disable=bad-whitespace 261 ( 222, 'OleObjectSize'), # pylint: disable=bad-whitespace 262 (2214, 'RichTextStream'), # pylint: disable=bad-whitespace 263 (2146, 'SheetExt'), # pylint: disable=bad-whitespace 264 (1212, 'ShrFmla'), # pylint: disable=bad-whitespace 265 (2060, 'SxViewExt'), # pylint: disable=bad-whitespace 266 (2136, 'SxViewLink'), # pylint: disable=bad-whitespace 267 (2049, 'WebPub'), # pylint: disable=bad-whitespace 268 ( 224, 'XF (formatting)'), # pylint: disable=bad-whitespace 269 (2173, 'XFExt (formatting)'), # pylint: disable=bad-whitespace 270 ( 659, 'Style'), # pylint: disable=bad-whitespace 271 (2194, 'StyleExt') # pylint: disable=bad-whitespace 272]) 273 274#: records found in xlsb binary parts 275FREQUENT_RECORDS_XLSB = dict([ 276 (588, 'BrtEndSupBook'), 277 (667, 'BrtSupAddin'), 278 (355, 'BrtSupBookSrc'), 279 (586, 'BrtSupNameBits'), 280 (584, 'BrtSupNameBool'), 281 (587, 'BrtSupNameEnd'), 282 (581, 'BrtSupNameErr'), 283 (585, 'BrtSupNameFmla'), 284 (583, 'BrtSupNameNil'), 285 (580, 'BrtSupNameNum'), 286 (582, 'BrtSupNameSt'), 287 (577, 'BrtSupNameStart'), 288 (579, 'BrtSupNameValueEnd'), 289 (578, 'BrtSupNameValueStart'), 290 (358, 'BrtSupSame'), 291 (357, 'BrtSupSelf'), 292 (359, 'BrtSupTabs'), 293]) 294 295 296class XlsRecord(record_base.OleRecordBase): 297 """ basic building block of data in workbook stream """ 298 299 #: max size of a record in xls stream (does not apply to xlsb) 300 MAX_SIZE = 8224 301 302 def _type_str(self): 303 """ simplification for subclasses to create their own __str__ """ 304 try: 305 return FREQUENT_RECORDS[self.type] 306 except KeyError: 307 return 'XlsRecord type {0}'.format(self.type) 308 309 310class XlsRecordBof(XlsRecord): 311 """ record found at beginning of substreams """ 312 TYPE = 2057 313 SIZE = 16 314 # types of substreams 315 DOCTYPES = dict([(0x5, 'workbook'), (0x10, 'dialog/worksheet'), 316 (0x20, 'chart'), (0x40, 'macro')]) 317 318 def finish_constructing(self, _): 319 if self.data is None: 320 self.doctype = None 321 return 322 # parse data (only doctype, ignore rest) 323 self.doctype = unpack('<H', self.data[2:4])[0] 324 325 def _type_str(self): 326 return 'BOF Record ({0} substream)'.format( 327 self.DOCTYPES[self.doctype] if self.doctype in self.DOCTYPES 328 else 'unknown') 329 330 331class XlsRecordEof(XlsRecord): 332 """ record found at end of substreams """ 333 TYPE = 10 334 SIZE = 0 335 336 def _type_str(self): 337 return 'EOF Record' 338 339 340class XlsRecordSupBook(XlsRecord): 341 """ The SupBook record specifies a supporting link 342 343 "... The collection of records specifies the contents of an external 344 workbook, DDE data source, or OLE data source." (MS-XLS, paragraph 2.4.271) 345 """ 346 347 TYPE = 430 348 349 LINK_TYPE_UNKNOWN = 'unknown' 350 LINK_TYPE_SELF = 'self-referencing' 351 LINK_TYPE_ADDIN = 'addin-referencing' 352 LINK_TYPE_UNUSED = 'unused' 353 LINK_TYPE_SAMESHEET = 'same-sheet' 354 LINK_TYPE_OLE_DDE = 'ole/dde data source' 355 LINK_TYPE_EXTERNAL = 'external workbook' 356 357 def finish_constructing(self, _): 358 """Finish constructing this record; called at end of constructor.""" 359 # set defaults 360 self.ctab = None 361 self.cch = None 362 self.virt_path = None 363 self.support_link_type = self.LINK_TYPE_UNKNOWN 364 if self.data is None: 365 return 366 367 # parse data 368 if self.size < 4: 369 raise ValueError('not enough data (size is {0} but need >= 4)' 370 .format(self.size)) 371 self.ctab, self.cch = unpack('<HH', self.data[:4]) 372 if 0 < self.cch <= 0xff: 373 # this is the length of virt_path 374 self.virt_path, _ = read_unicode(self.data, 4, self.cch) 375 else: 376 self.virt_path, _ = u'', 4 377 # ignore variable rgst 378 379 if self.cch == 0x401: # ctab is undefined and to be ignored 380 self.support_link_type = self.LINK_TYPE_SELF 381 elif self.ctab == 0x1 and self.cch == 0x3A01: 382 self.support_link_type = self.LINK_TYPE_ADDIN 383 # next records must be ExternName with all add-in functions 384 elif self.virt_path == u'\u0020': # space ; ctab can be anything 385 self.support_link_type = self.LINK_TYPE_UNUSED 386 elif self.virt_path == u'\u0000': 387 self.support_link_type = self.LINK_TYPE_SAMESHEET 388 elif self.ctab == 0x0 and self.virt_path: 389 self.support_link_type = self.LINK_TYPE_OLE_DDE 390 elif self.ctab > 0 and self.virt_path: 391 self.support_link_type = self.LINK_TYPE_EXTERNAL 392 393 def _type_str(self): 394 return 'SupBook Record ({0})'.format(self.support_link_type) 395 396 397class XlsbRecord(record_base.OleRecordBase): 398 """ like an xls record, but from binary part of xlsb file 399 400 has no MAX_SIZE and types have different meanings 401 """ 402 403 MAX_SIZE = None 404 405 def _type_str(self): 406 """ simplification for subclasses to create their own __str__ """ 407 try: 408 return FREQUENT_RECORDS_XLSB[self.type] 409 except KeyError: 410 return 'XlsbRecord type {0}'.format(self.type) 411 412 413class XlsbBeginSupBook(XlsbRecord): 414 """ Record beginning an external link in xlsb file 415 416 contains information about the link itself (e.g. for DDE the link is 417 string1 + ' ' + string2) 418 """ 419 420 TYPE = 360 421 LINK_TYPE_WORKBOOK = 'workbook' 422 LINK_TYPE_DDE = 'DDE' 423 LINK_TYPE_OLE = 'OLE' 424 LINK_TYPE_UNEXPECTED = 'unexpected' 425 LINK_TYPE_UNKNOWN = 'unknown' 426 427 def finish_constructing(self, _): 428 self.link_type = self.LINK_TYPE_UNKNOWN 429 self.string1 = '' 430 self.string2 = '' 431 if self.data is None: 432 return 433 self.sbt = unpack('<H', self.data[0:2])[0] 434 if self.sbt == 0: 435 self.link_type = self.LINK_TYPE_WORKBOOK 436 elif self.sbt == 1: 437 self.link_type = self.LINK_TYPE_DDE 438 elif self.sbt == 2: 439 self.link_type = self.LINK_TYPE_OLE 440 else: 441 logging.warning('Unexpected link type {0} encountered' 442 .format(self.data[0])) 443 self.link_type = self.LINK_TYPE_UNEXPECTED 444 445 start_idx = 2 446 n_chars = unpack('<I', self.data[start_idx:start_idx+4])[0] 447 if n_chars == 0xFFFFFFFF: 448 logging.warning('Max string length 0xFFFFFFF is not allowed') 449 elif self.size < n_chars*2 + start_idx+4: 450 logging.warning('Impossible string length {0} for data length {1}' 451 .format(n_chars, self.size)) 452 else: 453 self.string1, start_idx = read_unicode_2byte(self.data, 454 start_idx+4, n_chars) 455 456 n_chars = unpack('<I', self.data[start_idx:start_idx+4])[0] 457 if n_chars == 0xFFFFFFFF: 458 logging.warning('Max string length 0xFFFFFFF is not allowed') 459 elif self.size < n_chars*2 + start_idx+4: 460 logging.warning('Impossible string length {0} for data length {1}' 461 .format(n_chars, self.size) + ' for string2') 462 else: 463 self.string2, _ = read_unicode_2byte(self.data, start_idx+4, 464 n_chars) 465 466 def _type_str(self): 467 return 'XlsbBeginSupBook Record ({0}, "{1}", "{2}")' \ 468 .format(self.link_type, self.string1, self.string2) 469 470 471############################################################################### 472# XLSB Binary Parts 473############################################################################### 474 475 476def parse_xlsb_part(file_stream, _, filename): 477 """ Excel xlsb files also have bin files with record structure. iter! """ 478 xlsb_stream = None 479 try: 480 xlsb_stream = XlsbStream(file_stream, file_stream.size, filename, 481 record_base.STGTY_STREAM) 482 for record in xlsb_stream.iter_records(): 483 yield record 484 except Exception: 485 raise 486 finally: 487 if xlsb_stream is not None: 488 xlsb_stream.close() 489 490 491############################################################################### 492# TESTING 493############################################################################### 494 495 496if __name__ == '__main__': 497 sys.exit(record_base.test(sys.argv[1:], XlsFile, WorkbookStream)) 498