1#!/usr/local/bin/python3.8 2# 3# This Source Code Form is subject to the terms of the Mozilla Public 4# License, v. 2.0. If a copy of the MPL was not distributed with this 5# file, You can obtain one at http://mozilla.org/MPL/2.0/. 6# 7from builtins import range 8import sys, os.path, optparse 9sys.path.insert(0, sys.path[0]+"/msodump.zip") 10import traceback 11 12from msodumper import ole, xlsstream, globals, node, xlsmodel, olestream 13from msodumper import xlsparser, msocrypto 14 15from msodumper.globals import error 16 17def equalsName (name, array): 18 if len(name) != len(array): 19 return False 20 21 for i in range(0, len(name)): 22 if globals.indexbytes(name, i) != array[i]: 23 return False 24 25 return True 26 27def isOleStream (dirname): 28 """Determine whether or not a stream is an OLE stream. 29 30Accodring to the spec, an OLE stream is always named '\1Ole'.""" 31 32 name = [0x01, 0x4F, 0x6C, 0x65] # 0x01, 'Ole' 33 return equalsName(dirname, name) 34 35def isCompObjStream (dirname): 36 name = [0x01, 0x43, 0x6F, 0x6D, 0x70, 0x4F, 0x62, 0x6A] # 0x01, 'CompObj' 37 return equalsName(dirname, name) 38 39class XLDumper(object): 40 41 def __init__ (self, filepath, params): 42 self.filepath = filepath 43 self.params = params 44 self.strm = None 45 self.strmData = None 46 47 def __printDirHeader (self, direntry, byteLen): 48 dirname = direntry.Name 49 dirname = globals.encodeName(dirname) 50 globals.outputln("") 51 globals.outputln("="*globals.OutputWidth) 52 if direntry.isStorage(): 53 globals.outputln("%s (storage)"%dirname) 54 else: 55 globals.outputln("%s (stream, size: %d bytes)"%(dirname, byteLen)) 56 globals.outputln("-"*globals.OutputWidth) 57 58 def __parseFile (self): 59 file = open(self.filepath, 'rb') 60 self.strmData = xlsstream.StreamData() 61 self.strm = xlsstream.XLStream(file.read(), self.params, self.strmData) 62 file.close() 63 64 def dumpXML (self): 65 self.__parseFile() 66 dirs = self.strm.getDirectoryEntries() 67 docroot = node.Root() 68 root = docroot.appendElement('xls-dump') 69 70 for d in dirs: 71 if d.Name != b"Workbook": 72 # for now, we only dump the Workbook directory stream. 73 continue 74 75 dirstrm = self.strm.getDirectoryStream(d) 76 data = self.__readSubStreamXML(dirstrm) 77 self.__dumpDataAsXML(data, root) 78 79 node.prettyPrint(globals.utfwriter(), docroot, utf8 = self.params.utf8) 80 81 def dumpCanonicalXML (self): 82 self.__parseFile() 83 docroot = node.Root() 84 root = docroot.appendElement('xls-dump') 85 86 dirEntries = self.strm.getDirectoryEntries() 87 for entry in dirEntries: 88 dirname = entry.Name 89 if dirname != b"Workbook": 90 # for now, we only dump the Workbook directory stream. 91 continue 92 93 dirstrm = self.strm.getDirectoryStream(entry) 94 wbmodel = self.__buildWorkbookModel(dirstrm) 95 wbmodel.encrypted = self.strmData.encrypted 96 root.appendChild(wbmodel.createDOM()) 97 98 node.prettyPrint(globals.utfwriter(), docroot, utf8 = self.params.utf8) 99 100 def dump (self): 101 self.__parseFile() 102 self.strm.printStreamInfo() 103 self.strm.printHeader() 104 self.strm.printMSAT() 105 self.strm.printSAT() 106 self.strm.printSSAT() 107 self.strm.printDirectory() 108 dirEntries = self.strm.getDirectoryEntries() 109 for entry in dirEntries: 110 dirname = entry.Name 111 if len(dirname) == 0: 112 continue 113 114 dirstrm = self.strm.getDirectoryStream(entry) 115 self.__printDirHeader(entry, len(dirstrm.bytes)) 116 if entry.isStorage(): 117 continue 118 119 elif dirname == b"Workbook": 120 success = True 121 while success: 122 success = self.__readSubStream(dirstrm) 123 124 elif dirname == b"Revision Log": 125 dirstrm.type = xlsstream.DirType.RevisionLog 126 self.__readSubStream(dirstrm) 127 128 elif dirname == b"EncryptionInfo": 129 globals.dumpBytes(dirstrm.bytes, 512) 130 globals.outputln("-"*globals.OutputWidth) 131 info = msocrypto.EncryptionInfo(dirstrm.bytes) 132 info.read() 133 info.output() 134 135 elif self.strmData.isPivotCacheStream(dirname): 136 dirstrm.type = xlsstream.DirType.PivotTableCache 137 self.__readSubStream(dirstrm) 138 elif isOleStream(dirname): 139 self.__readOleStream(dirstrm) 140 elif isCompObjStream(dirname): 141 self.__readCompObjStream(dirstrm) 142 else: 143 globals.dumpBytes(dirstrm.bytes, 512) 144 145 def __readSubStream (self, strm): 146 try: 147 # read bytes from BOF to EOF. 148 header = 0x0000 149 while header != 0x000A: 150 header = strm.readRecord() 151 return True 152 except xlsstream.EndOfStream: 153 return False 154 155 def __readOleStream (self, dirstrm): 156 strm = olestream.OLEStream(dirstrm.bytes) 157 strm.read() 158 159 def __readCompObjStream (self, dirstrm): 160 try: 161 strm = olestream.CompObjStream(dirstrm.bytes) 162 strm.read() 163 except olestream.CompObjStreamError: 164 globals.error("failed to parse CompObj stream.\n") 165 166 def __dumpDataAsXML(self, data, root): 167 if isinstance(data, tuple): 168 newRoot = root.appendElement(data[0]) 169 if isinstance(data[1], dict): # attrs 170 for key,val in data[1].iteritems(): 171 newRoot.setAttr(key, val) 172 if len(data) > 2: # data has a list of children 173 self.__dumpDataAsXML(data[2], newRoot) 174 else: 175 self.__dumpDataAsXML(data[1], newRoot) 176 elif isinstance(data, list): 177 for x in data: 178 self.__dumpDataAsXML(x, root) 179 else: 180 pass # we're skipping all unknown elems 181 182 def __readSubStreamXML (self, strm): 183 handlers = [] 184 try: 185 while True: 186 handler = strm.getNextRecordHandler() 187 handlers.append(handler) 188 except xlsstream.EndOfStream: 189 pass 190 parser = xlsparser.XlsParser(handlers) 191 return parser.dumpData() 192 193 def __buildWorkbookModel (self, strm): 194 model = xlsmodel.Workbook() 195 try: 196 while True: 197 strm.fillModel(model) 198 except xlsstream.EndOfStream: 199 pass 200 201 return model 202 203def main (): 204 parser = optparse.OptionParser() 205 parser.add_option("-d", "--debug", action="store_true", dest="debug", default=False, 206 help="Turn on debug mode") 207 parser.add_option("--show-sector-chain", action="store_true", dest="show_sector_chain", default=False, 208 help="Show sector chain information at the start of the output.") 209 parser.add_option("--show-stream-pos", action="store_true", dest="show_stream_pos", default=False, 210 help="Show the position of each record relative to the stream.") 211 parser.add_option("--dump-mode", dest="dump_mode", default="flat", metavar="MODE", 212 help="Specify the dump mode. Possible values are: 'flat', 'xml', or 'canonical-xml'. The default value is 'flat'.") 213 parser.add_option("--catch", action="store_true", dest="catch_exceptions", default=False, 214 help="Catch exceptions and try to continue.") 215 parser.add_option("--utf-8", action="store_true", dest="utf8", default=False, 216 help="Output strings as UTF-8.") 217 options, args = parser.parse_args() 218 params = globals.params 219 params.debug = options.debug 220 params.showSectorChain = options.show_sector_chain 221 params.showStreamPos = options.show_stream_pos 222 params.catchExceptions = options.catch_exceptions 223 params.utf8 = options.utf8 224 225 if len(args) < 1: 226 globals.error("takes at least one argument\n") 227 parser.print_help() 228 sys.exit(1) 229 230 dumper = XLDumper(args[0], params) 231 if options.dump_mode == 'flat': 232 dumper.dump() 233 elif options.dump_mode == 'xml': 234 dumper.dumpXML() 235 elif options.dump_mode == 'canonical-xml' or options.dump_mode == 'cxml': 236 try: 237 dumper.dumpCanonicalXML() 238 except Exception as err: 239 if globals.params.catchExceptions: 240 traceback.print_exc() 241 else: 242 raise 243 globals.error("Dump failed") 244 sys.exit(1) 245 else: 246 error("unknown dump mode: '%s'\n"%options.dump_mode) 247 parser.print_help() 248 sys.exit(1) 249 250if __name__ == '__main__': 251 main() 252 253# vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab: 254