1// Copyright 2013 Richard Lehane. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// Package mscfb implements a reader for Microsoft's Compound File Binary File Format (http://msdn.microsoft.com/en-us/library/dd942138.aspx). 16// 17// The Compound File Binary File Format is also known as the Object Linking and Embedding (OLE) or Component Object Model (COM) format and was used by many 18// early MS software such as MS Office. 19// 20// Example: 21// file, _ := os.Open("test/test.doc") 22// defer file.Close() 23// doc, err := mscfb.New(file) 24// if err != nil { 25// log.Fatal(err) 26// } 27// for entry, err := doc.Next(); err == nil; entry, err = doc.Next() { 28// buf := make([]byte, 512) 29// i, _ := entry.Read(buf) 30// if i > 0 { 31// fmt.Println(buf[:i]) 32// } 33// fmt.Println(entry.Name) 34// } 35package mscfb 36 37import ( 38 "encoding/binary" 39 "io" 40 "strconv" 41 "time" 42) 43 44func fileOffset(ss, sn uint32) int64 { 45 return int64((sn + 1) * ss) 46} 47 48const ( 49 signature uint64 = 0xE11AB1A1E011CFD0 50 miniStreamSectorSize uint32 = 64 51 miniStreamCutoffSize int64 = 4096 52 dirEntrySize uint32 = 128 //128 bytes 53) 54 55const ( 56 maxRegSect uint32 = 0xFFFFFFFA // Maximum regular sector number 57 difatSect uint32 = 0xFFFFFFFC //Specifies a DIFAT sector in the FAT 58 fatSect uint32 = 0xFFFFFFFD // Specifies a FAT sector in the FAT 59 endOfChain uint32 = 0xFFFFFFFE // End of linked chain of sectors 60 freeSect uint32 = 0xFFFFFFFF // Speficies unallocated sector in the FAT, Mini FAT or DIFAT 61 maxRegStreamID uint32 = 0xFFFFFFFA // maximum regular stream ID 62 noStream uint32 = 0xFFFFFFFF // empty pointer 63) 64 65const lenHeader int = 8 + 16 + 10 + 6 + 12 + 8 + 16 + 109*4 66 67type headerFields struct { 68 signature uint64 69 _ [16]byte //CLSID - ignore, must be null 70 minorVersion uint16 //Version number for non-breaking changes. This field SHOULD be set to 0x003E if the major version field is either 0x0003 or 0x0004. 71 majorVersion uint16 //Version number for breaking changes. This field MUST be set to either 0x0003 (version 3) or 0x0004 (version 4). 72 _ [2]byte //byte order - ignore, must be little endian 73 sectorSize uint16 //This field MUST be set to 0x0009, or 0x000c, depending on the Major Version field. This field specifies the sector size of the compound file as a power of 2. If Major Version is 3, then the Sector Shift MUST be 0x0009, specifying a sector size of 512 bytes. If Major Version is 4, then the Sector Shift MUST be 0x000C, specifying a sector size of 4096 bytes. 74 _ [2]byte // ministream sector size - ignore, must be 64 bytes 75 _ [6]byte // reserved - ignore, not used 76 numDirectorySectors uint32 //This integer field contains the count of the number of directory sectors in the compound file. If Major Version is 3, then the Number of Directory Sectors MUST be zero. This field is not supported for version 3 compound files. 77 numFatSectors uint32 //This integer field contains the count of the number of FAT sectors in the compound file. 78 directorySectorLoc uint32 //This integer field contains the starting sector number for the directory stream. 79 _ [4]byte // transaction - ignore, not used 80 _ [4]byte // mini stream size cutooff - ignore, must be 4096 bytes 81 miniFatSectorLoc uint32 //This integer field contains the starting sector number for the mini FAT. 82 numMiniFatSectors uint32 //This integer field contains the count of the number of mini FAT sectors in the compound file. 83 difatSectorLoc uint32 //This integer field contains the starting sector number for the DIFAT. 84 numDifatSectors uint32 //This integer field contains the count of the number of DIFAT sectors in the compound file. 85 initialDifats [109]uint32 //The first 109 difat sectors are included in the header 86} 87 88func makeHeader(b []byte) *headerFields { 89 h := &headerFields{} 90 h.signature = binary.LittleEndian.Uint64(b[:8]) 91 h.minorVersion = binary.LittleEndian.Uint16(b[24:26]) 92 h.majorVersion = binary.LittleEndian.Uint16(b[26:28]) 93 h.sectorSize = binary.LittleEndian.Uint16(b[30:32]) 94 h.numDirectorySectors = binary.LittleEndian.Uint32(b[40:44]) 95 h.numFatSectors = binary.LittleEndian.Uint32(b[44:48]) 96 h.directorySectorLoc = binary.LittleEndian.Uint32(b[48:52]) 97 h.miniFatSectorLoc = binary.LittleEndian.Uint32(b[60:64]) 98 h.numMiniFatSectors = binary.LittleEndian.Uint32(b[64:68]) 99 h.difatSectorLoc = binary.LittleEndian.Uint32(b[68:72]) 100 h.numDifatSectors = binary.LittleEndian.Uint32(b[72:76]) 101 var idx int 102 for i := 76; i < 512; i = i + 4 { 103 h.initialDifats[idx] = binary.LittleEndian.Uint32(b[i : i+4]) 104 idx++ 105 } 106 return h 107} 108 109type header struct { 110 *headerFields 111 difats []uint32 112 miniFatLocs []uint32 113 miniStreamLocs []uint32 // chain of sectors containing the ministream 114} 115 116func (r *Reader) setHeader() error { 117 buf, err := r.readAt(0, lenHeader) 118 if err != nil { 119 return err 120 } 121 r.header = &header{headerFields: makeHeader(buf)} 122 // sanity check - check signature 123 if r.header.signature != signature { 124 return Error{ErrFormat, "bad signature", int64(r.header.signature)} 125 } 126 // check for legal sector size 127 if r.header.sectorSize == 0x0009 || r.header.sectorSize == 0x000c { 128 r.sectorSize = uint32(1 << r.header.sectorSize) 129 } else { 130 return Error{ErrFormat, "illegal sector size", int64(r.header.sectorSize)} 131 } 132 // check for DIFAT overflow 133 if r.header.numDifatSectors > 0 { 134 sz := (r.sectorSize / 4) - 1 135 if int(r.header.numDifatSectors*sz+109) < 0 { 136 return Error{ErrFormat, "DIFAT int overflow", int64(r.header.numDifatSectors)} 137 } 138 if r.header.numDifatSectors*sz+109 > r.header.numFatSectors+sz { 139 return Error{ErrFormat, "num DIFATs exceeds FAT sectors", int64(r.header.numDifatSectors)} 140 } 141 } 142 // check for mini FAT overflow 143 if r.header.numMiniFatSectors > 0 { 144 if int(r.sectorSize/4*r.header.numMiniFatSectors) < 0 { 145 return Error{ErrFormat, "mini FAT int overflow", int64(r.header.numMiniFatSectors)} 146 } 147 if r.header.numMiniFatSectors > r.header.numFatSectors*(r.sectorSize/miniStreamSectorSize) { 148 return Error{ErrFormat, "num mini FATs exceeds FAT sectors", int64(r.header.numFatSectors)} 149 } 150 } 151 return nil 152} 153 154func (r *Reader) setDifats() error { 155 r.header.difats = r.header.initialDifats[:] 156 // return early if no extra DIFAT sectors 157 if r.header.numDifatSectors == 0 { 158 return nil 159 } 160 sz := (r.sectorSize / 4) - 1 161 n := make([]uint32, 109, r.header.numDifatSectors*sz+109) 162 copy(n, r.header.difats) 163 r.header.difats = n 164 off := r.header.difatSectorLoc 165 for i := 0; i < int(r.header.numDifatSectors); i++ { 166 buf, err := r.readAt(fileOffset(r.sectorSize, off), int(r.sectorSize)) 167 if err != nil { 168 return Error{ErrFormat, "error setting DIFAT(" + err.Error() + ")", int64(off)} 169 } 170 for j := 0; j < int(sz); j++ { 171 r.header.difats = append(r.header.difats, binary.LittleEndian.Uint32(buf[j*4:j*4+4])) 172 } 173 off = binary.LittleEndian.Uint32(buf[len(buf)-4:]) 174 } 175 return nil 176} 177 178// set the ministream FAT and sector slices in the header 179func (r *Reader) setMiniStream() error { 180 // do nothing if there is no ministream 181 if r.direntries[0].startingSectorLoc == endOfChain || r.header.miniFatSectorLoc == endOfChain || r.header.numMiniFatSectors == 0 { 182 return nil 183 } 184 // build a slice of minifat sectors (akin to the DIFAT slice) 185 c := int(r.header.numMiniFatSectors) 186 r.header.miniFatLocs = make([]uint32, c) 187 r.header.miniFatLocs[0] = r.header.miniFatSectorLoc 188 for i := 1; i < c; i++ { 189 loc, err := r.findNext(r.header.miniFatLocs[i-1], false) 190 if err != nil { 191 return Error{ErrFormat, "setting mini stream (" + err.Error() + ")", int64(r.header.miniFatLocs[i-1])} 192 } 193 r.header.miniFatLocs[i] = loc 194 } 195 // build a slice of ministream sectors 196 c = int(r.sectorSize / 4 * r.header.numMiniFatSectors) 197 r.header.miniStreamLocs = make([]uint32, 0, c) 198 sn := r.direntries[0].startingSectorLoc 199 var err error 200 for sn != endOfChain { 201 r.header.miniStreamLocs = append(r.header.miniStreamLocs, sn) 202 sn, err = r.findNext(sn, false) 203 if err != nil { 204 return Error{ErrFormat, "setting mini stream (" + err.Error() + ")", int64(sn)} 205 } 206 } 207 return nil 208} 209 210func (r *Reader) readAt(offset int64, length int) ([]byte, error) { 211 if r.slicer { 212 b, err := r.ra.(slicer).Slice(offset, length) 213 if err != nil { 214 return nil, Error{ErrRead, "slicer read error (" + err.Error() + ")", offset} 215 } 216 return b, nil 217 } 218 if length > len(r.buf) { 219 return nil, Error{ErrRead, "read length greater than read buffer", int64(length)} 220 } 221 if _, err := r.ra.ReadAt(r.buf[:length], offset); err != nil { 222 return nil, Error{ErrRead, err.Error(), offset} 223 } 224 return r.buf[:length], nil 225} 226 227func (r *Reader) getOffset(sn uint32, mini bool) (int64, error) { 228 if mini { 229 num := r.sectorSize / 64 230 sec := int(sn / num) 231 if sec >= len(r.header.miniStreamLocs) { 232 return 0, Error{ErrRead, "minisector number is outside minisector range", int64(sec)} 233 } 234 dif := sn % num 235 return int64((r.header.miniStreamLocs[sec]+1)*r.sectorSize + dif*64), nil 236 } 237 return fileOffset(r.sectorSize, sn), nil 238} 239 240// check the FAT sector for the next sector in a chain 241func (r *Reader) findNext(sn uint32, mini bool) (uint32, error) { 242 entries := r.sectorSize / 4 243 index := int(sn / entries) // find position in DIFAT or minifat array 244 var sect uint32 245 if mini { 246 if index < 0 || index >= len(r.header.miniFatLocs) { 247 return 0, Error{ErrRead, "minisector index is outside miniFAT range", int64(index)} 248 } 249 sect = r.header.miniFatLocs[index] 250 } else { 251 if index < 0 || index >= len(r.header.difats) { 252 return 0, Error{ErrRead, "FAT index is outside DIFAT range", int64(index)} 253 } 254 sect = r.header.difats[index] 255 } 256 fatIndex := sn % entries // find position within FAT or MiniFAT sector 257 offset := fileOffset(r.sectorSize, sect) + int64(fatIndex*4) 258 buf, err := r.readAt(offset, 4) 259 if err != nil { 260 return 0, Error{ErrRead, "bad read finding next sector (" + err.Error() + ")", offset} 261 } 262 return binary.LittleEndian.Uint32(buf), nil 263} 264 265// Reader provides sequential access to the contents of a MS compound file (MSCFB) 266type Reader struct { 267 slicer bool 268 sectorSize uint32 269 buf []byte 270 header *header 271 File []*File // File is an ordered slice of final directory entries. 272 direntries []*File // unordered raw directory entries 273 entry int 274 275 ra io.ReaderAt 276 wa io.WriterAt 277} 278 279// New returns a MSCFB reader 280func New(ra io.ReaderAt) (*Reader, error) { 281 r := &Reader{ra: ra} 282 if _, ok := ra.(slicer); ok { 283 r.slicer = true 284 } else { 285 r.buf = make([]byte, lenHeader) 286 } 287 if err := r.setHeader(); err != nil { 288 return nil, err 289 } 290 // resize the buffer to 4096 if sector size isn't 512 291 if !r.slicer && int(r.sectorSize) > len(r.buf) { 292 r.buf = make([]byte, r.sectorSize) 293 } 294 if err := r.setDifats(); err != nil { 295 return nil, err 296 } 297 if err := r.setDirEntries(); err != nil { 298 return nil, err 299 } 300 if err := r.setMiniStream(); err != nil { 301 return nil, err 302 } 303 if err := r.traverse(); err != nil { 304 return nil, err 305 } 306 return r, nil 307} 308 309// ID returns the CLSID (class ID) field from the root directory entry 310func (r *Reader) ID() string { 311 return r.File[0].ID() 312} 313 314// Created returns the created field from the root directory entry 315func (r *Reader) Created() time.Time { 316 return r.File[0].Created() 317} 318 319// Modified returns the last modified field from the root directory entry 320func (r *Reader) Modified() time.Time { 321 return r.File[0].Modified() 322} 323 324// Next iterates to the next directory entry. 325// This isn't necessarily an adjacent *File within the File slice, but is based on the Left Sibling, Right Sibling and Child information in directory entries. 326func (r *Reader) Next() (*File, error) { 327 r.entry++ 328 if r.entry >= len(r.File) { 329 return nil, io.EOF 330 } 331 return r.File[r.entry], nil 332} 333 334// Read the current directory entry 335func (r *Reader) Read(b []byte) (n int, err error) { 336 if r.entry >= len(r.File) { 337 return 0, io.EOF 338 } 339 return r.File[r.entry].Read(b) 340} 341 342// Debug provides granular information from an mscfb file to assist with debugging 343func (r *Reader) Debug() map[string][]uint32 { 344 ret := map[string][]uint32{ 345 "sector size": []uint32{r.sectorSize}, 346 "mini fat locs": r.header.miniFatLocs, 347 "mini stream locs": r.header.miniStreamLocs, 348 "directory sector": []uint32{r.header.directorySectorLoc}, 349 "mini stream start/size": []uint32{r.File[0].startingSectorLoc, binary.LittleEndian.Uint32(r.File[0].streamSize[:])}, 350 } 351 for f, err := r.Next(); err == nil; f, err = r.Next() { 352 ret[f.Name+" start/size"] = []uint32{f.startingSectorLoc, binary.LittleEndian.Uint32(f.streamSize[:])} 353 } 354 return ret 355} 356 357const ( 358 // ErrFormat reports issues with the MSCFB's header structures 359 ErrFormat = iota 360 // ErrRead reports issues attempting to read MSCFB streams 361 ErrRead 362 // ErrSeek reports seek issues 363 ErrSeek 364 // ErrWrite reports write issues 365 ErrWrite 366 // ErrTraverse reports issues attempting to traverse the child-parent-sibling relations 367 // between MSCFB storage objects 368 ErrTraverse 369) 370 371type Error struct { 372 typ int 373 msg string 374 val int64 375} 376 377func (e Error) Error() string { 378 return "mscfb: " + e.msg + "; " + strconv.FormatInt(e.val, 10) 379} 380 381// Typ gives the type of MSCFB error 382func (e Error) Typ() int { 383 return e.typ 384} 385 386// Slicer interface avoids a copy by obtaining a byte slice directly from the underlying reader 387type slicer interface { 388 Slice(offset int64, length int) ([]byte, error) 389} 390