1// Copyright 2013 Richard Lehane. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Package mscfb implements a reader for Microsoft's Compound File Binary File Format (http://msdn.microsoft.com/en-us/library/dd942138.aspx).
16//
17// The Compound File Binary File Format is also known as the Object Linking and Embedding (OLE) or Component Object Model (COM) format and was used by many
18// early MS software such as MS Office.
19//
20// Example:
21//   file, _ := os.Open("test/test.doc")
22//   defer file.Close()
23//   doc, err := mscfb.New(file)
24//   if err != nil {
25//     log.Fatal(err)
26//   }
27//   for entry, err := doc.Next(); err == nil; entry, err = doc.Next() {
28//     buf := make([]byte, 512)
29//     i, _ := entry.Read(buf)
30//     if i > 0 {
31//       fmt.Println(buf[:i])
32//     }
33//     fmt.Println(entry.Name)
34//   }
35package mscfb
36
37import (
38	"encoding/binary"
39	"io"
40	"strconv"
41	"time"
42)
43
44func fileOffset(ss, sn uint32) int64 {
45	return int64((sn + 1) * ss)
46}
47
48const (
49	signature            uint64 = 0xE11AB1A1E011CFD0
50	miniStreamSectorSize uint32 = 64
51	miniStreamCutoffSize int64  = 4096
52	dirEntrySize         uint32 = 128 //128 bytes
53)
54
55const (
56	maxRegSect     uint32 = 0xFFFFFFFA // Maximum regular sector number
57	difatSect      uint32 = 0xFFFFFFFC //Specifies a DIFAT sector in the FAT
58	fatSect        uint32 = 0xFFFFFFFD // Specifies a FAT sector in the FAT
59	endOfChain     uint32 = 0xFFFFFFFE // End of linked chain of sectors
60	freeSect       uint32 = 0xFFFFFFFF // Speficies unallocated sector in the FAT, Mini FAT or DIFAT
61	maxRegStreamID uint32 = 0xFFFFFFFA // maximum regular stream ID
62	noStream       uint32 = 0xFFFFFFFF // empty pointer
63)
64
65const lenHeader int = 8 + 16 + 10 + 6 + 12 + 8 + 16 + 109*4
66
67type headerFields struct {
68	signature           uint64
69	_                   [16]byte    //CLSID - ignore, must be null
70	minorVersion        uint16      //Version number for non-breaking changes. This field SHOULD be set to 0x003E if the major version field is either 0x0003 or 0x0004.
71	majorVersion        uint16      //Version number for breaking changes. This field MUST be set to either 0x0003 (version 3) or 0x0004 (version 4).
72	_                   [2]byte     //byte order - ignore, must be little endian
73	sectorSize          uint16      //This field MUST be set to 0x0009, or 0x000c, depending on the Major Version field. This field specifies the sector size of the compound file as a power of 2. If Major Version is 3, then the Sector Shift MUST be 0x0009, specifying a sector size of 512 bytes. If Major Version is 4, then the Sector Shift MUST be 0x000C, specifying a sector size of 4096 bytes.
74	_                   [2]byte     // ministream sector size - ignore, must be 64 bytes
75	_                   [6]byte     // reserved - ignore, not used
76	numDirectorySectors uint32      //This integer field contains the count of the number of directory sectors in the compound file. If Major Version is 3, then the Number of Directory Sectors MUST be zero. This field is not supported for version 3 compound files.
77	numFatSectors       uint32      //This integer field contains the count of the number of FAT sectors in the compound file.
78	directorySectorLoc  uint32      //This integer field contains the starting sector number for the directory stream.
79	_                   [4]byte     // transaction - ignore, not used
80	_                   [4]byte     // mini stream size cutooff - ignore, must be 4096 bytes
81	miniFatSectorLoc    uint32      //This integer field contains the starting sector number for the mini FAT.
82	numMiniFatSectors   uint32      //This integer field contains the count of the number of mini FAT sectors in the compound file.
83	difatSectorLoc      uint32      //This integer field contains the starting sector number for the DIFAT.
84	numDifatSectors     uint32      //This integer field contains the count of the number of DIFAT sectors in the compound file.
85	initialDifats       [109]uint32 //The first 109 difat sectors are included in the header
86}
87
88func makeHeader(b []byte) *headerFields {
89	h := &headerFields{}
90	h.signature = binary.LittleEndian.Uint64(b[:8])
91	h.minorVersion = binary.LittleEndian.Uint16(b[24:26])
92	h.majorVersion = binary.LittleEndian.Uint16(b[26:28])
93	h.sectorSize = binary.LittleEndian.Uint16(b[30:32])
94	h.numDirectorySectors = binary.LittleEndian.Uint32(b[40:44])
95	h.numFatSectors = binary.LittleEndian.Uint32(b[44:48])
96	h.directorySectorLoc = binary.LittleEndian.Uint32(b[48:52])
97	h.miniFatSectorLoc = binary.LittleEndian.Uint32(b[60:64])
98	h.numMiniFatSectors = binary.LittleEndian.Uint32(b[64:68])
99	h.difatSectorLoc = binary.LittleEndian.Uint32(b[68:72])
100	h.numDifatSectors = binary.LittleEndian.Uint32(b[72:76])
101	var idx int
102	for i := 76; i < 512; i = i + 4 {
103		h.initialDifats[idx] = binary.LittleEndian.Uint32(b[i : i+4])
104		idx++
105	}
106	return h
107}
108
109type header struct {
110	*headerFields
111	difats         []uint32
112	miniFatLocs    []uint32
113	miniStreamLocs []uint32 // chain of sectors containing the ministream
114}
115
116func (r *Reader) setHeader() error {
117	buf, err := r.readAt(0, lenHeader)
118	if err != nil {
119		return err
120	}
121	r.header = &header{headerFields: makeHeader(buf)}
122	// sanity check - check signature
123	if r.header.signature != signature {
124		return Error{ErrFormat, "bad signature", int64(r.header.signature)}
125	}
126	// check for legal sector size
127	if r.header.sectorSize == 0x0009 || r.header.sectorSize == 0x000c {
128		r.sectorSize = uint32(1 << r.header.sectorSize)
129	} else {
130		return Error{ErrFormat, "illegal sector size", int64(r.header.sectorSize)}
131	}
132	// check for DIFAT overflow
133	if r.header.numDifatSectors > 0 {
134		sz := (r.sectorSize / 4) - 1
135		if int(r.header.numDifatSectors*sz+109) < 0 {
136			return Error{ErrFormat, "DIFAT int overflow", int64(r.header.numDifatSectors)}
137		}
138		if r.header.numDifatSectors*sz+109 > r.header.numFatSectors+sz {
139			return Error{ErrFormat, "num DIFATs exceeds FAT sectors", int64(r.header.numDifatSectors)}
140		}
141	}
142	// check for mini FAT overflow
143	if r.header.numMiniFatSectors > 0 {
144		if int(r.sectorSize/4*r.header.numMiniFatSectors) < 0 {
145			return Error{ErrFormat, "mini FAT int overflow", int64(r.header.numMiniFatSectors)}
146		}
147		if r.header.numMiniFatSectors > r.header.numFatSectors*(r.sectorSize/miniStreamSectorSize) {
148			return Error{ErrFormat, "num mini FATs exceeds FAT sectors", int64(r.header.numFatSectors)}
149		}
150	}
151	return nil
152}
153
154func (r *Reader) setDifats() error {
155	r.header.difats = r.header.initialDifats[:]
156	// return early if no extra DIFAT sectors
157	if r.header.numDifatSectors == 0 {
158		return nil
159	}
160	sz := (r.sectorSize / 4) - 1
161	n := make([]uint32, 109, r.header.numDifatSectors*sz+109)
162	copy(n, r.header.difats)
163	r.header.difats = n
164	off := r.header.difatSectorLoc
165	for i := 0; i < int(r.header.numDifatSectors); i++ {
166		buf, err := r.readAt(fileOffset(r.sectorSize, off), int(r.sectorSize))
167		if err != nil {
168			return Error{ErrFormat, "error setting DIFAT(" + err.Error() + ")", int64(off)}
169		}
170		for j := 0; j < int(sz); j++ {
171			r.header.difats = append(r.header.difats, binary.LittleEndian.Uint32(buf[j*4:j*4+4]))
172		}
173		off = binary.LittleEndian.Uint32(buf[len(buf)-4:])
174	}
175	return nil
176}
177
178// set the ministream FAT and sector slices in the header
179func (r *Reader) setMiniStream() error {
180	// do nothing if there is no ministream
181	if r.direntries[0].startingSectorLoc == endOfChain || r.header.miniFatSectorLoc == endOfChain || r.header.numMiniFatSectors == 0 {
182		return nil
183	}
184	// build a slice of minifat sectors (akin to the DIFAT slice)
185	c := int(r.header.numMiniFatSectors)
186	r.header.miniFatLocs = make([]uint32, c)
187	r.header.miniFatLocs[0] = r.header.miniFatSectorLoc
188	for i := 1; i < c; i++ {
189		loc, err := r.findNext(r.header.miniFatLocs[i-1], false)
190		if err != nil {
191			return Error{ErrFormat, "setting mini stream (" + err.Error() + ")", int64(r.header.miniFatLocs[i-1])}
192		}
193		r.header.miniFatLocs[i] = loc
194	}
195	// build a slice of ministream sectors
196	c = int(r.sectorSize / 4 * r.header.numMiniFatSectors)
197	r.header.miniStreamLocs = make([]uint32, 0, c)
198	sn := r.direntries[0].startingSectorLoc
199	var err error
200	for sn != endOfChain {
201		r.header.miniStreamLocs = append(r.header.miniStreamLocs, sn)
202		sn, err = r.findNext(sn, false)
203		if err != nil {
204			return Error{ErrFormat, "setting mini stream (" + err.Error() + ")", int64(sn)}
205		}
206	}
207	return nil
208}
209
210func (r *Reader) readAt(offset int64, length int) ([]byte, error) {
211	if r.slicer {
212		b, err := r.ra.(slicer).Slice(offset, length)
213		if err != nil {
214			return nil, Error{ErrRead, "slicer read error (" + err.Error() + ")", offset}
215		}
216		return b, nil
217	}
218	if length > len(r.buf) {
219		return nil, Error{ErrRead, "read length greater than read buffer", int64(length)}
220	}
221	if _, err := r.ra.ReadAt(r.buf[:length], offset); err != nil {
222		return nil, Error{ErrRead, err.Error(), offset}
223	}
224	return r.buf[:length], nil
225}
226
227func (r *Reader) getOffset(sn uint32, mini bool) (int64, error) {
228	if mini {
229		num := r.sectorSize / 64
230		sec := int(sn / num)
231		if sec >= len(r.header.miniStreamLocs) {
232			return 0, Error{ErrRead, "minisector number is outside minisector range", int64(sec)}
233		}
234		dif := sn % num
235		return int64((r.header.miniStreamLocs[sec]+1)*r.sectorSize + dif*64), nil
236	}
237	return fileOffset(r.sectorSize, sn), nil
238}
239
240// check the FAT sector for the next sector in a chain
241func (r *Reader) findNext(sn uint32, mini bool) (uint32, error) {
242	entries := r.sectorSize / 4
243	index := int(sn / entries) // find position in DIFAT or minifat array
244	var sect uint32
245	if mini {
246		if index < 0 || index >= len(r.header.miniFatLocs) {
247			return 0, Error{ErrRead, "minisector index is outside miniFAT range", int64(index)}
248		}
249		sect = r.header.miniFatLocs[index]
250	} else {
251		if index < 0 || index >= len(r.header.difats) {
252			return 0, Error{ErrRead, "FAT index is outside DIFAT range", int64(index)}
253		}
254		sect = r.header.difats[index]
255	}
256	fatIndex := sn % entries // find position within FAT or MiniFAT sector
257	offset := fileOffset(r.sectorSize, sect) + int64(fatIndex*4)
258	buf, err := r.readAt(offset, 4)
259	if err != nil {
260		return 0, Error{ErrRead, "bad read finding next sector (" + err.Error() + ")", offset}
261	}
262	return binary.LittleEndian.Uint32(buf), nil
263}
264
265// Reader provides sequential access to the contents of a MS compound file (MSCFB)
266type Reader struct {
267	slicer     bool
268	sectorSize uint32
269	buf        []byte
270	header     *header
271	File       []*File // File is an ordered slice of final directory entries.
272	direntries []*File // unordered raw directory entries
273	entry      int
274
275	ra io.ReaderAt
276	wa io.WriterAt
277}
278
279// New returns a MSCFB reader
280func New(ra io.ReaderAt) (*Reader, error) {
281	r := &Reader{ra: ra}
282	if _, ok := ra.(slicer); ok {
283		r.slicer = true
284	} else {
285		r.buf = make([]byte, lenHeader)
286	}
287	if err := r.setHeader(); err != nil {
288		return nil, err
289	}
290	// resize the buffer to 4096 if sector size isn't 512
291	if !r.slicer && int(r.sectorSize) > len(r.buf) {
292		r.buf = make([]byte, r.sectorSize)
293	}
294	if err := r.setDifats(); err != nil {
295		return nil, err
296	}
297	if err := r.setDirEntries(); err != nil {
298		return nil, err
299	}
300	if err := r.setMiniStream(); err != nil {
301		return nil, err
302	}
303	if err := r.traverse(); err != nil {
304		return nil, err
305	}
306	return r, nil
307}
308
309// ID returns the CLSID (class ID) field from the root directory entry
310func (r *Reader) ID() string {
311	return r.File[0].ID()
312}
313
314// Created returns the created field from the root directory entry
315func (r *Reader) Created() time.Time {
316	return r.File[0].Created()
317}
318
319// Modified returns the last modified field from the root directory entry
320func (r *Reader) Modified() time.Time {
321	return r.File[0].Modified()
322}
323
324// Next iterates to the next directory entry.
325// This isn't necessarily an adjacent *File within the File slice, but is based on the Left Sibling, Right Sibling and Child information in directory entries.
326func (r *Reader) Next() (*File, error) {
327	r.entry++
328	if r.entry >= len(r.File) {
329		return nil, io.EOF
330	}
331	return r.File[r.entry], nil
332}
333
334// Read the current directory entry
335func (r *Reader) Read(b []byte) (n int, err error) {
336	if r.entry >= len(r.File) {
337		return 0, io.EOF
338	}
339	return r.File[r.entry].Read(b)
340}
341
342// Debug provides granular information from an mscfb file to assist with debugging
343func (r *Reader) Debug() map[string][]uint32 {
344	ret := map[string][]uint32{
345		"sector size":            []uint32{r.sectorSize},
346		"mini fat locs":          r.header.miniFatLocs,
347		"mini stream locs":       r.header.miniStreamLocs,
348		"directory sector":       []uint32{r.header.directorySectorLoc},
349		"mini stream start/size": []uint32{r.File[0].startingSectorLoc, binary.LittleEndian.Uint32(r.File[0].streamSize[:])},
350	}
351	for f, err := r.Next(); err == nil; f, err = r.Next() {
352		ret[f.Name+" start/size"] = []uint32{f.startingSectorLoc, binary.LittleEndian.Uint32(f.streamSize[:])}
353	}
354	return ret
355}
356
357const (
358	// ErrFormat reports issues with the MSCFB's header structures
359	ErrFormat = iota
360	// ErrRead reports issues attempting to read MSCFB streams
361	ErrRead
362	// ErrSeek reports seek issues
363	ErrSeek
364	// ErrWrite reports write issues
365	ErrWrite
366	// ErrTraverse reports issues attempting to traverse the child-parent-sibling relations
367	// between MSCFB storage objects
368	ErrTraverse
369)
370
371type Error struct {
372	typ int
373	msg string
374	val int64
375}
376
377func (e Error) Error() string {
378	return "mscfb: " + e.msg + "; " + strconv.FormatInt(e.val, 10)
379}
380
381// Typ gives the type of MSCFB error
382func (e Error) Typ() int {
383	return e.typ
384}
385
386// Slicer interface avoids a copy by obtaining a byte slice directly from the underlying reader
387type slicer interface {
388	Slice(offset int64, length int) ([]byte, error)
389}
390