1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package tar
6
7import "strings"
8
9// Format represents the tar archive format.
10//
11// The original tar format was introduced in Unix V7.
12// Since then, there have been multiple competing formats attempting to
13// standardize or extend the V7 format to overcome its limitations.
14// The most common formats are the USTAR, PAX, and GNU formats,
15// each with their own advantages and limitations.
16//
17// The following table captures the capabilities of each format:
18//
19//	                  |  USTAR |       PAX |       GNU
20//	------------------+--------+-----------+----------
21//	Name              |   256B | unlimited | unlimited
22//	Linkname          |   100B | unlimited | unlimited
23//	Size              | uint33 | unlimited |    uint89
24//	Mode              | uint21 |    uint21 |    uint57
25//	Uid/Gid           | uint21 | unlimited |    uint57
26//	Uname/Gname       |    32B | unlimited |       32B
27//	ModTime           | uint33 | unlimited |     int89
28//	AccessTime        |    n/a | unlimited |     int89
29//	ChangeTime        |    n/a | unlimited |     int89
30//	Devmajor/Devminor | uint21 |    uint21 |    uint57
31//	------------------+--------+-----------+----------
32//	string encoding   |  ASCII |     UTF-8 |    binary
33//	sub-second times  |     no |       yes |        no
34//	sparse files      |     no |       yes |       yes
35//
36// The table's upper portion shows the Header fields, where each format reports
37// the maximum number of bytes allowed for each string field and
38// the integer type used to store each numeric field
39// (where timestamps are stored as the number of seconds since the Unix epoch).
40//
41// The table's lower portion shows specialized features of each format,
42// such as supported string encodings, support for sub-second timestamps,
43// or support for sparse files.
44//
45// The Writer currently provides no support for sparse files.
46type Format int
47
48// Constants to identify various tar formats.
49const (
50	// Deliberately hide the meaning of constants from public API.
51	_ Format = (1 << iota) / 4 // Sequence of 0, 0, 1, 2, 4, 8, etc...
52
53	// FormatUnknown indicates that the format is unknown.
54	FormatUnknown
55
56	// The format of the original Unix V7 tar tool prior to standardization.
57	formatV7
58
59	// FormatUSTAR represents the USTAR header format defined in POSIX.1-1988.
60	//
61	// While this format is compatible with most tar readers,
62	// the format has several limitations making it unsuitable for some usages.
63	// Most notably, it cannot support sparse files, files larger than 8GiB,
64	// filenames larger than 256 characters, and non-ASCII filenames.
65	//
66	// Reference:
67	//	http://pubs.opengroup.org/onlinepubs/9699919799/utilities/pax.html#tag_20_92_13_06
68	FormatUSTAR
69
70	// FormatPAX represents the PAX header format defined in POSIX.1-2001.
71	//
72	// PAX extends USTAR by writing a special file with Typeflag TypeXHeader
73	// preceding the original header. This file contains a set of key-value
74	// records, which are used to overcome USTAR's shortcomings, in addition to
75	// providing the ability to have sub-second resolution for timestamps.
76	//
77	// Some newer formats add their own extensions to PAX by defining their
78	// own keys and assigning certain semantic meaning to the associated values.
79	// For example, sparse file support in PAX is implemented using keys
80	// defined by the GNU manual (e.g., "GNU.sparse.map").
81	//
82	// Reference:
83	//	http://pubs.opengroup.org/onlinepubs/009695399/utilities/pax.html
84	FormatPAX
85
86	// FormatGNU represents the GNU header format.
87	//
88	// The GNU header format is older than the USTAR and PAX standards and
89	// is not compatible with them. The GNU format supports
90	// arbitrary file sizes, filenames of arbitrary encoding and length,
91	// sparse files, and other features.
92	//
93	// It is recommended that PAX be chosen over GNU unless the target
94	// application can only parse GNU formatted archives.
95	//
96	// Reference:
97	//	http://www.gnu.org/software/tar/manual/html_node/Standard.html
98	FormatGNU
99
100	// Schily's tar format, which is incompatible with USTAR.
101	// This does not cover STAR extensions to the PAX format; these fall under
102	// the PAX format.
103	formatSTAR
104
105	formatMax
106)
107
108func (f Format) has(f2 Format) bool   { return f&f2 != 0 }
109func (f *Format) mayBe(f2 Format)     { *f |= f2 }
110func (f *Format) mayOnlyBe(f2 Format) { *f &= f2 }
111func (f *Format) mustNotBe(f2 Format) { *f &^= f2 }
112
113var formatNames = map[Format]string{
114	formatV7: "V7", FormatUSTAR: "USTAR", FormatPAX: "PAX", FormatGNU: "GNU", formatSTAR: "STAR",
115}
116
117func (f Format) String() string {
118	var ss []string
119	for f2 := Format(1); f2 < formatMax; f2 <<= 1 {
120		if f.has(f2) {
121			ss = append(ss, formatNames[f2])
122		}
123	}
124	switch len(ss) {
125	case 0:
126		return "<unknown>"
127	case 1:
128		return ss[0]
129	default:
130		return "(" + strings.Join(ss, " | ") + ")"
131	}
132}
133
134// Magics used to identify various formats.
135const (
136	magicGNU, versionGNU     = "ustar ", " \x00"
137	magicUSTAR, versionUSTAR = "ustar\x00", "00"
138	trailerSTAR              = "tar\x00"
139)
140
141// Size constants from various tar specifications.
142const (
143	blockSize  = 512 // Size of each block in a tar stream
144	nameSize   = 100 // Max length of the name field in USTAR format
145	prefixSize = 155 // Max length of the prefix field in USTAR format
146)
147
148// blockPadding computes the number of bytes needed to pad offset up to the
149// nearest block edge where 0 <= n < blockSize.
150func blockPadding(offset int64) (n int64) {
151	return -offset & (blockSize - 1)
152}
153
154var zeroBlock block
155
156type block [blockSize]byte
157
158// Convert block to any number of formats.
159func (b *block) V7() *headerV7       { return (*headerV7)(b) }
160func (b *block) GNU() *headerGNU     { return (*headerGNU)(b) }
161func (b *block) STAR() *headerSTAR   { return (*headerSTAR)(b) }
162func (b *block) USTAR() *headerUSTAR { return (*headerUSTAR)(b) }
163func (b *block) Sparse() sparseArray { return (sparseArray)(b[:]) }
164
165// GetFormat checks that the block is a valid tar header based on the checksum.
166// It then attempts to guess the specific format based on magic values.
167// If the checksum fails, then FormatUnknown is returned.
168func (b *block) GetFormat() Format {
169	// Verify checksum.
170	var p parser
171	value := p.parseOctal(b.V7().Chksum())
172	chksum1, chksum2 := b.ComputeChecksum()
173	if p.err != nil || (value != chksum1 && value != chksum2) {
174		return FormatUnknown
175	}
176
177	// Guess the magic values.
178	magic := string(b.USTAR().Magic())
179	version := string(b.USTAR().Version())
180	trailer := string(b.STAR().Trailer())
181	switch {
182	case magic == magicUSTAR && trailer == trailerSTAR:
183		return formatSTAR
184	case magic == magicUSTAR:
185		return FormatUSTAR | FormatPAX
186	case magic == magicGNU && version == versionGNU:
187		return FormatGNU
188	default:
189		return formatV7
190	}
191}
192
193// SetFormat writes the magic values necessary for specified format
194// and then updates the checksum accordingly.
195func (b *block) SetFormat(format Format) {
196	// Set the magic values.
197	switch {
198	case format.has(formatV7):
199		// Do nothing.
200	case format.has(FormatGNU):
201		copy(b.GNU().Magic(), magicGNU)
202		copy(b.GNU().Version(), versionGNU)
203	case format.has(formatSTAR):
204		copy(b.STAR().Magic(), magicUSTAR)
205		copy(b.STAR().Version(), versionUSTAR)
206		copy(b.STAR().Trailer(), trailerSTAR)
207	case format.has(FormatUSTAR | FormatPAX):
208		copy(b.USTAR().Magic(), magicUSTAR)
209		copy(b.USTAR().Version(), versionUSTAR)
210	default:
211		panic("invalid format")
212	}
213
214	// Update checksum.
215	// This field is special in that it is terminated by a NULL then space.
216	var f formatter
217	field := b.V7().Chksum()
218	chksum, _ := b.ComputeChecksum() // Possible values are 256..128776
219	f.formatOctal(field[:7], chksum) // Never fails since 128776 < 262143
220	field[7] = ' '
221}
222
223// ComputeChecksum computes the checksum for the header block.
224// POSIX specifies a sum of the unsigned byte values, but the Sun tar used
225// signed byte values.
226// We compute and return both.
227func (b *block) ComputeChecksum() (unsigned, signed int64) {
228	for i, c := range b {
229		if 148 <= i && i < 156 {
230			c = ' ' // Treat the checksum field itself as all spaces.
231		}
232		unsigned += int64(c)
233		signed += int64(int8(c))
234	}
235	return unsigned, signed
236}
237
238// Reset clears the block with all zeros.
239func (b *block) Reset() {
240	*b = block{}
241}
242
243type headerV7 [blockSize]byte
244
245func (h *headerV7) Name() []byte     { return h[000:][:100] }
246func (h *headerV7) Mode() []byte     { return h[100:][:8] }
247func (h *headerV7) UID() []byte      { return h[108:][:8] }
248func (h *headerV7) GID() []byte      { return h[116:][:8] }
249func (h *headerV7) Size() []byte     { return h[124:][:12] }
250func (h *headerV7) ModTime() []byte  { return h[136:][:12] }
251func (h *headerV7) Chksum() []byte   { return h[148:][:8] }
252func (h *headerV7) TypeFlag() []byte { return h[156:][:1] }
253func (h *headerV7) LinkName() []byte { return h[157:][:100] }
254
255type headerGNU [blockSize]byte
256
257func (h *headerGNU) V7() *headerV7       { return (*headerV7)(h) }
258func (h *headerGNU) Magic() []byte       { return h[257:][:6] }
259func (h *headerGNU) Version() []byte     { return h[263:][:2] }
260func (h *headerGNU) UserName() []byte    { return h[265:][:32] }
261func (h *headerGNU) GroupName() []byte   { return h[297:][:32] }
262func (h *headerGNU) DevMajor() []byte    { return h[329:][:8] }
263func (h *headerGNU) DevMinor() []byte    { return h[337:][:8] }
264func (h *headerGNU) AccessTime() []byte  { return h[345:][:12] }
265func (h *headerGNU) ChangeTime() []byte  { return h[357:][:12] }
266func (h *headerGNU) Sparse() sparseArray { return (sparseArray)(h[386:][:24*4+1]) }
267func (h *headerGNU) RealSize() []byte    { return h[483:][:12] }
268
269type headerSTAR [blockSize]byte
270
271func (h *headerSTAR) V7() *headerV7      { return (*headerV7)(h) }
272func (h *headerSTAR) Magic() []byte      { return h[257:][:6] }
273func (h *headerSTAR) Version() []byte    { return h[263:][:2] }
274func (h *headerSTAR) UserName() []byte   { return h[265:][:32] }
275func (h *headerSTAR) GroupName() []byte  { return h[297:][:32] }
276func (h *headerSTAR) DevMajor() []byte   { return h[329:][:8] }
277func (h *headerSTAR) DevMinor() []byte   { return h[337:][:8] }
278func (h *headerSTAR) Prefix() []byte     { return h[345:][:131] }
279func (h *headerSTAR) AccessTime() []byte { return h[476:][:12] }
280func (h *headerSTAR) ChangeTime() []byte { return h[488:][:12] }
281func (h *headerSTAR) Trailer() []byte    { return h[508:][:4] }
282
283type headerUSTAR [blockSize]byte
284
285func (h *headerUSTAR) V7() *headerV7     { return (*headerV7)(h) }
286func (h *headerUSTAR) Magic() []byte     { return h[257:][:6] }
287func (h *headerUSTAR) Version() []byte   { return h[263:][:2] }
288func (h *headerUSTAR) UserName() []byte  { return h[265:][:32] }
289func (h *headerUSTAR) GroupName() []byte { return h[297:][:32] }
290func (h *headerUSTAR) DevMajor() []byte  { return h[329:][:8] }
291func (h *headerUSTAR) DevMinor() []byte  { return h[337:][:8] }
292func (h *headerUSTAR) Prefix() []byte    { return h[345:][:155] }
293
294type sparseArray []byte
295
296func (s sparseArray) Entry(i int) sparseElem { return (sparseElem)(s[i*24:]) }
297func (s sparseArray) IsExtended() []byte     { return s[24*s.MaxEntries():][:1] }
298func (s sparseArray) MaxEntries() int        { return len(s) / 24 }
299
300type sparseElem []byte
301
302func (s sparseElem) Offset() []byte { return s[00:][:12] }
303func (s sparseElem) Length() []byte { return s[12:][:12] }
304