1/*
2Copyright 2011 The Perkeep Authors
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8     http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15*/
16
17// Package schema manipulates Camlistore schema blobs.
18//
19// A schema blob is a JSON-encoded blob that describes other blobs.
20// See documentation in Perkeep's doc/schema/ directory.
21package schema // import "perkeep.org/pkg/schema"
22
23import (
24	"bytes"
25	"context"
26	"crypto/rand"
27	"encoding/base64"
28	"encoding/json"
29	"errors"
30	"fmt"
31	"hash"
32	"io"
33	"log"
34	"os"
35	"regexp"
36	"strconv"
37	"strings"
38	"sync"
39	"time"
40	"unicode/utf8"
41
42	"github.com/bradfitz/latlong"
43	"perkeep.org/pkg/blob"
44
45	"github.com/rwcarlsen/goexif/exif"
46	"github.com/rwcarlsen/goexif/tiff"
47	"go4.org/strutil"
48	"go4.org/types"
49)
50
51func init() {
52	// Intern common strings as used by schema blobs (camliType values), to reduce
53	// index memory usage, which uses strutil.StringFromBytes.
54	strutil.RegisterCommonString(
55		"bytes",
56		"claim",
57		"directory",
58		"file",
59		"permanode",
60		"share",
61		"static-set",
62		"symlink",
63	)
64}
65
66// MaxSchemaBlobSize represents the upper bound for how large
67// a schema blob may be.
68const MaxSchemaBlobSize = 1 << 20
69
70var (
71	ErrNoCamliVersion = errors.New("schema: no camliVersion key in map")
72)
73
74var clockNow = time.Now
75
76type StatHasher interface {
77	Lstat(fileName string) (os.FileInfo, error)
78	Hash(fileName string) (blob.Ref, error)
79}
80
81// File is the interface returned when opening a DirectoryEntry that
82// is a regular file.
83type File interface {
84	io.Closer
85	io.ReaderAt
86	io.Reader
87	Size() int64
88}
89
90// Directory is a read-only interface to a "directory" schema blob.
91type Directory interface {
92	// Readdir reads the contents of the directory associated with dr
93	// and returns an array of up to n DirectoryEntries structures.
94	// Subsequent calls on the same file will yield further
95	// DirectoryEntries.
96	// If n > 0, Readdir returns at most n DirectoryEntry structures. In
97	// this case, if Readdir returns an empty slice, it will return
98	// a non-nil error explaining why. At the end of a directory,
99	// the error is os.EOF.
100	// If n <= 0, Readdir returns all the DirectoryEntries from the
101	// directory in a single slice. In this case, if Readdir succeeds
102	// (reads all the way to the end of the directory), it returns the
103	// slice and a nil os.Error. If it encounters an error before the
104	// end of the directory, Readdir returns the DirectoryEntry read
105	// until that point and a non-nil error.
106	Readdir(ctx context.Context, n int) ([]DirectoryEntry, error)
107}
108
109type Symlink interface {
110	// .. TODO
111}
112
113// FIFO is the read-only interface to a "fifo" schema blob.
114type FIFO interface {
115	// .. TODO
116}
117
118// Socket is the read-only interface to a "socket" schema blob.
119type Socket interface {
120	// .. TODO
121}
122
123// DirectoryEntry is a read-only interface to an entry in a (static)
124// directory.
125type DirectoryEntry interface {
126	// CamliType returns the schema blob's "camliType" field.
127	// This may be "file", "directory", "symlink", or other more
128	// obscure types added in the future.
129	CamliType() string
130
131	FileName() string
132	BlobRef() blob.Ref
133
134	File(ctx context.Context) (File, error)           // if camliType is "file"
135	Directory(ctx context.Context) (Directory, error) // if camliType is "directory"
136	Symlink() (Symlink, error)                        // if camliType is "symlink"
137	FIFO() (FIFO, error)                              // if camliType is "fifo"
138	Socket() (Socket, error)                          // If camliType is "socket"
139}
140
141// dirEntry is the default implementation of DirectoryEntry
142type dirEntry struct {
143	ss      superset
144	fetcher blob.Fetcher
145	fr      *FileReader // or nil if not a file
146	dr      *DirReader  // or nil if not a directory
147}
148
149// A SearchQuery must be of type *search.SearchQuery.
150// This type breaks an otherwise-circular dependency.
151type SearchQuery interface{}
152
153func (de *dirEntry) CamliType() string {
154	return de.ss.Type
155}
156
157func (de *dirEntry) FileName() string {
158	return de.ss.FileNameString()
159}
160
161func (de *dirEntry) BlobRef() blob.Ref {
162	return de.ss.BlobRef
163}
164
165func (de *dirEntry) File(ctx context.Context) (File, error) {
166	if de.fr == nil {
167		if de.ss.Type != "file" {
168			return nil, fmt.Errorf("DirectoryEntry is camliType %q, not %q", de.ss.Type, "file")
169		}
170		fr, err := NewFileReader(ctx, de.fetcher, de.ss.BlobRef)
171		if err != nil {
172			return nil, err
173		}
174		de.fr = fr
175	}
176	return de.fr, nil
177}
178
179func (de *dirEntry) Directory(ctx context.Context) (Directory, error) {
180	if de.dr == nil {
181		if de.ss.Type != "directory" {
182			return nil, fmt.Errorf("DirectoryEntry is camliType %q, not %q", de.ss.Type, "directory")
183		}
184		dr, err := NewDirReader(ctx, de.fetcher, de.ss.BlobRef)
185		if err != nil {
186			return nil, err
187		}
188		de.dr = dr
189	}
190	return de.dr, nil
191}
192
193func (de *dirEntry) Symlink() (Symlink, error) {
194	return 0, errors.New("TODO: Symlink not implemented")
195}
196
197func (de *dirEntry) FIFO() (FIFO, error) {
198	return 0, errors.New("TODO: FIFO not implemented")
199}
200
201func (de *dirEntry) Socket() (Socket, error) {
202	return 0, errors.New("TODO: Socket not implemented")
203}
204
205// newDirectoryEntry takes a superset and returns a DirectoryEntry if
206// the Supserset is valid and represents an entry in a directory.  It
207// must by of type "file", "directory", "symlink" or "socket".
208// TODO: "char", block", probably.  later.
209func newDirectoryEntry(fetcher blob.Fetcher, ss *superset) (DirectoryEntry, error) {
210	if ss == nil {
211		return nil, errors.New("ss was nil")
212	}
213	if !ss.BlobRef.Valid() {
214		return nil, errors.New("ss.BlobRef was invalid")
215	}
216	switch ss.Type {
217	case "file", "directory", "symlink", "fifo", "socket":
218		// Okay
219	default:
220		return nil, fmt.Errorf("invalid DirectoryEntry camliType of %q", ss.Type)
221	}
222	de := &dirEntry{ss: *ss, fetcher: fetcher} // defensive copy
223	return de, nil
224}
225
226// NewDirectoryEntryFromBlobRef takes a BlobRef and returns a
227//  DirectoryEntry if the BlobRef contains a type "file", "directory",
228//  "symlink", "fifo" or "socket".
229// TODO: ""char", "block", probably.  later.
230func NewDirectoryEntryFromBlobRef(ctx context.Context, fetcher blob.Fetcher, blobRef blob.Ref) (DirectoryEntry, error) {
231	ss := new(superset)
232	err := ss.setFromBlobRef(ctx, fetcher, blobRef)
233	if err != nil {
234		return nil, fmt.Errorf("schema/filereader: can't fill superset: %v", err)
235	}
236	return newDirectoryEntry(fetcher, ss)
237}
238
239// superset represents the superset of common Perkeep JSON schema
240// keys as a convenient json.Unmarshal target.
241// TODO(bradfitz): unexport this type. Getting too gross. Move to schema.Blob
242type superset struct {
243	// BlobRef isn't for a particular metadata blob field, but included
244	// for convenience.
245	BlobRef blob.Ref
246
247	Version int    `json:"camliVersion"`
248	Type    string `json:"camliType"`
249
250	Signer blob.Ref `json:"camliSigner"`
251	Sig    string   `json:"camliSig"`
252
253	ClaimType string         `json:"claimType"`
254	ClaimDate types.Time3339 `json:"claimDate"`
255
256	Permanode blob.Ref `json:"permaNode"`
257	Attribute string   `json:"attribute"`
258	Value     string   `json:"value"`
259
260	// FileName and FileNameBytes represent one of the two
261	// representations of file names in schema blobs.  They should
262	// not be accessed directly.  Use the FileNameString accessor
263	// instead, which also sanitizes malicious values.
264	FileName      string        `json:"fileName"`
265	FileNameBytes []interface{} `json:"fileNameBytes"`
266
267	SymlinkTarget      string        `json:"symlinkTarget"`
268	SymlinkTargetBytes []interface{} `json:"symlinkTargetBytes"`
269
270	UnixPermission string `json:"unixPermission"`
271	UnixOwnerId    int    `json:"unixOwnerId"`
272	UnixOwner      string `json:"unixOwner"`
273	UnixGroupId    int    `json:"unixGroupId"`
274	UnixGroup      string `json:"unixGroup"`
275	UnixMtime      string `json:"unixMtime"`
276	UnixCtime      string `json:"unixCtime"`
277	UnixAtime      string `json:"unixAtime"`
278
279	// Parts are references to the data chunks of a regular file (or a "bytes" schema blob).
280	// See doc/schema/bytes.txt and doc/schema/files/file.txt.
281	Parts []*BytesPart `json:"parts"`
282
283	Entries   blob.Ref   `json:"entries"`   // for directories, a blobref to a static-set
284	Members   []blob.Ref `json:"members"`   // for static sets (for directory static-sets: blobrefs to child dirs/files)
285	MergeSets []blob.Ref `json:"mergeSets"` // each is a "sub static-set", that has either Members or MergeSets. For large dirs.
286
287	// Search allows a "share" blob to share an entire search. Contrast with "target".
288	Search SearchQuery `json:"search"`
289	// Target is a "share" blob's target (the thing being shared)
290	// Or it is the object being deleted in a DeleteClaim claim.
291	Target blob.Ref `json:"target"`
292	// Transitive is a property of a "share" blob.
293	Transitive bool `json:"transitive"`
294	// AuthType is a "share" blob's authentication type that is required.
295	// Currently (2013-01-02) just "haveref" (if you know the share's blobref,
296	// you get access: the secret URL model)
297	AuthType string         `json:"authType"`
298	Expires  types.Time3339 `json:"expires"` // or zero for no expiration
299}
300
301func parseSuperset(r io.Reader) (*superset, error) {
302	var ss superset
303	if err := json.NewDecoder(io.LimitReader(r, MaxSchemaBlobSize)).Decode(&ss); err != nil {
304		return nil, err
305	}
306	return &ss, nil
307}
308
309// BlobFromReader returns a new Blob from the provided Reader r,
310// which should be the body of the provided blobref.
311// Note: the hash checksum is not verified.
312func BlobFromReader(ref blob.Ref, r io.Reader) (*Blob, error) {
313	if !ref.Valid() {
314		return nil, errors.New("schema.BlobFromReader: invalid blobref")
315	}
316	var buf bytes.Buffer
317	tee := io.TeeReader(r, &buf)
318	ss, err := parseSuperset(tee)
319	if err != nil {
320		return nil, err
321	}
322	var wb [16]byte
323	afterObj := 0
324	for {
325		n, err := tee.Read(wb[:])
326		afterObj += n
327		for i := 0; i < n; i++ {
328			if !isASCIIWhite(wb[i]) {
329				return nil, fmt.Errorf("invalid bytes after JSON schema blob in %v", ref)
330			}
331		}
332		if afterObj > MaxSchemaBlobSize {
333			break
334		}
335		if err == io.EOF {
336			break
337		}
338		if err != nil {
339			return nil, err
340		}
341	}
342	json := buf.String()
343	if len(json) > MaxSchemaBlobSize {
344		return nil, fmt.Errorf("schema: metadata blob %v is over expected limit; size=%d", ref, len(json))
345	}
346	return &Blob{ref, json, ss}, nil
347}
348
349func isASCIIWhite(b byte) bool {
350	switch b {
351	case ' ', '\t', '\r', '\n':
352		return true
353	}
354	return false
355}
356
357// BytesPart is the type representing one of the "parts" in a "file"
358// or "bytes" JSON schema.
359//
360// See doc/schema/bytes.txt and doc/schema/files/file.txt.
361type BytesPart struct {
362	// Size is the number of bytes that this part contributes to the overall segment.
363	Size uint64 `json:"size"`
364
365	// At most one of BlobRef or BytesRef must be non-zero
366	// (Valid), but it's illegal for both.
367	// If neither are set, this BytesPart represents Size zero bytes.
368	// BlobRef refers to raw bytes. BytesRef references a "bytes" schema blob.
369	BlobRef  blob.Ref `json:"blobRef,omitempty"`
370	BytesRef blob.Ref `json:"bytesRef,omitempty"`
371
372	// Offset optionally specifies the offset into BlobRef to skip
373	// when reading Size bytes.
374	Offset uint64 `json:"offset,omitempty"`
375}
376
377// stringFromMixedArray joins a slice of either strings or float64
378// values (as retrieved from JSON decoding) into a string.  These are
379// used for non-UTF8 filenames in "fileNameBytes" fields.  The strings
380// are UTF-8 segments and the float64s (actually uint8 values) are
381// byte values.
382func stringFromMixedArray(parts []interface{}) string {
383	var buf bytes.Buffer
384	for _, part := range parts {
385		if s, ok := part.(string); ok {
386			buf.WriteString(s)
387			continue
388		}
389		if num, ok := part.(float64); ok {
390			buf.WriteByte(byte(num))
391			continue
392		}
393	}
394	return buf.String()
395}
396
397// mixedArrayFromString is the inverse of stringFromMixedArray. It
398// splits a string to a series of either UTF-8 strings and non-UTF-8
399// bytes.
400func mixedArrayFromString(s string) (parts []interface{}) {
401	for len(s) > 0 {
402		if n := utf8StrLen(s); n > 0 {
403			parts = append(parts, s[:n])
404			s = s[n:]
405		} else {
406			parts = append(parts, s[0])
407			s = s[1:]
408		}
409	}
410	return parts
411}
412
413// utf8StrLen returns how many prefix bytes of s are valid UTF-8.
414func utf8StrLen(s string) int {
415	for i, r := range s {
416		for r == utf8.RuneError {
417			// The RuneError value can be an error
418			// sentinel value (if it's size 1) or the same
419			// value encoded properly. Decode it to see if
420			// it's the 1 byte sentinel value.
421			_, size := utf8.DecodeRuneInString(s[i:])
422			if size == 1 {
423				return i
424			}
425		}
426	}
427	return len(s)
428}
429
430func (ss *superset) SumPartsSize() (size uint64) {
431	for _, part := range ss.Parts {
432		size += uint64(part.Size)
433	}
434	return size
435}
436
437func (ss *superset) SymlinkTargetString() string {
438	if ss.SymlinkTarget != "" {
439		return ss.SymlinkTarget
440	}
441	return stringFromMixedArray(ss.SymlinkTargetBytes)
442}
443
444// FileNameString returns the schema blob's base filename.
445//
446// If the fileName field of the blob accidentally or maliciously
447// contains a slash, this function returns an empty string instead.
448func (ss *superset) FileNameString() string {
449	v := ss.FileName
450	if v == "" {
451		v = stringFromMixedArray(ss.FileNameBytes)
452	}
453	if v != "" {
454		if strings.Contains(v, "/") {
455			// Bogus schema blob; ignore.
456			return ""
457		}
458		if strings.Contains(v, "\\") {
459			// Bogus schema blob; ignore.
460			return ""
461		}
462	}
463	return v
464}
465
466func (ss *superset) HasFilename(name string) bool {
467	return ss.FileNameString() == name
468}
469
470func (b *Blob) FileMode() os.FileMode {
471	// TODO: move this to a different type, off *Blob
472	return b.ss.FileMode()
473}
474
475func (ss *superset) FileMode() os.FileMode {
476	var mode os.FileMode
477	hasPerm := ss.UnixPermission != ""
478	if hasPerm {
479		m64, err := strconv.ParseUint(ss.UnixPermission, 8, 64)
480		if err == nil {
481			mode = mode | os.FileMode(m64)
482		}
483	}
484
485	// TODO: add other types (block, char, etc)
486	switch ss.Type {
487	case "directory":
488		mode = mode | os.ModeDir
489	case "file":
490		// No extra bit.
491	case "symlink":
492		mode = mode | os.ModeSymlink
493	case "fifo":
494		mode = mode | os.ModeNamedPipe
495	case "socket":
496		mode = mode | os.ModeSocket
497	}
498	if !hasPerm {
499		switch ss.Type {
500		case "directory":
501			mode |= 0755
502		default:
503			mode |= 0644
504		}
505	}
506	return mode
507}
508
509// MapUid returns the most appropriate mapping from this file's owner
510// to the local machine's owner, trying first a match by name,
511// followed by just mapping the number through directly.
512func (b *Blob) MapUid() int { return b.ss.MapUid() }
513
514// MapGid returns the most appropriate mapping from this file's group
515// to the local machine's group, trying first a match by name,
516// followed by just mapping the number through directly.
517func (b *Blob) MapGid() int { return b.ss.MapGid() }
518
519func (ss *superset) MapUid() int {
520	if ss.UnixOwner != "" {
521		uid, ok := getUidFromName(ss.UnixOwner)
522		if ok {
523			return uid
524		}
525	}
526	return ss.UnixOwnerId // TODO: will be 0 if unset, which isn't ideal
527}
528
529func (ss *superset) MapGid() int {
530	if ss.UnixGroup != "" {
531		gid, ok := getGidFromName(ss.UnixGroup)
532		if ok {
533			return gid
534		}
535	}
536	return ss.UnixGroupId // TODO: will be 0 if unset, which isn't ideal
537}
538
539func (ss *superset) ModTime() time.Time {
540	if ss.UnixMtime == "" {
541		return time.Time{}
542	}
543	t, err := time.Parse(time.RFC3339, ss.UnixMtime)
544	if err != nil {
545		return time.Time{}
546	}
547	return t
548}
549
550var DefaultStatHasher = &defaultStatHasher{}
551
552type defaultStatHasher struct{}
553
554func (d *defaultStatHasher) Lstat(fileName string) (os.FileInfo, error) {
555	return os.Lstat(fileName)
556}
557
558func (d *defaultStatHasher) Hash(fileName string) (blob.Ref, error) {
559	h := blob.NewHash()
560	file, err := os.Open(fileName)
561	if err != nil {
562		return blob.Ref{}, err
563	}
564	defer file.Close()
565	_, err = io.Copy(h, file)
566	if err != nil {
567		return blob.Ref{}, err
568	}
569	return blob.RefFromHash(h), nil
570}
571
572// maximum number of static-set members in a static-set schema. As noted in
573// https://github.com/camlistore/camlistore/issues/924 , 33k members result in a
574// 1.7MB blob, so 10k members seems reasonable to stay under the MaxSchemaBlobSize (1MB)
575// limit. This is not a const, so we can lower it during tests and test the logic
576// without having to create thousands of blobs.
577var maxStaticSetMembers = 10000
578
579// NewStaticSet returns the "static-set" schema for a directory. Its members
580// should be populated with SetStaticSetMembers.
581func NewStaticSet() *Builder {
582	return base(1, "static-set")
583}
584
585// SetStaticSetMembers sets the given members as the static-set members of this
586// builder. If the members are so numerous that they would not fit on a schema
587// blob, they are spread (recursively, if needed) onto sub static-sets. In which
588// case, these subsets are set as "mergeSets" of this builder. All the created
589// subsets are returned, so the caller can upload them along with the top
590// static-set created from this builder.
591// SetStaticSetMembers panics if bb isn't a "static-set" claim type.
592func (bb *Builder) SetStaticSetMembers(members []blob.Ref) []*Blob {
593	if bb.Type() != "static-set" {
594		panic("called SetStaticSetMembers on non static-set")
595	}
596
597	if len(members) <= maxStaticSetMembers {
598		ms := make([]string, len(members))
599		for i := range members {
600			ms[i] = members[i].String()
601		}
602		bb.m["members"] = ms
603		return nil
604	}
605
606	// too many members to fit in one static-set, so we spread them in
607	// several sub static-sets.
608	subsetsNumber := len(members) / maxStaticSetMembers
609	var perSubset int
610	if subsetsNumber < maxStaticSetMembers {
611		// this means we can fill each subset up to maxStaticSetMembers,
612		// and stash the rest in one last subset.
613		perSubset = maxStaticSetMembers
614	} else {
615		// otherwise we need to divide the members evenly in
616		// (maxStaticSetMembers - 1) subsets, and each of these subsets
617		// will also (recursively) have subsets of its own. There might
618		// also be a rest in one last subset, as above.
619		subsetsNumber = maxStaticSetMembers - 1
620		perSubset = len(members) / subsetsNumber
621	}
622	// only the subsets at this level
623	subsets := make([]*Blob, 0, subsetsNumber)
624	// subsets at this level, plus all the children subsets.
625	allSubsets := make([]*Blob, 0, subsetsNumber)
626	for i := 0; i < subsetsNumber; i++ {
627		ss := NewStaticSet()
628		subss := ss.SetStaticSetMembers(members[i*perSubset : (i+1)*perSubset])
629		subsets = append(subsets, ss.Blob())
630		allSubsets = append(allSubsets, ss.Blob())
631		for _, v := range subss {
632			allSubsets = append(allSubsets, v)
633		}
634	}
635
636	// Deal with the rest (of the euclidian division)
637	if perSubset*subsetsNumber < len(members) {
638		ss := NewStaticSet()
639		ss.SetStaticSetMembers(members[perSubset*subsetsNumber:])
640		allSubsets = append(allSubsets, ss.Blob())
641		subsets = append(subsets, ss.Blob())
642	}
643
644	mss := make([]string, len(subsets))
645	for i := range subsets {
646		mss[i] = subsets[i].BlobRef().String()
647	}
648	bb.m["mergeSets"] = mss
649	return allSubsets
650}
651
652func base(version int, ctype string) *Builder {
653	return &Builder{map[string]interface{}{
654		"camliVersion": version,
655		"camliType":    ctype,
656	}}
657}
658
659// NewUnsignedPermanode returns a new random permanode, not yet signed.
660func NewUnsignedPermanode() *Builder {
661	bb := base(1, "permanode")
662	chars := make([]byte, 20)
663	_, err := io.ReadFull(rand.Reader, chars)
664	if err != nil {
665		panic("error reading random bytes: " + err.Error())
666	}
667	bb.m["random"] = base64.StdEncoding.EncodeToString(chars)
668	return bb
669}
670
671// NewPlannedPermanode returns a permanode with a fixed key.  Like
672// NewUnsignedPermanode, this builder is also not yet signed.  Callers of
673// NewPlannedPermanode must sign the map with a fixed claimDate and
674// GPG date to create consistent JSON encodings of the Map (its
675// blobref), between runs.
676func NewPlannedPermanode(key string) *Builder {
677	bb := base(1, "permanode")
678	bb.m["key"] = key
679	return bb
680}
681
682// NewHashPlannedPermanode returns a planned permanode with the sum
683// of the hash, prefixed with "sha1-", as the key.
684func NewHashPlannedPermanode(h hash.Hash) *Builder {
685	return NewPlannedPermanode(blob.RefFromHash(h).String())
686}
687
688// JSON returns the map m encoded as JSON in its
689// recommended canonical form. The canonical form is readable with newlines and indentation,
690// and always starts with the header bytes:
691//
692//   {"camliVersion":
693//
694func mapJSON(m map[string]interface{}) (string, error) {
695	version, hasVersion := m["camliVersion"]
696	if !hasVersion {
697		return "", ErrNoCamliVersion
698	}
699	delete(m, "camliVersion")
700	jsonBytes, err := json.MarshalIndent(m, "", "  ")
701	if err != nil {
702		return "", err
703	}
704	m["camliVersion"] = version
705	var buf bytes.Buffer
706	fmt.Fprintf(&buf, "{\"camliVersion\": %v,\n", version)
707	buf.Write(jsonBytes[2:])
708	return buf.String(), nil
709}
710
711// NewFileMap returns a new builder of a type "file" schema for the provided fileName.
712// The chunk parts of the file are not populated.
713func NewFileMap(fileName string) *Builder {
714	return newCommonFilenameMap(fileName).SetType("file")
715}
716
717// NewDirMap returns a new builder of a type "directory" schema for the provided fileName.
718func NewDirMap(fileName string) *Builder {
719	return newCommonFilenameMap(fileName).SetType("directory")
720}
721
722func newCommonFilenameMap(fileName string) *Builder {
723	bb := base(1, "" /* no type yet */)
724	if fileName != "" {
725		bb.SetFileName(fileName)
726	}
727	return bb
728}
729
730var populateSchemaStat []func(schemaMap map[string]interface{}, fi os.FileInfo)
731
732func NewCommonFileMap(fileName string, fi os.FileInfo) *Builder {
733	bb := newCommonFilenameMap(fileName)
734	// Common elements (from file-common.txt)
735	if fi.Mode()&os.ModeSymlink == 0 {
736		bb.m["unixPermission"] = fmt.Sprintf("0%o", fi.Mode().Perm())
737	}
738
739	// OS-specific population; defined in schema_posix.go, etc. (not on App Engine)
740	for _, f := range populateSchemaStat {
741		f(bb.m, fi)
742	}
743
744	if mtime := fi.ModTime(); !mtime.IsZero() {
745		bb.m["unixMtime"] = RFC3339FromTime(mtime)
746	}
747	return bb
748}
749
750// PopulateParts sets the "parts" field of the blob with the provided
751// parts.  The sum of the sizes of parts must match the provided size
752// or an error is returned.  Also, each BytesPart may only contain either
753// a BytesPart or a BlobRef, but not both.
754func (bb *Builder) PopulateParts(size int64, parts []BytesPart) error {
755	return populateParts(bb.m, size, parts)
756}
757
758func populateParts(m map[string]interface{}, size int64, parts []BytesPart) error {
759	sumSize := int64(0)
760	mparts := make([]map[string]interface{}, len(parts))
761	for idx, part := range parts {
762		mpart := make(map[string]interface{})
763		mparts[idx] = mpart
764		switch {
765		case part.BlobRef.Valid() && part.BytesRef.Valid():
766			return errors.New("schema: part contains both BlobRef and BytesRef")
767		case part.BlobRef.Valid():
768			mpart["blobRef"] = part.BlobRef.String()
769		case part.BytesRef.Valid():
770			mpart["bytesRef"] = part.BytesRef.String()
771		default:
772			return errors.New("schema: part must contain either a BlobRef or BytesRef")
773		}
774		mpart["size"] = part.Size
775		sumSize += int64(part.Size)
776		if part.Offset != 0 {
777			mpart["offset"] = part.Offset
778		}
779	}
780	if sumSize != size {
781		return fmt.Errorf("schema: declared size %d doesn't match sum of parts size %d", size, sumSize)
782	}
783	m["parts"] = mparts
784	return nil
785}
786
787func newBytes() *Builder {
788	return base(1, "bytes")
789}
790
791// ClaimType is one of the valid "claimType" fields in a "claim" schema blob. See doc/schema/claims/.
792type ClaimType string
793
794const (
795	SetAttributeClaim ClaimType = "set-attribute"
796	AddAttributeClaim ClaimType = "add-attribute"
797	DelAttributeClaim ClaimType = "del-attribute"
798	ShareClaim        ClaimType = "share"
799	// DeleteClaim deletes a permanode or another claim.
800	// A delete claim can itself be deleted, and so on.
801	DeleteClaim ClaimType = "delete"
802)
803
804// claimParam is used to populate a claim map when building a new claim
805type claimParam struct {
806	claimType ClaimType
807
808	// Params specific to *Attribute claims:
809	permanode blob.Ref // modified permanode
810	attribute string   // required
811	value     string   // optional if Type == DelAttributeClaim
812
813	// Params specific to ShareClaim claims:
814	authType   string
815	transitive bool
816
817	// Params specific to ShareClaim and DeleteClaim claims.
818	target blob.Ref
819}
820
821func newClaim(claims ...*claimParam) *Builder {
822	bb := base(1, "claim")
823	bb.SetClaimDate(clockNow())
824	if len(claims) == 1 {
825		cp := claims[0]
826		populateClaimMap(bb.m, cp)
827		return bb
828	}
829	var claimList []interface{}
830	for _, cp := range claims {
831		m := map[string]interface{}{}
832		populateClaimMap(m, cp)
833		claimList = append(claimList, m)
834	}
835	bb.m["claimType"] = "multi"
836	bb.m["claims"] = claimList
837	return bb
838}
839
840func populateClaimMap(m map[string]interface{}, cp *claimParam) {
841	m["claimType"] = string(cp.claimType)
842	switch cp.claimType {
843	case ShareClaim:
844		m["authType"] = cp.authType
845		m["transitive"] = cp.transitive
846	case DeleteClaim:
847		m["target"] = cp.target.String()
848	default:
849		m["permaNode"] = cp.permanode.String()
850		m["attribute"] = cp.attribute
851		if !(cp.claimType == DelAttributeClaim && cp.value == "") {
852			m["value"] = cp.value
853		}
854	}
855}
856
857// NewShareRef creates a *Builder for a "share" claim.
858func NewShareRef(authType string, transitive bool) *Builder {
859	return newClaim(&claimParam{
860		claimType:  ShareClaim,
861		authType:   authType,
862		transitive: transitive,
863	})
864}
865
866func NewSetAttributeClaim(permaNode blob.Ref, attr, value string) *Builder {
867	return newClaim(&claimParam{
868		permanode: permaNode,
869		claimType: SetAttributeClaim,
870		attribute: attr,
871		value:     value,
872	})
873}
874
875func NewAddAttributeClaim(permaNode blob.Ref, attr, value string) *Builder {
876	return newClaim(&claimParam{
877		permanode: permaNode,
878		claimType: AddAttributeClaim,
879		attribute: attr,
880		value:     value,
881	})
882}
883
884// NewDelAttributeClaim creates a new claim to remove value from the
885// values set for the attribute attr of permaNode. If value is empty then
886// all the values for attribute are cleared.
887func NewDelAttributeClaim(permaNode blob.Ref, attr, value string) *Builder {
888	return newClaim(&claimParam{
889		permanode: permaNode,
890		claimType: DelAttributeClaim,
891		attribute: attr,
892		value:     value,
893	})
894}
895
896// NewDeleteClaim creates a new claim to delete a target claim or permanode.
897func NewDeleteClaim(target blob.Ref) *Builder {
898	return newClaim(&claimParam{
899		target:    target,
900		claimType: DeleteClaim,
901	})
902}
903
904// ShareHaveRef is the auth type specifying that if you "have the
905// reference" (know the blobref to the haveref share blob), then you
906// have access to the referenced object from that share blob.
907// This is the "send a link to a friend" access model.
908const ShareHaveRef = "haveref"
909
910// UnknownLocation is a magic timezone value used when the actual location
911// of a time is unknown. For instance, EXIF files commonly have a time without
912// a corresponding location or timezone offset.
913var UnknownLocation = time.FixedZone("Unknown", -60) // 1 minute west
914
915// IsZoneKnown reports whether t is in a known timezone.
916// Perkeep uses the magic timezone offset of 1 minute west of UTC
917// to mean that the timezone wasn't known.
918func IsZoneKnown(t time.Time) bool {
919	if t.Location() == UnknownLocation {
920		return false
921	}
922	if _, off := t.Zone(); off == -60 {
923		return false
924	}
925	return true
926}
927
928// RFC3339FromTime returns an RFC3339-formatted time.
929//
930// If the timezone is known, the time will be converted to UTC and
931// returned with a "Z" suffix. For unknown zones, the timezone will be
932// "-00:01" (1 minute west of UTC).
933//
934// Fractional seconds are only included if the time has fractional
935// seconds.
936func RFC3339FromTime(t time.Time) string {
937	if IsZoneKnown(t) {
938		t = t.UTC()
939	}
940	if t.UnixNano()%1e9 == 0 {
941		return t.Format(time.RFC3339)
942	}
943	return t.Format(time.RFC3339Nano)
944}
945
946var bytesCamliVersion = []byte("camliVersion")
947
948// LikelySchemaBlob returns quickly whether buf likely contains (or is
949// the prefix of) a schema blob.
950func LikelySchemaBlob(buf []byte) bool {
951	if len(buf) == 0 || buf[0] != '{' {
952		return false
953	}
954	return bytes.Contains(buf, bytesCamliVersion)
955}
956
957// findSize checks if v is an *os.File or if it has
958// a Size() int64 method, to find its size.
959// It returns 0, false otherwise.
960func findSize(v interface{}) (size int64, ok bool) {
961	if fi, ok := v.(*os.File); ok {
962		v, _ = fi.Stat()
963	}
964	if sz, ok := v.(interface {
965		Size() int64
966	}); ok {
967		return sz.Size(), true
968	}
969	// For bytes.Reader, strings.Reader, etc:
970	if li, ok := v.(interface {
971		Len() int
972	}); ok {
973		ln := int64(li.Len()) // unread portion, typically
974		// If it's also a seeker, remove add any seek offset:
975		if sk, ok := v.(io.Seeker); ok {
976			if cur, err := sk.Seek(0, 1); err == nil {
977				ln += cur
978			}
979		}
980		return ln, true
981	}
982	return 0, false
983}
984
985// FileTime returns the best guess of the file's creation time (or modtime).
986// If the file doesn't have its own metadata indication the creation time (such as in EXIF),
987// FileTime uses the modification time from the file system.
988// It there was a valid EXIF but an error while trying to get a date from it,
989// it logs the error and tries the other methods.
990func FileTime(f io.ReaderAt) (time.Time, error) {
991	var ct time.Time
992	defaultTime := func() (time.Time, error) {
993		if osf, ok := f.(*os.File); ok {
994			fi, err := osf.Stat()
995			if err != nil {
996				return ct, fmt.Errorf("Failed to find a modtime: stat: %v", err)
997			}
998			return fi.ModTime(), nil
999		}
1000		return ct, errors.New("all methods failed to find a creation time or modtime")
1001	}
1002
1003	size, ok := findSize(f)
1004	if !ok {
1005		size = 256 << 10 // enough to get the EXIF
1006	}
1007	r := io.NewSectionReader(f, 0, size)
1008	var tiffErr error
1009	ex, err := exif.Decode(r)
1010	if err != nil {
1011		tiffErr = err
1012		if exif.IsShortReadTagValueError(err) {
1013			return ct, io.ErrUnexpectedEOF
1014		}
1015		if exif.IsCriticalError(err) || exif.IsExifError(err) {
1016			return defaultTime()
1017		}
1018	}
1019	ct, err = ex.DateTime()
1020	if err != nil {
1021		return defaultTime()
1022	}
1023	// If the EXIF file only had local timezone, but it did have
1024	// GPS, then lookup the timezone and correct the time.
1025	if ct.Location() == time.Local {
1026		if exif.IsGPSError(tiffErr) {
1027			log.Printf("Invalid EXIF GPS data: %v", tiffErr)
1028			return ct, nil
1029		}
1030		if lat, long, err := ex.LatLong(); err == nil {
1031			if loc := lookupLocation(latlong.LookupZoneName(lat, long)); loc != nil {
1032				if t, err := exifDateTimeInLocation(ex, loc); err == nil {
1033					return t, nil
1034				}
1035			}
1036		} else if !exif.IsTagNotPresentError(err) {
1037			log.Printf("Invalid EXIF GPS data: %v", err)
1038		}
1039	}
1040	return ct, nil
1041}
1042
1043// This is basically a copy of the exif.Exif.DateTime() method, except:
1044//   * it takes a *time.Location to assume
1045//   * the caller already assumes there's no timezone offset or GPS time
1046//     in the EXIF, so any of that code can be ignored.
1047func exifDateTimeInLocation(x *exif.Exif, loc *time.Location) (time.Time, error) {
1048	tag, err := x.Get(exif.DateTimeOriginal)
1049	if err != nil {
1050		tag, err = x.Get(exif.DateTime)
1051		if err != nil {
1052			return time.Time{}, err
1053		}
1054	}
1055	if tag.Format() != tiff.StringVal {
1056		return time.Time{}, errors.New("DateTime[Original] not in string format")
1057	}
1058	const exifTimeLayout = "2006:01:02 15:04:05"
1059	dateStr := strings.TrimRight(string(tag.Val), "\x00")
1060	return time.ParseInLocation(exifTimeLayout, dateStr, loc)
1061}
1062
1063var zoneCache struct {
1064	sync.RWMutex
1065	m map[string]*time.Location
1066}
1067
1068func lookupLocation(zone string) *time.Location {
1069	if zone == "" {
1070		return nil
1071	}
1072	zoneCache.RLock()
1073	l, ok := zoneCache.m[zone]
1074	zoneCache.RUnlock()
1075	if ok {
1076		return l
1077	}
1078	// could use singleflight here, but doesn't really
1079	// matter if two callers both do this.
1080	loc, err := time.LoadLocation(zone)
1081
1082	zoneCache.Lock()
1083	if zoneCache.m == nil {
1084		zoneCache.m = make(map[string]*time.Location)
1085	}
1086	zoneCache.m[zone] = loc // even if nil
1087	zoneCache.Unlock()
1088
1089	if err != nil {
1090		log.Printf("failed to lookup timezone %q: %v", zone, err)
1091		return nil
1092	}
1093	return loc
1094}
1095
1096var boringTitlePattern = regexp.MustCompile(`^(?:IMG_|DSC|PANO_|ESR_).*$`)
1097
1098// IsInterestingTitle returns whether title would be interesting information as
1099// a title for a permanode. For example, filenames automatically created by
1100// cameras, such as IMG_XXXX.JPG, do not add any interesting value.
1101func IsInterestingTitle(title string) bool {
1102	return !boringTitlePattern.MatchString(title)
1103}
1104