1package suture
2
3// FIXMES in progress:
4// 1. Ensure the supervisor actually gets to the terminated state for the
5//     unstopped service report.
6// 2. Save the unstopped service report in the supervisor.
7
8import (
9	"context"
10	"errors"
11	"fmt"
12	"log"
13	"math"
14	"math/rand"
15	"runtime"
16	"sync"
17	"time"
18)
19
20const (
21	notRunning = iota
22	normal
23	paused
24	terminated
25)
26
27type supervisorID uint32
28type serviceID uint32
29
30// ErrSupervisorNotRunning is returned by some methods if the supervisor is
31// not running, either because it has not been started or because it has
32// been terminated.
33var ErrSupervisorNotRunning = errors.New("supervisor not running")
34
35/*
36Supervisor is the core type of the module that represents a Supervisor.
37
38Supervisors should be constructed either by New or NewSimple.
39
40Once constructed, a Supervisor should be started in one of three ways:
41
42 1. Calling .Serve(ctx).
43 2. Calling .ServeBackground(ctx).
44 3. Adding it to an existing Supervisor.
45
46Calling Serve will cause the supervisor to run until the passed-in
47context is cancelled. Often one of the last lines of the "main" func for a
48program will be to call one of the Serve methods.
49
50Calling ServeBackground will CORRECTLY start the supervisor running in a
51new goroutine. It is risky to directly run
52
53  go supervisor.Serve()
54
55because that will briefly create a race condition as it starts up, if you
56try to .Add() services immediately afterward.
57
58*/
59type Supervisor struct {
60	Name string
61
62	spec Spec
63
64	services             map[serviceID]serviceWithName
65	cancellations        map[serviceID]context.CancelFunc
66	servicesShuttingDown map[serviceID]serviceWithName
67	lastFail             time.Time
68	failures             float64
69	restartQueue         []serviceID
70	serviceCounter       serviceID
71	control              chan supervisorMessage
72	notifyServiceDone    chan serviceID
73	resumeTimer          <-chan time.Time
74	liveness             chan struct{}
75
76	// despite the recommendation in the context package to avoid
77	// holding this in a struct, I think due to the function of suture
78	// and the way it works, I think it's OK in this case. This is the
79	// exceptional case, basically.
80	ctxMutex sync.Mutex
81	ctx      context.Context
82	// This function cancels this supervisor specifically.
83	ctxCancel func()
84
85	getNow       func() time.Time
86	getAfterChan func(time.Duration) <-chan time.Time
87
88	m sync.Mutex
89
90	// The unstopped service report is generated when we finish
91	// stopping.
92	unstoppedServiceReport UnstoppedServiceReport
93
94	// malign leftovers
95	id    supervisorID
96	state uint8
97}
98
99/*
100
101New is the full constructor function for a supervisor.
102
103The name is a friendly human name for the supervisor, used in logging. Suture
104does not care if this is unique, but it is good for your sanity if it is.
105
106If not set, the following values are used:
107
108 * EventHook:         A function is created that uses log.Print.
109 * FailureDecay:      30 seconds
110 * FailureThreshold:  5 failures
111 * FailureBackoff:    15 seconds
112 * Timeout:           10 seconds
113 * BackoffJitter:     DefaultJitter
114
115The EventHook function will be called when errors occur. Suture will log the
116following:
117
118 * When a service has failed, with a descriptive message about the
119   current backoff status, and whether it was immediately restarted
120 * When the supervisor has gone into its backoff mode, and when it
121   exits it
122 * When a service fails to stop
123
124The failureRate, failureThreshold, and failureBackoff controls how failures
125are handled, in order to avoid the supervisor failure case where the
126program does nothing but restarting failed services. If you do not
127care how failures behave, the default values should be fine for the
128vast majority of services, but if you want the details:
129
130The supervisor tracks the number of failures that have occurred, with an
131exponential decay on the count. Every FailureDecay seconds, the number of
132failures that have occurred is cut in half. (This is done smoothly with an
133exponential function.) When a failure occurs, the number of failures
134is incremented by one. When the number of failures passes the
135FailureThreshold, the entire service waits for FailureBackoff seconds
136before attempting any further restarts, at which point it resets its
137failure count to zero.
138
139Timeout is how long Suture will wait for a service to properly terminate.
140
141The PassThroughPanics options can be set to let panics in services propagate
142and crash the program, should this be desirable.
143
144DontPropagateTermination indicates whether this supervisor tree will
145propagate a ErrTerminateTree if a child process returns it. If false,
146this supervisor will itself return an error that will terminate its
147parent. If true, it will merely return ErrDoNotRestart. false by default.
148
149*/
150func New(name string, spec Spec) *Supervisor {
151	spec.configureDefaults(name)
152
153	return &Supervisor{
154		name,
155
156		spec,
157
158		// services
159		make(map[serviceID]serviceWithName),
160		// cancellations
161		make(map[serviceID]context.CancelFunc),
162		// servicesShuttingDown
163		make(map[serviceID]serviceWithName),
164		// lastFail, deliberately the zero time
165		time.Time{},
166		// failures
167		0,
168		// restartQueue
169		make([]serviceID, 0, 1),
170		// serviceCounter
171		0,
172		// control
173		make(chan supervisorMessage),
174		// notifyServiceDone
175		make(chan serviceID),
176		// resumeTimer
177		make(chan time.Time),
178
179		// liveness
180		make(chan struct{}),
181
182		sync.Mutex{},
183		// ctx
184		nil,
185		// myCancel
186		nil,
187
188		// the tests can override these for testing threshold
189		// behavior
190		// getNow
191		time.Now,
192		// getAfterChan
193		time.After,
194
195		// m
196		sync.Mutex{},
197
198		// unstoppedServiceReport
199		nil,
200
201		// id
202		nextSupervisorID(),
203		// state
204		notRunning,
205	}
206}
207
208func serviceName(service Service) (serviceName string) {
209	stringer, canStringer := service.(fmt.Stringer)
210	if canStringer {
211		serviceName = stringer.String()
212	} else {
213		serviceName = fmt.Sprintf("%#v", service)
214	}
215	return
216}
217
218// NewSimple is a convenience function to create a service with just a name
219// and the sensible defaults.
220func NewSimple(name string) *Supervisor {
221	return New(name, Spec{})
222}
223
224// HasSupervisor is an interface that indicates the given struct contains a
225// supervisor. If the struct is either already a *Supervisor, or it embeds
226// a *Supervisor, this will already be implemented for you. Otherwise, a
227// struct containing a supervisor will need to implement this in order to
228// participate in the log function propagation and recursive
229// UnstoppedService report.
230//
231// It is legal for GetSupervisor to return nil, in which case
232// the supervisor-specific behaviors will simply be ignored.
233type HasSupervisor interface {
234	GetSupervisor() *Supervisor
235}
236
237func (s *Supervisor) GetSupervisor() *Supervisor {
238	return s
239}
240
241/*
242Add adds a service to this supervisor.
243
244If the supervisor is currently running, the service will be started
245immediately. If the supervisor has not been started yet, the service
246will be started when the supervisor is. If the supervisor was already stopped,
247this is a no-op returning an empty service-token.
248
249The returned ServiceID may be passed to the Remove method of the Supervisor
250to terminate the service.
251
252As a special behavior, if the service added is itself a supervisor, the
253supervisor being added will copy the EventHook function from the Supervisor it
254is being added to. This allows factoring out providing a Supervisor
255from its logging. This unconditionally overwrites the child Supervisor's
256logging functions.
257
258*/
259func (s *Supervisor) Add(service Service) ServiceToken {
260	if s == nil {
261		panic("can't add service to nil *suture.Supervisor")
262	}
263
264	if hasSupervisor, isHaveSupervisor := service.(HasSupervisor); isHaveSupervisor {
265		supervisor := hasSupervisor.GetSupervisor()
266		if supervisor != nil {
267			supervisor.spec.EventHook = s.spec.EventHook
268		}
269	}
270
271	s.m.Lock()
272	if s.state == notRunning {
273		id := s.serviceCounter
274		s.serviceCounter++
275
276		s.services[id] = serviceWithName{service, serviceName(service)}
277		s.restartQueue = append(s.restartQueue, id)
278
279		s.m.Unlock()
280		return ServiceToken{uint64(s.id)<<32 | uint64(id)}
281	}
282	s.m.Unlock()
283
284	response := make(chan serviceID)
285	if s.sendControl(addService{service, serviceName(service), response}) != nil {
286		return ServiceToken{}
287	}
288	return ServiceToken{uint64(s.id)<<32 | uint64(<-response)}
289}
290
291// ServeBackground starts running a supervisor in its own goroutine. When
292// this method returns, the supervisor is guaranteed to be in a running state.
293// The returned one-buffered channel receives the error returned by .Serve.
294func (s *Supervisor) ServeBackground(ctx context.Context) <-chan error {
295	errChan := make(chan error, 1)
296	go func() {
297		errChan <- s.Serve(ctx)
298	}()
299	s.sync()
300	return errChan
301}
302
303/*
304Serve starts the supervisor. You should call this on the top-level supervisor,
305but nothing else.
306*/
307func (s *Supervisor) Serve(ctx context.Context) error {
308	// context documentation suggests that it is legal for functions to
309	// take nil contexts, it's user's responsibility to never pass them in.
310	if ctx == nil {
311		ctx = context.Background()
312	}
313
314	if s == nil {
315		panic("Can't serve with a nil *suture.Supervisor")
316	}
317	// Take a separate cancellation function so this tree can be
318	// indepedently cancelled.
319	ctx, myCancel := context.WithCancel(ctx)
320	s.ctxMutex.Lock()
321	s.ctx = ctx
322	s.ctxMutex.Unlock()
323	s.ctxCancel = myCancel
324
325	if s.id == 0 {
326		panic("Can't call Serve on an incorrectly-constructed *suture.Supervisor")
327	}
328
329	s.m.Lock()
330	if s.state == normal || s.state == paused {
331		s.m.Unlock()
332		panic("Called .Serve() on a supervisor that is already Serve()ing")
333	}
334
335	s.state = normal
336	s.m.Unlock()
337
338	defer func() {
339		s.m.Lock()
340		s.state = terminated
341		s.m.Unlock()
342	}()
343
344	// for all the services I currently know about, start them
345	for _, id := range s.restartQueue {
346		namedService, present := s.services[id]
347		if present {
348			s.runService(ctx, namedService.Service, id)
349		}
350	}
351	s.restartQueue = make([]serviceID, 0, 1)
352
353	for {
354		select {
355		case <-ctx.Done():
356			s.stopSupervisor()
357			return ctx.Err()
358		case m := <-s.control:
359			switch msg := m.(type) {
360			case serviceFailed:
361				s.handleFailedService(ctx, msg.id, msg.panicMsg, msg.stacktrace, true)
362			case serviceEnded:
363				_, monitored := s.services[msg.id]
364				if monitored {
365					cancel := s.cancellations[msg.id]
366					if isErr(msg.err, ErrDoNotRestart) || isErr(msg.err, context.Canceled) || isErr(msg.err, context.DeadlineExceeded) {
367						delete(s.services, msg.id)
368						delete(s.cancellations, msg.id)
369						go cancel()
370					} else if isErr(msg.err, ErrTerminateSupervisorTree) {
371						s.stopSupervisor()
372						if s.spec.DontPropagateTermination {
373							return ErrDoNotRestart
374						} else {
375							return msg.err
376						}
377					} else {
378						s.handleFailedService(ctx, msg.id, msg.err, nil, false)
379					}
380				}
381			case addService:
382				id := s.serviceCounter
383				s.serviceCounter++
384
385				s.services[id] = serviceWithName{msg.service, msg.name}
386				s.runService(ctx, msg.service, id)
387
388				msg.response <- id
389			case removeService:
390				s.removeService(msg.id, msg.notification)
391			case stopSupervisor:
392				msg.done <- s.stopSupervisor()
393				return nil
394			case listServices:
395				services := []Service{}
396				for _, service := range s.services {
397					services = append(services, service.Service)
398				}
399				msg.c <- services
400			case syncSupervisor:
401				// this does nothing on purpose; its sole purpose is to
402				// introduce a sync point via the channel receive
403			case panicSupervisor:
404				// used only by tests
405				panic("Panicking as requested!")
406			}
407		case serviceEnded := <-s.notifyServiceDone:
408			delete(s.servicesShuttingDown, serviceEnded)
409		case <-s.resumeTimer:
410			// We're resuming normal operation after a pause due to
411			// excessive thrashing
412			// FIXME: Ought to permit some spacing of these functions, rather
413			// than simply hammering through them
414			s.m.Lock()
415			s.state = normal
416			s.m.Unlock()
417			s.failures = 0
418			s.spec.EventHook(EventResume{s, s.Name})
419			for _, id := range s.restartQueue {
420				namedService, present := s.services[id]
421				if present {
422					s.runService(ctx, namedService.Service, id)
423				}
424			}
425			s.restartQueue = make([]serviceID, 0, 1)
426		}
427	}
428}
429
430// UnstoppedServiceReport will return a report of what services failed to
431// stop when the supervisor was stopped. This call will return when the
432// supervisor is done shutting down. It will hang on a supervisor that has
433// not been stopped, because it will not be "done shutting down".
434//
435// Calling this on a supervisor will return a report for the whole
436// supervisor tree under it.
437//
438// WARNING: Technically, any use of the returned data structure is a
439// TOCTOU violation:
440// https://en.wikipedia.org/wiki/Time-of-check_to_time-of-use
441// Since the data structure was generated and returned to you, any of these
442// services may have stopped since then.
443//
444// However, this can still be useful information at program teardown
445// time. For instance, logging that a service failed to stop as expected is
446// still useful, as even if it shuts down later, it was still later than
447// you expected.
448//
449// But if you cast the Service objects back to their underlying objects and
450// start trying to manipulate them ("shut down harder!"), be sure to
451// account for the possibility they are in fact shut down before you get
452// them.
453//
454// If there are no services to report, the UnstoppedServiceReport will be
455// nil. A zero-length constructed slice is never returned.
456func (s *Supervisor) UnstoppedServiceReport() (UnstoppedServiceReport, error) {
457	// the only thing that ever happens to this channel is getting
458	// closed when the supervisor terminates.
459	_, _ = <-s.liveness
460
461	// FIXME: Recurse on the supervisors
462	return s.unstoppedServiceReport, nil
463}
464
465func (s *Supervisor) handleFailedService(ctx context.Context, id serviceID, err interface{}, stacktrace []byte, panic bool) {
466	now := s.getNow()
467
468	if s.lastFail.IsZero() {
469		s.lastFail = now
470		s.failures = 1.0
471	} else {
472		sinceLastFail := now.Sub(s.lastFail).Seconds()
473		intervals := sinceLastFail / s.spec.FailureDecay
474		s.failures = s.failures*math.Pow(.5, intervals) + 1
475	}
476
477	if s.failures > s.spec.FailureThreshold {
478		s.m.Lock()
479		s.state = paused
480		s.m.Unlock()
481		s.spec.EventHook(EventBackoff{s, s.Name})
482		s.resumeTimer = s.getAfterChan(
483			s.spec.BackoffJitter.Jitter(s.spec.FailureBackoff))
484	}
485
486	s.lastFail = now
487
488	failedService, monitored := s.services[id]
489
490	// It is possible for a service to be no longer monitored
491	// by the time we get here. In that case, just ignore it.
492	if monitored {
493		s.m.Lock()
494		curState := s.state
495		s.m.Unlock()
496		if curState == normal {
497			s.runService(ctx, failedService.Service, id)
498		} else {
499			s.restartQueue = append(s.restartQueue, id)
500		}
501		if panic {
502			s.spec.EventHook(EventServicePanic{
503				Supervisor:       s,
504				SupervisorName:   s.Name,
505				Service:          failedService.Service,
506				ServiceName:      failedService.name,
507				CurrentFailures:  s.failures,
508				FailureThreshold: s.spec.FailureThreshold,
509				Restarting:       curState == normal,
510				PanicMsg:         err.(string),
511				Stacktrace:       string(stacktrace),
512			})
513		} else {
514			e := EventServiceTerminate{
515				Supervisor:       s,
516				SupervisorName:   s.Name,
517				Service:          failedService.Service,
518				ServiceName:      failedService.name,
519				CurrentFailures:  s.failures,
520				FailureThreshold: s.spec.FailureThreshold,
521				Restarting:       curState == normal,
522			}
523			if err != nil {
524				e.Err = err
525			}
526			s.spec.EventHook(e)
527		}
528	}
529}
530
531func (s *Supervisor) runService(ctx context.Context, service Service, id serviceID) {
532	childCtx, cancel := context.WithCancel(ctx)
533	done := make(chan struct{})
534	blockingCancellation := func() {
535		cancel()
536		<-done
537	}
538	s.cancellations[id] = blockingCancellation
539	go func() {
540		if !s.spec.PassThroughPanics {
541			defer func() {
542				if r := recover(); r != nil {
543					buf := make([]byte, 65535)
544					written := runtime.Stack(buf, false)
545					buf = buf[:written]
546					s.fail(id, r.(string), buf)
547				}
548			}()
549		}
550
551		err := service.Serve(childCtx)
552		cancel()
553		close(done)
554
555		s.serviceEnded(id, err)
556	}()
557}
558
559func (s *Supervisor) removeService(id serviceID, notificationChan chan struct{}) {
560	namedService, present := s.services[id]
561	if present {
562		cancel := s.cancellations[id]
563		delete(s.services, id)
564		delete(s.cancellations, id)
565
566		s.servicesShuttingDown[id] = namedService
567		go func() {
568			successChan := make(chan struct{})
569			go func() {
570				cancel()
571				close(successChan)
572				if notificationChan != nil {
573					notificationChan <- struct{}{}
574				}
575			}()
576
577			select {
578			case <-successChan:
579				// Life is good!
580			case <-s.getAfterChan(s.spec.Timeout):
581				s.spec.EventHook(EventStopTimeout{
582					s, s.Name,
583					namedService.Service, namedService.name})
584			}
585			s.notifyServiceDone <- id
586		}()
587	} else {
588		if notificationChan != nil {
589			notificationChan <- struct{}{}
590		}
591	}
592}
593
594func (s *Supervisor) stopSupervisor() UnstoppedServiceReport {
595	notifyDone := make(chan serviceID, len(s.services))
596
597	for id, namedService := range s.services {
598		cancel := s.cancellations[id]
599		delete(s.services, id)
600		delete(s.cancellations, id)
601		s.servicesShuttingDown[id] = namedService
602		go func(sID serviceID) {
603			cancel()
604			notifyDone <- sID
605		}(id)
606	}
607
608	timeout := s.getAfterChan(s.spec.Timeout)
609
610SHUTTING_DOWN_SERVICES:
611	for len(s.servicesShuttingDown) > 0 {
612		select {
613		case id := <-notifyDone:
614			delete(s.servicesShuttingDown, id)
615		case serviceID := <-s.notifyServiceDone:
616			delete(s.servicesShuttingDown, serviceID)
617		case <-timeout:
618			for _, namedService := range s.servicesShuttingDown {
619				s.spec.EventHook(EventStopTimeout{
620					s, s.Name,
621					namedService.Service, namedService.name,
622				})
623			}
624
625			// failed remove statements will log the errors.
626			break SHUTTING_DOWN_SERVICES
627		}
628	}
629
630	// If nothing else has cancelled our context, we should now.
631	s.ctxCancel()
632
633	// Indicate that we're done shutting down
634	defer close(s.liveness)
635
636	if len(s.servicesShuttingDown) == 0 {
637		return nil
638	} else {
639		report := UnstoppedServiceReport{}
640		for serviceID, serviceWithName := range s.servicesShuttingDown {
641			report = append(report, UnstoppedService{
642				SupervisorPath: []*Supervisor{s},
643				Service:        serviceWithName.Service,
644				Name:           serviceWithName.name,
645				ServiceToken:   ServiceToken{uint64(s.id)<<32 | uint64(serviceID)},
646			})
647		}
648		s.m.Lock()
649		s.unstoppedServiceReport = report
650		s.m.Unlock()
651		return report
652	}
653}
654
655// String implements the fmt.Stringer interface.
656func (s *Supervisor) String() string {
657	return s.Name
658}
659
660// sendControl abstracts checking for the supervisor to still be running
661// when we send a message. This prevents blocking when sending to a
662// cancelled supervisor.
663func (s *Supervisor) sendControl(sm supervisorMessage) error {
664	var doneChan <-chan struct{}
665	s.ctxMutex.Lock()
666	if s.ctx == nil {
667		s.ctxMutex.Unlock()
668		return ErrSupervisorNotStarted
669	}
670	doneChan = s.ctx.Done()
671	s.ctxMutex.Unlock()
672
673	select {
674	case s.control <- sm:
675		return nil
676	case <-doneChan:
677		return ErrSupervisorNotRunning
678	}
679}
680
681/*
682Remove will remove the given service from the Supervisor, and attempt to Stop() it.
683The ServiceID token comes from the Add() call. This returns without waiting
684for the service to stop.
685*/
686func (s *Supervisor) Remove(id ServiceToken) error {
687	sID := supervisorID(id.id >> 32)
688	if sID != s.id {
689		return ErrWrongSupervisor
690	}
691	err := s.sendControl(removeService{serviceID(id.id & 0xffffffff), nil})
692	if err == ErrSupervisorNotRunning {
693		// No meaningful error handling if the supervisor is stopped.
694		return nil
695	}
696	return err
697}
698
699/*
700RemoveAndWait will remove the given service from the Supervisor and attempt
701to Stop() it. It will wait up to the given timeout value for the service to
702terminate. A timeout value of 0 means to wait forever.
703
704If a nil error is returned from this function, then the service was
705terminated normally. If either the supervisor terminates or the timeout
706passes, ErrTimeout is returned. (If this isn't even the right supervisor
707ErrWrongSupervisor is returned.)
708*/
709func (s *Supervisor) RemoveAndWait(id ServiceToken, timeout time.Duration) error {
710	sID := supervisorID(id.id >> 32)
711	if sID != s.id {
712		return ErrWrongSupervisor
713	}
714
715	var timeoutC <-chan time.Time
716
717	if timeout > 0 {
718		timer := time.NewTimer(timeout)
719		defer timer.Stop()
720		timeoutC = timer.C
721	}
722
723	notificationC := make(chan struct{})
724
725	sentControlErr := s.sendControl(removeService{serviceID(id.id & 0xffffffff), notificationC})
726
727	if sentControlErr != nil {
728		return sentControlErr
729	}
730
731	select {
732	case <-notificationC:
733		// normal case; the service is terminated.
734		return nil
735
736	// This occurs if the entire supervisor ends without the service
737	// having terminated, and includes the timeout the supervisor
738	// itself waited before closing the liveness channel.
739	case <-s.ctx.Done():
740		return ErrTimeout
741
742	// The local timeout.
743	case <-timeoutC:
744		return ErrTimeout
745	}
746}
747
748/*
749
750Services returns a []Service containing a snapshot of the services this
751Supervisor is managing.
752
753*/
754func (s *Supervisor) Services() []Service {
755	ls := listServices{make(chan []Service)}
756
757	if s.sendControl(ls) == nil {
758		return <-ls.c
759	}
760	return nil
761}
762
763var currentSupervisorIDL sync.Mutex
764var currentSupervisorID uint32
765
766func nextSupervisorID() supervisorID {
767	currentSupervisorIDL.Lock()
768	defer currentSupervisorIDL.Unlock()
769	currentSupervisorID++
770	return supervisorID(currentSupervisorID)
771}
772
773// ServiceToken is an opaque identifier that can be used to terminate a service that
774// has been Add()ed to a Supervisor.
775type ServiceToken struct {
776	id uint64
777}
778
779// An UnstoppedService is the component member of an
780// UnstoppedServiceReport.
781//
782// The SupervisorPath is the path down the supervisor tree to the given
783// service.
784type UnstoppedService struct {
785	SupervisorPath []*Supervisor
786	Service        Service
787	Name           string
788	ServiceToken   ServiceToken
789}
790
791// An UnstoppedServiceReport will be returned by StopWithReport, reporting
792// which services failed to stop.
793type UnstoppedServiceReport []UnstoppedService
794
795type serviceWithName struct {
796	Service Service
797	name    string
798}
799
800// Jitter returns the sum of the input duration and a random jitter.  It is
801// compatible with the jitter functions in github.com/lthibault/jitterbug.
802type Jitter interface {
803	Jitter(time.Duration) time.Duration
804}
805
806// NoJitter does not apply any jitter to the input duration
807type NoJitter struct{}
808
809// Jitter leaves the input duration d unchanged.
810func (NoJitter) Jitter(d time.Duration) time.Duration { return d }
811
812// DefaultJitter is the jitter function that is applied when spec.BackoffJitter
813// is set to nil.
814type DefaultJitter struct {
815	rand *rand.Rand
816}
817
818// Jitter will jitter the backoff time by uniformly distributing it into
819// the range [FailureBackoff, 1.5 * FailureBackoff).
820func (dj *DefaultJitter) Jitter(d time.Duration) time.Duration {
821	// this is only called by the core supervisor loop, so it is
822	// single-thread safe.
823	if dj.rand == nil {
824		dj.rand = rand.New(rand.NewSource(time.Now().UnixNano()))
825	}
826	jitter := dj.rand.Float64() / 2
827	return d + time.Duration(float64(d)*jitter)
828}
829
830// ErrWrongSupervisor is returned by the (*Supervisor).Remove method
831// if you pass a ServiceToken from the wrong Supervisor.
832var ErrWrongSupervisor = errors.New("wrong supervisor for this service token, no service removed")
833
834// ErrTimeout is returned when an attempt to RemoveAndWait for a service to
835// stop has timed out.
836var ErrTimeout = errors.New("waiting for service to stop has timed out")
837
838// ErrSupervisorNotTerminated is returned when asking for a stopped service
839// report before the supervisor has been terminated.
840var ErrSupervisorNotTerminated = errors.New("supervisor not terminated")
841
842// ErrSupervisorNotStarted is returned if you try to send control messages
843// to a supervisor that has not started yet. See note on Supervisor struct
844// about the legal ways to start a supervisor.
845var ErrSupervisorNotStarted = errors.New("supervisor not started yet")
846
847// Spec is used to pass arguments to the New function to create a
848// supervisor. See the New function for full documentation.
849type Spec struct {
850	EventHook                EventHook
851	FailureDecay             float64
852	FailureThreshold         float64
853	FailureBackoff           time.Duration
854	BackoffJitter            Jitter
855	Timeout                  time.Duration
856	PassThroughPanics        bool
857	DontPropagateTermination bool
858}
859
860func (s *Spec) configureDefaults(supervisorName string) {
861	if s.FailureDecay == 0 {
862		s.FailureDecay = 30
863	}
864	if s.FailureThreshold == 0 {
865		s.FailureThreshold = 5
866	}
867	if s.FailureBackoff == 0 {
868		s.FailureBackoff = time.Second * 15
869	}
870	if s.BackoffJitter == nil {
871		s.BackoffJitter = &DefaultJitter{}
872	}
873	if s.Timeout == 0 {
874		s.Timeout = time.Second * 10
875	}
876
877	// set up the default logging handlers
878	if s.EventHook == nil {
879		s.EventHook = func(e Event) {
880			log.Print(e)
881		}
882	}
883}
884