1package suture 2 3// FIXMES in progress: 4// 1. Ensure the supervisor actually gets to the terminated state for the 5// unstopped service report. 6// 2. Save the unstopped service report in the supervisor. 7 8import ( 9 "context" 10 "errors" 11 "fmt" 12 "log" 13 "math" 14 "math/rand" 15 "runtime" 16 "sync" 17 "time" 18) 19 20const ( 21 notRunning = iota 22 normal 23 paused 24 terminated 25) 26 27type supervisorID uint32 28type serviceID uint32 29 30// ErrSupervisorNotRunning is returned by some methods if the supervisor is 31// not running, either because it has not been started or because it has 32// been terminated. 33var ErrSupervisorNotRunning = errors.New("supervisor not running") 34 35/* 36Supervisor is the core type of the module that represents a Supervisor. 37 38Supervisors should be constructed either by New or NewSimple. 39 40Once constructed, a Supervisor should be started in one of three ways: 41 42 1. Calling .Serve(ctx). 43 2. Calling .ServeBackground(ctx). 44 3. Adding it to an existing Supervisor. 45 46Calling Serve will cause the supervisor to run until the passed-in 47context is cancelled. Often one of the last lines of the "main" func for a 48program will be to call one of the Serve methods. 49 50Calling ServeBackground will CORRECTLY start the supervisor running in a 51new goroutine. It is risky to directly run 52 53 go supervisor.Serve() 54 55because that will briefly create a race condition as it starts up, if you 56try to .Add() services immediately afterward. 57 58*/ 59type Supervisor struct { 60 Name string 61 62 spec Spec 63 64 services map[serviceID]serviceWithName 65 cancellations map[serviceID]context.CancelFunc 66 servicesShuttingDown map[serviceID]serviceWithName 67 lastFail time.Time 68 failures float64 69 restartQueue []serviceID 70 serviceCounter serviceID 71 control chan supervisorMessage 72 notifyServiceDone chan serviceID 73 resumeTimer <-chan time.Time 74 liveness chan struct{} 75 76 // despite the recommendation in the context package to avoid 77 // holding this in a struct, I think due to the function of suture 78 // and the way it works, I think it's OK in this case. This is the 79 // exceptional case, basically. 80 ctxMutex sync.Mutex 81 ctx context.Context 82 // This function cancels this supervisor specifically. 83 ctxCancel func() 84 85 getNow func() time.Time 86 getAfterChan func(time.Duration) <-chan time.Time 87 88 m sync.Mutex 89 90 // The unstopped service report is generated when we finish 91 // stopping. 92 unstoppedServiceReport UnstoppedServiceReport 93 94 // malign leftovers 95 id supervisorID 96 state uint8 97} 98 99/* 100 101New is the full constructor function for a supervisor. 102 103The name is a friendly human name for the supervisor, used in logging. Suture 104does not care if this is unique, but it is good for your sanity if it is. 105 106If not set, the following values are used: 107 108 * EventHook: A function is created that uses log.Print. 109 * FailureDecay: 30 seconds 110 * FailureThreshold: 5 failures 111 * FailureBackoff: 15 seconds 112 * Timeout: 10 seconds 113 * BackoffJitter: DefaultJitter 114 115The EventHook function will be called when errors occur. Suture will log the 116following: 117 118 * When a service has failed, with a descriptive message about the 119 current backoff status, and whether it was immediately restarted 120 * When the supervisor has gone into its backoff mode, and when it 121 exits it 122 * When a service fails to stop 123 124The failureRate, failureThreshold, and failureBackoff controls how failures 125are handled, in order to avoid the supervisor failure case where the 126program does nothing but restarting failed services. If you do not 127care how failures behave, the default values should be fine for the 128vast majority of services, but if you want the details: 129 130The supervisor tracks the number of failures that have occurred, with an 131exponential decay on the count. Every FailureDecay seconds, the number of 132failures that have occurred is cut in half. (This is done smoothly with an 133exponential function.) When a failure occurs, the number of failures 134is incremented by one. When the number of failures passes the 135FailureThreshold, the entire service waits for FailureBackoff seconds 136before attempting any further restarts, at which point it resets its 137failure count to zero. 138 139Timeout is how long Suture will wait for a service to properly terminate. 140 141The PassThroughPanics options can be set to let panics in services propagate 142and crash the program, should this be desirable. 143 144DontPropagateTermination indicates whether this supervisor tree will 145propagate a ErrTerminateTree if a child process returns it. If false, 146this supervisor will itself return an error that will terminate its 147parent. If true, it will merely return ErrDoNotRestart. false by default. 148 149*/ 150func New(name string, spec Spec) *Supervisor { 151 spec.configureDefaults(name) 152 153 return &Supervisor{ 154 name, 155 156 spec, 157 158 // services 159 make(map[serviceID]serviceWithName), 160 // cancellations 161 make(map[serviceID]context.CancelFunc), 162 // servicesShuttingDown 163 make(map[serviceID]serviceWithName), 164 // lastFail, deliberately the zero time 165 time.Time{}, 166 // failures 167 0, 168 // restartQueue 169 make([]serviceID, 0, 1), 170 // serviceCounter 171 0, 172 // control 173 make(chan supervisorMessage), 174 // notifyServiceDone 175 make(chan serviceID), 176 // resumeTimer 177 make(chan time.Time), 178 179 // liveness 180 make(chan struct{}), 181 182 sync.Mutex{}, 183 // ctx 184 nil, 185 // myCancel 186 nil, 187 188 // the tests can override these for testing threshold 189 // behavior 190 // getNow 191 time.Now, 192 // getAfterChan 193 time.After, 194 195 // m 196 sync.Mutex{}, 197 198 // unstoppedServiceReport 199 nil, 200 201 // id 202 nextSupervisorID(), 203 // state 204 notRunning, 205 } 206} 207 208func serviceName(service Service) (serviceName string) { 209 stringer, canStringer := service.(fmt.Stringer) 210 if canStringer { 211 serviceName = stringer.String() 212 } else { 213 serviceName = fmt.Sprintf("%#v", service) 214 } 215 return 216} 217 218// NewSimple is a convenience function to create a service with just a name 219// and the sensible defaults. 220func NewSimple(name string) *Supervisor { 221 return New(name, Spec{}) 222} 223 224// HasSupervisor is an interface that indicates the given struct contains a 225// supervisor. If the struct is either already a *Supervisor, or it embeds 226// a *Supervisor, this will already be implemented for you. Otherwise, a 227// struct containing a supervisor will need to implement this in order to 228// participate in the log function propagation and recursive 229// UnstoppedService report. 230// 231// It is legal for GetSupervisor to return nil, in which case 232// the supervisor-specific behaviors will simply be ignored. 233type HasSupervisor interface { 234 GetSupervisor() *Supervisor 235} 236 237func (s *Supervisor) GetSupervisor() *Supervisor { 238 return s 239} 240 241/* 242Add adds a service to this supervisor. 243 244If the supervisor is currently running, the service will be started 245immediately. If the supervisor has not been started yet, the service 246will be started when the supervisor is. If the supervisor was already stopped, 247this is a no-op returning an empty service-token. 248 249The returned ServiceID may be passed to the Remove method of the Supervisor 250to terminate the service. 251 252As a special behavior, if the service added is itself a supervisor, the 253supervisor being added will copy the EventHook function from the Supervisor it 254is being added to. This allows factoring out providing a Supervisor 255from its logging. This unconditionally overwrites the child Supervisor's 256logging functions. 257 258*/ 259func (s *Supervisor) Add(service Service) ServiceToken { 260 if s == nil { 261 panic("can't add service to nil *suture.Supervisor") 262 } 263 264 if hasSupervisor, isHaveSupervisor := service.(HasSupervisor); isHaveSupervisor { 265 supervisor := hasSupervisor.GetSupervisor() 266 if supervisor != nil { 267 supervisor.spec.EventHook = s.spec.EventHook 268 } 269 } 270 271 s.m.Lock() 272 if s.state == notRunning { 273 id := s.serviceCounter 274 s.serviceCounter++ 275 276 s.services[id] = serviceWithName{service, serviceName(service)} 277 s.restartQueue = append(s.restartQueue, id) 278 279 s.m.Unlock() 280 return ServiceToken{uint64(s.id)<<32 | uint64(id)} 281 } 282 s.m.Unlock() 283 284 response := make(chan serviceID) 285 if s.sendControl(addService{service, serviceName(service), response}) != nil { 286 return ServiceToken{} 287 } 288 return ServiceToken{uint64(s.id)<<32 | uint64(<-response)} 289} 290 291// ServeBackground starts running a supervisor in its own goroutine. When 292// this method returns, the supervisor is guaranteed to be in a running state. 293// The returned one-buffered channel receives the error returned by .Serve. 294func (s *Supervisor) ServeBackground(ctx context.Context) <-chan error { 295 errChan := make(chan error, 1) 296 go func() { 297 errChan <- s.Serve(ctx) 298 }() 299 s.sync() 300 return errChan 301} 302 303/* 304Serve starts the supervisor. You should call this on the top-level supervisor, 305but nothing else. 306*/ 307func (s *Supervisor) Serve(ctx context.Context) error { 308 // context documentation suggests that it is legal for functions to 309 // take nil contexts, it's user's responsibility to never pass them in. 310 if ctx == nil { 311 ctx = context.Background() 312 } 313 314 if s == nil { 315 panic("Can't serve with a nil *suture.Supervisor") 316 } 317 // Take a separate cancellation function so this tree can be 318 // indepedently cancelled. 319 ctx, myCancel := context.WithCancel(ctx) 320 s.ctxMutex.Lock() 321 s.ctx = ctx 322 s.ctxMutex.Unlock() 323 s.ctxCancel = myCancel 324 325 if s.id == 0 { 326 panic("Can't call Serve on an incorrectly-constructed *suture.Supervisor") 327 } 328 329 s.m.Lock() 330 if s.state == normal || s.state == paused { 331 s.m.Unlock() 332 panic("Called .Serve() on a supervisor that is already Serve()ing") 333 } 334 335 s.state = normal 336 s.m.Unlock() 337 338 defer func() { 339 s.m.Lock() 340 s.state = terminated 341 s.m.Unlock() 342 }() 343 344 // for all the services I currently know about, start them 345 for _, id := range s.restartQueue { 346 namedService, present := s.services[id] 347 if present { 348 s.runService(ctx, namedService.Service, id) 349 } 350 } 351 s.restartQueue = make([]serviceID, 0, 1) 352 353 for { 354 select { 355 case <-ctx.Done(): 356 s.stopSupervisor() 357 return ctx.Err() 358 case m := <-s.control: 359 switch msg := m.(type) { 360 case serviceFailed: 361 s.handleFailedService(ctx, msg.id, msg.panicMsg, msg.stacktrace, true) 362 case serviceEnded: 363 _, monitored := s.services[msg.id] 364 if monitored { 365 cancel := s.cancellations[msg.id] 366 if isErr(msg.err, ErrDoNotRestart) || isErr(msg.err, context.Canceled) || isErr(msg.err, context.DeadlineExceeded) { 367 delete(s.services, msg.id) 368 delete(s.cancellations, msg.id) 369 go cancel() 370 } else if isErr(msg.err, ErrTerminateSupervisorTree) { 371 s.stopSupervisor() 372 if s.spec.DontPropagateTermination { 373 return ErrDoNotRestart 374 } else { 375 return msg.err 376 } 377 } else { 378 s.handleFailedService(ctx, msg.id, msg.err, nil, false) 379 } 380 } 381 case addService: 382 id := s.serviceCounter 383 s.serviceCounter++ 384 385 s.services[id] = serviceWithName{msg.service, msg.name} 386 s.runService(ctx, msg.service, id) 387 388 msg.response <- id 389 case removeService: 390 s.removeService(msg.id, msg.notification) 391 case stopSupervisor: 392 msg.done <- s.stopSupervisor() 393 return nil 394 case listServices: 395 services := []Service{} 396 for _, service := range s.services { 397 services = append(services, service.Service) 398 } 399 msg.c <- services 400 case syncSupervisor: 401 // this does nothing on purpose; its sole purpose is to 402 // introduce a sync point via the channel receive 403 case panicSupervisor: 404 // used only by tests 405 panic("Panicking as requested!") 406 } 407 case serviceEnded := <-s.notifyServiceDone: 408 delete(s.servicesShuttingDown, serviceEnded) 409 case <-s.resumeTimer: 410 // We're resuming normal operation after a pause due to 411 // excessive thrashing 412 // FIXME: Ought to permit some spacing of these functions, rather 413 // than simply hammering through them 414 s.m.Lock() 415 s.state = normal 416 s.m.Unlock() 417 s.failures = 0 418 s.spec.EventHook(EventResume{s, s.Name}) 419 for _, id := range s.restartQueue { 420 namedService, present := s.services[id] 421 if present { 422 s.runService(ctx, namedService.Service, id) 423 } 424 } 425 s.restartQueue = make([]serviceID, 0, 1) 426 } 427 } 428} 429 430// UnstoppedServiceReport will return a report of what services failed to 431// stop when the supervisor was stopped. This call will return when the 432// supervisor is done shutting down. It will hang on a supervisor that has 433// not been stopped, because it will not be "done shutting down". 434// 435// Calling this on a supervisor will return a report for the whole 436// supervisor tree under it. 437// 438// WARNING: Technically, any use of the returned data structure is a 439// TOCTOU violation: 440// https://en.wikipedia.org/wiki/Time-of-check_to_time-of-use 441// Since the data structure was generated and returned to you, any of these 442// services may have stopped since then. 443// 444// However, this can still be useful information at program teardown 445// time. For instance, logging that a service failed to stop as expected is 446// still useful, as even if it shuts down later, it was still later than 447// you expected. 448// 449// But if you cast the Service objects back to their underlying objects and 450// start trying to manipulate them ("shut down harder!"), be sure to 451// account for the possibility they are in fact shut down before you get 452// them. 453// 454// If there are no services to report, the UnstoppedServiceReport will be 455// nil. A zero-length constructed slice is never returned. 456func (s *Supervisor) UnstoppedServiceReport() (UnstoppedServiceReport, error) { 457 // the only thing that ever happens to this channel is getting 458 // closed when the supervisor terminates. 459 _, _ = <-s.liveness 460 461 // FIXME: Recurse on the supervisors 462 return s.unstoppedServiceReport, nil 463} 464 465func (s *Supervisor) handleFailedService(ctx context.Context, id serviceID, err interface{}, stacktrace []byte, panic bool) { 466 now := s.getNow() 467 468 if s.lastFail.IsZero() { 469 s.lastFail = now 470 s.failures = 1.0 471 } else { 472 sinceLastFail := now.Sub(s.lastFail).Seconds() 473 intervals := sinceLastFail / s.spec.FailureDecay 474 s.failures = s.failures*math.Pow(.5, intervals) + 1 475 } 476 477 if s.failures > s.spec.FailureThreshold { 478 s.m.Lock() 479 s.state = paused 480 s.m.Unlock() 481 s.spec.EventHook(EventBackoff{s, s.Name}) 482 s.resumeTimer = s.getAfterChan( 483 s.spec.BackoffJitter.Jitter(s.spec.FailureBackoff)) 484 } 485 486 s.lastFail = now 487 488 failedService, monitored := s.services[id] 489 490 // It is possible for a service to be no longer monitored 491 // by the time we get here. In that case, just ignore it. 492 if monitored { 493 s.m.Lock() 494 curState := s.state 495 s.m.Unlock() 496 if curState == normal { 497 s.runService(ctx, failedService.Service, id) 498 } else { 499 s.restartQueue = append(s.restartQueue, id) 500 } 501 if panic { 502 s.spec.EventHook(EventServicePanic{ 503 Supervisor: s, 504 SupervisorName: s.Name, 505 Service: failedService.Service, 506 ServiceName: failedService.name, 507 CurrentFailures: s.failures, 508 FailureThreshold: s.spec.FailureThreshold, 509 Restarting: curState == normal, 510 PanicMsg: err.(string), 511 Stacktrace: string(stacktrace), 512 }) 513 } else { 514 e := EventServiceTerminate{ 515 Supervisor: s, 516 SupervisorName: s.Name, 517 Service: failedService.Service, 518 ServiceName: failedService.name, 519 CurrentFailures: s.failures, 520 FailureThreshold: s.spec.FailureThreshold, 521 Restarting: curState == normal, 522 } 523 if err != nil { 524 e.Err = err 525 } 526 s.spec.EventHook(e) 527 } 528 } 529} 530 531func (s *Supervisor) runService(ctx context.Context, service Service, id serviceID) { 532 childCtx, cancel := context.WithCancel(ctx) 533 done := make(chan struct{}) 534 blockingCancellation := func() { 535 cancel() 536 <-done 537 } 538 s.cancellations[id] = blockingCancellation 539 go func() { 540 if !s.spec.PassThroughPanics { 541 defer func() { 542 if r := recover(); r != nil { 543 buf := make([]byte, 65535) 544 written := runtime.Stack(buf, false) 545 buf = buf[:written] 546 s.fail(id, r.(string), buf) 547 } 548 }() 549 } 550 551 err := service.Serve(childCtx) 552 cancel() 553 close(done) 554 555 s.serviceEnded(id, err) 556 }() 557} 558 559func (s *Supervisor) removeService(id serviceID, notificationChan chan struct{}) { 560 namedService, present := s.services[id] 561 if present { 562 cancel := s.cancellations[id] 563 delete(s.services, id) 564 delete(s.cancellations, id) 565 566 s.servicesShuttingDown[id] = namedService 567 go func() { 568 successChan := make(chan struct{}) 569 go func() { 570 cancel() 571 close(successChan) 572 if notificationChan != nil { 573 notificationChan <- struct{}{} 574 } 575 }() 576 577 select { 578 case <-successChan: 579 // Life is good! 580 case <-s.getAfterChan(s.spec.Timeout): 581 s.spec.EventHook(EventStopTimeout{ 582 s, s.Name, 583 namedService.Service, namedService.name}) 584 } 585 s.notifyServiceDone <- id 586 }() 587 } else { 588 if notificationChan != nil { 589 notificationChan <- struct{}{} 590 } 591 } 592} 593 594func (s *Supervisor) stopSupervisor() UnstoppedServiceReport { 595 notifyDone := make(chan serviceID, len(s.services)) 596 597 for id, namedService := range s.services { 598 cancel := s.cancellations[id] 599 delete(s.services, id) 600 delete(s.cancellations, id) 601 s.servicesShuttingDown[id] = namedService 602 go func(sID serviceID) { 603 cancel() 604 notifyDone <- sID 605 }(id) 606 } 607 608 timeout := s.getAfterChan(s.spec.Timeout) 609 610SHUTTING_DOWN_SERVICES: 611 for len(s.servicesShuttingDown) > 0 { 612 select { 613 case id := <-notifyDone: 614 delete(s.servicesShuttingDown, id) 615 case serviceID := <-s.notifyServiceDone: 616 delete(s.servicesShuttingDown, serviceID) 617 case <-timeout: 618 for _, namedService := range s.servicesShuttingDown { 619 s.spec.EventHook(EventStopTimeout{ 620 s, s.Name, 621 namedService.Service, namedService.name, 622 }) 623 } 624 625 // failed remove statements will log the errors. 626 break SHUTTING_DOWN_SERVICES 627 } 628 } 629 630 // If nothing else has cancelled our context, we should now. 631 s.ctxCancel() 632 633 // Indicate that we're done shutting down 634 defer close(s.liveness) 635 636 if len(s.servicesShuttingDown) == 0 { 637 return nil 638 } else { 639 report := UnstoppedServiceReport{} 640 for serviceID, serviceWithName := range s.servicesShuttingDown { 641 report = append(report, UnstoppedService{ 642 SupervisorPath: []*Supervisor{s}, 643 Service: serviceWithName.Service, 644 Name: serviceWithName.name, 645 ServiceToken: ServiceToken{uint64(s.id)<<32 | uint64(serviceID)}, 646 }) 647 } 648 s.m.Lock() 649 s.unstoppedServiceReport = report 650 s.m.Unlock() 651 return report 652 } 653} 654 655// String implements the fmt.Stringer interface. 656func (s *Supervisor) String() string { 657 return s.Name 658} 659 660// sendControl abstracts checking for the supervisor to still be running 661// when we send a message. This prevents blocking when sending to a 662// cancelled supervisor. 663func (s *Supervisor) sendControl(sm supervisorMessage) error { 664 var doneChan <-chan struct{} 665 s.ctxMutex.Lock() 666 if s.ctx == nil { 667 s.ctxMutex.Unlock() 668 return ErrSupervisorNotStarted 669 } 670 doneChan = s.ctx.Done() 671 s.ctxMutex.Unlock() 672 673 select { 674 case s.control <- sm: 675 return nil 676 case <-doneChan: 677 return ErrSupervisorNotRunning 678 } 679} 680 681/* 682Remove will remove the given service from the Supervisor, and attempt to Stop() it. 683The ServiceID token comes from the Add() call. This returns without waiting 684for the service to stop. 685*/ 686func (s *Supervisor) Remove(id ServiceToken) error { 687 sID := supervisorID(id.id >> 32) 688 if sID != s.id { 689 return ErrWrongSupervisor 690 } 691 err := s.sendControl(removeService{serviceID(id.id & 0xffffffff), nil}) 692 if err == ErrSupervisorNotRunning { 693 // No meaningful error handling if the supervisor is stopped. 694 return nil 695 } 696 return err 697} 698 699/* 700RemoveAndWait will remove the given service from the Supervisor and attempt 701to Stop() it. It will wait up to the given timeout value for the service to 702terminate. A timeout value of 0 means to wait forever. 703 704If a nil error is returned from this function, then the service was 705terminated normally. If either the supervisor terminates or the timeout 706passes, ErrTimeout is returned. (If this isn't even the right supervisor 707ErrWrongSupervisor is returned.) 708*/ 709func (s *Supervisor) RemoveAndWait(id ServiceToken, timeout time.Duration) error { 710 sID := supervisorID(id.id >> 32) 711 if sID != s.id { 712 return ErrWrongSupervisor 713 } 714 715 var timeoutC <-chan time.Time 716 717 if timeout > 0 { 718 timer := time.NewTimer(timeout) 719 defer timer.Stop() 720 timeoutC = timer.C 721 } 722 723 notificationC := make(chan struct{}) 724 725 sentControlErr := s.sendControl(removeService{serviceID(id.id & 0xffffffff), notificationC}) 726 727 if sentControlErr != nil { 728 return sentControlErr 729 } 730 731 select { 732 case <-notificationC: 733 // normal case; the service is terminated. 734 return nil 735 736 // This occurs if the entire supervisor ends without the service 737 // having terminated, and includes the timeout the supervisor 738 // itself waited before closing the liveness channel. 739 case <-s.ctx.Done(): 740 return ErrTimeout 741 742 // The local timeout. 743 case <-timeoutC: 744 return ErrTimeout 745 } 746} 747 748/* 749 750Services returns a []Service containing a snapshot of the services this 751Supervisor is managing. 752 753*/ 754func (s *Supervisor) Services() []Service { 755 ls := listServices{make(chan []Service)} 756 757 if s.sendControl(ls) == nil { 758 return <-ls.c 759 } 760 return nil 761} 762 763var currentSupervisorIDL sync.Mutex 764var currentSupervisorID uint32 765 766func nextSupervisorID() supervisorID { 767 currentSupervisorIDL.Lock() 768 defer currentSupervisorIDL.Unlock() 769 currentSupervisorID++ 770 return supervisorID(currentSupervisorID) 771} 772 773// ServiceToken is an opaque identifier that can be used to terminate a service that 774// has been Add()ed to a Supervisor. 775type ServiceToken struct { 776 id uint64 777} 778 779// An UnstoppedService is the component member of an 780// UnstoppedServiceReport. 781// 782// The SupervisorPath is the path down the supervisor tree to the given 783// service. 784type UnstoppedService struct { 785 SupervisorPath []*Supervisor 786 Service Service 787 Name string 788 ServiceToken ServiceToken 789} 790 791// An UnstoppedServiceReport will be returned by StopWithReport, reporting 792// which services failed to stop. 793type UnstoppedServiceReport []UnstoppedService 794 795type serviceWithName struct { 796 Service Service 797 name string 798} 799 800// Jitter returns the sum of the input duration and a random jitter. It is 801// compatible with the jitter functions in github.com/lthibault/jitterbug. 802type Jitter interface { 803 Jitter(time.Duration) time.Duration 804} 805 806// NoJitter does not apply any jitter to the input duration 807type NoJitter struct{} 808 809// Jitter leaves the input duration d unchanged. 810func (NoJitter) Jitter(d time.Duration) time.Duration { return d } 811 812// DefaultJitter is the jitter function that is applied when spec.BackoffJitter 813// is set to nil. 814type DefaultJitter struct { 815 rand *rand.Rand 816} 817 818// Jitter will jitter the backoff time by uniformly distributing it into 819// the range [FailureBackoff, 1.5 * FailureBackoff). 820func (dj *DefaultJitter) Jitter(d time.Duration) time.Duration { 821 // this is only called by the core supervisor loop, so it is 822 // single-thread safe. 823 if dj.rand == nil { 824 dj.rand = rand.New(rand.NewSource(time.Now().UnixNano())) 825 } 826 jitter := dj.rand.Float64() / 2 827 return d + time.Duration(float64(d)*jitter) 828} 829 830// ErrWrongSupervisor is returned by the (*Supervisor).Remove method 831// if you pass a ServiceToken from the wrong Supervisor. 832var ErrWrongSupervisor = errors.New("wrong supervisor for this service token, no service removed") 833 834// ErrTimeout is returned when an attempt to RemoveAndWait for a service to 835// stop has timed out. 836var ErrTimeout = errors.New("waiting for service to stop has timed out") 837 838// ErrSupervisorNotTerminated is returned when asking for a stopped service 839// report before the supervisor has been terminated. 840var ErrSupervisorNotTerminated = errors.New("supervisor not terminated") 841 842// ErrSupervisorNotStarted is returned if you try to send control messages 843// to a supervisor that has not started yet. See note on Supervisor struct 844// about the legal ways to start a supervisor. 845var ErrSupervisorNotStarted = errors.New("supervisor not started yet") 846 847// Spec is used to pass arguments to the New function to create a 848// supervisor. See the New function for full documentation. 849type Spec struct { 850 EventHook EventHook 851 FailureDecay float64 852 FailureThreshold float64 853 FailureBackoff time.Duration 854 BackoffJitter Jitter 855 Timeout time.Duration 856 PassThroughPanics bool 857 DontPropagateTermination bool 858} 859 860func (s *Spec) configureDefaults(supervisorName string) { 861 if s.FailureDecay == 0 { 862 s.FailureDecay = 30 863 } 864 if s.FailureThreshold == 0 { 865 s.FailureThreshold = 5 866 } 867 if s.FailureBackoff == 0 { 868 s.FailureBackoff = time.Second * 15 869 } 870 if s.BackoffJitter == nil { 871 s.BackoffJitter = &DefaultJitter{} 872 } 873 if s.Timeout == 0 { 874 s.Timeout = time.Second * 10 875 } 876 877 // set up the default logging handlers 878 if s.EventHook == nil { 879 s.EventHook = func(e Event) { 880 log.Print(e) 881 } 882 } 883} 884