1package child 2 3import ( 4 "errors" 5 "fmt" 6 "io" 7 "log" 8 "math/rand" 9 "os" 10 "os/exec" 11 "strings" 12 "sync" 13 "syscall" 14 "time" 15) 16 17func init() { 18 // Seed the default rand Source with current time to produce better random 19 // numbers used with splay 20 rand.Seed(time.Now().UnixNano()) 21} 22 23var ( 24 // ErrMissingCommand is the error returned when no command is specified 25 // to run. 26 ErrMissingCommand = errors.New("missing command") 27 28 // ExitCodeOK is the default OK exit code. 29 ExitCodeOK = 0 30 31 // ExitCodeError is the default error code returned when the child exits with 32 // an error without a more specific code. 33 ExitCodeError = 127 34) 35 36// Child is a wrapper around a child process which can be used to send signals 37// and manage the processes' lifecycle. 38type Child struct { 39 sync.RWMutex 40 41 stdin io.Reader 42 stdout, stderr io.Writer 43 command string 44 args []string 45 env []string 46 47 timeout time.Duration 48 49 reloadSignal os.Signal 50 51 killSignal os.Signal 52 killTimeout time.Duration 53 54 splay time.Duration 55 56 // cmd is the actual child process under management. 57 cmd *exec.Cmd 58 59 // exitCh is the channel where the processes exit will be returned. 60 exitCh chan int 61 62 // stopLock is the mutex to lock when stopping. stopCh is the circuit breaker 63 // to force-terminate any waiting splays to kill the process now. stopped is 64 // a boolean that tells us if we have previously been stopped. 65 stopLock sync.RWMutex 66 stopCh chan struct{} 67 stopped bool 68} 69 70// NewInput is input to the NewChild function. 71type NewInput struct { 72 // Stdin is the io.Reader where input will come from. This is sent directly to 73 // the child process. Stdout and Stderr represent the io.Writer objects where 74 // the child process will send output and errorput. 75 Stdin io.Reader 76 Stdout, Stderr io.Writer 77 78 // Command is the name of the command to execute. Args are the list of 79 // arguments to pass when starting the command. 80 Command string 81 Args []string 82 83 // Timeout is the maximum amount of time to allow the command to execute. If 84 // set to 0, the command is permitted to run infinitely. 85 Timeout time.Duration 86 87 // Env represents the condition of the child processes' environment 88 // variables. Only these environment variables will be given to the child, so 89 // it is the responsibility of the caller to include the parent processes 90 // environment, if required. This should be in the key=value format. 91 Env []string 92 93 // ReloadSignal is the signal to send to reload this process. This value may 94 // be nil. 95 ReloadSignal os.Signal 96 97 // KillSignal is the signal to send to gracefully kill this process. This 98 // value may be nil. 99 KillSignal os.Signal 100 101 // KillTimeout is the amount of time to wait for the process to gracefully 102 // terminate before force-killing. 103 KillTimeout time.Duration 104 105 // Splay is the maximum random amount of time to wait before sending signals. 106 // This option helps reduce the thundering herd problem by effectively 107 // sleeping for a random amount of time before sending the signal. This 108 // prevents multiple processes from all signaling at the same time. This value 109 // may be zero (which disables the splay entirely). 110 Splay time.Duration 111} 112 113// New creates a new child process for management with high-level APIs for 114// sending signals to the child process, restarting the child process, and 115// gracefully terminating the child process. 116func New(i *NewInput) (*Child, error) { 117 if i == nil { 118 i = new(NewInput) 119 } 120 121 if len(i.Command) == 0 { 122 return nil, ErrMissingCommand 123 } 124 125 child := &Child{ 126 stdin: i.Stdin, 127 stdout: i.Stdout, 128 stderr: i.Stderr, 129 command: i.Command, 130 args: i.Args, 131 env: i.Env, 132 timeout: i.Timeout, 133 reloadSignal: i.ReloadSignal, 134 killSignal: i.KillSignal, 135 killTimeout: i.KillTimeout, 136 splay: i.Splay, 137 stopCh: make(chan struct{}, 1), 138 } 139 140 return child, nil 141} 142 143// ExitCh returns the current exit channel for this child process. This channel 144// may change if the process is restarted, so implementers must not cache this 145// value. 146func (c *Child) ExitCh() <-chan int { 147 c.RLock() 148 defer c.RUnlock() 149 return c.exitCh 150} 151 152// Pid returns the pid of the child process. If no child process exists, 0 is 153// returned. 154func (c *Child) Pid() int { 155 c.RLock() 156 defer c.RUnlock() 157 return c.pid() 158} 159 160// Command returns the human-formatted command with arguments. 161func (c *Child) Command() string { 162 list := append([]string{c.command}, c.args...) 163 return strings.Join(list, " ") 164} 165 166// Start starts and begins execution of the child process. A buffered channel 167// is returned which is where the command's exit code will be returned upon 168// exit. Any errors that occur prior to starting the command will be returned 169// as the second error argument, but any errors returned by the command after 170// execution will be returned as a non-zero value over the exit code channel. 171func (c *Child) Start() error { 172 log.Printf("[INFO] (child) spawning: %s", c.Command()) 173 c.Lock() 174 defer c.Unlock() 175 return c.start() 176} 177 178// Signal sends the signal to the child process, returning any errors that 179// occur. 180func (c *Child) Signal(s os.Signal) error { 181 log.Printf("[INFO] (child) receiving signal %q", s.String()) 182 c.RLock() 183 defer c.RUnlock() 184 return c.signal(s) 185} 186 187// Reload sends the reload signal to the child process and does not wait for a 188// response. If no reload signal was provided, the process is restarted and 189// replaces the process attached to this Child. 190func (c *Child) Reload() error { 191 if c.reloadSignal == nil { 192 log.Printf("[INFO] (child) restarting process") 193 194 // Take a full lock because start is going to replace the process. We also 195 // want to make sure that no other routines attempt to send reload signals 196 // during this transition. 197 c.Lock() 198 defer c.Unlock() 199 200 c.kill(false) 201 return c.start() 202 } 203 204 log.Printf("[INFO] (child) reloading process") 205 206 // We only need a read lock here because neither the process nor the exit 207 // channel are changing. 208 c.RLock() 209 defer c.RUnlock() 210 211 return c.reload() 212} 213 214// Kill sends the kill signal to the child process and waits for successful 215// termination. If no kill signal is defined, the process is killed with the 216// most aggressive kill signal. If the process does not gracefully stop within 217// the provided KillTimeout, the process is force-killed. If a splay was 218// provided, this function will sleep for a random period of time between 0 and 219// the provided splay value to reduce the thundering herd problem. This function 220// does not return any errors because it guarantees the process will be dead by 221// the return of the function call. 222func (c *Child) Kill() { 223 log.Printf("[INFO] (child) killing process") 224 c.Lock() 225 defer c.Unlock() 226 c.kill(false) 227} 228 229// Stop behaves almost identical to Kill except it suppresses future processes 230// from being started by this child and it prevents the killing of the child 231// process from sending its value back up the exit channel. This is useful 232// when doing a graceful shutdown of an application. 233func (c *Child) Stop() { 234 c.internalStop(false) 235} 236 237// StopImmediately behaves almost identical to Stop except it does not wait 238// for any random splay if configured. This is used for performing a fast 239// shutdown of consul-template and its children when a kill signal is received. 240func (c *Child) StopImmediately() { 241 c.internalStop(true) 242} 243 244func (c *Child) internalStop(immediately bool) { 245 log.Printf("[INFO] (child) stopping process") 246 247 c.Lock() 248 defer c.Unlock() 249 250 c.stopLock.Lock() 251 defer c.stopLock.Unlock() 252 if c.stopped { 253 log.Printf("[WARN] (child) already stopped") 254 return 255 } 256 c.kill(immediately) 257 close(c.stopCh) 258 c.stopped = true 259} 260 261func (c *Child) start() error { 262 cmd := exec.Command(c.command, c.args...) 263 cmd.Stdin = c.stdin 264 cmd.Stdout = c.stdout 265 cmd.Stderr = c.stderr 266 cmd.Env = c.env 267 if err := cmd.Start(); err != nil { 268 return err 269 } 270 c.cmd = cmd 271 272 // Create a new exitCh so that previously invoked commands (if any) don't 273 // cause us to exit, and start a goroutine to wait for that process to end. 274 exitCh := make(chan int, 1) 275 go func() { 276 var code int 277 err := cmd.Wait() 278 if err == nil { 279 code = ExitCodeOK 280 } else { 281 code = ExitCodeError 282 if exiterr, ok := err.(*exec.ExitError); ok { 283 if status, ok := exiterr.Sys().(syscall.WaitStatus); ok { 284 code = status.ExitStatus() 285 } 286 } 287 } 288 289 // If the child is in the process of killing, do not send a response back 290 // down the exit channel. 291 c.stopLock.RLock() 292 defer c.stopLock.RUnlock() 293 if !c.stopped { 294 select { 295 case <-c.stopCh: 296 case exitCh <- code: 297 } 298 } 299 300 close(exitCh) 301 }() 302 303 c.exitCh = exitCh 304 305 // If a timeout was given, start the timer to wait for the child to exit 306 if c.timeout != 0 { 307 select { 308 case code := <-exitCh: 309 if code != 0 { 310 return fmt.Errorf( 311 "command exited with a non-zero exit status:\n"+ 312 "\n"+ 313 " %s\n"+ 314 "\n"+ 315 "This is assumed to be a failure. Please ensure the command\n"+ 316 "exits with a zero exit status.", 317 c.Command(), 318 ) 319 } 320 case <-time.After(c.timeout): 321 // Force-kill the process 322 c.stopLock.Lock() 323 defer c.stopLock.Unlock() 324 if c.cmd != nil && c.cmd.Process != nil { 325 c.cmd.Process.Kill() 326 } 327 328 return fmt.Errorf( 329 "command did not exit within %q:\n"+ 330 "\n"+ 331 " %s\n"+ 332 "\n"+ 333 "Commands must exit in a timely manner in order for processing to\n"+ 334 "continue. Consider using a process supervisor or utilizing the\n"+ 335 "built-in exec mode instead.", 336 c.timeout, 337 c.Command(), 338 ) 339 } 340 } 341 342 return nil 343} 344 345func (c *Child) pid() int { 346 if !c.running() { 347 return 0 348 } 349 return c.cmd.Process.Pid 350} 351 352func (c *Child) signal(s os.Signal) error { 353 if !c.running() { 354 return nil 355 } 356 return c.cmd.Process.Signal(s) 357} 358 359func (c *Child) reload() error { 360 select { 361 case <-c.stopCh: 362 case <-c.randomSplay(): 363 } 364 365 return c.signal(c.reloadSignal) 366} 367 368// kill sends the signal to kill the process using the configured signal 369// if set, else the default system signal 370func (c *Child) kill(immediately bool) { 371 372 if !c.running() { 373 log.Printf("[DEBUG] (child) Kill() called but process dead; not waiting for splay.") 374 return 375 } else if immediately { 376 log.Printf("[DEBUG] (child) Kill() called but performing immediate shutdown; not waiting for splay.") 377 } else { 378 select { 379 case <-c.stopCh: 380 case <-c.randomSplay(): 381 } 382 } 383 384 exited := false 385 process := c.cmd.Process 386 387 if c.killSignal != nil { 388 if err := process.Signal(c.killSignal); err == nil { 389 // Wait a few seconds for it to exit 390 killCh := make(chan struct{}, 1) 391 go func() { 392 defer close(killCh) 393 process.Wait() 394 }() 395 396 select { 397 case <-c.stopCh: 398 case <-killCh: 399 exited = true 400 case <-time.After(c.killTimeout): 401 } 402 } 403 } 404 405 if !exited { 406 process.Kill() 407 } 408 409 c.cmd = nil 410} 411 412func (c *Child) running() bool { 413 select { 414 case <-c.exitCh: 415 return false 416 default: 417 } 418 return c.cmd != nil && c.cmd.Process != nil 419} 420 421func (c *Child) randomSplay() <-chan time.Time { 422 if c.splay == 0 { 423 return time.After(0) 424 } 425 426 ns := c.splay.Nanoseconds() 427 offset := rand.Int63n(ns) 428 t := time.Duration(offset) 429 430 log.Printf("[DEBUG] (child) waiting %.2fs for random splay", t.Seconds()) 431 432 return time.After(t) 433} 434