1package child
2
3import (
4	"errors"
5	"fmt"
6	"io"
7	"log"
8	"math/rand"
9	"os"
10	"os/exec"
11	"strings"
12	"sync"
13	"syscall"
14	"time"
15)
16
17func init() {
18	// Seed the default rand Source with current time to produce better random
19	// numbers used with splay
20	rand.Seed(time.Now().UnixNano())
21}
22
23var (
24	// ErrMissingCommand is the error returned when no command is specified
25	// to run.
26	ErrMissingCommand = errors.New("missing command")
27
28	// ExitCodeOK is the default OK exit code.
29	ExitCodeOK = 0
30
31	// ExitCodeError is the default error code returned when the child exits with
32	// an error without a more specific code.
33	ExitCodeError = 127
34)
35
36// Child is a wrapper around a child process which can be used to send signals
37// and manage the processes' lifecycle.
38type Child struct {
39	sync.RWMutex
40
41	stdin          io.Reader
42	stdout, stderr io.Writer
43	command        string
44	args           []string
45	env            []string
46
47	timeout time.Duration
48
49	reloadSignal os.Signal
50
51	killSignal  os.Signal
52	killTimeout time.Duration
53
54	splay time.Duration
55
56	// cmd is the actual child process under management.
57	cmd *exec.Cmd
58
59	// exitCh is the channel where the processes exit will be returned.
60	exitCh chan int
61
62	// stopLock is the mutex to lock when stopping. stopCh is the circuit breaker
63	// to force-terminate any waiting splays to kill the process now. stopped is
64	// a boolean that tells us if we have previously been stopped.
65	stopLock sync.RWMutex
66	stopCh   chan struct{}
67	stopped  bool
68}
69
70// NewInput is input to the NewChild function.
71type NewInput struct {
72	// Stdin is the io.Reader where input will come from. This is sent directly to
73	// the child process. Stdout and Stderr represent the io.Writer objects where
74	// the child process will send output and errorput.
75	Stdin          io.Reader
76	Stdout, Stderr io.Writer
77
78	// Command is the name of the command to execute. Args are the list of
79	// arguments to pass when starting the command.
80	Command string
81	Args    []string
82
83	// Timeout is the maximum amount of time to allow the command to execute. If
84	// set to 0, the command is permitted to run infinitely.
85	Timeout time.Duration
86
87	// Env represents the condition of the child processes' environment
88	// variables. Only these environment variables will be given to the child, so
89	// it is the responsibility of the caller to include the parent processes
90	// environment, if required. This should be in the key=value format.
91	Env []string
92
93	// ReloadSignal is the signal to send to reload this process. This value may
94	// be nil.
95	ReloadSignal os.Signal
96
97	// KillSignal is the signal to send to gracefully kill this process. This
98	// value may be nil.
99	KillSignal os.Signal
100
101	// KillTimeout is the amount of time to wait for the process to gracefully
102	// terminate before force-killing.
103	KillTimeout time.Duration
104
105	// Splay is the maximum random amount of time to wait before sending signals.
106	// This option helps reduce the thundering herd problem by effectively
107	// sleeping for a random amount of time before sending the signal. This
108	// prevents multiple processes from all signaling at the same time. This value
109	// may be zero (which disables the splay entirely).
110	Splay time.Duration
111}
112
113// New creates a new child process for management with high-level APIs for
114// sending signals to the child process, restarting the child process, and
115// gracefully terminating the child process.
116func New(i *NewInput) (*Child, error) {
117	if i == nil {
118		i = new(NewInput)
119	}
120
121	if len(i.Command) == 0 {
122		return nil, ErrMissingCommand
123	}
124
125	child := &Child{
126		stdin:        i.Stdin,
127		stdout:       i.Stdout,
128		stderr:       i.Stderr,
129		command:      i.Command,
130		args:         i.Args,
131		env:          i.Env,
132		timeout:      i.Timeout,
133		reloadSignal: i.ReloadSignal,
134		killSignal:   i.KillSignal,
135		killTimeout:  i.KillTimeout,
136		splay:        i.Splay,
137		stopCh:       make(chan struct{}, 1),
138	}
139
140	return child, nil
141}
142
143// ExitCh returns the current exit channel for this child process. This channel
144// may change if the process is restarted, so implementers must not cache this
145// value.
146func (c *Child) ExitCh() <-chan int {
147	c.RLock()
148	defer c.RUnlock()
149	return c.exitCh
150}
151
152// Pid returns the pid of the child process. If no child process exists, 0 is
153// returned.
154func (c *Child) Pid() int {
155	c.RLock()
156	defer c.RUnlock()
157	return c.pid()
158}
159
160// Command returns the human-formatted command with arguments.
161func (c *Child) Command() string {
162	list := append([]string{c.command}, c.args...)
163	return strings.Join(list, " ")
164}
165
166// Start starts and begins execution of the child process. A buffered channel
167// is returned which is where the command's exit code will be returned upon
168// exit. Any errors that occur prior to starting the command will be returned
169// as the second error argument, but any errors returned by the command after
170// execution will be returned as a non-zero value over the exit code channel.
171func (c *Child) Start() error {
172	log.Printf("[INFO] (child) spawning: %s", c.Command())
173	c.Lock()
174	defer c.Unlock()
175	return c.start()
176}
177
178// Signal sends the signal to the child process, returning any errors that
179// occur.
180func (c *Child) Signal(s os.Signal) error {
181	log.Printf("[INFO] (child) receiving signal %q", s.String())
182	c.RLock()
183	defer c.RUnlock()
184	return c.signal(s)
185}
186
187// Reload sends the reload signal to the child process and does not wait for a
188// response. If no reload signal was provided, the process is restarted and
189// replaces the process attached to this Child.
190func (c *Child) Reload() error {
191	if c.reloadSignal == nil {
192		log.Printf("[INFO] (child) restarting process")
193
194		// Take a full lock because start is going to replace the process. We also
195		// want to make sure that no other routines attempt to send reload signals
196		// during this transition.
197		c.Lock()
198		defer c.Unlock()
199
200		c.kill(false)
201		return c.start()
202	}
203
204	log.Printf("[INFO] (child) reloading process")
205
206	// We only need a read lock here because neither the process nor the exit
207	// channel are changing.
208	c.RLock()
209	defer c.RUnlock()
210
211	return c.reload()
212}
213
214// Kill sends the kill signal to the child process and waits for successful
215// termination. If no kill signal is defined, the process is killed with the
216// most aggressive kill signal. If the process does not gracefully stop within
217// the provided KillTimeout, the process is force-killed. If a splay was
218// provided, this function will sleep for a random period of time between 0 and
219// the provided splay value to reduce the thundering herd problem. This function
220// does not return any errors because it guarantees the process will be dead by
221// the return of the function call.
222func (c *Child) Kill() {
223	log.Printf("[INFO] (child) killing process")
224	c.Lock()
225	defer c.Unlock()
226	c.kill(false)
227}
228
229// Stop behaves almost identical to Kill except it suppresses future processes
230// from being started by this child and it prevents the killing of the child
231// process from sending its value back up the exit channel. This is useful
232// when doing a graceful shutdown of an application.
233func (c *Child) Stop() {
234	c.internalStop(false)
235}
236
237// StopImmediately behaves almost identical to Stop except it does not wait
238// for any random splay if configured. This is used for performing a fast
239// shutdown of consul-template and its children when a kill signal is received.
240func (c *Child) StopImmediately() {
241	c.internalStop(true)
242}
243
244func (c *Child) internalStop(immediately bool) {
245	log.Printf("[INFO] (child) stopping process")
246
247	c.Lock()
248	defer c.Unlock()
249
250	c.stopLock.Lock()
251	defer c.stopLock.Unlock()
252	if c.stopped {
253		log.Printf("[WARN] (child) already stopped")
254		return
255	}
256	c.kill(immediately)
257	close(c.stopCh)
258	c.stopped = true
259}
260
261func (c *Child) start() error {
262	cmd := exec.Command(c.command, c.args...)
263	cmd.Stdin = c.stdin
264	cmd.Stdout = c.stdout
265	cmd.Stderr = c.stderr
266	cmd.Env = c.env
267	if err := cmd.Start(); err != nil {
268		return err
269	}
270	c.cmd = cmd
271
272	// Create a new exitCh so that previously invoked commands (if any) don't
273	// cause us to exit, and start a goroutine to wait for that process to end.
274	exitCh := make(chan int, 1)
275	go func() {
276		var code int
277		err := cmd.Wait()
278		if err == nil {
279			code = ExitCodeOK
280		} else {
281			code = ExitCodeError
282			if exiterr, ok := err.(*exec.ExitError); ok {
283				if status, ok := exiterr.Sys().(syscall.WaitStatus); ok {
284					code = status.ExitStatus()
285				}
286			}
287		}
288
289		// If the child is in the process of killing, do not send a response back
290		// down the exit channel.
291		c.stopLock.RLock()
292		defer c.stopLock.RUnlock()
293		if !c.stopped {
294			select {
295			case <-c.stopCh:
296			case exitCh <- code:
297			}
298		}
299
300		close(exitCh)
301	}()
302
303	c.exitCh = exitCh
304
305	// If a timeout was given, start the timer to wait for the child to exit
306	if c.timeout != 0 {
307		select {
308		case code := <-exitCh:
309			if code != 0 {
310				return fmt.Errorf(
311					"command exited with a non-zero exit status:\n"+
312						"\n"+
313						"    %s\n"+
314						"\n"+
315						"This is assumed to be a failure. Please ensure the command\n"+
316						"exits with a zero exit status.",
317					c.Command(),
318				)
319			}
320		case <-time.After(c.timeout):
321			// Force-kill the process
322			c.stopLock.Lock()
323			defer c.stopLock.Unlock()
324			if c.cmd != nil && c.cmd.Process != nil {
325				c.cmd.Process.Kill()
326			}
327
328			return fmt.Errorf(
329				"command did not exit within %q:\n"+
330					"\n"+
331					"    %s\n"+
332					"\n"+
333					"Commands must exit in a timely manner in order for processing to\n"+
334					"continue. Consider using a process supervisor or utilizing the\n"+
335					"built-in exec mode instead.",
336				c.timeout,
337				c.Command(),
338			)
339		}
340	}
341
342	return nil
343}
344
345func (c *Child) pid() int {
346	if !c.running() {
347		return 0
348	}
349	return c.cmd.Process.Pid
350}
351
352func (c *Child) signal(s os.Signal) error {
353	if !c.running() {
354		return nil
355	}
356	return c.cmd.Process.Signal(s)
357}
358
359func (c *Child) reload() error {
360	select {
361	case <-c.stopCh:
362	case <-c.randomSplay():
363	}
364
365	return c.signal(c.reloadSignal)
366}
367
368// kill sends the signal to kill the process using the configured signal
369// if set, else the default system signal
370func (c *Child) kill(immediately bool) {
371
372	if !c.running() {
373		log.Printf("[DEBUG] (child) Kill() called but process dead; not waiting for splay.")
374		return
375	} else if immediately {
376		log.Printf("[DEBUG] (child) Kill() called but performing immediate shutdown; not waiting for splay.")
377	} else {
378		select {
379		case <-c.stopCh:
380		case <-c.randomSplay():
381		}
382	}
383
384	exited := false
385	process := c.cmd.Process
386
387	if c.killSignal != nil {
388		if err := process.Signal(c.killSignal); err == nil {
389			// Wait a few seconds for it to exit
390			killCh := make(chan struct{}, 1)
391			go func() {
392				defer close(killCh)
393				process.Wait()
394			}()
395
396			select {
397			case <-c.stopCh:
398			case <-killCh:
399				exited = true
400			case <-time.After(c.killTimeout):
401			}
402		}
403	}
404
405	if !exited {
406		process.Kill()
407	}
408
409	c.cmd = nil
410}
411
412func (c *Child) running() bool {
413	select {
414	case <-c.exitCh:
415		return false
416	default:
417	}
418	return c.cmd != nil && c.cmd.Process != nil
419}
420
421func (c *Child) randomSplay() <-chan time.Time {
422	if c.splay == 0 {
423		return time.After(0)
424	}
425
426	ns := c.splay.Nanoseconds()
427	offset := rand.Int63n(ns)
428	t := time.Duration(offset)
429
430	log.Printf("[DEBUG] (child) waiting %.2fs for random splay", t.Seconds())
431
432	return time.After(t)
433}
434