1package daemon // import "github.com/docker/docker/daemon"
2
3import (
4	"bytes"
5	"context"
6	"fmt"
7	"runtime"
8	"strings"
9	"sync"
10	"time"
11
12	"github.com/docker/docker/api/types"
13	"github.com/docker/docker/api/types/strslice"
14	"github.com/docker/docker/container"
15	"github.com/docker/docker/daemon/exec"
16	"github.com/sirupsen/logrus"
17)
18
19const (
20	// Longest healthcheck probe output message to store. Longer messages will be truncated.
21	maxOutputLen = 4096
22
23	// Default interval between probe runs (from the end of the first to the start of the second).
24	// Also the time before the first probe.
25	defaultProbeInterval = 30 * time.Second
26
27	// The maximum length of time a single probe run should take. If the probe takes longer
28	// than this, the check is considered to have failed.
29	defaultProbeTimeout = 30 * time.Second
30
31	// The time given for the container to start before the health check starts considering
32	// the container unstable. Defaults to none.
33	defaultStartPeriod = 0 * time.Second
34
35	// Default number of consecutive failures of the health check
36	// for the container to be considered unhealthy.
37	defaultProbeRetries = 3
38
39	// Maximum number of entries to record
40	maxLogEntries = 5
41)
42
43const (
44	// Exit status codes that can be returned by the probe command.
45
46	exitStatusHealthy = 0 // Container is healthy
47)
48
49// probe implementations know how to run a particular type of probe.
50type probe interface {
51	// Perform one run of the check. Returns the exit code and an optional
52	// short diagnostic string.
53	run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error)
54}
55
56// cmdProbe implements the "CMD" probe type.
57type cmdProbe struct {
58	// Run the command with the system's default shell instead of execing it directly.
59	shell bool
60}
61
62// exec the healthcheck command in the container.
63// Returns the exit code and probe output (if any)
64func (p *cmdProbe) run(ctx context.Context, d *Daemon, cntr *container.Container) (*types.HealthcheckResult, error) {
65	cmdSlice := strslice.StrSlice(cntr.Config.Healthcheck.Test)[1:]
66	if p.shell {
67		cmdSlice = append(getShell(cntr), cmdSlice...)
68	}
69	entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice)
70	execConfig := exec.NewConfig()
71	execConfig.OpenStdin = false
72	execConfig.OpenStdout = true
73	execConfig.OpenStderr = true
74	execConfig.ContainerID = cntr.ID
75	execConfig.DetachKeys = []byte{}
76	execConfig.Entrypoint = entrypoint
77	execConfig.Args = args
78	execConfig.Tty = false
79	execConfig.Privileged = false
80	execConfig.User = cntr.Config.User
81	execConfig.WorkingDir = cntr.Config.WorkingDir
82
83	linkedEnv, err := d.setupLinkedContainers(cntr)
84	if err != nil {
85		return nil, err
86	}
87	execConfig.Env = container.ReplaceOrAppendEnvValues(cntr.CreateDaemonEnvironment(execConfig.Tty, linkedEnv), execConfig.Env)
88
89	d.registerExecCommand(cntr, execConfig)
90	attributes := map[string]string{
91		"execID": execConfig.ID,
92	}
93	d.LogContainerEventWithAttributes(cntr, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " "), attributes)
94
95	output := &limitedBuffer{}
96	err = d.ContainerExecStart(ctx, execConfig.ID, nil, output, output)
97	if err != nil {
98		return nil, err
99	}
100	info, err := d.getExecConfig(execConfig.ID)
101	if err != nil {
102		return nil, err
103	}
104	if info.ExitCode == nil {
105		return nil, fmt.Errorf("healthcheck for container %s has no exit code", cntr.ID)
106	}
107	// Note: Go's json package will handle invalid UTF-8 for us
108	out := output.String()
109	return &types.HealthcheckResult{
110		End:      time.Now(),
111		ExitCode: *info.ExitCode,
112		Output:   out,
113	}, nil
114}
115
116// Update the container's Status.Health struct based on the latest probe's result.
117func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult, done chan struct{}) {
118	c.Lock()
119	defer c.Unlock()
120
121	// probe may have been cancelled while waiting on lock. Ignore result then
122	select {
123	case <-done:
124		return
125	default:
126	}
127
128	retries := c.Config.Healthcheck.Retries
129	if retries <= 0 {
130		retries = defaultProbeRetries
131	}
132
133	h := c.State.Health
134	oldStatus := h.Status()
135
136	if len(h.Log) >= maxLogEntries {
137		h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result)
138	} else {
139		h.Log = append(h.Log, result)
140	}
141
142	if result.ExitCode == exitStatusHealthy {
143		h.FailingStreak = 0
144		h.SetStatus(types.Healthy)
145	} else { // Failure (including invalid exit code)
146		shouldIncrementStreak := true
147
148		// If the container is starting (i.e. we never had a successful health check)
149		// then we check if we are within the start period of the container in which
150		// case we do not increment the failure streak.
151		if h.Status() == types.Starting {
152			startPeriod := timeoutWithDefault(c.Config.Healthcheck.StartPeriod, defaultStartPeriod)
153			timeSinceStart := result.Start.Sub(c.State.StartedAt)
154
155			// If still within the start period, then don't increment failing streak.
156			if timeSinceStart < startPeriod {
157				shouldIncrementStreak = false
158			}
159		}
160
161		if shouldIncrementStreak {
162			h.FailingStreak++
163
164			if h.FailingStreak >= retries {
165				h.SetStatus(types.Unhealthy)
166			}
167		}
168		// Else we're starting or healthy. Stay in that state.
169	}
170
171	// replicate Health status changes
172	if err := c.CheckpointTo(d.containersReplica); err != nil {
173		// queries will be inconsistent until the next probe runs or other state mutations
174		// checkpoint the container
175		logrus.Errorf("Error replicating health state for container %s: %v", c.ID, err)
176	}
177
178	current := h.Status()
179	if oldStatus != current {
180		d.LogContainerEvent(c, "health_status: "+current)
181	}
182}
183
184// Run the container's monitoring thread until notified via "stop".
185// There is never more than one monitor thread running per container at a time.
186func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
187	probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout)
188	probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval)
189
190	intervalTimer := time.NewTimer(probeInterval)
191	defer intervalTimer.Stop()
192
193	for {
194		intervalTimer.Reset(probeInterval)
195
196		select {
197		case <-stop:
198			logrus.Debugf("Stop healthcheck monitoring for container %s (received while idle)", c.ID)
199			return
200		case <-intervalTimer.C:
201			logrus.Debugf("Running health check for container %s ...", c.ID)
202			startTime := time.Now()
203			ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout)
204			results := make(chan *types.HealthcheckResult, 1)
205			go func() {
206				healthChecksCounter.Inc()
207				result, err := probe.run(ctx, d, c)
208				if err != nil {
209					healthChecksFailedCounter.Inc()
210					logrus.Warnf("Health check for container %s error: %v", c.ID, err)
211					results <- &types.HealthcheckResult{
212						ExitCode: -1,
213						Output:   err.Error(),
214						Start:    startTime,
215						End:      time.Now(),
216					}
217				} else {
218					result.Start = startTime
219					logrus.Debugf("Health check for container %s done (exitCode=%d)", c.ID, result.ExitCode)
220					results <- result
221				}
222				close(results)
223			}()
224			select {
225			case <-stop:
226				logrus.Debugf("Stop healthcheck monitoring for container %s (received while probing)", c.ID)
227				cancelProbe()
228				// Wait for probe to exit (it might take a while to respond to the TERM
229				// signal and we don't want dying probes to pile up).
230				<-results
231				return
232			case result := <-results:
233				handleProbeResult(d, c, result, stop)
234				// Stop timeout
235				cancelProbe()
236			case <-ctx.Done():
237				logrus.Debugf("Health check for container %s taking too long", c.ID)
238				handleProbeResult(d, c, &types.HealthcheckResult{
239					ExitCode: -1,
240					Output:   fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout),
241					Start:    startTime,
242					End:      time.Now(),
243				}, stop)
244				cancelProbe()
245				// Wait for probe to exit (it might take a while to respond to the TERM
246				// signal and we don't want dying probes to pile up).
247				<-results
248			}
249		}
250	}
251}
252
253// Get a suitable probe implementation for the container's healthcheck configuration.
254// Nil will be returned if no healthcheck was configured or NONE was set.
255func getProbe(c *container.Container) probe {
256	config := c.Config.Healthcheck
257	if config == nil || len(config.Test) == 0 {
258		return nil
259	}
260	switch config.Test[0] {
261	case "CMD":
262		return &cmdProbe{shell: false}
263	case "CMD-SHELL":
264		return &cmdProbe{shell: true}
265	case "NONE":
266		return nil
267	default:
268		logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD') in container %s", config.Test[0], c.ID)
269		return nil
270	}
271}
272
273// Ensure the health-check monitor is running or not, depending on the current
274// state of the container.
275// Called from monitor.go, with c locked.
276func (daemon *Daemon) updateHealthMonitor(c *container.Container) {
277	h := c.State.Health
278	if h == nil {
279		return // No healthcheck configured
280	}
281
282	probe := getProbe(c)
283	wantRunning := c.Running && !c.Paused && probe != nil
284	if wantRunning {
285		if stop := h.OpenMonitorChannel(); stop != nil {
286			go monitor(daemon, c, stop, probe)
287		}
288	} else {
289		h.CloseMonitorChannel()
290	}
291}
292
293// Reset the health state for a newly-started, restarted or restored container.
294// initHealthMonitor is called from monitor.go and we should never be running
295// two instances at once.
296// Called with c locked.
297func (daemon *Daemon) initHealthMonitor(c *container.Container) {
298	// If no healthcheck is setup then don't init the monitor
299	if getProbe(c) == nil {
300		return
301	}
302
303	// This is needed in case we're auto-restarting
304	daemon.stopHealthchecks(c)
305
306	if h := c.State.Health; h != nil {
307		h.SetStatus(types.Starting)
308		h.FailingStreak = 0
309	} else {
310		h := &container.Health{}
311		h.SetStatus(types.Starting)
312		c.State.Health = h
313	}
314
315	daemon.updateHealthMonitor(c)
316}
317
318// Called when the container is being stopped (whether because the health check is
319// failing or for any other reason).
320func (daemon *Daemon) stopHealthchecks(c *container.Container) {
321	h := c.State.Health
322	if h != nil {
323		h.CloseMonitorChannel()
324	}
325}
326
327// Buffer up to maxOutputLen bytes. Further data is discarded.
328type limitedBuffer struct {
329	buf       bytes.Buffer
330	mu        sync.Mutex
331	truncated bool // indicates that data has been lost
332}
333
334// Append to limitedBuffer while there is room.
335func (b *limitedBuffer) Write(data []byte) (int, error) {
336	b.mu.Lock()
337	defer b.mu.Unlock()
338
339	bufLen := b.buf.Len()
340	dataLen := len(data)
341	keep := min(maxOutputLen-bufLen, dataLen)
342	if keep > 0 {
343		b.buf.Write(data[:keep])
344	}
345	if keep < dataLen {
346		b.truncated = true
347	}
348	return dataLen, nil
349}
350
351// The contents of the buffer, with "..." appended if it overflowed.
352func (b *limitedBuffer) String() string {
353	b.mu.Lock()
354	defer b.mu.Unlock()
355
356	out := b.buf.String()
357	if b.truncated {
358		out = out + "..."
359	}
360	return out
361}
362
363// If configuredValue is zero, use defaultValue instead.
364func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration {
365	if configuredValue == 0 {
366		return defaultValue
367	}
368	return configuredValue
369}
370
371func min(x, y int) int {
372	if x < y {
373		return x
374	}
375	return y
376}
377
378func getShell(cntr *container.Container) []string {
379	if len(cntr.Config.Shell) != 0 {
380		return cntr.Config.Shell
381	}
382	if runtime.GOOS != "windows" {
383		return []string{"/bin/sh", "-c"}
384	}
385	if cntr.OS != runtime.GOOS {
386		return []string{"/bin/sh", "-c"}
387	}
388	return []string{"cmd", "/S", "/C"}
389}
390