1package main
2
3import (
4	"context"
5	"fmt"
6	"os"
7	"path/filepath"
8	"sync"
9
10	"github.com/Microsoft/hcsshim/internal/oc"
11	"go.opencensus.io/trace"
12
13	"github.com/Microsoft/hcsshim/internal/hcsoci"
14	"github.com/Microsoft/hcsshim/internal/oci"
15	"github.com/Microsoft/hcsshim/internal/uvm"
16	"github.com/Microsoft/hcsshim/osversion"
17	eventstypes "github.com/containerd/containerd/api/events"
18	"github.com/containerd/containerd/errdefs"
19	"github.com/containerd/containerd/runtime"
20	"github.com/containerd/containerd/runtime/v2/task"
21	specs "github.com/opencontainers/runtime-spec/specs-go"
22	"github.com/pkg/errors"
23	"golang.org/x/sync/errgroup"
24)
25
26// shimPod represents the logical grouping of all tasks in a single set of
27// shared namespaces. The pod sandbox (container) is represented by the task
28// that matches the `shimPod.ID()`
29type shimPod interface {
30	// ID is the id of the task representing the pause (sandbox) container.
31	ID() string
32	// CreateTask creates a workload task within this pod named `tid` with
33	// settings `s`.
34	//
35	// If `tid==ID()` or `tid` is the same as any other task in this pod, this
36	// pod MUST return `errdefs.ErrAlreadyExists`.
37	CreateTask(ctx context.Context, req *task.CreateTaskRequest, s *specs.Spec) (shimTask, error)
38	// GetTask returns a task in this pod that matches `tid`.
39	//
40	// If `tid` is not found, this pod MUST return `errdefs.ErrNotFound`.
41	GetTask(tid string) (shimTask, error)
42	// KillTask sends `signal` to task that matches `tid`.
43	//
44	// If `tid` is not found, this pod MUST return `errdefs.ErrNotFound`.
45	//
46	// If `tid==ID() && eid == "" && all == true` this pod will send `signal` to
47	// all tasks in the pod and lastly send `signal` to the sandbox itself.
48	//
49	// If `all == true && eid != ""` this pod MUST return
50	// `errdefs.ErrFailedPrecondition`.
51	//
52	// A call to `KillTask` is only valid when the exec found by `tid,eid` is in
53	// the `shimExecStateRunning, shimExecStateExited` states. If the exec is
54	// not in this state this pod MUST return `errdefs.ErrFailedPrecondition`.
55	KillTask(ctx context.Context, tid, eid string, signal uint32, all bool) error
56}
57
58func createPod(ctx context.Context, events publisher, req *task.CreateTaskRequest, s *specs.Spec) (_ shimPod, err error) {
59	ctx, span := trace.StartSpan(ctx, "createPod")
60	defer span.End()
61	defer func() {
62		oc.SetSpanStatus(span, err)
63	}()
64	span.AddAttributes(trace.StringAttribute("tid", req.ID))
65
66	if osversion.Get().Build < osversion.RS5 {
67		return nil, errors.Wrapf(errdefs.ErrFailedPrecondition, "pod support is not available on Windows versions previous to RS5 (%d)", osversion.RS5)
68	}
69
70	ct, sid, err := oci.GetSandboxTypeAndID(s.Annotations)
71	if err != nil {
72		return nil, err
73	}
74	if ct != oci.KubernetesContainerTypeSandbox {
75		return nil, errors.Wrapf(
76			errdefs.ErrFailedPrecondition,
77			"expected annotation: '%s': '%s' got '%s'",
78			oci.KubernetesContainerTypeAnnotation,
79			oci.KubernetesContainerTypeSandbox,
80			ct)
81	}
82	if sid != req.ID {
83		return nil, errors.Wrapf(
84			errdefs.ErrFailedPrecondition,
85			"expected annotation '%s': '%s' got '%s'",
86			oci.KubernetesSandboxIDAnnotation,
87			req.ID,
88			sid)
89	}
90
91	owner := filepath.Base(os.Args[0])
92	isWCOW := oci.IsWCOW(s)
93
94	var parent *uvm.UtilityVM
95	if oci.IsIsolated(s) {
96		// Create the UVM parent
97		opts, err := oci.SpecToUVMCreateOpts(ctx, s, fmt.Sprintf("%s@vm", req.ID), owner)
98		if err != nil {
99			return nil, err
100		}
101		switch opts.(type) {
102		case *uvm.OptionsLCOW:
103			lopts := (opts).(*uvm.OptionsLCOW)
104			parent, err = uvm.CreateLCOW(ctx, lopts)
105			if err != nil {
106				return nil, err
107			}
108		case *uvm.OptionsWCOW:
109			wopts := (opts).(*uvm.OptionsWCOW)
110
111			// In order for the UVM sandbox.vhdx not to collide with the actual
112			// nested Argon sandbox.vhdx we append the \vm folder to the last
113			// entry in the list.
114			layersLen := len(s.Windows.LayerFolders)
115			layers := make([]string, layersLen)
116			copy(layers, s.Windows.LayerFolders)
117
118			vmPath := filepath.Join(layers[layersLen-1], "vm")
119			err := os.MkdirAll(vmPath, 0)
120			if err != nil {
121				return nil, err
122			}
123			layers[layersLen-1] = vmPath
124			wopts.LayerFolders = layers
125
126			parent, err = uvm.CreateWCOW(ctx, wopts)
127			if err != nil {
128				return nil, err
129			}
130		}
131		err = parent.Start(ctx)
132		if err != nil {
133			parent.Close()
134			return nil, err
135		}
136	} else if !isWCOW {
137		return nil, errors.Wrap(errdefs.ErrFailedPrecondition, "oci spec does not contain WCOW or LCOW spec")
138	}
139	defer func() {
140		// clean up the uvm if we fail any further operations
141		if err != nil && parent != nil {
142			parent.Close()
143		}
144	}()
145
146	p := pod{
147		events: events,
148		id:     req.ID,
149		host:   parent,
150	}
151	// TOOD: JTERRY75 - There is a bug in the compartment activation for Windows
152	// Process isolated that requires us to create the real pause container to
153	// hold the network compartment open. This is not required for Windows
154	// Hypervisor isolated. When we have a build that supports this for Windows
155	// Process isolated make sure to move back to this model.
156	if isWCOW && parent != nil {
157		// For WCOW we fake out the init task since we dont need it. We only
158		// need to provision the guest network namespace if this is hypervisor
159		// isolated. Process isolated WCOW gets the namespace endpoints
160		// automatically.
161		if parent != nil {
162			nsid := ""
163			if s.Windows != nil && s.Windows.Network != nil {
164				nsid = s.Windows.Network.NetworkNamespace
165			}
166
167			if nsid != "" {
168				endpoints, err := hcsoci.GetNamespaceEndpoints(ctx, nsid)
169				if err != nil {
170					return nil, err
171				}
172				err = parent.AddNetNS(ctx, nsid)
173				if err != nil {
174					return nil, err
175				}
176				err = parent.AddEndpointsToNS(ctx, nsid, endpoints)
177				if err != nil {
178					return nil, err
179				}
180			}
181		}
182		p.sandboxTask = newWcowPodSandboxTask(ctx, events, req.ID, req.Bundle, parent)
183		// Publish the created event. We only do this for a fake WCOW task. A
184		// HCS Task will event itself based on actual process lifetime.
185		events.publishEvent(
186			ctx,
187			runtime.TaskCreateEventTopic,
188			&eventstypes.TaskCreate{
189				ContainerID: req.ID,
190				Bundle:      req.Bundle,
191				Rootfs:      req.Rootfs,
192				IO: &eventstypes.TaskIO{
193					Stdin:    req.Stdin,
194					Stdout:   req.Stdout,
195					Stderr:   req.Stderr,
196					Terminal: req.Terminal,
197				},
198				Checkpoint: "",
199				Pid:        0,
200			})
201	} else {
202		if isWCOW {
203			// The pause container activation will immediately exit on Windows
204			// because there is no command. We forcibly update the command here
205			// to keep it alive.
206			s.Process.CommandLine = "cmd /c ping -t 127.0.0.1 > nul"
207		}
208		// LCOW (and WCOW Process Isolated for the time being) requires a real
209		// task for the sandbox.
210		lt, err := newHcsTask(ctx, events, parent, true, req, s)
211		if err != nil {
212			return nil, err
213		}
214		p.sandboxTask = lt
215	}
216
217	return &p, nil
218}
219
220var _ = (shimPod)(&pod{})
221
222type pod struct {
223	events publisher
224	// id is the id of the sandbox task when the pod is created.
225	//
226	// It MUST be treated as read only in the lifetime of the pod.
227	id string
228	// sandboxTask is the task that represents the sandbox.
229	//
230	// Note: The invariant `id==sandboxTask.ID()` MUST be true.
231	//
232	// It MUST be treated as read only in the lifetime of the pod.
233	sandboxTask shimTask
234	// host is the UtilityVM that is hosting `sandboxTask` if the task is
235	// hypervisor isolated.
236	//
237	// It MUST be treated as read only in the lifetime of the pod.
238	host *uvm.UtilityVM
239
240	// wcl is the worload create mutex. All calls to CreateTask must hold this
241	// lock while the ID reservation takes place. Once the ID is held it is safe
242	// to release the lock to allow concurrent creates.
243	wcl           sync.Mutex
244	workloadTasks sync.Map
245}
246
247func (p *pod) ID() string {
248	return p.id
249}
250
251func (p *pod) CreateTask(ctx context.Context, req *task.CreateTaskRequest, s *specs.Spec) (_ shimTask, err error) {
252	if req.ID == p.id {
253		return nil, errors.Wrapf(errdefs.ErrAlreadyExists, "task with id: '%s' already exists", req.ID)
254	}
255	e, _ := p.sandboxTask.GetExec("")
256	if e.State() != shimExecStateRunning {
257		return nil, errors.Wrapf(errdefs.ErrFailedPrecondition, "task with id: '%s' cannot be created in pod: '%s' which is not running", req.ID, p.id)
258	}
259
260	p.wcl.Lock()
261	_, loaded := p.workloadTasks.LoadOrStore(req.ID, nil)
262	if loaded {
263		return nil, errors.Wrapf(errdefs.ErrAlreadyExists, "task with id: '%s' already exists id pod: '%s'", req.ID, p.id)
264	}
265	p.wcl.Unlock()
266	defer func() {
267		if err != nil {
268			p.workloadTasks.Delete(req.ID)
269		}
270	}()
271
272	ct, sid, err := oci.GetSandboxTypeAndID(s.Annotations)
273	if err != nil {
274		return nil, err
275	}
276	if ct != oci.KubernetesContainerTypeContainer {
277		return nil, errors.Wrapf(
278			errdefs.ErrFailedPrecondition,
279			"expected annotation: '%s': '%s' got '%s'",
280			oci.KubernetesContainerTypeAnnotation,
281			oci.KubernetesContainerTypeContainer,
282			ct)
283	}
284	if sid != p.id {
285		return nil, errors.Wrapf(
286			errdefs.ErrFailedPrecondition,
287			"expected annotation '%s': '%s' got '%s'",
288			oci.KubernetesSandboxIDAnnotation,
289			p.id,
290			sid)
291	}
292
293	st, err := newHcsTask(ctx, p.events, p.host, false, req, s)
294	if err != nil {
295		return nil, err
296	}
297
298	p.workloadTasks.Store(req.ID, st)
299	return st, nil
300}
301
302func (p *pod) GetTask(tid string) (shimTask, error) {
303	if tid == p.id {
304		return p.sandboxTask, nil
305	}
306	raw, loaded := p.workloadTasks.Load(tid)
307	if !loaded {
308		return nil, errors.Wrapf(errdefs.ErrNotFound, "task with id: '%s' not found", tid)
309	}
310	return raw.(shimTask), nil
311}
312
313func (p *pod) KillTask(ctx context.Context, tid, eid string, signal uint32, all bool) error {
314	t, err := p.GetTask(tid)
315	if err != nil {
316		return err
317	}
318	if all && eid != "" {
319		return errors.Wrapf(errdefs.ErrFailedPrecondition, "cannot signal all with non empty ExecID: '%s'", eid)
320	}
321	eg := errgroup.Group{}
322	if all && tid == p.id {
323		// We are in a kill all on the sandbox task. Signal everything.
324		p.workloadTasks.Range(func(key, value interface{}) bool {
325			wt := value.(shimTask)
326			eg.Go(func() error {
327				return wt.KillExec(ctx, eid, signal, all)
328			})
329
330			// iterate all
331			return false
332		})
333	}
334	eg.Go(func() error {
335		return t.KillExec(ctx, eid, signal, all)
336	})
337	return eg.Wait()
338}
339