1// +build !windows
2
3/*
4   Copyright The containerd Authors.
5
6   Licensed under the Apache License, Version 2.0 (the "License");
7   you may not use this file except in compliance with the License.
8   You may obtain a copy of the License at
9
10       http://www.apache.org/licenses/LICENSE-2.0
11
12   Unless required by applicable law or agreed to in writing, software
13   distributed under the License is distributed on an "AS IS" BASIS,
14   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   See the License for the specific language governing permissions and
16   limitations under the License.
17*/
18
19package run
20
21import (
22	gocontext "context"
23	"path/filepath"
24	"strconv"
25	"strings"
26
27	"github.com/containerd/containerd"
28	"github.com/containerd/containerd/cmd/ctr/commands"
29	"github.com/containerd/containerd/contrib/nvidia"
30	"github.com/containerd/containerd/contrib/seccomp"
31	"github.com/containerd/containerd/oci"
32	"github.com/containerd/containerd/platforms"
33	"github.com/containerd/containerd/runtime/v2/runc/options"
34	"github.com/opencontainers/runtime-spec/specs-go"
35	"github.com/pkg/errors"
36	"github.com/sirupsen/logrus"
37	"github.com/urfave/cli"
38)
39
40var platformRunFlags = []cli.Flag{
41	cli.StringFlag{
42		Name:  "runc-binary",
43		Usage: "specify runc-compatible binary",
44	},
45	cli.BoolFlag{
46		Name:  "runc-systemd-cgroup",
47		Usage: "start runc with systemd cgroup manager",
48	},
49	cli.StringFlag{
50		Name:  "uidmap",
51		Usage: "run inside a user namespace with the specified UID mapping range; specified with the format `container-uid:host-uid:length`",
52	},
53	cli.StringFlag{
54		Name:  "gidmap",
55		Usage: "run inside a user namespace with the specified GID mapping range; specified with the format `container-gid:host-gid:length`",
56	},
57	cli.BoolFlag{
58		Name:  "remap-labels",
59		Usage: "provide the user namespace ID remapping to the snapshotter via label options; requires snapshotter support",
60	},
61	cli.Float64Flag{
62		Name:  "cpus",
63		Usage: "set the CFS cpu qouta",
64		Value: 0.0,
65	},
66}
67
68// NewContainer creates a new container
69func NewContainer(ctx gocontext.Context, client *containerd.Client, context *cli.Context) (containerd.Container, error) {
70	var (
71		id     string
72		config = context.IsSet("config")
73	)
74	if config {
75		id = context.Args().First()
76	} else {
77		id = context.Args().Get(1)
78	}
79
80	var (
81		opts  []oci.SpecOpts
82		cOpts []containerd.NewContainerOpts
83		spec  containerd.NewContainerOpts
84	)
85
86	cOpts = append(cOpts, containerd.WithContainerLabels(commands.LabelArgs(context.StringSlice("label"))))
87	if config {
88		opts = append(opts, oci.WithSpecFromFile(context.String("config")))
89	} else {
90		var (
91			ref = context.Args().First()
92			//for container's id is Args[1]
93			args = context.Args()[2:]
94		)
95		opts = append(opts, oci.WithDefaultSpec(), oci.WithDefaultUnixDevices)
96		if ef := context.String("env-file"); ef != "" {
97			opts = append(opts, oci.WithEnvFile(ef))
98		}
99		opts = append(opts, oci.WithEnv(context.StringSlice("env")))
100		opts = append(opts, withMounts(context))
101
102		if context.Bool("rootfs") {
103			rootfs, err := filepath.Abs(ref)
104			if err != nil {
105				return nil, err
106			}
107			opts = append(opts, oci.WithRootFSPath(rootfs))
108		} else {
109			snapshotter := context.String("snapshotter")
110			var image containerd.Image
111			i, err := client.ImageService().Get(ctx, ref)
112			if err != nil {
113				return nil, err
114			}
115			if ps := context.String("platform"); ps != "" {
116				platform, err := platforms.Parse(ps)
117				if err != nil {
118					return nil, err
119				}
120				image = containerd.NewImageWithPlatform(client, i, platforms.Only(platform))
121			} else {
122				image = containerd.NewImage(client, i)
123			}
124
125			unpacked, err := image.IsUnpacked(ctx, snapshotter)
126			if err != nil {
127				return nil, err
128			}
129			if !unpacked {
130				if err := image.Unpack(ctx, snapshotter); err != nil {
131					return nil, err
132				}
133			}
134			opts = append(opts, oci.WithImageConfig(image))
135			cOpts = append(cOpts,
136				containerd.WithImage(image),
137				containerd.WithSnapshotter(snapshotter))
138			if uidmap, gidmap := context.String("uidmap"), context.String("gidmap"); uidmap != "" && gidmap != "" {
139				uidMap, err := parseIDMapping(uidmap)
140				if err != nil {
141					return nil, err
142				}
143				gidMap, err := parseIDMapping(gidmap)
144				if err != nil {
145					return nil, err
146				}
147				opts = append(opts,
148					oci.WithUserNamespace([]specs.LinuxIDMapping{uidMap}, []specs.LinuxIDMapping{gidMap}))
149				// use snapshotter opts or the remapped snapshot support to shift the filesystem
150				// currently the only snapshotter known to support the labels is fuse-overlayfs:
151				// https://github.com/AkihiroSuda/containerd-fuse-overlayfs
152				if context.Bool("remap-labels") {
153					cOpts = append(cOpts, containerd.WithNewSnapshot(id, image,
154						containerd.WithRemapperLabels(0, uidMap.HostID, 0, gidMap.HostID, uidMap.Size)))
155				} else {
156					cOpts = append(cOpts, containerd.WithRemappedSnapshot(id, image, uidMap.HostID, gidMap.HostID))
157				}
158			} else {
159				// Even when "read-only" is set, we don't use KindView snapshot here. (#1495)
160				// We pass writable snapshot to the OCI runtime, and the runtime remounts it as read-only,
161				// after creating some mount points on demand.
162				cOpts = append(cOpts, containerd.WithNewSnapshot(id, image))
163			}
164			cOpts = append(cOpts, containerd.WithImageStopSignal(image, "SIGTERM"))
165		}
166		if context.Bool("read-only") {
167			opts = append(opts, oci.WithRootFSReadonly())
168		}
169		if len(args) > 0 {
170			opts = append(opts, oci.WithProcessArgs(args...))
171		}
172		if cwd := context.String("cwd"); cwd != "" {
173			opts = append(opts, oci.WithProcessCwd(cwd))
174		}
175		if context.Bool("tty") {
176			opts = append(opts, oci.WithTTY)
177		}
178		if context.Bool("privileged") {
179			opts = append(opts, oci.WithPrivileged, oci.WithAllDevicesAllowed, oci.WithHostDevices)
180		}
181		if context.Bool("net-host") {
182			opts = append(opts, oci.WithHostNamespace(specs.NetworkNamespace), oci.WithHostHostsFile, oci.WithHostResolvconf)
183		}
184		if context.Bool("seccomp") {
185			opts = append(opts, seccomp.WithDefaultProfile())
186		}
187		if cpus := context.Float64("cpus"); cpus > 0.0 {
188			var (
189				period = uint64(100000)
190				quota  = int64(cpus * 100000.0)
191			)
192			opts = append(opts, oci.WithCPUCFS(quota, period))
193		}
194
195		quota := context.Int64("cpu-quota")
196		period := context.Uint64("cpu-period")
197		if quota != -1 || period != 0 {
198			if cpus := context.Float64("cpus"); cpus > 0.0 {
199				return nil, errors.New("cpus and quota/period should be used separately")
200			}
201			opts = append(opts, oci.WithCPUCFS(quota, period))
202		}
203
204		joinNs := context.StringSlice("with-ns")
205		for _, ns := range joinNs {
206			parts := strings.Split(ns, ":")
207			if len(parts) != 2 {
208				return nil, errors.New("joining a Linux namespace using --with-ns requires the format 'nstype:path'")
209			}
210			if !validNamespace(parts[0]) {
211				return nil, errors.New("the Linux namespace type specified in --with-ns is not valid: " + parts[0])
212			}
213			opts = append(opts, oci.WithLinuxNamespace(specs.LinuxNamespace{
214				Type: specs.LinuxNamespaceType(parts[0]),
215				Path: parts[1],
216			}))
217		}
218		if context.IsSet("gpus") {
219			opts = append(opts, nvidia.WithGPUs(nvidia.WithDevices(context.Int("gpus")), nvidia.WithAllCapabilities))
220		}
221		if context.IsSet("allow-new-privs") {
222			opts = append(opts, oci.WithNewPrivileges)
223		}
224		if context.IsSet("cgroup") {
225			// NOTE: can be set to "" explicitly for disabling cgroup.
226			opts = append(opts, oci.WithCgroup(context.String("cgroup")))
227		}
228		limit := context.Uint64("memory-limit")
229		if limit != 0 {
230			opts = append(opts, oci.WithMemoryLimit(limit))
231		}
232		for _, dev := range context.StringSlice("device") {
233			opts = append(opts, oci.WithLinuxDevice(dev, "rwm"))
234		}
235	}
236
237	runtimeOpts, err := getRuntimeOptions(context)
238	if err != nil {
239		return nil, err
240	}
241	cOpts = append(cOpts, containerd.WithRuntime(context.String("runtime"), runtimeOpts))
242
243	opts = append(opts, oci.WithAnnotations(commands.LabelArgs(context.StringSlice("label"))))
244	var s specs.Spec
245	spec = containerd.WithSpec(&s, opts...)
246
247	cOpts = append(cOpts, spec)
248
249	// oci.WithImageConfig (WithUsername, WithUserID) depends on access to rootfs for resolving via
250	// the /etc/{passwd,group} files. So cOpts needs to have precedence over opts.
251	return client.NewContainer(ctx, id, cOpts...)
252}
253
254func getRuncOptions(context *cli.Context) (*options.Options, error) {
255	runtimeOpts := &options.Options{}
256	if runcBinary := context.String("runc-binary"); runcBinary != "" {
257		runtimeOpts.BinaryName = runcBinary
258	}
259	if context.Bool("runc-systemd-cgroup") {
260		if context.String("cgroup") == "" {
261			// runc maps "machine.slice:foo:deadbeef" to "/machine.slice/foo-deadbeef.scope"
262			return nil, errors.New("option --runc-systemd-cgroup requires --cgroup to be set, e.g. \"machine.slice:foo:deadbeef\"")
263		}
264		runtimeOpts.SystemdCgroup = true
265	}
266
267	return runtimeOpts, nil
268}
269
270func getRuntimeOptions(context *cli.Context) (interface{}, error) {
271	// validate first
272	if (context.String("runc-binary") != "" || context.Bool("runc-systemd-cgroup")) &&
273		context.String("runtime") != "io.containerd.runc.v2" {
274		return nil, errors.New("specifying runc-binary and runc-systemd-cgroup is only supported for \"io.containerd.runc.v2\" runtime")
275	}
276
277	if context.String("runtime") == "io.containerd.runc.v2" {
278		return getRuncOptions(context)
279	}
280
281	return nil, nil
282}
283
284func getNewTaskOpts(context *cli.Context) []containerd.NewTaskOpts {
285	var (
286		tOpts []containerd.NewTaskOpts
287	)
288	if context.Bool("no-pivot") {
289		tOpts = append(tOpts, containerd.WithNoPivotRoot)
290	}
291	if uidmap := context.String("uidmap"); uidmap != "" {
292		uidMap, err := parseIDMapping(uidmap)
293		if err != nil {
294			logrus.WithError(err).Warn("unable to parse uidmap; defaulting to uid 0 IO ownership")
295		}
296		tOpts = append(tOpts, containerd.WithUIDOwner(uidMap.HostID))
297	}
298	if gidmap := context.String("gidmap"); gidmap != "" {
299		gidMap, err := parseIDMapping(gidmap)
300		if err != nil {
301			logrus.WithError(err).Warn("unable to parse gidmap; defaulting to gid 0 IO ownership")
302		}
303		tOpts = append(tOpts, containerd.WithGIDOwner(gidMap.HostID))
304	}
305	return tOpts
306}
307
308func parseIDMapping(mapping string) (specs.LinuxIDMapping, error) {
309	parts := strings.Split(mapping, ":")
310	if len(parts) != 3 {
311		return specs.LinuxIDMapping{}, errors.New("user namespace mappings require the format `container-id:host-id:size`")
312	}
313	cID, err := strconv.ParseUint(parts[0], 0, 32)
314	if err != nil {
315		return specs.LinuxIDMapping{}, errors.Wrapf(err, "invalid container id for user namespace remapping")
316	}
317	hID, err := strconv.ParseUint(parts[1], 0, 32)
318	if err != nil {
319		return specs.LinuxIDMapping{}, errors.Wrapf(err, "invalid host id for user namespace remapping")
320	}
321	size, err := strconv.ParseUint(parts[2], 0, 32)
322	if err != nil {
323		return specs.LinuxIDMapping{}, errors.Wrapf(err, "invalid size for user namespace remapping")
324	}
325	return specs.LinuxIDMapping{
326		ContainerID: uint32(cID),
327		HostID:      uint32(hID),
328		Size:        uint32(size),
329	}, nil
330}
331
332func validNamespace(ns string) bool {
333	linuxNs := specs.LinuxNamespaceType(ns)
334	switch linuxNs {
335	case specs.PIDNamespace,
336		specs.NetworkNamespace,
337		specs.UTSNamespace,
338		specs.MountNamespace,
339		specs.UserNamespace,
340		specs.IPCNamespace,
341		specs.CgroupNamespace:
342		return true
343	default:
344		return false
345	}
346}
347