1package main 2 3import ( 4 "context" 5 "fmt" 6 "os" 7 "path/filepath" 8 "sync" 9 10 "github.com/Microsoft/hcsshim/internal/oc" 11 "go.opencensus.io/trace" 12 13 "github.com/Microsoft/hcsshim/internal/hcsoci" 14 "github.com/Microsoft/hcsshim/internal/oci" 15 "github.com/Microsoft/hcsshim/internal/uvm" 16 "github.com/Microsoft/hcsshim/osversion" 17 eventstypes "github.com/containerd/containerd/api/events" 18 "github.com/containerd/containerd/errdefs" 19 "github.com/containerd/containerd/runtime" 20 "github.com/containerd/containerd/runtime/v2/task" 21 specs "github.com/opencontainers/runtime-spec/specs-go" 22 "github.com/pkg/errors" 23 "golang.org/x/sync/errgroup" 24) 25 26// shimPod represents the logical grouping of all tasks in a single set of 27// shared namespaces. The pod sandbox (container) is represented by the task 28// that matches the `shimPod.ID()` 29type shimPod interface { 30 // ID is the id of the task representing the pause (sandbox) container. 31 ID() string 32 // CreateTask creates a workload task within this pod named `tid` with 33 // settings `s`. 34 // 35 // If `tid==ID()` or `tid` is the same as any other task in this pod, this 36 // pod MUST return `errdefs.ErrAlreadyExists`. 37 CreateTask(ctx context.Context, req *task.CreateTaskRequest, s *specs.Spec) (shimTask, error) 38 // GetTask returns a task in this pod that matches `tid`. 39 // 40 // If `tid` is not found, this pod MUST return `errdefs.ErrNotFound`. 41 GetTask(tid string) (shimTask, error) 42 // KillTask sends `signal` to task that matches `tid`. 43 // 44 // If `tid` is not found, this pod MUST return `errdefs.ErrNotFound`. 45 // 46 // If `tid==ID() && eid == "" && all == true` this pod will send `signal` to 47 // all tasks in the pod and lastly send `signal` to the sandbox itself. 48 // 49 // If `all == true && eid != ""` this pod MUST return 50 // `errdefs.ErrFailedPrecondition`. 51 // 52 // A call to `KillTask` is only valid when the exec found by `tid,eid` is in 53 // the `shimExecStateRunning, shimExecStateExited` states. If the exec is 54 // not in this state this pod MUST return `errdefs.ErrFailedPrecondition`. 55 KillTask(ctx context.Context, tid, eid string, signal uint32, all bool) error 56} 57 58func createPod(ctx context.Context, events publisher, req *task.CreateTaskRequest, s *specs.Spec) (_ shimPod, err error) { 59 ctx, span := trace.StartSpan(ctx, "createPod") 60 defer span.End() 61 defer func() { 62 oc.SetSpanStatus(span, err) 63 }() 64 span.AddAttributes(trace.StringAttribute("tid", req.ID)) 65 66 if osversion.Get().Build < osversion.RS5 { 67 return nil, errors.Wrapf(errdefs.ErrFailedPrecondition, "pod support is not available on Windows versions previous to RS5 (%d)", osversion.RS5) 68 } 69 70 ct, sid, err := oci.GetSandboxTypeAndID(s.Annotations) 71 if err != nil { 72 return nil, err 73 } 74 if ct != oci.KubernetesContainerTypeSandbox { 75 return nil, errors.Wrapf( 76 errdefs.ErrFailedPrecondition, 77 "expected annotation: '%s': '%s' got '%s'", 78 oci.KubernetesContainerTypeAnnotation, 79 oci.KubernetesContainerTypeSandbox, 80 ct) 81 } 82 if sid != req.ID { 83 return nil, errors.Wrapf( 84 errdefs.ErrFailedPrecondition, 85 "expected annotation '%s': '%s' got '%s'", 86 oci.KubernetesSandboxIDAnnotation, 87 req.ID, 88 sid) 89 } 90 91 owner := filepath.Base(os.Args[0]) 92 isWCOW := oci.IsWCOW(s) 93 94 var parent *uvm.UtilityVM 95 if oci.IsIsolated(s) { 96 // Create the UVM parent 97 opts, err := oci.SpecToUVMCreateOpts(ctx, s, fmt.Sprintf("%s@vm", req.ID), owner) 98 if err != nil { 99 return nil, err 100 } 101 switch opts.(type) { 102 case *uvm.OptionsLCOW: 103 lopts := (opts).(*uvm.OptionsLCOW) 104 parent, err = uvm.CreateLCOW(ctx, lopts) 105 if err != nil { 106 return nil, err 107 } 108 case *uvm.OptionsWCOW: 109 wopts := (opts).(*uvm.OptionsWCOW) 110 111 // In order for the UVM sandbox.vhdx not to collide with the actual 112 // nested Argon sandbox.vhdx we append the \vm folder to the last 113 // entry in the list. 114 layersLen := len(s.Windows.LayerFolders) 115 layers := make([]string, layersLen) 116 copy(layers, s.Windows.LayerFolders) 117 118 vmPath := filepath.Join(layers[layersLen-1], "vm") 119 err := os.MkdirAll(vmPath, 0) 120 if err != nil { 121 return nil, err 122 } 123 layers[layersLen-1] = vmPath 124 wopts.LayerFolders = layers 125 126 parent, err = uvm.CreateWCOW(ctx, wopts) 127 if err != nil { 128 return nil, err 129 } 130 } 131 err = parent.Start(ctx) 132 if err != nil { 133 parent.Close() 134 return nil, err 135 } 136 } else if !isWCOW { 137 return nil, errors.Wrap(errdefs.ErrFailedPrecondition, "oci spec does not contain WCOW or LCOW spec") 138 } 139 defer func() { 140 // clean up the uvm if we fail any further operations 141 if err != nil && parent != nil { 142 parent.Close() 143 } 144 }() 145 146 p := pod{ 147 events: events, 148 id: req.ID, 149 host: parent, 150 } 151 // TOOD: JTERRY75 - There is a bug in the compartment activation for Windows 152 // Process isolated that requires us to create the real pause container to 153 // hold the network compartment open. This is not required for Windows 154 // Hypervisor isolated. When we have a build that supports this for Windows 155 // Process isolated make sure to move back to this model. 156 if isWCOW && parent != nil { 157 // For WCOW we fake out the init task since we dont need it. We only 158 // need to provision the guest network namespace if this is hypervisor 159 // isolated. Process isolated WCOW gets the namespace endpoints 160 // automatically. 161 if parent != nil { 162 nsid := "" 163 if s.Windows != nil && s.Windows.Network != nil { 164 nsid = s.Windows.Network.NetworkNamespace 165 } 166 167 if nsid != "" { 168 endpoints, err := hcsoci.GetNamespaceEndpoints(ctx, nsid) 169 if err != nil { 170 return nil, err 171 } 172 err = parent.AddNetNS(ctx, nsid) 173 if err != nil { 174 return nil, err 175 } 176 err = parent.AddEndpointsToNS(ctx, nsid, endpoints) 177 if err != nil { 178 return nil, err 179 } 180 } 181 } 182 p.sandboxTask = newWcowPodSandboxTask(ctx, events, req.ID, req.Bundle, parent) 183 // Publish the created event. We only do this for a fake WCOW task. A 184 // HCS Task will event itself based on actual process lifetime. 185 events.publishEvent( 186 ctx, 187 runtime.TaskCreateEventTopic, 188 &eventstypes.TaskCreate{ 189 ContainerID: req.ID, 190 Bundle: req.Bundle, 191 Rootfs: req.Rootfs, 192 IO: &eventstypes.TaskIO{ 193 Stdin: req.Stdin, 194 Stdout: req.Stdout, 195 Stderr: req.Stderr, 196 Terminal: req.Terminal, 197 }, 198 Checkpoint: "", 199 Pid: 0, 200 }) 201 } else { 202 if isWCOW { 203 // The pause container activation will immediately exit on Windows 204 // because there is no command. We forcibly update the command here 205 // to keep it alive. 206 s.Process.CommandLine = "cmd /c ping -t 127.0.0.1 > nul" 207 } 208 // LCOW (and WCOW Process Isolated for the time being) requires a real 209 // task for the sandbox. 210 lt, err := newHcsTask(ctx, events, parent, true, req, s) 211 if err != nil { 212 return nil, err 213 } 214 p.sandboxTask = lt 215 } 216 217 return &p, nil 218} 219 220var _ = (shimPod)(&pod{}) 221 222type pod struct { 223 events publisher 224 // id is the id of the sandbox task when the pod is created. 225 // 226 // It MUST be treated as read only in the lifetime of the pod. 227 id string 228 // sandboxTask is the task that represents the sandbox. 229 // 230 // Note: The invariant `id==sandboxTask.ID()` MUST be true. 231 // 232 // It MUST be treated as read only in the lifetime of the pod. 233 sandboxTask shimTask 234 // host is the UtilityVM that is hosting `sandboxTask` if the task is 235 // hypervisor isolated. 236 // 237 // It MUST be treated as read only in the lifetime of the pod. 238 host *uvm.UtilityVM 239 240 // wcl is the worload create mutex. All calls to CreateTask must hold this 241 // lock while the ID reservation takes place. Once the ID is held it is safe 242 // to release the lock to allow concurrent creates. 243 wcl sync.Mutex 244 workloadTasks sync.Map 245} 246 247func (p *pod) ID() string { 248 return p.id 249} 250 251func (p *pod) CreateTask(ctx context.Context, req *task.CreateTaskRequest, s *specs.Spec) (_ shimTask, err error) { 252 if req.ID == p.id { 253 return nil, errors.Wrapf(errdefs.ErrAlreadyExists, "task with id: '%s' already exists", req.ID) 254 } 255 e, _ := p.sandboxTask.GetExec("") 256 if e.State() != shimExecStateRunning { 257 return nil, errors.Wrapf(errdefs.ErrFailedPrecondition, "task with id: '%s' cannot be created in pod: '%s' which is not running", req.ID, p.id) 258 } 259 260 p.wcl.Lock() 261 _, loaded := p.workloadTasks.LoadOrStore(req.ID, nil) 262 if loaded { 263 return nil, errors.Wrapf(errdefs.ErrAlreadyExists, "task with id: '%s' already exists id pod: '%s'", req.ID, p.id) 264 } 265 p.wcl.Unlock() 266 defer func() { 267 if err != nil { 268 p.workloadTasks.Delete(req.ID) 269 } 270 }() 271 272 ct, sid, err := oci.GetSandboxTypeAndID(s.Annotations) 273 if err != nil { 274 return nil, err 275 } 276 if ct != oci.KubernetesContainerTypeContainer { 277 return nil, errors.Wrapf( 278 errdefs.ErrFailedPrecondition, 279 "expected annotation: '%s': '%s' got '%s'", 280 oci.KubernetesContainerTypeAnnotation, 281 oci.KubernetesContainerTypeContainer, 282 ct) 283 } 284 if sid != p.id { 285 return nil, errors.Wrapf( 286 errdefs.ErrFailedPrecondition, 287 "expected annotation '%s': '%s' got '%s'", 288 oci.KubernetesSandboxIDAnnotation, 289 p.id, 290 sid) 291 } 292 293 st, err := newHcsTask(ctx, p.events, p.host, false, req, s) 294 if err != nil { 295 return nil, err 296 } 297 298 p.workloadTasks.Store(req.ID, st) 299 return st, nil 300} 301 302func (p *pod) GetTask(tid string) (shimTask, error) { 303 if tid == p.id { 304 return p.sandboxTask, nil 305 } 306 raw, loaded := p.workloadTasks.Load(tid) 307 if !loaded { 308 return nil, errors.Wrapf(errdefs.ErrNotFound, "task with id: '%s' not found", tid) 309 } 310 return raw.(shimTask), nil 311} 312 313func (p *pod) KillTask(ctx context.Context, tid, eid string, signal uint32, all bool) error { 314 t, err := p.GetTask(tid) 315 if err != nil { 316 return err 317 } 318 if all && eid != "" { 319 return errors.Wrapf(errdefs.ErrFailedPrecondition, "cannot signal all with non empty ExecID: '%s'", eid) 320 } 321 eg := errgroup.Group{} 322 if all && tid == p.id { 323 // We are in a kill all on the sandbox task. Signal everything. 324 p.workloadTasks.Range(func(key, value interface{}) bool { 325 wt := value.(shimTask) 326 eg.Go(func() error { 327 return wt.KillExec(ctx, eid, signal, all) 328 }) 329 330 // iterate all 331 return false 332 }) 333 } 334 eg.Go(func() error { 335 return t.KillExec(ctx, eid, signal, all) 336 }) 337 return eg.Wait() 338} 339