1package structs
2
3import (
4	"bytes"
5	"container/heap"
6	"crypto/md5"
7	"crypto/sha1"
8	"crypto/sha256"
9	"crypto/sha512"
10	"encoding/base32"
11	"encoding/base64"
12	"encoding/hex"
13	"errors"
14	"fmt"
15	"hash"
16	"hash/crc32"
17	"math"
18	"net"
19	"os"
20	"path/filepath"
21	"reflect"
22	"regexp"
23	"sort"
24	"strconv"
25	"strings"
26	"time"
27
28	"github.com/hashicorp/nomad/lib/cpuset"
29
30	"github.com/hashicorp/cronexpr"
31	"github.com/hashicorp/go-msgpack/codec"
32	"github.com/hashicorp/go-multierror"
33	"github.com/hashicorp/go-version"
34	"github.com/mitchellh/copystructure"
35	"golang.org/x/crypto/blake2b"
36
37	"github.com/hashicorp/nomad/acl"
38	"github.com/hashicorp/nomad/command/agent/host"
39	"github.com/hashicorp/nomad/command/agent/pprof"
40	"github.com/hashicorp/nomad/helper"
41	"github.com/hashicorp/nomad/helper/args"
42	"github.com/hashicorp/nomad/helper/constraints/semver"
43	"github.com/hashicorp/nomad/helper/uuid"
44	"github.com/hashicorp/nomad/lib/kheap"
45	psstructs "github.com/hashicorp/nomad/plugins/shared/structs"
46)
47
48var (
49	// validPolicyName is used to validate a policy name
50	validPolicyName = regexp.MustCompile("^[a-zA-Z0-9-]{1,128}$")
51
52	// b32 is a lowercase base32 encoding for use in URL friendly service hashes
53	b32 = base32.NewEncoding(strings.ToLower("abcdefghijklmnopqrstuvwxyz234567"))
54)
55
56type MessageType uint8
57
58// note: new raft message types need to be added to the end of this
59// list of contents
60const (
61	NodeRegisterRequestType                      MessageType = 0
62	NodeDeregisterRequestType                    MessageType = 1
63	NodeUpdateStatusRequestType                  MessageType = 2
64	NodeUpdateDrainRequestType                   MessageType = 3
65	JobRegisterRequestType                       MessageType = 4
66	JobDeregisterRequestType                     MessageType = 5
67	EvalUpdateRequestType                        MessageType = 6
68	EvalDeleteRequestType                        MessageType = 7
69	AllocUpdateRequestType                       MessageType = 8
70	AllocClientUpdateRequestType                 MessageType = 9
71	ReconcileJobSummariesRequestType             MessageType = 10
72	VaultAccessorRegisterRequestType             MessageType = 11
73	VaultAccessorDeregisterRequestType           MessageType = 12
74	ApplyPlanResultsRequestType                  MessageType = 13
75	DeploymentStatusUpdateRequestType            MessageType = 14
76	DeploymentPromoteRequestType                 MessageType = 15
77	DeploymentAllocHealthRequestType             MessageType = 16
78	DeploymentDeleteRequestType                  MessageType = 17
79	JobStabilityRequestType                      MessageType = 18
80	ACLPolicyUpsertRequestType                   MessageType = 19
81	ACLPolicyDeleteRequestType                   MessageType = 20
82	ACLTokenUpsertRequestType                    MessageType = 21
83	ACLTokenDeleteRequestType                    MessageType = 22
84	ACLTokenBootstrapRequestType                 MessageType = 23
85	AutopilotRequestType                         MessageType = 24
86	UpsertNodeEventsType                         MessageType = 25
87	JobBatchDeregisterRequestType                MessageType = 26
88	AllocUpdateDesiredTransitionRequestType      MessageType = 27
89	NodeUpdateEligibilityRequestType             MessageType = 28
90	BatchNodeUpdateDrainRequestType              MessageType = 29
91	SchedulerConfigRequestType                   MessageType = 30
92	NodeBatchDeregisterRequestType               MessageType = 31
93	ClusterMetadataRequestType                   MessageType = 32
94	ServiceIdentityAccessorRegisterRequestType   MessageType = 33
95	ServiceIdentityAccessorDeregisterRequestType MessageType = 34
96	CSIVolumeRegisterRequestType                 MessageType = 35
97	CSIVolumeDeregisterRequestType               MessageType = 36
98	CSIVolumeClaimRequestType                    MessageType = 37
99	ScalingEventRegisterRequestType              MessageType = 38
100	CSIVolumeClaimBatchRequestType               MessageType = 39
101	CSIPluginDeleteRequestType                   MessageType = 40
102	EventSinkUpsertRequestType                   MessageType = 41
103	EventSinkDeleteRequestType                   MessageType = 42
104	BatchEventSinkUpdateProgressType             MessageType = 43
105	OneTimeTokenUpsertRequestType                MessageType = 44
106	OneTimeTokenDeleteRequestType                MessageType = 45
107	OneTimeTokenExpireRequestType                MessageType = 46
108
109	// Namespace types were moved from enterprise and therefore start at 64
110	NamespaceUpsertRequestType MessageType = 64
111	NamespaceDeleteRequestType MessageType = 65
112)
113
114const (
115	// IgnoreUnknownTypeFlag is set along with a MessageType
116	// to indicate that the message type can be safely ignored
117	// if it is not recognized. This is for future proofing, so
118	// that new commands can be added in a way that won't cause
119	// old servers to crash when the FSM attempts to process them.
120	IgnoreUnknownTypeFlag MessageType = 128
121
122	// MsgTypeTestSetup is used during testing when calling state store
123	// methods directly that require an FSM MessageType
124	MsgTypeTestSetup MessageType = IgnoreUnknownTypeFlag
125
126	// ApiMajorVersion is returned as part of the Status.Version request.
127	// It should be incremented anytime the APIs are changed in a way
128	// that would break clients for sane client versioning.
129	ApiMajorVersion = 1
130
131	// ApiMinorVersion is returned as part of the Status.Version request.
132	// It should be incremented anytime the APIs are changed to allow
133	// for sane client versioning. Minor changes should be compatible
134	// within the major version.
135	ApiMinorVersion = 1
136
137	ProtocolVersion = "protocol"
138	APIMajorVersion = "api.major"
139	APIMinorVersion = "api.minor"
140
141	GetterModeAny  = "any"
142	GetterModeFile = "file"
143	GetterModeDir  = "dir"
144
145	// maxPolicyDescriptionLength limits a policy description length
146	maxPolicyDescriptionLength = 256
147
148	// maxTokenNameLength limits a ACL token name length
149	maxTokenNameLength = 256
150
151	// ACLClientToken and ACLManagementToken are the only types of tokens
152	ACLClientToken     = "client"
153	ACLManagementToken = "management"
154
155	// DefaultNamespace is the default namespace.
156	DefaultNamespace            = "default"
157	DefaultNamespaceDescription = "Default shared namespace"
158
159	// AllNamespacesSentinel is the value used as a namespace RPC value
160	// to indicate that endpoints must search in all namespaces
161	AllNamespacesSentinel = "*"
162
163	// maxNamespaceDescriptionLength limits a namespace description length
164	maxNamespaceDescriptionLength = 256
165
166	// JitterFraction is a the limit to the amount of jitter we apply
167	// to a user specified MaxQueryTime. We divide the specified time by
168	// the fraction. So 16 == 6.25% limit of jitter. This jitter is also
169	// applied to RPCHoldTimeout.
170	JitterFraction = 16
171
172	// MaxRetainedNodeEvents is the maximum number of node events that will be
173	// retained for a single node
174	MaxRetainedNodeEvents = 10
175
176	// MaxRetainedNodeScores is the number of top scoring nodes for which we
177	// retain scoring metadata
178	MaxRetainedNodeScores = 5
179
180	// Normalized scorer name
181	NormScorerName = "normalized-score"
182
183	// MaxBlockingRPCQueryTime is used to bound the limit of a blocking query
184	MaxBlockingRPCQueryTime = 300 * time.Second
185
186	// DefaultBlockingRPCQueryTime is the amount of time we block waiting for a change
187	// if no time is specified. Previously we would wait the MaxBlockingRPCQueryTime.
188	DefaultBlockingRPCQueryTime = 300 * time.Second
189)
190
191var (
192	// validNamespaceName is used to validate a namespace name
193	validNamespaceName = regexp.MustCompile("^[a-zA-Z0-9-]{1,128}$")
194)
195
196// NamespacedID is a tuple of an ID and a namespace
197type NamespacedID struct {
198	ID        string
199	Namespace string
200}
201
202// NewNamespacedID returns a new namespaced ID given the ID and namespace
203func NewNamespacedID(id, ns string) NamespacedID {
204	return NamespacedID{
205		ID:        id,
206		Namespace: ns,
207	}
208}
209
210func (n NamespacedID) String() string {
211	return fmt.Sprintf("<ns: %q, id: %q>", n.Namespace, n.ID)
212}
213
214// RPCInfo is used to describe common information about query
215type RPCInfo interface {
216	RequestRegion() string
217	IsRead() bool
218	AllowStaleRead() bool
219	IsForwarded() bool
220	SetForwarded()
221	TimeToBlock() time.Duration
222	// TimeToBlock sets how long this request can block. The requested time may not be possible,
223	// so Callers should readback TimeToBlock. E.g. you cannot set time to block at all on WriteRequests
224	// and it cannot exceed MaxBlockingRPCQueryTime
225	SetTimeToBlock(t time.Duration)
226}
227
228// InternalRpcInfo allows adding internal RPC metadata to an RPC. This struct
229// should NOT be replicated in the API package as it is internal only.
230type InternalRpcInfo struct {
231	// Forwarded marks whether the RPC has been forwarded.
232	Forwarded bool
233}
234
235// IsForwarded returns whether the RPC is forwarded from another server.
236func (i *InternalRpcInfo) IsForwarded() bool {
237	return i.Forwarded
238}
239
240// SetForwarded marks that the RPC is being forwarded from another server.
241func (i *InternalRpcInfo) SetForwarded() {
242	i.Forwarded = true
243}
244
245// QueryOptions is used to specify various flags for read queries
246type QueryOptions struct {
247	// The target region for this query
248	Region string
249
250	// Namespace is the target namespace for the query.
251	//
252	// Since handlers do not have a default value set they should access
253	// the Namespace via the RequestNamespace method.
254	//
255	// Requests accessing specific namespaced objects must check ACLs
256	// against the namespace of the object, not the namespace in the
257	// request.
258	Namespace string
259
260	// If set, wait until query exceeds given index. Must be provided
261	// with MaxQueryTime.
262	MinQueryIndex uint64
263
264	// Provided with MinQueryIndex to wait for change.
265	MaxQueryTime time.Duration
266
267	// If set, any follower can service the request. Results
268	// may be arbitrarily stale.
269	AllowStale bool
270
271	// If set, used as prefix for resource list searches
272	Prefix string
273
274	// AuthToken is secret portion of the ACL token used for the request
275	AuthToken string
276
277	// PerPage is the number of entries to be returned in queries that support
278	// paginated lists.
279	PerPage int32
280
281	// NextToken is the token used indicate where to start paging for queries
282	// that support paginated lists.
283	NextToken string
284
285	InternalRpcInfo
286}
287
288// TimeToBlock returns MaxQueryTime adjusted for maximums and defaults
289// it will return 0 if this is not a blocking query
290func (q QueryOptions) TimeToBlock() time.Duration {
291	if q.MinQueryIndex == 0 {
292		return 0
293	}
294	if q.MaxQueryTime > MaxBlockingRPCQueryTime {
295		return MaxBlockingRPCQueryTime
296	} else if q.MaxQueryTime <= 0 {
297		return DefaultBlockingRPCQueryTime
298	}
299	return q.MaxQueryTime
300}
301
302func (q QueryOptions) SetTimeToBlock(t time.Duration) {
303	q.MaxQueryTime = t
304}
305
306func (q QueryOptions) RequestRegion() string {
307	return q.Region
308}
309
310// RequestNamespace returns the request's namespace or the default namespace if
311// no explicit namespace was sent.
312//
313// Requests accessing specific namespaced objects must check ACLs against the
314// namespace of the object, not the namespace in the request.
315func (q QueryOptions) RequestNamespace() string {
316	if q.Namespace == "" {
317		return DefaultNamespace
318	}
319	return q.Namespace
320}
321
322// QueryOption only applies to reads, so always true
323func (q QueryOptions) IsRead() bool {
324	return true
325}
326
327func (q QueryOptions) AllowStaleRead() bool {
328	return q.AllowStale
329}
330
331// AgentPprofRequest is used to request a pprof report for a given node.
332type AgentPprofRequest struct {
333	// ReqType specifies the profile to use
334	ReqType pprof.ReqType
335
336	// Profile specifies the runtime/pprof profile to lookup and generate.
337	Profile string
338
339	// Seconds is the number of seconds to capture a profile
340	Seconds int
341
342	// Debug specifies if pprof profile should inclue debug output
343	Debug int
344
345	// GC specifies if the profile should call runtime.GC() before
346	// running its profile. This is only used for "heap" profiles
347	GC int
348
349	// NodeID is the node we want to track the logs of
350	NodeID string
351
352	// ServerID is the server we want to track the logs of
353	ServerID string
354
355	QueryOptions
356}
357
358// AgentPprofResponse is used to return a generated pprof profile
359type AgentPprofResponse struct {
360	// ID of the agent that fulfilled the request
361	AgentID string
362
363	// Payload is the generated pprof profile
364	Payload []byte
365
366	// HTTPHeaders are a set of key value pairs to be applied as
367	// HTTP headers for a specific runtime profile
368	HTTPHeaders map[string]string
369}
370
371type WriteRequest struct {
372	// The target region for this write
373	Region string
374
375	// Namespace is the target namespace for the write.
376	//
377	// Since RPC handlers do not have a default value set they should
378	// access the Namespace via the RequestNamespace method.
379	//
380	// Requests accessing specific namespaced objects must check ACLs
381	// against the namespace of the object, not the namespace in the
382	// request.
383	Namespace string
384
385	// AuthToken is secret portion of the ACL token used for the request
386	AuthToken string
387
388	InternalRpcInfo
389}
390
391func (w WriteRequest) TimeToBlock() time.Duration {
392	return 0
393}
394
395func (w WriteRequest) SetTimeToBlock(_ time.Duration) {
396}
397
398func (w WriteRequest) RequestRegion() string {
399	// The target region for this request
400	return w.Region
401}
402
403// RequestNamespace returns the request's namespace or the default namespace if
404// no explicit namespace was sent.
405//
406// Requests accessing specific namespaced objects must check ACLs against the
407// namespace of the object, not the namespace in the request.
408func (w WriteRequest) RequestNamespace() string {
409	if w.Namespace == "" {
410		return DefaultNamespace
411	}
412	return w.Namespace
413}
414
415// WriteRequest only applies to writes, always false
416func (w WriteRequest) IsRead() bool {
417	return false
418}
419
420func (w WriteRequest) AllowStaleRead() bool {
421	return false
422}
423
424// QueryMeta allows a query response to include potentially
425// useful metadata about a query
426type QueryMeta struct {
427	// This is the index associated with the read
428	Index uint64
429
430	// If AllowStale is used, this is time elapsed since
431	// last contact between the follower and leader. This
432	// can be used to gauge staleness.
433	LastContact time.Duration
434
435	// Used to indicate if there is a known leader node
436	KnownLeader bool
437}
438
439// WriteMeta allows a write response to include potentially
440// useful metadata about the write
441type WriteMeta struct {
442	// This is the index associated with the write
443	Index uint64
444}
445
446// NodeRegisterRequest is used for Node.Register endpoint
447// to register a node as being a schedulable entity.
448type NodeRegisterRequest struct {
449	Node      *Node
450	NodeEvent *NodeEvent
451	WriteRequest
452}
453
454// NodeDeregisterRequest is used for Node.Deregister endpoint
455// to deregister a node as being a schedulable entity.
456type NodeDeregisterRequest struct {
457	NodeID string
458	WriteRequest
459}
460
461// NodeBatchDeregisterRequest is used for Node.BatchDeregister endpoint
462// to deregister a batch of nodes from being schedulable entities.
463type NodeBatchDeregisterRequest struct {
464	NodeIDs []string
465	WriteRequest
466}
467
468// NodeServerInfo is used to in NodeUpdateResponse to return Nomad server
469// information used in RPC server lists.
470type NodeServerInfo struct {
471	// RPCAdvertiseAddr is the IP endpoint that a Nomad Server wishes to
472	// be contacted at for RPCs.
473	RPCAdvertiseAddr string
474
475	// RpcMajorVersion is the major version number the Nomad Server
476	// supports
477	RPCMajorVersion int32
478
479	// RpcMinorVersion is the minor version number the Nomad Server
480	// supports
481	RPCMinorVersion int32
482
483	// Datacenter is the datacenter that a Nomad server belongs to
484	Datacenter string
485}
486
487// NodeUpdateStatusRequest is used for Node.UpdateStatus endpoint
488// to update the status of a node.
489type NodeUpdateStatusRequest struct {
490	NodeID    string
491	Status    string
492	NodeEvent *NodeEvent
493	UpdatedAt int64
494	WriteRequest
495}
496
497// NodeUpdateDrainRequest is used for updating the drain strategy
498type NodeUpdateDrainRequest struct {
499	NodeID        string
500	DrainStrategy *DrainStrategy
501
502	// MarkEligible marks the node as eligible if removing the drain strategy.
503	MarkEligible bool
504
505	// NodeEvent is the event added to the node
506	NodeEvent *NodeEvent
507
508	// UpdatedAt represents server time of receiving request
509	UpdatedAt int64
510
511	// Meta is user-provided metadata relating to the drain operation
512	Meta map[string]string
513
514	WriteRequest
515}
516
517// BatchNodeUpdateDrainRequest is used for updating the drain strategy for a
518// batch of nodes
519type BatchNodeUpdateDrainRequest struct {
520	// Updates is a mapping of nodes to their updated drain strategy
521	Updates map[string]*DrainUpdate
522
523	// NodeEvents is a mapping of the node to the event to add to the node
524	NodeEvents map[string]*NodeEvent
525
526	// UpdatedAt represents server time of receiving request
527	UpdatedAt int64
528
529	WriteRequest
530}
531
532// DrainUpdate is used to update the drain of a node
533type DrainUpdate struct {
534	// DrainStrategy is the new strategy for the node
535	DrainStrategy *DrainStrategy
536
537	// MarkEligible marks the node as eligible if removing the drain strategy.
538	MarkEligible bool
539}
540
541// NodeUpdateEligibilityRequest is used for updating the scheduling	eligibility
542type NodeUpdateEligibilityRequest struct {
543	NodeID      string
544	Eligibility string
545
546	// NodeEvent is the event added to the node
547	NodeEvent *NodeEvent
548
549	// UpdatedAt represents server time of receiving request
550	UpdatedAt int64
551
552	WriteRequest
553}
554
555// NodeEvaluateRequest is used to re-evaluate the node
556type NodeEvaluateRequest struct {
557	NodeID string
558	WriteRequest
559}
560
561// NodeSpecificRequest is used when we just need to specify a target node
562type NodeSpecificRequest struct {
563	NodeID   string
564	SecretID string
565	QueryOptions
566}
567
568// JobRegisterRequest is used for Job.Register endpoint
569// to register a job as being a schedulable entity.
570type JobRegisterRequest struct {
571	Job *Job
572
573	// If EnforceIndex is set then the job will only be registered if the passed
574	// JobModifyIndex matches the current Jobs index. If the index is zero, the
575	// register only occurs if the job is new.
576	EnforceIndex   bool
577	JobModifyIndex uint64
578
579	// PreserveCounts indicates that during job update, existing task group
580	// counts should be preserved, over those specified in the new job spec
581	// PreserveCounts is ignored for newly created jobs.
582	PreserveCounts bool
583
584	// PolicyOverride is set when the user is attempting to override any policies
585	PolicyOverride bool
586
587	// Eval is the evaluation that is associated with the job registration
588	Eval *Evaluation
589
590	WriteRequest
591}
592
593// JobDeregisterRequest is used for Job.Deregister endpoint
594// to deregister a job as being a schedulable entity.
595type JobDeregisterRequest struct {
596	JobID string
597
598	// Purge controls whether the deregister purges the job from the system or
599	// whether the job is just marked as stopped and will be removed by the
600	// garbage collector
601	Purge bool
602
603	// Global controls whether all regions of a multi-region job are
604	// deregistered. It is ignored for single-region jobs.
605	Global bool
606
607	// Eval is the evaluation to create that's associated with job deregister
608	Eval *Evaluation
609
610	WriteRequest
611}
612
613// JobBatchDeregisterRequest is used to batch deregister jobs and upsert
614// evaluations.
615type JobBatchDeregisterRequest struct {
616	// Jobs is the set of jobs to deregister
617	Jobs map[NamespacedID]*JobDeregisterOptions
618
619	// Evals is the set of evaluations to create.
620	Evals []*Evaluation
621
622	WriteRequest
623}
624
625// JobDeregisterOptions configures how a job is deregistered.
626type JobDeregisterOptions struct {
627	// Purge controls whether the deregister purges the job from the system or
628	// whether the job is just marked as stopped and will be removed by the
629	// garbage collector
630	Purge bool
631}
632
633// JobEvaluateRequest is used when we just need to re-evaluate a target job
634type JobEvaluateRequest struct {
635	JobID       string
636	EvalOptions EvalOptions
637	WriteRequest
638}
639
640// EvalOptions is used to encapsulate options when forcing a job evaluation
641type EvalOptions struct {
642	ForceReschedule bool
643}
644
645// JobSpecificRequest is used when we just need to specify a target job
646type JobSpecificRequest struct {
647	JobID string
648	All   bool
649	QueryOptions
650}
651
652// JobListRequest is used to parameterize a list request
653type JobListRequest struct {
654	QueryOptions
655}
656
657// JobPlanRequest is used for the Job.Plan endpoint to trigger a dry-run
658// evaluation of the Job.
659type JobPlanRequest struct {
660	Job  *Job
661	Diff bool // Toggles an annotated diff
662	// PolicyOverride is set when the user is attempting to override any policies
663	PolicyOverride bool
664	WriteRequest
665}
666
667// JobScaleRequest is used for the Job.Scale endpoint to scale one of the
668// scaling targets in a job
669type JobScaleRequest struct {
670	JobID   string
671	Target  map[string]string
672	Count   *int64
673	Message string
674	Error   bool
675	Meta    map[string]interface{}
676	// PolicyOverride is set when the user is attempting to override any policies
677	PolicyOverride bool
678	WriteRequest
679}
680
681// Validate is used to validate the arguments in the request
682func (r *JobScaleRequest) Validate() error {
683	namespace := r.Target[ScalingTargetNamespace]
684	if namespace != "" && namespace != r.RequestNamespace() {
685		return NewErrRPCCoded(400, "namespace in payload did not match header")
686	}
687
688	jobID := r.Target[ScalingTargetJob]
689	if jobID != "" && jobID != r.JobID {
690		return fmt.Errorf("job ID in payload did not match URL")
691	}
692
693	groupName := r.Target[ScalingTargetGroup]
694	if groupName == "" {
695		return NewErrRPCCoded(400, "missing task group name for scaling action")
696	}
697
698	if r.Count != nil {
699		if *r.Count < 0 {
700			return NewErrRPCCoded(400, "scaling action count can't be negative")
701		}
702
703		if r.Error {
704			return NewErrRPCCoded(400, "scaling action should not contain count if error is true")
705		}
706
707		truncCount := int(*r.Count)
708		if int64(truncCount) != *r.Count {
709			return NewErrRPCCoded(400,
710				fmt.Sprintf("new scaling count is too large for TaskGroup.Count (int): %v", r.Count))
711		}
712	}
713
714	return nil
715}
716
717// JobSummaryRequest is used when we just need to get a specific job summary
718type JobSummaryRequest struct {
719	JobID string
720	QueryOptions
721}
722
723// JobScaleStatusRequest is used to get the scale status for a job
724type JobScaleStatusRequest struct {
725	JobID string
726	QueryOptions
727}
728
729// JobDispatchRequest is used to dispatch a job based on a parameterized job
730type JobDispatchRequest struct {
731	JobID   string
732	Payload []byte
733	Meta    map[string]string
734	WriteRequest
735}
736
737// JobValidateRequest is used to validate a job
738type JobValidateRequest struct {
739	Job *Job
740	WriteRequest
741}
742
743// JobRevertRequest is used to revert a job to a prior version.
744type JobRevertRequest struct {
745	// JobID is the ID of the job  being reverted
746	JobID string
747
748	// JobVersion the version to revert to.
749	JobVersion uint64
750
751	// EnforcePriorVersion if set will enforce that the job is at the given
752	// version before reverting.
753	EnforcePriorVersion *uint64
754
755	// ConsulToken is the Consul token that proves the submitter of the job revert
756	// has access to the Service Identity policies associated with the job's
757	// Consul Connect enabled services. This field is only used to transfer the
758	// token and is not stored after the Job revert.
759	ConsulToken string
760
761	// VaultToken is the Vault token that proves the submitter of the job revert
762	// has access to any Vault policies specified in the targeted job version. This
763	// field is only used to transfer the token and is not stored after the Job
764	// revert.
765	VaultToken string
766
767	WriteRequest
768}
769
770// JobStabilityRequest is used to marked a job as stable.
771type JobStabilityRequest struct {
772	// Job to set the stability on
773	JobID      string
774	JobVersion uint64
775
776	// Set the stability
777	Stable bool
778	WriteRequest
779}
780
781// JobStabilityResponse is the response when marking a job as stable.
782type JobStabilityResponse struct {
783	WriteMeta
784}
785
786// NodeListRequest is used to parameterize a list request
787type NodeListRequest struct {
788	QueryOptions
789
790	Fields *NodeStubFields
791}
792
793// EvalUpdateRequest is used for upserting evaluations.
794type EvalUpdateRequest struct {
795	Evals     []*Evaluation
796	EvalToken string
797	WriteRequest
798}
799
800// EvalDeleteRequest is used for deleting an evaluation.
801type EvalDeleteRequest struct {
802	Evals  []string
803	Allocs []string
804	WriteRequest
805}
806
807// EvalSpecificRequest is used when we just need to specify a target evaluation
808type EvalSpecificRequest struct {
809	EvalID string
810	QueryOptions
811}
812
813// EvalAckRequest is used to Ack/Nack a specific evaluation
814type EvalAckRequest struct {
815	EvalID string
816	Token  string
817	WriteRequest
818}
819
820// EvalDequeueRequest is used when we want to dequeue an evaluation
821type EvalDequeueRequest struct {
822	Schedulers       []string
823	Timeout          time.Duration
824	SchedulerVersion uint16
825	WriteRequest
826}
827
828// EvalListRequest is used to list the evaluations
829type EvalListRequest struct {
830	QueryOptions
831}
832
833// PlanRequest is used to submit an allocation plan to the leader
834type PlanRequest struct {
835	Plan *Plan
836	WriteRequest
837}
838
839// ApplyPlanResultsRequest is used by the planner to apply a Raft transaction
840// committing the result of a plan.
841type ApplyPlanResultsRequest struct {
842	// AllocUpdateRequest holds the allocation updates to be made by the
843	// scheduler.
844	AllocUpdateRequest
845
846	// Deployment is the deployment created or updated as a result of a
847	// scheduling event.
848	Deployment *Deployment
849
850	// DeploymentUpdates is a set of status updates to apply to the given
851	// deployments. This allows the scheduler to cancel any unneeded deployment
852	// because the job is stopped or the update block is removed.
853	DeploymentUpdates []*DeploymentStatusUpdate
854
855	// EvalID is the eval ID of the plan being applied. The modify index of the
856	// evaluation is updated as part of applying the plan to ensure that subsequent
857	// scheduling events for the same job will wait for the index that last produced
858	// state changes. This is necessary for blocked evaluations since they can be
859	// processed many times, potentially making state updates, without the state of
860	// the evaluation itself being updated.
861	EvalID string
862
863	// COMPAT 0.11
864	// NodePreemptions is a slice of allocations from other lower priority jobs
865	// that are preempted. Preempted allocations are marked as evicted.
866	// Deprecated: Replaced with AllocsPreempted which contains only the diff
867	NodePreemptions []*Allocation
868
869	// AllocsPreempted is a slice of allocation diffs from other lower priority jobs
870	// that are preempted. Preempted allocations are marked as evicted.
871	AllocsPreempted []*AllocationDiff
872
873	// PreemptionEvals is a slice of follow up evals for jobs whose allocations
874	// have been preempted to place allocs in this plan
875	PreemptionEvals []*Evaluation
876}
877
878// AllocUpdateRequest is used to submit changes to allocations, either
879// to cause evictions or to assign new allocations. Both can be done
880// within a single transaction
881type AllocUpdateRequest struct {
882	// COMPAT 0.11
883	// Alloc is the list of new allocations to assign
884	// Deprecated: Replaced with two separate slices, one containing stopped allocations
885	// and another containing updated allocations
886	Alloc []*Allocation
887
888	// Allocations to stop. Contains only the diff, not the entire allocation
889	AllocsStopped []*AllocationDiff
890
891	// New or updated allocations
892	AllocsUpdated []*Allocation
893
894	// Evals is the list of new evaluations to create
895	// Evals are valid only when used in the Raft RPC
896	Evals []*Evaluation
897
898	// Job is the shared parent job of the allocations.
899	// It is pulled out since it is common to reduce payload size.
900	Job *Job
901
902	WriteRequest
903}
904
905// AllocUpdateDesiredTransitionRequest is used to submit changes to allocations
906// desired transition state.
907type AllocUpdateDesiredTransitionRequest struct {
908	// Allocs is the mapping of allocation ids to their desired state
909	// transition
910	Allocs map[string]*DesiredTransition
911
912	// Evals is the set of evaluations to create
913	Evals []*Evaluation
914
915	WriteRequest
916}
917
918// AllocStopRequest is used to stop and reschedule a running Allocation.
919type AllocStopRequest struct {
920	AllocID string
921
922	WriteRequest
923}
924
925// AllocStopResponse is the response to an `AllocStopRequest`
926type AllocStopResponse struct {
927	// EvalID is the id of the follow up evalution for the rescheduled alloc.
928	EvalID string
929
930	WriteMeta
931}
932
933// AllocListRequest is used to request a list of allocations
934type AllocListRequest struct {
935	QueryOptions
936
937	Fields *AllocStubFields
938}
939
940// AllocSpecificRequest is used to query a specific allocation
941type AllocSpecificRequest struct {
942	AllocID string
943	QueryOptions
944}
945
946// AllocSignalRequest is used to signal a specific allocation
947type AllocSignalRequest struct {
948	AllocID string
949	Task    string
950	Signal  string
951	QueryOptions
952}
953
954// AllocsGetRequest is used to query a set of allocations
955type AllocsGetRequest struct {
956	AllocIDs []string
957	QueryOptions
958}
959
960// AllocRestartRequest is used to restart a specific allocations tasks.
961type AllocRestartRequest struct {
962	AllocID  string
963	TaskName string
964
965	QueryOptions
966}
967
968// PeriodicForceRequest is used to force a specific periodic job.
969type PeriodicForceRequest struct {
970	JobID string
971	WriteRequest
972}
973
974// ServerMembersResponse has the list of servers in a cluster
975type ServerMembersResponse struct {
976	ServerName   string
977	ServerRegion string
978	ServerDC     string
979	Members      []*ServerMember
980}
981
982// ServerMember holds information about a Nomad server agent in a cluster
983type ServerMember struct {
984	Name        string
985	Addr        net.IP
986	Port        uint16
987	Tags        map[string]string
988	Status      string
989	ProtocolMin uint8
990	ProtocolMax uint8
991	ProtocolCur uint8
992	DelegateMin uint8
993	DelegateMax uint8
994	DelegateCur uint8
995}
996
997// ClusterMetadata is used to store per-cluster metadata.
998type ClusterMetadata struct {
999	ClusterID  string
1000	CreateTime int64
1001}
1002
1003// DeriveVaultTokenRequest is used to request wrapped Vault tokens for the
1004// following tasks in the given allocation
1005type DeriveVaultTokenRequest struct {
1006	NodeID   string
1007	SecretID string
1008	AllocID  string
1009	Tasks    []string
1010	QueryOptions
1011}
1012
1013// VaultAccessorsRequest is used to operate on a set of Vault accessors
1014type VaultAccessorsRequest struct {
1015	Accessors []*VaultAccessor
1016}
1017
1018// VaultAccessor is a reference to a created Vault token on behalf of
1019// an allocation's task.
1020type VaultAccessor struct {
1021	AllocID     string
1022	Task        string
1023	NodeID      string
1024	Accessor    string
1025	CreationTTL int
1026
1027	// Raft Indexes
1028	CreateIndex uint64
1029}
1030
1031// DeriveVaultTokenResponse returns the wrapped tokens for each requested task
1032type DeriveVaultTokenResponse struct {
1033	// Tasks is a mapping between the task name and the wrapped token
1034	Tasks map[string]string
1035
1036	// Error stores any error that occurred. Errors are stored here so we can
1037	// communicate whether it is retryable
1038	Error *RecoverableError
1039
1040	QueryMeta
1041}
1042
1043// GenericRequest is used to request where no
1044// specific information is needed.
1045type GenericRequest struct {
1046	QueryOptions
1047}
1048
1049// DeploymentListRequest is used to list the deployments
1050type DeploymentListRequest struct {
1051	QueryOptions
1052}
1053
1054// DeploymentDeleteRequest is used for deleting deployments.
1055type DeploymentDeleteRequest struct {
1056	Deployments []string
1057	WriteRequest
1058}
1059
1060// DeploymentStatusUpdateRequest is used to update the status of a deployment as
1061// well as optionally creating an evaluation atomically.
1062type DeploymentStatusUpdateRequest struct {
1063	// Eval, if set, is used to create an evaluation at the same time as
1064	// updating the status of a deployment.
1065	Eval *Evaluation
1066
1067	// DeploymentUpdate is a status update to apply to the given
1068	// deployment.
1069	DeploymentUpdate *DeploymentStatusUpdate
1070
1071	// Job is used to optionally upsert a job. This is used when setting the
1072	// allocation health results in a deployment failure and the deployment
1073	// auto-reverts to the latest stable job.
1074	Job *Job
1075}
1076
1077// DeploymentAllocHealthRequest is used to set the health of a set of
1078// allocations as part of a deployment.
1079type DeploymentAllocHealthRequest struct {
1080	DeploymentID string
1081
1082	// Marks these allocations as healthy, allow further allocations
1083	// to be rolled.
1084	HealthyAllocationIDs []string
1085
1086	// Any unhealthy allocations fail the deployment
1087	UnhealthyAllocationIDs []string
1088
1089	WriteRequest
1090}
1091
1092// ApplyDeploymentAllocHealthRequest is used to apply an alloc health request via Raft
1093type ApplyDeploymentAllocHealthRequest struct {
1094	DeploymentAllocHealthRequest
1095
1096	// Timestamp is the timestamp to use when setting the allocations health.
1097	Timestamp time.Time
1098
1099	// An optional field to update the status of a deployment
1100	DeploymentUpdate *DeploymentStatusUpdate
1101
1102	// Job is used to optionally upsert a job. This is used when setting the
1103	// allocation health results in a deployment failure and the deployment
1104	// auto-reverts to the latest stable job.
1105	Job *Job
1106
1107	// An optional evaluation to create after promoting the canaries
1108	Eval *Evaluation
1109}
1110
1111// DeploymentPromoteRequest is used to promote task groups in a deployment
1112type DeploymentPromoteRequest struct {
1113	DeploymentID string
1114
1115	// All is to promote all task groups
1116	All bool
1117
1118	// Groups is used to set the promotion status per task group
1119	Groups []string
1120
1121	WriteRequest
1122}
1123
1124// ApplyDeploymentPromoteRequest is used to apply a promotion request via Raft
1125type ApplyDeploymentPromoteRequest struct {
1126	DeploymentPromoteRequest
1127
1128	// An optional evaluation to create after promoting the canaries
1129	Eval *Evaluation
1130}
1131
1132// DeploymentPauseRequest is used to pause a deployment
1133type DeploymentPauseRequest struct {
1134	DeploymentID string
1135
1136	// Pause sets the pause status
1137	Pause bool
1138
1139	WriteRequest
1140}
1141
1142// DeploymentRunRequest is used to remotely start a pending deployment.
1143// Used only for multiregion deployments.
1144type DeploymentRunRequest struct {
1145	DeploymentID string
1146
1147	WriteRequest
1148}
1149
1150// DeploymentUnblockRequest is used to remotely unblock a deployment.
1151// Used only for multiregion deployments.
1152type DeploymentUnblockRequest struct {
1153	DeploymentID string
1154
1155	WriteRequest
1156}
1157
1158// DeploymentCancelRequest is used to remotely cancel a deployment.
1159// Used only for multiregion deployments.
1160type DeploymentCancelRequest struct {
1161	DeploymentID string
1162
1163	WriteRequest
1164}
1165
1166// DeploymentSpecificRequest is used to make a request specific to a particular
1167// deployment
1168type DeploymentSpecificRequest struct {
1169	DeploymentID string
1170	QueryOptions
1171}
1172
1173// DeploymentFailRequest is used to fail a particular deployment
1174type DeploymentFailRequest struct {
1175	DeploymentID string
1176	WriteRequest
1177}
1178
1179// ScalingPolicySpecificRequest is used when we just need to specify a target scaling policy
1180type ScalingPolicySpecificRequest struct {
1181	ID string
1182	QueryOptions
1183}
1184
1185// SingleScalingPolicyResponse is used to return a single job
1186type SingleScalingPolicyResponse struct {
1187	Policy *ScalingPolicy
1188	QueryMeta
1189}
1190
1191// ScalingPolicyListRequest is used to parameterize a scaling policy list request
1192type ScalingPolicyListRequest struct {
1193	Job  string
1194	Type string
1195	QueryOptions
1196}
1197
1198// ScalingPolicyListResponse is used for a list request
1199type ScalingPolicyListResponse struct {
1200	Policies []*ScalingPolicyListStub
1201	QueryMeta
1202}
1203
1204// SingleDeploymentResponse is used to respond with a single deployment
1205type SingleDeploymentResponse struct {
1206	Deployment *Deployment
1207	QueryMeta
1208}
1209
1210// GenericResponse is used to respond to a request where no
1211// specific response information is needed.
1212type GenericResponse struct {
1213	WriteMeta
1214}
1215
1216// VersionResponse is used for the Status.Version response
1217type VersionResponse struct {
1218	Build    string
1219	Versions map[string]int
1220	QueryMeta
1221}
1222
1223// JobRegisterResponse is used to respond to a job registration
1224type JobRegisterResponse struct {
1225	EvalID          string
1226	EvalCreateIndex uint64
1227	JobModifyIndex  uint64
1228
1229	// Warnings contains any warnings about the given job. These may include
1230	// deprecation warnings.
1231	Warnings string
1232
1233	QueryMeta
1234}
1235
1236// JobDeregisterResponse is used to respond to a job deregistration
1237type JobDeregisterResponse struct {
1238	EvalID          string
1239	EvalCreateIndex uint64
1240	JobModifyIndex  uint64
1241	VolumeEvalID    string
1242	VolumeEvalIndex uint64
1243	QueryMeta
1244}
1245
1246// JobBatchDeregisterResponse is used to respond to a batch job deregistration
1247type JobBatchDeregisterResponse struct {
1248	// JobEvals maps the job to its created evaluation
1249	JobEvals map[NamespacedID]string
1250	QueryMeta
1251}
1252
1253// JobValidateResponse is the response from validate request
1254type JobValidateResponse struct {
1255	// DriverConfigValidated indicates whether the agent validated the driver
1256	// config
1257	DriverConfigValidated bool
1258
1259	// ValidationErrors is a list of validation errors
1260	ValidationErrors []string
1261
1262	// Error is a string version of any error that may have occurred
1263	Error string
1264
1265	// Warnings contains any warnings about the given job. These may include
1266	// deprecation warnings.
1267	Warnings string
1268}
1269
1270// NodeUpdateResponse is used to respond to a node update
1271type NodeUpdateResponse struct {
1272	HeartbeatTTL    time.Duration
1273	EvalIDs         []string
1274	EvalCreateIndex uint64
1275	NodeModifyIndex uint64
1276
1277	// Features informs clients what enterprise features are allowed
1278	Features uint64
1279
1280	// LeaderRPCAddr is the RPC address of the current Raft Leader.  If
1281	// empty, the current Nomad Server is in the minority of a partition.
1282	LeaderRPCAddr string
1283
1284	// NumNodes is the number of Nomad nodes attached to this quorum of
1285	// Nomad Servers at the time of the response.  This value can
1286	// fluctuate based on the health of the cluster between heartbeats.
1287	NumNodes int32
1288
1289	// Servers is the full list of known Nomad servers in the local
1290	// region.
1291	Servers []*NodeServerInfo
1292
1293	QueryMeta
1294}
1295
1296// NodeDrainUpdateResponse is used to respond to a node drain update
1297type NodeDrainUpdateResponse struct {
1298	NodeModifyIndex uint64
1299	EvalIDs         []string
1300	EvalCreateIndex uint64
1301	WriteMeta
1302}
1303
1304// NodeEligibilityUpdateResponse is used to respond to a node eligibility update
1305type NodeEligibilityUpdateResponse struct {
1306	NodeModifyIndex uint64
1307	EvalIDs         []string
1308	EvalCreateIndex uint64
1309	WriteMeta
1310}
1311
1312// NodeAllocsResponse is used to return allocs for a single node
1313type NodeAllocsResponse struct {
1314	Allocs []*Allocation
1315	QueryMeta
1316}
1317
1318// NodeClientAllocsResponse is used to return allocs meta data for a single node
1319type NodeClientAllocsResponse struct {
1320	Allocs map[string]uint64
1321
1322	// MigrateTokens are used when ACLs are enabled to allow cross node,
1323	// authenticated access to sticky volumes
1324	MigrateTokens map[string]string
1325
1326	QueryMeta
1327}
1328
1329// SingleNodeResponse is used to return a single node
1330type SingleNodeResponse struct {
1331	Node *Node
1332	QueryMeta
1333}
1334
1335// NodeListResponse is used for a list request
1336type NodeListResponse struct {
1337	Nodes []*NodeListStub
1338	QueryMeta
1339}
1340
1341// SingleJobResponse is used to return a single job
1342type SingleJobResponse struct {
1343	Job *Job
1344	QueryMeta
1345}
1346
1347// JobSummaryResponse is used to return a single job summary
1348type JobSummaryResponse struct {
1349	JobSummary *JobSummary
1350	QueryMeta
1351}
1352
1353// JobScaleStatusResponse is used to return the scale status for a job
1354type JobScaleStatusResponse struct {
1355	JobScaleStatus *JobScaleStatus
1356	QueryMeta
1357}
1358
1359type JobScaleStatus struct {
1360	JobID          string
1361	Namespace      string
1362	JobCreateIndex uint64
1363	JobModifyIndex uint64
1364	JobStopped     bool
1365	TaskGroups     map[string]*TaskGroupScaleStatus
1366}
1367
1368// TaskGroupScaleStatus is used to return the scale status for a given task group
1369type TaskGroupScaleStatus struct {
1370	Desired   int
1371	Placed    int
1372	Running   int
1373	Healthy   int
1374	Unhealthy int
1375	Events    []*ScalingEvent
1376}
1377
1378type JobDispatchResponse struct {
1379	DispatchedJobID string
1380	EvalID          string
1381	EvalCreateIndex uint64
1382	JobCreateIndex  uint64
1383	WriteMeta
1384}
1385
1386// JobListResponse is used for a list request
1387type JobListResponse struct {
1388	Jobs []*JobListStub
1389	QueryMeta
1390}
1391
1392// JobVersionsRequest is used to get a jobs versions
1393type JobVersionsRequest struct {
1394	JobID string
1395	Diffs bool
1396	QueryOptions
1397}
1398
1399// JobVersionsResponse is used for a job get versions request
1400type JobVersionsResponse struct {
1401	Versions []*Job
1402	Diffs    []*JobDiff
1403	QueryMeta
1404}
1405
1406// JobPlanResponse is used to respond to a job plan request
1407type JobPlanResponse struct {
1408	// Annotations stores annotations explaining decisions the scheduler made.
1409	Annotations *PlanAnnotations
1410
1411	// FailedTGAllocs is the placement failures per task group.
1412	FailedTGAllocs map[string]*AllocMetric
1413
1414	// JobModifyIndex is the modification index of the job. The value can be
1415	// used when running `nomad run` to ensure that the Job wasn’t modified
1416	// since the last plan. If the job is being created, the value is zero.
1417	JobModifyIndex uint64
1418
1419	// CreatedEvals is the set of evaluations created by the scheduler. The
1420	// reasons for this can be rolling-updates or blocked evals.
1421	CreatedEvals []*Evaluation
1422
1423	// Diff contains the diff of the job and annotations on whether the change
1424	// causes an in-place update or create/destroy
1425	Diff *JobDiff
1426
1427	// NextPeriodicLaunch is the time duration till the job would be launched if
1428	// submitted.
1429	NextPeriodicLaunch time.Time
1430
1431	// Warnings contains any warnings about the given job. These may include
1432	// deprecation warnings.
1433	Warnings string
1434
1435	WriteMeta
1436}
1437
1438// SingleAllocResponse is used to return a single allocation
1439type SingleAllocResponse struct {
1440	Alloc *Allocation
1441	QueryMeta
1442}
1443
1444// AllocsGetResponse is used to return a set of allocations
1445type AllocsGetResponse struct {
1446	Allocs []*Allocation
1447	QueryMeta
1448}
1449
1450// JobAllocationsResponse is used to return the allocations for a job
1451type JobAllocationsResponse struct {
1452	Allocations []*AllocListStub
1453	QueryMeta
1454}
1455
1456// JobEvaluationsResponse is used to return the evaluations for a job
1457type JobEvaluationsResponse struct {
1458	Evaluations []*Evaluation
1459	QueryMeta
1460}
1461
1462// SingleEvalResponse is used to return a single evaluation
1463type SingleEvalResponse struct {
1464	Eval *Evaluation
1465	QueryMeta
1466}
1467
1468// EvalDequeueResponse is used to return from a dequeue
1469type EvalDequeueResponse struct {
1470	Eval  *Evaluation
1471	Token string
1472
1473	// WaitIndex is the Raft index the worker should wait until invoking the
1474	// scheduler.
1475	WaitIndex uint64
1476
1477	QueryMeta
1478}
1479
1480// GetWaitIndex is used to retrieve the Raft index in which state should be at
1481// or beyond before invoking the scheduler.
1482func (e *EvalDequeueResponse) GetWaitIndex() uint64 {
1483	// Prefer the wait index sent. This will be populated on all responses from
1484	// 0.7.0 and above
1485	if e.WaitIndex != 0 {
1486		return e.WaitIndex
1487	} else if e.Eval != nil {
1488		return e.Eval.ModifyIndex
1489	}
1490
1491	// This should never happen
1492	return 1
1493}
1494
1495// PlanResponse is used to return from a PlanRequest
1496type PlanResponse struct {
1497	Result *PlanResult
1498	WriteMeta
1499}
1500
1501// AllocListResponse is used for a list request
1502type AllocListResponse struct {
1503	Allocations []*AllocListStub
1504	QueryMeta
1505}
1506
1507// DeploymentListResponse is used for a list request
1508type DeploymentListResponse struct {
1509	Deployments []*Deployment
1510	QueryMeta
1511}
1512
1513// EvalListResponse is used for a list request
1514type EvalListResponse struct {
1515	Evaluations []*Evaluation
1516	QueryMeta
1517}
1518
1519// EvalAllocationsResponse is used to return the allocations for an evaluation
1520type EvalAllocationsResponse struct {
1521	Allocations []*AllocListStub
1522	QueryMeta
1523}
1524
1525// PeriodicForceResponse is used to respond to a periodic job force launch
1526type PeriodicForceResponse struct {
1527	EvalID          string
1528	EvalCreateIndex uint64
1529	WriteMeta
1530}
1531
1532// DeploymentUpdateResponse is used to respond to a deployment change. The
1533// response will include the modify index of the deployment as well as details
1534// of any triggered evaluation.
1535type DeploymentUpdateResponse struct {
1536	EvalID                string
1537	EvalCreateIndex       uint64
1538	DeploymentModifyIndex uint64
1539
1540	// RevertedJobVersion is the version the job was reverted to. If unset, the
1541	// job wasn't reverted
1542	RevertedJobVersion *uint64
1543
1544	WriteMeta
1545}
1546
1547// NodeConnQueryResponse is used to respond to a query of whether a server has
1548// a connection to a specific Node
1549type NodeConnQueryResponse struct {
1550	// Connected indicates whether a connection to the Client exists
1551	Connected bool
1552
1553	// Established marks the time at which the connection was established
1554	Established time.Time
1555
1556	QueryMeta
1557}
1558
1559// HostDataRequest is used by /agent/host to retrieve data about the agent's host system. If
1560// ServerID or NodeID is specified, the request is forwarded to the remote agent
1561type HostDataRequest struct {
1562	ServerID string
1563	NodeID   string
1564	QueryOptions
1565}
1566
1567// HostDataResponse contains the HostData content
1568type HostDataResponse struct {
1569	AgentID  string
1570	HostData *host.HostData
1571}
1572
1573// EmitNodeEventsRequest is a request to update the node events source
1574// with a new client-side event
1575type EmitNodeEventsRequest struct {
1576	// NodeEvents are a map where the key is a node id, and value is a list of
1577	// events for that node
1578	NodeEvents map[string][]*NodeEvent
1579
1580	WriteRequest
1581}
1582
1583// EmitNodeEventsResponse is a response to the client about the status of
1584// the node event source update.
1585type EmitNodeEventsResponse struct {
1586	WriteMeta
1587}
1588
1589const (
1590	NodeEventSubsystemDrain     = "Drain"
1591	NodeEventSubsystemDriver    = "Driver"
1592	NodeEventSubsystemHeartbeat = "Heartbeat"
1593	NodeEventSubsystemCluster   = "Cluster"
1594	NodeEventSubsystemStorage   = "Storage"
1595)
1596
1597// NodeEvent is a single unit representing a node’s state change
1598type NodeEvent struct {
1599	Message     string
1600	Subsystem   string
1601	Details     map[string]string
1602	Timestamp   time.Time
1603	CreateIndex uint64
1604}
1605
1606func (ne *NodeEvent) String() string {
1607	var details []string
1608	for k, v := range ne.Details {
1609		details = append(details, fmt.Sprintf("%s: %s", k, v))
1610	}
1611
1612	return fmt.Sprintf("Message: %s, Subsystem: %s, Details: %s, Timestamp: %s", ne.Message, ne.Subsystem, strings.Join(details, ","), ne.Timestamp.String())
1613}
1614
1615func (ne *NodeEvent) Copy() *NodeEvent {
1616	c := new(NodeEvent)
1617	*c = *ne
1618	c.Details = helper.CopyMapStringString(ne.Details)
1619	return c
1620}
1621
1622// NewNodeEvent generates a new node event storing the current time as the
1623// timestamp
1624func NewNodeEvent() *NodeEvent {
1625	return &NodeEvent{Timestamp: time.Now()}
1626}
1627
1628// SetMessage is used to set the message on the node event
1629func (ne *NodeEvent) SetMessage(msg string) *NodeEvent {
1630	ne.Message = msg
1631	return ne
1632}
1633
1634// SetSubsystem is used to set the subsystem on the node event
1635func (ne *NodeEvent) SetSubsystem(sys string) *NodeEvent {
1636	ne.Subsystem = sys
1637	return ne
1638}
1639
1640// SetTimestamp is used to set the timestamp on the node event
1641func (ne *NodeEvent) SetTimestamp(ts time.Time) *NodeEvent {
1642	ne.Timestamp = ts
1643	return ne
1644}
1645
1646// AddDetail is used to add a detail to the node event
1647func (ne *NodeEvent) AddDetail(k, v string) *NodeEvent {
1648	if ne.Details == nil {
1649		ne.Details = make(map[string]string, 1)
1650	}
1651	ne.Details[k] = v
1652	return ne
1653}
1654
1655const (
1656	NodeStatusInit  = "initializing"
1657	NodeStatusReady = "ready"
1658	NodeStatusDown  = "down"
1659)
1660
1661// ShouldDrainNode checks if a given node status should trigger an
1662// evaluation. Some states don't require any further action.
1663func ShouldDrainNode(status string) bool {
1664	switch status {
1665	case NodeStatusInit, NodeStatusReady:
1666		return false
1667	case NodeStatusDown:
1668		return true
1669	default:
1670		panic(fmt.Sprintf("unhandled node status %s", status))
1671	}
1672}
1673
1674// ValidNodeStatus is used to check if a node status is valid
1675func ValidNodeStatus(status string) bool {
1676	switch status {
1677	case NodeStatusInit, NodeStatusReady, NodeStatusDown:
1678		return true
1679	default:
1680		return false
1681	}
1682}
1683
1684const (
1685	// NodeSchedulingEligible and Ineligible marks the node as eligible or not,
1686	// respectively, for receiving allocations. This is orthoginal to the node
1687	// status being ready.
1688	NodeSchedulingEligible   = "eligible"
1689	NodeSchedulingIneligible = "ineligible"
1690)
1691
1692// DrainSpec describes a Node's desired drain behavior.
1693type DrainSpec struct {
1694	// Deadline is the duration after StartTime when the remaining
1695	// allocations on a draining Node should be told to stop.
1696	Deadline time.Duration
1697
1698	// IgnoreSystemJobs allows systems jobs to remain on the node even though it
1699	// has been marked for draining.
1700	IgnoreSystemJobs bool
1701}
1702
1703// DrainStrategy describes a Node's drain behavior.
1704type DrainStrategy struct {
1705	// DrainSpec is the user declared drain specification
1706	DrainSpec
1707
1708	// ForceDeadline is the deadline time for the drain after which drains will
1709	// be forced
1710	ForceDeadline time.Time
1711
1712	// StartedAt is the time the drain process started
1713	StartedAt time.Time
1714}
1715
1716func (d *DrainStrategy) Copy() *DrainStrategy {
1717	if d == nil {
1718		return nil
1719	}
1720
1721	nd := new(DrainStrategy)
1722	*nd = *d
1723	return nd
1724}
1725
1726// DeadlineTime returns a boolean whether the drain strategy allows an infinite
1727// duration or otherwise the deadline time. The force drain is captured by the
1728// deadline time being in the past.
1729func (d *DrainStrategy) DeadlineTime() (infinite bool, deadline time.Time) {
1730	// Treat the nil case as a force drain so during an upgrade where a node may
1731	// not have a drain strategy but has Drain set to true, it is treated as a
1732	// force to mimick old behavior.
1733	if d == nil {
1734		return false, time.Time{}
1735	}
1736
1737	ns := d.Deadline.Nanoseconds()
1738	switch {
1739	case ns < 0: // Force
1740		return false, time.Time{}
1741	case ns == 0: // Infinite
1742		return true, time.Time{}
1743	default:
1744		return false, d.ForceDeadline
1745	}
1746}
1747
1748func (d *DrainStrategy) Equal(o *DrainStrategy) bool {
1749	if d == nil && o == nil {
1750		return true
1751	} else if o != nil && d == nil {
1752		return false
1753	} else if d != nil && o == nil {
1754		return false
1755	}
1756
1757	// Compare values
1758	if d.ForceDeadline != o.ForceDeadline {
1759		return false
1760	} else if d.Deadline != o.Deadline {
1761		return false
1762	} else if d.IgnoreSystemJobs != o.IgnoreSystemJobs {
1763		return false
1764	}
1765
1766	return true
1767}
1768
1769const (
1770	// DrainStatuses are the various states a drain can be in, as reflect in DrainMetadata
1771	DrainStatusDraining DrainStatus = "draining"
1772	DrainStatusComplete DrainStatus = "complete"
1773	DrainStatusCanceled DrainStatus = "canceled"
1774)
1775
1776type DrainStatus string
1777
1778// DrainMetadata contains information about the most recent drain operation for a given Node.
1779type DrainMetadata struct {
1780	// StartedAt is the time that the drain operation started. This is equal to Node.DrainStrategy.StartedAt,
1781	// if it exists
1782	StartedAt time.Time
1783
1784	// UpdatedAt is the time that that this struct was most recently updated, either via API action
1785	// or drain completion
1786	UpdatedAt time.Time
1787
1788	// Status reflects the status of the drain operation.
1789	Status DrainStatus
1790
1791	// AccessorID is the accessor ID of the ACL token used in the most recent API operation against this drain
1792	AccessorID string
1793
1794	// Meta includes the operator-submitted metadata about this drain operation
1795	Meta map[string]string
1796}
1797
1798func (m *DrainMetadata) Copy() *DrainMetadata {
1799	if m == nil {
1800		return nil
1801	}
1802	c := new(DrainMetadata)
1803	*c = *m
1804	c.Meta = helper.CopyMapStringString(m.Meta)
1805	return c
1806}
1807
1808// Node is a representation of a schedulable client node
1809type Node struct {
1810	// ID is a unique identifier for the node. It can be constructed
1811	// by doing a concatenation of the Name and Datacenter as a simple
1812	// approach. Alternatively a UUID may be used.
1813	ID string
1814
1815	// SecretID is an ID that is only known by the Node and the set of Servers.
1816	// It is not accessible via the API and is used to authenticate nodes
1817	// conducting privileged activities.
1818	SecretID string
1819
1820	// Datacenter for this node
1821	Datacenter string
1822
1823	// Node name
1824	Name string
1825
1826	// HTTPAddr is the address on which the Nomad client is listening for http
1827	// requests
1828	HTTPAddr string
1829
1830	// TLSEnabled indicates if the Agent has TLS enabled for the HTTP API
1831	TLSEnabled bool
1832
1833	// Attributes is an arbitrary set of key/value
1834	// data that can be used for constraints. Examples
1835	// include "kernel.name=linux", "arch=386", "driver.docker=1",
1836	// "docker.runtime=1.8.3"
1837	Attributes map[string]string
1838
1839	// NodeResources captures the available resources on the client.
1840	NodeResources *NodeResources
1841
1842	// ReservedResources captures the set resources on the client that are
1843	// reserved from scheduling.
1844	ReservedResources *NodeReservedResources
1845
1846	// Resources is the available resources on the client.
1847	// For example 'cpu=2' 'memory=2048'
1848	// COMPAT(0.10): Remove after 0.10
1849	Resources *Resources
1850
1851	// Reserved is the set of resources that are reserved,
1852	// and should be subtracted from the total resources for
1853	// the purposes of scheduling. This may be provide certain
1854	// high-watermark tolerances or because of external schedulers
1855	// consuming resources.
1856	// COMPAT(0.10): Remove after 0.10
1857	Reserved *Resources
1858
1859	// Links are used to 'link' this client to external
1860	// systems. For example 'consul=foo.dc1' 'aws=i-83212'
1861	// 'ami=ami-123'
1862	Links map[string]string
1863
1864	// Meta is used to associate arbitrary metadata with this
1865	// client. This is opaque to Nomad.
1866	Meta map[string]string
1867
1868	// NodeClass is an opaque identifier used to group nodes
1869	// together for the purpose of determining scheduling pressure.
1870	NodeClass string
1871
1872	// ComputedClass is a unique id that identifies nodes with a common set of
1873	// attributes and capabilities.
1874	ComputedClass string
1875
1876	// DrainStrategy determines the node's draining behavior.
1877	// Will be non-nil only while draining.
1878	DrainStrategy *DrainStrategy
1879
1880	// SchedulingEligibility determines whether this node will receive new
1881	// placements.
1882	SchedulingEligibility string
1883
1884	// Status of this node
1885	Status string
1886
1887	// StatusDescription is meant to provide more human useful information
1888	StatusDescription string
1889
1890	// StatusUpdatedAt is the time stamp at which the state of the node was
1891	// updated
1892	StatusUpdatedAt int64
1893
1894	// Events is the most recent set of events generated for the node,
1895	// retaining only MaxRetainedNodeEvents number at a time
1896	Events []*NodeEvent
1897
1898	// Drivers is a map of driver names to current driver information
1899	Drivers map[string]*DriverInfo
1900
1901	// CSIControllerPlugins is a map of plugin names to current CSI Plugin info
1902	CSIControllerPlugins map[string]*CSIInfo
1903	// CSINodePlugins is a map of plugin names to current CSI Plugin info
1904	CSINodePlugins map[string]*CSIInfo
1905
1906	// HostVolumes is a map of host volume names to their configuration
1907	HostVolumes map[string]*ClientHostVolumeConfig
1908
1909	// LastDrain contains metadata about the most recent drain operation
1910	LastDrain *DrainMetadata
1911
1912	// Raft Indexes
1913	CreateIndex uint64
1914	ModifyIndex uint64
1915}
1916
1917// Sanitize returns a copy of the Node omitting confidential fields
1918// It only returns a copy if the Node contains the confidential fields
1919func (n *Node) Sanitize() *Node {
1920	if n == nil {
1921		return nil
1922	}
1923	if n.SecretID == "" {
1924		return n
1925	}
1926	clean := n.Copy()
1927	clean.SecretID = ""
1928	return clean
1929}
1930
1931// Ready returns true if the node is ready for running allocations
1932func (n *Node) Ready() bool {
1933	return n.Status == NodeStatusReady && n.DrainStrategy == nil && n.SchedulingEligibility == NodeSchedulingEligible
1934}
1935
1936func (n *Node) Canonicalize() {
1937	if n == nil {
1938		return
1939	}
1940
1941	// Ensure SchedulingEligibility is correctly set whenever draining so the plan applier and other scheduling logic
1942	// only need to check SchedulingEligibility when determining whether a placement is feasible on a node.
1943	if n.DrainStrategy != nil {
1944		n.SchedulingEligibility = NodeSchedulingIneligible
1945	} else if n.SchedulingEligibility == "" {
1946		n.SchedulingEligibility = NodeSchedulingEligible
1947	}
1948
1949	// COMPAT remove in 1.0
1950	// In v0.12.0 we introduced a separate node specific network resource struct
1951	// so we need to covert any pre 0.12 clients to the correct struct
1952	if n.NodeResources != nil && n.NodeResources.NodeNetworks == nil {
1953		if n.NodeResources.Networks != nil {
1954			for _, nr := range n.NodeResources.Networks {
1955				nnr := &NodeNetworkResource{
1956					Mode:   nr.Mode,
1957					Speed:  nr.MBits,
1958					Device: nr.Device,
1959				}
1960				if nr.IP != "" {
1961					nnr.Addresses = []NodeNetworkAddress{
1962						{
1963							Alias:   "default",
1964							Address: nr.IP,
1965						},
1966					}
1967				}
1968				n.NodeResources.NodeNetworks = append(n.NodeResources.NodeNetworks, nnr)
1969			}
1970		}
1971	}
1972}
1973
1974func (n *Node) Copy() *Node {
1975	if n == nil {
1976		return nil
1977	}
1978	nn := new(Node)
1979	*nn = *n
1980	nn.Attributes = helper.CopyMapStringString(nn.Attributes)
1981	nn.Resources = nn.Resources.Copy()
1982	nn.Reserved = nn.Reserved.Copy()
1983	nn.NodeResources = nn.NodeResources.Copy()
1984	nn.ReservedResources = nn.ReservedResources.Copy()
1985	nn.Links = helper.CopyMapStringString(nn.Links)
1986	nn.Meta = helper.CopyMapStringString(nn.Meta)
1987	nn.Events = copyNodeEvents(n.Events)
1988	nn.DrainStrategy = nn.DrainStrategy.Copy()
1989	nn.LastDrain = nn.LastDrain.Copy()
1990	nn.CSIControllerPlugins = copyNodeCSI(nn.CSIControllerPlugins)
1991	nn.CSINodePlugins = copyNodeCSI(nn.CSINodePlugins)
1992	nn.Drivers = copyNodeDrivers(n.Drivers)
1993	nn.HostVolumes = copyNodeHostVolumes(n.HostVolumes)
1994	return nn
1995}
1996
1997// copyNodeEvents is a helper to copy a list of NodeEvent's
1998func copyNodeEvents(events []*NodeEvent) []*NodeEvent {
1999	l := len(events)
2000	if l == 0 {
2001		return nil
2002	}
2003
2004	c := make([]*NodeEvent, l)
2005	for i, event := range events {
2006		c[i] = event.Copy()
2007	}
2008	return c
2009}
2010
2011// copyNodeCSI is a helper to copy a map of CSIInfo
2012func copyNodeCSI(plugins map[string]*CSIInfo) map[string]*CSIInfo {
2013	l := len(plugins)
2014	if l == 0 {
2015		return nil
2016	}
2017
2018	c := make(map[string]*CSIInfo, l)
2019	for plugin, info := range plugins {
2020		c[plugin] = info.Copy()
2021	}
2022
2023	return c
2024}
2025
2026// copyNodeDrivers is a helper to copy a map of DriverInfo
2027func copyNodeDrivers(drivers map[string]*DriverInfo) map[string]*DriverInfo {
2028	l := len(drivers)
2029	if l == 0 {
2030		return nil
2031	}
2032
2033	c := make(map[string]*DriverInfo, l)
2034	for driver, info := range drivers {
2035		c[driver] = info.Copy()
2036	}
2037	return c
2038}
2039
2040// copyNodeHostVolumes is a helper to copy a map of string to Volume
2041func copyNodeHostVolumes(volumes map[string]*ClientHostVolumeConfig) map[string]*ClientHostVolumeConfig {
2042	l := len(volumes)
2043	if l == 0 {
2044		return nil
2045	}
2046
2047	c := make(map[string]*ClientHostVolumeConfig, l)
2048	for volume, v := range volumes {
2049		c[volume] = v.Copy()
2050	}
2051
2052	return c
2053}
2054
2055// TerminalStatus returns if the current status is terminal and
2056// will no longer transition.
2057func (n *Node) TerminalStatus() bool {
2058	switch n.Status {
2059	case NodeStatusDown:
2060		return true
2061	default:
2062		return false
2063	}
2064}
2065
2066// COMPAT(0.11): Remove in 0.11
2067// ComparableReservedResources returns the reserved resouces on the node
2068// handling upgrade paths. Reserved networks must be handled separately. After
2069// 0.11 calls to this should be replaced with:
2070// node.ReservedResources.Comparable()
2071func (n *Node) ComparableReservedResources() *ComparableResources {
2072	// See if we can no-op
2073	if n.Reserved == nil && n.ReservedResources == nil {
2074		return nil
2075	}
2076
2077	// Node already has 0.9+ behavior
2078	if n.ReservedResources != nil {
2079		return n.ReservedResources.Comparable()
2080	}
2081
2082	// Upgrade path
2083	return &ComparableResources{
2084		Flattened: AllocatedTaskResources{
2085			Cpu: AllocatedCpuResources{
2086				CpuShares: int64(n.Reserved.CPU),
2087			},
2088			Memory: AllocatedMemoryResources{
2089				MemoryMB: int64(n.Reserved.MemoryMB),
2090			},
2091		},
2092		Shared: AllocatedSharedResources{
2093			DiskMB: int64(n.Reserved.DiskMB),
2094		},
2095	}
2096}
2097
2098// COMPAT(0.11): Remove in 0.11
2099// ComparableResources returns the resouces on the node
2100// handling upgrade paths. Networking must be handled separately. After 0.11
2101// calls to this should be replaced with: node.NodeResources.Comparable()
2102func (n *Node) ComparableResources() *ComparableResources {
2103	// Node already has 0.9+ behavior
2104	if n.NodeResources != nil {
2105		return n.NodeResources.Comparable()
2106	}
2107
2108	// Upgrade path
2109	return &ComparableResources{
2110		Flattened: AllocatedTaskResources{
2111			Cpu: AllocatedCpuResources{
2112				CpuShares: int64(n.Resources.CPU),
2113			},
2114			Memory: AllocatedMemoryResources{
2115				MemoryMB: int64(n.Resources.MemoryMB),
2116			},
2117		},
2118		Shared: AllocatedSharedResources{
2119			DiskMB: int64(n.Resources.DiskMB),
2120		},
2121	}
2122}
2123
2124// Stub returns a summarized version of the node
2125func (n *Node) Stub(fields *NodeStubFields) *NodeListStub {
2126
2127	addr, _, _ := net.SplitHostPort(n.HTTPAddr)
2128
2129	s := &NodeListStub{
2130		Address:               addr,
2131		ID:                    n.ID,
2132		Datacenter:            n.Datacenter,
2133		Name:                  n.Name,
2134		NodeClass:             n.NodeClass,
2135		Version:               n.Attributes["nomad.version"],
2136		Drain:                 n.DrainStrategy != nil,
2137		SchedulingEligibility: n.SchedulingEligibility,
2138		Status:                n.Status,
2139		StatusDescription:     n.StatusDescription,
2140		Drivers:               n.Drivers,
2141		HostVolumes:           n.HostVolumes,
2142		LastDrain:             n.LastDrain,
2143		CreateIndex:           n.CreateIndex,
2144		ModifyIndex:           n.ModifyIndex,
2145	}
2146
2147	if fields != nil {
2148		if fields.Resources {
2149			s.NodeResources = n.NodeResources
2150			s.ReservedResources = n.ReservedResources
2151		}
2152	}
2153
2154	return s
2155}
2156
2157// NodeListStub is used to return a subset of job information
2158// for the job list
2159type NodeListStub struct {
2160	Address               string
2161	ID                    string
2162	Datacenter            string
2163	Name                  string
2164	NodeClass             string
2165	Version               string
2166	Drain                 bool
2167	SchedulingEligibility string
2168	Status                string
2169	StatusDescription     string
2170	Drivers               map[string]*DriverInfo
2171	HostVolumes           map[string]*ClientHostVolumeConfig
2172	NodeResources         *NodeResources         `json:",omitempty"`
2173	ReservedResources     *NodeReservedResources `json:",omitempty"`
2174	LastDrain             *DrainMetadata
2175	CreateIndex           uint64
2176	ModifyIndex           uint64
2177}
2178
2179// NodeStubFields defines which fields are included in the NodeListStub.
2180type NodeStubFields struct {
2181	Resources bool
2182}
2183
2184// Resources is used to define the resources available
2185// on a client
2186type Resources struct {
2187	CPU         int
2188	Cores       int
2189	MemoryMB    int
2190	MemoryMaxMB int
2191	DiskMB      int
2192	IOPS        int // COMPAT(0.10): Only being used to issue warnings
2193	Networks    Networks
2194	Devices     ResourceDevices
2195}
2196
2197const (
2198	BytesInMegabyte = 1024 * 1024
2199)
2200
2201// DefaultResources is a small resources object that contains the
2202// default resources requests that we will provide to an object.
2203// ---  THIS FUNCTION IS REPLICATED IN api/resources.go and should
2204// be kept in sync.
2205func DefaultResources() *Resources {
2206	return &Resources{
2207		CPU:      100,
2208		Cores:    0,
2209		MemoryMB: 300,
2210	}
2211}
2212
2213// MinResources is a small resources object that contains the
2214// absolute minimum resources that we will provide to an object.
2215// This should not be confused with the defaults which are
2216// provided in Canonicalize() ---  THIS FUNCTION IS REPLICATED IN
2217// api/resources.go and should be kept in sync.
2218func MinResources() *Resources {
2219	return &Resources{
2220		CPU:      1,
2221		Cores:    0,
2222		MemoryMB: 10,
2223	}
2224}
2225
2226// DiskInBytes returns the amount of disk resources in bytes.
2227func (r *Resources) DiskInBytes() int64 {
2228	return int64(r.DiskMB * BytesInMegabyte)
2229}
2230
2231func (r *Resources) Validate() error {
2232	var mErr multierror.Error
2233
2234	if r.Cores > 0 && r.CPU > 0 {
2235		mErr.Errors = append(mErr.Errors, errors.New("Task can only ask for 'cpu' or 'cores' resource, not both."))
2236	}
2237
2238	if err := r.MeetsMinResources(); err != nil {
2239		mErr.Errors = append(mErr.Errors, err)
2240	}
2241
2242	// Ensure the task isn't asking for disk resources
2243	if r.DiskMB > 0 {
2244		mErr.Errors = append(mErr.Errors, errors.New("Task can't ask for disk resources, they have to be specified at the task group level."))
2245	}
2246
2247	for i, d := range r.Devices {
2248		if err := d.Validate(); err != nil {
2249			mErr.Errors = append(mErr.Errors, fmt.Errorf("device %d failed validation: %v", i+1, err))
2250		}
2251	}
2252
2253	if r.MemoryMaxMB != 0 && r.MemoryMaxMB < r.MemoryMB {
2254		mErr.Errors = append(mErr.Errors, fmt.Errorf("MemoryMaxMB value (%d) should be larger than MemoryMB value (%d)", r.MemoryMaxMB, r.MemoryMB))
2255	}
2256
2257	return mErr.ErrorOrNil()
2258}
2259
2260// Merge merges this resource with another resource.
2261// COMPAT(0.10): Remove in 0.10
2262func (r *Resources) Merge(other *Resources) {
2263	if other.CPU != 0 {
2264		r.CPU = other.CPU
2265	}
2266	if other.Cores != 0 {
2267		r.Cores = other.Cores
2268	}
2269	if other.MemoryMB != 0 {
2270		r.MemoryMB = other.MemoryMB
2271	}
2272	if other.MemoryMaxMB != 0 {
2273		r.MemoryMaxMB = other.MemoryMaxMB
2274	}
2275	if other.DiskMB != 0 {
2276		r.DiskMB = other.DiskMB
2277	}
2278	if len(other.Networks) != 0 {
2279		r.Networks = other.Networks
2280	}
2281	if len(other.Devices) != 0 {
2282		r.Devices = other.Devices
2283	}
2284}
2285
2286// COMPAT(0.10): Remove in 0.10
2287func (r *Resources) Equals(o *Resources) bool {
2288	if r == o {
2289		return true
2290	}
2291	if r == nil || o == nil {
2292		return false
2293	}
2294	return r.CPU == o.CPU &&
2295		r.Cores == o.Cores &&
2296		r.MemoryMB == o.MemoryMB &&
2297		r.MemoryMaxMB == o.MemoryMaxMB &&
2298		r.DiskMB == o.DiskMB &&
2299		r.IOPS == o.IOPS &&
2300		r.Networks.Equals(&o.Networks) &&
2301		r.Devices.Equals(&o.Devices)
2302}
2303
2304// COMPAT(0.10): Remove in 0.10
2305// ResourceDevices are part of Resources
2306type ResourceDevices []*RequestedDevice
2307
2308// COMPAT(0.10): Remove in 0.10
2309// Equals ResourceDevices as set keyed by Name
2310func (d *ResourceDevices) Equals(o *ResourceDevices) bool {
2311	if d == o {
2312		return true
2313	}
2314	if d == nil || o == nil {
2315		return false
2316	}
2317	if len(*d) != len(*o) {
2318		return false
2319	}
2320	m := make(map[string]*RequestedDevice, len(*d))
2321	for _, e := range *d {
2322		m[e.Name] = e
2323	}
2324	for _, oe := range *o {
2325		de, ok := m[oe.Name]
2326		if !ok || !de.Equals(oe) {
2327			return false
2328		}
2329	}
2330	return true
2331}
2332
2333// COMPAT(0.10): Remove in 0.10
2334func (r *Resources) Canonicalize() {
2335	// Ensure that an empty and nil slices are treated the same to avoid scheduling
2336	// problems since we use reflect DeepEquals.
2337	if len(r.Networks) == 0 {
2338		r.Networks = nil
2339	}
2340	if len(r.Devices) == 0 {
2341		r.Devices = nil
2342	}
2343
2344	for _, n := range r.Networks {
2345		n.Canonicalize()
2346	}
2347}
2348
2349// MeetsMinResources returns an error if the resources specified are less than
2350// the minimum allowed.
2351// This is based on the minimums defined in the Resources type
2352// COMPAT(0.10): Remove in 0.10
2353func (r *Resources) MeetsMinResources() error {
2354	var mErr multierror.Error
2355	minResources := MinResources()
2356	if r.CPU < minResources.CPU && r.Cores == 0 {
2357		mErr.Errors = append(mErr.Errors, fmt.Errorf("minimum CPU value is %d; got %d", minResources.CPU, r.CPU))
2358	}
2359	if r.MemoryMB < minResources.MemoryMB {
2360		mErr.Errors = append(mErr.Errors, fmt.Errorf("minimum MemoryMB value is %d; got %d", minResources.MemoryMB, r.MemoryMB))
2361	}
2362	return mErr.ErrorOrNil()
2363}
2364
2365// Copy returns a deep copy of the resources
2366func (r *Resources) Copy() *Resources {
2367	if r == nil {
2368		return nil
2369	}
2370	newR := new(Resources)
2371	*newR = *r
2372
2373	// Copy the network objects
2374	newR.Networks = r.Networks.Copy()
2375
2376	// Copy the devices
2377	if r.Devices != nil {
2378		n := len(r.Devices)
2379		newR.Devices = make([]*RequestedDevice, n)
2380		for i := 0; i < n; i++ {
2381			newR.Devices[i] = r.Devices[i].Copy()
2382		}
2383	}
2384
2385	return newR
2386}
2387
2388// NetIndex finds the matching net index using device name
2389// COMPAT(0.10): Remove in 0.10
2390func (r *Resources) NetIndex(n *NetworkResource) int {
2391	return r.Networks.NetIndex(n)
2392}
2393
2394// Add adds the resources of the delta to this, potentially
2395// returning an error if not possible.
2396// COMPAT(0.10): Remove in 0.10
2397func (r *Resources) Add(delta *Resources) {
2398	if delta == nil {
2399		return
2400	}
2401
2402	r.CPU += delta.CPU
2403	r.MemoryMB += delta.MemoryMB
2404	if delta.MemoryMaxMB > 0 {
2405		r.MemoryMaxMB += delta.MemoryMaxMB
2406	} else {
2407		r.MemoryMaxMB += delta.MemoryMB
2408	}
2409	r.DiskMB += delta.DiskMB
2410
2411	for _, n := range delta.Networks {
2412		// Find the matching interface by IP or CIDR
2413		idx := r.NetIndex(n)
2414		if idx == -1 {
2415			r.Networks = append(r.Networks, n.Copy())
2416		} else {
2417			r.Networks[idx].Add(n)
2418		}
2419	}
2420}
2421
2422// COMPAT(0.10): Remove in 0.10
2423func (r *Resources) GoString() string {
2424	return fmt.Sprintf("*%#v", *r)
2425}
2426
2427// NodeNetworkResource is used to describe a fingerprinted network of a node
2428type NodeNetworkResource struct {
2429	Mode string // host for physical networks, cni/<name> for cni networks
2430
2431	// The following apply only to host networks
2432	Device     string // interface name
2433	MacAddress string
2434	Speed      int
2435
2436	Addresses []NodeNetworkAddress // not valid for cni, for bridge there will only be 1 ip
2437}
2438
2439func (n *NodeNetworkResource) Equals(o *NodeNetworkResource) bool {
2440	return reflect.DeepEqual(n, o)
2441}
2442
2443func (n *NodeNetworkResource) HasAlias(alias string) bool {
2444	for _, addr := range n.Addresses {
2445		if addr.Alias == alias {
2446			return true
2447		}
2448	}
2449	return false
2450}
2451
2452type NodeNetworkAF string
2453
2454const (
2455	NodeNetworkAF_IPv4 NodeNetworkAF = "ipv4"
2456	NodeNetworkAF_IPv6 NodeNetworkAF = "ipv6"
2457)
2458
2459type NodeNetworkAddress struct {
2460	Family        NodeNetworkAF
2461	Alias         string
2462	Address       string
2463	ReservedPorts string
2464	Gateway       string // default route for this address
2465}
2466
2467type AllocatedPortMapping struct {
2468	Label  string
2469	Value  int
2470	To     int
2471	HostIP string
2472}
2473
2474type AllocatedPorts []AllocatedPortMapping
2475
2476func (p AllocatedPorts) Get(label string) (AllocatedPortMapping, bool) {
2477	for _, port := range p {
2478		if port.Label == label {
2479			return port, true
2480		}
2481	}
2482
2483	return AllocatedPortMapping{}, false
2484}
2485
2486type Port struct {
2487	// Label is the key for HCL port stanzas: port "foo" {}
2488	Label string
2489
2490	// Value is the static or dynamic port value. For dynamic ports this
2491	// will be 0 in the jobspec and set by the scheduler.
2492	Value int
2493
2494	// To is the port inside a network namespace where this port is
2495	// forwarded. -1 is an internal sentinel value used by Consul Connect
2496	// to mean "same as the host port."
2497	To int
2498
2499	// HostNetwork is the name of the network this port should be assigned
2500	// to. Jobs with a HostNetwork set can only be placed on nodes with
2501	// that host network available.
2502	HostNetwork string
2503}
2504
2505type DNSConfig struct {
2506	Servers  []string
2507	Searches []string
2508	Options  []string
2509}
2510
2511func (d *DNSConfig) Copy() *DNSConfig {
2512	if d == nil {
2513		return nil
2514	}
2515	newD := new(DNSConfig)
2516	newD.Servers = make([]string, len(d.Servers))
2517	copy(newD.Servers, d.Servers)
2518	newD.Searches = make([]string, len(d.Searches))
2519	copy(newD.Searches, d.Searches)
2520	newD.Options = make([]string, len(d.Options))
2521	copy(newD.Options, d.Options)
2522	return newD
2523}
2524
2525// NetworkResource is used to represent available network
2526// resources
2527type NetworkResource struct {
2528	Mode          string     // Mode of the network
2529	Device        string     // Name of the device
2530	CIDR          string     // CIDR block of addresses
2531	IP            string     // Host IP address
2532	MBits         int        // Throughput
2533	DNS           *DNSConfig // DNS Configuration
2534	ReservedPorts []Port     // Host Reserved ports
2535	DynamicPorts  []Port     // Host Dynamically assigned ports
2536}
2537
2538func (nr *NetworkResource) Hash() uint32 {
2539	var data []byte
2540	data = append(data, []byte(fmt.Sprintf("%s%s%s%s%d", nr.Mode, nr.Device, nr.CIDR, nr.IP, nr.MBits))...)
2541
2542	for i, port := range nr.ReservedPorts {
2543		data = append(data, []byte(fmt.Sprintf("r%d%s%d%d", i, port.Label, port.Value, port.To))...)
2544	}
2545
2546	for i, port := range nr.DynamicPorts {
2547		data = append(data, []byte(fmt.Sprintf("d%d%s%d%d", i, port.Label, port.Value, port.To))...)
2548	}
2549
2550	return crc32.ChecksumIEEE(data)
2551}
2552
2553func (nr *NetworkResource) Equals(other *NetworkResource) bool {
2554	return nr.Hash() == other.Hash()
2555}
2556
2557func (n *NetworkResource) Canonicalize() {
2558	// Ensure that an empty and nil slices are treated the same to avoid scheduling
2559	// problems since we use reflect DeepEquals.
2560	if len(n.ReservedPorts) == 0 {
2561		n.ReservedPorts = nil
2562	}
2563	if len(n.DynamicPorts) == 0 {
2564		n.DynamicPorts = nil
2565	}
2566
2567	for i, p := range n.DynamicPorts {
2568		if p.HostNetwork == "" {
2569			n.DynamicPorts[i].HostNetwork = "default"
2570		}
2571	}
2572	for i, p := range n.ReservedPorts {
2573		if p.HostNetwork == "" {
2574			n.ReservedPorts[i].HostNetwork = "default"
2575		}
2576	}
2577}
2578
2579// Copy returns a deep copy of the network resource
2580func (n *NetworkResource) Copy() *NetworkResource {
2581	if n == nil {
2582		return nil
2583	}
2584	newR := new(NetworkResource)
2585	*newR = *n
2586	if n.ReservedPorts != nil {
2587		newR.ReservedPorts = make([]Port, len(n.ReservedPorts))
2588		copy(newR.ReservedPorts, n.ReservedPorts)
2589	}
2590	if n.DynamicPorts != nil {
2591		newR.DynamicPorts = make([]Port, len(n.DynamicPorts))
2592		copy(newR.DynamicPorts, n.DynamicPorts)
2593	}
2594	return newR
2595}
2596
2597// Add adds the resources of the delta to this, potentially
2598// returning an error if not possible.
2599func (n *NetworkResource) Add(delta *NetworkResource) {
2600	if len(delta.ReservedPorts) > 0 {
2601		n.ReservedPorts = append(n.ReservedPorts, delta.ReservedPorts...)
2602	}
2603	n.MBits += delta.MBits
2604	n.DynamicPorts = append(n.DynamicPorts, delta.DynamicPorts...)
2605}
2606
2607func (n *NetworkResource) GoString() string {
2608	return fmt.Sprintf("*%#v", *n)
2609}
2610
2611// PortLabels returns a map of port labels to their assigned host ports.
2612func (n *NetworkResource) PortLabels() map[string]int {
2613	num := len(n.ReservedPorts) + len(n.DynamicPorts)
2614	labelValues := make(map[string]int, num)
2615	for _, port := range n.ReservedPorts {
2616		labelValues[port.Label] = port.Value
2617	}
2618	for _, port := range n.DynamicPorts {
2619		labelValues[port.Label] = port.Value
2620	}
2621	return labelValues
2622}
2623
2624// Networks defined for a task on the Resources struct.
2625type Networks []*NetworkResource
2626
2627func (ns Networks) Copy() Networks {
2628	if len(ns) == 0 {
2629		return nil
2630	}
2631
2632	out := make([]*NetworkResource, len(ns))
2633	for i := range ns {
2634		out[i] = ns[i].Copy()
2635	}
2636	return out
2637}
2638
2639// Port assignment and IP for the given label or empty values.
2640func (ns Networks) Port(label string) AllocatedPortMapping {
2641	for _, n := range ns {
2642		for _, p := range n.ReservedPorts {
2643			if p.Label == label {
2644				return AllocatedPortMapping{
2645					Label:  label,
2646					Value:  p.Value,
2647					To:     p.To,
2648					HostIP: n.IP,
2649				}
2650			}
2651		}
2652		for _, p := range n.DynamicPorts {
2653			if p.Label == label {
2654				return AllocatedPortMapping{
2655					Label:  label,
2656					Value:  p.Value,
2657					To:     p.To,
2658					HostIP: n.IP,
2659				}
2660			}
2661		}
2662	}
2663	return AllocatedPortMapping{}
2664}
2665
2666func (ns Networks) NetIndex(n *NetworkResource) int {
2667	for idx, net := range ns {
2668		if net.Device == n.Device {
2669			return idx
2670		}
2671	}
2672	return -1
2673}
2674
2675// RequestedDevice is used to request a device for a task.
2676type RequestedDevice struct {
2677	// Name is the request name. The possible values are as follows:
2678	// * <type>: A single value only specifies the type of request.
2679	// * <vendor>/<type>: A single slash delimiter assumes the vendor and type of device is specified.
2680	// * <vendor>/<type>/<name>: Two slash delimiters assume vendor, type and specific model are specified.
2681	//
2682	// Examples are as follows:
2683	// * "gpu"
2684	// * "nvidia/gpu"
2685	// * "nvidia/gpu/GTX2080Ti"
2686	Name string
2687
2688	// Count is the number of requested devices
2689	Count uint64
2690
2691	// Constraints are a set of constraints to apply when selecting the device
2692	// to use.
2693	Constraints Constraints
2694
2695	// Affinities are a set of affinities to apply when selecting the device
2696	// to use.
2697	Affinities Affinities
2698}
2699
2700func (r *RequestedDevice) Equals(o *RequestedDevice) bool {
2701	if r == o {
2702		return true
2703	}
2704	if r == nil || o == nil {
2705		return false
2706	}
2707	return r.Name == o.Name &&
2708		r.Count == o.Count &&
2709		r.Constraints.Equals(&o.Constraints) &&
2710		r.Affinities.Equals(&o.Affinities)
2711}
2712
2713func (r *RequestedDevice) Copy() *RequestedDevice {
2714	if r == nil {
2715		return nil
2716	}
2717
2718	nr := *r
2719	nr.Constraints = CopySliceConstraints(nr.Constraints)
2720	nr.Affinities = CopySliceAffinities(nr.Affinities)
2721
2722	return &nr
2723}
2724
2725func (r *RequestedDevice) ID() *DeviceIdTuple {
2726	if r == nil || r.Name == "" {
2727		return nil
2728	}
2729
2730	parts := strings.SplitN(r.Name, "/", 3)
2731	switch len(parts) {
2732	case 1:
2733		return &DeviceIdTuple{
2734			Type: parts[0],
2735		}
2736	case 2:
2737		return &DeviceIdTuple{
2738			Vendor: parts[0],
2739			Type:   parts[1],
2740		}
2741	default:
2742		return &DeviceIdTuple{
2743			Vendor: parts[0],
2744			Type:   parts[1],
2745			Name:   parts[2],
2746		}
2747	}
2748}
2749
2750func (r *RequestedDevice) Validate() error {
2751	if r == nil {
2752		return nil
2753	}
2754
2755	var mErr multierror.Error
2756	if r.Name == "" {
2757		_ = multierror.Append(&mErr, errors.New("device name must be given as one of the following: type, vendor/type, or vendor/type/name"))
2758	}
2759
2760	for idx, constr := range r.Constraints {
2761		// Ensure that the constraint doesn't use an operand we do not allow
2762		switch constr.Operand {
2763		case ConstraintDistinctHosts, ConstraintDistinctProperty:
2764			outer := fmt.Errorf("Constraint %d validation failed: using unsupported operand %q", idx+1, constr.Operand)
2765			_ = multierror.Append(&mErr, outer)
2766		default:
2767			if err := constr.Validate(); err != nil {
2768				outer := fmt.Errorf("Constraint %d validation failed: %s", idx+1, err)
2769				_ = multierror.Append(&mErr, outer)
2770			}
2771		}
2772	}
2773	for idx, affinity := range r.Affinities {
2774		if err := affinity.Validate(); err != nil {
2775			outer := fmt.Errorf("Affinity %d validation failed: %s", idx+1, err)
2776			_ = multierror.Append(&mErr, outer)
2777		}
2778	}
2779
2780	return mErr.ErrorOrNil()
2781}
2782
2783// NodeResources is used to define the resources available on a client node.
2784type NodeResources struct {
2785	Cpu          NodeCpuResources
2786	Memory       NodeMemoryResources
2787	Disk         NodeDiskResources
2788	Networks     Networks
2789	NodeNetworks []*NodeNetworkResource
2790	Devices      []*NodeDeviceResource
2791}
2792
2793func (n *NodeResources) Copy() *NodeResources {
2794	if n == nil {
2795		return nil
2796	}
2797
2798	newN := new(NodeResources)
2799	*newN = *n
2800
2801	// Copy the networks
2802	newN.Networks = n.Networks.Copy()
2803
2804	// Copy the devices
2805	if n.Devices != nil {
2806		devices := len(n.Devices)
2807		newN.Devices = make([]*NodeDeviceResource, devices)
2808		for i := 0; i < devices; i++ {
2809			newN.Devices[i] = n.Devices[i].Copy()
2810		}
2811	}
2812
2813	return newN
2814}
2815
2816// Comparable returns a comparable version of the nodes resources. This
2817// conversion can be lossy so care must be taken when using it.
2818func (n *NodeResources) Comparable() *ComparableResources {
2819	if n == nil {
2820		return nil
2821	}
2822
2823	c := &ComparableResources{
2824		Flattened: AllocatedTaskResources{
2825			Cpu: AllocatedCpuResources{
2826				CpuShares:     n.Cpu.CpuShares,
2827				ReservedCores: n.Cpu.ReservableCpuCores,
2828			},
2829			Memory: AllocatedMemoryResources{
2830				MemoryMB: n.Memory.MemoryMB,
2831			},
2832			Networks: n.Networks,
2833		},
2834		Shared: AllocatedSharedResources{
2835			DiskMB: n.Disk.DiskMB,
2836		},
2837	}
2838	return c
2839}
2840
2841func (n *NodeResources) Merge(o *NodeResources) {
2842	if o == nil {
2843		return
2844	}
2845
2846	n.Cpu.Merge(&o.Cpu)
2847	n.Memory.Merge(&o.Memory)
2848	n.Disk.Merge(&o.Disk)
2849
2850	if len(o.Networks) != 0 {
2851		n.Networks = append(n.Networks, o.Networks...)
2852	}
2853
2854	if len(o.Devices) != 0 {
2855		n.Devices = o.Devices
2856	}
2857
2858	if len(o.NodeNetworks) != 0 {
2859		lookupNetwork := func(nets []*NodeNetworkResource, name string) (int, *NodeNetworkResource) {
2860			for i, nw := range nets {
2861				if nw.Device == name {
2862					return i, nw
2863				}
2864			}
2865			return 0, nil
2866		}
2867
2868		for _, nw := range o.NodeNetworks {
2869			if i, nnw := lookupNetwork(n.NodeNetworks, nw.Device); nnw != nil {
2870				n.NodeNetworks[i] = nw
2871			} else {
2872				n.NodeNetworks = append(n.NodeNetworks, nw)
2873			}
2874		}
2875	}
2876}
2877
2878func (n *NodeResources) Equals(o *NodeResources) bool {
2879	if o == nil && n == nil {
2880		return true
2881	} else if o == nil {
2882		return false
2883	} else if n == nil {
2884		return false
2885	}
2886
2887	if !n.Cpu.Equals(&o.Cpu) {
2888		return false
2889	}
2890	if !n.Memory.Equals(&o.Memory) {
2891		return false
2892	}
2893	if !n.Disk.Equals(&o.Disk) {
2894		return false
2895	}
2896	if !n.Networks.Equals(&o.Networks) {
2897		return false
2898	}
2899
2900	// Check the devices
2901	if !DevicesEquals(n.Devices, o.Devices) {
2902		return false
2903	}
2904
2905	if !NodeNetworksEquals(n.NodeNetworks, o.NodeNetworks) {
2906		return false
2907	}
2908
2909	return true
2910}
2911
2912// Equals equates Networks as a set
2913func (ns *Networks) Equals(o *Networks) bool {
2914	if ns == o {
2915		return true
2916	}
2917	if ns == nil || o == nil {
2918		return false
2919	}
2920	if len(*ns) != len(*o) {
2921		return false
2922	}
2923SETEQUALS:
2924	for _, ne := range *ns {
2925		for _, oe := range *o {
2926			if ne.Equals(oe) {
2927				continue SETEQUALS
2928			}
2929		}
2930		return false
2931	}
2932	return true
2933}
2934
2935// DevicesEquals returns true if the two device arrays are set equal
2936func DevicesEquals(d1, d2 []*NodeDeviceResource) bool {
2937	if len(d1) != len(d2) {
2938		return false
2939	}
2940	idMap := make(map[DeviceIdTuple]*NodeDeviceResource, len(d1))
2941	for _, d := range d1 {
2942		idMap[*d.ID()] = d
2943	}
2944	for _, otherD := range d2 {
2945		if d, ok := idMap[*otherD.ID()]; !ok || !d.Equals(otherD) {
2946			return false
2947		}
2948	}
2949
2950	return true
2951}
2952
2953func NodeNetworksEquals(n1, n2 []*NodeNetworkResource) bool {
2954	if len(n1) != len(n2) {
2955		return false
2956	}
2957
2958	netMap := make(map[string]*NodeNetworkResource, len(n1))
2959	for _, n := range n1 {
2960		netMap[n.Device] = n
2961	}
2962	for _, otherN := range n2 {
2963		if n, ok := netMap[otherN.Device]; !ok || !n.Equals(otherN) {
2964			return false
2965		}
2966	}
2967
2968	return true
2969
2970}
2971
2972// NodeCpuResources captures the CPU resources of the node.
2973type NodeCpuResources struct {
2974	// CpuShares is the CPU shares available. This is calculated by number of
2975	// cores multiplied by the core frequency.
2976	CpuShares int64
2977
2978	// TotalCpuCores is the total number of cores on the machine. This includes cores not in
2979	// the agent's cpuset if on a linux platform
2980	TotalCpuCores uint16
2981
2982	// ReservableCpuCores is the set of cpus which are available to be reserved on the Node.
2983	// This value is currently only reported on Linux platforms which support cgroups and is
2984	// discovered by inspecting the cpuset of the agent's cgroup.
2985	ReservableCpuCores []uint16
2986}
2987
2988func (n *NodeCpuResources) Merge(o *NodeCpuResources) {
2989	if o == nil {
2990		return
2991	}
2992
2993	if o.CpuShares != 0 {
2994		n.CpuShares = o.CpuShares
2995	}
2996
2997	if o.TotalCpuCores != 0 {
2998		n.TotalCpuCores = o.TotalCpuCores
2999	}
3000
3001	if len(o.ReservableCpuCores) != 0 {
3002		n.ReservableCpuCores = o.ReservableCpuCores
3003	}
3004}
3005
3006func (n *NodeCpuResources) Equals(o *NodeCpuResources) bool {
3007	if o == nil && n == nil {
3008		return true
3009	} else if o == nil {
3010		return false
3011	} else if n == nil {
3012		return false
3013	}
3014
3015	if n.CpuShares != o.CpuShares {
3016		return false
3017	}
3018
3019	if n.TotalCpuCores != o.TotalCpuCores {
3020		return false
3021	}
3022
3023	if len(n.ReservableCpuCores) != len(o.ReservableCpuCores) {
3024		return false
3025	}
3026	for i := range n.ReservableCpuCores {
3027		if n.ReservableCpuCores[i] != o.ReservableCpuCores[i] {
3028			return false
3029		}
3030	}
3031	return true
3032}
3033
3034func (n *NodeCpuResources) SharesPerCore() int64 {
3035	return n.CpuShares / int64(n.TotalCpuCores)
3036}
3037
3038// NodeMemoryResources captures the memory resources of the node
3039type NodeMemoryResources struct {
3040	// MemoryMB is the total available memory on the node
3041	MemoryMB int64
3042}
3043
3044func (n *NodeMemoryResources) Merge(o *NodeMemoryResources) {
3045	if o == nil {
3046		return
3047	}
3048
3049	if o.MemoryMB != 0 {
3050		n.MemoryMB = o.MemoryMB
3051	}
3052}
3053
3054func (n *NodeMemoryResources) Equals(o *NodeMemoryResources) bool {
3055	if o == nil && n == nil {
3056		return true
3057	} else if o == nil {
3058		return false
3059	} else if n == nil {
3060		return false
3061	}
3062
3063	if n.MemoryMB != o.MemoryMB {
3064		return false
3065	}
3066
3067	return true
3068}
3069
3070// NodeDiskResources captures the disk resources of the node
3071type NodeDiskResources struct {
3072	// DiskMB is the total available disk space on the node
3073	DiskMB int64
3074}
3075
3076func (n *NodeDiskResources) Merge(o *NodeDiskResources) {
3077	if o == nil {
3078		return
3079	}
3080	if o.DiskMB != 0 {
3081		n.DiskMB = o.DiskMB
3082	}
3083}
3084
3085func (n *NodeDiskResources) Equals(o *NodeDiskResources) bool {
3086	if o == nil && n == nil {
3087		return true
3088	} else if o == nil {
3089		return false
3090	} else if n == nil {
3091		return false
3092	}
3093
3094	if n.DiskMB != o.DiskMB {
3095		return false
3096	}
3097
3098	return true
3099}
3100
3101// DeviceIdTuple is the tuple that identifies a device
3102type DeviceIdTuple struct {
3103	Vendor string
3104	Type   string
3105	Name   string
3106}
3107
3108func (d *DeviceIdTuple) String() string {
3109	if d == nil {
3110		return ""
3111	}
3112
3113	return fmt.Sprintf("%s/%s/%s", d.Vendor, d.Type, d.Name)
3114}
3115
3116// Matches returns if this Device ID is a superset of the passed ID.
3117func (id *DeviceIdTuple) Matches(other *DeviceIdTuple) bool {
3118	if other == nil {
3119		return false
3120	}
3121
3122	if other.Name != "" && other.Name != id.Name {
3123		return false
3124	}
3125
3126	if other.Vendor != "" && other.Vendor != id.Vendor {
3127		return false
3128	}
3129
3130	if other.Type != "" && other.Type != id.Type {
3131		return false
3132	}
3133
3134	return true
3135}
3136
3137// Equals returns if this Device ID is the same as the passed ID.
3138func (id *DeviceIdTuple) Equals(o *DeviceIdTuple) bool {
3139	if id == nil && o == nil {
3140		return true
3141	} else if id == nil || o == nil {
3142		return false
3143	}
3144
3145	return o.Vendor == id.Vendor && o.Type == id.Type && o.Name == id.Name
3146}
3147
3148// NodeDeviceResource captures a set of devices sharing a common
3149// vendor/type/device_name tuple.
3150type NodeDeviceResource struct {
3151	Vendor     string
3152	Type       string
3153	Name       string
3154	Instances  []*NodeDevice
3155	Attributes map[string]*psstructs.Attribute
3156}
3157
3158func (n *NodeDeviceResource) ID() *DeviceIdTuple {
3159	if n == nil {
3160		return nil
3161	}
3162
3163	return &DeviceIdTuple{
3164		Vendor: n.Vendor,
3165		Type:   n.Type,
3166		Name:   n.Name,
3167	}
3168}
3169
3170func (n *NodeDeviceResource) Copy() *NodeDeviceResource {
3171	if n == nil {
3172		return nil
3173	}
3174
3175	// Copy the primitives
3176	nn := *n
3177
3178	// Copy the device instances
3179	if l := len(nn.Instances); l != 0 {
3180		nn.Instances = make([]*NodeDevice, 0, l)
3181		for _, d := range n.Instances {
3182			nn.Instances = append(nn.Instances, d.Copy())
3183		}
3184	}
3185
3186	// Copy the Attributes
3187	nn.Attributes = psstructs.CopyMapStringAttribute(nn.Attributes)
3188
3189	return &nn
3190}
3191
3192func (n *NodeDeviceResource) Equals(o *NodeDeviceResource) bool {
3193	if o == nil && n == nil {
3194		return true
3195	} else if o == nil {
3196		return false
3197	} else if n == nil {
3198		return false
3199	}
3200
3201	if n.Vendor != o.Vendor {
3202		return false
3203	} else if n.Type != o.Type {
3204		return false
3205	} else if n.Name != o.Name {
3206		return false
3207	}
3208
3209	// Check the attributes
3210	if len(n.Attributes) != len(o.Attributes) {
3211		return false
3212	}
3213	for k, v := range n.Attributes {
3214		if otherV, ok := o.Attributes[k]; !ok || v != otherV {
3215			return false
3216		}
3217	}
3218
3219	// Check the instances
3220	if len(n.Instances) != len(o.Instances) {
3221		return false
3222	}
3223	idMap := make(map[string]*NodeDevice, len(n.Instances))
3224	for _, d := range n.Instances {
3225		idMap[d.ID] = d
3226	}
3227	for _, otherD := range o.Instances {
3228		if d, ok := idMap[otherD.ID]; !ok || !d.Equals(otherD) {
3229			return false
3230		}
3231	}
3232
3233	return true
3234}
3235
3236// NodeDevice is an instance of a particular device.
3237type NodeDevice struct {
3238	// ID is the ID of the device.
3239	ID string
3240
3241	// Healthy captures whether the device is healthy.
3242	Healthy bool
3243
3244	// HealthDescription is used to provide a human readable description of why
3245	// the device may be unhealthy.
3246	HealthDescription string
3247
3248	// Locality stores HW locality information for the node to optionally be
3249	// used when making placement decisions.
3250	Locality *NodeDeviceLocality
3251}
3252
3253func (n *NodeDevice) Equals(o *NodeDevice) bool {
3254	if o == nil && n == nil {
3255		return true
3256	} else if o == nil {
3257		return false
3258	} else if n == nil {
3259		return false
3260	}
3261
3262	if n.ID != o.ID {
3263		return false
3264	} else if n.Healthy != o.Healthy {
3265		return false
3266	} else if n.HealthDescription != o.HealthDescription {
3267		return false
3268	} else if !n.Locality.Equals(o.Locality) {
3269		return false
3270	}
3271
3272	return false
3273}
3274
3275func (n *NodeDevice) Copy() *NodeDevice {
3276	if n == nil {
3277		return nil
3278	}
3279
3280	// Copy the primitives
3281	nn := *n
3282
3283	// Copy the locality
3284	nn.Locality = nn.Locality.Copy()
3285
3286	return &nn
3287}
3288
3289// NodeDeviceLocality stores information about the devices hardware locality on
3290// the node.
3291type NodeDeviceLocality struct {
3292	// PciBusID is the PCI Bus ID for the device.
3293	PciBusID string
3294}
3295
3296func (n *NodeDeviceLocality) Equals(o *NodeDeviceLocality) bool {
3297	if o == nil && n == nil {
3298		return true
3299	} else if o == nil {
3300		return false
3301	} else if n == nil {
3302		return false
3303	}
3304
3305	if n.PciBusID != o.PciBusID {
3306		return false
3307	}
3308
3309	return true
3310}
3311
3312func (n *NodeDeviceLocality) Copy() *NodeDeviceLocality {
3313	if n == nil {
3314		return nil
3315	}
3316
3317	// Copy the primitives
3318	nn := *n
3319	return &nn
3320}
3321
3322// NodeReservedResources is used to capture the resources on a client node that
3323// should be reserved and not made available to jobs.
3324type NodeReservedResources struct {
3325	Cpu      NodeReservedCpuResources
3326	Memory   NodeReservedMemoryResources
3327	Disk     NodeReservedDiskResources
3328	Networks NodeReservedNetworkResources
3329}
3330
3331func (n *NodeReservedResources) Copy() *NodeReservedResources {
3332	if n == nil {
3333		return nil
3334	}
3335	newN := new(NodeReservedResources)
3336	*newN = *n
3337	return newN
3338}
3339
3340// Comparable returns a comparable version of the node's reserved resources. The
3341// returned resources doesn't contain any network information. This conversion
3342// can be lossy so care must be taken when using it.
3343func (n *NodeReservedResources) Comparable() *ComparableResources {
3344	if n == nil {
3345		return nil
3346	}
3347
3348	c := &ComparableResources{
3349		Flattened: AllocatedTaskResources{
3350			Cpu: AllocatedCpuResources{
3351				CpuShares:     n.Cpu.CpuShares,
3352				ReservedCores: n.Cpu.ReservedCpuCores,
3353			},
3354			Memory: AllocatedMemoryResources{
3355				MemoryMB: n.Memory.MemoryMB,
3356			},
3357		},
3358		Shared: AllocatedSharedResources{
3359			DiskMB: n.Disk.DiskMB,
3360		},
3361	}
3362	return c
3363}
3364
3365// NodeReservedCpuResources captures the reserved CPU resources of the node.
3366type NodeReservedCpuResources struct {
3367	CpuShares        int64
3368	ReservedCpuCores []uint16
3369}
3370
3371// NodeReservedMemoryResources captures the reserved memory resources of the node.
3372type NodeReservedMemoryResources struct {
3373	MemoryMB int64
3374}
3375
3376// NodeReservedDiskResources captures the reserved disk resources of the node.
3377type NodeReservedDiskResources struct {
3378	DiskMB int64
3379}
3380
3381// NodeReservedNetworkResources captures the reserved network resources of the node.
3382type NodeReservedNetworkResources struct {
3383	// ReservedHostPorts is the set of ports reserved on all host network
3384	// interfaces. Its format is a comma separate list of integers or integer
3385	// ranges. (80,443,1000-2000,2005)
3386	ReservedHostPorts string
3387}
3388
3389// ParsePortHostPorts returns the reserved host ports.
3390func (n *NodeReservedNetworkResources) ParseReservedHostPorts() ([]uint64, error) {
3391	return ParsePortRanges(n.ReservedHostPorts)
3392}
3393
3394// AllocatedResources is the set of resources to be used by an allocation.
3395type AllocatedResources struct {
3396	// Tasks is a mapping of task name to the resources for the task.
3397	Tasks          map[string]*AllocatedTaskResources
3398	TaskLifecycles map[string]*TaskLifecycleConfig
3399
3400	// Shared is the set of resource that are shared by all tasks in the group.
3401	Shared AllocatedSharedResources
3402}
3403
3404func (a *AllocatedResources) Copy() *AllocatedResources {
3405	if a == nil {
3406		return nil
3407	}
3408
3409	out := AllocatedResources{
3410		Shared: a.Shared.Copy(),
3411	}
3412
3413	if a.Tasks != nil {
3414		out.Tasks = make(map[string]*AllocatedTaskResources, len(out.Tasks))
3415		for task, resource := range a.Tasks {
3416			out.Tasks[task] = resource.Copy()
3417		}
3418	}
3419	if a.TaskLifecycles != nil {
3420		out.TaskLifecycles = make(map[string]*TaskLifecycleConfig, len(out.TaskLifecycles))
3421		for task, lifecycle := range a.TaskLifecycles {
3422			out.TaskLifecycles[task] = lifecycle.Copy()
3423		}
3424
3425	}
3426
3427	return &out
3428}
3429
3430// Comparable returns a comparable version of the allocations allocated
3431// resources. This conversion can be lossy so care must be taken when using it.
3432func (a *AllocatedResources) Comparable() *ComparableResources {
3433	if a == nil {
3434		return nil
3435	}
3436
3437	c := &ComparableResources{
3438		Shared: a.Shared,
3439	}
3440
3441	prestartSidecarTasks := &AllocatedTaskResources{}
3442	prestartEphemeralTasks := &AllocatedTaskResources{}
3443	main := &AllocatedTaskResources{}
3444	poststopTasks := &AllocatedTaskResources{}
3445
3446	for taskName, r := range a.Tasks {
3447		lc := a.TaskLifecycles[taskName]
3448		if lc == nil {
3449			main.Add(r)
3450		} else if lc.Hook == TaskLifecycleHookPrestart {
3451			if lc.Sidecar {
3452				prestartSidecarTasks.Add(r)
3453			} else {
3454				prestartEphemeralTasks.Add(r)
3455			}
3456		} else if lc.Hook == TaskLifecycleHookPoststop {
3457			poststopTasks.Add(r)
3458		}
3459	}
3460
3461	// update this loop to account for lifecycle hook
3462	prestartEphemeralTasks.Max(main)
3463	prestartEphemeralTasks.Max(poststopTasks)
3464	prestartSidecarTasks.Add(prestartEphemeralTasks)
3465	c.Flattened.Add(prestartSidecarTasks)
3466
3467	// Add network resources that are at the task group level
3468	for _, network := range a.Shared.Networks {
3469		c.Flattened.Add(&AllocatedTaskResources{
3470			Networks: []*NetworkResource{network},
3471		})
3472	}
3473
3474	return c
3475}
3476
3477// OldTaskResources returns the pre-0.9.0 map of task resources
3478func (a *AllocatedResources) OldTaskResources() map[string]*Resources {
3479	m := make(map[string]*Resources, len(a.Tasks))
3480	for name, res := range a.Tasks {
3481		m[name] = &Resources{
3482			CPU:         int(res.Cpu.CpuShares),
3483			MemoryMB:    int(res.Memory.MemoryMB),
3484			MemoryMaxMB: int(res.Memory.MemoryMaxMB),
3485			Networks:    res.Networks,
3486		}
3487	}
3488
3489	return m
3490}
3491
3492func (a *AllocatedResources) Canonicalize() {
3493	a.Shared.Canonicalize()
3494
3495	for _, r := range a.Tasks {
3496		for _, nw := range r.Networks {
3497			for _, port := range append(nw.DynamicPorts, nw.ReservedPorts...) {
3498				a.Shared.Ports = append(a.Shared.Ports, AllocatedPortMapping{
3499					Label:  port.Label,
3500					Value:  port.Value,
3501					To:     port.To,
3502					HostIP: nw.IP,
3503				})
3504			}
3505		}
3506	}
3507}
3508
3509// AllocatedTaskResources are the set of resources allocated to a task.
3510type AllocatedTaskResources struct {
3511	Cpu      AllocatedCpuResources
3512	Memory   AllocatedMemoryResources
3513	Networks Networks
3514	Devices  []*AllocatedDeviceResource
3515}
3516
3517func (a *AllocatedTaskResources) Copy() *AllocatedTaskResources {
3518	if a == nil {
3519		return nil
3520	}
3521	newA := new(AllocatedTaskResources)
3522	*newA = *a
3523
3524	// Copy the networks
3525	newA.Networks = a.Networks.Copy()
3526
3527	// Copy the devices
3528	if newA.Devices != nil {
3529		n := len(a.Devices)
3530		newA.Devices = make([]*AllocatedDeviceResource, n)
3531		for i := 0; i < n; i++ {
3532			newA.Devices[i] = a.Devices[i].Copy()
3533		}
3534	}
3535
3536	return newA
3537}
3538
3539// NetIndex finds the matching net index using device name
3540func (a *AllocatedTaskResources) NetIndex(n *NetworkResource) int {
3541	return a.Networks.NetIndex(n)
3542}
3543
3544func (a *AllocatedTaskResources) Add(delta *AllocatedTaskResources) {
3545	if delta == nil {
3546		return
3547	}
3548
3549	a.Cpu.Add(&delta.Cpu)
3550	a.Memory.Add(&delta.Memory)
3551
3552	for _, n := range delta.Networks {
3553		// Find the matching interface by IP or CIDR
3554		idx := a.NetIndex(n)
3555		if idx == -1 {
3556			a.Networks = append(a.Networks, n.Copy())
3557		} else {
3558			a.Networks[idx].Add(n)
3559		}
3560	}
3561
3562	for _, d := range delta.Devices {
3563		// Find the matching device
3564		idx := AllocatedDevices(a.Devices).Index(d)
3565		if idx == -1 {
3566			a.Devices = append(a.Devices, d.Copy())
3567		} else {
3568			a.Devices[idx].Add(d)
3569		}
3570	}
3571}
3572
3573func (a *AllocatedTaskResources) Max(other *AllocatedTaskResources) {
3574	if other == nil {
3575		return
3576	}
3577
3578	a.Cpu.Max(&other.Cpu)
3579	a.Memory.Max(&other.Memory)
3580
3581	for _, n := range other.Networks {
3582		// Find the matching interface by IP or CIDR
3583		idx := a.NetIndex(n)
3584		if idx == -1 {
3585			a.Networks = append(a.Networks, n.Copy())
3586		} else {
3587			a.Networks[idx].Add(n)
3588		}
3589	}
3590
3591	for _, d := range other.Devices {
3592		// Find the matching device
3593		idx := AllocatedDevices(a.Devices).Index(d)
3594		if idx == -1 {
3595			a.Devices = append(a.Devices, d.Copy())
3596		} else {
3597			a.Devices[idx].Add(d)
3598		}
3599	}
3600}
3601
3602// Comparable turns AllocatedTaskResources into ComparableResources
3603// as a helper step in preemption
3604func (a *AllocatedTaskResources) Comparable() *ComparableResources {
3605	ret := &ComparableResources{
3606		Flattened: AllocatedTaskResources{
3607			Cpu: AllocatedCpuResources{
3608				CpuShares:     a.Cpu.CpuShares,
3609				ReservedCores: a.Cpu.ReservedCores,
3610			},
3611			Memory: AllocatedMemoryResources{
3612				MemoryMB:    a.Memory.MemoryMB,
3613				MemoryMaxMB: a.Memory.MemoryMaxMB,
3614			},
3615		},
3616	}
3617	ret.Flattened.Networks = append(ret.Flattened.Networks, a.Networks...)
3618	return ret
3619}
3620
3621// Subtract only subtracts CPU and Memory resources. Network utilization
3622// is managed separately in NetworkIndex
3623func (a *AllocatedTaskResources) Subtract(delta *AllocatedTaskResources) {
3624	if delta == nil {
3625		return
3626	}
3627
3628	a.Cpu.Subtract(&delta.Cpu)
3629	a.Memory.Subtract(&delta.Memory)
3630}
3631
3632// AllocatedSharedResources are the set of resources allocated to a task group.
3633type AllocatedSharedResources struct {
3634	Networks Networks
3635	DiskMB   int64
3636	Ports    AllocatedPorts
3637}
3638
3639func (a AllocatedSharedResources) Copy() AllocatedSharedResources {
3640	return AllocatedSharedResources{
3641		Networks: a.Networks.Copy(),
3642		DiskMB:   a.DiskMB,
3643		Ports:    a.Ports,
3644	}
3645}
3646
3647func (a *AllocatedSharedResources) Add(delta *AllocatedSharedResources) {
3648	if delta == nil {
3649		return
3650	}
3651	a.Networks = append(a.Networks, delta.Networks...)
3652	a.DiskMB += delta.DiskMB
3653
3654}
3655
3656func (a *AllocatedSharedResources) Subtract(delta *AllocatedSharedResources) {
3657	if delta == nil {
3658		return
3659	}
3660
3661	diff := map[*NetworkResource]bool{}
3662	for _, n := range delta.Networks {
3663		diff[n] = true
3664	}
3665	var nets Networks
3666	for _, n := range a.Networks {
3667		if _, ok := diff[n]; !ok {
3668			nets = append(nets, n)
3669		}
3670	}
3671	a.Networks = nets
3672	a.DiskMB -= delta.DiskMB
3673}
3674
3675func (a *AllocatedSharedResources) Canonicalize() {
3676	if len(a.Networks) > 0 {
3677		if len(a.Networks[0].DynamicPorts)+len(a.Networks[0].ReservedPorts) > 0 && len(a.Ports) == 0 {
3678			for _, ports := range [][]Port{a.Networks[0].DynamicPorts, a.Networks[0].ReservedPorts} {
3679				for _, p := range ports {
3680					a.Ports = append(a.Ports, AllocatedPortMapping{
3681						Label:  p.Label,
3682						Value:  p.Value,
3683						To:     p.To,
3684						HostIP: a.Networks[0].IP,
3685					})
3686				}
3687			}
3688		}
3689	}
3690}
3691
3692// AllocatedCpuResources captures the allocated CPU resources.
3693type AllocatedCpuResources struct {
3694	CpuShares     int64
3695	ReservedCores []uint16
3696}
3697
3698func (a *AllocatedCpuResources) Add(delta *AllocatedCpuResources) {
3699	if delta == nil {
3700		return
3701	}
3702
3703	a.CpuShares += delta.CpuShares
3704
3705	a.ReservedCores = cpuset.New(a.ReservedCores...).Union(cpuset.New(delta.ReservedCores...)).ToSlice()
3706}
3707
3708func (a *AllocatedCpuResources) Subtract(delta *AllocatedCpuResources) {
3709	if delta == nil {
3710		return
3711	}
3712
3713	a.CpuShares -= delta.CpuShares
3714	a.ReservedCores = cpuset.New(a.ReservedCores...).Difference(cpuset.New(delta.ReservedCores...)).ToSlice()
3715}
3716
3717func (a *AllocatedCpuResources) Max(other *AllocatedCpuResources) {
3718	if other == nil {
3719		return
3720	}
3721
3722	if other.CpuShares > a.CpuShares {
3723		a.CpuShares = other.CpuShares
3724	}
3725
3726	if len(other.ReservedCores) > len(a.ReservedCores) {
3727		a.ReservedCores = other.ReservedCores
3728	}
3729}
3730
3731// AllocatedMemoryResources captures the allocated memory resources.
3732type AllocatedMemoryResources struct {
3733	MemoryMB    int64
3734	MemoryMaxMB int64
3735}
3736
3737func (a *AllocatedMemoryResources) Add(delta *AllocatedMemoryResources) {
3738	if delta == nil {
3739		return
3740	}
3741
3742	a.MemoryMB += delta.MemoryMB
3743	if delta.MemoryMaxMB != 0 {
3744		a.MemoryMaxMB += delta.MemoryMaxMB
3745	} else {
3746		a.MemoryMaxMB += delta.MemoryMB
3747	}
3748}
3749
3750func (a *AllocatedMemoryResources) Subtract(delta *AllocatedMemoryResources) {
3751	if delta == nil {
3752		return
3753	}
3754
3755	a.MemoryMB -= delta.MemoryMB
3756	if delta.MemoryMaxMB != 0 {
3757		a.MemoryMaxMB -= delta.MemoryMaxMB
3758	} else {
3759		a.MemoryMaxMB -= delta.MemoryMB
3760	}
3761}
3762
3763func (a *AllocatedMemoryResources) Max(other *AllocatedMemoryResources) {
3764	if other == nil {
3765		return
3766	}
3767
3768	if other.MemoryMB > a.MemoryMB {
3769		a.MemoryMB = other.MemoryMB
3770	}
3771	if other.MemoryMaxMB > a.MemoryMaxMB {
3772		a.MemoryMaxMB = other.MemoryMaxMB
3773	}
3774}
3775
3776type AllocatedDevices []*AllocatedDeviceResource
3777
3778// Index finds the matching index using the passed device. If not found, -1 is
3779// returned.
3780func (a AllocatedDevices) Index(d *AllocatedDeviceResource) int {
3781	if d == nil {
3782		return -1
3783	}
3784
3785	for i, o := range a {
3786		if o.ID().Equals(d.ID()) {
3787			return i
3788		}
3789	}
3790
3791	return -1
3792}
3793
3794// AllocatedDeviceResource captures a set of allocated devices.
3795type AllocatedDeviceResource struct {
3796	// Vendor, Type, and Name are used to select the plugin to request the
3797	// device IDs from.
3798	Vendor string
3799	Type   string
3800	Name   string
3801
3802	// DeviceIDs is the set of allocated devices
3803	DeviceIDs []string
3804}
3805
3806func (a *AllocatedDeviceResource) ID() *DeviceIdTuple {
3807	if a == nil {
3808		return nil
3809	}
3810
3811	return &DeviceIdTuple{
3812		Vendor: a.Vendor,
3813		Type:   a.Type,
3814		Name:   a.Name,
3815	}
3816}
3817
3818func (a *AllocatedDeviceResource) Add(delta *AllocatedDeviceResource) {
3819	if delta == nil {
3820		return
3821	}
3822
3823	a.DeviceIDs = append(a.DeviceIDs, delta.DeviceIDs...)
3824}
3825
3826func (a *AllocatedDeviceResource) Copy() *AllocatedDeviceResource {
3827	if a == nil {
3828		return a
3829	}
3830
3831	na := *a
3832
3833	// Copy the devices
3834	na.DeviceIDs = make([]string, len(a.DeviceIDs))
3835	for i, id := range a.DeviceIDs {
3836		na.DeviceIDs[i] = id
3837	}
3838
3839	return &na
3840}
3841
3842// ComparableResources is the set of resources allocated to a task group but
3843// not keyed by Task, making it easier to compare.
3844type ComparableResources struct {
3845	Flattened AllocatedTaskResources
3846	Shared    AllocatedSharedResources
3847}
3848
3849func (c *ComparableResources) Add(delta *ComparableResources) {
3850	if delta == nil {
3851		return
3852	}
3853
3854	c.Flattened.Add(&delta.Flattened)
3855	c.Shared.Add(&delta.Shared)
3856}
3857
3858func (c *ComparableResources) Subtract(delta *ComparableResources) {
3859	if delta == nil {
3860		return
3861	}
3862
3863	c.Flattened.Subtract(&delta.Flattened)
3864	c.Shared.Subtract(&delta.Shared)
3865}
3866
3867func (c *ComparableResources) Copy() *ComparableResources {
3868	if c == nil {
3869		return nil
3870	}
3871	newR := new(ComparableResources)
3872	*newR = *c
3873	return newR
3874}
3875
3876// Superset checks if one set of resources is a superset of another. This
3877// ignores network resources, and the NetworkIndex should be used for that.
3878func (c *ComparableResources) Superset(other *ComparableResources) (bool, string) {
3879	if c.Flattened.Cpu.CpuShares < other.Flattened.Cpu.CpuShares {
3880		return false, "cpu"
3881	}
3882
3883	if len(c.Flattened.Cpu.ReservedCores) > 0 && !cpuset.New(c.Flattened.Cpu.ReservedCores...).IsSupersetOf(cpuset.New(other.Flattened.Cpu.ReservedCores...)) {
3884		return false, "cores"
3885	}
3886	if c.Flattened.Memory.MemoryMB < other.Flattened.Memory.MemoryMB {
3887		return false, "memory"
3888	}
3889	if c.Shared.DiskMB < other.Shared.DiskMB {
3890		return false, "disk"
3891	}
3892	return true, ""
3893}
3894
3895// allocated finds the matching net index using device name
3896func (c *ComparableResources) NetIndex(n *NetworkResource) int {
3897	return c.Flattened.Networks.NetIndex(n)
3898}
3899
3900const (
3901	// JobTypeNomad is reserved for internal system tasks and is
3902	// always handled by the CoreScheduler.
3903	JobTypeCore    = "_core"
3904	JobTypeService = "service"
3905	JobTypeBatch   = "batch"
3906	JobTypeSystem  = "system"
3907)
3908
3909const (
3910	JobStatusPending = "pending" // Pending means the job is waiting on scheduling
3911	JobStatusRunning = "running" // Running means the job has non-terminal allocations
3912	JobStatusDead    = "dead"    // Dead means all evaluation's and allocations are terminal
3913)
3914
3915const (
3916	// JobMinPriority is the minimum allowed priority
3917	JobMinPriority = 1
3918
3919	// JobDefaultPriority is the default priority if not
3920	// not specified.
3921	JobDefaultPriority = 50
3922
3923	// JobMaxPriority is the maximum allowed priority
3924	JobMaxPriority = 100
3925
3926	// Ensure CoreJobPriority is higher than any user
3927	// specified job so that it gets priority. This is important
3928	// for the system to remain healthy.
3929	CoreJobPriority = JobMaxPriority * 2
3930
3931	// JobTrackedVersions is the number of historic job versions that are
3932	// kept.
3933	JobTrackedVersions = 6
3934
3935	// JobTrackedScalingEvents is the number of scaling events that are
3936	// kept for a single task group.
3937	JobTrackedScalingEvents = 20
3938)
3939
3940// Job is the scope of a scheduling request to Nomad. It is the largest
3941// scoped object, and is a named collection of task groups. Each task group
3942// is further composed of tasks. A task group (TG) is the unit of scheduling
3943// however.
3944type Job struct {
3945	// Stop marks whether the user has stopped the job. A stopped job will
3946	// have all created allocations stopped and acts as a way to stop a job
3947	// without purging it from the system. This allows existing allocs to be
3948	// queried and the job to be inspected as it is being killed.
3949	Stop bool
3950
3951	// Region is the Nomad region that handles scheduling this job
3952	Region string
3953
3954	// Namespace is the namespace the job is submitted into.
3955	Namespace string
3956
3957	// ID is a unique identifier for the job per region. It can be
3958	// specified hierarchically like LineOfBiz/OrgName/Team/Project
3959	ID string
3960
3961	// ParentID is the unique identifier of the job that spawned this job.
3962	ParentID string
3963
3964	// Name is the logical name of the job used to refer to it. This is unique
3965	// per region, but not unique globally.
3966	Name string
3967
3968	// Type is used to control various behaviors about the job. Most jobs
3969	// are service jobs, meaning they are expected to be long lived.
3970	// Some jobs are batch oriented meaning they run and then terminate.
3971	// This can be extended in the future to support custom schedulers.
3972	Type string
3973
3974	// Priority is used to control scheduling importance and if this job
3975	// can preempt other jobs.
3976	Priority int
3977
3978	// AllAtOnce is used to control if incremental scheduling of task groups
3979	// is allowed or if we must do a gang scheduling of the entire job. This
3980	// can slow down larger jobs if resources are not available.
3981	AllAtOnce bool
3982
3983	// Datacenters contains all the datacenters this job is allowed to span
3984	Datacenters []string
3985
3986	// Constraints can be specified at a job level and apply to
3987	// all the task groups and tasks.
3988	Constraints []*Constraint
3989
3990	// Affinities can be specified at the job level to express
3991	// scheduling preferences that apply to all groups and tasks
3992	Affinities []*Affinity
3993
3994	// Spread can be specified at the job level to express spreading
3995	// allocations across a desired attribute, such as datacenter
3996	Spreads []*Spread
3997
3998	// TaskGroups are the collections of task groups that this job needs
3999	// to run. Each task group is an atomic unit of scheduling and placement.
4000	TaskGroups []*TaskGroup
4001
4002	// See agent.ApiJobToStructJob
4003	// Update provides defaults for the TaskGroup Update stanzas
4004	Update UpdateStrategy
4005
4006	Multiregion *Multiregion
4007
4008	// Periodic is used to define the interval the job is run at.
4009	Periodic *PeriodicConfig
4010
4011	// ParameterizedJob is used to specify the job as a parameterized job
4012	// for dispatching.
4013	ParameterizedJob *ParameterizedJobConfig
4014
4015	// Dispatched is used to identify if the Job has been dispatched from a
4016	// parameterized job.
4017	Dispatched bool
4018
4019	// Payload is the payload supplied when the job was dispatched.
4020	Payload []byte
4021
4022	// Meta is used to associate arbitrary metadata with this
4023	// job. This is opaque to Nomad.
4024	Meta map[string]string
4025
4026	// ConsulToken is the Consul token that proves the submitter of the job has
4027	// access to the Service Identity policies associated with the job's
4028	// Consul Connect enabled services. This field is only used to transfer the
4029	// token and is not stored after Job submission.
4030	ConsulToken string
4031
4032	// ConsulNamespace is the Consul namespace
4033	ConsulNamespace string
4034
4035	// VaultToken is the Vault token that proves the submitter of the job has
4036	// access to the specified Vault policies. This field is only used to
4037	// transfer the token and is not stored after Job submission.
4038	VaultToken string
4039
4040	// VaultNamespace is the Vault namespace
4041	VaultNamespace string
4042
4043	// NomadTokenID is the Accessor ID of the ACL token (if any)
4044	// used to register this version of the job. Used by deploymentwatcher.
4045	NomadTokenID string
4046
4047	// Job status
4048	Status string
4049
4050	// StatusDescription is meant to provide more human useful information
4051	StatusDescription string
4052
4053	// Stable marks a job as stable. Stability is only defined on "service" and
4054	// "system" jobs. The stability of a job will be set automatically as part
4055	// of a deployment and can be manually set via APIs. This field is updated
4056	// when the status of a corresponding deployment transitions to Failed
4057	// or Successful. This field is not meaningful for jobs that don't have an
4058	// update stanza.
4059	Stable bool
4060
4061	// Version is a monotonically increasing version number that is incremented
4062	// on each job register.
4063	Version uint64
4064
4065	// SubmitTime is the time at which the job was submitted as a UnixNano in
4066	// UTC
4067	SubmitTime int64
4068
4069	// Raft Indexes
4070	CreateIndex    uint64
4071	ModifyIndex    uint64
4072	JobModifyIndex uint64
4073}
4074
4075// NamespacedID returns the namespaced id useful for logging
4076func (j *Job) NamespacedID() *NamespacedID {
4077	return &NamespacedID{
4078		ID:        j.ID,
4079		Namespace: j.Namespace,
4080	}
4081}
4082
4083// Canonicalize is used to canonicalize fields in the Job. This should be
4084// called when registering a Job.
4085func (j *Job) Canonicalize() {
4086	if j == nil {
4087		return
4088	}
4089
4090	// Ensure that an empty and nil map are treated the same to avoid scheduling
4091	// problems since we use reflect DeepEquals.
4092	if len(j.Meta) == 0 {
4093		j.Meta = nil
4094	}
4095
4096	// Ensure the job is in a namespace.
4097	if j.Namespace == "" {
4098		j.Namespace = DefaultNamespace
4099	}
4100
4101	for _, tg := range j.TaskGroups {
4102		tg.Canonicalize(j)
4103	}
4104
4105	if j.ParameterizedJob != nil {
4106		j.ParameterizedJob.Canonicalize()
4107	}
4108
4109	if j.Multiregion != nil {
4110		j.Multiregion.Canonicalize()
4111	}
4112
4113	if j.Periodic != nil {
4114		j.Periodic.Canonicalize()
4115	}
4116}
4117
4118// Copy returns a deep copy of the Job. It is expected that callers use recover.
4119// This job can panic if the deep copy failed as it uses reflection.
4120func (j *Job) Copy() *Job {
4121	if j == nil {
4122		return nil
4123	}
4124	nj := new(Job)
4125	*nj = *j
4126	nj.Datacenters = helper.CopySliceString(nj.Datacenters)
4127	nj.Constraints = CopySliceConstraints(nj.Constraints)
4128	nj.Affinities = CopySliceAffinities(nj.Affinities)
4129	nj.Multiregion = nj.Multiregion.Copy()
4130
4131	if j.TaskGroups != nil {
4132		tgs := make([]*TaskGroup, len(nj.TaskGroups))
4133		for i, tg := range nj.TaskGroups {
4134			tgs[i] = tg.Copy()
4135		}
4136		nj.TaskGroups = tgs
4137	}
4138
4139	nj.Periodic = nj.Periodic.Copy()
4140	nj.Meta = helper.CopyMapStringString(nj.Meta)
4141	nj.ParameterizedJob = nj.ParameterizedJob.Copy()
4142	return nj
4143}
4144
4145// Validate is used to check a job for reasonable configuration
4146func (j *Job) Validate() error {
4147	var mErr multierror.Error
4148
4149	if j.Region == "" && j.Multiregion == nil {
4150		mErr.Errors = append(mErr.Errors, errors.New("Missing job region"))
4151	}
4152	if j.ID == "" {
4153		mErr.Errors = append(mErr.Errors, errors.New("Missing job ID"))
4154	} else if strings.Contains(j.ID, " ") {
4155		mErr.Errors = append(mErr.Errors, errors.New("Job ID contains a space"))
4156	} else if strings.Contains(j.ID, "\000") {
4157		mErr.Errors = append(mErr.Errors, errors.New("Job ID contains a null character"))
4158	}
4159	if j.Name == "" {
4160		mErr.Errors = append(mErr.Errors, errors.New("Missing job name"))
4161	} else if strings.Contains(j.Name, "\000") {
4162		mErr.Errors = append(mErr.Errors, errors.New("Job Name contains a null character"))
4163	}
4164	if j.Namespace == "" {
4165		mErr.Errors = append(mErr.Errors, errors.New("Job must be in a namespace"))
4166	}
4167	switch j.Type {
4168	case JobTypeCore, JobTypeService, JobTypeBatch, JobTypeSystem:
4169	case "":
4170		mErr.Errors = append(mErr.Errors, errors.New("Missing job type"))
4171	default:
4172		mErr.Errors = append(mErr.Errors, fmt.Errorf("Invalid job type: %q", j.Type))
4173	}
4174	if j.Priority < JobMinPriority || j.Priority > JobMaxPriority {
4175		mErr.Errors = append(mErr.Errors, fmt.Errorf("Job priority must be between [%d, %d]", JobMinPriority, JobMaxPriority))
4176	}
4177	if len(j.Datacenters) == 0 && !j.IsMultiregion() {
4178		mErr.Errors = append(mErr.Errors, errors.New("Missing job datacenters"))
4179	} else {
4180		for _, v := range j.Datacenters {
4181			if v == "" {
4182				mErr.Errors = append(mErr.Errors, errors.New("Job datacenter must be non-empty string"))
4183			}
4184		}
4185	}
4186	if len(j.TaskGroups) == 0 {
4187		mErr.Errors = append(mErr.Errors, errors.New("Missing job task groups"))
4188	}
4189	for idx, constr := range j.Constraints {
4190		if err := constr.Validate(); err != nil {
4191			outer := fmt.Errorf("Constraint %d validation failed: %s", idx+1, err)
4192			mErr.Errors = append(mErr.Errors, outer)
4193		}
4194	}
4195	if j.Type == JobTypeSystem {
4196		if j.Affinities != nil {
4197			mErr.Errors = append(mErr.Errors, fmt.Errorf("System jobs may not have an affinity stanza"))
4198		}
4199	} else {
4200		for idx, affinity := range j.Affinities {
4201			if err := affinity.Validate(); err != nil {
4202				outer := fmt.Errorf("Affinity %d validation failed: %s", idx+1, err)
4203				mErr.Errors = append(mErr.Errors, outer)
4204			}
4205		}
4206	}
4207
4208	if j.Type == JobTypeSystem {
4209		if j.Spreads != nil {
4210			mErr.Errors = append(mErr.Errors, fmt.Errorf("System jobs may not have a spread stanza"))
4211		}
4212	} else {
4213		for idx, spread := range j.Spreads {
4214			if err := spread.Validate(); err != nil {
4215				outer := fmt.Errorf("Spread %d validation failed: %s", idx+1, err)
4216				mErr.Errors = append(mErr.Errors, outer)
4217			}
4218		}
4219	}
4220
4221	// Check for duplicate task groups
4222	taskGroups := make(map[string]int)
4223	for idx, tg := range j.TaskGroups {
4224		if tg.Name == "" {
4225			mErr.Errors = append(mErr.Errors, fmt.Errorf("Job task group %d missing name", idx+1))
4226		} else if existing, ok := taskGroups[tg.Name]; ok {
4227			mErr.Errors = append(mErr.Errors, fmt.Errorf("Job task group %d redefines '%s' from group %d", idx+1, tg.Name, existing+1))
4228		} else {
4229			taskGroups[tg.Name] = idx
4230		}
4231
4232		if tg.ShutdownDelay != nil && *tg.ShutdownDelay < 0 {
4233			mErr.Errors = append(mErr.Errors, errors.New("ShutdownDelay must be a positive value"))
4234		}
4235
4236		if tg.StopAfterClientDisconnect != nil && *tg.StopAfterClientDisconnect != 0 {
4237			if *tg.StopAfterClientDisconnect > 0 &&
4238				!(j.Type == JobTypeBatch || j.Type == JobTypeService) {
4239				mErr.Errors = append(mErr.Errors, errors.New("stop_after_client_disconnect can only be set in batch and service jobs"))
4240			} else if *tg.StopAfterClientDisconnect < 0 {
4241				mErr.Errors = append(mErr.Errors, errors.New("stop_after_client_disconnect must be a positive value"))
4242			}
4243		}
4244
4245		if j.Type == "system" && tg.Count > 1 {
4246			mErr.Errors = append(mErr.Errors,
4247				fmt.Errorf("Job task group %s has count %d. Count cannot exceed 1 with system scheduler",
4248					tg.Name, tg.Count))
4249		}
4250	}
4251
4252	// Validate the task group
4253	for _, tg := range j.TaskGroups {
4254		if err := tg.Validate(j); err != nil {
4255			outer := fmt.Errorf("Task group %s validation failed: %v", tg.Name, err)
4256			mErr.Errors = append(mErr.Errors, outer)
4257		}
4258	}
4259
4260	// Validate periodic is only used with batch jobs.
4261	if j.IsPeriodic() && j.Periodic.Enabled {
4262		if j.Type != JobTypeBatch {
4263			mErr.Errors = append(mErr.Errors,
4264				fmt.Errorf("Periodic can only be used with %q scheduler", JobTypeBatch))
4265		}
4266
4267		if err := j.Periodic.Validate(); err != nil {
4268			mErr.Errors = append(mErr.Errors, err)
4269		}
4270	}
4271
4272	if j.IsParameterized() {
4273		if j.Type != JobTypeBatch {
4274			mErr.Errors = append(mErr.Errors,
4275				fmt.Errorf("Parameterized job can only be used with %q scheduler", JobTypeBatch))
4276		}
4277
4278		if err := j.ParameterizedJob.Validate(); err != nil {
4279			mErr.Errors = append(mErr.Errors, err)
4280		}
4281	}
4282
4283	if j.IsMultiregion() {
4284		if err := j.Multiregion.Validate(j.Type, j.Datacenters); err != nil {
4285			mErr.Errors = append(mErr.Errors, err)
4286		}
4287	}
4288
4289	return mErr.ErrorOrNil()
4290}
4291
4292// Warnings returns a list of warnings that may be from dubious settings or
4293// deprecation warnings.
4294func (j *Job) Warnings() error {
4295	var mErr multierror.Error
4296
4297	// Check the groups
4298	ap := 0
4299	for _, tg := range j.TaskGroups {
4300		if err := tg.Warnings(j); err != nil {
4301			outer := fmt.Errorf("Group %q has warnings: %v", tg.Name, err)
4302			mErr.Errors = append(mErr.Errors, outer)
4303		}
4304		if tg.Update != nil && tg.Update.AutoPromote {
4305			ap += 1
4306		}
4307	}
4308
4309	// Check AutoPromote, should be all or none
4310	if ap > 0 && ap < len(j.TaskGroups) {
4311		err := fmt.Errorf("auto_promote must be true for all groups to enable automatic promotion")
4312		mErr.Errors = append(mErr.Errors, err)
4313	}
4314
4315	return mErr.ErrorOrNil()
4316}
4317
4318// LookupTaskGroup finds a task group by name
4319func (j *Job) LookupTaskGroup(name string) *TaskGroup {
4320	for _, tg := range j.TaskGroups {
4321		if tg.Name == name {
4322			return tg
4323		}
4324	}
4325	return nil
4326}
4327
4328// CombinedTaskMeta takes a TaskGroup and Task name and returns the combined
4329// meta data for the task. When joining Job, Group and Task Meta, the precedence
4330// is by deepest scope (Task > Group > Job).
4331func (j *Job) CombinedTaskMeta(groupName, taskName string) map[string]string {
4332	group := j.LookupTaskGroup(groupName)
4333	if group == nil {
4334		return j.Meta
4335	}
4336
4337	var meta map[string]string
4338
4339	task := group.LookupTask(taskName)
4340	if task != nil {
4341		meta = helper.CopyMapStringString(task.Meta)
4342	}
4343
4344	if meta == nil {
4345		meta = make(map[string]string, len(group.Meta)+len(j.Meta))
4346	}
4347
4348	// Add the group specific meta
4349	for k, v := range group.Meta {
4350		if _, ok := meta[k]; !ok {
4351			meta[k] = v
4352		}
4353	}
4354
4355	// Add the job specific meta
4356	for k, v := range j.Meta {
4357		if _, ok := meta[k]; !ok {
4358			meta[k] = v
4359		}
4360	}
4361
4362	return meta
4363}
4364
4365// Stopped returns if a job is stopped.
4366func (j *Job) Stopped() bool {
4367	return j == nil || j.Stop
4368}
4369
4370// HasUpdateStrategy returns if any task group in the job has an update strategy
4371func (j *Job) HasUpdateStrategy() bool {
4372	for _, tg := range j.TaskGroups {
4373		if !tg.Update.IsEmpty() {
4374			return true
4375		}
4376	}
4377
4378	return false
4379}
4380
4381// Stub is used to return a summary of the job
4382func (j *Job) Stub(summary *JobSummary) *JobListStub {
4383	return &JobListStub{
4384		ID:                j.ID,
4385		Namespace:         j.Namespace,
4386		ParentID:          j.ParentID,
4387		Name:              j.Name,
4388		Datacenters:       j.Datacenters,
4389		Multiregion:       j.Multiregion,
4390		Type:              j.Type,
4391		Priority:          j.Priority,
4392		Periodic:          j.IsPeriodic(),
4393		ParameterizedJob:  j.IsParameterized(),
4394		Stop:              j.Stop,
4395		Status:            j.Status,
4396		StatusDescription: j.StatusDescription,
4397		CreateIndex:       j.CreateIndex,
4398		ModifyIndex:       j.ModifyIndex,
4399		JobModifyIndex:    j.JobModifyIndex,
4400		SubmitTime:        j.SubmitTime,
4401		JobSummary:        summary,
4402	}
4403}
4404
4405// IsPeriodic returns whether a job is periodic.
4406func (j *Job) IsPeriodic() bool {
4407	return j.Periodic != nil
4408}
4409
4410// IsPeriodicActive returns whether the job is an active periodic job that will
4411// create child jobs
4412func (j *Job) IsPeriodicActive() bool {
4413	return j.IsPeriodic() && j.Periodic.Enabled && !j.Stopped() && !j.IsParameterized()
4414}
4415
4416// IsParameterized returns whether a job is parameterized job.
4417func (j *Job) IsParameterized() bool {
4418	return j.ParameterizedJob != nil && !j.Dispatched
4419}
4420
4421// IsMultiregion returns whether a job is multiregion
4422func (j *Job) IsMultiregion() bool {
4423	return j.Multiregion != nil && j.Multiregion.Regions != nil && len(j.Multiregion.Regions) > 0
4424}
4425
4426// VaultPolicies returns the set of Vault policies per task group, per task
4427func (j *Job) VaultPolicies() map[string]map[string]*Vault {
4428	policies := make(map[string]map[string]*Vault, len(j.TaskGroups))
4429
4430	for _, tg := range j.TaskGroups {
4431		tgPolicies := make(map[string]*Vault, len(tg.Tasks))
4432
4433		for _, task := range tg.Tasks {
4434			if task.Vault == nil {
4435				continue
4436			}
4437
4438			tgPolicies[task.Name] = task.Vault
4439		}
4440
4441		if len(tgPolicies) != 0 {
4442			policies[tg.Name] = tgPolicies
4443		}
4444	}
4445
4446	return policies
4447}
4448
4449// ConnectTasks returns the set of Consul Connect enabled tasks defined on the
4450// job that will require a Service Identity token in the case that Consul ACLs
4451// are enabled. The TaskKind.Value is the name of the Consul service.
4452//
4453// This method is meaningful only after the Job has passed through the job
4454// submission Mutator functions.
4455func (j *Job) ConnectTasks() []TaskKind {
4456	var kinds []TaskKind
4457	for _, tg := range j.TaskGroups {
4458		for _, task := range tg.Tasks {
4459			if task.Kind.IsConnectProxy() ||
4460				task.Kind.IsConnectNative() ||
4461				task.Kind.IsAnyConnectGateway() {
4462				kinds = append(kinds, task.Kind)
4463			}
4464		}
4465	}
4466	return kinds
4467}
4468
4469// RequiredSignals returns a mapping of task groups to tasks to their required
4470// set of signals
4471func (j *Job) RequiredSignals() map[string]map[string][]string {
4472	signals := make(map[string]map[string][]string)
4473
4474	for _, tg := range j.TaskGroups {
4475		for _, task := range tg.Tasks {
4476			// Use this local one as a set
4477			taskSignals := make(map[string]struct{})
4478
4479			// Check if the Vault change mode uses signals
4480			if task.Vault != nil && task.Vault.ChangeMode == VaultChangeModeSignal {
4481				taskSignals[task.Vault.ChangeSignal] = struct{}{}
4482			}
4483
4484			// If a user has specified a KillSignal, add it to required signals
4485			if task.KillSignal != "" {
4486				taskSignals[task.KillSignal] = struct{}{}
4487			}
4488
4489			// Check if any template change mode uses signals
4490			for _, t := range task.Templates {
4491				if t.ChangeMode != TemplateChangeModeSignal {
4492					continue
4493				}
4494
4495				taskSignals[t.ChangeSignal] = struct{}{}
4496			}
4497
4498			// Flatten and sort the signals
4499			l := len(taskSignals)
4500			if l == 0 {
4501				continue
4502			}
4503
4504			flat := make([]string, 0, l)
4505			for sig := range taskSignals {
4506				flat = append(flat, sig)
4507			}
4508
4509			sort.Strings(flat)
4510			tgSignals, ok := signals[tg.Name]
4511			if !ok {
4512				tgSignals = make(map[string][]string)
4513				signals[tg.Name] = tgSignals
4514			}
4515			tgSignals[task.Name] = flat
4516		}
4517
4518	}
4519
4520	return signals
4521}
4522
4523// SpecChanged determines if the functional specification has changed between
4524// two job versions.
4525func (j *Job) SpecChanged(new *Job) bool {
4526	if j == nil {
4527		return new != nil
4528	}
4529
4530	// Create a copy of the new job
4531	c := new.Copy()
4532
4533	// Update the new job so we can do a reflect
4534	c.Status = j.Status
4535	c.StatusDescription = j.StatusDescription
4536	c.Stable = j.Stable
4537	c.Version = j.Version
4538	c.CreateIndex = j.CreateIndex
4539	c.ModifyIndex = j.ModifyIndex
4540	c.JobModifyIndex = j.JobModifyIndex
4541	c.SubmitTime = j.SubmitTime
4542
4543	// cgbaker: FINISH: probably need some consideration of scaling policy ID here
4544
4545	// Deep equals the jobs
4546	return !reflect.DeepEqual(j, c)
4547}
4548
4549func (j *Job) SetSubmitTime() {
4550	j.SubmitTime = time.Now().UTC().UnixNano()
4551}
4552
4553// JobListStub is used to return a subset of job information
4554// for the job list
4555type JobListStub struct {
4556	ID                string
4557	ParentID          string
4558	Name              string
4559	Namespace         string `json:",omitempty"`
4560	Datacenters       []string
4561	Multiregion       *Multiregion
4562	Type              string
4563	Priority          int
4564	Periodic          bool
4565	ParameterizedJob  bool
4566	Stop              bool
4567	Status            string
4568	StatusDescription string
4569	JobSummary        *JobSummary
4570	CreateIndex       uint64
4571	ModifyIndex       uint64
4572	JobModifyIndex    uint64
4573	SubmitTime        int64
4574}
4575
4576// JobSummary summarizes the state of the allocations of a job
4577type JobSummary struct {
4578	// JobID is the ID of the job the summary is for
4579	JobID string
4580
4581	// Namespace is the namespace of the job and its summary
4582	Namespace string
4583
4584	// Summary contains the summary per task group for the Job
4585	Summary map[string]TaskGroupSummary
4586
4587	// Children contains a summary for the children of this job.
4588	Children *JobChildrenSummary
4589
4590	// Raft Indexes
4591	CreateIndex uint64
4592	ModifyIndex uint64
4593}
4594
4595// Copy returns a new copy of JobSummary
4596func (js *JobSummary) Copy() *JobSummary {
4597	newJobSummary := new(JobSummary)
4598	*newJobSummary = *js
4599	newTGSummary := make(map[string]TaskGroupSummary, len(js.Summary))
4600	for k, v := range js.Summary {
4601		newTGSummary[k] = v
4602	}
4603	newJobSummary.Summary = newTGSummary
4604	newJobSummary.Children = newJobSummary.Children.Copy()
4605	return newJobSummary
4606}
4607
4608// JobChildrenSummary contains the summary of children job statuses
4609type JobChildrenSummary struct {
4610	Pending int64
4611	Running int64
4612	Dead    int64
4613}
4614
4615// Copy returns a new copy of a JobChildrenSummary
4616func (jc *JobChildrenSummary) Copy() *JobChildrenSummary {
4617	if jc == nil {
4618		return nil
4619	}
4620
4621	njc := new(JobChildrenSummary)
4622	*njc = *jc
4623	return njc
4624}
4625
4626// TaskGroup summarizes the state of all the allocations of a particular
4627// TaskGroup
4628type TaskGroupSummary struct {
4629	Queued   int
4630	Complete int
4631	Failed   int
4632	Running  int
4633	Starting int
4634	Lost     int
4635}
4636
4637const (
4638	// Checks uses any registered health check state in combination with task
4639	// states to determine if a allocation is healthy.
4640	UpdateStrategyHealthCheck_Checks = "checks"
4641
4642	// TaskStates uses the task states of an allocation to determine if the
4643	// allocation is healthy.
4644	UpdateStrategyHealthCheck_TaskStates = "task_states"
4645
4646	// Manual allows the operator to manually signal to Nomad when an
4647	// allocations is healthy. This allows more advanced health checking that is
4648	// outside of the scope of Nomad.
4649	UpdateStrategyHealthCheck_Manual = "manual"
4650)
4651
4652var (
4653	// DefaultUpdateStrategy provides a baseline that can be used to upgrade
4654	// jobs with the old policy or for populating field defaults.
4655	DefaultUpdateStrategy = &UpdateStrategy{
4656		Stagger:          30 * time.Second,
4657		MaxParallel:      1,
4658		HealthCheck:      UpdateStrategyHealthCheck_Checks,
4659		MinHealthyTime:   10 * time.Second,
4660		HealthyDeadline:  5 * time.Minute,
4661		ProgressDeadline: 10 * time.Minute,
4662		AutoRevert:       false,
4663		AutoPromote:      false,
4664		Canary:           0,
4665	}
4666)
4667
4668// UpdateStrategy is used to modify how updates are done
4669type UpdateStrategy struct {
4670	// Stagger is used to determine the rate at which allocations are migrated
4671	// due to down or draining nodes.
4672	Stagger time.Duration
4673
4674	// MaxParallel is how many updates can be done in parallel
4675	MaxParallel int
4676
4677	// HealthCheck specifies the mechanism in which allocations are marked
4678	// healthy or unhealthy as part of a deployment.
4679	HealthCheck string
4680
4681	// MinHealthyTime is the minimum time an allocation must be in the healthy
4682	// state before it is marked as healthy, unblocking more allocations to be
4683	// rolled.
4684	MinHealthyTime time.Duration
4685
4686	// HealthyDeadline is the time in which an allocation must be marked as
4687	// healthy before it is automatically transitioned to unhealthy. This time
4688	// period doesn't count against the MinHealthyTime.
4689	HealthyDeadline time.Duration
4690
4691	// ProgressDeadline is the time in which an allocation as part of the
4692	// deployment must transition to healthy. If no allocation becomes healthy
4693	// after the deadline, the deployment is marked as failed. If the deadline
4694	// is zero, the first failure causes the deployment to fail.
4695	ProgressDeadline time.Duration
4696
4697	// AutoRevert declares that if a deployment fails because of unhealthy
4698	// allocations, there should be an attempt to auto-revert the job to a
4699	// stable version.
4700	AutoRevert bool
4701
4702	// AutoPromote declares that the deployment should be promoted when all canaries are
4703	// healthy
4704	AutoPromote bool
4705
4706	// Canary is the number of canaries to deploy when a change to the task
4707	// group is detected.
4708	Canary int
4709}
4710
4711func (u *UpdateStrategy) Copy() *UpdateStrategy {
4712	if u == nil {
4713		return nil
4714	}
4715
4716	copy := new(UpdateStrategy)
4717	*copy = *u
4718	return copy
4719}
4720
4721func (u *UpdateStrategy) Validate() error {
4722	if u == nil {
4723		return nil
4724	}
4725
4726	var mErr multierror.Error
4727	switch u.HealthCheck {
4728	case UpdateStrategyHealthCheck_Checks, UpdateStrategyHealthCheck_TaskStates, UpdateStrategyHealthCheck_Manual:
4729	default:
4730		_ = multierror.Append(&mErr, fmt.Errorf("Invalid health check given: %q", u.HealthCheck))
4731	}
4732
4733	if u.MaxParallel < 0 {
4734		_ = multierror.Append(&mErr, fmt.Errorf("Max parallel can not be less than zero: %d < 0", u.MaxParallel))
4735	}
4736	if u.Canary < 0 {
4737		_ = multierror.Append(&mErr, fmt.Errorf("Canary count can not be less than zero: %d < 0", u.Canary))
4738	}
4739	if u.Canary == 0 && u.AutoPromote {
4740		_ = multierror.Append(&mErr, fmt.Errorf("Auto Promote requires a Canary count greater than zero"))
4741	}
4742	if u.MinHealthyTime < 0 {
4743		_ = multierror.Append(&mErr, fmt.Errorf("Minimum healthy time may not be less than zero: %v", u.MinHealthyTime))
4744	}
4745	if u.HealthyDeadline <= 0 {
4746		_ = multierror.Append(&mErr, fmt.Errorf("Healthy deadline must be greater than zero: %v", u.HealthyDeadline))
4747	}
4748	if u.ProgressDeadline < 0 {
4749		_ = multierror.Append(&mErr, fmt.Errorf("Progress deadline must be zero or greater: %v", u.ProgressDeadline))
4750	}
4751	if u.MinHealthyTime >= u.HealthyDeadline {
4752		_ = multierror.Append(&mErr, fmt.Errorf("Minimum healthy time must be less than healthy deadline: %v > %v", u.MinHealthyTime, u.HealthyDeadline))
4753	}
4754	if u.ProgressDeadline != 0 && u.HealthyDeadline >= u.ProgressDeadline {
4755		_ = multierror.Append(&mErr, fmt.Errorf("Healthy deadline must be less than progress deadline: %v > %v", u.HealthyDeadline, u.ProgressDeadline))
4756	}
4757	if u.Stagger <= 0 {
4758		_ = multierror.Append(&mErr, fmt.Errorf("Stagger must be greater than zero: %v", u.Stagger))
4759	}
4760
4761	return mErr.ErrorOrNil()
4762}
4763
4764func (u *UpdateStrategy) IsEmpty() bool {
4765	if u == nil {
4766		return true
4767	}
4768
4769	return u.MaxParallel == 0
4770}
4771
4772// TODO(alexdadgar): Remove once no longer used by the scheduler.
4773// Rolling returns if a rolling strategy should be used
4774func (u *UpdateStrategy) Rolling() bool {
4775	return u.Stagger > 0 && u.MaxParallel > 0
4776}
4777
4778type Multiregion struct {
4779	Strategy *MultiregionStrategy
4780	Regions  []*MultiregionRegion
4781}
4782
4783func (m *Multiregion) Canonicalize() {
4784	if m.Strategy == nil {
4785		m.Strategy = &MultiregionStrategy{}
4786	}
4787	if m.Regions == nil {
4788		m.Regions = []*MultiregionRegion{}
4789	}
4790}
4791
4792// Diff indicates whether the multiregion config has changed
4793func (m *Multiregion) Diff(m2 *Multiregion) bool {
4794	return !reflect.DeepEqual(m, m2)
4795}
4796
4797func (m *Multiregion) Copy() *Multiregion {
4798	if m == nil {
4799		return nil
4800	}
4801	copy := new(Multiregion)
4802	if m.Strategy != nil {
4803		copy.Strategy = &MultiregionStrategy{
4804			MaxParallel: m.Strategy.MaxParallel,
4805			OnFailure:   m.Strategy.OnFailure,
4806		}
4807	}
4808	for _, region := range m.Regions {
4809		copyRegion := &MultiregionRegion{
4810			Name:        region.Name,
4811			Count:       region.Count,
4812			Datacenters: []string{},
4813			Meta:        map[string]string{},
4814		}
4815		copyRegion.Datacenters = append(copyRegion.Datacenters, region.Datacenters...)
4816		for k, v := range region.Meta {
4817			copyRegion.Meta[k] = v
4818		}
4819		copy.Regions = append(copy.Regions, copyRegion)
4820	}
4821	return copy
4822}
4823
4824type MultiregionStrategy struct {
4825	MaxParallel int
4826	OnFailure   string
4827}
4828
4829type MultiregionRegion struct {
4830	Name        string
4831	Count       int
4832	Datacenters []string
4833	Meta        map[string]string
4834}
4835
4836// Namespace allows logically grouping jobs and their associated objects.
4837type Namespace struct {
4838	// Name is the name of the namespace
4839	Name string
4840
4841	// Description is a human readable description of the namespace
4842	Description string
4843
4844	// Quota is the quota specification that the namespace should account
4845	// against.
4846	Quota string
4847
4848	// Hash is the hash of the namespace which is used to efficiently replicate
4849	// cross-regions.
4850	Hash []byte
4851
4852	// Raft Indexes
4853	CreateIndex uint64
4854	ModifyIndex uint64
4855}
4856
4857func (n *Namespace) Validate() error {
4858	var mErr multierror.Error
4859
4860	// Validate the name and description
4861	if !validNamespaceName.MatchString(n.Name) {
4862		err := fmt.Errorf("invalid name %q. Must match regex %s", n.Name, validNamespaceName)
4863		mErr.Errors = append(mErr.Errors, err)
4864	}
4865	if len(n.Description) > maxNamespaceDescriptionLength {
4866		err := fmt.Errorf("description longer than %d", maxNamespaceDescriptionLength)
4867		mErr.Errors = append(mErr.Errors, err)
4868	}
4869
4870	return mErr.ErrorOrNil()
4871}
4872
4873// SetHash is used to compute and set the hash of the namespace
4874func (n *Namespace) SetHash() []byte {
4875	// Initialize a 256bit Blake2 hash (32 bytes)
4876	hash, err := blake2b.New256(nil)
4877	if err != nil {
4878		panic(err)
4879	}
4880
4881	// Write all the user set fields
4882	_, _ = hash.Write([]byte(n.Name))
4883	_, _ = hash.Write([]byte(n.Description))
4884	_, _ = hash.Write([]byte(n.Quota))
4885
4886	// Finalize the hash
4887	hashVal := hash.Sum(nil)
4888
4889	// Set and return the hash
4890	n.Hash = hashVal
4891	return hashVal
4892}
4893
4894func (n *Namespace) Copy() *Namespace {
4895	nc := new(Namespace)
4896	*nc = *n
4897	nc.Hash = make([]byte, len(n.Hash))
4898	copy(nc.Hash, n.Hash)
4899	return nc
4900}
4901
4902// NamespaceListRequest is used to request a list of namespaces
4903type NamespaceListRequest struct {
4904	QueryOptions
4905}
4906
4907// NamespaceListResponse is used for a list request
4908type NamespaceListResponse struct {
4909	Namespaces []*Namespace
4910	QueryMeta
4911}
4912
4913// NamespaceSpecificRequest is used to query a specific namespace
4914type NamespaceSpecificRequest struct {
4915	Name string
4916	QueryOptions
4917}
4918
4919// SingleNamespaceResponse is used to return a single namespace
4920type SingleNamespaceResponse struct {
4921	Namespace *Namespace
4922	QueryMeta
4923}
4924
4925// NamespaceSetRequest is used to query a set of namespaces
4926type NamespaceSetRequest struct {
4927	Namespaces []string
4928	QueryOptions
4929}
4930
4931// NamespaceSetResponse is used to return a set of namespaces
4932type NamespaceSetResponse struct {
4933	Namespaces map[string]*Namespace // Keyed by namespace Name
4934	QueryMeta
4935}
4936
4937// NamespaceDeleteRequest is used to delete a set of namespaces
4938type NamespaceDeleteRequest struct {
4939	Namespaces []string
4940	WriteRequest
4941}
4942
4943// NamespaceUpsertRequest is used to upsert a set of namespaces
4944type NamespaceUpsertRequest struct {
4945	Namespaces []*Namespace
4946	WriteRequest
4947}
4948
4949const (
4950	// PeriodicSpecCron is used for a cron spec.
4951	PeriodicSpecCron = "cron"
4952
4953	// PeriodicSpecTest is only used by unit tests. It is a sorted, comma
4954	// separated list of unix timestamps at which to launch.
4955	PeriodicSpecTest = "_internal_test"
4956)
4957
4958// Periodic defines the interval a job should be run at.
4959type PeriodicConfig struct {
4960	// Enabled determines if the job should be run periodically.
4961	Enabled bool
4962
4963	// Spec specifies the interval the job should be run as. It is parsed based
4964	// on the SpecType.
4965	Spec string
4966
4967	// SpecType defines the format of the spec.
4968	SpecType string
4969
4970	// ProhibitOverlap enforces that spawned jobs do not run in parallel.
4971	ProhibitOverlap bool
4972
4973	// TimeZone is the user specified string that determines the time zone to
4974	// launch against. The time zones must be specified from IANA Time Zone
4975	// database, such as "America/New_York".
4976	// Reference: https://en.wikipedia.org/wiki/List_of_tz_database_time_zones
4977	// Reference: https://www.iana.org/time-zones
4978	TimeZone string
4979
4980	// location is the time zone to evaluate the launch time against
4981	location *time.Location
4982}
4983
4984func (p *PeriodicConfig) Copy() *PeriodicConfig {
4985	if p == nil {
4986		return nil
4987	}
4988	np := new(PeriodicConfig)
4989	*np = *p
4990	return np
4991}
4992
4993func (p *PeriodicConfig) Validate() error {
4994	if !p.Enabled {
4995		return nil
4996	}
4997
4998	var mErr multierror.Error
4999	if p.Spec == "" {
5000		_ = multierror.Append(&mErr, fmt.Errorf("Must specify a spec"))
5001	}
5002
5003	// Check if we got a valid time zone
5004	if p.TimeZone != "" {
5005		if _, err := time.LoadLocation(p.TimeZone); err != nil {
5006			_ = multierror.Append(&mErr, fmt.Errorf("Invalid time zone %q: %v", p.TimeZone, err))
5007		}
5008	}
5009
5010	switch p.SpecType {
5011	case PeriodicSpecCron:
5012		// Validate the cron spec
5013		if _, err := cronexpr.Parse(p.Spec); err != nil {
5014			_ = multierror.Append(&mErr, fmt.Errorf("Invalid cron spec %q: %v", p.Spec, err))
5015		}
5016	case PeriodicSpecTest:
5017		// No-op
5018	default:
5019		_ = multierror.Append(&mErr, fmt.Errorf("Unknown periodic specification type %q", p.SpecType))
5020	}
5021
5022	return mErr.ErrorOrNil()
5023}
5024
5025func (p *PeriodicConfig) Canonicalize() {
5026	// Load the location
5027	l, err := time.LoadLocation(p.TimeZone)
5028	if err != nil {
5029		p.location = time.UTC
5030	}
5031
5032	p.location = l
5033}
5034
5035// CronParseNext is a helper that parses the next time for the given expression
5036// but captures any panic that may occur in the underlying library.
5037func CronParseNext(e *cronexpr.Expression, fromTime time.Time, spec string) (t time.Time, err error) {
5038	defer func() {
5039		if recover() != nil {
5040			t = time.Time{}
5041			err = fmt.Errorf("failed parsing cron expression: %q", spec)
5042		}
5043	}()
5044
5045	return e.Next(fromTime), nil
5046}
5047
5048// Next returns the closest time instant matching the spec that is after the
5049// passed time. If no matching instance exists, the zero value of time.Time is
5050// returned. The `time.Location` of the returned value matches that of the
5051// passed time.
5052func (p *PeriodicConfig) Next(fromTime time.Time) (time.Time, error) {
5053	switch p.SpecType {
5054	case PeriodicSpecCron:
5055		e, err := cronexpr.Parse(p.Spec)
5056		if err != nil {
5057			return time.Time{}, fmt.Errorf("failed parsing cron expression: %q: %v", p.Spec, err)
5058		}
5059		return CronParseNext(e, fromTime, p.Spec)
5060	case PeriodicSpecTest:
5061		split := strings.Split(p.Spec, ",")
5062		if len(split) == 1 && split[0] == "" {
5063			return time.Time{}, nil
5064		}
5065
5066		// Parse the times
5067		times := make([]time.Time, len(split))
5068		for i, s := range split {
5069			unix, err := strconv.Atoi(s)
5070			if err != nil {
5071				return time.Time{}, nil
5072			}
5073
5074			times[i] = time.Unix(int64(unix), 0)
5075		}
5076
5077		// Find the next match
5078		for _, next := range times {
5079			if fromTime.Before(next) {
5080				return next, nil
5081			}
5082		}
5083	}
5084
5085	return time.Time{}, nil
5086}
5087
5088// GetLocation returns the location to use for determining the time zone to run
5089// the periodic job against.
5090func (p *PeriodicConfig) GetLocation() *time.Location {
5091	// Jobs pre 0.5.5 will not have this
5092	if p.location != nil {
5093		return p.location
5094	}
5095
5096	return time.UTC
5097}
5098
5099const (
5100	// PeriodicLaunchSuffix is the string appended to the periodic jobs ID
5101	// when launching derived instances of it.
5102	PeriodicLaunchSuffix = "/periodic-"
5103)
5104
5105// PeriodicLaunch tracks the last launch time of a periodic job.
5106type PeriodicLaunch struct {
5107	ID        string    // ID of the periodic job.
5108	Namespace string    // Namespace of the periodic job
5109	Launch    time.Time // The last launch time.
5110
5111	// Raft Indexes
5112	CreateIndex uint64
5113	ModifyIndex uint64
5114}
5115
5116const (
5117	DispatchPayloadForbidden = "forbidden"
5118	DispatchPayloadOptional  = "optional"
5119	DispatchPayloadRequired  = "required"
5120
5121	// DispatchLaunchSuffix is the string appended to the parameterized job's ID
5122	// when dispatching instances of it.
5123	DispatchLaunchSuffix = "/dispatch-"
5124)
5125
5126// ParameterizedJobConfig is used to configure the parameterized job
5127type ParameterizedJobConfig struct {
5128	// Payload configure the payload requirements
5129	Payload string
5130
5131	// MetaRequired is metadata keys that must be specified by the dispatcher
5132	MetaRequired []string
5133
5134	// MetaOptional is metadata keys that may be specified by the dispatcher
5135	MetaOptional []string
5136}
5137
5138func (d *ParameterizedJobConfig) Validate() error {
5139	var mErr multierror.Error
5140	switch d.Payload {
5141	case DispatchPayloadOptional, DispatchPayloadRequired, DispatchPayloadForbidden:
5142	default:
5143		_ = multierror.Append(&mErr, fmt.Errorf("Unknown payload requirement: %q", d.Payload))
5144	}
5145
5146	// Check that the meta configurations are disjoint sets
5147	disjoint, offending := helper.SliceSetDisjoint(d.MetaRequired, d.MetaOptional)
5148	if !disjoint {
5149		_ = multierror.Append(&mErr, fmt.Errorf("Required and optional meta keys should be disjoint. Following keys exist in both: %v", offending))
5150	}
5151
5152	return mErr.ErrorOrNil()
5153}
5154
5155func (d *ParameterizedJobConfig) Canonicalize() {
5156	if d.Payload == "" {
5157		d.Payload = DispatchPayloadOptional
5158	}
5159}
5160
5161func (d *ParameterizedJobConfig) Copy() *ParameterizedJobConfig {
5162	if d == nil {
5163		return nil
5164	}
5165	nd := new(ParameterizedJobConfig)
5166	*nd = *d
5167	nd.MetaOptional = helper.CopySliceString(nd.MetaOptional)
5168	nd.MetaRequired = helper.CopySliceString(nd.MetaRequired)
5169	return nd
5170}
5171
5172// DispatchedID returns an ID appropriate for a job dispatched against a
5173// particular parameterized job
5174func DispatchedID(templateID string, t time.Time) string {
5175	u := uuid.Generate()[:8]
5176	return fmt.Sprintf("%s%s%d-%s", templateID, DispatchLaunchSuffix, t.Unix(), u)
5177}
5178
5179// DispatchPayloadConfig configures how a task gets its input from a job dispatch
5180type DispatchPayloadConfig struct {
5181	// File specifies a relative path to where the input data should be written
5182	File string
5183}
5184
5185func (d *DispatchPayloadConfig) Copy() *DispatchPayloadConfig {
5186	if d == nil {
5187		return nil
5188	}
5189	nd := new(DispatchPayloadConfig)
5190	*nd = *d
5191	return nd
5192}
5193
5194func (d *DispatchPayloadConfig) Validate() error {
5195	// Verify the destination doesn't escape
5196	escaped, err := PathEscapesAllocDir("task/local/", d.File)
5197	if err != nil {
5198		return fmt.Errorf("invalid destination path: %v", err)
5199	} else if escaped {
5200		return fmt.Errorf("destination escapes allocation directory")
5201	}
5202
5203	return nil
5204}
5205
5206const (
5207	TaskLifecycleHookPrestart  = "prestart"
5208	TaskLifecycleHookPoststart = "poststart"
5209	TaskLifecycleHookPoststop  = "poststop"
5210)
5211
5212type TaskLifecycleConfig struct {
5213	Hook    string
5214	Sidecar bool
5215}
5216
5217func (d *TaskLifecycleConfig) Copy() *TaskLifecycleConfig {
5218	if d == nil {
5219		return nil
5220	}
5221	nd := new(TaskLifecycleConfig)
5222	*nd = *d
5223	return nd
5224}
5225
5226func (d *TaskLifecycleConfig) Validate() error {
5227	if d == nil {
5228		return nil
5229	}
5230
5231	switch d.Hook {
5232	case TaskLifecycleHookPrestart:
5233	case TaskLifecycleHookPoststart:
5234	case TaskLifecycleHookPoststop:
5235	case "":
5236		return fmt.Errorf("no lifecycle hook provided")
5237	default:
5238		return fmt.Errorf("invalid hook: %v", d.Hook)
5239	}
5240
5241	return nil
5242}
5243
5244var (
5245	// These default restart policies needs to be in sync with
5246	// Canonicalize in api/tasks.go
5247
5248	DefaultServiceJobRestartPolicy = RestartPolicy{
5249		Delay:    15 * time.Second,
5250		Attempts: 2,
5251		Interval: 30 * time.Minute,
5252		Mode:     RestartPolicyModeFail,
5253	}
5254	DefaultBatchJobRestartPolicy = RestartPolicy{
5255		Delay:    15 * time.Second,
5256		Attempts: 3,
5257		Interval: 24 * time.Hour,
5258		Mode:     RestartPolicyModeFail,
5259	}
5260)
5261
5262var (
5263	// These default reschedule policies needs to be in sync with
5264	// NewDefaultReschedulePolicy in api/tasks.go
5265
5266	DefaultServiceJobReschedulePolicy = ReschedulePolicy{
5267		Delay:         30 * time.Second,
5268		DelayFunction: "exponential",
5269		MaxDelay:      1 * time.Hour,
5270		Unlimited:     true,
5271	}
5272	DefaultBatchJobReschedulePolicy = ReschedulePolicy{
5273		Attempts:      1,
5274		Interval:      24 * time.Hour,
5275		Delay:         5 * time.Second,
5276		DelayFunction: "constant",
5277	}
5278)
5279
5280const (
5281	// RestartPolicyModeDelay causes an artificial delay till the next interval is
5282	// reached when the specified attempts have been reached in the interval.
5283	RestartPolicyModeDelay = "delay"
5284
5285	// RestartPolicyModeFail causes a job to fail if the specified number of
5286	// attempts are reached within an interval.
5287	RestartPolicyModeFail = "fail"
5288
5289	// RestartPolicyMinInterval is the minimum interval that is accepted for a
5290	// restart policy.
5291	RestartPolicyMinInterval = 5 * time.Second
5292
5293	// ReasonWithinPolicy describes restart events that are within policy
5294	ReasonWithinPolicy = "Restart within policy"
5295)
5296
5297// JobScalingEvents contains the scaling events for a given job
5298type JobScalingEvents struct {
5299	Namespace string
5300	JobID     string
5301
5302	// This map is indexed by target; currently, this is just task group
5303	// the indexed array is sorted from newest to oldest event
5304	// the array should have less than JobTrackedScalingEvents entries
5305	ScalingEvents map[string][]*ScalingEvent
5306
5307	// Raft index
5308	ModifyIndex uint64
5309}
5310
5311// Factory method for ScalingEvent objects
5312func NewScalingEvent(message string) *ScalingEvent {
5313	return &ScalingEvent{
5314		Time:    time.Now().Unix(),
5315		Message: message,
5316	}
5317}
5318
5319// ScalingEvent describes a scaling event against a Job
5320type ScalingEvent struct {
5321	// Unix Nanosecond timestamp for the scaling event
5322	Time int64
5323
5324	// Count is the new scaling count, if provided
5325	Count *int64
5326
5327	// PreviousCount is the count at the time of the scaling event
5328	PreviousCount int64
5329
5330	// Message is the message describing a scaling event
5331	Message string
5332
5333	// Error indicates an error state for this scaling event
5334	Error bool
5335
5336	// Meta is a map of metadata returned during a scaling event
5337	Meta map[string]interface{}
5338
5339	// EvalID is the ID for an evaluation if one was created as part of a scaling event
5340	EvalID *string
5341
5342	// Raft index
5343	CreateIndex uint64
5344}
5345
5346func (e *ScalingEvent) SetError(error bool) *ScalingEvent {
5347	e.Error = error
5348	return e
5349}
5350
5351func (e *ScalingEvent) SetMeta(meta map[string]interface{}) *ScalingEvent {
5352	e.Meta = meta
5353	return e
5354}
5355
5356func (e *ScalingEvent) SetEvalID(evalID string) *ScalingEvent {
5357	e.EvalID = &evalID
5358	return e
5359}
5360
5361// ScalingEventRequest is by for Job.Scale endpoint
5362// to register scaling events
5363type ScalingEventRequest struct {
5364	Namespace string
5365	JobID     string
5366	TaskGroup string
5367
5368	ScalingEvent *ScalingEvent
5369}
5370
5371// ScalingPolicy specifies the scaling policy for a scaling target
5372type ScalingPolicy struct {
5373	// ID is a generated UUID used for looking up the scaling policy
5374	ID string
5375
5376	// Type is the type of scaling performed by the policy
5377	Type string
5378
5379	// Target contains information about the target of the scaling policy, like job and group
5380	Target map[string]string
5381
5382	// Policy is an opaque description of the scaling policy, passed to the autoscaler
5383	Policy map[string]interface{}
5384
5385	// Min is the minimum allowable scaling count for this target
5386	Min int64
5387
5388	// Max is the maximum allowable scaling count for this target
5389	Max int64
5390
5391	// Enabled indicates whether this policy has been enabled/disabled
5392	Enabled bool
5393
5394	CreateIndex uint64
5395	ModifyIndex uint64
5396}
5397
5398// JobKey returns a key that is unique to a job-scoped target, useful as a map
5399// key. This uses the policy type, plus target (group and task).
5400func (p *ScalingPolicy) JobKey() string {
5401	return p.Type + "\000" +
5402		p.Target[ScalingTargetGroup] + "\000" +
5403		p.Target[ScalingTargetTask]
5404}
5405
5406const (
5407	ScalingTargetNamespace = "Namespace"
5408	ScalingTargetJob       = "Job"
5409	ScalingTargetGroup     = "Group"
5410	ScalingTargetTask      = "Task"
5411
5412	ScalingPolicyTypeHorizontal = "horizontal"
5413)
5414
5415func (p *ScalingPolicy) Canonicalize() {
5416	if p.Type == "" {
5417		p.Type = ScalingPolicyTypeHorizontal
5418	}
5419}
5420
5421func (p *ScalingPolicy) Copy() *ScalingPolicy {
5422	if p == nil {
5423		return nil
5424	}
5425
5426	opaquePolicyConfig, err := copystructure.Copy(p.Policy)
5427	if err != nil {
5428		panic(err.Error())
5429	}
5430
5431	c := ScalingPolicy{
5432		ID:          p.ID,
5433		Policy:      opaquePolicyConfig.(map[string]interface{}),
5434		Enabled:     p.Enabled,
5435		Type:        p.Type,
5436		Min:         p.Min,
5437		Max:         p.Max,
5438		CreateIndex: p.CreateIndex,
5439		ModifyIndex: p.ModifyIndex,
5440	}
5441	c.Target = make(map[string]string, len(p.Target))
5442	for k, v := range p.Target {
5443		c.Target[k] = v
5444	}
5445	return &c
5446}
5447
5448func (p *ScalingPolicy) Validate() error {
5449	if p == nil {
5450		return nil
5451	}
5452
5453	var mErr multierror.Error
5454
5455	// Check policy type and target
5456	if p.Type == "" {
5457		mErr.Errors = append(mErr.Errors, fmt.Errorf("missing scaling policy type"))
5458	} else {
5459		mErr.Errors = append(mErr.Errors, p.validateType().Errors...)
5460	}
5461
5462	// Check Min and Max
5463	if p.Max < 0 {
5464		mErr.Errors = append(mErr.Errors,
5465			fmt.Errorf("maximum count must be specified and non-negative"))
5466	} else if p.Max < p.Min {
5467		mErr.Errors = append(mErr.Errors,
5468			fmt.Errorf("maximum count must not be less than minimum count"))
5469	}
5470
5471	if p.Min < 0 {
5472		mErr.Errors = append(mErr.Errors,
5473			fmt.Errorf("minimum count must be specified and non-negative"))
5474	}
5475
5476	return mErr.ErrorOrNil()
5477}
5478
5479func (p *ScalingPolicy) validateTargetHorizontal() (mErr multierror.Error) {
5480	if len(p.Target) == 0 {
5481		// This is probably not a Nomad horizontal policy
5482		return
5483	}
5484
5485	// Nomad horizontal policies should have Namespace, Job and TaskGroup
5486	if p.Target[ScalingTargetNamespace] == "" {
5487		mErr.Errors = append(mErr.Errors, fmt.Errorf("missing target namespace"))
5488	}
5489	if p.Target[ScalingTargetJob] == "" {
5490		mErr.Errors = append(mErr.Errors, fmt.Errorf("missing target job"))
5491	}
5492	if p.Target[ScalingTargetGroup] == "" {
5493		mErr.Errors = append(mErr.Errors, fmt.Errorf("missing target group"))
5494	}
5495	return
5496}
5497
5498// Diff indicates whether the specification for a given scaling policy has changed
5499func (p *ScalingPolicy) Diff(p2 *ScalingPolicy) bool {
5500	copy := *p2
5501	copy.ID = p.ID
5502	copy.CreateIndex = p.CreateIndex
5503	copy.ModifyIndex = p.ModifyIndex
5504	return !reflect.DeepEqual(*p, copy)
5505}
5506
5507// TarketTaskGroup updates a ScalingPolicy target to specify a given task group
5508func (p *ScalingPolicy) TargetTaskGroup(job *Job, tg *TaskGroup) *ScalingPolicy {
5509	p.Target = map[string]string{
5510		ScalingTargetNamespace: job.Namespace,
5511		ScalingTargetJob:       job.ID,
5512		ScalingTargetGroup:     tg.Name,
5513	}
5514	return p
5515}
5516
5517// TargetTask updates a ScalingPolicy target to specify a given task
5518func (p *ScalingPolicy) TargetTask(job *Job, tg *TaskGroup, task *Task) *ScalingPolicy {
5519	p.TargetTaskGroup(job, tg)
5520	p.Target[ScalingTargetTask] = task.Name
5521	return p
5522}
5523
5524func (p *ScalingPolicy) Stub() *ScalingPolicyListStub {
5525	stub := &ScalingPolicyListStub{
5526		ID:          p.ID,
5527		Type:        p.Type,
5528		Target:      make(map[string]string),
5529		Enabled:     p.Enabled,
5530		CreateIndex: p.CreateIndex,
5531		ModifyIndex: p.ModifyIndex,
5532	}
5533	for k, v := range p.Target {
5534		stub.Target[k] = v
5535	}
5536	return stub
5537}
5538
5539// GetScalingPolicies returns a slice of all scaling scaling policies for this job
5540func (j *Job) GetScalingPolicies() []*ScalingPolicy {
5541	ret := make([]*ScalingPolicy, 0)
5542
5543	for _, tg := range j.TaskGroups {
5544		if tg.Scaling != nil {
5545			ret = append(ret, tg.Scaling)
5546		}
5547	}
5548
5549	ret = append(ret, j.GetEntScalingPolicies()...)
5550
5551	return ret
5552}
5553
5554// ScalingPolicyListStub is used to return a subset of scaling policy information
5555// for the scaling policy list
5556type ScalingPolicyListStub struct {
5557	ID          string
5558	Enabled     bool
5559	Type        string
5560	Target      map[string]string
5561	CreateIndex uint64
5562	ModifyIndex uint64
5563}
5564
5565// RestartPolicy configures how Tasks are restarted when they crash or fail.
5566type RestartPolicy struct {
5567	// Attempts is the number of restart that will occur in an interval.
5568	Attempts int
5569
5570	// Interval is a duration in which we can limit the number of restarts
5571	// within.
5572	Interval time.Duration
5573
5574	// Delay is the time between a failure and a restart.
5575	Delay time.Duration
5576
5577	// Mode controls what happens when the task restarts more than attempt times
5578	// in an interval.
5579	Mode string
5580}
5581
5582func (r *RestartPolicy) Copy() *RestartPolicy {
5583	if r == nil {
5584		return nil
5585	}
5586	nrp := new(RestartPolicy)
5587	*nrp = *r
5588	return nrp
5589}
5590
5591func (r *RestartPolicy) Validate() error {
5592	var mErr multierror.Error
5593	switch r.Mode {
5594	case RestartPolicyModeDelay, RestartPolicyModeFail:
5595	default:
5596		_ = multierror.Append(&mErr, fmt.Errorf("Unsupported restart mode: %q", r.Mode))
5597	}
5598
5599	// Check for ambiguous/confusing settings
5600	if r.Attempts == 0 && r.Mode != RestartPolicyModeFail {
5601		_ = multierror.Append(&mErr, fmt.Errorf("Restart policy %q with %d attempts is ambiguous", r.Mode, r.Attempts))
5602	}
5603
5604	if r.Interval.Nanoseconds() < RestartPolicyMinInterval.Nanoseconds() {
5605		_ = multierror.Append(&mErr, fmt.Errorf("Interval can not be less than %v (got %v)", RestartPolicyMinInterval, r.Interval))
5606	}
5607	if time.Duration(r.Attempts)*r.Delay > r.Interval {
5608		_ = multierror.Append(&mErr,
5609			fmt.Errorf("Nomad can't restart the TaskGroup %v times in an interval of %v with a delay of %v", r.Attempts, r.Interval, r.Delay))
5610	}
5611	return mErr.ErrorOrNil()
5612}
5613
5614func NewRestartPolicy(jobType string) *RestartPolicy {
5615	switch jobType {
5616	case JobTypeService, JobTypeSystem:
5617		rp := DefaultServiceJobRestartPolicy
5618		return &rp
5619	case JobTypeBatch:
5620		rp := DefaultBatchJobRestartPolicy
5621		return &rp
5622	}
5623	return nil
5624}
5625
5626const ReschedulePolicyMinInterval = 15 * time.Second
5627const ReschedulePolicyMinDelay = 5 * time.Second
5628
5629var RescheduleDelayFunctions = [...]string{"constant", "exponential", "fibonacci"}
5630
5631// ReschedulePolicy configures how Tasks are rescheduled  when they crash or fail.
5632type ReschedulePolicy struct {
5633	// Attempts limits the number of rescheduling attempts that can occur in an interval.
5634	Attempts int
5635
5636	// Interval is a duration in which we can limit the number of reschedule attempts.
5637	Interval time.Duration
5638
5639	// Delay is a minimum duration to wait between reschedule attempts.
5640	// The delay function determines how much subsequent reschedule attempts are delayed by.
5641	Delay time.Duration
5642
5643	// DelayFunction determines how the delay progressively changes on subsequent reschedule
5644	// attempts. Valid values are "exponential", "constant", and "fibonacci".
5645	DelayFunction string
5646
5647	// MaxDelay is an upper bound on the delay.
5648	MaxDelay time.Duration
5649
5650	// Unlimited allows infinite rescheduling attempts. Only allowed when delay is set
5651	// between reschedule attempts.
5652	Unlimited bool
5653}
5654
5655func (r *ReschedulePolicy) Copy() *ReschedulePolicy {
5656	if r == nil {
5657		return nil
5658	}
5659	nrp := new(ReschedulePolicy)
5660	*nrp = *r
5661	return nrp
5662}
5663
5664func (r *ReschedulePolicy) Enabled() bool {
5665	enabled := r != nil && (r.Attempts > 0 || r.Unlimited)
5666	return enabled
5667}
5668
5669// Validate uses different criteria to validate the reschedule policy
5670// Delay must be a minimum of 5 seconds
5671// Delay Ceiling is ignored if Delay Function is "constant"
5672// Number of possible attempts is validated, given the interval, delay and delay function
5673func (r *ReschedulePolicy) Validate() error {
5674	if !r.Enabled() {
5675		return nil
5676	}
5677	var mErr multierror.Error
5678	// Check for ambiguous/confusing settings
5679	if r.Attempts > 0 {
5680		if r.Interval <= 0 {
5681			_ = multierror.Append(&mErr, fmt.Errorf("Interval must be a non zero value if Attempts > 0"))
5682		}
5683		if r.Unlimited {
5684			_ = multierror.Append(&mErr, fmt.Errorf("Reschedule Policy with Attempts = %v, Interval = %v, "+
5685				"and Unlimited = %v is ambiguous", r.Attempts, r.Interval, r.Unlimited))
5686			_ = multierror.Append(&mErr, errors.New("If Attempts >0, Unlimited cannot also be set to true"))
5687		}
5688	}
5689
5690	delayPreCheck := true
5691	// Delay should be bigger than the default
5692	if r.Delay.Nanoseconds() < ReschedulePolicyMinDelay.Nanoseconds() {
5693		_ = multierror.Append(&mErr, fmt.Errorf("Delay cannot be less than %v (got %v)", ReschedulePolicyMinDelay, r.Delay))
5694		delayPreCheck = false
5695	}
5696
5697	// Must use a valid delay function
5698	if !isValidDelayFunction(r.DelayFunction) {
5699		_ = multierror.Append(&mErr, fmt.Errorf("Invalid delay function %q, must be one of %q", r.DelayFunction, RescheduleDelayFunctions))
5700		delayPreCheck = false
5701	}
5702
5703	// Validate MaxDelay if not using linear delay progression
5704	if r.DelayFunction != "constant" {
5705		if r.MaxDelay.Nanoseconds() < ReschedulePolicyMinDelay.Nanoseconds() {
5706			_ = multierror.Append(&mErr, fmt.Errorf("Max Delay cannot be less than %v (got %v)", ReschedulePolicyMinDelay, r.Delay))
5707			delayPreCheck = false
5708		}
5709		if r.MaxDelay < r.Delay {
5710			_ = multierror.Append(&mErr, fmt.Errorf("Max Delay cannot be less than Delay %v (got %v)", r.Delay, r.MaxDelay))
5711			delayPreCheck = false
5712		}
5713
5714	}
5715
5716	// Validate Interval and other delay parameters if attempts are limited
5717	if !r.Unlimited {
5718		if r.Interval.Nanoseconds() < ReschedulePolicyMinInterval.Nanoseconds() {
5719			_ = multierror.Append(&mErr, fmt.Errorf("Interval cannot be less than %v (got %v)", ReschedulePolicyMinInterval, r.Interval))
5720		}
5721		if !delayPreCheck {
5722			// We can't cross validate the rest of the delay params if delayPreCheck fails, so return early
5723			return mErr.ErrorOrNil()
5724		}
5725		crossValidationErr := r.validateDelayParams()
5726		if crossValidationErr != nil {
5727			_ = multierror.Append(&mErr, crossValidationErr)
5728		}
5729	}
5730	return mErr.ErrorOrNil()
5731}
5732
5733func isValidDelayFunction(delayFunc string) bool {
5734	for _, value := range RescheduleDelayFunctions {
5735		if value == delayFunc {
5736			return true
5737		}
5738	}
5739	return false
5740}
5741
5742func (r *ReschedulePolicy) validateDelayParams() error {
5743	ok, possibleAttempts, recommendedInterval := r.viableAttempts()
5744	if ok {
5745		return nil
5746	}
5747	var mErr multierror.Error
5748	if r.DelayFunction == "constant" {
5749		_ = multierror.Append(&mErr, fmt.Errorf("Nomad can only make %v attempts in %v with initial delay %v and "+
5750			"delay function %q", possibleAttempts, r.Interval, r.Delay, r.DelayFunction))
5751	} else {
5752		_ = multierror.Append(&mErr, fmt.Errorf("Nomad can only make %v attempts in %v with initial delay %v, "+
5753			"delay function %q, and delay ceiling %v", possibleAttempts, r.Interval, r.Delay, r.DelayFunction, r.MaxDelay))
5754	}
5755	_ = multierror.Append(&mErr, fmt.Errorf("Set the interval to at least %v to accommodate %v attempts", recommendedInterval.Round(time.Second), r.Attempts))
5756	return mErr.ErrorOrNil()
5757}
5758
5759func (r *ReschedulePolicy) viableAttempts() (bool, int, time.Duration) {
5760	var possibleAttempts int
5761	var recommendedInterval time.Duration
5762	valid := true
5763	switch r.DelayFunction {
5764	case "constant":
5765		recommendedInterval = time.Duration(r.Attempts) * r.Delay
5766		if r.Interval < recommendedInterval {
5767			possibleAttempts = int(r.Interval / r.Delay)
5768			valid = false
5769		}
5770	case "exponential":
5771		for i := 0; i < r.Attempts; i++ {
5772			nextDelay := time.Duration(math.Pow(2, float64(i))) * r.Delay
5773			if nextDelay > r.MaxDelay {
5774				nextDelay = r.MaxDelay
5775				recommendedInterval += nextDelay
5776			} else {
5777				recommendedInterval = nextDelay
5778			}
5779			if recommendedInterval < r.Interval {
5780				possibleAttempts++
5781			}
5782		}
5783		if possibleAttempts < r.Attempts {
5784			valid = false
5785		}
5786	case "fibonacci":
5787		var slots []time.Duration
5788		slots = append(slots, r.Delay)
5789		slots = append(slots, r.Delay)
5790		reachedCeiling := false
5791		for i := 2; i < r.Attempts; i++ {
5792			var nextDelay time.Duration
5793			if reachedCeiling {
5794				//switch to linear
5795				nextDelay = slots[i-1] + r.MaxDelay
5796			} else {
5797				nextDelay = slots[i-1] + slots[i-2]
5798				if nextDelay > r.MaxDelay {
5799					nextDelay = r.MaxDelay
5800					reachedCeiling = true
5801				}
5802			}
5803			slots = append(slots, nextDelay)
5804		}
5805		recommendedInterval = slots[len(slots)-1]
5806		if r.Interval < recommendedInterval {
5807			valid = false
5808			// calculate possible attempts
5809			for i := 0; i < len(slots); i++ {
5810				if slots[i] > r.Interval {
5811					possibleAttempts = i
5812					break
5813				}
5814			}
5815		}
5816	default:
5817		return false, 0, 0
5818	}
5819	if possibleAttempts < 0 { // can happen if delay is bigger than interval
5820		possibleAttempts = 0
5821	}
5822	return valid, possibleAttempts, recommendedInterval
5823}
5824
5825func NewReschedulePolicy(jobType string) *ReschedulePolicy {
5826	switch jobType {
5827	case JobTypeService:
5828		rp := DefaultServiceJobReschedulePolicy
5829		return &rp
5830	case JobTypeBatch:
5831		rp := DefaultBatchJobReschedulePolicy
5832		return &rp
5833	}
5834	return nil
5835}
5836
5837const (
5838	MigrateStrategyHealthChecks = "checks"
5839	MigrateStrategyHealthStates = "task_states"
5840)
5841
5842type MigrateStrategy struct {
5843	MaxParallel     int
5844	HealthCheck     string
5845	MinHealthyTime  time.Duration
5846	HealthyDeadline time.Duration
5847}
5848
5849// DefaultMigrateStrategy is used for backwards compat with pre-0.8 Allocations
5850// that lack an update strategy.
5851//
5852// This function should match its counterpart in api/tasks.go
5853func DefaultMigrateStrategy() *MigrateStrategy {
5854	return &MigrateStrategy{
5855		MaxParallel:     1,
5856		HealthCheck:     MigrateStrategyHealthChecks,
5857		MinHealthyTime:  10 * time.Second,
5858		HealthyDeadline: 5 * time.Minute,
5859	}
5860}
5861
5862func (m *MigrateStrategy) Validate() error {
5863	var mErr multierror.Error
5864
5865	if m.MaxParallel < 0 {
5866		_ = multierror.Append(&mErr, fmt.Errorf("MaxParallel must be >= 0 but found %d", m.MaxParallel))
5867	}
5868
5869	switch m.HealthCheck {
5870	case MigrateStrategyHealthChecks, MigrateStrategyHealthStates:
5871		// ok
5872	case "":
5873		if m.MaxParallel > 0 {
5874			_ = multierror.Append(&mErr, fmt.Errorf("Missing HealthCheck"))
5875		}
5876	default:
5877		_ = multierror.Append(&mErr, fmt.Errorf("Invalid HealthCheck: %q", m.HealthCheck))
5878	}
5879
5880	if m.MinHealthyTime < 0 {
5881		_ = multierror.Append(&mErr, fmt.Errorf("MinHealthyTime is %s and must be >= 0", m.MinHealthyTime))
5882	}
5883
5884	if m.HealthyDeadline < 0 {
5885		_ = multierror.Append(&mErr, fmt.Errorf("HealthyDeadline is %s and must be >= 0", m.HealthyDeadline))
5886	}
5887
5888	if m.MinHealthyTime > m.HealthyDeadline {
5889		_ = multierror.Append(&mErr, fmt.Errorf("MinHealthyTime must be less than HealthyDeadline"))
5890	}
5891
5892	return mErr.ErrorOrNil()
5893}
5894
5895// TaskGroup is an atomic unit of placement. Each task group belongs to
5896// a job and may contain any number of tasks. A task group support running
5897// in many replicas using the same configuration..
5898type TaskGroup struct {
5899	// Name of the task group
5900	Name string
5901
5902	// Count is the number of replicas of this task group that should
5903	// be scheduled.
5904	Count int
5905
5906	// Update is used to control the update strategy for this task group
5907	Update *UpdateStrategy
5908
5909	// Migrate is used to control the migration strategy for this task group
5910	Migrate *MigrateStrategy
5911
5912	// Constraints can be specified at a task group level and apply to
5913	// all the tasks contained.
5914	Constraints []*Constraint
5915
5916	// Scaling is the list of autoscaling policies for the TaskGroup
5917	Scaling *ScalingPolicy
5918
5919	// RestartPolicy of a TaskGroup
5920	RestartPolicy *RestartPolicy
5921
5922	// Tasks are the collection of tasks that this task group needs to run
5923	Tasks []*Task
5924
5925	// EphemeralDisk is the disk resources that the task group requests
5926	EphemeralDisk *EphemeralDisk
5927
5928	// Meta is used to associate arbitrary metadata with this
5929	// task group. This is opaque to Nomad.
5930	Meta map[string]string
5931
5932	// ReschedulePolicy is used to configure how the scheduler should
5933	// retry failed allocations.
5934	ReschedulePolicy *ReschedulePolicy
5935
5936	// Affinities can be specified at the task group level to express
5937	// scheduling preferences.
5938	Affinities []*Affinity
5939
5940	// Spread can be specified at the task group level to express spreading
5941	// allocations across a desired attribute, such as datacenter
5942	Spreads []*Spread
5943
5944	// Networks are the network configuration for the task group. This can be
5945	// overridden in the task.
5946	Networks Networks
5947
5948	// Consul configuration specific to this task group
5949	Consul *Consul
5950
5951	// Services this group provides
5952	Services []*Service
5953
5954	// Volumes is a map of volumes that have been requested by the task group.
5955	Volumes map[string]*VolumeRequest
5956
5957	// ShutdownDelay is the amount of time to wait between deregistering
5958	// group services in consul and stopping tasks.
5959	ShutdownDelay *time.Duration
5960
5961	// StopAfterClientDisconnect, if set, configures the client to stop the task group
5962	// after this duration since the last known good heartbeat
5963	StopAfterClientDisconnect *time.Duration
5964}
5965
5966func (tg *TaskGroup) Copy() *TaskGroup {
5967	if tg == nil {
5968		return nil
5969	}
5970	ntg := new(TaskGroup)
5971	*ntg = *tg
5972	ntg.Update = ntg.Update.Copy()
5973	ntg.Constraints = CopySliceConstraints(ntg.Constraints)
5974	ntg.RestartPolicy = ntg.RestartPolicy.Copy()
5975	ntg.ReschedulePolicy = ntg.ReschedulePolicy.Copy()
5976	ntg.Affinities = CopySliceAffinities(ntg.Affinities)
5977	ntg.Spreads = CopySliceSpreads(ntg.Spreads)
5978	ntg.Volumes = CopyMapVolumeRequest(ntg.Volumes)
5979	ntg.Scaling = ntg.Scaling.Copy()
5980	ntg.Consul = ntg.Consul.Copy()
5981
5982	// Copy the network objects
5983	if tg.Networks != nil {
5984		n := len(tg.Networks)
5985		ntg.Networks = make([]*NetworkResource, n)
5986		for i := 0; i < n; i++ {
5987			ntg.Networks[i] = tg.Networks[i].Copy()
5988		}
5989	}
5990
5991	if tg.Tasks != nil {
5992		tasks := make([]*Task, len(ntg.Tasks))
5993		for i, t := range ntg.Tasks {
5994			tasks[i] = t.Copy()
5995		}
5996		ntg.Tasks = tasks
5997	}
5998
5999	ntg.Meta = helper.CopyMapStringString(ntg.Meta)
6000
6001	if tg.EphemeralDisk != nil {
6002		ntg.EphemeralDisk = tg.EphemeralDisk.Copy()
6003	}
6004
6005	if tg.Services != nil {
6006		ntg.Services = make([]*Service, len(tg.Services))
6007		for i, s := range tg.Services {
6008			ntg.Services[i] = s.Copy()
6009		}
6010	}
6011
6012	if tg.ShutdownDelay != nil {
6013		ntg.ShutdownDelay = tg.ShutdownDelay
6014	}
6015
6016	if tg.StopAfterClientDisconnect != nil {
6017		ntg.StopAfterClientDisconnect = tg.StopAfterClientDisconnect
6018	}
6019
6020	return ntg
6021}
6022
6023// Canonicalize is used to canonicalize fields in the TaskGroup.
6024func (tg *TaskGroup) Canonicalize(job *Job) {
6025	// Ensure that an empty and nil map are treated the same to avoid scheduling
6026	// problems since we use reflect DeepEquals.
6027	if len(tg.Meta) == 0 {
6028		tg.Meta = nil
6029	}
6030
6031	// Set the default restart policy.
6032	if tg.RestartPolicy == nil {
6033		tg.RestartPolicy = NewRestartPolicy(job.Type)
6034	}
6035
6036	if tg.ReschedulePolicy == nil {
6037		tg.ReschedulePolicy = NewReschedulePolicy(job.Type)
6038	}
6039
6040	// Canonicalize Migrate for service jobs
6041	if job.Type == JobTypeService && tg.Migrate == nil {
6042		tg.Migrate = DefaultMigrateStrategy()
6043	}
6044
6045	// Set a default ephemeral disk object if the user has not requested for one
6046	if tg.EphemeralDisk == nil {
6047		tg.EphemeralDisk = DefaultEphemeralDisk()
6048	}
6049
6050	if tg.Scaling != nil {
6051		tg.Scaling.Canonicalize()
6052	}
6053
6054	for _, service := range tg.Services {
6055		service.Canonicalize(job.Name, tg.Name, "group")
6056	}
6057
6058	for _, network := range tg.Networks {
6059		network.Canonicalize()
6060	}
6061
6062	for _, task := range tg.Tasks {
6063		task.Canonicalize(job, tg)
6064	}
6065}
6066
6067// Validate is used to check a task group for reasonable configuration
6068func (tg *TaskGroup) Validate(j *Job) error {
6069	var mErr multierror.Error
6070	if tg.Name == "" {
6071		mErr.Errors = append(mErr.Errors, errors.New("Missing task group name"))
6072	} else if strings.Contains(tg.Name, "\000") {
6073		mErr.Errors = append(mErr.Errors, errors.New("Task group name contains null character"))
6074	}
6075	if tg.Count < 0 {
6076		mErr.Errors = append(mErr.Errors, errors.New("Task group count can't be negative"))
6077	}
6078	if len(tg.Tasks) == 0 {
6079		// could be a lone consul gateway inserted by the connect mutator
6080		mErr.Errors = append(mErr.Errors, errors.New("Missing tasks for task group"))
6081	}
6082
6083	for idx, constr := range tg.Constraints {
6084		if err := constr.Validate(); err != nil {
6085			outer := fmt.Errorf("Constraint %d validation failed: %s", idx+1, err)
6086			mErr.Errors = append(mErr.Errors, outer)
6087		}
6088	}
6089	if j.Type == JobTypeSystem {
6090		if tg.Affinities != nil {
6091			mErr.Errors = append(mErr.Errors, fmt.Errorf("System jobs may not have an affinity stanza"))
6092		}
6093	} else {
6094		for idx, affinity := range tg.Affinities {
6095			if err := affinity.Validate(); err != nil {
6096				outer := fmt.Errorf("Affinity %d validation failed: %s", idx+1, err)
6097				mErr.Errors = append(mErr.Errors, outer)
6098			}
6099		}
6100	}
6101
6102	if tg.RestartPolicy != nil {
6103		if err := tg.RestartPolicy.Validate(); err != nil {
6104			mErr.Errors = append(mErr.Errors, err)
6105		}
6106	} else {
6107		mErr.Errors = append(mErr.Errors, fmt.Errorf("Task Group %v should have a restart policy", tg.Name))
6108	}
6109
6110	if j.Type == JobTypeSystem {
6111		if tg.Spreads != nil {
6112			mErr.Errors = append(mErr.Errors, fmt.Errorf("System jobs may not have a spread stanza"))
6113		}
6114	} else {
6115		for idx, spread := range tg.Spreads {
6116			if err := spread.Validate(); err != nil {
6117				outer := fmt.Errorf("Spread %d validation failed: %s", idx+1, err)
6118				mErr.Errors = append(mErr.Errors, outer)
6119			}
6120		}
6121	}
6122
6123	if j.Type == JobTypeSystem {
6124		if tg.ReschedulePolicy != nil {
6125			mErr.Errors = append(mErr.Errors, fmt.Errorf("System jobs should not have a reschedule policy"))
6126		}
6127	} else {
6128		if tg.ReschedulePolicy != nil {
6129			if err := tg.ReschedulePolicy.Validate(); err != nil {
6130				mErr.Errors = append(mErr.Errors, err)
6131			}
6132		} else {
6133			mErr.Errors = append(mErr.Errors, fmt.Errorf("Task Group %v should have a reschedule policy", tg.Name))
6134		}
6135	}
6136
6137	if tg.EphemeralDisk != nil {
6138		if err := tg.EphemeralDisk.Validate(); err != nil {
6139			mErr.Errors = append(mErr.Errors, err)
6140		}
6141	} else {
6142		mErr.Errors = append(mErr.Errors, fmt.Errorf("Task Group %v should have an ephemeral disk object", tg.Name))
6143	}
6144
6145	// Validate the update strategy
6146	if u := tg.Update; u != nil {
6147		switch j.Type {
6148		case JobTypeService, JobTypeSystem:
6149		default:
6150			mErr.Errors = append(mErr.Errors, fmt.Errorf("Job type %q does not allow update block", j.Type))
6151		}
6152		if err := u.Validate(); err != nil {
6153			mErr.Errors = append(mErr.Errors, err)
6154		}
6155	}
6156
6157	// Validate the migration strategy
6158	switch j.Type {
6159	case JobTypeService:
6160		if tg.Migrate != nil {
6161			if err := tg.Migrate.Validate(); err != nil {
6162				mErr.Errors = append(mErr.Errors, err)
6163			}
6164		}
6165	default:
6166		if tg.Migrate != nil {
6167			mErr.Errors = append(mErr.Errors, fmt.Errorf("Job type %q does not allow migrate block", j.Type))
6168		}
6169	}
6170
6171	// Check that there is only one leader task if any
6172	tasks := make(map[string]int)
6173	leaderTasks := 0
6174	for idx, task := range tg.Tasks {
6175		if task.Name == "" {
6176			mErr.Errors = append(mErr.Errors, fmt.Errorf("Task %d missing name", idx+1))
6177		} else if existing, ok := tasks[task.Name]; ok {
6178			mErr.Errors = append(mErr.Errors, fmt.Errorf("Task %d redefines '%s' from task %d", idx+1, task.Name, existing+1))
6179		} else {
6180			tasks[task.Name] = idx
6181		}
6182
6183		if task.Leader {
6184			leaderTasks++
6185		}
6186	}
6187
6188	if leaderTasks > 1 {
6189		mErr.Errors = append(mErr.Errors, fmt.Errorf("Only one task may be marked as leader"))
6190	}
6191
6192	// Validate the volume requests
6193	var canaries int
6194	if tg.Update != nil {
6195		canaries = tg.Update.Canary
6196	}
6197	for name, volReq := range tg.Volumes {
6198		if err := volReq.Validate(canaries); err != nil {
6199			mErr.Errors = append(mErr.Errors, fmt.Errorf(
6200				"Task group volume validation for %s failed: %v", name, err))
6201		}
6202	}
6203
6204	// Validate task group and task network resources
6205	if err := tg.validateNetworks(); err != nil {
6206		outer := fmt.Errorf("Task group network validation failed: %v", err)
6207		mErr.Errors = append(mErr.Errors, outer)
6208	}
6209
6210	// Validate task group and task services
6211	if err := tg.validateServices(); err != nil {
6212		outer := fmt.Errorf("Task group service validation failed: %v", err)
6213		mErr.Errors = append(mErr.Errors, outer)
6214	}
6215
6216	// Validate group service script-checks
6217	if err := tg.validateScriptChecksInGroupServices(); err != nil {
6218		outer := fmt.Errorf("Task group service check validation failed: %v", err)
6219		mErr.Errors = append(mErr.Errors, outer)
6220	}
6221
6222	// Validate the scaling policy
6223	if err := tg.validateScalingPolicy(j); err != nil {
6224		outer := fmt.Errorf("Task group scaling policy validation failed: %v", err)
6225		mErr.Errors = append(mErr.Errors, outer)
6226	}
6227
6228	// Validate the tasks
6229	for _, task := range tg.Tasks {
6230		// Validate the task does not reference undefined volume mounts
6231		for i, mnt := range task.VolumeMounts {
6232			if mnt.Volume == "" {
6233				mErr.Errors = append(mErr.Errors, fmt.Errorf("Task %s has a volume mount (%d) referencing an empty volume", task.Name, i))
6234				continue
6235			}
6236
6237			if _, ok := tg.Volumes[mnt.Volume]; !ok {
6238				mErr.Errors = append(mErr.Errors, fmt.Errorf("Task %s has a volume mount (%d) referencing undefined volume %s", task.Name, i, mnt.Volume))
6239				continue
6240			}
6241		}
6242
6243		if err := task.Validate(tg.EphemeralDisk, j.Type, tg.Services, tg.Networks); err != nil {
6244			outer := fmt.Errorf("Task %s validation failed: %v", task.Name, err)
6245			mErr.Errors = append(mErr.Errors, outer)
6246		}
6247	}
6248	return mErr.ErrorOrNil()
6249}
6250
6251func (tg *TaskGroup) validateNetworks() error {
6252	var mErr multierror.Error
6253	portLabels := make(map[string]string)
6254	// host_network -> static port tracking
6255	staticPortsIndex := make(map[string]map[int]string)
6256
6257	for _, net := range tg.Networks {
6258		for _, port := range append(net.ReservedPorts, net.DynamicPorts...) {
6259			if other, ok := portLabels[port.Label]; ok {
6260				mErr.Errors = append(mErr.Errors, fmt.Errorf("Port label %s already in use by %s", port.Label, other))
6261			} else {
6262				portLabels[port.Label] = "taskgroup network"
6263			}
6264
6265			if port.Value != 0 {
6266				hostNetwork := port.HostNetwork
6267				if hostNetwork == "" {
6268					hostNetwork = "default"
6269				}
6270				staticPorts, ok := staticPortsIndex[hostNetwork]
6271				if !ok {
6272					staticPorts = make(map[int]string)
6273				}
6274				// static port
6275				if other, ok := staticPorts[port.Value]; ok {
6276					err := fmt.Errorf("Static port %d already reserved by %s", port.Value, other)
6277					mErr.Errors = append(mErr.Errors, err)
6278				} else if port.Value > math.MaxUint16 {
6279					err := fmt.Errorf("Port %s (%d) cannot be greater than %d", port.Label, port.Value, math.MaxUint16)
6280					mErr.Errors = append(mErr.Errors, err)
6281				} else {
6282					staticPorts[port.Value] = fmt.Sprintf("taskgroup network:%s", port.Label)
6283					staticPortsIndex[hostNetwork] = staticPorts
6284				}
6285			}
6286
6287			if port.To < -1 {
6288				err := fmt.Errorf("Port %q cannot be mapped to negative value %d", port.Label, port.To)
6289				mErr.Errors = append(mErr.Errors, err)
6290			} else if port.To > math.MaxUint16 {
6291				err := fmt.Errorf("Port %q cannot be mapped to a port (%d) greater than %d", port.Label, port.To, math.MaxUint16)
6292				mErr.Errors = append(mErr.Errors, err)
6293			}
6294		}
6295	}
6296	// Check for duplicate tasks or port labels, and no duplicated static ports
6297	for _, task := range tg.Tasks {
6298		if task.Resources == nil {
6299			continue
6300		}
6301
6302		for _, net := range task.Resources.Networks {
6303			for _, port := range append(net.ReservedPorts, net.DynamicPorts...) {
6304				if other, ok := portLabels[port.Label]; ok {
6305					mErr.Errors = append(mErr.Errors, fmt.Errorf("Port label %s already in use by %s", port.Label, other))
6306				}
6307
6308				if port.Value != 0 {
6309					hostNetwork := port.HostNetwork
6310					if hostNetwork == "" {
6311						hostNetwork = "default"
6312					}
6313					staticPorts, ok := staticPortsIndex[hostNetwork]
6314					if !ok {
6315						staticPorts = make(map[int]string)
6316					}
6317					if other, ok := staticPorts[port.Value]; ok {
6318						err := fmt.Errorf("Static port %d already reserved by %s", port.Value, other)
6319						mErr.Errors = append(mErr.Errors, err)
6320					} else if port.Value > math.MaxUint16 {
6321						err := fmt.Errorf("Port %s (%d) cannot be greater than %d", port.Label, port.Value, math.MaxUint16)
6322						mErr.Errors = append(mErr.Errors, err)
6323					} else {
6324						staticPorts[port.Value] = fmt.Sprintf("%s:%s", task.Name, port.Label)
6325						staticPortsIndex[hostNetwork] = staticPorts
6326					}
6327				}
6328			}
6329		}
6330	}
6331	return mErr.ErrorOrNil()
6332}
6333
6334// validateServices runs Service.Validate() on group-level services,
6335// checks that group services do not conflict with task services and that
6336// group service checks that refer to tasks only refer to tasks that exist.
6337func (tg *TaskGroup) validateServices() error {
6338	var mErr multierror.Error
6339	knownTasks := make(map[string]struct{})
6340	knownServices := make(map[string]struct{})
6341
6342	// Create a map of known tasks and their services so we can compare
6343	// vs the group-level services and checks
6344	for _, task := range tg.Tasks {
6345		knownTasks[task.Name] = struct{}{}
6346		if task.Services == nil {
6347			continue
6348		}
6349		for _, service := range task.Services {
6350			if _, ok := knownServices[service.Name+service.PortLabel]; ok {
6351				mErr.Errors = append(mErr.Errors, fmt.Errorf("Service %s is duplicate", service.Name))
6352			}
6353			for _, check := range service.Checks {
6354				if check.TaskName != "" {
6355					mErr.Errors = append(mErr.Errors, fmt.Errorf("Check %s is invalid: only task group service checks can be assigned tasks", check.Name))
6356				}
6357			}
6358			knownServices[service.Name+service.PortLabel] = struct{}{}
6359		}
6360	}
6361	for i, service := range tg.Services {
6362		if err := service.Validate(); err != nil {
6363			outer := fmt.Errorf("Service[%d] %s validation failed: %s", i, service.Name, err)
6364			mErr.Errors = append(mErr.Errors, outer)
6365			// we break here to avoid the risk of crashing on null-pointer
6366			// access in a later step, accepting that we might miss out on
6367			// error messages to provide the user.
6368			continue
6369		}
6370		if service.AddressMode == AddressModeDriver {
6371			mErr.Errors = append(mErr.Errors, fmt.Errorf("service %q cannot use address_mode=\"driver\", only services defined in a \"task\" block can use this mode", service.Name))
6372		}
6373		if _, ok := knownServices[service.Name+service.PortLabel]; ok {
6374			mErr.Errors = append(mErr.Errors, fmt.Errorf("Service %s is duplicate", service.Name))
6375		}
6376		knownServices[service.Name+service.PortLabel] = struct{}{}
6377		for _, check := range service.Checks {
6378			if check.TaskName != "" {
6379				if check.Type != ServiceCheckScript && check.Type != ServiceCheckGRPC {
6380					mErr.Errors = append(mErr.Errors,
6381						fmt.Errorf("Check %s invalid: only script and gRPC checks should have tasks", check.Name))
6382				}
6383				if check.AddressMode == AddressModeDriver {
6384					mErr.Errors = append(mErr.Errors, fmt.Errorf("Check %q invalid: cannot use address_mode=\"driver\", only checks defined in a \"task\" service block can use this mode", service.Name))
6385				}
6386				if _, ok := knownTasks[check.TaskName]; !ok {
6387					mErr.Errors = append(mErr.Errors,
6388						fmt.Errorf("Check %s invalid: refers to non-existent task %s", check.Name, check.TaskName))
6389				}
6390			}
6391		}
6392	}
6393	return mErr.ErrorOrNil()
6394}
6395
6396// validateScriptChecksInGroupServices ensures group-level services with script
6397// checks know what task driver to use. Either the service.task or service.check.task
6398// parameter must be configured.
6399func (tg *TaskGroup) validateScriptChecksInGroupServices() error {
6400	var mErr multierror.Error
6401	for _, service := range tg.Services {
6402		if service.TaskName == "" {
6403			for _, check := range service.Checks {
6404				if check.Type == "script" && check.TaskName == "" {
6405					mErr.Errors = append(mErr.Errors,
6406						fmt.Errorf("Service [%s]->%s or Check %s must specify task parameter",
6407							tg.Name, service.Name, check.Name,
6408						))
6409				}
6410			}
6411		}
6412	}
6413	return mErr.ErrorOrNil()
6414}
6415
6416// validateScalingPolicy ensures that the scaling policy has consistent
6417// min and max, not in conflict with the task group count
6418func (tg *TaskGroup) validateScalingPolicy(j *Job) error {
6419	if tg.Scaling == nil {
6420		return nil
6421	}
6422
6423	var mErr multierror.Error
6424
6425	err := tg.Scaling.Validate()
6426	if err != nil {
6427		// prefix scaling policy errors
6428		if me, ok := err.(*multierror.Error); ok {
6429			for _, e := range me.Errors {
6430				mErr.Errors = append(mErr.Errors, fmt.Errorf("Scaling policy invalid: %s", e))
6431			}
6432		}
6433	}
6434
6435	if tg.Scaling.Max < int64(tg.Count) {
6436		mErr.Errors = append(mErr.Errors,
6437			fmt.Errorf("Scaling policy invalid: task group count must not be greater than maximum count in scaling policy"))
6438	}
6439
6440	if int64(tg.Count) < tg.Scaling.Min && !(j.IsMultiregion() && tg.Count == 0 && j.Region == "global") {
6441		mErr.Errors = append(mErr.Errors,
6442			fmt.Errorf("Scaling policy invalid: task group count must not be less than minimum count in scaling policy"))
6443	}
6444
6445	return mErr.ErrorOrNil()
6446}
6447
6448// Warnings returns a list of warnings that may be from dubious settings or
6449// deprecation warnings.
6450func (tg *TaskGroup) Warnings(j *Job) error {
6451	var mErr multierror.Error
6452
6453	// Validate the update strategy
6454	if u := tg.Update; u != nil {
6455		// Check the counts are appropriate
6456		if u.MaxParallel > tg.Count && !(j.IsMultiregion() && tg.Count == 0) {
6457			mErr.Errors = append(mErr.Errors,
6458				fmt.Errorf("Update max parallel count is greater than task group count (%d > %d). "+
6459					"A destructive change would result in the simultaneous replacement of all allocations.", u.MaxParallel, tg.Count))
6460		}
6461	}
6462
6463	// Check for mbits network field
6464	if len(tg.Networks) > 0 && tg.Networks[0].MBits > 0 {
6465		mErr.Errors = append(mErr.Errors, fmt.Errorf("mbits has been deprecated as of Nomad 0.12.0. Please remove mbits from the network block"))
6466	}
6467
6468	for _, t := range tg.Tasks {
6469		if err := t.Warnings(); err != nil {
6470			err = multierror.Prefix(err, fmt.Sprintf("Task %q:", t.Name))
6471			mErr.Errors = append(mErr.Errors, err)
6472		}
6473	}
6474
6475	return mErr.ErrorOrNil()
6476}
6477
6478// LookupTask finds a task by name
6479func (tg *TaskGroup) LookupTask(name string) *Task {
6480	for _, t := range tg.Tasks {
6481		if t.Name == name {
6482			return t
6483		}
6484	}
6485	return nil
6486}
6487
6488// UsesConnect for convenience returns true if the TaskGroup contains at least
6489// one service that makes use of Consul Connect features.
6490//
6491// Currently used for validating that the task group contains one or more connect
6492// aware services before generating a service identity token.
6493func (tg *TaskGroup) UsesConnect() bool {
6494	for _, service := range tg.Services {
6495		if service.Connect != nil {
6496			if service.Connect.IsNative() || service.Connect.HasSidecar() || service.Connect.IsGateway() {
6497				return true
6498			}
6499		}
6500	}
6501	return false
6502}
6503
6504// UsesConnectGateway for convenience returns true if the TaskGroup contains at
6505// least one service that makes use of Consul Connect Gateway features.
6506func (tg *TaskGroup) UsesConnectGateway() bool {
6507	for _, service := range tg.Services {
6508		if service.Connect != nil {
6509			if service.Connect.IsGateway() {
6510				return true
6511			}
6512		}
6513	}
6514	return false
6515}
6516
6517func (tg *TaskGroup) GoString() string {
6518	return fmt.Sprintf("*%#v", *tg)
6519}
6520
6521// CheckRestart describes if and when a task should be restarted based on
6522// failing health checks.
6523type CheckRestart struct {
6524	Limit          int           // Restart task after this many unhealthy intervals
6525	Grace          time.Duration // Grace time to give tasks after starting to get healthy
6526	IgnoreWarnings bool          // If true treat checks in `warning` as passing
6527}
6528
6529func (c *CheckRestart) Copy() *CheckRestart {
6530	if c == nil {
6531		return nil
6532	}
6533
6534	nc := new(CheckRestart)
6535	*nc = *c
6536	return nc
6537}
6538
6539func (c *CheckRestart) Equals(o *CheckRestart) bool {
6540	if c == nil || o == nil {
6541		return c == o
6542	}
6543
6544	if c.Limit != o.Limit {
6545		return false
6546	}
6547
6548	if c.Grace != o.Grace {
6549		return false
6550	}
6551
6552	if c.IgnoreWarnings != o.IgnoreWarnings {
6553		return false
6554	}
6555
6556	return true
6557}
6558
6559func (c *CheckRestart) Validate() error {
6560	if c == nil {
6561		return nil
6562	}
6563
6564	var mErr multierror.Error
6565	if c.Limit < 0 {
6566		mErr.Errors = append(mErr.Errors, fmt.Errorf("limit must be greater than or equal to 0 but found %d", c.Limit))
6567	}
6568
6569	if c.Grace < 0 {
6570		mErr.Errors = append(mErr.Errors, fmt.Errorf("grace period must be greater than or equal to 0 but found %d", c.Grace))
6571	}
6572
6573	return mErr.ErrorOrNil()
6574}
6575
6576const (
6577	// DefaultKillTimeout is the default timeout between signaling a task it
6578	// will be killed and killing it.
6579	DefaultKillTimeout = 5 * time.Second
6580)
6581
6582// LogConfig provides configuration for log rotation
6583type LogConfig struct {
6584	MaxFiles      int
6585	MaxFileSizeMB int
6586}
6587
6588func (l *LogConfig) Equals(o *LogConfig) bool {
6589	if l == nil || o == nil {
6590		return l == o
6591	}
6592
6593	if l.MaxFiles != o.MaxFiles {
6594		return false
6595	}
6596
6597	if l.MaxFileSizeMB != o.MaxFileSizeMB {
6598		return false
6599	}
6600
6601	return true
6602}
6603
6604func (l *LogConfig) Copy() *LogConfig {
6605	if l == nil {
6606		return nil
6607	}
6608	return &LogConfig{
6609		MaxFiles:      l.MaxFiles,
6610		MaxFileSizeMB: l.MaxFileSizeMB,
6611	}
6612}
6613
6614// DefaultLogConfig returns the default LogConfig values.
6615func DefaultLogConfig() *LogConfig {
6616	return &LogConfig{
6617		MaxFiles:      10,
6618		MaxFileSizeMB: 10,
6619	}
6620}
6621
6622// Validate returns an error if the log config specified are less than
6623// the minimum allowed.
6624func (l *LogConfig) Validate() error {
6625	var mErr multierror.Error
6626	if l.MaxFiles < 1 {
6627		mErr.Errors = append(mErr.Errors, fmt.Errorf("minimum number of files is 1; got %d", l.MaxFiles))
6628	}
6629	if l.MaxFileSizeMB < 1 {
6630		mErr.Errors = append(mErr.Errors, fmt.Errorf("minimum file size is 1MB; got %d", l.MaxFileSizeMB))
6631	}
6632	return mErr.ErrorOrNil()
6633}
6634
6635// Task is a single process typically that is executed as part of a task group.
6636type Task struct {
6637	// Name of the task
6638	Name string
6639
6640	// Driver is used to control which driver is used
6641	Driver string
6642
6643	// User is used to determine which user will run the task. It defaults to
6644	// the same user the Nomad client is being run as.
6645	User string
6646
6647	// Config is provided to the driver to initialize
6648	Config map[string]interface{}
6649
6650	// Map of environment variables to be used by the driver
6651	Env map[string]string
6652
6653	// List of service definitions exposed by the Task
6654	Services []*Service
6655
6656	// Vault is used to define the set of Vault policies that this task should
6657	// have access to.
6658	Vault *Vault
6659
6660	// Templates are the set of templates to be rendered for the task.
6661	Templates []*Template
6662
6663	// Constraints can be specified at a task level and apply only to
6664	// the particular task.
6665	Constraints []*Constraint
6666
6667	// Affinities can be specified at the task level to express
6668	// scheduling preferences
6669	Affinities []*Affinity
6670
6671	// Resources is the resources needed by this task
6672	Resources *Resources
6673
6674	// RestartPolicy of a TaskGroup
6675	RestartPolicy *RestartPolicy
6676
6677	// DispatchPayload configures how the task retrieves its input from a dispatch
6678	DispatchPayload *DispatchPayloadConfig
6679
6680	Lifecycle *TaskLifecycleConfig
6681
6682	// Meta is used to associate arbitrary metadata with this
6683	// task. This is opaque to Nomad.
6684	Meta map[string]string
6685
6686	// KillTimeout is the time between signaling a task that it will be
6687	// killed and killing it.
6688	KillTimeout time.Duration
6689
6690	// LogConfig provides configuration for log rotation
6691	LogConfig *LogConfig
6692
6693	// Artifacts is a list of artifacts to download and extract before running
6694	// the task.
6695	Artifacts []*TaskArtifact
6696
6697	// Leader marks the task as the leader within the group. When the leader
6698	// task exits, other tasks will be gracefully terminated.
6699	Leader bool
6700
6701	// ShutdownDelay is the duration of the delay between deregistering a
6702	// task from Consul and sending it a signal to shutdown. See #2441
6703	ShutdownDelay time.Duration
6704
6705	// VolumeMounts is a list of Volume name <-> mount configurations that will be
6706	// attached to this task.
6707	VolumeMounts []*VolumeMount
6708
6709	// ScalingPolicies is a list of scaling policies scoped to this task
6710	ScalingPolicies []*ScalingPolicy
6711
6712	// KillSignal is the kill signal to use for the task. This is an optional
6713	// specification and defaults to SIGINT
6714	KillSignal string
6715
6716	// Used internally to manage tasks according to their TaskKind. Initial use case
6717	// is for Consul Connect
6718	Kind TaskKind
6719
6720	// CSIPluginConfig is used to configure the plugin supervisor for the task.
6721	CSIPluginConfig *TaskCSIPluginConfig
6722}
6723
6724// UsesConnect is for conveniently detecting if the Task is able to make use
6725// of Consul Connect features. This will be indicated in the TaskKind of the
6726// Task, which exports known types of Tasks. UsesConnect will be true if the
6727// task is a connect proxy, connect native, or is a connect gateway.
6728func (t *Task) UsesConnect() bool {
6729	return t.Kind.IsConnectNative() || t.UsesConnectSidecar()
6730}
6731
6732func (t *Task) UsesConnectSidecar() bool {
6733	return t.Kind.IsConnectProxy() || t.Kind.IsAnyConnectGateway()
6734}
6735
6736func (t *Task) Copy() *Task {
6737	if t == nil {
6738		return nil
6739	}
6740	nt := new(Task)
6741	*nt = *t
6742	nt.Env = helper.CopyMapStringString(nt.Env)
6743
6744	if t.Services != nil {
6745		services := make([]*Service, len(nt.Services))
6746		for i, s := range nt.Services {
6747			services[i] = s.Copy()
6748		}
6749		nt.Services = services
6750	}
6751
6752	nt.Constraints = CopySliceConstraints(nt.Constraints)
6753	nt.Affinities = CopySliceAffinities(nt.Affinities)
6754	nt.VolumeMounts = CopySliceVolumeMount(nt.VolumeMounts)
6755	nt.CSIPluginConfig = nt.CSIPluginConfig.Copy()
6756
6757	nt.Vault = nt.Vault.Copy()
6758	nt.Resources = nt.Resources.Copy()
6759	nt.LogConfig = nt.LogConfig.Copy()
6760	nt.Meta = helper.CopyMapStringString(nt.Meta)
6761	nt.DispatchPayload = nt.DispatchPayload.Copy()
6762	nt.Lifecycle = nt.Lifecycle.Copy()
6763
6764	if t.Artifacts != nil {
6765		artifacts := make([]*TaskArtifact, 0, len(t.Artifacts))
6766		for _, a := range nt.Artifacts {
6767			artifacts = append(artifacts, a.Copy())
6768		}
6769		nt.Artifacts = artifacts
6770	}
6771
6772	if i, err := copystructure.Copy(nt.Config); err != nil {
6773		panic(err.Error())
6774	} else {
6775		nt.Config = i.(map[string]interface{})
6776	}
6777
6778	if t.Templates != nil {
6779		templates := make([]*Template, len(t.Templates))
6780		for i, tmpl := range nt.Templates {
6781			templates[i] = tmpl.Copy()
6782		}
6783		nt.Templates = templates
6784	}
6785
6786	return nt
6787}
6788
6789// Canonicalize canonicalizes fields in the task.
6790func (t *Task) Canonicalize(job *Job, tg *TaskGroup) {
6791	// Ensure that an empty and nil map are treated the same to avoid scheduling
6792	// problems since we use reflect DeepEquals.
6793	if len(t.Meta) == 0 {
6794		t.Meta = nil
6795	}
6796	if len(t.Config) == 0 {
6797		t.Config = nil
6798	}
6799	if len(t.Env) == 0 {
6800		t.Env = nil
6801	}
6802
6803	for _, service := range t.Services {
6804		service.Canonicalize(job.Name, tg.Name, t.Name)
6805	}
6806
6807	// If Resources are nil initialize them to defaults, otherwise canonicalize
6808	if t.Resources == nil {
6809		t.Resources = DefaultResources()
6810	} else {
6811		t.Resources.Canonicalize()
6812	}
6813
6814	if t.RestartPolicy == nil {
6815		t.RestartPolicy = tg.RestartPolicy
6816	}
6817
6818	// Set the default timeout if it is not specified.
6819	if t.KillTimeout == 0 {
6820		t.KillTimeout = DefaultKillTimeout
6821	}
6822
6823	if t.Vault != nil {
6824		t.Vault.Canonicalize()
6825	}
6826
6827	for _, template := range t.Templates {
6828		template.Canonicalize()
6829	}
6830}
6831
6832func (t *Task) GoString() string {
6833	return fmt.Sprintf("*%#v", *t)
6834}
6835
6836// Validate is used to check a task for reasonable configuration
6837func (t *Task) Validate(ephemeralDisk *EphemeralDisk, jobType string, tgServices []*Service, tgNetworks Networks) error {
6838	var mErr multierror.Error
6839	if t.Name == "" {
6840		mErr.Errors = append(mErr.Errors, errors.New("Missing task name"))
6841	}
6842	if strings.ContainsAny(t.Name, `/\`) {
6843		// We enforce this so that when creating the directory on disk it will
6844		// not have any slashes.
6845		mErr.Errors = append(mErr.Errors, errors.New("Task name cannot include slashes"))
6846	} else if strings.Contains(t.Name, "\000") {
6847		mErr.Errors = append(mErr.Errors, errors.New("Task name cannot include null characters"))
6848	}
6849	if t.Driver == "" {
6850		mErr.Errors = append(mErr.Errors, errors.New("Missing task driver"))
6851	}
6852	if t.KillTimeout < 0 {
6853		mErr.Errors = append(mErr.Errors, errors.New("KillTimeout must be a positive value"))
6854	}
6855	if t.ShutdownDelay < 0 {
6856		mErr.Errors = append(mErr.Errors, errors.New("ShutdownDelay must be a positive value"))
6857	}
6858
6859	// Validate the resources.
6860	if t.Resources == nil {
6861		mErr.Errors = append(mErr.Errors, errors.New("Missing task resources"))
6862	} else if err := t.Resources.Validate(); err != nil {
6863		mErr.Errors = append(mErr.Errors, err)
6864	}
6865
6866	// Validate the log config
6867	if t.LogConfig == nil {
6868		mErr.Errors = append(mErr.Errors, errors.New("Missing Log Config"))
6869	} else if err := t.LogConfig.Validate(); err != nil {
6870		mErr.Errors = append(mErr.Errors, err)
6871	}
6872
6873	for idx, constr := range t.Constraints {
6874		if err := constr.Validate(); err != nil {
6875			outer := fmt.Errorf("Constraint %d validation failed: %s", idx+1, err)
6876			mErr.Errors = append(mErr.Errors, outer)
6877		}
6878
6879		switch constr.Operand {
6880		case ConstraintDistinctHosts, ConstraintDistinctProperty:
6881			outer := fmt.Errorf("Constraint %d has disallowed Operand at task level: %s", idx+1, constr.Operand)
6882			mErr.Errors = append(mErr.Errors, outer)
6883		}
6884	}
6885
6886	if jobType == JobTypeSystem {
6887		if t.Affinities != nil {
6888			mErr.Errors = append(mErr.Errors, fmt.Errorf("System jobs may not have an affinity stanza"))
6889		}
6890	} else {
6891		for idx, affinity := range t.Affinities {
6892			if err := affinity.Validate(); err != nil {
6893				outer := fmt.Errorf("Affinity %d validation failed: %s", idx+1, err)
6894				mErr.Errors = append(mErr.Errors, outer)
6895			}
6896		}
6897	}
6898
6899	// Validate Services
6900	if err := validateServices(t, tgNetworks); err != nil {
6901		mErr.Errors = append(mErr.Errors, err)
6902	}
6903
6904	if t.LogConfig != nil && ephemeralDisk != nil {
6905		logUsage := (t.LogConfig.MaxFiles * t.LogConfig.MaxFileSizeMB)
6906		if ephemeralDisk.SizeMB <= logUsage {
6907			mErr.Errors = append(mErr.Errors,
6908				fmt.Errorf("log storage (%d MB) must be less than requested disk capacity (%d MB)",
6909					logUsage, ephemeralDisk.SizeMB))
6910		}
6911	}
6912
6913	for idx, artifact := range t.Artifacts {
6914		if err := artifact.Validate(); err != nil {
6915			outer := fmt.Errorf("Artifact %d validation failed: %v", idx+1, err)
6916			mErr.Errors = append(mErr.Errors, outer)
6917		}
6918	}
6919
6920	if t.Vault != nil {
6921		if err := t.Vault.Validate(); err != nil {
6922			mErr.Errors = append(mErr.Errors, fmt.Errorf("Vault validation failed: %v", err))
6923		}
6924	}
6925
6926	destinations := make(map[string]int, len(t.Templates))
6927	for idx, tmpl := range t.Templates {
6928		if err := tmpl.Validate(); err != nil {
6929			outer := fmt.Errorf("Template %d validation failed: %s", idx+1, err)
6930			mErr.Errors = append(mErr.Errors, outer)
6931		}
6932
6933		if other, ok := destinations[tmpl.DestPath]; ok {
6934			outer := fmt.Errorf("Template %d has same destination as %d", idx+1, other)
6935			mErr.Errors = append(mErr.Errors, outer)
6936		} else {
6937			destinations[tmpl.DestPath] = idx + 1
6938		}
6939	}
6940
6941	// Validate the dispatch payload block if there
6942	if t.DispatchPayload != nil {
6943		if err := t.DispatchPayload.Validate(); err != nil {
6944			mErr.Errors = append(mErr.Errors, fmt.Errorf("Dispatch Payload validation failed: %v", err))
6945		}
6946	}
6947
6948	// Validate the Lifecycle block if there
6949	if t.Lifecycle != nil {
6950		if err := t.Lifecycle.Validate(); err != nil {
6951			mErr.Errors = append(mErr.Errors, fmt.Errorf("Lifecycle validation failed: %v", err))
6952		}
6953
6954	}
6955
6956	// Validation for TaskKind field which is used for Consul Connect integration
6957	if t.Kind.IsConnectProxy() {
6958		// This task is a Connect proxy so it should not have service stanzas
6959		if len(t.Services) > 0 {
6960			mErr.Errors = append(mErr.Errors, fmt.Errorf("Connect proxy task must not have a service stanza"))
6961		}
6962		if t.Leader {
6963			mErr.Errors = append(mErr.Errors, fmt.Errorf("Connect proxy task must not have leader set"))
6964		}
6965
6966		// Ensure the proxy task has a corresponding service entry
6967		serviceErr := ValidateConnectProxyService(t.Kind.Value(), tgServices)
6968		if serviceErr != nil {
6969			mErr.Errors = append(mErr.Errors, serviceErr)
6970		}
6971	}
6972
6973	// Validation for volumes
6974	for idx, vm := range t.VolumeMounts {
6975		if !MountPropagationModeIsValid(vm.PropagationMode) {
6976			mErr.Errors = append(mErr.Errors, fmt.Errorf("Volume Mount (%d) has an invalid propagation mode: \"%s\"", idx, vm.PropagationMode))
6977		}
6978	}
6979
6980	// Validate CSI Plugin Config
6981	if t.CSIPluginConfig != nil {
6982		if t.CSIPluginConfig.ID == "" {
6983			mErr.Errors = append(mErr.Errors, fmt.Errorf("CSIPluginConfig must have a non-empty PluginID"))
6984		}
6985
6986		if !CSIPluginTypeIsValid(t.CSIPluginConfig.Type) {
6987			mErr.Errors = append(mErr.Errors, fmt.Errorf("CSIPluginConfig PluginType must be one of 'node', 'controller', or 'monolith', got: \"%s\"", t.CSIPluginConfig.Type))
6988		}
6989
6990		// TODO: Investigate validation of the PluginMountDir. Not much we can do apart from check IsAbs until after we understand its execution environment though :(
6991	}
6992
6993	return mErr.ErrorOrNil()
6994}
6995
6996// validateServices takes a task and validates the services within it are valid
6997// and reference ports that exist.
6998func validateServices(t *Task, tgNetworks Networks) error {
6999	var mErr multierror.Error
7000
7001	// Ensure that services don't ask for nonexistent ports and their names are
7002	// unique.
7003	servicePorts := make(map[string]map[string]struct{})
7004	addServicePort := func(label, service string) {
7005		if _, ok := servicePorts[label]; !ok {
7006			servicePorts[label] = map[string]struct{}{}
7007		}
7008		servicePorts[label][service] = struct{}{}
7009	}
7010	knownServices := make(map[string]struct{})
7011	for i, service := range t.Services {
7012		if err := service.Validate(); err != nil {
7013			outer := fmt.Errorf("service[%d] %+q validation failed: %s", i, service.Name, err)
7014			mErr.Errors = append(mErr.Errors, outer)
7015		}
7016
7017		if service.AddressMode == AddressModeAlloc {
7018			mErr.Errors = append(mErr.Errors, fmt.Errorf("service %q cannot use address_mode=\"alloc\", only services defined in a \"group\" block can use this mode", service.Name))
7019		}
7020
7021		// Ensure that services with the same name are not being registered for
7022		// the same port
7023		if _, ok := knownServices[service.Name+service.PortLabel]; ok {
7024			mErr.Errors = append(mErr.Errors, fmt.Errorf("service %q is duplicate", service.Name))
7025		}
7026		knownServices[service.Name+service.PortLabel] = struct{}{}
7027
7028		if service.PortLabel != "" {
7029			if service.AddressMode == "driver" {
7030				// Numeric port labels are valid for address_mode=driver
7031				_, err := strconv.Atoi(service.PortLabel)
7032				if err != nil {
7033					// Not a numeric port label, add it to list to check
7034					addServicePort(service.PortLabel, service.Name)
7035				}
7036			} else {
7037				addServicePort(service.PortLabel, service.Name)
7038			}
7039		}
7040
7041		// connect block is only allowed on group level
7042		if service.Connect != nil {
7043			mErr.Errors = append(mErr.Errors, fmt.Errorf("service %q cannot have \"connect\" block, only services defined in a \"group\" block can", service.Name))
7044		}
7045
7046		// Ensure that check names are unique and have valid ports
7047		knownChecks := make(map[string]struct{})
7048		for _, check := range service.Checks {
7049			if _, ok := knownChecks[check.Name]; ok {
7050				mErr.Errors = append(mErr.Errors, fmt.Errorf("check %q is duplicate", check.Name))
7051			}
7052			knownChecks[check.Name] = struct{}{}
7053
7054			if check.AddressMode == AddressModeAlloc {
7055				mErr.Errors = append(mErr.Errors, fmt.Errorf("check %q cannot use address_mode=\"alloc\", only checks defined in a \"group\" service block can use this mode", service.Name))
7056			}
7057
7058			if !check.RequiresPort() {
7059				// No need to continue validating check if it doesn't need a port
7060				continue
7061			}
7062
7063			effectivePort := check.PortLabel
7064			if effectivePort == "" {
7065				// Inherits from service
7066				effectivePort = service.PortLabel
7067			}
7068
7069			if effectivePort == "" {
7070				mErr.Errors = append(mErr.Errors, fmt.Errorf("check %q is missing a port", check.Name))
7071				continue
7072			}
7073
7074			isNumeric := false
7075			portNumber, err := strconv.Atoi(effectivePort)
7076			if err == nil {
7077				isNumeric = true
7078			}
7079
7080			// Numeric ports are fine for address_mode = "driver"
7081			if check.AddressMode == "driver" && isNumeric {
7082				if portNumber <= 0 {
7083					mErr.Errors = append(mErr.Errors, fmt.Errorf("check %q has invalid numeric port %d", check.Name, portNumber))
7084				}
7085				continue
7086			}
7087
7088			if isNumeric {
7089				mErr.Errors = append(mErr.Errors, fmt.Errorf(`check %q cannot use a numeric port %d without setting address_mode="driver"`, check.Name, portNumber))
7090				continue
7091			}
7092
7093			// PortLabel must exist, report errors by its parent service
7094			addServicePort(effectivePort, service.Name)
7095		}
7096	}
7097
7098	// Get the set of group port labels.
7099	portLabels := make(map[string]struct{})
7100	if len(tgNetworks) > 0 {
7101		ports := tgNetworks[0].PortLabels()
7102		for portLabel := range ports {
7103			portLabels[portLabel] = struct{}{}
7104		}
7105	}
7106
7107	// COMPAT(0.13)
7108	// Append the set of task port labels. (Note that network resources on the
7109	// task resources are deprecated, but we must let them continue working; a
7110	// warning will be emitted on job submission).
7111	if t.Resources != nil {
7112		for _, network := range t.Resources.Networks {
7113			for portLabel := range network.PortLabels() {
7114				portLabels[portLabel] = struct{}{}
7115			}
7116		}
7117	}
7118
7119	// Iterate over a sorted list of keys to make error listings stable
7120	keys := make([]string, 0, len(servicePorts))
7121	for p := range servicePorts {
7122		keys = append(keys, p)
7123	}
7124	sort.Strings(keys)
7125
7126	// Ensure all ports referenced in services exist.
7127	for _, servicePort := range keys {
7128		services := servicePorts[servicePort]
7129		_, ok := portLabels[servicePort]
7130		if !ok {
7131			names := make([]string, 0, len(services))
7132			for name := range services {
7133				names = append(names, name)
7134			}
7135
7136			// Keep order deterministic
7137			sort.Strings(names)
7138			joined := strings.Join(names, ", ")
7139			err := fmt.Errorf("port label %q referenced by services %v does not exist", servicePort, joined)
7140			mErr.Errors = append(mErr.Errors, err)
7141		}
7142	}
7143
7144	// Ensure address mode is valid
7145	return mErr.ErrorOrNil()
7146}
7147
7148func (t *Task) Warnings() error {
7149	var mErr multierror.Error
7150
7151	// Validate the resources
7152	if t.Resources != nil && t.Resources.IOPS != 0 {
7153		mErr.Errors = append(mErr.Errors, fmt.Errorf("IOPS has been deprecated as of Nomad 0.9.0. Please remove IOPS from resource stanza."))
7154	}
7155
7156	if t.Resources != nil && len(t.Resources.Networks) != 0 {
7157		mErr.Errors = append(mErr.Errors, fmt.Errorf("task network resources have been deprecated as of Nomad 0.12.0. Please configure networking via group network block."))
7158	}
7159
7160	for idx, tmpl := range t.Templates {
7161		if err := tmpl.Warnings(); err != nil {
7162			err = multierror.Prefix(err, fmt.Sprintf("Template[%d]", idx))
7163			mErr.Errors = append(mErr.Errors, err)
7164		}
7165	}
7166
7167	return mErr.ErrorOrNil()
7168}
7169
7170// TaskKind identifies the special kinds of tasks using the following format:
7171// '<kind_name>(:<identifier>)`. The TaskKind can optionally include an identifier that
7172// is opaque to the Task. This identifier can be used to relate the task to some
7173// other entity based on the kind.
7174//
7175// For example, a task may have the TaskKind of `connect-proxy:service` where
7176// 'connect-proxy' is the kind name and 'service' is the identifier that relates the
7177// task to the service name of which it is a connect proxy for.
7178type TaskKind string
7179
7180func NewTaskKind(name, identifier string) TaskKind {
7181	return TaskKind(fmt.Sprintf("%s:%s", name, identifier))
7182}
7183
7184// Name returns the kind name portion of the TaskKind
7185func (k TaskKind) Name() string {
7186	return strings.Split(string(k), ":")[0]
7187}
7188
7189// Value returns the identifier of the TaskKind or an empty string if it doesn't
7190// include one.
7191func (k TaskKind) Value() string {
7192	if s := strings.SplitN(string(k), ":", 2); len(s) > 1 {
7193		return s[1]
7194	}
7195	return ""
7196}
7197
7198func (k TaskKind) hasPrefix(prefix string) bool {
7199	return strings.HasPrefix(string(k), prefix+":") && len(k) > len(prefix)+1
7200}
7201
7202// IsConnectProxy returns true if the TaskKind is connect-proxy.
7203func (k TaskKind) IsConnectProxy() bool {
7204	return k.hasPrefix(ConnectProxyPrefix)
7205}
7206
7207// IsConnectNative returns true if the TaskKind is connect-native.
7208func (k TaskKind) IsConnectNative() bool {
7209	return k.hasPrefix(ConnectNativePrefix)
7210}
7211
7212func (k TaskKind) IsConnectIngress() bool {
7213	return k.hasPrefix(ConnectIngressPrefix)
7214}
7215
7216func (k TaskKind) IsConnectTerminating() bool {
7217	return k.hasPrefix(ConnectTerminatingPrefix)
7218}
7219
7220func (k TaskKind) IsAnyConnectGateway() bool {
7221	switch {
7222	case k.IsConnectIngress():
7223		return true
7224	case k.IsConnectTerminating():
7225		return true
7226	default:
7227		return false
7228	}
7229}
7230
7231const (
7232	// ConnectProxyPrefix is the prefix used for fields referencing a Consul Connect
7233	// Proxy
7234	ConnectProxyPrefix = "connect-proxy"
7235
7236	// ConnectNativePrefix is the prefix used for fields referencing a Connect
7237	// Native Task
7238	ConnectNativePrefix = "connect-native"
7239
7240	// ConnectIngressPrefix is the prefix used for fields referencing a Consul
7241	// Connect Ingress Gateway Proxy.
7242	ConnectIngressPrefix = "connect-ingress"
7243
7244	// ConnectTerminatingPrefix is the prefix used for fields referencing a Consul
7245	// Connect Terminating Gateway Proxy.
7246	//
7247	ConnectTerminatingPrefix = "connect-terminating"
7248
7249	// ConnectMeshPrefix is the prefix used for fields referencing a Consul Connect
7250	// Mesh Gateway Proxy.
7251	//
7252	// Not yet supported.
7253	// ConnectMeshPrefix = "connect-mesh"
7254)
7255
7256// ValidateConnectProxyService checks that the service that is being
7257// proxied by this task exists in the task group and contains
7258// valid Connect config.
7259func ValidateConnectProxyService(serviceName string, tgServices []*Service) error {
7260	found := false
7261	names := make([]string, 0, len(tgServices))
7262	for _, svc := range tgServices {
7263		if svc.Connect == nil || svc.Connect.SidecarService == nil {
7264			continue
7265		}
7266
7267		if svc.Name == serviceName {
7268			found = true
7269			break
7270		}
7271
7272		// Build up list of mismatched Connect service names for error
7273		// reporting.
7274		names = append(names, svc.Name)
7275	}
7276
7277	if !found {
7278		if len(names) == 0 {
7279			return fmt.Errorf("No Connect services in task group with Connect proxy (%q)", serviceName)
7280		} else {
7281			return fmt.Errorf("Connect proxy service name (%q) not found in Connect services from task group: %s", serviceName, names)
7282		}
7283	}
7284
7285	return nil
7286}
7287
7288const (
7289	// TemplateChangeModeNoop marks that no action should be taken if the
7290	// template is re-rendered
7291	TemplateChangeModeNoop = "noop"
7292
7293	// TemplateChangeModeSignal marks that the task should be signaled if the
7294	// template is re-rendered
7295	TemplateChangeModeSignal = "signal"
7296
7297	// TemplateChangeModeRestart marks that the task should be restarted if the
7298	// template is re-rendered
7299	TemplateChangeModeRestart = "restart"
7300)
7301
7302var (
7303	// TemplateChangeModeInvalidError is the error for when an invalid change
7304	// mode is given
7305	TemplateChangeModeInvalidError = errors.New("Invalid change mode. Must be one of the following: noop, signal, restart")
7306)
7307
7308// Template represents a template configuration to be rendered for a given task
7309type Template struct {
7310	// SourcePath is the path to the template to be rendered
7311	SourcePath string
7312
7313	// DestPath is the path to where the template should be rendered
7314	DestPath string
7315
7316	// EmbeddedTmpl store the raw template. This is useful for smaller templates
7317	// where they are embedded in the job file rather than sent as an artifact
7318	EmbeddedTmpl string
7319
7320	// ChangeMode indicates what should be done if the template is re-rendered
7321	ChangeMode string
7322
7323	// ChangeSignal is the signal that should be sent if the change mode
7324	// requires it.
7325	ChangeSignal string
7326
7327	// Splay is used to avoid coordinated restarts of processes by applying a
7328	// random wait between 0 and the given splay value before signalling the
7329	// application of a change
7330	Splay time.Duration
7331
7332	// Perms is the permission the file should be written out with.
7333	Perms string
7334
7335	// LeftDelim and RightDelim are optional configurations to control what
7336	// delimiter is utilized when parsing the template.
7337	LeftDelim  string
7338	RightDelim string
7339
7340	// Envvars enables exposing the template as environment variables
7341	// instead of as a file. The template must be of the form:
7342	//
7343	//	VAR_NAME_1={{ key service/my-key }}
7344	//	VAR_NAME_2=raw string and {{ env "attr.kernel.name" }}
7345	//
7346	// Lines will be split on the initial "=" with the first part being the
7347	// key name and the second part the value.
7348	// Empty lines and lines starting with # will be ignored, but to avoid
7349	// escaping issues #s within lines will not be treated as comments.
7350	Envvars bool
7351
7352	// VaultGrace is the grace duration between lease renewal and reacquiring a
7353	// secret. If the lease of a secret is less than the grace, a new secret is
7354	// acquired.
7355	// COMPAT(0.12) VaultGrace has been ignored by Vault since Vault v0.5.
7356	VaultGrace time.Duration
7357}
7358
7359// DefaultTemplate returns a default template.
7360func DefaultTemplate() *Template {
7361	return &Template{
7362		ChangeMode: TemplateChangeModeRestart,
7363		Splay:      5 * time.Second,
7364		Perms:      "0644",
7365	}
7366}
7367
7368func (t *Template) Copy() *Template {
7369	if t == nil {
7370		return nil
7371	}
7372	copy := new(Template)
7373	*copy = *t
7374	return copy
7375}
7376
7377func (t *Template) Canonicalize() {
7378	if t.ChangeSignal != "" {
7379		t.ChangeSignal = strings.ToUpper(t.ChangeSignal)
7380	}
7381}
7382
7383func (t *Template) Validate() error {
7384	var mErr multierror.Error
7385
7386	// Verify we have something to render
7387	if t.SourcePath == "" && t.EmbeddedTmpl == "" {
7388		_ = multierror.Append(&mErr, fmt.Errorf("Must specify a source path or have an embedded template"))
7389	}
7390
7391	// Verify we can render somewhere
7392	if t.DestPath == "" {
7393		_ = multierror.Append(&mErr, fmt.Errorf("Must specify a destination for the template"))
7394	}
7395
7396	// Verify the destination doesn't escape
7397	escaped, err := PathEscapesAllocDir("task", t.DestPath)
7398	if err != nil {
7399		mErr.Errors = append(mErr.Errors, fmt.Errorf("invalid destination path: %v", err))
7400	} else if escaped {
7401		mErr.Errors = append(mErr.Errors, fmt.Errorf("destination escapes allocation directory"))
7402	}
7403
7404	// Verify a proper change mode
7405	switch t.ChangeMode {
7406	case TemplateChangeModeNoop, TemplateChangeModeRestart:
7407	case TemplateChangeModeSignal:
7408		if t.ChangeSignal == "" {
7409			_ = multierror.Append(&mErr, fmt.Errorf("Must specify signal value when change mode is signal"))
7410		}
7411		if t.Envvars {
7412			_ = multierror.Append(&mErr, fmt.Errorf("cannot use signals with env var templates"))
7413		}
7414	default:
7415		_ = multierror.Append(&mErr, TemplateChangeModeInvalidError)
7416	}
7417
7418	// Verify the splay is positive
7419	if t.Splay < 0 {
7420		_ = multierror.Append(&mErr, fmt.Errorf("Must specify positive splay value"))
7421	}
7422
7423	// Verify the permissions
7424	if t.Perms != "" {
7425		if _, err := strconv.ParseUint(t.Perms, 8, 12); err != nil {
7426			_ = multierror.Append(&mErr, fmt.Errorf("Failed to parse %q as octal: %v", t.Perms, err))
7427		}
7428	}
7429
7430	return mErr.ErrorOrNil()
7431}
7432
7433func (t *Template) Warnings() error {
7434	var mErr multierror.Error
7435
7436	// Deprecation notice for vault_grace
7437	if t.VaultGrace != 0 {
7438		mErr.Errors = append(mErr.Errors, fmt.Errorf("VaultGrace has been deprecated as of Nomad 0.11 and ignored since Vault 0.5. Please remove VaultGrace / vault_grace from template stanza."))
7439	}
7440
7441	return mErr.ErrorOrNil()
7442}
7443
7444// AllocState records a single event that changes the state of the whole allocation
7445type AllocStateField uint8
7446
7447const (
7448	AllocStateFieldClientStatus AllocStateField = iota
7449)
7450
7451type AllocState struct {
7452	Field AllocStateField
7453	Value string
7454	Time  time.Time
7455}
7456
7457// TaskHandle is  optional handle to a task propogated to the servers for use
7458// by remote tasks. Since remote tasks are not implicitly lost when the node
7459// they are assigned to is down, their state is migrated to the replacement
7460// allocation.
7461//
7462//  Minimal set of fields from plugins/drivers/task_handle.go:TaskHandle
7463type TaskHandle struct {
7464	// Version of driver state. Used by the driver to gracefully handle
7465	// plugin upgrades.
7466	Version int
7467
7468	// Driver-specific state containing a handle to the remote task.
7469	DriverState []byte
7470}
7471
7472func (h *TaskHandle) Copy() *TaskHandle {
7473	if h == nil {
7474		return nil
7475	}
7476
7477	newTH := TaskHandle{
7478		Version:     h.Version,
7479		DriverState: make([]byte, len(h.DriverState)),
7480	}
7481	copy(newTH.DriverState, h.DriverState)
7482	return &newTH
7483}
7484
7485// Set of possible states for a task.
7486const (
7487	TaskStatePending = "pending" // The task is waiting to be run.
7488	TaskStateRunning = "running" // The task is currently running.
7489	TaskStateDead    = "dead"    // Terminal state of task.
7490)
7491
7492// TaskState tracks the current state of a task and events that caused state
7493// transitions.
7494type TaskState struct {
7495	// The current state of the task.
7496	State string
7497
7498	// Failed marks a task as having failed
7499	Failed bool
7500
7501	// Restarts is the number of times the task has restarted
7502	Restarts uint64
7503
7504	// LastRestart is the time the task last restarted. It is updated each time the
7505	// task restarts
7506	LastRestart time.Time
7507
7508	// StartedAt is the time the task is started. It is updated each time the
7509	// task starts
7510	StartedAt time.Time
7511
7512	// FinishedAt is the time at which the task transitioned to dead and will
7513	// not be started again.
7514	FinishedAt time.Time
7515
7516	// Series of task events that transition the state of the task.
7517	Events []*TaskEvent
7518
7519	// Experimental -  TaskHandle is based on drivers.TaskHandle and used
7520	// by remote task drivers to migrate task handles between allocations.
7521	TaskHandle *TaskHandle
7522}
7523
7524// NewTaskState returns a TaskState initialized in the Pending state.
7525func NewTaskState() *TaskState {
7526	return &TaskState{
7527		State: TaskStatePending,
7528	}
7529}
7530
7531// Canonicalize ensures the TaskState has a State set. It should default to
7532// Pending.
7533func (ts *TaskState) Canonicalize() {
7534	if ts.State == "" {
7535		ts.State = TaskStatePending
7536	}
7537}
7538
7539func (ts *TaskState) Copy() *TaskState {
7540	if ts == nil {
7541		return nil
7542	}
7543	newTS := new(TaskState)
7544	*newTS = *ts
7545
7546	if ts.Events != nil {
7547		newTS.Events = make([]*TaskEvent, len(ts.Events))
7548		for i, e := range ts.Events {
7549			newTS.Events[i] = e.Copy()
7550		}
7551	}
7552
7553	newTS.TaskHandle = ts.TaskHandle.Copy()
7554	return newTS
7555}
7556
7557// Successful returns whether a task finished successfully. This doesn't really
7558// have meaning on a non-batch allocation because a service and system
7559// allocation should not finish.
7560func (ts *TaskState) Successful() bool {
7561	return ts.State == TaskStateDead && !ts.Failed
7562}
7563
7564const (
7565	// TaskSetupFailure indicates that the task could not be started due to a
7566	// a setup failure.
7567	TaskSetupFailure = "Setup Failure"
7568
7569	// TaskDriveFailure indicates that the task could not be started due to a
7570	// failure in the driver. TaskDriverFailure is considered Recoverable.
7571	TaskDriverFailure = "Driver Failure"
7572
7573	// TaskReceived signals that the task has been pulled by the client at the
7574	// given timestamp.
7575	TaskReceived = "Received"
7576
7577	// TaskFailedValidation indicates the task was invalid and as such was not run.
7578	// TaskFailedValidation is not considered Recoverable.
7579	TaskFailedValidation = "Failed Validation"
7580
7581	// TaskStarted signals that the task was started and its timestamp can be
7582	// used to determine the running length of the task.
7583	TaskStarted = "Started"
7584
7585	// TaskTerminated indicates that the task was started and exited.
7586	TaskTerminated = "Terminated"
7587
7588	// TaskKilling indicates a kill signal has been sent to the task.
7589	TaskKilling = "Killing"
7590
7591	// TaskKilled indicates a user has killed the task.
7592	TaskKilled = "Killed"
7593
7594	// TaskRestarting indicates that task terminated and is being restarted.
7595	TaskRestarting = "Restarting"
7596
7597	// TaskNotRestarting indicates that the task has failed and is not being
7598	// restarted because it has exceeded its restart policy.
7599	TaskNotRestarting = "Not Restarting"
7600
7601	// TaskRestartSignal indicates that the task has been signalled to be
7602	// restarted
7603	TaskRestartSignal = "Restart Signaled"
7604
7605	// TaskSignaling indicates that the task is being signalled.
7606	TaskSignaling = "Signaling"
7607
7608	// TaskDownloadingArtifacts means the task is downloading the artifacts
7609	// specified in the task.
7610	TaskDownloadingArtifacts = "Downloading Artifacts"
7611
7612	// TaskArtifactDownloadFailed indicates that downloading the artifacts
7613	// failed.
7614	TaskArtifactDownloadFailed = "Failed Artifact Download"
7615
7616	// TaskBuildingTaskDir indicates that the task directory/chroot is being
7617	// built.
7618	TaskBuildingTaskDir = "Building Task Directory"
7619
7620	// TaskSetup indicates the task runner is setting up the task environment
7621	TaskSetup = "Task Setup"
7622
7623	// TaskDiskExceeded indicates that one of the tasks in a taskgroup has
7624	// exceeded the requested disk resources.
7625	TaskDiskExceeded = "Disk Resources Exceeded"
7626
7627	// TaskSiblingFailed indicates that a sibling task in the task group has
7628	// failed.
7629	TaskSiblingFailed = "Sibling Task Failed"
7630
7631	// TaskDriverMessage is an informational event message emitted by
7632	// drivers such as when they're performing a long running action like
7633	// downloading an image.
7634	TaskDriverMessage = "Driver"
7635
7636	// TaskLeaderDead indicates that the leader task within the has finished.
7637	TaskLeaderDead = "Leader Task Dead"
7638
7639	// TaskMainDead indicates that the main tasks have dead
7640	TaskMainDead = "Main Tasks Dead"
7641
7642	// TaskHookFailed indicates that one of the hooks for a task failed.
7643	TaskHookFailed = "Task hook failed"
7644
7645	// TaskRestoreFailed indicates Nomad was unable to reattach to a
7646	// restored task.
7647	TaskRestoreFailed = "Failed Restoring Task"
7648
7649	// TaskPluginUnhealthy indicates that a plugin managed by Nomad became unhealthy
7650	TaskPluginUnhealthy = "Plugin became unhealthy"
7651
7652	// TaskPluginHealthy indicates that a plugin managed by Nomad became healthy
7653	TaskPluginHealthy = "Plugin became healthy"
7654)
7655
7656// TaskEvent is an event that effects the state of a task and contains meta-data
7657// appropriate to the events type.
7658type TaskEvent struct {
7659	Type string
7660	Time int64 // Unix Nanosecond timestamp
7661
7662	Message string // A possible message explaining the termination of the task.
7663
7664	// DisplayMessage is a human friendly message about the event
7665	DisplayMessage string
7666
7667	// Details is a map with annotated info about the event
7668	Details map[string]string
7669
7670	// DEPRECATION NOTICE: The following fields are deprecated and will be removed
7671	// in a future release. Field values are available in the Details map.
7672
7673	// FailsTask marks whether this event fails the task.
7674	// Deprecated, use Details["fails_task"] to access this.
7675	FailsTask bool
7676
7677	// Restart fields.
7678	// Deprecated, use Details["restart_reason"] to access this.
7679	RestartReason string
7680
7681	// Setup Failure fields.
7682	// Deprecated, use Details["setup_error"] to access this.
7683	SetupError string
7684
7685	// Driver Failure fields.
7686	// Deprecated, use Details["driver_error"] to access this.
7687	DriverError string // A driver error occurred while starting the task.
7688
7689	// Task Terminated Fields.
7690
7691	// Deprecated, use Details["exit_code"] to access this.
7692	ExitCode int // The exit code of the task.
7693
7694	// Deprecated, use Details["signal"] to access this.
7695	Signal int // The signal that terminated the task.
7696
7697	// Killing fields
7698	// Deprecated, use Details["kill_timeout"] to access this.
7699	KillTimeout time.Duration
7700
7701	// Task Killed Fields.
7702	// Deprecated, use Details["kill_error"] to access this.
7703	KillError string // Error killing the task.
7704
7705	// KillReason is the reason the task was killed
7706	// Deprecated, use Details["kill_reason"] to access this.
7707	KillReason string
7708
7709	// TaskRestarting fields.
7710	// Deprecated, use Details["start_delay"] to access this.
7711	StartDelay int64 // The sleep period before restarting the task in unix nanoseconds.
7712
7713	// Artifact Download fields
7714	// Deprecated, use Details["download_error"] to access this.
7715	DownloadError string // Error downloading artifacts
7716
7717	// Validation fields
7718	// Deprecated, use Details["validation_error"] to access this.
7719	ValidationError string // Validation error
7720
7721	// The maximum allowed task disk size.
7722	// Deprecated, use Details["disk_limit"] to access this.
7723	DiskLimit int64
7724
7725	// Name of the sibling task that caused termination of the task that
7726	// the TaskEvent refers to.
7727	// Deprecated, use Details["failed_sibling"] to access this.
7728	FailedSibling string
7729
7730	// VaultError is the error from token renewal
7731	// Deprecated, use Details["vault_renewal_error"] to access this.
7732	VaultError string
7733
7734	// TaskSignalReason indicates the reason the task is being signalled.
7735	// Deprecated, use Details["task_signal_reason"] to access this.
7736	TaskSignalReason string
7737
7738	// TaskSignal is the signal that was sent to the task
7739	// Deprecated, use Details["task_signal"] to access this.
7740	TaskSignal string
7741
7742	// DriverMessage indicates a driver action being taken.
7743	// Deprecated, use Details["driver_message"] to access this.
7744	DriverMessage string
7745
7746	// GenericSource is the source of a message.
7747	// Deprecated, is redundant with event type.
7748	GenericSource string
7749}
7750
7751func (event *TaskEvent) PopulateEventDisplayMessage() {
7752	// Build up the description based on the event type.
7753	if event == nil { //TODO(preetha) needs investigation alloc_runner's Run method sends a nil event when sigterming nomad. Why?
7754		return
7755	}
7756
7757	if event.DisplayMessage != "" {
7758		return
7759	}
7760
7761	var desc string
7762	switch event.Type {
7763	case TaskSetup:
7764		desc = event.Message
7765	case TaskStarted:
7766		desc = "Task started by client"
7767	case TaskReceived:
7768		desc = "Task received by client"
7769	case TaskFailedValidation:
7770		if event.ValidationError != "" {
7771			desc = event.ValidationError
7772		} else {
7773			desc = "Validation of task failed"
7774		}
7775	case TaskSetupFailure:
7776		if event.SetupError != "" {
7777			desc = event.SetupError
7778		} else {
7779			desc = "Task setup failed"
7780		}
7781	case TaskDriverFailure:
7782		if event.DriverError != "" {
7783			desc = event.DriverError
7784		} else {
7785			desc = "Failed to start task"
7786		}
7787	case TaskDownloadingArtifacts:
7788		desc = "Client is downloading artifacts"
7789	case TaskArtifactDownloadFailed:
7790		if event.DownloadError != "" {
7791			desc = event.DownloadError
7792		} else {
7793			desc = "Failed to download artifacts"
7794		}
7795	case TaskKilling:
7796		if event.KillReason != "" {
7797			desc = event.KillReason
7798		} else if event.KillTimeout != 0 {
7799			desc = fmt.Sprintf("Sent interrupt. Waiting %v before force killing", event.KillTimeout)
7800		} else {
7801			desc = "Sent interrupt"
7802		}
7803	case TaskKilled:
7804		if event.KillError != "" {
7805			desc = event.KillError
7806		} else {
7807			desc = "Task successfully killed"
7808		}
7809	case TaskTerminated:
7810		var parts []string
7811		parts = append(parts, fmt.Sprintf("Exit Code: %d", event.ExitCode))
7812
7813		if event.Signal != 0 {
7814			parts = append(parts, fmt.Sprintf("Signal: %d", event.Signal))
7815		}
7816
7817		if event.Message != "" {
7818			parts = append(parts, fmt.Sprintf("Exit Message: %q", event.Message))
7819		}
7820		desc = strings.Join(parts, ", ")
7821	case TaskRestarting:
7822		in := fmt.Sprintf("Task restarting in %v", time.Duration(event.StartDelay))
7823		if event.RestartReason != "" && event.RestartReason != ReasonWithinPolicy {
7824			desc = fmt.Sprintf("%s - %s", event.RestartReason, in)
7825		} else {
7826			desc = in
7827		}
7828	case TaskNotRestarting:
7829		if event.RestartReason != "" {
7830			desc = event.RestartReason
7831		} else {
7832			desc = "Task exceeded restart policy"
7833		}
7834	case TaskSiblingFailed:
7835		if event.FailedSibling != "" {
7836			desc = fmt.Sprintf("Task's sibling %q failed", event.FailedSibling)
7837		} else {
7838			desc = "Task's sibling failed"
7839		}
7840	case TaskSignaling:
7841		sig := event.TaskSignal
7842		reason := event.TaskSignalReason
7843
7844		if sig == "" && reason == "" {
7845			desc = "Task being sent a signal"
7846		} else if sig == "" {
7847			desc = reason
7848		} else if reason == "" {
7849			desc = fmt.Sprintf("Task being sent signal %v", sig)
7850		} else {
7851			desc = fmt.Sprintf("Task being sent signal %v: %v", sig, reason)
7852		}
7853	case TaskRestartSignal:
7854		if event.RestartReason != "" {
7855			desc = event.RestartReason
7856		} else {
7857			desc = "Task signaled to restart"
7858		}
7859	case TaskDriverMessage:
7860		desc = event.DriverMessage
7861	case TaskLeaderDead:
7862		desc = "Leader Task in Group dead"
7863	case TaskMainDead:
7864		desc = "Main tasks in the group died"
7865	default:
7866		desc = event.Message
7867	}
7868
7869	event.DisplayMessage = desc
7870}
7871
7872func (te *TaskEvent) GoString() string {
7873	return fmt.Sprintf("%v - %v", te.Time, te.Type)
7874}
7875
7876// SetDisplayMessage sets the display message of TaskEvent
7877func (te *TaskEvent) SetDisplayMessage(msg string) *TaskEvent {
7878	te.DisplayMessage = msg
7879	return te
7880}
7881
7882// SetMessage sets the message of TaskEvent
7883func (te *TaskEvent) SetMessage(msg string) *TaskEvent {
7884	te.Message = msg
7885	te.Details["message"] = msg
7886	return te
7887}
7888
7889func (te *TaskEvent) Copy() *TaskEvent {
7890	if te == nil {
7891		return nil
7892	}
7893	copy := new(TaskEvent)
7894	*copy = *te
7895	return copy
7896}
7897
7898func NewTaskEvent(event string) *TaskEvent {
7899	return &TaskEvent{
7900		Type:    event,
7901		Time:    time.Now().UnixNano(),
7902		Details: make(map[string]string),
7903	}
7904}
7905
7906// SetSetupError is used to store an error that occurred while setting up the
7907// task
7908func (e *TaskEvent) SetSetupError(err error) *TaskEvent {
7909	if err != nil {
7910		e.SetupError = err.Error()
7911		e.Details["setup_error"] = err.Error()
7912	}
7913	return e
7914}
7915
7916func (e *TaskEvent) SetFailsTask() *TaskEvent {
7917	e.FailsTask = true
7918	e.Details["fails_task"] = "true"
7919	return e
7920}
7921
7922func (e *TaskEvent) SetDriverError(err error) *TaskEvent {
7923	if err != nil {
7924		e.DriverError = err.Error()
7925		e.Details["driver_error"] = err.Error()
7926	}
7927	return e
7928}
7929
7930func (e *TaskEvent) SetExitCode(c int) *TaskEvent {
7931	e.ExitCode = c
7932	e.Details["exit_code"] = fmt.Sprintf("%d", c)
7933	return e
7934}
7935
7936func (e *TaskEvent) SetSignal(s int) *TaskEvent {
7937	e.Signal = s
7938	e.Details["signal"] = fmt.Sprintf("%d", s)
7939	return e
7940}
7941
7942func (e *TaskEvent) SetSignalText(s string) *TaskEvent {
7943	e.Details["signal"] = s
7944	return e
7945}
7946
7947func (e *TaskEvent) SetExitMessage(err error) *TaskEvent {
7948	if err != nil {
7949		e.Message = err.Error()
7950		e.Details["exit_message"] = err.Error()
7951	}
7952	return e
7953}
7954
7955func (e *TaskEvent) SetKillError(err error) *TaskEvent {
7956	if err != nil {
7957		e.KillError = err.Error()
7958		e.Details["kill_error"] = err.Error()
7959	}
7960	return e
7961}
7962
7963func (e *TaskEvent) SetKillReason(r string) *TaskEvent {
7964	e.KillReason = r
7965	e.Details["kill_reason"] = r
7966	return e
7967}
7968
7969func (e *TaskEvent) SetRestartDelay(delay time.Duration) *TaskEvent {
7970	e.StartDelay = int64(delay)
7971	e.Details["start_delay"] = fmt.Sprintf("%d", delay)
7972	return e
7973}
7974
7975func (e *TaskEvent) SetRestartReason(reason string) *TaskEvent {
7976	e.RestartReason = reason
7977	e.Details["restart_reason"] = reason
7978	return e
7979}
7980
7981func (e *TaskEvent) SetTaskSignalReason(r string) *TaskEvent {
7982	e.TaskSignalReason = r
7983	e.Details["task_signal_reason"] = r
7984	return e
7985}
7986
7987func (e *TaskEvent) SetTaskSignal(s os.Signal) *TaskEvent {
7988	e.TaskSignal = s.String()
7989	e.Details["task_signal"] = s.String()
7990	return e
7991}
7992
7993func (e *TaskEvent) SetDownloadError(err error) *TaskEvent {
7994	if err != nil {
7995		e.DownloadError = err.Error()
7996		e.Details["download_error"] = err.Error()
7997	}
7998	return e
7999}
8000
8001func (e *TaskEvent) SetValidationError(err error) *TaskEvent {
8002	if err != nil {
8003		e.ValidationError = err.Error()
8004		e.Details["validation_error"] = err.Error()
8005	}
8006	return e
8007}
8008
8009func (e *TaskEvent) SetKillTimeout(timeout time.Duration) *TaskEvent {
8010	e.KillTimeout = timeout
8011	e.Details["kill_timeout"] = timeout.String()
8012	return e
8013}
8014
8015func (e *TaskEvent) SetDiskLimit(limit int64) *TaskEvent {
8016	e.DiskLimit = limit
8017	e.Details["disk_limit"] = fmt.Sprintf("%d", limit)
8018	return e
8019}
8020
8021func (e *TaskEvent) SetFailedSibling(sibling string) *TaskEvent {
8022	e.FailedSibling = sibling
8023	e.Details["failed_sibling"] = sibling
8024	return e
8025}
8026
8027func (e *TaskEvent) SetVaultRenewalError(err error) *TaskEvent {
8028	if err != nil {
8029		e.VaultError = err.Error()
8030		e.Details["vault_renewal_error"] = err.Error()
8031	}
8032	return e
8033}
8034
8035func (e *TaskEvent) SetDriverMessage(m string) *TaskEvent {
8036	e.DriverMessage = m
8037	e.Details["driver_message"] = m
8038	return e
8039}
8040
8041func (e *TaskEvent) SetOOMKilled(oom bool) *TaskEvent {
8042	e.Details["oom_killed"] = strconv.FormatBool(oom)
8043	return e
8044}
8045
8046// TaskArtifact is an artifact to download before running the task.
8047type TaskArtifact struct {
8048	// GetterSource is the source to download an artifact using go-getter
8049	GetterSource string
8050
8051	// GetterOptions are options to use when downloading the artifact using
8052	// go-getter.
8053	GetterOptions map[string]string
8054
8055	// GetterHeaders are headers to use when downloading the artifact using
8056	// go-getter.
8057	GetterHeaders map[string]string
8058
8059	// GetterMode is the go-getter.ClientMode for fetching resources.
8060	// Defaults to "any" but can be set to "file" or "dir".
8061	GetterMode string
8062
8063	// RelativeDest is the download destination given relative to the task's
8064	// directory.
8065	RelativeDest string
8066}
8067
8068func (ta *TaskArtifact) Copy() *TaskArtifact {
8069	if ta == nil {
8070		return nil
8071	}
8072	return &TaskArtifact{
8073		GetterSource:  ta.GetterSource,
8074		GetterOptions: helper.CopyMapStringString(ta.GetterOptions),
8075		GetterHeaders: helper.CopyMapStringString(ta.GetterHeaders),
8076		GetterMode:    ta.GetterMode,
8077		RelativeDest:  ta.RelativeDest,
8078	}
8079}
8080
8081func (ta *TaskArtifact) GoString() string {
8082	return fmt.Sprintf("%+v", ta)
8083}
8084
8085// hashStringMap appends a deterministic hash of m onto h.
8086func hashStringMap(h hash.Hash, m map[string]string) {
8087	keys := make([]string, 0, len(m))
8088	for k := range m {
8089		keys = append(keys, k)
8090	}
8091	sort.Strings(keys)
8092	for _, k := range keys {
8093		_, _ = h.Write([]byte(k))
8094		_, _ = h.Write([]byte(m[k]))
8095	}
8096}
8097
8098// Hash creates a unique identifier for a TaskArtifact as the same GetterSource
8099// may be specified multiple times with different destinations.
8100func (ta *TaskArtifact) Hash() string {
8101	h, err := blake2b.New256(nil)
8102	if err != nil {
8103		panic(err)
8104	}
8105
8106	_, _ = h.Write([]byte(ta.GetterSource))
8107
8108	hashStringMap(h, ta.GetterOptions)
8109	hashStringMap(h, ta.GetterHeaders)
8110
8111	_, _ = h.Write([]byte(ta.GetterMode))
8112	_, _ = h.Write([]byte(ta.RelativeDest))
8113	return base64.RawStdEncoding.EncodeToString(h.Sum(nil))
8114}
8115
8116// PathEscapesAllocDir returns if the given path escapes the allocation
8117// directory.
8118//
8119// The prefix is to joined to the path (e.g. "task/local"), and this function
8120// checks if path escapes the alloc dir, NOT the prefix directory within the alloc dir.
8121// With prefix="task/local", it will return false for "../secret", but
8122// true for "../../../../../../root" path; only the latter escapes the alloc dir
8123func PathEscapesAllocDir(prefix, path string) (bool, error) {
8124	// Verify the destination doesn't escape the tasks directory
8125	alloc, err := filepath.Abs(filepath.Join("/", "alloc-dir/", "alloc-id/"))
8126	if err != nil {
8127		return false, err
8128	}
8129	abs, err := filepath.Abs(filepath.Join(alloc, prefix, path))
8130	if err != nil {
8131		return false, err
8132	}
8133	rel, err := filepath.Rel(alloc, abs)
8134	if err != nil {
8135		return false, err
8136	}
8137
8138	return strings.HasPrefix(rel, ".."), nil
8139}
8140
8141func (ta *TaskArtifact) Validate() error {
8142	// Verify the source
8143	var mErr multierror.Error
8144	if ta.GetterSource == "" {
8145		mErr.Errors = append(mErr.Errors, fmt.Errorf("source must be specified"))
8146	}
8147
8148	switch ta.GetterMode {
8149	case "":
8150		// Default to any
8151		ta.GetterMode = GetterModeAny
8152	case GetterModeAny, GetterModeFile, GetterModeDir:
8153		// Ok
8154	default:
8155		mErr.Errors = append(mErr.Errors, fmt.Errorf("invalid artifact mode %q; must be one of: %s, %s, %s",
8156			ta.GetterMode, GetterModeAny, GetterModeFile, GetterModeDir))
8157	}
8158
8159	escaped, err := PathEscapesAllocDir("task", ta.RelativeDest)
8160	if err != nil {
8161		mErr.Errors = append(mErr.Errors, fmt.Errorf("invalid destination path: %v", err))
8162	} else if escaped {
8163		mErr.Errors = append(mErr.Errors, fmt.Errorf("destination escapes allocation directory"))
8164	}
8165
8166	if err := ta.validateChecksum(); err != nil {
8167		mErr.Errors = append(mErr.Errors, err)
8168	}
8169
8170	return mErr.ErrorOrNil()
8171}
8172
8173func (ta *TaskArtifact) validateChecksum() error {
8174	check, ok := ta.GetterOptions["checksum"]
8175	if !ok {
8176		return nil
8177	}
8178
8179	// Job struct validation occurs before interpolation resolution can be effective.
8180	// Skip checking if checksum contain variable reference, and artifacts fetching will
8181	// eventually fail, if checksum is indeed invalid.
8182	if args.ContainsEnv(check) {
8183		return nil
8184	}
8185
8186	check = strings.TrimSpace(check)
8187	if check == "" {
8188		return fmt.Errorf("checksum value cannot be empty")
8189	}
8190
8191	parts := strings.Split(check, ":")
8192	if l := len(parts); l != 2 {
8193		return fmt.Errorf(`checksum must be given as "type:value"; got %q`, check)
8194	}
8195
8196	checksumVal := parts[1]
8197	checksumBytes, err := hex.DecodeString(checksumVal)
8198	if err != nil {
8199		return fmt.Errorf("invalid checksum: %v", err)
8200	}
8201
8202	checksumType := parts[0]
8203	expectedLength := 0
8204	switch checksumType {
8205	case "md5":
8206		expectedLength = md5.Size
8207	case "sha1":
8208		expectedLength = sha1.Size
8209	case "sha256":
8210		expectedLength = sha256.Size
8211	case "sha512":
8212		expectedLength = sha512.Size
8213	default:
8214		return fmt.Errorf("unsupported checksum type: %s", checksumType)
8215	}
8216
8217	if len(checksumBytes) != expectedLength {
8218		return fmt.Errorf("invalid %s checksum: %v", checksumType, checksumVal)
8219	}
8220
8221	return nil
8222}
8223
8224const (
8225	ConstraintDistinctProperty  = "distinct_property"
8226	ConstraintDistinctHosts     = "distinct_hosts"
8227	ConstraintRegex             = "regexp"
8228	ConstraintVersion           = "version"
8229	ConstraintSemver            = "semver"
8230	ConstraintSetContains       = "set_contains"
8231	ConstraintSetContainsAll    = "set_contains_all"
8232	ConstraintSetContainsAny    = "set_contains_any"
8233	ConstraintAttributeIsSet    = "is_set"
8234	ConstraintAttributeIsNotSet = "is_not_set"
8235)
8236
8237// Constraints are used to restrict placement options.
8238type Constraint struct {
8239	LTarget string // Left-hand target
8240	RTarget string // Right-hand target
8241	Operand string // Constraint operand (<=, <, =, !=, >, >=), contains, near
8242	str     string // Memoized string
8243}
8244
8245// Equal checks if two constraints are equal
8246func (c *Constraint) Equals(o *Constraint) bool {
8247	return c == o ||
8248		c.LTarget == o.LTarget &&
8249			c.RTarget == o.RTarget &&
8250			c.Operand == o.Operand
8251}
8252
8253func (c *Constraint) Equal(o *Constraint) bool {
8254	return c.Equals(o)
8255}
8256
8257func (c *Constraint) Copy() *Constraint {
8258	if c == nil {
8259		return nil
8260	}
8261	nc := new(Constraint)
8262	*nc = *c
8263	return nc
8264}
8265
8266func (c *Constraint) String() string {
8267	if c.str != "" {
8268		return c.str
8269	}
8270	c.str = fmt.Sprintf("%s %s %s", c.LTarget, c.Operand, c.RTarget)
8271	return c.str
8272}
8273
8274func (c *Constraint) Validate() error {
8275	var mErr multierror.Error
8276	if c.Operand == "" {
8277		mErr.Errors = append(mErr.Errors, errors.New("Missing constraint operand"))
8278	}
8279
8280	// requireLtarget specifies whether the constraint requires an LTarget to be
8281	// provided.
8282	requireLtarget := true
8283
8284	// Perform additional validation based on operand
8285	switch c.Operand {
8286	case ConstraintDistinctHosts:
8287		requireLtarget = false
8288	case ConstraintSetContainsAll, ConstraintSetContainsAny, ConstraintSetContains:
8289		if c.RTarget == "" {
8290			mErr.Errors = append(mErr.Errors, fmt.Errorf("Set contains constraint requires an RTarget"))
8291		}
8292	case ConstraintRegex:
8293		if _, err := regexp.Compile(c.RTarget); err != nil {
8294			mErr.Errors = append(mErr.Errors, fmt.Errorf("Regular expression failed to compile: %v", err))
8295		}
8296	case ConstraintVersion:
8297		if _, err := version.NewConstraint(c.RTarget); err != nil {
8298			mErr.Errors = append(mErr.Errors, fmt.Errorf("Version constraint is invalid: %v", err))
8299		}
8300	case ConstraintSemver:
8301		if _, err := semver.NewConstraint(c.RTarget); err != nil {
8302			mErr.Errors = append(mErr.Errors, fmt.Errorf("Semver constraint is invalid: %v", err))
8303		}
8304	case ConstraintDistinctProperty:
8305		// If a count is set, make sure it is convertible to a uint64
8306		if c.RTarget != "" {
8307			count, err := strconv.ParseUint(c.RTarget, 10, 64)
8308			if err != nil {
8309				mErr.Errors = append(mErr.Errors, fmt.Errorf("Failed to convert RTarget %q to uint64: %v", c.RTarget, err))
8310			} else if count < 1 {
8311				mErr.Errors = append(mErr.Errors, fmt.Errorf("Distinct Property must have an allowed count of 1 or greater: %d < 1", count))
8312			}
8313		}
8314	case ConstraintAttributeIsSet, ConstraintAttributeIsNotSet:
8315		if c.RTarget != "" {
8316			mErr.Errors = append(mErr.Errors, fmt.Errorf("Operator %q does not support an RTarget", c.Operand))
8317		}
8318	case "=", "==", "is", "!=", "not", "<", "<=", ">", ">=":
8319		if c.RTarget == "" {
8320			mErr.Errors = append(mErr.Errors, fmt.Errorf("Operator %q requires an RTarget", c.Operand))
8321		}
8322	default:
8323		mErr.Errors = append(mErr.Errors, fmt.Errorf("Unknown constraint type %q", c.Operand))
8324	}
8325
8326	// Ensure we have an LTarget for the constraints that need one
8327	if requireLtarget && c.LTarget == "" {
8328		mErr.Errors = append(mErr.Errors, fmt.Errorf("No LTarget provided but is required by constraint"))
8329	}
8330
8331	return mErr.ErrorOrNil()
8332}
8333
8334type Constraints []*Constraint
8335
8336// Equals compares Constraints as a set
8337func (xs *Constraints) Equals(ys *Constraints) bool {
8338	if xs == ys {
8339		return true
8340	}
8341	if xs == nil || ys == nil {
8342		return false
8343	}
8344	if len(*xs) != len(*ys) {
8345		return false
8346	}
8347SETEQUALS:
8348	for _, x := range *xs {
8349		for _, y := range *ys {
8350			if x.Equals(y) {
8351				continue SETEQUALS
8352			}
8353		}
8354		return false
8355	}
8356	return true
8357}
8358
8359// Affinity is used to score placement options based on a weight
8360type Affinity struct {
8361	LTarget string // Left-hand target
8362	RTarget string // Right-hand target
8363	Operand string // Affinity operand (<=, <, =, !=, >, >=), set_contains_all, set_contains_any
8364	Weight  int8   // Weight applied to nodes that match the affinity. Can be negative
8365	str     string // Memoized string
8366}
8367
8368// Equal checks if two affinities are equal
8369func (a *Affinity) Equals(o *Affinity) bool {
8370	return a == o ||
8371		a.LTarget == o.LTarget &&
8372			a.RTarget == o.RTarget &&
8373			a.Operand == o.Operand &&
8374			a.Weight == o.Weight
8375}
8376
8377func (a *Affinity) Equal(o *Affinity) bool {
8378	return a.Equals(o)
8379}
8380
8381func (a *Affinity) Copy() *Affinity {
8382	if a == nil {
8383		return nil
8384	}
8385	na := new(Affinity)
8386	*na = *a
8387	return na
8388}
8389
8390func (a *Affinity) String() string {
8391	if a.str != "" {
8392		return a.str
8393	}
8394	a.str = fmt.Sprintf("%s %s %s %v", a.LTarget, a.Operand, a.RTarget, a.Weight)
8395	return a.str
8396}
8397
8398func (a *Affinity) Validate() error {
8399	var mErr multierror.Error
8400	if a.Operand == "" {
8401		mErr.Errors = append(mErr.Errors, errors.New("Missing affinity operand"))
8402	}
8403
8404	// Perform additional validation based on operand
8405	switch a.Operand {
8406	case ConstraintSetContainsAll, ConstraintSetContainsAny, ConstraintSetContains:
8407		if a.RTarget == "" {
8408			mErr.Errors = append(mErr.Errors, fmt.Errorf("Set contains operators require an RTarget"))
8409		}
8410	case ConstraintRegex:
8411		if _, err := regexp.Compile(a.RTarget); err != nil {
8412			mErr.Errors = append(mErr.Errors, fmt.Errorf("Regular expression failed to compile: %v", err))
8413		}
8414	case ConstraintVersion:
8415		if _, err := version.NewConstraint(a.RTarget); err != nil {
8416			mErr.Errors = append(mErr.Errors, fmt.Errorf("Version affinity is invalid: %v", err))
8417		}
8418	case ConstraintSemver:
8419		if _, err := semver.NewConstraint(a.RTarget); err != nil {
8420			mErr.Errors = append(mErr.Errors, fmt.Errorf("Semver affinity is invalid: %v", err))
8421		}
8422	case "=", "==", "is", "!=", "not", "<", "<=", ">", ">=":
8423		if a.RTarget == "" {
8424			mErr.Errors = append(mErr.Errors, fmt.Errorf("Operator %q requires an RTarget", a.Operand))
8425		}
8426	default:
8427		mErr.Errors = append(mErr.Errors, fmt.Errorf("Unknown affinity operator %q", a.Operand))
8428	}
8429
8430	// Ensure we have an LTarget
8431	if a.LTarget == "" {
8432		mErr.Errors = append(mErr.Errors, fmt.Errorf("No LTarget provided but is required"))
8433	}
8434
8435	// Ensure that weight is between -100 and 100, and not zero
8436	if a.Weight == 0 {
8437		mErr.Errors = append(mErr.Errors, fmt.Errorf("Affinity weight cannot be zero"))
8438	}
8439
8440	if a.Weight > 100 || a.Weight < -100 {
8441		mErr.Errors = append(mErr.Errors, fmt.Errorf("Affinity weight must be within the range [-100,100]"))
8442	}
8443
8444	return mErr.ErrorOrNil()
8445}
8446
8447// Spread is used to specify desired distribution of allocations according to weight
8448type Spread struct {
8449	// Attribute is the node attribute used as the spread criteria
8450	Attribute string
8451
8452	// Weight is the relative weight of this spread, useful when there are multiple
8453	// spread and affinities
8454	Weight int8
8455
8456	// SpreadTarget is used to describe desired percentages for each attribute value
8457	SpreadTarget []*SpreadTarget
8458
8459	// Memoized string representation
8460	str string
8461}
8462
8463type Affinities []*Affinity
8464
8465// Equals compares Affinities as a set
8466func (xs *Affinities) Equals(ys *Affinities) bool {
8467	if xs == ys {
8468		return true
8469	}
8470	if xs == nil || ys == nil {
8471		return false
8472	}
8473	if len(*xs) != len(*ys) {
8474		return false
8475	}
8476SETEQUALS:
8477	for _, x := range *xs {
8478		for _, y := range *ys {
8479			if x.Equals(y) {
8480				continue SETEQUALS
8481			}
8482		}
8483		return false
8484	}
8485	return true
8486}
8487
8488func (s *Spread) Copy() *Spread {
8489	if s == nil {
8490		return nil
8491	}
8492	ns := new(Spread)
8493	*ns = *s
8494
8495	ns.SpreadTarget = CopySliceSpreadTarget(s.SpreadTarget)
8496	return ns
8497}
8498
8499func (s *Spread) String() string {
8500	if s.str != "" {
8501		return s.str
8502	}
8503	s.str = fmt.Sprintf("%s %s %v", s.Attribute, s.SpreadTarget, s.Weight)
8504	return s.str
8505}
8506
8507func (s *Spread) Validate() error {
8508	var mErr multierror.Error
8509	if s.Attribute == "" {
8510		mErr.Errors = append(mErr.Errors, errors.New("Missing spread attribute"))
8511	}
8512	if s.Weight <= 0 || s.Weight > 100 {
8513		mErr.Errors = append(mErr.Errors, errors.New("Spread stanza must have a positive weight from 0 to 100"))
8514	}
8515	seen := make(map[string]struct{})
8516	sumPercent := uint32(0)
8517
8518	for _, target := range s.SpreadTarget {
8519		// Make sure there are no duplicates
8520		_, ok := seen[target.Value]
8521		if !ok {
8522			seen[target.Value] = struct{}{}
8523		} else {
8524			mErr.Errors = append(mErr.Errors, fmt.Errorf("Spread target value %q already defined", target.Value))
8525		}
8526		if target.Percent > 100 {
8527			mErr.Errors = append(mErr.Errors, fmt.Errorf("Spread target percentage for value %q must be between 0 and 100", target.Value))
8528		}
8529		sumPercent += uint32(target.Percent)
8530	}
8531	if sumPercent > 100 {
8532		mErr.Errors = append(mErr.Errors, fmt.Errorf("Sum of spread target percentages must not be greater than 100%%; got %d%%", sumPercent))
8533	}
8534	return mErr.ErrorOrNil()
8535}
8536
8537// SpreadTarget is used to specify desired percentages for each attribute value
8538type SpreadTarget struct {
8539	// Value is a single attribute value, like "dc1"
8540	Value string
8541
8542	// Percent is the desired percentage of allocs
8543	Percent uint8
8544
8545	// Memoized string representation
8546	str string
8547}
8548
8549func (s *SpreadTarget) Copy() *SpreadTarget {
8550	if s == nil {
8551		return nil
8552	}
8553
8554	ns := new(SpreadTarget)
8555	*ns = *s
8556	return ns
8557}
8558
8559func (s *SpreadTarget) String() string {
8560	if s.str != "" {
8561		return s.str
8562	}
8563	s.str = fmt.Sprintf("%q %v%%", s.Value, s.Percent)
8564	return s.str
8565}
8566
8567// EphemeralDisk is an ephemeral disk object
8568type EphemeralDisk struct {
8569	// Sticky indicates whether the allocation is sticky to a node
8570	Sticky bool
8571
8572	// SizeMB is the size of the local disk
8573	SizeMB int
8574
8575	// Migrate determines if Nomad client should migrate the allocation dir for
8576	// sticky allocations
8577	Migrate bool
8578}
8579
8580// DefaultEphemeralDisk returns a EphemeralDisk with default configurations
8581func DefaultEphemeralDisk() *EphemeralDisk {
8582	return &EphemeralDisk{
8583		SizeMB: 300,
8584	}
8585}
8586
8587// Validate validates EphemeralDisk
8588func (d *EphemeralDisk) Validate() error {
8589	if d.SizeMB < 10 {
8590		return fmt.Errorf("minimum DiskMB value is 10; got %d", d.SizeMB)
8591	}
8592	return nil
8593}
8594
8595// Copy copies the EphemeralDisk struct and returns a new one
8596func (d *EphemeralDisk) Copy() *EphemeralDisk {
8597	ld := new(EphemeralDisk)
8598	*ld = *d
8599	return ld
8600}
8601
8602var (
8603	// VaultUnrecoverableError matches unrecoverable errors returned by a Vault
8604	// server
8605	VaultUnrecoverableError = regexp.MustCompile(`Code:\s+40(0|3|4)`)
8606)
8607
8608const (
8609	// VaultChangeModeNoop takes no action when a new token is retrieved.
8610	VaultChangeModeNoop = "noop"
8611
8612	// VaultChangeModeSignal signals the task when a new token is retrieved.
8613	VaultChangeModeSignal = "signal"
8614
8615	// VaultChangeModeRestart restarts the task when a new token is retrieved.
8616	VaultChangeModeRestart = "restart"
8617)
8618
8619// Vault stores the set of permissions a task needs access to from Vault.
8620type Vault struct {
8621	// Policies is the set of policies that the task needs access to
8622	Policies []string
8623
8624	// Namespace is the vault namespace that should be used.
8625	Namespace string
8626
8627	// Env marks whether the Vault Token should be exposed as an environment
8628	// variable
8629	Env bool
8630
8631	// ChangeMode is used to configure the task's behavior when the Vault
8632	// token changes because the original token could not be renewed in time.
8633	ChangeMode string
8634
8635	// ChangeSignal is the signal sent to the task when a new token is
8636	// retrieved. This is only valid when using the signal change mode.
8637	ChangeSignal string
8638}
8639
8640func DefaultVaultBlock() *Vault {
8641	return &Vault{
8642		Env:        true,
8643		ChangeMode: VaultChangeModeRestart,
8644	}
8645}
8646
8647// Copy returns a copy of this Vault block.
8648func (v *Vault) Copy() *Vault {
8649	if v == nil {
8650		return nil
8651	}
8652
8653	nv := new(Vault)
8654	*nv = *v
8655	return nv
8656}
8657
8658func (v *Vault) Canonicalize() {
8659	if v.ChangeSignal != "" {
8660		v.ChangeSignal = strings.ToUpper(v.ChangeSignal)
8661	}
8662}
8663
8664// Validate returns if the Vault block is valid.
8665func (v *Vault) Validate() error {
8666	if v == nil {
8667		return nil
8668	}
8669
8670	var mErr multierror.Error
8671	if len(v.Policies) == 0 {
8672		_ = multierror.Append(&mErr, fmt.Errorf("Policy list cannot be empty"))
8673	}
8674
8675	for _, p := range v.Policies {
8676		if p == "root" {
8677			_ = multierror.Append(&mErr, fmt.Errorf("Can not specify \"root\" policy"))
8678		}
8679	}
8680
8681	switch v.ChangeMode {
8682	case VaultChangeModeSignal:
8683		if v.ChangeSignal == "" {
8684			_ = multierror.Append(&mErr, fmt.Errorf("Signal must be specified when using change mode %q", VaultChangeModeSignal))
8685		}
8686	case VaultChangeModeNoop, VaultChangeModeRestart:
8687	default:
8688		_ = multierror.Append(&mErr, fmt.Errorf("Unknown change mode %q", v.ChangeMode))
8689	}
8690
8691	return mErr.ErrorOrNil()
8692}
8693
8694const (
8695	// DeploymentStatuses are the various states a deployment can be be in
8696	DeploymentStatusRunning    = "running"
8697	DeploymentStatusPaused     = "paused"
8698	DeploymentStatusFailed     = "failed"
8699	DeploymentStatusSuccessful = "successful"
8700	DeploymentStatusCancelled  = "cancelled"
8701	DeploymentStatusPending    = "pending"
8702	DeploymentStatusBlocked    = "blocked"
8703	DeploymentStatusUnblocking = "unblocking"
8704
8705	// TODO Statuses and Descriptions do not match 1:1 and we sometimes use the Description as a status flag
8706
8707	// DeploymentStatusDescriptions are the various descriptions of the states a
8708	// deployment can be in.
8709	DeploymentStatusDescriptionRunning               = "Deployment is running"
8710	DeploymentStatusDescriptionRunningNeedsPromotion = "Deployment is running but requires manual promotion"
8711	DeploymentStatusDescriptionRunningAutoPromotion  = "Deployment is running pending automatic promotion"
8712	DeploymentStatusDescriptionPaused                = "Deployment is paused"
8713	DeploymentStatusDescriptionSuccessful            = "Deployment completed successfully"
8714	DeploymentStatusDescriptionStoppedJob            = "Cancelled because job is stopped"
8715	DeploymentStatusDescriptionNewerJob              = "Cancelled due to newer version of job"
8716	DeploymentStatusDescriptionFailedAllocations     = "Failed due to unhealthy allocations"
8717	DeploymentStatusDescriptionProgressDeadline      = "Failed due to progress deadline"
8718	DeploymentStatusDescriptionFailedByUser          = "Deployment marked as failed"
8719
8720	// used only in multiregion deployments
8721	DeploymentStatusDescriptionFailedByPeer   = "Failed because of an error in peer region"
8722	DeploymentStatusDescriptionBlocked        = "Deployment is complete but waiting for peer region"
8723	DeploymentStatusDescriptionUnblocking     = "Deployment is unblocking remaining regions"
8724	DeploymentStatusDescriptionPendingForPeer = "Deployment is pending, waiting for peer region"
8725)
8726
8727// DeploymentStatusDescriptionRollback is used to get the status description of
8728// a deployment when rolling back to an older job.
8729func DeploymentStatusDescriptionRollback(baseDescription string, jobVersion uint64) string {
8730	return fmt.Sprintf("%s - rolling back to job version %d", baseDescription, jobVersion)
8731}
8732
8733// DeploymentStatusDescriptionRollbackNoop is used to get the status description of
8734// a deployment when rolling back is not possible because it has the same specification
8735func DeploymentStatusDescriptionRollbackNoop(baseDescription string, jobVersion uint64) string {
8736	return fmt.Sprintf("%s - not rolling back to stable job version %d as current job has same specification", baseDescription, jobVersion)
8737}
8738
8739// DeploymentStatusDescriptionNoRollbackTarget is used to get the status description of
8740// a deployment when there is no target to rollback to but autorevert is desired.
8741func DeploymentStatusDescriptionNoRollbackTarget(baseDescription string) string {
8742	return fmt.Sprintf("%s - no stable job version to auto revert to", baseDescription)
8743}
8744
8745// Deployment is the object that represents a job deployment which is used to
8746// transition a job between versions.
8747type Deployment struct {
8748	// ID is a generated UUID for the deployment
8749	ID string
8750
8751	// Namespace is the namespace the deployment is created in
8752	Namespace string
8753
8754	// JobID is the job the deployment is created for
8755	JobID string
8756
8757	// JobVersion is the version of the job at which the deployment is tracking
8758	JobVersion uint64
8759
8760	// JobModifyIndex is the ModifyIndex of the job which the deployment is
8761	// tracking.
8762	JobModifyIndex uint64
8763
8764	// JobSpecModifyIndex is the JobModifyIndex of the job which the
8765	// deployment is tracking.
8766	JobSpecModifyIndex uint64
8767
8768	// JobCreateIndex is the create index of the job which the deployment is
8769	// tracking. It is needed so that if the job gets stopped and reran we can
8770	// present the correct list of deployments for the job and not old ones.
8771	JobCreateIndex uint64
8772
8773	// Multiregion specifies if deployment is part of multiregion deployment
8774	IsMultiregion bool
8775
8776	// TaskGroups is the set of task groups effected by the deployment and their
8777	// current deployment status.
8778	TaskGroups map[string]*DeploymentState
8779
8780	// The status of the deployment
8781	Status string
8782
8783	// StatusDescription allows a human readable description of the deployment
8784	// status.
8785	StatusDescription string
8786
8787	CreateIndex uint64
8788	ModifyIndex uint64
8789}
8790
8791// NewDeployment creates a new deployment given the job.
8792func NewDeployment(job *Job) *Deployment {
8793	return &Deployment{
8794		ID:                 uuid.Generate(),
8795		Namespace:          job.Namespace,
8796		JobID:              job.ID,
8797		JobVersion:         job.Version,
8798		JobModifyIndex:     job.ModifyIndex,
8799		JobSpecModifyIndex: job.JobModifyIndex,
8800		JobCreateIndex:     job.CreateIndex,
8801		IsMultiregion:      job.IsMultiregion(),
8802		Status:             DeploymentStatusRunning,
8803		StatusDescription:  DeploymentStatusDescriptionRunning,
8804		TaskGroups:         make(map[string]*DeploymentState, len(job.TaskGroups)),
8805	}
8806}
8807
8808func (d *Deployment) Copy() *Deployment {
8809	if d == nil {
8810		return nil
8811	}
8812
8813	c := &Deployment{}
8814	*c = *d
8815
8816	c.TaskGroups = nil
8817	if l := len(d.TaskGroups); d.TaskGroups != nil {
8818		c.TaskGroups = make(map[string]*DeploymentState, l)
8819		for tg, s := range d.TaskGroups {
8820			c.TaskGroups[tg] = s.Copy()
8821		}
8822	}
8823
8824	return c
8825}
8826
8827// Active returns whether the deployment is active or terminal.
8828func (d *Deployment) Active() bool {
8829	switch d.Status {
8830	case DeploymentStatusRunning, DeploymentStatusPaused, DeploymentStatusBlocked, DeploymentStatusUnblocking, DeploymentStatusPending:
8831		return true
8832	default:
8833		return false
8834	}
8835}
8836
8837// GetID is a helper for getting the ID when the object may be nil
8838func (d *Deployment) GetID() string {
8839	if d == nil {
8840		return ""
8841	}
8842	return d.ID
8843}
8844
8845// HasPlacedCanaries returns whether the deployment has placed canaries
8846func (d *Deployment) HasPlacedCanaries() bool {
8847	if d == nil || len(d.TaskGroups) == 0 {
8848		return false
8849	}
8850	for _, group := range d.TaskGroups {
8851		if len(group.PlacedCanaries) != 0 {
8852			return true
8853		}
8854	}
8855	return false
8856}
8857
8858// RequiresPromotion returns whether the deployment requires promotion to
8859// continue
8860func (d *Deployment) RequiresPromotion() bool {
8861	if d == nil || len(d.TaskGroups) == 0 || d.Status != DeploymentStatusRunning {
8862		return false
8863	}
8864	for _, group := range d.TaskGroups {
8865		if group.DesiredCanaries > 0 && !group.Promoted {
8866			return true
8867		}
8868	}
8869	return false
8870}
8871
8872// HasAutoPromote determines if all taskgroups are marked auto_promote
8873func (d *Deployment) HasAutoPromote() bool {
8874	if d == nil || len(d.TaskGroups) == 0 || d.Status != DeploymentStatusRunning {
8875		return false
8876	}
8877	for _, group := range d.TaskGroups {
8878		if !group.AutoPromote {
8879			return false
8880		}
8881	}
8882	return true
8883}
8884
8885func (d *Deployment) GoString() string {
8886	base := fmt.Sprintf("Deployment ID %q for job %q has status %q (%v):", d.ID, d.JobID, d.Status, d.StatusDescription)
8887	for group, state := range d.TaskGroups {
8888		base += fmt.Sprintf("\nTask Group %q has state:\n%#v", group, state)
8889	}
8890	return base
8891}
8892
8893// DeploymentState tracks the state of a deployment for a given task group.
8894type DeploymentState struct {
8895	// AutoRevert marks whether the task group has indicated the job should be
8896	// reverted on failure
8897	AutoRevert bool
8898
8899	// AutoPromote marks promotion triggered automatically by healthy canaries
8900	// copied from TaskGroup UpdateStrategy in scheduler.reconcile
8901	AutoPromote bool
8902
8903	// ProgressDeadline is the deadline by which an allocation must transition
8904	// to healthy before the deployment is considered failed. This value is set
8905	// by the jobspec `update.progress_deadline` field.
8906	ProgressDeadline time.Duration
8907
8908	// RequireProgressBy is the time by which an allocation must transition to
8909	// healthy before the deployment is considered failed. This value is reset
8910	// to "now" + ProgressDeadline when an allocation updates the deployment.
8911	RequireProgressBy time.Time
8912
8913	// Promoted marks whether the canaries have been promoted
8914	Promoted bool
8915
8916	// PlacedCanaries is the set of placed canary allocations
8917	PlacedCanaries []string
8918
8919	// DesiredCanaries is the number of canaries that should be created.
8920	DesiredCanaries int
8921
8922	// DesiredTotal is the total number of allocations that should be created as
8923	// part of the deployment.
8924	DesiredTotal int
8925
8926	// PlacedAllocs is the number of allocations that have been placed
8927	PlacedAllocs int
8928
8929	// HealthyAllocs is the number of allocations that have been marked healthy.
8930	HealthyAllocs int
8931
8932	// UnhealthyAllocs are allocations that have been marked as unhealthy.
8933	UnhealthyAllocs int
8934}
8935
8936func (d *DeploymentState) GoString() string {
8937	base := fmt.Sprintf("\tDesired Total: %d", d.DesiredTotal)
8938	base += fmt.Sprintf("\n\tDesired Canaries: %d", d.DesiredCanaries)
8939	base += fmt.Sprintf("\n\tPlaced Canaries: %#v", d.PlacedCanaries)
8940	base += fmt.Sprintf("\n\tPromoted: %v", d.Promoted)
8941	base += fmt.Sprintf("\n\tPlaced: %d", d.PlacedAllocs)
8942	base += fmt.Sprintf("\n\tHealthy: %d", d.HealthyAllocs)
8943	base += fmt.Sprintf("\n\tUnhealthy: %d", d.UnhealthyAllocs)
8944	base += fmt.Sprintf("\n\tAutoRevert: %v", d.AutoRevert)
8945	base += fmt.Sprintf("\n\tAutoPromote: %v", d.AutoPromote)
8946	return base
8947}
8948
8949func (d *DeploymentState) Copy() *DeploymentState {
8950	c := &DeploymentState{}
8951	*c = *d
8952	c.PlacedCanaries = helper.CopySliceString(d.PlacedCanaries)
8953	return c
8954}
8955
8956// DeploymentStatusUpdate is used to update the status of a given deployment
8957type DeploymentStatusUpdate struct {
8958	// DeploymentID is the ID of the deployment to update
8959	DeploymentID string
8960
8961	// Status is the new status of the deployment.
8962	Status string
8963
8964	// StatusDescription is the new status description of the deployment.
8965	StatusDescription string
8966}
8967
8968// RescheduleTracker encapsulates previous reschedule events
8969type RescheduleTracker struct {
8970	Events []*RescheduleEvent
8971}
8972
8973func (rt *RescheduleTracker) Copy() *RescheduleTracker {
8974	if rt == nil {
8975		return nil
8976	}
8977	nt := &RescheduleTracker{}
8978	*nt = *rt
8979	rescheduleEvents := make([]*RescheduleEvent, 0, len(rt.Events))
8980	for _, tracker := range rt.Events {
8981		rescheduleEvents = append(rescheduleEvents, tracker.Copy())
8982	}
8983	nt.Events = rescheduleEvents
8984	return nt
8985}
8986
8987// RescheduleEvent is used to keep track of previous attempts at rescheduling an allocation
8988type RescheduleEvent struct {
8989	// RescheduleTime is the timestamp of a reschedule attempt
8990	RescheduleTime int64
8991
8992	// PrevAllocID is the ID of the previous allocation being restarted
8993	PrevAllocID string
8994
8995	// PrevNodeID is the node ID of the previous allocation
8996	PrevNodeID string
8997
8998	// Delay is the reschedule delay associated with the attempt
8999	Delay time.Duration
9000}
9001
9002func NewRescheduleEvent(rescheduleTime int64, prevAllocID string, prevNodeID string, delay time.Duration) *RescheduleEvent {
9003	return &RescheduleEvent{RescheduleTime: rescheduleTime,
9004		PrevAllocID: prevAllocID,
9005		PrevNodeID:  prevNodeID,
9006		Delay:       delay}
9007}
9008
9009func (re *RescheduleEvent) Copy() *RescheduleEvent {
9010	if re == nil {
9011		return nil
9012	}
9013	copy := new(RescheduleEvent)
9014	*copy = *re
9015	return copy
9016}
9017
9018// DesiredTransition is used to mark an allocation as having a desired state
9019// transition. This information can be used by the scheduler to make the
9020// correct decision.
9021type DesiredTransition struct {
9022	// Migrate is used to indicate that this allocation should be stopped and
9023	// migrated to another node.
9024	Migrate *bool
9025
9026	// Reschedule is used to indicate that this allocation is eligible to be
9027	// rescheduled. Most allocations are automatically eligible for
9028	// rescheduling, so this field is only required when an allocation is not
9029	// automatically eligible. An example is an allocation that is part of a
9030	// deployment.
9031	Reschedule *bool
9032
9033	// ForceReschedule is used to indicate that this allocation must be rescheduled.
9034	// This field is only used when operators want to force a placement even if
9035	// a failed allocation is not eligible to be rescheduled
9036	ForceReschedule *bool
9037}
9038
9039// Merge merges the two desired transitions, preferring the values from the
9040// passed in object.
9041func (d *DesiredTransition) Merge(o *DesiredTransition) {
9042	if o.Migrate != nil {
9043		d.Migrate = o.Migrate
9044	}
9045
9046	if o.Reschedule != nil {
9047		d.Reschedule = o.Reschedule
9048	}
9049
9050	if o.ForceReschedule != nil {
9051		d.ForceReschedule = o.ForceReschedule
9052	}
9053}
9054
9055// ShouldMigrate returns whether the transition object dictates a migration.
9056func (d *DesiredTransition) ShouldMigrate() bool {
9057	return d.Migrate != nil && *d.Migrate
9058}
9059
9060// ShouldReschedule returns whether the transition object dictates a
9061// rescheduling.
9062func (d *DesiredTransition) ShouldReschedule() bool {
9063	return d.Reschedule != nil && *d.Reschedule
9064}
9065
9066// ShouldForceReschedule returns whether the transition object dictates a
9067// forced rescheduling.
9068func (d *DesiredTransition) ShouldForceReschedule() bool {
9069	if d == nil {
9070		return false
9071	}
9072	return d.ForceReschedule != nil && *d.ForceReschedule
9073}
9074
9075const (
9076	AllocDesiredStatusRun   = "run"   // Allocation should run
9077	AllocDesiredStatusStop  = "stop"  // Allocation should stop
9078	AllocDesiredStatusEvict = "evict" // Allocation should stop, and was evicted
9079)
9080
9081const (
9082	AllocClientStatusPending  = "pending"
9083	AllocClientStatusRunning  = "running"
9084	AllocClientStatusComplete = "complete"
9085	AllocClientStatusFailed   = "failed"
9086	AllocClientStatusLost     = "lost"
9087)
9088
9089// Allocation is used to allocate the placement of a task group to a node.
9090type Allocation struct {
9091	// msgpack omit empty fields during serialization
9092	_struct bool `codec:",omitempty"` // nolint: structcheck
9093
9094	// ID of the allocation (UUID)
9095	ID string
9096
9097	// Namespace is the namespace the allocation is created in
9098	Namespace string
9099
9100	// ID of the evaluation that generated this allocation
9101	EvalID string
9102
9103	// Name is a logical name of the allocation.
9104	Name string
9105
9106	// NodeID is the node this is being placed on
9107	NodeID string
9108
9109	// NodeName is the name of the node this is being placed on.
9110	NodeName string
9111
9112	// Job is the parent job of the task group being allocated.
9113	// This is copied at allocation time to avoid issues if the job
9114	// definition is updated.
9115	JobID string
9116	Job   *Job
9117
9118	// TaskGroup is the name of the task group that should be run
9119	TaskGroup string
9120
9121	// COMPAT(0.11): Remove in 0.11
9122	// Resources is the total set of resources allocated as part
9123	// of this allocation of the task group. Dynamic ports will be set by
9124	// the scheduler.
9125	Resources *Resources
9126
9127	// SharedResources are the resources that are shared by all the tasks in an
9128	// allocation
9129	// Deprecated: use AllocatedResources.Shared instead.
9130	// Keep field to allow us to handle upgrade paths from old versions
9131	SharedResources *Resources
9132
9133	// TaskResources is the set of resources allocated to each
9134	// task. These should sum to the total Resources. Dynamic ports will be
9135	// set by the scheduler.
9136	// Deprecated: use AllocatedResources.Tasks instead.
9137	// Keep field to allow us to handle upgrade paths from old versions
9138	TaskResources map[string]*Resources
9139
9140	// AllocatedResources is the total resources allocated for the task group.
9141	AllocatedResources *AllocatedResources
9142
9143	// Metrics associated with this allocation
9144	Metrics *AllocMetric
9145
9146	// Desired Status of the allocation on the client
9147	DesiredStatus string
9148
9149	// DesiredStatusDescription is meant to provide more human useful information
9150	DesiredDescription string
9151
9152	// DesiredTransition is used to indicate that a state transition
9153	// is desired for a given reason.
9154	DesiredTransition DesiredTransition
9155
9156	// Status of the allocation on the client
9157	ClientStatus string
9158
9159	// ClientStatusDescription is meant to provide more human useful information
9160	ClientDescription string
9161
9162	// TaskStates stores the state of each task,
9163	TaskStates map[string]*TaskState
9164
9165	// AllocStates track meta data associated with changes to the state of the whole allocation, like becoming lost
9166	AllocStates []*AllocState
9167
9168	// PreviousAllocation is the allocation that this allocation is replacing
9169	PreviousAllocation string
9170
9171	// NextAllocation is the allocation that this allocation is being replaced by
9172	NextAllocation string
9173
9174	// DeploymentID identifies an allocation as being created from a
9175	// particular deployment
9176	DeploymentID string
9177
9178	// DeploymentStatus captures the status of the allocation as part of the
9179	// given deployment
9180	DeploymentStatus *AllocDeploymentStatus
9181
9182	// RescheduleTrackers captures details of previous reschedule attempts of the allocation
9183	RescheduleTracker *RescheduleTracker
9184
9185	// NetworkStatus captures networking details of an allocation known at runtime
9186	NetworkStatus *AllocNetworkStatus
9187
9188	// FollowupEvalID captures a follow up evaluation created to handle a failed allocation
9189	// that can be rescheduled in the future
9190	FollowupEvalID string
9191
9192	// PreemptedAllocations captures IDs of any allocations that were preempted
9193	// in order to place this allocation
9194	PreemptedAllocations []string
9195
9196	// PreemptedByAllocation tracks the alloc ID of the allocation that caused this allocation
9197	// to stop running because it got preempted
9198	PreemptedByAllocation string
9199
9200	// Raft Indexes
9201	CreateIndex uint64
9202	ModifyIndex uint64
9203
9204	// AllocModifyIndex is not updated when the client updates allocations. This
9205	// lets the client pull only the allocs updated by the server.
9206	AllocModifyIndex uint64
9207
9208	// CreateTime is the time the allocation has finished scheduling and been
9209	// verified by the plan applier.
9210	CreateTime int64
9211
9212	// ModifyTime is the time the allocation was last updated.
9213	ModifyTime int64
9214}
9215
9216// ConsulNamespace returns the Consul namespace of the task group associated
9217// with this allocation.
9218func (a *Allocation) ConsulNamespace() string {
9219	return a.Job.LookupTaskGroup(a.TaskGroup).Consul.GetNamespace()
9220}
9221
9222func (a *Allocation) JobNamespacedID() NamespacedID {
9223	return NewNamespacedID(a.JobID, a.Namespace)
9224}
9225
9226// Index returns the index of the allocation. If the allocation is from a task
9227// group with count greater than 1, there will be multiple allocations for it.
9228func (a *Allocation) Index() uint {
9229	l := len(a.Name)
9230	prefix := len(a.JobID) + len(a.TaskGroup) + 2
9231	if l <= 3 || l <= prefix {
9232		return uint(0)
9233	}
9234
9235	strNum := a.Name[prefix : len(a.Name)-1]
9236	num, _ := strconv.Atoi(strNum)
9237	return uint(num)
9238}
9239
9240// Copy provides a copy of the allocation and deep copies the job
9241func (a *Allocation) Copy() *Allocation {
9242	return a.copyImpl(true)
9243}
9244
9245// CopySkipJob provides a copy of the allocation but doesn't deep copy the job
9246func (a *Allocation) CopySkipJob() *Allocation {
9247	return a.copyImpl(false)
9248}
9249
9250// Canonicalize Allocation to ensure fields are initialized to the expectations
9251// of this version of Nomad. Should be called when restoring persisted
9252// Allocations or receiving Allocations from Nomad agents potentially on an
9253// older version of Nomad.
9254func (a *Allocation) Canonicalize() {
9255	if a.AllocatedResources == nil && a.TaskResources != nil {
9256		ar := AllocatedResources{}
9257
9258		tasks := make(map[string]*AllocatedTaskResources, len(a.TaskResources))
9259		for name, tr := range a.TaskResources {
9260			atr := AllocatedTaskResources{}
9261			atr.Cpu.CpuShares = int64(tr.CPU)
9262			atr.Memory.MemoryMB = int64(tr.MemoryMB)
9263			atr.Networks = tr.Networks.Copy()
9264
9265			tasks[name] = &atr
9266		}
9267		ar.Tasks = tasks
9268
9269		if a.SharedResources != nil {
9270			ar.Shared.DiskMB = int64(a.SharedResources.DiskMB)
9271			ar.Shared.Networks = a.SharedResources.Networks.Copy()
9272		}
9273
9274		a.AllocatedResources = &ar
9275	}
9276
9277	a.Job.Canonicalize()
9278}
9279
9280func (a *Allocation) copyImpl(job bool) *Allocation {
9281	if a == nil {
9282		return nil
9283	}
9284	na := new(Allocation)
9285	*na = *a
9286
9287	if job {
9288		na.Job = na.Job.Copy()
9289	}
9290
9291	na.AllocatedResources = na.AllocatedResources.Copy()
9292	na.Resources = na.Resources.Copy()
9293	na.SharedResources = na.SharedResources.Copy()
9294
9295	if a.TaskResources != nil {
9296		tr := make(map[string]*Resources, len(na.TaskResources))
9297		for task, resource := range na.TaskResources {
9298			tr[task] = resource.Copy()
9299		}
9300		na.TaskResources = tr
9301	}
9302
9303	na.Metrics = na.Metrics.Copy()
9304	na.DeploymentStatus = na.DeploymentStatus.Copy()
9305
9306	if a.TaskStates != nil {
9307		ts := make(map[string]*TaskState, len(na.TaskStates))
9308		for task, state := range na.TaskStates {
9309			ts[task] = state.Copy()
9310		}
9311		na.TaskStates = ts
9312	}
9313
9314	na.RescheduleTracker = a.RescheduleTracker.Copy()
9315	na.PreemptedAllocations = helper.CopySliceString(a.PreemptedAllocations)
9316	return na
9317}
9318
9319// TerminalStatus returns if the desired or actual status is terminal and
9320// will no longer transition.
9321func (a *Allocation) TerminalStatus() bool {
9322	// First check the desired state and if that isn't terminal, check client
9323	// state.
9324	return a.ServerTerminalStatus() || a.ClientTerminalStatus()
9325}
9326
9327// ServerTerminalStatus returns true if the desired state of the allocation is terminal
9328func (a *Allocation) ServerTerminalStatus() bool {
9329	switch a.DesiredStatus {
9330	case AllocDesiredStatusStop, AllocDesiredStatusEvict:
9331		return true
9332	default:
9333		return false
9334	}
9335}
9336
9337// ClientTerminalStatus returns if the client status is terminal and will no longer transition
9338func (a *Allocation) ClientTerminalStatus() bool {
9339	switch a.ClientStatus {
9340	case AllocClientStatusComplete, AllocClientStatusFailed, AllocClientStatusLost:
9341		return true
9342	default:
9343		return false
9344	}
9345}
9346
9347// ShouldReschedule returns if the allocation is eligible to be rescheduled according
9348// to its status and ReschedulePolicy given its failure time
9349func (a *Allocation) ShouldReschedule(reschedulePolicy *ReschedulePolicy, failTime time.Time) bool {
9350	// First check the desired state
9351	switch a.DesiredStatus {
9352	case AllocDesiredStatusStop, AllocDesiredStatusEvict:
9353		return false
9354	default:
9355	}
9356	switch a.ClientStatus {
9357	case AllocClientStatusFailed:
9358		return a.RescheduleEligible(reschedulePolicy, failTime)
9359	default:
9360		return false
9361	}
9362}
9363
9364// RescheduleEligible returns if the allocation is eligible to be rescheduled according
9365// to its ReschedulePolicy and the current state of its reschedule trackers
9366func (a *Allocation) RescheduleEligible(reschedulePolicy *ReschedulePolicy, failTime time.Time) bool {
9367	if reschedulePolicy == nil {
9368		return false
9369	}
9370	attempts := reschedulePolicy.Attempts
9371	interval := reschedulePolicy.Interval
9372	enabled := attempts > 0 || reschedulePolicy.Unlimited
9373	if !enabled {
9374		return false
9375	}
9376	if reschedulePolicy.Unlimited {
9377		return true
9378	}
9379	// Early return true if there are no attempts yet and the number of allowed attempts is > 0
9380	if (a.RescheduleTracker == nil || len(a.RescheduleTracker.Events) == 0) && attempts > 0 {
9381		return true
9382	}
9383	attempted := 0
9384	for j := len(a.RescheduleTracker.Events) - 1; j >= 0; j-- {
9385		lastAttempt := a.RescheduleTracker.Events[j].RescheduleTime
9386		timeDiff := failTime.UTC().UnixNano() - lastAttempt
9387		if timeDiff < interval.Nanoseconds() {
9388			attempted += 1
9389		}
9390	}
9391	return attempted < attempts
9392}
9393
9394// LastEventTime is the time of the last task event in the allocation.
9395// It is used to determine allocation failure time. If the FinishedAt field
9396// is not set, the alloc's modify time is used
9397func (a *Allocation) LastEventTime() time.Time {
9398	var lastEventTime time.Time
9399	if a.TaskStates != nil {
9400		for _, s := range a.TaskStates {
9401			if lastEventTime.IsZero() || s.FinishedAt.After(lastEventTime) {
9402				lastEventTime = s.FinishedAt
9403			}
9404		}
9405	}
9406
9407	if lastEventTime.IsZero() {
9408		return time.Unix(0, a.ModifyTime).UTC()
9409	}
9410	return lastEventTime
9411}
9412
9413// ReschedulePolicy returns the reschedule policy based on the task group
9414func (a *Allocation) ReschedulePolicy() *ReschedulePolicy {
9415	tg := a.Job.LookupTaskGroup(a.TaskGroup)
9416	if tg == nil {
9417		return nil
9418	}
9419	return tg.ReschedulePolicy
9420}
9421
9422// MigrateStrategy returns the migrate strategy based on the task group
9423func (a *Allocation) MigrateStrategy() *MigrateStrategy {
9424	tg := a.Job.LookupTaskGroup(a.TaskGroup)
9425	if tg == nil {
9426		return nil
9427	}
9428	return tg.Migrate
9429}
9430
9431// NextRescheduleTime returns a time on or after which the allocation is eligible to be rescheduled,
9432// and whether the next reschedule time is within policy's interval if the policy doesn't allow unlimited reschedules
9433func (a *Allocation) NextRescheduleTime() (time.Time, bool) {
9434	failTime := a.LastEventTime()
9435	reschedulePolicy := a.ReschedulePolicy()
9436	if a.DesiredStatus == AllocDesiredStatusStop || a.ClientStatus != AllocClientStatusFailed || failTime.IsZero() || reschedulePolicy == nil {
9437		return time.Time{}, false
9438	}
9439
9440	nextDelay := a.NextDelay()
9441	nextRescheduleTime := failTime.Add(nextDelay)
9442	rescheduleEligible := reschedulePolicy.Unlimited || (reschedulePolicy.Attempts > 0 && a.RescheduleTracker == nil)
9443	if reschedulePolicy.Attempts > 0 && a.RescheduleTracker != nil && a.RescheduleTracker.Events != nil {
9444		// Check for eligibility based on the interval if max attempts is set
9445		attempted := 0
9446		for j := len(a.RescheduleTracker.Events) - 1; j >= 0; j-- {
9447			lastAttempt := a.RescheduleTracker.Events[j].RescheduleTime
9448			timeDiff := failTime.UTC().UnixNano() - lastAttempt
9449			if timeDiff < reschedulePolicy.Interval.Nanoseconds() {
9450				attempted += 1
9451			}
9452		}
9453		rescheduleEligible = attempted < reschedulePolicy.Attempts && nextDelay < reschedulePolicy.Interval
9454	}
9455	return nextRescheduleTime, rescheduleEligible
9456}
9457
9458// ShouldClientStop tests an alloc for StopAfterClientDisconnect configuration
9459func (a *Allocation) ShouldClientStop() bool {
9460	tg := a.Job.LookupTaskGroup(a.TaskGroup)
9461	if tg == nil ||
9462		tg.StopAfterClientDisconnect == nil ||
9463		*tg.StopAfterClientDisconnect == 0*time.Nanosecond {
9464		return false
9465	}
9466	return true
9467}
9468
9469// WaitClientStop uses the reschedule delay mechanism to block rescheduling until
9470// StopAfterClientDisconnect's block interval passes
9471func (a *Allocation) WaitClientStop() time.Time {
9472	tg := a.Job.LookupTaskGroup(a.TaskGroup)
9473
9474	// An alloc can only be marked lost once, so use the first lost transition
9475	var t time.Time
9476	for _, s := range a.AllocStates {
9477		if s.Field == AllocStateFieldClientStatus &&
9478			s.Value == AllocClientStatusLost {
9479			t = s.Time
9480			break
9481		}
9482	}
9483
9484	// On the first pass, the alloc hasn't been marked lost yet, and so we start
9485	// counting from now
9486	if t.IsZero() {
9487		t = time.Now().UTC()
9488	}
9489
9490	// Find the max kill timeout
9491	kill := DefaultKillTimeout
9492	for _, t := range tg.Tasks {
9493		if t.KillTimeout > kill {
9494			kill = t.KillTimeout
9495		}
9496	}
9497
9498	return t.Add(*tg.StopAfterClientDisconnect + kill)
9499}
9500
9501// NextDelay returns a duration after which the allocation can be rescheduled.
9502// It is calculated according to the delay function and previous reschedule attempts.
9503func (a *Allocation) NextDelay() time.Duration {
9504	policy := a.ReschedulePolicy()
9505	// Can be nil if the task group was updated to remove its reschedule policy
9506	if policy == nil {
9507		return 0
9508	}
9509	delayDur := policy.Delay
9510	if a.RescheduleTracker == nil || a.RescheduleTracker.Events == nil || len(a.RescheduleTracker.Events) == 0 {
9511		return delayDur
9512	}
9513	events := a.RescheduleTracker.Events
9514	switch policy.DelayFunction {
9515	case "exponential":
9516		delayDur = a.RescheduleTracker.Events[len(a.RescheduleTracker.Events)-1].Delay * 2
9517	case "fibonacci":
9518		if len(events) >= 2 {
9519			fibN1Delay := events[len(events)-1].Delay
9520			fibN2Delay := events[len(events)-2].Delay
9521			// Handle reset of delay ceiling which should cause
9522			// a new series to start
9523			if fibN2Delay == policy.MaxDelay && fibN1Delay == policy.Delay {
9524				delayDur = fibN1Delay
9525			} else {
9526				delayDur = fibN1Delay + fibN2Delay
9527			}
9528		}
9529	default:
9530		return delayDur
9531	}
9532	if policy.MaxDelay > 0 && delayDur > policy.MaxDelay {
9533		delayDur = policy.MaxDelay
9534		// check if delay needs to be reset
9535
9536		lastRescheduleEvent := a.RescheduleTracker.Events[len(a.RescheduleTracker.Events)-1]
9537		timeDiff := a.LastEventTime().UTC().UnixNano() - lastRescheduleEvent.RescheduleTime
9538		if timeDiff > delayDur.Nanoseconds() {
9539			delayDur = policy.Delay
9540		}
9541
9542	}
9543
9544	return delayDur
9545}
9546
9547// Terminated returns if the allocation is in a terminal state on a client.
9548func (a *Allocation) Terminated() bool {
9549	if a.ClientStatus == AllocClientStatusFailed ||
9550		a.ClientStatus == AllocClientStatusComplete ||
9551		a.ClientStatus == AllocClientStatusLost {
9552		return true
9553	}
9554	return false
9555}
9556
9557// SetStopped updates the allocation in place to a DesiredStatus stop, with the ClientStatus
9558func (a *Allocation) SetStop(clientStatus, clientDesc string) {
9559	a.DesiredStatus = AllocDesiredStatusStop
9560	a.ClientStatus = clientStatus
9561	a.ClientDescription = clientDesc
9562	a.AppendState(AllocStateFieldClientStatus, clientStatus)
9563}
9564
9565// AppendState creates and appends an AllocState entry recording the time of the state
9566// transition. Used to mark the transition to lost
9567func (a *Allocation) AppendState(field AllocStateField, value string) {
9568	a.AllocStates = append(a.AllocStates, &AllocState{
9569		Field: field,
9570		Value: value,
9571		Time:  time.Now().UTC(),
9572	})
9573}
9574
9575// RanSuccessfully returns whether the client has ran the allocation and all
9576// tasks finished successfully. Critically this function returns whether the
9577// allocation has ran to completion and not just that the alloc has converged to
9578// its desired state. That is to say that a batch allocation must have finished
9579// with exit code 0 on all task groups. This doesn't really have meaning on a
9580// non-batch allocation because a service and system allocation should not
9581// finish.
9582func (a *Allocation) RanSuccessfully() bool {
9583	// Handle the case the client hasn't started the allocation.
9584	if len(a.TaskStates) == 0 {
9585		return false
9586	}
9587
9588	// Check to see if all the tasks finished successfully in the allocation
9589	allSuccess := true
9590	for _, state := range a.TaskStates {
9591		allSuccess = allSuccess && state.Successful()
9592	}
9593
9594	return allSuccess
9595}
9596
9597// ShouldMigrate returns if the allocation needs data migration
9598func (a *Allocation) ShouldMigrate() bool {
9599	if a.PreviousAllocation == "" {
9600		return false
9601	}
9602
9603	if a.DesiredStatus == AllocDesiredStatusStop || a.DesiredStatus == AllocDesiredStatusEvict {
9604		return false
9605	}
9606
9607	tg := a.Job.LookupTaskGroup(a.TaskGroup)
9608
9609	// if the task group is nil or the ephemeral disk block isn't present then
9610	// we won't migrate
9611	if tg == nil || tg.EphemeralDisk == nil {
9612		return false
9613	}
9614
9615	// We won't migrate any data is the user hasn't enabled migration or the
9616	// disk is not marked as sticky
9617	if !tg.EphemeralDisk.Migrate || !tg.EphemeralDisk.Sticky {
9618		return false
9619	}
9620
9621	return true
9622}
9623
9624// SetEventDisplayMessage populates the display message if its not already set,
9625// a temporary fix to handle old allocations that don't have it.
9626// This method will be removed in a future release.
9627func (a *Allocation) SetEventDisplayMessages() {
9628	setDisplayMsg(a.TaskStates)
9629}
9630
9631// COMPAT(0.11): Remove in 0.11
9632// ComparableResources returns the resources on the allocation
9633// handling upgrade paths. After 0.11 calls to this should be replaced with:
9634// alloc.AllocatedResources.Comparable()
9635func (a *Allocation) ComparableResources() *ComparableResources {
9636	// ALloc already has 0.9+ behavior
9637	if a.AllocatedResources != nil {
9638		return a.AllocatedResources.Comparable()
9639	}
9640
9641	var resources *Resources
9642	if a.Resources != nil {
9643		resources = a.Resources
9644	} else if a.TaskResources != nil {
9645		resources = new(Resources)
9646		resources.Add(a.SharedResources)
9647		for _, taskResource := range a.TaskResources {
9648			resources.Add(taskResource)
9649		}
9650	}
9651
9652	// Upgrade path
9653	return &ComparableResources{
9654		Flattened: AllocatedTaskResources{
9655			Cpu: AllocatedCpuResources{
9656				CpuShares: int64(resources.CPU),
9657			},
9658			Memory: AllocatedMemoryResources{
9659				MemoryMB: int64(resources.MemoryMB),
9660			},
9661			Networks: resources.Networks,
9662		},
9663		Shared: AllocatedSharedResources{
9664			DiskMB: int64(resources.DiskMB),
9665		},
9666	}
9667}
9668
9669// LookupTask by name from the Allocation. Returns nil if the Job is not set, the
9670// TaskGroup does not exist, or the task name cannot be found.
9671func (a *Allocation) LookupTask(name string) *Task {
9672	if a.Job == nil {
9673		return nil
9674	}
9675
9676	tg := a.Job.LookupTaskGroup(a.TaskGroup)
9677	if tg == nil {
9678		return nil
9679	}
9680
9681	return tg.LookupTask(name)
9682}
9683
9684// Stub returns a list stub for the allocation
9685func (a *Allocation) Stub(fields *AllocStubFields) *AllocListStub {
9686	s := &AllocListStub{
9687		ID:                    a.ID,
9688		EvalID:                a.EvalID,
9689		Name:                  a.Name,
9690		Namespace:             a.Namespace,
9691		NodeID:                a.NodeID,
9692		NodeName:              a.NodeName,
9693		JobID:                 a.JobID,
9694		JobType:               a.Job.Type,
9695		JobVersion:            a.Job.Version,
9696		TaskGroup:             a.TaskGroup,
9697		DesiredStatus:         a.DesiredStatus,
9698		DesiredDescription:    a.DesiredDescription,
9699		ClientStatus:          a.ClientStatus,
9700		ClientDescription:     a.ClientDescription,
9701		DesiredTransition:     a.DesiredTransition,
9702		TaskStates:            a.TaskStates,
9703		DeploymentStatus:      a.DeploymentStatus,
9704		FollowupEvalID:        a.FollowupEvalID,
9705		RescheduleTracker:     a.RescheduleTracker,
9706		PreemptedAllocations:  a.PreemptedAllocations,
9707		PreemptedByAllocation: a.PreemptedByAllocation,
9708		CreateIndex:           a.CreateIndex,
9709		ModifyIndex:           a.ModifyIndex,
9710		CreateTime:            a.CreateTime,
9711		ModifyTime:            a.ModifyTime,
9712	}
9713
9714	if fields != nil {
9715		if fields.Resources {
9716			s.AllocatedResources = a.AllocatedResources
9717		}
9718		if !fields.TaskStates {
9719			s.TaskStates = nil
9720		}
9721	}
9722
9723	return s
9724}
9725
9726// AllocationDiff converts an Allocation type to an AllocationDiff type
9727// If at any time, modification are made to AllocationDiff so that an
9728// Allocation can no longer be safely converted to AllocationDiff,
9729// this method should be changed accordingly.
9730func (a *Allocation) AllocationDiff() *AllocationDiff {
9731	return (*AllocationDiff)(a)
9732}
9733
9734// AllocationDiff is another named type for Allocation (to use the same fields),
9735// which is used to represent the delta for an Allocation. If you need a method
9736// defined on the al
9737type AllocationDiff Allocation
9738
9739// AllocListStub is used to return a subset of alloc information
9740type AllocListStub struct {
9741	ID                    string
9742	EvalID                string
9743	Name                  string
9744	Namespace             string
9745	NodeID                string
9746	NodeName              string
9747	JobID                 string
9748	JobType               string
9749	JobVersion            uint64
9750	TaskGroup             string
9751	AllocatedResources    *AllocatedResources `json:",omitempty"`
9752	DesiredStatus         string
9753	DesiredDescription    string
9754	ClientStatus          string
9755	ClientDescription     string
9756	DesiredTransition     DesiredTransition
9757	TaskStates            map[string]*TaskState
9758	DeploymentStatus      *AllocDeploymentStatus
9759	FollowupEvalID        string
9760	RescheduleTracker     *RescheduleTracker
9761	PreemptedAllocations  []string
9762	PreemptedByAllocation string
9763	CreateIndex           uint64
9764	ModifyIndex           uint64
9765	CreateTime            int64
9766	ModifyTime            int64
9767}
9768
9769// SetEventDisplayMessage populates the display message if its not already set,
9770// a temporary fix to handle old allocations that don't have it.
9771// This method will be removed in a future release.
9772func (a *AllocListStub) SetEventDisplayMessages() {
9773	setDisplayMsg(a.TaskStates)
9774}
9775
9776func setDisplayMsg(taskStates map[string]*TaskState) {
9777	for _, taskState := range taskStates {
9778		for _, event := range taskState.Events {
9779			event.PopulateEventDisplayMessage()
9780		}
9781	}
9782}
9783
9784// AllocStubFields defines which fields are included in the AllocListStub.
9785type AllocStubFields struct {
9786	// Resources includes resource-related fields if true.
9787	Resources bool
9788
9789	// TaskStates removes the TaskStates field if false (default is to
9790	// include TaskStates).
9791	TaskStates bool
9792}
9793
9794func NewAllocStubFields() *AllocStubFields {
9795	return &AllocStubFields{
9796		// Maintain backward compatibility by retaining task states by
9797		// default.
9798		TaskStates: true,
9799	}
9800}
9801
9802// AllocMetric is used to track various metrics while attempting
9803// to make an allocation. These are used to debug a job, or to better
9804// understand the pressure within the system.
9805type AllocMetric struct {
9806	// NodesEvaluated is the number of nodes that were evaluated
9807	NodesEvaluated int
9808
9809	// NodesFiltered is the number of nodes filtered due to a constraint
9810	NodesFiltered int
9811
9812	// NodesAvailable is the number of nodes available for evaluation per DC.
9813	NodesAvailable map[string]int
9814
9815	// ClassFiltered is the number of nodes filtered by class
9816	ClassFiltered map[string]int
9817
9818	// ConstraintFiltered is the number of failures caused by constraint
9819	ConstraintFiltered map[string]int
9820
9821	// NodesExhausted is the number of nodes skipped due to being
9822	// exhausted of at least one resource
9823	NodesExhausted int
9824
9825	// ClassExhausted is the number of nodes exhausted by class
9826	ClassExhausted map[string]int
9827
9828	// DimensionExhausted provides the count by dimension or reason
9829	DimensionExhausted map[string]int
9830
9831	// QuotaExhausted provides the exhausted dimensions
9832	QuotaExhausted []string
9833
9834	// ResourcesExhausted provides the amount of resources exhausted by task
9835	// during the allocation placement
9836	ResourcesExhausted map[string]*Resources
9837
9838	// Scores is the scores of the final few nodes remaining
9839	// for placement. The top score is typically selected.
9840	// Deprecated: Replaced by ScoreMetaData in Nomad 0.9
9841	Scores map[string]float64
9842
9843	// ScoreMetaData is a slice of top scoring nodes displayed in the CLI
9844	ScoreMetaData []*NodeScoreMeta
9845
9846	// nodeScoreMeta is used to keep scores for a single node id. It is cleared out after
9847	// we receive normalized score during the last step of the scoring stack.
9848	nodeScoreMeta *NodeScoreMeta
9849
9850	// topScores is used to maintain a heap of the top K nodes with
9851	// the highest normalized score
9852	topScores *kheap.ScoreHeap
9853
9854	// AllocationTime is a measure of how long the allocation
9855	// attempt took. This can affect performance and SLAs.
9856	AllocationTime time.Duration
9857
9858	// CoalescedFailures indicates the number of other
9859	// allocations that were coalesced into this failed allocation.
9860	// This is to prevent creating many failed allocations for a
9861	// single task group.
9862	CoalescedFailures int
9863}
9864
9865func (a *AllocMetric) Copy() *AllocMetric {
9866	if a == nil {
9867		return nil
9868	}
9869	na := new(AllocMetric)
9870	*na = *a
9871	na.NodesAvailable = helper.CopyMapStringInt(na.NodesAvailable)
9872	na.ClassFiltered = helper.CopyMapStringInt(na.ClassFiltered)
9873	na.ConstraintFiltered = helper.CopyMapStringInt(na.ConstraintFiltered)
9874	na.ClassExhausted = helper.CopyMapStringInt(na.ClassExhausted)
9875	na.DimensionExhausted = helper.CopyMapStringInt(na.DimensionExhausted)
9876	na.QuotaExhausted = helper.CopySliceString(na.QuotaExhausted)
9877	na.Scores = helper.CopyMapStringFloat64(na.Scores)
9878	na.ScoreMetaData = CopySliceNodeScoreMeta(na.ScoreMetaData)
9879	return na
9880}
9881
9882func (a *AllocMetric) EvaluateNode() {
9883	a.NodesEvaluated += 1
9884}
9885
9886func (a *AllocMetric) FilterNode(node *Node, constraint string) {
9887	a.NodesFiltered += 1
9888	if node != nil && node.NodeClass != "" {
9889		if a.ClassFiltered == nil {
9890			a.ClassFiltered = make(map[string]int)
9891		}
9892		a.ClassFiltered[node.NodeClass] += 1
9893	}
9894	if constraint != "" {
9895		if a.ConstraintFiltered == nil {
9896			a.ConstraintFiltered = make(map[string]int)
9897		}
9898		a.ConstraintFiltered[constraint] += 1
9899	}
9900}
9901
9902func (a *AllocMetric) ExhaustedNode(node *Node, dimension string) {
9903	a.NodesExhausted += 1
9904	if node != nil && node.NodeClass != "" {
9905		if a.ClassExhausted == nil {
9906			a.ClassExhausted = make(map[string]int)
9907		}
9908		a.ClassExhausted[node.NodeClass] += 1
9909	}
9910	if dimension != "" {
9911		if a.DimensionExhausted == nil {
9912			a.DimensionExhausted = make(map[string]int)
9913		}
9914		a.DimensionExhausted[dimension] += 1
9915	}
9916}
9917
9918func (a *AllocMetric) ExhaustQuota(dimensions []string) {
9919	if a.QuotaExhausted == nil {
9920		a.QuotaExhausted = make([]string, 0, len(dimensions))
9921	}
9922
9923	a.QuotaExhausted = append(a.QuotaExhausted, dimensions...)
9924}
9925
9926// ExhaustResources updates the amount of resources exhausted for the
9927// allocation because of the given task group.
9928func (a *AllocMetric) ExhaustResources(tg *TaskGroup) {
9929	if a.DimensionExhausted == nil {
9930		return
9931	}
9932
9933	if a.ResourcesExhausted == nil {
9934		a.ResourcesExhausted = make(map[string]*Resources)
9935	}
9936
9937	for _, t := range tg.Tasks {
9938		exhaustedResources := a.ResourcesExhausted[t.Name]
9939		if exhaustedResources == nil {
9940			exhaustedResources = &Resources{}
9941		}
9942
9943		if a.DimensionExhausted["memory"] > 0 {
9944			exhaustedResources.MemoryMB += t.Resources.MemoryMB
9945		}
9946
9947		if a.DimensionExhausted["cpu"] > 0 {
9948			exhaustedResources.CPU += t.Resources.CPU
9949		}
9950
9951		a.ResourcesExhausted[t.Name] = exhaustedResources
9952	}
9953}
9954
9955// ScoreNode is used to gather top K scoring nodes in a heap
9956func (a *AllocMetric) ScoreNode(node *Node, name string, score float64) {
9957	// Create nodeScoreMeta lazily if its the first time or if its a new node
9958	if a.nodeScoreMeta == nil || a.nodeScoreMeta.NodeID != node.ID {
9959		a.nodeScoreMeta = &NodeScoreMeta{
9960			NodeID: node.ID,
9961			Scores: make(map[string]float64),
9962		}
9963	}
9964	if name == NormScorerName {
9965		a.nodeScoreMeta.NormScore = score
9966		// Once we have the normalized score we can push to the heap
9967		// that tracks top K by normalized score
9968
9969		// Create the heap if its not there already
9970		if a.topScores == nil {
9971			a.topScores = kheap.NewScoreHeap(MaxRetainedNodeScores)
9972		}
9973		heap.Push(a.topScores, a.nodeScoreMeta)
9974
9975		// Clear out this entry because its now in the heap
9976		a.nodeScoreMeta = nil
9977	} else {
9978		a.nodeScoreMeta.Scores[name] = score
9979	}
9980}
9981
9982// PopulateScoreMetaData populates a map of scorer to scoring metadata
9983// The map is populated by popping elements from a heap of top K scores
9984// maintained per scorer
9985func (a *AllocMetric) PopulateScoreMetaData() {
9986	if a.topScores == nil {
9987		return
9988	}
9989
9990	if a.ScoreMetaData == nil {
9991		a.ScoreMetaData = make([]*NodeScoreMeta, a.topScores.Len())
9992	}
9993	heapItems := a.topScores.GetItemsReverse()
9994	for i, item := range heapItems {
9995		a.ScoreMetaData[i] = item.(*NodeScoreMeta)
9996	}
9997}
9998
9999// NodeScoreMeta captures scoring meta data derived from
10000// different scoring factors.
10001type NodeScoreMeta struct {
10002	NodeID    string
10003	Scores    map[string]float64
10004	NormScore float64
10005}
10006
10007func (s *NodeScoreMeta) Copy() *NodeScoreMeta {
10008	if s == nil {
10009		return nil
10010	}
10011	ns := new(NodeScoreMeta)
10012	*ns = *s
10013	return ns
10014}
10015
10016func (s *NodeScoreMeta) String() string {
10017	return fmt.Sprintf("%s %f %v", s.NodeID, s.NormScore, s.Scores)
10018}
10019
10020func (s *NodeScoreMeta) Score() float64 {
10021	return s.NormScore
10022}
10023
10024func (s *NodeScoreMeta) Data() interface{} {
10025	return s
10026}
10027
10028// AllocNetworkStatus captures the status of an allocation's network during runtime.
10029// Depending on the network mode, an allocation's address may need to be known to other
10030// systems in Nomad such as service registration.
10031type AllocNetworkStatus struct {
10032	InterfaceName string
10033	Address       string
10034	DNS           *DNSConfig
10035}
10036
10037func (a *AllocNetworkStatus) Copy() *AllocNetworkStatus {
10038	if a == nil {
10039		return nil
10040	}
10041	return &AllocNetworkStatus{
10042		InterfaceName: a.InterfaceName,
10043		Address:       a.Address,
10044		DNS:           a.DNS.Copy(),
10045	}
10046}
10047
10048// AllocDeploymentStatus captures the status of the allocation as part of the
10049// deployment. This can include things like if the allocation has been marked as
10050// healthy.
10051type AllocDeploymentStatus struct {
10052	// Healthy marks whether the allocation has been marked healthy or unhealthy
10053	// as part of a deployment. It can be unset if it has neither been marked
10054	// healthy or unhealthy.
10055	Healthy *bool
10056
10057	// Timestamp is the time at which the health status was set.
10058	Timestamp time.Time
10059
10060	// Canary marks whether the allocation is a canary or not. A canary that has
10061	// been promoted will have this field set to false.
10062	Canary bool
10063
10064	// ModifyIndex is the raft index in which the deployment status was last
10065	// changed.
10066	ModifyIndex uint64
10067}
10068
10069// HasHealth returns true if the allocation has its health set.
10070func (a *AllocDeploymentStatus) HasHealth() bool {
10071	return a != nil && a.Healthy != nil
10072}
10073
10074// IsHealthy returns if the allocation is marked as healthy as part of a
10075// deployment
10076func (a *AllocDeploymentStatus) IsHealthy() bool {
10077	if a == nil {
10078		return false
10079	}
10080
10081	return a.Healthy != nil && *a.Healthy
10082}
10083
10084// IsUnhealthy returns if the allocation is marked as unhealthy as part of a
10085// deployment
10086func (a *AllocDeploymentStatus) IsUnhealthy() bool {
10087	if a == nil {
10088		return false
10089	}
10090
10091	return a.Healthy != nil && !*a.Healthy
10092}
10093
10094// IsCanary returns if the allocation is marked as a canary
10095func (a *AllocDeploymentStatus) IsCanary() bool {
10096	if a == nil {
10097		return false
10098	}
10099
10100	return a.Canary
10101}
10102
10103func (a *AllocDeploymentStatus) Copy() *AllocDeploymentStatus {
10104	if a == nil {
10105		return nil
10106	}
10107
10108	c := new(AllocDeploymentStatus)
10109	*c = *a
10110
10111	if a.Healthy != nil {
10112		c.Healthy = helper.BoolToPtr(*a.Healthy)
10113	}
10114
10115	return c
10116}
10117
10118const (
10119	EvalStatusBlocked   = "blocked"
10120	EvalStatusPending   = "pending"
10121	EvalStatusComplete  = "complete"
10122	EvalStatusFailed    = "failed"
10123	EvalStatusCancelled = "canceled"
10124)
10125
10126const (
10127	EvalTriggerJobRegister       = "job-register"
10128	EvalTriggerJobDeregister     = "job-deregister"
10129	EvalTriggerPeriodicJob       = "periodic-job"
10130	EvalTriggerNodeDrain         = "node-drain"
10131	EvalTriggerNodeUpdate        = "node-update"
10132	EvalTriggerAllocStop         = "alloc-stop"
10133	EvalTriggerScheduled         = "scheduled"
10134	EvalTriggerRollingUpdate     = "rolling-update"
10135	EvalTriggerDeploymentWatcher = "deployment-watcher"
10136	EvalTriggerFailedFollowUp    = "failed-follow-up"
10137	EvalTriggerMaxPlans          = "max-plan-attempts"
10138	EvalTriggerRetryFailedAlloc  = "alloc-failure"
10139	EvalTriggerQueuedAllocs      = "queued-allocs"
10140	EvalTriggerPreemption        = "preemption"
10141	EvalTriggerScaling           = "job-scaling"
10142)
10143
10144const (
10145	// CoreJobEvalGC is used for the garbage collection of evaluations
10146	// and allocations. We periodically scan evaluations in a terminal state,
10147	// in which all the corresponding allocations are also terminal. We
10148	// delete these out of the system to bound the state.
10149	CoreJobEvalGC = "eval-gc"
10150
10151	// CoreJobNodeGC is used for the garbage collection of failed nodes.
10152	// We periodically scan nodes in a terminal state, and if they have no
10153	// corresponding allocations we delete these out of the system.
10154	CoreJobNodeGC = "node-gc"
10155
10156	// CoreJobJobGC is used for the garbage collection of eligible jobs. We
10157	// periodically scan garbage collectible jobs and check if both their
10158	// evaluations and allocations are terminal. If so, we delete these out of
10159	// the system.
10160	CoreJobJobGC = "job-gc"
10161
10162	// CoreJobDeploymentGC is used for the garbage collection of eligible
10163	// deployments. We periodically scan garbage collectible deployments and
10164	// check if they are terminal. If so, we delete these out of the system.
10165	CoreJobDeploymentGC = "deployment-gc"
10166
10167	// CoreJobCSIVolumeClaimGC is use for the garbage collection of CSI
10168	// volume claims. We periodically scan volumes to see if no allocs are
10169	// claiming them. If so, we unclaim the volume.
10170	CoreJobCSIVolumeClaimGC = "csi-volume-claim-gc"
10171
10172	// CoreJobCSIPluginGC is use for the garbage collection of CSI plugins.
10173	// We periodically scan plugins to see if they have no associated volumes
10174	// or allocs running them. If so, we delete the plugin.
10175	CoreJobCSIPluginGC = "csi-plugin-gc"
10176
10177	// CoreJobOneTimeTokenGC is use for the garbage collection of one-time
10178	// tokens. We periodically scan for expired tokens and delete them.
10179	CoreJobOneTimeTokenGC = "one-time-token-gc"
10180
10181	// CoreJobForceGC is used to force garbage collection of all GCable objects.
10182	CoreJobForceGC = "force-gc"
10183)
10184
10185// Evaluation is used anytime we need to apply business logic as a result
10186// of a change to our desired state (job specification) or the emergent state
10187// (registered nodes). When the inputs change, we need to "evaluate" them,
10188// potentially taking action (allocation of work) or doing nothing if the state
10189// of the world does not require it.
10190type Evaluation struct {
10191	// msgpack omit empty fields during serialization
10192	_struct bool `codec:",omitempty"` // nolint: structcheck
10193
10194	// ID is a randomly generated UUID used for this evaluation. This
10195	// is assigned upon the creation of the evaluation.
10196	ID string
10197
10198	// Namespace is the namespace the evaluation is created in
10199	Namespace string
10200
10201	// Priority is used to control scheduling importance and if this job
10202	// can preempt other jobs.
10203	Priority int
10204
10205	// Type is used to control which schedulers are available to handle
10206	// this evaluation.
10207	Type string
10208
10209	// TriggeredBy is used to give some insight into why this Eval
10210	// was created. (Job change, node failure, alloc failure, etc).
10211	TriggeredBy string
10212
10213	// JobID is the job this evaluation is scoped to. Evaluations cannot
10214	// be run in parallel for a given JobID, so we serialize on this.
10215	JobID string
10216
10217	// JobModifyIndex is the modify index of the job at the time
10218	// the evaluation was created
10219	JobModifyIndex uint64
10220
10221	// NodeID is the node that was affected triggering the evaluation.
10222	NodeID string
10223
10224	// NodeModifyIndex is the modify index of the node at the time
10225	// the evaluation was created
10226	NodeModifyIndex uint64
10227
10228	// DeploymentID is the ID of the deployment that triggered the evaluation.
10229	DeploymentID string
10230
10231	// Status of the evaluation
10232	Status string
10233
10234	// StatusDescription is meant to provide more human useful information
10235	StatusDescription string
10236
10237	// Wait is a minimum wait time for running the eval. This is used to
10238	// support a rolling upgrade in versions prior to 0.7.0
10239	// Deprecated
10240	Wait time.Duration
10241
10242	// WaitUntil is the time when this eval should be run. This is used to
10243	// supported delayed rescheduling of failed allocations
10244	WaitUntil time.Time
10245
10246	// NextEval is the evaluation ID for the eval created to do a followup.
10247	// This is used to support rolling upgrades and failed-follow-up evals, where
10248	// we need a chain of evaluations.
10249	NextEval string
10250
10251	// PreviousEval is the evaluation ID for the eval creating this one to do a followup.
10252	// This is used to support rolling upgrades and failed-follow-up evals, where
10253	// we need a chain of evaluations.
10254	PreviousEval string
10255
10256	// BlockedEval is the evaluation ID for a created blocked eval. A
10257	// blocked eval will be created if all allocations could not be placed due
10258	// to constraints or lacking resources.
10259	BlockedEval string
10260
10261	// FailedTGAllocs are task groups which have allocations that could not be
10262	// made, but the metrics are persisted so that the user can use the feedback
10263	// to determine the cause.
10264	FailedTGAllocs map[string]*AllocMetric
10265
10266	// ClassEligibility tracks computed node classes that have been explicitly
10267	// marked as eligible or ineligible.
10268	ClassEligibility map[string]bool
10269
10270	// QuotaLimitReached marks whether a quota limit was reached for the
10271	// evaluation.
10272	QuotaLimitReached string
10273
10274	// EscapedComputedClass marks whether the job has constraints that are not
10275	// captured by computed node classes.
10276	EscapedComputedClass bool
10277
10278	// AnnotatePlan triggers the scheduler to provide additional annotations
10279	// during the evaluation. This should not be set during normal operations.
10280	AnnotatePlan bool
10281
10282	// QueuedAllocations is the number of unplaced allocations at the time the
10283	// evaluation was processed. The map is keyed by Task Group names.
10284	QueuedAllocations map[string]int
10285
10286	// LeaderACL provides the ACL token to when issuing RPCs back to the
10287	// leader. This will be a valid management token as long as the leader is
10288	// active. This should not ever be exposed via the API.
10289	LeaderACL string
10290
10291	// SnapshotIndex is the Raft index of the snapshot used to process the
10292	// evaluation. The index will either be set when it has gone through the
10293	// scheduler or if a blocked evaluation is being created. The index is set
10294	// in this case so we can determine if an early unblocking is required since
10295	// capacity has changed since the evaluation was created. This can result in
10296	// the SnapshotIndex being less than the CreateIndex.
10297	SnapshotIndex uint64
10298
10299	// Raft Indexes
10300	CreateIndex uint64
10301	ModifyIndex uint64
10302
10303	CreateTime int64
10304	ModifyTime int64
10305}
10306
10307// TerminalStatus returns if the current status is terminal and
10308// will no longer transition.
10309func (e *Evaluation) TerminalStatus() bool {
10310	switch e.Status {
10311	case EvalStatusComplete, EvalStatusFailed, EvalStatusCancelled:
10312		return true
10313	default:
10314		return false
10315	}
10316}
10317
10318func (e *Evaluation) GoString() string {
10319	return fmt.Sprintf("<Eval %q JobID: %q Namespace: %q>", e.ID, e.JobID, e.Namespace)
10320}
10321
10322func (e *Evaluation) Copy() *Evaluation {
10323	if e == nil {
10324		return nil
10325	}
10326	ne := new(Evaluation)
10327	*ne = *e
10328
10329	// Copy ClassEligibility
10330	if e.ClassEligibility != nil {
10331		classes := make(map[string]bool, len(e.ClassEligibility))
10332		for class, elig := range e.ClassEligibility {
10333			classes[class] = elig
10334		}
10335		ne.ClassEligibility = classes
10336	}
10337
10338	// Copy FailedTGAllocs
10339	if e.FailedTGAllocs != nil {
10340		failedTGs := make(map[string]*AllocMetric, len(e.FailedTGAllocs))
10341		for tg, metric := range e.FailedTGAllocs {
10342			failedTGs[tg] = metric.Copy()
10343		}
10344		ne.FailedTGAllocs = failedTGs
10345	}
10346
10347	// Copy queued allocations
10348	if e.QueuedAllocations != nil {
10349		queuedAllocations := make(map[string]int, len(e.QueuedAllocations))
10350		for tg, num := range e.QueuedAllocations {
10351			queuedAllocations[tg] = num
10352		}
10353		ne.QueuedAllocations = queuedAllocations
10354	}
10355
10356	return ne
10357}
10358
10359// ShouldEnqueue checks if a given evaluation should be enqueued into the
10360// eval_broker
10361func (e *Evaluation) ShouldEnqueue() bool {
10362	switch e.Status {
10363	case EvalStatusPending:
10364		return true
10365	case EvalStatusComplete, EvalStatusFailed, EvalStatusBlocked, EvalStatusCancelled:
10366		return false
10367	default:
10368		panic(fmt.Sprintf("unhandled evaluation (%s) status %s", e.ID, e.Status))
10369	}
10370}
10371
10372// ShouldBlock checks if a given evaluation should be entered into the blocked
10373// eval tracker.
10374func (e *Evaluation) ShouldBlock() bool {
10375	switch e.Status {
10376	case EvalStatusBlocked:
10377		return true
10378	case EvalStatusComplete, EvalStatusFailed, EvalStatusPending, EvalStatusCancelled:
10379		return false
10380	default:
10381		panic(fmt.Sprintf("unhandled evaluation (%s) status %s", e.ID, e.Status))
10382	}
10383}
10384
10385// MakePlan is used to make a plan from the given evaluation
10386// for a given Job
10387func (e *Evaluation) MakePlan(j *Job) *Plan {
10388	p := &Plan{
10389		EvalID:          e.ID,
10390		Priority:        e.Priority,
10391		Job:             j,
10392		NodeUpdate:      make(map[string][]*Allocation),
10393		NodeAllocation:  make(map[string][]*Allocation),
10394		NodePreemptions: make(map[string][]*Allocation),
10395	}
10396	if j != nil {
10397		p.AllAtOnce = j.AllAtOnce
10398	}
10399	return p
10400}
10401
10402// NextRollingEval creates an evaluation to followup this eval for rolling updates
10403func (e *Evaluation) NextRollingEval(wait time.Duration) *Evaluation {
10404	now := time.Now().UTC().UnixNano()
10405	return &Evaluation{
10406		ID:             uuid.Generate(),
10407		Namespace:      e.Namespace,
10408		Priority:       e.Priority,
10409		Type:           e.Type,
10410		TriggeredBy:    EvalTriggerRollingUpdate,
10411		JobID:          e.JobID,
10412		JobModifyIndex: e.JobModifyIndex,
10413		Status:         EvalStatusPending,
10414		Wait:           wait,
10415		PreviousEval:   e.ID,
10416		CreateTime:     now,
10417		ModifyTime:     now,
10418	}
10419}
10420
10421// CreateBlockedEval creates a blocked evaluation to followup this eval to place any
10422// failed allocations. It takes the classes marked explicitly eligible or
10423// ineligible, whether the job has escaped computed node classes and whether the
10424// quota limit was reached.
10425func (e *Evaluation) CreateBlockedEval(classEligibility map[string]bool,
10426	escaped bool, quotaReached string, failedTGAllocs map[string]*AllocMetric) *Evaluation {
10427	now := time.Now().UTC().UnixNano()
10428	return &Evaluation{
10429		ID:                   uuid.Generate(),
10430		Namespace:            e.Namespace,
10431		Priority:             e.Priority,
10432		Type:                 e.Type,
10433		TriggeredBy:          EvalTriggerQueuedAllocs,
10434		JobID:                e.JobID,
10435		JobModifyIndex:       e.JobModifyIndex,
10436		Status:               EvalStatusBlocked,
10437		PreviousEval:         e.ID,
10438		FailedTGAllocs:       failedTGAllocs,
10439		ClassEligibility:     classEligibility,
10440		EscapedComputedClass: escaped,
10441		QuotaLimitReached:    quotaReached,
10442		CreateTime:           now,
10443		ModifyTime:           now,
10444	}
10445}
10446
10447// CreateFailedFollowUpEval creates a follow up evaluation when the current one
10448// has been marked as failed because it has hit the delivery limit and will not
10449// be retried by the eval_broker. Callers should copy the created eval's ID to
10450// into the old eval's NextEval field.
10451func (e *Evaluation) CreateFailedFollowUpEval(wait time.Duration) *Evaluation {
10452	now := time.Now().UTC().UnixNano()
10453	return &Evaluation{
10454		ID:             uuid.Generate(),
10455		Namespace:      e.Namespace,
10456		Priority:       e.Priority,
10457		Type:           e.Type,
10458		TriggeredBy:    EvalTriggerFailedFollowUp,
10459		JobID:          e.JobID,
10460		JobModifyIndex: e.JobModifyIndex,
10461		Status:         EvalStatusPending,
10462		Wait:           wait,
10463		PreviousEval:   e.ID,
10464		CreateTime:     now,
10465		ModifyTime:     now,
10466	}
10467}
10468
10469// UpdateModifyTime takes into account that clocks on different servers may be
10470// slightly out of sync. Even in case of a leader change, this method will
10471// guarantee that ModifyTime will always be after CreateTime.
10472func (e *Evaluation) UpdateModifyTime() {
10473	now := time.Now().UTC().UnixNano()
10474	if now <= e.CreateTime {
10475		e.ModifyTime = e.CreateTime + 1
10476	} else {
10477		e.ModifyTime = now
10478	}
10479}
10480
10481// Plan is used to submit a commit plan for task allocations. These
10482// are submitted to the leader which verifies that resources have
10483// not been overcommitted before admitting the plan.
10484type Plan struct {
10485	// msgpack omit empty fields during serialization
10486	_struct bool `codec:",omitempty"` // nolint: structcheck
10487
10488	// EvalID is the evaluation ID this plan is associated with
10489	EvalID string
10490
10491	// EvalToken is used to prevent a split-brain processing of
10492	// an evaluation. There should only be a single scheduler running
10493	// an Eval at a time, but this could be violated after a leadership
10494	// transition. This unique token is used to reject plans that are
10495	// being submitted from a different leader.
10496	EvalToken string
10497
10498	// Priority is the priority of the upstream job
10499	Priority int
10500
10501	// AllAtOnce is used to control if incremental scheduling of task groups
10502	// is allowed or if we must do a gang scheduling of the entire job.
10503	// If this is false, a plan may be partially applied. Otherwise, the
10504	// entire plan must be able to make progress.
10505	AllAtOnce bool
10506
10507	// Job is the parent job of all the allocations in the Plan.
10508	// Since a Plan only involves a single Job, we can reduce the size
10509	// of the plan by only including it once.
10510	Job *Job
10511
10512	// NodeUpdate contains all the allocations for each node. For each node,
10513	// this is a list of the allocations to update to either stop or evict.
10514	NodeUpdate map[string][]*Allocation
10515
10516	// NodeAllocation contains all the allocations for each node.
10517	// The evicts must be considered prior to the allocations.
10518	NodeAllocation map[string][]*Allocation
10519
10520	// Annotations contains annotations by the scheduler to be used by operators
10521	// to understand the decisions made by the scheduler.
10522	Annotations *PlanAnnotations
10523
10524	// Deployment is the deployment created or updated by the scheduler that
10525	// should be applied by the planner.
10526	Deployment *Deployment
10527
10528	// DeploymentUpdates is a set of status updates to apply to the given
10529	// deployments. This allows the scheduler to cancel any unneeded deployment
10530	// because the job is stopped or the update block is removed.
10531	DeploymentUpdates []*DeploymentStatusUpdate
10532
10533	// NodePreemptions is a map from node id to a set of allocations from other
10534	// lower priority jobs that are preempted. Preempted allocations are marked
10535	// as evicted.
10536	NodePreemptions map[string][]*Allocation
10537
10538	// SnapshotIndex is the Raft index of the snapshot used to create the
10539	// Plan. The leader will wait to evaluate the plan until its StateStore
10540	// has reached at least this index.
10541	SnapshotIndex uint64
10542}
10543
10544// AppendStoppedAlloc marks an allocation to be stopped. The clientStatus of the
10545// allocation may be optionally set by passing in a non-empty value.
10546func (p *Plan) AppendStoppedAlloc(alloc *Allocation, desiredDesc, clientStatus, followupEvalID string) {
10547	newAlloc := new(Allocation)
10548	*newAlloc = *alloc
10549
10550	// If the job is not set in the plan we are deregistering a job so we
10551	// extract the job from the allocation.
10552	if p.Job == nil && newAlloc.Job != nil {
10553		p.Job = newAlloc.Job
10554	}
10555
10556	// Normalize the job
10557	newAlloc.Job = nil
10558
10559	// Strip the resources as it can be rebuilt.
10560	newAlloc.Resources = nil
10561
10562	newAlloc.DesiredStatus = AllocDesiredStatusStop
10563	newAlloc.DesiredDescription = desiredDesc
10564
10565	if clientStatus != "" {
10566		newAlloc.ClientStatus = clientStatus
10567	}
10568
10569	newAlloc.AppendState(AllocStateFieldClientStatus, clientStatus)
10570
10571	if followupEvalID != "" {
10572		newAlloc.FollowupEvalID = followupEvalID
10573	}
10574
10575	node := alloc.NodeID
10576	existing := p.NodeUpdate[node]
10577	p.NodeUpdate[node] = append(existing, newAlloc)
10578}
10579
10580// AppendPreemptedAlloc is used to append an allocation that's being preempted to the plan.
10581// To minimize the size of the plan, this only sets a minimal set of fields in the allocation
10582func (p *Plan) AppendPreemptedAlloc(alloc *Allocation, preemptingAllocID string) {
10583	newAlloc := &Allocation{}
10584	newAlloc.ID = alloc.ID
10585	newAlloc.JobID = alloc.JobID
10586	newAlloc.Namespace = alloc.Namespace
10587	newAlloc.DesiredStatus = AllocDesiredStatusEvict
10588	newAlloc.PreemptedByAllocation = preemptingAllocID
10589
10590	desiredDesc := fmt.Sprintf("Preempted by alloc ID %v", preemptingAllocID)
10591	newAlloc.DesiredDescription = desiredDesc
10592
10593	// TaskResources are needed by the plan applier to check if allocations fit
10594	// after removing preempted allocations
10595	if alloc.AllocatedResources != nil {
10596		newAlloc.AllocatedResources = alloc.AllocatedResources
10597	} else {
10598		// COMPAT Remove in version 0.11
10599		newAlloc.TaskResources = alloc.TaskResources
10600		newAlloc.SharedResources = alloc.SharedResources
10601	}
10602
10603	// Append this alloc to slice for this node
10604	node := alloc.NodeID
10605	existing := p.NodePreemptions[node]
10606	p.NodePreemptions[node] = append(existing, newAlloc)
10607}
10608
10609func (p *Plan) PopUpdate(alloc *Allocation) {
10610	existing := p.NodeUpdate[alloc.NodeID]
10611	n := len(existing)
10612	if n > 0 && existing[n-1].ID == alloc.ID {
10613		existing = existing[:n-1]
10614		if len(existing) > 0 {
10615			p.NodeUpdate[alloc.NodeID] = existing
10616		} else {
10617			delete(p.NodeUpdate, alloc.NodeID)
10618		}
10619	}
10620}
10621
10622// AppendAlloc appends the alloc to the plan allocations.
10623// Uses the passed job if explicitly passed, otherwise
10624// it is assumed the alloc will use the plan Job version.
10625func (p *Plan) AppendAlloc(alloc *Allocation, job *Job) {
10626	node := alloc.NodeID
10627	existing := p.NodeAllocation[node]
10628
10629	alloc.Job = job
10630
10631	p.NodeAllocation[node] = append(existing, alloc)
10632}
10633
10634// IsNoOp checks if this plan would do nothing
10635func (p *Plan) IsNoOp() bool {
10636	return len(p.NodeUpdate) == 0 &&
10637		len(p.NodeAllocation) == 0 &&
10638		p.Deployment == nil &&
10639		len(p.DeploymentUpdates) == 0
10640}
10641
10642// NormalizeAllocations normalizes allocations to remove fields that can
10643// be fetched from the MemDB instead of sending over the wire
10644func (p *Plan) NormalizeAllocations() {
10645	for _, allocs := range p.NodeUpdate {
10646		for i, alloc := range allocs {
10647			allocs[i] = &Allocation{
10648				ID:                 alloc.ID,
10649				DesiredDescription: alloc.DesiredDescription,
10650				ClientStatus:       alloc.ClientStatus,
10651				FollowupEvalID:     alloc.FollowupEvalID,
10652			}
10653		}
10654	}
10655
10656	for _, allocs := range p.NodePreemptions {
10657		for i, alloc := range allocs {
10658			allocs[i] = &Allocation{
10659				ID:                    alloc.ID,
10660				PreemptedByAllocation: alloc.PreemptedByAllocation,
10661			}
10662		}
10663	}
10664}
10665
10666// PlanResult is the result of a plan submitted to the leader.
10667type PlanResult struct {
10668	// NodeUpdate contains all the updates that were committed.
10669	NodeUpdate map[string][]*Allocation
10670
10671	// NodeAllocation contains all the allocations that were committed.
10672	NodeAllocation map[string][]*Allocation
10673
10674	// Deployment is the deployment that was committed.
10675	Deployment *Deployment
10676
10677	// DeploymentUpdates is the set of deployment updates that were committed.
10678	DeploymentUpdates []*DeploymentStatusUpdate
10679
10680	// NodePreemptions is a map from node id to a set of allocations from other
10681	// lower priority jobs that are preempted. Preempted allocations are marked
10682	// as stopped.
10683	NodePreemptions map[string][]*Allocation
10684
10685	// RefreshIndex is the index the worker should refresh state up to.
10686	// This allows all evictions and allocations to be materialized.
10687	// If any allocations were rejected due to stale data (node state,
10688	// over committed) this can be used to force a worker refresh.
10689	RefreshIndex uint64
10690
10691	// AllocIndex is the Raft index in which the evictions and
10692	// allocations took place. This is used for the write index.
10693	AllocIndex uint64
10694}
10695
10696// IsNoOp checks if this plan result would do nothing
10697func (p *PlanResult) IsNoOp() bool {
10698	return len(p.NodeUpdate) == 0 && len(p.NodeAllocation) == 0 &&
10699		len(p.DeploymentUpdates) == 0 && p.Deployment == nil
10700}
10701
10702// FullCommit is used to check if all the allocations in a plan
10703// were committed as part of the result. Returns if there was
10704// a match, and the number of expected and actual allocations.
10705func (p *PlanResult) FullCommit(plan *Plan) (bool, int, int) {
10706	expected := 0
10707	actual := 0
10708	for name, allocList := range plan.NodeAllocation {
10709		didAlloc := p.NodeAllocation[name]
10710		expected += len(allocList)
10711		actual += len(didAlloc)
10712	}
10713	return actual == expected, expected, actual
10714}
10715
10716// PlanAnnotations holds annotations made by the scheduler to give further debug
10717// information to operators.
10718type PlanAnnotations struct {
10719	// DesiredTGUpdates is the set of desired updates per task group.
10720	DesiredTGUpdates map[string]*DesiredUpdates
10721
10722	// PreemptedAllocs is the set of allocations to be preempted to make the placement successful.
10723	PreemptedAllocs []*AllocListStub
10724}
10725
10726// DesiredUpdates is the set of changes the scheduler would like to make given
10727// sufficient resources and cluster capacity.
10728type DesiredUpdates struct {
10729	Ignore            uint64
10730	Place             uint64
10731	Migrate           uint64
10732	Stop              uint64
10733	InPlaceUpdate     uint64
10734	DestructiveUpdate uint64
10735	Canary            uint64
10736	Preemptions       uint64
10737}
10738
10739func (d *DesiredUpdates) GoString() string {
10740	return fmt.Sprintf("(place %d) (inplace %d) (destructive %d) (stop %d) (migrate %d) (ignore %d) (canary %d)",
10741		d.Place, d.InPlaceUpdate, d.DestructiveUpdate, d.Stop, d.Migrate, d.Ignore, d.Canary)
10742}
10743
10744// msgpackHandle is a shared handle for encoding/decoding of structs
10745var MsgpackHandle = func() *codec.MsgpackHandle {
10746	h := &codec.MsgpackHandle{}
10747	h.RawToString = true
10748
10749	// maintain binary format from time prior to upgrading latest ugorji
10750	h.BasicHandle.TimeNotBuiltin = true
10751
10752	// Sets the default type for decoding a map into a nil interface{}.
10753	// This is necessary in particular because we store the driver configs as a
10754	// nil interface{}.
10755	h.MapType = reflect.TypeOf(map[string]interface{}(nil))
10756
10757	// only review struct codec tags
10758	h.TypeInfos = codec.NewTypeInfos([]string{"codec"})
10759
10760	return h
10761}()
10762
10763// Decode is used to decode a MsgPack encoded object
10764func Decode(buf []byte, out interface{}) error {
10765	return codec.NewDecoder(bytes.NewReader(buf), MsgpackHandle).Decode(out)
10766}
10767
10768// Encode is used to encode a MsgPack object with type prefix
10769func Encode(t MessageType, msg interface{}) ([]byte, error) {
10770	var buf bytes.Buffer
10771	buf.WriteByte(uint8(t))
10772	err := codec.NewEncoder(&buf, MsgpackHandle).Encode(msg)
10773	return buf.Bytes(), err
10774}
10775
10776// KeyringResponse is a unified key response and can be used for install,
10777// remove, use, as well as listing key queries.
10778type KeyringResponse struct {
10779	Messages map[string]string
10780	Keys     map[string]int
10781	NumNodes int
10782}
10783
10784// KeyringRequest is request objects for serf key operations.
10785type KeyringRequest struct {
10786	Key string
10787}
10788
10789// RecoverableError wraps an error and marks whether it is recoverable and could
10790// be retried or it is fatal.
10791type RecoverableError struct {
10792	Err         string
10793	Recoverable bool
10794}
10795
10796// NewRecoverableError is used to wrap an error and mark it as recoverable or
10797// not.
10798func NewRecoverableError(e error, recoverable bool) error {
10799	if e == nil {
10800		return nil
10801	}
10802
10803	return &RecoverableError{
10804		Err:         e.Error(),
10805		Recoverable: recoverable,
10806	}
10807}
10808
10809// WrapRecoverable wraps an existing error in a new RecoverableError with a new
10810// message. If the error was recoverable before the returned error is as well;
10811// otherwise it is unrecoverable.
10812func WrapRecoverable(msg string, err error) error {
10813	return &RecoverableError{Err: msg, Recoverable: IsRecoverable(err)}
10814}
10815
10816func (r *RecoverableError) Error() string {
10817	return r.Err
10818}
10819
10820func (r *RecoverableError) IsRecoverable() bool {
10821	return r.Recoverable
10822}
10823
10824func (r *RecoverableError) IsUnrecoverable() bool {
10825	return !r.Recoverable
10826}
10827
10828// Recoverable is an interface for errors to implement to indicate whether or
10829// not they are fatal or recoverable.
10830type Recoverable interface {
10831	error
10832	IsRecoverable() bool
10833}
10834
10835// IsRecoverable returns true if error is a RecoverableError with
10836// Recoverable=true. Otherwise false is returned.
10837func IsRecoverable(e error) bool {
10838	if re, ok := e.(Recoverable); ok {
10839		return re.IsRecoverable()
10840	}
10841	return false
10842}
10843
10844// WrappedServerError wraps an error and satisfies
10845// both the Recoverable and the ServerSideError interfaces
10846type WrappedServerError struct {
10847	Err error
10848}
10849
10850// NewWrappedServerError is used to create a wrapped server side error
10851func NewWrappedServerError(e error) error {
10852	return &WrappedServerError{
10853		Err: e,
10854	}
10855}
10856
10857func (r *WrappedServerError) IsRecoverable() bool {
10858	return IsRecoverable(r.Err)
10859}
10860
10861func (r *WrappedServerError) Error() string {
10862	return r.Err.Error()
10863}
10864
10865func (r *WrappedServerError) IsServerSide() bool {
10866	return true
10867}
10868
10869// ServerSideError is an interface for errors to implement to indicate
10870// errors occurring after the request makes it to a server
10871type ServerSideError interface {
10872	error
10873	IsServerSide() bool
10874}
10875
10876// IsServerSide returns true if error is a wrapped
10877// server side error
10878func IsServerSide(e error) bool {
10879	if se, ok := e.(ServerSideError); ok {
10880		return se.IsServerSide()
10881	}
10882	return false
10883}
10884
10885// ACLPolicy is used to represent an ACL policy
10886type ACLPolicy struct {
10887	Name        string      // Unique name
10888	Description string      // Human readable
10889	Rules       string      // HCL or JSON format
10890	RulesJSON   *acl.Policy // Generated from Rules on read
10891	Hash        []byte
10892	CreateIndex uint64
10893	ModifyIndex uint64
10894}
10895
10896// SetHash is used to compute and set the hash of the ACL policy
10897func (c *ACLPolicy) SetHash() []byte {
10898	// Initialize a 256bit Blake2 hash (32 bytes)
10899	hash, err := blake2b.New256(nil)
10900	if err != nil {
10901		panic(err)
10902	}
10903
10904	// Write all the user set fields
10905	_, _ = hash.Write([]byte(c.Name))
10906	_, _ = hash.Write([]byte(c.Description))
10907	_, _ = hash.Write([]byte(c.Rules))
10908
10909	// Finalize the hash
10910	hashVal := hash.Sum(nil)
10911
10912	// Set and return the hash
10913	c.Hash = hashVal
10914	return hashVal
10915}
10916
10917func (a *ACLPolicy) Stub() *ACLPolicyListStub {
10918	return &ACLPolicyListStub{
10919		Name:        a.Name,
10920		Description: a.Description,
10921		Hash:        a.Hash,
10922		CreateIndex: a.CreateIndex,
10923		ModifyIndex: a.ModifyIndex,
10924	}
10925}
10926
10927func (a *ACLPolicy) Validate() error {
10928	var mErr multierror.Error
10929	if !validPolicyName.MatchString(a.Name) {
10930		err := fmt.Errorf("invalid name '%s'", a.Name)
10931		mErr.Errors = append(mErr.Errors, err)
10932	}
10933	if _, err := acl.Parse(a.Rules); err != nil {
10934		err = fmt.Errorf("failed to parse rules: %v", err)
10935		mErr.Errors = append(mErr.Errors, err)
10936	}
10937	if len(a.Description) > maxPolicyDescriptionLength {
10938		err := fmt.Errorf("description longer than %d", maxPolicyDescriptionLength)
10939		mErr.Errors = append(mErr.Errors, err)
10940	}
10941	return mErr.ErrorOrNil()
10942}
10943
10944// ACLPolicyListStub is used to for listing ACL policies
10945type ACLPolicyListStub struct {
10946	Name        string
10947	Description string
10948	Hash        []byte
10949	CreateIndex uint64
10950	ModifyIndex uint64
10951}
10952
10953// ACLPolicyListRequest is used to request a list of policies
10954type ACLPolicyListRequest struct {
10955	QueryOptions
10956}
10957
10958// ACLPolicySpecificRequest is used to query a specific policy
10959type ACLPolicySpecificRequest struct {
10960	Name string
10961	QueryOptions
10962}
10963
10964// ACLPolicySetRequest is used to query a set of policies
10965type ACLPolicySetRequest struct {
10966	Names []string
10967	QueryOptions
10968}
10969
10970// ACLPolicyListResponse is used for a list request
10971type ACLPolicyListResponse struct {
10972	Policies []*ACLPolicyListStub
10973	QueryMeta
10974}
10975
10976// SingleACLPolicyResponse is used to return a single policy
10977type SingleACLPolicyResponse struct {
10978	Policy *ACLPolicy
10979	QueryMeta
10980}
10981
10982// ACLPolicySetResponse is used to return a set of policies
10983type ACLPolicySetResponse struct {
10984	Policies map[string]*ACLPolicy
10985	QueryMeta
10986}
10987
10988// ACLPolicyDeleteRequest is used to delete a set of policies
10989type ACLPolicyDeleteRequest struct {
10990	Names []string
10991	WriteRequest
10992}
10993
10994// ACLPolicyUpsertRequest is used to upsert a set of policies
10995type ACLPolicyUpsertRequest struct {
10996	Policies []*ACLPolicy
10997	WriteRequest
10998}
10999
11000// ACLToken represents a client token which is used to Authenticate
11001type ACLToken struct {
11002	AccessorID  string   // Public Accessor ID (UUID)
11003	SecretID    string   // Secret ID, private (UUID)
11004	Name        string   // Human friendly name
11005	Type        string   // Client or Management
11006	Policies    []string // Policies this token ties to
11007	Global      bool     // Global or Region local
11008	Hash        []byte
11009	CreateTime  time.Time // Time of creation
11010	CreateIndex uint64
11011	ModifyIndex uint64
11012}
11013
11014func (a *ACLToken) Copy() *ACLToken {
11015	c := new(ACLToken)
11016	*c = *a
11017
11018	c.Policies = make([]string, len(a.Policies))
11019	copy(c.Policies, a.Policies)
11020	c.Hash = make([]byte, len(a.Hash))
11021	copy(c.Hash, a.Hash)
11022
11023	return c
11024}
11025
11026var (
11027	// AnonymousACLToken is used no SecretID is provided, and the
11028	// request is made anonymously.
11029	AnonymousACLToken = &ACLToken{
11030		AccessorID: "anonymous",
11031		Name:       "Anonymous Token",
11032		Type:       ACLClientToken,
11033		Policies:   []string{"anonymous"},
11034		Global:     false,
11035	}
11036)
11037
11038type ACLTokenListStub struct {
11039	AccessorID  string
11040	Name        string
11041	Type        string
11042	Policies    []string
11043	Global      bool
11044	Hash        []byte
11045	CreateTime  time.Time
11046	CreateIndex uint64
11047	ModifyIndex uint64
11048}
11049
11050// SetHash is used to compute and set the hash of the ACL token
11051func (a *ACLToken) SetHash() []byte {
11052	// Initialize a 256bit Blake2 hash (32 bytes)
11053	hash, err := blake2b.New256(nil)
11054	if err != nil {
11055		panic(err)
11056	}
11057
11058	// Write all the user set fields
11059	_, _ = hash.Write([]byte(a.Name))
11060	_, _ = hash.Write([]byte(a.Type))
11061	for _, policyName := range a.Policies {
11062		_, _ = hash.Write([]byte(policyName))
11063	}
11064	if a.Global {
11065		_, _ = hash.Write([]byte("global"))
11066	} else {
11067		_, _ = hash.Write([]byte("local"))
11068	}
11069
11070	// Finalize the hash
11071	hashVal := hash.Sum(nil)
11072
11073	// Set and return the hash
11074	a.Hash = hashVal
11075	return hashVal
11076}
11077
11078func (a *ACLToken) Stub() *ACLTokenListStub {
11079	return &ACLTokenListStub{
11080		AccessorID:  a.AccessorID,
11081		Name:        a.Name,
11082		Type:        a.Type,
11083		Policies:    a.Policies,
11084		Global:      a.Global,
11085		Hash:        a.Hash,
11086		CreateTime:  a.CreateTime,
11087		CreateIndex: a.CreateIndex,
11088		ModifyIndex: a.ModifyIndex,
11089	}
11090}
11091
11092// Validate is used to check a token for reasonableness
11093func (a *ACLToken) Validate() error {
11094	var mErr multierror.Error
11095	if len(a.Name) > maxTokenNameLength {
11096		mErr.Errors = append(mErr.Errors, fmt.Errorf("token name too long"))
11097	}
11098	switch a.Type {
11099	case ACLClientToken:
11100		if len(a.Policies) == 0 {
11101			mErr.Errors = append(mErr.Errors, fmt.Errorf("client token missing policies"))
11102		}
11103	case ACLManagementToken:
11104		if len(a.Policies) != 0 {
11105			mErr.Errors = append(mErr.Errors, fmt.Errorf("management token cannot be associated with policies"))
11106		}
11107	default:
11108		mErr.Errors = append(mErr.Errors, fmt.Errorf("token type must be client or management"))
11109	}
11110	return mErr.ErrorOrNil()
11111}
11112
11113// PolicySubset checks if a given set of policies is a subset of the token
11114func (a *ACLToken) PolicySubset(policies []string) bool {
11115	// Hot-path the management tokens, superset of all policies.
11116	if a.Type == ACLManagementToken {
11117		return true
11118	}
11119	associatedPolicies := make(map[string]struct{}, len(a.Policies))
11120	for _, policy := range a.Policies {
11121		associatedPolicies[policy] = struct{}{}
11122	}
11123	for _, policy := range policies {
11124		if _, ok := associatedPolicies[policy]; !ok {
11125			return false
11126		}
11127	}
11128	return true
11129}
11130
11131// ACLTokenListRequest is used to request a list of tokens
11132type ACLTokenListRequest struct {
11133	GlobalOnly bool
11134	QueryOptions
11135}
11136
11137// ACLTokenSpecificRequest is used to query a specific token
11138type ACLTokenSpecificRequest struct {
11139	AccessorID string
11140	QueryOptions
11141}
11142
11143// ACLTokenSetRequest is used to query a set of tokens
11144type ACLTokenSetRequest struct {
11145	AccessorIDS []string
11146	QueryOptions
11147}
11148
11149// ACLTokenListResponse is used for a list request
11150type ACLTokenListResponse struct {
11151	Tokens []*ACLTokenListStub
11152	QueryMeta
11153}
11154
11155// SingleACLTokenResponse is used to return a single token
11156type SingleACLTokenResponse struct {
11157	Token *ACLToken
11158	QueryMeta
11159}
11160
11161// ACLTokenSetResponse is used to return a set of token
11162type ACLTokenSetResponse struct {
11163	Tokens map[string]*ACLToken // Keyed by Accessor ID
11164	QueryMeta
11165}
11166
11167// ResolveACLTokenRequest is used to resolve a specific token
11168type ResolveACLTokenRequest struct {
11169	SecretID string
11170	QueryOptions
11171}
11172
11173// ResolveACLTokenResponse is used to resolve a single token
11174type ResolveACLTokenResponse struct {
11175	Token *ACLToken
11176	QueryMeta
11177}
11178
11179// ACLTokenDeleteRequest is used to delete a set of tokens
11180type ACLTokenDeleteRequest struct {
11181	AccessorIDs []string
11182	WriteRequest
11183}
11184
11185// ACLTokenBootstrapRequest is used to bootstrap ACLs
11186type ACLTokenBootstrapRequest struct {
11187	Token      *ACLToken // Not client specifiable
11188	ResetIndex uint64    // Reset index is used to clear the bootstrap token
11189	WriteRequest
11190}
11191
11192// ACLTokenUpsertRequest is used to upsert a set of tokens
11193type ACLTokenUpsertRequest struct {
11194	Tokens []*ACLToken
11195	WriteRequest
11196}
11197
11198// ACLTokenUpsertResponse is used to return from an ACLTokenUpsertRequest
11199type ACLTokenUpsertResponse struct {
11200	Tokens []*ACLToken
11201	WriteMeta
11202}
11203
11204// OneTimeToken is used to log into the web UI using a token provided by the
11205// command line.
11206type OneTimeToken struct {
11207	OneTimeSecretID string
11208	AccessorID      string
11209	ExpiresAt       time.Time
11210	CreateIndex     uint64
11211	ModifyIndex     uint64
11212}
11213
11214// OneTimeTokenUpsertRequest is the request for a UpsertOneTimeToken RPC
11215type OneTimeTokenUpsertRequest struct {
11216	WriteRequest
11217}
11218
11219// OneTimeTokenUpsertResponse is the response to a UpsertOneTimeToken RPC.
11220type OneTimeTokenUpsertResponse struct {
11221	OneTimeToken *OneTimeToken
11222	WriteMeta
11223}
11224
11225// OneTimeTokenExchangeRequest is a request to swap the one-time token with
11226// the backing ACL token
11227type OneTimeTokenExchangeRequest struct {
11228	OneTimeSecretID string
11229	WriteRequest
11230}
11231
11232// OneTimeTokenExchangeResponse is the response to swapping the one-time token
11233// with the backing ACL token
11234type OneTimeTokenExchangeResponse struct {
11235	Token *ACLToken
11236	WriteMeta
11237}
11238
11239// OneTimeTokenDeleteRequest is a request to delete a group of one-time tokens
11240type OneTimeTokenDeleteRequest struct {
11241	AccessorIDs []string
11242	WriteRequest
11243}
11244
11245// OneTimeTokenExpireRequest is a request to delete all expired one-time tokens
11246type OneTimeTokenExpireRequest struct {
11247	WriteRequest
11248}
11249
11250// RpcError is used for serializing errors with a potential error code
11251type RpcError struct {
11252	Message string
11253	Code    *int64
11254}
11255
11256func NewRpcError(err error, code *int64) *RpcError {
11257	return &RpcError{
11258		Message: err.Error(),
11259		Code:    code,
11260	}
11261}
11262
11263func (r *RpcError) Error() string {
11264	return r.Message
11265}
11266