1package scheduler
2
3import (
4	"testing"
5
6	"github.com/hashicorp/nomad/helper/uuid"
7	"github.com/hashicorp/nomad/nomad/mock"
8	"github.com/hashicorp/nomad/nomad/structs"
9	psstructs "github.com/hashicorp/nomad/plugins/shared/structs"
10	"github.com/stretchr/testify/require"
11)
12
13// deviceRequest takes the name, count and potential constraints and affinities
14// and returns a device request.
15func deviceRequest(name string, count uint64,
16	constraints []*structs.Constraint, affinities []*structs.Affinity) *structs.RequestedDevice {
17	return &structs.RequestedDevice{
18		Name:        name,
19		Count:       count,
20		Constraints: constraints,
21		Affinities:  affinities,
22	}
23}
24
25// devNode returns a node containing two devices, an nvidia gpu and an intel
26// FPGA.
27func devNode() *structs.Node {
28	n := mock.NvidiaNode()
29	n.NodeResources.Devices = append(n.NodeResources.Devices, &structs.NodeDeviceResource{
30		Type:   "fpga",
31		Vendor: "intel",
32		Name:   "F100",
33		Attributes: map[string]*psstructs.Attribute{
34			"memory": psstructs.NewIntAttribute(4, psstructs.UnitGiB),
35		},
36		Instances: []*structs.NodeDevice{
37			{
38				ID:      uuid.Generate(),
39				Healthy: true,
40			},
41			{
42				ID:      uuid.Generate(),
43				Healthy: false,
44			},
45		},
46	})
47	return n
48}
49
50// multipleNvidiaNode returns a node containing multiple nvidia device types.
51func multipleNvidiaNode() *structs.Node {
52	n := mock.NvidiaNode()
53	n.NodeResources.Devices = append(n.NodeResources.Devices, &structs.NodeDeviceResource{
54		Type:   "gpu",
55		Vendor: "nvidia",
56		Name:   "2080ti",
57		Attributes: map[string]*psstructs.Attribute{
58			"memory":           psstructs.NewIntAttribute(11, psstructs.UnitGiB),
59			"cuda_cores":       psstructs.NewIntAttribute(4352, ""),
60			"graphics_clock":   psstructs.NewIntAttribute(1350, psstructs.UnitMHz),
61			"memory_bandwidth": psstructs.NewIntAttribute(14, psstructs.UnitGBPerS),
62		},
63		Instances: []*structs.NodeDevice{
64			{
65				ID:      uuid.Generate(),
66				Healthy: true,
67			},
68			{
69				ID:      uuid.Generate(),
70				Healthy: true,
71			},
72		},
73	})
74	return n
75
76}
77
78// collectInstanceIDs returns the IDs of the device instances
79func collectInstanceIDs(devices ...*structs.NodeDeviceResource) []string {
80	var out []string
81	for _, d := range devices {
82		for _, i := range d.Instances {
83			out = append(out, i.ID)
84		}
85	}
86	return out
87}
88
89// Test that asking for a device that isn't fully specified works.
90func TestDeviceAllocator_Allocate_GenericRequest(t *testing.T) {
91	require := require.New(t)
92	_, ctx := testContext(t)
93	n := devNode()
94	d := newDeviceAllocator(ctx, n)
95	require.NotNil(d)
96
97	// Build the request
98	ask := deviceRequest("gpu", 1, nil, nil)
99
100	out, score, err := d.AssignDevice(ask)
101	require.NotNil(out)
102	require.Zero(score)
103	require.NoError(err)
104
105	// Check that we got the nvidia device
106	require.Len(out.DeviceIDs, 1)
107	require.Contains(collectInstanceIDs(n.NodeResources.Devices[0]), out.DeviceIDs[0])
108}
109
110// Test that asking for a device that is fully specified works.
111func TestDeviceAllocator_Allocate_FullyQualifiedRequest(t *testing.T) {
112	require := require.New(t)
113	_, ctx := testContext(t)
114	n := devNode()
115	d := newDeviceAllocator(ctx, n)
116	require.NotNil(d)
117
118	// Build the request
119	ask := deviceRequest("intel/fpga/F100", 1, nil, nil)
120
121	out, score, err := d.AssignDevice(ask)
122	require.NotNil(out)
123	require.Zero(score)
124	require.NoError(err)
125
126	// Check that we got the nvidia device
127	require.Len(out.DeviceIDs, 1)
128	require.Contains(collectInstanceIDs(n.NodeResources.Devices[1]), out.DeviceIDs[0])
129}
130
131// Test that asking for a device with too much count doesn't place
132func TestDeviceAllocator_Allocate_NotEnoughInstances(t *testing.T) {
133	require := require.New(t)
134	_, ctx := testContext(t)
135	n := devNode()
136	d := newDeviceAllocator(ctx, n)
137	require.NotNil(d)
138
139	// Build the request
140	ask := deviceRequest("gpu", 4, nil, nil)
141
142	out, _, err := d.AssignDevice(ask)
143	require.Nil(out)
144	require.Error(err)
145	require.Contains(err.Error(), "no devices match request")
146}
147
148// Test that asking for a device with constraints works
149func TestDeviceAllocator_Allocate_Constraints(t *testing.T) {
150	n := multipleNvidiaNode()
151	nvidia0 := n.NodeResources.Devices[0]
152	nvidia1 := n.NodeResources.Devices[1]
153
154	cases := []struct {
155		Name           string
156		Constraints    []*structs.Constraint
157		ExpectedDevice *structs.NodeDeviceResource
158		NoPlacement    bool
159	}{
160		{
161			Name: "gpu",
162			Constraints: []*structs.Constraint{
163				{
164					LTarget: "${device.attr.cuda_cores}",
165					Operand: ">",
166					RTarget: "4000",
167				},
168			},
169			ExpectedDevice: nvidia1,
170		},
171		{
172			Name: "gpu",
173			Constraints: []*structs.Constraint{
174				{
175					LTarget: "${device.attr.cuda_cores}",
176					Operand: "<",
177					RTarget: "4000",
178				},
179			},
180			ExpectedDevice: nvidia0,
181		},
182		{
183			Name: "nvidia/gpu",
184			Constraints: []*structs.Constraint{
185				// First two are shared across both devices
186				{
187					LTarget: "${device.attr.memory_bandwidth}",
188					Operand: ">",
189					RTarget: "10 GB/s",
190				},
191				{
192					LTarget: "${device.attr.memory}",
193					Operand: "is",
194					RTarget: "11264 MiB",
195				},
196				{
197					LTarget: "${device.attr.graphics_clock}",
198					Operand: ">",
199					RTarget: "1.4 GHz",
200				},
201			},
202			ExpectedDevice: nvidia0,
203		},
204		{
205			Name:        "intel/gpu",
206			NoPlacement: true,
207		},
208		{
209			Name: "nvidia/gpu",
210			Constraints: []*structs.Constraint{
211				{
212					LTarget: "${device.attr.memory_bandwidth}",
213					Operand: ">",
214					RTarget: "10 GB/s",
215				},
216				{
217					LTarget: "${device.attr.memory}",
218					Operand: "is",
219					RTarget: "11264 MiB",
220				},
221				// Rules both out
222				{
223					LTarget: "${device.attr.graphics_clock}",
224					Operand: ">",
225					RTarget: "2.4 GHz",
226				},
227			},
228			NoPlacement: true,
229		},
230	}
231
232	for _, c := range cases {
233		t.Run(c.Name, func(t *testing.T) {
234			require := require.New(t)
235			_, ctx := testContext(t)
236			d := newDeviceAllocator(ctx, n)
237			require.NotNil(d)
238
239			// Build the request
240			ask := deviceRequest(c.Name, 1, c.Constraints, nil)
241
242			out, score, err := d.AssignDevice(ask)
243			if c.NoPlacement {
244				require.Nil(out)
245			} else {
246				require.NotNil(out)
247				require.Zero(score)
248				require.NoError(err)
249
250				// Check that we got the nvidia device
251				require.Len(out.DeviceIDs, 1)
252				require.Contains(collectInstanceIDs(c.ExpectedDevice), out.DeviceIDs[0])
253			}
254		})
255	}
256}
257
258// Test that asking for a device with affinities works
259func TestDeviceAllocator_Allocate_Affinities(t *testing.T) {
260	n := multipleNvidiaNode()
261	nvidia0 := n.NodeResources.Devices[0]
262	nvidia1 := n.NodeResources.Devices[1]
263
264	cases := []struct {
265		Name           string
266		Affinities     []*structs.Affinity
267		ExpectedDevice *structs.NodeDeviceResource
268		ZeroScore      bool
269	}{
270		{
271			Name: "gpu",
272			Affinities: []*structs.Affinity{
273				{
274					LTarget: "${device.attr.cuda_cores}",
275					Operand: ">",
276					RTarget: "4000",
277					Weight:  60,
278				},
279			},
280			ExpectedDevice: nvidia1,
281		},
282		{
283			Name: "gpu",
284			Affinities: []*structs.Affinity{
285				{
286					LTarget: "${device.attr.cuda_cores}",
287					Operand: "<",
288					RTarget: "4000",
289					Weight:  10,
290				},
291			},
292			ExpectedDevice: nvidia0,
293		},
294		{
295			Name: "gpu",
296			Affinities: []*structs.Affinity{
297				{
298					LTarget: "${device.attr.cuda_cores}",
299					Operand: ">",
300					RTarget: "4000",
301					Weight:  -20,
302				},
303			},
304			ZeroScore:      true,
305			ExpectedDevice: nvidia0,
306		},
307		{
308			Name: "nvidia/gpu",
309			Affinities: []*structs.Affinity{
310				// First two are shared across both devices
311				{
312					LTarget: "${device.attr.memory_bandwidth}",
313					Operand: ">",
314					RTarget: "10 GB/s",
315					Weight:  20,
316				},
317				{
318					LTarget: "${device.attr.memory}",
319					Operand: "is",
320					RTarget: "11264 MiB",
321					Weight:  20,
322				},
323				{
324					LTarget: "${device.attr.graphics_clock}",
325					Operand: ">",
326					RTarget: "1.4 GHz",
327					Weight:  90,
328				},
329			},
330			ExpectedDevice: nvidia0,
331		},
332	}
333
334	for _, c := range cases {
335		t.Run(c.Name, func(t *testing.T) {
336			require := require.New(t)
337			_, ctx := testContext(t)
338			d := newDeviceAllocator(ctx, n)
339			require.NotNil(d)
340
341			// Build the request
342			ask := deviceRequest(c.Name, 1, nil, c.Affinities)
343
344			out, score, err := d.AssignDevice(ask)
345			require.NotNil(out)
346			require.NoError(err)
347			if c.ZeroScore {
348				require.Zero(score)
349			} else {
350				require.NotZero(score)
351			}
352
353			// Check that we got the nvidia device
354			require.Len(out.DeviceIDs, 1)
355			require.Contains(collectInstanceIDs(c.ExpectedDevice), out.DeviceIDs[0])
356		})
357	}
358}
359