xref: /linux/drivers/misc/vmw_balloon.c (revision 8b079cd0)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * VMware Balloon driver.
4  *
5  * Copyright (C) 2000-2018, VMware, Inc. All Rights Reserved.
6  *
7  * This is VMware physical memory management driver for Linux. The driver
8  * acts like a "balloon" that can be inflated to reclaim physical pages by
9  * reserving them in the guest and invalidating them in the monitor,
10  * freeing up the underlying machine pages so they can be allocated to
11  * other guests.  The balloon can also be deflated to allow the guest to
12  * use more physical memory. Higher level policies can control the sizes
13  * of balloons in VMs in order to manage physical memory resources.
14  */
15 
16 //#define DEBUG
17 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
18 
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/mm.h>
22 #include <linux/vmalloc.h>
23 #include <linux/sched.h>
24 #include <linux/module.h>
25 #include <linux/workqueue.h>
26 #include <linux/debugfs.h>
27 #include <linux/seq_file.h>
28 #include <linux/vmw_vmci_defs.h>
29 #include <linux/vmw_vmci_api.h>
30 #include <asm/hypervisor.h>
31 
32 MODULE_AUTHOR("VMware, Inc.");
33 MODULE_DESCRIPTION("VMware Memory Control (Balloon) Driver");
34 MODULE_VERSION("1.5.0.0-k");
35 MODULE_ALIAS("dmi:*:svnVMware*:*");
36 MODULE_ALIAS("vmware_vmmemctl");
37 MODULE_LICENSE("GPL");
38 
39 /*
40  * Use __GFP_HIGHMEM to allow pages from HIGHMEM zone. We don't allow wait
41  * (__GFP_RECLAIM) for huge page allocations. Use __GFP_NOWARN, to suppress page
42  * allocation failure warnings. Disallow access to emergency low-memory pools.
43  */
44 #define VMW_HUGE_PAGE_ALLOC_FLAGS	(__GFP_HIGHMEM|__GFP_NOWARN|	\
45 					 __GFP_NOMEMALLOC)
46 
47 /*
48  * Use __GFP_HIGHMEM to allow pages from HIGHMEM zone. We allow lightweight
49  * reclamation (__GFP_NORETRY). Use __GFP_NOWARN, to suppress page allocation
50  * failure warnings. Disallow access to emergency low-memory pools.
51  */
52 #define VMW_PAGE_ALLOC_FLAGS		(__GFP_HIGHMEM|__GFP_NOWARN|	\
53 					 __GFP_NOMEMALLOC|__GFP_NORETRY)
54 
55 /* Maximum number of refused pages we accumulate during inflation cycle */
56 #define VMW_BALLOON_MAX_REFUSED		16
57 
58 /*
59  * Hypervisor communication port definitions.
60  */
61 #define VMW_BALLOON_HV_PORT		0x5670
62 #define VMW_BALLOON_HV_MAGIC		0x456c6d6f
63 #define VMW_BALLOON_GUEST_ID		1	/* Linux */
64 
65 enum vmwballoon_capabilities {
66 	/*
67 	 * Bit 0 is reserved and not associated to any capability.
68 	 */
69 	VMW_BALLOON_BASIC_CMDS			= (1 << 1),
70 	VMW_BALLOON_BATCHED_CMDS		= (1 << 2),
71 	VMW_BALLOON_BATCHED_2M_CMDS		= (1 << 3),
72 	VMW_BALLOON_SIGNALLED_WAKEUP_CMD	= (1 << 4),
73 };
74 
75 #define VMW_BALLOON_CAPABILITIES	(VMW_BALLOON_BASIC_CMDS \
76 					| VMW_BALLOON_BATCHED_CMDS \
77 					| VMW_BALLOON_BATCHED_2M_CMDS \
78 					| VMW_BALLOON_SIGNALLED_WAKEUP_CMD)
79 
80 #define VMW_BALLOON_2M_ORDER		(PMD_SHIFT - PAGE_SHIFT)
81 #define VMW_BALLOON_NUM_PAGE_SIZES	(2)
82 
83 /*
84  * Backdoor commands availability:
85  *
86  * START, GET_TARGET and GUEST_ID are always available,
87  *
88  * VMW_BALLOON_BASIC_CMDS:
89  *	LOCK and UNLOCK commands,
90  * VMW_BALLOON_BATCHED_CMDS:
91  *	BATCHED_LOCK and BATCHED_UNLOCK commands.
92  * VMW BALLOON_BATCHED_2M_CMDS:
93  *	BATCHED_2M_LOCK and BATCHED_2M_UNLOCK commands,
94  * VMW VMW_BALLOON_SIGNALLED_WAKEUP_CMD:
95  *	VMW_BALLOON_CMD_VMCI_DOORBELL_SET command.
96  */
97 #define VMW_BALLOON_CMD_START			0
98 #define VMW_BALLOON_CMD_GET_TARGET		1
99 #define VMW_BALLOON_CMD_LOCK			2
100 #define VMW_BALLOON_CMD_UNLOCK			3
101 #define VMW_BALLOON_CMD_GUEST_ID		4
102 #define VMW_BALLOON_CMD_BATCHED_LOCK		6
103 #define VMW_BALLOON_CMD_BATCHED_UNLOCK		7
104 #define VMW_BALLOON_CMD_BATCHED_2M_LOCK		8
105 #define VMW_BALLOON_CMD_BATCHED_2M_UNLOCK	9
106 #define VMW_BALLOON_CMD_VMCI_DOORBELL_SET	10
107 
108 #define VMW_BALLOON_CMD_NUM			11
109 
110 /* error codes */
111 #define VMW_BALLOON_SUCCESS		        0
112 #define VMW_BALLOON_FAILURE		        -1
113 #define VMW_BALLOON_ERROR_CMD_INVALID	        1
114 #define VMW_BALLOON_ERROR_PPN_INVALID	        2
115 #define VMW_BALLOON_ERROR_PPN_LOCKED	        3
116 #define VMW_BALLOON_ERROR_PPN_UNLOCKED	        4
117 #define VMW_BALLOON_ERROR_PPN_PINNED	        5
118 #define VMW_BALLOON_ERROR_PPN_NOTNEEDED	        6
119 #define VMW_BALLOON_ERROR_RESET		        7
120 #define VMW_BALLOON_ERROR_BUSY		        8
121 
122 #define VMW_BALLOON_SUCCESS_WITH_CAPABILITIES	(0x03000000)
123 
124 #define VMW_BALLOON_CMD_WITH_TARGET_MASK			\
125 	((1UL << VMW_BALLOON_CMD_GET_TARGET)		|	\
126 	 (1UL << VMW_BALLOON_CMD_LOCK)			|	\
127 	 (1UL << VMW_BALLOON_CMD_UNLOCK)		|	\
128 	 (1UL << VMW_BALLOON_CMD_BATCHED_LOCK)		|	\
129 	 (1UL << VMW_BALLOON_CMD_BATCHED_UNLOCK)	|	\
130 	 (1UL << VMW_BALLOON_CMD_BATCHED_2M_LOCK)	|	\
131 	 (1UL << VMW_BALLOON_CMD_BATCHED_2M_UNLOCK))
132 
133 static const char * const vmballoon_cmd_names[] = {
134 	[VMW_BALLOON_CMD_START]			= "start",
135 	[VMW_BALLOON_CMD_GET_TARGET]		= "target",
136 	[VMW_BALLOON_CMD_LOCK]			= "lock",
137 	[VMW_BALLOON_CMD_UNLOCK]		= "unlock",
138 	[VMW_BALLOON_CMD_GUEST_ID]		= "guestType",
139 	[VMW_BALLOON_CMD_BATCHED_LOCK]		= "batchLock",
140 	[VMW_BALLOON_CMD_BATCHED_UNLOCK]	= "batchUnlock",
141 	[VMW_BALLOON_CMD_BATCHED_2M_LOCK]	= "2m-lock",
142 	[VMW_BALLOON_CMD_BATCHED_2M_UNLOCK]	= "2m-unlock",
143 	[VMW_BALLOON_CMD_VMCI_DOORBELL_SET]	= "doorbellSet"
144 };
145 
146 #ifdef CONFIG_DEBUG_FS
147 struct vmballoon_stats {
148 	unsigned int timer;
149 	unsigned int doorbell;
150 
151 	/* allocation statistics */
152 	unsigned int alloc[VMW_BALLOON_NUM_PAGE_SIZES];
153 	unsigned int alloc_fail[VMW_BALLOON_NUM_PAGE_SIZES];
154 	unsigned int refused_alloc[VMW_BALLOON_NUM_PAGE_SIZES];
155 	unsigned int refused_free[VMW_BALLOON_NUM_PAGE_SIZES];
156 	unsigned int free[VMW_BALLOON_NUM_PAGE_SIZES];
157 
158 	/* Monitor operations.  */
159 	unsigned long ops[VMW_BALLOON_CMD_NUM];
160 	unsigned long ops_fail[VMW_BALLOON_CMD_NUM];
161 };
162 
163 #define STATS_INC(stat) (stat)++
164 #else
165 #define STATS_INC(stat)
166 #endif
167 
168 static DEFINE_STATIC_KEY_TRUE(vmw_balloon_batching);
169 
170 struct vmballoon_page_size {
171 	/* list of reserved physical pages */
172 	struct list_head pages;
173 
174 	/* transient list of non-balloonable pages */
175 	struct list_head refused_pages;
176 	unsigned int n_refused_pages;
177 };
178 
179 /**
180  * struct vmballoon_batch_entry - a batch entry for lock or unlock.
181  *
182  * @status: the status of the operation, which is written by the hypervisor.
183  * @reserved: reserved for future use. Must be set to zero.
184  * @pfn: the physical frame number of the page to be locked or unlocked.
185  */
186 struct vmballoon_batch_entry {
187 	u64 status : 5;
188 	u64 reserved : PAGE_SHIFT - 5;
189 	u64 pfn : 52;
190 } __packed;
191 
192 struct vmballoon {
193 	struct vmballoon_page_size page_sizes[VMW_BALLOON_NUM_PAGE_SIZES];
194 
195 	/* supported page sizes. 1 == 4k pages only, 2 == 4k and 2m pages */
196 	unsigned supported_page_sizes;
197 
198 	/* balloon size in pages */
199 	unsigned int size;
200 	unsigned int target;
201 
202 	/* reset flag */
203 	bool reset_required;
204 
205 	unsigned long capabilities;
206 
207 	/**
208 	 * @batch_page: pointer to communication batch page.
209 	 *
210 	 * When batching is used, batch_page points to a page, which holds up to
211 	 * %VMW_BALLOON_BATCH_MAX_PAGES entries for locking or unlocking.
212 	 */
213 	struct vmballoon_batch_entry *batch_page;
214 
215 	unsigned int batch_max_pages;
216 	struct page *page;
217 
218 #ifdef CONFIG_DEBUG_FS
219 	/* statistics */
220 	struct vmballoon_stats stats;
221 
222 	/* debugfs file exporting statistics */
223 	struct dentry *dbg_entry;
224 #endif
225 
226 	struct sysinfo sysinfo;
227 
228 	struct delayed_work dwork;
229 
230 	struct vmci_handle vmci_doorbell;
231 };
232 
233 static struct vmballoon balloon;
234 
235 static inline unsigned long
236 __vmballoon_cmd(struct vmballoon *b, unsigned long cmd, unsigned long arg1,
237 		unsigned long arg2, unsigned long *result)
238 {
239 	unsigned long status, dummy1, dummy2, dummy3, local_result;
240 
241 	STATS_INC(b->stats.ops[cmd]);
242 
243 	asm volatile ("inl %%dx" :
244 		"=a"(status),
245 		"=c"(dummy1),
246 		"=d"(dummy2),
247 		"=b"(local_result),
248 		"=S"(dummy3) :
249 		"0"(VMW_BALLOON_HV_MAGIC),
250 		"1"(cmd),
251 		"2"(VMW_BALLOON_HV_PORT),
252 		"3"(arg1),
253 		"4"(arg2) :
254 		"memory");
255 
256 	/* update the result if needed */
257 	if (result)
258 		*result = (cmd == VMW_BALLOON_CMD_START) ? dummy1 :
259 							   local_result;
260 
261 	/* update target when applicable */
262 	if (status == VMW_BALLOON_SUCCESS &&
263 	    ((1ul << cmd) & VMW_BALLOON_CMD_WITH_TARGET_MASK))
264 		b->target = local_result;
265 
266 	if (status != VMW_BALLOON_SUCCESS &&
267 	    status != VMW_BALLOON_SUCCESS_WITH_CAPABILITIES) {
268 		STATS_INC(b->stats.ops_fail[cmd]);
269 		pr_debug("%s: %s [0x%lx,0x%lx) failed, returned %ld\n",
270 			 __func__, vmballoon_cmd_names[cmd], arg1, arg2,
271 			 status);
272 	}
273 
274 	/* mark reset required accordingly */
275 	if (status == VMW_BALLOON_ERROR_RESET)
276 		b->reset_required = true;
277 
278 	return status;
279 }
280 
281 static __always_inline unsigned long
282 vmballoon_cmd(struct vmballoon *b, unsigned long cmd, unsigned long arg1,
283 	      unsigned long arg2)
284 {
285 	unsigned long dummy;
286 
287 	return __vmballoon_cmd(b, cmd, arg1, arg2, &dummy);
288 }
289 
290 /*
291  * Send "start" command to the host, communicating supported version
292  * of the protocol.
293  */
294 static bool vmballoon_send_start(struct vmballoon *b, unsigned long req_caps)
295 {
296 	unsigned long status, capabilities;
297 	bool success;
298 
299 	status = __vmballoon_cmd(b, VMW_BALLOON_CMD_START, req_caps, 0,
300 				 &capabilities);
301 
302 	switch (status) {
303 	case VMW_BALLOON_SUCCESS_WITH_CAPABILITIES:
304 		b->capabilities = capabilities;
305 		success = true;
306 		break;
307 	case VMW_BALLOON_SUCCESS:
308 		b->capabilities = VMW_BALLOON_BASIC_CMDS;
309 		success = true;
310 		break;
311 	default:
312 		success = false;
313 	}
314 
315 	/*
316 	 * 2MB pages are only supported with batching. If batching is for some
317 	 * reason disabled, do not use 2MB pages, since otherwise the legacy
318 	 * mechanism is used with 2MB pages, causing a failure.
319 	 */
320 	if ((b->capabilities & VMW_BALLOON_BATCHED_2M_CMDS) &&
321 	    (b->capabilities & VMW_BALLOON_BATCHED_CMDS))
322 		b->supported_page_sizes = 2;
323 	else
324 		b->supported_page_sizes = 1;
325 
326 	return success;
327 }
328 
329 /*
330  * Communicate guest type to the host so that it can adjust ballooning
331  * algorithm to the one most appropriate for the guest. This command
332  * is normally issued after sending "start" command and is part of
333  * standard reset sequence.
334  */
335 static bool vmballoon_send_guest_id(struct vmballoon *b)
336 {
337 	unsigned long status;
338 
339 	status = vmballoon_cmd(b, VMW_BALLOON_CMD_GUEST_ID,
340 			       VMW_BALLOON_GUEST_ID, 0);
341 
342 	if (status == VMW_BALLOON_SUCCESS)
343 		return true;
344 
345 	return false;
346 }
347 
348 static u16 vmballoon_page_size(bool is_2m_page)
349 {
350 	if (is_2m_page)
351 		return 1 << VMW_BALLOON_2M_ORDER;
352 
353 	return 1;
354 }
355 
356 /*
357  * Retrieve desired balloon size from the host.
358  */
359 static bool vmballoon_send_get_target(struct vmballoon *b)
360 {
361 	unsigned long status;
362 	unsigned long limit;
363 	u32 limit32;
364 
365 	/*
366 	 * si_meminfo() is cheap. Moreover, we want to provide dynamic
367 	 * max balloon size later. So let us call si_meminfo() every
368 	 * iteration.
369 	 */
370 	si_meminfo(&b->sysinfo);
371 	limit = b->sysinfo.totalram;
372 
373 	/* Ensure limit fits in 32-bits */
374 	limit32 = (u32)limit;
375 	if (limit != limit32)
376 		return false;
377 
378 	status = vmballoon_cmd(b, VMW_BALLOON_CMD_GET_TARGET, limit, 0);
379 
380 	if (status == VMW_BALLOON_SUCCESS)
381 		return true;
382 
383 	return false;
384 }
385 
386 static struct page *vmballoon_alloc_page(bool is_2m_page)
387 {
388 	if (is_2m_page)
389 		return alloc_pages(VMW_HUGE_PAGE_ALLOC_FLAGS,
390 				   VMW_BALLOON_2M_ORDER);
391 
392 	return alloc_page(VMW_PAGE_ALLOC_FLAGS);
393 }
394 
395 static void vmballoon_free_page(struct page *page, bool is_2m_page)
396 {
397 	if (is_2m_page)
398 		__free_pages(page, VMW_BALLOON_2M_ORDER);
399 	else
400 		__free_page(page);
401 }
402 
403 /*
404  * Quickly release all pages allocated for the balloon. This function is
405  * called when host decides to "reset" balloon for one reason or another.
406  * Unlike normal "deflate" we do not (shall not) notify host of the pages
407  * being released.
408  */
409 static void vmballoon_pop(struct vmballoon *b)
410 {
411 	struct page *page, *next;
412 	unsigned is_2m_pages;
413 
414 	for (is_2m_pages = 0; is_2m_pages < VMW_BALLOON_NUM_PAGE_SIZES;
415 			is_2m_pages++) {
416 		struct vmballoon_page_size *page_size =
417 				&b->page_sizes[is_2m_pages];
418 		u16 size_per_page = vmballoon_page_size(is_2m_pages);
419 
420 		list_for_each_entry_safe(page, next, &page_size->pages, lru) {
421 			list_del(&page->lru);
422 			vmballoon_free_page(page, is_2m_pages);
423 			STATS_INC(b->stats.free[is_2m_pages]);
424 			b->size -= size_per_page;
425 			cond_resched();
426 		}
427 	}
428 
429 	/* Clearing the batch_page unconditionally has no adverse effect */
430 	free_page((unsigned long)b->batch_page);
431 	b->batch_page = NULL;
432 }
433 
434 /**
435  * vmballoon_status_page - returns the status of (un)lock operation
436  *
437  * @b: pointer to the balloon.
438  * @idx: index for the page for which the operation is performed.
439  * @p: pointer to where the page struct is returned.
440  *
441  * Following a lock or unlock operation, returns the status of the operation for
442  * an individual page. Provides the page that the operation was performed on on
443  * the @page argument.
444  *
445  * Returns: The status of a lock or unlock operation for an individual page.
446  */
447 static unsigned long vmballoon_status_page(struct vmballoon *b, int idx,
448 					   struct page **p)
449 {
450 	if (static_branch_likely(&vmw_balloon_batching)) {
451 		/* batching mode */
452 		*p = pfn_to_page(b->batch_page[idx].pfn);
453 		return b->batch_page[idx].status;
454 	}
455 
456 	/* non-batching mode */
457 	*p = b->page;
458 
459 	/*
460 	 * If a failure occurs, the indication will be provided in the status
461 	 * of the entire operation, which is considered before the individual
462 	 * page status. So for non-batching mode, the indication is always of
463 	 * success.
464 	 */
465 	return VMW_BALLOON_SUCCESS;
466 }
467 
468 /**
469  * vmballoon_lock_op - notifies the host about inflated/deflated pages.
470  * @b: pointer to the balloon.
471  * @num_pages: number of inflated/deflated pages.
472  * @is_2m_pages: whether the page(s) are 2M (or 4k).
473  * @lock: whether the operation is lock (or unlock).
474  *
475  * Notify the host about page(s) that were ballooned (or removed from the
476  * balloon) so that host can use it without fear that guest will need it (or
477  * stop using them since the VM does). Host may reject some pages, we need to
478  * check the return value and maybe submit a different page. The pages that are
479  * inflated/deflated are pointed by @b->page.
480  *
481  * Return: result as provided by the hypervisor.
482  */
483 static unsigned long vmballoon_lock_op(struct vmballoon *b,
484 				       unsigned int num_pages,
485 				       bool is_2m_pages, bool lock)
486 {
487 	unsigned long cmd, pfn;
488 
489 	if (static_branch_likely(&vmw_balloon_batching)) {
490 		if (lock)
491 			cmd = is_2m_pages ? VMW_BALLOON_CMD_BATCHED_2M_LOCK :
492 					    VMW_BALLOON_CMD_BATCHED_LOCK;
493 		else
494 			cmd = is_2m_pages ? VMW_BALLOON_CMD_BATCHED_2M_UNLOCK :
495 					    VMW_BALLOON_CMD_BATCHED_UNLOCK;
496 
497 		pfn = PHYS_PFN(virt_to_phys(b->batch_page));
498 	} else {
499 		cmd = lock ? VMW_BALLOON_CMD_LOCK : VMW_BALLOON_CMD_UNLOCK;
500 		pfn = page_to_pfn(b->page);
501 
502 		/* In non-batching mode, PFNs must fit in 32-bit */
503 		if (unlikely(pfn != (u32)pfn))
504 			return VMW_BALLOON_ERROR_PPN_INVALID;
505 	}
506 
507 	return vmballoon_cmd(b, cmd, pfn, num_pages);
508 }
509 
510 static int vmballoon_lock(struct vmballoon *b, unsigned int num_pages,
511 			  bool is_2m_pages)
512 {
513 	unsigned long batch_status;
514 	int i;
515 	u16 size_per_page = vmballoon_page_size(is_2m_pages);
516 
517 	batch_status = vmballoon_lock_op(b, num_pages, is_2m_pages, true);
518 
519 	for (i = 0; i < num_pages; i++) {
520 		unsigned long status;
521 		struct page *p;
522 		struct vmballoon_page_size *page_size =
523 				&b->page_sizes[is_2m_pages];
524 
525 		status = vmballoon_status_page(b, i, &p);
526 
527 		/*
528 		 * Failure of the whole batch overrides a single operation
529 		 * results.
530 		 */
531 		if (batch_status != VMW_BALLOON_SUCCESS)
532 			status = batch_status;
533 
534 		if (status == VMW_BALLOON_SUCCESS) {
535 			/* track allocated page */
536 			list_add(&p->lru, &page_size->pages);
537 
538 			/* update balloon size */
539 			b->size += size_per_page;
540 			continue;
541 		}
542 
543 		/* Error occurred */
544 		STATS_INC(b->stats.refused_alloc[is_2m_pages]);
545 
546 		/*
547 		 * Place page on the list of non-balloonable pages
548 		 * and retry allocation, unless we already accumulated
549 		 * too many of them, in which case take a breather.
550 		 */
551 		list_add(&p->lru, &page_size->refused_pages);
552 		page_size->n_refused_pages++;
553 	}
554 
555 	return batch_status == VMW_BALLOON_SUCCESS ? 0 : -EIO;
556 }
557 
558 /*
559  * Release the page allocated for the balloon. Note that we first notify
560  * the host so it can make sure the page will be available for the guest
561  * to use, if needed.
562  */
563 static int vmballoon_unlock(struct vmballoon *b, unsigned int num_pages,
564 			    bool is_2m_pages)
565 {
566 	int i;
567 	unsigned long batch_status;
568 	u16 size_per_page = vmballoon_page_size(is_2m_pages);
569 
570 	batch_status = vmballoon_lock_op(b, num_pages, is_2m_pages, false);
571 
572 	for (i = 0; i < num_pages; i++) {
573 		struct vmballoon_page_size *page_size;
574 		unsigned long status;
575 		struct page *p;
576 
577 		status = vmballoon_status_page(b, i, &p);
578 		page_size = &b->page_sizes[is_2m_pages];
579 
580 		/*
581 		 * Failure of the whole batch overrides a single operation
582 		 * results.
583 		 */
584 		if (batch_status != VMW_BALLOON_SUCCESS)
585 			status = batch_status;
586 
587 		if (status != VMW_BALLOON_SUCCESS) {
588 			/*
589 			 * That page wasn't successfully unlocked by the
590 			 * hypervisor, re-add it to the list of pages owned by
591 			 * the balloon driver.
592 			 */
593 			list_add(&p->lru, &page_size->pages);
594 		} else {
595 			/* deallocate page */
596 			vmballoon_free_page(p, is_2m_pages);
597 			STATS_INC(b->stats.free[is_2m_pages]);
598 
599 			/* update balloon size */
600 			b->size -= size_per_page;
601 		}
602 	}
603 
604 	return batch_status == VMW_BALLOON_SUCCESS ? 0 : -EIO;
605 }
606 
607 /*
608  * Release pages that were allocated while attempting to inflate the
609  * balloon but were refused by the host for one reason or another.
610  */
611 static void vmballoon_release_refused_pages(struct vmballoon *b,
612 		bool is_2m_pages)
613 {
614 	struct page *page, *next;
615 	struct vmballoon_page_size *page_size =
616 			&b->page_sizes[is_2m_pages];
617 
618 	list_for_each_entry_safe(page, next, &page_size->refused_pages, lru) {
619 		list_del(&page->lru);
620 		vmballoon_free_page(page, is_2m_pages);
621 		STATS_INC(b->stats.refused_free[is_2m_pages]);
622 	}
623 
624 	page_size->n_refused_pages = 0;
625 }
626 
627 static void vmballoon_add_page(struct vmballoon *b, int idx, struct page *p)
628 {
629 	if (static_branch_likely(&vmw_balloon_batching))
630 		b->batch_page[idx] = (struct vmballoon_batch_entry)
631 					{ .pfn = page_to_pfn(p) };
632 	else
633 		b->page = p;
634 }
635 
636 /**
637  * vmballoon_change - retrieve the required balloon change
638  *
639  * @b: pointer for the balloon.
640  *
641  * Return: the required change for the balloon size. A positive number
642  * indicates inflation, a negative number indicates a deflation.
643  */
644 static int64_t vmballoon_change(struct vmballoon *b)
645 {
646 	int64_t size, target;
647 
648 	size = b->size;
649 	target = b->target;
650 
651 	/*
652 	 * We must cast first because of int sizes
653 	 * Otherwise we might get huge positives instead of negatives
654 	 */
655 
656 	if (b->reset_required)
657 		return 0;
658 
659 	/* consider a 2MB slack on deflate, unless the balloon is emptied */
660 	if (target < size && size - target < vmballoon_page_size(true) &&
661 	    target != 0)
662 		return 0;
663 
664 	return target - size;
665 }
666 
667 /*
668  * Inflate the balloon towards its target size. Note that we try to limit
669  * the rate of allocation to make sure we are not choking the rest of the
670  * system.
671  */
672 static void vmballoon_inflate(struct vmballoon *b)
673 {
674 	unsigned int num_pages = 0;
675 	int error = 0;
676 	bool is_2m_pages;
677 
678 	/*
679 	 * First try NOSLEEP page allocations to inflate balloon.
680 	 *
681 	 * If we do not throttle nosleep allocations, we can drain all
682 	 * free pages in the guest quickly (if the balloon target is high).
683 	 * As a side-effect, draining free pages helps to inform (force)
684 	 * the guest to start swapping if balloon target is not met yet,
685 	 * which is a desired behavior. However, balloon driver can consume
686 	 * all available CPU cycles if too many pages are allocated in a
687 	 * second. Therefore, we throttle nosleep allocations even when
688 	 * the guest is not under memory pressure. OTOH, if we have already
689 	 * predicted that the guest is under memory pressure, then we
690 	 * slowdown page allocations considerably.
691 	 */
692 
693 	/*
694 	 * Start with no sleep allocation rate which may be higher
695 	 * than sleeping allocation rate.
696 	 */
697 	is_2m_pages = b->supported_page_sizes == VMW_BALLOON_NUM_PAGE_SIZES;
698 
699 	while ((int64_t)(num_pages * vmballoon_page_size(is_2m_pages)) <
700 	       vmballoon_change(b)) {
701 		struct page *page;
702 
703 		STATS_INC(b->stats.alloc[is_2m_pages]);
704 		page = vmballoon_alloc_page(is_2m_pages);
705 		if (!page) {
706 			STATS_INC(b->stats.alloc_fail[is_2m_pages]);
707 			if (is_2m_pages) {
708 				vmballoon_lock(b, num_pages, true);
709 
710 				/*
711 				 * ignore errors from locking as we now switch
712 				 * to 4k pages and we might get different
713 				 * errors.
714 				 */
715 
716 				num_pages = 0;
717 				is_2m_pages = false;
718 				continue;
719 			}
720 			break;
721 		}
722 
723 		vmballoon_add_page(b, num_pages++, page);
724 		if (num_pages == b->batch_max_pages) {
725 			struct vmballoon_page_size *page_size =
726 					&b->page_sizes[is_2m_pages];
727 
728 			error = vmballoon_lock(b, num_pages, is_2m_pages);
729 
730 			num_pages = 0;
731 
732 			/*
733 			 * Stop allocating this page size if we already
734 			 * accumulated too many pages that the hypervisor
735 			 * refused.
736 			 */
737 			if (page_size->n_refused_pages >=
738 			    VMW_BALLOON_MAX_REFUSED) {
739 				if (!is_2m_pages)
740 					break;
741 
742 				/*
743 				 * Release the refused pages as we move to 4k
744 				 * pages.
745 				 */
746 				vmballoon_release_refused_pages(b, true);
747 				is_2m_pages = true;
748 			}
749 
750 			if (error)
751 				break;
752 		}
753 
754 		cond_resched();
755 	}
756 
757 	if (num_pages > 0)
758 		vmballoon_lock(b, num_pages, is_2m_pages);
759 
760 	vmballoon_release_refused_pages(b, true);
761 	vmballoon_release_refused_pages(b, false);
762 }
763 
764 /*
765  * Decrease the size of the balloon allowing guest to use more memory.
766  */
767 static void vmballoon_deflate(struct vmballoon *b)
768 {
769 	unsigned is_2m_pages;
770 
771 	/* free pages to reach target */
772 	for (is_2m_pages = 0; is_2m_pages < b->supported_page_sizes;
773 			is_2m_pages++) {
774 		struct page *page, *next;
775 		unsigned int num_pages = 0;
776 		struct vmballoon_page_size *page_size =
777 				&b->page_sizes[is_2m_pages];
778 
779 		list_for_each_entry_safe(page, next, &page_size->pages, lru) {
780 			if ((int64_t)(num_pages *
781 				      vmballoon_page_size(is_2m_pages)) >=
782 					-vmballoon_change(b))
783 				break;
784 
785 			list_del(&page->lru);
786 			vmballoon_add_page(b, num_pages++, page);
787 
788 			if (num_pages == b->batch_max_pages) {
789 				int error;
790 
791 				error = vmballoon_unlock(b, num_pages,
792 						       is_2m_pages);
793 				num_pages = 0;
794 				if (error)
795 					return;
796 			}
797 
798 			cond_resched();
799 		}
800 
801 		if (num_pages > 0)
802 			vmballoon_unlock(b, num_pages, is_2m_pages);
803 	}
804 }
805 
806 /**
807  * vmballoon_deinit_batching - disables batching mode.
808  *
809  * @b: pointer to &struct vmballoon.
810  *
811  * Disables batching, by deallocating the page for communication with the
812  * hypervisor and disabling the static key to indicate that batching is off.
813  */
814 static void vmballoon_deinit_batching(struct vmballoon *b)
815 {
816 	free_page((unsigned long)b->batch_page);
817 	b->batch_page = NULL;
818 	static_branch_disable(&vmw_balloon_batching);
819 	b->batch_max_pages = 1;
820 }
821 
822 /**
823  * vmballoon_init_batching - enable batching mode.
824  *
825  * @b: pointer to &struct vmballoon.
826  *
827  * Enables batching, by allocating a page for communication with the hypervisor
828  * and enabling the static_key to use batching.
829  *
830  * Return: zero on success or an appropriate error-code.
831  */
832 static int vmballoon_init_batching(struct vmballoon *b)
833 {
834 	struct page *page;
835 
836 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
837 	if (!page)
838 		return -ENOMEM;
839 
840 	b->batch_page = page_address(page);
841 	b->batch_max_pages = PAGE_SIZE / sizeof(struct vmballoon_batch_entry);
842 
843 	static_branch_enable(&vmw_balloon_batching);
844 
845 	return 0;
846 }
847 
848 /*
849  * Receive notification and resize balloon
850  */
851 static void vmballoon_doorbell(void *client_data)
852 {
853 	struct vmballoon *b = client_data;
854 
855 	STATS_INC(b->stats.doorbell);
856 
857 	mod_delayed_work(system_freezable_wq, &b->dwork, 0);
858 }
859 
860 /*
861  * Clean up vmci doorbell
862  */
863 static void vmballoon_vmci_cleanup(struct vmballoon *b)
864 {
865 	vmballoon_cmd(b, VMW_BALLOON_CMD_VMCI_DOORBELL_SET,
866 		      VMCI_INVALID_ID, VMCI_INVALID_ID);
867 
868 	if (!vmci_handle_is_invalid(b->vmci_doorbell)) {
869 		vmci_doorbell_destroy(b->vmci_doorbell);
870 		b->vmci_doorbell = VMCI_INVALID_HANDLE;
871 	}
872 }
873 
874 /*
875  * Initialize vmci doorbell, to get notified as soon as balloon changes
876  */
877 static int vmballoon_vmci_init(struct vmballoon *b)
878 {
879 	unsigned long error;
880 
881 	if ((b->capabilities & VMW_BALLOON_SIGNALLED_WAKEUP_CMD) == 0)
882 		return 0;
883 
884 	error = vmci_doorbell_create(&b->vmci_doorbell, VMCI_FLAG_DELAYED_CB,
885 				     VMCI_PRIVILEGE_FLAG_RESTRICTED,
886 				     vmballoon_doorbell, b);
887 
888 	if (error != VMCI_SUCCESS)
889 		goto fail;
890 
891 	error =	__vmballoon_cmd(b, VMW_BALLOON_CMD_VMCI_DOORBELL_SET,
892 				b->vmci_doorbell.context,
893 				b->vmci_doorbell.resource, NULL);
894 
895 	if (error != VMW_BALLOON_SUCCESS)
896 		goto fail;
897 
898 	return 0;
899 fail:
900 	vmballoon_vmci_cleanup(b);
901 	return -EIO;
902 }
903 
904 /*
905  * Perform standard reset sequence by popping the balloon (in case it
906  * is not  empty) and then restarting protocol. This operation normally
907  * happens when host responds with VMW_BALLOON_ERROR_RESET to a command.
908  */
909 static void vmballoon_reset(struct vmballoon *b)
910 {
911 	int error;
912 
913 	vmballoon_vmci_cleanup(b);
914 
915 	/* free all pages, skipping monitor unlock */
916 	vmballoon_pop(b);
917 
918 	if (!vmballoon_send_start(b, VMW_BALLOON_CAPABILITIES))
919 		return;
920 
921 	if ((b->capabilities & VMW_BALLOON_BATCHED_CMDS) != 0) {
922 		if (vmballoon_init_batching(b)) {
923 			/*
924 			 * We failed to initialize batching, inform the monitor
925 			 * about it by sending a null capability.
926 			 *
927 			 * The guest will retry in one second.
928 			 */
929 			vmballoon_send_start(b, 0);
930 			return;
931 		}
932 	} else if ((b->capabilities & VMW_BALLOON_BASIC_CMDS) != 0) {
933 		vmballoon_deinit_batching(b);
934 	}
935 
936 	b->reset_required = false;
937 
938 	error = vmballoon_vmci_init(b);
939 	if (error)
940 		pr_err("failed to initialize vmci doorbell\n");
941 
942 	if (!vmballoon_send_guest_id(b))
943 		pr_err("failed to send guest ID to the host\n");
944 }
945 
946 /**
947  * vmballoon_work - periodic balloon worker for reset, inflation and deflation.
948  *
949  * @work: pointer to the &work_struct which is provided by the workqueue.
950  *
951  * Resets the protocol if needed, gets the new size and adjusts balloon as
952  * needed. Repeat in 1 sec.
953  */
954 static void vmballoon_work(struct work_struct *work)
955 {
956 	struct delayed_work *dwork = to_delayed_work(work);
957 	struct vmballoon *b = container_of(dwork, struct vmballoon, dwork);
958 	int64_t change = 0;
959 
960 	STATS_INC(b->stats.timer);
961 
962 	if (b->reset_required)
963 		vmballoon_reset(b);
964 
965 	if (vmballoon_send_get_target(b))
966 		change = vmballoon_change(b);
967 
968 	if (change != 0) {
969 		pr_debug("%s - size: %u, target %u", __func__,
970 			 b->size, b->target);
971 
972 		if (change > 0)
973 			vmballoon_inflate(b);
974 		else  /* (change < 0) */
975 			vmballoon_deflate(b);
976 	}
977 
978 	/*
979 	 * We are using a freezable workqueue so that balloon operations are
980 	 * stopped while the system transitions to/from sleep/hibernation.
981 	 */
982 	queue_delayed_work(system_freezable_wq,
983 			   dwork, round_jiffies_relative(HZ));
984 }
985 
986 /*
987  * DEBUGFS Interface
988  */
989 #ifdef CONFIG_DEBUG_FS
990 
991 static int vmballoon_debug_show(struct seq_file *f, void *offset)
992 {
993 	struct vmballoon *b = f->private;
994 	struct vmballoon_stats *stats = &b->stats;
995 	int i;
996 
997 	/* format capabilities info */
998 	seq_printf(f,
999 		   "balloon capabilities:   %#4x\n"
1000 		   "used capabilities:      %#4lx\n"
1001 		   "is resetting:           %c\n",
1002 		   VMW_BALLOON_CAPABILITIES, b->capabilities,
1003 		   b->reset_required ? 'y' : 'n');
1004 
1005 	/* format size info */
1006 	seq_printf(f,
1007 		   "target:             %8d pages\n"
1008 		   "current:            %8d pages\n",
1009 		   b->target, b->size);
1010 
1011 	for (i = 0; i < VMW_BALLOON_CMD_NUM; i++) {
1012 		if (vmballoon_cmd_names[i] == NULL)
1013 			continue;
1014 
1015 		seq_printf(f, "%-22s: %16lu (%lu failed)\n",
1016 			   vmballoon_cmd_names[i], stats->ops[i],
1017 			   stats->ops_fail[i]);
1018 	}
1019 
1020 	seq_printf(f,
1021 		   "\n"
1022 		   "timer:              %8u\n"
1023 		   "doorbell:           %8u\n"
1024 		   "prim2mAlloc:        %8u (%4u failed)\n"
1025 		   "prim4kAlloc:        %8u (%4u failed)\n"
1026 		   "prim2mFree:         %8u\n"
1027 		   "primFree:           %8u\n"
1028 		   "err2mAlloc:         %8u\n"
1029 		   "errAlloc:           %8u\n"
1030 		   "err2mFree:          %8u\n"
1031 		   "errFree:            %8u\n",
1032 		   stats->timer,
1033 		   stats->doorbell,
1034 		   stats->alloc[true], stats->alloc_fail[true],
1035 		   stats->alloc[false], stats->alloc_fail[false],
1036 		   stats->free[true],
1037 		   stats->free[false],
1038 		   stats->refused_alloc[true], stats->refused_alloc[false],
1039 		   stats->refused_free[true], stats->refused_free[false]);
1040 
1041 	return 0;
1042 }
1043 
1044 static int vmballoon_debug_open(struct inode *inode, struct file *file)
1045 {
1046 	return single_open(file, vmballoon_debug_show, inode->i_private);
1047 }
1048 
1049 static const struct file_operations vmballoon_debug_fops = {
1050 	.owner		= THIS_MODULE,
1051 	.open		= vmballoon_debug_open,
1052 	.read		= seq_read,
1053 	.llseek		= seq_lseek,
1054 	.release	= single_release,
1055 };
1056 
1057 static int __init vmballoon_debugfs_init(struct vmballoon *b)
1058 {
1059 	int error;
1060 
1061 	b->dbg_entry = debugfs_create_file("vmmemctl", S_IRUGO, NULL, b,
1062 					   &vmballoon_debug_fops);
1063 	if (IS_ERR(b->dbg_entry)) {
1064 		error = PTR_ERR(b->dbg_entry);
1065 		pr_err("failed to create debugfs entry, error: %d\n", error);
1066 		return error;
1067 	}
1068 
1069 	return 0;
1070 }
1071 
1072 static void __exit vmballoon_debugfs_exit(struct vmballoon *b)
1073 {
1074 	debugfs_remove(b->dbg_entry);
1075 }
1076 
1077 #else
1078 
1079 static inline int vmballoon_debugfs_init(struct vmballoon *b)
1080 {
1081 	return 0;
1082 }
1083 
1084 static inline void vmballoon_debugfs_exit(struct vmballoon *b)
1085 {
1086 }
1087 
1088 #endif	/* CONFIG_DEBUG_FS */
1089 
1090 static int __init vmballoon_init(void)
1091 {
1092 	int error;
1093 	unsigned is_2m_pages;
1094 	/*
1095 	 * Check if we are running on VMware's hypervisor and bail out
1096 	 * if we are not.
1097 	 */
1098 	if (x86_hyper_type != X86_HYPER_VMWARE)
1099 		return -ENODEV;
1100 
1101 	for (is_2m_pages = 0; is_2m_pages < VMW_BALLOON_NUM_PAGE_SIZES;
1102 			is_2m_pages++) {
1103 		INIT_LIST_HEAD(&balloon.page_sizes[is_2m_pages].pages);
1104 		INIT_LIST_HEAD(&balloon.page_sizes[is_2m_pages].refused_pages);
1105 	}
1106 
1107 	INIT_DELAYED_WORK(&balloon.dwork, vmballoon_work);
1108 
1109 	error = vmballoon_debugfs_init(&balloon);
1110 	if (error)
1111 		return error;
1112 
1113 	balloon.vmci_doorbell = VMCI_INVALID_HANDLE;
1114 	balloon.batch_page = NULL;
1115 	balloon.page = NULL;
1116 	balloon.reset_required = true;
1117 
1118 	queue_delayed_work(system_freezable_wq, &balloon.dwork, 0);
1119 
1120 	return 0;
1121 }
1122 
1123 /*
1124  * Using late_initcall() instead of module_init() allows the balloon to use the
1125  * VMCI doorbell even when the balloon is built into the kernel. Otherwise the
1126  * VMCI is probed only after the balloon is initialized. If the balloon is used
1127  * as a module, late_initcall() is equivalent to module_init().
1128  */
1129 late_initcall(vmballoon_init);
1130 
1131 static void __exit vmballoon_exit(void)
1132 {
1133 	vmballoon_vmci_cleanup(&balloon);
1134 	cancel_delayed_work_sync(&balloon.dwork);
1135 
1136 	vmballoon_debugfs_exit(&balloon);
1137 
1138 	/*
1139 	 * Deallocate all reserved memory, and reset connection with monitor.
1140 	 * Reset connection before deallocating memory to avoid potential for
1141 	 * additional spurious resets from guest touching deallocated pages.
1142 	 */
1143 	vmballoon_send_start(&balloon, 0);
1144 	vmballoon_pop(&balloon);
1145 }
1146 module_exit(vmballoon_exit);
1147