1 /*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "lowmemorykiller"
18
19 #include <dirent.h>
20 #include <errno.h>
21 #include <inttypes.h>
22 #include <pwd.h>
23 #include <sched.h>
24 #include <signal.h>
25 #include <stdbool.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <sys/cdefs.h>
29 #include <sys/epoll.h>
30 #include <sys/eventfd.h>
31 #include <sys/mman.h>
32 #include <sys/resource.h>
33 #include <sys/socket.h>
34 #include <sys/syscall.h>
35 #include <sys/sysinfo.h>
36 #include <sys/time.h>
37 #include <sys/types.h>
38 #include <time.h>
39 #include <unistd.h>
40
41 #include <cutils/properties.h>
42 #include <cutils/sched_policy.h>
43 #include <cutils/sockets.h>
44 #include <lmkd.h>
45 #include <log/log.h>
46 #include <log/log_event_list.h>
47 #include <log/log_time.h>
48 #include <psi/psi.h>
49 #include <system/thread_defs.h>
50
51 #include "statslog.h"
52
53 /*
54 * Define LMKD_TRACE_KILLS to record lmkd kills in kernel traces
55 * to profile and correlate with OOM kills
56 */
57 #ifdef LMKD_TRACE_KILLS
58
59 #define ATRACE_TAG ATRACE_TAG_ALWAYS
60 #include <cutils/trace.h>
61
62 #define TRACE_KILL_START(pid) ATRACE_INT(__FUNCTION__, pid);
63 #define TRACE_KILL_END() ATRACE_INT(__FUNCTION__, 0);
64
65 #else /* LMKD_TRACE_KILLS */
66
67 #define TRACE_KILL_START(pid) ((void)(pid))
68 #define TRACE_KILL_END() ((void)0)
69
70 #endif /* LMKD_TRACE_KILLS */
71
72 #ifndef __unused
73 #define __unused __attribute__((__unused__))
74 #endif
75
76 #define MEMCG_SYSFS_PATH "/dev/memcg/"
77 #define MEMCG_MEMORY_USAGE "/dev/memcg/memory.usage_in_bytes"
78 #define MEMCG_MEMORYSW_USAGE "/dev/memcg/memory.memsw.usage_in_bytes"
79 #define ZONEINFO_PATH "/proc/zoneinfo"
80 #define MEMINFO_PATH "/proc/meminfo"
81 #define VMSTAT_PATH "/proc/vmstat"
82 #define PROC_STATUS_TGID_FIELD "Tgid:"
83 #define LINE_MAX 128
84
85 #define PERCEPTIBLE_APP_ADJ 200
86
87 /* Android Logger event logtags (see event.logtags) */
88 #define KILLINFO_LOG_TAG 10195355
89
90 /* gid containing AID_SYSTEM required */
91 #define INKERNEL_MINFREE_PATH "/sys/module/lowmemorykiller/parameters/minfree"
92 #define INKERNEL_ADJ_PATH "/sys/module/lowmemorykiller/parameters/adj"
93
94 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
95 #define EIGHT_MEGA (1 << 23)
96
97 #define TARGET_UPDATE_MIN_INTERVAL_MS 1000
98
99 #define NS_PER_MS (NS_PER_SEC / MS_PER_SEC)
100 #define US_PER_MS (US_PER_SEC / MS_PER_SEC)
101
102 /* Defined as ProcessList.SYSTEM_ADJ in ProcessList.java */
103 #define SYSTEM_ADJ (-900)
104
105 #define STRINGIFY(x) STRINGIFY_INTERNAL(x)
106 #define STRINGIFY_INTERNAL(x) #x
107
108 /*
109 * PSI monitor tracking window size.
110 * PSI monitor generates events at most once per window,
111 * therefore we poll memory state for the duration of
112 * PSI_WINDOW_SIZE_MS after the event happens.
113 */
114 #define PSI_WINDOW_SIZE_MS 1000
115 /* Polling period after PSI signal when pressure is high */
116 #define PSI_POLL_PERIOD_SHORT_MS 10
117 /* Polling period after PSI signal when pressure is low */
118 #define PSI_POLL_PERIOD_LONG_MS 100
119
120 #define min(a, b) (((a) < (b)) ? (a) : (b))
121 #define max(a, b) (((a) > (b)) ? (a) : (b))
122
123 #define FAIL_REPORT_RLIMIT_MS 1000
124
125 /*
126 * System property defaults
127 */
128 /* ro.lmk.swap_free_low_percentage property defaults */
129 #define DEF_LOW_SWAP_LOWRAM 10
130 #define DEF_LOW_SWAP 20
131 /* ro.lmk.thrashing_limit property defaults */
132 #define DEF_THRASHING_LOWRAM 30
133 #define DEF_THRASHING 100
134 /* ro.lmk.thrashing_limit_decay property defaults */
135 #define DEF_THRASHING_DECAY_LOWRAM 50
136 #define DEF_THRASHING_DECAY 10
137 /* ro.lmk.psi_partial_stall_ms property defaults */
138 #define DEF_PARTIAL_STALL_LOWRAM 200
139 #define DEF_PARTIAL_STALL 70
140 /* ro.lmk.psi_complete_stall_ms property defaults */
141 #define DEF_COMPLETE_STALL 700
142
sys_pidfd_open(pid_t pid,unsigned int flags)143 static inline int sys_pidfd_open(pid_t pid, unsigned int flags) {
144 return syscall(__NR_pidfd_open, pid, flags);
145 }
146
sys_pidfd_send_signal(int pidfd,int sig,siginfo_t * info,unsigned int flags)147 static inline int sys_pidfd_send_signal(int pidfd, int sig, siginfo_t *info,
148 unsigned int flags) {
149 return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags);
150 }
151
152 /* default to old in-kernel interface if no memory pressure events */
153 static bool use_inkernel_interface = true;
154 static bool has_inkernel_module;
155
156 /* memory pressure levels */
157 enum vmpressure_level {
158 VMPRESS_LEVEL_LOW = 0,
159 VMPRESS_LEVEL_MEDIUM,
160 VMPRESS_LEVEL_CRITICAL,
161 VMPRESS_LEVEL_COUNT
162 };
163
164 static const char *level_name[] = {
165 "low",
166 "medium",
167 "critical"
168 };
169
170 struct {
171 int64_t min_nr_free_pages; /* recorded but not used yet */
172 int64_t max_nr_free_pages;
173 } low_pressure_mem = { -1, -1 };
174
175 struct psi_threshold {
176 enum psi_stall_type stall_type;
177 int threshold_ms;
178 };
179
180 static int level_oomadj[VMPRESS_LEVEL_COUNT];
181 static int mpevfd[VMPRESS_LEVEL_COUNT] = { -1, -1, -1 };
182 static bool pidfd_supported;
183 static int last_kill_pid_or_fd = -1;
184 static struct timespec last_kill_tm;
185
186 /* lmkd configurable parameters */
187 static bool debug_process_killing;
188 static bool enable_pressure_upgrade;
189 static int64_t upgrade_pressure;
190 static int64_t downgrade_pressure;
191 static bool low_ram_device;
192 static bool kill_heaviest_task;
193 static unsigned long kill_timeout_ms;
194 static bool use_minfree_levels;
195 static bool per_app_memcg;
196 static int swap_free_low_percentage;
197 static int psi_partial_stall_ms;
198 static int psi_complete_stall_ms;
199 static int thrashing_limit_pct;
200 static int thrashing_limit_decay_pct;
201 static bool use_psi_monitors = false;
202 static struct kernel_poll_info kpoll_info;
203 static struct psi_threshold psi_thresholds[VMPRESS_LEVEL_COUNT] = {
204 { PSI_SOME, 70 }, /* 70ms out of 1sec for partial stall */
205 { PSI_SOME, 100 }, /* 100ms out of 1sec for partial stall */
206 { PSI_FULL, 70 }, /* 70ms out of 1sec for complete stall */
207 };
208
209 static android_log_context ctx;
210
211 enum polling_update {
212 POLLING_DO_NOT_CHANGE,
213 POLLING_START,
214 POLLING_STOP,
215 POLLING_PAUSE,
216 POLLING_RESUME,
217 };
218
219 /*
220 * Data used for periodic polling for the memory state of the device.
221 * Note that when system is not polling poll_handler is set to NULL,
222 * when polling starts poll_handler gets set and is reset back to
223 * NULL when polling stops.
224 */
225 struct polling_params {
226 struct event_handler_info* poll_handler;
227 struct event_handler_info* paused_handler;
228 struct timespec poll_start_tm;
229 struct timespec last_poll_tm;
230 int polling_interval_ms;
231 enum polling_update update;
232 };
233
234 /* data required to handle events */
235 struct event_handler_info {
236 int data;
237 void (*handler)(int data, uint32_t events, struct polling_params *poll_params);
238 };
239
240 /* data required to handle socket events */
241 struct sock_event_handler_info {
242 int sock;
243 struct event_handler_info handler_info;
244 };
245
246 /* max supported number of data connections */
247 #define MAX_DATA_CONN 2
248
249 /* socket event handler data */
250 static struct sock_event_handler_info ctrl_sock;
251 static struct sock_event_handler_info data_sock[MAX_DATA_CONN];
252
253 /* vmpressure event handler data */
254 static struct event_handler_info vmpressure_hinfo[VMPRESS_LEVEL_COUNT];
255
256 /*
257 * 1 ctrl listen socket, 2 ctrl data socket, 3 memory pressure levels,
258 * 1 lmk events + 1 fd to wait for process death
259 */
260 #define MAX_EPOLL_EVENTS (1 + MAX_DATA_CONN + VMPRESS_LEVEL_COUNT + 1 + 1)
261 static int epollfd;
262 static int maxevents;
263
264 /* OOM score values used by both kernel and framework */
265 #define OOM_SCORE_ADJ_MIN (-1000)
266 #define OOM_SCORE_ADJ_MAX 1000
267
268 static int lowmem_adj[MAX_TARGETS];
269 static int lowmem_minfree[MAX_TARGETS];
270 static int lowmem_targets_size;
271
272 /* Fields to parse in /proc/zoneinfo */
273 /* zoneinfo per-zone fields */
274 enum zoneinfo_zone_field {
275 ZI_ZONE_NR_FREE_PAGES = 0,
276 ZI_ZONE_MIN,
277 ZI_ZONE_LOW,
278 ZI_ZONE_HIGH,
279 ZI_ZONE_PRESENT,
280 ZI_ZONE_NR_FREE_CMA,
281 ZI_ZONE_FIELD_COUNT
282 };
283
284 static const char* const zoneinfo_zone_field_names[ZI_ZONE_FIELD_COUNT] = {
285 "nr_free_pages",
286 "min",
287 "low",
288 "high",
289 "present",
290 "nr_free_cma",
291 };
292
293 /* zoneinfo per-zone special fields */
294 enum zoneinfo_zone_spec_field {
295 ZI_ZONE_SPEC_PROTECTION = 0,
296 ZI_ZONE_SPEC_PAGESETS,
297 ZI_ZONE_SPEC_FIELD_COUNT,
298 };
299
300 static const char* const zoneinfo_zone_spec_field_names[ZI_ZONE_SPEC_FIELD_COUNT] = {
301 "protection:",
302 "pagesets",
303 };
304
305 /* see __MAX_NR_ZONES definition in kernel mmzone.h */
306 #define MAX_NR_ZONES 6
307
308 union zoneinfo_zone_fields {
309 struct {
310 int64_t nr_free_pages;
311 int64_t min;
312 int64_t low;
313 int64_t high;
314 int64_t present;
315 int64_t nr_free_cma;
316 } field;
317 int64_t arr[ZI_ZONE_FIELD_COUNT];
318 };
319
320 struct zoneinfo_zone {
321 union zoneinfo_zone_fields fields;
322 int64_t protection[MAX_NR_ZONES];
323 int64_t max_protection;
324 };
325
326 /* zoneinfo per-node fields */
327 enum zoneinfo_node_field {
328 ZI_NODE_NR_INACTIVE_FILE = 0,
329 ZI_NODE_NR_ACTIVE_FILE,
330 ZI_NODE_WORKINGSET_REFAULT,
331 ZI_NODE_FIELD_COUNT
332 };
333
334 static const char* const zoneinfo_node_field_names[ZI_NODE_FIELD_COUNT] = {
335 "nr_inactive_file",
336 "nr_active_file",
337 "workingset_refault",
338 };
339
340 union zoneinfo_node_fields {
341 struct {
342 int64_t nr_inactive_file;
343 int64_t nr_active_file;
344 int64_t workingset_refault;
345 } field;
346 int64_t arr[ZI_NODE_FIELD_COUNT];
347 };
348
349 struct zoneinfo_node {
350 int id;
351 int zone_count;
352 struct zoneinfo_zone zones[MAX_NR_ZONES];
353 union zoneinfo_node_fields fields;
354 };
355
356 /* for now two memory nodes is more than enough */
357 #define MAX_NR_NODES 2
358
359 struct zoneinfo {
360 int node_count;
361 struct zoneinfo_node nodes[MAX_NR_NODES];
362 int64_t totalreserve_pages;
363 int64_t total_inactive_file;
364 int64_t total_active_file;
365 int64_t total_workingset_refault;
366 };
367
368 /* Fields to parse in /proc/meminfo */
369 enum meminfo_field {
370 MI_NR_FREE_PAGES = 0,
371 MI_CACHED,
372 MI_SWAP_CACHED,
373 MI_BUFFERS,
374 MI_SHMEM,
375 MI_UNEVICTABLE,
376 MI_TOTAL_SWAP,
377 MI_FREE_SWAP,
378 MI_ACTIVE_ANON,
379 MI_INACTIVE_ANON,
380 MI_ACTIVE_FILE,
381 MI_INACTIVE_FILE,
382 MI_SRECLAIMABLE,
383 MI_SUNRECLAIM,
384 MI_KERNEL_STACK,
385 MI_PAGE_TABLES,
386 MI_ION_HELP,
387 MI_ION_HELP_POOL,
388 MI_CMA_FREE,
389 MI_FIELD_COUNT
390 };
391
392 static const char* const meminfo_field_names[MI_FIELD_COUNT] = {
393 "MemFree:",
394 "Cached:",
395 "SwapCached:",
396 "Buffers:",
397 "Shmem:",
398 "Unevictable:",
399 "SwapTotal:",
400 "SwapFree:",
401 "Active(anon):",
402 "Inactive(anon):",
403 "Active(file):",
404 "Inactive(file):",
405 "SReclaimable:",
406 "SUnreclaim:",
407 "KernelStack:",
408 "PageTables:",
409 "ION_heap:",
410 "ION_heap_pool:",
411 "CmaFree:",
412 };
413
414 union meminfo {
415 struct {
416 int64_t nr_free_pages;
417 int64_t cached;
418 int64_t swap_cached;
419 int64_t buffers;
420 int64_t shmem;
421 int64_t unevictable;
422 int64_t total_swap;
423 int64_t free_swap;
424 int64_t active_anon;
425 int64_t inactive_anon;
426 int64_t active_file;
427 int64_t inactive_file;
428 int64_t sreclaimable;
429 int64_t sunreclaimable;
430 int64_t kernel_stack;
431 int64_t page_tables;
432 int64_t ion_heap;
433 int64_t ion_heap_pool;
434 int64_t cma_free;
435 /* fields below are calculated rather than read from the file */
436 int64_t nr_file_pages;
437 } field;
438 int64_t arr[MI_FIELD_COUNT];
439 };
440
441 /* Fields to parse in /proc/vmstat */
442 enum vmstat_field {
443 VS_FREE_PAGES,
444 VS_INACTIVE_FILE,
445 VS_ACTIVE_FILE,
446 VS_WORKINGSET_REFAULT,
447 VS_PGSCAN_KSWAPD,
448 VS_PGSCAN_DIRECT,
449 VS_PGSCAN_DIRECT_THROTTLE,
450 VS_FIELD_COUNT
451 };
452
453 static const char* const vmstat_field_names[MI_FIELD_COUNT] = {
454 "nr_free_pages",
455 "nr_inactive_file",
456 "nr_active_file",
457 "workingset_refault",
458 "pgscan_kswapd",
459 "pgscan_direct",
460 "pgscan_direct_throttle",
461 };
462
463 union vmstat {
464 struct {
465 int64_t nr_free_pages;
466 int64_t nr_inactive_file;
467 int64_t nr_active_file;
468 int64_t workingset_refault;
469 int64_t pgscan_kswapd;
470 int64_t pgscan_direct;
471 int64_t pgscan_direct_throttle;
472 } field;
473 int64_t arr[VS_FIELD_COUNT];
474 };
475
476 enum field_match_result {
477 NO_MATCH,
478 PARSE_FAIL,
479 PARSE_SUCCESS
480 };
481
482 struct adjslot_list {
483 struct adjslot_list *next;
484 struct adjslot_list *prev;
485 };
486
487 struct proc {
488 struct adjslot_list asl;
489 int pid;
490 int pidfd;
491 uid_t uid;
492 int oomadj;
493 struct proc *pidhash_next;
494 };
495
496 struct reread_data {
497 const char* const filename;
498 int fd;
499 };
500
501 #define PIDHASH_SZ 1024
502 static struct proc *pidhash[PIDHASH_SZ];
503 #define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1))
504
505 #define ADJTOSLOT(adj) ((adj) + -OOM_SCORE_ADJ_MIN)
506 #define ADJTOSLOT_COUNT (ADJTOSLOT(OOM_SCORE_ADJ_MAX) + 1)
507 static struct adjslot_list procadjslot_list[ADJTOSLOT_COUNT];
508
509 #define MAX_DISTINCT_OOM_ADJ 32
510 #define KILLCNT_INVALID_IDX 0xFF
511 /*
512 * Because killcnt array is sparse a two-level indirection is used
513 * to keep the size small. killcnt_idx stores index of the element in
514 * killcnt array. Index KILLCNT_INVALID_IDX indicates an unused slot.
515 */
516 static uint8_t killcnt_idx[ADJTOSLOT_COUNT];
517 static uint16_t killcnt[MAX_DISTINCT_OOM_ADJ];
518 static int killcnt_free_idx = 0;
519 static uint32_t killcnt_total = 0;
520
521 /* PAGE_SIZE / 1024 */
522 static long page_k;
523
clamp(int low,int high,int value)524 static int clamp(int low, int high, int value) {
525 return max(min(value, high), low);
526 }
527
parse_int64(const char * str,int64_t * ret)528 static bool parse_int64(const char* str, int64_t* ret) {
529 char* endptr;
530 long long val = strtoll(str, &endptr, 10);
531 if (str == endptr || val > INT64_MAX) {
532 return false;
533 }
534 *ret = (int64_t)val;
535 return true;
536 }
537
find_field(const char * name,const char * const field_names[],int field_count)538 static int find_field(const char* name, const char* const field_names[], int field_count) {
539 for (int i = 0; i < field_count; i++) {
540 if (!strcmp(name, field_names[i])) {
541 return i;
542 }
543 }
544 return -1;
545 }
546
match_field(const char * cp,const char * ap,const char * const field_names[],int field_count,int64_t * field,int * field_idx)547 static enum field_match_result match_field(const char* cp, const char* ap,
548 const char* const field_names[],
549 int field_count, int64_t* field,
550 int *field_idx) {
551 int i = find_field(cp, field_names, field_count);
552 if (i < 0) {
553 return NO_MATCH;
554 }
555 *field_idx = i;
556 return parse_int64(ap, field) ? PARSE_SUCCESS : PARSE_FAIL;
557 }
558
559 /*
560 * Read file content from the beginning up to max_len bytes or EOF
561 * whichever happens first.
562 */
read_all(int fd,char * buf,size_t max_len)563 static ssize_t read_all(int fd, char *buf, size_t max_len)
564 {
565 ssize_t ret = 0;
566 off_t offset = 0;
567
568 while (max_len > 0) {
569 ssize_t r = TEMP_FAILURE_RETRY(pread(fd, buf, max_len, offset));
570 if (r == 0) {
571 break;
572 }
573 if (r == -1) {
574 return -1;
575 }
576 ret += r;
577 buf += r;
578 offset += r;
579 max_len -= r;
580 }
581
582 return ret;
583 }
584
585 /*
586 * Read a new or already opened file from the beginning.
587 * If the file has not been opened yet data->fd should be set to -1.
588 * To be used with files which are read often and possibly during high
589 * memory pressure to minimize file opening which by itself requires kernel
590 * memory allocation and might result in a stall on memory stressed system.
591 */
reread_file(struct reread_data * data)592 static char *reread_file(struct reread_data *data) {
593 /* start with page-size buffer and increase if needed */
594 static ssize_t buf_size = PAGE_SIZE;
595 static char *new_buf, *buf = NULL;
596 ssize_t size;
597
598 if (data->fd == -1) {
599 /* First-time buffer initialization */
600 if (!buf && (buf = malloc(buf_size)) == NULL) {
601 return NULL;
602 }
603
604 data->fd = TEMP_FAILURE_RETRY(open(data->filename, O_RDONLY | O_CLOEXEC));
605 if (data->fd < 0) {
606 ALOGE("%s open: %s", data->filename, strerror(errno));
607 return NULL;
608 }
609 }
610
611 while (true) {
612 size = read_all(data->fd, buf, buf_size - 1);
613 if (size < 0) {
614 ALOGE("%s read: %s", data->filename, strerror(errno));
615 close(data->fd);
616 data->fd = -1;
617 return NULL;
618 }
619 if (size < buf_size - 1) {
620 break;
621 }
622 /*
623 * Since we are reading /proc files we can't use fstat to find out
624 * the real size of the file. Double the buffer size and keep retrying.
625 */
626 if ((new_buf = realloc(buf, buf_size * 2)) == NULL) {
627 errno = ENOMEM;
628 return NULL;
629 }
630 buf = new_buf;
631 buf_size *= 2;
632 }
633 buf[size] = 0;
634
635 return buf;
636 }
637
pid_lookup(int pid)638 static struct proc *pid_lookup(int pid) {
639 struct proc *procp;
640
641 for (procp = pidhash[pid_hashfn(pid)]; procp && procp->pid != pid;
642 procp = procp->pidhash_next)
643 ;
644
645 return procp;
646 }
647
adjslot_insert(struct adjslot_list * head,struct adjslot_list * new)648 static void adjslot_insert(struct adjslot_list *head, struct adjslot_list *new)
649 {
650 struct adjslot_list *next = head->next;
651 new->prev = head;
652 new->next = next;
653 next->prev = new;
654 head->next = new;
655 }
656
adjslot_remove(struct adjslot_list * old)657 static void adjslot_remove(struct adjslot_list *old)
658 {
659 struct adjslot_list *prev = old->prev;
660 struct adjslot_list *next = old->next;
661 next->prev = prev;
662 prev->next = next;
663 }
664
adjslot_tail(struct adjslot_list * head)665 static struct adjslot_list *adjslot_tail(struct adjslot_list *head) {
666 struct adjslot_list *asl = head->prev;
667
668 return asl == head ? NULL : asl;
669 }
670
proc_slot(struct proc * procp)671 static void proc_slot(struct proc *procp) {
672 int adjslot = ADJTOSLOT(procp->oomadj);
673
674 adjslot_insert(&procadjslot_list[adjslot], &procp->asl);
675 }
676
proc_unslot(struct proc * procp)677 static void proc_unslot(struct proc *procp) {
678 adjslot_remove(&procp->asl);
679 }
680
proc_insert(struct proc * procp)681 static void proc_insert(struct proc *procp) {
682 int hval = pid_hashfn(procp->pid);
683
684 procp->pidhash_next = pidhash[hval];
685 pidhash[hval] = procp;
686 proc_slot(procp);
687 }
688
pid_remove(int pid)689 static int pid_remove(int pid) {
690 int hval = pid_hashfn(pid);
691 struct proc *procp;
692 struct proc *prevp;
693
694 for (procp = pidhash[hval], prevp = NULL; procp && procp->pid != pid;
695 procp = procp->pidhash_next)
696 prevp = procp;
697
698 if (!procp)
699 return -1;
700
701 if (!prevp)
702 pidhash[hval] = procp->pidhash_next;
703 else
704 prevp->pidhash_next = procp->pidhash_next;
705
706 proc_unslot(procp);
707 /*
708 * Close pidfd here if we are not waiting for corresponding process to die,
709 * in which case stop_wait_for_proc_kill() will close the pidfd later
710 */
711 if (procp->pidfd >= 0 && procp->pidfd != last_kill_pid_or_fd) {
712 close(procp->pidfd);
713 }
714 free(procp);
715 return 0;
716 }
717
718 /*
719 * Write a string to a file.
720 * Returns false if the file does not exist.
721 */
writefilestring(const char * path,const char * s,bool err_if_missing)722 static bool writefilestring(const char *path, const char *s,
723 bool err_if_missing) {
724 int fd = open(path, O_WRONLY | O_CLOEXEC);
725 ssize_t len = strlen(s);
726 ssize_t ret;
727
728 if (fd < 0) {
729 if (err_if_missing) {
730 ALOGE("Error opening %s; errno=%d", path, errno);
731 }
732 return false;
733 }
734
735 ret = TEMP_FAILURE_RETRY(write(fd, s, len));
736 if (ret < 0) {
737 ALOGE("Error writing %s; errno=%d", path, errno);
738 } else if (ret < len) {
739 ALOGE("Short write on %s; length=%zd", path, ret);
740 }
741
742 close(fd);
743 return true;
744 }
745
get_time_diff_ms(struct timespec * from,struct timespec * to)746 static inline long get_time_diff_ms(struct timespec *from,
747 struct timespec *to) {
748 return (to->tv_sec - from->tv_sec) * (long)MS_PER_SEC +
749 (to->tv_nsec - from->tv_nsec) / (long)NS_PER_MS;
750 }
751
proc_get_tgid(int pid)752 static int proc_get_tgid(int pid) {
753 char path[PATH_MAX];
754 char buf[PAGE_SIZE];
755 int fd;
756 ssize_t size;
757 char *pos;
758 int64_t tgid = -1;
759
760 snprintf(path, PATH_MAX, "/proc/%d/status", pid);
761 fd = open(path, O_RDONLY | O_CLOEXEC);
762 if (fd < 0) {
763 return -1;
764 }
765
766 size = read_all(fd, buf, sizeof(buf) - 1);
767 if (size < 0) {
768 goto out;
769 }
770 buf[size] = 0;
771
772 pos = buf;
773 while (true) {
774 pos = strstr(pos, PROC_STATUS_TGID_FIELD);
775 /* Stop if TGID tag not found or found at the line beginning */
776 if (pos == NULL || pos == buf || pos[-1] == '\n') {
777 break;
778 }
779 pos++;
780 }
781
782 if (pos == NULL) {
783 goto out;
784 }
785
786 pos += strlen(PROC_STATUS_TGID_FIELD);
787 while (*pos == ' ') pos++;
788 parse_int64(pos, &tgid);
789
790 out:
791 close(fd);
792 return (int)tgid;
793 }
794
proc_get_size(int pid)795 static int proc_get_size(int pid) {
796 char path[PATH_MAX];
797 char line[LINE_MAX];
798 int fd;
799 int rss = 0;
800 int total;
801 ssize_t ret;
802
803 /* gid containing AID_READPROC required */
804 snprintf(path, PATH_MAX, "/proc/%d/statm", pid);
805 fd = open(path, O_RDONLY | O_CLOEXEC);
806 if (fd == -1)
807 return -1;
808
809 ret = read_all(fd, line, sizeof(line) - 1);
810 if (ret < 0) {
811 close(fd);
812 return -1;
813 }
814 line[ret] = '\0';
815
816 sscanf(line, "%d %d ", &total, &rss);
817 close(fd);
818 return rss;
819 }
820
proc_get_name(int pid,char * buf,size_t buf_size)821 static char *proc_get_name(int pid, char *buf, size_t buf_size) {
822 char path[PATH_MAX];
823 int fd;
824 char *cp;
825 ssize_t ret;
826
827 /* gid containing AID_READPROC required */
828 snprintf(path, PATH_MAX, "/proc/%d/cmdline", pid);
829 fd = open(path, O_RDONLY | O_CLOEXEC);
830 if (fd == -1) {
831 return NULL;
832 }
833 ret = read_all(fd, buf, buf_size - 1);
834 close(fd);
835 if (ret < 0) {
836 return NULL;
837 }
838 buf[ret] = '\0';
839
840 cp = strchr(buf, ' ');
841 if (cp) {
842 *cp = '\0';
843 }
844
845 return buf;
846 }
847
cmd_procprio(LMKD_CTRL_PACKET packet)848 static void cmd_procprio(LMKD_CTRL_PACKET packet) {
849 struct proc *procp;
850 char path[LINE_MAX];
851 char val[20];
852 int soft_limit_mult;
853 struct lmk_procprio params;
854 bool is_system_server;
855 struct passwd *pwdrec;
856 int tgid;
857
858 lmkd_pack_get_procprio(packet, ¶ms);
859
860 if (params.oomadj < OOM_SCORE_ADJ_MIN ||
861 params.oomadj > OOM_SCORE_ADJ_MAX) {
862 ALOGE("Invalid PROCPRIO oomadj argument %d", params.oomadj);
863 return;
864 }
865
866 /* Check if registered process is a thread group leader */
867 tgid = proc_get_tgid(params.pid);
868 if (tgid >= 0 && tgid != params.pid) {
869 ALOGE("Attempt to register a task that is not a thread group leader (tid %d, tgid %d)",
870 params.pid, tgid);
871 return;
872 }
873
874 /* gid containing AID_READPROC required */
875 /* CAP_SYS_RESOURCE required */
876 /* CAP_DAC_OVERRIDE required */
877 snprintf(path, sizeof(path), "/proc/%d/oom_score_adj", params.pid);
878 snprintf(val, sizeof(val), "%d", params.oomadj);
879 if (!writefilestring(path, val, false)) {
880 ALOGW("Failed to open %s; errno=%d: process %d might have been killed",
881 path, errno, params.pid);
882 /* If this file does not exist the process is dead. */
883 return;
884 }
885
886 if (use_inkernel_interface) {
887 stats_store_taskname(params.pid, proc_get_name(params.pid, path, sizeof(path)),
888 kpoll_info.poll_fd);
889 return;
890 }
891
892 if (per_app_memcg) {
893 if (params.oomadj >= 900) {
894 soft_limit_mult = 0;
895 } else if (params.oomadj >= 800) {
896 soft_limit_mult = 0;
897 } else if (params.oomadj >= 700) {
898 soft_limit_mult = 0;
899 } else if (params.oomadj >= 600) {
900 // Launcher should be perceptible, don't kill it.
901 params.oomadj = 200;
902 soft_limit_mult = 1;
903 } else if (params.oomadj >= 500) {
904 soft_limit_mult = 0;
905 } else if (params.oomadj >= 400) {
906 soft_limit_mult = 0;
907 } else if (params.oomadj >= 300) {
908 soft_limit_mult = 1;
909 } else if (params.oomadj >= 200) {
910 soft_limit_mult = 8;
911 } else if (params.oomadj >= 100) {
912 soft_limit_mult = 10;
913 } else if (params.oomadj >= 0) {
914 soft_limit_mult = 20;
915 } else {
916 // Persistent processes will have a large
917 // soft limit 512MB.
918 soft_limit_mult = 64;
919 }
920
921 snprintf(path, sizeof(path), MEMCG_SYSFS_PATH
922 "apps/uid_%d/pid_%d/memory.soft_limit_in_bytes",
923 params.uid, params.pid);
924 snprintf(val, sizeof(val), "%d", soft_limit_mult * EIGHT_MEGA);
925
926 /*
927 * system_server process has no memcg under /dev/memcg/apps but should be
928 * registered with lmkd. This is the best way so far to identify it.
929 */
930 is_system_server = (params.oomadj == SYSTEM_ADJ &&
931 (pwdrec = getpwnam("system")) != NULL &&
932 params.uid == pwdrec->pw_uid);
933 writefilestring(path, val, !is_system_server);
934 }
935
936 procp = pid_lookup(params.pid);
937 if (!procp) {
938 int pidfd = -1;
939
940 if (pidfd_supported) {
941 pidfd = TEMP_FAILURE_RETRY(sys_pidfd_open(params.pid, 0));
942 if (pidfd < 0) {
943 ALOGE("pidfd_open for pid %d failed; errno=%d", params.pid, errno);
944 return;
945 }
946 }
947
948 procp = calloc(1, sizeof(struct proc));
949 if (!procp) {
950 // Oh, the irony. May need to rebuild our state.
951 return;
952 }
953
954 procp->pid = params.pid;
955 procp->pidfd = pidfd;
956 procp->uid = params.uid;
957 procp->oomadj = params.oomadj;
958 proc_insert(procp);
959 } else {
960 proc_unslot(procp);
961 procp->oomadj = params.oomadj;
962 proc_slot(procp);
963 }
964 }
965
cmd_procremove(LMKD_CTRL_PACKET packet)966 static void cmd_procremove(LMKD_CTRL_PACKET packet) {
967 struct lmk_procremove params;
968
969 lmkd_pack_get_procremove(packet, ¶ms);
970 if (use_inkernel_interface) {
971 stats_remove_taskname(params.pid, kpoll_info.poll_fd);
972 return;
973 }
974
975 /*
976 * WARNING: After pid_remove() procp is freed and can't be used!
977 * Therefore placed at the end of the function.
978 */
979 pid_remove(params.pid);
980 }
981
cmd_procpurge()982 static void cmd_procpurge() {
983 int i;
984 struct proc *procp;
985 struct proc *next;
986
987 if (use_inkernel_interface) {
988 stats_purge_tasknames();
989 return;
990 }
991
992 for (i = 0; i <= ADJTOSLOT(OOM_SCORE_ADJ_MAX); i++) {
993 procadjslot_list[i].next = &procadjslot_list[i];
994 procadjslot_list[i].prev = &procadjslot_list[i];
995 }
996
997 for (i = 0; i < PIDHASH_SZ; i++) {
998 procp = pidhash[i];
999 while (procp) {
1000 next = procp->pidhash_next;
1001 free(procp);
1002 procp = next;
1003 }
1004 }
1005 memset(&pidhash[0], 0, sizeof(pidhash));
1006 }
1007
inc_killcnt(int oomadj)1008 static void inc_killcnt(int oomadj) {
1009 int slot = ADJTOSLOT(oomadj);
1010 uint8_t idx = killcnt_idx[slot];
1011
1012 if (idx == KILLCNT_INVALID_IDX) {
1013 /* index is not assigned for this oomadj */
1014 if (killcnt_free_idx < MAX_DISTINCT_OOM_ADJ) {
1015 killcnt_idx[slot] = killcnt_free_idx;
1016 killcnt[killcnt_free_idx] = 1;
1017 killcnt_free_idx++;
1018 } else {
1019 ALOGW("Number of distinct oomadj levels exceeds %d",
1020 MAX_DISTINCT_OOM_ADJ);
1021 }
1022 } else {
1023 /*
1024 * wraparound is highly unlikely and is detectable using total
1025 * counter because it has to be equal to the sum of all counters
1026 */
1027 killcnt[idx]++;
1028 }
1029 /* increment total kill counter */
1030 killcnt_total++;
1031 }
1032
get_killcnt(int min_oomadj,int max_oomadj)1033 static int get_killcnt(int min_oomadj, int max_oomadj) {
1034 int slot;
1035 int count = 0;
1036
1037 if (min_oomadj > max_oomadj)
1038 return 0;
1039
1040 /* special case to get total kill count */
1041 if (min_oomadj > OOM_SCORE_ADJ_MAX)
1042 return killcnt_total;
1043
1044 while (min_oomadj <= max_oomadj &&
1045 (slot = ADJTOSLOT(min_oomadj)) < ADJTOSLOT_COUNT) {
1046 uint8_t idx = killcnt_idx[slot];
1047 if (idx != KILLCNT_INVALID_IDX) {
1048 count += killcnt[idx];
1049 }
1050 min_oomadj++;
1051 }
1052
1053 return count;
1054 }
1055
cmd_getkillcnt(LMKD_CTRL_PACKET packet)1056 static int cmd_getkillcnt(LMKD_CTRL_PACKET packet) {
1057 struct lmk_getkillcnt params;
1058
1059 if (use_inkernel_interface) {
1060 /* kernel driver does not expose this information */
1061 return 0;
1062 }
1063
1064 lmkd_pack_get_getkillcnt(packet, ¶ms);
1065
1066 return get_killcnt(params.min_oomadj, params.max_oomadj);
1067 }
1068
cmd_target(int ntargets,LMKD_CTRL_PACKET packet)1069 static void cmd_target(int ntargets, LMKD_CTRL_PACKET packet) {
1070 int i;
1071 struct lmk_target target;
1072 char minfree_str[PROPERTY_VALUE_MAX];
1073 char *pstr = minfree_str;
1074 char *pend = minfree_str + sizeof(minfree_str);
1075 static struct timespec last_req_tm;
1076 struct timespec curr_tm;
1077
1078 if (ntargets < 1 || ntargets > (int)ARRAY_SIZE(lowmem_adj))
1079 return;
1080
1081 /*
1082 * Ratelimit minfree updates to once per TARGET_UPDATE_MIN_INTERVAL_MS
1083 * to prevent DoS attacks
1084 */
1085 if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
1086 ALOGE("Failed to get current time");
1087 return;
1088 }
1089
1090 if (get_time_diff_ms(&last_req_tm, &curr_tm) <
1091 TARGET_UPDATE_MIN_INTERVAL_MS) {
1092 ALOGE("Ignoring frequent updated to lmkd limits");
1093 return;
1094 }
1095
1096 last_req_tm = curr_tm;
1097
1098 for (i = 0; i < ntargets; i++) {
1099 lmkd_pack_get_target(packet, i, &target);
1100 lowmem_minfree[i] = target.minfree;
1101 lowmem_adj[i] = target.oom_adj_score;
1102
1103 pstr += snprintf(pstr, pend - pstr, "%d:%d,", target.minfree,
1104 target.oom_adj_score);
1105 if (pstr >= pend) {
1106 /* if no more space in the buffer then terminate the loop */
1107 pstr = pend;
1108 break;
1109 }
1110 }
1111
1112 lowmem_targets_size = ntargets;
1113
1114 /* Override the last extra comma */
1115 pstr[-1] = '\0';
1116 property_set("sys.lmk.minfree_levels", minfree_str);
1117
1118 if (has_inkernel_module) {
1119 char minfreestr[128];
1120 char killpriostr[128];
1121
1122 minfreestr[0] = '\0';
1123 killpriostr[0] = '\0';
1124
1125 for (i = 0; i < lowmem_targets_size; i++) {
1126 char val[40];
1127
1128 if (i) {
1129 strlcat(minfreestr, ",", sizeof(minfreestr));
1130 strlcat(killpriostr, ",", sizeof(killpriostr));
1131 }
1132
1133 snprintf(val, sizeof(val), "%d", use_inkernel_interface ? lowmem_minfree[i] : 0);
1134 strlcat(minfreestr, val, sizeof(minfreestr));
1135 snprintf(val, sizeof(val), "%d", use_inkernel_interface ? lowmem_adj[i] : 0);
1136 strlcat(killpriostr, val, sizeof(killpriostr));
1137 }
1138
1139 writefilestring(INKERNEL_MINFREE_PATH, minfreestr, true);
1140 writefilestring(INKERNEL_ADJ_PATH, killpriostr, true);
1141 }
1142 }
1143
ctrl_data_close(int dsock_idx)1144 static void ctrl_data_close(int dsock_idx) {
1145 struct epoll_event epev;
1146
1147 ALOGI("closing lmkd data connection");
1148 if (epoll_ctl(epollfd, EPOLL_CTL_DEL, data_sock[dsock_idx].sock, &epev) == -1) {
1149 // Log a warning and keep going
1150 ALOGW("epoll_ctl for data connection socket failed; errno=%d", errno);
1151 }
1152 maxevents--;
1153
1154 close(data_sock[dsock_idx].sock);
1155 data_sock[dsock_idx].sock = -1;
1156 }
1157
ctrl_data_read(int dsock_idx,char * buf,size_t bufsz)1158 static int ctrl_data_read(int dsock_idx, char *buf, size_t bufsz) {
1159 int ret = 0;
1160
1161 ret = TEMP_FAILURE_RETRY(read(data_sock[dsock_idx].sock, buf, bufsz));
1162
1163 if (ret == -1) {
1164 ALOGE("control data socket read failed; errno=%d", errno);
1165 } else if (ret == 0) {
1166 ALOGE("Got EOF on control data socket");
1167 ret = -1;
1168 }
1169
1170 return ret;
1171 }
1172
ctrl_data_write(int dsock_idx,char * buf,size_t bufsz)1173 static int ctrl_data_write(int dsock_idx, char *buf, size_t bufsz) {
1174 int ret = 0;
1175
1176 ret = TEMP_FAILURE_RETRY(write(data_sock[dsock_idx].sock, buf, bufsz));
1177
1178 if (ret == -1) {
1179 ALOGE("control data socket write failed; errno=%d", errno);
1180 } else if (ret == 0) {
1181 ALOGE("Got EOF on control data socket");
1182 ret = -1;
1183 }
1184
1185 return ret;
1186 }
1187
ctrl_command_handler(int dsock_idx)1188 static void ctrl_command_handler(int dsock_idx) {
1189 LMKD_CTRL_PACKET packet;
1190 int len;
1191 enum lmk_cmd cmd;
1192 int nargs;
1193 int targets;
1194 int kill_cnt;
1195
1196 len = ctrl_data_read(dsock_idx, (char *)packet, CTRL_PACKET_MAX_SIZE);
1197 if (len <= 0)
1198 return;
1199
1200 if (len < (int)sizeof(int)) {
1201 ALOGE("Wrong control socket read length len=%d", len);
1202 return;
1203 }
1204
1205 cmd = lmkd_pack_get_cmd(packet);
1206 nargs = len / sizeof(int) - 1;
1207 if (nargs < 0)
1208 goto wronglen;
1209
1210 switch(cmd) {
1211 case LMK_TARGET:
1212 targets = nargs / 2;
1213 if (nargs & 0x1 || targets > (int)ARRAY_SIZE(lowmem_adj))
1214 goto wronglen;
1215 cmd_target(targets, packet);
1216 break;
1217 case LMK_PROCPRIO:
1218 if (nargs != 3)
1219 goto wronglen;
1220 cmd_procprio(packet);
1221 break;
1222 case LMK_PROCREMOVE:
1223 if (nargs != 1)
1224 goto wronglen;
1225 cmd_procremove(packet);
1226 break;
1227 case LMK_PROCPURGE:
1228 if (nargs != 0)
1229 goto wronglen;
1230 cmd_procpurge();
1231 break;
1232 case LMK_GETKILLCNT:
1233 if (nargs != 2)
1234 goto wronglen;
1235 kill_cnt = cmd_getkillcnt(packet);
1236 len = lmkd_pack_set_getkillcnt_repl(packet, kill_cnt);
1237 if (ctrl_data_write(dsock_idx, (char *)packet, len) != len)
1238 return;
1239 break;
1240 default:
1241 ALOGE("Received unknown command code %d", cmd);
1242 return;
1243 }
1244
1245 return;
1246
1247 wronglen:
1248 ALOGE("Wrong control socket read length cmd=%d len=%d", cmd, len);
1249 }
1250
ctrl_data_handler(int data,uint32_t events,struct polling_params * poll_params __unused)1251 static void ctrl_data_handler(int data, uint32_t events,
1252 struct polling_params *poll_params __unused) {
1253 if (events & EPOLLIN) {
1254 ctrl_command_handler(data);
1255 }
1256 }
1257
get_free_dsock()1258 static int get_free_dsock() {
1259 for (int i = 0; i < MAX_DATA_CONN; i++) {
1260 if (data_sock[i].sock < 0) {
1261 return i;
1262 }
1263 }
1264 return -1;
1265 }
1266
ctrl_connect_handler(int data __unused,uint32_t events __unused,struct polling_params * poll_params __unused)1267 static void ctrl_connect_handler(int data __unused, uint32_t events __unused,
1268 struct polling_params *poll_params __unused) {
1269 struct epoll_event epev;
1270 int free_dscock_idx = get_free_dsock();
1271
1272 if (free_dscock_idx < 0) {
1273 /*
1274 * Number of data connections exceeded max supported. This should not
1275 * happen but if it does we drop all existing connections and accept
1276 * the new one. This prevents inactive connections from monopolizing
1277 * data socket and if we drop ActivityManager connection it will
1278 * immediately reconnect.
1279 */
1280 for (int i = 0; i < MAX_DATA_CONN; i++) {
1281 ctrl_data_close(i);
1282 }
1283 free_dscock_idx = 0;
1284 }
1285
1286 data_sock[free_dscock_idx].sock = accept(ctrl_sock.sock, NULL, NULL);
1287 if (data_sock[free_dscock_idx].sock < 0) {
1288 ALOGE("lmkd control socket accept failed; errno=%d", errno);
1289 return;
1290 }
1291
1292 ALOGI("lmkd data connection established");
1293 /* use data to store data connection idx */
1294 data_sock[free_dscock_idx].handler_info.data = free_dscock_idx;
1295 data_sock[free_dscock_idx].handler_info.handler = ctrl_data_handler;
1296 epev.events = EPOLLIN;
1297 epev.data.ptr = (void *)&(data_sock[free_dscock_idx].handler_info);
1298 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, data_sock[free_dscock_idx].sock, &epev) == -1) {
1299 ALOGE("epoll_ctl for data connection socket failed; errno=%d", errno);
1300 ctrl_data_close(free_dscock_idx);
1301 return;
1302 }
1303 maxevents++;
1304 }
1305
1306 /*
1307 * /proc/zoneinfo parsing routines
1308 * Expected file format is:
1309 *
1310 * Node <node_id>, zone <zone_name>
1311 * (
1312 * per-node stats
1313 * (<per-node field name> <value>)+
1314 * )?
1315 * (pages free <value>
1316 * (<per-zone field name> <value>)+
1317 * pagesets
1318 * (<unused fields>)*
1319 * )+
1320 * ...
1321 */
zoneinfo_parse_protection(char * buf,struct zoneinfo_zone * zone)1322 static void zoneinfo_parse_protection(char *buf, struct zoneinfo_zone *zone) {
1323 int zone_idx;
1324 int64_t max = 0;
1325 char *save_ptr;
1326
1327 for (buf = strtok_r(buf, "(), ", &save_ptr), zone_idx = 0;
1328 buf && zone_idx < MAX_NR_ZONES;
1329 buf = strtok_r(NULL, "), ", &save_ptr), zone_idx++) {
1330 long long zoneval = strtoll(buf, &buf, 0);
1331 if (zoneval > max) {
1332 max = (zoneval > INT64_MAX) ? INT64_MAX : zoneval;
1333 }
1334 zone->protection[zone_idx] = zoneval;
1335 }
1336 zone->max_protection = max;
1337 }
1338
zoneinfo_parse_zone(char ** buf,struct zoneinfo_zone * zone)1339 static int zoneinfo_parse_zone(char **buf, struct zoneinfo_zone *zone) {
1340 for (char *line = strtok_r(NULL, "\n", buf); line;
1341 line = strtok_r(NULL, "\n", buf)) {
1342 char *cp;
1343 char *ap;
1344 char *save_ptr;
1345 int64_t val;
1346 int field_idx;
1347 enum field_match_result match_res;
1348
1349 cp = strtok_r(line, " ", &save_ptr);
1350 if (!cp) {
1351 return false;
1352 }
1353
1354 field_idx = find_field(cp, zoneinfo_zone_spec_field_names, ZI_ZONE_SPEC_FIELD_COUNT);
1355 if (field_idx >= 0) {
1356 /* special field */
1357 if (field_idx == ZI_ZONE_SPEC_PAGESETS) {
1358 /* no mode fields we are interested in */
1359 return true;
1360 }
1361
1362 /* protection field */
1363 ap = strtok_r(NULL, ")", &save_ptr);
1364 if (ap) {
1365 zoneinfo_parse_protection(ap, zone);
1366 }
1367 continue;
1368 }
1369
1370 ap = strtok_r(NULL, " ", &save_ptr);
1371 if (!ap) {
1372 continue;
1373 }
1374
1375 match_res = match_field(cp, ap, zoneinfo_zone_field_names, ZI_ZONE_FIELD_COUNT,
1376 &val, &field_idx);
1377 if (match_res == PARSE_FAIL) {
1378 return false;
1379 }
1380 if (match_res == PARSE_SUCCESS) {
1381 zone->fields.arr[field_idx] = val;
1382 }
1383 if (field_idx == ZI_ZONE_PRESENT && val == 0) {
1384 /* zone is not populated, stop parsing it */
1385 return true;
1386 }
1387 }
1388 return false;
1389 }
1390
zoneinfo_parse_node(char ** buf,struct zoneinfo_node * node)1391 static int zoneinfo_parse_node(char **buf, struct zoneinfo_node *node) {
1392 int fields_to_match = ZI_NODE_FIELD_COUNT;
1393
1394 for (char *line = strtok_r(NULL, "\n", buf); line;
1395 line = strtok_r(NULL, "\n", buf)) {
1396 char *cp;
1397 char *ap;
1398 char *save_ptr;
1399 int64_t val;
1400 int field_idx;
1401 enum field_match_result match_res;
1402
1403 cp = strtok_r(line, " ", &save_ptr);
1404 if (!cp) {
1405 return false;
1406 }
1407
1408 ap = strtok_r(NULL, " ", &save_ptr);
1409 if (!ap) {
1410 return false;
1411 }
1412
1413 match_res = match_field(cp, ap, zoneinfo_node_field_names, ZI_NODE_FIELD_COUNT,
1414 &val, &field_idx);
1415 if (match_res == PARSE_FAIL) {
1416 return false;
1417 }
1418 if (match_res == PARSE_SUCCESS) {
1419 node->fields.arr[field_idx] = val;
1420 fields_to_match--;
1421 if (!fields_to_match) {
1422 return true;
1423 }
1424 }
1425 }
1426 return false;
1427 }
1428
zoneinfo_parse(struct zoneinfo * zi)1429 static int zoneinfo_parse(struct zoneinfo *zi) {
1430 static struct reread_data file_data = {
1431 .filename = ZONEINFO_PATH,
1432 .fd = -1,
1433 };
1434 char *buf;
1435 char *save_ptr;
1436 char *line;
1437 char zone_name[LINE_MAX + 1];
1438 struct zoneinfo_node *node = NULL;
1439 int node_idx = 0;
1440 int zone_idx = 0;
1441
1442 memset(zi, 0, sizeof(struct zoneinfo));
1443
1444 if ((buf = reread_file(&file_data)) == NULL) {
1445 return -1;
1446 }
1447
1448 for (line = strtok_r(buf, "\n", &save_ptr); line;
1449 line = strtok_r(NULL, "\n", &save_ptr)) {
1450 int node_id;
1451 if (sscanf(line, "Node %d, zone %" STRINGIFY(LINE_MAX) "s", &node_id, zone_name) == 2) {
1452 if (!node || node->id != node_id) {
1453 /* new node is found */
1454 if (node) {
1455 node->zone_count = zone_idx + 1;
1456 node_idx++;
1457 if (node_idx == MAX_NR_NODES) {
1458 /* max node count exceeded */
1459 ALOGE("%s parse error", file_data.filename);
1460 return -1;
1461 }
1462 }
1463 node = &zi->nodes[node_idx];
1464 node->id = node_id;
1465 zone_idx = 0;
1466 if (!zoneinfo_parse_node(&save_ptr, node)) {
1467 ALOGE("%s parse error", file_data.filename);
1468 return -1;
1469 }
1470 } else {
1471 /* new zone is found */
1472 zone_idx++;
1473 }
1474 if (!zoneinfo_parse_zone(&save_ptr, &node->zones[zone_idx])) {
1475 ALOGE("%s parse error", file_data.filename);
1476 return -1;
1477 }
1478 }
1479 }
1480 if (!node) {
1481 ALOGE("%s parse error", file_data.filename);
1482 return -1;
1483 }
1484 node->zone_count = zone_idx + 1;
1485 zi->node_count = node_idx + 1;
1486
1487 /* calculate totals fields */
1488 for (node_idx = 0; node_idx < zi->node_count; node_idx++) {
1489 node = &zi->nodes[node_idx];
1490 for (zone_idx = 0; zone_idx < node->zone_count; zone_idx++) {
1491 struct zoneinfo_zone *zone = &zi->nodes[node_idx].zones[zone_idx];
1492 zi->totalreserve_pages += zone->max_protection + zone->fields.field.high;
1493 }
1494 zi->total_inactive_file += node->fields.field.nr_inactive_file;
1495 zi->total_active_file += node->fields.field.nr_active_file;
1496 zi->total_workingset_refault += node->fields.field.workingset_refault;
1497 }
1498 return 0;
1499 }
1500
1501 /* /proc/meminfo parsing routines */
meminfo_parse_line(char * line,union meminfo * mi)1502 static bool meminfo_parse_line(char *line, union meminfo *mi) {
1503 char *cp = line;
1504 char *ap;
1505 char *save_ptr;
1506 int64_t val;
1507 int field_idx;
1508 enum field_match_result match_res;
1509
1510 cp = strtok_r(line, " ", &save_ptr);
1511 if (!cp) {
1512 return false;
1513 }
1514
1515 ap = strtok_r(NULL, " ", &save_ptr);
1516 if (!ap) {
1517 return false;
1518 }
1519
1520 match_res = match_field(cp, ap, meminfo_field_names, MI_FIELD_COUNT,
1521 &val, &field_idx);
1522 if (match_res == PARSE_SUCCESS) {
1523 mi->arr[field_idx] = val / page_k;
1524 }
1525 return (match_res != PARSE_FAIL);
1526 }
1527
meminfo_parse(union meminfo * mi)1528 static int meminfo_parse(union meminfo *mi) {
1529 static struct reread_data file_data = {
1530 .filename = MEMINFO_PATH,
1531 .fd = -1,
1532 };
1533 char *buf;
1534 char *save_ptr;
1535 char *line;
1536
1537 memset(mi, 0, sizeof(union meminfo));
1538
1539 if ((buf = reread_file(&file_data)) == NULL) {
1540 return -1;
1541 }
1542
1543 for (line = strtok_r(buf, "\n", &save_ptr); line;
1544 line = strtok_r(NULL, "\n", &save_ptr)) {
1545 if (!meminfo_parse_line(line, mi)) {
1546 ALOGE("%s parse error", file_data.filename);
1547 return -1;
1548 }
1549 }
1550 mi->field.nr_file_pages = mi->field.cached + mi->field.swap_cached +
1551 mi->field.buffers;
1552
1553 return 0;
1554 }
1555
1556 /* /proc/vmstat parsing routines */
vmstat_parse_line(char * line,union vmstat * vs)1557 static bool vmstat_parse_line(char *line, union vmstat *vs) {
1558 char *cp;
1559 char *ap;
1560 char *save_ptr;
1561 int64_t val;
1562 int field_idx;
1563 enum field_match_result match_res;
1564
1565 cp = strtok_r(line, " ", &save_ptr);
1566 if (!cp) {
1567 return false;
1568 }
1569
1570 ap = strtok_r(NULL, " ", &save_ptr);
1571 if (!ap) {
1572 return false;
1573 }
1574
1575 match_res = match_field(cp, ap, vmstat_field_names, VS_FIELD_COUNT,
1576 &val, &field_idx);
1577 if (match_res == PARSE_SUCCESS) {
1578 vs->arr[field_idx] = val;
1579 }
1580 return (match_res != PARSE_FAIL);
1581 }
1582
vmstat_parse(union vmstat * vs)1583 static int vmstat_parse(union vmstat *vs) {
1584 static struct reread_data file_data = {
1585 .filename = VMSTAT_PATH,
1586 .fd = -1,
1587 };
1588 char *buf;
1589 char *save_ptr;
1590 char *line;
1591
1592 memset(vs, 0, sizeof(union vmstat));
1593
1594 if ((buf = reread_file(&file_data)) == NULL) {
1595 return -1;
1596 }
1597
1598 for (line = strtok_r(buf, "\n", &save_ptr); line;
1599 line = strtok_r(NULL, "\n", &save_ptr)) {
1600 if (!vmstat_parse_line(line, vs)) {
1601 ALOGE("%s parse error", file_data.filename);
1602 return -1;
1603 }
1604 }
1605
1606 return 0;
1607 }
1608
killinfo_log(struct proc * procp,int min_oom_score,int tasksize,int kill_reason,union meminfo * mi)1609 static void killinfo_log(struct proc* procp, int min_oom_score, int tasksize,
1610 int kill_reason, union meminfo *mi) {
1611 /* log process information */
1612 android_log_write_int32(ctx, procp->pid);
1613 android_log_write_int32(ctx, procp->uid);
1614 android_log_write_int32(ctx, procp->oomadj);
1615 android_log_write_int32(ctx, min_oom_score);
1616 android_log_write_int32(ctx, (int32_t)min(tasksize * page_k, INT32_MAX));
1617 android_log_write_int32(ctx, kill_reason);
1618
1619 /* log meminfo fields */
1620 for (int field_idx = 0; field_idx < MI_FIELD_COUNT; field_idx++) {
1621 android_log_write_int32(ctx, (int32_t)min(mi->arr[field_idx] * page_k, INT32_MAX));
1622 }
1623
1624 android_log_write_list(ctx, LOG_ID_EVENTS);
1625 android_log_reset(ctx);
1626 }
1627
proc_adj_lru(int oomadj)1628 static struct proc *proc_adj_lru(int oomadj) {
1629 return (struct proc *)adjslot_tail(&procadjslot_list[ADJTOSLOT(oomadj)]);
1630 }
1631
proc_get_heaviest(int oomadj)1632 static struct proc *proc_get_heaviest(int oomadj) {
1633 struct adjslot_list *head = &procadjslot_list[ADJTOSLOT(oomadj)];
1634 struct adjslot_list *curr = head->next;
1635 struct proc *maxprocp = NULL;
1636 int maxsize = 0;
1637 while (curr != head) {
1638 int pid = ((struct proc *)curr)->pid;
1639 int tasksize = proc_get_size(pid);
1640 if (tasksize <= 0) {
1641 struct adjslot_list *next = curr->next;
1642 pid_remove(pid);
1643 curr = next;
1644 } else {
1645 if (tasksize > maxsize) {
1646 maxsize = tasksize;
1647 maxprocp = (struct proc *)curr;
1648 }
1649 curr = curr->next;
1650 }
1651 }
1652 return maxprocp;
1653 }
1654
set_process_group_and_prio(int pid,SchedPolicy sp,int prio)1655 static void set_process_group_and_prio(int pid, SchedPolicy sp, int prio) {
1656 DIR* d;
1657 char proc_path[PATH_MAX];
1658 struct dirent* de;
1659
1660 snprintf(proc_path, sizeof(proc_path), "/proc/%d/task", pid);
1661 if (!(d = opendir(proc_path))) {
1662 ALOGW("Failed to open %s; errno=%d: process pid(%d) might have died", proc_path, errno,
1663 pid);
1664 return;
1665 }
1666
1667 while ((de = readdir(d))) {
1668 int t_pid;
1669
1670 if (de->d_name[0] == '.') continue;
1671 t_pid = atoi(de->d_name);
1672
1673 if (!t_pid) {
1674 ALOGW("Failed to get t_pid for '%s' of pid(%d)", de->d_name, pid);
1675 continue;
1676 }
1677
1678 if (setpriority(PRIO_PROCESS, t_pid, prio) && errno != ESRCH) {
1679 ALOGW("Unable to raise priority of killing t_pid (%d): errno=%d", t_pid, errno);
1680 }
1681
1682 if (set_cpuset_policy(t_pid, sp)) {
1683 ALOGW("Failed to set_cpuset_policy on pid(%d) t_pid(%d) to %d", pid, t_pid, (int)sp);
1684 continue;
1685 }
1686 }
1687 closedir(d);
1688 }
1689
is_kill_pending(void)1690 static bool is_kill_pending(void) {
1691 char buf[24];
1692
1693 if (last_kill_pid_or_fd < 0) {
1694 return false;
1695 }
1696
1697 if (pidfd_supported) {
1698 return true;
1699 }
1700
1701 /* when pidfd is not supported base the decision on /proc/<pid> existence */
1702 snprintf(buf, sizeof(buf), "/proc/%d/", last_kill_pid_or_fd);
1703 if (access(buf, F_OK) == 0) {
1704 return true;
1705 }
1706
1707 return false;
1708 }
1709
is_waiting_for_kill(void)1710 static bool is_waiting_for_kill(void) {
1711 return pidfd_supported && last_kill_pid_or_fd >= 0;
1712 }
1713
stop_wait_for_proc_kill(bool finished)1714 static void stop_wait_for_proc_kill(bool finished) {
1715 struct epoll_event epev;
1716
1717 if (last_kill_pid_or_fd < 0) {
1718 return;
1719 }
1720
1721 if (debug_process_killing) {
1722 struct timespec curr_tm;
1723
1724 if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
1725 /*
1726 * curr_tm is used here merely to report kill duration, so this failure is not fatal.
1727 * Log an error and continue.
1728 */
1729 ALOGE("Failed to get current time");
1730 }
1731
1732 if (finished) {
1733 ALOGI("Process got killed in %ldms",
1734 get_time_diff_ms(&last_kill_tm, &curr_tm));
1735 } else {
1736 ALOGI("Stop waiting for process kill after %ldms",
1737 get_time_diff_ms(&last_kill_tm, &curr_tm));
1738 }
1739 }
1740
1741 if (pidfd_supported) {
1742 /* unregister fd */
1743 if (epoll_ctl(epollfd, EPOLL_CTL_DEL, last_kill_pid_or_fd, &epev) != 0) {
1744 ALOGE("epoll_ctl for last killed process failed; errno=%d", errno);
1745 return;
1746 }
1747 maxevents--;
1748 close(last_kill_pid_or_fd);
1749 }
1750
1751 last_kill_pid_or_fd = -1;
1752 }
1753
kill_done_handler(int data __unused,uint32_t events __unused,struct polling_params * poll_params)1754 static void kill_done_handler(int data __unused, uint32_t events __unused,
1755 struct polling_params *poll_params) {
1756 stop_wait_for_proc_kill(true);
1757 poll_params->update = POLLING_RESUME;
1758 }
1759
start_wait_for_proc_kill(int pid_or_fd)1760 static void start_wait_for_proc_kill(int pid_or_fd) {
1761 static struct event_handler_info kill_done_hinfo = { 0, kill_done_handler };
1762 struct epoll_event epev;
1763
1764 if (last_kill_pid_or_fd >= 0) {
1765 /* Should not happen but if it does we should stop previous wait */
1766 ALOGE("Attempt to wait for a kill while another wait is in progress");
1767 stop_wait_for_proc_kill(false);
1768 }
1769
1770 last_kill_pid_or_fd = pid_or_fd;
1771
1772 if (!pidfd_supported) {
1773 /* If pidfd is not supported just store PID and exit */
1774 return;
1775 }
1776
1777 epev.events = EPOLLIN;
1778 epev.data.ptr = (void *)&kill_done_hinfo;
1779 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, last_kill_pid_or_fd, &epev) != 0) {
1780 ALOGE("epoll_ctl for last kill failed; errno=%d", errno);
1781 close(last_kill_pid_or_fd);
1782 last_kill_pid_or_fd = -1;
1783 return;
1784 }
1785 maxevents++;
1786 }
1787
1788 /* Kill one process specified by procp. Returns the size of the process killed */
kill_one_process(struct proc * procp,int min_oom_score,int kill_reason,const char * kill_desc,union meminfo * mi,struct timespec * tm)1789 static int kill_one_process(struct proc* procp, int min_oom_score, int kill_reason,
1790 const char *kill_desc, union meminfo *mi, struct timespec *tm) {
1791 int pid = procp->pid;
1792 int pidfd = procp->pidfd;
1793 uid_t uid = procp->uid;
1794 int tgid;
1795 char *taskname;
1796 int tasksize;
1797 int r;
1798 int result = -1;
1799 struct memory_stat *mem_st;
1800 char buf[LINE_MAX];
1801
1802 tgid = proc_get_tgid(pid);
1803 if (tgid >= 0 && tgid != pid) {
1804 ALOGE("Possible pid reuse detected (pid %d, tgid %d)!", pid, tgid);
1805 goto out;
1806 }
1807
1808 taskname = proc_get_name(pid, buf, sizeof(buf));
1809 if (!taskname) {
1810 goto out;
1811 }
1812
1813 tasksize = proc_get_size(pid);
1814 if (tasksize <= 0) {
1815 goto out;
1816 }
1817
1818 mem_st = stats_read_memory_stat(per_app_memcg, pid, uid);
1819
1820 TRACE_KILL_START(pid);
1821
1822 /* CAP_KILL required */
1823 if (pidfd < 0) {
1824 start_wait_for_proc_kill(pid);
1825 r = kill(pid, SIGKILL);
1826 } else {
1827 start_wait_for_proc_kill(pidfd);
1828 r = sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0);
1829 }
1830
1831 TRACE_KILL_END();
1832
1833 if (r) {
1834 stop_wait_for_proc_kill(false);
1835 ALOGE("kill(%d): errno=%d", pid, errno);
1836 /* Delete process record even when we fail to kill so that we don't get stuck on it */
1837 goto out;
1838 }
1839
1840 set_process_group_and_prio(pid, SP_FOREGROUND, ANDROID_PRIORITY_HIGHEST);
1841
1842 last_kill_tm = *tm;
1843
1844 inc_killcnt(procp->oomadj);
1845
1846 killinfo_log(procp, min_oom_score, tasksize, kill_reason, mi);
1847
1848 if (kill_desc) {
1849 ALOGI("Kill '%s' (%d), uid %d, oom_adj %d to free %ldkB; reason: %s", taskname, pid,
1850 uid, procp->oomadj, tasksize * page_k, kill_desc);
1851 } else {
1852 ALOGI("Kill '%s' (%d), uid %d, oom_adj %d to free %ldkB", taskname, pid,
1853 uid, procp->oomadj, tasksize * page_k);
1854 }
1855
1856 stats_write_lmk_kill_occurred(LMK_KILL_OCCURRED, uid, taskname,
1857 procp->oomadj, min_oom_score, tasksize, mem_st);
1858
1859 result = tasksize;
1860
1861 out:
1862 /*
1863 * WARNING: After pid_remove() procp is freed and can't be used!
1864 * Therefore placed at the end of the function.
1865 */
1866 pid_remove(pid);
1867 return result;
1868 }
1869
1870 /*
1871 * Find one process to kill at or above the given oom_adj level.
1872 * Returns size of the killed process.
1873 */
find_and_kill_process(int min_score_adj,int kill_reason,const char * kill_desc,union meminfo * mi,struct timespec * tm)1874 static int find_and_kill_process(int min_score_adj, int kill_reason, const char *kill_desc,
1875 union meminfo *mi, struct timespec *tm) {
1876 int i;
1877 int killed_size = 0;
1878 bool lmk_state_change_start = false;
1879
1880 for (i = OOM_SCORE_ADJ_MAX; i >= min_score_adj; i--) {
1881 struct proc *procp;
1882
1883 while (true) {
1884 procp = kill_heaviest_task ?
1885 proc_get_heaviest(i) : proc_adj_lru(i);
1886
1887 if (!procp)
1888 break;
1889
1890 killed_size = kill_one_process(procp, min_score_adj, kill_reason, kill_desc, mi, tm);
1891 if (killed_size >= 0) {
1892 if (!lmk_state_change_start) {
1893 lmk_state_change_start = true;
1894 stats_write_lmk_state_changed(LMK_STATE_CHANGED,
1895 LMK_STATE_CHANGE_START);
1896 }
1897 break;
1898 }
1899 }
1900 if (killed_size) {
1901 break;
1902 }
1903 }
1904
1905 if (lmk_state_change_start) {
1906 stats_write_lmk_state_changed(LMK_STATE_CHANGED, LMK_STATE_CHANGE_STOP);
1907 }
1908
1909 return killed_size;
1910 }
1911
get_memory_usage(struct reread_data * file_data)1912 static int64_t get_memory_usage(struct reread_data *file_data) {
1913 int ret;
1914 int64_t mem_usage;
1915 char *buf;
1916
1917 if ((buf = reread_file(file_data)) == NULL) {
1918 return -1;
1919 }
1920
1921 if (!parse_int64(buf, &mem_usage)) {
1922 ALOGE("%s parse error", file_data->filename);
1923 return -1;
1924 }
1925 if (mem_usage == 0) {
1926 ALOGE("No memory!");
1927 return -1;
1928 }
1929 return mem_usage;
1930 }
1931
record_low_pressure_levels(union meminfo * mi)1932 void record_low_pressure_levels(union meminfo *mi) {
1933 if (low_pressure_mem.min_nr_free_pages == -1 ||
1934 low_pressure_mem.min_nr_free_pages > mi->field.nr_free_pages) {
1935 if (debug_process_killing) {
1936 ALOGI("Low pressure min memory update from %" PRId64 " to %" PRId64,
1937 low_pressure_mem.min_nr_free_pages, mi->field.nr_free_pages);
1938 }
1939 low_pressure_mem.min_nr_free_pages = mi->field.nr_free_pages;
1940 }
1941 /*
1942 * Free memory at low vmpressure events occasionally gets spikes,
1943 * possibly a stale low vmpressure event with memory already
1944 * freed up (no memory pressure should have been reported).
1945 * Ignore large jumps in max_nr_free_pages that would mess up our stats.
1946 */
1947 if (low_pressure_mem.max_nr_free_pages == -1 ||
1948 (low_pressure_mem.max_nr_free_pages < mi->field.nr_free_pages &&
1949 mi->field.nr_free_pages - low_pressure_mem.max_nr_free_pages <
1950 low_pressure_mem.max_nr_free_pages * 0.1)) {
1951 if (debug_process_killing) {
1952 ALOGI("Low pressure max memory update from %" PRId64 " to %" PRId64,
1953 low_pressure_mem.max_nr_free_pages, mi->field.nr_free_pages);
1954 }
1955 low_pressure_mem.max_nr_free_pages = mi->field.nr_free_pages;
1956 }
1957 }
1958
upgrade_level(enum vmpressure_level level)1959 enum vmpressure_level upgrade_level(enum vmpressure_level level) {
1960 return (enum vmpressure_level)((level < VMPRESS_LEVEL_CRITICAL) ?
1961 level + 1 : level);
1962 }
1963
downgrade_level(enum vmpressure_level level)1964 enum vmpressure_level downgrade_level(enum vmpressure_level level) {
1965 return (enum vmpressure_level)((level > VMPRESS_LEVEL_LOW) ?
1966 level - 1 : level);
1967 }
1968
1969 enum zone_watermark {
1970 WMARK_MIN = 0,
1971 WMARK_LOW,
1972 WMARK_HIGH,
1973 WMARK_NONE
1974 };
1975
1976 struct zone_watermarks {
1977 long high_wmark;
1978 long low_wmark;
1979 long min_wmark;
1980 };
1981
1982 /*
1983 * Returns lowest breached watermark or WMARK_NONE.
1984 */
get_lowest_watermark(union meminfo * mi,struct zone_watermarks * watermarks)1985 static enum zone_watermark get_lowest_watermark(union meminfo *mi,
1986 struct zone_watermarks *watermarks)
1987 {
1988 int64_t nr_free_pages = mi->field.nr_free_pages - mi->field.cma_free;
1989
1990 if (nr_free_pages < watermarks->min_wmark) {
1991 return WMARK_MIN;
1992 }
1993 if (nr_free_pages < watermarks->low_wmark) {
1994 return WMARK_LOW;
1995 }
1996 if (nr_free_pages < watermarks->high_wmark) {
1997 return WMARK_HIGH;
1998 }
1999 return WMARK_NONE;
2000 }
2001
calc_zone_watermarks(struct zoneinfo * zi,struct zone_watermarks * watermarks)2002 void calc_zone_watermarks(struct zoneinfo *zi, struct zone_watermarks *watermarks) {
2003 memset(watermarks, 0, sizeof(struct zone_watermarks));
2004
2005 for (int node_idx = 0; node_idx < zi->node_count; node_idx++) {
2006 struct zoneinfo_node *node = &zi->nodes[node_idx];
2007 for (int zone_idx = 0; zone_idx < node->zone_count; zone_idx++) {
2008 struct zoneinfo_zone *zone = &node->zones[zone_idx];
2009
2010 if (!zone->fields.field.present) {
2011 continue;
2012 }
2013
2014 watermarks->high_wmark += zone->max_protection + zone->fields.field.high;
2015 watermarks->low_wmark += zone->max_protection + zone->fields.field.low;
2016 watermarks->min_wmark += zone->max_protection + zone->fields.field.min;
2017 }
2018 }
2019 }
2020
mp_event_psi(int data,uint32_t events,struct polling_params * poll_params)2021 static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_params) {
2022 enum kill_reasons {
2023 NONE = -1, /* To denote no kill condition */
2024 PRESSURE_AFTER_KILL = 0,
2025 NOT_RESPONDING,
2026 LOW_SWAP_AND_THRASHING,
2027 LOW_MEM_AND_SWAP,
2028 LOW_MEM_AND_THRASHING,
2029 DIRECT_RECL_AND_THRASHING,
2030 KILL_REASON_COUNT
2031 };
2032 enum reclaim_state {
2033 NO_RECLAIM = 0,
2034 KSWAPD_RECLAIM,
2035 DIRECT_RECLAIM,
2036 };
2037 static int64_t init_ws_refault;
2038 static int64_t base_file_lru;
2039 static int64_t init_pgscan_kswapd;
2040 static int64_t init_pgscan_direct;
2041 static int64_t swap_low_threshold;
2042 static bool killing;
2043 static int thrashing_limit;
2044 static bool in_reclaim;
2045 static struct zone_watermarks watermarks;
2046 static struct timespec wmark_update_tm;
2047
2048 union meminfo mi;
2049 union vmstat vs;
2050 struct timespec curr_tm;
2051 int64_t thrashing = 0;
2052 bool swap_is_low = false;
2053 enum vmpressure_level level = (enum vmpressure_level)data;
2054 enum kill_reasons kill_reason = NONE;
2055 bool cycle_after_kill = false;
2056 enum reclaim_state reclaim = NO_RECLAIM;
2057 enum zone_watermark wmark = WMARK_NONE;
2058 char kill_desc[LINE_MAX];
2059 bool cut_thrashing_limit = false;
2060 int min_score_adj = 0;
2061
2062 /* Skip while still killing a process */
2063 if (is_kill_pending()) {
2064 goto no_kill;
2065 }
2066 /*
2067 * Process is dead, stop waiting. This has no effect if pidfds are supported and
2068 * death notification already caused waiting to stop.
2069 */
2070 stop_wait_for_proc_kill(true);
2071
2072 if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
2073 ALOGE("Failed to get current time");
2074 return;
2075 }
2076
2077 if (vmstat_parse(&vs) < 0) {
2078 ALOGE("Failed to parse vmstat!");
2079 return;
2080 }
2081
2082 if (meminfo_parse(&mi) < 0) {
2083 ALOGE("Failed to parse meminfo!");
2084 return;
2085 }
2086
2087 /* Reset states after process got killed */
2088 if (killing) {
2089 killing = false;
2090 cycle_after_kill = true;
2091 /* Reset file-backed pagecache size and refault amounts after a kill */
2092 base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file;
2093 init_ws_refault = vs.field.workingset_refault;
2094 }
2095
2096 /* Check free swap levels */
2097 if (swap_free_low_percentage) {
2098 if (!swap_low_threshold) {
2099 swap_low_threshold = mi.field.total_swap * swap_free_low_percentage / 100;
2100 }
2101 swap_is_low = mi.field.free_swap < swap_low_threshold;
2102 }
2103
2104 /* Identify reclaim state */
2105 if (vs.field.pgscan_direct > init_pgscan_direct) {
2106 init_pgscan_direct = vs.field.pgscan_direct;
2107 init_pgscan_kswapd = vs.field.pgscan_kswapd;
2108 reclaim = DIRECT_RECLAIM;
2109 } else if (vs.field.pgscan_kswapd > init_pgscan_kswapd) {
2110 init_pgscan_kswapd = vs.field.pgscan_kswapd;
2111 reclaim = KSWAPD_RECLAIM;
2112 } else {
2113 in_reclaim = false;
2114 /* Skip if system is not reclaiming */
2115 goto no_kill;
2116 }
2117
2118 if (!in_reclaim) {
2119 /* Record file-backed pagecache size when entering reclaim cycle */
2120 base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file;
2121 init_ws_refault = vs.field.workingset_refault;
2122 thrashing_limit = thrashing_limit_pct;
2123 } else {
2124 /* Calculate what % of the file-backed pagecache refaulted so far */
2125 thrashing = (vs.field.workingset_refault - init_ws_refault) * 100 / base_file_lru;
2126 }
2127 in_reclaim = true;
2128
2129 /*
2130 * Refresh watermarks once per min in case user updated one of the margins.
2131 * TODO: b/140521024 replace this periodic update with an API for AMS to notify LMKD
2132 * that zone watermarks were changed by the system software.
2133 */
2134 if (watermarks.high_wmark == 0 || get_time_diff_ms(&wmark_update_tm, &curr_tm) > 60000) {
2135 struct zoneinfo zi;
2136
2137 if (zoneinfo_parse(&zi) < 0) {
2138 ALOGE("Failed to parse zoneinfo!");
2139 return;
2140 }
2141
2142 calc_zone_watermarks(&zi, &watermarks);
2143 wmark_update_tm = curr_tm;
2144 }
2145
2146 /* Find out which watermark is breached if any */
2147 wmark = get_lowest_watermark(&mi, &watermarks);
2148
2149 /*
2150 * TODO: move this logic into a separate function
2151 * Decide if killing a process is necessary and record the reason
2152 */
2153 if (cycle_after_kill && wmark < WMARK_LOW) {
2154 /*
2155 * Prevent kills not freeing enough memory which might lead to OOM kill.
2156 * This might happen when a process is consuming memory faster than reclaim can
2157 * free even after a kill. Mostly happens when running memory stress tests.
2158 */
2159 kill_reason = PRESSURE_AFTER_KILL;
2160 strncpy(kill_desc, "min watermark is breached even after kill", sizeof(kill_desc));
2161 } else if (level == VMPRESS_LEVEL_CRITICAL && events != 0) {
2162 /*
2163 * Device is too busy reclaiming memory which might lead to ANR.
2164 * Critical level is triggered when PSI complete stall (all tasks are blocked because
2165 * of the memory congestion) breaches the configured threshold.
2166 */
2167 kill_reason = NOT_RESPONDING;
2168 strncpy(kill_desc, "device is not responding", sizeof(kill_desc));
2169 } else if (swap_is_low && thrashing > thrashing_limit_pct) {
2170 /* Page cache is thrashing while swap is low */
2171 kill_reason = LOW_SWAP_AND_THRASHING;
2172 snprintf(kill_desc, sizeof(kill_desc), "device is low on swap (%" PRId64
2173 "kB < %" PRId64 "kB) and thrashing (%" PRId64 "%%)",
2174 mi.field.free_swap * page_k, swap_low_threshold * page_k, thrashing);
2175 } else if (swap_is_low && wmark < WMARK_HIGH) {
2176 /* Both free memory and swap are low */
2177 kill_reason = LOW_MEM_AND_SWAP;
2178 snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and swap is low (%"
2179 PRId64 "kB < %" PRId64 "kB)", wmark > WMARK_LOW ? "min" : "low",
2180 mi.field.free_swap * page_k, swap_low_threshold * page_k);
2181 } else if (wmark < WMARK_HIGH && thrashing > thrashing_limit) {
2182 /* Page cache is thrashing while memory is low */
2183 kill_reason = LOW_MEM_AND_THRASHING;
2184 snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and thrashing (%"
2185 PRId64 "%%)", wmark > WMARK_LOW ? "min" : "low", thrashing);
2186 cut_thrashing_limit = true;
2187 /* Do not kill perceptible apps because of thrashing */
2188 min_score_adj = PERCEPTIBLE_APP_ADJ;
2189 } else if (reclaim == DIRECT_RECLAIM && thrashing > thrashing_limit) {
2190 /* Page cache is thrashing while in direct reclaim (mostly happens on lowram devices) */
2191 kill_reason = DIRECT_RECL_AND_THRASHING;
2192 snprintf(kill_desc, sizeof(kill_desc), "device is in direct reclaim and thrashing (%"
2193 PRId64 "%%)", thrashing);
2194 cut_thrashing_limit = true;
2195 /* Do not kill perceptible apps because of thrashing */
2196 min_score_adj = PERCEPTIBLE_APP_ADJ;
2197 }
2198
2199 /* Kill a process if necessary */
2200 if (kill_reason != NONE) {
2201 int pages_freed = find_and_kill_process(min_score_adj, kill_reason, kill_desc, &mi,
2202 &curr_tm);
2203 if (pages_freed > 0) {
2204 killing = true;
2205 if (cut_thrashing_limit) {
2206 /*
2207 * Cut thrasing limit by thrashing_limit_decay_pct percentage of the current
2208 * thrashing limit until the system stops thrashing.
2209 */
2210 thrashing_limit = (thrashing_limit * (100 - thrashing_limit_decay_pct)) / 100;
2211 }
2212 }
2213 }
2214
2215 no_kill:
2216 /* Do not poll if kernel supports pidfd waiting */
2217 if (is_waiting_for_kill()) {
2218 /* Pause polling if we are waiting for process death notification */
2219 poll_params->update = POLLING_PAUSE;
2220 return;
2221 }
2222
2223 /*
2224 * Start polling after initial PSI event;
2225 * extend polling while device is in direct reclaim or process is being killed;
2226 * do not extend when kswapd reclaims because that might go on for a long time
2227 * without causing memory pressure
2228 */
2229 if (events || killing || reclaim == DIRECT_RECLAIM) {
2230 poll_params->update = POLLING_START;
2231 }
2232
2233 /* Decide the polling interval */
2234 if (swap_is_low || killing) {
2235 /* Fast polling during and after a kill or when swap is low */
2236 poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
2237 } else {
2238 /* By default use long intervals */
2239 poll_params->polling_interval_ms = PSI_POLL_PERIOD_LONG_MS;
2240 }
2241 }
2242
mp_event_common(int data,uint32_t events,struct polling_params * poll_params)2243 static void mp_event_common(int data, uint32_t events, struct polling_params *poll_params) {
2244 int ret;
2245 unsigned long long evcount;
2246 int64_t mem_usage, memsw_usage;
2247 int64_t mem_pressure;
2248 enum vmpressure_level lvl;
2249 union meminfo mi;
2250 struct zoneinfo zi;
2251 struct timespec curr_tm;
2252 static unsigned long kill_skip_count = 0;
2253 enum vmpressure_level level = (enum vmpressure_level)data;
2254 long other_free = 0, other_file = 0;
2255 int min_score_adj;
2256 int minfree = 0;
2257 static struct reread_data mem_usage_file_data = {
2258 .filename = MEMCG_MEMORY_USAGE,
2259 .fd = -1,
2260 };
2261 static struct reread_data memsw_usage_file_data = {
2262 .filename = MEMCG_MEMORYSW_USAGE,
2263 .fd = -1,
2264 };
2265
2266 if (debug_process_killing) {
2267 ALOGI("%s memory pressure event is triggered", level_name[level]);
2268 }
2269
2270 if (!use_psi_monitors) {
2271 /*
2272 * Check all event counters from low to critical
2273 * and upgrade to the highest priority one. By reading
2274 * eventfd we also reset the event counters.
2275 */
2276 for (lvl = VMPRESS_LEVEL_LOW; lvl < VMPRESS_LEVEL_COUNT; lvl++) {
2277 if (mpevfd[lvl] != -1 &&
2278 TEMP_FAILURE_RETRY(read(mpevfd[lvl],
2279 &evcount, sizeof(evcount))) > 0 &&
2280 evcount > 0 && lvl > level) {
2281 level = lvl;
2282 }
2283 }
2284 }
2285
2286 /* Start polling after initial PSI event */
2287 if (use_psi_monitors && events) {
2288 /* Override polling params only if current event is more critical */
2289 if (!poll_params->poll_handler || data > poll_params->poll_handler->data) {
2290 poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
2291 poll_params->update = POLLING_START;
2292 }
2293 }
2294
2295 if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
2296 ALOGE("Failed to get current time");
2297 return;
2298 }
2299
2300 if (kill_timeout_ms && get_time_diff_ms(&last_kill_tm, &curr_tm) < kill_timeout_ms) {
2301 /*
2302 * If we're within the no-kill timeout, see if there's pending reclaim work
2303 * from the last killed process. If so, skip killing for now.
2304 */
2305 if (is_kill_pending()) {
2306 kill_skip_count++;
2307 return;
2308 }
2309 /*
2310 * Process is dead, stop waiting. This has no effect if pidfds are supported and
2311 * death notification already caused waiting to stop.
2312 */
2313 stop_wait_for_proc_kill(true);
2314 } else {
2315 /*
2316 * Killing took longer than no-kill timeout. Stop waiting for the last process
2317 * to die because we are ready to kill again.
2318 */
2319 stop_wait_for_proc_kill(false);
2320 }
2321
2322 if (kill_skip_count > 0) {
2323 ALOGI("%lu memory pressure events were skipped after a kill!",
2324 kill_skip_count);
2325 kill_skip_count = 0;
2326 }
2327
2328 if (meminfo_parse(&mi) < 0 || zoneinfo_parse(&zi) < 0) {
2329 ALOGE("Failed to get free memory!");
2330 return;
2331 }
2332
2333 if (use_minfree_levels) {
2334 int i;
2335
2336 other_free = mi.field.nr_free_pages - zi.totalreserve_pages;
2337 if (mi.field.nr_file_pages > (mi.field.shmem + mi.field.unevictable + mi.field.swap_cached)) {
2338 other_file = (mi.field.nr_file_pages - mi.field.shmem -
2339 mi.field.unevictable - mi.field.swap_cached);
2340 } else {
2341 other_file = 0;
2342 }
2343
2344 min_score_adj = OOM_SCORE_ADJ_MAX + 1;
2345 for (i = 0; i < lowmem_targets_size; i++) {
2346 minfree = lowmem_minfree[i];
2347 if (other_free < minfree && other_file < minfree) {
2348 min_score_adj = lowmem_adj[i];
2349 break;
2350 }
2351 }
2352
2353 if (min_score_adj == OOM_SCORE_ADJ_MAX + 1) {
2354 if (debug_process_killing) {
2355 ALOGI("Ignore %s memory pressure event "
2356 "(free memory=%ldkB, cache=%ldkB, limit=%ldkB)",
2357 level_name[level], other_free * page_k, other_file * page_k,
2358 (long)lowmem_minfree[lowmem_targets_size - 1] * page_k);
2359 }
2360 return;
2361 }
2362
2363 goto do_kill;
2364 }
2365
2366 if (level == VMPRESS_LEVEL_LOW) {
2367 record_low_pressure_levels(&mi);
2368 }
2369
2370 if (level_oomadj[level] > OOM_SCORE_ADJ_MAX) {
2371 /* Do not monitor this pressure level */
2372 return;
2373 }
2374
2375 if ((mem_usage = get_memory_usage(&mem_usage_file_data)) < 0) {
2376 goto do_kill;
2377 }
2378 if ((memsw_usage = get_memory_usage(&memsw_usage_file_data)) < 0) {
2379 goto do_kill;
2380 }
2381
2382 // Calculate percent for swappinness.
2383 mem_pressure = (mem_usage * 100) / memsw_usage;
2384
2385 if (enable_pressure_upgrade && level != VMPRESS_LEVEL_CRITICAL) {
2386 // We are swapping too much.
2387 if (mem_pressure < upgrade_pressure) {
2388 level = upgrade_level(level);
2389 if (debug_process_killing) {
2390 ALOGI("Event upgraded to %s", level_name[level]);
2391 }
2392 }
2393 }
2394
2395 // If we still have enough swap space available, check if we want to
2396 // ignore/downgrade pressure events.
2397 if (mi.field.free_swap >=
2398 mi.field.total_swap * swap_free_low_percentage / 100) {
2399 // If the pressure is larger than downgrade_pressure lmk will not
2400 // kill any process, since enough memory is available.
2401 if (mem_pressure > downgrade_pressure) {
2402 if (debug_process_killing) {
2403 ALOGI("Ignore %s memory pressure", level_name[level]);
2404 }
2405 return;
2406 } else if (level == VMPRESS_LEVEL_CRITICAL && mem_pressure > upgrade_pressure) {
2407 if (debug_process_killing) {
2408 ALOGI("Downgrade critical memory pressure");
2409 }
2410 // Downgrade event, since enough memory available.
2411 level = downgrade_level(level);
2412 }
2413 }
2414
2415 do_kill:
2416 if (low_ram_device) {
2417 /* For Go devices kill only one task */
2418 if (find_and_kill_process(level_oomadj[level], -1, NULL, &mi, &curr_tm) == 0) {
2419 if (debug_process_killing) {
2420 ALOGI("Nothing to kill");
2421 }
2422 }
2423 } else {
2424 int pages_freed;
2425 static struct timespec last_report_tm;
2426 static unsigned long report_skip_count = 0;
2427
2428 if (!use_minfree_levels) {
2429 /* Free up enough memory to downgrate the memory pressure to low level */
2430 if (mi.field.nr_free_pages >= low_pressure_mem.max_nr_free_pages) {
2431 if (debug_process_killing) {
2432 ALOGI("Ignoring pressure since more memory is "
2433 "available (%" PRId64 ") than watermark (%" PRId64 ")",
2434 mi.field.nr_free_pages, low_pressure_mem.max_nr_free_pages);
2435 }
2436 return;
2437 }
2438 min_score_adj = level_oomadj[level];
2439 }
2440
2441 pages_freed = find_and_kill_process(min_score_adj, -1, NULL, &mi, &curr_tm);
2442
2443 if (pages_freed == 0) {
2444 /* Rate limit kill reports when nothing was reclaimed */
2445 if (get_time_diff_ms(&last_report_tm, &curr_tm) < FAIL_REPORT_RLIMIT_MS) {
2446 report_skip_count++;
2447 return;
2448 }
2449 }
2450
2451 /* Log whenever we kill or when report rate limit allows */
2452 if (use_minfree_levels) {
2453 ALOGI("Reclaimed %ldkB, cache(%ldkB) and "
2454 "free(%" PRId64 "kB)-reserved(%" PRId64 "kB) below min(%ldkB) for oom_adj %d",
2455 pages_freed * page_k,
2456 other_file * page_k, mi.field.nr_free_pages * page_k,
2457 zi.totalreserve_pages * page_k,
2458 minfree * page_k, min_score_adj);
2459 } else {
2460 ALOGI("Reclaimed %ldkB at oom_adj %d",
2461 pages_freed * page_k, min_score_adj);
2462 }
2463
2464 if (report_skip_count > 0) {
2465 ALOGI("Suppressed %lu failed kill reports", report_skip_count);
2466 report_skip_count = 0;
2467 }
2468
2469 last_report_tm = curr_tm;
2470 }
2471 if (is_waiting_for_kill()) {
2472 /* pause polling if we are waiting for process death notification */
2473 poll_params->update = POLLING_PAUSE;
2474 }
2475 }
2476
init_mp_psi(enum vmpressure_level level,bool use_new_strategy)2477 static bool init_mp_psi(enum vmpressure_level level, bool use_new_strategy) {
2478 int fd;
2479
2480 /* Do not register a handler if threshold_ms is not set */
2481 if (!psi_thresholds[level].threshold_ms) {
2482 return true;
2483 }
2484
2485 fd = init_psi_monitor(psi_thresholds[level].stall_type,
2486 psi_thresholds[level].threshold_ms * US_PER_MS,
2487 PSI_WINDOW_SIZE_MS * US_PER_MS);
2488
2489 if (fd < 0) {
2490 return false;
2491 }
2492
2493 vmpressure_hinfo[level].handler = use_new_strategy ? mp_event_psi : mp_event_common;
2494 vmpressure_hinfo[level].data = level;
2495 if (register_psi_monitor(epollfd, fd, &vmpressure_hinfo[level]) < 0) {
2496 destroy_psi_monitor(fd);
2497 return false;
2498 }
2499 maxevents++;
2500 mpevfd[level] = fd;
2501
2502 return true;
2503 }
2504
destroy_mp_psi(enum vmpressure_level level)2505 static void destroy_mp_psi(enum vmpressure_level level) {
2506 int fd = mpevfd[level];
2507
2508 if (unregister_psi_monitor(epollfd, fd) < 0) {
2509 ALOGE("Failed to unregister psi monitor for %s memory pressure; errno=%d",
2510 level_name[level], errno);
2511 }
2512 destroy_psi_monitor(fd);
2513 mpevfd[level] = -1;
2514 }
2515
init_psi_monitors()2516 static bool init_psi_monitors() {
2517 /*
2518 * When PSI is used on low-ram devices or on high-end devices without memfree levels
2519 * use new kill strategy based on zone watermarks, free swap and thrashing stats
2520 */
2521 bool use_new_strategy =
2522 property_get_bool("ro.lmk.use_new_strategy", low_ram_device || !use_minfree_levels);
2523
2524 /* In default PSI mode override stall amounts using system properties */
2525 if (use_new_strategy) {
2526 /* Do not use low pressure level */
2527 psi_thresholds[VMPRESS_LEVEL_LOW].threshold_ms = 0;
2528 psi_thresholds[VMPRESS_LEVEL_MEDIUM].threshold_ms = psi_partial_stall_ms;
2529 psi_thresholds[VMPRESS_LEVEL_CRITICAL].threshold_ms = psi_complete_stall_ms;
2530 }
2531
2532 if (!init_mp_psi(VMPRESS_LEVEL_LOW, use_new_strategy)) {
2533 return false;
2534 }
2535 if (!init_mp_psi(VMPRESS_LEVEL_MEDIUM, use_new_strategy)) {
2536 destroy_mp_psi(VMPRESS_LEVEL_LOW);
2537 return false;
2538 }
2539 if (!init_mp_psi(VMPRESS_LEVEL_CRITICAL, use_new_strategy)) {
2540 destroy_mp_psi(VMPRESS_LEVEL_MEDIUM);
2541 destroy_mp_psi(VMPRESS_LEVEL_LOW);
2542 return false;
2543 }
2544 return true;
2545 }
2546
init_mp_common(enum vmpressure_level level)2547 static bool init_mp_common(enum vmpressure_level level) {
2548 int mpfd;
2549 int evfd;
2550 int evctlfd;
2551 char buf[256];
2552 struct epoll_event epev;
2553 int ret;
2554 int level_idx = (int)level;
2555 const char *levelstr = level_name[level_idx];
2556
2557 /* gid containing AID_SYSTEM required */
2558 mpfd = open(MEMCG_SYSFS_PATH "memory.pressure_level", O_RDONLY | O_CLOEXEC);
2559 if (mpfd < 0) {
2560 ALOGI("No kernel memory.pressure_level support (errno=%d)", errno);
2561 goto err_open_mpfd;
2562 }
2563
2564 evctlfd = open(MEMCG_SYSFS_PATH "cgroup.event_control", O_WRONLY | O_CLOEXEC);
2565 if (evctlfd < 0) {
2566 ALOGI("No kernel memory cgroup event control (errno=%d)", errno);
2567 goto err_open_evctlfd;
2568 }
2569
2570 evfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
2571 if (evfd < 0) {
2572 ALOGE("eventfd failed for level %s; errno=%d", levelstr, errno);
2573 goto err_eventfd;
2574 }
2575
2576 ret = snprintf(buf, sizeof(buf), "%d %d %s", evfd, mpfd, levelstr);
2577 if (ret >= (ssize_t)sizeof(buf)) {
2578 ALOGE("cgroup.event_control line overflow for level %s", levelstr);
2579 goto err;
2580 }
2581
2582 ret = TEMP_FAILURE_RETRY(write(evctlfd, buf, strlen(buf) + 1));
2583 if (ret == -1) {
2584 ALOGE("cgroup.event_control write failed for level %s; errno=%d",
2585 levelstr, errno);
2586 goto err;
2587 }
2588
2589 epev.events = EPOLLIN;
2590 /* use data to store event level */
2591 vmpressure_hinfo[level_idx].data = level_idx;
2592 vmpressure_hinfo[level_idx].handler = mp_event_common;
2593 epev.data.ptr = (void *)&vmpressure_hinfo[level_idx];
2594 ret = epoll_ctl(epollfd, EPOLL_CTL_ADD, evfd, &epev);
2595 if (ret == -1) {
2596 ALOGE("epoll_ctl for level %s failed; errno=%d", levelstr, errno);
2597 goto err;
2598 }
2599 maxevents++;
2600 mpevfd[level] = evfd;
2601 close(evctlfd);
2602 return true;
2603
2604 err:
2605 close(evfd);
2606 err_eventfd:
2607 close(evctlfd);
2608 err_open_evctlfd:
2609 close(mpfd);
2610 err_open_mpfd:
2611 return false;
2612 }
2613
kernel_event_handler(int data __unused,uint32_t events __unused,struct polling_params * poll_params __unused)2614 static void kernel_event_handler(int data __unused, uint32_t events __unused,
2615 struct polling_params *poll_params __unused) {
2616 kpoll_info.handler(kpoll_info.poll_fd);
2617 }
2618
init(void)2619 static int init(void) {
2620 static struct event_handler_info kernel_poll_hinfo = { 0, kernel_event_handler };
2621 struct reread_data file_data = {
2622 .filename = ZONEINFO_PATH,
2623 .fd = -1,
2624 };
2625 struct epoll_event epev;
2626 int pidfd;
2627 int i;
2628 int ret;
2629
2630 page_k = sysconf(_SC_PAGESIZE);
2631 if (page_k == -1)
2632 page_k = PAGE_SIZE;
2633 page_k /= 1024;
2634
2635 epollfd = epoll_create(MAX_EPOLL_EVENTS);
2636 if (epollfd == -1) {
2637 ALOGE("epoll_create failed (errno=%d)", errno);
2638 return -1;
2639 }
2640
2641 // mark data connections as not connected
2642 for (int i = 0; i < MAX_DATA_CONN; i++) {
2643 data_sock[i].sock = -1;
2644 }
2645
2646 ctrl_sock.sock = android_get_control_socket("lmkd");
2647 if (ctrl_sock.sock < 0) {
2648 ALOGE("get lmkd control socket failed");
2649 return -1;
2650 }
2651
2652 ret = listen(ctrl_sock.sock, MAX_DATA_CONN);
2653 if (ret < 0) {
2654 ALOGE("lmkd control socket listen failed (errno=%d)", errno);
2655 return -1;
2656 }
2657
2658 epev.events = EPOLLIN;
2659 ctrl_sock.handler_info.handler = ctrl_connect_handler;
2660 epev.data.ptr = (void *)&(ctrl_sock.handler_info);
2661 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ctrl_sock.sock, &epev) == -1) {
2662 ALOGE("epoll_ctl for lmkd control socket failed (errno=%d)", errno);
2663 return -1;
2664 }
2665 maxevents++;
2666
2667 has_inkernel_module = !access(INKERNEL_MINFREE_PATH, W_OK);
2668 use_inkernel_interface = has_inkernel_module;
2669
2670 if (use_inkernel_interface) {
2671 ALOGI("Using in-kernel low memory killer interface");
2672 if (init_poll_kernel(&kpoll_info)) {
2673 epev.events = EPOLLIN;
2674 epev.data.ptr = (void*)&kernel_poll_hinfo;
2675 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, kpoll_info.poll_fd, &epev) != 0) {
2676 ALOGE("epoll_ctl for lmk events failed (errno=%d)", errno);
2677 close(kpoll_info.poll_fd);
2678 kpoll_info.poll_fd = -1;
2679 } else {
2680 maxevents++;
2681 }
2682 }
2683 } else {
2684 /* Try to use psi monitor first if kernel has it */
2685 use_psi_monitors = property_get_bool("ro.lmk.use_psi", true) &&
2686 init_psi_monitors();
2687 /* Fall back to vmpressure */
2688 if (!use_psi_monitors &&
2689 (!init_mp_common(VMPRESS_LEVEL_LOW) ||
2690 !init_mp_common(VMPRESS_LEVEL_MEDIUM) ||
2691 !init_mp_common(VMPRESS_LEVEL_CRITICAL))) {
2692 ALOGE("Kernel does not support memory pressure events or in-kernel low memory killer");
2693 return -1;
2694 }
2695 if (use_psi_monitors) {
2696 ALOGI("Using psi monitors for memory pressure detection");
2697 } else {
2698 ALOGI("Using vmpressure for memory pressure detection");
2699 }
2700 }
2701
2702 for (i = 0; i <= ADJTOSLOT(OOM_SCORE_ADJ_MAX); i++) {
2703 procadjslot_list[i].next = &procadjslot_list[i];
2704 procadjslot_list[i].prev = &procadjslot_list[i];
2705 }
2706
2707 memset(killcnt_idx, KILLCNT_INVALID_IDX, sizeof(killcnt_idx));
2708
2709 /*
2710 * Read zoneinfo as the biggest file we read to create and size the initial
2711 * read buffer and avoid memory re-allocations during memory pressure
2712 */
2713 if (reread_file(&file_data) == NULL) {
2714 ALOGE("Failed to read %s: %s", file_data.filename, strerror(errno));
2715 }
2716
2717 /* check if kernel supports pidfd_open syscall */
2718 pidfd = TEMP_FAILURE_RETRY(sys_pidfd_open(getpid(), 0));
2719 if (pidfd < 0) {
2720 pidfd_supported = (errno != ENOSYS);
2721 } else {
2722 pidfd_supported = true;
2723 close(pidfd);
2724 }
2725 ALOGI("Process polling is %s", pidfd_supported ? "supported" : "not supported" );
2726
2727 return 0;
2728 }
2729
call_handler(struct event_handler_info * handler_info,struct polling_params * poll_params,uint32_t events)2730 static void call_handler(struct event_handler_info* handler_info,
2731 struct polling_params *poll_params, uint32_t events) {
2732 struct timespec curr_tm;
2733
2734 handler_info->handler(handler_info->data, events, poll_params);
2735 clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
2736 poll_params->last_poll_tm = curr_tm;
2737
2738 switch (poll_params->update) {
2739 case POLLING_START:
2740 /*
2741 * Poll for the duration of PSI_WINDOW_SIZE_MS after the
2742 * initial PSI event because psi events are rate-limited
2743 * at one per sec.
2744 */
2745 poll_params->poll_start_tm = curr_tm;
2746 poll_params->poll_handler = handler_info;
2747 break;
2748 case POLLING_STOP:
2749 poll_params->poll_handler = NULL;
2750 break;
2751 case POLLING_PAUSE:
2752 poll_params->paused_handler = handler_info;
2753 poll_params->poll_handler = NULL;
2754 break;
2755 case POLLING_RESUME:
2756 poll_params->poll_start_tm = curr_tm;
2757 poll_params->poll_handler = poll_params->paused_handler;
2758 break;
2759 case POLLING_DO_NOT_CHANGE:
2760 if (get_time_diff_ms(&poll_params->poll_start_tm, &curr_tm) > PSI_WINDOW_SIZE_MS) {
2761 /* Polled for the duration of PSI window, time to stop */
2762 poll_params->poll_handler = NULL;
2763 }
2764 /* WARNING: skipping the rest of the function */
2765 return;
2766 }
2767 poll_params->update = POLLING_DO_NOT_CHANGE;
2768 }
2769
mainloop(void)2770 static void mainloop(void) {
2771 struct event_handler_info* handler_info;
2772 struct polling_params poll_params;
2773 struct timespec curr_tm;
2774 struct epoll_event *evt;
2775 long delay = -1;
2776
2777 poll_params.poll_handler = NULL;
2778 poll_params.update = POLLING_DO_NOT_CHANGE;
2779
2780 while (1) {
2781 struct epoll_event events[maxevents];
2782 int nevents;
2783 int i;
2784
2785 if (poll_params.poll_handler) {
2786 bool poll_now;
2787
2788 clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
2789 if (poll_params.poll_handler == poll_params.paused_handler) {
2790 /*
2791 * Just transitioned into POLLING_RESUME. Reset paused_handler
2792 * and poll immediately
2793 */
2794 poll_params.paused_handler = NULL;
2795 poll_now = true;
2796 nevents = 0;
2797 } else {
2798 /* Calculate next timeout */
2799 delay = get_time_diff_ms(&poll_params.last_poll_tm, &curr_tm);
2800 delay = (delay < poll_params.polling_interval_ms) ?
2801 poll_params.polling_interval_ms - delay : poll_params.polling_interval_ms;
2802
2803 /* Wait for events until the next polling timeout */
2804 nevents = epoll_wait(epollfd, events, maxevents, delay);
2805
2806 /* Update current time after wait */
2807 clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
2808 poll_now = (get_time_diff_ms(&poll_params.last_poll_tm, &curr_tm) >=
2809 poll_params.polling_interval_ms);
2810 }
2811 if (poll_now) {
2812 call_handler(poll_params.poll_handler, &poll_params, 0);
2813 }
2814 } else {
2815 /* Wait for events with no timeout */
2816 nevents = epoll_wait(epollfd, events, maxevents, -1);
2817 }
2818
2819 if (nevents == -1) {
2820 if (errno == EINTR)
2821 continue;
2822 ALOGE("epoll_wait failed (errno=%d)", errno);
2823 continue;
2824 }
2825
2826 /*
2827 * First pass to see if any data socket connections were dropped.
2828 * Dropped connection should be handled before any other events
2829 * to deallocate data connection and correctly handle cases when
2830 * connection gets dropped and reestablished in the same epoll cycle.
2831 * In such cases it's essential to handle connection closures first.
2832 */
2833 for (i = 0, evt = &events[0]; i < nevents; ++i, evt++) {
2834 if ((evt->events & EPOLLHUP) && evt->data.ptr) {
2835 ALOGI("lmkd data connection dropped");
2836 handler_info = (struct event_handler_info*)evt->data.ptr;
2837 ctrl_data_close(handler_info->data);
2838 }
2839 }
2840
2841 /* Second pass to handle all other events */
2842 for (i = 0, evt = &events[0]; i < nevents; ++i, evt++) {
2843 if (evt->events & EPOLLERR) {
2844 ALOGD("EPOLLERR on event #%d", i);
2845 }
2846 if (evt->events & EPOLLHUP) {
2847 /* This case was handled in the first pass */
2848 continue;
2849 }
2850 if (evt->data.ptr) {
2851 handler_info = (struct event_handler_info*)evt->data.ptr;
2852 call_handler(handler_info, &poll_params, evt->events);
2853 }
2854 }
2855 }
2856 }
2857
main(int argc __unused,char ** argv __unused)2858 int main(int argc __unused, char **argv __unused) {
2859 struct sched_param param = {
2860 .sched_priority = 1,
2861 };
2862
2863 /* By default disable low level vmpressure events */
2864 level_oomadj[VMPRESS_LEVEL_LOW] =
2865 property_get_int32("ro.lmk.low", OOM_SCORE_ADJ_MAX + 1);
2866 level_oomadj[VMPRESS_LEVEL_MEDIUM] =
2867 property_get_int32("ro.lmk.medium", 800);
2868 level_oomadj[VMPRESS_LEVEL_CRITICAL] =
2869 property_get_int32("ro.lmk.critical", 0);
2870 debug_process_killing = property_get_bool("ro.lmk.debug", false);
2871
2872 /* By default disable upgrade/downgrade logic */
2873 enable_pressure_upgrade =
2874 property_get_bool("ro.lmk.critical_upgrade", false);
2875 upgrade_pressure =
2876 (int64_t)property_get_int32("ro.lmk.upgrade_pressure", 100);
2877 downgrade_pressure =
2878 (int64_t)property_get_int32("ro.lmk.downgrade_pressure", 100);
2879 kill_heaviest_task =
2880 property_get_bool("ro.lmk.kill_heaviest_task", false);
2881 low_ram_device = property_get_bool("ro.config.low_ram", false);
2882 kill_timeout_ms =
2883 (unsigned long)property_get_int32("ro.lmk.kill_timeout_ms", 0);
2884 use_minfree_levels =
2885 property_get_bool("ro.lmk.use_minfree_levels", false);
2886 per_app_memcg =
2887 property_get_bool("ro.config.per_app_memcg", low_ram_device);
2888 swap_free_low_percentage = clamp(0, 100, property_get_int32("ro.lmk.swap_free_low_percentage",
2889 low_ram_device ? DEF_LOW_SWAP_LOWRAM : DEF_LOW_SWAP));
2890 psi_partial_stall_ms = property_get_int32("ro.lmk.psi_partial_stall_ms",
2891 low_ram_device ? DEF_PARTIAL_STALL_LOWRAM : DEF_PARTIAL_STALL);
2892 psi_complete_stall_ms = property_get_int32("ro.lmk.psi_complete_stall_ms",
2893 DEF_COMPLETE_STALL);
2894 thrashing_limit_pct = max(0, property_get_int32("ro.lmk.thrashing_limit",
2895 low_ram_device ? DEF_THRASHING_LOWRAM : DEF_THRASHING));
2896 thrashing_limit_decay_pct = clamp(0, 100, property_get_int32("ro.lmk.thrashing_limit_decay",
2897 low_ram_device ? DEF_THRASHING_DECAY_LOWRAM : DEF_THRASHING_DECAY));
2898
2899 ctx = create_android_logger(KILLINFO_LOG_TAG);
2900
2901 statslog_init();
2902
2903 if (!init()) {
2904 if (!use_inkernel_interface) {
2905 /*
2906 * MCL_ONFAULT pins pages as they fault instead of loading
2907 * everything immediately all at once. (Which would be bad,
2908 * because as of this writing, we have a lot of mapped pages we
2909 * never use.) Old kernels will see MCL_ONFAULT and fail with
2910 * EINVAL; we ignore this failure.
2911 *
2912 * N.B. read the man page for mlockall. MCL_CURRENT | MCL_ONFAULT
2913 * pins ⊆ MCL_CURRENT, converging to just MCL_CURRENT as we fault
2914 * in pages.
2915 */
2916 /* CAP_IPC_LOCK required */
2917 if (mlockall(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT) && (errno != EINVAL)) {
2918 ALOGW("mlockall failed %s", strerror(errno));
2919 }
2920
2921 /* CAP_NICE required */
2922 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) {
2923 ALOGW("set SCHED_FIFO failed %s", strerror(errno));
2924 }
2925 }
2926
2927 mainloop();
2928 }
2929
2930 statslog_destroy();
2931
2932 android_log_destroy(&ctx);
2933
2934 ALOGI("exiting");
2935 return 0;
2936 }
2937