1 /*
2  * Copyright (C) 2013 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "lowmemorykiller"
18 
19 #include <dirent.h>
20 #include <errno.h>
21 #include <inttypes.h>
22 #include <pwd.h>
23 #include <sched.h>
24 #include <signal.h>
25 #include <stdbool.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <sys/cdefs.h>
29 #include <sys/epoll.h>
30 #include <sys/eventfd.h>
31 #include <sys/mman.h>
32 #include <sys/resource.h>
33 #include <sys/socket.h>
34 #include <sys/syscall.h>
35 #include <sys/sysinfo.h>
36 #include <sys/time.h>
37 #include <sys/types.h>
38 #include <time.h>
39 #include <unistd.h>
40 
41 #include <cutils/properties.h>
42 #include <cutils/sched_policy.h>
43 #include <cutils/sockets.h>
44 #include <lmkd.h>
45 #include <log/log.h>
46 #include <log/log_event_list.h>
47 #include <log/log_time.h>
48 #include <psi/psi.h>
49 #include <system/thread_defs.h>
50 
51 #include "statslog.h"
52 
53 /*
54  * Define LMKD_TRACE_KILLS to record lmkd kills in kernel traces
55  * to profile and correlate with OOM kills
56  */
57 #ifdef LMKD_TRACE_KILLS
58 
59 #define ATRACE_TAG ATRACE_TAG_ALWAYS
60 #include <cutils/trace.h>
61 
62 #define TRACE_KILL_START(pid) ATRACE_INT(__FUNCTION__, pid);
63 #define TRACE_KILL_END()      ATRACE_INT(__FUNCTION__, 0);
64 
65 #else /* LMKD_TRACE_KILLS */
66 
67 #define TRACE_KILL_START(pid) ((void)(pid))
68 #define TRACE_KILL_END() ((void)0)
69 
70 #endif /* LMKD_TRACE_KILLS */
71 
72 #ifndef __unused
73 #define __unused __attribute__((__unused__))
74 #endif
75 
76 #define MEMCG_SYSFS_PATH "/dev/memcg/"
77 #define MEMCG_MEMORY_USAGE "/dev/memcg/memory.usage_in_bytes"
78 #define MEMCG_MEMORYSW_USAGE "/dev/memcg/memory.memsw.usage_in_bytes"
79 #define ZONEINFO_PATH "/proc/zoneinfo"
80 #define MEMINFO_PATH "/proc/meminfo"
81 #define VMSTAT_PATH "/proc/vmstat"
82 #define PROC_STATUS_TGID_FIELD "Tgid:"
83 #define LINE_MAX 128
84 
85 #define PERCEPTIBLE_APP_ADJ 200
86 
87 /* Android Logger event logtags (see event.logtags) */
88 #define KILLINFO_LOG_TAG 10195355
89 
90 /* gid containing AID_SYSTEM required */
91 #define INKERNEL_MINFREE_PATH "/sys/module/lowmemorykiller/parameters/minfree"
92 #define INKERNEL_ADJ_PATH "/sys/module/lowmemorykiller/parameters/adj"
93 
94 #define ARRAY_SIZE(x)   (sizeof(x) / sizeof(*(x)))
95 #define EIGHT_MEGA (1 << 23)
96 
97 #define TARGET_UPDATE_MIN_INTERVAL_MS 1000
98 
99 #define NS_PER_MS (NS_PER_SEC / MS_PER_SEC)
100 #define US_PER_MS (US_PER_SEC / MS_PER_SEC)
101 
102 /* Defined as ProcessList.SYSTEM_ADJ in ProcessList.java */
103 #define SYSTEM_ADJ (-900)
104 
105 #define STRINGIFY(x) STRINGIFY_INTERNAL(x)
106 #define STRINGIFY_INTERNAL(x) #x
107 
108 /*
109  * PSI monitor tracking window size.
110  * PSI monitor generates events at most once per window,
111  * therefore we poll memory state for the duration of
112  * PSI_WINDOW_SIZE_MS after the event happens.
113  */
114 #define PSI_WINDOW_SIZE_MS 1000
115 /* Polling period after PSI signal when pressure is high */
116 #define PSI_POLL_PERIOD_SHORT_MS 10
117 /* Polling period after PSI signal when pressure is low */
118 #define PSI_POLL_PERIOD_LONG_MS 100
119 
120 #define min(a, b) (((a) < (b)) ? (a) : (b))
121 #define max(a, b) (((a) > (b)) ? (a) : (b))
122 
123 #define FAIL_REPORT_RLIMIT_MS 1000
124 
125 /*
126  * System property defaults
127  */
128 /* ro.lmk.swap_free_low_percentage property defaults */
129 #define DEF_LOW_SWAP_LOWRAM 10
130 #define DEF_LOW_SWAP 20
131 /* ro.lmk.thrashing_limit property defaults */
132 #define DEF_THRASHING_LOWRAM 30
133 #define DEF_THRASHING 100
134 /* ro.lmk.thrashing_limit_decay property defaults */
135 #define DEF_THRASHING_DECAY_LOWRAM 50
136 #define DEF_THRASHING_DECAY 10
137 /* ro.lmk.psi_partial_stall_ms property defaults */
138 #define DEF_PARTIAL_STALL_LOWRAM 200
139 #define DEF_PARTIAL_STALL 70
140 /* ro.lmk.psi_complete_stall_ms property defaults */
141 #define DEF_COMPLETE_STALL 700
142 
sys_pidfd_open(pid_t pid,unsigned int flags)143 static inline int sys_pidfd_open(pid_t pid, unsigned int flags) {
144     return syscall(__NR_pidfd_open, pid, flags);
145 }
146 
sys_pidfd_send_signal(int pidfd,int sig,siginfo_t * info,unsigned int flags)147 static inline int sys_pidfd_send_signal(int pidfd, int sig, siginfo_t *info,
148                                         unsigned int flags) {
149     return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags);
150 }
151 
152 /* default to old in-kernel interface if no memory pressure events */
153 static bool use_inkernel_interface = true;
154 static bool has_inkernel_module;
155 
156 /* memory pressure levels */
157 enum vmpressure_level {
158     VMPRESS_LEVEL_LOW = 0,
159     VMPRESS_LEVEL_MEDIUM,
160     VMPRESS_LEVEL_CRITICAL,
161     VMPRESS_LEVEL_COUNT
162 };
163 
164 static const char *level_name[] = {
165     "low",
166     "medium",
167     "critical"
168 };
169 
170 struct {
171     int64_t min_nr_free_pages; /* recorded but not used yet */
172     int64_t max_nr_free_pages;
173 } low_pressure_mem = { -1, -1 };
174 
175 struct psi_threshold {
176     enum psi_stall_type stall_type;
177     int threshold_ms;
178 };
179 
180 static int level_oomadj[VMPRESS_LEVEL_COUNT];
181 static int mpevfd[VMPRESS_LEVEL_COUNT] = { -1, -1, -1 };
182 static bool pidfd_supported;
183 static int last_kill_pid_or_fd = -1;
184 static struct timespec last_kill_tm;
185 
186 /* lmkd configurable parameters */
187 static bool debug_process_killing;
188 static bool enable_pressure_upgrade;
189 static int64_t upgrade_pressure;
190 static int64_t downgrade_pressure;
191 static bool low_ram_device;
192 static bool kill_heaviest_task;
193 static unsigned long kill_timeout_ms;
194 static bool use_minfree_levels;
195 static bool per_app_memcg;
196 static int swap_free_low_percentage;
197 static int psi_partial_stall_ms;
198 static int psi_complete_stall_ms;
199 static int thrashing_limit_pct;
200 static int thrashing_limit_decay_pct;
201 static bool use_psi_monitors = false;
202 static struct kernel_poll_info kpoll_info;
203 static struct psi_threshold psi_thresholds[VMPRESS_LEVEL_COUNT] = {
204     { PSI_SOME, 70 },    /* 70ms out of 1sec for partial stall */
205     { PSI_SOME, 100 },   /* 100ms out of 1sec for partial stall */
206     { PSI_FULL, 70 },    /* 70ms out of 1sec for complete stall */
207 };
208 
209 static android_log_context ctx;
210 
211 enum polling_update {
212     POLLING_DO_NOT_CHANGE,
213     POLLING_START,
214     POLLING_STOP,
215     POLLING_PAUSE,
216     POLLING_RESUME,
217 };
218 
219 /*
220  * Data used for periodic polling for the memory state of the device.
221  * Note that when system is not polling poll_handler is set to NULL,
222  * when polling starts poll_handler gets set and is reset back to
223  * NULL when polling stops.
224  */
225 struct polling_params {
226     struct event_handler_info* poll_handler;
227     struct event_handler_info* paused_handler;
228     struct timespec poll_start_tm;
229     struct timespec last_poll_tm;
230     int polling_interval_ms;
231     enum polling_update update;
232 };
233 
234 /* data required to handle events */
235 struct event_handler_info {
236     int data;
237     void (*handler)(int data, uint32_t events, struct polling_params *poll_params);
238 };
239 
240 /* data required to handle socket events */
241 struct sock_event_handler_info {
242     int sock;
243     struct event_handler_info handler_info;
244 };
245 
246 /* max supported number of data connections */
247 #define MAX_DATA_CONN 2
248 
249 /* socket event handler data */
250 static struct sock_event_handler_info ctrl_sock;
251 static struct sock_event_handler_info data_sock[MAX_DATA_CONN];
252 
253 /* vmpressure event handler data */
254 static struct event_handler_info vmpressure_hinfo[VMPRESS_LEVEL_COUNT];
255 
256 /*
257  * 1 ctrl listen socket, 2 ctrl data socket, 3 memory pressure levels,
258  * 1 lmk events + 1 fd to wait for process death
259  */
260 #define MAX_EPOLL_EVENTS (1 + MAX_DATA_CONN + VMPRESS_LEVEL_COUNT + 1 + 1)
261 static int epollfd;
262 static int maxevents;
263 
264 /* OOM score values used by both kernel and framework */
265 #define OOM_SCORE_ADJ_MIN       (-1000)
266 #define OOM_SCORE_ADJ_MAX       1000
267 
268 static int lowmem_adj[MAX_TARGETS];
269 static int lowmem_minfree[MAX_TARGETS];
270 static int lowmem_targets_size;
271 
272 /* Fields to parse in /proc/zoneinfo */
273 /* zoneinfo per-zone fields */
274 enum zoneinfo_zone_field {
275     ZI_ZONE_NR_FREE_PAGES = 0,
276     ZI_ZONE_MIN,
277     ZI_ZONE_LOW,
278     ZI_ZONE_HIGH,
279     ZI_ZONE_PRESENT,
280     ZI_ZONE_NR_FREE_CMA,
281     ZI_ZONE_FIELD_COUNT
282 };
283 
284 static const char* const zoneinfo_zone_field_names[ZI_ZONE_FIELD_COUNT] = {
285     "nr_free_pages",
286     "min",
287     "low",
288     "high",
289     "present",
290     "nr_free_cma",
291 };
292 
293 /* zoneinfo per-zone special fields */
294 enum zoneinfo_zone_spec_field {
295     ZI_ZONE_SPEC_PROTECTION = 0,
296     ZI_ZONE_SPEC_PAGESETS,
297     ZI_ZONE_SPEC_FIELD_COUNT,
298 };
299 
300 static const char* const zoneinfo_zone_spec_field_names[ZI_ZONE_SPEC_FIELD_COUNT] = {
301     "protection:",
302     "pagesets",
303 };
304 
305 /* see __MAX_NR_ZONES definition in kernel mmzone.h */
306 #define MAX_NR_ZONES 6
307 
308 union zoneinfo_zone_fields {
309     struct {
310         int64_t nr_free_pages;
311         int64_t min;
312         int64_t low;
313         int64_t high;
314         int64_t present;
315         int64_t nr_free_cma;
316     } field;
317     int64_t arr[ZI_ZONE_FIELD_COUNT];
318 };
319 
320 struct zoneinfo_zone {
321     union zoneinfo_zone_fields fields;
322     int64_t protection[MAX_NR_ZONES];
323     int64_t max_protection;
324 };
325 
326 /* zoneinfo per-node fields */
327 enum zoneinfo_node_field {
328     ZI_NODE_NR_INACTIVE_FILE = 0,
329     ZI_NODE_NR_ACTIVE_FILE,
330     ZI_NODE_WORKINGSET_REFAULT,
331     ZI_NODE_FIELD_COUNT
332 };
333 
334 static const char* const zoneinfo_node_field_names[ZI_NODE_FIELD_COUNT] = {
335     "nr_inactive_file",
336     "nr_active_file",
337     "workingset_refault",
338 };
339 
340 union zoneinfo_node_fields {
341     struct {
342         int64_t nr_inactive_file;
343         int64_t nr_active_file;
344         int64_t workingset_refault;
345     } field;
346     int64_t arr[ZI_NODE_FIELD_COUNT];
347 };
348 
349 struct zoneinfo_node {
350     int id;
351     int zone_count;
352     struct zoneinfo_zone zones[MAX_NR_ZONES];
353     union zoneinfo_node_fields fields;
354 };
355 
356 /* for now two memory nodes is more than enough */
357 #define MAX_NR_NODES 2
358 
359 struct zoneinfo {
360     int node_count;
361     struct zoneinfo_node nodes[MAX_NR_NODES];
362     int64_t totalreserve_pages;
363     int64_t total_inactive_file;
364     int64_t total_active_file;
365     int64_t total_workingset_refault;
366 };
367 
368 /* Fields to parse in /proc/meminfo */
369 enum meminfo_field {
370     MI_NR_FREE_PAGES = 0,
371     MI_CACHED,
372     MI_SWAP_CACHED,
373     MI_BUFFERS,
374     MI_SHMEM,
375     MI_UNEVICTABLE,
376     MI_TOTAL_SWAP,
377     MI_FREE_SWAP,
378     MI_ACTIVE_ANON,
379     MI_INACTIVE_ANON,
380     MI_ACTIVE_FILE,
381     MI_INACTIVE_FILE,
382     MI_SRECLAIMABLE,
383     MI_SUNRECLAIM,
384     MI_KERNEL_STACK,
385     MI_PAGE_TABLES,
386     MI_ION_HELP,
387     MI_ION_HELP_POOL,
388     MI_CMA_FREE,
389     MI_FIELD_COUNT
390 };
391 
392 static const char* const meminfo_field_names[MI_FIELD_COUNT] = {
393     "MemFree:",
394     "Cached:",
395     "SwapCached:",
396     "Buffers:",
397     "Shmem:",
398     "Unevictable:",
399     "SwapTotal:",
400     "SwapFree:",
401     "Active(anon):",
402     "Inactive(anon):",
403     "Active(file):",
404     "Inactive(file):",
405     "SReclaimable:",
406     "SUnreclaim:",
407     "KernelStack:",
408     "PageTables:",
409     "ION_heap:",
410     "ION_heap_pool:",
411     "CmaFree:",
412 };
413 
414 union meminfo {
415     struct {
416         int64_t nr_free_pages;
417         int64_t cached;
418         int64_t swap_cached;
419         int64_t buffers;
420         int64_t shmem;
421         int64_t unevictable;
422         int64_t total_swap;
423         int64_t free_swap;
424         int64_t active_anon;
425         int64_t inactive_anon;
426         int64_t active_file;
427         int64_t inactive_file;
428         int64_t sreclaimable;
429         int64_t sunreclaimable;
430         int64_t kernel_stack;
431         int64_t page_tables;
432         int64_t ion_heap;
433         int64_t ion_heap_pool;
434         int64_t cma_free;
435         /* fields below are calculated rather than read from the file */
436         int64_t nr_file_pages;
437     } field;
438     int64_t arr[MI_FIELD_COUNT];
439 };
440 
441 /* Fields to parse in /proc/vmstat */
442 enum vmstat_field {
443     VS_FREE_PAGES,
444     VS_INACTIVE_FILE,
445     VS_ACTIVE_FILE,
446     VS_WORKINGSET_REFAULT,
447     VS_PGSCAN_KSWAPD,
448     VS_PGSCAN_DIRECT,
449     VS_PGSCAN_DIRECT_THROTTLE,
450     VS_FIELD_COUNT
451 };
452 
453 static const char* const vmstat_field_names[MI_FIELD_COUNT] = {
454     "nr_free_pages",
455     "nr_inactive_file",
456     "nr_active_file",
457     "workingset_refault",
458     "pgscan_kswapd",
459     "pgscan_direct",
460     "pgscan_direct_throttle",
461 };
462 
463 union vmstat {
464     struct {
465         int64_t nr_free_pages;
466         int64_t nr_inactive_file;
467         int64_t nr_active_file;
468         int64_t workingset_refault;
469         int64_t pgscan_kswapd;
470         int64_t pgscan_direct;
471         int64_t pgscan_direct_throttle;
472     } field;
473     int64_t arr[VS_FIELD_COUNT];
474 };
475 
476 enum field_match_result {
477     NO_MATCH,
478     PARSE_FAIL,
479     PARSE_SUCCESS
480 };
481 
482 struct adjslot_list {
483     struct adjslot_list *next;
484     struct adjslot_list *prev;
485 };
486 
487 struct proc {
488     struct adjslot_list asl;
489     int pid;
490     int pidfd;
491     uid_t uid;
492     int oomadj;
493     struct proc *pidhash_next;
494 };
495 
496 struct reread_data {
497     const char* const filename;
498     int fd;
499 };
500 
501 #define PIDHASH_SZ 1024
502 static struct proc *pidhash[PIDHASH_SZ];
503 #define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1))
504 
505 #define ADJTOSLOT(adj) ((adj) + -OOM_SCORE_ADJ_MIN)
506 #define ADJTOSLOT_COUNT (ADJTOSLOT(OOM_SCORE_ADJ_MAX) + 1)
507 static struct adjslot_list procadjslot_list[ADJTOSLOT_COUNT];
508 
509 #define MAX_DISTINCT_OOM_ADJ 32
510 #define KILLCNT_INVALID_IDX 0xFF
511 /*
512  * Because killcnt array is sparse a two-level indirection is used
513  * to keep the size small. killcnt_idx stores index of the element in
514  * killcnt array. Index KILLCNT_INVALID_IDX indicates an unused slot.
515  */
516 static uint8_t killcnt_idx[ADJTOSLOT_COUNT];
517 static uint16_t killcnt[MAX_DISTINCT_OOM_ADJ];
518 static int killcnt_free_idx = 0;
519 static uint32_t killcnt_total = 0;
520 
521 /* PAGE_SIZE / 1024 */
522 static long page_k;
523 
clamp(int low,int high,int value)524 static int clamp(int low, int high, int value) {
525     return max(min(value, high), low);
526 }
527 
parse_int64(const char * str,int64_t * ret)528 static bool parse_int64(const char* str, int64_t* ret) {
529     char* endptr;
530     long long val = strtoll(str, &endptr, 10);
531     if (str == endptr || val > INT64_MAX) {
532         return false;
533     }
534     *ret = (int64_t)val;
535     return true;
536 }
537 
find_field(const char * name,const char * const field_names[],int field_count)538 static int find_field(const char* name, const char* const field_names[], int field_count) {
539     for (int i = 0; i < field_count; i++) {
540         if (!strcmp(name, field_names[i])) {
541             return i;
542         }
543     }
544     return -1;
545 }
546 
match_field(const char * cp,const char * ap,const char * const field_names[],int field_count,int64_t * field,int * field_idx)547 static enum field_match_result match_field(const char* cp, const char* ap,
548                                    const char* const field_names[],
549                                    int field_count, int64_t* field,
550                                    int *field_idx) {
551     int i = find_field(cp, field_names, field_count);
552     if (i < 0) {
553         return NO_MATCH;
554     }
555     *field_idx = i;
556     return parse_int64(ap, field) ? PARSE_SUCCESS : PARSE_FAIL;
557 }
558 
559 /*
560  * Read file content from the beginning up to max_len bytes or EOF
561  * whichever happens first.
562  */
read_all(int fd,char * buf,size_t max_len)563 static ssize_t read_all(int fd, char *buf, size_t max_len)
564 {
565     ssize_t ret = 0;
566     off_t offset = 0;
567 
568     while (max_len > 0) {
569         ssize_t r = TEMP_FAILURE_RETRY(pread(fd, buf, max_len, offset));
570         if (r == 0) {
571             break;
572         }
573         if (r == -1) {
574             return -1;
575         }
576         ret += r;
577         buf += r;
578         offset += r;
579         max_len -= r;
580     }
581 
582     return ret;
583 }
584 
585 /*
586  * Read a new or already opened file from the beginning.
587  * If the file has not been opened yet data->fd should be set to -1.
588  * To be used with files which are read often and possibly during high
589  * memory pressure to minimize file opening which by itself requires kernel
590  * memory allocation and might result in a stall on memory stressed system.
591  */
reread_file(struct reread_data * data)592 static char *reread_file(struct reread_data *data) {
593     /* start with page-size buffer and increase if needed */
594     static ssize_t buf_size = PAGE_SIZE;
595     static char *new_buf, *buf = NULL;
596     ssize_t size;
597 
598     if (data->fd == -1) {
599         /* First-time buffer initialization */
600         if (!buf && (buf = malloc(buf_size)) == NULL) {
601             return NULL;
602         }
603 
604         data->fd = TEMP_FAILURE_RETRY(open(data->filename, O_RDONLY | O_CLOEXEC));
605         if (data->fd < 0) {
606             ALOGE("%s open: %s", data->filename, strerror(errno));
607             return NULL;
608         }
609     }
610 
611     while (true) {
612         size = read_all(data->fd, buf, buf_size - 1);
613         if (size < 0) {
614             ALOGE("%s read: %s", data->filename, strerror(errno));
615             close(data->fd);
616             data->fd = -1;
617             return NULL;
618         }
619         if (size < buf_size - 1) {
620             break;
621         }
622         /*
623          * Since we are reading /proc files we can't use fstat to find out
624          * the real size of the file. Double the buffer size and keep retrying.
625          */
626         if ((new_buf = realloc(buf, buf_size * 2)) == NULL) {
627             errno = ENOMEM;
628             return NULL;
629         }
630         buf = new_buf;
631         buf_size *= 2;
632     }
633     buf[size] = 0;
634 
635     return buf;
636 }
637 
pid_lookup(int pid)638 static struct proc *pid_lookup(int pid) {
639     struct proc *procp;
640 
641     for (procp = pidhash[pid_hashfn(pid)]; procp && procp->pid != pid;
642          procp = procp->pidhash_next)
643             ;
644 
645     return procp;
646 }
647 
adjslot_insert(struct adjslot_list * head,struct adjslot_list * new)648 static void adjslot_insert(struct adjslot_list *head, struct adjslot_list *new)
649 {
650     struct adjslot_list *next = head->next;
651     new->prev = head;
652     new->next = next;
653     next->prev = new;
654     head->next = new;
655 }
656 
adjslot_remove(struct adjslot_list * old)657 static void adjslot_remove(struct adjslot_list *old)
658 {
659     struct adjslot_list *prev = old->prev;
660     struct adjslot_list *next = old->next;
661     next->prev = prev;
662     prev->next = next;
663 }
664 
adjslot_tail(struct adjslot_list * head)665 static struct adjslot_list *adjslot_tail(struct adjslot_list *head) {
666     struct adjslot_list *asl = head->prev;
667 
668     return asl == head ? NULL : asl;
669 }
670 
proc_slot(struct proc * procp)671 static void proc_slot(struct proc *procp) {
672     int adjslot = ADJTOSLOT(procp->oomadj);
673 
674     adjslot_insert(&procadjslot_list[adjslot], &procp->asl);
675 }
676 
proc_unslot(struct proc * procp)677 static void proc_unslot(struct proc *procp) {
678     adjslot_remove(&procp->asl);
679 }
680 
proc_insert(struct proc * procp)681 static void proc_insert(struct proc *procp) {
682     int hval = pid_hashfn(procp->pid);
683 
684     procp->pidhash_next = pidhash[hval];
685     pidhash[hval] = procp;
686     proc_slot(procp);
687 }
688 
pid_remove(int pid)689 static int pid_remove(int pid) {
690     int hval = pid_hashfn(pid);
691     struct proc *procp;
692     struct proc *prevp;
693 
694     for (procp = pidhash[hval], prevp = NULL; procp && procp->pid != pid;
695          procp = procp->pidhash_next)
696             prevp = procp;
697 
698     if (!procp)
699         return -1;
700 
701     if (!prevp)
702         pidhash[hval] = procp->pidhash_next;
703     else
704         prevp->pidhash_next = procp->pidhash_next;
705 
706     proc_unslot(procp);
707     /*
708      * Close pidfd here if we are not waiting for corresponding process to die,
709      * in which case stop_wait_for_proc_kill() will close the pidfd later
710      */
711     if (procp->pidfd >= 0 && procp->pidfd != last_kill_pid_or_fd) {
712         close(procp->pidfd);
713     }
714     free(procp);
715     return 0;
716 }
717 
718 /*
719  * Write a string to a file.
720  * Returns false if the file does not exist.
721  */
writefilestring(const char * path,const char * s,bool err_if_missing)722 static bool writefilestring(const char *path, const char *s,
723                             bool err_if_missing) {
724     int fd = open(path, O_WRONLY | O_CLOEXEC);
725     ssize_t len = strlen(s);
726     ssize_t ret;
727 
728     if (fd < 0) {
729         if (err_if_missing) {
730             ALOGE("Error opening %s; errno=%d", path, errno);
731         }
732         return false;
733     }
734 
735     ret = TEMP_FAILURE_RETRY(write(fd, s, len));
736     if (ret < 0) {
737         ALOGE("Error writing %s; errno=%d", path, errno);
738     } else if (ret < len) {
739         ALOGE("Short write on %s; length=%zd", path, ret);
740     }
741 
742     close(fd);
743     return true;
744 }
745 
get_time_diff_ms(struct timespec * from,struct timespec * to)746 static inline long get_time_diff_ms(struct timespec *from,
747                                     struct timespec *to) {
748     return (to->tv_sec - from->tv_sec) * (long)MS_PER_SEC +
749            (to->tv_nsec - from->tv_nsec) / (long)NS_PER_MS;
750 }
751 
proc_get_tgid(int pid)752 static int proc_get_tgid(int pid) {
753     char path[PATH_MAX];
754     char buf[PAGE_SIZE];
755     int fd;
756     ssize_t size;
757     char *pos;
758     int64_t tgid = -1;
759 
760     snprintf(path, PATH_MAX, "/proc/%d/status", pid);
761     fd = open(path, O_RDONLY | O_CLOEXEC);
762     if (fd < 0) {
763         return -1;
764     }
765 
766     size = read_all(fd, buf, sizeof(buf) - 1);
767     if (size < 0) {
768         goto out;
769     }
770     buf[size] = 0;
771 
772     pos = buf;
773     while (true) {
774         pos = strstr(pos, PROC_STATUS_TGID_FIELD);
775         /* Stop if TGID tag not found or found at the line beginning */
776         if (pos == NULL || pos == buf || pos[-1] == '\n') {
777             break;
778         }
779         pos++;
780     }
781 
782     if (pos == NULL) {
783         goto out;
784     }
785 
786     pos += strlen(PROC_STATUS_TGID_FIELD);
787     while (*pos == ' ') pos++;
788     parse_int64(pos, &tgid);
789 
790 out:
791     close(fd);
792     return (int)tgid;
793 }
794 
proc_get_size(int pid)795 static int proc_get_size(int pid) {
796     char path[PATH_MAX];
797     char line[LINE_MAX];
798     int fd;
799     int rss = 0;
800     int total;
801     ssize_t ret;
802 
803     /* gid containing AID_READPROC required */
804     snprintf(path, PATH_MAX, "/proc/%d/statm", pid);
805     fd = open(path, O_RDONLY | O_CLOEXEC);
806     if (fd == -1)
807         return -1;
808 
809     ret = read_all(fd, line, sizeof(line) - 1);
810     if (ret < 0) {
811         close(fd);
812         return -1;
813     }
814     line[ret] = '\0';
815 
816     sscanf(line, "%d %d ", &total, &rss);
817     close(fd);
818     return rss;
819 }
820 
proc_get_name(int pid,char * buf,size_t buf_size)821 static char *proc_get_name(int pid, char *buf, size_t buf_size) {
822     char path[PATH_MAX];
823     int fd;
824     char *cp;
825     ssize_t ret;
826 
827     /* gid containing AID_READPROC required */
828     snprintf(path, PATH_MAX, "/proc/%d/cmdline", pid);
829     fd = open(path, O_RDONLY | O_CLOEXEC);
830     if (fd == -1) {
831         return NULL;
832     }
833     ret = read_all(fd, buf, buf_size - 1);
834     close(fd);
835     if (ret < 0) {
836         return NULL;
837     }
838     buf[ret] = '\0';
839 
840     cp = strchr(buf, ' ');
841     if (cp) {
842         *cp = '\0';
843     }
844 
845     return buf;
846 }
847 
cmd_procprio(LMKD_CTRL_PACKET packet)848 static void cmd_procprio(LMKD_CTRL_PACKET packet) {
849     struct proc *procp;
850     char path[LINE_MAX];
851     char val[20];
852     int soft_limit_mult;
853     struct lmk_procprio params;
854     bool is_system_server;
855     struct passwd *pwdrec;
856     int tgid;
857 
858     lmkd_pack_get_procprio(packet, &params);
859 
860     if (params.oomadj < OOM_SCORE_ADJ_MIN ||
861         params.oomadj > OOM_SCORE_ADJ_MAX) {
862         ALOGE("Invalid PROCPRIO oomadj argument %d", params.oomadj);
863         return;
864     }
865 
866     /* Check if registered process is a thread group leader */
867     tgid = proc_get_tgid(params.pid);
868     if (tgid >= 0 && tgid != params.pid) {
869         ALOGE("Attempt to register a task that is not a thread group leader (tid %d, tgid %d)",
870             params.pid, tgid);
871         return;
872     }
873 
874     /* gid containing AID_READPROC required */
875     /* CAP_SYS_RESOURCE required */
876     /* CAP_DAC_OVERRIDE required */
877     snprintf(path, sizeof(path), "/proc/%d/oom_score_adj", params.pid);
878     snprintf(val, sizeof(val), "%d", params.oomadj);
879     if (!writefilestring(path, val, false)) {
880         ALOGW("Failed to open %s; errno=%d: process %d might have been killed",
881               path, errno, params.pid);
882         /* If this file does not exist the process is dead. */
883         return;
884     }
885 
886     if (use_inkernel_interface) {
887         stats_store_taskname(params.pid, proc_get_name(params.pid, path, sizeof(path)),
888                              kpoll_info.poll_fd);
889         return;
890     }
891 
892     if (per_app_memcg) {
893         if (params.oomadj >= 900) {
894             soft_limit_mult = 0;
895         } else if (params.oomadj >= 800) {
896             soft_limit_mult = 0;
897         } else if (params.oomadj >= 700) {
898             soft_limit_mult = 0;
899         } else if (params.oomadj >= 600) {
900             // Launcher should be perceptible, don't kill it.
901             params.oomadj = 200;
902             soft_limit_mult = 1;
903         } else if (params.oomadj >= 500) {
904             soft_limit_mult = 0;
905         } else if (params.oomadj >= 400) {
906             soft_limit_mult = 0;
907         } else if (params.oomadj >= 300) {
908             soft_limit_mult = 1;
909         } else if (params.oomadj >= 200) {
910             soft_limit_mult = 8;
911         } else if (params.oomadj >= 100) {
912             soft_limit_mult = 10;
913         } else if (params.oomadj >=   0) {
914             soft_limit_mult = 20;
915         } else {
916             // Persistent processes will have a large
917             // soft limit 512MB.
918             soft_limit_mult = 64;
919         }
920 
921         snprintf(path, sizeof(path), MEMCG_SYSFS_PATH
922                  "apps/uid_%d/pid_%d/memory.soft_limit_in_bytes",
923                  params.uid, params.pid);
924         snprintf(val, sizeof(val), "%d", soft_limit_mult * EIGHT_MEGA);
925 
926         /*
927          * system_server process has no memcg under /dev/memcg/apps but should be
928          * registered with lmkd. This is the best way so far to identify it.
929          */
930         is_system_server = (params.oomadj == SYSTEM_ADJ &&
931                             (pwdrec = getpwnam("system")) != NULL &&
932                             params.uid == pwdrec->pw_uid);
933         writefilestring(path, val, !is_system_server);
934     }
935 
936     procp = pid_lookup(params.pid);
937     if (!procp) {
938         int pidfd = -1;
939 
940         if (pidfd_supported) {
941             pidfd = TEMP_FAILURE_RETRY(sys_pidfd_open(params.pid, 0));
942             if (pidfd < 0) {
943                 ALOGE("pidfd_open for pid %d failed; errno=%d", params.pid, errno);
944                 return;
945             }
946         }
947 
948         procp = calloc(1, sizeof(struct proc));
949         if (!procp) {
950             // Oh, the irony.  May need to rebuild our state.
951             return;
952         }
953 
954         procp->pid = params.pid;
955         procp->pidfd = pidfd;
956         procp->uid = params.uid;
957         procp->oomadj = params.oomadj;
958         proc_insert(procp);
959     } else {
960         proc_unslot(procp);
961         procp->oomadj = params.oomadj;
962         proc_slot(procp);
963     }
964 }
965 
cmd_procremove(LMKD_CTRL_PACKET packet)966 static void cmd_procremove(LMKD_CTRL_PACKET packet) {
967     struct lmk_procremove params;
968 
969     lmkd_pack_get_procremove(packet, &params);
970     if (use_inkernel_interface) {
971         stats_remove_taskname(params.pid, kpoll_info.poll_fd);
972         return;
973     }
974 
975     /*
976      * WARNING: After pid_remove() procp is freed and can't be used!
977      * Therefore placed at the end of the function.
978      */
979     pid_remove(params.pid);
980 }
981 
cmd_procpurge()982 static void cmd_procpurge() {
983     int i;
984     struct proc *procp;
985     struct proc *next;
986 
987     if (use_inkernel_interface) {
988         stats_purge_tasknames();
989         return;
990     }
991 
992     for (i = 0; i <= ADJTOSLOT(OOM_SCORE_ADJ_MAX); i++) {
993         procadjslot_list[i].next = &procadjslot_list[i];
994         procadjslot_list[i].prev = &procadjslot_list[i];
995     }
996 
997     for (i = 0; i < PIDHASH_SZ; i++) {
998         procp = pidhash[i];
999         while (procp) {
1000             next = procp->pidhash_next;
1001             free(procp);
1002             procp = next;
1003         }
1004     }
1005     memset(&pidhash[0], 0, sizeof(pidhash));
1006 }
1007 
inc_killcnt(int oomadj)1008 static void inc_killcnt(int oomadj) {
1009     int slot = ADJTOSLOT(oomadj);
1010     uint8_t idx = killcnt_idx[slot];
1011 
1012     if (idx == KILLCNT_INVALID_IDX) {
1013         /* index is not assigned for this oomadj */
1014         if (killcnt_free_idx < MAX_DISTINCT_OOM_ADJ) {
1015             killcnt_idx[slot] = killcnt_free_idx;
1016             killcnt[killcnt_free_idx] = 1;
1017             killcnt_free_idx++;
1018         } else {
1019             ALOGW("Number of distinct oomadj levels exceeds %d",
1020                 MAX_DISTINCT_OOM_ADJ);
1021         }
1022     } else {
1023         /*
1024          * wraparound is highly unlikely and is detectable using total
1025          * counter because it has to be equal to the sum of all counters
1026          */
1027         killcnt[idx]++;
1028     }
1029     /* increment total kill counter */
1030     killcnt_total++;
1031 }
1032 
get_killcnt(int min_oomadj,int max_oomadj)1033 static int get_killcnt(int min_oomadj, int max_oomadj) {
1034     int slot;
1035     int count = 0;
1036 
1037     if (min_oomadj > max_oomadj)
1038         return 0;
1039 
1040     /* special case to get total kill count */
1041     if (min_oomadj > OOM_SCORE_ADJ_MAX)
1042         return killcnt_total;
1043 
1044     while (min_oomadj <= max_oomadj &&
1045            (slot = ADJTOSLOT(min_oomadj)) < ADJTOSLOT_COUNT) {
1046         uint8_t idx = killcnt_idx[slot];
1047         if (idx != KILLCNT_INVALID_IDX) {
1048             count += killcnt[idx];
1049         }
1050         min_oomadj++;
1051     }
1052 
1053     return count;
1054 }
1055 
cmd_getkillcnt(LMKD_CTRL_PACKET packet)1056 static int cmd_getkillcnt(LMKD_CTRL_PACKET packet) {
1057     struct lmk_getkillcnt params;
1058 
1059     if (use_inkernel_interface) {
1060         /* kernel driver does not expose this information */
1061         return 0;
1062     }
1063 
1064     lmkd_pack_get_getkillcnt(packet, &params);
1065 
1066     return get_killcnt(params.min_oomadj, params.max_oomadj);
1067 }
1068 
cmd_target(int ntargets,LMKD_CTRL_PACKET packet)1069 static void cmd_target(int ntargets, LMKD_CTRL_PACKET packet) {
1070     int i;
1071     struct lmk_target target;
1072     char minfree_str[PROPERTY_VALUE_MAX];
1073     char *pstr = minfree_str;
1074     char *pend = minfree_str + sizeof(minfree_str);
1075     static struct timespec last_req_tm;
1076     struct timespec curr_tm;
1077 
1078     if (ntargets < 1 || ntargets > (int)ARRAY_SIZE(lowmem_adj))
1079         return;
1080 
1081     /*
1082      * Ratelimit minfree updates to once per TARGET_UPDATE_MIN_INTERVAL_MS
1083      * to prevent DoS attacks
1084      */
1085     if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
1086         ALOGE("Failed to get current time");
1087         return;
1088     }
1089 
1090     if (get_time_diff_ms(&last_req_tm, &curr_tm) <
1091         TARGET_UPDATE_MIN_INTERVAL_MS) {
1092         ALOGE("Ignoring frequent updated to lmkd limits");
1093         return;
1094     }
1095 
1096     last_req_tm = curr_tm;
1097 
1098     for (i = 0; i < ntargets; i++) {
1099         lmkd_pack_get_target(packet, i, &target);
1100         lowmem_minfree[i] = target.minfree;
1101         lowmem_adj[i] = target.oom_adj_score;
1102 
1103         pstr += snprintf(pstr, pend - pstr, "%d:%d,", target.minfree,
1104             target.oom_adj_score);
1105         if (pstr >= pend) {
1106             /* if no more space in the buffer then terminate the loop */
1107             pstr = pend;
1108             break;
1109         }
1110     }
1111 
1112     lowmem_targets_size = ntargets;
1113 
1114     /* Override the last extra comma */
1115     pstr[-1] = '\0';
1116     property_set("sys.lmk.minfree_levels", minfree_str);
1117 
1118     if (has_inkernel_module) {
1119         char minfreestr[128];
1120         char killpriostr[128];
1121 
1122         minfreestr[0] = '\0';
1123         killpriostr[0] = '\0';
1124 
1125         for (i = 0; i < lowmem_targets_size; i++) {
1126             char val[40];
1127 
1128             if (i) {
1129                 strlcat(minfreestr, ",", sizeof(minfreestr));
1130                 strlcat(killpriostr, ",", sizeof(killpriostr));
1131             }
1132 
1133             snprintf(val, sizeof(val), "%d", use_inkernel_interface ? lowmem_minfree[i] : 0);
1134             strlcat(minfreestr, val, sizeof(minfreestr));
1135             snprintf(val, sizeof(val), "%d", use_inkernel_interface ? lowmem_adj[i] : 0);
1136             strlcat(killpriostr, val, sizeof(killpriostr));
1137         }
1138 
1139         writefilestring(INKERNEL_MINFREE_PATH, minfreestr, true);
1140         writefilestring(INKERNEL_ADJ_PATH, killpriostr, true);
1141     }
1142 }
1143 
ctrl_data_close(int dsock_idx)1144 static void ctrl_data_close(int dsock_idx) {
1145     struct epoll_event epev;
1146 
1147     ALOGI("closing lmkd data connection");
1148     if (epoll_ctl(epollfd, EPOLL_CTL_DEL, data_sock[dsock_idx].sock, &epev) == -1) {
1149         // Log a warning and keep going
1150         ALOGW("epoll_ctl for data connection socket failed; errno=%d", errno);
1151     }
1152     maxevents--;
1153 
1154     close(data_sock[dsock_idx].sock);
1155     data_sock[dsock_idx].sock = -1;
1156 }
1157 
ctrl_data_read(int dsock_idx,char * buf,size_t bufsz)1158 static int ctrl_data_read(int dsock_idx, char *buf, size_t bufsz) {
1159     int ret = 0;
1160 
1161     ret = TEMP_FAILURE_RETRY(read(data_sock[dsock_idx].sock, buf, bufsz));
1162 
1163     if (ret == -1) {
1164         ALOGE("control data socket read failed; errno=%d", errno);
1165     } else if (ret == 0) {
1166         ALOGE("Got EOF on control data socket");
1167         ret = -1;
1168     }
1169 
1170     return ret;
1171 }
1172 
ctrl_data_write(int dsock_idx,char * buf,size_t bufsz)1173 static int ctrl_data_write(int dsock_idx, char *buf, size_t bufsz) {
1174     int ret = 0;
1175 
1176     ret = TEMP_FAILURE_RETRY(write(data_sock[dsock_idx].sock, buf, bufsz));
1177 
1178     if (ret == -1) {
1179         ALOGE("control data socket write failed; errno=%d", errno);
1180     } else if (ret == 0) {
1181         ALOGE("Got EOF on control data socket");
1182         ret = -1;
1183     }
1184 
1185     return ret;
1186 }
1187 
ctrl_command_handler(int dsock_idx)1188 static void ctrl_command_handler(int dsock_idx) {
1189     LMKD_CTRL_PACKET packet;
1190     int len;
1191     enum lmk_cmd cmd;
1192     int nargs;
1193     int targets;
1194     int kill_cnt;
1195 
1196     len = ctrl_data_read(dsock_idx, (char *)packet, CTRL_PACKET_MAX_SIZE);
1197     if (len <= 0)
1198         return;
1199 
1200     if (len < (int)sizeof(int)) {
1201         ALOGE("Wrong control socket read length len=%d", len);
1202         return;
1203     }
1204 
1205     cmd = lmkd_pack_get_cmd(packet);
1206     nargs = len / sizeof(int) - 1;
1207     if (nargs < 0)
1208         goto wronglen;
1209 
1210     switch(cmd) {
1211     case LMK_TARGET:
1212         targets = nargs / 2;
1213         if (nargs & 0x1 || targets > (int)ARRAY_SIZE(lowmem_adj))
1214             goto wronglen;
1215         cmd_target(targets, packet);
1216         break;
1217     case LMK_PROCPRIO:
1218         if (nargs != 3)
1219             goto wronglen;
1220         cmd_procprio(packet);
1221         break;
1222     case LMK_PROCREMOVE:
1223         if (nargs != 1)
1224             goto wronglen;
1225         cmd_procremove(packet);
1226         break;
1227     case LMK_PROCPURGE:
1228         if (nargs != 0)
1229             goto wronglen;
1230         cmd_procpurge();
1231         break;
1232     case LMK_GETKILLCNT:
1233         if (nargs != 2)
1234             goto wronglen;
1235         kill_cnt = cmd_getkillcnt(packet);
1236         len = lmkd_pack_set_getkillcnt_repl(packet, kill_cnt);
1237         if (ctrl_data_write(dsock_idx, (char *)packet, len) != len)
1238             return;
1239         break;
1240     default:
1241         ALOGE("Received unknown command code %d", cmd);
1242         return;
1243     }
1244 
1245     return;
1246 
1247 wronglen:
1248     ALOGE("Wrong control socket read length cmd=%d len=%d", cmd, len);
1249 }
1250 
ctrl_data_handler(int data,uint32_t events,struct polling_params * poll_params __unused)1251 static void ctrl_data_handler(int data, uint32_t events,
1252                               struct polling_params *poll_params __unused) {
1253     if (events & EPOLLIN) {
1254         ctrl_command_handler(data);
1255     }
1256 }
1257 
get_free_dsock()1258 static int get_free_dsock() {
1259     for (int i = 0; i < MAX_DATA_CONN; i++) {
1260         if (data_sock[i].sock < 0) {
1261             return i;
1262         }
1263     }
1264     return -1;
1265 }
1266 
ctrl_connect_handler(int data __unused,uint32_t events __unused,struct polling_params * poll_params __unused)1267 static void ctrl_connect_handler(int data __unused, uint32_t events __unused,
1268                                  struct polling_params *poll_params __unused) {
1269     struct epoll_event epev;
1270     int free_dscock_idx = get_free_dsock();
1271 
1272     if (free_dscock_idx < 0) {
1273         /*
1274          * Number of data connections exceeded max supported. This should not
1275          * happen but if it does we drop all existing connections and accept
1276          * the new one. This prevents inactive connections from monopolizing
1277          * data socket and if we drop ActivityManager connection it will
1278          * immediately reconnect.
1279          */
1280         for (int i = 0; i < MAX_DATA_CONN; i++) {
1281             ctrl_data_close(i);
1282         }
1283         free_dscock_idx = 0;
1284     }
1285 
1286     data_sock[free_dscock_idx].sock = accept(ctrl_sock.sock, NULL, NULL);
1287     if (data_sock[free_dscock_idx].sock < 0) {
1288         ALOGE("lmkd control socket accept failed; errno=%d", errno);
1289         return;
1290     }
1291 
1292     ALOGI("lmkd data connection established");
1293     /* use data to store data connection idx */
1294     data_sock[free_dscock_idx].handler_info.data = free_dscock_idx;
1295     data_sock[free_dscock_idx].handler_info.handler = ctrl_data_handler;
1296     epev.events = EPOLLIN;
1297     epev.data.ptr = (void *)&(data_sock[free_dscock_idx].handler_info);
1298     if (epoll_ctl(epollfd, EPOLL_CTL_ADD, data_sock[free_dscock_idx].sock, &epev) == -1) {
1299         ALOGE("epoll_ctl for data connection socket failed; errno=%d", errno);
1300         ctrl_data_close(free_dscock_idx);
1301         return;
1302     }
1303     maxevents++;
1304 }
1305 
1306 /*
1307  * /proc/zoneinfo parsing routines
1308  * Expected file format is:
1309  *
1310  *   Node <node_id>, zone   <zone_name>
1311  *   (
1312  *    per-node stats
1313  *       (<per-node field name> <value>)+
1314  *   )?
1315  *   (pages free     <value>
1316  *       (<per-zone field name> <value>)+
1317  *    pagesets
1318  *       (<unused fields>)*
1319  *   )+
1320  *   ...
1321  */
zoneinfo_parse_protection(char * buf,struct zoneinfo_zone * zone)1322 static void zoneinfo_parse_protection(char *buf, struct zoneinfo_zone *zone) {
1323     int zone_idx;
1324     int64_t max = 0;
1325     char *save_ptr;
1326 
1327     for (buf = strtok_r(buf, "(), ", &save_ptr), zone_idx = 0;
1328          buf && zone_idx < MAX_NR_ZONES;
1329          buf = strtok_r(NULL, "), ", &save_ptr), zone_idx++) {
1330         long long zoneval = strtoll(buf, &buf, 0);
1331         if (zoneval > max) {
1332             max = (zoneval > INT64_MAX) ? INT64_MAX : zoneval;
1333         }
1334         zone->protection[zone_idx] = zoneval;
1335     }
1336     zone->max_protection = max;
1337 }
1338 
zoneinfo_parse_zone(char ** buf,struct zoneinfo_zone * zone)1339 static int zoneinfo_parse_zone(char **buf, struct zoneinfo_zone *zone) {
1340     for (char *line = strtok_r(NULL, "\n", buf); line;
1341          line = strtok_r(NULL, "\n", buf)) {
1342         char *cp;
1343         char *ap;
1344         char *save_ptr;
1345         int64_t val;
1346         int field_idx;
1347         enum field_match_result match_res;
1348 
1349         cp = strtok_r(line, " ", &save_ptr);
1350         if (!cp) {
1351             return false;
1352         }
1353 
1354         field_idx = find_field(cp, zoneinfo_zone_spec_field_names, ZI_ZONE_SPEC_FIELD_COUNT);
1355         if (field_idx >= 0) {
1356             /* special field */
1357             if (field_idx == ZI_ZONE_SPEC_PAGESETS) {
1358                 /* no mode fields we are interested in */
1359                 return true;
1360             }
1361 
1362             /* protection field */
1363             ap = strtok_r(NULL, ")", &save_ptr);
1364             if (ap) {
1365                 zoneinfo_parse_protection(ap, zone);
1366             }
1367             continue;
1368         }
1369 
1370         ap = strtok_r(NULL, " ", &save_ptr);
1371         if (!ap) {
1372             continue;
1373         }
1374 
1375         match_res = match_field(cp, ap, zoneinfo_zone_field_names, ZI_ZONE_FIELD_COUNT,
1376             &val, &field_idx);
1377         if (match_res == PARSE_FAIL) {
1378             return false;
1379         }
1380         if (match_res == PARSE_SUCCESS) {
1381             zone->fields.arr[field_idx] = val;
1382         }
1383         if (field_idx == ZI_ZONE_PRESENT && val == 0) {
1384             /* zone is not populated, stop parsing it */
1385             return true;
1386         }
1387     }
1388     return false;
1389 }
1390 
zoneinfo_parse_node(char ** buf,struct zoneinfo_node * node)1391 static int zoneinfo_parse_node(char **buf, struct zoneinfo_node *node) {
1392     int fields_to_match = ZI_NODE_FIELD_COUNT;
1393 
1394     for (char *line = strtok_r(NULL, "\n", buf); line;
1395          line = strtok_r(NULL, "\n", buf)) {
1396         char *cp;
1397         char *ap;
1398         char *save_ptr;
1399         int64_t val;
1400         int field_idx;
1401         enum field_match_result match_res;
1402 
1403         cp = strtok_r(line, " ", &save_ptr);
1404         if (!cp) {
1405             return false;
1406         }
1407 
1408         ap = strtok_r(NULL, " ", &save_ptr);
1409         if (!ap) {
1410             return false;
1411         }
1412 
1413         match_res = match_field(cp, ap, zoneinfo_node_field_names, ZI_NODE_FIELD_COUNT,
1414             &val, &field_idx);
1415         if (match_res == PARSE_FAIL) {
1416             return false;
1417         }
1418         if (match_res == PARSE_SUCCESS) {
1419             node->fields.arr[field_idx] = val;
1420             fields_to_match--;
1421             if (!fields_to_match) {
1422                 return true;
1423             }
1424         }
1425     }
1426     return false;
1427 }
1428 
zoneinfo_parse(struct zoneinfo * zi)1429 static int zoneinfo_parse(struct zoneinfo *zi) {
1430     static struct reread_data file_data = {
1431         .filename = ZONEINFO_PATH,
1432         .fd = -1,
1433     };
1434     char *buf;
1435     char *save_ptr;
1436     char *line;
1437     char zone_name[LINE_MAX + 1];
1438     struct zoneinfo_node *node = NULL;
1439     int node_idx = 0;
1440     int zone_idx = 0;
1441 
1442     memset(zi, 0, sizeof(struct zoneinfo));
1443 
1444     if ((buf = reread_file(&file_data)) == NULL) {
1445         return -1;
1446     }
1447 
1448     for (line = strtok_r(buf, "\n", &save_ptr); line;
1449          line = strtok_r(NULL, "\n", &save_ptr)) {
1450         int node_id;
1451         if (sscanf(line, "Node %d, zone %" STRINGIFY(LINE_MAX) "s", &node_id, zone_name) == 2) {
1452             if (!node || node->id != node_id) {
1453                 /* new node is found */
1454                 if (node) {
1455                     node->zone_count = zone_idx + 1;
1456                     node_idx++;
1457                     if (node_idx == MAX_NR_NODES) {
1458                         /* max node count exceeded */
1459                         ALOGE("%s parse error", file_data.filename);
1460                         return -1;
1461                     }
1462                 }
1463                 node = &zi->nodes[node_idx];
1464                 node->id = node_id;
1465                 zone_idx = 0;
1466                 if (!zoneinfo_parse_node(&save_ptr, node)) {
1467                     ALOGE("%s parse error", file_data.filename);
1468                     return -1;
1469                 }
1470             } else {
1471                 /* new zone is found */
1472                 zone_idx++;
1473             }
1474             if (!zoneinfo_parse_zone(&save_ptr, &node->zones[zone_idx])) {
1475                 ALOGE("%s parse error", file_data.filename);
1476                 return -1;
1477             }
1478         }
1479     }
1480     if (!node) {
1481         ALOGE("%s parse error", file_data.filename);
1482         return -1;
1483     }
1484     node->zone_count = zone_idx + 1;
1485     zi->node_count = node_idx + 1;
1486 
1487     /* calculate totals fields */
1488     for (node_idx = 0; node_idx < zi->node_count; node_idx++) {
1489         node = &zi->nodes[node_idx];
1490         for (zone_idx = 0; zone_idx < node->zone_count; zone_idx++) {
1491             struct zoneinfo_zone *zone = &zi->nodes[node_idx].zones[zone_idx];
1492             zi->totalreserve_pages += zone->max_protection + zone->fields.field.high;
1493         }
1494         zi->total_inactive_file += node->fields.field.nr_inactive_file;
1495         zi->total_active_file += node->fields.field.nr_active_file;
1496         zi->total_workingset_refault += node->fields.field.workingset_refault;
1497     }
1498     return 0;
1499 }
1500 
1501 /* /proc/meminfo parsing routines */
meminfo_parse_line(char * line,union meminfo * mi)1502 static bool meminfo_parse_line(char *line, union meminfo *mi) {
1503     char *cp = line;
1504     char *ap;
1505     char *save_ptr;
1506     int64_t val;
1507     int field_idx;
1508     enum field_match_result match_res;
1509 
1510     cp = strtok_r(line, " ", &save_ptr);
1511     if (!cp) {
1512         return false;
1513     }
1514 
1515     ap = strtok_r(NULL, " ", &save_ptr);
1516     if (!ap) {
1517         return false;
1518     }
1519 
1520     match_res = match_field(cp, ap, meminfo_field_names, MI_FIELD_COUNT,
1521         &val, &field_idx);
1522     if (match_res == PARSE_SUCCESS) {
1523         mi->arr[field_idx] = val / page_k;
1524     }
1525     return (match_res != PARSE_FAIL);
1526 }
1527 
meminfo_parse(union meminfo * mi)1528 static int meminfo_parse(union meminfo *mi) {
1529     static struct reread_data file_data = {
1530         .filename = MEMINFO_PATH,
1531         .fd = -1,
1532     };
1533     char *buf;
1534     char *save_ptr;
1535     char *line;
1536 
1537     memset(mi, 0, sizeof(union meminfo));
1538 
1539     if ((buf = reread_file(&file_data)) == NULL) {
1540         return -1;
1541     }
1542 
1543     for (line = strtok_r(buf, "\n", &save_ptr); line;
1544          line = strtok_r(NULL, "\n", &save_ptr)) {
1545         if (!meminfo_parse_line(line, mi)) {
1546             ALOGE("%s parse error", file_data.filename);
1547             return -1;
1548         }
1549     }
1550     mi->field.nr_file_pages = mi->field.cached + mi->field.swap_cached +
1551         mi->field.buffers;
1552 
1553     return 0;
1554 }
1555 
1556 /* /proc/vmstat parsing routines */
vmstat_parse_line(char * line,union vmstat * vs)1557 static bool vmstat_parse_line(char *line, union vmstat *vs) {
1558     char *cp;
1559     char *ap;
1560     char *save_ptr;
1561     int64_t val;
1562     int field_idx;
1563     enum field_match_result match_res;
1564 
1565     cp = strtok_r(line, " ", &save_ptr);
1566     if (!cp) {
1567         return false;
1568     }
1569 
1570     ap = strtok_r(NULL, " ", &save_ptr);
1571     if (!ap) {
1572         return false;
1573     }
1574 
1575     match_res = match_field(cp, ap, vmstat_field_names, VS_FIELD_COUNT,
1576         &val, &field_idx);
1577     if (match_res == PARSE_SUCCESS) {
1578         vs->arr[field_idx] = val;
1579     }
1580     return (match_res != PARSE_FAIL);
1581 }
1582 
vmstat_parse(union vmstat * vs)1583 static int vmstat_parse(union vmstat *vs) {
1584     static struct reread_data file_data = {
1585         .filename = VMSTAT_PATH,
1586         .fd = -1,
1587     };
1588     char *buf;
1589     char *save_ptr;
1590     char *line;
1591 
1592     memset(vs, 0, sizeof(union vmstat));
1593 
1594     if ((buf = reread_file(&file_data)) == NULL) {
1595         return -1;
1596     }
1597 
1598     for (line = strtok_r(buf, "\n", &save_ptr); line;
1599          line = strtok_r(NULL, "\n", &save_ptr)) {
1600         if (!vmstat_parse_line(line, vs)) {
1601             ALOGE("%s parse error", file_data.filename);
1602             return -1;
1603         }
1604     }
1605 
1606     return 0;
1607 }
1608 
killinfo_log(struct proc * procp,int min_oom_score,int tasksize,int kill_reason,union meminfo * mi)1609 static void killinfo_log(struct proc* procp, int min_oom_score, int tasksize,
1610                          int kill_reason, union meminfo *mi) {
1611     /* log process information */
1612     android_log_write_int32(ctx, procp->pid);
1613     android_log_write_int32(ctx, procp->uid);
1614     android_log_write_int32(ctx, procp->oomadj);
1615     android_log_write_int32(ctx, min_oom_score);
1616     android_log_write_int32(ctx, (int32_t)min(tasksize * page_k, INT32_MAX));
1617     android_log_write_int32(ctx, kill_reason);
1618 
1619     /* log meminfo fields */
1620     for (int field_idx = 0; field_idx < MI_FIELD_COUNT; field_idx++) {
1621         android_log_write_int32(ctx, (int32_t)min(mi->arr[field_idx] * page_k, INT32_MAX));
1622     }
1623 
1624     android_log_write_list(ctx, LOG_ID_EVENTS);
1625     android_log_reset(ctx);
1626 }
1627 
proc_adj_lru(int oomadj)1628 static struct proc *proc_adj_lru(int oomadj) {
1629     return (struct proc *)adjslot_tail(&procadjslot_list[ADJTOSLOT(oomadj)]);
1630 }
1631 
proc_get_heaviest(int oomadj)1632 static struct proc *proc_get_heaviest(int oomadj) {
1633     struct adjslot_list *head = &procadjslot_list[ADJTOSLOT(oomadj)];
1634     struct adjslot_list *curr = head->next;
1635     struct proc *maxprocp = NULL;
1636     int maxsize = 0;
1637     while (curr != head) {
1638         int pid = ((struct proc *)curr)->pid;
1639         int tasksize = proc_get_size(pid);
1640         if (tasksize <= 0) {
1641             struct adjslot_list *next = curr->next;
1642             pid_remove(pid);
1643             curr = next;
1644         } else {
1645             if (tasksize > maxsize) {
1646                 maxsize = tasksize;
1647                 maxprocp = (struct proc *)curr;
1648             }
1649             curr = curr->next;
1650         }
1651     }
1652     return maxprocp;
1653 }
1654 
set_process_group_and_prio(int pid,SchedPolicy sp,int prio)1655 static void set_process_group_and_prio(int pid, SchedPolicy sp, int prio) {
1656     DIR* d;
1657     char proc_path[PATH_MAX];
1658     struct dirent* de;
1659 
1660     snprintf(proc_path, sizeof(proc_path), "/proc/%d/task", pid);
1661     if (!(d = opendir(proc_path))) {
1662         ALOGW("Failed to open %s; errno=%d: process pid(%d) might have died", proc_path, errno,
1663               pid);
1664         return;
1665     }
1666 
1667     while ((de = readdir(d))) {
1668         int t_pid;
1669 
1670         if (de->d_name[0] == '.') continue;
1671         t_pid = atoi(de->d_name);
1672 
1673         if (!t_pid) {
1674             ALOGW("Failed to get t_pid for '%s' of pid(%d)", de->d_name, pid);
1675             continue;
1676         }
1677 
1678         if (setpriority(PRIO_PROCESS, t_pid, prio) && errno != ESRCH) {
1679             ALOGW("Unable to raise priority of killing t_pid (%d): errno=%d", t_pid, errno);
1680         }
1681 
1682         if (set_cpuset_policy(t_pid, sp)) {
1683             ALOGW("Failed to set_cpuset_policy on pid(%d) t_pid(%d) to %d", pid, t_pid, (int)sp);
1684             continue;
1685         }
1686     }
1687     closedir(d);
1688 }
1689 
is_kill_pending(void)1690 static bool is_kill_pending(void) {
1691     char buf[24];
1692 
1693     if (last_kill_pid_or_fd < 0) {
1694         return false;
1695     }
1696 
1697     if (pidfd_supported) {
1698         return true;
1699     }
1700 
1701     /* when pidfd is not supported base the decision on /proc/<pid> existence */
1702     snprintf(buf, sizeof(buf), "/proc/%d/", last_kill_pid_or_fd);
1703     if (access(buf, F_OK) == 0) {
1704         return true;
1705     }
1706 
1707     return false;
1708 }
1709 
is_waiting_for_kill(void)1710 static bool is_waiting_for_kill(void) {
1711     return pidfd_supported && last_kill_pid_or_fd >= 0;
1712 }
1713 
stop_wait_for_proc_kill(bool finished)1714 static void stop_wait_for_proc_kill(bool finished) {
1715     struct epoll_event epev;
1716 
1717     if (last_kill_pid_or_fd < 0) {
1718         return;
1719     }
1720 
1721     if (debug_process_killing) {
1722         struct timespec curr_tm;
1723 
1724         if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
1725             /*
1726              * curr_tm is used here merely to report kill duration, so this failure is not fatal.
1727              * Log an error and continue.
1728              */
1729             ALOGE("Failed to get current time");
1730         }
1731 
1732         if (finished) {
1733             ALOGI("Process got killed in %ldms",
1734                 get_time_diff_ms(&last_kill_tm, &curr_tm));
1735         } else {
1736             ALOGI("Stop waiting for process kill after %ldms",
1737                 get_time_diff_ms(&last_kill_tm, &curr_tm));
1738         }
1739     }
1740 
1741     if (pidfd_supported) {
1742         /* unregister fd */
1743         if (epoll_ctl(epollfd, EPOLL_CTL_DEL, last_kill_pid_or_fd, &epev) != 0) {
1744             ALOGE("epoll_ctl for last killed process failed; errno=%d", errno);
1745             return;
1746         }
1747         maxevents--;
1748         close(last_kill_pid_or_fd);
1749     }
1750 
1751     last_kill_pid_or_fd = -1;
1752 }
1753 
kill_done_handler(int data __unused,uint32_t events __unused,struct polling_params * poll_params)1754 static void kill_done_handler(int data __unused, uint32_t events __unused,
1755                               struct polling_params *poll_params) {
1756     stop_wait_for_proc_kill(true);
1757     poll_params->update = POLLING_RESUME;
1758 }
1759 
start_wait_for_proc_kill(int pid_or_fd)1760 static void start_wait_for_proc_kill(int pid_or_fd) {
1761     static struct event_handler_info kill_done_hinfo = { 0, kill_done_handler };
1762     struct epoll_event epev;
1763 
1764     if (last_kill_pid_or_fd >= 0) {
1765         /* Should not happen but if it does we should stop previous wait */
1766         ALOGE("Attempt to wait for a kill while another wait is in progress");
1767         stop_wait_for_proc_kill(false);
1768     }
1769 
1770     last_kill_pid_or_fd = pid_or_fd;
1771 
1772     if (!pidfd_supported) {
1773         /* If pidfd is not supported just store PID and exit */
1774         return;
1775     }
1776 
1777     epev.events = EPOLLIN;
1778     epev.data.ptr = (void *)&kill_done_hinfo;
1779     if (epoll_ctl(epollfd, EPOLL_CTL_ADD, last_kill_pid_or_fd, &epev) != 0) {
1780         ALOGE("epoll_ctl for last kill failed; errno=%d", errno);
1781         close(last_kill_pid_or_fd);
1782         last_kill_pid_or_fd = -1;
1783         return;
1784     }
1785     maxevents++;
1786 }
1787 
1788 /* Kill one process specified by procp.  Returns the size of the process killed */
kill_one_process(struct proc * procp,int min_oom_score,int kill_reason,const char * kill_desc,union meminfo * mi,struct timespec * tm)1789 static int kill_one_process(struct proc* procp, int min_oom_score, int kill_reason,
1790                             const char *kill_desc, union meminfo *mi, struct timespec *tm) {
1791     int pid = procp->pid;
1792     int pidfd = procp->pidfd;
1793     uid_t uid = procp->uid;
1794     int tgid;
1795     char *taskname;
1796     int tasksize;
1797     int r;
1798     int result = -1;
1799     struct memory_stat *mem_st;
1800     char buf[LINE_MAX];
1801 
1802     tgid = proc_get_tgid(pid);
1803     if (tgid >= 0 && tgid != pid) {
1804         ALOGE("Possible pid reuse detected (pid %d, tgid %d)!", pid, tgid);
1805         goto out;
1806     }
1807 
1808     taskname = proc_get_name(pid, buf, sizeof(buf));
1809     if (!taskname) {
1810         goto out;
1811     }
1812 
1813     tasksize = proc_get_size(pid);
1814     if (tasksize <= 0) {
1815         goto out;
1816     }
1817 
1818     mem_st = stats_read_memory_stat(per_app_memcg, pid, uid);
1819 
1820     TRACE_KILL_START(pid);
1821 
1822     /* CAP_KILL required */
1823     if (pidfd < 0) {
1824         start_wait_for_proc_kill(pid);
1825         r = kill(pid, SIGKILL);
1826     } else {
1827         start_wait_for_proc_kill(pidfd);
1828         r = sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0);
1829     }
1830 
1831     TRACE_KILL_END();
1832 
1833     if (r) {
1834         stop_wait_for_proc_kill(false);
1835         ALOGE("kill(%d): errno=%d", pid, errno);
1836         /* Delete process record even when we fail to kill so that we don't get stuck on it */
1837         goto out;
1838     }
1839 
1840     set_process_group_and_prio(pid, SP_FOREGROUND, ANDROID_PRIORITY_HIGHEST);
1841 
1842     last_kill_tm = *tm;
1843 
1844     inc_killcnt(procp->oomadj);
1845 
1846     killinfo_log(procp, min_oom_score, tasksize, kill_reason, mi);
1847 
1848     if (kill_desc) {
1849         ALOGI("Kill '%s' (%d), uid %d, oom_adj %d to free %ldkB; reason: %s", taskname, pid,
1850               uid, procp->oomadj, tasksize * page_k, kill_desc);
1851     } else {
1852         ALOGI("Kill '%s' (%d), uid %d, oom_adj %d to free %ldkB", taskname, pid,
1853               uid, procp->oomadj, tasksize * page_k);
1854     }
1855 
1856     stats_write_lmk_kill_occurred(LMK_KILL_OCCURRED, uid, taskname,
1857             procp->oomadj, min_oom_score, tasksize, mem_st);
1858 
1859     result = tasksize;
1860 
1861 out:
1862     /*
1863      * WARNING: After pid_remove() procp is freed and can't be used!
1864      * Therefore placed at the end of the function.
1865      */
1866     pid_remove(pid);
1867     return result;
1868 }
1869 
1870 /*
1871  * Find one process to kill at or above the given oom_adj level.
1872  * Returns size of the killed process.
1873  */
find_and_kill_process(int min_score_adj,int kill_reason,const char * kill_desc,union meminfo * mi,struct timespec * tm)1874 static int find_and_kill_process(int min_score_adj, int kill_reason, const char *kill_desc,
1875                                  union meminfo *mi, struct timespec *tm) {
1876     int i;
1877     int killed_size = 0;
1878     bool lmk_state_change_start = false;
1879 
1880     for (i = OOM_SCORE_ADJ_MAX; i >= min_score_adj; i--) {
1881         struct proc *procp;
1882 
1883         while (true) {
1884             procp = kill_heaviest_task ?
1885                 proc_get_heaviest(i) : proc_adj_lru(i);
1886 
1887             if (!procp)
1888                 break;
1889 
1890             killed_size = kill_one_process(procp, min_score_adj, kill_reason, kill_desc, mi, tm);
1891             if (killed_size >= 0) {
1892                 if (!lmk_state_change_start) {
1893                     lmk_state_change_start = true;
1894                     stats_write_lmk_state_changed(LMK_STATE_CHANGED,
1895                                                   LMK_STATE_CHANGE_START);
1896                 }
1897                 break;
1898             }
1899         }
1900         if (killed_size) {
1901             break;
1902         }
1903     }
1904 
1905     if (lmk_state_change_start) {
1906         stats_write_lmk_state_changed(LMK_STATE_CHANGED, LMK_STATE_CHANGE_STOP);
1907     }
1908 
1909     return killed_size;
1910 }
1911 
get_memory_usage(struct reread_data * file_data)1912 static int64_t get_memory_usage(struct reread_data *file_data) {
1913     int ret;
1914     int64_t mem_usage;
1915     char *buf;
1916 
1917     if ((buf = reread_file(file_data)) == NULL) {
1918         return -1;
1919     }
1920 
1921     if (!parse_int64(buf, &mem_usage)) {
1922         ALOGE("%s parse error", file_data->filename);
1923         return -1;
1924     }
1925     if (mem_usage == 0) {
1926         ALOGE("No memory!");
1927         return -1;
1928     }
1929     return mem_usage;
1930 }
1931 
record_low_pressure_levels(union meminfo * mi)1932 void record_low_pressure_levels(union meminfo *mi) {
1933     if (low_pressure_mem.min_nr_free_pages == -1 ||
1934         low_pressure_mem.min_nr_free_pages > mi->field.nr_free_pages) {
1935         if (debug_process_killing) {
1936             ALOGI("Low pressure min memory update from %" PRId64 " to %" PRId64,
1937                 low_pressure_mem.min_nr_free_pages, mi->field.nr_free_pages);
1938         }
1939         low_pressure_mem.min_nr_free_pages = mi->field.nr_free_pages;
1940     }
1941     /*
1942      * Free memory at low vmpressure events occasionally gets spikes,
1943      * possibly a stale low vmpressure event with memory already
1944      * freed up (no memory pressure should have been reported).
1945      * Ignore large jumps in max_nr_free_pages that would mess up our stats.
1946      */
1947     if (low_pressure_mem.max_nr_free_pages == -1 ||
1948         (low_pressure_mem.max_nr_free_pages < mi->field.nr_free_pages &&
1949          mi->field.nr_free_pages - low_pressure_mem.max_nr_free_pages <
1950          low_pressure_mem.max_nr_free_pages * 0.1)) {
1951         if (debug_process_killing) {
1952             ALOGI("Low pressure max memory update from %" PRId64 " to %" PRId64,
1953                 low_pressure_mem.max_nr_free_pages, mi->field.nr_free_pages);
1954         }
1955         low_pressure_mem.max_nr_free_pages = mi->field.nr_free_pages;
1956     }
1957 }
1958 
upgrade_level(enum vmpressure_level level)1959 enum vmpressure_level upgrade_level(enum vmpressure_level level) {
1960     return (enum vmpressure_level)((level < VMPRESS_LEVEL_CRITICAL) ?
1961         level + 1 : level);
1962 }
1963 
downgrade_level(enum vmpressure_level level)1964 enum vmpressure_level downgrade_level(enum vmpressure_level level) {
1965     return (enum vmpressure_level)((level > VMPRESS_LEVEL_LOW) ?
1966         level - 1 : level);
1967 }
1968 
1969 enum zone_watermark {
1970     WMARK_MIN = 0,
1971     WMARK_LOW,
1972     WMARK_HIGH,
1973     WMARK_NONE
1974 };
1975 
1976 struct zone_watermarks {
1977     long high_wmark;
1978     long low_wmark;
1979     long min_wmark;
1980 };
1981 
1982 /*
1983  * Returns lowest breached watermark or WMARK_NONE.
1984  */
get_lowest_watermark(union meminfo * mi,struct zone_watermarks * watermarks)1985 static enum zone_watermark get_lowest_watermark(union meminfo *mi,
1986                                                 struct zone_watermarks *watermarks)
1987 {
1988     int64_t nr_free_pages = mi->field.nr_free_pages - mi->field.cma_free;
1989 
1990     if (nr_free_pages < watermarks->min_wmark) {
1991         return WMARK_MIN;
1992     }
1993     if (nr_free_pages < watermarks->low_wmark) {
1994         return WMARK_LOW;
1995     }
1996     if (nr_free_pages < watermarks->high_wmark) {
1997         return WMARK_HIGH;
1998     }
1999     return WMARK_NONE;
2000 }
2001 
calc_zone_watermarks(struct zoneinfo * zi,struct zone_watermarks * watermarks)2002 void calc_zone_watermarks(struct zoneinfo *zi, struct zone_watermarks *watermarks) {
2003     memset(watermarks, 0, sizeof(struct zone_watermarks));
2004 
2005     for (int node_idx = 0; node_idx < zi->node_count; node_idx++) {
2006         struct zoneinfo_node *node = &zi->nodes[node_idx];
2007         for (int zone_idx = 0; zone_idx < node->zone_count; zone_idx++) {
2008             struct zoneinfo_zone *zone = &node->zones[zone_idx];
2009 
2010             if (!zone->fields.field.present) {
2011                 continue;
2012             }
2013 
2014             watermarks->high_wmark += zone->max_protection + zone->fields.field.high;
2015             watermarks->low_wmark += zone->max_protection + zone->fields.field.low;
2016             watermarks->min_wmark += zone->max_protection + zone->fields.field.min;
2017         }
2018     }
2019 }
2020 
mp_event_psi(int data,uint32_t events,struct polling_params * poll_params)2021 static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_params) {
2022     enum kill_reasons {
2023         NONE = -1, /* To denote no kill condition */
2024         PRESSURE_AFTER_KILL = 0,
2025         NOT_RESPONDING,
2026         LOW_SWAP_AND_THRASHING,
2027         LOW_MEM_AND_SWAP,
2028         LOW_MEM_AND_THRASHING,
2029         DIRECT_RECL_AND_THRASHING,
2030         KILL_REASON_COUNT
2031     };
2032     enum reclaim_state {
2033         NO_RECLAIM = 0,
2034         KSWAPD_RECLAIM,
2035         DIRECT_RECLAIM,
2036     };
2037     static int64_t init_ws_refault;
2038     static int64_t base_file_lru;
2039     static int64_t init_pgscan_kswapd;
2040     static int64_t init_pgscan_direct;
2041     static int64_t swap_low_threshold;
2042     static bool killing;
2043     static int thrashing_limit;
2044     static bool in_reclaim;
2045     static struct zone_watermarks watermarks;
2046     static struct timespec wmark_update_tm;
2047 
2048     union meminfo mi;
2049     union vmstat vs;
2050     struct timespec curr_tm;
2051     int64_t thrashing = 0;
2052     bool swap_is_low = false;
2053     enum vmpressure_level level = (enum vmpressure_level)data;
2054     enum kill_reasons kill_reason = NONE;
2055     bool cycle_after_kill = false;
2056     enum reclaim_state reclaim = NO_RECLAIM;
2057     enum zone_watermark wmark = WMARK_NONE;
2058     char kill_desc[LINE_MAX];
2059     bool cut_thrashing_limit = false;
2060     int min_score_adj = 0;
2061 
2062     /* Skip while still killing a process */
2063     if (is_kill_pending()) {
2064         goto no_kill;
2065     }
2066     /*
2067      * Process is dead, stop waiting. This has no effect if pidfds are supported and
2068      * death notification already caused waiting to stop.
2069      */
2070     stop_wait_for_proc_kill(true);
2071 
2072     if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
2073         ALOGE("Failed to get current time");
2074         return;
2075     }
2076 
2077     if (vmstat_parse(&vs) < 0) {
2078         ALOGE("Failed to parse vmstat!");
2079         return;
2080     }
2081 
2082     if (meminfo_parse(&mi) < 0) {
2083         ALOGE("Failed to parse meminfo!");
2084         return;
2085     }
2086 
2087     /* Reset states after process got killed */
2088     if (killing) {
2089         killing = false;
2090         cycle_after_kill = true;
2091         /* Reset file-backed pagecache size and refault amounts after a kill */
2092         base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file;
2093         init_ws_refault = vs.field.workingset_refault;
2094     }
2095 
2096     /* Check free swap levels */
2097     if (swap_free_low_percentage) {
2098         if (!swap_low_threshold) {
2099             swap_low_threshold = mi.field.total_swap * swap_free_low_percentage / 100;
2100         }
2101         swap_is_low = mi.field.free_swap < swap_low_threshold;
2102     }
2103 
2104     /* Identify reclaim state */
2105     if (vs.field.pgscan_direct > init_pgscan_direct) {
2106         init_pgscan_direct = vs.field.pgscan_direct;
2107         init_pgscan_kswapd = vs.field.pgscan_kswapd;
2108         reclaim = DIRECT_RECLAIM;
2109     } else if (vs.field.pgscan_kswapd > init_pgscan_kswapd) {
2110         init_pgscan_kswapd = vs.field.pgscan_kswapd;
2111         reclaim = KSWAPD_RECLAIM;
2112     } else {
2113         in_reclaim = false;
2114         /* Skip if system is not reclaiming */
2115         goto no_kill;
2116     }
2117 
2118     if (!in_reclaim) {
2119         /* Record file-backed pagecache size when entering reclaim cycle */
2120         base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file;
2121         init_ws_refault = vs.field.workingset_refault;
2122         thrashing_limit = thrashing_limit_pct;
2123     } else {
2124         /* Calculate what % of the file-backed pagecache refaulted so far */
2125         thrashing = (vs.field.workingset_refault - init_ws_refault) * 100 / base_file_lru;
2126     }
2127     in_reclaim = true;
2128 
2129     /*
2130      * Refresh watermarks once per min in case user updated one of the margins.
2131      * TODO: b/140521024 replace this periodic update with an API for AMS to notify LMKD
2132      * that zone watermarks were changed by the system software.
2133      */
2134     if (watermarks.high_wmark == 0 || get_time_diff_ms(&wmark_update_tm, &curr_tm) > 60000) {
2135         struct zoneinfo zi;
2136 
2137         if (zoneinfo_parse(&zi) < 0) {
2138             ALOGE("Failed to parse zoneinfo!");
2139             return;
2140         }
2141 
2142         calc_zone_watermarks(&zi, &watermarks);
2143         wmark_update_tm = curr_tm;
2144      }
2145 
2146     /* Find out which watermark is breached if any */
2147     wmark = get_lowest_watermark(&mi, &watermarks);
2148 
2149     /*
2150      * TODO: move this logic into a separate function
2151      * Decide if killing a process is necessary and record the reason
2152      */
2153     if (cycle_after_kill && wmark < WMARK_LOW) {
2154         /*
2155          * Prevent kills not freeing enough memory which might lead to OOM kill.
2156          * This might happen when a process is consuming memory faster than reclaim can
2157          * free even after a kill. Mostly happens when running memory stress tests.
2158          */
2159         kill_reason = PRESSURE_AFTER_KILL;
2160         strncpy(kill_desc, "min watermark is breached even after kill", sizeof(kill_desc));
2161     } else if (level == VMPRESS_LEVEL_CRITICAL && events != 0) {
2162         /*
2163          * Device is too busy reclaiming memory which might lead to ANR.
2164          * Critical level is triggered when PSI complete stall (all tasks are blocked because
2165          * of the memory congestion) breaches the configured threshold.
2166          */
2167         kill_reason = NOT_RESPONDING;
2168         strncpy(kill_desc, "device is not responding", sizeof(kill_desc));
2169     } else if (swap_is_low && thrashing > thrashing_limit_pct) {
2170         /* Page cache is thrashing while swap is low */
2171         kill_reason = LOW_SWAP_AND_THRASHING;
2172         snprintf(kill_desc, sizeof(kill_desc), "device is low on swap (%" PRId64
2173             "kB < %" PRId64 "kB) and thrashing (%" PRId64 "%%)",
2174             mi.field.free_swap * page_k, swap_low_threshold * page_k, thrashing);
2175     } else if (swap_is_low && wmark < WMARK_HIGH) {
2176         /* Both free memory and swap are low */
2177         kill_reason = LOW_MEM_AND_SWAP;
2178         snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and swap is low (%"
2179             PRId64 "kB < %" PRId64 "kB)", wmark > WMARK_LOW ? "min" : "low",
2180             mi.field.free_swap * page_k, swap_low_threshold * page_k);
2181     } else if (wmark < WMARK_HIGH && thrashing > thrashing_limit) {
2182         /* Page cache is thrashing while memory is low */
2183         kill_reason = LOW_MEM_AND_THRASHING;
2184         snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and thrashing (%"
2185             PRId64 "%%)", wmark > WMARK_LOW ? "min" : "low", thrashing);
2186         cut_thrashing_limit = true;
2187         /* Do not kill perceptible apps because of thrashing */
2188         min_score_adj = PERCEPTIBLE_APP_ADJ;
2189     } else if (reclaim == DIRECT_RECLAIM && thrashing > thrashing_limit) {
2190         /* Page cache is thrashing while in direct reclaim (mostly happens on lowram devices) */
2191         kill_reason = DIRECT_RECL_AND_THRASHING;
2192         snprintf(kill_desc, sizeof(kill_desc), "device is in direct reclaim and thrashing (%"
2193             PRId64 "%%)", thrashing);
2194         cut_thrashing_limit = true;
2195         /* Do not kill perceptible apps because of thrashing */
2196         min_score_adj = PERCEPTIBLE_APP_ADJ;
2197     }
2198 
2199     /* Kill a process if necessary */
2200     if (kill_reason != NONE) {
2201         int pages_freed = find_and_kill_process(min_score_adj, kill_reason, kill_desc, &mi,
2202                                                 &curr_tm);
2203         if (pages_freed > 0) {
2204             killing = true;
2205             if (cut_thrashing_limit) {
2206                 /*
2207                  * Cut thrasing limit by thrashing_limit_decay_pct percentage of the current
2208                  * thrashing limit until the system stops thrashing.
2209                  */
2210                 thrashing_limit = (thrashing_limit * (100 - thrashing_limit_decay_pct)) / 100;
2211             }
2212         }
2213     }
2214 
2215 no_kill:
2216     /* Do not poll if kernel supports pidfd waiting */
2217     if (is_waiting_for_kill()) {
2218         /* Pause polling if we are waiting for process death notification */
2219         poll_params->update = POLLING_PAUSE;
2220         return;
2221     }
2222 
2223     /*
2224      * Start polling after initial PSI event;
2225      * extend polling while device is in direct reclaim or process is being killed;
2226      * do not extend when kswapd reclaims because that might go on for a long time
2227      * without causing memory pressure
2228      */
2229     if (events || killing || reclaim == DIRECT_RECLAIM) {
2230         poll_params->update = POLLING_START;
2231     }
2232 
2233     /* Decide the polling interval */
2234     if (swap_is_low || killing) {
2235         /* Fast polling during and after a kill or when swap is low */
2236         poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
2237     } else {
2238         /* By default use long intervals */
2239         poll_params->polling_interval_ms = PSI_POLL_PERIOD_LONG_MS;
2240     }
2241 }
2242 
mp_event_common(int data,uint32_t events,struct polling_params * poll_params)2243 static void mp_event_common(int data, uint32_t events, struct polling_params *poll_params) {
2244     int ret;
2245     unsigned long long evcount;
2246     int64_t mem_usage, memsw_usage;
2247     int64_t mem_pressure;
2248     enum vmpressure_level lvl;
2249     union meminfo mi;
2250     struct zoneinfo zi;
2251     struct timespec curr_tm;
2252     static unsigned long kill_skip_count = 0;
2253     enum vmpressure_level level = (enum vmpressure_level)data;
2254     long other_free = 0, other_file = 0;
2255     int min_score_adj;
2256     int minfree = 0;
2257     static struct reread_data mem_usage_file_data = {
2258         .filename = MEMCG_MEMORY_USAGE,
2259         .fd = -1,
2260     };
2261     static struct reread_data memsw_usage_file_data = {
2262         .filename = MEMCG_MEMORYSW_USAGE,
2263         .fd = -1,
2264     };
2265 
2266     if (debug_process_killing) {
2267         ALOGI("%s memory pressure event is triggered", level_name[level]);
2268     }
2269 
2270     if (!use_psi_monitors) {
2271         /*
2272          * Check all event counters from low to critical
2273          * and upgrade to the highest priority one. By reading
2274          * eventfd we also reset the event counters.
2275          */
2276         for (lvl = VMPRESS_LEVEL_LOW; lvl < VMPRESS_LEVEL_COUNT; lvl++) {
2277             if (mpevfd[lvl] != -1 &&
2278                 TEMP_FAILURE_RETRY(read(mpevfd[lvl],
2279                                    &evcount, sizeof(evcount))) > 0 &&
2280                 evcount > 0 && lvl > level) {
2281                 level = lvl;
2282             }
2283         }
2284     }
2285 
2286     /* Start polling after initial PSI event */
2287     if (use_psi_monitors && events) {
2288         /* Override polling params only if current event is more critical */
2289         if (!poll_params->poll_handler || data > poll_params->poll_handler->data) {
2290             poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
2291             poll_params->update = POLLING_START;
2292         }
2293     }
2294 
2295     if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
2296         ALOGE("Failed to get current time");
2297         return;
2298     }
2299 
2300     if (kill_timeout_ms && get_time_diff_ms(&last_kill_tm, &curr_tm) < kill_timeout_ms) {
2301         /*
2302          * If we're within the no-kill timeout, see if there's pending reclaim work
2303          * from the last killed process. If so, skip killing for now.
2304          */
2305         if (is_kill_pending()) {
2306             kill_skip_count++;
2307             return;
2308         }
2309         /*
2310          * Process is dead, stop waiting. This has no effect if pidfds are supported and
2311          * death notification already caused waiting to stop.
2312          */
2313         stop_wait_for_proc_kill(true);
2314     } else {
2315         /*
2316          * Killing took longer than no-kill timeout. Stop waiting for the last process
2317          * to die because we are ready to kill again.
2318          */
2319         stop_wait_for_proc_kill(false);
2320     }
2321 
2322     if (kill_skip_count > 0) {
2323         ALOGI("%lu memory pressure events were skipped after a kill!",
2324               kill_skip_count);
2325         kill_skip_count = 0;
2326     }
2327 
2328     if (meminfo_parse(&mi) < 0 || zoneinfo_parse(&zi) < 0) {
2329         ALOGE("Failed to get free memory!");
2330         return;
2331     }
2332 
2333     if (use_minfree_levels) {
2334         int i;
2335 
2336         other_free = mi.field.nr_free_pages - zi.totalreserve_pages;
2337         if (mi.field.nr_file_pages > (mi.field.shmem + mi.field.unevictable + mi.field.swap_cached)) {
2338             other_file = (mi.field.nr_file_pages - mi.field.shmem -
2339                           mi.field.unevictable - mi.field.swap_cached);
2340         } else {
2341             other_file = 0;
2342         }
2343 
2344         min_score_adj = OOM_SCORE_ADJ_MAX + 1;
2345         for (i = 0; i < lowmem_targets_size; i++) {
2346             minfree = lowmem_minfree[i];
2347             if (other_free < minfree && other_file < minfree) {
2348                 min_score_adj = lowmem_adj[i];
2349                 break;
2350             }
2351         }
2352 
2353         if (min_score_adj == OOM_SCORE_ADJ_MAX + 1) {
2354             if (debug_process_killing) {
2355                 ALOGI("Ignore %s memory pressure event "
2356                       "(free memory=%ldkB, cache=%ldkB, limit=%ldkB)",
2357                       level_name[level], other_free * page_k, other_file * page_k,
2358                       (long)lowmem_minfree[lowmem_targets_size - 1] * page_k);
2359             }
2360             return;
2361         }
2362 
2363         goto do_kill;
2364     }
2365 
2366     if (level == VMPRESS_LEVEL_LOW) {
2367         record_low_pressure_levels(&mi);
2368     }
2369 
2370     if (level_oomadj[level] > OOM_SCORE_ADJ_MAX) {
2371         /* Do not monitor this pressure level */
2372         return;
2373     }
2374 
2375     if ((mem_usage = get_memory_usage(&mem_usage_file_data)) < 0) {
2376         goto do_kill;
2377     }
2378     if ((memsw_usage = get_memory_usage(&memsw_usage_file_data)) < 0) {
2379         goto do_kill;
2380     }
2381 
2382     // Calculate percent for swappinness.
2383     mem_pressure = (mem_usage * 100) / memsw_usage;
2384 
2385     if (enable_pressure_upgrade && level != VMPRESS_LEVEL_CRITICAL) {
2386         // We are swapping too much.
2387         if (mem_pressure < upgrade_pressure) {
2388             level = upgrade_level(level);
2389             if (debug_process_killing) {
2390                 ALOGI("Event upgraded to %s", level_name[level]);
2391             }
2392         }
2393     }
2394 
2395     // If we still have enough swap space available, check if we want to
2396     // ignore/downgrade pressure events.
2397     if (mi.field.free_swap >=
2398         mi.field.total_swap * swap_free_low_percentage / 100) {
2399         // If the pressure is larger than downgrade_pressure lmk will not
2400         // kill any process, since enough memory is available.
2401         if (mem_pressure > downgrade_pressure) {
2402             if (debug_process_killing) {
2403                 ALOGI("Ignore %s memory pressure", level_name[level]);
2404             }
2405             return;
2406         } else if (level == VMPRESS_LEVEL_CRITICAL && mem_pressure > upgrade_pressure) {
2407             if (debug_process_killing) {
2408                 ALOGI("Downgrade critical memory pressure");
2409             }
2410             // Downgrade event, since enough memory available.
2411             level = downgrade_level(level);
2412         }
2413     }
2414 
2415 do_kill:
2416     if (low_ram_device) {
2417         /* For Go devices kill only one task */
2418         if (find_and_kill_process(level_oomadj[level], -1, NULL, &mi, &curr_tm) == 0) {
2419             if (debug_process_killing) {
2420                 ALOGI("Nothing to kill");
2421             }
2422         }
2423     } else {
2424         int pages_freed;
2425         static struct timespec last_report_tm;
2426         static unsigned long report_skip_count = 0;
2427 
2428         if (!use_minfree_levels) {
2429             /* Free up enough memory to downgrate the memory pressure to low level */
2430             if (mi.field.nr_free_pages >= low_pressure_mem.max_nr_free_pages) {
2431                 if (debug_process_killing) {
2432                     ALOGI("Ignoring pressure since more memory is "
2433                         "available (%" PRId64 ") than watermark (%" PRId64 ")",
2434                         mi.field.nr_free_pages, low_pressure_mem.max_nr_free_pages);
2435                 }
2436                 return;
2437             }
2438             min_score_adj = level_oomadj[level];
2439         }
2440 
2441         pages_freed = find_and_kill_process(min_score_adj, -1, NULL, &mi, &curr_tm);
2442 
2443         if (pages_freed == 0) {
2444             /* Rate limit kill reports when nothing was reclaimed */
2445             if (get_time_diff_ms(&last_report_tm, &curr_tm) < FAIL_REPORT_RLIMIT_MS) {
2446                 report_skip_count++;
2447                 return;
2448             }
2449         }
2450 
2451         /* Log whenever we kill or when report rate limit allows */
2452         if (use_minfree_levels) {
2453             ALOGI("Reclaimed %ldkB, cache(%ldkB) and "
2454                 "free(%" PRId64 "kB)-reserved(%" PRId64 "kB) below min(%ldkB) for oom_adj %d",
2455                 pages_freed * page_k,
2456                 other_file * page_k, mi.field.nr_free_pages * page_k,
2457                 zi.totalreserve_pages * page_k,
2458                 minfree * page_k, min_score_adj);
2459         } else {
2460             ALOGI("Reclaimed %ldkB at oom_adj %d",
2461                 pages_freed * page_k, min_score_adj);
2462         }
2463 
2464         if (report_skip_count > 0) {
2465             ALOGI("Suppressed %lu failed kill reports", report_skip_count);
2466             report_skip_count = 0;
2467         }
2468 
2469         last_report_tm = curr_tm;
2470     }
2471     if (is_waiting_for_kill()) {
2472         /* pause polling if we are waiting for process death notification */
2473         poll_params->update = POLLING_PAUSE;
2474     }
2475 }
2476 
init_mp_psi(enum vmpressure_level level,bool use_new_strategy)2477 static bool init_mp_psi(enum vmpressure_level level, bool use_new_strategy) {
2478     int fd;
2479 
2480     /* Do not register a handler if threshold_ms is not set */
2481     if (!psi_thresholds[level].threshold_ms) {
2482         return true;
2483     }
2484 
2485     fd = init_psi_monitor(psi_thresholds[level].stall_type,
2486         psi_thresholds[level].threshold_ms * US_PER_MS,
2487         PSI_WINDOW_SIZE_MS * US_PER_MS);
2488 
2489     if (fd < 0) {
2490         return false;
2491     }
2492 
2493     vmpressure_hinfo[level].handler = use_new_strategy ? mp_event_psi : mp_event_common;
2494     vmpressure_hinfo[level].data = level;
2495     if (register_psi_monitor(epollfd, fd, &vmpressure_hinfo[level]) < 0) {
2496         destroy_psi_monitor(fd);
2497         return false;
2498     }
2499     maxevents++;
2500     mpevfd[level] = fd;
2501 
2502     return true;
2503 }
2504 
destroy_mp_psi(enum vmpressure_level level)2505 static void destroy_mp_psi(enum vmpressure_level level) {
2506     int fd = mpevfd[level];
2507 
2508     if (unregister_psi_monitor(epollfd, fd) < 0) {
2509         ALOGE("Failed to unregister psi monitor for %s memory pressure; errno=%d",
2510             level_name[level], errno);
2511     }
2512     destroy_psi_monitor(fd);
2513     mpevfd[level] = -1;
2514 }
2515 
init_psi_monitors()2516 static bool init_psi_monitors() {
2517     /*
2518      * When PSI is used on low-ram devices or on high-end devices without memfree levels
2519      * use new kill strategy based on zone watermarks, free swap and thrashing stats
2520      */
2521     bool use_new_strategy =
2522         property_get_bool("ro.lmk.use_new_strategy", low_ram_device || !use_minfree_levels);
2523 
2524     /* In default PSI mode override stall amounts using system properties */
2525     if (use_new_strategy) {
2526         /* Do not use low pressure level */
2527         psi_thresholds[VMPRESS_LEVEL_LOW].threshold_ms = 0;
2528         psi_thresholds[VMPRESS_LEVEL_MEDIUM].threshold_ms = psi_partial_stall_ms;
2529         psi_thresholds[VMPRESS_LEVEL_CRITICAL].threshold_ms = psi_complete_stall_ms;
2530     }
2531 
2532     if (!init_mp_psi(VMPRESS_LEVEL_LOW, use_new_strategy)) {
2533         return false;
2534     }
2535     if (!init_mp_psi(VMPRESS_LEVEL_MEDIUM, use_new_strategy)) {
2536         destroy_mp_psi(VMPRESS_LEVEL_LOW);
2537         return false;
2538     }
2539     if (!init_mp_psi(VMPRESS_LEVEL_CRITICAL, use_new_strategy)) {
2540         destroy_mp_psi(VMPRESS_LEVEL_MEDIUM);
2541         destroy_mp_psi(VMPRESS_LEVEL_LOW);
2542         return false;
2543     }
2544     return true;
2545 }
2546 
init_mp_common(enum vmpressure_level level)2547 static bool init_mp_common(enum vmpressure_level level) {
2548     int mpfd;
2549     int evfd;
2550     int evctlfd;
2551     char buf[256];
2552     struct epoll_event epev;
2553     int ret;
2554     int level_idx = (int)level;
2555     const char *levelstr = level_name[level_idx];
2556 
2557     /* gid containing AID_SYSTEM required */
2558     mpfd = open(MEMCG_SYSFS_PATH "memory.pressure_level", O_RDONLY | O_CLOEXEC);
2559     if (mpfd < 0) {
2560         ALOGI("No kernel memory.pressure_level support (errno=%d)", errno);
2561         goto err_open_mpfd;
2562     }
2563 
2564     evctlfd = open(MEMCG_SYSFS_PATH "cgroup.event_control", O_WRONLY | O_CLOEXEC);
2565     if (evctlfd < 0) {
2566         ALOGI("No kernel memory cgroup event control (errno=%d)", errno);
2567         goto err_open_evctlfd;
2568     }
2569 
2570     evfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
2571     if (evfd < 0) {
2572         ALOGE("eventfd failed for level %s; errno=%d", levelstr, errno);
2573         goto err_eventfd;
2574     }
2575 
2576     ret = snprintf(buf, sizeof(buf), "%d %d %s", evfd, mpfd, levelstr);
2577     if (ret >= (ssize_t)sizeof(buf)) {
2578         ALOGE("cgroup.event_control line overflow for level %s", levelstr);
2579         goto err;
2580     }
2581 
2582     ret = TEMP_FAILURE_RETRY(write(evctlfd, buf, strlen(buf) + 1));
2583     if (ret == -1) {
2584         ALOGE("cgroup.event_control write failed for level %s; errno=%d",
2585               levelstr, errno);
2586         goto err;
2587     }
2588 
2589     epev.events = EPOLLIN;
2590     /* use data to store event level */
2591     vmpressure_hinfo[level_idx].data = level_idx;
2592     vmpressure_hinfo[level_idx].handler = mp_event_common;
2593     epev.data.ptr = (void *)&vmpressure_hinfo[level_idx];
2594     ret = epoll_ctl(epollfd, EPOLL_CTL_ADD, evfd, &epev);
2595     if (ret == -1) {
2596         ALOGE("epoll_ctl for level %s failed; errno=%d", levelstr, errno);
2597         goto err;
2598     }
2599     maxevents++;
2600     mpevfd[level] = evfd;
2601     close(evctlfd);
2602     return true;
2603 
2604 err:
2605     close(evfd);
2606 err_eventfd:
2607     close(evctlfd);
2608 err_open_evctlfd:
2609     close(mpfd);
2610 err_open_mpfd:
2611     return false;
2612 }
2613 
kernel_event_handler(int data __unused,uint32_t events __unused,struct polling_params * poll_params __unused)2614 static void kernel_event_handler(int data __unused, uint32_t events __unused,
2615                                  struct polling_params *poll_params __unused) {
2616     kpoll_info.handler(kpoll_info.poll_fd);
2617 }
2618 
init(void)2619 static int init(void) {
2620     static struct event_handler_info kernel_poll_hinfo = { 0, kernel_event_handler };
2621     struct reread_data file_data = {
2622         .filename = ZONEINFO_PATH,
2623         .fd = -1,
2624     };
2625     struct epoll_event epev;
2626     int pidfd;
2627     int i;
2628     int ret;
2629 
2630     page_k = sysconf(_SC_PAGESIZE);
2631     if (page_k == -1)
2632         page_k = PAGE_SIZE;
2633     page_k /= 1024;
2634 
2635     epollfd = epoll_create(MAX_EPOLL_EVENTS);
2636     if (epollfd == -1) {
2637         ALOGE("epoll_create failed (errno=%d)", errno);
2638         return -1;
2639     }
2640 
2641     // mark data connections as not connected
2642     for (int i = 0; i < MAX_DATA_CONN; i++) {
2643         data_sock[i].sock = -1;
2644     }
2645 
2646     ctrl_sock.sock = android_get_control_socket("lmkd");
2647     if (ctrl_sock.sock < 0) {
2648         ALOGE("get lmkd control socket failed");
2649         return -1;
2650     }
2651 
2652     ret = listen(ctrl_sock.sock, MAX_DATA_CONN);
2653     if (ret < 0) {
2654         ALOGE("lmkd control socket listen failed (errno=%d)", errno);
2655         return -1;
2656     }
2657 
2658     epev.events = EPOLLIN;
2659     ctrl_sock.handler_info.handler = ctrl_connect_handler;
2660     epev.data.ptr = (void *)&(ctrl_sock.handler_info);
2661     if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ctrl_sock.sock, &epev) == -1) {
2662         ALOGE("epoll_ctl for lmkd control socket failed (errno=%d)", errno);
2663         return -1;
2664     }
2665     maxevents++;
2666 
2667     has_inkernel_module = !access(INKERNEL_MINFREE_PATH, W_OK);
2668     use_inkernel_interface = has_inkernel_module;
2669 
2670     if (use_inkernel_interface) {
2671         ALOGI("Using in-kernel low memory killer interface");
2672         if (init_poll_kernel(&kpoll_info)) {
2673             epev.events = EPOLLIN;
2674             epev.data.ptr = (void*)&kernel_poll_hinfo;
2675             if (epoll_ctl(epollfd, EPOLL_CTL_ADD, kpoll_info.poll_fd, &epev) != 0) {
2676                 ALOGE("epoll_ctl for lmk events failed (errno=%d)", errno);
2677                 close(kpoll_info.poll_fd);
2678                 kpoll_info.poll_fd = -1;
2679             } else {
2680                 maxevents++;
2681             }
2682         }
2683     } else {
2684         /* Try to use psi monitor first if kernel has it */
2685         use_psi_monitors = property_get_bool("ro.lmk.use_psi", true) &&
2686             init_psi_monitors();
2687         /* Fall back to vmpressure */
2688         if (!use_psi_monitors &&
2689             (!init_mp_common(VMPRESS_LEVEL_LOW) ||
2690             !init_mp_common(VMPRESS_LEVEL_MEDIUM) ||
2691             !init_mp_common(VMPRESS_LEVEL_CRITICAL))) {
2692             ALOGE("Kernel does not support memory pressure events or in-kernel low memory killer");
2693             return -1;
2694         }
2695         if (use_psi_monitors) {
2696             ALOGI("Using psi monitors for memory pressure detection");
2697         } else {
2698             ALOGI("Using vmpressure for memory pressure detection");
2699         }
2700     }
2701 
2702     for (i = 0; i <= ADJTOSLOT(OOM_SCORE_ADJ_MAX); i++) {
2703         procadjslot_list[i].next = &procadjslot_list[i];
2704         procadjslot_list[i].prev = &procadjslot_list[i];
2705     }
2706 
2707     memset(killcnt_idx, KILLCNT_INVALID_IDX, sizeof(killcnt_idx));
2708 
2709     /*
2710      * Read zoneinfo as the biggest file we read to create and size the initial
2711      * read buffer and avoid memory re-allocations during memory pressure
2712      */
2713     if (reread_file(&file_data) == NULL) {
2714         ALOGE("Failed to read %s: %s", file_data.filename, strerror(errno));
2715     }
2716 
2717     /* check if kernel supports pidfd_open syscall */
2718     pidfd = TEMP_FAILURE_RETRY(sys_pidfd_open(getpid(), 0));
2719     if (pidfd < 0) {
2720         pidfd_supported = (errno != ENOSYS);
2721     } else {
2722         pidfd_supported = true;
2723         close(pidfd);
2724     }
2725     ALOGI("Process polling is %s", pidfd_supported ? "supported" : "not supported" );
2726 
2727     return 0;
2728 }
2729 
call_handler(struct event_handler_info * handler_info,struct polling_params * poll_params,uint32_t events)2730 static void call_handler(struct event_handler_info* handler_info,
2731                          struct polling_params *poll_params, uint32_t events) {
2732     struct timespec curr_tm;
2733 
2734     handler_info->handler(handler_info->data, events, poll_params);
2735     clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
2736     poll_params->last_poll_tm = curr_tm;
2737 
2738     switch (poll_params->update) {
2739     case POLLING_START:
2740         /*
2741          * Poll for the duration of PSI_WINDOW_SIZE_MS after the
2742          * initial PSI event because psi events are rate-limited
2743          * at one per sec.
2744          */
2745         poll_params->poll_start_tm = curr_tm;
2746         poll_params->poll_handler = handler_info;
2747         break;
2748     case POLLING_STOP:
2749         poll_params->poll_handler = NULL;
2750         break;
2751     case POLLING_PAUSE:
2752         poll_params->paused_handler = handler_info;
2753         poll_params->poll_handler = NULL;
2754         break;
2755     case POLLING_RESUME:
2756         poll_params->poll_start_tm = curr_tm;
2757         poll_params->poll_handler = poll_params->paused_handler;
2758         break;
2759     case POLLING_DO_NOT_CHANGE:
2760         if (get_time_diff_ms(&poll_params->poll_start_tm, &curr_tm) > PSI_WINDOW_SIZE_MS) {
2761             /* Polled for the duration of PSI window, time to stop */
2762             poll_params->poll_handler = NULL;
2763         }
2764         /* WARNING: skipping the rest of the function */
2765         return;
2766     }
2767     poll_params->update = POLLING_DO_NOT_CHANGE;
2768 }
2769 
mainloop(void)2770 static void mainloop(void) {
2771     struct event_handler_info* handler_info;
2772     struct polling_params poll_params;
2773     struct timespec curr_tm;
2774     struct epoll_event *evt;
2775     long delay = -1;
2776 
2777     poll_params.poll_handler = NULL;
2778     poll_params.update = POLLING_DO_NOT_CHANGE;
2779 
2780     while (1) {
2781         struct epoll_event events[maxevents];
2782         int nevents;
2783         int i;
2784 
2785         if (poll_params.poll_handler) {
2786             bool poll_now;
2787 
2788             clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
2789             if (poll_params.poll_handler == poll_params.paused_handler) {
2790                 /*
2791                  * Just transitioned into POLLING_RESUME. Reset paused_handler
2792                  * and poll immediately
2793                  */
2794                 poll_params.paused_handler = NULL;
2795                 poll_now = true;
2796                 nevents = 0;
2797             } else {
2798                 /* Calculate next timeout */
2799                 delay = get_time_diff_ms(&poll_params.last_poll_tm, &curr_tm);
2800                 delay = (delay < poll_params.polling_interval_ms) ?
2801                     poll_params.polling_interval_ms - delay : poll_params.polling_interval_ms;
2802 
2803                 /* Wait for events until the next polling timeout */
2804                 nevents = epoll_wait(epollfd, events, maxevents, delay);
2805 
2806                 /* Update current time after wait */
2807                 clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
2808                 poll_now = (get_time_diff_ms(&poll_params.last_poll_tm, &curr_tm) >=
2809                     poll_params.polling_interval_ms);
2810             }
2811             if (poll_now) {
2812                 call_handler(poll_params.poll_handler, &poll_params, 0);
2813             }
2814         } else {
2815             /* Wait for events with no timeout */
2816             nevents = epoll_wait(epollfd, events, maxevents, -1);
2817         }
2818 
2819         if (nevents == -1) {
2820             if (errno == EINTR)
2821                 continue;
2822             ALOGE("epoll_wait failed (errno=%d)", errno);
2823             continue;
2824         }
2825 
2826         /*
2827          * First pass to see if any data socket connections were dropped.
2828          * Dropped connection should be handled before any other events
2829          * to deallocate data connection and correctly handle cases when
2830          * connection gets dropped and reestablished in the same epoll cycle.
2831          * In such cases it's essential to handle connection closures first.
2832          */
2833         for (i = 0, evt = &events[0]; i < nevents; ++i, evt++) {
2834             if ((evt->events & EPOLLHUP) && evt->data.ptr) {
2835                 ALOGI("lmkd data connection dropped");
2836                 handler_info = (struct event_handler_info*)evt->data.ptr;
2837                 ctrl_data_close(handler_info->data);
2838             }
2839         }
2840 
2841         /* Second pass to handle all other events */
2842         for (i = 0, evt = &events[0]; i < nevents; ++i, evt++) {
2843             if (evt->events & EPOLLERR) {
2844                 ALOGD("EPOLLERR on event #%d", i);
2845             }
2846             if (evt->events & EPOLLHUP) {
2847                 /* This case was handled in the first pass */
2848                 continue;
2849             }
2850             if (evt->data.ptr) {
2851                 handler_info = (struct event_handler_info*)evt->data.ptr;
2852                 call_handler(handler_info, &poll_params, evt->events);
2853             }
2854         }
2855     }
2856 }
2857 
main(int argc __unused,char ** argv __unused)2858 int main(int argc __unused, char **argv __unused) {
2859     struct sched_param param = {
2860             .sched_priority = 1,
2861     };
2862 
2863     /* By default disable low level vmpressure events */
2864     level_oomadj[VMPRESS_LEVEL_LOW] =
2865         property_get_int32("ro.lmk.low", OOM_SCORE_ADJ_MAX + 1);
2866     level_oomadj[VMPRESS_LEVEL_MEDIUM] =
2867         property_get_int32("ro.lmk.medium", 800);
2868     level_oomadj[VMPRESS_LEVEL_CRITICAL] =
2869         property_get_int32("ro.lmk.critical", 0);
2870     debug_process_killing = property_get_bool("ro.lmk.debug", false);
2871 
2872     /* By default disable upgrade/downgrade logic */
2873     enable_pressure_upgrade =
2874         property_get_bool("ro.lmk.critical_upgrade", false);
2875     upgrade_pressure =
2876         (int64_t)property_get_int32("ro.lmk.upgrade_pressure", 100);
2877     downgrade_pressure =
2878         (int64_t)property_get_int32("ro.lmk.downgrade_pressure", 100);
2879     kill_heaviest_task =
2880         property_get_bool("ro.lmk.kill_heaviest_task", false);
2881     low_ram_device = property_get_bool("ro.config.low_ram", false);
2882     kill_timeout_ms =
2883         (unsigned long)property_get_int32("ro.lmk.kill_timeout_ms", 0);
2884     use_minfree_levels =
2885         property_get_bool("ro.lmk.use_minfree_levels", false);
2886     per_app_memcg =
2887         property_get_bool("ro.config.per_app_memcg", low_ram_device);
2888     swap_free_low_percentage = clamp(0, 100, property_get_int32("ro.lmk.swap_free_low_percentage",
2889         low_ram_device ? DEF_LOW_SWAP_LOWRAM : DEF_LOW_SWAP));
2890     psi_partial_stall_ms = property_get_int32("ro.lmk.psi_partial_stall_ms",
2891         low_ram_device ? DEF_PARTIAL_STALL_LOWRAM : DEF_PARTIAL_STALL);
2892     psi_complete_stall_ms = property_get_int32("ro.lmk.psi_complete_stall_ms",
2893         DEF_COMPLETE_STALL);
2894     thrashing_limit_pct = max(0, property_get_int32("ro.lmk.thrashing_limit",
2895         low_ram_device ? DEF_THRASHING_LOWRAM : DEF_THRASHING));
2896     thrashing_limit_decay_pct = clamp(0, 100, property_get_int32("ro.lmk.thrashing_limit_decay",
2897         low_ram_device ? DEF_THRASHING_DECAY_LOWRAM : DEF_THRASHING_DECAY));
2898 
2899     ctx = create_android_logger(KILLINFO_LOG_TAG);
2900 
2901     statslog_init();
2902 
2903     if (!init()) {
2904         if (!use_inkernel_interface) {
2905             /*
2906              * MCL_ONFAULT pins pages as they fault instead of loading
2907              * everything immediately all at once. (Which would be bad,
2908              * because as of this writing, we have a lot of mapped pages we
2909              * never use.) Old kernels will see MCL_ONFAULT and fail with
2910              * EINVAL; we ignore this failure.
2911              *
2912              * N.B. read the man page for mlockall. MCL_CURRENT | MCL_ONFAULT
2913              * pins ⊆ MCL_CURRENT, converging to just MCL_CURRENT as we fault
2914              * in pages.
2915              */
2916             /* CAP_IPC_LOCK required */
2917             if (mlockall(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT) && (errno != EINVAL)) {
2918                 ALOGW("mlockall failed %s", strerror(errno));
2919             }
2920 
2921             /* CAP_NICE required */
2922             if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2923                 ALOGW("set SCHED_FIFO failed %s", strerror(errno));
2924             }
2925         }
2926 
2927         mainloop();
2928     }
2929 
2930     statslog_destroy();
2931 
2932     android_log_destroy(&ctx);
2933 
2934     ALOGI("exiting");
2935     return 0;
2936 }
2937