1 /*
2  * Copyright (c) 2017, 2018, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.
8  *
9  * This code is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12  * version 2 for more details (a copy is included in the LICENSE file that
13  * accompanied this code).
14  *
15  * You should have received a copy of the GNU General Public License version
16  * 2 along with this work; if not, write to the Free Software Foundation,
17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18  *
19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20  * or visit www.oracle.com if you need additional information or have any
21  * questions.
22  *
23  */
24 
25 #include <string.h>
26 #include <math.h>
27 #include <errno.h>
28 #include "utilities/globalDefinitions.hpp"
29 #include "memory/allocation.hpp"
30 #include "runtime/os.hpp"
31 #include "logging/log.hpp"
32 #include "osContainer_linux.hpp"
33 
34 #define PER_CPU_SHARES 1024
35 
36 bool  OSContainer::_is_initialized   = false;
37 bool  OSContainer::_is_containerized = false;
38 julong _unlimited_memory;
39 
40 class CgroupSubsystem: CHeapObj<mtInternal> {
41  friend class OSContainer;
42 
43  private:
44     /* mountinfo contents */
45     char *_root;
46     char *_mount_point;
47 
48     /* Constructed subsystem directory */
49     char *_path;
50 
51  public:
CgroupSubsystem(char * root,char * mountpoint)52     CgroupSubsystem(char *root, char *mountpoint) {
53       _root = os::strdup(root);
54       _mount_point = os::strdup(mountpoint);
55       _path = NULL;
56     }
57 
58     /*
59      * Set directory to subsystem specific files based
60      * on the contents of the mountinfo and cgroup files.
61      */
set_subsystem_path(char * cgroup_path)62     void set_subsystem_path(char *cgroup_path) {
63       char buf[MAXPATHLEN+1];
64       if (_root != NULL && cgroup_path != NULL) {
65         if (strcmp(_root, "/") == 0) {
66           int buflen;
67           strncpy(buf, _mount_point, MAXPATHLEN);
68           buf[MAXPATHLEN-1] = '\0';
69           if (strcmp(cgroup_path,"/") != 0) {
70             buflen = strlen(buf);
71             if ((buflen + strlen(cgroup_path)) > (MAXPATHLEN-1)) {
72               return;
73             }
74             strncat(buf, cgroup_path, MAXPATHLEN-buflen);
75             buf[MAXPATHLEN-1] = '\0';
76           }
77           _path = os::strdup(buf);
78         } else {
79           if (strcmp(_root, cgroup_path) == 0) {
80             strncpy(buf, _mount_point, MAXPATHLEN);
81             buf[MAXPATHLEN-1] = '\0';
82             _path = os::strdup(buf);
83           } else {
84             char *p = strstr(_root, cgroup_path);
85             if (p != NULL && p == _root) {
86               if (strlen(cgroup_path) > strlen(_root)) {
87                 int buflen;
88                 strncpy(buf, _mount_point, MAXPATHLEN);
89                 buf[MAXPATHLEN-1] = '\0';
90                 buflen = strlen(buf);
91                 if ((buflen + strlen(cgroup_path)) > (MAXPATHLEN-1)) {
92                   return;
93                 }
94                 strncat(buf, cgroup_path + strlen(_root), MAXPATHLEN-buflen);
95                 buf[MAXPATHLEN-1] = '\0';
96                 _path = os::strdup(buf);
97               }
98             }
99           }
100         }
101       }
102     }
103 
subsystem_path()104     char *subsystem_path() { return _path; }
105 };
106 
107 CgroupSubsystem* memory = NULL;
108 CgroupSubsystem* cpuset = NULL;
109 CgroupSubsystem* cpu = NULL;
110 CgroupSubsystem* cpuacct = NULL;
111 
112 typedef char * cptr;
113 
114 PRAGMA_DIAG_PUSH
115 PRAGMA_FORMAT_NONLITERAL_IGNORED
subsystem_file_contents(CgroupSubsystem * c,const char * filename,const char * scan_fmt,T returnval)116 template <typename T> int subsystem_file_contents(CgroupSubsystem* c,
117                                               const char *filename,
118                                               const char *scan_fmt,
119                                               T returnval) {
120   FILE *fp = NULL;
121   char *p;
122   char file[MAXPATHLEN+1];
123   char buf[MAXPATHLEN+1];
124 
125   if (c == NULL) {
126     log_debug(os, container)("subsystem_file_contents: CgroupSubsytem* is NULL");
127     return OSCONTAINER_ERROR;
128   }
129   if (c->subsystem_path() == NULL) {
130     log_debug(os, container)("subsystem_file_contents: subsystem path is NULL");
131     return OSCONTAINER_ERROR;
132   }
133 
134   strncpy(file, c->subsystem_path(), MAXPATHLEN);
135   file[MAXPATHLEN-1] = '\0';
136   int filelen = strlen(file);
137   if ((filelen + strlen(filename)) > (MAXPATHLEN-1)) {
138     log_debug(os, container)("File path too long %s, %s", file, filename);
139     return OSCONTAINER_ERROR;
140   }
141   strncat(file, filename, MAXPATHLEN-filelen);
142   log_trace(os, container)("Path to %s is %s", filename, file);
143   fp = fopen(file, "r");
144   if (fp != NULL) {
145     p = fgets(buf, MAXPATHLEN, fp);
146     if (p != NULL) {
147       int matched = sscanf(p, scan_fmt, returnval);
148       if (matched == 1) {
149         fclose(fp);
150         return 0;
151       } else {
152         log_debug(os, container)("Type %s not found in file %s", scan_fmt, file);
153       }
154     } else {
155       log_debug(os, container)("Empty file %s", file);
156     }
157   } else {
158     log_debug(os, container)("Open of file %s failed, %s", file, os::strerror(errno));
159   }
160   if (fp != NULL)
161     fclose(fp);
162   return OSCONTAINER_ERROR;
163 }
164 PRAGMA_DIAG_POP
165 
166 #define GET_CONTAINER_INFO(return_type, subsystem, filename,              \
167                            logstring, scan_fmt, variable)                 \
168   return_type variable;                                                   \
169 {                                                                         \
170   int err;                                                                \
171   err = subsystem_file_contents(subsystem,                                \
172                                 filename,                                 \
173                                 scan_fmt,                                 \
174                                 &variable);                               \
175   if (err != 0)                                                           \
176     return (return_type) OSCONTAINER_ERROR;                               \
177                                                                           \
178   log_trace(os, container)(logstring, variable);                          \
179 }
180 
181 #define GET_CONTAINER_INFO_CPTR(return_type, subsystem, filename,         \
182                                logstring, scan_fmt, variable, bufsize)    \
183   char variable[bufsize];                                                 \
184 {                                                                         \
185   int err;                                                                \
186   err = subsystem_file_contents(subsystem,                                \
187                                 filename,                                 \
188                                 scan_fmt,                                 \
189                                 variable);                                \
190   if (err != 0)                                                           \
191     return (return_type) NULL;                                            \
192                                                                           \
193   log_trace(os, container)(logstring, variable);                          \
194 }
195 
196 /* init
197  *
198  * Initialize the container support and determine if
199  * we are running under cgroup control.
200  */
init()201 void OSContainer::init() {
202   int mountid;
203   int parentid;
204   int major;
205   int minor;
206   FILE *mntinfo = NULL;
207   FILE *cgroup = NULL;
208   char buf[MAXPATHLEN+1];
209   char tmproot[MAXPATHLEN+1];
210   char tmpmount[MAXPATHLEN+1];
211   char tmpbase[MAXPATHLEN+1];
212   char *p;
213   jlong mem_limit;
214 
215   assert(!_is_initialized, "Initializing OSContainer more than once");
216 
217   _is_initialized = true;
218   _is_containerized = false;
219 
220   _unlimited_memory = (LONG_MAX / os::vm_page_size()) * os::vm_page_size();
221 
222   log_trace(os, container)("OSContainer::init: Initializing Container Support");
223   if (!UseContainerSupport) {
224     log_trace(os, container)("Container Support not enabled");
225     return;
226   }
227 
228   /*
229    * Find the cgroup mount point for memory and cpuset
230    * by reading /proc/self/mountinfo
231    *
232    * Example for docker:
233    * 219 214 0:29 /docker/7208cebd00fa5f2e342b1094f7bed87fa25661471a4637118e65f1c995be8a34 /sys/fs/cgroup/memory ro,nosuid,nodev,noexec,relatime - cgroup cgroup rw,memory
234    *
235    * Example for host:
236    * 34 28 0:29 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,memory
237    */
238   mntinfo = fopen("/proc/self/mountinfo", "r");
239   if (mntinfo == NULL) {
240       log_debug(os, container)("Can't open /proc/self/mountinfo, %s",
241                                os::strerror(errno));
242       return;
243   }
244 
245   while ( (p = fgets(buf, MAXPATHLEN, mntinfo)) != NULL) {
246     // Look for the filesystem type and see if it's cgroup
247     char fstype[MAXPATHLEN+1];
248     fstype[0] = '\0';
249     char *s =  strstr(p, " - ");
250     if (s != NULL &&
251         sscanf(s, " - %s", fstype) == 1 &&
252         strcmp(fstype, "cgroup") == 0) {
253 
254       if (strstr(p, "memory") != NULL) {
255         int matched = sscanf(p, "%d %d %d:%d %s %s",
256                              &mountid,
257                              &parentid,
258                              &major,
259                              &minor,
260                              tmproot,
261                              tmpmount);
262         if (matched == 6) {
263           memory = new CgroupSubsystem(tmproot, tmpmount);
264         }
265         else
266           log_debug(os, container)("Incompatible str containing cgroup and memory: %s", p);
267       } else if (strstr(p, "cpuset") != NULL) {
268         int matched = sscanf(p, "%d %d %d:%d %s %s",
269                              &mountid,
270                              &parentid,
271                              &major,
272                              &minor,
273                              tmproot,
274                              tmpmount);
275         if (matched == 6) {
276           cpuset = new CgroupSubsystem(tmproot, tmpmount);
277         }
278         else {
279           log_debug(os, container)("Incompatible str containing cgroup and cpuset: %s", p);
280         }
281       } else if (strstr(p, "cpu,cpuacct") != NULL || strstr(p, "cpuacct,cpu") != NULL) {
282         int matched = sscanf(p, "%d %d %d:%d %s %s",
283                              &mountid,
284                              &parentid,
285                              &major,
286                              &minor,
287                              tmproot,
288                              tmpmount);
289         if (matched == 6) {
290           cpu = new CgroupSubsystem(tmproot, tmpmount);
291           cpuacct = new CgroupSubsystem(tmproot, tmpmount);
292         }
293         else {
294           log_debug(os, container)("Incompatible str containing cgroup and cpu,cpuacct: %s", p);
295         }
296       } else if (strstr(p, "cpuacct") != NULL) {
297         int matched = sscanf(p, "%d %d %d:%d %s %s",
298                              &mountid,
299                              &parentid,
300                              &major,
301                              &minor,
302                              tmproot,
303                              tmpmount);
304         if (matched == 6) {
305           cpuacct = new CgroupSubsystem(tmproot, tmpmount);
306         }
307         else {
308           log_debug(os, container)("Incompatible str containing cgroup and cpuacct: %s", p);
309         }
310       } else if (strstr(p, "cpu") != NULL) {
311         int matched = sscanf(p, "%d %d %d:%d %s %s",
312                              &mountid,
313                              &parentid,
314                              &major,
315                              &minor,
316                              tmproot,
317                              tmpmount);
318         if (matched == 6) {
319           cpu = new CgroupSubsystem(tmproot, tmpmount);
320         }
321         else {
322           log_debug(os, container)("Incompatible str containing cgroup and cpu: %s", p);
323         }
324       }
325     }
326   }
327 
328   fclose(mntinfo);
329 
330   if (memory == NULL) {
331     log_debug(os, container)("Required cgroup memory subsystem not found");
332     return;
333   }
334   if (cpuset == NULL) {
335     log_debug(os, container)("Required cgroup cpuset subsystem not found");
336     return;
337   }
338   if (cpu == NULL) {
339     log_debug(os, container)("Required cgroup cpu subsystem not found");
340     return;
341   }
342   if (cpuacct == NULL) {
343     log_debug(os, container)("Required cgroup cpuacct subsystem not found");
344     return;
345   }
346 
347   /*
348    * Read /proc/self/cgroup and map host mount point to
349    * local one via /proc/self/mountinfo content above
350    *
351    * Docker example:
352    * 5:memory:/docker/6558aed8fc662b194323ceab5b964f69cf36b3e8af877a14b80256e93aecb044
353    *
354    * Host example:
355    * 5:memory:/user.slice
356    *
357    * Construct a path to the process specific memory and cpuset
358    * cgroup directory.
359    *
360    * For a container running under Docker from memory example above
361    * the paths would be:
362    *
363    * /sys/fs/cgroup/memory
364    *
365    * For a Host from memory example above the path would be:
366    *
367    * /sys/fs/cgroup/memory/user.slice
368    *
369    */
370   cgroup = fopen("/proc/self/cgroup", "r");
371   if (cgroup == NULL) {
372     log_debug(os, container)("Can't open /proc/self/cgroup, %s",
373                              os::strerror(errno));
374     return;
375   }
376 
377   while ( (p = fgets(buf, MAXPATHLEN, cgroup)) != NULL) {
378     int cgno;
379     int matched;
380     char *controller;
381     char *base;
382 
383     /* Skip cgroup number */
384     strsep(&p, ":");
385     /* Get controller and base */
386     controller = strsep(&p, ":");
387     base = strsep(&p, "\n");
388 
389     if (controller != NULL) {
390       if (strstr(controller, "memory") != NULL) {
391         memory->set_subsystem_path(base);
392       } else if (strstr(controller, "cpuset") != NULL) {
393         cpuset->set_subsystem_path(base);
394       } else if (strstr(controller, "cpu,cpuacct") != NULL || strstr(controller, "cpuacct,cpu") != NULL) {
395         cpu->set_subsystem_path(base);
396         cpuacct->set_subsystem_path(base);
397       } else if (strstr(controller, "cpuacct") != NULL) {
398         cpuacct->set_subsystem_path(base);
399       } else if (strstr(controller, "cpu") != NULL) {
400         cpu->set_subsystem_path(base);
401       }
402     }
403   }
404 
405   fclose(cgroup);
406 
407   // We need to update the amount of physical memory now that
408   // command line arguments have been processed.
409   if ((mem_limit = memory_limit_in_bytes()) > 0) {
410     os::Linux::set_physical_memory(mem_limit);
411   }
412 
413   _is_containerized = true;
414 
415 }
416 
container_type()417 const char * OSContainer::container_type() {
418   if (is_containerized()) {
419     return "cgroupv1";
420   } else {
421     return NULL;
422   }
423 }
424 
425 
426 /* memory_limit_in_bytes
427  *
428  * Return the limit of available memory for this process.
429  *
430  * return:
431  *    memory limit in bytes or
432  *    -1 for unlimited
433  *    OSCONTAINER_ERROR for not supported
434  */
memory_limit_in_bytes()435 jlong OSContainer::memory_limit_in_bytes() {
436   GET_CONTAINER_INFO(julong, memory, "/memory.limit_in_bytes",
437                      "Memory Limit is: " JULONG_FORMAT, JULONG_FORMAT, memlimit);
438 
439   if (memlimit >= _unlimited_memory) {
440     log_trace(os, container)("Memory Limit is: Unlimited");
441     return (jlong)-1;
442   }
443   else {
444     return (jlong)memlimit;
445   }
446 }
447 
memory_and_swap_limit_in_bytes()448 jlong OSContainer::memory_and_swap_limit_in_bytes() {
449   GET_CONTAINER_INFO(julong, memory, "/memory.memsw.limit_in_bytes",
450                      "Memory and Swap Limit is: " JULONG_FORMAT, JULONG_FORMAT, memswlimit);
451   if (memswlimit >= _unlimited_memory) {
452     log_trace(os, container)("Memory and Swap Limit is: Unlimited");
453     return (jlong)-1;
454   } else {
455     return (jlong)memswlimit;
456   }
457 }
458 
memory_soft_limit_in_bytes()459 jlong OSContainer::memory_soft_limit_in_bytes() {
460   GET_CONTAINER_INFO(julong, memory, "/memory.soft_limit_in_bytes",
461                      "Memory Soft Limit is: " JULONG_FORMAT, JULONG_FORMAT, memsoftlimit);
462   if (memsoftlimit >= _unlimited_memory) {
463     log_trace(os, container)("Memory Soft Limit is: Unlimited");
464     return (jlong)-1;
465   } else {
466     return (jlong)memsoftlimit;
467   }
468 }
469 
470 /* memory_usage_in_bytes
471  *
472  * Return the amount of used memory for this process.
473  *
474  * return:
475  *    memory usage in bytes or
476  *    -1 for unlimited
477  *    OSCONTAINER_ERROR for not supported
478  */
memory_usage_in_bytes()479 jlong OSContainer::memory_usage_in_bytes() {
480   GET_CONTAINER_INFO(jlong, memory, "/memory.usage_in_bytes",
481                      "Memory Usage is: " JLONG_FORMAT, JLONG_FORMAT, memusage);
482   return memusage;
483 }
484 
485 /* memory_max_usage_in_bytes
486  *
487  * Return the maximum amount of used memory for this process.
488  *
489  * return:
490  *    max memory usage in bytes or
491  *    OSCONTAINER_ERROR for not supported
492  */
memory_max_usage_in_bytes()493 jlong OSContainer::memory_max_usage_in_bytes() {
494   GET_CONTAINER_INFO(jlong, memory, "/memory.max_usage_in_bytes",
495                      "Maximum Memory Usage is: " JLONG_FORMAT, JLONG_FORMAT, memmaxusage);
496   return memmaxusage;
497 }
498 
499 /* active_processor_count
500  *
501  * Calculate an appropriate number of active processors for the
502  * VM to use based on these three inputs.
503  *
504  * cpu affinity
505  * cgroup cpu quota & cpu period
506  * cgroup cpu shares
507  *
508  * Algorithm:
509  *
510  * Determine the number of available CPUs from sched_getaffinity
511  *
512  * If user specified a quota (quota != -1), calculate the number of
513  * required CPUs by dividing quota by period.
514  *
515  * If shares are in effect (shares != -1), calculate the number
516  * of CPUs required for the shares by dividing the share value
517  * by PER_CPU_SHARES.
518  *
519  * All results of division are rounded up to the next whole number.
520  *
521  * If neither shares or quotas have been specified, return the
522  * number of active processors in the system.
523  *
524  * If both shares and quotas have been specified, the results are
525  * based on the flag PreferContainerQuotaForCPUCount.  If true,
526  * return the quota value.  If false return the smallest value
527  * between shares or quotas.
528  *
529  * If shares and/or quotas have been specified, the resulting number
530  * returned will never exceed the number of active processors.
531  *
532  * return:
533  *    number of CPUs
534  */
active_processor_count()535 int OSContainer::active_processor_count() {
536   int quota_count = 0, share_count = 0;
537   int cpu_count, limit_count;
538   int result;
539 
540   cpu_count = limit_count = os::Linux::active_processor_count();
541   int quota  = cpu_quota();
542   int period = cpu_period();
543   int share  = cpu_shares();
544 
545   if (quota > -1 && period > 0) {
546     quota_count = ceilf((float)quota / (float)period);
547     log_trace(os, container)("CPU Quota count based on quota/period: %d", quota_count);
548   }
549   if (share > -1) {
550     share_count = ceilf((float)share / (float)PER_CPU_SHARES);
551     log_trace(os, container)("CPU Share count based on shares: %d", share_count);
552   }
553 
554   // If both shares and quotas are setup results depend
555   // on flag PreferContainerQuotaForCPUCount.
556   // If true, limit CPU count to quota
557   // If false, use minimum of shares and quotas
558   if (quota_count !=0 && share_count != 0) {
559     if (PreferContainerQuotaForCPUCount) {
560       limit_count = quota_count;
561     } else {
562       limit_count = MIN2(quota_count, share_count);
563     }
564   } else if (quota_count != 0) {
565     limit_count = quota_count;
566   } else if (share_count != 0) {
567     limit_count = share_count;
568   }
569 
570   result = MIN2(cpu_count, limit_count);
571   log_trace(os, container)("OSContainer::active_processor_count: %d", result);
572   return result;
573 }
574 
cpu_cpuset_cpus()575 char * OSContainer::cpu_cpuset_cpus() {
576   GET_CONTAINER_INFO_CPTR(cptr, cpuset, "/cpuset.cpus",
577                      "cpuset.cpus is: %s", "%1023s", cpus, 1024);
578   return os::strdup(cpus);
579 }
580 
cpu_cpuset_memory_nodes()581 char * OSContainer::cpu_cpuset_memory_nodes() {
582   GET_CONTAINER_INFO_CPTR(cptr, cpuset, "/cpuset.mems",
583                      "cpuset.mems is: %s", "%1023s", mems, 1024);
584   return os::strdup(mems);
585 }
586 
587 /* cpu_quota
588  *
589  * Return the number of milliseconds per period
590  * process is guaranteed to run.
591  *
592  * return:
593  *    quota time in milliseconds
594  *    -1 for no quota
595  *    OSCONTAINER_ERROR for not supported
596  */
cpu_quota()597 int OSContainer::cpu_quota() {
598   GET_CONTAINER_INFO(int, cpu, "/cpu.cfs_quota_us",
599                      "CPU Quota is: %d", "%d", quota);
600   return quota;
601 }
602 
cpu_period()603 int OSContainer::cpu_period() {
604   GET_CONTAINER_INFO(int, cpu, "/cpu.cfs_period_us",
605                      "CPU Period is: %d", "%d", period);
606   return period;
607 }
608 
609 /* cpu_shares
610  *
611  * Return the amount of cpu shares available to the process
612  *
613  * return:
614  *    Share number (typically a number relative to 1024)
615  *                 (2048 typically expresses 2 CPUs worth of processing)
616  *    -1 for no share setup
617  *    OSCONTAINER_ERROR for not supported
618  */
cpu_shares()619 int OSContainer::cpu_shares() {
620   GET_CONTAINER_INFO(int, cpu, "/cpu.shares",
621                      "CPU Shares is: %d", "%d", shares);
622   // Convert 1024 to no shares setup
623   if (shares == 1024) return -1;
624 
625   return shares;
626 }
627 
628