1 // This file is part of BOINC.
2 // http://boinc.berkeley.edu
3 // Copyright (C) 2008 University of California
4 //
5 // BOINC is free software; you can redistribute it and/or modify it
6 // under the terms of the GNU Lesser General Public License
7 // as published by the Free Software Foundation,
8 // either version 3 of the License, or (at your option) any later version.
9 //
10 // BOINC is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13 // See the GNU Lesser General Public License for more details.
14 //
15 // You should have received a copy of the GNU Lesser General Public License
16 // along with BOINC. If not, see <http://www.gnu.org/licenses/>.
17
18 // monitoring and process control of running apps
19
20 #include "cpp.h"
21
22 #ifdef _WIN32
23 #include "boinc_win.h"
24 #include "win_util.h"
25 #ifdef _MSC_VER
26 #define snprintf _snprintf
27 #endif
28 #ifndef STATUS_SUCCESS
29 #define STATUS_SUCCESS 0x0 // may be in ntstatus.h
30 #endif
31 #ifndef STATUS_DLL_INIT_FAILED
32 #define STATUS_DLL_INIT_FAILED 0xC0000142 // may be in ntstatus.h
33 #endif
34 #ifndef STATUS_DLL_INIT_FAILED_LOGOFF
35 #define STATUS_DLL_INIT_FAILED_LOGOFF 0xC000026B // may be in ntstatus.h
36 #endif
37
38 #else
39 #include "config.h"
40 #include <string>
41 #include <unistd.h>
42
43 #if HAVE_SYS_IPC_H
44 #include <sys/ipc.h>
45 #endif
46 #if HAVE_SYS_RESOURCE_H
47 #include <sys/resource.h>
48 #endif
49 #if HAVE_CSIGNAL
50 #include <csignal>
51 #elif HAVE_SYS_SIGNAL_H
52 #include <sys/signal.h>
53 #elif HAVE_SIGNAL_H
54 #include <signal.h>
55 #endif
56 #if HAVE_SYS_WAIT_H
57 #include <sys/wait.h>
58 #endif
59
60 #include <vector>
61
62 #endif
63
64 using std::vector;
65
66 #include "error_numbers.h"
67 #include "filesys.h"
68 #include "parse.h"
69 #include "shmem.h"
70 #include "str_replace.h"
71 #include "str_util.h"
72 #include "util.h"
73
74 #include "client_msgs.h"
75 #include "client_state.h"
76 #include "file_names.h"
77 #include "proc_control.h"
78 #include "result.h"
79 #include "sandbox.h"
80
81 #include "app.h"
82
83 // Do periodic checks on running apps:
84 // - get latest CPU time and % done info
85 // - check if any has exited, and clean up
86 // - see if any has exceeded its CPU or disk space limits, and abort it
87 //
poll()88 bool ACTIVE_TASK_SET::poll() {
89 bool action;
90 unsigned int i;
91 static double last_time = 0;
92 if (!gstate.clock_change && gstate.now - last_time < TASK_POLL_PERIOD) return false;
93 last_time = gstate.now;
94
95 action = check_app_exited();
96 send_heartbeats();
97 send_trickle_downs();
98 process_control_poll();
99 action |= check_rsc_limits_exceeded();
100 get_msgs();
101 for (i=0; i<active_tasks.size(); i++) {
102 ACTIVE_TASK* atp = active_tasks[i];
103 if (atp->task_state() == PROCESS_ABORT_PENDING) {
104 if (gstate.now > atp->abort_time + ABORT_TIMEOUT) {
105 if (log_flags.task_debug) {
106 msg_printf(atp->result->project, MSG_INFO,
107 "[task] abort request timed out, killing task %s",
108 atp->result->name
109 );
110 }
111 atp->kill_running_task(false);
112 }
113 }
114 if (atp->task_state() == PROCESS_QUIT_PENDING) {
115 if (gstate.now > atp->quit_time + QUIT_TIMEOUT) {
116 if (log_flags.task_debug) {
117 msg_printf(atp->result->project, MSG_INFO,
118 "[task] quit request timed out, killing task %s",
119 atp->result->name
120 );
121 }
122 atp->kill_running_task(true);
123 }
124 }
125 }
126
127 // Check for finish files every 10 sec.
128 // If we already found a finish file, abort the app;
129 // it must be hung somewhere in boinc_finish();
130 //
131 static double last_finish_check_time = 0;
132 if (gstate.clock_change || gstate.now - last_finish_check_time > 10) {
133 last_finish_check_time = gstate.now;
134 for (i=0; i<active_tasks.size(); i++) {
135 ACTIVE_TASK* atp = active_tasks[i];
136 if (atp->task_state() == PROCESS_UNINITIALIZED) continue;
137 if (atp->finish_file_time) {
138 // process is still there 10 sec after it wrote finish file.
139 // abort the job
140 atp->abort_task(EXIT_ABORTED_BY_CLIENT, "finish file present too long");
141 } else if (atp->finish_file_present()) {
142 atp->finish_file_time = gstate.now;
143 }
144 }
145 }
146 if (action) {
147 gstate.set_client_state_dirty("ACTIVE_TASK_SET::poll");
148 }
149
150 return action;
151 }
152
153 #if 0
154 // deprecated; TerminateProcessById() doesn't work if
155 // the process is running as a different user
156 //
157 #ifdef _WIN32
158 bool ACTIVE_TASK::kill_all_children() {
159 unsigned int i,j;
160 std::vector<PROCINFO> ps;
161 std::vector<PROCINFO> tps;
162
163 procinfo_setup(ps);
164
165 PROCINFO pi;
166 pi.id = pid;
167 tps.push_back(pi);
168
169 for (i=0; i < tps.size(); i++) {
170 PROCINFO tp = tps[i];
171 for (j=0; j < ps.size(); j++) {
172 PROCINFO p = ps[j];
173 if (tp.id == p.parentid) {
174 if (TerminateProcessById(p.id)) {
175 tps.push_back(p);
176 }
177 }
178 }
179 }
180 return true;
181 }
182 #endif
183 #endif
184
print_descendants(int pid,vector<int> desc,const char * where)185 static void print_descendants(int pid, vector<int>desc, const char* where) {
186 msg_printf(0, MSG_INFO, "%s: PID %d has %d descendants",
187 where, pid, (int)desc.size()
188 );
189 for (unsigned int i=0; i<desc.size(); i++) {
190 msg_printf(0, MSG_INFO, " PID %d", desc[i]);
191 }
192 }
193
194 // Send a quit message, start timer, get descendants
195 //
request_exit()196 int ACTIVE_TASK::request_exit() {
197 if (app_client_shm.shm) {
198 process_control_queue.msg_queue_send(
199 "<quit/>",
200 app_client_shm.shm->process_control_request
201 );
202 }
203 set_task_state(PROCESS_QUIT_PENDING, "request_exit()");
204 quit_time = gstate.now;
205 get_descendants(pid, descendants);
206 if (log_flags.task_debug) {
207 print_descendants(pid, descendants, "request_exit()");
208 }
209 return 0;
210 }
211
212 // Send an abort message, start timer, get descendants
213 //
request_abort()214 int ACTIVE_TASK::request_abort() {
215 if (app_client_shm.shm) {
216 process_control_queue.msg_queue_send(
217 "<abort/>",
218 app_client_shm.shm->process_control_request
219 );
220 }
221 set_task_state(PROCESS_ABORT_PENDING, "request_abort");
222 abort_time = gstate.now;
223 get_descendants(pid, descendants);
224 if (log_flags.task_debug) {
225 print_descendants(pid, descendants, "request_abort()");
226 }
227 return 0;
228 }
229
230 #ifdef _WIN32
kill_app_process(int pid,bool will_restart)231 static void kill_app_process(int pid, bool will_restart) {
232 int retval = 0;
233 retval = kill_program(pid, will_restart?0:EXIT_ABORTED_BY_CLIENT);
234 if (retval && log_flags.task_debug) {
235 msg_printf(0, MSG_INFO,
236 "[task] kill_program(%d) failed: %s",
237 pid, boincerror(retval)
238 );
239 }
240 }
241 #else
kill_app_process(int pid,bool)242 static void kill_app_process(int pid, bool) {
243 int retval = 0;
244 if (g_use_sandbox) {
245 retval = kill_via_switcher(pid);
246 if (retval && log_flags.task_debug) {
247 msg_printf(0, MSG_INFO,
248 "[task] kill_via_switcher(%d) failed: %s (%d)",
249 pid,
250 (retval>=0) ? strerror(errno) : boincerror(retval), retval
251 );
252 }
253 } else {
254 retval = kill_program(pid);
255 if (retval && log_flags.task_debug) {
256 msg_printf(0, MSG_INFO,
257 "[task] kill_program(%d) failed: %s",
258 pid, strerror(errno)
259 );
260 }
261 }
262 }
263 #endif
264
265 // Kill a task whose main process is still running
266 // Just kill the main process; shared mem and subsidiary processes
267 // will be cleaned up after it exits, by cleanup_task();
268 //
kill_running_task(bool will_restart)269 int ACTIVE_TASK::kill_running_task(bool will_restart) {
270 kill_app_process(pid, will_restart);
271 return 0;
272 }
273
274 // Kill any remaining subsidiary processes
275 // of a task whose main process has exited,
276 // namely:
277 // - its descendants (as recently enumerated; it's too late to do that now)
278 // This list will be populated only in the quit and abort cases.
279 // - its "other" processes, e.g. VMs
280 //
kill_subsidiary_processes()281 int ACTIVE_TASK::kill_subsidiary_processes() {
282 unsigned int i;
283 for (i=0; i<other_pids.size(); i++) {
284 kill_app_process(other_pids[i], false);
285 }
286 for (i=0; i<descendants.size(); i++) {
287 kill_app_process(descendants[i], false);
288 }
289 return 0;
290 }
291
292 // We have sent a quit request to the process; see if it's exited.
293 // This is called when the client exits,
294 // or when a project is detached or reset
295 //
has_task_exited()296 bool ACTIVE_TASK::has_task_exited() {
297 bool exited = false;
298
299 if (!process_exists()) return true;
300
301 #ifdef _WIN32
302 unsigned long exit_code;
303 if (GetExitCodeProcess(process_handle, &exit_code)) {
304 if (exit_code != STILL_ACTIVE) {
305 exited = true;
306 }
307 }
308 #else
309 if (waitpid(pid, 0, WNOHANG) == pid) {
310 exited = true;
311 }
312 #endif
313 if (exited) {
314 set_task_state(PROCESS_EXITED, "has_task_exited");
315 cleanup_task();
316 }
317 return exited;
318 }
319
320
limbo_message(ACTIVE_TASK & at)321 static void limbo_message(ACTIVE_TASK& at) {
322 #ifdef _WIN32
323 if (at.result->exit_status == STATUS_DLL_INIT_FAILED) {
324 msg_printf(at.result->project, MSG_INFO,
325 "Task %s exited with a DLL initialization error.",
326 at.result->name
327 );
328 msg_printf(at.result->project, MSG_INFO,
329 "If this happens repeatedly you may need to reboot your computer."
330 );
331 return;
332 }
333 #endif
334 msg_printf(at.result->project, MSG_INFO,
335 "Task %s exited with zero status but no 'finished' file",
336 at.result->name
337 );
338 msg_printf(at.result->project, MSG_INFO,
339 "If this happens repeatedly you may need to reset the project."
340 );
341 }
342
343 // the job just exited. If it's a GPU job,
344 // clear the "schedule_backoff" field of all other jobs
345 // that use the GPU type, in case they're waiting for GPU RAM
346 //
clear_schedule_backoffs(ACTIVE_TASK * atp)347 static void clear_schedule_backoffs(ACTIVE_TASK* atp) {
348 int rt = atp->result->avp->rsc_type();
349 if (rt == RSC_TYPE_CPU) return;
350 for (unsigned int i=0; i<gstate.results.size(); i++) {
351 RESULT* rp = gstate.results[i];
352 if (rp->avp->rsc_type() == rt) {
353 rp->schedule_backoff = 0;
354 }
355 }
356 }
357
358 // handle a task that exited prematurely (i.e. no finish file)
359 //
handle_premature_exit(bool & will_restart)360 void ACTIVE_TASK::handle_premature_exit(bool& will_restart) {
361 // keep count of premature exits;
362 // if this happens 100 times w/o a checkpoint, abort job
363 //
364 premature_exit_count++;
365 if (premature_exit_count > 100) {
366 will_restart = false;
367 set_task_state(PROCESS_ABORTED, "handle_premature_exit");
368 result->exit_status = ERR_TOO_MANY_EXITS;
369 gstate.report_result_error(*result, "too many exit(0)s");
370 result->set_state(RESULT_ABORTED, "handle_premature_exit");
371 } else {
372 will_restart = true;
373 limbo_message(*this);
374 set_task_state(PROCESS_UNINITIALIZED, "handle_premature_exit");
375 }
376 }
377
378 // handle a temporary exit
379 //
handle_temporary_exit(bool & will_restart,double backoff,const char * reason,bool is_notice)380 void ACTIVE_TASK::handle_temporary_exit(
381 bool& will_restart, double backoff, const char* reason, bool is_notice
382 ) {
383 premature_exit_count++;
384 if (premature_exit_count > 100) {
385 will_restart = false;
386 set_task_state(PROCESS_ABORTED, "handle_temporary_exit");
387 result->exit_status = ERR_TOO_MANY_EXITS;
388 gstate.report_result_error(*result, "too many boinc_temporary_exit()s");
389 result->set_state(RESULT_ABORTED, "handle_temporary_exit");
390 } else {
391 if (is_notice) {
392 msg_printf(result->project, MSG_USER_ALERT,
393 "Task postponed: %s", reason
394 );
395 } else {
396 if (log_flags.task) {
397 msg_printf(result->project, MSG_INFO,
398 "task postponed %f sec: %s", backoff, reason
399 );
400 }
401 }
402 will_restart = true;
403 result->schedule_backoff = gstate.now + backoff;
404 safe_strcpy(result->schedule_backoff_reason, reason);
405 set_task_state(PROCESS_UNINITIALIZED, "handle_temporary_exit");
406 }
407 }
408
copy_final_info()409 void ACTIVE_TASK::copy_final_info() {
410 result->final_cpu_time = current_cpu_time;
411 result->final_elapsed_time = elapsed_time;
412 result->final_peak_working_set_size = peak_working_set_size;
413 result->final_peak_swap_size = peak_swap_size;
414 result->final_peak_disk_usage = peak_disk_usage;
415 result->final_bytes_sent = bytes_sent;
416 result->final_bytes_received = bytes_received;
417 }
418
419 // deal with a process that has exited, for whatever reason:
420 // - completion
421 // - crash
422 // - quit or abort message sent by client
423 // - killed by client
424 //
425 #ifdef _WIN32
handle_exited_app(unsigned long exit_code)426 void ACTIVE_TASK::handle_exited_app(unsigned long exit_code) {
427 if (log_flags.task_debug) {
428 msg_printf(result->project, MSG_INFO,
429 "[task] Process for %s exited, exit code %lu, task state %d",
430 result->name, exit_code, task_state()
431 );
432 }
433 #else
434 void ACTIVE_TASK::handle_exited_app(int stat) {
435 if (log_flags.task_debug) {
436 msg_printf(result->project, MSG_INFO,
437 "[task] Process for %s exited, status %d, task state %d",
438 result->name, stat, task_state()
439 );
440 }
441 #endif
442 char err_msg[4096];
443 bool will_restart = false;
444
445 get_app_status_msg();
446 get_trickle_up_msg();
447 copy_final_info();
448
449 // if an abort or quit is pending,
450 // the process may have exited itself, or we may have killed it.
451 // Ignore exit status.
452 //
453 if (task_state() == PROCESS_ABORT_PENDING) {
454 set_task_state(PROCESS_ABORTED, "handle_exited_app");
455 } else if (task_state() == PROCESS_QUIT_PENDING) {
456 set_task_state(PROCESS_UNINITIALIZED, "handle_exited_app");
457 will_restart = true;
458 } else {
459 #ifdef _WIN32
460 result->exit_status = exit_code;
461 switch(exit_code) {
462 case STATUS_SUCCESS:
463 // if another process killed the app, it looks like exit(0).
464 // So check for the finish file
465 //
466 if (finish_file_present()) {
467 set_task_state(PROCESS_EXITED, "handle_exited_app");
468 break;
469 }
470 double x;
471 bool is_notice;
472 char buf[256];
473 safe_strcpy(buf, "");
474 if (temporary_exit_file_present(x, buf, is_notice)) {
475 handle_temporary_exit(will_restart, x, buf, is_notice);
476 } else {
477 handle_premature_exit(will_restart);
478 }
479 break;
480 case 0xc000013a: // control-C??
481 case 0x40010004: // vista shutdown?? can someone explain this?
482 case STATUS_DLL_INIT_FAILED:
483 case STATUS_DLL_INIT_FAILED_LOGOFF:
484 // This can happen because:
485 // - The OS is shutting down, and attempting to start
486 // any new application fails automatically.
487 // - The OS has run out of desktop heap
488 // - (reportedly) The computer has just come out of hibernation
489 //
490 handle_premature_exit(will_restart);
491 break;
492 default:
493 char szError[1024];
494 set_task_state(PROCESS_EXITED, "handle_exited_app");
495 sprintf(err_msg,
496 "%s - exit code %d (0x%x)",
497 windows_format_error_string(exit_code, szError, sizeof(szError)),
498 exit_code, exit_code
499 );
500 gstate.report_result_error(*result, err_msg);
501 if (log_flags.task_debug) {
502 msg_printf(result->project, MSG_INFO,
503 "[task] Process for %s exited",
504 result->name
505 );
506 msg_printf(result->project, MSG_INFO,
507 "[task] exit code %d (0x%x): %s",
508 exit_code, exit_code,
509 windows_format_error_string(exit_code, szError, sizeof(szError))
510 );
511 }
512 break;
513 }
514 #else
515 if (WIFEXITED(stat)) {
516 result->exit_status = WEXITSTATUS(stat);
517
518 double x;
519 char buf[256];
520 bool is_notice;
521 if (temporary_exit_file_present(x, buf, is_notice)) {
522 handle_temporary_exit(will_restart, x, buf, is_notice);
523 } else {
524 if (log_flags.task_debug) {
525 msg_printf(result->project, MSG_INFO,
526 "[task] process exited with status %d\n",
527 result->exit_status
528 );
529 }
530 if (result->exit_status) {
531 set_task_state(PROCESS_EXITED, "handle_exited_app");
532 sprintf(err_msg,
533 "process exited with code %d (0x%x, %d)",
534 result->exit_status, result->exit_status,
535 (-1<<8)|result->exit_status
536 );
537 gstate.report_result_error(*result, err_msg);
538 } else {
539 if (finish_file_present()) {
540 set_task_state(PROCESS_EXITED, "handle_exited_app");
541 } else {
542 handle_premature_exit(will_restart);
543 }
544 }
545 }
546 } else if (WIFSIGNALED(stat)) {
547 int got_signal = WTERMSIG(stat);
548
549 if (log_flags.task_debug) {
550 msg_printf(result->project, MSG_INFO,
551 "[task] process got signal %d", got_signal
552 );
553 }
554
555 // if the process was externally killed, let it restart.
556 //
557 switch (got_signal) {
558 case SIGHUP:
559 case SIGINT:
560 case SIGQUIT:
561 case SIGKILL:
562 case SIGTERM:
563 case SIGSTOP:
564 will_restart = true;
565 set_task_state(PROCESS_UNINITIALIZED, "handle_exited_app");
566 break;
567 default:
568 result->exit_status = stat;
569 set_task_state(PROCESS_WAS_SIGNALED, "handle_exited_app");
570 signal = got_signal;
571 sprintf(err_msg, "process got signal %d", signal);
572 gstate.report_result_error(*result, err_msg);
573 }
574 } else {
575 result->exit_status = EXIT_UNKNOWN;
576 set_task_state(PROCESS_EXIT_UNKNOWN, "handle_exited_app");
577 gstate.report_result_error(*result, "process exit, unknown");
578 msg_printf(result->project, MSG_INTERNAL_ERROR,
579 "process exited for unknown reason"
580 );
581 }
582 #endif
583 }
584
585 // get rid of shared-mem segment and kill subsidiary processes
586 //
587 cleanup_task();
588
589 if (gstate.run_test_app) {
590 msg_printf(0, MSG_INFO, "test app finished - exiting");
591 exit(0);
592 }
593
594 if (!will_restart) {
595 copy_output_files();
596 int retval = read_stderr_file();
597 if (retval) {
598 msg_printf(result->project, MSG_INTERNAL_ERROR,
599 "read_stderr_file(): %s", boincerror(retval)
600 );
601 }
602 client_clean_out_dir(slot_dir, "handle_exited_app()");
603 clear_schedule_backoffs(this);
604 // clear scheduling backoffs of jobs waiting for GPU
605 }
606 gstate.request_schedule_cpus("application exited");
607 gstate.request_work_fetch("application exited");
608 }
609
610 // structure of a finish file (see boinc_api.cpp)):
611 // exit status (int)
612 // message
613 // "notice" or blank line
614 // ... or empty
615 //
616 bool ACTIVE_TASK::finish_file_present() {
617 char path[MAXPATHLEN], buf[1024], buf2[256];
618 safe_strcpy(buf, "");
619 safe_strcpy(buf2, "");
620 sprintf(path, "%s/%s", slot_dir, BOINC_FINISH_CALLED_FILE);
621 FILE* f = boinc_fopen(path, "r");
622 if (!f) return false;
623 fgets(buf, sizeof(buf), f); // read (and discard) exit status
624 char* p = fgets(buf, sizeof(buf), f);
625 if (p && strlen(buf)) {
626 fgets(buf2, sizeof(buf2), f);
627 msg_printf(result->project,
628 strstr(buf2, "notice")?MSG_USER_ALERT:MSG_INFO,
629 "Message from task: %s", buf
630 );
631 }
632 fclose(f);
633 return true;
634 }
635
636 bool ACTIVE_TASK::temporary_exit_file_present(
637 double& x, char* buf, bool& is_notice
638 ) {
639 char path[MAXPATHLEN], buf2[256];
640 sprintf(path, "%s/%s", slot_dir, TEMPORARY_EXIT_FILE);
641 FILE* f = boinc_fopen(path, "r");
642 if (!f) return false;
643 strcpy(buf, "");
644 int y;
645 int n = fscanf(f, "%d", &y);
646 if (n != 1 || y < 0 || y > 86400) {
647 x = 300;
648 } else {
649 x = y;
650 }
651 (void) fgets(buf, 256, f); // read the \n
652 (void) fgets(buf, 256, f);
653 strip_whitespace(buf);
654 is_notice = false;
655 if (fgets(buf2, 256, f)) {
656 if (strstr(buf2, "notice")) {
657 is_notice = true;
658 }
659 }
660 fclose(f);
661 return true;
662 }
663
664 void ACTIVE_TASK_SET::send_trickle_downs() {
665 unsigned int i;
666 ACTIVE_TASK* atp;
667 bool sent;
668 for (i=0; i<active_tasks.size(); i++) {
669 atp = active_tasks[i];
670 if (!atp->process_exists()) continue;
671 if (atp->have_trickle_down) {
672 if (!atp->app_client_shm.shm) continue;
673 sent = atp->app_client_shm.shm->trickle_down.send_msg("<have_trickle_down/>\n");
674 if (sent) atp->have_trickle_down = false;
675 }
676 if (atp->send_upload_file_status) {
677 if (!atp->app_client_shm.shm) continue;
678 sent = atp->app_client_shm.shm->trickle_down.send_msg("<upload_file_status/>\n");
679 if (sent) atp->send_upload_file_status = false;
680 }
681 }
682 }
683
684 void ACTIVE_TASK_SET::send_heartbeats() {
685 unsigned int i;
686 ACTIVE_TASK* atp;
687 char buf[1024];
688 double ar = gstate.available_ram();
689
690 for (i=0; i<active_tasks.size(); i++) {
691 atp = active_tasks[i];
692 if (!atp->process_exists()) continue;
693 if (!atp->app_client_shm.shm) continue;
694 snprintf(buf, sizeof(buf), "<heartbeat/>"
695 "<wss>%e</wss>"
696 "<max_wss>%e</max_wss>",
697 atp->procinfo.working_set_size, ar
698 );
699 if (gstate.network_suspended) {
700 safe_strcat(buf, "<network_suspended/>");
701 }
702 bool sent = atp->app_client_shm.shm->heartbeat.send_msg(buf);
703 if (log_flags.heartbeat_debug) {
704 if (sent) {
705 msg_printf(atp->result->project, MSG_INFO,
706 "[heartbeat] Heartbeat sent to task %s",
707 atp->result->name
708 );
709 } else {
710 msg_printf(atp->result->project, MSG_INFO,
711 "[heartbeat] Heartbeat to task %s failed, previous message unread",
712 atp->result->name
713 );
714 }
715 }
716 }
717 }
718
719 // send queued process-control messages; check for timeout
720 //
721 void ACTIVE_TASK_SET::process_control_poll() {
722 unsigned int i;
723 ACTIVE_TASK* atp;
724
725 for (i=0; i<active_tasks.size(); i++) {
726 atp = active_tasks[i];
727 if (!atp->process_exists()) continue;
728 if (!atp->app_client_shm.shm) continue;
729
730 // if app has had the same message in its send buffer for 180 sec,
731 // assume it's hung and restart it
732 //
733 if (atp->process_control_queue.timeout(180)) {
734 if (log_flags.task_debug) {
735 msg_printf(atp->result->project, MSG_INFO,
736 "Restarting %s - message timeout", atp->result->name
737 );
738 }
739 atp->kill_running_task(true);
740 } else {
741 atp->process_control_queue.msg_queue_poll(
742 atp->app_client_shm.shm->process_control_request
743 );
744 }
745 }
746 }
747
748 // See if any processes have exited
749 //
750 bool ACTIVE_TASK_SET::check_app_exited() {
751 ACTIVE_TASK* atp;
752 bool found = false;
753
754 #ifdef _WIN32
755 unsigned long exit_code;
756 unsigned int i;
757
758 for (i=0; i<active_tasks.size(); i++) {
759 atp = active_tasks[i];
760 if (!atp->process_exists()) continue;
761 if (GetExitCodeProcess(atp->process_handle, &exit_code)) {
762 if (exit_code != STILL_ACTIVE) {
763 found = true;
764 atp->handle_exited_app(exit_code);
765 }
766 } else {
767 if (log_flags.task_debug) {
768 char errmsg[1024];
769 msg_printf(atp->result->project, MSG_INFO,
770 "[task] task %s GetExitCodeProcess() failed - %s GLE %d (0x%x)",
771 atp->result->name,
772 windows_format_error_string(
773 GetLastError(), errmsg, sizeof(errmsg)
774 ),
775 GetLastError(), GetLastError()
776 );
777 }
778
779 // The process doesn't seem to be there.
780 // Mark task as aborted so we don't check it again.
781 //
782 atp->cleanup_task();
783 atp->set_task_state(PROCESS_ABORTED, "check_app_exited");
784 }
785 }
786 #else
787 int pid, stat;
788
789 if ((pid = waitpid(-1, &stat, WNOHANG)) > 0) {
790 atp = lookup_pid(pid);
791 if (!atp) {
792 // if we're running benchmarks, exited process
793 // is probably a benchmark process; don't show error
794 //
795 if (!gstate.benchmarks_running && log_flags.task_debug) {
796 msg_printf(NULL, MSG_INTERNAL_ERROR,
797 "Process %d not found\n", pid
798 );
799 }
800 return false;
801 }
802 atp->handle_exited_app(stat);
803 found = true;
804 }
805 #endif
806
807 return found;
808 }
809
810 // if an app has exceeded its maximum disk usage, abort it
811 //
812 bool ACTIVE_TASK::check_max_disk_exceeded() {
813 double disk_usage;
814 int retval;
815
816 retval = current_disk_usage(disk_usage);
817 if (retval) {
818 msg_printf(this->wup->project, MSG_INTERNAL_ERROR,
819 "Can't get task disk usage: %s", boincerror(retval)
820 );
821 } else {
822 if (disk_usage > max_disk_usage) {
823 msg_printf(
824 result->project, MSG_INFO,
825 "Aborting task %s: exceeded disk limit: %.2fMB > %.2fMB\n",
826 result->name, disk_usage/MEGA, max_disk_usage/MEGA
827 );
828 abort_task(EXIT_DISK_LIMIT_EXCEEDED, "Disk usage limit exceeded");
829 return true;
830 }
831 }
832 return false;
833 }
834
835 // Check if any of the active tasks have exceeded their
836 // resource limits on disk, CPU time or memory
837 //
838 // TODO: this gets called ever 1 sec,
839 // but mem and disk usage are computed less often.
840 // refactor.
841 //
842 bool ACTIVE_TASK_SET::check_rsc_limits_exceeded() {
843 unsigned int i;
844 ACTIVE_TASK *atp;
845 static double last_disk_check_time = 0;
846 bool do_disk_check = false;
847 bool did_anything = false;
848 char buf[256];
849
850 double ram_left = gstate.available_ram();
851 double max_ram = gstate.max_available_ram();
852
853 // Some slot dirs have lots of files,
854 // so only check every min(disk_interval, 300) secs
855 //
856 double min_interval = gstate.global_prefs.disk_interval;
857 if (min_interval < 300) min_interval = 300;
858 if (gstate.clock_change || gstate.now > last_disk_check_time + min_interval) {
859 do_disk_check = true;
860 }
861 for (i=0; i<active_tasks.size(); i++) {
862 atp = active_tasks[i];
863 if (atp->task_state() != PROCESS_EXECUTING) continue;
864 if (!atp->result->non_cpu_intensive() && (atp->elapsed_time > atp->max_elapsed_time)) {
865 sprintf(buf, "exceeded elapsed time limit %.2f (%.2fG/%.2fG)",
866 atp->max_elapsed_time,
867 atp->result->wup->rsc_fpops_bound/1e9,
868 atp->result->avp->flops/1e9
869 );
870 msg_printf(atp->result->project, MSG_INFO,
871 "Aborting task %s: %s", atp->result->name, buf
872 );
873 atp->abort_task(EXIT_TIME_LIMIT_EXCEEDED, buf);
874 did_anything = true;
875 continue;
876 }
877 #if 0
878 // removing this for now because most projects currently
879 // have too-low values of workunit.rsc_memory_bound
880 // (causing lots of aborts)
881 // and I don't think we can expect projects to provide
882 // accurate bounds.
883 //
884 if (atp->procinfo.working_set_size_smoothed > atp->max_mem_usage) {
885 sprintf(buf, "working set size > workunit.rsc_memory_bound: %.2fMB > %.2fMB",
886 atp->procinfo.working_set_size_smoothed/MEGA, atp->max_mem_usage/MEGA
887 );
888 msg_printf(atp->result->project, MSG_INFO,
889 "Aborting task %s: %s",
890 atp->result->name, buf
891 );
892 atp->abort_task(EXIT_MEM_LIMIT_EXCEEDED, buf);
893 did_anything = true;
894 continue;
895 }
896 #endif
897 if (atp->procinfo.working_set_size_smoothed > max_ram) {
898 sprintf(buf, "working set size > client RAM limit: %.2fMB > %.2fMB",
899 atp->procinfo.working_set_size_smoothed/MEGA, max_ram/MEGA
900 );
901 msg_printf(atp->result->project, MSG_INFO,
902 "Aborting task %s: %s",
903 atp->result->name, buf
904 );
905 atp->abort_task(EXIT_MEM_LIMIT_EXCEEDED, buf);
906 did_anything = true;
907 continue;
908 }
909 if (do_disk_check || atp->peak_disk_usage == 0) {
910 if (atp->check_max_disk_exceeded()) {
911 did_anything = true;
912 continue;
913 }
914 }
915
916 // don't count RAM usage of non-CPU-intensive jobs
917 //
918 if (!atp->result->non_cpu_intensive()) {
919 ram_left -= atp->procinfo.working_set_size_smoothed;
920 }
921 }
922 if (ram_left < 0) {
923 gstate.request_schedule_cpus("RAM usage limit exceeded");
924 }
925 if (do_disk_check) {
926 last_disk_check_time = gstate.now;
927 }
928 return did_anything;
929 }
930
931 // If process is running, send it an "abort" message,
932 // Set a flag so that if it doesn't exit within 5 seconds,
933 // kill it by OS-specific mechanism (e.g. KILL signal).
934 // This is done when app has exceeded CPU, disk, or mem limits,
935 // or when the user has requested it.
936 // The task won't be restarted.
937 //
938 int ACTIVE_TASK::abort_task(int exit_status, const char* msg) {
939 if (task_state() == PROCESS_EXECUTING || task_state() == PROCESS_SUSPENDED) {
940 request_abort();
941 } else {
942 set_task_state(PROCESS_ABORTED, "abort_task");
943 }
944 result->exit_status = exit_status;
945 gstate.report_result_error(*result, msg);
946 result->set_state(RESULT_ABORTED, "abort_task");
947 if (task_state() == PROCESS_ABORTED) {
948 copy_final_info();
949 read_stderr_file();
950 }
951 return 0;
952 }
953
954 // check for the stderr file, copy to result record
955 //
956 int ACTIVE_TASK::read_stderr_file() {
957 char* buf1, *buf2;
958 char path[MAXPATHLEN];
959
960 // truncate stderr output to the last 63KB;
961 // it's unlikely that more than that will be useful
962 //
963 int max_len = 63*1024;
964 sprintf(path, "%s/%s", slot_dir, STDERR_FILE);
965 if (!boinc_file_exists(path)) return 0;
966 int retval = read_file_malloc(
967 path, buf1, max_len, !cc_config.stderr_head
968 );
969 if (retval) return retval;
970
971 // if it's a vbox app, check for string in stderr saying
972 // the job failed because CPU VM extensions disabled
973 //
974 if (strstr(app_version->plan_class, "vbox")) {
975 if (strstr(buf1, "ERR_CPU_VM_EXTENSIONS_DISABLED")) {
976 msg_printf(0, MSG_INFO,
977 "Vbox app stderr indicates CPU VM extensions disabled"
978 );
979 gstate.host_info.p_vm_extensions_disabled = true;
980 }
981 }
982
983 buf2 = (char*)malloc(2*max_len);
984 if (!buf2) {
985 free(buf1);
986 return ERR_MALLOC;
987 }
988 non_ascii_escape(buf1, buf2, 2*max_len);
989 result->stderr_out += "<stderr_txt>\n";
990 result->stderr_out += buf2;
991 result->stderr_out += "\n</stderr_txt>\n";
992 free(buf1);
993 free(buf2);
994 return 0;
995 }
996
997 // tell a running app to reread project preferences.
998 // This is called when project prefs change,
999 // or when a user file has finished downloading.
1000 //
1001 // TODO: get rid of this function
1002 //
1003 int ACTIVE_TASK::request_reread_prefs() {
1004 int retval;
1005 APP_INIT_DATA aid;
1006
1007 link_user_files();
1008
1009 init_app_init_data(aid);
1010 retval = write_app_init_file(aid);
1011 if (retval) return retval;
1012 #if 0
1013 graphics_request_queue.msg_queue_send(
1014 xml_graphics_modes[MODE_REREAD_PREFS],
1015 app_client_shm.shm->graphics_request
1016 );
1017 #endif
1018 return 0;
1019 }
1020
1021 // tell a running app to reread the app_info file
1022 // (e.g. because proxy settings have changed: this is for F@h)
1023 //
1024 int ACTIVE_TASK::request_reread_app_info() {
1025 APP_INIT_DATA aid;
1026 init_app_init_data(aid);
1027 int retval = write_app_init_file(aid);
1028 if (retval) return retval;
1029 process_control_queue.msg_queue_send(
1030 "<reread_app_info/>",
1031 app_client_shm.shm->process_control_request
1032 );
1033 return 0;
1034 }
1035
1036 // tell all running apps of a project to reread prefs
1037 //
1038 void ACTIVE_TASK_SET::request_reread_prefs(PROJECT* project) {
1039 unsigned int i;
1040 ACTIVE_TASK* atp;
1041
1042 for (i=0; i<active_tasks.size(); i++) {
1043 atp = active_tasks[i];
1044 if (atp->result->project != project) continue;
1045 if (!atp->process_exists()) continue;
1046 atp->request_reread_prefs();
1047 }
1048 }
1049
1050 void ACTIVE_TASK_SET::request_reread_app_info() {
1051 for (unsigned int i=0; i<active_tasks.size(); i++) {
1052 ACTIVE_TASK* atp = active_tasks[i];
1053 if (!atp->process_exists()) continue;
1054 atp->request_reread_app_info();
1055 }
1056 }
1057
1058
1059 // send quit message to all tasks in the project
1060 // (or all tasks, if proj is NULL).
1061 // If they don't exit in QUIT_TIMEOUT seconds,
1062 // send them a kill signal and wait up to 5 more seconds to exit.
1063 // This is called when the client exits,
1064 // or when a project is detached or reset
1065 //
1066 int ACTIVE_TASK_SET::exit_tasks(PROJECT* proj) {
1067 if (log_flags.task_debug) {
1068 msg_printf(NULL, MSG_INFO, "[task_debug] requesting tasks to exit");
1069 }
1070 request_tasks_exit(proj);
1071
1072 // Wait for tasks to exit normally; if they don't then kill them
1073 //
1074 if (wait_for_exit(QUIT_TIMEOUT, proj)) {
1075 if (log_flags.task_debug) {
1076 msg_printf(NULL, MSG_INFO,
1077 "[task_debug] all tasks haven't exited after %d sec; killing them",
1078 QUIT_TIMEOUT
1079 );
1080 }
1081 kill_tasks(proj);
1082 if (wait_for_exit(5, proj)) {
1083 if (log_flags.task_debug) {
1084 msg_printf(NULL, MSG_INFO,
1085 "[task_debug] tasks still not exited after 5 secs; giving up"
1086 );
1087 }
1088 } else {
1089 if (log_flags.task_debug) {
1090 msg_printf(NULL, MSG_INFO, "[task_debug] all tasks exited");
1091 }
1092 }
1093 } else {
1094 if (log_flags.task_debug) {
1095 msg_printf(NULL, MSG_INFO, "[task_debug] all tasks exited");
1096 }
1097 }
1098
1099 // get final checkpoint_cpu_times
1100 //
1101 get_msgs();
1102
1103 gstate.request_schedule_cpus("exit_tasks");
1104 return 0;
1105 }
1106
1107 // Wait up to wait_time seconds for processes to exit
1108 // If proj is zero, wait for all processes, else that project's
1109 // NOTE: it's bad form to sleep, but it would be complex to avoid it here
1110 //
1111 int ACTIVE_TASK_SET::wait_for_exit(double wait_time, PROJECT* proj) {
1112 bool all_exited;
1113 unsigned int i,n;
1114 ACTIVE_TASK *atp;
1115
1116 for (i=0; i<10; i++) {
1117 all_exited = true;
1118
1119 for (n=0; n<active_tasks.size(); n++) {
1120 atp = active_tasks[n];
1121 if (proj && atp->wup->project != proj) continue;
1122 if (!atp->has_task_exited()) {
1123 all_exited = false;
1124 break;
1125 }
1126 }
1127
1128 if (all_exited) return 0;
1129 boinc_sleep(wait_time/10.0);
1130 }
1131
1132 return ERR_NOT_EXITED;
1133 }
1134
1135 int ACTIVE_TASK_SET::abort_project(PROJECT* project) {
1136 vector<ACTIVE_TASK*>::iterator task_iter;
1137 ACTIVE_TASK* atp;
1138
1139 exit_tasks(project);
1140 task_iter = active_tasks.begin();
1141 while (task_iter != active_tasks.end()) {
1142 atp = *task_iter;
1143 if (atp->result->project == project) {
1144 client_clean_out_dir(atp->slot_dir, "abort_project()");
1145 remove_project_owned_dir(atp->slot_dir);
1146 task_iter = active_tasks.erase(task_iter);
1147 delete atp;
1148 } else {
1149 ++task_iter;
1150 }
1151 }
1152 return 0;
1153 }
1154
1155 // suspend all currently running tasks
1156 // e.g. because on batteries, time of day, benchmarking, CPU throttle, etc.
1157 //
1158 void ACTIVE_TASK_SET::suspend_all(int reason) {
1159 for (unsigned int i=0; i<active_tasks.size(); i++) {
1160 ACTIVE_TASK* atp = active_tasks[i];
1161
1162 // don't suspend if process doesn't exist,
1163 // or if quit/abort is pending.
1164 // If process is currently suspended, proceed;
1165 // the new suspension may require it to be removed from memory.
1166 // E.g. a GPU job may currently be suspended due to CPU throttling,
1167 // and therefore left in memory,
1168 // but this suspension (say, a user request)
1169 // might require it to be removed from memory.
1170 //
1171 switch (atp->task_state()) {
1172 case PROCESS_EXECUTING:
1173 case PROCESS_SUSPENDED:
1174 break;
1175 default:
1176 continue;
1177 }
1178
1179 // special cases for non-CPU-intensive apps
1180 //
1181 if (atp->result->non_cpu_intensive()) {
1182 if (cc_config.dont_suspend_nci) {
1183 continue;
1184 }
1185 if (reason == SUSPEND_REASON_BATTERIES) {
1186 continue;
1187 }
1188 }
1189
1190 // handle CPU throttling separately
1191 //
1192 if (reason == SUSPEND_REASON_CPU_THROTTLE) {
1193 if (atp->result->dont_throttle()) continue;
1194 atp->preempt(REMOVE_NEVER, reason);
1195 continue;
1196 }
1197
1198 #ifdef ANDROID
1199 // On Android, remove apps from memory if on batteries
1200 // no matter what the reason for suspension.
1201 // The message polling in the BOINC runtime system
1202 // imposes an overhead which drains the battery
1203 //
1204 if (gstate.host_info.host_is_running_on_batteries()) {
1205 atp->preempt(REMOVE_ALWAYS);
1206 continue;
1207 }
1208 #endif
1209
1210 switch (reason) {
1211 case SUSPEND_REASON_BENCHMARKS:
1212 atp->preempt(REMOVE_NEVER);
1213 break;
1214 case SUSPEND_REASON_CPU_USAGE:
1215 // If we're suspending because of non-BOINC CPU load,
1216 // don't remove from memory.
1217 // Some systems do a security check when apps are launched,
1218 // which uses a lot of CPU.
1219 // Avoid going into a preemption loop.
1220 //
1221 if (atp->result->non_cpu_intensive()) break;
1222 atp->preempt(REMOVE_NEVER);
1223 break;
1224 case SUSPEND_REASON_BATTERY_OVERHEATED:
1225 case SUSPEND_REASON_BATTERY_CHARGING:
1226 // these conditions can oscillate, so leave apps in mem
1227 //
1228 atp->preempt(REMOVE_NEVER);
1229 break;
1230 default:
1231 atp->preempt(REMOVE_MAYBE_USER);
1232 break;
1233 }
1234 }
1235 }
1236
1237 // resume all currently scheduled tasks
1238 //
1239 void ACTIVE_TASK_SET::unsuspend_all(int reason) {
1240 unsigned int i;
1241 ACTIVE_TASK* atp;
1242 for (i=0; i<active_tasks.size(); i++) {
1243 atp = active_tasks[i];
1244 if (atp->scheduler_state != CPU_SCHED_SCHEDULED) continue;
1245 if (atp->task_state() == PROCESS_UNINITIALIZED) {
1246 if (atp->resume_or_start(false)) {
1247 msg_printf(atp->wup->project, MSG_INTERNAL_ERROR,
1248 "Couldn't restart task %s", atp->result->name
1249 );
1250 }
1251 } else if (atp->task_state() == PROCESS_SUSPENDED) {
1252 atp->unsuspend(reason);
1253 }
1254 }
1255 }
1256
1257 // Check to see if any tasks are running
1258 // called if benchmarking and waiting for suspends to happen
1259 // or the system needs to suspend itself so we are suspending
1260 // the applications
1261 //
1262 bool ACTIVE_TASK_SET::is_task_executing() {
1263 unsigned int i;
1264 ACTIVE_TASK* atp;
1265 for (i=0; i<active_tasks.size(); i++) {
1266 atp = active_tasks[i];
1267 if (atp->task_state() == PROCESS_EXECUTING) {
1268 return true;
1269 }
1270 }
1271 return false;
1272 }
1273
1274 // Send quit message to all app processes
1275 // This is called when the client exits,
1276 // or when a project is detached or reset
1277 //
1278 void ACTIVE_TASK_SET::request_tasks_exit(PROJECT* proj) {
1279 unsigned int i;
1280 ACTIVE_TASK *atp;
1281 for (i=0; i<active_tasks.size(); i++) {
1282 atp = active_tasks[i];
1283 if (proj && atp->wup->project != proj) continue;
1284 if (!atp->process_exists()) continue;
1285 atp->request_exit();
1286 }
1287 }
1288
1289 // Send kill signal to all app processes
1290 // Don't wait for them to exit
1291 //
1292 void ACTIVE_TASK_SET::kill_tasks(PROJECT* proj) {
1293 unsigned int i;
1294 ACTIVE_TASK *atp;
1295 for (i=0; i<active_tasks.size(); i++) {
1296 atp = active_tasks[i];
1297 if (proj && atp->wup->project != proj) continue;
1298 if (!atp->process_exists()) continue;
1299 atp->kill_running_task(true);
1300 }
1301 }
1302
1303 // send a <suspend> message
1304 //
1305 int ACTIVE_TASK::suspend() {
1306 if (!app_client_shm.shm) return 0;
1307 if (task_state() != PROCESS_EXECUTING) {
1308 msg_printf(result->project, MSG_INTERNAL_ERROR,
1309 "ACTIVE_TASK::SUSPEND(): expected task %s to be executing",
1310 result->name
1311 );
1312 }
1313 int n = process_control_queue.msg_queue_purge("<resume/>");
1314 if (n == 0) {
1315 process_control_queue.msg_queue_send(
1316 "<suspend/>",
1317 app_client_shm.shm->process_control_request
1318 );
1319 }
1320 set_task_state(PROCESS_SUSPENDED, "suspend");
1321 return 0;
1322 }
1323
1324 // resume a suspended task
1325 //
1326 int ACTIVE_TASK::unsuspend(int reason) {
1327 if (!app_client_shm.shm) return 0;
1328 if (task_state() != PROCESS_SUSPENDED) {
1329 msg_printf(result->project, MSG_INFO,
1330 "Internal error: expected process %s to be suspended", result->name
1331 );
1332 }
1333 if (log_flags.cpu_sched && reason != SUSPEND_REASON_CPU_THROTTLE) {
1334 msg_printf(result->project, MSG_INFO,
1335 "[cpu_sched] Resuming %s", result->name
1336 );
1337 }
1338 int n = process_control_queue.msg_queue_purge("<suspend/>");
1339 if (n == 0) {
1340 process_control_queue.msg_queue_send(
1341 "<resume/>",
1342 app_client_shm.shm->process_control_request
1343 );
1344 }
1345 set_task_state(PROCESS_EXECUTING, "unsuspend");
1346 return 0;
1347 }
1348
1349 void ACTIVE_TASK::send_network_available() {
1350 if (!app_client_shm.shm) return;
1351 process_control_queue.msg_queue_send(
1352 "<network_available/>",
1353 app_client_shm.shm->process_control_request
1354 );
1355 return;
1356 }
1357
1358 // See if the app has placed a new message in shared mem
1359 // (with CPU done, frac done etc.)
1360 // If so parse it and return true.
1361 //
1362 bool ACTIVE_TASK::get_app_status_msg() {
1363 char msg_buf[MSG_CHANNEL_SIZE];
1364 double fd;
1365 int other_pid;
1366 double dtemp;
1367 static double last_msg_time=0;
1368
1369 if (!app_client_shm.shm) {
1370 msg_printf(result->project, MSG_INFO,
1371 "Task %s: no shared memory segment", result->name
1372 );
1373 return false;
1374 }
1375 if (!app_client_shm.shm->app_status.get_msg(msg_buf)) {
1376 return false;
1377 }
1378 if (log_flags.app_msg_receive) {
1379 msg_printf(this->wup->project, MSG_INFO,
1380 "[app_msg_receive] got msg from slot %d: %s", slot, msg_buf
1381 );
1382 }
1383 want_network = 0;
1384 current_cpu_time = checkpoint_cpu_time = 0.0;
1385 if (parse_double(msg_buf, "<fraction_done>", fd)) {
1386 // fraction_done will be reported as zero
1387 // until the app's first call to boinc_fraction_done().
1388 // So ignore zeros.
1389 //
1390 if (fd) {
1391 fraction_done = fd;
1392 fraction_done_elapsed_time = elapsed_time;
1393 if (!first_fraction_done) {
1394 first_fraction_done = fd;
1395 first_fraction_done_elapsed_time = elapsed_time;
1396 }
1397 if (log_flags.task_debug && (fd<0 || fd>1)) {
1398 if (gstate.now > last_msg_time + 60) {
1399 msg_printf(this->wup->project, MSG_INFO,
1400 "[task_debug] app reported bad fraction done: %f", fd
1401 );
1402 last_msg_time = gstate.now;
1403 }
1404 }
1405 }
1406 }
1407 parse_double(msg_buf, "<current_cpu_time>", current_cpu_time);
1408 parse_double(msg_buf, "<checkpoint_cpu_time>", checkpoint_cpu_time);
1409 parse_double(msg_buf, "<fpops_per_cpu_sec>", result->fpops_per_cpu_sec);
1410 parse_double(msg_buf, "<fpops_cumulative>", result->fpops_cumulative);
1411 parse_double(msg_buf, "<intops_per_cpu_sec>", result->intops_per_cpu_sec);
1412 parse_double(msg_buf, "<intops_cumulative>", result->intops_cumulative);
1413 if (parse_double(msg_buf, "<bytes_sent>", dtemp)) {
1414 if (dtemp > bytes_sent_episode) {
1415 double nbytes = dtemp - bytes_sent_episode;
1416 daily_xfer_history.add(nbytes, true);
1417 bytes_sent += nbytes;
1418 }
1419 bytes_sent_episode = dtemp;
1420 }
1421 if (parse_double(msg_buf, "<bytes_received>", dtemp)) {
1422 if (dtemp > bytes_received_episode) {
1423 double nbytes = dtemp - bytes_received_episode;
1424 daily_xfer_history.add(nbytes, false);
1425 bytes_received += nbytes;
1426 }
1427 bytes_received_episode = dtemp;
1428 }
1429 parse_int(msg_buf, "<want_network>", want_network);
1430 if (parse_int(msg_buf, "<other_pid>", other_pid)) {
1431 // for now, we handle only one of these
1432 other_pids.clear();
1433 other_pids.push_back(other_pid);
1434 }
1435 if (current_cpu_time < 0) {
1436 msg_printf(result->project, MSG_INFO,
1437 "app reporting negative CPU: %f", current_cpu_time
1438 );
1439 current_cpu_time = 0;
1440 }
1441 if (checkpoint_cpu_time < 0) {
1442 msg_printf(result->project, MSG_INFO,
1443 "app reporting negative checkpoint CPU: %f", checkpoint_cpu_time
1444 );
1445 checkpoint_cpu_time = 0;
1446 }
1447 return true;
1448 }
1449
1450 void ACTIVE_TASK::get_graphics_msg() {
1451 char msg_buf[MSG_CHANNEL_SIZE];
1452
1453 if (!app_client_shm.shm) return;
1454 if (app_client_shm.shm->graphics_reply.get_msg(msg_buf)) {
1455 if (log_flags.app_msg_receive) {
1456 msg_printf(this->wup->project, MSG_INFO,
1457 "[app_msg_receive] got msg from slot %d: %s", slot, msg_buf
1458 );
1459 }
1460
1461 parse_str(msg_buf, "<web_graphics_url>", web_graphics_url, sizeof(web_graphics_url));
1462 parse_str(msg_buf, "<remote_desktop_addr>", remote_desktop_addr, sizeof(remote_desktop_addr));
1463 }
1464 }
1465
1466 bool ACTIVE_TASK::get_trickle_up_msg() {
1467 char msg_buf[MSG_CHANNEL_SIZE];
1468 bool found = false;
1469 int retval;
1470
1471 if (!app_client_shm.shm) return false;
1472 if (app_client_shm.shm->trickle_up.get_msg(msg_buf)) {
1473 if (match_tag(msg_buf, "<have_new_trickle_up/>")) {
1474 if (log_flags.app_msg_receive) {
1475 msg_printf(NULL, MSG_INFO,
1476 "[app_msg_receive] got msg from slot %d: %s", slot, msg_buf
1477 );
1478 }
1479 retval = move_trickle_file();
1480 if (!retval) {
1481 wup->project->trickle_up_pending = true;
1482 }
1483 }
1484 if (match_tag(msg_buf, "<have_new_upload_file/>")) {
1485 if (log_flags.app_msg_receive) {
1486 msg_printf(NULL, MSG_INFO,
1487 "[app_msg_receive] got msg from slot %d: %s", slot, msg_buf
1488 );
1489 }
1490 handle_upload_files();
1491 }
1492 found = true;
1493 }
1494 return found;
1495 }
1496
1497 // check for msgs from active tasks,
1498 // and update their elapsed time and other info
1499 //
1500 void ACTIVE_TASK_SET::get_msgs() {
1501 unsigned int i;
1502 ACTIVE_TASK *atp;
1503 double old_time;
1504 static double last_time=0;
1505 double delta_t;
1506 if (!gstate.clock_change && last_time) {
1507 delta_t = gstate.now - last_time;
1508
1509 // Normally this is called every second.
1510 // If delta_t is > 10, we'll assume that a period of hibernation
1511 // or suspension happened, and treat it as zero.
1512 // If negative, must be clock reset. Ignore.
1513 //
1514 if (delta_t > 10 || delta_t < 0) {
1515 delta_t = 0;
1516 }
1517 } else {
1518 delta_t = 0;
1519 }
1520 last_time = gstate.now;
1521
1522 double et_diff = delta_t;
1523 double et_diff_throttle = delta_t * gstate.global_prefs.cpu_usage_limit/100;
1524
1525 for (i=0; i<active_tasks.size(); i++) {
1526 atp = active_tasks[i];
1527 if (!atp->process_exists()) continue;
1528 old_time = atp->checkpoint_cpu_time;
1529 if (atp->scheduler_state == CPU_SCHED_SCHEDULED && !gstate.tasks_suspended) {
1530 double x = atp->result->dont_throttle()?et_diff:et_diff_throttle;
1531 atp->elapsed_time += x;
1532 atp->wup->project->elapsed_time += x;
1533 }
1534 if (atp->get_app_status_msg()) {
1535 if (old_time != atp->checkpoint_cpu_time) {
1536 char buf[256];
1537 sprintf(buf, "%s checkpointed", atp->result->name);
1538 if (atp->overdue_checkpoint) {
1539 gstate.request_schedule_cpus(buf);
1540 }
1541 atp->checkpoint_wall_time = gstate.now;
1542 atp->premature_exit_count = 0;
1543 atp->checkpoint_elapsed_time = atp->elapsed_time;
1544 atp->checkpoint_fraction_done = atp->fraction_done;
1545 atp->checkpoint_fraction_done_elapsed_time = atp->fraction_done_elapsed_time;
1546 if (log_flags.checkpoint_debug) {
1547 msg_printf(atp->wup->project, MSG_INFO,
1548 "[checkpoint] result %s checkpointed",
1549 atp->result->name
1550 );
1551 } else if (log_flags.task_debug) {
1552 msg_printf(atp->wup->project, MSG_INFO,
1553 "[task] result %s checkpointed",
1554 atp->result->name
1555 );
1556 }
1557 atp->write_task_state_file();
1558 }
1559 }
1560 atp->get_trickle_up_msg();
1561 atp->get_graphics_msg();
1562 }
1563 }
1564
1565 // The job just checkpointed.
1566 // Write some state items to a file in the slot dir
1567 // (this avoids rewriting the state file on each checkpoint)
1568 //
1569 void ACTIVE_TASK::write_task_state_file() {
1570 char path[MAXPATHLEN];
1571 sprintf(path, "%s/%s", slot_dir, TASK_STATE_FILENAME);
1572 FILE* f = boinc_fopen(path, "w");
1573 if (!f) return;
1574 fprintf(f,
1575 "<active_task>\n"
1576 " <project_master_url>%s</project_master_url>\n"
1577 " <result_name>%s</result_name>\n"
1578 " <checkpoint_cpu_time>%f</checkpoint_cpu_time>\n"
1579 " <checkpoint_elapsed_time>%f</checkpoint_elapsed_time>\n"
1580 " <fraction_done>%f</fraction_done>\n"
1581 " <peak_working_set_size>%.0f</peak_working_set_size>\n"
1582 " <peak_swap_size>%.0f</peak_swap_size>\n"
1583 " <peak_disk_usage>%.0f</peak_disk_usage>\n"
1584 "</active_task>\n",
1585 result->project->master_url,
1586 result->name,
1587 checkpoint_cpu_time,
1588 checkpoint_elapsed_time,
1589 checkpoint_fraction_done,
1590 peak_working_set_size,
1591 peak_swap_size,
1592 peak_disk_usage
1593 );
1594 fclose(f);
1595 }
1596
1597 // called on startup; read the task state file in case it's more recent
1598 // than the main state file
1599 //
1600 void ACTIVE_TASK::read_task_state_file() {
1601 char buf[4096], path[MAXPATHLEN], s[1024];
1602 sprintf(path, "%s/%s", slot_dir, TASK_STATE_FILENAME);
1603 FILE* f = boinc_fopen(path, "r");
1604 if (!f) return;
1605 buf[0] = 0;
1606 (void) fread(buf, 1, 4096, f);
1607 fclose(f);
1608 buf[4095] = 0;
1609 double x;
1610 // TODO: use XML parser
1611
1612 // sanity checks - project and result name must match
1613 //
1614 if (!parse_str(buf, "<project_master_url>", s, sizeof(s))) {
1615 msg_printf(wup->project, MSG_INTERNAL_ERROR,
1616 "no project URL in task state file"
1617 );
1618 return;
1619 }
1620 if (strcmp(s, result->project->master_url)) {
1621 msg_printf(wup->project, MSG_INTERNAL_ERROR,
1622 "wrong project URL in task state file"
1623 );
1624 return;
1625 }
1626 if (!parse_str(buf, "<result_name>", s, sizeof(s))) {
1627 msg_printf(wup->project, MSG_INTERNAL_ERROR,
1628 "no task name in task state file"
1629 );
1630 return;
1631 }
1632 if (strcmp(s, result->name)) {
1633 msg_printf(wup->project, MSG_INTERNAL_ERROR,
1634 "wrong task name in task state file"
1635 );
1636 return;
1637 }
1638 if (parse_double(buf, "<checkpoint_cpu_time>", x)) {
1639 if (x > checkpoint_cpu_time) {
1640 checkpoint_cpu_time = x;
1641 }
1642 }
1643 if (parse_double(buf, "<checkpoint_elapsed_time>", x)) {
1644 if (x > checkpoint_elapsed_time) {
1645 checkpoint_elapsed_time = x;
1646 }
1647 }
1648 if (parse_double(buf, "<peak_working_set_size>", x)) {
1649 peak_working_set_size = x;
1650 }
1651 if (parse_double(buf, "<peak_swap_size>", x)) {
1652 peak_swap_size = x;
1653 }
1654 if (parse_double(buf, "<peak_disk_usage>", x)) {
1655 peak_disk_usage = x;
1656 }
1657 }
1658