1 // This file is part of BOINC.
2 // http://boinc.berkeley.edu
3 // Copyright (C) 2008 University of California
4 //
5 // BOINC is free software; you can redistribute it and/or modify it
6 // under the terms of the GNU Lesser General Public License
7 // as published by the Free Software Foundation,
8 // either version 3 of the License, or (at your option) any later version.
9 //
10 // BOINC is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13 // See the GNU Lesser General Public License for more details.
14 //
15 // You should have received a copy of the GNU Lesser General Public License
16 // along with BOINC.  If not, see <http://www.gnu.org/licenses/>.
17 
18 // monitoring and process control of running apps
19 
20 #include "cpp.h"
21 
22 #ifdef _WIN32
23 #include "boinc_win.h"
24 #include "win_util.h"
25 #ifdef _MSC_VER
26 #define snprintf _snprintf
27 #endif
28 #ifndef STATUS_SUCCESS
29 #define STATUS_SUCCESS                0x0         // may be in ntstatus.h
30 #endif
31 #ifndef STATUS_DLL_INIT_FAILED
32 #define STATUS_DLL_INIT_FAILED        0xC0000142  // may be in ntstatus.h
33 #endif
34 #ifndef STATUS_DLL_INIT_FAILED_LOGOFF
35 #define STATUS_DLL_INIT_FAILED_LOGOFF 0xC000026B  // may be in ntstatus.h
36 #endif
37 
38 #else
39 #include "config.h"
40 #include <string>
41 #include <unistd.h>
42 
43 #if HAVE_SYS_IPC_H
44 #include <sys/ipc.h>
45 #endif
46 #if HAVE_SYS_RESOURCE_H
47 #include <sys/resource.h>
48 #endif
49 #if HAVE_CSIGNAL
50 #include <csignal>
51 #elif HAVE_SYS_SIGNAL_H
52 #include <sys/signal.h>
53 #elif HAVE_SIGNAL_H
54 #include <signal.h>
55 #endif
56 #if HAVE_SYS_WAIT_H
57 #include <sys/wait.h>
58 #endif
59 
60 #include <vector>
61 
62 #endif
63 
64 using std::vector;
65 
66 #include "error_numbers.h"
67 #include "filesys.h"
68 #include "parse.h"
69 #include "shmem.h"
70 #include "str_replace.h"
71 #include "str_util.h"
72 #include "util.h"
73 
74 #include "client_msgs.h"
75 #include "client_state.h"
76 #include "file_names.h"
77 #include "proc_control.h"
78 #include "result.h"
79 #include "sandbox.h"
80 
81 #include "app.h"
82 
83 // Do periodic checks on running apps:
84 // - get latest CPU time and % done info
85 // - check if any has exited, and clean up
86 // - see if any has exceeded its CPU or disk space limits, and abort it
87 //
poll()88 bool ACTIVE_TASK_SET::poll() {
89     bool action;
90     unsigned int i;
91     static double last_time = 0;
92     if (!gstate.clock_change && gstate.now - last_time < TASK_POLL_PERIOD) return false;
93     last_time = gstate.now;
94 
95     action = check_app_exited();
96     send_heartbeats();
97     send_trickle_downs();
98     process_control_poll();
99     action |= check_rsc_limits_exceeded();
100     get_msgs();
101     for (i=0; i<active_tasks.size(); i++) {
102         ACTIVE_TASK* atp = active_tasks[i];
103         if (atp->task_state() == PROCESS_ABORT_PENDING) {
104             if (gstate.now > atp->abort_time + ABORT_TIMEOUT) {
105                 if (log_flags.task_debug) {
106                     msg_printf(atp->result->project, MSG_INFO,
107                         "[task] abort request timed out, killing task %s",
108                         atp->result->name
109                     );
110                 }
111                 atp->kill_running_task(false);
112             }
113         }
114         if (atp->task_state() == PROCESS_QUIT_PENDING) {
115             if (gstate.now > atp->quit_time + QUIT_TIMEOUT) {
116                 if (log_flags.task_debug) {
117                     msg_printf(atp->result->project, MSG_INFO,
118                         "[task] quit request timed out, killing task %s",
119                         atp->result->name
120                     );
121                 }
122                 atp->kill_running_task(true);
123             }
124         }
125     }
126 
127     // Check for finish files every 10 sec.
128     // If we already found a finish file, abort the app;
129     // it must be hung somewhere in boinc_finish();
130     //
131     static double last_finish_check_time = 0;
132     if (gstate.clock_change || gstate.now - last_finish_check_time > 10) {
133         last_finish_check_time = gstate.now;
134         for (i=0; i<active_tasks.size(); i++) {
135             ACTIVE_TASK* atp = active_tasks[i];
136             if (atp->task_state() == PROCESS_UNINITIALIZED) continue;
137             if (atp->finish_file_time) {
138                 // process is still there 10 sec after it wrote finish file.
139                 // abort the job
140                 atp->abort_task(EXIT_ABORTED_BY_CLIENT, "finish file present too long");
141             } else if (atp->finish_file_present()) {
142                 atp->finish_file_time = gstate.now;
143             }
144         }
145     }
146     if (action) {
147         gstate.set_client_state_dirty("ACTIVE_TASK_SET::poll");
148     }
149 
150     return action;
151 }
152 
153 #if 0
154 // deprecated; TerminateProcessById() doesn't work if
155 // the process is running as a different user
156 //
157 #ifdef _WIN32
158 bool ACTIVE_TASK::kill_all_children() {
159     unsigned int i,j;
160     std::vector<PROCINFO> ps;
161     std::vector<PROCINFO> tps;
162 
163     procinfo_setup(ps);
164 
165     PROCINFO pi;
166     pi.id = pid;
167     tps.push_back(pi);
168 
169     for (i=0; i < tps.size(); i++) {
170         PROCINFO tp = tps[i];
171         for (j=0; j < ps.size(); j++) {
172             PROCINFO p = ps[j];
173             if (tp.id == p.parentid) {
174                 if (TerminateProcessById(p.id)) {
175                     tps.push_back(p);
176                 }
177             }
178         }
179     }
180     return true;
181 }
182 #endif
183 #endif
184 
print_descendants(int pid,vector<int> desc,const char * where)185 static void print_descendants(int pid, vector<int>desc, const char* where) {
186     msg_printf(0, MSG_INFO, "%s: PID %d has %d descendants",
187         where, pid, (int)desc.size()
188     );
189     for (unsigned int i=0; i<desc.size(); i++) {
190         msg_printf(0, MSG_INFO, "   PID %d", desc[i]);
191     }
192 }
193 
194 // Send a quit message, start timer, get descendants
195 //
request_exit()196 int ACTIVE_TASK::request_exit() {
197     if (app_client_shm.shm) {
198         process_control_queue.msg_queue_send(
199             "<quit/>",
200             app_client_shm.shm->process_control_request
201         );
202     }
203     set_task_state(PROCESS_QUIT_PENDING, "request_exit()");
204     quit_time = gstate.now;
205     get_descendants(pid, descendants);
206     if (log_flags.task_debug) {
207         print_descendants(pid, descendants, "request_exit()");
208     }
209     return 0;
210 }
211 
212 // Send an abort message, start timer, get descendants
213 //
request_abort()214 int ACTIVE_TASK::request_abort() {
215     if (app_client_shm.shm) {
216         process_control_queue.msg_queue_send(
217             "<abort/>",
218             app_client_shm.shm->process_control_request
219         );
220     }
221     set_task_state(PROCESS_ABORT_PENDING, "request_abort");
222     abort_time = gstate.now;
223     get_descendants(pid, descendants);
224     if (log_flags.task_debug) {
225         print_descendants(pid, descendants, "request_abort()");
226     }
227     return 0;
228 }
229 
230 #ifdef _WIN32
kill_app_process(int pid,bool will_restart)231 static void kill_app_process(int pid, bool will_restart) {
232     int retval = 0;
233     retval = kill_program(pid, will_restart?0:EXIT_ABORTED_BY_CLIENT);
234     if (retval && log_flags.task_debug) {
235         msg_printf(0, MSG_INFO,
236             "[task] kill_program(%d) failed: %s",
237             pid, boincerror(retval)
238         );
239     }
240 }
241 #else
kill_app_process(int pid,bool)242 static void kill_app_process(int pid, bool) {
243     int retval = 0;
244     if (g_use_sandbox) {
245         retval = kill_via_switcher(pid);
246         if (retval && log_flags.task_debug) {
247             msg_printf(0, MSG_INFO,
248                 "[task] kill_via_switcher(%d) failed: %s (%d)",
249                 pid,
250                 (retval>=0) ? strerror(errno) : boincerror(retval), retval
251             );
252         }
253     } else {
254         retval = kill_program(pid);
255         if (retval && log_flags.task_debug) {
256             msg_printf(0, MSG_INFO,
257                 "[task] kill_program(%d) failed: %s",
258                 pid, strerror(errno)
259             );
260         }
261     }
262 }
263 #endif
264 
265 // Kill a task whose main process is still running
266 // Just kill the main process; shared mem and subsidiary processes
267 // will be cleaned up after it exits, by cleanup_task();
268 //
kill_running_task(bool will_restart)269 int ACTIVE_TASK::kill_running_task(bool will_restart) {
270     kill_app_process(pid, will_restart);
271     return 0;
272 }
273 
274 // Kill any remaining subsidiary processes
275 // of a task whose main process has exited,
276 // namely:
277 // - its descendants (as recently enumerated; it's too late to do that now)
278 //   This list will be populated only in the quit and abort cases.
279 // - its "other" processes, e.g. VMs
280 //
kill_subsidiary_processes()281 int ACTIVE_TASK::kill_subsidiary_processes() {
282     unsigned int i;
283     for (i=0; i<other_pids.size(); i++) {
284         kill_app_process(other_pids[i], false);
285     }
286     for (i=0; i<descendants.size(); i++) {
287         kill_app_process(descendants[i], false);
288     }
289     return 0;
290 }
291 
292 // We have sent a quit request to the process; see if it's exited.
293 // This is called when the client exits,
294 // or when a project is detached or reset
295 //
has_task_exited()296 bool ACTIVE_TASK::has_task_exited() {
297     bool exited = false;
298 
299     if (!process_exists()) return true;
300 
301 #ifdef _WIN32
302     unsigned long exit_code;
303     if (GetExitCodeProcess(process_handle, &exit_code)) {
304         if (exit_code != STILL_ACTIVE) {
305             exited = true;
306         }
307     }
308 #else
309     if (waitpid(pid, 0, WNOHANG) == pid) {
310         exited = true;
311     }
312 #endif
313     if (exited) {
314         set_task_state(PROCESS_EXITED, "has_task_exited");
315         cleanup_task();
316     }
317     return exited;
318 }
319 
320 
limbo_message(ACTIVE_TASK & at)321 static void limbo_message(ACTIVE_TASK& at) {
322 #ifdef _WIN32
323     if (at.result->exit_status == STATUS_DLL_INIT_FAILED) {
324         msg_printf(at.result->project, MSG_INFO,
325             "Task %s exited with a DLL initialization error.",
326             at.result->name
327         );
328         msg_printf(at.result->project, MSG_INFO,
329             "If this happens repeatedly you may need to reboot your computer."
330         );
331         return;
332     }
333 #endif
334     msg_printf(at.result->project, MSG_INFO,
335         "Task %s exited with zero status but no 'finished' file",
336         at.result->name
337     );
338     msg_printf(at.result->project, MSG_INFO,
339         "If this happens repeatedly you may need to reset the project."
340     );
341 }
342 
343 // the job just exited.  If it's a GPU job,
344 // clear the "schedule_backoff" field of all other jobs
345 // that use the GPU type, in case they're waiting for GPU RAM
346 //
clear_schedule_backoffs(ACTIVE_TASK * atp)347 static void clear_schedule_backoffs(ACTIVE_TASK* atp) {
348     int rt = atp->result->avp->rsc_type();
349     if (rt == RSC_TYPE_CPU) return;
350     for (unsigned int i=0; i<gstate.results.size(); i++) {
351         RESULT* rp = gstate.results[i];
352         if (rp->avp->rsc_type() == rt) {
353             rp->schedule_backoff = 0;
354         }
355     }
356 }
357 
358 // handle a task that exited prematurely (i.e. no finish file)
359 //
handle_premature_exit(bool & will_restart)360 void ACTIVE_TASK::handle_premature_exit(bool& will_restart) {
361     // keep count of premature exits;
362     // if this happens 100 times w/o a checkpoint, abort job
363     //
364     premature_exit_count++;
365     if (premature_exit_count > 100) {
366         will_restart = false;
367         set_task_state(PROCESS_ABORTED, "handle_premature_exit");
368         result->exit_status = ERR_TOO_MANY_EXITS;
369         gstate.report_result_error(*result, "too many exit(0)s");
370         result->set_state(RESULT_ABORTED, "handle_premature_exit");
371     } else {
372         will_restart = true;
373         limbo_message(*this);
374         set_task_state(PROCESS_UNINITIALIZED, "handle_premature_exit");
375     }
376 }
377 
378 // handle a temporary exit
379 //
handle_temporary_exit(bool & will_restart,double backoff,const char * reason,bool is_notice)380 void ACTIVE_TASK::handle_temporary_exit(
381     bool& will_restart, double backoff, const char* reason, bool is_notice
382 ) {
383     premature_exit_count++;
384     if (premature_exit_count > 100) {
385         will_restart = false;
386         set_task_state(PROCESS_ABORTED, "handle_temporary_exit");
387         result->exit_status = ERR_TOO_MANY_EXITS;
388         gstate.report_result_error(*result, "too many boinc_temporary_exit()s");
389         result->set_state(RESULT_ABORTED, "handle_temporary_exit");
390     } else {
391         if (is_notice) {
392             msg_printf(result->project, MSG_USER_ALERT,
393                 "Task postponed: %s", reason
394             );
395         } else {
396             if (log_flags.task) {
397                 msg_printf(result->project, MSG_INFO,
398                     "task postponed %f sec: %s", backoff, reason
399                 );
400             }
401         }
402         will_restart = true;
403         result->schedule_backoff = gstate.now + backoff;
404         safe_strcpy(result->schedule_backoff_reason, reason);
405         set_task_state(PROCESS_UNINITIALIZED, "handle_temporary_exit");
406     }
407 }
408 
copy_final_info()409 void ACTIVE_TASK::copy_final_info() {
410     result->final_cpu_time = current_cpu_time;
411     result->final_elapsed_time = elapsed_time;
412     result->final_peak_working_set_size = peak_working_set_size;
413     result->final_peak_swap_size = peak_swap_size;
414     result->final_peak_disk_usage = peak_disk_usage;
415     result->final_bytes_sent = bytes_sent;
416     result->final_bytes_received = bytes_received;
417 }
418 
419 // deal with a process that has exited, for whatever reason:
420 // - completion
421 // - crash
422 // - quit or abort message sent by client
423 // - killed by client
424 //
425 #ifdef _WIN32
handle_exited_app(unsigned long exit_code)426 void ACTIVE_TASK::handle_exited_app(unsigned long exit_code) {
427     if (log_flags.task_debug) {
428         msg_printf(result->project, MSG_INFO,
429             "[task] Process for %s exited, exit code %lu, task state %d",
430             result->name, exit_code, task_state()
431         );
432     }
433 #else
434 void ACTIVE_TASK::handle_exited_app(int stat) {
435     if (log_flags.task_debug) {
436         msg_printf(result->project, MSG_INFO,
437             "[task] Process for %s exited, status %d, task state %d",
438             result->name, stat, task_state()
439         );
440     }
441 #endif
442     char err_msg[4096];
443     bool will_restart = false;
444 
445     get_app_status_msg();
446     get_trickle_up_msg();
447     copy_final_info();
448 
449     // if an abort or quit is pending,
450     // the process may have exited itself, or we may have killed it.
451     // Ignore exit status.
452     //
453     if (task_state() == PROCESS_ABORT_PENDING) {
454         set_task_state(PROCESS_ABORTED, "handle_exited_app");
455     } else if (task_state() == PROCESS_QUIT_PENDING) {
456         set_task_state(PROCESS_UNINITIALIZED, "handle_exited_app");
457         will_restart = true;
458     } else {
459 #ifdef _WIN32
460         result->exit_status = exit_code;
461         switch(exit_code) {
462         case STATUS_SUCCESS:
463             // if another process killed the app, it looks like exit(0).
464             // So check for the finish file
465             //
466             if (finish_file_present()) {
467                 set_task_state(PROCESS_EXITED, "handle_exited_app");
468                 break;
469             }
470             double x;
471             bool is_notice;
472             char buf[256];
473             safe_strcpy(buf, "");
474             if (temporary_exit_file_present(x, buf, is_notice)) {
475                 handle_temporary_exit(will_restart, x, buf, is_notice);
476             } else {
477                 handle_premature_exit(will_restart);
478             }
479             break;
480         case 0xc000013a:        // control-C??
481         case 0x40010004:        // vista shutdown?? can someone explain this?
482         case STATUS_DLL_INIT_FAILED:
483 		case STATUS_DLL_INIT_FAILED_LOGOFF:
484             // This can happen because:
485             // - The OS is shutting down, and attempting to start
486             //   any new application fails automatically.
487             // - The OS has run out of desktop heap
488             // - (reportedly) The computer has just come out of hibernation
489             //
490             handle_premature_exit(will_restart);
491             break;
492         default:
493             char szError[1024];
494             set_task_state(PROCESS_EXITED, "handle_exited_app");
495             sprintf(err_msg,
496                 "%s - exit code %d (0x%x)",
497                 windows_format_error_string(exit_code, szError, sizeof(szError)),
498                 exit_code, exit_code
499             );
500             gstate.report_result_error(*result, err_msg);
501             if (log_flags.task_debug) {
502                 msg_printf(result->project, MSG_INFO,
503                     "[task] Process for %s exited",
504                     result->name
505                 );
506                 msg_printf(result->project, MSG_INFO,
507                     "[task] exit code %d (0x%x): %s",
508                     exit_code, exit_code,
509                     windows_format_error_string(exit_code, szError, sizeof(szError))
510                 );
511             }
512             break;
513         }
514 #else
515         if (WIFEXITED(stat)) {
516             result->exit_status = WEXITSTATUS(stat);
517 
518             double x;
519             char buf[256];
520             bool is_notice;
521             if (temporary_exit_file_present(x, buf, is_notice)) {
522                 handle_temporary_exit(will_restart, x, buf, is_notice);
523             } else {
524                 if (log_flags.task_debug) {
525                     msg_printf(result->project, MSG_INFO,
526                         "[task] process exited with status %d\n",
527                         result->exit_status
528                     );
529                 }
530                 if (result->exit_status) {
531                     set_task_state(PROCESS_EXITED, "handle_exited_app");
532                     sprintf(err_msg,
533                         "process exited with code %d (0x%x, %d)",
534                         result->exit_status, result->exit_status,
535                         (-1<<8)|result->exit_status
536                     );
537                     gstate.report_result_error(*result, err_msg);
538                 } else {
539                     if (finish_file_present()) {
540                         set_task_state(PROCESS_EXITED, "handle_exited_app");
541                     } else {
542                         handle_premature_exit(will_restart);
543                     }
544                 }
545             }
546         } else if (WIFSIGNALED(stat)) {
547             int got_signal = WTERMSIG(stat);
548 
549             if (log_flags.task_debug) {
550                 msg_printf(result->project, MSG_INFO,
551                     "[task] process got signal %d", got_signal
552                 );
553             }
554 
555             // if the process was externally killed, let it restart.
556             //
557             switch (got_signal) {
558             case SIGHUP:
559             case SIGINT:
560             case SIGQUIT:
561             case SIGKILL:
562             case SIGTERM:
563             case SIGSTOP:
564                 will_restart = true;
565                 set_task_state(PROCESS_UNINITIALIZED, "handle_exited_app");
566                 break;
567             default:
568                 result->exit_status = stat;
569                 set_task_state(PROCESS_WAS_SIGNALED, "handle_exited_app");
570                 signal = got_signal;
571                 sprintf(err_msg, "process got signal %d", signal);
572                 gstate.report_result_error(*result, err_msg);
573             }
574         } else {
575             result->exit_status = EXIT_UNKNOWN;
576             set_task_state(PROCESS_EXIT_UNKNOWN, "handle_exited_app");
577             gstate.report_result_error(*result, "process exit, unknown");
578             msg_printf(result->project, MSG_INTERNAL_ERROR,
579                 "process exited for unknown reason"
580             );
581         }
582 #endif
583     }
584 
585     // get rid of shared-mem segment and kill subsidiary processes
586     //
587     cleanup_task();
588 
589     if (gstate.run_test_app) {
590         msg_printf(0, MSG_INFO, "test app finished - exiting");
591         exit(0);
592     }
593 
594     if (!will_restart) {
595         copy_output_files();
596         int retval = read_stderr_file();
597         if (retval) {
598             msg_printf(result->project, MSG_INTERNAL_ERROR,
599                 "read_stderr_file(): %s", boincerror(retval)
600             );
601         }
602         client_clean_out_dir(slot_dir, "handle_exited_app()");
603         clear_schedule_backoffs(this);
604             // clear scheduling backoffs of jobs waiting for GPU
605     }
606     gstate.request_schedule_cpus("application exited");
607     gstate.request_work_fetch("application exited");
608 }
609 
610 // structure of a finish file (see boinc_api.cpp)):
611 // exit status (int)
612 // message
613 // "notice" or blank line
614 // ... or empty
615 //
616 bool ACTIVE_TASK::finish_file_present() {
617     char path[MAXPATHLEN], buf[1024], buf2[256];
618     safe_strcpy(buf, "");
619     safe_strcpy(buf2, "");
620     sprintf(path, "%s/%s", slot_dir, BOINC_FINISH_CALLED_FILE);
621     FILE* f = boinc_fopen(path, "r");
622     if (!f) return false;
623     fgets(buf, sizeof(buf), f);     // read (and discard) exit status
624     char* p = fgets(buf, sizeof(buf), f);
625     if (p && strlen(buf)) {
626         fgets(buf2, sizeof(buf2), f);
627         msg_printf(result->project,
628             strstr(buf2, "notice")?MSG_USER_ALERT:MSG_INFO,
629             "Message from task: %s", buf
630         );
631     }
632     fclose(f);
633     return true;
634 }
635 
636 bool ACTIVE_TASK::temporary_exit_file_present(
637     double& x, char* buf, bool& is_notice
638 ) {
639     char path[MAXPATHLEN], buf2[256];
640     sprintf(path, "%s/%s", slot_dir, TEMPORARY_EXIT_FILE);
641     FILE* f = boinc_fopen(path, "r");
642     if (!f) return false;
643     strcpy(buf, "");
644     int y;
645     int n = fscanf(f, "%d", &y);
646     if (n != 1 || y < 0 || y > 86400) {
647         x = 300;
648     } else {
649         x = y;
650     }
651     (void) fgets(buf, 256, f);     // read the \n
652     (void) fgets(buf, 256, f);
653     strip_whitespace(buf);
654     is_notice = false;
655     if (fgets(buf2, 256, f)) {
656         if (strstr(buf2, "notice")) {
657             is_notice = true;
658         }
659     }
660     fclose(f);
661     return true;
662 }
663 
664 void ACTIVE_TASK_SET::send_trickle_downs() {
665     unsigned int i;
666     ACTIVE_TASK* atp;
667     bool sent;
668     for (i=0; i<active_tasks.size(); i++) {
669         atp = active_tasks[i];
670         if (!atp->process_exists()) continue;
671         if (atp->have_trickle_down) {
672             if (!atp->app_client_shm.shm) continue;
673             sent = atp->app_client_shm.shm->trickle_down.send_msg("<have_trickle_down/>\n");
674             if (sent) atp->have_trickle_down = false;
675         }
676         if (atp->send_upload_file_status) {
677             if (!atp->app_client_shm.shm) continue;
678             sent = atp->app_client_shm.shm->trickle_down.send_msg("<upload_file_status/>\n");
679             if (sent) atp->send_upload_file_status = false;
680        }
681     }
682 }
683 
684 void ACTIVE_TASK_SET::send_heartbeats() {
685     unsigned int i;
686     ACTIVE_TASK* atp;
687     char buf[1024];
688     double ar = gstate.available_ram();
689 
690     for (i=0; i<active_tasks.size(); i++) {
691         atp = active_tasks[i];
692         if (!atp->process_exists()) continue;
693         if (!atp->app_client_shm.shm) continue;
694         snprintf(buf, sizeof(buf), "<heartbeat/>"
695             "<wss>%e</wss>"
696             "<max_wss>%e</max_wss>",
697             atp->procinfo.working_set_size, ar
698         );
699         if (gstate.network_suspended) {
700             safe_strcat(buf, "<network_suspended/>");
701         }
702         bool sent = atp->app_client_shm.shm->heartbeat.send_msg(buf);
703         if (log_flags.heartbeat_debug) {
704             if (sent) {
705                 msg_printf(atp->result->project, MSG_INFO,
706                     "[heartbeat] Heartbeat sent to task %s",
707                     atp->result->name
708                 );
709             } else {
710                 msg_printf(atp->result->project, MSG_INFO,
711                     "[heartbeat] Heartbeat to task %s failed, previous message unread",
712                     atp->result->name
713                 );
714             }
715         }
716     }
717 }
718 
719 // send queued process-control messages; check for timeout
720 //
721 void ACTIVE_TASK_SET::process_control_poll() {
722     unsigned int i;
723     ACTIVE_TASK* atp;
724 
725     for (i=0; i<active_tasks.size(); i++) {
726         atp = active_tasks[i];
727         if (!atp->process_exists()) continue;
728         if (!atp->app_client_shm.shm) continue;
729 
730         // if app has had the same message in its send buffer for 180 sec,
731         // assume it's hung and restart it
732         //
733         if (atp->process_control_queue.timeout(180)) {
734             if (log_flags.task_debug) {
735                 msg_printf(atp->result->project, MSG_INFO,
736                     "Restarting %s - message timeout", atp->result->name
737                 );
738             }
739             atp->kill_running_task(true);
740         } else {
741             atp->process_control_queue.msg_queue_poll(
742                 atp->app_client_shm.shm->process_control_request
743             );
744         }
745     }
746 }
747 
748 // See if any processes have exited
749 //
750 bool ACTIVE_TASK_SET::check_app_exited() {
751     ACTIVE_TASK* atp;
752     bool found = false;
753 
754 #ifdef _WIN32
755     unsigned long exit_code;
756     unsigned int i;
757 
758     for (i=0; i<active_tasks.size(); i++) {
759         atp = active_tasks[i];
760         if (!atp->process_exists()) continue;
761         if (GetExitCodeProcess(atp->process_handle, &exit_code)) {
762             if (exit_code != STILL_ACTIVE) {
763                 found = true;
764                 atp->handle_exited_app(exit_code);
765             }
766         } else {
767             if (log_flags.task_debug) {
768                 char errmsg[1024];
769                 msg_printf(atp->result->project, MSG_INFO,
770                     "[task] task %s GetExitCodeProcess() failed - %s GLE %d (0x%x)",
771                     atp->result->name,
772                     windows_format_error_string(
773                         GetLastError(), errmsg, sizeof(errmsg)
774                     ),
775                     GetLastError(), GetLastError()
776                 );
777             }
778 
779             // The process doesn't seem to be there.
780             // Mark task as aborted so we don't check it again.
781             //
782             atp->cleanup_task();
783             atp->set_task_state(PROCESS_ABORTED, "check_app_exited");
784         }
785     }
786 #else
787     int pid, stat;
788 
789     if ((pid = waitpid(-1, &stat, WNOHANG)) > 0) {
790         atp = lookup_pid(pid);
791         if (!atp) {
792             // if we're running benchmarks, exited process
793             // is probably a benchmark process; don't show error
794             //
795             if (!gstate.benchmarks_running && log_flags.task_debug) {
796                 msg_printf(NULL, MSG_INTERNAL_ERROR,
797                     "Process %d not found\n", pid
798                 );
799             }
800             return false;
801         }
802         atp->handle_exited_app(stat);
803         found = true;
804     }
805 #endif
806 
807     return found;
808 }
809 
810 // if an app has exceeded its maximum disk usage, abort it
811 //
812 bool ACTIVE_TASK::check_max_disk_exceeded() {
813     double disk_usage;
814     int retval;
815 
816     retval = current_disk_usage(disk_usage);
817     if (retval) {
818         msg_printf(this->wup->project, MSG_INTERNAL_ERROR,
819             "Can't get task disk usage: %s", boincerror(retval)
820         );
821     } else {
822         if (disk_usage > max_disk_usage) {
823             msg_printf(
824                 result->project, MSG_INFO,
825                 "Aborting task %s: exceeded disk limit: %.2fMB > %.2fMB\n",
826                 result->name, disk_usage/MEGA, max_disk_usage/MEGA
827             );
828             abort_task(EXIT_DISK_LIMIT_EXCEEDED, "Disk usage limit exceeded");
829             return true;
830         }
831     }
832     return false;
833 }
834 
835 // Check if any of the active tasks have exceeded their
836 // resource limits on disk, CPU time or memory
837 //
838 // TODO: this gets called ever 1 sec,
839 // but mem and disk usage are computed less often.
840 // refactor.
841 //
842 bool ACTIVE_TASK_SET::check_rsc_limits_exceeded() {
843     unsigned int i;
844     ACTIVE_TASK *atp;
845     static double last_disk_check_time = 0;
846     bool do_disk_check = false;
847     bool did_anything = false;
848     char buf[256];
849 
850     double ram_left = gstate.available_ram();
851     double max_ram = gstate.max_available_ram();
852 
853     // Some slot dirs have lots of files,
854     // so only check every min(disk_interval, 300) secs
855     //
856     double min_interval = gstate.global_prefs.disk_interval;
857     if (min_interval < 300) min_interval = 300;
858     if (gstate.clock_change || gstate.now > last_disk_check_time + min_interval) {
859         do_disk_check = true;
860     }
861     for (i=0; i<active_tasks.size(); i++) {
862         atp = active_tasks[i];
863         if (atp->task_state() != PROCESS_EXECUTING) continue;
864         if (!atp->result->non_cpu_intensive() && (atp->elapsed_time > atp->max_elapsed_time)) {
865             sprintf(buf, "exceeded elapsed time limit %.2f (%.2fG/%.2fG)",
866                 atp->max_elapsed_time,
867                 atp->result->wup->rsc_fpops_bound/1e9,
868                 atp->result->avp->flops/1e9
869             );
870             msg_printf(atp->result->project, MSG_INFO,
871                 "Aborting task %s: %s", atp->result->name, buf
872             );
873             atp->abort_task(EXIT_TIME_LIMIT_EXCEEDED, buf);
874             did_anything = true;
875             continue;
876         }
877 #if 0
878         // removing this for now because most projects currently
879         // have too-low values of workunit.rsc_memory_bound
880         // (causing lots of aborts)
881         // and I don't think we can expect projects to provide
882         // accurate bounds.
883         //
884         if (atp->procinfo.working_set_size_smoothed > atp->max_mem_usage) {
885             sprintf(buf, "working set size > workunit.rsc_memory_bound: %.2fMB > %.2fMB",
886                 atp->procinfo.working_set_size_smoothed/MEGA, atp->max_mem_usage/MEGA
887             );
888             msg_printf(atp->result->project, MSG_INFO,
889                 "Aborting task %s: %s",
890                 atp->result->name, buf
891             );
892             atp->abort_task(EXIT_MEM_LIMIT_EXCEEDED, buf);
893             did_anything = true;
894             continue;
895         }
896 #endif
897         if (atp->procinfo.working_set_size_smoothed > max_ram) {
898             sprintf(buf, "working set size > client RAM limit: %.2fMB > %.2fMB",
899                 atp->procinfo.working_set_size_smoothed/MEGA, max_ram/MEGA
900             );
901             msg_printf(atp->result->project, MSG_INFO,
902                 "Aborting task %s: %s",
903                 atp->result->name, buf
904             );
905             atp->abort_task(EXIT_MEM_LIMIT_EXCEEDED, buf);
906             did_anything = true;
907             continue;
908         }
909         if (do_disk_check || atp->peak_disk_usage == 0) {
910             if (atp->check_max_disk_exceeded()) {
911                 did_anything = true;
912                 continue;
913             }
914         }
915 
916         // don't count RAM usage of non-CPU-intensive jobs
917         //
918         if (!atp->result->non_cpu_intensive()) {
919             ram_left -= atp->procinfo.working_set_size_smoothed;
920         }
921     }
922     if (ram_left < 0) {
923         gstate.request_schedule_cpus("RAM usage limit exceeded");
924     }
925     if (do_disk_check) {
926         last_disk_check_time = gstate.now;
927     }
928     return did_anything;
929 }
930 
931 // If process is running, send it an "abort" message,
932 // Set a flag so that if it doesn't exit within 5 seconds,
933 // kill it by OS-specific mechanism (e.g. KILL signal).
934 // This is done when app has exceeded CPU, disk, or mem limits,
935 // or when the user has requested it.
936 // The task won't be restarted.
937 //
938 int ACTIVE_TASK::abort_task(int exit_status, const char* msg) {
939     if (task_state() == PROCESS_EXECUTING || task_state() == PROCESS_SUSPENDED) {
940         request_abort();
941     } else {
942         set_task_state(PROCESS_ABORTED, "abort_task");
943     }
944     result->exit_status = exit_status;
945     gstate.report_result_error(*result, msg);
946     result->set_state(RESULT_ABORTED, "abort_task");
947     if (task_state() == PROCESS_ABORTED) {
948         copy_final_info();
949         read_stderr_file();
950     }
951     return 0;
952 }
953 
954 // check for the stderr file, copy to result record
955 //
956 int ACTIVE_TASK::read_stderr_file() {
957     char* buf1, *buf2;
958     char path[MAXPATHLEN];
959 
960     // truncate stderr output to the last 63KB;
961     // it's unlikely that more than that will be useful
962     //
963     int max_len = 63*1024;
964     sprintf(path, "%s/%s", slot_dir, STDERR_FILE);
965     if (!boinc_file_exists(path)) return 0;
966     int retval  = read_file_malloc(
967         path, buf1, max_len, !cc_config.stderr_head
968     );
969     if (retval) return retval;
970 
971     // if it's a vbox app, check for string in stderr saying
972     // the job failed because CPU VM extensions disabled
973     //
974     if (strstr(app_version->plan_class, "vbox")) {
975         if (strstr(buf1, "ERR_CPU_VM_EXTENSIONS_DISABLED")) {
976             msg_printf(0, MSG_INFO,
977                 "Vbox app stderr indicates CPU VM extensions disabled"
978             );
979             gstate.host_info.p_vm_extensions_disabled = true;
980         }
981     }
982 
983     buf2 = (char*)malloc(2*max_len);
984     if (!buf2) {
985         free(buf1);
986         return ERR_MALLOC;
987     }
988     non_ascii_escape(buf1, buf2, 2*max_len);
989     result->stderr_out += "<stderr_txt>\n";
990     result->stderr_out += buf2;
991     result->stderr_out += "\n</stderr_txt>\n";
992     free(buf1);
993     free(buf2);
994     return 0;
995 }
996 
997 // tell a running app to reread project preferences.
998 // This is called when project prefs change,
999 // or when a user file has finished downloading.
1000 //
1001 // TODO: get rid of this function
1002 //
1003 int ACTIVE_TASK::request_reread_prefs() {
1004     int retval;
1005     APP_INIT_DATA aid;
1006 
1007     link_user_files();
1008 
1009     init_app_init_data(aid);
1010     retval = write_app_init_file(aid);
1011     if (retval) return retval;
1012 #if 0
1013     graphics_request_queue.msg_queue_send(
1014         xml_graphics_modes[MODE_REREAD_PREFS],
1015         app_client_shm.shm->graphics_request
1016     );
1017 #endif
1018     return 0;
1019 }
1020 
1021 // tell a running app to reread the app_info file
1022 // (e.g. because proxy settings have changed: this is for F@h)
1023 //
1024 int ACTIVE_TASK::request_reread_app_info() {
1025     APP_INIT_DATA aid;
1026     init_app_init_data(aid);
1027     int retval = write_app_init_file(aid);
1028     if (retval) return retval;
1029     process_control_queue.msg_queue_send(
1030         "<reread_app_info/>",
1031         app_client_shm.shm->process_control_request
1032     );
1033     return 0;
1034 }
1035 
1036 // tell all running apps of a project to reread prefs
1037 //
1038 void ACTIVE_TASK_SET::request_reread_prefs(PROJECT* project) {
1039     unsigned int i;
1040     ACTIVE_TASK* atp;
1041 
1042     for (i=0; i<active_tasks.size(); i++) {
1043         atp = active_tasks[i];
1044         if (atp->result->project != project) continue;
1045         if (!atp->process_exists()) continue;
1046         atp->request_reread_prefs();
1047     }
1048 }
1049 
1050 void ACTIVE_TASK_SET::request_reread_app_info() {
1051     for (unsigned int i=0; i<active_tasks.size(); i++) {
1052         ACTIVE_TASK* atp = active_tasks[i];
1053         if (!atp->process_exists()) continue;
1054         atp->request_reread_app_info();
1055     }
1056 }
1057 
1058 
1059 // send quit message to all tasks in the project
1060 // (or all tasks, if proj is NULL).
1061 // If they don't exit in QUIT_TIMEOUT seconds,
1062 // send them a kill signal and wait up to 5 more seconds to exit.
1063 // This is called when the client exits,
1064 // or when a project is detached or reset
1065 //
1066 int ACTIVE_TASK_SET::exit_tasks(PROJECT* proj) {
1067     if (log_flags.task_debug) {
1068         msg_printf(NULL, MSG_INFO, "[task_debug] requesting tasks to exit");
1069     }
1070     request_tasks_exit(proj);
1071 
1072     // Wait for tasks to exit normally; if they don't then kill them
1073     //
1074     if (wait_for_exit(QUIT_TIMEOUT, proj)) {
1075         if (log_flags.task_debug) {
1076             msg_printf(NULL, MSG_INFO,
1077                 "[task_debug] all tasks haven't exited after %d sec; killing them",
1078                 QUIT_TIMEOUT
1079             );
1080         }
1081         kill_tasks(proj);
1082         if (wait_for_exit(5, proj)) {
1083             if (log_flags.task_debug) {
1084                 msg_printf(NULL, MSG_INFO,
1085                     "[task_debug] tasks still not exited after 5 secs; giving up"
1086                 );
1087             }
1088         } else {
1089             if (log_flags.task_debug) {
1090                 msg_printf(NULL, MSG_INFO, "[task_debug] all tasks exited");
1091             }
1092         }
1093     } else {
1094         if (log_flags.task_debug) {
1095             msg_printf(NULL, MSG_INFO, "[task_debug] all tasks exited");
1096         }
1097     }
1098 
1099     // get final checkpoint_cpu_times
1100     //
1101     get_msgs();
1102 
1103     gstate.request_schedule_cpus("exit_tasks");
1104     return 0;
1105 }
1106 
1107 // Wait up to wait_time seconds for processes to exit
1108 // If proj is zero, wait for all processes, else that project's
1109 // NOTE: it's bad form to sleep, but it would be complex to avoid it here
1110 //
1111 int ACTIVE_TASK_SET::wait_for_exit(double wait_time, PROJECT* proj) {
1112     bool all_exited;
1113     unsigned int i,n;
1114     ACTIVE_TASK *atp;
1115 
1116     for (i=0; i<10; i++) {
1117         all_exited = true;
1118 
1119         for (n=0; n<active_tasks.size(); n++) {
1120             atp = active_tasks[n];
1121             if (proj && atp->wup->project != proj) continue;
1122             if (!atp->has_task_exited()) {
1123                 all_exited = false;
1124                 break;
1125             }
1126         }
1127 
1128         if (all_exited) return 0;
1129         boinc_sleep(wait_time/10.0);
1130     }
1131 
1132     return ERR_NOT_EXITED;
1133 }
1134 
1135 int ACTIVE_TASK_SET::abort_project(PROJECT* project) {
1136     vector<ACTIVE_TASK*>::iterator task_iter;
1137     ACTIVE_TASK* atp;
1138 
1139     exit_tasks(project);
1140     task_iter = active_tasks.begin();
1141     while (task_iter != active_tasks.end()) {
1142         atp = *task_iter;
1143         if (atp->result->project == project) {
1144             client_clean_out_dir(atp->slot_dir, "abort_project()");
1145             remove_project_owned_dir(atp->slot_dir);
1146             task_iter = active_tasks.erase(task_iter);
1147             delete atp;
1148         } else {
1149             ++task_iter;
1150         }
1151     }
1152     return 0;
1153 }
1154 
1155 // suspend all currently running tasks
1156 // e.g. because on batteries, time of day, benchmarking, CPU throttle, etc.
1157 //
1158 void ACTIVE_TASK_SET::suspend_all(int reason) {
1159     for (unsigned int i=0; i<active_tasks.size(); i++) {
1160         ACTIVE_TASK* atp = active_tasks[i];
1161 
1162         // don't suspend if process doesn't exist,
1163         // or if quit/abort is pending.
1164         // If process is currently suspended, proceed;
1165         // the new suspension may require it to be removed from memory.
1166         // E.g. a GPU job may currently be suspended due to CPU throttling,
1167         // and therefore left in memory,
1168         // but this suspension (say, a user request)
1169         // might require it to be removed from memory.
1170         //
1171         switch (atp->task_state()) {
1172         case PROCESS_EXECUTING:
1173         case PROCESS_SUSPENDED:
1174             break;
1175         default:
1176             continue;
1177         }
1178 
1179         // special cases for non-CPU-intensive apps
1180         //
1181         if (atp->result->non_cpu_intensive()) {
1182             if (cc_config.dont_suspend_nci) {
1183                 continue;
1184             }
1185             if (reason == SUSPEND_REASON_BATTERIES) {
1186                 continue;
1187             }
1188         }
1189 
1190         // handle CPU throttling separately
1191         //
1192         if (reason == SUSPEND_REASON_CPU_THROTTLE) {
1193             if (atp->result->dont_throttle()) continue;
1194             atp->preempt(REMOVE_NEVER, reason);
1195             continue;
1196         }
1197 
1198 #ifdef ANDROID
1199         // On Android, remove apps from memory if on batteries
1200         // no matter what the reason for suspension.
1201         // The message polling in the BOINC runtime system
1202         // imposes an overhead which drains the battery
1203         //
1204         if (gstate.host_info.host_is_running_on_batteries()) {
1205             atp->preempt(REMOVE_ALWAYS);
1206             continue;
1207         }
1208 #endif
1209 
1210         switch (reason) {
1211         case SUSPEND_REASON_BENCHMARKS:
1212             atp->preempt(REMOVE_NEVER);
1213             break;
1214         case SUSPEND_REASON_CPU_USAGE:
1215             // If we're suspending because of non-BOINC CPU load,
1216             // don't remove from memory.
1217             // Some systems do a security check when apps are launched,
1218             // which uses a lot of CPU.
1219             // Avoid going into a preemption loop.
1220             //
1221             if (atp->result->non_cpu_intensive()) break;
1222             atp->preempt(REMOVE_NEVER);
1223             break;
1224         case SUSPEND_REASON_BATTERY_OVERHEATED:
1225         case SUSPEND_REASON_BATTERY_CHARGING:
1226             // these conditions can oscillate, so leave apps in mem
1227             //
1228             atp->preempt(REMOVE_NEVER);
1229             break;
1230         default:
1231             atp->preempt(REMOVE_MAYBE_USER);
1232             break;
1233         }
1234     }
1235 }
1236 
1237 // resume all currently scheduled tasks
1238 //
1239 void ACTIVE_TASK_SET::unsuspend_all(int reason) {
1240     unsigned int i;
1241     ACTIVE_TASK* atp;
1242     for (i=0; i<active_tasks.size(); i++) {
1243         atp = active_tasks[i];
1244         if (atp->scheduler_state != CPU_SCHED_SCHEDULED) continue;
1245         if (atp->task_state() == PROCESS_UNINITIALIZED) {
1246             if (atp->resume_or_start(false)) {
1247                 msg_printf(atp->wup->project, MSG_INTERNAL_ERROR,
1248                     "Couldn't restart task %s", atp->result->name
1249                 );
1250             }
1251         } else if (atp->task_state() == PROCESS_SUSPENDED) {
1252             atp->unsuspend(reason);
1253         }
1254     }
1255 }
1256 
1257 // Check to see if any tasks are running
1258 // called if benchmarking and waiting for suspends to happen
1259 // or the system needs to suspend itself so we are suspending
1260 // the applications
1261 //
1262 bool ACTIVE_TASK_SET::is_task_executing() {
1263     unsigned int i;
1264     ACTIVE_TASK* atp;
1265     for (i=0; i<active_tasks.size(); i++) {
1266         atp = active_tasks[i];
1267         if (atp->task_state() == PROCESS_EXECUTING) {
1268             return true;
1269         }
1270     }
1271     return false;
1272 }
1273 
1274 // Send quit message to all app processes
1275 // This is called when the client exits,
1276 // or when a project is detached or reset
1277 //
1278 void ACTIVE_TASK_SET::request_tasks_exit(PROJECT* proj) {
1279     unsigned int i;
1280     ACTIVE_TASK *atp;
1281     for (i=0; i<active_tasks.size(); i++) {
1282         atp = active_tasks[i];
1283         if (proj && atp->wup->project != proj) continue;
1284         if (!atp->process_exists()) continue;
1285         atp->request_exit();
1286     }
1287 }
1288 
1289 // Send kill signal to all app processes
1290 // Don't wait for them to exit
1291 //
1292 void ACTIVE_TASK_SET::kill_tasks(PROJECT* proj) {
1293     unsigned int i;
1294     ACTIVE_TASK *atp;
1295     for (i=0; i<active_tasks.size(); i++) {
1296         atp = active_tasks[i];
1297         if (proj && atp->wup->project != proj) continue;
1298         if (!atp->process_exists()) continue;
1299         atp->kill_running_task(true);
1300     }
1301 }
1302 
1303 // send a <suspend> message
1304 //
1305 int ACTIVE_TASK::suspend() {
1306     if (!app_client_shm.shm) return 0;
1307     if (task_state() != PROCESS_EXECUTING) {
1308         msg_printf(result->project, MSG_INTERNAL_ERROR,
1309             "ACTIVE_TASK::SUSPEND(): expected task %s to be executing",
1310             result->name
1311         );
1312     }
1313     int n = process_control_queue.msg_queue_purge("<resume/>");
1314     if (n == 0) {
1315         process_control_queue.msg_queue_send(
1316             "<suspend/>",
1317             app_client_shm.shm->process_control_request
1318         );
1319     }
1320     set_task_state(PROCESS_SUSPENDED, "suspend");
1321     return 0;
1322 }
1323 
1324 // resume a suspended task
1325 //
1326 int ACTIVE_TASK::unsuspend(int reason) {
1327     if (!app_client_shm.shm) return 0;
1328     if (task_state() != PROCESS_SUSPENDED) {
1329         msg_printf(result->project, MSG_INFO,
1330             "Internal error: expected process %s to be suspended", result->name
1331         );
1332     }
1333     if (log_flags.cpu_sched && reason != SUSPEND_REASON_CPU_THROTTLE) {
1334         msg_printf(result->project, MSG_INFO,
1335             "[cpu_sched] Resuming %s", result->name
1336         );
1337     }
1338     int n = process_control_queue.msg_queue_purge("<suspend/>");
1339     if (n == 0) {
1340         process_control_queue.msg_queue_send(
1341             "<resume/>",
1342             app_client_shm.shm->process_control_request
1343         );
1344     }
1345     set_task_state(PROCESS_EXECUTING, "unsuspend");
1346     return 0;
1347 }
1348 
1349 void ACTIVE_TASK::send_network_available() {
1350     if (!app_client_shm.shm) return;
1351     process_control_queue.msg_queue_send(
1352         "<network_available/>",
1353         app_client_shm.shm->process_control_request
1354     );
1355     return;
1356 }
1357 
1358 // See if the app has placed a new message in shared mem
1359 // (with CPU done, frac done etc.)
1360 // If so parse it and return true.
1361 //
1362 bool ACTIVE_TASK::get_app_status_msg() {
1363     char msg_buf[MSG_CHANNEL_SIZE];
1364     double fd;
1365     int other_pid;
1366     double dtemp;
1367     static double last_msg_time=0;
1368 
1369     if (!app_client_shm.shm) {
1370         msg_printf(result->project, MSG_INFO,
1371             "Task %s: no shared memory segment", result->name
1372         );
1373         return false;
1374     }
1375     if (!app_client_shm.shm->app_status.get_msg(msg_buf)) {
1376         return false;
1377     }
1378     if (log_flags.app_msg_receive) {
1379         msg_printf(this->wup->project, MSG_INFO,
1380             "[app_msg_receive] got msg from slot %d: %s", slot, msg_buf
1381         );
1382     }
1383     want_network = 0;
1384     current_cpu_time = checkpoint_cpu_time = 0.0;
1385     if (parse_double(msg_buf, "<fraction_done>", fd)) {
1386         // fraction_done will be reported as zero
1387         // until the app's first call to boinc_fraction_done().
1388         // So ignore zeros.
1389         //
1390         if (fd) {
1391             fraction_done = fd;
1392             fraction_done_elapsed_time = elapsed_time;
1393             if (!first_fraction_done) {
1394                 first_fraction_done = fd;
1395                 first_fraction_done_elapsed_time = elapsed_time;
1396             }
1397             if (log_flags.task_debug && (fd<0 || fd>1)) {
1398                 if (gstate.now > last_msg_time + 60) {
1399                     msg_printf(this->wup->project, MSG_INFO,
1400                         "[task_debug] app reported bad fraction done: %f", fd
1401                     );
1402                     last_msg_time = gstate.now;
1403                 }
1404             }
1405         }
1406     }
1407     parse_double(msg_buf, "<current_cpu_time>", current_cpu_time);
1408     parse_double(msg_buf, "<checkpoint_cpu_time>", checkpoint_cpu_time);
1409     parse_double(msg_buf, "<fpops_per_cpu_sec>", result->fpops_per_cpu_sec);
1410     parse_double(msg_buf, "<fpops_cumulative>", result->fpops_cumulative);
1411     parse_double(msg_buf, "<intops_per_cpu_sec>", result->intops_per_cpu_sec);
1412     parse_double(msg_buf, "<intops_cumulative>", result->intops_cumulative);
1413     if (parse_double(msg_buf, "<bytes_sent>", dtemp)) {
1414         if (dtemp > bytes_sent_episode) {
1415             double nbytes = dtemp - bytes_sent_episode;
1416             daily_xfer_history.add(nbytes, true);
1417             bytes_sent += nbytes;
1418         }
1419         bytes_sent_episode = dtemp;
1420     }
1421     if (parse_double(msg_buf, "<bytes_received>", dtemp)) {
1422         if (dtemp > bytes_received_episode) {
1423             double nbytes = dtemp - bytes_received_episode;
1424             daily_xfer_history.add(nbytes, false);
1425             bytes_received += nbytes;
1426         }
1427         bytes_received_episode = dtemp;
1428     }
1429     parse_int(msg_buf, "<want_network>", want_network);
1430     if (parse_int(msg_buf, "<other_pid>", other_pid)) {
1431         // for now, we handle only one of these
1432         other_pids.clear();
1433         other_pids.push_back(other_pid);
1434     }
1435     if (current_cpu_time < 0) {
1436         msg_printf(result->project, MSG_INFO,
1437             "app reporting negative CPU: %f", current_cpu_time
1438         );
1439         current_cpu_time = 0;
1440     }
1441     if (checkpoint_cpu_time < 0) {
1442         msg_printf(result->project, MSG_INFO,
1443             "app reporting negative checkpoint CPU: %f", checkpoint_cpu_time
1444         );
1445         checkpoint_cpu_time = 0;
1446     }
1447     return true;
1448 }
1449 
1450 void ACTIVE_TASK::get_graphics_msg() {
1451     char msg_buf[MSG_CHANNEL_SIZE];
1452 
1453     if (!app_client_shm.shm) return;
1454     if (app_client_shm.shm->graphics_reply.get_msg(msg_buf)) {
1455         if (log_flags.app_msg_receive) {
1456             msg_printf(this->wup->project, MSG_INFO,
1457                 "[app_msg_receive] got msg from slot %d: %s", slot, msg_buf
1458             );
1459         }
1460 
1461         parse_str(msg_buf, "<web_graphics_url>", web_graphics_url, sizeof(web_graphics_url));
1462         parse_str(msg_buf, "<remote_desktop_addr>", remote_desktop_addr, sizeof(remote_desktop_addr));
1463     }
1464 }
1465 
1466 bool ACTIVE_TASK::get_trickle_up_msg() {
1467     char msg_buf[MSG_CHANNEL_SIZE];
1468     bool found = false;
1469     int retval;
1470 
1471     if (!app_client_shm.shm) return false;
1472     if (app_client_shm.shm->trickle_up.get_msg(msg_buf)) {
1473         if (match_tag(msg_buf, "<have_new_trickle_up/>")) {
1474             if (log_flags.app_msg_receive) {
1475                 msg_printf(NULL, MSG_INFO,
1476                     "[app_msg_receive] got msg from slot %d: %s", slot, msg_buf
1477                 );
1478             }
1479             retval = move_trickle_file();
1480             if (!retval) {
1481                 wup->project->trickle_up_pending = true;
1482             }
1483         }
1484         if (match_tag(msg_buf, "<have_new_upload_file/>")) {
1485             if (log_flags.app_msg_receive) {
1486                 msg_printf(NULL, MSG_INFO,
1487                     "[app_msg_receive] got msg from slot %d: %s", slot, msg_buf
1488                 );
1489             }
1490             handle_upload_files();
1491         }
1492         found = true;
1493     }
1494     return found;
1495 }
1496 
1497 // check for msgs from active tasks,
1498 // and update their elapsed time and other info
1499 //
1500 void ACTIVE_TASK_SET::get_msgs() {
1501     unsigned int i;
1502     ACTIVE_TASK *atp;
1503     double old_time;
1504     static double last_time=0;
1505     double delta_t;
1506     if (!gstate.clock_change && last_time) {
1507         delta_t = gstate.now - last_time;
1508 
1509         // Normally this is called every second.
1510         // If delta_t is > 10, we'll assume that a period of hibernation
1511         // or suspension happened, and treat it as zero.
1512         // If negative, must be clock reset.  Ignore.
1513         //
1514         if (delta_t > 10 || delta_t < 0) {
1515             delta_t = 0;
1516         }
1517     } else {
1518         delta_t = 0;
1519     }
1520     last_time = gstate.now;
1521 
1522     double et_diff = delta_t;
1523     double et_diff_throttle = delta_t * gstate.global_prefs.cpu_usage_limit/100;
1524 
1525     for (i=0; i<active_tasks.size(); i++) {
1526         atp = active_tasks[i];
1527         if (!atp->process_exists()) continue;
1528         old_time = atp->checkpoint_cpu_time;
1529         if (atp->scheduler_state == CPU_SCHED_SCHEDULED && !gstate.tasks_suspended) {
1530             double x = atp->result->dont_throttle()?et_diff:et_diff_throttle;
1531             atp->elapsed_time += x;
1532             atp->wup->project->elapsed_time += x;
1533         }
1534         if (atp->get_app_status_msg()) {
1535             if (old_time != atp->checkpoint_cpu_time) {
1536                 char buf[256];
1537                 sprintf(buf, "%s checkpointed", atp->result->name);
1538                 if (atp->overdue_checkpoint) {
1539                     gstate.request_schedule_cpus(buf);
1540                 }
1541                 atp->checkpoint_wall_time = gstate.now;
1542                 atp->premature_exit_count = 0;
1543                 atp->checkpoint_elapsed_time = atp->elapsed_time;
1544                 atp->checkpoint_fraction_done = atp->fraction_done;
1545                 atp->checkpoint_fraction_done_elapsed_time = atp->fraction_done_elapsed_time;
1546                 if (log_flags.checkpoint_debug) {
1547                     msg_printf(atp->wup->project, MSG_INFO,
1548                         "[checkpoint] result %s checkpointed",
1549                         atp->result->name
1550                     );
1551                 } else if (log_flags.task_debug) {
1552                     msg_printf(atp->wup->project, MSG_INFO,
1553                         "[task] result %s checkpointed",
1554                         atp->result->name
1555                     );
1556                 }
1557                 atp->write_task_state_file();
1558             }
1559         }
1560         atp->get_trickle_up_msg();
1561         atp->get_graphics_msg();
1562     }
1563 }
1564 
1565 // The job just checkpointed.
1566 // Write some state items to a file in the slot dir
1567 // (this avoids rewriting the state file on each checkpoint)
1568 //
1569 void ACTIVE_TASK::write_task_state_file() {
1570     char path[MAXPATHLEN];
1571     sprintf(path, "%s/%s", slot_dir, TASK_STATE_FILENAME);
1572     FILE* f = boinc_fopen(path, "w");
1573     if (!f) return;
1574     fprintf(f,
1575         "<active_task>\n"
1576         "    <project_master_url>%s</project_master_url>\n"
1577         "    <result_name>%s</result_name>\n"
1578         "    <checkpoint_cpu_time>%f</checkpoint_cpu_time>\n"
1579         "    <checkpoint_elapsed_time>%f</checkpoint_elapsed_time>\n"
1580         "    <fraction_done>%f</fraction_done>\n"
1581         "    <peak_working_set_size>%.0f</peak_working_set_size>\n"
1582         "    <peak_swap_size>%.0f</peak_swap_size>\n"
1583         "    <peak_disk_usage>%.0f</peak_disk_usage>\n"
1584         "</active_task>\n",
1585         result->project->master_url,
1586         result->name,
1587         checkpoint_cpu_time,
1588         checkpoint_elapsed_time,
1589         checkpoint_fraction_done,
1590         peak_working_set_size,
1591         peak_swap_size,
1592         peak_disk_usage
1593     );
1594     fclose(f);
1595 }
1596 
1597 // called on startup; read the task state file in case it's more recent
1598 // than the main state file
1599 //
1600 void ACTIVE_TASK::read_task_state_file() {
1601     char buf[4096], path[MAXPATHLEN], s[1024];
1602     sprintf(path, "%s/%s", slot_dir, TASK_STATE_FILENAME);
1603     FILE* f = boinc_fopen(path, "r");
1604     if (!f) return;
1605     buf[0] = 0;
1606     (void) fread(buf, 1, 4096, f);
1607     fclose(f);
1608     buf[4095] = 0;
1609     double x;
1610     // TODO: use XML parser
1611 
1612     // sanity checks - project and result name must match
1613     //
1614     if (!parse_str(buf, "<project_master_url>", s, sizeof(s))) {
1615         msg_printf(wup->project, MSG_INTERNAL_ERROR,
1616             "no project URL in task state file"
1617         );
1618         return;
1619     }
1620     if (strcmp(s, result->project->master_url)) {
1621         msg_printf(wup->project, MSG_INTERNAL_ERROR,
1622             "wrong project URL in task state file"
1623         );
1624         return;
1625     }
1626     if (!parse_str(buf, "<result_name>", s, sizeof(s))) {
1627         msg_printf(wup->project, MSG_INTERNAL_ERROR,
1628             "no task name in task state file"
1629         );
1630         return;
1631     }
1632     if (strcmp(s, result->name)) {
1633         msg_printf(wup->project, MSG_INTERNAL_ERROR,
1634             "wrong task name in task state file"
1635         );
1636         return;
1637     }
1638     if (parse_double(buf, "<checkpoint_cpu_time>", x)) {
1639         if (x > checkpoint_cpu_time) {
1640             checkpoint_cpu_time = x;
1641         }
1642     }
1643     if (parse_double(buf, "<checkpoint_elapsed_time>", x)) {
1644         if (x > checkpoint_elapsed_time) {
1645             checkpoint_elapsed_time = x;
1646         }
1647     }
1648     if (parse_double(buf, "<peak_working_set_size>", x)) {
1649         peak_working_set_size = x;
1650     }
1651     if (parse_double(buf, "<peak_swap_size>", x)) {
1652         peak_swap_size = x;
1653     }
1654     if (parse_double(buf, "<peak_disk_usage>", x)) {
1655         peak_disk_usage = x;
1656     }
1657 }
1658