1 // This file is part of BOINC.
2 // http://boinc.berkeley.edu
3 // Copyright (C) 2008 University of California
4 //
5 // BOINC is free software; you can redistribute it and/or modify it
6 // under the terms of the GNU Lesser General Public License
7 // as published by the Free Software Foundation,
8 // either version 3 of the License, or (at your option) any later version.
9 //
10 // BOINC is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13 // See the GNU Lesser General Public License for more details.
14 //
15 // You should have received a copy of the GNU Lesser General Public License
16 // along with BOINC.  If not, see <http://www.gnu.org/licenses/>.
17 
18 // Work fetch logic for CPU, GPU, and other processing resources.
19 // See http://boinc.berkeley.edu/trac/wiki/GpuWorkFetch
20 
21 #ifndef BOINC_WORK_FETCH_H
22 #define BOINC_WORK_FETCH_H
23 
24 #include <vector>
25 #include <deque>
26 
27 #define RSC_TYPE_ANY    -1
28 #define RSC_TYPE_CPU    0
29 
30 // reasons for not fetching work from a project
31 //
32 #define CANT_FETCH_WORK_NON_CPU_INTENSIVE           1
33 #define CANT_FETCH_WORK_SUSPENDED_VIA_GUI           2
34 #define CANT_FETCH_WORK_MASTER_URL_FETCH_PENDING    3
35 #define CANT_FETCH_WORK_MIN_RPC_TIME                4
36 #define CANT_FETCH_WORK_DONT_REQUEST_MORE_WORK      5
37 #define CANT_FETCH_WORK_DOWNLOAD_STALLED            6
38 #define CANT_FETCH_WORK_RESULT_SUSPENDED            7
39 #define CANT_FETCH_WORK_TOO_MANY_UPLOADS            8
40 #define CANT_FETCH_WORK_NOT_HIGHEST_PRIORITY        9
41 #define CANT_FETCH_WORK_DONT_NEED                   10
42 #define CANT_FETCH_WORK_TOO_MANY_RUNNABLE           11
43 
44 // in case of DONT_NEED, per-resource reason
45 //
46 #define DONT_FETCH_GPUS_NOT_USABLE                  1
47 #define DONT_FETCH_PREFS                            2
48 #define DONT_FETCH_CONFIG                           3
49 #define DONT_FETCH_NO_APPS                          4
50 #define DONT_FETCH_AMS                              5
51 #define DONT_FETCH_ZERO_SHARE                       7
52 #define DONT_FETCH_BUFFER_FULL                      8
53 #define DONT_FETCH_NOT_HIGHEST_PRIO                 9
54 #define DONT_FETCH_BACKED_OFF                       10
55 #define DONT_FETCH_DEFER_SCHED                      11
56 
57 struct PROJECT;
58 struct RESULT;
59 struct ACTIVE_TASK;
60 struct RSC_WORK_FETCH;
61 struct SCHEDULER_REPLY;
62 struct APP_VERSION;
63 
64 typedef long long COPROC_INSTANCE_BITMAP;
65     // should be at least MAX_COPROC_INSTANCES (64) bits
66 
67 // state per (resource, project) pair
68 //
69 struct RSC_PROJECT_WORK_FETCH {
70     // the following are persistent (saved in state file)
71     double backoff_time;
72     double backoff_interval;
73 
74     // the following used by REC accounting
75     double secs_this_rec_interval;
76 
77     double queue_est;
78         // an estimate of instance-secs of queued work;
79     bool anonymous_platform_no_apps;
80         // set if this project is anonymous platform
81         // and it has no app version that uses this resource
82     double fetchable_share;
83         // this project's share relative to projects from which
84         // we could probably get work for this resource;
85         // determines how many instances this project deserves
86     int n_runnable_jobs;
87     double sim_nused;
88         // # of instances used at this point in the simulation
89     double nused_total;     // sum of instances over all runnable jobs
90     int ncoprocs_excluded;
91         // number of excluded instances
92     COPROC_INSTANCE_BITMAP non_excluded_instances;
93         // bitmap of non-excluded instances
94         // (i.e. instances this project's jobs can run on)
95     int deadlines_missed;
96     int deadlines_missed_copy;
97         // copy of the above used during schedule_cpus()
98     std::deque<RESULT*> pending;
99     std::deque<RESULT*>::iterator pending_iter;
100     bool has_deferred_job;
101         // This project has a coproc job of the given type for which
102         // the job is deferred because of a temporary_exit() call.
103         // Don't fetch more jobs of this type; they might have same problem
104     int rsc_project_reason;
105         // If zero, it's OK to ask this project for this type of work.
106         // If nonzero, the reason why it's not OK
107 
RSC_PROJECT_WORK_FETCHRSC_PROJECT_WORK_FETCH108     RSC_PROJECT_WORK_FETCH() {
109         backoff_time = 0;
110         backoff_interval = 0;
111         secs_this_rec_interval = 0;
112         queue_est = 0;
113         anonymous_platform_no_apps = false;
114         fetchable_share = 0;
115         n_runnable_jobs = 0;
116         sim_nused = 0;
117         nused_total = 0;
118         ncoprocs_excluded = 0;
119         non_excluded_instances = 0;
120         deadlines_missed = 0;
121         deadlines_missed_copy = 0;
122         pending.clear();
123         has_deferred_job = false;
124         rsc_project_reason = 0;
125     }
126 
resetRSC_PROJECT_WORK_FETCH127     inline void reset() {
128         backoff_time = 0;
129         backoff_interval = 0;
130     }
131 
reset_rec_accountingRSC_PROJECT_WORK_FETCH132     inline void reset_rec_accounting() {
133         secs_this_rec_interval = 0;
134     }
135     int compute_rsc_project_reason(PROJECT*, int rsc_type);
136     void resource_backoff(PROJECT*, const char*);
137     void rr_init();
clear_backoffRSC_PROJECT_WORK_FETCH138     void clear_backoff() {
139         backoff_time = 0;
140         backoff_interval = 0;
141     }
142 };
143 
144 // estimate the time a resource will be saturated
145 // with high-priority jobs.
146 //
147 struct BUSY_TIME_ESTIMATOR {
148     std::vector<double> busy_time;
149     int ninstances;
resetBUSY_TIME_ESTIMATOR150     inline void reset() {
151         for (int i=0; i<ninstances; i++) {
152             busy_time[i] = 0;
153         }
154     }
initBUSY_TIME_ESTIMATOR155     inline void init(int n) {
156         ninstances = n;
157         busy_time.resize(n);
158         reset();
159     }
160     // called for each high-priority job.
161     // Find the least-busy instance, and put this job
162     // on that and following instances
163     //
updateBUSY_TIME_ESTIMATOR164     inline void update(double dur, double nused) {
165         if (ninstances==0) return;
166         int i, j;
167         if (nused < 1) return;
168         double best = 0;
169         int ibest = 0;
170         for (i=0; i<ninstances; i++) {
171             if (!i || busy_time[i] < best) {
172                 best = busy_time[i];
173                 ibest = i;
174             }
175         }
176         int inused = (int) nused;     // ignore fractional usage
177         for (i=0; i<inused; i++) {
178             j = (ibest + i) % ninstances;
179             busy_time[j] += dur;
180         }
181     }
182 
183     // the overall busy time is the busy time of
184     // the least busy instance
185     //
get_busy_timeBUSY_TIME_ESTIMATOR186     inline double get_busy_time() {
187         double best = 0;
188         for (int i=0; i<ninstances; i++) {
189             if (!i || busy_time[i] < best) {
190                 best = busy_time[i];
191             }
192         }
193         return best;
194     }
195 };
196 
197 // per-resource state
198 //
199 struct RSC_WORK_FETCH {
200     int rsc_type;
201     int ninstances;
202     double relative_speed;   // total FLOPS relative to CPU total FLOPS
203     bool has_exclusions;
204 
205     // the following used/set by rr_simulation():
206     //
207     double shortfall;
208         // seconds of idle instances between now and now+work_buf_total()
209     double nidle_now;
210     double sim_nused;
211     COPROC_INSTANCE_BITMAP sim_used_instances;
212         // bitmap of instances used in simulation,
213         // taking into account GPU exclusions
214     COPROC_INSTANCE_BITMAP sim_excluded_instances;
215         // bitmap of instances not used (i.e. starved because of exclusion)
216     double total_fetchable_share;
217         // total RS of projects from which we could fetch jobs for this device
218     double saturated_time;
219         // estimated time until resource is not saturated
220         // used to calculate work request
221     double deadline_missed_instances;
222         // instance count for jobs that miss deadline
223     BUSY_TIME_ESTIMATOR busy_time_estimator;
224     int dont_fetch_reason;
225 #ifdef SIM
226     double estimated_delay;
227 #endif
228     // the following specify the work request for this resource
229     //
230     double req_secs;
231     double req_instances;
232     // REC accounting
233     double secs_this_rec_interval;
234     // temp in choose_project()
235     PROJECT* found_project;     // a project able to ask for this work
236 
initRSC_WORK_FETCH237     void init(int t, int n, double sp) {
238         rsc_type = t;
239         ninstances = n;
240         relative_speed = sp;
241         busy_time_estimator.init(n);
242     }
243     void rr_init();
244     void update_stats(double sim_now, double dt, double buf_end);
245     void update_busy_time(double dur, double nused);
246     void supplement(PROJECT*);
247     RSC_PROJECT_WORK_FETCH& project_state(PROJECT*);
248     void print_state(const char*);
249     void clear_request();
250     void set_request(PROJECT*);
251     void copy_request(COPROC&);
252     void set_request_excluded(PROJECT*);
253     bool may_have_work(PROJECT*);
254     int cant_fetch(PROJECT*);
255     bool backed_off(PROJECT*);
256     bool uses_starved_excluded_instances(PROJECT*);
reset_rec_accountingRSC_WORK_FETCH257     inline void reset_rec_accounting() {
258         this->secs_this_rec_interval = 0;
259     }
RSC_WORK_FETCHRSC_WORK_FETCH260     RSC_WORK_FETCH() {
261         rsc_type = 0;
262         ninstances = 0;
263         relative_speed = 0;
264         has_exclusions = false;
265         shortfall = 0;
266         nidle_now = 0;
267         sim_nused = 0;
268         sim_used_instances = 0;
269         sim_excluded_instances = 0;
270         total_fetchable_share = 0;
271         saturated_time = 0;
272         deadline_missed_instances = 0;
273         busy_time_estimator.init(0);
274         dont_fetch_reason = 0;
275 #ifdef SIM
276         estimated_delay = 0.0;
277 #endif
278         req_secs = 0.0;
279         req_instances = 0.0;
280         secs_this_rec_interval = 0.0;
281         found_project = NULL;
282     }
283 };
284 
285 
286 // per project state
287 //
288 struct PROJECT_WORK_FETCH {
289     double rec;
290         // recent estimated credit
291     double rec_time;
292         // when it was last updated
293     double rec_temp;
294         // temporary copy used during schedule_cpus() and work fetch
295     double rec_temp_save;
296         // temporary used during RR simulation
297     int project_reason;
298     int compute_project_reason(PROJECT*);
299     int n_runnable_jobs;
300     bool request_if_idle_and_uploading;
301         // Set when a job finishes.
302         // If we're uploading but a resource is idle, make a work request.
303         // If this succeeds, clear the flag.
PROJECT_WORK_FETCHPROJECT_WORK_FETCH304     PROJECT_WORK_FETCH() {
305         memset(this, 0, sizeof(*this));
306     }
307     void reset(PROJECT*);
308     void rr_init(PROJECT*);
309     void print_state(PROJECT*);
310 };
311 
312 // global work fetch state
313 //
314 struct WORK_FETCH {
315     std::vector<PROJECT*> projects_sorted;
316         // projects in decreasing priority order
317     void setup();
318     PROJECT* choose_project();
319         // Find a project to ask for work.
320     PROJECT* non_cpu_intensive_project_needing_work();
321     void piggyback_work_request(PROJECT*);
322         // we're going to contact this project anyway;
323         // piggyback a work request if appropriate.
324     void accumulate_inst_sec(ACTIVE_TASK*, double dt);
325     void write_request(FILE*, PROJECT*);
326     void handle_reply(
327         PROJECT*, SCHEDULER_REPLY*, std::vector<RESULT*>new_results
328     );
329     void set_initial_work_request(PROJECT*);
330     void set_all_requests(PROJECT*);
331     void set_all_requests_hyst(PROJECT*, int rsc_type);
332     void print_state();
333     void init();
334     void rr_init();
335     void clear_request();
336     void compute_shares();
337     void clear_backoffs(APP_VERSION&);
338     void request_string(char*, int);
339     bool requested_work();
340     void copy_requests();
341 };
342 
343 extern RSC_WORK_FETCH rsc_work_fetch[MAX_RSC];
344 extern WORK_FETCH work_fetch;
345 
346 extern void project_priority_init(bool for_work_fetch);
347 extern double project_priority(PROJECT*);
348 extern void adjust_rec_sched(RESULT*);
349 extern void adjust_rec_work_fetch(RESULT*);
350 
351 extern double total_peak_flops();
352 extern const char* project_reason_string(PROJECT* p, char* buf, int len);
353 extern const char* rsc_project_reason_string(int);
354 
355 #endif
356