1 // This file is part of BOINC. 2 // http://boinc.berkeley.edu 3 // Copyright (C) 2008 University of California 4 // 5 // BOINC is free software; you can redistribute it and/or modify it 6 // under the terms of the GNU Lesser General Public License 7 // as published by the Free Software Foundation, 8 // either version 3 of the License, or (at your option) any later version. 9 // 10 // BOINC is distributed in the hope that it will be useful, 11 // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 13 // See the GNU Lesser General Public License for more details. 14 // 15 // You should have received a copy of the GNU Lesser General Public License 16 // along with BOINC. If not, see <http://www.gnu.org/licenses/>. 17 18 // Work fetch logic for CPU, GPU, and other processing resources. 19 // See http://boinc.berkeley.edu/trac/wiki/GpuWorkFetch 20 21 #ifndef BOINC_WORK_FETCH_H 22 #define BOINC_WORK_FETCH_H 23 24 #include <vector> 25 #include <deque> 26 27 #define RSC_TYPE_ANY -1 28 #define RSC_TYPE_CPU 0 29 30 // reasons for not fetching work from a project 31 // 32 #define CANT_FETCH_WORK_NON_CPU_INTENSIVE 1 33 #define CANT_FETCH_WORK_SUSPENDED_VIA_GUI 2 34 #define CANT_FETCH_WORK_MASTER_URL_FETCH_PENDING 3 35 #define CANT_FETCH_WORK_MIN_RPC_TIME 4 36 #define CANT_FETCH_WORK_DONT_REQUEST_MORE_WORK 5 37 #define CANT_FETCH_WORK_DOWNLOAD_STALLED 6 38 #define CANT_FETCH_WORK_RESULT_SUSPENDED 7 39 #define CANT_FETCH_WORK_TOO_MANY_UPLOADS 8 40 #define CANT_FETCH_WORK_NOT_HIGHEST_PRIORITY 9 41 #define CANT_FETCH_WORK_DONT_NEED 10 42 #define CANT_FETCH_WORK_TOO_MANY_RUNNABLE 11 43 44 // in case of DONT_NEED, per-resource reason 45 // 46 #define DONT_FETCH_GPUS_NOT_USABLE 1 47 #define DONT_FETCH_PREFS 2 48 #define DONT_FETCH_CONFIG 3 49 #define DONT_FETCH_NO_APPS 4 50 #define DONT_FETCH_AMS 5 51 #define DONT_FETCH_ZERO_SHARE 7 52 #define DONT_FETCH_BUFFER_FULL 8 53 #define DONT_FETCH_NOT_HIGHEST_PRIO 9 54 #define DONT_FETCH_BACKED_OFF 10 55 #define DONT_FETCH_DEFER_SCHED 11 56 57 struct PROJECT; 58 struct RESULT; 59 struct ACTIVE_TASK; 60 struct RSC_WORK_FETCH; 61 struct SCHEDULER_REPLY; 62 struct APP_VERSION; 63 64 typedef long long COPROC_INSTANCE_BITMAP; 65 // should be at least MAX_COPROC_INSTANCES (64) bits 66 67 // state per (resource, project) pair 68 // 69 struct RSC_PROJECT_WORK_FETCH { 70 // the following are persistent (saved in state file) 71 double backoff_time; 72 double backoff_interval; 73 74 // the following used by REC accounting 75 double secs_this_rec_interval; 76 77 double queue_est; 78 // an estimate of instance-secs of queued work; 79 bool anonymous_platform_no_apps; 80 // set if this project is anonymous platform 81 // and it has no app version that uses this resource 82 double fetchable_share; 83 // this project's share relative to projects from which 84 // we could probably get work for this resource; 85 // determines how many instances this project deserves 86 int n_runnable_jobs; 87 double sim_nused; 88 // # of instances used at this point in the simulation 89 double nused_total; // sum of instances over all runnable jobs 90 int ncoprocs_excluded; 91 // number of excluded instances 92 COPROC_INSTANCE_BITMAP non_excluded_instances; 93 // bitmap of non-excluded instances 94 // (i.e. instances this project's jobs can run on) 95 int deadlines_missed; 96 int deadlines_missed_copy; 97 // copy of the above used during schedule_cpus() 98 std::deque<RESULT*> pending; 99 std::deque<RESULT*>::iterator pending_iter; 100 bool has_deferred_job; 101 // This project has a coproc job of the given type for which 102 // the job is deferred because of a temporary_exit() call. 103 // Don't fetch more jobs of this type; they might have same problem 104 int rsc_project_reason; 105 // If zero, it's OK to ask this project for this type of work. 106 // If nonzero, the reason why it's not OK 107 RSC_PROJECT_WORK_FETCHRSC_PROJECT_WORK_FETCH108 RSC_PROJECT_WORK_FETCH() { 109 backoff_time = 0; 110 backoff_interval = 0; 111 secs_this_rec_interval = 0; 112 queue_est = 0; 113 anonymous_platform_no_apps = false; 114 fetchable_share = 0; 115 n_runnable_jobs = 0; 116 sim_nused = 0; 117 nused_total = 0; 118 ncoprocs_excluded = 0; 119 non_excluded_instances = 0; 120 deadlines_missed = 0; 121 deadlines_missed_copy = 0; 122 pending.clear(); 123 has_deferred_job = false; 124 rsc_project_reason = 0; 125 } 126 resetRSC_PROJECT_WORK_FETCH127 inline void reset() { 128 backoff_time = 0; 129 backoff_interval = 0; 130 } 131 reset_rec_accountingRSC_PROJECT_WORK_FETCH132 inline void reset_rec_accounting() { 133 secs_this_rec_interval = 0; 134 } 135 int compute_rsc_project_reason(PROJECT*, int rsc_type); 136 void resource_backoff(PROJECT*, const char*); 137 void rr_init(); clear_backoffRSC_PROJECT_WORK_FETCH138 void clear_backoff() { 139 backoff_time = 0; 140 backoff_interval = 0; 141 } 142 }; 143 144 // estimate the time a resource will be saturated 145 // with high-priority jobs. 146 // 147 struct BUSY_TIME_ESTIMATOR { 148 std::vector<double> busy_time; 149 int ninstances; resetBUSY_TIME_ESTIMATOR150 inline void reset() { 151 for (int i=0; i<ninstances; i++) { 152 busy_time[i] = 0; 153 } 154 } initBUSY_TIME_ESTIMATOR155 inline void init(int n) { 156 ninstances = n; 157 busy_time.resize(n); 158 reset(); 159 } 160 // called for each high-priority job. 161 // Find the least-busy instance, and put this job 162 // on that and following instances 163 // updateBUSY_TIME_ESTIMATOR164 inline void update(double dur, double nused) { 165 if (ninstances==0) return; 166 int i, j; 167 if (nused < 1) return; 168 double best = 0; 169 int ibest = 0; 170 for (i=0; i<ninstances; i++) { 171 if (!i || busy_time[i] < best) { 172 best = busy_time[i]; 173 ibest = i; 174 } 175 } 176 int inused = (int) nused; // ignore fractional usage 177 for (i=0; i<inused; i++) { 178 j = (ibest + i) % ninstances; 179 busy_time[j] += dur; 180 } 181 } 182 183 // the overall busy time is the busy time of 184 // the least busy instance 185 // get_busy_timeBUSY_TIME_ESTIMATOR186 inline double get_busy_time() { 187 double best = 0; 188 for (int i=0; i<ninstances; i++) { 189 if (!i || busy_time[i] < best) { 190 best = busy_time[i]; 191 } 192 } 193 return best; 194 } 195 }; 196 197 // per-resource state 198 // 199 struct RSC_WORK_FETCH { 200 int rsc_type; 201 int ninstances; 202 double relative_speed; // total FLOPS relative to CPU total FLOPS 203 bool has_exclusions; 204 205 // the following used/set by rr_simulation(): 206 // 207 double shortfall; 208 // seconds of idle instances between now and now+work_buf_total() 209 double nidle_now; 210 double sim_nused; 211 COPROC_INSTANCE_BITMAP sim_used_instances; 212 // bitmap of instances used in simulation, 213 // taking into account GPU exclusions 214 COPROC_INSTANCE_BITMAP sim_excluded_instances; 215 // bitmap of instances not used (i.e. starved because of exclusion) 216 double total_fetchable_share; 217 // total RS of projects from which we could fetch jobs for this device 218 double saturated_time; 219 // estimated time until resource is not saturated 220 // used to calculate work request 221 double deadline_missed_instances; 222 // instance count for jobs that miss deadline 223 BUSY_TIME_ESTIMATOR busy_time_estimator; 224 int dont_fetch_reason; 225 #ifdef SIM 226 double estimated_delay; 227 #endif 228 // the following specify the work request for this resource 229 // 230 double req_secs; 231 double req_instances; 232 // REC accounting 233 double secs_this_rec_interval; 234 // temp in choose_project() 235 PROJECT* found_project; // a project able to ask for this work 236 initRSC_WORK_FETCH237 void init(int t, int n, double sp) { 238 rsc_type = t; 239 ninstances = n; 240 relative_speed = sp; 241 busy_time_estimator.init(n); 242 } 243 void rr_init(); 244 void update_stats(double sim_now, double dt, double buf_end); 245 void update_busy_time(double dur, double nused); 246 void supplement(PROJECT*); 247 RSC_PROJECT_WORK_FETCH& project_state(PROJECT*); 248 void print_state(const char*); 249 void clear_request(); 250 void set_request(PROJECT*); 251 void copy_request(COPROC&); 252 void set_request_excluded(PROJECT*); 253 bool may_have_work(PROJECT*); 254 int cant_fetch(PROJECT*); 255 bool backed_off(PROJECT*); 256 bool uses_starved_excluded_instances(PROJECT*); reset_rec_accountingRSC_WORK_FETCH257 inline void reset_rec_accounting() { 258 this->secs_this_rec_interval = 0; 259 } RSC_WORK_FETCHRSC_WORK_FETCH260 RSC_WORK_FETCH() { 261 rsc_type = 0; 262 ninstances = 0; 263 relative_speed = 0; 264 has_exclusions = false; 265 shortfall = 0; 266 nidle_now = 0; 267 sim_nused = 0; 268 sim_used_instances = 0; 269 sim_excluded_instances = 0; 270 total_fetchable_share = 0; 271 saturated_time = 0; 272 deadline_missed_instances = 0; 273 busy_time_estimator.init(0); 274 dont_fetch_reason = 0; 275 #ifdef SIM 276 estimated_delay = 0.0; 277 #endif 278 req_secs = 0.0; 279 req_instances = 0.0; 280 secs_this_rec_interval = 0.0; 281 found_project = NULL; 282 } 283 }; 284 285 286 // per project state 287 // 288 struct PROJECT_WORK_FETCH { 289 double rec; 290 // recent estimated credit 291 double rec_time; 292 // when it was last updated 293 double rec_temp; 294 // temporary copy used during schedule_cpus() and work fetch 295 double rec_temp_save; 296 // temporary used during RR simulation 297 int project_reason; 298 int compute_project_reason(PROJECT*); 299 int n_runnable_jobs; 300 bool request_if_idle_and_uploading; 301 // Set when a job finishes. 302 // If we're uploading but a resource is idle, make a work request. 303 // If this succeeds, clear the flag. PROJECT_WORK_FETCHPROJECT_WORK_FETCH304 PROJECT_WORK_FETCH() { 305 memset(this, 0, sizeof(*this)); 306 } 307 void reset(PROJECT*); 308 void rr_init(PROJECT*); 309 void print_state(PROJECT*); 310 }; 311 312 // global work fetch state 313 // 314 struct WORK_FETCH { 315 std::vector<PROJECT*> projects_sorted; 316 // projects in decreasing priority order 317 void setup(); 318 PROJECT* choose_project(); 319 // Find a project to ask for work. 320 PROJECT* non_cpu_intensive_project_needing_work(); 321 void piggyback_work_request(PROJECT*); 322 // we're going to contact this project anyway; 323 // piggyback a work request if appropriate. 324 void accumulate_inst_sec(ACTIVE_TASK*, double dt); 325 void write_request(FILE*, PROJECT*); 326 void handle_reply( 327 PROJECT*, SCHEDULER_REPLY*, std::vector<RESULT*>new_results 328 ); 329 void set_initial_work_request(PROJECT*); 330 void set_all_requests(PROJECT*); 331 void set_all_requests_hyst(PROJECT*, int rsc_type); 332 void print_state(); 333 void init(); 334 void rr_init(); 335 void clear_request(); 336 void compute_shares(); 337 void clear_backoffs(APP_VERSION&); 338 void request_string(char*, int); 339 bool requested_work(); 340 void copy_requests(); 341 }; 342 343 extern RSC_WORK_FETCH rsc_work_fetch[MAX_RSC]; 344 extern WORK_FETCH work_fetch; 345 346 extern void project_priority_init(bool for_work_fetch); 347 extern double project_priority(PROJECT*); 348 extern void adjust_rec_sched(RESULT*); 349 extern void adjust_rec_work_fetch(RESULT*); 350 351 extern double total_peak_flops(); 352 extern const char* project_reason_string(PROJECT* p, char* buf, int len); 353 extern const char* rsc_project_reason_string(int); 354 355 #endif 356