1#! /usr/bin/env python 2# encoding: utf-8 3# Thomas Nagy, 2017-2018 (ita) 4 5""" 6A system for fast partial rebuilds 7 8Creating a large amount of task objects up front can take some time. 9By making a few assumptions, it is possible to avoid posting creating 10task objects for targets that are already up-to-date. 11 12On a silly benchmark the gain observed for 1M tasks can be 5m->10s 13for a single file change. 14 15Usage:: 16 17 def options(opt): 18 opt.load('fast_partial') 19 20Assumptions: 21* Start with a clean build (run "waf distclean" after enabling) 22* Mostly for C/C++/Fortran targets with link tasks (object-only targets are not handled) 23 try it in the folder generated by utils/genbench.py 24* For full project builds: no --targets and no pruning from subfolders 25* The installation phase is ignored 26* `use=` dependencies are specified up front even across build groups 27* Task generator source files are not obtained from globs 28 29Implementation details: 30* The first layer obtains file timestamps to recalculate file hashes only 31 when necessary (similar to md5_tstamp); the timestamps are then stored 32 in a dedicated pickle file 33* A second layer associates each task generator to a file set to help 34 detecting changes. Task generators are to create their tasks only when 35 the related files have been modified. A specific db file is created 36 to store such data (5m -> 1m10) 37* A third layer binds build context proxies onto task generators, replacing 38 the default context. While loading data for the full build uses more memory 39 (4GB -> 9GB), partial builds are then much faster (1m10 -> 13s) 40* A fourth layer enables a 2-level cache on file signatures to 41 reduce the size of the main pickle file (13s -> 10s) 42""" 43 44import os 45from waflib import Build, Context, Errors, Logs, Task, TaskGen, Utils 46from waflib.TaskGen import feature, after_method, taskgen_method 47import waflib.Node 48 49DONE = 0 50DIRTY = 1 51NEEDED = 2 52 53SKIPPABLE = ['cshlib', 'cxxshlib', 'cstlib', 'cxxstlib', 'cprogram', 'cxxprogram'] 54 55TSTAMP_DB = '.wafpickle_tstamp_db_file' 56 57SAVED_ATTRS = 'root node_sigs task_sigs imp_sigs raw_deps node_deps'.split() 58 59class bld_proxy(object): 60 def __init__(self, bld): 61 object.__setattr__(self, 'bld', bld) 62 63 object.__setattr__(self, 'node_class', type('Nod3', (waflib.Node.Node,), {})) 64 self.node_class.__module__ = 'waflib.Node' 65 self.node_class.ctx = self 66 67 object.__setattr__(self, 'root', self.node_class('', None)) 68 for x in SAVED_ATTRS: 69 if x != 'root': 70 object.__setattr__(self, x, {}) 71 72 self.fix_nodes() 73 74 def __setattr__(self, name, value): 75 bld = object.__getattribute__(self, 'bld') 76 setattr(bld, name, value) 77 78 def __delattr__(self, name): 79 bld = object.__getattribute__(self, 'bld') 80 delattr(bld, name) 81 82 def __getattribute__(self, name): 83 try: 84 return object.__getattribute__(self, name) 85 except AttributeError: 86 bld = object.__getattribute__(self, 'bld') 87 return getattr(bld, name) 88 89 def __call__(self, *k, **kw): 90 return self.bld(*k, **kw) 91 92 def fix_nodes(self): 93 for x in ('srcnode', 'path', 'bldnode'): 94 node = self.root.find_dir(getattr(self.bld, x).abspath()) 95 object.__setattr__(self, x, node) 96 97 def set_key(self, store_key): 98 object.__setattr__(self, 'store_key', store_key) 99 100 def fix_tg_path(self, *tgs): 101 # changing Node objects on task generators is possible 102 # yet, all Node objects must belong to the same parent 103 for tg in tgs: 104 tg.path = self.root.make_node(tg.path.abspath()) 105 106 def restore(self): 107 dbfn = os.path.join(self.variant_dir, Context.DBFILE + self.store_key) 108 Logs.debug('rev_use: reading %s', dbfn) 109 try: 110 data = Utils.readf(dbfn, 'rb') 111 except (EnvironmentError, EOFError): 112 # handle missing file/empty file 113 Logs.debug('rev_use: Could not load the build cache %s (missing)', dbfn) 114 else: 115 try: 116 waflib.Node.pickle_lock.acquire() 117 waflib.Node.Nod3 = self.node_class 118 try: 119 data = Build.cPickle.loads(data) 120 except Exception as e: 121 Logs.debug('rev_use: Could not pickle the build cache %s: %r', dbfn, e) 122 else: 123 for x in SAVED_ATTRS: 124 object.__setattr__(self, x, data.get(x, {})) 125 finally: 126 waflib.Node.pickle_lock.release() 127 self.fix_nodes() 128 129 def store(self): 130 data = {} 131 for x in Build.SAVED_ATTRS: 132 data[x] = getattr(self, x) 133 db = os.path.join(self.variant_dir, Context.DBFILE + self.store_key) 134 135 with waflib.Node.pickle_lock: 136 waflib.Node.Nod3 = self.node_class 137 try: 138 x = Build.cPickle.dumps(data, Build.PROTOCOL) 139 except Build.cPickle.PicklingError: 140 root = data['root'] 141 for node_deps in data['node_deps'].values(): 142 for idx, node in enumerate(node_deps): 143 # there may be more cross-context Node objects to fix, 144 # but this should be the main source 145 node_deps[idx] = root.find_node(node.abspath()) 146 x = Build.cPickle.dumps(data, Build.PROTOCOL) 147 148 Logs.debug('rev_use: storing %s', db) 149 Utils.writef(db + '.tmp', x, m='wb') 150 try: 151 st = os.stat(db) 152 os.remove(db) 153 if not Utils.is_win32: 154 os.chown(db + '.tmp', st.st_uid, st.st_gid) 155 except (AttributeError, OSError): 156 pass 157 os.rename(db + '.tmp', db) 158 159class bld(Build.BuildContext): 160 def __init__(self, **kw): 161 super(bld, self).__init__(**kw) 162 self.hashes_md5_tstamp = {} 163 164 def __call__(self, *k, **kw): 165 # this is one way of doing it, one could use a task generator method too 166 bld = kw['bld'] = bld_proxy(self) 167 ret = TaskGen.task_gen(*k, **kw) 168 self.task_gen_cache_names = {} 169 self.add_to_group(ret, group=kw.get('group')) 170 ret.bld = bld 171 bld.set_key(ret.path.abspath().replace(os.sep, '') + str(ret.idx)) 172 return ret 173 174 def is_dirty(self): 175 return True 176 177 def store_tstamps(self): 178 # Called after a build is finished 179 # For each task generator, record all files involved in task objects 180 # optimization: done only if there was something built 181 do_store = False 182 try: 183 f_deps = self.f_deps 184 except AttributeError: 185 f_deps = self.f_deps = {} 186 self.f_tstamps = {} 187 188 allfiles = set() 189 for g in self.groups: 190 for tg in g: 191 try: 192 staleness = tg.staleness 193 except AttributeError: 194 staleness = DIRTY 195 196 if staleness != DIRTY: 197 # DONE case: there was nothing built 198 # NEEDED case: the tg was brought in because of 'use' propagation 199 # but nothing really changed for them, there may be incomplete 200 # tasks (object files) and in this case it is best to let the next build 201 # figure out if an input/output file changed 202 continue 203 204 do_cache = False 205 for tsk in tg.tasks: 206 if tsk.hasrun == Task.SUCCESS: 207 do_cache = True 208 pass 209 elif tsk.hasrun == Task.SKIPPED: 210 pass 211 else: 212 # one failed task, clear the cache for this tg 213 try: 214 del f_deps[(tg.path.abspath(), tg.idx)] 215 except KeyError: 216 pass 217 else: 218 # just store the new state because there is a change 219 do_store = True 220 221 # skip the rest because there is no valid cache possible 222 break 223 else: 224 if not do_cache: 225 # all skipped, but is there anything in cache? 226 try: 227 f_deps[(tg.path.abspath(), tg.idx)] 228 except KeyError: 229 # probably cleared because a wscript file changed 230 # store it 231 do_cache = True 232 233 if do_cache: 234 235 # there was a rebuild, store the data structure too 236 tg.bld.store() 237 238 # all tasks skipped but no cache 239 # or a successful task build 240 do_store = True 241 st = set() 242 for tsk in tg.tasks: 243 st.update(tsk.inputs) 244 st.update(self.node_deps.get(tsk.uid(), [])) 245 246 # TODO do last/when loading the tgs? 247 lst = [] 248 for k in ('wscript', 'wscript_build'): 249 n = tg.path.find_node(k) 250 if n: 251 n.get_bld_sig() 252 lst.append(n.abspath()) 253 254 lst.extend(sorted(x.abspath() for x in st)) 255 allfiles.update(lst) 256 f_deps[(tg.path.abspath(), tg.idx)] = lst 257 258 for x in allfiles: 259 # f_tstamps has everything, while md5_tstamp can be relatively empty on partial builds 260 self.f_tstamps[x] = self.hashes_md5_tstamp[x][0] 261 262 if do_store: 263 dbfn = os.path.join(self.variant_dir, TSTAMP_DB) 264 Logs.debug('rev_use: storing %s', dbfn) 265 dbfn_tmp = dbfn + '.tmp' 266 x = Build.cPickle.dumps([self.f_tstamps, f_deps], Build.PROTOCOL) 267 Utils.writef(dbfn_tmp, x, m='wb') 268 os.rename(dbfn_tmp, dbfn) 269 Logs.debug('rev_use: stored %s', dbfn) 270 271 def store(self): 272 self.store_tstamps() 273 if self.producer.dirty: 274 Build.BuildContext.store(self) 275 276 def compute_needed_tgs(self): 277 # assume the 'use' keys are not modified during the build phase 278 279 dbfn = os.path.join(self.variant_dir, TSTAMP_DB) 280 Logs.debug('rev_use: Loading %s', dbfn) 281 try: 282 data = Utils.readf(dbfn, 'rb') 283 except (EnvironmentError, EOFError): 284 Logs.debug('rev_use: Could not load the build cache %s (missing)', dbfn) 285 self.f_deps = {} 286 self.f_tstamps = {} 287 else: 288 try: 289 self.f_tstamps, self.f_deps = Build.cPickle.loads(data) 290 except Exception as e: 291 Logs.debug('rev_use: Could not pickle the build cache %s: %r', dbfn, e) 292 self.f_deps = {} 293 self.f_tstamps = {} 294 else: 295 Logs.debug('rev_use: Loaded %s', dbfn) 296 297 298 # 1. obtain task generators that contain rebuilds 299 # 2. obtain the 'use' graph and its dual 300 stales = set() 301 reverse_use_map = Utils.defaultdict(list) 302 use_map = Utils.defaultdict(list) 303 304 for g in self.groups: 305 for tg in g: 306 if tg.is_stale(): 307 stales.add(tg) 308 309 try: 310 lst = tg.use = Utils.to_list(tg.use) 311 except AttributeError: 312 pass 313 else: 314 for x in lst: 315 try: 316 xtg = self.get_tgen_by_name(x) 317 except Errors.WafError: 318 pass 319 else: 320 use_map[tg].append(xtg) 321 reverse_use_map[xtg].append(tg) 322 323 Logs.debug('rev_use: found %r stale tgs', len(stales)) 324 325 # 3. dfs to post downstream tg as stale 326 visited = set() 327 def mark_down(tg): 328 if tg in visited: 329 return 330 visited.add(tg) 331 Logs.debug('rev_use: marking down %r as stale', tg.name) 332 tg.staleness = DIRTY 333 for x in reverse_use_map[tg]: 334 mark_down(x) 335 for tg in stales: 336 mark_down(tg) 337 338 # 4. dfs to find ancestors tg to mark as needed 339 self.needed_tgs = needed_tgs = set() 340 def mark_needed(tg): 341 if tg in needed_tgs: 342 return 343 needed_tgs.add(tg) 344 if tg.staleness == DONE: 345 Logs.debug('rev_use: marking up %r as needed', tg.name) 346 tg.staleness = NEEDED 347 for x in use_map[tg]: 348 mark_needed(x) 349 for xx in visited: 350 mark_needed(xx) 351 352 # so we have the whole tg trees to post in the set "needed" 353 # load their build trees 354 for tg in needed_tgs: 355 tg.bld.restore() 356 tg.bld.fix_tg_path(tg) 357 358 # the stale ones should be fully build, while the needed ones 359 # may skip a few tasks, see create_compiled_task and apply_link_after below 360 Logs.debug('rev_use: amount of needed task gens: %r', len(needed_tgs)) 361 362 def post_group(self): 363 # assumption: we can ignore the folder/subfolders cuts 364 def tgpost(tg): 365 try: 366 f = tg.post 367 except AttributeError: 368 pass 369 else: 370 f() 371 372 if not self.targets or self.targets == '*': 373 for tg in self.groups[self.current_group]: 374 # this can cut quite a lot of tg objects 375 if tg in self.needed_tgs: 376 tgpost(tg) 377 else: 378 # default implementation 379 return Build.BuildContext.post_group() 380 381 def get_build_iterator(self): 382 if not self.targets or self.targets == '*': 383 self.compute_needed_tgs() 384 return Build.BuildContext.get_build_iterator(self) 385 386@taskgen_method 387def is_stale(self): 388 # assume no globs 389 self.staleness = DIRTY 390 391 # 1. the case of always stale targets 392 if getattr(self, 'always_stale', False): 393 return True 394 395 # 2. check if the db file exists 396 db = os.path.join(self.bld.variant_dir, Context.DBFILE) 397 try: 398 dbstat = os.stat(db).st_mtime 399 except OSError: 400 Logs.debug('rev_use: must post %r because this is a clean build') 401 return True 402 403 # 3.a check if the configuration exists 404 cache_node = self.bld.bldnode.find_node('c4che/build.config.py') 405 if not cache_node: 406 return True 407 408 # 3.b check if the configuration changed 409 if os.stat(cache_node.abspath()).st_mtime > dbstat: 410 Logs.debug('rev_use: must post %r because the configuration has changed', self.name) 411 return True 412 413 # 3.c any tstamp data? 414 try: 415 f_deps = self.bld.f_deps 416 except AttributeError: 417 Logs.debug('rev_use: must post %r because there is no f_deps', self.name) 418 return True 419 420 # 4. check if this is the first build (no cache) 421 try: 422 lst = f_deps[(self.path.abspath(), self.idx)] 423 except KeyError: 424 Logs.debug('rev_use: must post %r because there it has no cached data', self.name) 425 return True 426 427 try: 428 cache = self.bld.cache_tstamp_rev_use 429 except AttributeError: 430 cache = self.bld.cache_tstamp_rev_use = {} 431 432 # 5. check the timestamp of each dependency files listed is unchanged 433 f_tstamps = self.bld.f_tstamps 434 for x in lst: 435 try: 436 old_ts = f_tstamps[x] 437 except KeyError: 438 Logs.debug('rev_use: must post %r because %r is not in cache', self.name, x) 439 return True 440 441 try: 442 try: 443 ts = cache[x] 444 except KeyError: 445 ts = cache[x] = os.stat(x).st_mtime 446 except OSError: 447 del f_deps[(self.path.abspath(), self.idx)] 448 Logs.debug('rev_use: must post %r because %r does not exist anymore', self.name, x) 449 return True 450 else: 451 if ts != old_ts: 452 Logs.debug('rev_use: must post %r because the timestamp on %r changed %r %r', self.name, x, old_ts, ts) 453 return True 454 455 self.staleness = DONE 456 return False 457 458@taskgen_method 459def create_compiled_task(self, name, node): 460 # skip the creation of object files 461 # assumption: object-only targets are not skippable 462 if self.staleness == NEEDED: 463 # only libraries/programs can skip object files 464 for x in SKIPPABLE: 465 if x in self.features: 466 return None 467 468 out = '%s.%d.o' % (node.name, self.idx) 469 task = self.create_task(name, node, node.parent.find_or_declare(out)) 470 try: 471 self.compiled_tasks.append(task) 472 except AttributeError: 473 self.compiled_tasks = [task] 474 return task 475 476@feature(*SKIPPABLE) 477@after_method('apply_link') 478def apply_link_after(self): 479 # cprogram/cxxprogram might be unnecessary 480 if self.staleness != NEEDED: 481 return 482 for tsk in self.tasks: 483 tsk.hasrun = Task.SKIPPED 484 485def path_from(self, node): 486 # handle nodes of distinct types 487 if node.ctx is not self.ctx: 488 node = self.ctx.root.make_node(node.abspath()) 489 return self.default_path_from(node) 490waflib.Node.Node.default_path_from = waflib.Node.Node.path_from 491waflib.Node.Node.path_from = path_from 492 493def h_file(self): 494 # similar to md5_tstamp.py, but with 2-layer cache 495 # global_cache for the build context common for all task generators 496 # local_cache for the build context proxy (one by task generator) 497 # 498 # the global cache is not persistent 499 # the local cache is persistent and meant for partial builds 500 # 501 # assume all calls are made from a single thread 502 # 503 filename = self.abspath() 504 st = os.stat(filename) 505 506 global_cache = self.ctx.bld.hashes_md5_tstamp 507 local_cache = self.ctx.hashes_md5_tstamp 508 509 if filename in global_cache: 510 # value already calculated in this build 511 cval = global_cache[filename] 512 513 # the value in global cache is assumed to be calculated once 514 # reverifying it could cause task generators 515 # to get distinct tstamp values, thus missing rebuilds 516 local_cache[filename] = cval 517 return cval[1] 518 519 if filename in local_cache: 520 cval = local_cache[filename] 521 if cval[0] == st.st_mtime: 522 # correct value from a previous build 523 # put it in the global cache 524 global_cache[filename] = cval 525 return cval[1] 526 527 ret = Utils.h_file(filename) 528 local_cache[filename] = global_cache[filename] = (st.st_mtime, ret) 529 return ret 530waflib.Node.Node.h_file = h_file 531 532