1#! /usr/bin/env python 2# encoding: utf-8 3# Thomas Nagy, 2017-2018 (ita) 4 5""" 6A system for fast partial rebuilds 7 8Creating a large amount of task objects up front can take some time. 9By making a few assumptions, it is possible to avoid posting creating 10task objects for targets that are already up-to-date. 11 12On a silly benchmark the gain observed for 1M tasks can be 5m->10s 13for a single file change. 14 15Usage:: 16 17 def options(opt): 18 opt.load('fast_partial') 19 20Assuptions: 21* Mostly for C/C++/Fortran targets with link tasks (object-only targets are not handled) 22* For full project builds: no --targets and no pruning from subfolders 23* The installation phase is ignored 24* `use=` dependencies are specified up front even across build groups 25* Task generator source files are not obtained from globs 26 27Implementation details: 28* The first layer obtains file timestamps to recalculate file hashes only 29 when necessary (similar to md5_tstamp); the timestamps are then stored 30 in a dedicated pickle file 31* A second layer associates each task generator to a file set to help 32 detecting changes. Task generators are to create their tasks only when 33 the related files have been modified. A specific db file is created 34 to store such data (5m -> 1m10) 35* A third layer binds build context proxies onto task generators, replacing 36 the default context. While loading data for the full build uses more memory 37 (4GB -> 9GB), partial builds are then much faster (1m10 -> 13s) 38* A fourth layer enables a 2-level cache on file signatures to 39 reduce the size of the main pickle file (13s -> 10s) 40""" 41 42import os 43from waflib import Build, Context, Errors, Logs, Task, TaskGen, Utils 44from waflib.TaskGen import feature, after_method, taskgen_method 45import waflib.Node 46 47DONE = 0 48DIRTY = 1 49NEEDED = 2 50 51SKIPPABLE = ['cshlib', 'cxxshlib', 'cstlib', 'cxxstlib', 'cprogram', 'cxxprogram'] 52 53TSTAMP_DB = '.wafpickle_tstamp_db_file' 54 55SAVED_ATTRS = 'root node_sigs task_sigs imp_sigs raw_deps node_deps'.split() 56 57class bld_proxy(object): 58 def __init__(self, bld): 59 object.__setattr__(self, 'bld', bld) 60 61 object.__setattr__(self, 'node_class', type('Nod3', (waflib.Node.Node,), {})) 62 self.node_class.__module__ = 'waflib.Node' 63 self.node_class.ctx = self 64 65 object.__setattr__(self, 'root', self.node_class('', None)) 66 for x in SAVED_ATTRS: 67 if x != 'root': 68 object.__setattr__(self, x, {}) 69 70 self.fix_nodes() 71 72 def __setattr__(self, name, value): 73 bld = object.__getattribute__(self, 'bld') 74 setattr(bld, name, value) 75 76 def __delattr__(self, name): 77 bld = object.__getattribute__(self, 'bld') 78 delattr(bld, name) 79 80 def __getattribute__(self, name): 81 try: 82 return object.__getattribute__(self, name) 83 except AttributeError: 84 bld = object.__getattribute__(self, 'bld') 85 return getattr(bld, name) 86 87 def __call__(self, *k, **kw): 88 return self.bld(*k, **kw) 89 90 def fix_nodes(self): 91 for x in ('srcnode', 'path', 'bldnode'): 92 node = self.root.find_dir(getattr(self.bld, x).abspath()) 93 object.__setattr__(self, x, node) 94 95 def set_key(self, store_key): 96 object.__setattr__(self, 'store_key', store_key) 97 98 def fix_tg_path(self, *tgs): 99 # changing Node objects on task generators is possible 100 # yet, all Node objects must belong to the same parent 101 for tg in tgs: 102 tg.path = self.root.make_node(tg.path.abspath()) 103 104 def restore(self): 105 dbfn = os.path.join(self.variant_dir, Context.DBFILE + self.store_key) 106 Logs.debug('rev_use: reading %s', dbfn) 107 try: 108 data = Utils.readf(dbfn, 'rb') 109 except (EnvironmentError, EOFError): 110 # handle missing file/empty file 111 Logs.debug('rev_use: Could not load the build cache %s (missing)', dbfn) 112 else: 113 try: 114 waflib.Node.pickle_lock.acquire() 115 waflib.Node.Nod3 = self.node_class 116 try: 117 data = Build.cPickle.loads(data) 118 except Exception as e: 119 Logs.debug('rev_use: Could not pickle the build cache %s: %r', dbfn, e) 120 else: 121 for x in SAVED_ATTRS: 122 object.__setattr__(self, x, data.get(x, {})) 123 finally: 124 waflib.Node.pickle_lock.release() 125 self.fix_nodes() 126 127 def store(self): 128 data = {} 129 for x in Build.SAVED_ATTRS: 130 data[x] = getattr(self, x) 131 db = os.path.join(self.variant_dir, Context.DBFILE + self.store_key) 132 133 try: 134 waflib.Node.pickle_lock.acquire() 135 waflib.Node.Nod3 = self.node_class 136 x = Build.cPickle.dumps(data, Build.PROTOCOL) 137 finally: 138 waflib.Node.pickle_lock.release() 139 140 Logs.debug('rev_use: storing %s', db) 141 Utils.writef(db + '.tmp', x, m='wb') 142 try: 143 st = os.stat(db) 144 os.remove(db) 145 if not Utils.is_win32: 146 os.chown(db + '.tmp', st.st_uid, st.st_gid) 147 except (AttributeError, OSError): 148 pass 149 os.rename(db + '.tmp', db) 150 151class bld(Build.BuildContext): 152 def __init__(self, **kw): 153 super(bld, self).__init__(**kw) 154 self.hashes_md5_tstamp = {} 155 156 def __call__(self, *k, **kw): 157 # this is one way of doing it, one could use a task generator method too 158 bld = kw['bld'] = bld_proxy(self) 159 ret = TaskGen.task_gen(*k, **kw) 160 self.task_gen_cache_names = {} 161 self.add_to_group(ret, group=kw.get('group')) 162 ret.bld = bld 163 bld.set_key(ret.path.abspath().replace(os.sep, '') + str(ret.idx)) 164 return ret 165 166 def is_dirty(self): 167 return True 168 169 def store_tstamps(self): 170 # Called after a build is finished 171 # For each task generator, record all files involved in task objects 172 # optimization: done only if there was something built 173 do_store = False 174 try: 175 f_deps = self.f_deps 176 except AttributeError: 177 f_deps = self.f_deps = {} 178 self.f_tstamps = {} 179 180 allfiles = set() 181 for g in self.groups: 182 for tg in g: 183 try: 184 staleness = tg.staleness 185 except AttributeError: 186 staleness = DIRTY 187 188 if staleness != DIRTY: 189 # DONE case: there was nothing built 190 # NEEDED case: the tg was brought in because of 'use' propagation 191 # but nothing really changed for them, there may be incomplete 192 # tasks (object files) and in this case it is best to let the next build 193 # figure out if an input/output file changed 194 continue 195 196 do_cache = False 197 for tsk in tg.tasks: 198 if tsk.hasrun == Task.SUCCESS: 199 do_cache = True 200 pass 201 elif tsk.hasrun == Task.SKIPPED: 202 pass 203 else: 204 # one failed task, clear the cache for this tg 205 try: 206 del f_deps[(tg.path.abspath(), tg.idx)] 207 except KeyError: 208 pass 209 else: 210 # just store the new state because there is a change 211 do_store = True 212 213 # skip the rest because there is no valid cache possible 214 break 215 else: 216 if not do_cache: 217 # all skipped, but is there anything in cache? 218 try: 219 f_deps[(tg.path.abspath(), tg.idx)] 220 except KeyError: 221 # probably cleared because a wscript file changed 222 # store it 223 do_cache = True 224 225 if do_cache: 226 227 # there was a rebuild, store the data structure too 228 tg.bld.store() 229 230 # all tasks skipped but no cache 231 # or a successful task build 232 do_store = True 233 st = set() 234 for tsk in tg.tasks: 235 st.update(tsk.inputs) 236 st.update(self.node_deps.get(tsk.uid(), [])) 237 238 # TODO do last/when loading the tgs? 239 lst = [] 240 for k in ('wscript', 'wscript_build'): 241 n = tg.path.find_node(k) 242 if n: 243 n.get_bld_sig() 244 lst.append(n.abspath()) 245 246 lst.extend(sorted(x.abspath() for x in st)) 247 allfiles.update(lst) 248 f_deps[(tg.path.abspath(), tg.idx)] = lst 249 250 for x in allfiles: 251 # f_tstamps has everything, while md5_tstamp can be relatively empty on partial builds 252 self.f_tstamps[x] = self.hashes_md5_tstamp[x][0] 253 254 if do_store: 255 dbfn = os.path.join(self.variant_dir, TSTAMP_DB) 256 Logs.debug('rev_use: storing %s', dbfn) 257 dbfn_tmp = dbfn + '.tmp' 258 x = Build.cPickle.dumps([self.f_tstamps, f_deps], Build.PROTOCOL) 259 Utils.writef(dbfn_tmp, x, m='wb') 260 os.rename(dbfn_tmp, dbfn) 261 Logs.debug('rev_use: stored %s', dbfn) 262 263 def store(self): 264 self.store_tstamps() 265 if self.producer.dirty: 266 Build.BuildContext.store(self) 267 268 def compute_needed_tgs(self): 269 # assume the 'use' keys are not modified during the build phase 270 271 dbfn = os.path.join(self.variant_dir, TSTAMP_DB) 272 Logs.debug('rev_use: Loading %s', dbfn) 273 try: 274 data = Utils.readf(dbfn, 'rb') 275 except (EnvironmentError, EOFError): 276 Logs.debug('rev_use: Could not load the build cache %s (missing)', dbfn) 277 self.f_deps = {} 278 self.f_tstamps = {} 279 else: 280 try: 281 self.f_tstamps, self.f_deps = Build.cPickle.loads(data) 282 except Exception as e: 283 Logs.debug('rev_use: Could not pickle the build cache %s: %r', dbfn, e) 284 self.f_deps = {} 285 self.f_tstamps = {} 286 else: 287 Logs.debug('rev_use: Loaded %s', dbfn) 288 289 290 # 1. obtain task generators that contain rebuilds 291 # 2. obtain the 'use' graph and its dual 292 stales = set() 293 reverse_use_map = Utils.defaultdict(list) 294 use_map = Utils.defaultdict(list) 295 296 for g in self.groups: 297 for tg in g: 298 if tg.is_stale(): 299 stales.add(tg) 300 301 try: 302 lst = tg.use = Utils.to_list(tg.use) 303 except AttributeError: 304 pass 305 else: 306 for x in lst: 307 try: 308 xtg = self.get_tgen_by_name(x) 309 except Errors.WafError: 310 pass 311 else: 312 use_map[tg].append(xtg) 313 reverse_use_map[xtg].append(tg) 314 315 Logs.debug('rev_use: found %r stale tgs', len(stales)) 316 317 # 3. dfs to post downstream tg as stale 318 visited = set() 319 def mark_down(tg): 320 if tg in visited: 321 return 322 visited.add(tg) 323 Logs.debug('rev_use: marking down %r as stale', tg.name) 324 tg.staleness = DIRTY 325 for x in reverse_use_map[tg]: 326 mark_down(x) 327 for tg in stales: 328 mark_down(tg) 329 330 # 4. dfs to find ancestors tg to mark as needed 331 self.needed_tgs = needed_tgs = set() 332 def mark_needed(tg): 333 if tg in needed_tgs: 334 return 335 needed_tgs.add(tg) 336 if tg.staleness == DONE: 337 Logs.debug('rev_use: marking up %r as needed', tg.name) 338 tg.staleness = NEEDED 339 for x in use_map[tg]: 340 mark_needed(x) 341 for xx in visited: 342 mark_needed(xx) 343 344 # so we have the whole tg trees to post in the set "needed" 345 # load their build trees 346 for tg in needed_tgs: 347 tg.bld.restore() 348 tg.bld.fix_tg_path(tg) 349 350 # the stale ones should be fully build, while the needed ones 351 # may skip a few tasks, see create_compiled_task and apply_link_after below 352 Logs.debug('rev_use: amount of needed task gens: %r', len(needed_tgs)) 353 354 def post_group(self): 355 # assumption: we can ignore the folder/subfolders cuts 356 def tgpost(tg): 357 try: 358 f = tg.post 359 except AttributeError: 360 pass 361 else: 362 f() 363 364 if not self.targets or self.targets == '*': 365 for tg in self.groups[self.current_group]: 366 # this can cut quite a lot of tg objects 367 if tg in self.needed_tgs: 368 tgpost(tg) 369 else: 370 # default implementation 371 return Build.BuildContext.post_group() 372 373 def get_build_iterator(self): 374 if not self.targets or self.targets == '*': 375 self.compute_needed_tgs() 376 return Build.BuildContext.get_build_iterator(self) 377 378@taskgen_method 379def is_stale(self): 380 # assume no globs 381 self.staleness = DIRTY 382 383 # 1. the case of always stale targets 384 if getattr(self, 'always_stale', False): 385 return True 386 387 # 2. check if the db file exists 388 db = os.path.join(self.bld.variant_dir, Context.DBFILE) 389 try: 390 dbstat = os.stat(db).st_mtime 391 except OSError: 392 Logs.debug('rev_use: must post %r because this is a clean build') 393 return True 394 395 # 3. check if the configuration changed 396 if os.stat(self.bld.bldnode.find_node('c4che/build.config.py').abspath()).st_mtime > dbstat: 397 Logs.debug('rev_use: must post %r because the configuration has changed', self.name) 398 return True 399 400 # 3.a any tstamp data? 401 try: 402 f_deps = self.bld.f_deps 403 except AttributeError: 404 Logs.debug('rev_use: must post %r because there is no f_deps', self.name) 405 return True 406 407 # 4. check if this is the first build (no cache) 408 try: 409 lst = f_deps[(self.path.abspath(), self.idx)] 410 except KeyError: 411 Logs.debug('rev_use: must post %r because there it has no cached data', self.name) 412 return True 413 414 try: 415 cache = self.bld.cache_tstamp_rev_use 416 except AttributeError: 417 cache = self.bld.cache_tstamp_rev_use = {} 418 419 # 5. check the timestamp of each dependency files listed is unchanged 420 f_tstamps = self.bld.f_tstamps 421 for x in lst: 422 try: 423 old_ts = f_tstamps[x] 424 except KeyError: 425 Logs.debug('rev_use: must post %r because %r is not in cache', self.name, x) 426 return True 427 428 try: 429 try: 430 ts = cache[x] 431 except KeyError: 432 ts = cache[x] = os.stat(x).st_mtime 433 except OSError: 434 del f_deps[(self.path.abspath(), self.idx)] 435 Logs.debug('rev_use: must post %r because %r does not exist anymore', self.name, x) 436 return True 437 else: 438 if ts != old_ts: 439 Logs.debug('rev_use: must post %r because the timestamp on %r changed %r %r', self.name, x, old_ts, ts) 440 return True 441 442 self.staleness = DONE 443 return False 444 445@taskgen_method 446def create_compiled_task(self, name, node): 447 # skip the creation of object files 448 # assumption: object-only targets are not skippable 449 if self.staleness == NEEDED: 450 # only libraries/programs can skip object files 451 for x in SKIPPABLE: 452 if x in self.features: 453 return None 454 455 out = '%s.%d.o' % (node.name, self.idx) 456 task = self.create_task(name, node, node.parent.find_or_declare(out)) 457 try: 458 self.compiled_tasks.append(task) 459 except AttributeError: 460 self.compiled_tasks = [task] 461 return task 462 463@feature(*SKIPPABLE) 464@after_method('apply_link') 465def apply_link_after(self): 466 # cprogram/cxxprogram might be unnecessary 467 if self.staleness != NEEDED: 468 return 469 for tsk in self.tasks: 470 tsk.hasrun = Task.SKIPPED 471 472def path_from(self, node): 473 # handle nodes of distinct types 474 if node.ctx is not self.ctx: 475 node = self.ctx.root.make_node(node.abspath()) 476 return self.default_path_from(node) 477waflib.Node.Node.default_path_from = waflib.Node.Node.path_from 478waflib.Node.Node.path_from = path_from 479 480def h_file(self): 481 # similar to md5_tstamp.py, but with 2-layer cache 482 # global_cache for the build context common for all task generators 483 # local_cache for the build context proxy (one by task generator) 484 # 485 # the global cache is not persistent 486 # the local cache is persistent and meant for partial builds 487 # 488 # assume all calls are made from a single thread 489 # 490 filename = self.abspath() 491 st = os.stat(filename) 492 493 global_cache = self.ctx.bld.hashes_md5_tstamp 494 local_cache = self.ctx.hashes_md5_tstamp 495 496 if filename in global_cache: 497 # value already calculated in this build 498 cval = global_cache[filename] 499 500 # the value in global cache is assumed to be calculated once 501 # reverifying it could cause task generators 502 # to get distinct tstamp values, thus missing rebuilds 503 local_cache[filename] = cval 504 return cval[1] 505 506 if filename in local_cache: 507 cval = local_cache[filename] 508 if cval[0] == st.st_mtime: 509 # correct value from a previous build 510 # put it in the global cache 511 global_cache[filename] = cval 512 return cval[1] 513 514 ret = Utils.h_file(filename) 515 local_cache[filename] = global_cache[filename] = (st.st_mtime, ret) 516 return ret 517waflib.Node.Node.h_file = h_file 518 519