1#! /usr/bin/env python
2# encoding: utf-8
3# Thomas Nagy, 2017-2018 (ita)
4
5"""
6A system for fast partial rebuilds
7
8Creating a large amount of task objects up front can take some time.
9By making a few assumptions, it is possible to avoid posting creating
10task objects for targets that are already up-to-date.
11
12On a silly benchmark the gain observed for 1M tasks can be 5m->10s
13for a single file change.
14
15Usage::
16
17	def options(opt):
18		opt.load('fast_partial')
19
20Assumptions:
21* Start with a clean build (run "waf distclean" after enabling)
22* Mostly for C/C++/Fortran targets with link tasks (object-only targets are not handled)
23  try it in the folder generated by utils/genbench.py
24* For full project builds: no --targets and no pruning from subfolders
25* The installation phase is ignored
26* `use=` dependencies are specified up front even across build groups
27* Task generator source files are not obtained from globs
28
29Implementation details:
30* The first layer obtains file timestamps to recalculate file hashes only
31  when necessary (similar to md5_tstamp); the timestamps are then stored
32  in a dedicated pickle file
33* A second layer associates each task generator to a file set to help
34  detecting changes. Task generators are to create their tasks only when
35  the related files have been modified. A specific db file is created
36  to store such data (5m -> 1m10)
37* A third layer binds build context proxies onto task generators, replacing
38  the default context. While loading data for the full build uses more memory
39  (4GB -> 9GB), partial builds are then much faster (1m10 -> 13s)
40* A fourth layer enables a 2-level cache on file signatures to
41  reduce the size of the main pickle file (13s -> 10s)
42"""
43
44import os
45from waflib import Build, Context, Errors, Logs, Task, TaskGen, Utils
46from waflib.TaskGen import feature, after_method, taskgen_method
47import waflib.Node
48
49DONE = 0
50DIRTY = 1
51NEEDED = 2
52
53SKIPPABLE = ['cshlib', 'cxxshlib', 'cstlib', 'cxxstlib', 'cprogram', 'cxxprogram']
54
55TSTAMP_DB = '.wafpickle_tstamp_db_file'
56
57SAVED_ATTRS = 'root node_sigs task_sigs imp_sigs raw_deps node_deps'.split()
58
59class bld_proxy(object):
60	def __init__(self, bld):
61		object.__setattr__(self, 'bld', bld)
62
63		object.__setattr__(self, 'node_class', type('Nod3', (waflib.Node.Node,), {}))
64		self.node_class.__module__ = 'waflib.Node'
65		self.node_class.ctx = self
66
67		object.__setattr__(self, 'root', self.node_class('', None))
68		for x in SAVED_ATTRS:
69			if x != 'root':
70				object.__setattr__(self, x, {})
71
72		self.fix_nodes()
73
74	def __setattr__(self, name, value):
75		bld = object.__getattribute__(self, 'bld')
76		setattr(bld, name, value)
77
78	def __delattr__(self, name):
79		bld = object.__getattribute__(self, 'bld')
80		delattr(bld, name)
81
82	def __getattribute__(self, name):
83		try:
84			return object.__getattribute__(self, name)
85		except AttributeError:
86			bld = object.__getattribute__(self, 'bld')
87			return getattr(bld, name)
88
89	def __call__(self, *k, **kw):
90		return self.bld(*k, **kw)
91
92	def fix_nodes(self):
93		for x in ('srcnode', 'path', 'bldnode'):
94			node = self.root.find_dir(getattr(self.bld, x).abspath())
95			object.__setattr__(self, x, node)
96
97	def set_key(self, store_key):
98		object.__setattr__(self, 'store_key', store_key)
99
100	def fix_tg_path(self, *tgs):
101		# changing Node objects on task generators is possible
102		# yet, all Node objects must belong to the same parent
103		for tg in tgs:
104			tg.path = self.root.make_node(tg.path.abspath())
105
106	def restore(self):
107		dbfn = os.path.join(self.variant_dir, Context.DBFILE + self.store_key)
108		Logs.debug('rev_use: reading %s', dbfn)
109		try:
110			data = Utils.readf(dbfn, 'rb')
111		except (EnvironmentError, EOFError):
112			# handle missing file/empty file
113			Logs.debug('rev_use: Could not load the build cache %s (missing)', dbfn)
114		else:
115			try:
116				waflib.Node.pickle_lock.acquire()
117				waflib.Node.Nod3 = self.node_class
118				try:
119					data = Build.cPickle.loads(data)
120				except Exception as e:
121					Logs.debug('rev_use: Could not pickle the build cache %s: %r', dbfn, e)
122				else:
123					for x in SAVED_ATTRS:
124						object.__setattr__(self, x, data.get(x, {}))
125			finally:
126				waflib.Node.pickle_lock.release()
127		self.fix_nodes()
128
129	def store(self):
130		data = {}
131		for x in Build.SAVED_ATTRS:
132			data[x] = getattr(self, x)
133		db = os.path.join(self.variant_dir, Context.DBFILE + self.store_key)
134
135		with waflib.Node.pickle_lock:
136			waflib.Node.Nod3 = self.node_class
137			try:
138				x = Build.cPickle.dumps(data, Build.PROTOCOL)
139			except Build.cPickle.PicklingError:
140				root = data['root']
141				for node_deps in data['node_deps'].values():
142					for idx, node in enumerate(node_deps):
143						# there may be more cross-context Node objects to fix,
144						# but this should be the main source
145						node_deps[idx] = root.find_node(node.abspath())
146				x = Build.cPickle.dumps(data, Build.PROTOCOL)
147
148		Logs.debug('rev_use: storing %s', db)
149		Utils.writef(db + '.tmp', x, m='wb')
150		try:
151			st = os.stat(db)
152			os.remove(db)
153			if not Utils.is_win32:
154				os.chown(db + '.tmp', st.st_uid, st.st_gid)
155		except (AttributeError, OSError):
156			pass
157		os.rename(db + '.tmp', db)
158
159class bld(Build.BuildContext):
160	def __init__(self, **kw):
161		super(bld, self).__init__(**kw)
162		self.hashes_md5_tstamp = {}
163
164	def __call__(self, *k, **kw):
165		# this is one way of doing it, one could use a task generator method too
166		bld = kw['bld'] = bld_proxy(self)
167		ret = TaskGen.task_gen(*k, **kw)
168		self.task_gen_cache_names = {}
169		self.add_to_group(ret, group=kw.get('group'))
170		ret.bld = bld
171		bld.set_key(ret.path.abspath().replace(os.sep, '') + str(ret.idx))
172		return ret
173
174	def is_dirty(self):
175		return True
176
177	def store_tstamps(self):
178		# Called after a build is finished
179		# For each task generator, record all files involved in task objects
180		# optimization: done only if there was something built
181		do_store = False
182		try:
183			f_deps = self.f_deps
184		except AttributeError:
185			f_deps = self.f_deps = {}
186			self.f_tstamps = {}
187
188		allfiles = set()
189		for g in self.groups:
190			for tg in g:
191				try:
192					staleness = tg.staleness
193				except AttributeError:
194					staleness = DIRTY
195
196				if staleness != DIRTY:
197					# DONE case: there was nothing built
198					# NEEDED case: the tg was brought in because of 'use' propagation
199					# but nothing really changed for them, there may be incomplete
200					# tasks (object files) and in this case it is best to let the next build
201					# figure out if an input/output file changed
202					continue
203
204				do_cache = False
205				for tsk in tg.tasks:
206					if tsk.hasrun == Task.SUCCESS:
207						do_cache = True
208						pass
209					elif tsk.hasrun == Task.SKIPPED:
210						pass
211					else:
212						# one failed task, clear the cache for this tg
213						try:
214							del f_deps[(tg.path.abspath(), tg.idx)]
215						except KeyError:
216							pass
217						else:
218							# just store the new state because there is a change
219							do_store = True
220
221						# skip the rest because there is no valid cache possible
222						break
223				else:
224					if not do_cache:
225						# all skipped, but is there anything in cache?
226						try:
227							f_deps[(tg.path.abspath(), tg.idx)]
228						except KeyError:
229							# probably cleared because a wscript file changed
230							# store it
231							do_cache = True
232
233					if do_cache:
234
235						# there was a rebuild, store the data structure too
236						tg.bld.store()
237
238						# all tasks skipped but no cache
239						# or a successful task build
240						do_store = True
241						st = set()
242						for tsk in tg.tasks:
243							st.update(tsk.inputs)
244							st.update(self.node_deps.get(tsk.uid(), []))
245
246						# TODO do last/when loading the tgs?
247						lst = []
248						for k in ('wscript', 'wscript_build'):
249							n = tg.path.find_node(k)
250							if n:
251								n.get_bld_sig()
252								lst.append(n.abspath())
253
254						lst.extend(sorted(x.abspath() for x in st))
255						allfiles.update(lst)
256						f_deps[(tg.path.abspath(), tg.idx)] = lst
257
258		for x in allfiles:
259			# f_tstamps has everything, while md5_tstamp can be relatively empty on partial builds
260			self.f_tstamps[x] = self.hashes_md5_tstamp[x][0]
261
262		if do_store:
263			dbfn = os.path.join(self.variant_dir, TSTAMP_DB)
264			Logs.debug('rev_use: storing %s', dbfn)
265			dbfn_tmp = dbfn + '.tmp'
266			x = Build.cPickle.dumps([self.f_tstamps, f_deps], Build.PROTOCOL)
267			Utils.writef(dbfn_tmp, x, m='wb')
268			os.rename(dbfn_tmp, dbfn)
269			Logs.debug('rev_use: stored %s', dbfn)
270
271	def store(self):
272		self.store_tstamps()
273		if self.producer.dirty:
274			Build.BuildContext.store(self)
275
276	def compute_needed_tgs(self):
277		# assume the 'use' keys are not modified during the build phase
278
279		dbfn = os.path.join(self.variant_dir, TSTAMP_DB)
280		Logs.debug('rev_use: Loading %s', dbfn)
281		try:
282			data = Utils.readf(dbfn, 'rb')
283		except (EnvironmentError, EOFError):
284			Logs.debug('rev_use: Could not load the build cache %s (missing)', dbfn)
285			self.f_deps = {}
286			self.f_tstamps = {}
287		else:
288			try:
289				self.f_tstamps, self.f_deps = Build.cPickle.loads(data)
290			except Exception as e:
291				Logs.debug('rev_use: Could not pickle the build cache %s: %r', dbfn, e)
292				self.f_deps = {}
293				self.f_tstamps = {}
294			else:
295				Logs.debug('rev_use: Loaded %s', dbfn)
296
297
298		# 1. obtain task generators that contain rebuilds
299		# 2. obtain the 'use' graph and its dual
300		stales = set()
301		reverse_use_map = Utils.defaultdict(list)
302		use_map = Utils.defaultdict(list)
303
304		for g in self.groups:
305			for tg in g:
306				if tg.is_stale():
307					stales.add(tg)
308
309				try:
310					lst = tg.use = Utils.to_list(tg.use)
311				except AttributeError:
312					pass
313				else:
314					for x in lst:
315						try:
316							xtg = self.get_tgen_by_name(x)
317						except Errors.WafError:
318							pass
319						else:
320							use_map[tg].append(xtg)
321							reverse_use_map[xtg].append(tg)
322
323		Logs.debug('rev_use: found %r stale tgs', len(stales))
324
325		# 3. dfs to post downstream tg as stale
326		visited = set()
327		def mark_down(tg):
328			if tg in visited:
329				return
330			visited.add(tg)
331			Logs.debug('rev_use: marking down %r as stale', tg.name)
332			tg.staleness = DIRTY
333			for x in reverse_use_map[tg]:
334				mark_down(x)
335		for tg in stales:
336			mark_down(tg)
337
338		# 4. dfs to find ancestors tg to mark as needed
339		self.needed_tgs = needed_tgs = set()
340		def mark_needed(tg):
341			if tg in needed_tgs:
342				return
343			needed_tgs.add(tg)
344			if tg.staleness == DONE:
345				Logs.debug('rev_use: marking up %r as needed', tg.name)
346				tg.staleness = NEEDED
347			for x in use_map[tg]:
348				mark_needed(x)
349		for xx in visited:
350			mark_needed(xx)
351
352		# so we have the whole tg trees to post in the set "needed"
353		# load their build trees
354		for tg in needed_tgs:
355			tg.bld.restore()
356			tg.bld.fix_tg_path(tg)
357
358		# the stale ones should be fully build, while the needed ones
359		# may skip a few tasks, see create_compiled_task and apply_link_after below
360		Logs.debug('rev_use: amount of needed task gens: %r', len(needed_tgs))
361
362	def post_group(self):
363		# assumption: we can ignore the folder/subfolders cuts
364		def tgpost(tg):
365			try:
366				f = tg.post
367			except AttributeError:
368				pass
369			else:
370				f()
371
372		if not self.targets or self.targets == '*':
373			for tg in self.groups[self.current_group]:
374				# this can cut quite a lot of tg objects
375				if tg in self.needed_tgs:
376					tgpost(tg)
377		else:
378			# default implementation
379			return Build.BuildContext.post_group()
380
381	def get_build_iterator(self):
382		if not self.targets or self.targets == '*':
383			self.compute_needed_tgs()
384		return Build.BuildContext.get_build_iterator(self)
385
386@taskgen_method
387def is_stale(self):
388	# assume no globs
389	self.staleness = DIRTY
390
391	# 1. the case of always stale targets
392	if getattr(self, 'always_stale', False):
393		return True
394
395	# 2. check if the db file exists
396	db = os.path.join(self.bld.variant_dir, Context.DBFILE)
397	try:
398		dbstat = os.stat(db).st_mtime
399	except OSError:
400		Logs.debug('rev_use: must post %r because this is a clean build')
401		return True
402
403	# 3.a check if the configuration exists
404	cache_node = self.bld.bldnode.find_node('c4che/build.config.py')
405	if not cache_node:
406		return True
407
408	# 3.b check if the configuration changed
409	if os.stat(cache_node.abspath()).st_mtime > dbstat:
410		Logs.debug('rev_use: must post %r because the configuration has changed', self.name)
411		return True
412
413	# 3.c any tstamp data?
414	try:
415		f_deps = self.bld.f_deps
416	except AttributeError:
417		Logs.debug('rev_use: must post %r because there is no f_deps', self.name)
418		return True
419
420	# 4. check if this is the first build (no cache)
421	try:
422		lst = f_deps[(self.path.abspath(), self.idx)]
423	except KeyError:
424		Logs.debug('rev_use: must post %r because there it has no cached data', self.name)
425		return True
426
427	try:
428		cache = self.bld.cache_tstamp_rev_use
429	except AttributeError:
430		cache = self.bld.cache_tstamp_rev_use = {}
431
432	# 5. check the timestamp of each dependency files listed is unchanged
433	f_tstamps = self.bld.f_tstamps
434	for x in lst:
435		try:
436			old_ts = f_tstamps[x]
437		except KeyError:
438			Logs.debug('rev_use: must post %r because %r is not in cache', self.name, x)
439			return True
440
441		try:
442			try:
443				ts = cache[x]
444			except KeyError:
445				ts = cache[x] = os.stat(x).st_mtime
446		except OSError:
447			del f_deps[(self.path.abspath(), self.idx)]
448			Logs.debug('rev_use: must post %r because %r does not exist anymore', self.name, x)
449			return True
450		else:
451			if ts != old_ts:
452				Logs.debug('rev_use: must post %r because the timestamp on %r changed %r %r', self.name, x, old_ts, ts)
453				return True
454
455	self.staleness = DONE
456	return False
457
458@taskgen_method
459def create_compiled_task(self, name, node):
460	# skip the creation of object files
461	# assumption: object-only targets are not skippable
462	if self.staleness == NEEDED:
463		# only libraries/programs can skip object files
464		for x in SKIPPABLE:
465			if x in self.features:
466				return None
467
468	out = '%s.%d.o' % (node.name, self.idx)
469	task = self.create_task(name, node, node.parent.find_or_declare(out))
470	try:
471		self.compiled_tasks.append(task)
472	except AttributeError:
473		self.compiled_tasks = [task]
474	return task
475
476@feature(*SKIPPABLE)
477@after_method('apply_link')
478def apply_link_after(self):
479	# cprogram/cxxprogram might be unnecessary
480	if self.staleness != NEEDED:
481		return
482	for tsk in self.tasks:
483		tsk.hasrun = Task.SKIPPED
484
485def path_from(self, node):
486	# handle nodes of distinct types
487	if node.ctx is not self.ctx:
488		node = self.ctx.root.make_node(node.abspath())
489	return self.default_path_from(node)
490waflib.Node.Node.default_path_from = waflib.Node.Node.path_from
491waflib.Node.Node.path_from = path_from
492
493def h_file(self):
494	# similar to md5_tstamp.py, but with 2-layer cache
495	# global_cache for the build context common for all task generators
496	# local_cache for the build context proxy (one by task generator)
497	#
498	# the global cache is not persistent
499	# the local cache is persistent and meant for partial builds
500	#
501	# assume all calls are made from a single thread
502	#
503	filename = self.abspath()
504	st = os.stat(filename)
505
506	global_cache = self.ctx.bld.hashes_md5_tstamp
507	local_cache = self.ctx.hashes_md5_tstamp
508
509	if filename in global_cache:
510		# value already calculated in this build
511		cval = global_cache[filename]
512
513		# the value in global cache is assumed to be calculated once
514		# reverifying it could cause task generators
515		# to get distinct tstamp values, thus missing rebuilds
516		local_cache[filename] = cval
517		return cval[1]
518
519	if filename in local_cache:
520		cval = local_cache[filename]
521		if cval[0] == st.st_mtime:
522			# correct value from a previous build
523			# put it in the global cache
524			global_cache[filename] = cval
525			return cval[1]
526
527	ret = Utils.h_file(filename)
528	local_cache[filename] = global_cache[filename] = (st.st_mtime, ret)
529	return ret
530waflib.Node.Node.h_file = h_file
531
532