1# Copyright 2013 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5from __future__ import print_function
6
7import difflib
8import hashlib
9import itertools
10import json
11import os
12import sys
13import zipfile
14
15from util import build_utils
16
17# When set and a difference is detected, a diff of what changed is printed.
18PRINT_EXPLANATIONS = int(os.environ.get('PRINT_BUILD_EXPLANATIONS', 0))
19
20# An escape hatch that causes all targets to be rebuilt.
21_FORCE_REBUILD = int(os.environ.get('FORCE_REBUILD', 0))
22
23
24def CallAndWriteDepfileIfStale(on_stale_md5,
25                               options,
26                               record_path=None,
27                               input_paths=None,
28                               input_strings=None,
29                               output_paths=None,
30                               force=False,
31                               pass_changes=False,
32                               track_subpaths_allowlist=None,
33                               depfile_deps=None):
34  """Wraps CallAndRecordIfStale() and writes a depfile if applicable.
35
36  Depfiles are automatically added to output_paths when present in the |options|
37  argument. They are then created after |on_stale_md5| is called.
38
39  By default, only python dependencies are added to the depfile. If there are
40  other input paths that are not captured by GN deps, then they should be listed
41  in depfile_deps. It's important to write paths to the depfile that are already
42  captured by GN deps since GN args can cause GN deps to change, and such
43  changes are not immediately reflected in depfiles (http://crbug.com/589311).
44  """
45  if not output_paths:
46    raise Exception('At least one output_path must be specified.')
47  input_paths = list(input_paths or [])
48  input_strings = list(input_strings or [])
49  output_paths = list(output_paths or [])
50
51  input_paths += build_utils.ComputePythonDependencies()
52
53  CallAndRecordIfStale(
54      on_stale_md5,
55      record_path=record_path,
56      input_paths=input_paths,
57      input_strings=input_strings,
58      output_paths=output_paths,
59      force=force,
60      pass_changes=pass_changes,
61      track_subpaths_allowlist=track_subpaths_allowlist)
62
63  # Write depfile even when inputs have not changed to ensure build correctness
64  # on bots that build with & without patch, and the patch changes the depfile
65  # location.
66  if hasattr(options, 'depfile') and options.depfile:
67    build_utils.WriteDepfile(
68        options.depfile, output_paths[0], depfile_deps, add_pydeps=False)
69
70
71def CallAndRecordIfStale(function,
72                         record_path=None,
73                         input_paths=None,
74                         input_strings=None,
75                         output_paths=None,
76                         force=False,
77                         pass_changes=False,
78                         track_subpaths_allowlist=None):
79  """Calls function if outputs are stale.
80
81  Outputs are considered stale if:
82  - any output_paths are missing, or
83  - the contents of any file within input_paths has changed, or
84  - the contents of input_strings has changed.
85
86  To debug which files are out-of-date, set the environment variable:
87      PRINT_MD5_DIFFS=1
88
89  Args:
90    function: The function to call.
91    record_path: Path to record metadata.
92      Defaults to output_paths[0] + '.md5.stamp'
93    input_paths: List of paths to calcualte an md5 sum on.
94    input_strings: List of strings to record verbatim.
95    output_paths: List of output paths.
96    force: Whether to treat outputs as missing regardless of whether they
97      actually are.
98    pass_changes: Whether to pass a Changes instance to |function|.
99    track_subpaths_allowlist: Relevant only when pass_changes=True. List of .zip
100      files from |input_paths| to make subpath information available for.
101  """
102  assert record_path or output_paths
103  input_paths = input_paths or []
104  input_strings = input_strings or []
105  output_paths = output_paths or []
106  record_path = record_path or output_paths[0] + '.md5.stamp'
107
108  assert record_path.endswith('.stamp'), (
109      'record paths must end in \'.stamp\' so that they are easy to find '
110      'and delete')
111
112  new_metadata = _Metadata(track_entries=pass_changes or PRINT_EXPLANATIONS)
113  new_metadata.AddStrings(input_strings)
114
115  zip_allowlist = set(track_subpaths_allowlist or [])
116  for path in input_paths:
117    # It's faster to md5 an entire zip file than it is to just locate & hash
118    # its central directory (which is what this used to do).
119    if path in zip_allowlist:
120      entries = _ExtractZipEntries(path)
121      new_metadata.AddZipFile(path, entries)
122    else:
123      new_metadata.AddFile(path, _ComputeTagForPath(path))
124
125  old_metadata = None
126  force = force or _FORCE_REBUILD
127  missing_outputs = [x for x in output_paths if force or not os.path.exists(x)]
128  # When outputs are missing, don't bother gathering change information.
129  if not missing_outputs and os.path.exists(record_path):
130    with open(record_path, 'r') as jsonfile:
131      try:
132        old_metadata = _Metadata.FromFile(jsonfile)
133      except:  # pylint: disable=bare-except
134        pass  # Not yet using new file format.
135
136  changes = Changes(old_metadata, new_metadata, force, missing_outputs)
137  if not changes.HasChanges():
138    return
139
140  if PRINT_EXPLANATIONS:
141    print('=' * 80)
142    print('Target is stale: %s' % record_path)
143    print(changes.DescribeDifference())
144    print('=' * 80)
145
146  args = (changes,) if pass_changes else ()
147  function(*args)
148
149  with open(record_path, 'w') as f:
150    new_metadata.ToFile(f)
151
152
153class Changes(object):
154  """Provides and API for querying what changed between runs."""
155
156  def __init__(self, old_metadata, new_metadata, force, missing_outputs):
157    self.old_metadata = old_metadata
158    self.new_metadata = new_metadata
159    self.force = force
160    self.missing_outputs = missing_outputs
161
162  def _GetOldTag(self, path, subpath=None):
163    return self.old_metadata and self.old_metadata.GetTag(path, subpath)
164
165  def HasChanges(self):
166    """Returns whether any changes exist."""
167    return (self.HasStringChanges()
168            or self.old_metadata.FilesMd5() != self.new_metadata.FilesMd5())
169
170  def HasStringChanges(self):
171    """Returns whether string metadata changed."""
172    return (self.force or not self.old_metadata
173            or self.old_metadata.StringsMd5() != self.new_metadata.StringsMd5())
174
175  def AddedOrModifiedOnly(self):
176    """Returns whether the only changes were from added or modified (sub)files.
177
178    No missing outputs, no removed paths/subpaths.
179    """
180    if self.HasStringChanges():
181      return False
182    if any(self.IterRemovedPaths()):
183      return False
184    for path in self.IterModifiedPaths():
185      if any(self.IterRemovedSubpaths(path)):
186        return False
187    return True
188
189  def IterAllPaths(self):
190    """Generator for paths."""
191    return self.new_metadata.IterPaths();
192
193  def IterAllSubpaths(self, path):
194    """Generator for subpaths."""
195    return self.new_metadata.IterSubpaths(path);
196
197  def IterAddedPaths(self):
198    """Generator for paths that were added."""
199    for path in self.new_metadata.IterPaths():
200      if self._GetOldTag(path) is None:
201        yield path
202
203  def IterAddedSubpaths(self, path):
204    """Generator for paths that were added within the given zip file."""
205    for subpath in self.new_metadata.IterSubpaths(path):
206      if self._GetOldTag(path, subpath) is None:
207        yield subpath
208
209  def IterRemovedPaths(self):
210    """Generator for paths that were removed."""
211    if self.old_metadata:
212      for path in self.old_metadata.IterPaths():
213        if self.new_metadata.GetTag(path) is None:
214          yield path
215
216  def IterRemovedSubpaths(self, path):
217    """Generator for paths that were removed within the given zip file."""
218    if self.old_metadata:
219      for subpath in self.old_metadata.IterSubpaths(path):
220        if self.new_metadata.GetTag(path, subpath) is None:
221          yield subpath
222
223  def IterModifiedPaths(self):
224    """Generator for paths whose contents have changed."""
225    for path in self.new_metadata.IterPaths():
226      old_tag = self._GetOldTag(path)
227      new_tag = self.new_metadata.GetTag(path)
228      if old_tag is not None and old_tag != new_tag:
229        yield path
230
231  def IterModifiedSubpaths(self, path):
232    """Generator for paths within a zip file whose contents have changed."""
233    for subpath in self.new_metadata.IterSubpaths(path):
234      old_tag = self._GetOldTag(path, subpath)
235      new_tag = self.new_metadata.GetTag(path, subpath)
236      if old_tag is not None and old_tag != new_tag:
237        yield subpath
238
239  def IterChangedPaths(self):
240    """Generator for all changed paths (added/removed/modified)."""
241    return itertools.chain(self.IterRemovedPaths(),
242                           self.IterModifiedPaths(),
243                           self.IterAddedPaths())
244
245  def IterChangedSubpaths(self, path):
246    """Generator for paths within a zip that were added/removed/modified."""
247    return itertools.chain(self.IterRemovedSubpaths(path),
248                           self.IterModifiedSubpaths(path),
249                           self.IterAddedSubpaths(path))
250
251  def DescribeDifference(self):
252    """Returns a human-readable description of what changed."""
253    if self.force:
254      return 'force=True'
255    elif self.missing_outputs:
256      return 'Outputs do not exist:\n  ' + '\n  '.join(self.missing_outputs)
257    elif self.old_metadata is None:
258      return 'Previous stamp file not found.'
259
260    if self.old_metadata.StringsMd5() != self.new_metadata.StringsMd5():
261      ndiff = difflib.ndiff(self.old_metadata.GetStrings(),
262                            self.new_metadata.GetStrings())
263      changed = [s for s in ndiff if not s.startswith(' ')]
264      return 'Input strings changed:\n  ' + '\n  '.join(changed)
265
266    if self.old_metadata.FilesMd5() == self.new_metadata.FilesMd5():
267      return "There's no difference."
268
269    lines = []
270    lines.extend('Added: ' + p for p in self.IterAddedPaths())
271    lines.extend('Removed: ' + p for p in self.IterRemovedPaths())
272    for path in self.IterModifiedPaths():
273      lines.append('Modified: ' + path)
274      lines.extend('  -> Subpath added: ' + p
275                   for p in self.IterAddedSubpaths(path))
276      lines.extend('  -> Subpath removed: ' + p
277                   for p in self.IterRemovedSubpaths(path))
278      lines.extend('  -> Subpath modified: ' + p
279                   for p in self.IterModifiedSubpaths(path))
280    if lines:
281      return 'Input files changed:\n  ' + '\n  '.join(lines)
282    return 'I have no idea what changed (there is a bug).'
283
284
285class _Metadata(object):
286  """Data model for tracking change metadata.
287
288  Args:
289    track_entries: Enables per-file change tracking. Slower, but required for
290        Changes functionality.
291  """
292  # Schema:
293  # {
294  #   "files-md5": "VALUE",
295  #   "strings-md5": "VALUE",
296  #   "input-files": [
297  #     {
298  #       "path": "path.jar",
299  #       "tag": "{MD5 of entries}",
300  #       "entries": [
301  #         { "path": "org/chromium/base/Foo.class", "tag": "{CRC32}" }, ...
302  #       ]
303  #     }, {
304  #       "path": "path.txt",
305  #       "tag": "{MD5}",
306  #     }
307  #   ],
308  #   "input-strings": ["a", "b", ...],
309  # }
310  def __init__(self, track_entries=False):
311    self._track_entries = track_entries
312    self._files_md5 = None
313    self._strings_md5 = None
314    self._files = []
315    self._strings = []
316    # Map of (path, subpath) -> entry. Created upon first call to _GetEntry().
317    self._file_map = None
318
319  @classmethod
320  def FromFile(cls, fileobj):
321    """Returns a _Metadata initialized from a file object."""
322    ret = cls()
323    obj = json.load(fileobj)
324    ret._files_md5 = obj['files-md5']
325    ret._strings_md5 = obj['strings-md5']
326    ret._files = obj.get('input-files', [])
327    ret._strings = obj.get('input-strings', [])
328    return ret
329
330  def ToFile(self, fileobj):
331    """Serializes metadata to the given file object."""
332    obj = {
333        'files-md5': self.FilesMd5(),
334        'strings-md5': self.StringsMd5(),
335    }
336    if self._track_entries:
337      obj['input-files'] = sorted(self._files, key=lambda e: e['path'])
338      obj['input-strings'] = self._strings
339
340    json.dump(obj, fileobj, indent=2)
341
342  def _AssertNotQueried(self):
343    assert self._files_md5 is None
344    assert self._strings_md5 is None
345    assert self._file_map is None
346
347  def AddStrings(self, values):
348    self._AssertNotQueried()
349    self._strings.extend(str(v) for v in values)
350
351  def AddFile(self, path, tag):
352    """Adds metadata for a non-zip file.
353
354    Args:
355      path: Path to the file.
356      tag: A short string representative of the file contents.
357    """
358    self._AssertNotQueried()
359    self._files.append({
360        'path': path,
361        'tag': tag,
362    })
363
364  def AddZipFile(self, path, entries):
365    """Adds metadata for a zip file.
366
367    Args:
368      path: Path to the file.
369      entries: List of (subpath, tag) tuples for entries within the zip.
370    """
371    self._AssertNotQueried()
372    tag = _ComputeInlineMd5(itertools.chain((e[0] for e in entries),
373                                            (e[1] for e in entries)))
374    self._files.append({
375        'path': path,
376        'tag': tag,
377        'entries': [{"path": e[0], "tag": e[1]} for e in entries],
378    })
379
380  def GetStrings(self):
381    """Returns the list of input strings."""
382    return self._strings
383
384  def FilesMd5(self):
385    """Lazily computes and returns the aggregate md5 of input files."""
386    if self._files_md5 is None:
387      # Omit paths from md5 since temporary files have random names.
388      self._files_md5 = _ComputeInlineMd5(
389          self.GetTag(p) for p in sorted(self.IterPaths()))
390    return self._files_md5
391
392  def StringsMd5(self):
393    """Lazily computes and returns the aggregate md5 of input strings."""
394    if self._strings_md5 is None:
395      self._strings_md5 = _ComputeInlineMd5(self._strings)
396    return self._strings_md5
397
398  def _GetEntry(self, path, subpath=None):
399    """Returns the JSON entry for the given path / subpath."""
400    if self._file_map is None:
401      self._file_map = {}
402      for entry in self._files:
403        self._file_map[(entry['path'], None)] = entry
404        for subentry in entry.get('entries', ()):
405          self._file_map[(entry['path'], subentry['path'])] = subentry
406    return self._file_map.get((path, subpath))
407
408  def GetTag(self, path, subpath=None):
409    """Returns the tag for the given path / subpath."""
410    ret = self._GetEntry(path, subpath)
411    return ret and ret['tag']
412
413  def IterPaths(self):
414    """Returns a generator for all top-level paths."""
415    return (e['path'] for e in self._files)
416
417  def IterSubpaths(self, path):
418    """Returns a generator for all subpaths in the given zip.
419
420    If the given path is not a zip file or doesn't exist, returns an empty
421    iterable.
422    """
423    outer_entry = self._GetEntry(path)
424    if not outer_entry:
425      return ()
426    subentries = outer_entry.get('entries', [])
427    return (entry['path'] for entry in subentries)
428
429
430def _ComputeTagForPath(path):
431  stat = os.stat(path)
432  if stat.st_size > 1 * 1024 * 1024:
433    # Fallback to mtime for large files so that md5_check does not take too long
434    # to run.
435    return stat.st_mtime
436  md5 = hashlib.md5()
437  with open(path, 'rb') as f:
438    md5.update(f.read())
439  return md5.hexdigest()
440
441
442def _ComputeInlineMd5(iterable):
443  """Computes the md5 of the concatenated parameters."""
444  md5 = hashlib.md5()
445  for item in iterable:
446    md5.update(str(item))
447  return md5.hexdigest()
448
449
450def _ExtractZipEntries(path):
451  """Returns a list of (path, CRC32) of all files within |path|."""
452  entries = []
453  with zipfile.ZipFile(path) as zip_file:
454    for zip_info in zip_file.infolist():
455      # Skip directories and empty files.
456      if zip_info.CRC:
457        entries.append(
458            (zip_info.filename, zip_info.CRC + zip_info.compress_type))
459  return entries
460