1# walk.py -- General implementation of walking commits and their contents.
2# Copyright (C) 2010 Google, Inc.
3#
4# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
5# General Public License as public by the Free Software Foundation; version 2.0
6# or (at your option) any later version. You can redistribute it and/or
7# modify it under the terms of either of these two licenses.
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14#
15# You should have received a copy of the licenses; if not, see
16# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
17# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
18# License, Version 2.0.
19#
20
21"""General implementation of walking commits and their contents."""
22
23
24import collections
25import heapq
26from itertools import chain
27
28from dulwich.diff_tree import (
29    RENAME_CHANGE_TYPES,
30    tree_changes,
31    tree_changes_for_merge,
32    RenameDetector,
33    )
34from dulwich.errors import (
35    MissingCommitError,
36    )
37from dulwich.objects import (
38    Tag,
39    )
40
41ORDER_DATE = 'date'
42ORDER_TOPO = 'topo'
43
44ALL_ORDERS = (ORDER_DATE, ORDER_TOPO)
45
46# Maximum number of commits to walk past a commit time boundary.
47_MAX_EXTRA_COMMITS = 5
48
49
50class WalkEntry(object):
51    """Object encapsulating a single result from a walk."""
52
53    def __init__(self, walker, commit):
54        self.commit = commit
55        self._store = walker.store
56        self._get_parents = walker.get_parents
57        self._changes = {}
58        self._rename_detector = walker.rename_detector
59
60    def changes(self, path_prefix=None):
61        """Get the tree changes for this entry.
62
63        Args:
64          path_prefix: Portion of the path in the repository to
65            use to filter changes. Must be a directory name. Must be
66            a full, valid, path reference (no partial names or wildcards).
67        Returns: For commits with up to one parent, a list of TreeChange
68            objects; if the commit has no parents, these will be relative to
69            the empty tree. For merge commits, a list of lists of TreeChange
70            objects; see dulwich.diff.tree_changes_for_merge.
71        """
72        cached = self._changes.get(path_prefix)
73        if cached is None:
74            commit = self.commit
75            if not self._get_parents(commit):
76                changes_func = tree_changes
77                parent = None
78            elif len(self._get_parents(commit)) == 1:
79                changes_func = tree_changes
80                parent = self._store[self._get_parents(commit)[0]].tree
81                if path_prefix:
82                    mode, subtree_sha = parent.lookup_path(
83                        self._store.__getitem__,
84                        path_prefix,
85                    )
86                    parent = self._store[subtree_sha]
87            else:
88                changes_func = tree_changes_for_merge
89                parent = [
90                        self._store[p].tree for p in self._get_parents(commit)]
91                if path_prefix:
92                    parent_trees = [self._store[p] for p in parent]
93                    parent = []
94                    for p in parent_trees:
95                        try:
96                            mode, st = p.lookup_path(
97                                self._store.__getitem__,
98                                path_prefix,
99                            )
100                        except KeyError:
101                            pass
102                        else:
103                            parent.append(st)
104            commit_tree_sha = commit.tree
105            if path_prefix:
106                commit_tree = self._store[commit_tree_sha]
107                mode, commit_tree_sha = commit_tree.lookup_path(
108                    self._store.__getitem__,
109                    path_prefix,
110                )
111            cached = list(changes_func(
112              self._store, parent, commit_tree_sha,
113              rename_detector=self._rename_detector))
114            self._changes[path_prefix] = cached
115        return self._changes[path_prefix]
116
117    def __repr__(self):
118        return '<WalkEntry commit=%s, changes=%r>' % (
119          self.commit.id, self.changes())
120
121
122class _CommitTimeQueue(object):
123    """Priority queue of WalkEntry objects by commit time."""
124
125    def __init__(self, walker):
126        self._walker = walker
127        self._store = walker.store
128        self._get_parents = walker.get_parents
129        self._excluded = walker.excluded
130        self._pq = []
131        self._pq_set = set()
132        self._seen = set()
133        self._done = set()
134        self._min_time = walker.since
135        self._last = None
136        self._extra_commits_left = _MAX_EXTRA_COMMITS
137        self._is_finished = False
138
139        for commit_id in chain(walker.include, walker.excluded):
140            self._push(commit_id)
141
142    def _push(self, object_id):
143        try:
144            obj = self._store[object_id]
145        except KeyError:
146            raise MissingCommitError(object_id)
147        if isinstance(obj, Tag):
148            self._push(obj.object[1])
149            return
150        # TODO(jelmer): What to do about non-Commit and non-Tag objects?
151        commit = obj
152        if commit.id not in self._pq_set and commit.id not in self._done:
153            heapq.heappush(self._pq, (-commit.commit_time, commit))
154            self._pq_set.add(commit.id)
155            self._seen.add(commit.id)
156
157    def _exclude_parents(self, commit):
158        excluded = self._excluded
159        seen = self._seen
160        todo = [commit]
161        while todo:
162            commit = todo.pop()
163            for parent in self._get_parents(commit):
164                if parent not in excluded and parent in seen:
165                    # TODO: This is inefficient unless the object store does
166                    # some caching (which DiskObjectStore currently does not).
167                    # We could either add caching in this class or pass around
168                    # parsed queue entry objects instead of commits.
169                    todo.append(self._store[parent])
170                excluded.add(parent)
171
172    def next(self):
173        if self._is_finished:
174            return None
175        while self._pq:
176            _, commit = heapq.heappop(self._pq)
177            sha = commit.id
178            self._pq_set.remove(sha)
179            if sha in self._done:
180                continue
181            self._done.add(sha)
182
183            for parent_id in self._get_parents(commit):
184                self._push(parent_id)
185
186            reset_extra_commits = True
187            is_excluded = sha in self._excluded
188            if is_excluded:
189                self._exclude_parents(commit)
190                if self._pq and all(c.id in self._excluded
191                                    for _, c in self._pq):
192                    _, n = self._pq[0]
193                    if self._last and n.commit_time >= self._last.commit_time:
194                        # If the next commit is newer than the last one, we
195                        # need to keep walking in case its parents (which we
196                        # may not have seen yet) are excluded. This gives the
197                        # excluded set a chance to "catch up" while the commit
198                        # is still in the Walker's output queue.
199                        reset_extra_commits = True
200                    else:
201                        reset_extra_commits = False
202
203            if (self._min_time is not None and
204                    commit.commit_time < self._min_time):
205                # We want to stop walking at min_time, but commits at the
206                # boundary may be out of order with respect to their parents.
207                # So we walk _MAX_EXTRA_COMMITS more commits once we hit this
208                # boundary.
209                reset_extra_commits = False
210
211            if reset_extra_commits:
212                # We're not at a boundary, so reset the counter.
213                self._extra_commits_left = _MAX_EXTRA_COMMITS
214            else:
215                self._extra_commits_left -= 1
216                if not self._extra_commits_left:
217                    break
218
219            if not is_excluded:
220                self._last = commit
221                return WalkEntry(self._walker, commit)
222        self._is_finished = True
223        return None
224
225    __next__ = next
226
227
228class Walker(object):
229    """Object for performing a walk of commits in a store.
230
231    Walker objects are initialized with a store and other options and can then
232    be treated as iterators of Commit objects.
233    """
234
235    def __init__(self, store, include, exclude=None, order=ORDER_DATE,
236                 reverse=False, max_entries=None, paths=None,
237                 rename_detector=None, follow=False, since=None, until=None,
238                 get_parents=lambda commit: commit.parents,
239                 queue_cls=_CommitTimeQueue):
240        """Constructor.
241
242        Args:
243          store: ObjectStore instance for looking up objects.
244          include: Iterable of SHAs of commits to include along with their
245            ancestors.
246          exclude: Iterable of SHAs of commits to exclude along with their
247            ancestors, overriding includes.
248          order: ORDER_* constant specifying the order of results.
249            Anything other than ORDER_DATE may result in O(n) memory usage.
250          reverse: If True, reverse the order of output, requiring O(n)
251            memory.
252          max_entries: The maximum number of entries to yield, or None for
253            no limit.
254          paths: Iterable of file or subtree paths to show entries for.
255          rename_detector: diff.RenameDetector object for detecting
256            renames.
257          follow: If True, follow path across renames/copies. Forces a
258            default rename_detector.
259          since: Timestamp to list commits after.
260          until: Timestamp to list commits before.
261          get_parents: Method to retrieve the parents of a commit
262          queue_cls: A class to use for a queue of commits, supporting the
263            iterator protocol. The constructor takes a single argument, the
264            Walker.
265        """
266        # Note: when adding arguments to this method, please also update
267        # dulwich.repo.BaseRepo.get_walker
268        if order not in ALL_ORDERS:
269            raise ValueError('Unknown walk order %s' % order)
270        self.store = store
271        if isinstance(include, bytes):
272            # TODO(jelmer): Really, this should require a single type.
273            # Print deprecation warning here?
274            include = [include]
275        self.include = include
276        self.excluded = set(exclude or [])
277        self.order = order
278        self.reverse = reverse
279        self.max_entries = max_entries
280        self.paths = paths and set(paths) or None
281        if follow and not rename_detector:
282            rename_detector = RenameDetector(store)
283        self.rename_detector = rename_detector
284        self.get_parents = get_parents
285        self.follow = follow
286        self.since = since
287        self.until = until
288
289        self._num_entries = 0
290        self._queue = queue_cls(self)
291        self._out_queue = collections.deque()
292
293    def _path_matches(self, changed_path):
294        if changed_path is None:
295            return False
296        for followed_path in self.paths:
297            if changed_path == followed_path:
298                return True
299            if (changed_path.startswith(followed_path) and
300                    changed_path[len(followed_path)] == b'/'[0]):
301                return True
302        return False
303
304    def _change_matches(self, change):
305        if not change:
306            return False
307
308        old_path = change.old.path
309        new_path = change.new.path
310        if self._path_matches(new_path):
311            if self.follow and change.type in RENAME_CHANGE_TYPES:
312                self.paths.add(old_path)
313                self.paths.remove(new_path)
314            return True
315        elif self._path_matches(old_path):
316            return True
317        return False
318
319    def _should_return(self, entry):
320        """Determine if a walk entry should be returned..
321
322        Args:
323          entry: The WalkEntry to consider.
324        Returns: True if the WalkEntry should be returned by this walk, or
325            False otherwise (e.g. if it doesn't match any requested paths).
326        """
327        commit = entry.commit
328        if self.since is not None and commit.commit_time < self.since:
329            return False
330        if self.until is not None and commit.commit_time > self.until:
331            return False
332        if commit.id in self.excluded:
333            return False
334
335        if self.paths is None:
336            return True
337
338        if len(self.get_parents(commit)) > 1:
339            for path_changes in entry.changes():
340                # For merge commits, only include changes with conflicts for
341                # this path. Since a rename conflict may include different
342                # old.paths, we have to check all of them.
343                for change in path_changes:
344                    if self._change_matches(change):
345                        return True
346        else:
347            for change in entry.changes():
348                if self._change_matches(change):
349                    return True
350        return None
351
352    def _next(self):
353        max_entries = self.max_entries
354        while max_entries is None or self._num_entries < max_entries:
355            entry = next(self._queue)
356            if entry is not None:
357                self._out_queue.append(entry)
358            if entry is None or len(self._out_queue) > _MAX_EXTRA_COMMITS:
359                if not self._out_queue:
360                    return None
361                entry = self._out_queue.popleft()
362                if self._should_return(entry):
363                    self._num_entries += 1
364                    return entry
365        return None
366
367    def _reorder(self, results):
368        """Possibly reorder a results iterator.
369
370        Args:
371          results: An iterator of WalkEntry objects, in the order returned
372            from the queue_cls.
373        Returns: An iterator or list of WalkEntry objects, in the order
374            required by the Walker.
375        """
376        if self.order == ORDER_TOPO:
377            results = _topo_reorder(results, self.get_parents)
378        if self.reverse:
379            results = reversed(list(results))
380        return results
381
382    def __iter__(self):
383        return iter(self._reorder(iter(self._next, None)))
384
385
386def _topo_reorder(entries, get_parents=lambda commit: commit.parents):
387    """Reorder an iterable of entries topologically.
388
389    This works best assuming the entries are already in almost-topological
390    order, e.g. in commit time order.
391
392    Args:
393      entries: An iterable of WalkEntry objects.
394      get_parents: Optional function for getting the parents of a commit.
395    Returns: iterator over WalkEntry objects from entries in FIFO order, except
396        where a parent would be yielded before any of its children.
397    """
398    todo = collections.deque()
399    pending = {}
400    num_children = collections.defaultdict(int)
401    for entry in entries:
402        todo.append(entry)
403        for p in get_parents(entry.commit):
404            num_children[p] += 1
405
406    while todo:
407        entry = todo.popleft()
408        commit = entry.commit
409        commit_id = commit.id
410        if num_children[commit_id]:
411            pending[commit_id] = entry
412            continue
413        for parent_id in get_parents(commit):
414            num_children[parent_id] -= 1
415            if not num_children[parent_id]:
416                parent_entry = pending.pop(parent_id, None)
417                if parent_entry:
418                    todo.appendleft(parent_entry)
419        yield entry
420