1# walk.py -- General implementation of walking commits and their contents. 2# Copyright (C) 2010 Google, Inc. 3# 4# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU 5# General Public License as public by the Free Software Foundation; version 2.0 6# or (at your option) any later version. You can redistribute it and/or 7# modify it under the terms of either of these two licenses. 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# 15# You should have received a copy of the licenses; if not, see 16# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License 17# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache 18# License, Version 2.0. 19# 20 21"""General implementation of walking commits and their contents.""" 22 23 24import collections 25import heapq 26from itertools import chain 27 28from dulwich.diff_tree import ( 29 RENAME_CHANGE_TYPES, 30 tree_changes, 31 tree_changes_for_merge, 32 RenameDetector, 33 ) 34from dulwich.errors import ( 35 MissingCommitError, 36 ) 37from dulwich.objects import ( 38 Tag, 39 ) 40 41ORDER_DATE = 'date' 42ORDER_TOPO = 'topo' 43 44ALL_ORDERS = (ORDER_DATE, ORDER_TOPO) 45 46# Maximum number of commits to walk past a commit time boundary. 47_MAX_EXTRA_COMMITS = 5 48 49 50class WalkEntry(object): 51 """Object encapsulating a single result from a walk.""" 52 53 def __init__(self, walker, commit): 54 self.commit = commit 55 self._store = walker.store 56 self._get_parents = walker.get_parents 57 self._changes = {} 58 self._rename_detector = walker.rename_detector 59 60 def changes(self, path_prefix=None): 61 """Get the tree changes for this entry. 62 63 Args: 64 path_prefix: Portion of the path in the repository to 65 use to filter changes. Must be a directory name. Must be 66 a full, valid, path reference (no partial names or wildcards). 67 Returns: For commits with up to one parent, a list of TreeChange 68 objects; if the commit has no parents, these will be relative to 69 the empty tree. For merge commits, a list of lists of TreeChange 70 objects; see dulwich.diff.tree_changes_for_merge. 71 """ 72 cached = self._changes.get(path_prefix) 73 if cached is None: 74 commit = self.commit 75 if not self._get_parents(commit): 76 changes_func = tree_changes 77 parent = None 78 elif len(self._get_parents(commit)) == 1: 79 changes_func = tree_changes 80 parent = self._store[self._get_parents(commit)[0]].tree 81 if path_prefix: 82 mode, subtree_sha = parent.lookup_path( 83 self._store.__getitem__, 84 path_prefix, 85 ) 86 parent = self._store[subtree_sha] 87 else: 88 changes_func = tree_changes_for_merge 89 parent = [ 90 self._store[p].tree for p in self._get_parents(commit)] 91 if path_prefix: 92 parent_trees = [self._store[p] for p in parent] 93 parent = [] 94 for p in parent_trees: 95 try: 96 mode, st = p.lookup_path( 97 self._store.__getitem__, 98 path_prefix, 99 ) 100 except KeyError: 101 pass 102 else: 103 parent.append(st) 104 commit_tree_sha = commit.tree 105 if path_prefix: 106 commit_tree = self._store[commit_tree_sha] 107 mode, commit_tree_sha = commit_tree.lookup_path( 108 self._store.__getitem__, 109 path_prefix, 110 ) 111 cached = list(changes_func( 112 self._store, parent, commit_tree_sha, 113 rename_detector=self._rename_detector)) 114 self._changes[path_prefix] = cached 115 return self._changes[path_prefix] 116 117 def __repr__(self): 118 return '<WalkEntry commit=%s, changes=%r>' % ( 119 self.commit.id, self.changes()) 120 121 122class _CommitTimeQueue(object): 123 """Priority queue of WalkEntry objects by commit time.""" 124 125 def __init__(self, walker): 126 self._walker = walker 127 self._store = walker.store 128 self._get_parents = walker.get_parents 129 self._excluded = walker.excluded 130 self._pq = [] 131 self._pq_set = set() 132 self._seen = set() 133 self._done = set() 134 self._min_time = walker.since 135 self._last = None 136 self._extra_commits_left = _MAX_EXTRA_COMMITS 137 self._is_finished = False 138 139 for commit_id in chain(walker.include, walker.excluded): 140 self._push(commit_id) 141 142 def _push(self, object_id): 143 try: 144 obj = self._store[object_id] 145 except KeyError: 146 raise MissingCommitError(object_id) 147 if isinstance(obj, Tag): 148 self._push(obj.object[1]) 149 return 150 # TODO(jelmer): What to do about non-Commit and non-Tag objects? 151 commit = obj 152 if commit.id not in self._pq_set and commit.id not in self._done: 153 heapq.heappush(self._pq, (-commit.commit_time, commit)) 154 self._pq_set.add(commit.id) 155 self._seen.add(commit.id) 156 157 def _exclude_parents(self, commit): 158 excluded = self._excluded 159 seen = self._seen 160 todo = [commit] 161 while todo: 162 commit = todo.pop() 163 for parent in self._get_parents(commit): 164 if parent not in excluded and parent in seen: 165 # TODO: This is inefficient unless the object store does 166 # some caching (which DiskObjectStore currently does not). 167 # We could either add caching in this class or pass around 168 # parsed queue entry objects instead of commits. 169 todo.append(self._store[parent]) 170 excluded.add(parent) 171 172 def next(self): 173 if self._is_finished: 174 return None 175 while self._pq: 176 _, commit = heapq.heappop(self._pq) 177 sha = commit.id 178 self._pq_set.remove(sha) 179 if sha in self._done: 180 continue 181 self._done.add(sha) 182 183 for parent_id in self._get_parents(commit): 184 self._push(parent_id) 185 186 reset_extra_commits = True 187 is_excluded = sha in self._excluded 188 if is_excluded: 189 self._exclude_parents(commit) 190 if self._pq and all(c.id in self._excluded 191 for _, c in self._pq): 192 _, n = self._pq[0] 193 if self._last and n.commit_time >= self._last.commit_time: 194 # If the next commit is newer than the last one, we 195 # need to keep walking in case its parents (which we 196 # may not have seen yet) are excluded. This gives the 197 # excluded set a chance to "catch up" while the commit 198 # is still in the Walker's output queue. 199 reset_extra_commits = True 200 else: 201 reset_extra_commits = False 202 203 if (self._min_time is not None and 204 commit.commit_time < self._min_time): 205 # We want to stop walking at min_time, but commits at the 206 # boundary may be out of order with respect to their parents. 207 # So we walk _MAX_EXTRA_COMMITS more commits once we hit this 208 # boundary. 209 reset_extra_commits = False 210 211 if reset_extra_commits: 212 # We're not at a boundary, so reset the counter. 213 self._extra_commits_left = _MAX_EXTRA_COMMITS 214 else: 215 self._extra_commits_left -= 1 216 if not self._extra_commits_left: 217 break 218 219 if not is_excluded: 220 self._last = commit 221 return WalkEntry(self._walker, commit) 222 self._is_finished = True 223 return None 224 225 __next__ = next 226 227 228class Walker(object): 229 """Object for performing a walk of commits in a store. 230 231 Walker objects are initialized with a store and other options and can then 232 be treated as iterators of Commit objects. 233 """ 234 235 def __init__(self, store, include, exclude=None, order=ORDER_DATE, 236 reverse=False, max_entries=None, paths=None, 237 rename_detector=None, follow=False, since=None, until=None, 238 get_parents=lambda commit: commit.parents, 239 queue_cls=_CommitTimeQueue): 240 """Constructor. 241 242 Args: 243 store: ObjectStore instance for looking up objects. 244 include: Iterable of SHAs of commits to include along with their 245 ancestors. 246 exclude: Iterable of SHAs of commits to exclude along with their 247 ancestors, overriding includes. 248 order: ORDER_* constant specifying the order of results. 249 Anything other than ORDER_DATE may result in O(n) memory usage. 250 reverse: If True, reverse the order of output, requiring O(n) 251 memory. 252 max_entries: The maximum number of entries to yield, or None for 253 no limit. 254 paths: Iterable of file or subtree paths to show entries for. 255 rename_detector: diff.RenameDetector object for detecting 256 renames. 257 follow: If True, follow path across renames/copies. Forces a 258 default rename_detector. 259 since: Timestamp to list commits after. 260 until: Timestamp to list commits before. 261 get_parents: Method to retrieve the parents of a commit 262 queue_cls: A class to use for a queue of commits, supporting the 263 iterator protocol. The constructor takes a single argument, the 264 Walker. 265 """ 266 # Note: when adding arguments to this method, please also update 267 # dulwich.repo.BaseRepo.get_walker 268 if order not in ALL_ORDERS: 269 raise ValueError('Unknown walk order %s' % order) 270 self.store = store 271 if isinstance(include, bytes): 272 # TODO(jelmer): Really, this should require a single type. 273 # Print deprecation warning here? 274 include = [include] 275 self.include = include 276 self.excluded = set(exclude or []) 277 self.order = order 278 self.reverse = reverse 279 self.max_entries = max_entries 280 self.paths = paths and set(paths) or None 281 if follow and not rename_detector: 282 rename_detector = RenameDetector(store) 283 self.rename_detector = rename_detector 284 self.get_parents = get_parents 285 self.follow = follow 286 self.since = since 287 self.until = until 288 289 self._num_entries = 0 290 self._queue = queue_cls(self) 291 self._out_queue = collections.deque() 292 293 def _path_matches(self, changed_path): 294 if changed_path is None: 295 return False 296 for followed_path in self.paths: 297 if changed_path == followed_path: 298 return True 299 if (changed_path.startswith(followed_path) and 300 changed_path[len(followed_path)] == b'/'[0]): 301 return True 302 return False 303 304 def _change_matches(self, change): 305 if not change: 306 return False 307 308 old_path = change.old.path 309 new_path = change.new.path 310 if self._path_matches(new_path): 311 if self.follow and change.type in RENAME_CHANGE_TYPES: 312 self.paths.add(old_path) 313 self.paths.remove(new_path) 314 return True 315 elif self._path_matches(old_path): 316 return True 317 return False 318 319 def _should_return(self, entry): 320 """Determine if a walk entry should be returned.. 321 322 Args: 323 entry: The WalkEntry to consider. 324 Returns: True if the WalkEntry should be returned by this walk, or 325 False otherwise (e.g. if it doesn't match any requested paths). 326 """ 327 commit = entry.commit 328 if self.since is not None and commit.commit_time < self.since: 329 return False 330 if self.until is not None and commit.commit_time > self.until: 331 return False 332 if commit.id in self.excluded: 333 return False 334 335 if self.paths is None: 336 return True 337 338 if len(self.get_parents(commit)) > 1: 339 for path_changes in entry.changes(): 340 # For merge commits, only include changes with conflicts for 341 # this path. Since a rename conflict may include different 342 # old.paths, we have to check all of them. 343 for change in path_changes: 344 if self._change_matches(change): 345 return True 346 else: 347 for change in entry.changes(): 348 if self._change_matches(change): 349 return True 350 return None 351 352 def _next(self): 353 max_entries = self.max_entries 354 while max_entries is None or self._num_entries < max_entries: 355 entry = next(self._queue) 356 if entry is not None: 357 self._out_queue.append(entry) 358 if entry is None or len(self._out_queue) > _MAX_EXTRA_COMMITS: 359 if not self._out_queue: 360 return None 361 entry = self._out_queue.popleft() 362 if self._should_return(entry): 363 self._num_entries += 1 364 return entry 365 return None 366 367 def _reorder(self, results): 368 """Possibly reorder a results iterator. 369 370 Args: 371 results: An iterator of WalkEntry objects, in the order returned 372 from the queue_cls. 373 Returns: An iterator or list of WalkEntry objects, in the order 374 required by the Walker. 375 """ 376 if self.order == ORDER_TOPO: 377 results = _topo_reorder(results, self.get_parents) 378 if self.reverse: 379 results = reversed(list(results)) 380 return results 381 382 def __iter__(self): 383 return iter(self._reorder(iter(self._next, None))) 384 385 386def _topo_reorder(entries, get_parents=lambda commit: commit.parents): 387 """Reorder an iterable of entries topologically. 388 389 This works best assuming the entries are already in almost-topological 390 order, e.g. in commit time order. 391 392 Args: 393 entries: An iterable of WalkEntry objects. 394 get_parents: Optional function for getting the parents of a commit. 395 Returns: iterator over WalkEntry objects from entries in FIFO order, except 396 where a parent would be yielded before any of its children. 397 """ 398 todo = collections.deque() 399 pending = {} 400 num_children = collections.defaultdict(int) 401 for entry in entries: 402 todo.append(entry) 403 for p in get_parents(entry.commit): 404 num_children[p] += 1 405 406 while todo: 407 entry = todo.popleft() 408 commit = entry.commit 409 commit_id = commit.id 410 if num_children[commit_id]: 411 pending[commit_id] = entry 412 continue 413 for parent_id in get_parents(commit): 414 num_children[parent_id] -= 1 415 if not num_children[parent_id]: 416 parent_entry = pending.pop(parent_id, None) 417 if parent_entry: 418 todo.appendleft(parent_entry) 419 yield entry 420