1# Copyright (C) 2006-2011 Canonical Ltd
2#
3# This program is free software; you can redistribute it and/or modify
4# it under the terms of the GNU General Public License as published by
5# the Free Software Foundation; either version 2 of the License, or
6# (at your option) any later version.
7#
8# This program is distributed in the hope that it will be useful,
9# but WITHOUT ANY WARRANTY; without even the implied warranty of
10# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11# GNU General Public License for more details.
12#
13# You should have received a copy of the GNU General Public License
14# along with this program; if not, write to the Free Software
15# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
17"""DirState objects record the state of a directory and its bzr metadata.
18
19Pseudo EBNF grammar for the state file. Fields are separated by NULLs, and
20lines by NL. The field delimiters are ommitted in the grammar, line delimiters
21are not - this is done for clarity of reading. All string data is in utf8.
22
23::
24
25    MINIKIND = "f" | "d" | "l" | "a" | "r" | "t";
26    NL = "\\n";
27    NULL = "\\0";
28    WHOLE_NUMBER = {digit}, digit;
29    BOOLEAN = "y" | "n";
30    REVISION_ID = a non-empty utf8 string;
31
32    dirstate format = header line, full checksum, row count, parent details,
33     ghost_details, entries;
34    header line = "#bazaar dirstate flat format 3", NL;
35    full checksum = "crc32: ", ["-"], WHOLE_NUMBER, NL;
36    row count = "num_entries: ", WHOLE_NUMBER, NL;
37    parent_details = WHOLE NUMBER, {REVISION_ID}* NL;
38    ghost_details = WHOLE NUMBER, {REVISION_ID}*, NL;
39    entries = {entry};
40    entry = entry_key, current_entry_details, {parent_entry_details};
41    entry_key = dirname,  basename, fileid;
42    current_entry_details = common_entry_details, working_entry_details;
43    parent_entry_details = common_entry_details, history_entry_details;
44    common_entry_details = MINIKIND, fingerprint, size, executable
45    working_entry_details = packed_stat
46    history_entry_details = REVISION_ID;
47    executable = BOOLEAN;
48    size = WHOLE_NUMBER;
49    fingerprint = a nonempty utf8 sequence with meaning defined by minikind.
50
51Given this definition, the following is useful to know::
52
53    entry (aka row) - all the data for a given key.
54    entry[0]: The key (dirname, basename, fileid)
55    entry[0][0]: dirname
56    entry[0][1]: basename
57    entry[0][2]: fileid
58    entry[1]: The tree(s) data for this path and id combination.
59    entry[1][0]: The current tree
60    entry[1][1]: The second tree
61
62For an entry for a tree, we have (using tree 0 - current tree) to demonstrate::
63
64    entry[1][0][0]: minikind
65    entry[1][0][1]: fingerprint
66    entry[1][0][2]: size
67    entry[1][0][3]: executable
68    entry[1][0][4]: packed_stat
69
70OR (for non tree-0)::
71
72    entry[1][1][4]: revision_id
73
74There may be multiple rows at the root, one per id present in the root, so the
75in memory root row is now::
76
77    self._dirblocks[0] -> ('', [entry ...]),
78
79and the entries in there are::
80
81    entries[0][0]: b''
82    entries[0][1]: b''
83    entries[0][2]: file_id
84    entries[1][0]: The tree data for the current tree for this fileid at /
85    etc.
86
87Kinds::
88
89   b'r' is a relocated entry: This path is not present in this tree with this
90        id, but the id can be found at another location. The fingerprint is
91        used to point to the target location.
92   b'a' is an absent entry: In that tree the id is not present at this path.
93   b'd' is a directory entry: This path in this tree is a directory with the
94        current file id. There is no fingerprint for directories.
95   b'f' is a file entry: As for directory, but it's a file. The fingerprint is
96        the sha1 value of the file's canonical form, i.e. after any read
97        filters have been applied to the convenience form stored in the working
98        tree.
99   b'l' is a symlink entry: As for directory, but a symlink. The fingerprint is
100        the link target.
101   b't' is a reference to a nested subtree; the fingerprint is the referenced
102        revision.
103
104Ordering:
105
106The entries on disk and in memory are ordered according to the following keys::
107
108    directory, as a list of components
109    filename
110    file-id
111
112--- Format 1 had the following different definition: ---
113
114::
115
116    rows = dirname, NULL, basename, NULL, MINIKIND, NULL, fileid_utf8, NULL,
117        WHOLE NUMBER (* size *), NULL, packed stat, NULL, sha1|symlink target,
118        {PARENT ROW}
119    PARENT ROW = NULL, revision_utf8, NULL, MINIKIND, NULL, dirname, NULL,
120        basename, NULL, WHOLE NUMBER (* size *), NULL, "y" | "n", NULL,
121        SHA1
122
123PARENT ROW's are emitted for every parent that is not in the ghosts details
124line. That is, if the parents are foo, bar, baz, and the ghosts are bar, then
125each row will have a PARENT ROW for foo and baz, but not for bar.
126
127
128In any tree, a kind of 'moved' indicates that the fingerprint field
129(which we treat as opaque data specific to the 'kind' anyway) has the
130details for the id of this row in that tree.
131
132I'm strongly tempted to add a id->path index as well, but I think that
133where we need id->path mapping; we also usually read the whole file, so
134I'm going to skip that for the moment, as we have the ability to locate
135via bisect any path in any tree, and if we lookup things by path, we can
136accumulate an id->path mapping as we go, which will tend to match what we
137looked for.
138
139I plan to implement this asap, so please speak up now to alter/tweak the
140design - and once we stabilise on this, I'll update the wiki page for
141it.
142
143The rationale for all this is that we want fast operations for the
144common case (diff/status/commit/merge on all files) and extremely fast
145operations for the less common but still occurs a lot status/diff/commit
146on specific files). Operations on specific files involve a scan for all
147the children of a path, *in every involved tree*, which the current
148format did not accommodate.
149----
150
151Design priorities:
152 1. Fast end to end use for bzr's top 5 uses cases. (commmit/diff/status/merge/???)
153 2. fall back current object model as needed.
154 3. scale usably to the largest trees known today - say 50K entries. (mozilla
155    is an example of this)
156
157
158Locking:
159
160 Eventually reuse dirstate objects across locks IFF the dirstate file has not
161 been modified, but will require that we flush/ignore cached stat-hit data
162 because we won't want to restat all files on disk just because a lock was
163 acquired, yet we cannot trust the data after the previous lock was released.
164
165Memory representation::
166
167 vector of all directories, and vector of the childen ?
168   i.e.
169     root_entries = (direntry for root, [parent_direntries_for_root]),
170     dirblocks = [
171     ('', ['data for achild', 'data for bchild', 'data for cchild'])
172     ('dir', ['achild', 'cchild', 'echild'])
173     ]
174    - single bisect to find N subtrees from a path spec
175    - in-order for serialisation - this is 'dirblock' grouping.
176    - insertion of a file '/a' affects only the '/' child-vector, that is, to
177      insert 10K elements from scratch does not generates O(N^2) memoves of a
178      single vector, rather each individual, which tends to be limited to a
179      manageable number. Will scale badly on trees with 10K entries in a
180      single directory. compare with Inventory.InventoryDirectory which has
181      a dictionary for the children. No bisect capability, can only probe for
182      exact matches, or grab all elements and sort.
183    - What's the risk of error here? Once we have the base format being processed
184      we should have a net win regardless of optimality. So we are going to
185      go with what seems reasonable.
186
187open questions:
188
189Maybe we should do a test profile of the core structure - 10K simulated
190searches/lookups/etc?
191
192Objects for each row?
193The lifetime of Dirstate objects is current per lock, but see above for
194possible extensions. The lifetime of a row from a dirstate is expected to be
195very short in the optimistic case: which we are optimising for. For instance,
196subtree status will determine from analysis of the disk data what rows need to
197be examined at all, and will be able to determine from a single row whether
198that file has altered or not, so we are aiming to process tens of thousands of
199entries each second within the dirstate context, before exposing anything to
200the larger codebase. This suggests we want the time for a single file
201comparison to be < 0.1 milliseconds. That would give us 10000 paths per second
202processed, and to scale to 100 thousand we'll another order of magnitude to do
203that. Now, as the lifetime for all unchanged entries is the time to parse, stat
204the file on disk, and then immediately discard, the overhead of object creation
205becomes a significant cost.
206
207Figures: Creating a tuple from 3 elements was profiled at 0.0625
208microseconds, whereas creating a object which is subclassed from tuple was
2090.500 microseconds, and creating an object with 3 elements and slots was 3
210microseconds long. 0.1 milliseconds is 100 microseconds, and ideally we'll get
211down to 10 microseconds for the total processing - having 33% of that be object
212creation is a huge overhead. There is a potential cost in using tuples within
213each row which is that the conditional code to do comparisons may be slower
214than method invocation, but method invocation is known to be slow due to stack
215frame creation, so avoiding methods in these tight inner loops in unfortunately
216desirable. We can consider a pyrex version of this with objects in future if
217desired.
218
219"""
220
221import bisect
222import contextlib
223import errno
224import operator
225import os
226from stat import S_IEXEC
227import stat
228import sys
229import time
230import zlib
231
232from . import (
233    inventory,
234    )
235from .. import (
236    cache_utf8,
237    config,
238    debug,
239    errors,
240    lock,
241    osutils,
242    static_tuple,
243    trace,
244    urlutils,
245    )
246from .inventorytree import InventoryTreeChange
247
248
249# This is the Windows equivalent of ENOTDIR
250# It is defined in pywin32.winerror, but we don't want a strong dependency for
251# just an error code.
252ERROR_PATH_NOT_FOUND = 3
253ERROR_DIRECTORY = 267
254
255
256class DirstateCorrupt(errors.BzrError):
257
258    _fmt = "The dirstate file (%(state)s) appears to be corrupt: %(msg)s"
259
260    def __init__(self, state, msg):
261        errors.BzrError.__init__(self)
262        self.state = state
263        self.msg = msg
264
265
266class SHA1Provider(object):
267    """An interface for getting sha1s of a file."""
268
269    def sha1(self, abspath):
270        """Return the sha1 of a file given its absolute path.
271
272        :param abspath:  May be a filesystem encoded absolute path
273             or a unicode path.
274        """
275        raise NotImplementedError(self.sha1)
276
277    def stat_and_sha1(self, abspath):
278        """Return the stat and sha1 of a file given its absolute path.
279
280        :param abspath:  May be a filesystem encoded absolute path
281             or a unicode path.
282
283        Note: the stat should be the stat of the physical file
284        while the sha may be the sha of its canonical content.
285        """
286        raise NotImplementedError(self.stat_and_sha1)
287
288
289class DefaultSHA1Provider(SHA1Provider):
290    """A SHA1Provider that reads directly from the filesystem."""
291
292    def sha1(self, abspath):
293        """Return the sha1 of a file given its absolute path."""
294        return osutils.sha_file_by_name(abspath)
295
296    def stat_and_sha1(self, abspath):
297        """Return the stat and sha1 of a file given its absolute path."""
298        with open(abspath, 'rb') as file_obj:
299            statvalue = os.fstat(file_obj.fileno())
300            sha1 = osutils.sha_file(file_obj)
301        return statvalue, sha1
302
303
304class DirState(object):
305    """Record directory and metadata state for fast access.
306
307    A dirstate is a specialised data structure for managing local working
308    tree state information. Its not yet well defined whether it is platform
309    specific, and if it is how we detect/parameterize that.
310
311    Dirstates use the usual lock_write, lock_read and unlock mechanisms.
312    Unlike most bzr disk formats, DirStates must be locked for reading, using
313    lock_read.  (This is an os file lock internally.)  This is necessary
314    because the file can be rewritten in place.
315
316    DirStates must be explicitly written with save() to commit changes; just
317    unlocking them does not write the changes to disk.
318    """
319
320    _kind_to_minikind = {
321        'absent': b'a',
322        'file': b'f',
323        'directory': b'd',
324        'relocated': b'r',
325        'symlink': b'l',
326        'tree-reference': b't',
327        }
328    _minikind_to_kind = {
329        b'a': 'absent',
330        b'f': 'file',
331        b'd': 'directory',
332        b'l': 'symlink',
333        b'r': 'relocated',
334        b't': 'tree-reference',
335        }
336    _stat_to_minikind = {
337        stat.S_IFDIR: b'd',
338        stat.S_IFREG: b'f',
339        stat.S_IFLNK: b'l',
340    }
341    _to_yesno = {True: b'y', False: b'n'}  # TODO profile the performance gain
342    # of using int conversion rather than a dict here. AND BLAME ANDREW IF
343    # it is faster.
344
345    # TODO: jam 20070221 Figure out what to do if we have a record that exceeds
346    #       the BISECT_PAGE_SIZE. For now, we just have to make it large enough
347    #       that we are sure a single record will always fit.
348    BISECT_PAGE_SIZE = 4096
349
350    NOT_IN_MEMORY = 0
351    IN_MEMORY_UNMODIFIED = 1
352    IN_MEMORY_MODIFIED = 2
353    IN_MEMORY_HASH_MODIFIED = 3  # Only hash-cache updates
354
355    # A pack_stat (the x's) that is just noise and will never match the output
356    # of base64 encode.
357    NULLSTAT = b'x' * 32
358    NULL_PARENT_DETAILS = static_tuple.StaticTuple(b'a', b'', 0, False, b'')
359
360    HEADER_FORMAT_2 = b'#bazaar dirstate flat format 2\n'
361    HEADER_FORMAT_3 = b'#bazaar dirstate flat format 3\n'
362
363    def __init__(self, path, sha1_provider, worth_saving_limit=0,
364                 use_filesystem_for_exec=True):
365        """Create a  DirState object.
366
367        :param path: The path at which the dirstate file on disk should live.
368        :param sha1_provider: an object meeting the SHA1Provider interface.
369        :param worth_saving_limit: when the exact number of hash changed
370            entries is known, only bother saving the dirstate if more than
371            this count of entries have changed.
372            -1 means never save hash changes, 0 means always save hash changes.
373        :param use_filesystem_for_exec: Whether to trust the filesystem
374            for executable bit information
375        """
376        # _header_state and _dirblock_state represent the current state
377        # of the dirstate metadata and the per-row data respectiely.
378        # NOT_IN_MEMORY indicates that no data is in memory
379        # IN_MEMORY_UNMODIFIED indicates that what we have in memory
380        #   is the same as is on disk
381        # IN_MEMORY_MODIFIED indicates that we have a modified version
382        #   of what is on disk.
383        # In future we will add more granularity, for instance _dirblock_state
384        # will probably support partially-in-memory as a separate variable,
385        # allowing for partially-in-memory unmodified and partially-in-memory
386        # modified states.
387        self._header_state = DirState.NOT_IN_MEMORY
388        self._dirblock_state = DirState.NOT_IN_MEMORY
389        # If true, an error has been detected while updating the dirstate, and
390        # for safety we're not going to commit to disk.
391        self._changes_aborted = False
392        self._dirblocks = []
393        self._ghosts = []
394        self._parents = []
395        self._state_file = None
396        self._filename = path
397        self._lock_token = None
398        self._lock_state = None
399        self._id_index = None
400        # a map from packed_stat to sha's.
401        self._packed_stat_index = None
402        self._end_of_header = None
403        self._cutoff_time = None
404        self._split_path_cache = {}
405        self._bisect_page_size = DirState.BISECT_PAGE_SIZE
406        self._sha1_provider = sha1_provider
407        if 'hashcache' in debug.debug_flags:
408            self._sha1_file = self._sha1_file_and_mutter
409        else:
410            self._sha1_file = self._sha1_provider.sha1
411        # These two attributes provide a simple cache for lookups into the
412        # dirstate in-memory vectors. By probing respectively for the last
413        # block, and for the next entry, we save nearly 2 bisections per path
414        # during commit.
415        self._last_block_index = None
416        self._last_entry_index = None
417        # The set of known hash changes
418        self._known_hash_changes = set()
419        # How many hash changed entries can we have without saving
420        self._worth_saving_limit = worth_saving_limit
421        self._config_stack = config.LocationStack(urlutils.local_path_to_url(
422            path))
423        self._use_filesystem_for_exec = use_filesystem_for_exec
424
425    def __repr__(self):
426        return "%s(%r)" % \
427            (self.__class__.__name__, self._filename)
428
429    def _mark_modified(self, hash_changed_entries=None, header_modified=False):
430        """Mark this dirstate as modified.
431
432        :param hash_changed_entries: if non-None, mark just these entries as
433            having their hash modified.
434        :param header_modified: mark the header modified as well, not just the
435            dirblocks.
436        """
437        #trace.mutter_callsite(3, "modified hash entries: %s", hash_changed_entries)
438        if hash_changed_entries:
439            self._known_hash_changes.update(
440                [e[0] for e in hash_changed_entries])
441            if self._dirblock_state in (DirState.NOT_IN_MEMORY,
442                                        DirState.IN_MEMORY_UNMODIFIED):
443                # If the dirstate is already marked a IN_MEMORY_MODIFIED, then
444                # that takes precedence.
445                self._dirblock_state = DirState.IN_MEMORY_HASH_MODIFIED
446        else:
447            # TODO: Since we now have a IN_MEMORY_HASH_MODIFIED state, we
448            #       should fail noisily if someone tries to set
449            #       IN_MEMORY_MODIFIED but we don't have a write-lock!
450            # We don't know exactly what changed so disable smart saving
451            self._dirblock_state = DirState.IN_MEMORY_MODIFIED
452        if header_modified:
453            self._header_state = DirState.IN_MEMORY_MODIFIED
454
455    def _mark_unmodified(self):
456        """Mark this dirstate as unmodified."""
457        self._header_state = DirState.IN_MEMORY_UNMODIFIED
458        self._dirblock_state = DirState.IN_MEMORY_UNMODIFIED
459        self._known_hash_changes = set()
460
461    def add(self, path, file_id, kind, stat, fingerprint):
462        """Add a path to be tracked.
463
464        :param path: The path within the dirstate - b'' is the root, 'foo' is the
465            path foo within the root, 'foo/bar' is the path bar within foo
466            within the root.
467        :param file_id: The file id of the path being added.
468        :param kind: The kind of the path, as a string like 'file',
469            'directory', etc.
470        :param stat: The output of os.lstat for the path.
471        :param fingerprint: The sha value of the file's canonical form (i.e.
472            after any read filters have been applied),
473            or the target of a symlink,
474            or the referenced revision id for tree-references,
475            or b'' for directories.
476        """
477        # adding a file:
478        # find the block its in.
479        # find the location in the block.
480        # check its not there
481        # add it.
482        # ------- copied from inventory.ensure_normalized_name - keep synced.
483        # --- normalized_filename wants a unicode basename only, so get one.
484        dirname, basename = osutils.split(path)
485        # we dont import normalized_filename directly because we want to be
486        # able to change the implementation at runtime for tests.
487        norm_name, can_access = osutils.normalized_filename(basename)
488        if norm_name != basename:
489            if can_access:
490                basename = norm_name
491            else:
492                raise errors.InvalidNormalization(path)
493        # you should never have files called . or ..; just add the directory
494        # in the parent, or according to the special treatment for the root
495        if basename == '.' or basename == '..':
496            raise inventory.InvalidEntryName(path)
497        # now that we've normalised, we need the correct utf8 path and
498        # dirname and basename elements. This single encode and split should be
499        # faster than three separate encodes.
500        utf8path = (dirname + '/' + basename).strip('/').encode('utf8')
501        dirname, basename = osutils.split(utf8path)
502        # uses __class__ for speed; the check is needed for safety
503        if file_id.__class__ is not bytes:
504            raise AssertionError(
505                "must be a utf8 file_id not %s" % (type(file_id), ))
506        # Make sure the file_id does not exist in this tree
507        rename_from = None
508        file_id_entry = self._get_entry(
509            0, fileid_utf8=file_id, include_deleted=True)
510        if file_id_entry != (None, None):
511            if file_id_entry[1][0][0] == b'a':
512                if file_id_entry[0] != (dirname, basename, file_id):
513                    # set the old name's current operation to rename
514                    self.update_minimal(file_id_entry[0],
515                                        b'r',
516                                        path_utf8=b'',
517                                        packed_stat=b'',
518                                        fingerprint=utf8path
519                                        )
520                    rename_from = file_id_entry[0][0:2]
521            else:
522                path = osutils.pathjoin(
523                    file_id_entry[0][0], file_id_entry[0][1])
524                kind = DirState._minikind_to_kind[file_id_entry[1][0][0]]
525                info = '%s:%s' % (kind, path)
526                raise inventory.DuplicateFileId(file_id, info)
527        first_key = (dirname, basename, b'')
528        block_index, present = self._find_block_index_from_key(first_key)
529        if present:
530            # check the path is not in the tree
531            block = self._dirblocks[block_index][1]
532            entry_index, _ = self._find_entry_index(first_key, block)
533            while (entry_index < len(block) and
534                   block[entry_index][0][0:2] == first_key[0:2]):
535                if block[entry_index][1][0][0] not in (b'a', b'r'):
536                    # this path is in the dirstate in the current tree.
537                    raise Exception("adding already added path!")
538                entry_index += 1
539        else:
540            # The block where we want to put the file is not present. But it
541            # might be because the directory was empty, or not loaded yet. Look
542            # for a parent entry, if not found, raise NotVersionedError
543            parent_dir, parent_base = osutils.split(dirname)
544            parent_block_idx, parent_entry_idx, _, parent_present = \
545                self._get_block_entry_index(parent_dir, parent_base, 0)
546            if not parent_present:
547                raise errors.NotVersionedError(path, str(self))
548            self._ensure_block(parent_block_idx, parent_entry_idx, dirname)
549        block = self._dirblocks[block_index][1]
550        entry_key = (dirname, basename, file_id)
551        if stat is None:
552            size = 0
553            packed_stat = DirState.NULLSTAT
554        else:
555            size = stat.st_size
556            packed_stat = pack_stat(stat)
557        parent_info = self._empty_parent_info()
558        minikind = DirState._kind_to_minikind[kind]
559        if rename_from is not None:
560            if rename_from[0]:
561                old_path_utf8 = b'%s/%s' % rename_from
562            else:
563                old_path_utf8 = rename_from[1]
564            parent_info[0] = (b'r', old_path_utf8, 0, False, b'')
565        if kind == 'file':
566            entry_data = entry_key, [
567                (minikind, fingerprint, size, False, packed_stat),
568                ] + parent_info
569        elif kind == 'directory':
570            entry_data = entry_key, [
571                (minikind, b'', 0, False, packed_stat),
572                ] + parent_info
573        elif kind == 'symlink':
574            entry_data = entry_key, [
575                (minikind, fingerprint, size, False, packed_stat),
576                ] + parent_info
577        elif kind == 'tree-reference':
578            entry_data = entry_key, [
579                (minikind, fingerprint, 0, False, packed_stat),
580                ] + parent_info
581        else:
582            raise errors.BzrError('unknown kind %r' % kind)
583        entry_index, present = self._find_entry_index(entry_key, block)
584        if not present:
585            block.insert(entry_index, entry_data)
586        else:
587            if block[entry_index][1][0][0] != b'a':
588                raise AssertionError(" %r(%r) already added" %
589                                     (basename, file_id))
590            block[entry_index][1][0] = entry_data[1][0]
591
592        if kind == 'directory':
593            # insert a new dirblock
594            self._ensure_block(block_index, entry_index, utf8path)
595        self._mark_modified()
596        if self._id_index:
597            self._add_to_id_index(self._id_index, entry_key)
598
599    def _bisect(self, paths):
600        """Bisect through the disk structure for specific rows.
601
602        :param paths: A list of paths to find
603        :return: A dict mapping path => entries for found entries. Missing
604                 entries will not be in the map.
605                 The list is not sorted, and entries will be populated
606                 based on when they were read.
607        """
608        self._requires_lock()
609        # We need the file pointer to be right after the initial header block
610        self._read_header_if_needed()
611        # If _dirblock_state was in memory, we should just return info from
612        # there, this function is only meant to handle when we want to read
613        # part of the disk.
614        if self._dirblock_state != DirState.NOT_IN_MEMORY:
615            raise AssertionError("bad dirblock state %r" %
616                                 self._dirblock_state)
617
618        # The disk representation is generally info + '\0\n\0' at the end. But
619        # for bisecting, it is easier to treat this as '\0' + info + '\0\n'
620        # Because it means we can sync on the '\n'
621        state_file = self._state_file
622        file_size = os.fstat(state_file.fileno()).st_size
623        # We end up with 2 extra fields, we should have a trailing '\n' to
624        # ensure that we read the whole record, and we should have a precursur
625        # b'' which ensures that we start after the previous '\n'
626        entry_field_count = self._fields_per_entry() + 1
627
628        low = self._end_of_header
629        high = file_size - 1  # Ignore the final '\0'
630        # Map from (dir, name) => entry
631        found = {}
632
633        # Avoid infinite seeking
634        max_count = 30 * len(paths)
635        count = 0
636        # pending is a list of places to look.
637        # each entry is a tuple of low, high, dir_names
638        #   low -> the first byte offset to read (inclusive)
639        #   high -> the last byte offset (inclusive)
640        #   dir_names -> The list of (dir, name) pairs that should be found in
641        #                the [low, high] range
642        pending = [(low, high, paths)]
643
644        page_size = self._bisect_page_size
645
646        fields_to_entry = self._get_fields_to_entry()
647
648        while pending:
649            low, high, cur_files = pending.pop()
650
651            if not cur_files or low >= high:
652                # Nothing to find
653                continue
654
655            count += 1
656            if count > max_count:
657                raise errors.BzrError('Too many seeks, most likely a bug.')
658
659            mid = max(low, (low + high - page_size) // 2)
660
661            state_file.seek(mid)
662            # limit the read size, so we don't end up reading data that we have
663            # already read.
664            read_size = min(page_size, (high - mid) + 1)
665            block = state_file.read(read_size)
666
667            start = mid
668            entries = block.split(b'\n')
669
670            if len(entries) < 2:
671                # We didn't find a '\n', so we cannot have found any records.
672                # So put this range back and try again. But we know we have to
673                # increase the page size, because a single read did not contain
674                # a record break (so records must be larger than page_size)
675                page_size *= 2
676                pending.append((low, high, cur_files))
677                continue
678
679            # Check the first and last entries, in case they are partial, or if
680            # we don't care about the rest of this page
681            first_entry_num = 0
682            first_fields = entries[0].split(b'\0')
683            if len(first_fields) < entry_field_count:
684                # We didn't get the complete first entry
685                # so move start, and grab the next, which
686                # should be a full entry
687                start += len(entries[0]) + 1
688                first_fields = entries[1].split(b'\0')
689                first_entry_num = 1
690
691            if len(first_fields) <= 2:
692                # We didn't even get a filename here... what do we do?
693                # Try a large page size and repeat this query
694                page_size *= 2
695                pending.append((low, high, cur_files))
696                continue
697            else:
698                # Find what entries we are looking for, which occur before and
699                # after this first record.
700                after = start
701                if first_fields[1]:
702                    first_path = first_fields[1] + b'/' + first_fields[2]
703                else:
704                    first_path = first_fields[2]
705                first_loc = _bisect_path_left(cur_files, first_path)
706
707                # These exist before the current location
708                pre = cur_files[:first_loc]
709                # These occur after the current location, which may be in the
710                # data we read, or might be after the last entry
711                post = cur_files[first_loc:]
712
713            if post and len(first_fields) >= entry_field_count:
714                # We have files after the first entry
715
716                # Parse the last entry
717                last_entry_num = len(entries) - 1
718                last_fields = entries[last_entry_num].split(b'\0')
719                if len(last_fields) < entry_field_count:
720                    # The very last hunk was not complete,
721                    # read the previous hunk
722                    after = mid + len(block) - len(entries[-1])
723                    last_entry_num -= 1
724                    last_fields = entries[last_entry_num].split(b'\0')
725                else:
726                    after = mid + len(block)
727
728                if last_fields[1]:
729                    last_path = last_fields[1] + b'/' + last_fields[2]
730                else:
731                    last_path = last_fields[2]
732                last_loc = _bisect_path_right(post, last_path)
733
734                middle_files = post[:last_loc]
735                post = post[last_loc:]
736
737                if middle_files:
738                    # We have files that should occur in this block
739                    # (>= first, <= last)
740                    # Either we will find them here, or we can mark them as
741                    # missing.
742
743                    if middle_files[0] == first_path:
744                        # We might need to go before this location
745                        pre.append(first_path)
746                    if middle_files[-1] == last_path:
747                        post.insert(0, last_path)
748
749                    # Find out what paths we have
750                    paths = {first_path: [first_fields]}
751                    # last_path might == first_path so we need to be
752                    # careful if we should append rather than overwrite
753                    if last_entry_num != first_entry_num:
754                        paths.setdefault(last_path, []).append(last_fields)
755                    for num in range(first_entry_num + 1, last_entry_num):
756                        # TODO: jam 20070223 We are already splitting here, so
757                        #       shouldn't we just split the whole thing rather
758                        #       than doing the split again in add_one_record?
759                        fields = entries[num].split(b'\0')
760                        if fields[1]:
761                            path = fields[1] + b'/' + fields[2]
762                        else:
763                            path = fields[2]
764                        paths.setdefault(path, []).append(fields)
765
766                    for path in middle_files:
767                        for fields in paths.get(path, []):
768                            # offset by 1 because of the opening '\0'
769                            # consider changing fields_to_entry to avoid the
770                            # extra list slice
771                            entry = fields_to_entry(fields[1:])
772                            found.setdefault(path, []).append(entry)
773
774            # Now we have split up everything into pre, middle, and post, and
775            # we have handled everything that fell in 'middle'.
776            # We add 'post' first, so that we prefer to seek towards the
777            # beginning, so that we will tend to go as early as we need, and
778            # then only seek forward after that.
779            if post:
780                pending.append((after, high, post))
781            if pre:
782                pending.append((low, start - 1, pre))
783
784        # Consider that we may want to return the directory entries in sorted
785        # order. For now, we just return them in whatever order we found them,
786        # and leave it up to the caller if they care if it is ordered or not.
787        return found
788
789    def _bisect_dirblocks(self, dir_list):
790        """Bisect through the disk structure to find entries in given dirs.
791
792        _bisect_dirblocks is meant to find the contents of directories, which
793        differs from _bisect, which only finds individual entries.
794
795        :param dir_list: A sorted list of directory names ['', 'dir', 'foo'].
796        :return: A map from dir => entries_for_dir
797        """
798        # TODO: jam 20070223 A lot of the bisecting logic could be shared
799        #       between this and _bisect. It would require parameterizing the
800        #       inner loop with a function, though. We should evaluate the
801        #       performance difference.
802        self._requires_lock()
803        # We need the file pointer to be right after the initial header block
804        self._read_header_if_needed()
805        # If _dirblock_state was in memory, we should just return info from
806        # there, this function is only meant to handle when we want to read
807        # part of the disk.
808        if self._dirblock_state != DirState.NOT_IN_MEMORY:
809            raise AssertionError("bad dirblock state %r" %
810                                 self._dirblock_state)
811        # The disk representation is generally info + '\0\n\0' at the end. But
812        # for bisecting, it is easier to treat this as '\0' + info + '\0\n'
813        # Because it means we can sync on the '\n'
814        state_file = self._state_file
815        file_size = os.fstat(state_file.fileno()).st_size
816        # We end up with 2 extra fields, we should have a trailing '\n' to
817        # ensure that we read the whole record, and we should have a precursur
818        # b'' which ensures that we start after the previous '\n'
819        entry_field_count = self._fields_per_entry() + 1
820
821        low = self._end_of_header
822        high = file_size - 1  # Ignore the final '\0'
823        # Map from dir => entry
824        found = {}
825
826        # Avoid infinite seeking
827        max_count = 30 * len(dir_list)
828        count = 0
829        # pending is a list of places to look.
830        # each entry is a tuple of low, high, dir_names
831        #   low -> the first byte offset to read (inclusive)
832        #   high -> the last byte offset (inclusive)
833        #   dirs -> The list of directories that should be found in
834        #                the [low, high] range
835        pending = [(low, high, dir_list)]
836
837        page_size = self._bisect_page_size
838
839        fields_to_entry = self._get_fields_to_entry()
840
841        while pending:
842            low, high, cur_dirs = pending.pop()
843
844            if not cur_dirs or low >= high:
845                # Nothing to find
846                continue
847
848            count += 1
849            if count > max_count:
850                raise errors.BzrError('Too many seeks, most likely a bug.')
851
852            mid = max(low, (low + high - page_size) // 2)
853
854            state_file.seek(mid)
855            # limit the read size, so we don't end up reading data that we have
856            # already read.
857            read_size = min(page_size, (high - mid) + 1)
858            block = state_file.read(read_size)
859
860            start = mid
861            entries = block.split(b'\n')
862
863            if len(entries) < 2:
864                # We didn't find a '\n', so we cannot have found any records.
865                # So put this range back and try again. But we know we have to
866                # increase the page size, because a single read did not contain
867                # a record break (so records must be larger than page_size)
868                page_size *= 2
869                pending.append((low, high, cur_dirs))
870                continue
871
872            # Check the first and last entries, in case they are partial, or if
873            # we don't care about the rest of this page
874            first_entry_num = 0
875            first_fields = entries[0].split(b'\0')
876            if len(first_fields) < entry_field_count:
877                # We didn't get the complete first entry
878                # so move start, and grab the next, which
879                # should be a full entry
880                start += len(entries[0]) + 1
881                first_fields = entries[1].split(b'\0')
882                first_entry_num = 1
883
884            if len(first_fields) <= 1:
885                # We didn't even get a dirname here... what do we do?
886                # Try a large page size and repeat this query
887                page_size *= 2
888                pending.append((low, high, cur_dirs))
889                continue
890            else:
891                # Find what entries we are looking for, which occur before and
892                # after this first record.
893                after = start
894                first_dir = first_fields[1]
895                first_loc = bisect.bisect_left(cur_dirs, first_dir)
896
897                # These exist before the current location
898                pre = cur_dirs[:first_loc]
899                # These occur after the current location, which may be in the
900                # data we read, or might be after the last entry
901                post = cur_dirs[first_loc:]
902
903            if post and len(first_fields) >= entry_field_count:
904                # We have records to look at after the first entry
905
906                # Parse the last entry
907                last_entry_num = len(entries) - 1
908                last_fields = entries[last_entry_num].split(b'\0')
909                if len(last_fields) < entry_field_count:
910                    # The very last hunk was not complete,
911                    # read the previous hunk
912                    after = mid + len(block) - len(entries[-1])
913                    last_entry_num -= 1
914                    last_fields = entries[last_entry_num].split(b'\0')
915                else:
916                    after = mid + len(block)
917
918                last_dir = last_fields[1]
919                last_loc = bisect.bisect_right(post, last_dir)
920
921                middle_files = post[:last_loc]
922                post = post[last_loc:]
923
924                if middle_files:
925                    # We have files that should occur in this block
926                    # (>= first, <= last)
927                    # Either we will find them here, or we can mark them as
928                    # missing.
929
930                    if middle_files[0] == first_dir:
931                        # We might need to go before this location
932                        pre.append(first_dir)
933                    if middle_files[-1] == last_dir:
934                        post.insert(0, last_dir)
935
936                    # Find out what paths we have
937                    paths = {first_dir: [first_fields]}
938                    # last_dir might == first_dir so we need to be
939                    # careful if we should append rather than overwrite
940                    if last_entry_num != first_entry_num:
941                        paths.setdefault(last_dir, []).append(last_fields)
942                    for num in range(first_entry_num + 1, last_entry_num):
943                        # TODO: jam 20070223 We are already splitting here, so
944                        #       shouldn't we just split the whole thing rather
945                        #       than doing the split again in add_one_record?
946                        fields = entries[num].split(b'\0')
947                        paths.setdefault(fields[1], []).append(fields)
948
949                    for cur_dir in middle_files:
950                        for fields in paths.get(cur_dir, []):
951                            # offset by 1 because of the opening '\0'
952                            # consider changing fields_to_entry to avoid the
953                            # extra list slice
954                            entry = fields_to_entry(fields[1:])
955                            found.setdefault(cur_dir, []).append(entry)
956
957            # Now we have split up everything into pre, middle, and post, and
958            # we have handled everything that fell in 'middle'.
959            # We add 'post' first, so that we prefer to seek towards the
960            # beginning, so that we will tend to go as early as we need, and
961            # then only seek forward after that.
962            if post:
963                pending.append((after, high, post))
964            if pre:
965                pending.append((low, start - 1, pre))
966
967        return found
968
969    def _bisect_recursive(self, paths):
970        """Bisect for entries for all paths and their children.
971
972        This will use bisect to find all records for the supplied paths. It
973        will then continue to bisect for any records which are marked as
974        directories. (and renames?)
975
976        :param paths: A sorted list of (dir, name) pairs
977             eg: [('', b'a'), ('', b'f'), ('a/b', b'c')]
978        :return: A dictionary mapping (dir, name, file_id) => [tree_info]
979        """
980        # Map from (dir, name, file_id) => [tree_info]
981        found = {}
982
983        found_dir_names = set()
984
985        # Directories that have been read
986        processed_dirs = set()
987        # Get the ball rolling with the first bisect for all entries.
988        newly_found = self._bisect(paths)
989
990        while newly_found:
991            # Directories that need to be read
992            pending_dirs = set()
993            paths_to_search = set()
994            for entry_list in newly_found.values():
995                for dir_name_id, trees_info in entry_list:
996                    found[dir_name_id] = trees_info
997                    found_dir_names.add(dir_name_id[:2])
998                    is_dir = False
999                    for tree_info in trees_info:
1000                        minikind = tree_info[0]
1001                        if minikind == b'd':
1002                            if is_dir:
1003                                # We already processed this one as a directory,
1004                                # we don't need to do the extra work again.
1005                                continue
1006                            subdir, name, file_id = dir_name_id
1007                            path = osutils.pathjoin(subdir, name)
1008                            is_dir = True
1009                            if path not in processed_dirs:
1010                                pending_dirs.add(path)
1011                        elif minikind == b'r':
1012                            # Rename, we need to directly search the target
1013                            # which is contained in the fingerprint column
1014                            dir_name = osutils.split(tree_info[1])
1015                            if dir_name[0] in pending_dirs:
1016                                # This entry will be found in the dir search
1017                                continue
1018                            if dir_name not in found_dir_names:
1019                                paths_to_search.add(tree_info[1])
1020            # Now we have a list of paths to look for directly, and
1021            # directory blocks that need to be read.
1022            # newly_found is mixing the keys between (dir, name) and path
1023            # entries, but that is okay, because we only really care about the
1024            # targets.
1025            newly_found = self._bisect(sorted(paths_to_search))
1026            newly_found.update(self._bisect_dirblocks(sorted(pending_dirs)))
1027            processed_dirs.update(pending_dirs)
1028        return found
1029
1030    def _discard_merge_parents(self):
1031        """Discard any parents trees beyond the first.
1032
1033        Note that if this fails the dirstate is corrupted.
1034
1035        After this function returns the dirstate contains 2 trees, neither of
1036        which are ghosted.
1037        """
1038        self._read_header_if_needed()
1039        parents = self.get_parent_ids()
1040        if len(parents) < 1:
1041            return
1042        # only require all dirblocks if we are doing a full-pass removal.
1043        self._read_dirblocks_if_needed()
1044        dead_patterns = {(b'a', b'r'), (b'a', b'a'),
1045                         (b'r', b'r'), (b'r', b'a')}
1046
1047        def iter_entries_removable():
1048            for block in self._dirblocks:
1049                deleted_positions = []
1050                for pos, entry in enumerate(block[1]):
1051                    yield entry
1052                    if (entry[1][0][0], entry[1][1][0]) in dead_patterns:
1053                        deleted_positions.append(pos)
1054                if deleted_positions:
1055                    if len(deleted_positions) == len(block[1]):
1056                        del block[1][:]
1057                    else:
1058                        for pos in reversed(deleted_positions):
1059                            del block[1][pos]
1060        # if the first parent is a ghost:
1061        if parents[0] in self.get_ghosts():
1062            empty_parent = [DirState.NULL_PARENT_DETAILS]
1063            for entry in iter_entries_removable():
1064                entry[1][1:] = empty_parent
1065        else:
1066            for entry in iter_entries_removable():
1067                del entry[1][2:]
1068
1069        self._ghosts = []
1070        self._parents = [parents[0]]
1071        self._mark_modified(header_modified=True)
1072
1073    def _empty_parent_info(self):
1074        return [DirState.NULL_PARENT_DETAILS] * (len(self._parents) -
1075                                                 len(self._ghosts))
1076
1077    def _ensure_block(self, parent_block_index, parent_row_index, dirname):
1078        """Ensure a block for dirname exists.
1079
1080        This function exists to let callers which know that there is a
1081        directory dirname ensure that the block for it exists. This block can
1082        fail to exist because of demand loading, or because a directory had no
1083        children. In either case it is not an error. It is however an error to
1084        call this if there is no parent entry for the directory, and thus the
1085        function requires the coordinates of such an entry to be provided.
1086
1087        The root row is special cased and can be indicated with a parent block
1088        and row index of -1
1089
1090        :param parent_block_index: The index of the block in which dirname's row
1091            exists.
1092        :param parent_row_index: The index in the parent block where the row
1093            exists.
1094        :param dirname: The utf8 dirname to ensure there is a block for.
1095        :return: The index for the block.
1096        """
1097        if dirname == b'' and parent_row_index == 0 and parent_block_index == 0:
1098            # This is the signature of the root row, and the
1099            # contents-of-root row is always index 1
1100            return 1
1101        # the basename of the directory must be the end of its full name.
1102        if not (parent_block_index == -1 and
1103                parent_block_index == -1 and dirname == b''):
1104            if not dirname.endswith(
1105                    self._dirblocks[parent_block_index][1][parent_row_index][0][1]):
1106                raise AssertionError("bad dirname %r" % dirname)
1107        block_index, present = self._find_block_index_from_key(
1108            (dirname, b'', b''))
1109        if not present:
1110            # In future, when doing partial parsing, this should load and
1111            # populate the entire block.
1112            self._dirblocks.insert(block_index, (dirname, []))
1113        return block_index
1114
1115    def _entries_to_current_state(self, new_entries):
1116        """Load new_entries into self.dirblocks.
1117
1118        Process new_entries into the current state object, making them the active
1119        state.  The entries are grouped together by directory to form dirblocks.
1120
1121        :param new_entries: A sorted list of entries. This function does not sort
1122            to prevent unneeded overhead when callers have a sorted list already.
1123        :return: Nothing.
1124        """
1125        if new_entries[0][0][0:2] != (b'', b''):
1126            raise AssertionError(
1127                "Missing root row %r" % (new_entries[0][0],))
1128        # The two blocks here are deliberate: the root block and the
1129        # contents-of-root block.
1130        self._dirblocks = [(b'', []), (b'', [])]
1131        current_block = self._dirblocks[0][1]
1132        current_dirname = b''
1133        root_key = (b'', b'')
1134        append_entry = current_block.append
1135        for entry in new_entries:
1136            if entry[0][0] != current_dirname:
1137                # new block - different dirname
1138                current_block = []
1139                current_dirname = entry[0][0]
1140                self._dirblocks.append((current_dirname, current_block))
1141                append_entry = current_block.append
1142            # append the entry to the current block
1143            append_entry(entry)
1144        self._split_root_dirblock_into_contents()
1145
1146    def _split_root_dirblock_into_contents(self):
1147        """Split the root dirblocks into root and contents-of-root.
1148
1149        After parsing by path, we end up with root entries and contents-of-root
1150        entries in the same block. This loop splits them out again.
1151        """
1152        # The above loop leaves the "root block" entries mixed with the
1153        # "contents-of-root block". But we don't want an if check on
1154        # all entries, so instead we just fix it up here.
1155        if self._dirblocks[1] != (b'', []):
1156            raise ValueError("bad dirblock start %r" % (self._dirblocks[1],))
1157        root_block = []
1158        contents_of_root_block = []
1159        for entry in self._dirblocks[0][1]:
1160            if not entry[0][1]:  # This is a root entry
1161                root_block.append(entry)
1162            else:
1163                contents_of_root_block.append(entry)
1164        self._dirblocks[0] = (b'', root_block)
1165        self._dirblocks[1] = (b'', contents_of_root_block)
1166
1167    def _entries_for_path(self, path):
1168        """Return a list with all the entries that match path for all ids."""
1169        dirname, basename = os.path.split(path)
1170        key = (dirname, basename, b'')
1171        block_index, present = self._find_block_index_from_key(key)
1172        if not present:
1173            # the block which should contain path is absent.
1174            return []
1175        result = []
1176        block = self._dirblocks[block_index][1]
1177        entry_index, _ = self._find_entry_index(key, block)
1178        # we may need to look at multiple entries at this path: walk while the specific_files match.
1179        while (entry_index < len(block) and
1180               block[entry_index][0][0:2] == key[0:2]):
1181            result.append(block[entry_index])
1182            entry_index += 1
1183        return result
1184
1185    def _entry_to_line(self, entry):
1186        """Serialize entry to a NULL delimited line ready for _get_output_lines.
1187
1188        :param entry: An entry_tuple as defined in the module docstring.
1189        """
1190        entire_entry = list(entry[0])
1191        for tree_number, tree_data in enumerate(entry[1]):
1192            # (minikind, fingerprint, size, executable, tree_specific_string)
1193            entire_entry.extend(tree_data)
1194            # 3 for the key, 5 for the fields per tree.
1195            tree_offset = 3 + tree_number * 5
1196            # minikind
1197            entire_entry[tree_offset + 0] = tree_data[0]
1198            # size
1199            entire_entry[tree_offset + 2] = b'%d' % tree_data[2]
1200            # executable
1201            entire_entry[tree_offset + 3] = DirState._to_yesno[tree_data[3]]
1202        return b'\0'.join(entire_entry)
1203
1204    def _fields_per_entry(self):
1205        """How many null separated fields should be in each entry row.
1206
1207        Each line now has an extra '\\n' field which is not used
1208        so we just skip over it
1209
1210        entry size::
1211            3 fields for the key
1212            + number of fields per tree_data (5) * tree count
1213            + newline
1214         """
1215        tree_count = 1 + self._num_present_parents()
1216        return 3 + 5 * tree_count + 1
1217
1218    def _find_block(self, key, add_if_missing=False):
1219        """Return the block that key should be present in.
1220
1221        :param key: A dirstate entry key.
1222        :return: The block tuple.
1223        """
1224        block_index, present = self._find_block_index_from_key(key)
1225        if not present:
1226            if not add_if_missing:
1227                # check to see if key is versioned itself - we might want to
1228                # add it anyway, because dirs with no entries dont get a
1229                # dirblock at parse time.
1230                # This is an uncommon branch to take: most dirs have children,
1231                # and most code works with versioned paths.
1232                parent_base, parent_name = osutils.split(key[0])
1233                if not self._get_block_entry_index(parent_base, parent_name, 0)[3]:
1234                    # some parent path has not been added - its an error to add
1235                    # this child
1236                    raise errors.NotVersionedError(key[0:2], str(self))
1237            self._dirblocks.insert(block_index, (key[0], []))
1238        return self._dirblocks[block_index]
1239
1240    def _find_block_index_from_key(self, key):
1241        """Find the dirblock index for a key.
1242
1243        :return: The block index, True if the block for the key is present.
1244        """
1245        if key[0:2] == (b'', b''):
1246            return 0, True
1247        try:
1248            if (self._last_block_index is not None and
1249                    self._dirblocks[self._last_block_index][0] == key[0]):
1250                return self._last_block_index, True
1251        except IndexError:
1252            pass
1253        block_index = bisect_dirblock(self._dirblocks, key[0], 1,
1254                                      cache=self._split_path_cache)
1255        # _right returns one-past-where-key is so we have to subtract
1256        # one to use it. we use _right here because there are two
1257        # b'' blocks - the root, and the contents of root
1258        # we always have a minimum of 2 in self._dirblocks: root and
1259        # root-contents, and for b'', we get 2 back, so this is
1260        # simple and correct:
1261        present = (block_index < len(self._dirblocks) and
1262                   self._dirblocks[block_index][0] == key[0])
1263        self._last_block_index = block_index
1264        # Reset the entry index cache to the beginning of the block.
1265        self._last_entry_index = -1
1266        return block_index, present
1267
1268    def _find_entry_index(self, key, block):
1269        """Find the entry index for a key in a block.
1270
1271        :return: The entry index, True if the entry for the key is present.
1272        """
1273        len_block = len(block)
1274        try:
1275            if self._last_entry_index is not None:
1276                # mini-bisect here.
1277                entry_index = self._last_entry_index + 1
1278                # A hit is when the key is after the last slot, and before or
1279                # equal to the next slot.
1280                if ((entry_index > 0 and block[entry_index - 1][0] < key) and
1281                        key <= block[entry_index][0]):
1282                    self._last_entry_index = entry_index
1283                    present = (block[entry_index][0] == key)
1284                    return entry_index, present
1285        except IndexError:
1286            pass
1287        entry_index = bisect.bisect_left(block, (key, []))
1288        present = (entry_index < len_block and
1289                   block[entry_index][0] == key)
1290        self._last_entry_index = entry_index
1291        return entry_index, present
1292
1293    @staticmethod
1294    def from_tree(tree, dir_state_filename, sha1_provider=None):
1295        """Create a dirstate from a bzr Tree.
1296
1297        :param tree: The tree which should provide parent information and
1298            inventory ids.
1299        :param sha1_provider: an object meeting the SHA1Provider interface.
1300            If None, a DefaultSHA1Provider is used.
1301        :return: a DirState object which is currently locked for writing.
1302            (it was locked by DirState.initialize)
1303        """
1304        result = DirState.initialize(dir_state_filename,
1305                                     sha1_provider=sha1_provider)
1306        try:
1307            with contextlib.ExitStack() as exit_stack:
1308                exit_stack.enter_context(tree.lock_read())
1309                parent_ids = tree.get_parent_ids()
1310                num_parents = len(parent_ids)
1311                parent_trees = []
1312                for parent_id in parent_ids:
1313                    parent_tree = tree.branch.repository.revision_tree(
1314                        parent_id)
1315                    parent_trees.append((parent_id, parent_tree))
1316                    exit_stack.enter_context(parent_tree.lock_read())
1317                result.set_parent_trees(parent_trees, [])
1318                result.set_state_from_inventory(tree.root_inventory)
1319        except:
1320            # The caller won't have a chance to unlock this, so make sure we
1321            # cleanup ourselves
1322            result.unlock()
1323            raise
1324        return result
1325
1326    def _check_delta_is_valid(self, delta):
1327        delta = list(inventory._check_delta_unique_ids(
1328                     inventory._check_delta_unique_old_paths(
1329                         inventory._check_delta_unique_new_paths(
1330                             inventory._check_delta_ids_match_entry(
1331                                 inventory._check_delta_ids_are_valid(
1332                                     inventory._check_delta_new_path_entry_both_or_None(delta)))))))
1333
1334        def delta_key(d):
1335            (old_path, new_path, file_id, new_entry) = d
1336            if old_path is None:
1337                old_path = ''
1338            if new_path is None:
1339                new_path = ''
1340            return (old_path, new_path, file_id, new_entry)
1341        delta.sort(key=delta_key, reverse=True)
1342        return delta
1343
1344    def update_by_delta(self, delta):
1345        """Apply an inventory delta to the dirstate for tree 0
1346
1347        This is the workhorse for apply_inventory_delta in dirstate based
1348        trees.
1349
1350        :param delta: An inventory delta.  See Inventory.apply_delta for
1351            details.
1352        """
1353        self._read_dirblocks_if_needed()
1354        encode = cache_utf8.encode
1355        insertions = {}
1356        removals = {}
1357        # Accumulate parent references (path_utf8, id), to check for parentless
1358        # items or items placed under files/links/tree-references. We get
1359        # references from every item in the delta that is not a deletion and
1360        # is not itself the root.
1361        parents = set()
1362        # Added ids must not be in the dirstate already. This set holds those
1363        # ids.
1364        new_ids = set()
1365        # This loop transforms the delta to single atomic operations that can
1366        # be executed and validated.
1367        delta = self._check_delta_is_valid(delta)
1368        for old_path, new_path, file_id, inv_entry in delta:
1369            if not isinstance(file_id, bytes):
1370                raise AssertionError(
1371                    "must be a utf8 file_id not %s" % (type(file_id), ))
1372            if (file_id in insertions) or (file_id in removals):
1373                self._raise_invalid(old_path or new_path, file_id,
1374                                    "repeated file_id")
1375            if old_path is not None:
1376                old_path = old_path.encode('utf-8')
1377                removals[file_id] = old_path
1378            else:
1379                new_ids.add(file_id)
1380            if new_path is not None:
1381                if inv_entry is None:
1382                    self._raise_invalid(new_path, file_id,
1383                                        "new_path with no entry")
1384                new_path = new_path.encode('utf-8')
1385                dirname_utf8, basename = osutils.split(new_path)
1386                if basename:
1387                    parents.add((dirname_utf8, inv_entry.parent_id))
1388                key = (dirname_utf8, basename, file_id)
1389                minikind = DirState._kind_to_minikind[inv_entry.kind]
1390                if minikind == b't':
1391                    fingerprint = inv_entry.reference_revision or b''
1392                else:
1393                    fingerprint = b''
1394                insertions[file_id] = (key, minikind, inv_entry.executable,
1395                                       fingerprint, new_path)
1396            # Transform moves into delete+add pairs
1397            if None not in (old_path, new_path):
1398                for child in self._iter_child_entries(0, old_path):
1399                    if child[0][2] in insertions or child[0][2] in removals:
1400                        continue
1401                    child_dirname = child[0][0]
1402                    child_basename = child[0][1]
1403                    minikind = child[1][0][0]
1404                    fingerprint = child[1][0][4]
1405                    executable = child[1][0][3]
1406                    old_child_path = osutils.pathjoin(child_dirname,
1407                                                      child_basename)
1408                    removals[child[0][2]] = old_child_path
1409                    child_suffix = child_dirname[len(old_path):]
1410                    new_child_dirname = (new_path + child_suffix)
1411                    key = (new_child_dirname, child_basename, child[0][2])
1412                    new_child_path = osutils.pathjoin(new_child_dirname,
1413                                                      child_basename)
1414                    insertions[child[0][2]] = (key, minikind, executable,
1415                                               fingerprint, new_child_path)
1416        self._check_delta_ids_absent(new_ids, delta, 0)
1417        try:
1418            self._apply_removals(removals.items())
1419            self._apply_insertions(insertions.values())
1420            # Validate parents
1421            self._after_delta_check_parents(parents, 0)
1422        except errors.BzrError as e:
1423            self._changes_aborted = True
1424            if 'integrity error' not in str(e):
1425                raise
1426            # _get_entry raises BzrError when a request is inconsistent; we
1427            # want such errors to be shown as InconsistentDelta - and that
1428            # fits the behaviour we trigger.
1429            raise errors.InconsistentDeltaDelta(delta,
1430                                                "error from _get_entry. %s" % (e,))
1431
1432    def _apply_removals(self, removals):
1433        for file_id, path in sorted(removals, reverse=True,
1434                                    key=operator.itemgetter(1)):
1435            dirname, basename = osutils.split(path)
1436            block_i, entry_i, d_present, f_present = \
1437                self._get_block_entry_index(dirname, basename, 0)
1438            try:
1439                entry = self._dirblocks[block_i][1][entry_i]
1440            except IndexError:
1441                self._raise_invalid(path, file_id,
1442                                    "Wrong path for old path.")
1443            if not f_present or entry[1][0][0] in (b'a', b'r'):
1444                self._raise_invalid(path, file_id,
1445                                    "Wrong path for old path.")
1446            if file_id != entry[0][2]:
1447                self._raise_invalid(path, file_id,
1448                                    "Attempt to remove path has wrong id - found %r."
1449                                    % entry[0][2])
1450            self._make_absent(entry)
1451            # See if we have a malformed delta: deleting a directory must not
1452            # leave crud behind. This increases the number of bisects needed
1453            # substantially, but deletion or renames of large numbers of paths
1454            # is rare enough it shouldn't be an issue (famous last words?) RBC
1455            # 20080730.
1456            block_i, entry_i, d_present, f_present = \
1457                self._get_block_entry_index(path, b'', 0)
1458            if d_present:
1459                # The dir block is still present in the dirstate; this could
1460                # be due to it being in a parent tree, or a corrupt delta.
1461                for child_entry in self._dirblocks[block_i][1]:
1462                    if child_entry[1][0][0] not in (b'r', b'a'):
1463                        self._raise_invalid(path, entry[0][2],
1464                                            "The file id was deleted but its children were "
1465                                            "not deleted.")
1466
1467    def _apply_insertions(self, adds):
1468        try:
1469            for key, minikind, executable, fingerprint, path_utf8 in sorted(adds):
1470                self.update_minimal(key, minikind, executable, fingerprint,
1471                                    path_utf8=path_utf8)
1472        except errors.NotVersionedError:
1473            self._raise_invalid(path_utf8.decode('utf8'), key[2],
1474                                "Missing parent")
1475
1476    def update_basis_by_delta(self, delta, new_revid):
1477        """Update the parents of this tree after a commit.
1478
1479        This gives the tree one parent, with revision id new_revid. The
1480        inventory delta is applied to the current basis tree to generate the
1481        inventory for the parent new_revid, and all other parent trees are
1482        discarded.
1483
1484        Note that an exception during the operation of this method will leave
1485        the dirstate in a corrupt state where it should not be saved.
1486
1487        :param new_revid: The new revision id for the trees parent.
1488        :param delta: An inventory delta (see apply_inventory_delta) describing
1489            the changes from the current left most parent revision to new_revid.
1490        """
1491        self._read_dirblocks_if_needed()
1492        self._discard_merge_parents()
1493        if self._ghosts != []:
1494            raise NotImplementedError(self.update_basis_by_delta)
1495        if len(self._parents) == 0:
1496            # setup a blank tree, the most simple way.
1497            empty_parent = DirState.NULL_PARENT_DETAILS
1498            for entry in self._iter_entries():
1499                entry[1].append(empty_parent)
1500            self._parents.append(new_revid)
1501
1502        self._parents[0] = new_revid
1503
1504        delta = self._check_delta_is_valid(delta)
1505        adds = []
1506        changes = []
1507        deletes = []
1508        # The paths this function accepts are unicode and must be encoded as we
1509        # go.
1510        encode = cache_utf8.encode
1511        inv_to_entry = self._inv_entry_to_details
1512        # delta is now (deletes, changes), (adds) in reverse lexographical
1513        # order.
1514        # deletes in reverse lexographic order are safe to process in situ.
1515        # renames are not, as a rename from any path could go to a path
1516        # lexographically lower, so we transform renames into delete, add pairs,
1517        # expanding them recursively as needed.
1518        # At the same time, to reduce interface friction we convert the input
1519        # inventory entries to dirstate.
1520        root_only = ('', '')
1521        # Accumulate parent references (path_utf8, id), to check for parentless
1522        # items or items placed under files/links/tree-references. We get
1523        # references from every item in the delta that is not a deletion and
1524        # is not itself the root.
1525        parents = set()
1526        # Added ids must not be in the dirstate already. This set holds those
1527        # ids.
1528        new_ids = set()
1529        for old_path, new_path, file_id, inv_entry in delta:
1530            if file_id.__class__ is not bytes:
1531                raise AssertionError(
1532                    "must be a utf8 file_id not %s" % (type(file_id), ))
1533            if inv_entry is not None and file_id != inv_entry.file_id:
1534                self._raise_invalid(new_path, file_id,
1535                                    "mismatched entry file_id %r" % inv_entry)
1536            if new_path is None:
1537                new_path_utf8 = None
1538            else:
1539                if inv_entry is None:
1540                    self._raise_invalid(new_path, file_id,
1541                                        "new_path with no entry")
1542                new_path_utf8 = encode(new_path)
1543                # note the parent for validation
1544                dirname_utf8, basename_utf8 = osutils.split(new_path_utf8)
1545                if basename_utf8:
1546                    parents.add((dirname_utf8, inv_entry.parent_id))
1547            if old_path is None:
1548                old_path_utf8 = None
1549            else:
1550                old_path_utf8 = encode(old_path)
1551            if old_path is None:
1552                adds.append((None, new_path_utf8, file_id,
1553                             inv_to_entry(inv_entry), True))
1554                new_ids.add(file_id)
1555            elif new_path is None:
1556                deletes.append((old_path_utf8, None, file_id, None, True))
1557            elif (old_path, new_path) == root_only:
1558                # change things in-place
1559                # Note: the case of a parent directory changing its file_id
1560                #       tends to break optimizations here, because officially
1561                #       the file has actually been moved, it just happens to
1562                #       end up at the same path. If we can figure out how to
1563                #       handle that case, we can avoid a lot of add+delete
1564                #       pairs for objects that stay put.
1565                # elif old_path == new_path:
1566                changes.append((old_path_utf8, new_path_utf8, file_id,
1567                                inv_to_entry(inv_entry)))
1568            else:
1569                # Renames:
1570                # Because renames must preserve their children we must have
1571                # processed all relocations and removes before hand. The sort
1572                # order ensures we've examined the child paths, but we also
1573                # have to execute the removals, or the split to an add/delete
1574                # pair will result in the deleted item being reinserted, or
1575                # renamed items being reinserted twice - and possibly at the
1576                # wrong place. Splitting into a delete/add pair also simplifies
1577                # the handling of entries with (b'f', ...), (b'r' ...) because
1578                # the target of the b'r' is old_path here, and we add that to
1579                # deletes, meaning that the add handler does not need to check
1580                # for b'r' items on every pass.
1581                self._update_basis_apply_deletes(deletes)
1582                deletes = []
1583                # Split into an add/delete pair recursively.
1584                adds.append((old_path_utf8, new_path_utf8, file_id,
1585                             inv_to_entry(inv_entry), False))
1586                # Expunge deletes that we've seen so that deleted/renamed
1587                # children of a rename directory are handled correctly.
1588                new_deletes = reversed(list(
1589                    self._iter_child_entries(1, old_path_utf8)))
1590                # Remove the current contents of the tree at orig_path, and
1591                # reinsert at the correct new path.
1592                for entry in new_deletes:
1593                    child_dirname, child_basename, child_file_id = entry[0]
1594                    if child_dirname:
1595                        source_path = child_dirname + b'/' + child_basename
1596                    else:
1597                        source_path = child_basename
1598                    if new_path_utf8:
1599                        target_path = \
1600                            new_path_utf8 + source_path[len(old_path_utf8):]
1601                    else:
1602                        if old_path_utf8 == b'':
1603                            raise AssertionError("cannot rename directory to"
1604                                                 " itself")
1605                        target_path = source_path[len(old_path_utf8) + 1:]
1606                    adds.append(
1607                        (None, target_path, entry[0][2], entry[1][1], False))
1608                    deletes.append(
1609                        (source_path, target_path, entry[0][2], None, False))
1610                deletes.append(
1611                    (old_path_utf8, new_path_utf8, file_id, None, False))
1612
1613        self._check_delta_ids_absent(new_ids, delta, 1)
1614        try:
1615            # Finish expunging deletes/first half of renames.
1616            self._update_basis_apply_deletes(deletes)
1617            # Reinstate second half of renames and new paths.
1618            self._update_basis_apply_adds(adds)
1619            # Apply in-situ changes.
1620            self._update_basis_apply_changes(changes)
1621            # Validate parents
1622            self._after_delta_check_parents(parents, 1)
1623        except errors.BzrError as e:
1624            self._changes_aborted = True
1625            if 'integrity error' not in str(e):
1626                raise
1627            # _get_entry raises BzrError when a request is inconsistent; we
1628            # want such errors to be shown as InconsistentDelta - and that
1629            # fits the behaviour we trigger.
1630            raise errors.InconsistentDeltaDelta(delta,
1631                                                "error from _get_entry. %s" % (e,))
1632
1633        self._mark_modified(header_modified=True)
1634        self._id_index = None
1635        return
1636
1637    def _check_delta_ids_absent(self, new_ids, delta, tree_index):
1638        """Check that none of the file_ids in new_ids are present in a tree."""
1639        if not new_ids:
1640            return
1641        id_index = self._get_id_index()
1642        for file_id in new_ids:
1643            for key in id_index.get(file_id, ()):
1644                block_i, entry_i, d_present, f_present = \
1645                    self._get_block_entry_index(key[0], key[1], tree_index)
1646                if not f_present:
1647                    # In a different tree
1648                    continue
1649                entry = self._dirblocks[block_i][1][entry_i]
1650                if entry[0][2] != file_id:
1651                    # Different file_id, so not what we want.
1652                    continue
1653                self._raise_invalid((b"%s/%s" % key[0:2]).decode('utf8'), file_id,
1654                                    "This file_id is new in the delta but already present in "
1655                                    "the target")
1656
1657    def _raise_invalid(self, path, file_id, reason):
1658        self._changes_aborted = True
1659        raise errors.InconsistentDelta(path, file_id, reason)
1660
1661    def _update_basis_apply_adds(self, adds):
1662        """Apply a sequence of adds to tree 1 during update_basis_by_delta.
1663
1664        They may be adds, or renames that have been split into add/delete
1665        pairs.
1666
1667        :param adds: A sequence of adds. Each add is a tuple:
1668            (None, new_path_utf8, file_id, (entry_details), real_add). real_add
1669            is False when the add is the second half of a remove-and-reinsert
1670            pair created to handle renames and deletes.
1671        """
1672        # Adds are accumulated partly from renames, so can be in any input
1673        # order - sort it.
1674        # TODO: we may want to sort in dirblocks order. That way each entry
1675        #       will end up in the same directory, allowing the _get_entry
1676        #       fast-path for looking up 2 items in the same dir work.
1677        adds.sort(key=lambda x: x[1])
1678        # adds is now in lexographic order, which places all parents before
1679        # their children, so we can process it linearly.
1680        st = static_tuple.StaticTuple
1681        for old_path, new_path, file_id, new_details, real_add in adds:
1682            dirname, basename = osutils.split(new_path)
1683            entry_key = st(dirname, basename, file_id)
1684            block_index, present = self._find_block_index_from_key(entry_key)
1685            if not present:
1686                # The block where we want to put the file is not present.
1687                # However, it might have just been an empty directory. Look for
1688                # the parent in the basis-so-far before throwing an error.
1689                parent_dir, parent_base = osutils.split(dirname)
1690                parent_block_idx, parent_entry_idx, _, parent_present = \
1691                    self._get_block_entry_index(parent_dir, parent_base, 1)
1692                if not parent_present:
1693                    self._raise_invalid(new_path, file_id,
1694                                        "Unable to find block for this record."
1695                                        " Was the parent added?")
1696                self._ensure_block(parent_block_idx, parent_entry_idx, dirname)
1697
1698            block = self._dirblocks[block_index][1]
1699            entry_index, present = self._find_entry_index(entry_key, block)
1700            if real_add:
1701                if old_path is not None:
1702                    self._raise_invalid(new_path, file_id,
1703                                        'considered a real add but still had old_path at %s'
1704                                        % (old_path,))
1705            if present:
1706                entry = block[entry_index]
1707                basis_kind = entry[1][1][0]
1708                if basis_kind == b'a':
1709                    entry[1][1] = new_details
1710                elif basis_kind == b'r':
1711                    raise NotImplementedError()
1712                else:
1713                    self._raise_invalid(new_path, file_id,
1714                                        "An entry was marked as a new add"
1715                                        " but the basis target already existed")
1716            else:
1717                # The exact key was not found in the block. However, we need to
1718                # check if there is a key next to us that would have matched.
1719                # We only need to check 2 locations, because there are only 2
1720                # trees present.
1721                for maybe_index in range(entry_index - 1, entry_index + 1):
1722                    if maybe_index < 0 or maybe_index >= len(block):
1723                        continue
1724                    maybe_entry = block[maybe_index]
1725                    if maybe_entry[0][:2] != (dirname, basename):
1726                        # Just a random neighbor
1727                        continue
1728                    if maybe_entry[0][2] == file_id:
1729                        raise AssertionError(
1730                            '_find_entry_index didnt find a key match'
1731                            ' but walking the data did, for %s'
1732                            % (entry_key,))
1733                    basis_kind = maybe_entry[1][1][0]
1734                    if basis_kind not in (b'a', b'r'):
1735                        self._raise_invalid(new_path, file_id,
1736                                            "we have an add record for path, but the path"
1737                                            " is already present with another file_id %s"
1738                                            % (maybe_entry[0][2],))
1739
1740                entry = (entry_key, [DirState.NULL_PARENT_DETAILS,
1741                                     new_details])
1742                block.insert(entry_index, entry)
1743
1744            active_kind = entry[1][0][0]
1745            if active_kind == b'a':
1746                # The active record shows up as absent, this could be genuine,
1747                # or it could be present at some other location. We need to
1748                # verify.
1749                id_index = self._get_id_index()
1750                # The id_index may not be perfectly accurate for tree1, because
1751                # we haven't been keeping it updated. However, it should be
1752                # fine for tree0, and that gives us enough info for what we
1753                # need
1754                keys = id_index.get(file_id, ())
1755                for key in keys:
1756                    block_i, entry_i, d_present, f_present = \
1757                        self._get_block_entry_index(key[0], key[1], 0)
1758                    if not f_present:
1759                        continue
1760                    active_entry = self._dirblocks[block_i][1][entry_i]
1761                    if (active_entry[0][2] != file_id):
1762                        # Some other file is at this path, we don't need to
1763                        # link it.
1764                        continue
1765                    real_active_kind = active_entry[1][0][0]
1766                    if real_active_kind in (b'a', b'r'):
1767                        # We found a record, which was not *this* record,
1768                        # which matches the file_id, but is not actually
1769                        # present. Something seems *really* wrong.
1770                        self._raise_invalid(new_path, file_id,
1771                                            "We found a tree0 entry that doesnt make sense")
1772                    # Now, we've found a tree0 entry which matches the file_id
1773                    # but is at a different location. So update them to be
1774                    # rename records.
1775                    active_dir, active_name = active_entry[0][:2]
1776                    if active_dir:
1777                        active_path = active_dir + b'/' + active_name
1778                    else:
1779                        active_path = active_name
1780                    active_entry[1][1] = st(b'r', new_path, 0, False, b'')
1781                    entry[1][0] = st(b'r', active_path, 0, False, b'')
1782            elif active_kind == b'r':
1783                raise NotImplementedError()
1784
1785            new_kind = new_details[0]
1786            if new_kind == b'd':
1787                self._ensure_block(block_index, entry_index, new_path)
1788
1789    def _update_basis_apply_changes(self, changes):
1790        """Apply a sequence of changes to tree 1 during update_basis_by_delta.
1791
1792        :param adds: A sequence of changes. Each change is a tuple:
1793            (path_utf8, path_utf8, file_id, (entry_details))
1794        """
1795        for old_path, new_path, file_id, new_details in changes:
1796            # the entry for this file_id must be in tree 0.
1797            entry = self._get_entry(1, file_id, new_path)
1798            if entry[0] is None or entry[1][1][0] in (b'a', b'r'):
1799                self._raise_invalid(new_path, file_id,
1800                                    'changed entry considered not present')
1801            entry[1][1] = new_details
1802
1803    def _update_basis_apply_deletes(self, deletes):
1804        """Apply a sequence of deletes to tree 1 during update_basis_by_delta.
1805
1806        They may be deletes, or renames that have been split into add/delete
1807        pairs.
1808
1809        :param deletes: A sequence of deletes. Each delete is a tuple:
1810            (old_path_utf8, new_path_utf8, file_id, None, real_delete).
1811            real_delete is True when the desired outcome is an actual deletion
1812            rather than the rename handling logic temporarily deleting a path
1813            during the replacement of a parent.
1814        """
1815        null = DirState.NULL_PARENT_DETAILS
1816        for old_path, new_path, file_id, _, real_delete in deletes:
1817            if real_delete != (new_path is None):
1818                self._raise_invalid(old_path, file_id, "bad delete delta")
1819            # the entry for this file_id must be in tree 1.
1820            dirname, basename = osutils.split(old_path)
1821            block_index, entry_index, dir_present, file_present = \
1822                self._get_block_entry_index(dirname, basename, 1)
1823            if not file_present:
1824                self._raise_invalid(old_path, file_id,
1825                                    'basis tree does not contain removed entry')
1826            entry = self._dirblocks[block_index][1][entry_index]
1827            # The state of the entry in the 'active' WT
1828            active_kind = entry[1][0][0]
1829            if entry[0][2] != file_id:
1830                self._raise_invalid(old_path, file_id,
1831                                    'mismatched file_id in tree 1')
1832            dir_block = ()
1833            old_kind = entry[1][1][0]
1834            if active_kind in b'ar':
1835                # The active tree doesn't have this file_id.
1836                # The basis tree is changing this record. If this is a
1837                # rename, then we don't want the record here at all
1838                # anymore. If it is just an in-place change, we want the
1839                # record here, but we'll add it if we need to. So we just
1840                # delete it
1841                if active_kind == b'r':
1842                    active_path = entry[1][0][1]
1843                    active_entry = self._get_entry(0, file_id, active_path)
1844                    if active_entry[1][1][0] != b'r':
1845                        self._raise_invalid(old_path, file_id,
1846                                            "Dirstate did not have matching rename entries")
1847                    elif active_entry[1][0][0] in b'ar':
1848                        self._raise_invalid(old_path, file_id,
1849                                            "Dirstate had a rename pointing at an inactive"
1850                                            " tree0")
1851                    active_entry[1][1] = null
1852                del self._dirblocks[block_index][1][entry_index]
1853                if old_kind == b'd':
1854                    # This was a directory, and the active tree says it
1855                    # doesn't exist, and now the basis tree says it doesn't
1856                    # exist. Remove its dirblock if present
1857                    (dir_block_index,
1858                     present) = self._find_block_index_from_key(
1859                        (old_path, b'', b''))
1860                    if present:
1861                        dir_block = self._dirblocks[dir_block_index][1]
1862                        if not dir_block:
1863                            # This entry is empty, go ahead and just remove it
1864                            del self._dirblocks[dir_block_index]
1865            else:
1866                # There is still an active record, so just mark this
1867                # removed.
1868                entry[1][1] = null
1869                block_i, entry_i, d_present, f_present = \
1870                    self._get_block_entry_index(old_path, b'', 1)
1871                if d_present:
1872                    dir_block = self._dirblocks[block_i][1]
1873            for child_entry in dir_block:
1874                child_basis_kind = child_entry[1][1][0]
1875                if child_basis_kind not in b'ar':
1876                    self._raise_invalid(old_path, file_id,
1877                                        "The file id was deleted but its children were "
1878                                        "not deleted.")
1879
1880    def _after_delta_check_parents(self, parents, index):
1881        """Check that parents required by the delta are all intact.
1882
1883        :param parents: An iterable of (path_utf8, file_id) tuples which are
1884            required to be present in tree 'index' at path_utf8 with id file_id
1885            and be a directory.
1886        :param index: The column in the dirstate to check for parents in.
1887        """
1888        for dirname_utf8, file_id in parents:
1889            # Get the entry - the ensures that file_id, dirname_utf8 exists and
1890            # has the right file id.
1891            entry = self._get_entry(index, file_id, dirname_utf8)
1892            if entry[1] is None:
1893                self._raise_invalid(dirname_utf8.decode('utf8'),
1894                                    file_id, "This parent is not present.")
1895            # Parents of things must be directories
1896            if entry[1][index][0] != b'd':
1897                self._raise_invalid(dirname_utf8.decode('utf8'),
1898                                    file_id, "This parent is not a directory.")
1899
1900    def _observed_sha1(self, entry, sha1, stat_value,
1901                       _stat_to_minikind=_stat_to_minikind):
1902        """Note the sha1 of a file.
1903
1904        :param entry: The entry the sha1 is for.
1905        :param sha1: The observed sha1.
1906        :param stat_value: The os.lstat for the file.
1907        """
1908        try:
1909            minikind = _stat_to_minikind[stat_value.st_mode & 0o170000]
1910        except KeyError:
1911            # Unhandled kind
1912            return None
1913        if minikind == b'f':
1914            if self._cutoff_time is None:
1915                self._sha_cutoff_time()
1916            if (stat_value.st_mtime < self._cutoff_time
1917                    and stat_value.st_ctime < self._cutoff_time):
1918                entry[1][0] = (b'f', sha1, stat_value.st_size, entry[1][0][3],
1919                               pack_stat(stat_value))
1920                self._mark_modified([entry])
1921
1922    def _sha_cutoff_time(self):
1923        """Return cutoff time.
1924
1925        Files modified more recently than this time are at risk of being
1926        undetectably modified and so can't be cached.
1927        """
1928        # Cache the cutoff time as long as we hold a lock.
1929        # time.time() isn't super expensive (approx 3.38us), but
1930        # when you call it 50,000 times it adds up.
1931        # For comparison, os.lstat() costs 7.2us if it is hot.
1932        self._cutoff_time = int(time.time()) - 3
1933        return self._cutoff_time
1934
1935    def _lstat(self, abspath, entry):
1936        """Return the os.lstat value for this path."""
1937        return os.lstat(abspath)
1938
1939    def _sha1_file_and_mutter(self, abspath):
1940        # when -Dhashcache is turned on, this is monkey-patched in to log
1941        # file reads
1942        trace.mutter("dirstate sha1 " + abspath)
1943        return self._sha1_provider.sha1(abspath)
1944
1945    def _is_executable(self, mode, old_executable):
1946        """Is this file executable?"""
1947        if self._use_filesystem_for_exec:
1948            return bool(S_IEXEC & mode)
1949        else:
1950            return old_executable
1951
1952    def _read_link(self, abspath, old_link):
1953        """Read the target of a symlink"""
1954        # TODO: jam 200700301 On Win32, this could just return the value
1955        #       already in memory. However, this really needs to be done at a
1956        #       higher level, because there either won't be anything on disk,
1957        #       or the thing on disk will be a file.
1958        fs_encoding = osutils._fs_enc
1959        if isinstance(abspath, str):
1960            # abspath is defined as the path to pass to lstat. readlink is
1961            # buggy in python < 2.6 (it doesn't encode unicode path into FS
1962            # encoding), so we need to encode ourselves knowing that unicode
1963            # paths are produced by UnicodeDirReader on purpose.
1964            abspath = abspath.encode(fs_encoding)
1965        target = os.readlink(abspath)
1966        if fs_encoding not in ('utf-8', 'ascii'):
1967            # Change encoding if needed
1968            target = target.decode(fs_encoding).encode('UTF-8')
1969        return target
1970
1971    def get_ghosts(self):
1972        """Return a list of the parent tree revision ids that are ghosts."""
1973        self._read_header_if_needed()
1974        return self._ghosts
1975
1976    def get_lines(self):
1977        """Serialise the entire dirstate to a sequence of lines."""
1978        if (self._header_state == DirState.IN_MEMORY_UNMODIFIED and
1979                self._dirblock_state == DirState.IN_MEMORY_UNMODIFIED):
1980            # read what's on disk.
1981            self._state_file.seek(0)
1982            return self._state_file.readlines()
1983        lines = []
1984        lines.append(self._get_parents_line(self.get_parent_ids()))
1985        lines.append(self._get_ghosts_line(self._ghosts))
1986        lines.extend(self._iter_entry_lines())
1987        return self._get_output_lines(lines)
1988
1989    def _get_ghosts_line(self, ghost_ids):
1990        """Create a line for the state file for ghost information."""
1991        return b'\0'.join([b'%d' % len(ghost_ids)] + ghost_ids)
1992
1993    def _get_parents_line(self, parent_ids):
1994        """Create a line for the state file for parents information."""
1995        return b'\0'.join([b'%d' % len(parent_ids)] + parent_ids)
1996
1997    def _iter_entry_lines(self):
1998        """Create lines for entries."""
1999        return map(self._entry_to_line, self._iter_entries())
2000
2001    def _get_fields_to_entry(self):
2002        """Get a function which converts entry fields into a entry record.
2003
2004        This handles size and executable, as well as parent records.
2005
2006        :return: A function which takes a list of fields, and returns an
2007            appropriate record for storing in memory.
2008        """
2009        # This is intentionally unrolled for performance
2010        num_present_parents = self._num_present_parents()
2011        if num_present_parents == 0:
2012            def fields_to_entry_0_parents(fields, _int=int):
2013                path_name_file_id_key = (fields[0], fields[1], fields[2])
2014                return (path_name_file_id_key, [
2015                    (  # Current tree
2016                        fields[3],                # minikind
2017                        fields[4],                # fingerprint
2018                        _int(fields[5]),          # size
2019                        fields[6] == b'y',         # executable
2020                        fields[7],                # packed_stat or revision_id
2021                    )])
2022            return fields_to_entry_0_parents
2023        elif num_present_parents == 1:
2024            def fields_to_entry_1_parent(fields, _int=int):
2025                path_name_file_id_key = (fields[0], fields[1], fields[2])
2026                return (path_name_file_id_key, [
2027                    (  # Current tree
2028                        fields[3],                # minikind
2029                        fields[4],                # fingerprint
2030                        _int(fields[5]),          # size
2031                        fields[6] == b'y',         # executable
2032                        fields[7],                # packed_stat or revision_id
2033                    ),
2034                    (  # Parent 1
2035                        fields[8],                # minikind
2036                        fields[9],                # fingerprint
2037                        _int(fields[10]),         # size
2038                        fields[11] == b'y',        # executable
2039                        fields[12],               # packed_stat or revision_id
2040                    ),
2041                    ])
2042            return fields_to_entry_1_parent
2043        elif num_present_parents == 2:
2044            def fields_to_entry_2_parents(fields, _int=int):
2045                path_name_file_id_key = (fields[0], fields[1], fields[2])
2046                return (path_name_file_id_key, [
2047                    (  # Current tree
2048                        fields[3],                # minikind
2049                        fields[4],                # fingerprint
2050                        _int(fields[5]),          # size
2051                        fields[6] == b'y',         # executable
2052                        fields[7],                # packed_stat or revision_id
2053                    ),
2054                    (  # Parent 1
2055                        fields[8],                # minikind
2056                        fields[9],                # fingerprint
2057                        _int(fields[10]),         # size
2058                        fields[11] == b'y',        # executable
2059                        fields[12],               # packed_stat or revision_id
2060                    ),
2061                    (  # Parent 2
2062                        fields[13],               # minikind
2063                        fields[14],               # fingerprint
2064                        _int(fields[15]),         # size
2065                        fields[16] == b'y',        # executable
2066                        fields[17],               # packed_stat or revision_id
2067                    ),
2068                    ])
2069            return fields_to_entry_2_parents
2070        else:
2071            def fields_to_entry_n_parents(fields, _int=int):
2072                path_name_file_id_key = (fields[0], fields[1], fields[2])
2073                trees = [(fields[cur],                # minikind
2074                          fields[cur + 1],              # fingerprint
2075                          _int(fields[cur + 2]),        # size
2076                          fields[cur + 3] == b'y',       # executable
2077                          fields[cur + 4],              # stat or revision_id
2078                          ) for cur in range(3, len(fields) - 1, 5)]
2079                return path_name_file_id_key, trees
2080            return fields_to_entry_n_parents
2081
2082    def get_parent_ids(self):
2083        """Return a list of the parent tree ids for the directory state."""
2084        self._read_header_if_needed()
2085        return list(self._parents)
2086
2087    def _get_block_entry_index(self, dirname, basename, tree_index):
2088        """Get the coordinates for a path in the state structure.
2089
2090        :param dirname: The utf8 dirname to lookup.
2091        :param basename: The utf8 basename to lookup.
2092        :param tree_index: The index of the tree for which this lookup should
2093            be attempted.
2094        :return: A tuple describing where the path is located, or should be
2095            inserted. The tuple contains four fields: the block index, the row
2096            index, the directory is present (boolean), the entire path is
2097            present (boolean).  There is no guarantee that either
2098            coordinate is currently reachable unless the found field for it is
2099            True. For instance, a directory not present in the searched tree
2100            may be returned with a value one greater than the current highest
2101            block offset. The directory present field will always be True when
2102            the path present field is True. The directory present field does
2103            NOT indicate that the directory is present in the searched tree,
2104            rather it indicates that there are at least some files in some
2105            tree present there.
2106        """
2107        self._read_dirblocks_if_needed()
2108        key = dirname, basename, b''
2109        block_index, present = self._find_block_index_from_key(key)
2110        if not present:
2111            # no such directory - return the dir index and 0 for the row.
2112            return block_index, 0, False, False
2113        block = self._dirblocks[block_index][1]  # access the entries only
2114        entry_index, present = self._find_entry_index(key, block)
2115        # linear search through entries at this path to find the one
2116        # requested.
2117        while entry_index < len(block) and block[entry_index][0][1] == basename:
2118            if block[entry_index][1][tree_index][0] not in (b'a', b'r'):
2119                # neither absent or relocated
2120                return block_index, entry_index, True, True
2121            entry_index += 1
2122        return block_index, entry_index, True, False
2123
2124    def _get_entry(self, tree_index, fileid_utf8=None, path_utf8=None,
2125                   include_deleted=False):
2126        """Get the dirstate entry for path in tree tree_index.
2127
2128        If either file_id or path is supplied, it is used as the key to lookup.
2129        If both are supplied, the fastest lookup is used, and an error is
2130        raised if they do not both point at the same row.
2131
2132        :param tree_index: The index of the tree we wish to locate this path
2133            in. If the path is present in that tree, the entry containing its
2134            details is returned, otherwise (None, None) is returned
2135            0 is the working tree, higher indexes are successive parent
2136            trees.
2137        :param fileid_utf8: A utf8 file_id to look up.
2138        :param path_utf8: An utf8 path to be looked up.
2139        :param include_deleted: If True, and performing a lookup via
2140            fileid_utf8 rather than path_utf8, return an entry for deleted
2141            (absent) paths.
2142        :return: The dirstate entry tuple for path, or (None, None)
2143        """
2144        self._read_dirblocks_if_needed()
2145        if path_utf8 is not None:
2146            if not isinstance(path_utf8, bytes):
2147                raise errors.BzrError('path_utf8 is not bytes: %s %r'
2148                                      % (type(path_utf8), path_utf8))
2149            # path lookups are faster
2150            dirname, basename = osutils.split(path_utf8)
2151            block_index, entry_index, dir_present, file_present = \
2152                self._get_block_entry_index(dirname, basename, tree_index)
2153            if not file_present:
2154                return None, None
2155            entry = self._dirblocks[block_index][1][entry_index]
2156            if not (entry[0][2] and entry[1][tree_index][0] not in (b'a', b'r')):
2157                raise AssertionError('unversioned entry?')
2158            if fileid_utf8:
2159                if entry[0][2] != fileid_utf8:
2160                    self._changes_aborted = True
2161                    raise errors.BzrError('integrity error ? : mismatching'
2162                                          ' tree_index, file_id and path')
2163            return entry
2164        else:
2165            possible_keys = self._get_id_index().get(fileid_utf8, ())
2166            if not possible_keys:
2167                return None, None
2168            for key in possible_keys:
2169                block_index, present = \
2170                    self._find_block_index_from_key(key)
2171                # strange, probably indicates an out of date
2172                # id index - for now, allow this.
2173                if not present:
2174                    continue
2175                # WARNING: DO not change this code to use _get_block_entry_index
2176                # as that function is not suitable: it does not use the key
2177                # to lookup, and thus the wrong coordinates are returned.
2178                block = self._dirblocks[block_index][1]
2179                entry_index, present = self._find_entry_index(key, block)
2180                if present:
2181                    entry = self._dirblocks[block_index][1][entry_index]
2182                    # TODO: We might want to assert that entry[0][2] ==
2183                    #       fileid_utf8.
2184                    # GZ 2017-06-09: Hoist set of minkinds somewhere
2185                    if entry[1][tree_index][0] in {b'f', b'd', b'l', b't'}:
2186                        # this is the result we are looking for: the
2187                        # real home of this file_id in this tree.
2188                        return entry
2189                    if entry[1][tree_index][0] == b'a':
2190                        # there is no home for this entry in this tree
2191                        if include_deleted:
2192                            return entry
2193                        return None, None
2194                    if entry[1][tree_index][0] != b'r':
2195                        raise AssertionError(
2196                            "entry %r has invalid minikind %r for tree %r"
2197                            % (entry,
2198                               entry[1][tree_index][0],
2199                               tree_index))
2200                    real_path = entry[1][tree_index][1]
2201                    return self._get_entry(tree_index, fileid_utf8=fileid_utf8,
2202                                           path_utf8=real_path)
2203            return None, None
2204
2205    @classmethod
2206    def initialize(cls, path, sha1_provider=None):
2207        """Create a new dirstate on path.
2208
2209        The new dirstate will be an empty tree - that is it has no parents,
2210        and only a root node - which has id ROOT_ID.
2211
2212        :param path: The name of the file for the dirstate.
2213        :param sha1_provider: an object meeting the SHA1Provider interface.
2214            If None, a DefaultSHA1Provider is used.
2215        :return: A write-locked DirState object.
2216        """
2217        # This constructs a new DirState object on a path, sets the _state_file
2218        # to a new empty file for that path. It then calls _set_data() with our
2219        # stock empty dirstate information - a root with ROOT_ID, no children,
2220        # and no parents. Finally it calls save() to ensure that this data will
2221        # persist.
2222        if sha1_provider is None:
2223            sha1_provider = DefaultSHA1Provider()
2224        result = cls(path, sha1_provider)
2225        # root dir and root dir contents with no children.
2226        empty_tree_dirblocks = [(b'', []), (b'', [])]
2227        # a new root directory, with a NULLSTAT.
2228        empty_tree_dirblocks[0][1].append(
2229            ((b'', b'', inventory.ROOT_ID), [
2230                (b'd', b'', 0, False, DirState.NULLSTAT),
2231            ]))
2232        result.lock_write()
2233        try:
2234            result._set_data([], empty_tree_dirblocks)
2235            result.save()
2236        except:
2237            result.unlock()
2238            raise
2239        return result
2240
2241    @staticmethod
2242    def _inv_entry_to_details(inv_entry):
2243        """Convert an inventory entry (from a revision tree) to state details.
2244
2245        :param inv_entry: An inventory entry whose sha1 and link targets can be
2246            relied upon, and which has a revision set.
2247        :return: A details tuple - the details for a single tree at a path +
2248            id.
2249        """
2250        kind = inv_entry.kind
2251        minikind = DirState._kind_to_minikind[kind]
2252        tree_data = inv_entry.revision
2253        if kind == 'directory':
2254            fingerprint = b''
2255            size = 0
2256            executable = False
2257        elif kind == 'symlink':
2258            if inv_entry.symlink_target is None:
2259                fingerprint = b''
2260            else:
2261                fingerprint = inv_entry.symlink_target.encode('utf8')
2262            size = 0
2263            executable = False
2264        elif kind == 'file':
2265            fingerprint = inv_entry.text_sha1 or b''
2266            size = inv_entry.text_size or 0
2267            executable = inv_entry.executable
2268        elif kind == 'tree-reference':
2269            fingerprint = inv_entry.reference_revision or b''
2270            size = 0
2271            executable = False
2272        else:
2273            raise Exception("can't pack %s" % inv_entry)
2274        return static_tuple.StaticTuple(minikind, fingerprint, size,
2275                                        executable, tree_data)
2276
2277    def _iter_child_entries(self, tree_index, path_utf8):
2278        """Iterate over all the entries that are children of path_utf.
2279
2280        This only returns entries that are present (not in b'a', b'r') in
2281        tree_index. tree_index data is not refreshed, so if tree 0 is used,
2282        results may differ from that obtained if paths were statted to
2283        determine what ones were directories.
2284
2285        Asking for the children of a non-directory will return an empty
2286        iterator.
2287        """
2288        pending_dirs = []
2289        next_pending_dirs = [path_utf8]
2290        absent = (b'a', b'r')
2291        while next_pending_dirs:
2292            pending_dirs = next_pending_dirs
2293            next_pending_dirs = []
2294            for path in pending_dirs:
2295                block_index, present = self._find_block_index_from_key(
2296                    (path, b'', b''))
2297                if block_index == 0:
2298                    block_index = 1
2299                    if len(self._dirblocks) == 1:
2300                        # asked for the children of the root with no other
2301                        # contents.
2302                        return
2303                if not present:
2304                    # children of a non-directory asked for.
2305                    continue
2306                block = self._dirblocks[block_index]
2307                for entry in block[1]:
2308                    kind = entry[1][tree_index][0]
2309                    if kind not in absent:
2310                        yield entry
2311                    if kind == b'd':
2312                        if entry[0][0]:
2313                            path = entry[0][0] + b'/' + entry[0][1]
2314                        else:
2315                            path = entry[0][1]
2316                        next_pending_dirs.append(path)
2317
2318    def _iter_entries(self):
2319        """Iterate over all the entries in the dirstate.
2320
2321        Each yelt item is an entry in the standard format described in the
2322        docstring of breezy.dirstate.
2323        """
2324        self._read_dirblocks_if_needed()
2325        for directory in self._dirblocks:
2326            for entry in directory[1]:
2327                yield entry
2328
2329    def _get_id_index(self):
2330        """Get an id index of self._dirblocks.
2331
2332        This maps from file_id => [(directory, name, file_id)] entries where
2333        that file_id appears in one of the trees.
2334        """
2335        if self._id_index is None:
2336            id_index = {}
2337            for key, tree_details in self._iter_entries():
2338                self._add_to_id_index(id_index, key)
2339            self._id_index = id_index
2340        return self._id_index
2341
2342    def _add_to_id_index(self, id_index, entry_key):
2343        """Add this entry to the _id_index mapping."""
2344        # This code used to use a set for every entry in the id_index. However,
2345        # it is *rare* to have more than one entry. So a set is a large
2346        # overkill. And even when we do, we won't ever have more than the
2347        # number of parent trees. Which is still a small number (rarely >2). As
2348        # such, we use a simple tuple, and do our own uniqueness checks. While
2349        # the 'in' check is O(N) since N is nicely bounded it shouldn't ever
2350        # cause quadratic failure.
2351        file_id = entry_key[2]
2352        entry_key = static_tuple.StaticTuple.from_sequence(entry_key)
2353        if file_id not in id_index:
2354            id_index[file_id] = static_tuple.StaticTuple(entry_key,)
2355        else:
2356            entry_keys = id_index[file_id]
2357            if entry_key not in entry_keys:
2358                id_index[file_id] = entry_keys + (entry_key,)
2359
2360    def _remove_from_id_index(self, id_index, entry_key):
2361        """Remove this entry from the _id_index mapping.
2362
2363        It is an programming error to call this when the entry_key is not
2364        already present.
2365        """
2366        file_id = entry_key[2]
2367        entry_keys = list(id_index[file_id])
2368        entry_keys.remove(entry_key)
2369        id_index[file_id] = static_tuple.StaticTuple.from_sequence(entry_keys)
2370
2371    def _get_output_lines(self, lines):
2372        """Format lines for final output.
2373
2374        :param lines: A sequence of lines containing the parents list and the
2375            path lines.
2376        """
2377        output_lines = [DirState.HEADER_FORMAT_3]
2378        lines.append(b'')  # a final newline
2379        inventory_text = b'\0\n\0'.join(lines)
2380        output_lines.append(b'crc32: %d\n' % (zlib.crc32(inventory_text),))
2381        # -3, 1 for num parents, 1 for ghosts, 1 for final newline
2382        num_entries = len(lines) - 3
2383        output_lines.append(b'num_entries: %d\n' % (num_entries,))
2384        output_lines.append(inventory_text)
2385        return output_lines
2386
2387    def _make_deleted_row(self, fileid_utf8, parents):
2388        """Return a deleted row for fileid_utf8."""
2389        return (b'/', b'RECYCLED.BIN', b'file', fileid_utf8, 0, DirState.NULLSTAT,
2390                b''), parents
2391
2392    def _num_present_parents(self):
2393        """The number of parent entries in each record row."""
2394        return len(self._parents) - len(self._ghosts)
2395
2396    @classmethod
2397    def on_file(cls, path, sha1_provider=None, worth_saving_limit=0,
2398                use_filesystem_for_exec=True):
2399        """Construct a DirState on the file at path "path".
2400
2401        :param path: The path at which the dirstate file on disk should live.
2402        :param sha1_provider: an object meeting the SHA1Provider interface.
2403            If None, a DefaultSHA1Provider is used.
2404        :param worth_saving_limit: when the exact number of hash changed
2405            entries is known, only bother saving the dirstate if more than
2406            this count of entries have changed. -1 means never save.
2407        :param use_filesystem_for_exec: Whether to trust the filesystem
2408            for executable bit information
2409        :return: An unlocked DirState object, associated with the given path.
2410        """
2411        if sha1_provider is None:
2412            sha1_provider = DefaultSHA1Provider()
2413        result = cls(path, sha1_provider,
2414                     worth_saving_limit=worth_saving_limit,
2415                     use_filesystem_for_exec=use_filesystem_for_exec)
2416        return result
2417
2418    def _read_dirblocks_if_needed(self):
2419        """Read in all the dirblocks from the file if they are not in memory.
2420
2421        This populates self._dirblocks, and sets self._dirblock_state to
2422        IN_MEMORY_UNMODIFIED. It is not currently ready for incremental block
2423        loading.
2424        """
2425        self._read_header_if_needed()
2426        if self._dirblock_state == DirState.NOT_IN_MEMORY:
2427            _read_dirblocks(self)
2428
2429    def _read_header(self):
2430        """This reads in the metadata header, and the parent ids.
2431
2432        After reading in, the file should be positioned at the null
2433        just before the start of the first record in the file.
2434
2435        :return: (expected crc checksum, number of entries, parent list)
2436        """
2437        self._read_prelude()
2438        parent_line = self._state_file.readline()
2439        info = parent_line.split(b'\0')
2440        num_parents = int(info[0])
2441        self._parents = info[1:-1]
2442        ghost_line = self._state_file.readline()
2443        info = ghost_line.split(b'\0')
2444        num_ghosts = int(info[1])
2445        self._ghosts = info[2:-1]
2446        self._header_state = DirState.IN_MEMORY_UNMODIFIED
2447        self._end_of_header = self._state_file.tell()
2448
2449    def _read_header_if_needed(self):
2450        """Read the header of the dirstate file if needed."""
2451        # inline this as it will be called a lot
2452        if not self._lock_token:
2453            raise errors.ObjectNotLocked(self)
2454        if self._header_state == DirState.NOT_IN_MEMORY:
2455            self._read_header()
2456
2457    def _read_prelude(self):
2458        """Read in the prelude header of the dirstate file.
2459
2460        This only reads in the stuff that is not connected to the crc
2461        checksum. The position will be correct to read in the rest of
2462        the file and check the checksum after this point.
2463        The next entry in the file should be the number of parents,
2464        and their ids. Followed by a newline.
2465        """
2466        header = self._state_file.readline()
2467        if header != DirState.HEADER_FORMAT_3:
2468            raise errors.BzrError(
2469                'invalid header line: %r' % (header,))
2470        crc_line = self._state_file.readline()
2471        if not crc_line.startswith(b'crc32: '):
2472            raise errors.BzrError('missing crc32 checksum: %r' % crc_line)
2473        self.crc_expected = int(crc_line[len(b'crc32: '):-1])
2474        num_entries_line = self._state_file.readline()
2475        if not num_entries_line.startswith(b'num_entries: '):
2476            raise errors.BzrError('missing num_entries line')
2477        self._num_entries = int(num_entries_line[len(b'num_entries: '):-1])
2478
2479    def sha1_from_stat(self, path, stat_result):
2480        """Find a sha1 given a stat lookup."""
2481        return self._get_packed_stat_index().get(pack_stat(stat_result), None)
2482
2483    def _get_packed_stat_index(self):
2484        """Get a packed_stat index of self._dirblocks."""
2485        if self._packed_stat_index is None:
2486            index = {}
2487            for key, tree_details in self._iter_entries():
2488                if tree_details[0][0] == b'f':
2489                    index[tree_details[0][4]] = tree_details[0][1]
2490            self._packed_stat_index = index
2491        return self._packed_stat_index
2492
2493    def save(self):
2494        """Save any pending changes created during this session.
2495
2496        We reuse the existing file, because that prevents race conditions with
2497        file creation, and use oslocks on it to prevent concurrent modification
2498        and reads - because dirstate's incremental data aggregation is not
2499        compatible with reading a modified file, and replacing a file in use by
2500        another process is impossible on Windows.
2501
2502        A dirstate in read only mode should be smart enough though to validate
2503        that the file has not changed, and otherwise discard its cache and
2504        start over, to allow for fine grained read lock duration, so 'status'
2505        wont block 'commit' - for example.
2506        """
2507        if self._changes_aborted:
2508            # Should this be a warning? For now, I'm expecting that places that
2509            # mark it inconsistent will warn, making a warning here redundant.
2510            trace.mutter('Not saving DirState because '
2511                         '_changes_aborted is set.')
2512            return
2513        # TODO: Since we now distinguish IN_MEMORY_MODIFIED from
2514        #       IN_MEMORY_HASH_MODIFIED, we should only fail quietly if we fail
2515        #       to save an IN_MEMORY_HASH_MODIFIED, and fail *noisily* if we
2516        #       fail to save IN_MEMORY_MODIFIED
2517        if not self._worth_saving():
2518            return
2519
2520        grabbed_write_lock = False
2521        if self._lock_state != 'w':
2522            grabbed_write_lock, new_lock = self._lock_token.temporary_write_lock()
2523            # Switch over to the new lock, as the old one may be closed.
2524            # TODO: jam 20070315 We should validate the disk file has
2525            #       not changed contents, since temporary_write_lock may
2526            #       not be an atomic operation.
2527            self._lock_token = new_lock
2528            self._state_file = new_lock.f
2529            if not grabbed_write_lock:
2530                # We couldn't grab a write lock, so we switch back to a read one
2531                return
2532        try:
2533            lines = self.get_lines()
2534            self._state_file.seek(0)
2535            self._state_file.writelines(lines)
2536            self._state_file.truncate()
2537            self._state_file.flush()
2538            self._maybe_fdatasync()
2539            self._mark_unmodified()
2540        finally:
2541            if grabbed_write_lock:
2542                self._lock_token = self._lock_token.restore_read_lock()
2543                self._state_file = self._lock_token.f
2544                # TODO: jam 20070315 We should validate the disk file has
2545                #       not changed contents. Since restore_read_lock may
2546                #       not be an atomic operation.
2547
2548    def _maybe_fdatasync(self):
2549        """Flush to disk if possible and if not configured off."""
2550        if self._config_stack.get('dirstate.fdatasync'):
2551            osutils.fdatasync(self._state_file.fileno())
2552
2553    def _worth_saving(self):
2554        """Is it worth saving the dirstate or not?"""
2555        if (self._header_state == DirState.IN_MEMORY_MODIFIED
2556                or self._dirblock_state == DirState.IN_MEMORY_MODIFIED):
2557            return True
2558        if self._dirblock_state == DirState.IN_MEMORY_HASH_MODIFIED:
2559            if self._worth_saving_limit == -1:
2560                # We never save hash changes when the limit is -1
2561                return False
2562            # If we're using smart saving and only a small number of
2563            # entries have changed their hash, don't bother saving. John has
2564            # suggested using a heuristic here based on the size of the
2565            # changed files and/or tree. For now, we go with a configurable
2566            # number of changes, keeping the calculation time
2567            # as low overhead as possible. (This also keeps all existing
2568            # tests passing as the default is 0, i.e. always save.)
2569            if len(self._known_hash_changes) >= self._worth_saving_limit:
2570                return True
2571        return False
2572
2573    def _set_data(self, parent_ids, dirblocks):
2574        """Set the full dirstate data in memory.
2575
2576        This is an internal function used to completely replace the objects
2577        in memory state. It puts the dirstate into state 'full-dirty'.
2578
2579        :param parent_ids: A list of parent tree revision ids.
2580        :param dirblocks: A list containing one tuple for each directory in the
2581            tree. Each tuple contains the directory path and a list of entries
2582            found in that directory.
2583        """
2584        # our memory copy is now authoritative.
2585        self._dirblocks = dirblocks
2586        self._mark_modified(header_modified=True)
2587        self._parents = list(parent_ids)
2588        self._id_index = None
2589        self._packed_stat_index = None
2590
2591    def set_path_id(self, path, new_id):
2592        """Change the id of path to new_id in the current working tree.
2593
2594        :param path: The path inside the tree to set - b'' is the root, 'foo'
2595            is the path foo in the root.
2596        :param new_id: The new id to assign to the path. This must be a utf8
2597            file id (not unicode, and not None).
2598        """
2599        self._read_dirblocks_if_needed()
2600        if len(path):
2601            # TODO: logic not written
2602            raise NotImplementedError(self.set_path_id)
2603        # TODO: check new id is unique
2604        entry = self._get_entry(0, path_utf8=path)
2605        if entry[0][2] == new_id:
2606            # Nothing to change.
2607            return
2608        if new_id.__class__ != bytes:
2609            raise AssertionError(
2610                "must be a utf8 file_id not %s" % (type(new_id), ))
2611        # mark the old path absent, and insert a new root path
2612        self._make_absent(entry)
2613        self.update_minimal((b'', b'', new_id), b'd',
2614                            path_utf8=b'', packed_stat=entry[1][0][4])
2615        self._mark_modified()
2616
2617    def set_parent_trees(self, trees, ghosts):
2618        """Set the parent trees for the dirstate.
2619
2620        :param trees: A list of revision_id, tree tuples. tree must be provided
2621            even if the revision_id refers to a ghost: supply an empty tree in
2622            this case.
2623        :param ghosts: A list of the revision_ids that are ghosts at the time
2624            of setting.
2625        """
2626        # TODO: generate a list of parent indexes to preserve to save
2627        # processing specific parent trees. In the common case one tree will
2628        # be preserved - the left most parent.
2629        # TODO: if the parent tree is a dirstate, we might want to walk them
2630        # all by path in parallel for 'optimal' common-case performance.
2631        # generate new root row.
2632        self._read_dirblocks_if_needed()
2633        # TODO future sketch: Examine the existing parents to generate a change
2634        # map and then walk the new parent trees only, mapping them into the
2635        # dirstate. Walk the dirstate at the same time to remove unreferenced
2636        # entries.
2637        # for now:
2638        # sketch: loop over all entries in the dirstate, cherry picking
2639        # entries from the parent trees, if they are not ghost trees.
2640        # after we finish walking the dirstate, all entries not in the dirstate
2641        # are deletes, so we want to append them to the end as per the design
2642        # discussions. So do a set difference on ids with the parents to
2643        # get deletes, and add them to the end.
2644        # During the update process we need to answer the following questions:
2645        # - find other keys containing a fileid in order to create cross-path
2646        #   links. We dont't trivially use the inventory from other trees
2647        #   because this leads to either double touching, or to accessing
2648        #   missing keys,
2649        # - find other keys containing a path
2650        # We accumulate each entry via this dictionary, including the root
2651        by_path = {}
2652        id_index = {}
2653        # we could do parallel iterators, but because file id data may be
2654        # scattered throughout, we dont save on index overhead: we have to look
2655        # at everything anyway. We can probably save cycles by reusing parent
2656        # data and doing an incremental update when adding an additional
2657        # parent, but for now the common cases are adding a new parent (merge),
2658        # and replacing completely (commit), and commit is more common: so
2659        # optimise merge later.
2660
2661        # ---- start generation of full tree mapping data
2662        # what trees should we use?
2663        parent_trees = [tree for rev_id, tree in trees if rev_id not in ghosts]
2664        # how many trees do we end up with
2665        parent_count = len(parent_trees)
2666        st = static_tuple.StaticTuple
2667
2668        # one: the current tree
2669        for entry in self._iter_entries():
2670            # skip entries not in the current tree
2671            if entry[1][0][0] in (b'a', b'r'):  # absent, relocated
2672                continue
2673            by_path[entry[0]] = [entry[1][0]] + \
2674                [DirState.NULL_PARENT_DETAILS] * parent_count
2675            # TODO: Possibly inline this, since we know it isn't present yet
2676            #       id_index[entry[0][2]] = (entry[0],)
2677            self._add_to_id_index(id_index, entry[0])
2678
2679        # now the parent trees:
2680        for tree_index, tree in enumerate(parent_trees):
2681            # the index is off by one, adjust it.
2682            tree_index = tree_index + 1
2683            # when we add new locations for a fileid we need these ranges for
2684            # any fileid in this tree as we set the by_path[id] to:
2685            # already_processed_tree_details + new_details + new_location_suffix
2686            # the suffix is from tree_index+1:parent_count+1.
2687            new_location_suffix = [
2688                DirState.NULL_PARENT_DETAILS] * (parent_count - tree_index)
2689            # now stitch in all the entries from this tree
2690            last_dirname = None
2691            for path, entry in tree.iter_entries_by_dir():
2692                # here we process each trees details for each item in the tree.
2693                # we first update any existing entries for the id at other paths,
2694                # then we either create or update the entry for the id at the
2695                # right path, and finally we add (if needed) a mapping from
2696                # file_id to this path. We do it in this order to allow us to
2697                # avoid checking all known paths for the id when generating a
2698                # new entry at this path: by adding the id->path mapping last,
2699                # all the mappings are valid and have correct relocation
2700                # records where needed.
2701                file_id = entry.file_id
2702                path_utf8 = path.encode('utf8')
2703                dirname, basename = osutils.split(path_utf8)
2704                if dirname == last_dirname:
2705                    # Try to re-use objects as much as possible
2706                    dirname = last_dirname
2707                else:
2708                    last_dirname = dirname
2709                new_entry_key = st(dirname, basename, file_id)
2710                # tree index consistency: All other paths for this id in this tree
2711                # index must point to the correct path.
2712                entry_keys = id_index.get(file_id, ())
2713                for entry_key in entry_keys:
2714                    # TODO:PROFILING: It might be faster to just update
2715                    # rather than checking if we need to, and then overwrite
2716                    # the one we are located at.
2717                    if entry_key != new_entry_key:
2718                        # this file id is at a different path in one of the
2719                        # other trees, so put absent pointers there
2720                        # This is the vertical axis in the matrix, all pointing
2721                        # to the real path.
2722                        by_path[entry_key][tree_index] = st(b'r', path_utf8, 0,
2723                                                            False, b'')
2724                # by path consistency: Insert into an existing path record
2725                # (trivial), or add a new one with relocation pointers for the
2726                # other tree indexes.
2727                if new_entry_key in entry_keys:
2728                    # there is already an entry where this data belongs, just
2729                    # insert it.
2730                    by_path[new_entry_key][tree_index] = \
2731                        self._inv_entry_to_details(entry)
2732                else:
2733                    # add relocated entries to the horizontal axis - this row
2734                    # mapping from path,id. We need to look up the correct path
2735                    # for the indexes from 0 to tree_index -1
2736                    new_details = []
2737                    for lookup_index in range(tree_index):
2738                        # boundary case: this is the first occurence of file_id
2739                        # so there are no id_indexes, possibly take this out of
2740                        # the loop?
2741                        if not len(entry_keys):
2742                            new_details.append(DirState.NULL_PARENT_DETAILS)
2743                        else:
2744                            # grab any one entry, use it to find the right path.
2745                            a_key = next(iter(entry_keys))
2746                            if by_path[a_key][lookup_index][0] in (b'r', b'a'):
2747                                # its a pointer or missing statement, use it as
2748                                # is.
2749                                new_details.append(
2750                                    by_path[a_key][lookup_index])
2751                            else:
2752                                # we have the right key, make a pointer to it.
2753                                real_path = (b'/'.join(a_key[0:2])).strip(b'/')
2754                                new_details.append(st(b'r', real_path, 0, False,
2755                                                      b''))
2756                    new_details.append(self._inv_entry_to_details(entry))
2757                    new_details.extend(new_location_suffix)
2758                    by_path[new_entry_key] = new_details
2759                    self._add_to_id_index(id_index, new_entry_key)
2760        # --- end generation of full tree mappings
2761
2762        # sort and output all the entries
2763        new_entries = self._sort_entries(by_path.items())
2764        self._entries_to_current_state(new_entries)
2765        self._parents = [rev_id for rev_id, tree in trees]
2766        self._ghosts = list(ghosts)
2767        self._mark_modified(header_modified=True)
2768        self._id_index = id_index
2769
2770    def _sort_entries(self, entry_list):
2771        """Given a list of entries, sort them into the right order.
2772
2773        This is done when constructing a new dirstate from trees - normally we
2774        try to keep everything in sorted blocks all the time, but sometimes
2775        it's easier to sort after the fact.
2776        """
2777        # When sorting, we usually have 10x more entries than directories. (69k
2778        # total entries, 4k directories). So cache the results of splitting.
2779        # Saving time and objects. Also, use StaticTuple to avoid putting all
2780        # of these object into python's garbage collector.
2781        split_dirs = {}
2782
2783        def _key(entry, _split_dirs=split_dirs, _st=static_tuple.StaticTuple):
2784            # sort by: directory parts, file name, file id
2785            dirpath, fname, file_id = entry[0]
2786            try:
2787                split = _split_dirs[dirpath]
2788            except KeyError:
2789                split = _st.from_sequence(dirpath.split(b'/'))
2790                _split_dirs[dirpath] = split
2791            return _st(split, fname, file_id)
2792        return sorted(entry_list, key=_key)
2793
2794    def set_state_from_inventory(self, new_inv):
2795        """Set new_inv as the current state.
2796
2797        This API is called by tree transform, and will usually occur with
2798        existing parent trees.
2799
2800        :param new_inv: The inventory object to set current state from.
2801        """
2802        if 'evil' in debug.debug_flags:
2803            trace.mutter_callsite(1,
2804                                  "set_state_from_inventory called; please mutate the tree instead")
2805        tracing = 'dirstate' in debug.debug_flags
2806        if tracing:
2807            trace.mutter("set_state_from_inventory trace:")
2808        self._read_dirblocks_if_needed()
2809        # sketch:
2810        # Two iterators: current data and new data, both in dirblock order.
2811        # We zip them together, which tells about entries that are new in the
2812        # inventory, or removed in the inventory, or present in both and
2813        # possibly changed.
2814        #
2815        # You might think we could just synthesize a new dirstate directly
2816        # since we're processing it in the right order.  However, we need to
2817        # also consider there may be any number of parent trees and relocation
2818        # pointers, and we don't want to duplicate that here.
2819        new_iterator = new_inv.iter_entries_by_dir()
2820        # we will be modifying the dirstate, so we need a stable iterator. In
2821        # future we might write one, for now we just clone the state into a
2822        # list using a copy so that we see every original item and don't have
2823        # to adjust the position when items are inserted or deleted in the
2824        # underlying dirstate.
2825        old_iterator = iter(list(self._iter_entries()))
2826        # both must have roots so this is safe:
2827        current_new = next(new_iterator)
2828        current_old = next(old_iterator)
2829
2830        def advance(iterator):
2831            try:
2832                return next(iterator)
2833            except StopIteration:
2834                return None
2835        while current_new or current_old:
2836            # skip entries in old that are not really there
2837            if current_old and current_old[1][0][0] in (b'a', b'r'):
2838                # relocated or absent
2839                current_old = advance(old_iterator)
2840                continue
2841            if current_new:
2842                # convert new into dirblock style
2843                new_path_utf8 = current_new[0].encode('utf8')
2844                new_dirname, new_basename = osutils.split(new_path_utf8)
2845                new_id = current_new[1].file_id
2846                new_entry_key = (new_dirname, new_basename, new_id)
2847                current_new_minikind = \
2848                    DirState._kind_to_minikind[current_new[1].kind]
2849                if current_new_minikind == b't':
2850                    fingerprint = current_new[1].reference_revision or b''
2851                else:
2852                    # We normally only insert or remove records, or update
2853                    # them when it has significantly changed.  Then we want to
2854                    # erase its fingerprint.  Unaffected records should
2855                    # normally not be updated at all.
2856                    fingerprint = b''
2857            else:
2858                # for safety disable variables
2859                new_path_utf8 = new_dirname = new_basename = new_id = \
2860                    new_entry_key = None
2861            # 5 cases, we dont have a value that is strictly greater than everything, so
2862            # we make both end conditions explicit
2863            if not current_old:
2864                # old is finished: insert current_new into the state.
2865                if tracing:
2866                    trace.mutter("Appending from new '%s'.",
2867                                 new_path_utf8.decode('utf8'))
2868                self.update_minimal(new_entry_key, current_new_minikind,
2869                                    executable=current_new[1].executable,
2870                                    path_utf8=new_path_utf8, fingerprint=fingerprint,
2871                                    fullscan=True)
2872                current_new = advance(new_iterator)
2873            elif not current_new:
2874                # new is finished
2875                if tracing:
2876                    trace.mutter("Truncating from old '%s/%s'.",
2877                                 current_old[0][0].decode('utf8'),
2878                                 current_old[0][1].decode('utf8'))
2879                self._make_absent(current_old)
2880                current_old = advance(old_iterator)
2881            elif new_entry_key == current_old[0]:
2882                # same -  common case
2883                # We're looking at the same path and id in both the dirstate
2884                # and inventory, so just need to update the fields in the
2885                # dirstate from the one in the inventory.
2886                # TODO: update the record if anything significant has changed.
2887                # the minimal required trigger is if the execute bit or cached
2888                # kind has changed.
2889                if (current_old[1][0][3] != current_new[1].executable or
2890                        current_old[1][0][0] != current_new_minikind):
2891                    if tracing:
2892                        trace.mutter("Updating in-place change '%s'.",
2893                                     new_path_utf8.decode('utf8'))
2894                    self.update_minimal(current_old[0], current_new_minikind,
2895                                        executable=current_new[1].executable,
2896                                        path_utf8=new_path_utf8, fingerprint=fingerprint,
2897                                        fullscan=True)
2898                # both sides are dealt with, move on
2899                current_old = advance(old_iterator)
2900                current_new = advance(new_iterator)
2901            elif (lt_by_dirs(new_dirname, current_old[0][0])
2902                  or (new_dirname == current_old[0][0] and
2903                      new_entry_key[1:] < current_old[0][1:])):
2904                # new comes before:
2905                # add a entry for this and advance new
2906                if tracing:
2907                    trace.mutter("Inserting from new '%s'.",
2908                                 new_path_utf8.decode('utf8'))
2909                self.update_minimal(new_entry_key, current_new_minikind,
2910                                    executable=current_new[1].executable,
2911                                    path_utf8=new_path_utf8, fingerprint=fingerprint,
2912                                    fullscan=True)
2913                current_new = advance(new_iterator)
2914            else:
2915                # we've advanced past the place where the old key would be,
2916                # without seeing it in the new list.  so it must be gone.
2917                if tracing:
2918                    trace.mutter("Deleting from old '%s/%s'.",
2919                                 current_old[0][0].decode('utf8'),
2920                                 current_old[0][1].decode('utf8'))
2921                self._make_absent(current_old)
2922                current_old = advance(old_iterator)
2923        self._mark_modified()
2924        self._id_index = None
2925        self._packed_stat_index = None
2926        if tracing:
2927            trace.mutter("set_state_from_inventory complete.")
2928
2929    def set_state_from_scratch(self, working_inv, parent_trees, parent_ghosts):
2930        """Wipe the currently stored state and set it to something new.
2931
2932        This is a hard-reset for the data we are working with.
2933        """
2934        # Technically, we really want a write lock, but until we write, we
2935        # don't really need it.
2936        self._requires_lock()
2937        # root dir and root dir contents with no children. We have to have a
2938        # root for set_state_from_inventory to work correctly.
2939        empty_root = ((b'', b'', inventory.ROOT_ID),
2940                      [(b'd', b'', 0, False, DirState.NULLSTAT)])
2941        empty_tree_dirblocks = [(b'', [empty_root]), (b'', [])]
2942        self._set_data([], empty_tree_dirblocks)
2943        self.set_state_from_inventory(working_inv)
2944        self.set_parent_trees(parent_trees, parent_ghosts)
2945
2946    def _make_absent(self, current_old):
2947        """Mark current_old - an entry - as absent for tree 0.
2948
2949        :return: True if this was the last details entry for the entry key:
2950            that is, if the underlying block has had the entry removed, thus
2951            shrinking in length.
2952        """
2953        # build up paths that this id will be left at after the change is made,
2954        # so we can update their cross references in tree 0
2955        all_remaining_keys = set()
2956        # Dont check the working tree, because it's going.
2957        for details in current_old[1][1:]:
2958            if details[0] not in (b'a', b'r'):  # absent, relocated
2959                all_remaining_keys.add(current_old[0])
2960            elif details[0] == b'r':  # relocated
2961                # record the key for the real path.
2962                all_remaining_keys.add(
2963                    tuple(osutils.split(details[1])) + (current_old[0][2],))
2964            # absent rows are not present at any path.
2965        last_reference = current_old[0] not in all_remaining_keys
2966        if last_reference:
2967            # the current row consists entire of the current item (being marked
2968            # absent), and relocated or absent entries for the other trees:
2969            # Remove it, its meaningless.
2970            block = self._find_block(current_old[0])
2971            entry_index, present = self._find_entry_index(
2972                current_old[0], block[1])
2973            if not present:
2974                raise AssertionError(
2975                    'could not find entry for %s' % (current_old,))
2976            block[1].pop(entry_index)
2977            # if we have an id_index in use, remove this key from it for this id.
2978            if self._id_index is not None:
2979                self._remove_from_id_index(self._id_index, current_old[0])
2980        # update all remaining keys for this id to record it as absent. The
2981        # existing details may either be the record we are marking as deleted
2982        # (if there were other trees with the id present at this path), or may
2983        # be relocations.
2984        for update_key in all_remaining_keys:
2985            update_block_index, present = \
2986                self._find_block_index_from_key(update_key)
2987            if not present:
2988                raise AssertionError(
2989                    'could not find block for %s' % (update_key,))
2990            update_entry_index, present = \
2991                self._find_entry_index(
2992                    update_key, self._dirblocks[update_block_index][1])
2993            if not present:
2994                raise AssertionError(
2995                    'could not find entry for %s' % (update_key,))
2996            update_tree_details = self._dirblocks[update_block_index][1][update_entry_index][1]
2997            # it must not be absent at the moment
2998            if update_tree_details[0][0] == b'a':  # absent
2999                raise AssertionError('bad row %r' % (update_tree_details,))
3000            update_tree_details[0] = DirState.NULL_PARENT_DETAILS
3001        self._mark_modified()
3002        return last_reference
3003
3004    def update_minimal(self, key, minikind, executable=False, fingerprint=b'',
3005                       packed_stat=None, size=0, path_utf8=None, fullscan=False):
3006        """Update an entry to the state in tree 0.
3007
3008        This will either create a new entry at 'key' or update an existing one.
3009        It also makes sure that any other records which might mention this are
3010        updated as well.
3011
3012        :param key: (dir, name, file_id) for the new entry
3013        :param minikind: The type for the entry (b'f' == 'file', b'd' ==
3014                'directory'), etc.
3015        :param executable: Should the executable bit be set?
3016        :param fingerprint: Simple fingerprint for new entry: canonical-form
3017            sha1 for files, referenced revision id for subtrees, etc.
3018        :param packed_stat: Packed stat value for new entry.
3019        :param size: Size information for new entry
3020        :param path_utf8: key[0] + '/' + key[1], just passed in to avoid doing
3021                extra computation.
3022        :param fullscan: If True then a complete scan of the dirstate is being
3023            done and checking for duplicate rows should not be done. This
3024            should only be set by set_state_from_inventory and similar methods.
3025
3026        If packed_stat and fingerprint are not given, they're invalidated in
3027        the entry.
3028        """
3029        block = self._find_block(key)[1]
3030        if packed_stat is None:
3031            packed_stat = DirState.NULLSTAT
3032        # XXX: Some callers pass b'' as the packed_stat, and it seems to be
3033        # sometimes present in the dirstate - this seems oddly inconsistent.
3034        # mbp 20071008
3035        entry_index, present = self._find_entry_index(key, block)
3036        new_details = (minikind, fingerprint, size, executable, packed_stat)
3037        id_index = self._get_id_index()
3038        if not present:
3039            # New record. Check there isn't a entry at this path already.
3040            if not fullscan:
3041                low_index, _ = self._find_entry_index(key[0:2] + (b'',), block)
3042                while low_index < len(block):
3043                    entry = block[low_index]
3044                    if entry[0][0:2] == key[0:2]:
3045                        if entry[1][0][0] not in (b'a', b'r'):
3046                            # This entry has the same path (but a different id) as
3047                            # the new entry we're adding, and is present in ths
3048                            # tree.
3049                            self._raise_invalid(
3050                                (b"%s/%s" % key[0:2]).decode('utf8'), key[2],
3051                                "Attempt to add item at path already occupied by "
3052                                "id %r" % entry[0][2])
3053                        low_index += 1
3054                    else:
3055                        break
3056            # new entry, synthesis cross reference here,
3057            existing_keys = id_index.get(key[2], ())
3058            if not existing_keys:
3059                # not currently in the state, simplest case
3060                new_entry = key, [new_details] + self._empty_parent_info()
3061            else:
3062                # present at one or more existing other paths.
3063                # grab one of them and use it to generate parent
3064                # relocation/absent entries.
3065                new_entry = key, [new_details]
3066                # existing_keys can be changed as we iterate.
3067                for other_key in tuple(existing_keys):
3068                    # change the record at other to be a pointer to this new
3069                    # record. The loop looks similar to the change to
3070                    # relocations when updating an existing record but its not:
3071                    # the test for existing kinds is different: this can be
3072                    # factored out to a helper though.
3073                    other_block_index, present = self._find_block_index_from_key(
3074                        other_key)
3075                    if not present:
3076                        raise AssertionError('could not find block for %s' % (
3077                            other_key,))
3078                    other_block = self._dirblocks[other_block_index][1]
3079                    other_entry_index, present = self._find_entry_index(
3080                        other_key, other_block)
3081                    if not present:
3082                        raise AssertionError(
3083                            'update_minimal: could not find other entry for %s'
3084                            % (other_key,))
3085                    if path_utf8 is None:
3086                        raise AssertionError('no path')
3087                    # Turn this other location into a reference to the new
3088                    # location. This also updates the aliased iterator
3089                    # (current_old in set_state_from_inventory) so that the old
3090                    # entry, if not already examined, is skipped over by that
3091                    # loop.
3092                    other_entry = other_block[other_entry_index]
3093                    other_entry[1][0] = (b'r', path_utf8, 0, False, b'')
3094                    if self._maybe_remove_row(other_block, other_entry_index,
3095                                              id_index):
3096                        # If the row holding this was removed, we need to
3097                        # recompute where this entry goes
3098                        entry_index, _ = self._find_entry_index(key, block)
3099
3100                # This loop:
3101                # adds a tuple to the new details for each column
3102                #  - either by copying an existing relocation pointer inside that column
3103                #  - or by creating a new pointer to the right row inside that column
3104                num_present_parents = self._num_present_parents()
3105                if num_present_parents:
3106                    # TODO: This re-evaluates the existing_keys set, do we need
3107                    #       to do that ourselves?
3108                    other_key = list(existing_keys)[0]
3109                for lookup_index in range(1, num_present_parents + 1):
3110                    # grab any one entry, use it to find the right path.
3111                    # TODO: optimise this to reduce memory use in highly
3112                    # fragmented situations by reusing the relocation
3113                    # records.
3114                    update_block_index, present = \
3115                        self._find_block_index_from_key(other_key)
3116                    if not present:
3117                        raise AssertionError(
3118                            'could not find block for %s' % (other_key,))
3119                    update_entry_index, present = \
3120                        self._find_entry_index(
3121                            other_key, self._dirblocks[update_block_index][1])
3122                    if not present:
3123                        raise AssertionError(
3124                            'update_minimal: could not find entry for %s' % (other_key,))
3125                    update_details = self._dirblocks[update_block_index][1][update_entry_index][1][lookup_index]
3126                    if update_details[0] in (b'a', b'r'):  # relocated, absent
3127                        # its a pointer or absent in lookup_index's tree, use
3128                        # it as is.
3129                        new_entry[1].append(update_details)
3130                    else:
3131                        # we have the right key, make a pointer to it.
3132                        pointer_path = osutils.pathjoin(*other_key[0:2])
3133                        new_entry[1].append(
3134                            (b'r', pointer_path, 0, False, b''))
3135            block.insert(entry_index, new_entry)
3136            self._add_to_id_index(id_index, key)
3137        else:
3138            # Does the new state matter?
3139            block[entry_index][1][0] = new_details
3140            # parents cannot be affected by what we do.
3141            # other occurences of this id can be found
3142            # from the id index.
3143            # ---
3144            # tree index consistency: All other paths for this id in this tree
3145            # index must point to the correct path. We have to loop here because
3146            # we may have passed entries in the state with this file id already
3147            # that were absent - where parent entries are - and they need to be
3148            # converted to relocated.
3149            if path_utf8 is None:
3150                raise AssertionError('no path')
3151            existing_keys = id_index.get(key[2], ())
3152            if key not in existing_keys:
3153                raise AssertionError('We found the entry in the blocks, but'
3154                                     ' the key is not in the id_index.'
3155                                     ' key: %s, existing_keys: %s' % (key, existing_keys))
3156            for entry_key in existing_keys:
3157                # TODO:PROFILING: It might be faster to just update
3158                # rather than checking if we need to, and then overwrite
3159                # the one we are located at.
3160                if entry_key != key:
3161                    # this file id is at a different path in one of the
3162                    # other trees, so put absent pointers there
3163                    # This is the vertical axis in the matrix, all pointing
3164                    # to the real path.
3165                    block_index, present = self._find_block_index_from_key(
3166                        entry_key)
3167                    if not present:
3168                        raise AssertionError('not present: %r', entry_key)
3169                    entry_index, present = self._find_entry_index(
3170                        entry_key, self._dirblocks[block_index][1])
3171                    if not present:
3172                        raise AssertionError('not present: %r', entry_key)
3173                    self._dirblocks[block_index][1][entry_index][1][0] = \
3174                        (b'r', path_utf8, 0, False, b'')
3175        # add a containing dirblock if needed.
3176        if new_details[0] == b'd':
3177            # GZ 2017-06-09: Using pathjoin why?
3178            subdir_key = (osutils.pathjoin(*key[0:2]), b'', b'')
3179            block_index, present = self._find_block_index_from_key(subdir_key)
3180            if not present:
3181                self._dirblocks.insert(block_index, (subdir_key[0], []))
3182
3183        self._mark_modified()
3184
3185    def _maybe_remove_row(self, block, index, id_index):
3186        """Remove index if it is absent or relocated across the row.
3187
3188        id_index is updated accordingly.
3189        :return: True if we removed the row, False otherwise
3190        """
3191        present_in_row = False
3192        entry = block[index]
3193        for column in entry[1]:
3194            if column[0] not in (b'a', b'r'):
3195                present_in_row = True
3196                break
3197        if not present_in_row:
3198            block.pop(index)
3199            self._remove_from_id_index(id_index, entry[0])
3200            return True
3201        return False
3202
3203    def _validate(self):
3204        """Check that invariants on the dirblock are correct.
3205
3206        This can be useful in debugging; it shouldn't be necessary in
3207        normal code.
3208
3209        This must be called with a lock held.
3210        """
3211        # NOTE: This must always raise AssertionError not just assert,
3212        # otherwise it may not behave properly under python -O
3213        #
3214        # TODO: All entries must have some content that's not b'a' or b'r',
3215        # otherwise it could just be removed.
3216        #
3217        # TODO: All relocations must point directly to a real entry.
3218        #
3219        # TODO: No repeated keys.
3220        #
3221        # -- mbp 20070325
3222        from pprint import pformat
3223        self._read_dirblocks_if_needed()
3224        if len(self._dirblocks) > 0:
3225            if not self._dirblocks[0][0] == b'':
3226                raise AssertionError(
3227                    "dirblocks don't start with root block:\n"
3228                    + pformat(self._dirblocks))
3229        if len(self._dirblocks) > 1:
3230            if not self._dirblocks[1][0] == b'':
3231                raise AssertionError(
3232                    "dirblocks missing root directory:\n"
3233                    + pformat(self._dirblocks))
3234        # the dirblocks are sorted by their path components, name, and dir id
3235        dir_names = [d[0].split(b'/')
3236                     for d in self._dirblocks[1:]]
3237        if dir_names != sorted(dir_names):
3238            raise AssertionError(
3239                "dir names are not in sorted order:\n" +
3240                pformat(self._dirblocks) +
3241                "\nkeys:\n" +
3242                pformat(dir_names))
3243        for dirblock in self._dirblocks:
3244            # within each dirblock, the entries are sorted by filename and
3245            # then by id.
3246            for entry in dirblock[1]:
3247                if dirblock[0] != entry[0][0]:
3248                    raise AssertionError(
3249                        "entry key for %r"
3250                        "doesn't match directory name in\n%r" %
3251                        (entry, pformat(dirblock)))
3252            if dirblock[1] != sorted(dirblock[1]):
3253                raise AssertionError(
3254                    "dirblock for %r is not sorted:\n%s" %
3255                    (dirblock[0], pformat(dirblock)))
3256
3257        def check_valid_parent():
3258            """Check that the current entry has a valid parent.
3259
3260            This makes sure that the parent has a record,
3261            and that the parent isn't marked as "absent" in the
3262            current tree. (It is invalid to have a non-absent file in an absent
3263            directory.)
3264            """
3265            if entry[0][0:2] == (b'', b''):
3266                # There should be no parent for the root row
3267                return
3268            parent_entry = self._get_entry(tree_index, path_utf8=entry[0][0])
3269            if parent_entry == (None, None):
3270                raise AssertionError(
3271                    "no parent entry for: %s in tree %s"
3272                    % (this_path, tree_index))
3273            if parent_entry[1][tree_index][0] != b'd':
3274                raise AssertionError(
3275                    "Parent entry for %s is not marked as a valid"
3276                    " directory. %s" % (this_path, parent_entry,))
3277
3278        # For each file id, for each tree: either
3279        # the file id is not present at all; all rows with that id in the
3280        # key have it marked as 'absent'
3281        # OR the file id is present under exactly one name; any other entries
3282        # that mention that id point to the correct name.
3283        #
3284        # We check this with a dict per tree pointing either to the present
3285        # name, or None if absent.
3286        tree_count = self._num_present_parents() + 1
3287        id_path_maps = [{} for _ in range(tree_count)]
3288        # Make sure that all renamed entries point to the correct location.
3289        for entry in self._iter_entries():
3290            file_id = entry[0][2]
3291            this_path = osutils.pathjoin(entry[0][0], entry[0][1])
3292            if len(entry[1]) != tree_count:
3293                raise AssertionError(
3294                    "wrong number of entry details for row\n%s"
3295                    ",\nexpected %d" %
3296                    (pformat(entry), tree_count))
3297            absent_positions = 0
3298            for tree_index, tree_state in enumerate(entry[1]):
3299                this_tree_map = id_path_maps[tree_index]
3300                minikind = tree_state[0]
3301                if minikind in (b'a', b'r'):
3302                    absent_positions += 1
3303                # have we seen this id before in this column?
3304                if file_id in this_tree_map:
3305                    previous_path, previous_loc = this_tree_map[file_id]
3306                    # any later mention of this file must be consistent with
3307                    # what was said before
3308                    if minikind == b'a':
3309                        if previous_path is not None:
3310                            raise AssertionError(
3311                                "file %s is absent in row %r but also present "
3312                                "at %r" %
3313                                (file_id.decode('utf-8'), entry, previous_path))
3314                    elif minikind == b'r':
3315                        target_location = tree_state[1]
3316                        if previous_path != target_location:
3317                            raise AssertionError(
3318                                "file %s relocation in row %r but also at %r"
3319                                % (file_id, entry, previous_path))
3320                    else:
3321                        # a file, directory, etc - may have been previously
3322                        # pointed to by a relocation, which must point here
3323                        if previous_path != this_path:
3324                            raise AssertionError(
3325                                "entry %r inconsistent with previous path %r "
3326                                "seen at %r" %
3327                                (entry, previous_path, previous_loc))
3328                        check_valid_parent()
3329                else:
3330                    if minikind == b'a':
3331                        # absent; should not occur anywhere else
3332                        this_tree_map[file_id] = None, this_path
3333                    elif minikind == b'r':
3334                        # relocation, must occur at expected location
3335                        this_tree_map[file_id] = tree_state[1], this_path
3336                    else:
3337                        this_tree_map[file_id] = this_path, this_path
3338                        check_valid_parent()
3339            if absent_positions == tree_count:
3340                raise AssertionError(
3341                    "entry %r has no data for any tree." % (entry,))
3342        if self._id_index is not None:
3343            for file_id, entry_keys in self._id_index.items():
3344                for entry_key in entry_keys:
3345                    # Check that the entry in the map is pointing to the same
3346                    # file_id
3347                    if entry_key[2] != file_id:
3348                        raise AssertionError(
3349                            'file_id %r did not match entry key %s'
3350                            % (file_id, entry_key))
3351                    # And that from this entry key, we can look up the original
3352                    # record
3353                    block_index, present = self._find_block_index_from_key(
3354                        entry_key)
3355                    if not present:
3356                        raise AssertionError(
3357                            'missing block for entry key: %r', entry_key)
3358                    entry_index, present = self._find_entry_index(
3359                        entry_key, self._dirblocks[block_index][1])
3360                    if not present:
3361                        raise AssertionError(
3362                            'missing entry for key: %r', entry_key)
3363                if len(entry_keys) != len(set(entry_keys)):
3364                    raise AssertionError(
3365                        'id_index contained non-unique data for %s'
3366                        % (entry_keys,))
3367
3368    def _wipe_state(self):
3369        """Forget all state information about the dirstate."""
3370        self._header_state = DirState.NOT_IN_MEMORY
3371        self._dirblock_state = DirState.NOT_IN_MEMORY
3372        self._changes_aborted = False
3373        self._parents = []
3374        self._ghosts = []
3375        self._dirblocks = []
3376        self._id_index = None
3377        self._packed_stat_index = None
3378        self._end_of_header = None
3379        self._cutoff_time = None
3380        self._split_path_cache = {}
3381
3382    def lock_read(self):
3383        """Acquire a read lock on the dirstate."""
3384        if self._lock_token is not None:
3385            raise errors.LockContention(self._lock_token)
3386        # TODO: jam 20070301 Rather than wiping completely, if the blocks are
3387        #       already in memory, we could read just the header and check for
3388        #       any modification. If not modified, we can just leave things
3389        #       alone
3390        self._lock_token = lock.ReadLock(self._filename)
3391        self._lock_state = 'r'
3392        self._state_file = self._lock_token.f
3393        self._wipe_state()
3394        return lock.LogicalLockResult(self.unlock)
3395
3396    def lock_write(self):
3397        """Acquire a write lock on the dirstate."""
3398        if self._lock_token is not None:
3399            raise errors.LockContention(self._lock_token)
3400        # TODO: jam 20070301 Rather than wiping completely, if the blocks are
3401        #       already in memory, we could read just the header and check for
3402        #       any modification. If not modified, we can just leave things
3403        #       alone
3404        self._lock_token = lock.WriteLock(self._filename)
3405        self._lock_state = 'w'
3406        self._state_file = self._lock_token.f
3407        self._wipe_state()
3408        return lock.LogicalLockResult(self.unlock, self._lock_token)
3409
3410    def unlock(self):
3411        """Drop any locks held on the dirstate."""
3412        if self._lock_token is None:
3413            raise errors.LockNotHeld(self)
3414        # TODO: jam 20070301 Rather than wiping completely, if the blocks are
3415        #       already in memory, we could read just the header and check for
3416        #       any modification. If not modified, we can just leave things
3417        #       alone
3418        self._state_file = None
3419        self._lock_state = None
3420        self._lock_token.unlock()
3421        self._lock_token = None
3422        self._split_path_cache = {}
3423
3424    def _requires_lock(self):
3425        """Check that a lock is currently held by someone on the dirstate."""
3426        if not self._lock_token:
3427            raise errors.ObjectNotLocked(self)
3428
3429
3430def py_update_entry(state, entry, abspath, stat_value,
3431                    _stat_to_minikind=DirState._stat_to_minikind):
3432    """Update the entry based on what is actually on disk.
3433
3434    This function only calculates the sha if it needs to - if the entry is
3435    uncachable, or clearly different to the first parent's entry, no sha
3436    is calculated, and None is returned.
3437
3438    :param state: The dirstate this entry is in.
3439    :param entry: This is the dirblock entry for the file in question.
3440    :param abspath: The path on disk for this file.
3441    :param stat_value: The stat value done on the path.
3442    :return: None, or The sha1 hexdigest of the file (40 bytes) or link
3443        target of a symlink.
3444    """
3445    try:
3446        minikind = _stat_to_minikind[stat_value.st_mode & 0o170000]
3447    except KeyError:
3448        # Unhandled kind
3449        return None
3450    packed_stat = pack_stat(stat_value)
3451    (saved_minikind, saved_link_or_sha1, saved_file_size,
3452     saved_executable, saved_packed_stat) = entry[1][0]
3453    if not isinstance(saved_minikind, bytes):
3454        raise TypeError(saved_minikind)
3455
3456    if minikind == b'd' and saved_minikind == b't':
3457        minikind = b't'
3458    if (minikind == saved_minikind
3459            and packed_stat == saved_packed_stat):
3460        # The stat hasn't changed since we saved, so we can re-use the
3461        # saved sha hash.
3462        if minikind == b'd':
3463            return None
3464
3465        # size should also be in packed_stat
3466        if saved_file_size == stat_value.st_size:
3467            return saved_link_or_sha1
3468
3469    # If we have gotten this far, that means that we need to actually
3470    # process this entry.
3471    link_or_sha1 = None
3472    worth_saving = True
3473    if minikind == b'f':
3474        executable = state._is_executable(stat_value.st_mode,
3475                                          saved_executable)
3476        if state._cutoff_time is None:
3477            state._sha_cutoff_time()
3478        if (stat_value.st_mtime < state._cutoff_time
3479            and stat_value.st_ctime < state._cutoff_time
3480            and len(entry[1]) > 1
3481                and entry[1][1][0] != b'a'):
3482            # Could check for size changes for further optimised
3483            # avoidance of sha1's. However the most prominent case of
3484            # over-shaing is during initial add, which this catches.
3485            # Besides, if content filtering happens, size and sha
3486            # are calculated at the same time, so checking just the size
3487            # gains nothing w.r.t. performance.
3488            link_or_sha1 = state._sha1_file(abspath)
3489            entry[1][0] = (b'f', link_or_sha1, stat_value.st_size,
3490                           executable, packed_stat)
3491        else:
3492            entry[1][0] = (b'f', b'', stat_value.st_size,
3493                           executable, DirState.NULLSTAT)
3494            worth_saving = False
3495    elif minikind == b'd':
3496        link_or_sha1 = None
3497        entry[1][0] = (b'd', b'', 0, False, packed_stat)
3498        if saved_minikind != b'd':
3499            # This changed from something into a directory. Make sure we
3500            # have a directory block for it. This doesn't happen very
3501            # often, so this doesn't have to be super fast.
3502            block_index, entry_index, dir_present, file_present = \
3503                state._get_block_entry_index(entry[0][0], entry[0][1], 0)
3504            state._ensure_block(block_index, entry_index,
3505                                osutils.pathjoin(entry[0][0], entry[0][1]))
3506        else:
3507            worth_saving = False
3508    elif minikind == b'l':
3509        if saved_minikind == b'l':
3510            worth_saving = False
3511        link_or_sha1 = state._read_link(abspath, saved_link_or_sha1)
3512        if state._cutoff_time is None:
3513            state._sha_cutoff_time()
3514        if (stat_value.st_mtime < state._cutoff_time
3515                and stat_value.st_ctime < state._cutoff_time):
3516            entry[1][0] = (b'l', link_or_sha1, stat_value.st_size,
3517                           False, packed_stat)
3518        else:
3519            entry[1][0] = (b'l', b'', stat_value.st_size,
3520                           False, DirState.NULLSTAT)
3521    if worth_saving:
3522        state._mark_modified([entry])
3523    return link_or_sha1
3524
3525
3526class ProcessEntryPython(object):
3527
3528    __slots__ = ["old_dirname_to_file_id", "new_dirname_to_file_id",
3529                 "last_source_parent", "last_target_parent", "include_unchanged",
3530                 "partial", "use_filesystem_for_exec", "utf8_decode",
3531                 "searched_specific_files", "search_specific_files",
3532                 "searched_exact_paths", "search_specific_file_parents", "seen_ids",
3533                 "state", "source_index", "target_index", "want_unversioned", "tree"]
3534
3535    def __init__(self, include_unchanged, use_filesystem_for_exec,
3536                 search_specific_files, state, source_index, target_index,
3537                 want_unversioned, tree):
3538        self.old_dirname_to_file_id = {}
3539        self.new_dirname_to_file_id = {}
3540        # Are we doing a partial iter_changes?
3541        self.partial = search_specific_files != {''}
3542        # Using a list so that we can access the values and change them in
3543        # nested scope. Each one is [path, file_id, entry]
3544        self.last_source_parent = [None, None]
3545        self.last_target_parent = [None, None]
3546        self.include_unchanged = include_unchanged
3547        self.use_filesystem_for_exec = use_filesystem_for_exec
3548        self.utf8_decode = cache_utf8._utf8_decode
3549        # for all search_indexs in each path at or under each element of
3550        # search_specific_files, if the detail is relocated: add the id, and
3551        # add the relocated path as one to search if its not searched already.
3552        # If the detail is not relocated, add the id.
3553        self.searched_specific_files = set()
3554        # When we search exact paths without expanding downwards, we record
3555        # that here.
3556        self.searched_exact_paths = set()
3557        self.search_specific_files = search_specific_files
3558        # The parents up to the root of the paths we are searching.
3559        # After all normal paths are returned, these specific items are returned.
3560        self.search_specific_file_parents = set()
3561        # The ids we've sent out in the delta.
3562        self.seen_ids = set()
3563        self.state = state
3564        self.source_index = source_index
3565        self.target_index = target_index
3566        if target_index != 0:
3567            # A lot of code in here depends on target_index == 0
3568            raise errors.BzrError('unsupported target index')
3569        self.want_unversioned = want_unversioned
3570        self.tree = tree
3571
3572    def _process_entry(self, entry, path_info, pathjoin=osutils.pathjoin):
3573        """Compare an entry and real disk to generate delta information.
3574
3575        :param path_info: top_relpath, basename, kind, lstat, abspath for
3576            the path of entry. If None, then the path is considered absent in
3577            the target (Perhaps we should pass in a concrete entry for this ?)
3578            Basename is returned as a utf8 string because we expect this
3579            tuple will be ignored, and don't want to take the time to
3580            decode.
3581        :return: (iter_changes_result, changed). If the entry has not been
3582            handled then changed is None. Otherwise it is False if no content
3583            or metadata changes have occurred, and True if any content or
3584            metadata change has occurred. If self.include_unchanged is True then
3585            if changed is not None, iter_changes_result will always be a result
3586            tuple. Otherwise, iter_changes_result is None unless changed is
3587            True.
3588        """
3589        if self.source_index is None:
3590            source_details = DirState.NULL_PARENT_DETAILS
3591        else:
3592            source_details = entry[1][self.source_index]
3593        # GZ 2017-06-09: Eck, more sets.
3594        _fdltr = {b'f', b'd', b'l', b't', b'r'}
3595        _fdlt = {b'f', b'd', b'l', b't'}
3596        _ra = (b'r', b'a')
3597        target_details = entry[1][self.target_index]
3598        target_minikind = target_details[0]
3599        if path_info is not None and target_minikind in _fdlt:
3600            if not (self.target_index == 0):
3601                raise AssertionError()
3602            link_or_sha1 = update_entry(self.state, entry,
3603                                        abspath=path_info[4], stat_value=path_info[3])
3604            # The entry may have been modified by update_entry
3605            target_details = entry[1][self.target_index]
3606            target_minikind = target_details[0]
3607        else:
3608            link_or_sha1 = None
3609        file_id = entry[0][2]
3610        source_minikind = source_details[0]
3611        if source_minikind in _fdltr and target_minikind in _fdlt:
3612            # claimed content in both: diff
3613            #   r    | fdlt   |      | add source to search, add id path move and perform
3614            #        |        |      | diff check on source-target
3615            #   r    | fdlt   |  a   | dangling file that was present in the basis.
3616            #        |        |      | ???
3617            if source_minikind == b'r':
3618                # add the source to the search path to find any children it
3619                # has.  TODO ? : only add if it is a container ?
3620                if not osutils.is_inside_any(self.searched_specific_files,
3621                                             source_details[1]):
3622                    self.search_specific_files.add(source_details[1])
3623                # generate the old path; this is needed for stating later
3624                # as well.
3625                old_path = source_details[1]
3626                old_dirname, old_basename = os.path.split(old_path)
3627                path = pathjoin(entry[0][0], entry[0][1])
3628                old_entry = self.state._get_entry(self.source_index,
3629                                                  path_utf8=old_path)
3630                # update the source details variable to be the real
3631                # location.
3632                if old_entry == (None, None):
3633                    raise DirstateCorrupt(self.state._filename,
3634                                          "entry '%s/%s' is considered renamed from %r"
3635                                          " but source does not exist\n"
3636                                          "entry: %s" % (entry[0][0], entry[0][1], old_path, entry))
3637                source_details = old_entry[1][self.source_index]
3638                source_minikind = source_details[0]
3639            else:
3640                old_dirname = entry[0][0]
3641                old_basename = entry[0][1]
3642                old_path = path = None
3643            if path_info is None:
3644                # the file is missing on disk, show as removed.
3645                content_change = True
3646                target_kind = None
3647                target_exec = False
3648            else:
3649                # source and target are both versioned and disk file is present.
3650                target_kind = path_info[2]
3651                if target_kind == 'directory':
3652                    if path is None:
3653                        old_path = path = pathjoin(old_dirname, old_basename)
3654                    self.new_dirname_to_file_id[path] = file_id
3655                    if source_minikind != b'd':
3656                        content_change = True
3657                    else:
3658                        # directories have no fingerprint
3659                        content_change = False
3660                    target_exec = False
3661                elif target_kind == 'file':
3662                    if source_minikind != b'f':
3663                        content_change = True
3664                    else:
3665                        # Check the sha. We can't just rely on the size as
3666                        # content filtering may mean differ sizes actually
3667                        # map to the same content
3668                        if link_or_sha1 is None:
3669                            # Stat cache miss:
3670                            statvalue, link_or_sha1 = \
3671                                self.state._sha1_provider.stat_and_sha1(
3672                                    path_info[4])
3673                            self.state._observed_sha1(entry, link_or_sha1,
3674                                                      statvalue)
3675                        content_change = (link_or_sha1 != source_details[1])
3676                    # Target details is updated at update_entry time
3677                    if self.use_filesystem_for_exec:
3678                        # We don't need S_ISREG here, because we are sure
3679                        # we are dealing with a file.
3680                        target_exec = bool(stat.S_IEXEC & path_info[3].st_mode)
3681                    else:
3682                        target_exec = target_details[3]
3683                elif target_kind == 'symlink':
3684                    if source_minikind != b'l':
3685                        content_change = True
3686                    else:
3687                        content_change = (link_or_sha1 != source_details[1])
3688                    target_exec = False
3689                elif target_kind == 'tree-reference':
3690                    if source_minikind != b't':
3691                        content_change = True
3692                    else:
3693                        content_change = False
3694                    target_exec = False
3695                else:
3696                    if path is None:
3697                        path = pathjoin(old_dirname, old_basename)
3698                    raise errors.BadFileKindError(path, path_info[2])
3699            if source_minikind == b'd':
3700                if path is None:
3701                    old_path = path = pathjoin(old_dirname, old_basename)
3702                self.old_dirname_to_file_id[old_path] = file_id
3703            # parent id is the entry for the path in the target tree
3704            if old_basename and old_dirname == self.last_source_parent[0]:
3705                source_parent_id = self.last_source_parent[1]
3706            else:
3707                try:
3708                    source_parent_id = self.old_dirname_to_file_id[old_dirname]
3709                except KeyError:
3710                    source_parent_entry = self.state._get_entry(self.source_index,
3711                                                                path_utf8=old_dirname)
3712                    source_parent_id = source_parent_entry[0][2]
3713                if source_parent_id == entry[0][2]:
3714                    # This is the root, so the parent is None
3715                    source_parent_id = None
3716                else:
3717                    self.last_source_parent[0] = old_dirname
3718                    self.last_source_parent[1] = source_parent_id
3719            new_dirname = entry[0][0]
3720            if entry[0][1] and new_dirname == self.last_target_parent[0]:
3721                target_parent_id = self.last_target_parent[1]
3722            else:
3723                try:
3724                    target_parent_id = self.new_dirname_to_file_id[new_dirname]
3725                except KeyError:
3726                    # TODO: We don't always need to do the lookup, because the
3727                    #       parent entry will be the same as the source entry.
3728                    target_parent_entry = self.state._get_entry(self.target_index,
3729                                                                path_utf8=new_dirname)
3730                    if target_parent_entry == (None, None):
3731                        raise AssertionError(
3732                            "Could not find target parent in wt: %s\nparent of: %s"
3733                            % (new_dirname, entry))
3734                    target_parent_id = target_parent_entry[0][2]
3735                if target_parent_id == entry[0][2]:
3736                    # This is the root, so the parent is None
3737                    target_parent_id = None
3738                else:
3739                    self.last_target_parent[0] = new_dirname
3740                    self.last_target_parent[1] = target_parent_id
3741
3742            source_exec = source_details[3]
3743            changed = (content_change
3744                       or source_parent_id != target_parent_id
3745                       or old_basename != entry[0][1]
3746                       or source_exec != target_exec
3747                       )
3748            if not changed and not self.include_unchanged:
3749                return None, False
3750            else:
3751                if old_path is None:
3752                    old_path = path = pathjoin(old_dirname, old_basename)
3753                    old_path_u = self.utf8_decode(old_path)[0]
3754                    path_u = old_path_u
3755                else:
3756                    old_path_u = self.utf8_decode(old_path)[0]
3757                    if old_path == path:
3758                        path_u = old_path_u
3759                    else:
3760                        path_u = self.utf8_decode(path)[0]
3761                source_kind = DirState._minikind_to_kind[source_minikind]
3762                return InventoryTreeChange(
3763                    entry[0][2],
3764                    (old_path_u, path_u),
3765                    content_change,
3766                    (True, True),
3767                    (source_parent_id, target_parent_id),
3768                    (self.utf8_decode(old_basename)[
3769                     0], self.utf8_decode(entry[0][1])[0]),
3770                    (source_kind, target_kind),
3771                    (source_exec, target_exec)), changed
3772        elif source_minikind in b'a' and target_minikind in _fdlt:
3773            # looks like a new file
3774            path = pathjoin(entry[0][0], entry[0][1])
3775            # parent id is the entry for the path in the target tree
3776            # TODO: these are the same for an entire directory: cache em.
3777            parent_id = self.state._get_entry(self.target_index,
3778                                              path_utf8=entry[0][0])[0][2]
3779            if parent_id == entry[0][2]:
3780                parent_id = None
3781            if path_info is not None:
3782                # Present on disk:
3783                if self.use_filesystem_for_exec:
3784                    # We need S_ISREG here, because we aren't sure if this
3785                    # is a file or not.
3786                    target_exec = bool(
3787                        stat.S_ISREG(path_info[3].st_mode)
3788                        and stat.S_IEXEC & path_info[3].st_mode)
3789                else:
3790                    target_exec = target_details[3]
3791                return InventoryTreeChange(
3792                    entry[0][2],
3793                    (None, self.utf8_decode(path)[0]),
3794                    True,
3795                    (False, True),
3796                    (None, parent_id),
3797                    (None, self.utf8_decode(entry[0][1])[0]),
3798                    (None, path_info[2]),
3799                    (None, target_exec)), True
3800            else:
3801                # Its a missing file, report it as such.
3802                return InventoryTreeChange(
3803                    entry[0][2],
3804                    (None, self.utf8_decode(path)[0]),
3805                    False,
3806                    (False, True),
3807                    (None, parent_id),
3808                    (None, self.utf8_decode(entry[0][1])[0]),
3809                    (None, None),
3810                    (None, False)), True
3811        elif source_minikind in _fdlt and target_minikind in b'a':
3812            # unversioned, possibly, or possibly not deleted: we dont care.
3813            # if its still on disk, *and* theres no other entry at this
3814            # path [we dont know this in this routine at the moment -
3815            # perhaps we should change this - then it would be an unknown.
3816            old_path = pathjoin(entry[0][0], entry[0][1])
3817            # parent id is the entry for the path in the target tree
3818            parent_id = self.state._get_entry(
3819                self.source_index, path_utf8=entry[0][0])[0][2]
3820            if parent_id == entry[0][2]:
3821                parent_id = None
3822            return InventoryTreeChange(
3823                entry[0][2],
3824                (self.utf8_decode(old_path)[0], None),
3825                True,
3826                (True, False),
3827                (parent_id, None),
3828                (self.utf8_decode(entry[0][1])[0], None),
3829                (DirState._minikind_to_kind[source_minikind], None),
3830                (source_details[3], None)), True
3831        elif source_minikind in _fdlt and target_minikind in b'r':
3832            # a rename; could be a true rename, or a rename inherited from
3833            # a renamed parent. TODO: handle this efficiently. Its not
3834            # common case to rename dirs though, so a correct but slow
3835            # implementation will do.
3836            if not osutils.is_inside_any(self.searched_specific_files,
3837                                         target_details[1]):
3838                self.search_specific_files.add(target_details[1])
3839        elif source_minikind in _ra and target_minikind in _ra:
3840            # neither of the selected trees contain this file,
3841            # so skip over it. This is not currently directly tested, but
3842            # is indirectly via test_too_much.TestCommands.test_conflicts.
3843            pass
3844        else:
3845            raise AssertionError("don't know how to compare "
3846                                 "source_minikind=%r, target_minikind=%r"
3847                                 % (source_minikind, target_minikind))
3848        return None, None
3849
3850    def __iter__(self):
3851        return self
3852
3853    def _gather_result_for_consistency(self, result):
3854        """Check a result we will yield to make sure we are consistent later.
3855
3856        This gathers result's parents into a set to output later.
3857
3858        :param result: A result tuple.
3859        """
3860        if not self.partial or not result.file_id:
3861            return
3862        self.seen_ids.add(result.file_id)
3863        new_path = result.path[1]
3864        if new_path:
3865            # Not the root and not a delete: queue up the parents of the path.
3866            self.search_specific_file_parents.update(
3867                p.encode('utf8') for p in osutils.parent_directories(new_path))
3868            # Add the root directory which parent_directories does not
3869            # provide.
3870            self.search_specific_file_parents.add(b'')
3871
3872    def iter_changes(self):
3873        """Iterate over the changes."""
3874        utf8_decode = cache_utf8._utf8_decode
3875        _lt_by_dirs = lt_by_dirs
3876        _process_entry = self._process_entry
3877        search_specific_files = self.search_specific_files
3878        searched_specific_files = self.searched_specific_files
3879        splitpath = osutils.splitpath
3880        # sketch:
3881        # compare source_index and target_index at or under each element of search_specific_files.
3882        # follow the following comparison table. Note that we only want to do diff operations when
3883        # the target is fdl because thats when the walkdirs logic will have exposed the pathinfo
3884        # for the target.
3885        # cases:
3886        #
3887        # Source | Target | disk | action
3888        #   r    | fdlt   |      | add source to search, add id path move and perform
3889        #        |        |      | diff check on source-target
3890        #   r    | fdlt   |  a   | dangling file that was present in the basis.
3891        #        |        |      | ???
3892        #   r    |  a     |      | add source to search
3893        #   r    |  a     |  a   |
3894        #   r    |  r     |      | this path is present in a non-examined tree, skip.
3895        #   r    |  r     |  a   | this path is present in a non-examined tree, skip.
3896        #   a    | fdlt   |      | add new id
3897        #   a    | fdlt   |  a   | dangling locally added file, skip
3898        #   a    |  a     |      | not present in either tree, skip
3899        #   a    |  a     |  a   | not present in any tree, skip
3900        #   a    |  r     |      | not present in either tree at this path, skip as it
3901        #        |        |      | may not be selected by the users list of paths.
3902        #   a    |  r     |  a   | not present in either tree at this path, skip as it
3903        #        |        |      | may not be selected by the users list of paths.
3904        #  fdlt  | fdlt   |      | content in both: diff them
3905        #  fdlt  | fdlt   |  a   | deleted locally, but not unversioned - show as deleted ?
3906        #  fdlt  |  a     |      | unversioned: output deleted id for now
3907        #  fdlt  |  a     |  a   | unversioned and deleted: output deleted id
3908        #  fdlt  |  r     |      | relocated in this tree, so add target to search.
3909        #        |        |      | Dont diff, we will see an r,fd; pair when we reach
3910        #        |        |      | this id at the other path.
3911        #  fdlt  |  r     |  a   | relocated in this tree, so add target to search.
3912        #        |        |      | Dont diff, we will see an r,fd; pair when we reach
3913        #        |        |      | this id at the other path.
3914
3915        # TODO: jam 20070516 - Avoid the _get_entry lookup overhead by
3916        #       keeping a cache of directories that we have seen.
3917
3918        while search_specific_files:
3919            # TODO: the pending list should be lexically sorted?  the
3920            # interface doesn't require it.
3921            current_root = search_specific_files.pop()
3922            current_root_unicode = current_root.decode('utf8')
3923            searched_specific_files.add(current_root)
3924            # process the entries for this containing directory: the rest will be
3925            # found by their parents recursively.
3926            root_entries = self.state._entries_for_path(current_root)
3927            root_abspath = self.tree.abspath(current_root_unicode)
3928            try:
3929                root_stat = os.lstat(root_abspath)
3930            except OSError as e:
3931                if e.errno == errno.ENOENT:
3932                    # the path does not exist: let _process_entry know that.
3933                    root_dir_info = None
3934                else:
3935                    # some other random error: hand it up.
3936                    raise
3937            else:
3938                root_dir_info = (b'', current_root,
3939                                 osutils.file_kind_from_stat_mode(
3940                                     root_stat.st_mode), root_stat,
3941                                 root_abspath)
3942                if root_dir_info[2] == 'directory':
3943                    if self.tree._directory_is_tree_reference(
3944                            current_root.decode('utf8')):
3945                        root_dir_info = root_dir_info[:2] + \
3946                            ('tree-reference',) + root_dir_info[3:]
3947
3948            if not root_entries and not root_dir_info:
3949                # this specified path is not present at all, skip it.
3950                continue
3951            path_handled = False
3952            for entry in root_entries:
3953                result, changed = _process_entry(entry, root_dir_info)
3954                if changed is not None:
3955                    path_handled = True
3956                    if changed:
3957                        self._gather_result_for_consistency(result)
3958                    if changed or self.include_unchanged:
3959                        yield result
3960            if self.want_unversioned and not path_handled and root_dir_info:
3961                new_executable = bool(
3962                    stat.S_ISREG(root_dir_info[3].st_mode)
3963                    and stat.S_IEXEC & root_dir_info[3].st_mode)
3964                yield InventoryTreeChange(
3965                    None,
3966                    (None, current_root_unicode),
3967                    True,
3968                    (False, False),
3969                    (None, None),
3970                    (None, splitpath(current_root_unicode)[-1]),
3971                    (None, root_dir_info[2]),
3972                    (None, new_executable)
3973                    )
3974            initial_key = (current_root, b'', b'')
3975            block_index, _ = self.state._find_block_index_from_key(initial_key)
3976            if block_index == 0:
3977                # we have processed the total root already, but because the
3978                # initial key matched it we should skip it here.
3979                block_index += 1
3980            if root_dir_info and root_dir_info[2] == 'tree-reference':
3981                current_dir_info = None
3982            else:
3983                dir_iterator = osutils._walkdirs_utf8(
3984                    root_abspath, prefix=current_root)
3985                try:
3986                    current_dir_info = next(dir_iterator)
3987                except OSError as e:
3988                    # on win32, python2.4 has e.errno == ERROR_DIRECTORY, but
3989                    # python 2.5 has e.errno == EINVAL,
3990                    #            and e.winerror == ERROR_DIRECTORY
3991                    e_winerror = getattr(e, 'winerror', None)
3992                    win_errors = (ERROR_DIRECTORY, ERROR_PATH_NOT_FOUND)
3993                    # there may be directories in the inventory even though
3994                    # this path is not a file on disk: so mark it as end of
3995                    # iterator
3996                    if e.errno in (errno.ENOENT, errno.ENOTDIR, errno.EINVAL):
3997                        current_dir_info = None
3998                    elif (sys.platform == 'win32'
3999                          and (e.errno in win_errors or
4000                               e_winerror in win_errors)):
4001                        current_dir_info = None
4002                    else:
4003                        raise
4004                else:
4005                    if current_dir_info[0][0] == b'':
4006                        # remove .bzr from iteration
4007                        bzr_index = bisect.bisect_left(
4008                            current_dir_info[1], (b'.bzr',))
4009                        if current_dir_info[1][bzr_index][0] != b'.bzr':
4010                            raise AssertionError()
4011                        del current_dir_info[1][bzr_index]
4012            # walk until both the directory listing and the versioned metadata
4013            # are exhausted.
4014            if (block_index < len(self.state._dirblocks) and
4015                osutils.is_inside(current_root,
4016                                  self.state._dirblocks[block_index][0])):
4017                current_block = self.state._dirblocks[block_index]
4018            else:
4019                current_block = None
4020            while (current_dir_info is not None or
4021                   current_block is not None):
4022                if (current_dir_info and current_block
4023                        and current_dir_info[0][0] != current_block[0]):
4024                    if _lt_by_dirs(current_dir_info[0][0], current_block[0]):
4025                        # filesystem data refers to paths not covered by the dirblock.
4026                        # this has two possibilities:
4027                        # A) it is versioned but empty, so there is no block for it
4028                        # B) it is not versioned.
4029
4030                        # if (A) then we need to recurse into it to check for
4031                        # new unknown files or directories.
4032                        # if (B) then we should ignore it, because we don't
4033                        # recurse into unknown directories.
4034                        path_index = 0
4035                        while path_index < len(current_dir_info[1]):
4036                            current_path_info = current_dir_info[1][path_index]
4037                            if self.want_unversioned:
4038                                if current_path_info[2] == 'directory':
4039                                    if self.tree._directory_is_tree_reference(
4040                                            current_path_info[0].decode('utf8')):
4041                                        current_path_info = current_path_info[:2] + \
4042                                            ('tree-reference',) + \
4043                                            current_path_info[3:]
4044                                new_executable = bool(
4045                                    stat.S_ISREG(current_path_info[3].st_mode)
4046                                    and stat.S_IEXEC & current_path_info[3].st_mode)
4047                                yield InventoryTreeChange(
4048                                    None,
4049                                    (None, utf8_decode(current_path_info[0])[0]),
4050                                    True,
4051                                    (False, False),
4052                                    (None, None),
4053                                    (None, utf8_decode(current_path_info[1])[0]),
4054                                    (None, current_path_info[2]),
4055                                    (None, new_executable))
4056                            # dont descend into this unversioned path if it is
4057                            # a dir
4058                            if current_path_info[2] in ('directory',
4059                                                        'tree-reference'):
4060                                del current_dir_info[1][path_index]
4061                                path_index -= 1
4062                            path_index += 1
4063
4064                        # This dir info has been handled, go to the next
4065                        try:
4066                            current_dir_info = next(dir_iterator)
4067                        except StopIteration:
4068                            current_dir_info = None
4069                    else:
4070                        # We have a dirblock entry for this location, but there
4071                        # is no filesystem path for this. This is most likely
4072                        # because a directory was removed from the disk.
4073                        # We don't have to report the missing directory,
4074                        # because that should have already been handled, but we
4075                        # need to handle all of the files that are contained
4076                        # within.
4077                        for current_entry in current_block[1]:
4078                            # entry referring to file not present on disk.
4079                            # advance the entry only, after processing.
4080                            result, changed = _process_entry(
4081                                current_entry, None)
4082                            if changed is not None:
4083                                if changed:
4084                                    self._gather_result_for_consistency(result)
4085                                if changed or self.include_unchanged:
4086                                    yield result
4087                        block_index += 1
4088                        if (block_index < len(self.state._dirblocks) and
4089                            osutils.is_inside(current_root,
4090                                              self.state._dirblocks[block_index][0])):
4091                            current_block = self.state._dirblocks[block_index]
4092                        else:
4093                            current_block = None
4094                    continue
4095                entry_index = 0
4096                if current_block and entry_index < len(current_block[1]):
4097                    current_entry = current_block[1][entry_index]
4098                else:
4099                    current_entry = None
4100                advance_entry = True
4101                path_index = 0
4102                if current_dir_info and path_index < len(current_dir_info[1]):
4103                    current_path_info = current_dir_info[1][path_index]
4104                    if current_path_info[2] == 'directory':
4105                        if self.tree._directory_is_tree_reference(
4106                                current_path_info[0].decode('utf8')):
4107                            current_path_info = current_path_info[:2] + \
4108                                ('tree-reference',) + current_path_info[3:]
4109                else:
4110                    current_path_info = None
4111                advance_path = True
4112                path_handled = False
4113                while (current_entry is not None or
4114                       current_path_info is not None):
4115                    if current_entry is None:
4116                        # the check for path_handled when the path is advanced
4117                        # will yield this path if needed.
4118                        pass
4119                    elif current_path_info is None:
4120                        # no path is fine: the per entry code will handle it.
4121                        result, changed = _process_entry(
4122                            current_entry, current_path_info)
4123                        if changed is not None:
4124                            if changed:
4125                                self._gather_result_for_consistency(result)
4126                            if changed or self.include_unchanged:
4127                                yield result
4128                    elif (current_entry[0][1] != current_path_info[1]
4129                          or current_entry[1][self.target_index][0] in (b'a', b'r')):
4130                        # The current path on disk doesn't match the dirblock
4131                        # record. Either the dirblock is marked as absent, or
4132                        # the file on disk is not present at all in the
4133                        # dirblock. Either way, report about the dirblock
4134                        # entry, and let other code handle the filesystem one.
4135
4136                        # Compare the basename for these files to determine
4137                        # which comes first
4138                        if current_path_info[1] < current_entry[0][1]:
4139                            # extra file on disk: pass for now, but only
4140                            # increment the path, not the entry
4141                            advance_entry = False
4142                        else:
4143                            # entry referring to file not present on disk.
4144                            # advance the entry only, after processing.
4145                            result, changed = _process_entry(
4146                                current_entry, None)
4147                            if changed is not None:
4148                                if changed:
4149                                    self._gather_result_for_consistency(result)
4150                                if changed or self.include_unchanged:
4151                                    yield result
4152                            advance_path = False
4153                    else:
4154                        result, changed = _process_entry(
4155                            current_entry, current_path_info)
4156                        if changed is not None:
4157                            path_handled = True
4158                            if changed:
4159                                self._gather_result_for_consistency(result)
4160                            if changed or self.include_unchanged:
4161                                yield result
4162                    if advance_entry and current_entry is not None:
4163                        entry_index += 1
4164                        if entry_index < len(current_block[1]):
4165                            current_entry = current_block[1][entry_index]
4166                        else:
4167                            current_entry = None
4168                    else:
4169                        advance_entry = True  # reset the advance flaga
4170                    if advance_path and current_path_info is not None:
4171                        if not path_handled:
4172                            # unversioned in all regards
4173                            if self.want_unversioned:
4174                                new_executable = bool(
4175                                    stat.S_ISREG(current_path_info[3].st_mode)
4176                                    and stat.S_IEXEC & current_path_info[3].st_mode)
4177                                try:
4178                                    relpath_unicode = utf8_decode(
4179                                        current_path_info[0])[0]
4180                                except UnicodeDecodeError:
4181                                    raise errors.BadFilenameEncoding(
4182                                        current_path_info[0], osutils._fs_enc)
4183                                yield InventoryTreeChange(
4184                                    None,
4185                                    (None, relpath_unicode),
4186                                    True,
4187                                    (False, False),
4188                                    (None, None),
4189                                    (None, utf8_decode(current_path_info[1])[0]),
4190                                    (None, current_path_info[2]),
4191                                    (None, new_executable))
4192                            # dont descend into this unversioned path if it is
4193                            # a dir
4194                            if current_path_info[2] in ('directory'):
4195                                del current_dir_info[1][path_index]
4196                                path_index -= 1
4197                        # dont descend the disk iterator into any tree
4198                        # paths.
4199                        if current_path_info[2] == 'tree-reference':
4200                            del current_dir_info[1][path_index]
4201                            path_index -= 1
4202                        path_index += 1
4203                        if path_index < len(current_dir_info[1]):
4204                            current_path_info = current_dir_info[1][path_index]
4205                            if current_path_info[2] == 'directory':
4206                                if self.tree._directory_is_tree_reference(
4207                                        current_path_info[0].decode('utf8')):
4208                                    current_path_info = current_path_info[:2] + \
4209                                        ('tree-reference',) + \
4210                                        current_path_info[3:]
4211                        else:
4212                            current_path_info = None
4213                        path_handled = False
4214                    else:
4215                        advance_path = True  # reset the advance flagg.
4216                if current_block is not None:
4217                    block_index += 1
4218                    if (block_index < len(self.state._dirblocks) and
4219                        osutils.is_inside(current_root,
4220                                          self.state._dirblocks[block_index][0])):
4221                        current_block = self.state._dirblocks[block_index]
4222                    else:
4223                        current_block = None
4224                if current_dir_info is not None:
4225                    try:
4226                        current_dir_info = next(dir_iterator)
4227                    except StopIteration:
4228                        current_dir_info = None
4229        for result in self._iter_specific_file_parents():
4230            yield result
4231
4232    def _iter_specific_file_parents(self):
4233        """Iter over the specific file parents."""
4234        while self.search_specific_file_parents:
4235            # Process the parent directories for the paths we were iterating.
4236            # Even in extremely large trees this should be modest, so currently
4237            # no attempt is made to optimise.
4238            path_utf8 = self.search_specific_file_parents.pop()
4239            if osutils.is_inside_any(self.searched_specific_files, path_utf8):
4240                # We've examined this path.
4241                continue
4242            if path_utf8 in self.searched_exact_paths:
4243                # We've examined this path.
4244                continue
4245            path_entries = self.state._entries_for_path(path_utf8)
4246            # We need either one or two entries. If the path in
4247            # self.target_index has moved (so the entry in source_index is in
4248            # 'ar') then we need to also look for the entry for this path in
4249            # self.source_index, to output the appropriate delete-or-rename.
4250            selected_entries = []
4251            found_item = False
4252            for candidate_entry in path_entries:
4253                # Find entries present in target at this path:
4254                if candidate_entry[1][self.target_index][0] not in (b'a', b'r'):
4255                    found_item = True
4256                    selected_entries.append(candidate_entry)
4257                # Find entries present in source at this path:
4258                elif (self.source_index is not None and
4259                      candidate_entry[1][self.source_index][0] not in (b'a', b'r')):
4260                    found_item = True
4261                    if candidate_entry[1][self.target_index][0] == b'a':
4262                        # Deleted, emit it here.
4263                        selected_entries.append(candidate_entry)
4264                    else:
4265                        # renamed, emit it when we process the directory it
4266                        # ended up at.
4267                        self.search_specific_file_parents.add(
4268                            candidate_entry[1][self.target_index][1])
4269            if not found_item:
4270                raise AssertionError(
4271                    "Missing entry for specific path parent %r, %r" % (
4272                        path_utf8, path_entries))
4273            path_info = self._path_info(path_utf8, path_utf8.decode('utf8'))
4274            for entry in selected_entries:
4275                if entry[0][2] in self.seen_ids:
4276                    continue
4277                result, changed = self._process_entry(entry, path_info)
4278                if changed is None:
4279                    raise AssertionError(
4280                        "Got entry<->path mismatch for specific path "
4281                        "%r entry %r path_info %r " % (
4282                            path_utf8, entry, path_info))
4283                # Only include changes - we're outside the users requested
4284                # expansion.
4285                if changed:
4286                    self._gather_result_for_consistency(result)
4287                    if (result.kind[0] == 'directory' and
4288                            result.kind[1] != 'directory'):
4289                        # This stopped being a directory, the old children have
4290                        # to be included.
4291                        if entry[1][self.source_index][0] == b'r':
4292                            # renamed, take the source path
4293                            entry_path_utf8 = entry[1][self.source_index][1]
4294                        else:
4295                            entry_path_utf8 = path_utf8
4296                        initial_key = (entry_path_utf8, b'', b'')
4297                        block_index, _ = self.state._find_block_index_from_key(
4298                            initial_key)
4299                        if block_index == 0:
4300                            # The children of the root are in block index 1.
4301                            block_index += 1
4302                        current_block = None
4303                        if block_index < len(self.state._dirblocks):
4304                            current_block = self.state._dirblocks[block_index]
4305                            if not osutils.is_inside(
4306                                    entry_path_utf8, current_block[0]):
4307                                # No entries for this directory at all.
4308                                current_block = None
4309                        if current_block is not None:
4310                            for entry in current_block[1]:
4311                                if entry[1][self.source_index][0] in (b'a', b'r'):
4312                                    # Not in the source tree, so doesn't have to be
4313                                    # included.
4314                                    continue
4315                                # Path of the entry itself.
4316
4317                                self.search_specific_file_parents.add(
4318                                    osutils.pathjoin(*entry[0][:2]))
4319                if changed or self.include_unchanged:
4320                    yield result
4321            self.searched_exact_paths.add(path_utf8)
4322
4323    def _path_info(self, utf8_path, unicode_path):
4324        """Generate path_info for unicode_path.
4325
4326        :return: None if unicode_path does not exist, or a path_info tuple.
4327        """
4328        abspath = self.tree.abspath(unicode_path)
4329        try:
4330            stat = os.lstat(abspath)
4331        except OSError as e:
4332            if e.errno == errno.ENOENT:
4333                # the path does not exist.
4334                return None
4335            else:
4336                raise
4337        utf8_basename = utf8_path.rsplit(b'/', 1)[-1]
4338        dir_info = (utf8_path, utf8_basename,
4339                    osutils.file_kind_from_stat_mode(stat.st_mode), stat,
4340                    abspath)
4341        if dir_info[2] == 'directory':
4342            if self.tree._directory_is_tree_reference(
4343                    unicode_path):
4344                self.root_dir_info = self.root_dir_info[:2] + \
4345                    ('tree-reference',) + self.root_dir_info[3:]
4346        return dir_info
4347
4348
4349# Try to load the compiled form if possible
4350try:
4351    from ._dirstate_helpers_pyx import (
4352        _read_dirblocks,
4353        bisect_dirblock,
4354        _bisect_path_left,
4355        _bisect_path_right,
4356        lt_by_dirs,
4357        pack_stat,
4358        ProcessEntryC as _process_entry,
4359        update_entry as update_entry,
4360        )
4361except ImportError as e:
4362    osutils.failed_to_load_extension(e)
4363    from ._dirstate_helpers_py import (
4364        _read_dirblocks,
4365        bisect_dirblock,
4366        _bisect_path_left,
4367        _bisect_path_right,
4368        lt_by_dirs,
4369        pack_stat,
4370        )
4371    # FIXME: It would be nice to be able to track moved lines so that the
4372    # corresponding python code can be moved to the _dirstate_helpers_py
4373    # module. I don't want to break the history for this important piece of
4374    # code so I left the code here -- vila 20090622
4375    update_entry = py_update_entry
4376    _process_entry = ProcessEntryPython
4377