1# patch.py -- For dealing with packed-style patches.
2# Copyright (C) 2009-2013 Jelmer Vernooij <jelmer@jelmer.uk>
3#
4# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
5# General Public License as public by the Free Software Foundation; version 2.0
6# or (at your option) any later version. You can redistribute it and/or
7# modify it under the terms of either of these two licenses.
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14#
15# You should have received a copy of the licenses; if not, see
16# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
17# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
18# License, Version 2.0.
19#
20
21"""Classes for dealing with git am-style patches.
22
23These patches are basically unified diffs with some extra metadata tacked
24on.
25"""
26
27from difflib import SequenceMatcher
28import email.parser
29import time
30
31from dulwich.objects import (
32    Blob,
33    Commit,
34    S_ISGITLINK,
35    )
36
37FIRST_FEW_BYTES = 8000
38
39
40def write_commit_patch(f, commit, contents, progress, version=None,
41                       encoding=None):
42    """Write a individual file patch.
43
44    Args:
45      commit: Commit object
46      progress: Tuple with current patch number and total.
47    Returns:
48      tuple with filename and contents
49    """
50    encoding = encoding or getattr(f, "encoding", "ascii")
51    if isinstance(contents, str):
52        contents = contents.encode(encoding)
53    (num, total) = progress
54    f.write(b"From " + commit.id + b" " +
55            time.ctime(commit.commit_time).encode(encoding) + b"\n")
56    f.write(b"From: " + commit.author + b"\n")
57    f.write(b"Date: " +
58            time.strftime("%a, %d %b %Y %H:%M:%S %Z").encode(encoding) + b"\n")
59    f.write(("Subject: [PATCH %d/%d] " % (num, total)).encode(encoding) +
60            commit.message + b"\n")
61    f.write(b"\n")
62    f.write(b"---\n")
63    try:
64        import subprocess
65        p = subprocess.Popen(["diffstat"], stdout=subprocess.PIPE,
66                             stdin=subprocess.PIPE)
67    except (ImportError, OSError):
68        pass  # diffstat not available?
69    else:
70        (diffstat, _) = p.communicate(contents)
71        f.write(diffstat)
72        f.write(b"\n")
73    f.write(contents)
74    f.write(b"-- \n")
75    if version is None:
76        from dulwich import __version__ as dulwich_version
77        f.write(b"Dulwich %d.%d.%d\n" % dulwich_version)
78    else:
79        f.write(version.encode(encoding) + b"\n")
80
81
82def get_summary(commit):
83    """Determine the summary line for use in a filename.
84
85    Args:
86      commit: Commit
87    Returns: Summary string
88    """
89    decoded = commit.message.decode(errors='replace')
90    return decoded.splitlines()[0].replace(" ", "-")
91
92
93#  Unified Diff
94def _format_range_unified(start, stop):
95    'Convert range to the "ed" format'
96    # Per the diff spec at http://www.unix.org/single_unix_specification/
97    beginning = start + 1  # lines start numbering with one
98    length = stop - start
99    if length == 1:
100        return '{}'.format(beginning)
101    if not length:
102        beginning -= 1  # empty ranges begin at line just before the range
103    return '{},{}'.format(beginning, length)
104
105
106def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
107                 tofiledate='', n=3, lineterm='\n'):
108    """difflib.unified_diff that can detect "No newline at end of file" as
109    original "git diff" does.
110
111    Based on the same function in Python2.7 difflib.py
112    """
113    started = False
114    for group in SequenceMatcher(None, a, b).get_grouped_opcodes(n):
115        if not started:
116            started = True
117            fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''
118            todate = '\t{}'.format(tofiledate) if tofiledate else ''
119            yield '--- {}{}{}'.format(
120                fromfile.decode("ascii"),
121                fromdate,
122                lineterm
123                ).encode('ascii')
124            yield '+++ {}{}{}'.format(
125                tofile.decode("ascii"),
126                todate,
127                lineterm
128                ).encode('ascii')
129
130        first, last = group[0], group[-1]
131        file1_range = _format_range_unified(first[1], last[2])
132        file2_range = _format_range_unified(first[3], last[4])
133        yield '@@ -{} +{} @@{}'.format(
134            file1_range,
135            file2_range,
136            lineterm
137             ).encode('ascii')
138
139        for tag, i1, i2, j1, j2 in group:
140            if tag == 'equal':
141                for line in a[i1:i2]:
142                    yield b' ' + line
143                continue
144            if tag in ('replace', 'delete'):
145                for line in a[i1:i2]:
146                    if not line[-1:] == b'\n':
147                        line += b'\n\\ No newline at end of file\n'
148                    yield b'-' + line
149            if tag in ('replace', 'insert'):
150                for line in b[j1:j2]:
151                    if not line[-1:] == b'\n':
152                        line += b'\n\\ No newline at end of file\n'
153                    yield b'+' + line
154
155
156def is_binary(content):
157    """See if the first few bytes contain any null characters.
158
159    Args:
160      content: Bytestring to check for binary content
161    """
162    return b'\0' in content[:FIRST_FEW_BYTES]
163
164
165def shortid(hexsha):
166    if hexsha is None:
167        return b"0" * 7
168    else:
169        return hexsha[:7]
170
171
172def patch_filename(p, root):
173    if p is None:
174        return b"/dev/null"
175    else:
176        return root + b"/" + p
177
178
179def write_object_diff(f, store, old_file, new_file, diff_binary=False):
180    """Write the diff for an object.
181
182    Args:
183      f: File-like object to write to
184      store: Store to retrieve objects from, if necessary
185      old_file: (path, mode, hexsha) tuple
186      new_file: (path, mode, hexsha) tuple
187      diff_binary: Whether to diff files even if they
188        are considered binary files by is_binary().
189
190    Note: the tuple elements should be None for nonexistant files
191    """
192    (old_path, old_mode, old_id) = old_file
193    (new_path, new_mode, new_id) = new_file
194    patched_old_path = patch_filename(old_path, b"a")
195    patched_new_path = patch_filename(new_path, b"b")
196
197    def content(mode, hexsha):
198        if hexsha is None:
199            return Blob.from_string(b'')
200        elif S_ISGITLINK(mode):
201            return Blob.from_string(b"Subproject commit " + hexsha + b"\n")
202        else:
203            return store[hexsha]
204
205    def lines(content):
206        if not content:
207            return []
208        else:
209            return content.splitlines()
210    f.writelines(gen_diff_header(
211        (old_path, new_path), (old_mode, new_mode), (old_id, new_id)))
212    old_content = content(old_mode, old_id)
213    new_content = content(new_mode, new_id)
214    if not diff_binary and (
215            is_binary(old_content.data) or is_binary(new_content.data)):
216        binary_diff = (
217            b"Binary files "
218            + patched_old_path
219            + b" and "
220            + patched_new_path
221            + b" differ\n"
222        )
223        f.write(binary_diff)
224    else:
225        f.writelines(unified_diff(lines(old_content), lines(new_content),
226                     patched_old_path, patched_new_path))
227
228
229# TODO(jelmer): Support writing unicode, rather than bytes.
230def gen_diff_header(paths, modes, shas):
231    """Write a blob diff header.
232
233    Args:
234      paths: Tuple with old and new path
235      modes: Tuple with old and new modes
236      shas: Tuple with old and new shas
237    """
238    (old_path, new_path) = paths
239    (old_mode, new_mode) = modes
240    (old_sha, new_sha) = shas
241    if old_path is None and new_path is not None:
242        old_path = new_path
243    if new_path is None and old_path is not None:
244        new_path = old_path
245    old_path = patch_filename(old_path, b"a")
246    new_path = patch_filename(new_path, b"b")
247    yield b"diff --git " + old_path + b" " + new_path + b"\n"
248
249    if old_mode != new_mode:
250        if new_mode is not None:
251            if old_mode is not None:
252                yield ("old file mode %o\n" % old_mode).encode('ascii')
253            yield ("new file mode %o\n" % new_mode).encode('ascii')
254        else:
255            yield ("deleted file mode %o\n" % old_mode).encode('ascii')
256    yield b"index " + shortid(old_sha) + b".." + shortid(new_sha)
257    if new_mode is not None and old_mode is not None:
258        yield (" %o" % new_mode).encode('ascii')
259    yield b"\n"
260
261
262# TODO(jelmer): Support writing unicode, rather than bytes.
263def write_blob_diff(f, old_file, new_file):
264    """Write blob diff.
265
266    Args:
267      f: File-like object to write to
268      old_file: (path, mode, hexsha) tuple (None if nonexisting)
269      new_file: (path, mode, hexsha) tuple (None if nonexisting)
270
271    Note: The use of write_object_diff is recommended over this function.
272    """
273    (old_path, old_mode, old_blob) = old_file
274    (new_path, new_mode, new_blob) = new_file
275    patched_old_path = patch_filename(old_path, b"a")
276    patched_new_path = patch_filename(new_path, b"b")
277
278    def lines(blob):
279        if blob is not None:
280            return blob.splitlines()
281        else:
282            return []
283    f.writelines(gen_diff_header(
284        (old_path, new_path), (old_mode, new_mode),
285        (getattr(old_blob, "id", None), getattr(new_blob, "id", None))))
286    old_contents = lines(old_blob)
287    new_contents = lines(new_blob)
288    f.writelines(unified_diff(old_contents, new_contents,
289                 patched_old_path, patched_new_path))
290
291
292def write_tree_diff(f, store, old_tree, new_tree, diff_binary=False):
293    """Write tree diff.
294
295    Args:
296      f: File-like object to write to.
297      old_tree: Old tree id
298      new_tree: New tree id
299      diff_binary: Whether to diff files even if they
300        are considered binary files by is_binary().
301    """
302    changes = store.tree_changes(old_tree, new_tree)
303    for (oldpath, newpath), (oldmode, newmode), (oldsha, newsha) in changes:
304        write_object_diff(f, store, (oldpath, oldmode, oldsha),
305                          (newpath, newmode, newsha), diff_binary=diff_binary)
306
307
308def git_am_patch_split(f, encoding=None):
309    """Parse a git-am-style patch and split it up into bits.
310
311    Args:
312      f: File-like object to parse
313      encoding: Encoding to use when creating Git objects
314    Returns: Tuple with commit object, diff contents and git version
315    """
316    encoding = encoding or getattr(f, "encoding", "ascii")
317    encoding = encoding or "ascii"
318    contents = f.read()
319    if (isinstance(contents, bytes) and
320            getattr(email.parser, "BytesParser", None)):
321        parser = email.parser.BytesParser()
322        msg = parser.parsebytes(contents)
323    else:
324        parser = email.parser.Parser()
325        msg = parser.parsestr(contents)
326    return parse_patch_message(msg, encoding)
327
328
329def parse_patch_message(msg, encoding=None):
330    """Extract a Commit object and patch from an e-mail message.
331
332    Args:
333      msg: An email message (email.message.Message)
334      encoding: Encoding to use to encode Git commits
335    Returns: Tuple with commit object, diff contents and git version
336    """
337    c = Commit()
338    c.author = msg["from"].encode(encoding)
339    c.committer = msg["from"].encode(encoding)
340    try:
341        patch_tag_start = msg["subject"].index("[PATCH")
342    except ValueError:
343        subject = msg["subject"]
344    else:
345        close = msg["subject"].index("] ", patch_tag_start)
346        subject = msg["subject"][close+2:]
347    c.message = (subject.replace("\n", "") + "\n").encode(encoding)
348    first = True
349
350    body = msg.get_payload(decode=True)
351    lines = body.splitlines(True)
352    line_iter = iter(lines)
353
354    for line in line_iter:
355        if line == b"---\n":
356            break
357        if first:
358            if line.startswith(b"From: "):
359                c.author = line[len(b"From: "):].rstrip()
360            else:
361                c.message += b"\n" + line
362            first = False
363        else:
364            c.message += line
365    diff = b""
366    for line in line_iter:
367        if line == b"-- \n":
368            break
369        diff += line
370    try:
371        version = next(line_iter).rstrip(b"\n")
372    except StopIteration:
373        version = None
374    return c, diff, version
375