1# protocol.py -- Shared parts of the git protocols
2# Copyright (C) 2008 John Carr <john.carr@unrouted.co.uk>
3# Copyright (C) 2008-2012 Jelmer Vernooij <jelmer@jelmer.uk>
4#
5# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
6# General Public License as public by the Free Software Foundation; version 2.0
7# or (at your option) any later version. You can redistribute it and/or
8# modify it under the terms of either of these two licenses.
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15#
16# You should have received a copy of the licenses; if not, see
17# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
18# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
19# License, Version 2.0.
20#
21
22"""Generic functions for talking the git smart server protocol."""
23
24from io import BytesIO
25from os import (
26    SEEK_END,
27    )
28import socket
29
30import dulwich
31from dulwich.errors import (
32    HangupException,
33    GitProtocolError,
34    )
35
36TCP_GIT_PORT = 9418
37
38ZERO_SHA = b"0" * 40
39
40SINGLE_ACK = 0
41MULTI_ACK = 1
42MULTI_ACK_DETAILED = 2
43
44# pack data
45SIDE_BAND_CHANNEL_DATA = 1
46# progress messages
47SIDE_BAND_CHANNEL_PROGRESS = 2
48# fatal error message just before stream aborts
49SIDE_BAND_CHANNEL_FATAL = 3
50
51CAPABILITY_DEEPEN_SINCE = b'deepen-since'
52CAPABILITY_DEEPEN_NOT = b'deepen-not'
53CAPABILITY_DEEPEN_RELATIVE = b'deepen-relative'
54CAPABILITY_DELETE_REFS = b'delete-refs'
55CAPABILITY_INCLUDE_TAG = b'include-tag'
56CAPABILITY_MULTI_ACK = b'multi_ack'
57CAPABILITY_MULTI_ACK_DETAILED = b'multi_ack_detailed'
58CAPABILITY_NO_DONE = b'no-done'
59CAPABILITY_NO_PROGRESS = b'no-progress'
60CAPABILITY_OFS_DELTA = b'ofs-delta'
61CAPABILITY_QUIET = b'quiet'
62CAPABILITY_REPORT_STATUS = b'report-status'
63CAPABILITY_SHALLOW = b'shallow'
64CAPABILITY_SIDE_BAND = b'side-band'
65CAPABILITY_SIDE_BAND_64K = b'side-band-64k'
66CAPABILITY_THIN_PACK = b'thin-pack'
67CAPABILITY_AGENT = b'agent'
68CAPABILITY_SYMREF = b'symref'
69
70# Magic ref that is used to attach capabilities to when
71# there are no refs. Should always be ste to ZERO_SHA.
72CAPABILITIES_REF = b'capabilities^{}'
73
74COMMON_CAPABILITIES = [
75    CAPABILITY_OFS_DELTA,
76    CAPABILITY_SIDE_BAND,
77    CAPABILITY_SIDE_BAND_64K,
78    CAPABILITY_AGENT,
79    CAPABILITY_NO_PROGRESS]
80KNOWN_UPLOAD_CAPABILITIES = set(COMMON_CAPABILITIES + [
81    CAPABILITY_THIN_PACK,
82    CAPABILITY_MULTI_ACK,
83    CAPABILITY_MULTI_ACK_DETAILED,
84    CAPABILITY_INCLUDE_TAG,
85    CAPABILITY_DEEPEN_SINCE,
86    CAPABILITY_SYMREF,
87    CAPABILITY_SHALLOW,
88    CAPABILITY_DEEPEN_NOT,
89    CAPABILITY_DEEPEN_RELATIVE,
90    ])
91KNOWN_RECEIVE_CAPABILITIES = set(COMMON_CAPABILITIES + [
92    CAPABILITY_REPORT_STATUS])
93
94
95def agent_string():
96    return ('dulwich/%d.%d.%d' % dulwich.__version__).encode('ascii')
97
98
99def capability_agent():
100    return CAPABILITY_AGENT + b'=' + agent_string()
101
102
103def capability_symref(from_ref, to_ref):
104    return CAPABILITY_SYMREF + b'=' + from_ref + b':' + to_ref
105
106
107def extract_capability_names(capabilities):
108    return set(parse_capability(c)[0] for c in capabilities)
109
110
111def parse_capability(capability):
112    parts = capability.split(b'=', 1)
113    if len(parts) == 1:
114        return (parts[0], None)
115    return tuple(parts)
116
117
118def symref_capabilities(symrefs):
119    return [capability_symref(*k) for k in symrefs]
120
121
122COMMAND_DEEPEN = b'deepen'
123COMMAND_SHALLOW = b'shallow'
124COMMAND_UNSHALLOW = b'unshallow'
125COMMAND_DONE = b'done'
126COMMAND_WANT = b'want'
127COMMAND_HAVE = b'have'
128
129
130class ProtocolFile(object):
131    """A dummy file for network ops that expect file-like objects."""
132
133    def __init__(self, read, write):
134        self.read = read
135        self.write = write
136
137    def tell(self):
138        pass
139
140    def close(self):
141        pass
142
143
144def pkt_line(data):
145    """Wrap data in a pkt-line.
146
147    :param data: The data to wrap, as a str or None.
148    :return: The data prefixed with its length in pkt-line format; if data was
149        None, returns the flush-pkt ('0000').
150    """
151    if data is None:
152        return b'0000'
153    return ('%04x' % (len(data) + 4)).encode('ascii') + data
154
155
156class Protocol(object):
157    """Class for interacting with a remote git process over the wire.
158
159    Parts of the git wire protocol use 'pkt-lines' to communicate. A pkt-line
160    consists of the length of the line as a 4-byte hex string, followed by the
161    payload data. The length includes the 4-byte header. The special line
162    '0000' indicates the end of a section of input and is called a 'flush-pkt'.
163
164    For details on the pkt-line format, see the cgit distribution:
165        Documentation/technical/protocol-common.txt
166    """
167
168    def __init__(self, read, write, close=None, report_activity=None):
169        self.read = read
170        self.write = write
171        self._close = close
172        self.report_activity = report_activity
173        self._readahead = None
174
175    def close(self):
176        if self._close:
177            self._close()
178
179    def __enter__(self):
180        return self
181
182    def __exit__(self, exc_type, exc_val, exc_tb):
183        self.close()
184
185    def read_pkt_line(self):
186        """Reads a pkt-line from the remote git process.
187
188        This method may read from the readahead buffer; see unread_pkt_line.
189
190        :return: The next string from the stream, without the length prefix, or
191            None for a flush-pkt ('0000').
192        """
193        if self._readahead is None:
194            read = self.read
195        else:
196            read = self._readahead.read
197            self._readahead = None
198
199        try:
200            sizestr = read(4)
201            if not sizestr:
202                raise HangupException()
203            size = int(sizestr, 16)
204            if size == 0:
205                if self.report_activity:
206                    self.report_activity(4, 'read')
207                return None
208            if self.report_activity:
209                self.report_activity(size, 'read')
210            pkt_contents = read(size-4)
211        except socket.error as e:
212            raise GitProtocolError(e)
213        else:
214            if len(pkt_contents) + 4 != size:
215                raise GitProtocolError(
216                    'Length of pkt read %04x does not match length prefix %04x'
217                    % (len(pkt_contents) + 4, size))
218            return pkt_contents
219
220    def eof(self):
221        """Test whether the protocol stream has reached EOF.
222
223        Note that this refers to the actual stream EOF and not just a
224        flush-pkt.
225
226        :return: True if the stream is at EOF, False otherwise.
227        """
228        try:
229            next_line = self.read_pkt_line()
230        except HangupException:
231            return True
232        self.unread_pkt_line(next_line)
233        return False
234
235    def unread_pkt_line(self, data):
236        """Unread a single line of data into the readahead buffer.
237
238        This method can be used to unread a single pkt-line into a fixed
239        readahead buffer.
240
241        :param data: The data to unread, without the length prefix.
242        :raise ValueError: If more than one pkt-line is unread.
243        """
244        if self._readahead is not None:
245            raise ValueError('Attempted to unread multiple pkt-lines.')
246        self._readahead = BytesIO(pkt_line(data))
247
248    def read_pkt_seq(self):
249        """Read a sequence of pkt-lines from the remote git process.
250
251        :return: Yields each line of data up to but not including the next
252            flush-pkt.
253        """
254        pkt = self.read_pkt_line()
255        while pkt:
256            yield pkt
257            pkt = self.read_pkt_line()
258
259    def write_pkt_line(self, line):
260        """Sends a pkt-line to the remote git process.
261
262        :param line: A string containing the data to send, without the length
263            prefix.
264        """
265        try:
266            line = pkt_line(line)
267            self.write(line)
268            if self.report_activity:
269                self.report_activity(len(line), 'write')
270        except socket.error as e:
271            raise GitProtocolError(e)
272
273    def write_file(self):
274        """Return a writable file-like object for this protocol."""
275
276        class ProtocolFile(object):
277
278            def __init__(self, proto):
279                self._proto = proto
280                self._offset = 0
281
282            def write(self, data):
283                self._proto.write(data)
284                self._offset += len(data)
285
286            def tell(self):
287                return self._offset
288
289            def close(self):
290                pass
291
292        return ProtocolFile(self)
293
294    def write_sideband(self, channel, blob):
295        """Write multiplexed data to the sideband.
296
297        :param channel: An int specifying the channel to write to.
298        :param blob: A blob of data (as a string) to send on this channel.
299        """
300        # a pktline can be a max of 65520. a sideband line can therefore be
301        # 65520-5 = 65515
302        # WTF: Why have the len in ASCII, but the channel in binary.
303        while blob:
304            self.write_pkt_line(bytes(bytearray([channel])) + blob[:65515])
305            blob = blob[65515:]
306
307    def send_cmd(self, cmd, *args):
308        """Send a command and some arguments to a git server.
309
310        Only used for the TCP git protocol (git://).
311
312        :param cmd: The remote service to access.
313        :param args: List of arguments to send to remove service.
314        """
315        self.write_pkt_line(cmd + b" " + b"".join([(a + b"\0") for a in args]))
316
317    def read_cmd(self):
318        """Read a command and some arguments from the git client
319
320        Only used for the TCP git protocol (git://).
321
322        :return: A tuple of (command, [list of arguments]).
323        """
324        line = self.read_pkt_line()
325        splice_at = line.find(b" ")
326        cmd, args = line[:splice_at], line[splice_at+1:]
327        assert args[-1:] == b"\x00"
328        return cmd, args[:-1].split(b"\0")
329
330
331_RBUFSIZE = 8192  # Default read buffer size.
332
333
334class ReceivableProtocol(Protocol):
335    """Variant of Protocol that allows reading up to a size without blocking.
336
337    This class has a recv() method that behaves like socket.recv() in addition
338    to a read() method.
339
340    If you want to read n bytes from the wire and block until exactly n bytes
341    (or EOF) are read, use read(n). If you want to read at most n bytes from
342    the wire but don't care if you get less, use recv(n). Note that recv(n)
343    will still block until at least one byte is read.
344    """
345
346    def __init__(self, recv, write, close=None, report_activity=None,
347                 rbufsize=_RBUFSIZE):
348        super(ReceivableProtocol, self).__init__(
349                self.read, write, close=close, report_activity=report_activity)
350        self._recv = recv
351        self._rbuf = BytesIO()
352        self._rbufsize = rbufsize
353
354    def read(self, size):
355        # From _fileobj.read in socket.py in the Python 2.6.5 standard library,
356        # with the following modifications:
357        #  - omit the size <= 0 branch
358        #  - seek back to start rather than 0 in case some buffer has been
359        #    consumed.
360        #  - use SEEK_END instead of the magic number.
361        # Copyright (c) 2001-2010 Python Software Foundation; All Rights
362        # Reserved
363        # Licensed under the Python Software Foundation License.
364        # TODO: see if buffer is more efficient than cBytesIO.
365        assert size > 0
366
367        # Our use of BytesIO rather than lists of string objects returned by
368        # recv() minimizes memory usage and fragmentation that occurs when
369        # rbufsize is large compared to the typical return value of recv().
370        buf = self._rbuf
371        start = buf.tell()
372        buf.seek(0, SEEK_END)
373        # buffer may have been partially consumed by recv()
374        buf_len = buf.tell() - start
375        if buf_len >= size:
376            # Already have size bytes in our buffer?  Extract and return.
377            buf.seek(start)
378            rv = buf.read(size)
379            self._rbuf = BytesIO()
380            self._rbuf.write(buf.read())
381            self._rbuf.seek(0)
382            return rv
383
384        self._rbuf = BytesIO()  # reset _rbuf.  we consume it via buf.
385        while True:
386            left = size - buf_len
387            # recv() will malloc the amount of memory given as its
388            # parameter even though it often returns much less data
389            # than that.  The returned data string is short lived
390            # as we copy it into a BytesIO and free it.  This avoids
391            # fragmentation issues on many platforms.
392            data = self._recv(left)
393            if not data:
394                break
395            n = len(data)
396            if n == size and not buf_len:
397                # Shortcut.  Avoid buffer data copies when:
398                # - We have no data in our buffer.
399                # AND
400                # - Our call to recv returned exactly the
401                #   number of bytes we were asked to read.
402                return data
403            if n == left:
404                buf.write(data)
405                del data  # explicit free
406                break
407            assert n <= left, "_recv(%d) returned %d bytes" % (left, n)
408            buf.write(data)
409            buf_len += n
410            del data  # explicit free
411            # assert buf_len == buf.tell()
412        buf.seek(start)
413        return buf.read()
414
415    def recv(self, size):
416        assert size > 0
417
418        buf = self._rbuf
419        start = buf.tell()
420        buf.seek(0, SEEK_END)
421        buf_len = buf.tell()
422        buf.seek(start)
423
424        left = buf_len - start
425        if not left:
426            # only read from the wire if our read buffer is exhausted
427            data = self._recv(self._rbufsize)
428            if len(data) == size:
429                # shortcut: skip the buffer if we read exactly size bytes
430                return data
431            buf = BytesIO()
432            buf.write(data)
433            buf.seek(0)
434            del data  # explicit free
435            self._rbuf = buf
436        return buf.read(size)
437
438
439def extract_capabilities(text):
440    """Extract a capabilities list from a string, if present.
441
442    :param text: String to extract from
443    :return: Tuple with text with capabilities removed and list of capabilities
444    """
445    if b"\0" not in text:
446        return text, []
447    text, capabilities = text.rstrip().split(b"\0")
448    return (text, capabilities.strip().split(b" "))
449
450
451def extract_want_line_capabilities(text):
452    """Extract a capabilities list from a want line, if present.
453
454    Note that want lines have capabilities separated from the rest of the line
455    by a space instead of a null byte. Thus want lines have the form:
456
457        want obj-id cap1 cap2 ...
458
459    :param text: Want line to extract from
460    :return: Tuple with text with capabilities removed and list of capabilities
461    """
462    split_text = text.rstrip().split(b" ")
463    if len(split_text) < 3:
464        return text, []
465    return (b" ".join(split_text[:2]), split_text[2:])
466
467
468def ack_type(capabilities):
469    """Extract the ack type from a capabilities list."""
470    if b'multi_ack_detailed' in capabilities:
471        return MULTI_ACK_DETAILED
472    elif b'multi_ack' in capabilities:
473        return MULTI_ACK
474    return SINGLE_ACK
475
476
477class BufferedPktLineWriter(object):
478    """Writer that wraps its data in pkt-lines and has an independent buffer.
479
480    Consecutive calls to write() wrap the data in a pkt-line and then buffers
481    it until enough lines have been written such that their total length
482    (including length prefix) reach the buffer size.
483    """
484
485    def __init__(self, write, bufsize=65515):
486        """Initialize the BufferedPktLineWriter.
487
488        :param write: A write callback for the underlying writer.
489        :param bufsize: The internal buffer size, including length prefixes.
490        """
491        self._write = write
492        self._bufsize = bufsize
493        self._wbuf = BytesIO()
494        self._buflen = 0
495
496    def write(self, data):
497        """Write data, wrapping it in a pkt-line."""
498        line = pkt_line(data)
499        line_len = len(line)
500        over = self._buflen + line_len - self._bufsize
501        if over >= 0:
502            start = line_len - over
503            self._wbuf.write(line[:start])
504            self.flush()
505        else:
506            start = 0
507        saved = line[start:]
508        self._wbuf.write(saved)
509        self._buflen += len(saved)
510
511    def flush(self):
512        """Flush all data from the buffer."""
513        data = self._wbuf.getvalue()
514        if data:
515            self._write(data)
516        self._len = 0
517        self._wbuf = BytesIO()
518
519
520class PktLineParser(object):
521    """Packet line parser that hands completed packets off to a callback.
522    """
523
524    def __init__(self, handle_pkt):
525        self.handle_pkt = handle_pkt
526        self._readahead = BytesIO()
527
528    def parse(self, data):
529        """Parse a fragment of data and call back for any completed packets.
530        """
531        self._readahead.write(data)
532        buf = self._readahead.getvalue()
533        if len(buf) < 4:
534            return
535        while len(buf) >= 4:
536            size = int(buf[:4], 16)
537            if size == 0:
538                self.handle_pkt(None)
539                buf = buf[4:]
540            elif size <= len(buf):
541                self.handle_pkt(buf[4:size])
542                buf = buf[size:]
543            else:
544                break
545        self._readahead = BytesIO()
546        self._readahead.write(buf)
547
548    def get_tail(self):
549        """Read back any unused data."""
550        return self._readahead.getvalue()
551