1# web.py -- WSGI smart-http server
2# Copyright (C) 2010 Google, Inc.
3# Copyright (C) 2012 Jelmer Vernooij <jelmer@jelmer.uk>
4#
5# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
6# General Public License as public by the Free Software Foundation; version 2.0
7# or (at your option) any later version. You can redistribute it and/or
8# modify it under the terms of either of these two licenses.
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15#
16# You should have received a copy of the licenses; if not, see
17# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
18# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
19# License, Version 2.0.
20#
21
22"""HTTP server for dulwich that implements the git smart HTTP protocol."""
23
24from io import BytesIO
25import shutil
26import tempfile
27import gzip
28import os
29import re
30import sys
31import time
32from wsgiref.simple_server import (
33    WSGIRequestHandler,
34    ServerHandler,
35    WSGIServer,
36    make_server,
37    )
38
39try:
40    from urlparse import parse_qs
41except ImportError:
42    from urllib.parse import parse_qs
43
44
45from dulwich import log_utils
46from dulwich.protocol import (
47    ReceivableProtocol,
48    )
49from dulwich.repo import (
50    NotGitRepository,
51    Repo,
52    )
53from dulwich.server import (
54    DictBackend,
55    DEFAULT_HANDLERS,
56    generate_info_refs,
57    generate_objects_info_packs,
58    )
59
60
61logger = log_utils.getLogger(__name__)
62
63
64# HTTP error strings
65HTTP_OK = '200 OK'
66HTTP_NOT_FOUND = '404 Not Found'
67HTTP_FORBIDDEN = '403 Forbidden'
68HTTP_ERROR = '500 Internal Server Error'
69
70
71def date_time_string(timestamp=None):
72    # From BaseHTTPRequestHandler.date_time_string in BaseHTTPServer.py in the
73    # Python 2.6.5 standard library, following modifications:
74    #  - Made a global rather than an instance method.
75    #  - weekdayname and monthname are renamed and locals rather than class
76    #    variables.
77    # Copyright (c) 2001-2010 Python Software Foundation; All Rights Reserved
78    weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
79    months = [None,
80              'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
81              'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
82    if timestamp is None:
83        timestamp = time.time()
84    year, month, day, hh, mm, ss, wd, y, z = time.gmtime(timestamp)
85    return '%s, %02d %3s %4d %02d:%02d:%02d GMD' % (
86            weekdays[wd], day, months[month], year, hh, mm, ss)
87
88
89def url_prefix(mat):
90    """Extract the URL prefix from a regex match.
91
92    Args:
93      mat: A regex match object.
94    Returns: The URL prefix, defined as the text before the match in the
95        original string. Normalized to start with one leading slash and end
96        with zero.
97    """
98    return '/' + mat.string[:mat.start()].strip('/')
99
100
101def get_repo(backend, mat):
102    """Get a Repo instance for the given backend and URL regex match."""
103    return backend.open_repository(url_prefix(mat))
104
105
106def send_file(req, f, content_type):
107    """Send a file-like object to the request output.
108
109    Args:
110      req: The HTTPGitRequest object to send output to.
111      f: An open file-like object to send; will be closed.
112      content_type: The MIME type for the file.
113    Returns: Iterator over the contents of the file, as chunks.
114    """
115    if f is None:
116        yield req.not_found('File not found')
117        return
118    try:
119        req.respond(HTTP_OK, content_type)
120        while True:
121            data = f.read(10240)
122            if not data:
123                break
124            yield data
125    except IOError:
126        yield req.error('Error reading file')
127    finally:
128        f.close()
129
130
131def _url_to_path(url):
132    return url.replace('/', os.path.sep)
133
134
135def get_text_file(req, backend, mat):
136    req.nocache()
137    path = _url_to_path(mat.group())
138    logger.info('Sending plain text file %s', path)
139    return send_file(req, get_repo(backend, mat).get_named_file(path),
140                     'text/plain')
141
142
143def get_loose_object(req, backend, mat):
144    sha = (mat.group(1) + mat.group(2)).encode('ascii')
145    logger.info('Sending loose object %s', sha)
146    object_store = get_repo(backend, mat).object_store
147    if not object_store.contains_loose(sha):
148        yield req.not_found('Object not found')
149        return
150    try:
151        data = object_store[sha].as_legacy_object()
152    except IOError:
153        yield req.error('Error reading object')
154        return
155    req.cache_forever()
156    req.respond(HTTP_OK, 'application/x-git-loose-object')
157    yield data
158
159
160def get_pack_file(req, backend, mat):
161    req.cache_forever()
162    path = _url_to_path(mat.group())
163    logger.info('Sending pack file %s', path)
164    return send_file(req, get_repo(backend, mat).get_named_file(path),
165                     'application/x-git-packed-objects')
166
167
168def get_idx_file(req, backend, mat):
169    req.cache_forever()
170    path = _url_to_path(mat.group())
171    logger.info('Sending pack file %s', path)
172    return send_file(req, get_repo(backend, mat).get_named_file(path),
173                     'application/x-git-packed-objects-toc')
174
175
176def get_info_refs(req, backend, mat):
177    params = parse_qs(req.environ['QUERY_STRING'])
178    service = params.get('service', [None])[0]
179    try:
180        repo = get_repo(backend, mat)
181    except NotGitRepository as e:
182        yield req.not_found(str(e))
183        return
184    if service and not req.dumb:
185        handler_cls = req.handlers.get(service.encode('ascii'), None)
186        if handler_cls is None:
187            yield req.forbidden('Unsupported service')
188            return
189        req.nocache()
190        write = req.respond(
191            HTTP_OK, 'application/x-%s-advertisement' % service)
192        proto = ReceivableProtocol(BytesIO().read, write)
193        handler = handler_cls(backend, [url_prefix(mat)], proto,
194                              http_req=req, advertise_refs=True)
195        handler.proto.write_pkt_line(
196            b'# service=' + service.encode('ascii') + b'\n')
197        handler.proto.write_pkt_line(None)
198        handler.handle()
199    else:
200        # non-smart fallback
201        # TODO: select_getanyfile() (see http-backend.c)
202        req.nocache()
203        req.respond(HTTP_OK, 'text/plain')
204        logger.info('Emulating dumb info/refs')
205        for text in generate_info_refs(repo):
206            yield text
207
208
209def get_info_packs(req, backend, mat):
210    req.nocache()
211    req.respond(HTTP_OK, 'text/plain')
212    logger.info('Emulating dumb info/packs')
213    return generate_objects_info_packs(get_repo(backend, mat))
214
215
216class _LengthLimitedFile(object):
217    """Wrapper class to limit the length of reads from a file-like object.
218
219    This is used to ensure EOF is read from the wsgi.input object once
220    Content-Length bytes are read. This behavior is required by the WSGI spec
221    but not implemented in wsgiref as of 2.5.
222    """
223
224    def __init__(self, input, max_bytes):
225        self._input = input
226        self._bytes_avail = max_bytes
227
228    def read(self, size=-1):
229        if self._bytes_avail <= 0:
230            return b''
231        if size == -1 or size > self._bytes_avail:
232            size = self._bytes_avail
233        self._bytes_avail -= size
234        return self._input.read(size)
235
236    # TODO: support more methods as necessary
237
238
239def handle_service_request(req, backend, mat):
240    service = mat.group().lstrip('/')
241    logger.info('Handling service request for %s', service)
242    handler_cls = req.handlers.get(service.encode('ascii'), None)
243    if handler_cls is None:
244        yield req.forbidden('Unsupported service')
245        return
246    try:
247        get_repo(backend, mat)
248    except NotGitRepository as e:
249        yield req.not_found(str(e))
250        return
251    req.nocache()
252    write = req.respond(HTTP_OK, 'application/x-%s-result' % service)
253    proto = ReceivableProtocol(req.environ['wsgi.input'].read, write)
254    # TODO(jelmer): Find a way to pass in repo, rather than having handler_cls
255    # reopen.
256    handler = handler_cls(backend, [url_prefix(mat)], proto, http_req=req)
257    handler.handle()
258
259
260class HTTPGitRequest(object):
261    """Class encapsulating the state of a single git HTTP request.
262
263    :ivar environ: the WSGI environment for the request.
264    """
265
266    def __init__(self, environ, start_response, dumb=False, handlers=None):
267        self.environ = environ
268        self.dumb = dumb
269        self.handlers = handlers
270        self._start_response = start_response
271        self._cache_headers = []
272        self._headers = []
273
274    def add_header(self, name, value):
275        """Add a header to the response."""
276        self._headers.append((name, value))
277
278    def respond(self, status=HTTP_OK, content_type=None, headers=None):
279        """Begin a response with the given status and other headers."""
280        if headers:
281            self._headers.extend(headers)
282        if content_type:
283            self._headers.append(('Content-Type', content_type))
284        self._headers.extend(self._cache_headers)
285
286        return self._start_response(status, self._headers)
287
288    def not_found(self, message):
289        """Begin a HTTP 404 response and return the text of a message."""
290        self._cache_headers = []
291        logger.info('Not found: %s', message)
292        self.respond(HTTP_NOT_FOUND, 'text/plain')
293        return message.encode('ascii')
294
295    def forbidden(self, message):
296        """Begin a HTTP 403 response and return the text of a message."""
297        self._cache_headers = []
298        logger.info('Forbidden: %s', message)
299        self.respond(HTTP_FORBIDDEN, 'text/plain')
300        return message.encode('ascii')
301
302    def error(self, message):
303        """Begin a HTTP 500 response and return the text of a message."""
304        self._cache_headers = []
305        logger.error('Error: %s', message)
306        self.respond(HTTP_ERROR, 'text/plain')
307        return message.encode('ascii')
308
309    def nocache(self):
310        """Set the response to never be cached by the client."""
311        self._cache_headers = [
312          ('Expires', 'Fri, 01 Jan 1980 00:00:00 GMT'),
313          ('Pragma', 'no-cache'),
314          ('Cache-Control', 'no-cache, max-age=0, must-revalidate'),
315          ]
316
317    def cache_forever(self):
318        """Set the response to be cached forever by the client."""
319        now = time.time()
320        self._cache_headers = [
321          ('Date', date_time_string(now)),
322          ('Expires', date_time_string(now + 31536000)),
323          ('Cache-Control', 'public, max-age=31536000'),
324          ]
325
326
327class HTTPGitApplication(object):
328    """Class encapsulating the state of a git WSGI application.
329
330    :ivar backend: the Backend object backing this application
331    """
332
333    services = {
334      ('GET', re.compile('/HEAD$')): get_text_file,
335      ('GET', re.compile('/info/refs$')): get_info_refs,
336      ('GET', re.compile('/objects/info/alternates$')): get_text_file,
337      ('GET', re.compile('/objects/info/http-alternates$')): get_text_file,
338      ('GET', re.compile('/objects/info/packs$')): get_info_packs,
339      ('GET', re.compile('/objects/([0-9a-f]{2})/([0-9a-f]{38})$')):
340      get_loose_object,
341      ('GET', re.compile('/objects/pack/pack-([0-9a-f]{40})\\.pack$')):
342      get_pack_file,
343      ('GET', re.compile('/objects/pack/pack-([0-9a-f]{40})\\.idx$')):
344      get_idx_file,
345
346      ('POST', re.compile('/git-upload-pack$')): handle_service_request,
347      ('POST', re.compile('/git-receive-pack$')): handle_service_request,
348    }
349
350    def __init__(self, backend, dumb=False, handlers=None, fallback_app=None):
351        self.backend = backend
352        self.dumb = dumb
353        self.handlers = dict(DEFAULT_HANDLERS)
354        self.fallback_app = fallback_app
355        if handlers is not None:
356            self.handlers.update(handlers)
357
358    def __call__(self, environ, start_response):
359        path = environ['PATH_INFO']
360        method = environ['REQUEST_METHOD']
361        req = HTTPGitRequest(environ, start_response, dumb=self.dumb,
362                             handlers=self.handlers)
363        # environ['QUERY_STRING'] has qs args
364        handler = None
365        for smethod, spath in self.services.keys():
366            if smethod != method:
367                continue
368            mat = spath.search(path)
369            if mat:
370                handler = self.services[smethod, spath]
371                break
372
373        if handler is None:
374            if self.fallback_app is not None:
375                return self.fallback_app(environ, start_response)
376            else:
377                return [req.not_found('Sorry, that method is not supported')]
378
379        return handler(req, self.backend, mat)
380
381
382class GunzipFilter(object):
383    """WSGI middleware that unzips gzip-encoded requests before
384    passing on to the underlying application.
385    """
386
387    def __init__(self, application):
388        self.app = application
389
390    def __call__(self, environ, start_response):
391        if environ.get('HTTP_CONTENT_ENCODING', '') == 'gzip':
392            try:
393                environ['wsgi.input'].tell()
394                wsgi_input = environ['wsgi.input']
395            except (AttributeError, IOError, NotImplementedError):
396                # The gzip implementation in the standard library of Python 2.x
397                # requires working '.seek()' and '.tell()' methods on the input
398                # stream.  Read the data into a temporary file to work around
399                # this limitation.
400                wsgi_input = tempfile.SpooledTemporaryFile(16 * 1024 * 1024)
401                shutil.copyfileobj(environ['wsgi.input'], wsgi_input)
402                wsgi_input.seek(0)
403
404            environ['wsgi.input'] = gzip.GzipFile(
405                filename=None, fileobj=wsgi_input, mode='r')
406            del environ['HTTP_CONTENT_ENCODING']
407            if 'CONTENT_LENGTH' in environ:
408                del environ['CONTENT_LENGTH']
409
410        return self.app(environ, start_response)
411
412
413class LimitedInputFilter(object):
414    """WSGI middleware that limits the input length of a request to that
415    specified in Content-Length.
416    """
417
418    def __init__(self, application):
419        self.app = application
420
421    def __call__(self, environ, start_response):
422        # This is not necessary if this app is run from a conforming WSGI
423        # server. Unfortunately, there's no way to tell that at this point.
424        # TODO: git may used HTTP/1.1 chunked encoding instead of specifying
425        # content-length
426        content_length = environ.get('CONTENT_LENGTH', '')
427        if content_length:
428            environ['wsgi.input'] = _LengthLimitedFile(
429                environ['wsgi.input'], int(content_length))
430        return self.app(environ, start_response)
431
432
433def make_wsgi_chain(*args, **kwargs):
434    """Factory function to create an instance of HTTPGitApplication,
435    correctly wrapped with needed middleware.
436    """
437    app = HTTPGitApplication(*args, **kwargs)
438    wrapped_app = LimitedInputFilter(GunzipFilter(app))
439    return wrapped_app
440
441
442class ServerHandlerLogger(ServerHandler):
443    """ServerHandler that uses dulwich's logger for logging exceptions."""
444
445    def log_exception(self, exc_info):
446        if sys.version_info < (2, 7):
447            logger.exception('Exception happened during processing of request')
448        else:
449            logger.exception('Exception happened during processing of request',
450                             exc_info=exc_info)
451
452    def log_message(self, format, *args):
453        logger.info(format, *args)
454
455    def log_error(self, *args):
456        logger.error(*args)
457
458
459class WSGIRequestHandlerLogger(WSGIRequestHandler):
460    """WSGIRequestHandler that uses dulwich's logger for logging exceptions."""
461
462    def log_exception(self, exc_info):
463        logger.exception('Exception happened during processing of request',
464                         exc_info=exc_info)
465
466    def log_message(self, format, *args):
467        logger.info(format, *args)
468
469    def log_error(self, *args):
470        logger.error(*args)
471
472    def handle(self):
473        """Handle a single HTTP request"""
474
475        self.raw_requestline = self.rfile.readline()
476        if not self.parse_request():  # An error code has been sent, just exit
477            return
478
479        handler = ServerHandlerLogger(
480            self.rfile, self.wfile, self.get_stderr(), self.get_environ()
481        )
482        handler.request_handler = self      # backpointer for logging
483        handler.run(self.server.get_app())
484
485
486class WSGIServerLogger(WSGIServer):
487
488    def handle_error(self, request, client_address):
489        """Handle an error. """
490        logger.exception(
491            'Exception happened during processing of request from %s' %
492            str(client_address))
493
494
495def main(argv=sys.argv):
496    """Entry point for starting an HTTP git server."""
497    import optparse
498    parser = optparse.OptionParser()
499    parser.add_option("-l", "--listen_address", dest="listen_address",
500                      default="localhost",
501                      help="Binding IP address.")
502    parser.add_option("-p", "--port", dest="port", type=int,
503                      default=8000,
504                      help="Port to listen on.")
505    options, args = parser.parse_args(argv)
506
507    if len(args) > 1:
508        gitdir = args[1]
509    else:
510        gitdir = os.getcwd()
511
512    log_utils.default_logging_config()
513    backend = DictBackend({'/': Repo(gitdir)})
514    app = make_wsgi_chain(backend)
515    server = make_server(options.listen_address, options.port, app,
516                         handler_class=WSGIRequestHandlerLogger,
517                         server_class=WSGIServerLogger)
518    logger.info('Listening for HTTP connections on %s:%d',
519                options.listen_address, options.port)
520    server.serve_forever()
521
522
523if __name__ == '__main__':
524    main()
525