1# web.py -- WSGI smart-http server 2# Copyright (C) 2010 Google, Inc. 3# Copyright (C) 2012 Jelmer Vernooij <jelmer@jelmer.uk> 4# 5# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU 6# General Public License as public by the Free Software Foundation; version 2.0 7# or (at your option) any later version. You can redistribute it and/or 8# modify it under the terms of either of these two licenses. 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13# See the License for the specific language governing permissions and 14# limitations under the License. 15# 16# You should have received a copy of the licenses; if not, see 17# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License 18# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache 19# License, Version 2.0. 20# 21 22"""HTTP server for dulwich that implements the git smart HTTP protocol.""" 23 24from io import BytesIO 25import shutil 26import tempfile 27import gzip 28import os 29import re 30import sys 31import time 32from wsgiref.simple_server import ( 33 WSGIRequestHandler, 34 ServerHandler, 35 WSGIServer, 36 make_server, 37 ) 38 39try: 40 from urlparse import parse_qs 41except ImportError: 42 from urllib.parse import parse_qs 43 44 45from dulwich import log_utils 46from dulwich.protocol import ( 47 ReceivableProtocol, 48 ) 49from dulwich.repo import ( 50 NotGitRepository, 51 Repo, 52 ) 53from dulwich.server import ( 54 DictBackend, 55 DEFAULT_HANDLERS, 56 generate_info_refs, 57 generate_objects_info_packs, 58 ) 59 60 61logger = log_utils.getLogger(__name__) 62 63 64# HTTP error strings 65HTTP_OK = '200 OK' 66HTTP_NOT_FOUND = '404 Not Found' 67HTTP_FORBIDDEN = '403 Forbidden' 68HTTP_ERROR = '500 Internal Server Error' 69 70 71def date_time_string(timestamp=None): 72 # From BaseHTTPRequestHandler.date_time_string in BaseHTTPServer.py in the 73 # Python 2.6.5 standard library, following modifications: 74 # - Made a global rather than an instance method. 75 # - weekdayname and monthname are renamed and locals rather than class 76 # variables. 77 # Copyright (c) 2001-2010 Python Software Foundation; All Rights Reserved 78 weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] 79 months = [None, 80 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 81 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] 82 if timestamp is None: 83 timestamp = time.time() 84 year, month, day, hh, mm, ss, wd, y, z = time.gmtime(timestamp) 85 return '%s, %02d %3s %4d %02d:%02d:%02d GMD' % ( 86 weekdays[wd], day, months[month], year, hh, mm, ss) 87 88 89def url_prefix(mat): 90 """Extract the URL prefix from a regex match. 91 92 Args: 93 mat: A regex match object. 94 Returns: The URL prefix, defined as the text before the match in the 95 original string. Normalized to start with one leading slash and end 96 with zero. 97 """ 98 return '/' + mat.string[:mat.start()].strip('/') 99 100 101def get_repo(backend, mat): 102 """Get a Repo instance for the given backend and URL regex match.""" 103 return backend.open_repository(url_prefix(mat)) 104 105 106def send_file(req, f, content_type): 107 """Send a file-like object to the request output. 108 109 Args: 110 req: The HTTPGitRequest object to send output to. 111 f: An open file-like object to send; will be closed. 112 content_type: The MIME type for the file. 113 Returns: Iterator over the contents of the file, as chunks. 114 """ 115 if f is None: 116 yield req.not_found('File not found') 117 return 118 try: 119 req.respond(HTTP_OK, content_type) 120 while True: 121 data = f.read(10240) 122 if not data: 123 break 124 yield data 125 except IOError: 126 yield req.error('Error reading file') 127 finally: 128 f.close() 129 130 131def _url_to_path(url): 132 return url.replace('/', os.path.sep) 133 134 135def get_text_file(req, backend, mat): 136 req.nocache() 137 path = _url_to_path(mat.group()) 138 logger.info('Sending plain text file %s', path) 139 return send_file(req, get_repo(backend, mat).get_named_file(path), 140 'text/plain') 141 142 143def get_loose_object(req, backend, mat): 144 sha = (mat.group(1) + mat.group(2)).encode('ascii') 145 logger.info('Sending loose object %s', sha) 146 object_store = get_repo(backend, mat).object_store 147 if not object_store.contains_loose(sha): 148 yield req.not_found('Object not found') 149 return 150 try: 151 data = object_store[sha].as_legacy_object() 152 except IOError: 153 yield req.error('Error reading object') 154 return 155 req.cache_forever() 156 req.respond(HTTP_OK, 'application/x-git-loose-object') 157 yield data 158 159 160def get_pack_file(req, backend, mat): 161 req.cache_forever() 162 path = _url_to_path(mat.group()) 163 logger.info('Sending pack file %s', path) 164 return send_file(req, get_repo(backend, mat).get_named_file(path), 165 'application/x-git-packed-objects') 166 167 168def get_idx_file(req, backend, mat): 169 req.cache_forever() 170 path = _url_to_path(mat.group()) 171 logger.info('Sending pack file %s', path) 172 return send_file(req, get_repo(backend, mat).get_named_file(path), 173 'application/x-git-packed-objects-toc') 174 175 176def get_info_refs(req, backend, mat): 177 params = parse_qs(req.environ['QUERY_STRING']) 178 service = params.get('service', [None])[0] 179 try: 180 repo = get_repo(backend, mat) 181 except NotGitRepository as e: 182 yield req.not_found(str(e)) 183 return 184 if service and not req.dumb: 185 handler_cls = req.handlers.get(service.encode('ascii'), None) 186 if handler_cls is None: 187 yield req.forbidden('Unsupported service') 188 return 189 req.nocache() 190 write = req.respond( 191 HTTP_OK, 'application/x-%s-advertisement' % service) 192 proto = ReceivableProtocol(BytesIO().read, write) 193 handler = handler_cls(backend, [url_prefix(mat)], proto, 194 http_req=req, advertise_refs=True) 195 handler.proto.write_pkt_line( 196 b'# service=' + service.encode('ascii') + b'\n') 197 handler.proto.write_pkt_line(None) 198 handler.handle() 199 else: 200 # non-smart fallback 201 # TODO: select_getanyfile() (see http-backend.c) 202 req.nocache() 203 req.respond(HTTP_OK, 'text/plain') 204 logger.info('Emulating dumb info/refs') 205 for text in generate_info_refs(repo): 206 yield text 207 208 209def get_info_packs(req, backend, mat): 210 req.nocache() 211 req.respond(HTTP_OK, 'text/plain') 212 logger.info('Emulating dumb info/packs') 213 return generate_objects_info_packs(get_repo(backend, mat)) 214 215 216class _LengthLimitedFile(object): 217 """Wrapper class to limit the length of reads from a file-like object. 218 219 This is used to ensure EOF is read from the wsgi.input object once 220 Content-Length bytes are read. This behavior is required by the WSGI spec 221 but not implemented in wsgiref as of 2.5. 222 """ 223 224 def __init__(self, input, max_bytes): 225 self._input = input 226 self._bytes_avail = max_bytes 227 228 def read(self, size=-1): 229 if self._bytes_avail <= 0: 230 return b'' 231 if size == -1 or size > self._bytes_avail: 232 size = self._bytes_avail 233 self._bytes_avail -= size 234 return self._input.read(size) 235 236 # TODO: support more methods as necessary 237 238 239def handle_service_request(req, backend, mat): 240 service = mat.group().lstrip('/') 241 logger.info('Handling service request for %s', service) 242 handler_cls = req.handlers.get(service.encode('ascii'), None) 243 if handler_cls is None: 244 yield req.forbidden('Unsupported service') 245 return 246 try: 247 get_repo(backend, mat) 248 except NotGitRepository as e: 249 yield req.not_found(str(e)) 250 return 251 req.nocache() 252 write = req.respond(HTTP_OK, 'application/x-%s-result' % service) 253 proto = ReceivableProtocol(req.environ['wsgi.input'].read, write) 254 # TODO(jelmer): Find a way to pass in repo, rather than having handler_cls 255 # reopen. 256 handler = handler_cls(backend, [url_prefix(mat)], proto, http_req=req) 257 handler.handle() 258 259 260class HTTPGitRequest(object): 261 """Class encapsulating the state of a single git HTTP request. 262 263 :ivar environ: the WSGI environment for the request. 264 """ 265 266 def __init__(self, environ, start_response, dumb=False, handlers=None): 267 self.environ = environ 268 self.dumb = dumb 269 self.handlers = handlers 270 self._start_response = start_response 271 self._cache_headers = [] 272 self._headers = [] 273 274 def add_header(self, name, value): 275 """Add a header to the response.""" 276 self._headers.append((name, value)) 277 278 def respond(self, status=HTTP_OK, content_type=None, headers=None): 279 """Begin a response with the given status and other headers.""" 280 if headers: 281 self._headers.extend(headers) 282 if content_type: 283 self._headers.append(('Content-Type', content_type)) 284 self._headers.extend(self._cache_headers) 285 286 return self._start_response(status, self._headers) 287 288 def not_found(self, message): 289 """Begin a HTTP 404 response and return the text of a message.""" 290 self._cache_headers = [] 291 logger.info('Not found: %s', message) 292 self.respond(HTTP_NOT_FOUND, 'text/plain') 293 return message.encode('ascii') 294 295 def forbidden(self, message): 296 """Begin a HTTP 403 response and return the text of a message.""" 297 self._cache_headers = [] 298 logger.info('Forbidden: %s', message) 299 self.respond(HTTP_FORBIDDEN, 'text/plain') 300 return message.encode('ascii') 301 302 def error(self, message): 303 """Begin a HTTP 500 response and return the text of a message.""" 304 self._cache_headers = [] 305 logger.error('Error: %s', message) 306 self.respond(HTTP_ERROR, 'text/plain') 307 return message.encode('ascii') 308 309 def nocache(self): 310 """Set the response to never be cached by the client.""" 311 self._cache_headers = [ 312 ('Expires', 'Fri, 01 Jan 1980 00:00:00 GMT'), 313 ('Pragma', 'no-cache'), 314 ('Cache-Control', 'no-cache, max-age=0, must-revalidate'), 315 ] 316 317 def cache_forever(self): 318 """Set the response to be cached forever by the client.""" 319 now = time.time() 320 self._cache_headers = [ 321 ('Date', date_time_string(now)), 322 ('Expires', date_time_string(now + 31536000)), 323 ('Cache-Control', 'public, max-age=31536000'), 324 ] 325 326 327class HTTPGitApplication(object): 328 """Class encapsulating the state of a git WSGI application. 329 330 :ivar backend: the Backend object backing this application 331 """ 332 333 services = { 334 ('GET', re.compile('/HEAD$')): get_text_file, 335 ('GET', re.compile('/info/refs$')): get_info_refs, 336 ('GET', re.compile('/objects/info/alternates$')): get_text_file, 337 ('GET', re.compile('/objects/info/http-alternates$')): get_text_file, 338 ('GET', re.compile('/objects/info/packs$')): get_info_packs, 339 ('GET', re.compile('/objects/([0-9a-f]{2})/([0-9a-f]{38})$')): 340 get_loose_object, 341 ('GET', re.compile('/objects/pack/pack-([0-9a-f]{40})\\.pack$')): 342 get_pack_file, 343 ('GET', re.compile('/objects/pack/pack-([0-9a-f]{40})\\.idx$')): 344 get_idx_file, 345 346 ('POST', re.compile('/git-upload-pack$')): handle_service_request, 347 ('POST', re.compile('/git-receive-pack$')): handle_service_request, 348 } 349 350 def __init__(self, backend, dumb=False, handlers=None, fallback_app=None): 351 self.backend = backend 352 self.dumb = dumb 353 self.handlers = dict(DEFAULT_HANDLERS) 354 self.fallback_app = fallback_app 355 if handlers is not None: 356 self.handlers.update(handlers) 357 358 def __call__(self, environ, start_response): 359 path = environ['PATH_INFO'] 360 method = environ['REQUEST_METHOD'] 361 req = HTTPGitRequest(environ, start_response, dumb=self.dumb, 362 handlers=self.handlers) 363 # environ['QUERY_STRING'] has qs args 364 handler = None 365 for smethod, spath in self.services.keys(): 366 if smethod != method: 367 continue 368 mat = spath.search(path) 369 if mat: 370 handler = self.services[smethod, spath] 371 break 372 373 if handler is None: 374 if self.fallback_app is not None: 375 return self.fallback_app(environ, start_response) 376 else: 377 return [req.not_found('Sorry, that method is not supported')] 378 379 return handler(req, self.backend, mat) 380 381 382class GunzipFilter(object): 383 """WSGI middleware that unzips gzip-encoded requests before 384 passing on to the underlying application. 385 """ 386 387 def __init__(self, application): 388 self.app = application 389 390 def __call__(self, environ, start_response): 391 if environ.get('HTTP_CONTENT_ENCODING', '') == 'gzip': 392 try: 393 environ['wsgi.input'].tell() 394 wsgi_input = environ['wsgi.input'] 395 except (AttributeError, IOError, NotImplementedError): 396 # The gzip implementation in the standard library of Python 2.x 397 # requires working '.seek()' and '.tell()' methods on the input 398 # stream. Read the data into a temporary file to work around 399 # this limitation. 400 wsgi_input = tempfile.SpooledTemporaryFile(16 * 1024 * 1024) 401 shutil.copyfileobj(environ['wsgi.input'], wsgi_input) 402 wsgi_input.seek(0) 403 404 environ['wsgi.input'] = gzip.GzipFile( 405 filename=None, fileobj=wsgi_input, mode='r') 406 del environ['HTTP_CONTENT_ENCODING'] 407 if 'CONTENT_LENGTH' in environ: 408 del environ['CONTENT_LENGTH'] 409 410 return self.app(environ, start_response) 411 412 413class LimitedInputFilter(object): 414 """WSGI middleware that limits the input length of a request to that 415 specified in Content-Length. 416 """ 417 418 def __init__(self, application): 419 self.app = application 420 421 def __call__(self, environ, start_response): 422 # This is not necessary if this app is run from a conforming WSGI 423 # server. Unfortunately, there's no way to tell that at this point. 424 # TODO: git may used HTTP/1.1 chunked encoding instead of specifying 425 # content-length 426 content_length = environ.get('CONTENT_LENGTH', '') 427 if content_length: 428 environ['wsgi.input'] = _LengthLimitedFile( 429 environ['wsgi.input'], int(content_length)) 430 return self.app(environ, start_response) 431 432 433def make_wsgi_chain(*args, **kwargs): 434 """Factory function to create an instance of HTTPGitApplication, 435 correctly wrapped with needed middleware. 436 """ 437 app = HTTPGitApplication(*args, **kwargs) 438 wrapped_app = LimitedInputFilter(GunzipFilter(app)) 439 return wrapped_app 440 441 442class ServerHandlerLogger(ServerHandler): 443 """ServerHandler that uses dulwich's logger for logging exceptions.""" 444 445 def log_exception(self, exc_info): 446 if sys.version_info < (2, 7): 447 logger.exception('Exception happened during processing of request') 448 else: 449 logger.exception('Exception happened during processing of request', 450 exc_info=exc_info) 451 452 def log_message(self, format, *args): 453 logger.info(format, *args) 454 455 def log_error(self, *args): 456 logger.error(*args) 457 458 459class WSGIRequestHandlerLogger(WSGIRequestHandler): 460 """WSGIRequestHandler that uses dulwich's logger for logging exceptions.""" 461 462 def log_exception(self, exc_info): 463 logger.exception('Exception happened during processing of request', 464 exc_info=exc_info) 465 466 def log_message(self, format, *args): 467 logger.info(format, *args) 468 469 def log_error(self, *args): 470 logger.error(*args) 471 472 def handle(self): 473 """Handle a single HTTP request""" 474 475 self.raw_requestline = self.rfile.readline() 476 if not self.parse_request(): # An error code has been sent, just exit 477 return 478 479 handler = ServerHandlerLogger( 480 self.rfile, self.wfile, self.get_stderr(), self.get_environ() 481 ) 482 handler.request_handler = self # backpointer for logging 483 handler.run(self.server.get_app()) 484 485 486class WSGIServerLogger(WSGIServer): 487 488 def handle_error(self, request, client_address): 489 """Handle an error. """ 490 logger.exception( 491 'Exception happened during processing of request from %s' % 492 str(client_address)) 493 494 495def main(argv=sys.argv): 496 """Entry point for starting an HTTP git server.""" 497 import optparse 498 parser = optparse.OptionParser() 499 parser.add_option("-l", "--listen_address", dest="listen_address", 500 default="localhost", 501 help="Binding IP address.") 502 parser.add_option("-p", "--port", dest="port", type=int, 503 default=8000, 504 help="Port to listen on.") 505 options, args = parser.parse_args(argv) 506 507 if len(args) > 1: 508 gitdir = args[1] 509 else: 510 gitdir = os.getcwd() 511 512 log_utils.default_logging_config() 513 backend = DictBackend({'/': Repo(gitdir)}) 514 app = make_wsgi_chain(backend) 515 server = make_server(options.listen_address, options.port, app, 516 handler_class=WSGIRequestHandlerLogger, 517 server_class=WSGIServerLogger) 518 logger.info('Listening for HTTP connections on %s:%d', 519 options.listen_address, options.port) 520 server.serve_forever() 521 522 523if __name__ == '__main__': 524 main() 525