1# Copyright (C) 2006-2011 Canonical Ltd 2# 3# This program is free software; you can redistribute it and/or modify 4# it under the terms of the GNU General Public License as published by 5# the Free Software Foundation; either version 2 of the License, or 6# (at your option) any later version. 7# 8# This program is distributed in the hope that it will be useful, 9# but WITHOUT ANY WARRANTY; without even the implied warranty of 10# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11# GNU General Public License for more details. 12# 13# You should have received a copy of the GNU General Public License 14# along with this program; if not, write to the Free Software 15# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 16 17"""Handlers for HTTP Responses. 18 19The purpose of these classes is to provide a uniform interface for clients 20to standard HTTP responses, single range responses and multipart range 21responses. 22""" 23 24import cgi 25from io import BytesIO 26import os 27import http.client as http_client 28import email.utils as email_utils 29 30from ... import ( 31 errors, 32 osutils, 33 ) 34 35 36class ResponseFile(object): 37 """A wrapper around the http socket containing the result of a GET request. 38 39 Only read() and seek() (forward) are supported. 40 41 """ 42 43 def __init__(self, path, infile): 44 """Constructor. 45 46 :param path: File url, for error reports. 47 48 :param infile: File-like socket set at body start. 49 """ 50 self._path = path 51 self._file = infile 52 self._pos = 0 53 54 def close(self): 55 """Close this file. 56 57 Dummy implementation for consistency with the 'file' API. 58 """ 59 60 def __enter__(self): 61 return self 62 63 def __exit__(self, exc_type, exc_val, exc_tb): 64 return False # propogate exceptions. 65 66 def read(self, size=None): 67 """Read size bytes from the current position in the file. 68 69 :param size: The number of bytes to read. Leave unspecified or pass 70 -1 to read to EOF. 71 """ 72 data = self._file.read(size) 73 self._pos += len(data) 74 return data 75 76 def readline(self): 77 data = self._file.readline() 78 self._pos += len(data) 79 return data 80 81 def readlines(self, size=None): 82 data = self._file.readlines() 83 self._pos += sum(map(len, data)) 84 return data 85 86 def __iter__(self): 87 while True: 88 line = self.readline() 89 if not line: 90 return 91 yield line 92 93 def tell(self): 94 return self._pos 95 96 def seek(self, offset, whence=os.SEEK_SET): 97 if whence == os.SEEK_SET: 98 if offset < self._pos: 99 raise AssertionError( 100 "Can't seek backwards, pos: %s, offset: %s" 101 % (self._pos, offset)) 102 to_discard = offset - self._pos 103 elif whence == os.SEEK_CUR: 104 to_discard = offset 105 else: 106 raise AssertionError("Can't seek backwards") 107 if to_discard: 108 # Just discard the unwanted bytes 109 self.read(to_discard) 110 111# A RangeFile expects the following grammar (simplified to outline the 112# assumptions we rely upon). 113 114# file: single_range 115# | multiple_range 116 117# single_range: content_range_header data 118 119# multiple_range: boundary_header boundary (content_range_header data boundary)+ 120 121 122class RangeFile(ResponseFile): 123 """File-like object that allow access to partial available data. 124 125 All accesses should happen sequentially since the acquisition occurs during 126 an http response reception (as sockets can't be seeked, we simulate the 127 seek by just reading and discarding the data). 128 129 The access pattern is defined by a set of ranges discovered as reading 130 progress. Only one range is available at a given time, so all accesses 131 should happen with monotonically increasing offsets. 132 """ 133 134 # in _checked_read() below, we may have to discard several MB in the worst 135 # case. To avoid buffering that much, we read and discard by chunks 136 # instead. The underlying file is either a socket or a BytesIO, so reading 137 # 8k chunks should be fine. 138 _discarded_buf_size = 8192 139 140 # maximum size of read requests -- used to avoid MemoryError issues in recv 141 _max_read_size = 512 * 1024 142 143 def __init__(self, path, infile): 144 """Constructor. 145 146 :param path: File url, for error reports. 147 148 :param infile: File-like socket set at body start. 149 """ 150 super(RangeFile, self).__init__(path, infile) 151 self._boundary = None 152 # When using multi parts response, this will be set with the headers 153 # associated with the range currently read. 154 self._headers = None 155 # Default to the whole file of unspecified size 156 self.set_range(0, -1) 157 158 def set_range(self, start, size): 159 """Change the range mapping""" 160 self._start = start 161 self._size = size 162 # Set the new _pos since that's what we want to expose 163 self._pos = self._start 164 165 def set_boundary(self, boundary): 166 """Define the boundary used in a multi parts message. 167 168 The file should be at the beginning of the body, the first range 169 definition is read and taken into account. 170 """ 171 if not isinstance(boundary, bytes): 172 raise TypeError(boundary) 173 self._boundary = boundary 174 # Decode the headers and setup the first range 175 self.read_boundary() 176 self.read_range_definition() 177 178 def read_boundary(self): 179 """Read the boundary headers defining a new range""" 180 boundary_line = b'\r\n' 181 while boundary_line == b'\r\n': 182 # RFC2616 19.2 Additional CRLFs may precede the first boundary 183 # string entity. 184 # To be on the safe side we allow it before any boundary line 185 boundary_line = self._file.readline() 186 187 if boundary_line == b'': 188 # A timeout in the proxy server caused the response to end early. 189 # See launchpad bug 198646. 190 raise errors.HttpBoundaryMissing( 191 self._path, 192 self._boundary) 193 194 if boundary_line != b'--' + self._boundary + b'\r\n': 195 # email_utils.unquote() incorrectly unquotes strings enclosed in <> 196 # IIS 6 and 7 incorrectly wrap boundary strings in <> 197 # together they make a beautiful bug, which we will be gracious 198 # about here 199 if (self._unquote_boundary(boundary_line) != 200 b'--' + self._boundary + b'\r\n'): 201 raise errors.InvalidHttpResponse( 202 self._path, 203 "Expected a boundary (%s) line, got '%s'" 204 % (self._boundary, boundary_line)) 205 206 def _unquote_boundary(self, b): 207 return b[:2] + email_utils.unquote(b[2:-2].decode('ascii')).encode('ascii') + b[-2:] 208 209 def read_range_definition(self): 210 """Read a new range definition in a multi parts message. 211 212 Parse the headers including the empty line following them so that we 213 are ready to read the data itself. 214 """ 215 self._headers = http_client.parse_headers(self._file) 216 # Extract the range definition 217 content_range = self._headers.get('content-range', None) 218 if content_range is None: 219 raise errors.InvalidHttpResponse( 220 self._path, 221 'Content-Range header missing in a multi-part response') 222 self.set_range_from_header(content_range) 223 224 def set_range_from_header(self, content_range): 225 """Helper to set the new range from its description in the headers""" 226 try: 227 rtype, values = content_range.split() 228 except ValueError: 229 raise errors.InvalidHttpRange(self._path, content_range, 230 'Malformed header') 231 if rtype != 'bytes': 232 raise errors.InvalidHttpRange(self._path, content_range, 233 "Unsupported range type '%s'" % rtype) 234 try: 235 # We don't need total, but note that it may be either the file size 236 # or '*' if the server can't or doesn't want to return the file 237 # size. 238 start_end, total = values.split('/') 239 start, end = start_end.split('-') 240 start = int(start) 241 end = int(end) 242 except ValueError: 243 raise errors.InvalidHttpRange(self._path, content_range, 244 'Invalid range values') 245 size = end - start + 1 246 if size <= 0: 247 raise errors.InvalidHttpRange(self._path, content_range, 248 'Invalid range, size <= 0') 249 self.set_range(start, size) 250 251 def _checked_read(self, size): 252 """Read the file checking for short reads. 253 254 The data read is discarded along the way. 255 """ 256 pos = self._pos 257 remaining = size 258 while remaining > 0: 259 data = self._file.read(min(remaining, self._discarded_buf_size)) 260 remaining -= len(data) 261 if not data: 262 raise errors.ShortReadvError(self._path, pos, size, 263 size - remaining) 264 self._pos += size 265 266 def _seek_to_next_range(self): 267 # We will cross range boundaries 268 if self._boundary is None: 269 # If we don't have a boundary, we can't find another range 270 raise errors.InvalidRange(self._path, self._pos, 271 "Range (%s, %s) exhausted" 272 % (self._start, self._size)) 273 self.read_boundary() 274 self.read_range_definition() 275 276 def read(self, size=-1): 277 """Read size bytes from the current position in the file. 278 279 Reading across ranges is not supported. We rely on the underlying http 280 client to clean the socket if we leave bytes unread. This may occur for 281 the final boundary line of a multipart response or for any range 282 request not entirely consumed by the client (due to offset coalescing) 283 284 :param size: The number of bytes to read. Leave unspecified or pass 285 -1 to read to EOF. 286 """ 287 if (self._size > 0 288 and self._pos == self._start + self._size): 289 if size == 0: 290 return b'' 291 else: 292 self._seek_to_next_range() 293 elif self._pos < self._start: 294 raise errors.InvalidRange( 295 self._path, self._pos, 296 "Can't read %s bytes before range (%s, %s)" 297 % (size, self._start, self._size)) 298 if self._size > 0: 299 if size > 0 and self._pos + size > self._start + self._size: 300 raise errors.InvalidRange( 301 self._path, self._pos, 302 "Can't read %s bytes across range (%s, %s)" 303 % (size, self._start, self._size)) 304 305 # read data from file 306 buf = BytesIO() 307 limited = size 308 if self._size > 0: 309 # Don't read past the range definition 310 limited = self._start + self._size - self._pos 311 if size >= 0: 312 limited = min(limited, size) 313 osutils.pumpfile(self._file, buf, limited, self._max_read_size) 314 data = buf.getvalue() 315 316 # Update _pos respecting the data effectively read 317 self._pos += len(data) 318 return data 319 320 def seek(self, offset, whence=0): 321 start_pos = self._pos 322 if whence == 0: 323 final_pos = offset 324 elif whence == 1: 325 final_pos = start_pos + offset 326 elif whence == 2: 327 if self._size > 0: 328 final_pos = self._start + self._size + offset # offset < 0 329 else: 330 raise errors.InvalidRange( 331 self._path, self._pos, 332 "RangeFile: can't seek from end while size is unknown") 333 else: 334 raise ValueError("Invalid value %s for whence." % whence) 335 336 if final_pos < self._pos: 337 # Can't seek backwards 338 raise errors.InvalidRange( 339 self._path, self._pos, 340 'RangeFile: trying to seek backwards to %s' % final_pos) 341 342 if self._size > 0: 343 cur_limit = self._start + self._size 344 while final_pos > cur_limit: 345 # We will cross range boundaries 346 remain = cur_limit - self._pos 347 if remain > 0: 348 # Finish reading the current range 349 self._checked_read(remain) 350 self._seek_to_next_range() 351 cur_limit = self._start + self._size 352 353 size = final_pos - self._pos 354 if size > 0: # size can be < 0 if we crossed a range boundary 355 # We don't need the data, just read it and throw it away 356 self._checked_read(size) 357 358 def tell(self): 359 return self._pos 360 361 362def handle_response(url, code, getheader, data): 363 """Interpret the code & headers and wrap the provided data in a RangeFile. 364 365 This is a factory method which returns an appropriate RangeFile based on 366 the code & headers it's given. 367 368 :param url: The url being processed. Mostly for error reporting 369 :param code: The integer HTTP response code 370 :param getheader: Function for retrieving header 371 :param data: A file-like object that can be read() to get the 372 requested data 373 :return: A file-like object that can seek()+read() the 374 ranges indicated by the headers. 375 """ 376 if code == 200: 377 # A whole file 378 rfile = ResponseFile(url, data) 379 elif code == 206: 380 rfile = RangeFile(url, data) 381 # When there is no content-type header we treat the response as 382 # being of type 'application/octet-stream' as per RFC2616 section 383 # 7.2.1. 384 # Therefore it is obviously not multipart 385 content_type = getheader('content-type', 'application/octet-stream') 386 mimetype, options = cgi.parse_header(content_type) 387 if mimetype == 'multipart/byteranges': 388 rfile.set_boundary(options['boundary'].encode('ascii')) 389 else: 390 # A response to a range request, but not multipart 391 content_range = getheader('content-range', None) 392 if content_range is None: 393 raise errors.InvalidHttpResponse( 394 url, 'Missing the Content-Range header in a 206 range response') 395 rfile.set_range_from_header(content_range) 396 else: 397 raise errors.UnexpectedHttpStatus(url, code) 398 399 return rfile 400