1"""Consts and function to handle target format. 2ALL_SUPPORTED_FORMATS - list of supported formats 3get_decompress_function - returns stream decompress function for a current 4 format (specified or autodetected) 5get_compress_function - returns compress function for a current format 6 (specifed or default) 7""" 8from __future__ import absolute_import 9 10from .snappy import ( 11 stream_compress, stream_decompress, check_format, UncompressError) 12from .hadoop_snappy import ( 13 stream_compress as hadoop_stream_compress, 14 stream_decompress as hadoop_stream_decompress, 15 check_format as hadoop_check_format) 16 17 18FRAMING_FORMAT = 'framing' 19 20HADOOP_FORMAT = 'hadoop_snappy' 21 22# Means format auto detection. 23# For compression will be used framing format. 24# In case of decompression will try to detect a format from the input stream 25# header. 26FORMAT_AUTO = 'auto' 27 28DEFAULT_FORMAT = FORMAT_AUTO 29 30ALL_SUPPORTED_FORMATS = [FRAMING_FORMAT, HADOOP_FORMAT, FORMAT_AUTO] 31 32_COMPRESS_METHODS = { 33 FRAMING_FORMAT: stream_compress, 34 HADOOP_FORMAT: hadoop_stream_compress, 35} 36 37_DECOMPRESS_METHODS = { 38 FRAMING_FORMAT: stream_decompress, 39 HADOOP_FORMAT: hadoop_stream_decompress, 40} 41 42# We will use framing format as the default to compression. 43# And for decompression, if it's not defined explicitly, we will try to 44# guess the format from the file header. 45_DEFAULT_COMPRESS_FORMAT = FRAMING_FORMAT 46 47# The tuple contains an ordered sequence of a format checking function and 48# a format-specific decompression function. 49# Framing format has it's header, that may be recognized. 50# Hadoop snappy format hasn't any special headers, it contains only 51# uncompressed block length integer and length of compressed subblock. 52# So we first check framing format and if it is not the case, then 53# check for snappy format. 54_DECOMPRESS_FORMAT_FUNCS = ( 55 (check_format, stream_decompress), 56 (hadoop_check_format, hadoop_stream_decompress), 57) 58 59 60def guess_format_by_header(fin): 61 """Tries to guess a compression format for the given input file by it's 62 header. 63 :return: tuple of decompression method and a chunk that was taken from the 64 input for format detection. 65 """ 66 chunk = None 67 for check_method, decompress_func in _DECOMPRESS_FORMAT_FUNCS: 68 ok, chunk = check_method(fin=fin, chunk=chunk) 69 if not ok: 70 continue 71 return decompress_func, chunk 72 raise UncompressError("Can't detect archive format") 73 74 75def get_decompress_function(specified_format, fin): 76 if specified_format == FORMAT_AUTO: 77 decompress_func, read_chunk = guess_format_by_header(fin) 78 return decompress_func, read_chunk 79 return _DECOMPRESS_METHODS[specified_format], None 80 81 82def get_compress_function(specified_format): 83 if specified_format == FORMAT_AUTO: 84 return _COMPRESS_METHODS[_DEFAULT_COMPRESS_FORMAT] 85 return _COMPRESS_METHODS[specified_format] 86