1"""Use heuristics to guess if it is a text file or a binary file.""" 2 3from __future__ import unicode_literals 4 5from dvc.utils.compat import is_py3, open 6 7# Based on https://eli.thegreenplace.net/2011/10/19/ 8# perls-guess-if-file-is-text-or-binary-implemented-in-python 9 10 11# A function that takes an integer in the 8-bit range and returns 12# a single-character byte object in py3 / a single-character string 13# in py2. 14# 15def _int2byte(i): 16 if is_py3: 17 return bytes((i,)) 18 return chr(i) 19 20 21TEXT_CHARS = b"".join(_int2byte(i) for i in range(32, 127)) + b"\n\r\t\f\b" 22 23 24def istextfile(fname, blocksize=512): 25 """ Uses heuristics to guess whether the given file is text or binary, 26 by reading a single block of bytes from the file. 27 If more than 30% of the chars in the block are non-text, or there 28 are NUL ('\x00') bytes in the block, assume this is a binary file. 29 """ 30 with open(fname, "rb") as fobj: 31 block = fobj.read(blocksize) 32 33 if not block: 34 # An empty file is considered a valid text file 35 return True 36 37 if b"\x00" in block: 38 # Files with null bytes are binary 39 return False 40 41 # Use translate's 'deletechars' argument to efficiently remove all 42 # occurrences of TEXT_CHARS from the block 43 nontext = block.translate(None, TEXT_CHARS) 44 return float(len(nontext)) / len(block) <= 0.30 45