1"""Use heuristics to guess if it is a text file or a binary file."""
2
3from __future__ import unicode_literals
4
5from dvc.utils.compat import is_py3, open
6
7# Based on https://eli.thegreenplace.net/2011/10/19/
8# perls-guess-if-file-is-text-or-binary-implemented-in-python
9
10
11# A function that takes an integer in the 8-bit range and returns
12# a single-character byte object in py3 / a single-character string
13# in py2.
14#
15def _int2byte(i):
16    if is_py3:
17        return bytes((i,))
18    return chr(i)
19
20
21TEXT_CHARS = b"".join(_int2byte(i) for i in range(32, 127)) + b"\n\r\t\f\b"
22
23
24def istextfile(fname, blocksize=512):
25    """ Uses heuristics to guess whether the given file is text or binary,
26        by reading a single block of bytes from the file.
27        If more than 30% of the chars in the block are non-text, or there
28        are NUL ('\x00') bytes in the block, assume this is a binary file.
29    """
30    with open(fname, "rb") as fobj:
31        block = fobj.read(blocksize)
32
33    if not block:
34        # An empty file is considered a valid text file
35        return True
36
37    if b"\x00" in block:
38        # Files with null bytes are binary
39        return False
40
41    # Use translate's 'deletechars' argument to efficiently remove all
42    # occurrences of TEXT_CHARS from the block
43    nontext = block.translate(None, TEXT_CHARS)
44    return float(len(nontext)) / len(block) <= 0.30
45