This file is indexed.

/usr/lib/python3/dist-packages/binaryornot/helpers.py is in python3-binaryornot 0.4.4+dfsg-2.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# -*- coding: utf-8 -*-


"""
binaryornot.helpers
-------------------

Helper utilities used by BinaryOrNot.
"""

import chardet
import logging


logger = logging.getLogger(__name__)


def print_as_hex(s):
    """
    Print a string as hex bytes.
    """
    print(":".join("{0:x}".format(ord(c)) for c in s))


def get_starting_chunk(filename, length=1024):
    """
    :param filename: File to open and get the first little chunk of.
    :param length: Number of bytes to read, default 1024.
    :returns: Starting chunk of bytes.
    """
    # Ensure we open the file in binary mode
    try:
        with open(filename, 'rb') as f:
            chunk = f.read(length)
            return chunk
    except IOError as e:
        print(e)


_control_chars = b'\n\r\t\f\b'
if bytes is str:
    # Python 2 means we need to invoke chr() explicitly
    _printable_ascii = _control_chars + b''.join(map(chr, range(32, 127)))
    _printable_high_ascii = b''.join(map(chr, range(127, 256)))
else:
    # Python 3 means bytes accepts integer input directly
    _printable_ascii = _control_chars + bytes(range(32, 127))
    _printable_high_ascii = bytes(range(127, 256))


def is_binary_string(bytes_to_check):
    """
    Uses a simplified version of the Perl detection algorithm,
    based roughly on Eli Bendersky's translation to Python:
    http://eli.thegreenplace.net/2011/10/19/perls-guess-if-file-is-text-or-binary-implemented-in-python/

    This is biased slightly more in favour of deeming files as text
    files than the Perl algorithm, since all ASCII compatible character
    sets are accepted as text, not just utf-8.

    :param bytes: A chunk of bytes to check.
    :returns: True if appears to be a binary, otherwise False.
    """

    # Empty files are considered text files
    if not bytes_to_check:
        return False

    # Now check for a high percentage of ASCII control characters
    # Binary if control chars are > 30% of the string
    low_chars = bytes_to_check.translate(None, _printable_ascii)
    nontext_ratio1 = float(len(low_chars)) / float(len(bytes_to_check))
    logger.debug('nontext_ratio1: %(nontext_ratio1)r', locals())

    # and check for a low percentage of high ASCII characters:
    # Binary if high ASCII chars are < 5% of the string
    # From: https://en.wikipedia.org/wiki/UTF-8
    # If the bytes are random, the chances of a byte with the high bit set
    # starting a valid UTF-8 character is only 6.64%. The chances of finding 7
    # of these without finding an invalid sequence is actually lower than the
    # chance of the first three bytes randomly being the UTF-8 BOM.

    high_chars = bytes_to_check.translate(None, _printable_high_ascii)
    nontext_ratio2 = float(len(high_chars)) / float(len(bytes_to_check))
    logger.debug('nontext_ratio2: %(nontext_ratio2)r', locals())

    is_likely_binary = (
        (nontext_ratio1 > 0.3 and nontext_ratio2 < 0.05) or
        (nontext_ratio1 > 0.8 and nontext_ratio2 > 0.8)
    )
    logger.debug('is_likely_binary: %(is_likely_binary)r', locals())

    # then check for binary for possible encoding detection with chardet
    detected_encoding = chardet.detect(bytes_to_check)
    logger.debug('detected_encoding: %(detected_encoding)r', locals())

    # finally use all the check to decide binary or text
    decodable_as_unicode = False
    if (detected_encoding['confidence'] > 0.9 and
            detected_encoding['encoding'] != 'ascii'):
        try:
            try:
                bytes_to_check.decode(encoding=detected_encoding['encoding'])
            except TypeError:
                # happens only on Python 2.6
                unicode(bytes_to_check, encoding=detected_encoding['encoding'])  # noqa
            decodable_as_unicode = True
            logger.debug('success: decodable_as_unicode: '
                         '%(decodable_as_unicode)r', locals())
        except LookupError:
            logger.debug('failure: could not look up encoding %(encoding)s',
                         detected_encoding)
        except UnicodeDecodeError:
            logger.debug('failure: decodable_as_unicode: '
                         '%(decodable_as_unicode)r', locals())

    logger.debug('failure: decodable_as_unicode: '
                 '%(decodable_as_unicode)r', locals())
    if is_likely_binary:
        if decodable_as_unicode:
            return False
        else:
            return True
    else:
        if decodable_as_unicode:
            return False
        else:
            if b'\x00' in bytes_to_check or b'\xff' in bytes_to_check:
                # Check for NULL bytes last
                logger.debug('has nulls:' + repr(b'\x00' in bytes_to_check))
                return True
        return False