/usr/lib/python2.7/dist-packages/astroML/resample.py is in python-astroml 0.3-3.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 | import numpy as np
from astroML.utils import check_random_state
def bootstrap(data, n_bootstraps, user_statistic, kwargs=None,
pass_indices=False, random_state=None):
"""Compute bootstraped statistics of a dataset.
Parameters
----------
data : array_like
A 1-dimensional data array of size n_samples
n_bootstraps : integer
the number of bootstrap samples to compute. Note that internally,
two arrays of size (n_bootstraps, n_samples) will be allocated.
For very large numbers of bootstraps, this can cause memory issues.
user_statistic : function
The statistic to be computed. This should take an array of data
of size (n_bootstraps, n_samples) and return the row-wise statistics
of the data.
kwargs : dictionary (optional)
A dictionary of keyword arguments to be passed to the
user_statistic function.
pass_indices : boolean (optional)
if True, then the indices of the points rather than the points
themselves are passed to `user_statistic`
random_state: RandomState or an int seed (0 by default)
A random number generator instance
Returns
-------
distribution : ndarray
the bootstrapped distribution of statistics (length = n_bootstraps)
"""
# we don't set kwargs={} by default in the argument list, because using
# a mutable type as a default argument can lead to strange results
if kwargs is None:
kwargs = {}
rng = check_random_state(random_state)
data = np.asarray(data)
n_samples = data.size
if data.ndim != 1:
raise ValueError("bootstrap expects 1-dimensional data")
# Generate random indices with repetition
ind = rng.randint(n_samples, size=(n_bootstraps, n_samples))
# Call the function
if pass_indices:
stat_bootstrap = user_statistic(ind, **kwargs)
else:
stat_bootstrap = user_statistic(data[ind], **kwargs)
# compute the statistic on the data
return stat_bootstrap
def jackknife(data, user_statistic, kwargs=None,
return_raw_distribution=False, pass_indices=False):
"""Compute first-order jackknife statistics of the data.
Parameters
----------
data : array_like
A 1-dimensional data array of size n_samples
user_statistic : function
The statistic to be computed. This should take an array of data
of size (n_samples, n_samples - 1) and return an array of size
n_samples or tuple of arrays of size n_samples, representing the
row-wise statistics of the input.
kwargs : dictionary (optional)
A dictionary of keyword arguments to be passed to the
user_statistic function.
return_raw_distribution : boolean (optional)
if True, return the raw jackknife distribution. Be aware that
this distribution is not reflective of the true distribution:
it is simply an intermediate step in the jackknife calculation
pass_indices : boolean (optional)
if True, then the indices of the points rather than the points
themselves are passed to `user_statistic`
Returns
-------
mean, stdev : floats
The mean and standard deviation of the jackknifed distribution
raw_distribution : ndarray
Returned only if `return_raw_distribution` is True
The array containing the raw distribution (length n_samples)
Be aware that this distribution is not reflective of the true
distribution: it is simply an intermediate step in the jackknife
calculation
Notes
-----
This implementation is a leave-one-out jackknife.
Jackknife resampling is known to fail on rank-based statistics
(e.g. median, quartiles, etc.) It works well on smooth statistics
(e.g. mean, standard deviation, etc.)
"""
# we don't set kwargs={} by default in the argument list, because using
# a mutable type as a default argument can lead to strange results
if kwargs is None:
kwargs = {}
data = np.asarray(data)
n_samples = data.size
if data.ndim != 1:
raise ValueError("bootstrap expects 1-dimensional data")
# generate indices for the entire dataset, converting to row vector
ind0 = np.arange(n_samples)[np.newaxis, :]
# generate sets of indices where a single datapoint is left-out
ind = np.arange(n_samples, dtype=int)
ind = np.vstack([np.hstack((ind[:i], ind[i + 1:])) for i in ind])
# compute the statistic for the whole dataset
if pass_indices:
stat_data = user_statistic(ind0, **kwargs)
stat_jackknife = user_statistic(ind, **kwargs)
else:
stat_data = user_statistic(data[ind0], **kwargs)
stat_jackknife = user_statistic(data[ind], **kwargs)
# handle multiple statistics:
# if ndim=0, then the statistic is not operating on rows (error).
# if ndim=1, then it's a single statistic returned
# if ndim=2, then a tuple has been returned
stat_data = np.asarray(stat_data)
ndim = stat_data.ndim
if ndim == 0:
raise ValueError("user_statistic should return row-wise statistics")
stat_data = np.atleast_2d(stat_data).T
stat_jackknife = np.atleast_2d(stat_jackknife)
# compute the jackknife correction formula
delta_stat = (n_samples - 1) * (stat_data - stat_jackknife.mean(1))
stat_corrected = (stat_data + delta_stat)[0]
sigma_stat = np.sqrt(1. / n_samples / (n_samples + 1)
* np.sum((n_samples * stat_data - stat_corrected
- (n_samples - 1)
* stat_jackknife.T) ** 2, 0))
if return_raw_distribution:
results = tuple(zip(stat_corrected, sigma_stat, stat_jackknife))
else:
results = tuple(zip(stat_corrected, sigma_stat))
if ndim == 1:
return results[0]
else:
return results
|