/usr/share/pyshared/mvpa2/testing/datasets.py is in python-mvpa2 2.1.0-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*-
# vi: set ft=python sts=4 ts=4 sw=4 et:
### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the PyMVPA package for the
# copyright and license terms.
#
### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Provides convenience datasets for unittesting.
Also performs testing of storing/reloading datasets into hdf5 file if
cfg.getboolean('tests', 'use hdf datasets'
"""
__docformat__ = 'restructuredtext'
import tempfile
import shutil
import os
import traceback as tbm
import sys
import numpy as np
from mvpa2 import cfg, externals
from mvpa2.datasets.base import Dataset, HollowSamples
from mvpa2.generators.partition import OddEvenPartitioner
from mvpa2.misc.data_generators import *
from mvpa2.testing.tools import reseed_rng
__all__ = [ 'datasets', 'get_random_rotation', 'saveload_warehouse',
'pure_multivariate_signal']
# Define datasets to be used all over. Split-half later on is used to
# split into training/testing
#
snr_scale = cfg.get_as_dtype('tests', 'snr scale', float, default=1.0)
specs = {'large' : { 'perlabel': 99, 'nchunks': 11,
'nfeatures': 20, 'snr': 8 * snr_scale},
'medium' :{ 'perlabel': 24, 'nchunks': 6,
'nfeatures': 14, 'snr': 8 * snr_scale},
'small' : { 'perlabel': 12, 'nchunks': 4,
'nfeatures': 6, 'snr' : 14 * snr_scale} }
# to assure reproducibility -- lets reseed the RNG at this point
@reseed_rng()
def generate_testing_datasets(specs):
# Lets permute upon each invocation of test, so we could possibly
# trigger some funny cases
nonbogus_pool = np.random.permutation([0, 1, 3, 5])
datasets = {}
# use a partitioner to flag odd/even samples as training and test
ttp = OddEvenPartitioner(space='train', count=1)
for kind, spec in specs.iteritems():
# set of univariate datasets
for nlabels in [ 2, 3, 4 ]:
basename = 'uni%d%s' % (nlabels, kind)
nonbogus_features = nonbogus_pool[:nlabels]
dataset = normal_feature_dataset(
nlabels=nlabels,
nonbogus_features=nonbogus_features,
**spec)
# full dataset
datasets[basename] = list(ttp.generate(dataset))[0]
# sample 3D
total = 2*spec['perlabel']
nchunks = spec['nchunks']
data = np.random.standard_normal(( total, 3, 6, 6 ))
labels = np.concatenate( ( np.repeat( 0, spec['perlabel'] ),
np.repeat( 1, spec['perlabel'] ) ) )
data[:, 1, 0, 0] += 2*labels # add some signal
chunks = np.asarray(range(nchunks)*(total/nchunks))
mask = np.ones((3, 6, 6), dtype='bool')
mask[0, 0, 0] = 0
mask[1, 3, 2] = 0
ds = Dataset.from_wizard(samples=data, targets=labels, chunks=chunks,
mask=mask, space='myspace')
# and to stress tests on manipulating sa/fa possibly containing
# attributes of dtype object
ds.sa['test_object'] = [['a'], [1, 2]] * (ds.nsamples/2)
datasets['3d%s' % kind] = ds
# some additional datasets
datasets['dumb2'] = dumb_feature_binary_dataset()
datasets['dumb'] = dumb_feature_dataset()
# dataset with few invariant features
_dsinv = dumb_feature_dataset()
_dsinv.samples = np.hstack((_dsinv.samples,
np.zeros((_dsinv.nsamples, 1)),
np.ones((_dsinv.nsamples, 1))))
datasets['dumbinv'] = _dsinv
# Datasets for regressions testing
datasets['sin_modulated'] = list(ttp.generate(multiple_chunks(sin_modulated, 4, 30, 1)))[0]
# use the same full for training
datasets['sin_modulated_train'] = datasets['sin_modulated']
datasets['sin_modulated_test'] = sin_modulated(30, 1, flat=True)
# simple signal for linear regressors
datasets['chirp_linear'] = multiple_chunks(chirp_linear, 6, 50, 10, 2, 0.3, 0.1)
datasets['chirp_linear_test'] = chirp_linear(20, 5, 2, 0.4, 0.1)
datasets['wr1996'] = multiple_chunks(wr1996, 4, 50)
datasets['wr1996_test'] = wr1996(50)
datasets['hollow'] = Dataset(HollowSamples((40,20)),
sa={'targets': np.tile(['one', 'two'], 20)})
return datasets
# avoid treating it as a test by nose
generate_testing_datasets.__test__ = False
def saveload_warehouse():
"""Store all warehouse datasets into HDF5 and reload them.
"""
import h5py
from mvpa2.base.hdf5 import obj2hdf, hdf2obj
tempdir = tempfile.mkdtemp()
# store the whole datasets warehouse in one hdf5 file
hdf = h5py.File(os.path.join(tempdir, 'myhdf5.hdf5'), 'w')
for d in datasets:
obj2hdf(hdf, datasets[d], d)
hdf.close()
hdf = h5py.File(os.path.join(tempdir, 'myhdf5.hdf5'), 'r')
rc_ds = {}
for d in hdf:
rc_ds[d] = hdf2obj(hdf[d])
hdf.close()
#cleanup temp dir
shutil.rmtree(tempdir, ignore_errors=True)
# return the reconstructed datasets (for use in datasets warehouse)
return rc_ds
datasets = generate_testing_datasets(specs)
if cfg.getboolean('tests', 'use hdf datasets', False):
if not externals.exists('h5py'):
raise RuntimeError(
"Cannot perform HDF5 dump of all datasets in the warehouse, "
"because 'h5py' is not available")
datasets = saveload_warehouse()
print "Replaced all dataset warehouse for HDF5 loaded alternative."
|