/usr/include/BALL/QSAR/QSARData.h is in libball1.4-dev 1.4.3~beta1-3.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 | // -*- Mode: C++; tab-width: 2; -*-
// vi: set ts=2:
//
//
#ifndef QSARH
#define QSARH
#include <iostream>
#include <BALL/KERNEL/system.h>
#include <BALL/FORMAT/SDFile.h>
#include <BALL/FORMAT/PDBFile.h>
#include <BALL/FORMAT/HINFile.h>
#include <BALL/FORMAT/MOLFile.h>
#include <vector>
#include <list>
#include <set>
#include <map>
#include <math.h>
#include <sstream>
#include <fstream>
#include <limits>
#include <fstream>
#include <BALL/QSAR/simpleDescriptors.h>
#include <BALL/QSAR/connectivityDescriptors.h>
#include <BALL/QSAR/partialChargeDescriptors.h>
#include <BALL/QSAR/surfaceDescriptors.h>
#include <BALL/COMMON/exception.h>
#include <cstring>
#ifndef STATISTICS
#include <BALL/QSAR/statistics.h>
#endif
#ifndef QSAR_EXCEPTION
#include <BALL/QSAR/exception.h>
#endif
#include <BALL/CONCEPT/timeStamp.h>
// #ifndef MODEL
// #include "Model.h"
// #endif
namespace BALL
{
class MolecularSimilarity;
namespace QSAR
{
typedef vector<double> Column;
typedef vector<Column> VMatrix;
/** QSAR */
class BALL_EXPORT QSARData
{
public:
QSARData();
~QSARData();
/** @name Predicates */
/** tells whether the features have been centered */
bool isDataCentered() const;
/** tells whether the response variables have been centered */
bool isResponseCentered() const;
/** @name Accessors
*/
//@{
/** reads the names of the properties from the first molecule in the given sd-file */
vector<String>* readPropertyNames(String sd_file);
/** Fetches input from one sd-file containing all structures and from one file containing the activities of all structures sorted in ascending order. \n
The latter file is assumed to have the same name as the first one, with only the extension changed to ".txt"
@param file the sd-file containing the input */
void readSDFile(const char* file);
/** Fetches input from one sd-file containing all structures. The activity value for each molecule is taken from its property in the sd-file. \n
@param a contains the numbers of the properties that are activity-values
@param file the sd-file containing the input
@param useExDesc if set to 1, descriptors read from the sd-file will be used in addition to those calculated by BALL internally
@param append if set to 1, the substances read from the sd-file will be appended as new lines to the current descriptor_matrix */
void readSDFile(const char* file, std::multiset<int>& act, bool useExDesc=1, bool append=0, bool translate_class_labels=0);
void readSDFile(const char* file, std::set<String>& activity_names, bool useExDesc=1, bool append=0, bool translate_class_labels=0, bool calc_phychem_properties=1, bool calc_topological_properties=1);
/** show descriptor_matrix on stdout */
void displayMatrix();
/** centers each descriptor to mean of 0 and stddev of 1
@param center_Y if ==1, activity values are also centered. Obviously this should NOT be used for classification experiments! */
void centerData(bool center_Y=0);
/** scales each descriptor to stddev of 1 */
void scaleAllDescriptors();
/** returns the number of substances */
unsigned int getNoSubstances() const;
/** returns the number of descriptors */
unsigned int getNoDescriptors() const;
/** Read input from a csv file. \n
This file should contain all descriptor values in the first columns and the activity values in the last no_y columns.\n
@param no_y the number of activities, i.e. the number of columns containing activity values
@param xlabels if ==1, names of descriptors are read from the first line of the table
@param ylabel if ==1, names of substances are read from the first column of the table
@param sep the character used to seperate the cells of the table
@param appendDescriptors if set to 1, descriptors will be read from the file and appended as new columns to the current descriptor_matrix */
void readCSVFile(const char* file, int no_y, bool xlabels, bool ylabels, const char* sep=",", bool appendDescriptors=0, bool translate_class_labels=0);
/** for testing purposes only: change Y-matrix according to the given equations */
void manipulateY(std::vector<String> v);
/** for testing purposes only: change Y-matrix according to the given equation
@param v string containing the equation, e.g."x1+x3*5+x10^2" */
void manipulateY(String v);
/** Discretize the response values. If the response variable(s) of this data object have been normalized, the given thresolds will be automatically normalized accordingly.
@param thresolds d thresholds for d+1 classes, that are to be created */
void discretizeY(std::vector<double> thresholds);
void transformX(std::vector<String> v);
/** partitions the input data into p QSARData object of (approx.) equal size. */
std::vector<QSARData*> partitionInputData(int p);
/** saves the current QSARData object to a text file */
void saveToFile(string filename) const;
/** reconstructs a QSARData object from a text file */
void readFromFile(string filename);
/** generates a training and an external validation set from the current QSARData object
@param fraction the fraction of this current coumpounds that should be used as external validation set (by random drawing) */
std::vector<QSARData*> generateExternalSet(double fraction) const;
/** Split this data set into a training set and a test set.
In contrast to generateExternalSet(), compounds for the test set are *not* randomly selected. Instead, this data set is first sorted according to response values (in order to ensure equal response value ranges) and then split regularly into training and test set.
@param no_test_splits the total number of splits you want to create by successive calls of this function
@param current_test_split_id the split to be produced, with 0<=current_test_split_id<no_test_splits */
std::vector<QSARData*> evenSplit(int no_test_splits, int current_test_split_id, int response_id=0) const;
/** returns a pointer to a new vector containing the UNcentered descriptor values for the s'th substance of the current data set */
std::vector<double>* getSubstance(int s) const;
/** returns a pointer to a new vector containing the UNcentered response values for the s'th substance of the current data set */
std::vector<double>* getActivity(int s) const;
/** returns the number of response variables */
unsigned int getNoResponseVariables() const;
const std::vector<string>* getSubstanceNames() const;
/** checks whether the response variables contain only discrete values. This can be used to check whether the current input data set is suitable for a ClassificationModel */
bool checkforDiscreteY() const;
/** checks whether the response variables of a specified file contain only discrete values. */
bool checkforDiscreteY(const char* file, std::multiset<int>& activity_IDs) const;
/** allows to set the data-folder neccessary for computation of descriptors without using BALL_DATA_PATH enviroment variable, which is useful for standalone applications */
void setDataFolder(const char* folder);
/** removes compounds whose absolute correlation coefficient to another compound is larger than cor_threshold
@param feature_cor_threshold Only features that do not have a correlation larger than this value to another feature are used to calculate the similarity of compounds (=instances). */
void removeHighlyCorrelatedCompounds(double& compound_cor_threshold, double& feature_cor_threshold);
/** Find all descriptors of the current data set that have a correlation of at least 'similarity' to the specified feature
@param descriptor_ID the ID of the descriptor for which similar features should be searched
@param similarity the desired minimal correlation
@param similar_descriptor_IDs list to which the IDs of the found descriptors will be saved as pairs of descriptor ID and descriptor name */
void getSimilarDescriptors(int descriptor_ID, double correlation, std::list<std::pair<uint,String> >& similar_descriptor_IDs) const;
//@}
protected:
/** @name Accessors
*/
//@{
/**
Calculates descriptors for one molecule and saves them into one new line of descriptor_matrix
*/
void calculateBALLDescriptors(Molecule& m);
/** Calculates topological descriptors based on functional groups counts done by SMARTS matching */
void calculateTopologicalDescriptors(Molecule& mol, MolecularSimilarity& molsim, const std::map<String,int>& descriptor_map);
/** writes the names of all external descriptors into column_names */
void setDescriptorNames(const Molecule& m, std::multiset<int>& activity_IDs, bool useExDesc=1, bool resize=1);
/** removes columns of invalid descriptor from descriptor_matrix
@param invalidDescriptors list containing the IDs of the columns to be deleted */
void removeInvalidDescriptors(std::multiset<int>& invalidDescriptors);
void removeInvalidSubstances(std::multiset<int>& inv);
/** reconstructs a vector based matrix from a file */
void readMatrix(VMatrix& mat, std::ifstream& in, char seperator, unsigned int lines, unsigned int col);
/** checks whether the given list of activity IDs contains any values <0 or values that are larger than the number of properties in the current input file.\n
If such values are found, an Exception of type InvalidActivityID is thrown. */
void checkActivityIDs(std::multiset<int>& act, int no_properties);
/** appends compound no <s> taken from the given source to the data of this object.
@param backtransformation if set to true, all features of the compound are back-transformed after adding them to this object. */
void insertSubstance(const QSARData* source, int s, bool backtransformation=0);
/** prints a vector-based matrix to a file */
void printMatrix(const VMatrix& mat, std::ostream& out) const;
//@}
/** @name Attributes
*/
//@{
/** matrix containing the values of each descriptor for each substance */
VMatrix descriptor_matrix_;
/** matrix containing the experimentally determined results (active/non-active) for each substance. Different activities are saved column-wise. */
VMatrix Y_;
/** 2xm dimensional matrix (m=no of descriptors) containing mean and stddev of each transformed descriptor */
VMatrix descriptor_transformations_;
/** 2xc dimensional matrix (c=no of activities) containing mean and stddev of each transformed activity */
VMatrix y_transformations_;
/** names of all descriptors */
vector<string> column_names_;
/** names of all substances */
vector<string> substance_names_;
/** contains the numbers of external descriptors for which invalid values (e.g. strings instead numerical values) were encountered in some molecules */
std::multiset<int> invalidDescriptors_;
std::multiset<int> invalidSubstances_;
String data_folder_;
/** in case of classification data sets with non-numeric class labels, this member maps the names of the individual classes to their assigned id. */
std::map<String,int> class_names_;
//@}
friend class ClassificationValidation;
friend class RegressionValidation;
friend class Validation;
friend class Model;
friend class FitModel;
friend class FeatureSelection;
};
}
}
#endif // QSARH
|