/usr/include/BALL/QSAR/featureSelection.h is in libball1.4-dev 1.4.3~beta1-3.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | // -*- Mode: C++; tab-width: 2; -*-
// vi: set ts=2:
//
#ifndef BALL_QSAR_FEATURESELECTION_H
#define BALL_QSAR_FEATURESELECTION_H
#ifndef BALL_QSAR_MODEL_H
#include <BALL/QSAR/Model.h>
#endif
#ifndef BALL_QSAR_KERNELMODEL_H
#include <BALL/QSAR/kernelModel.h>
#endif
#ifndef BALL_QSAR_LINEARMODEL_H
#include <BALL/QSAR/linearModel.h>
#endif
#include <set>
namespace BALL
{
namespace QSAR
{
class BALL_EXPORT FeatureSelection
{
public:
/** @name Constructors and Destructors
*/
//@{
FeatureSelection(Model& m);
FeatureSelection(KernelModel& m);
~FeatureSelection();
EIGEN_MAKE_ALIGNED_OPERATOR_NEW
//@}
/** @name Accessors
*/
//@{
/** set the model, or which feature selection is to be done */
void setModel(Model& m);
void setModel(KernelModel& km);
/** starts forward selection. \n
In order to evaluate how much a descriptor increases the accuracy of the model, cross-validation is started in each step using descriptor_matrix from class QSARData as data source.\n
@param optPar 1 : Model.optimizeParameters() is used to try to find the optimal parameters during *each* step of feature selection. \n
0: Model.optimizeParameters() is not used during feature selection*/
void forwardSelection(int k=4, bool optPar=0);
/** starts backward selection. \n
In order to evaluate how much a descriptor increases the accuracy of the model, cross-validation is started in each step using descriptor_matrix from class QSARData as data source.\n
@param optPar 1 : Model.optimizeParameters() is used to try to find the optimal parameters during *each* step of feature selection. \n
0: Model.optimizeParameters() is not used during feature selection*/
void backwardSelection(int k=4, bool optPar=0);
void stepwiseSelection(int k=4, bool optPar=0);
/** Does a simple check consisting of two successive scans of all features.\n
In the first scan, the best feature to start with is searched.\n
In the second scan, it is checked for each remaining (non-empty) descriptor whether it can increase the prediction quality. The features are tested in the descending order of their predictive qualities as determined in the first scan. \n
Thus, this method is particularly suited for models that consider all features to be independent for each other (e.g. Bayesian classifiaction models). */
void twinScan(int k, bool optPar=0);
/** uses the coefficients generated by a linear regression model (LinearModel.training_result) in order to select features.\n
All descriptors whose coefficients are within 0 +/- d*stddev are considered to be unimportant and are not selected.\n
Futhermore, if feature selection has already been done on FeatureSelection->model, only those descriptors that are already part of lm AND of FeatureSelection->model are tested.
@param act determines which coefficients are to be used, i.e. which column of LinearModel.training_result */
void implicitSelection(LinearModel& lm, int act=1, double d=1);
/** reomves features that are highly correlated to another feature.
@param cor_threshold all feature which a correlation (to another feature) > cor_threshold or \< cor_threshold are removed */
void removeHighlyCorrelatedFeatures(double& cor_threshold);
/** removes those features that do not have a correlation greater than the specified value to any of the response variables */
void removeLowResponseCorrelation(double& min_correlation);
/** removes descriptors whose values are 0 in all substances from the list of selected features */
void removeEmptyDescriptors();
void selectStat(int s);
/** Sets a cutoff value for feature selections. \n
If the preditive quality is increased by less than d after adding/removing a descriptor, feature selection is stopped. */
void setQualityIncreaseCutoff(double& d);
//@}
private:
/** @name Accessors
*/
//@{
void updateWeights(std::multiset<unsigned int>& oldDescIDs, std::multiset<unsigned int>& newDescIDs, Eigen::VectorXd& oldWeights);
//@}
/** @name Attributes
*/
//@{
/** searches for empty or irrelevant descriptors and returns a sorted list containing their IDs.
\n If more than one feature selection method is applied, all descriptors that have not been selected by the previous method are considered to be irrelevant.*/
std::multiset<unsigned int>* findIrrelevantDescriptors();
/** pointer to the model, for which feature selection is to be done */
Model* model_;
/** pointer to KernelModel.weights (if the model to be optimized is a KernelModel) */
Eigen::VectorXd* weights_;
/** implements forward selection; if stepwise==1, backwardSelection() is called after each forward step, i.e. after adding a feature. */
void forward(bool stepwise, int k, bool optPar);
/** if the preditive quality is increased by less than this value after adding/removing a descriptor, feature selection is stopped. */
double quality_increase_cutoff_;
//@}
};
}
}
#endif // FEATURESEL
|