/usr/include/pbseq/alignment/algorithms/anchoring/LISPValueImpl.hpp is in libblasr-dev 0~20161219-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | #ifndef _BLASR_LISP_VALUE_IMPL_HPP_
#define _BLASR_LISP_VALUE_IMPL_HPP_
template<typename T_MatchPos>
void StoreNonOverlappingIndices(std::vector<T_MatchPos> &lis,
std::vector<T_MatchPos> &noOvpLis) {
unsigned int i;
//
// Greedily add lis matches according to weight. A match may be added
// as long as it does not overlap with any other matches.
//
// do nothing on empty lists
if (lis.empty()) return;
//
// First build a list of matches sorted by weight.
SortMatchPosListByWeight(lis);
//
// The first match is guaranteed to not overlap.
noOvpLis.push_back(lis[0]);
//
// Nothing is overlapping, and everything is sorted when there is
// just one value.
if (lis.size() == 1) return;
//
// Next, add matches as long as they do not overlap.
for (i = 1; i < lis.size(); i++ ){
VectorIndex j;
int lts = lis[i].t;
int lte = lis[i].t + lis[i].GetLength();
int lqs = lis[i].q;
int lqe = lis[i].q + lis[i].GetLength();
int ovpFound = 0;
for (j =0; j < noOvpLis.size(); j++ ){
int tIntvStart = noOvpLis[j].t;
int tIntvEnd = noOvpLis[j].t + noOvpLis[j].GetLength();
int qIntvStart = noOvpLis[j].q;
int qIntvEnd = noOvpLis[j].q + noOvpLis[j].GetLength();
if ((lts >= tIntvStart and lts < tIntvEnd) or
(lte > tIntvStart and lte <= tIntvEnd) or
(lqs >= qIntvStart and lqs < qIntvEnd) or
(lqe > qIntvStart and lqe <= qIntvEnd)) {
ovpFound = 1;
break;
}
}
if (!ovpFound) {
noOvpLis.push_back(lis[i]);
}
}
//
// Now, the matches are found in order of size, but they need to
// be stored in order of text.
//
SortMatchPosList(noOvpLis);
//
// The match pos list was sorted in order of weight.
// Just in case it causes problems down the line, re-sort it
// according to query pos.
//
lis = noOvpLis;
SortMatchPosList(lis);
}
template<typename T_TextSequence,
typename T_Sequence,
typename T_MatchPos,
typename T_Tuple>
float ComputeLISPValue(std::vector<T_MatchPos> &lis,
T_TextSequence &text, T_Sequence &read,
TupleMetrics &tm, TupleCountTable<T_TextSequence, T_Tuple> &ct,
int &lisNBases, int &lisSize ) {
(void)(read);
//
// First, find a subset of the lis that has non-overlapping matches.
//
size_t i;
lisNBases = 0;
for (i = 0; i < lis.size(); i++) {
lisNBases += lis[i].l;
}
lisSize = lis.size();
if (lis.size() == 1) {
//
// When just one LIS is found, the pvalue of the match is just the
// pvalue of getting a single hit with one read, which we estimate
// as the pvalue of at least one hit.
//
float matchProb;
//
// Weight the single match based on how frequently it appears in
// the genome.
if (POneOrMoreMatches<T_TextSequence>(text, lis[0].t,
lis[0].GetLength(), tm, ct, matchProb)) {
return matchProb;
}
else {
// return non-significant match value.
return 1;
}
}
else {
//
// There is more than one non overlapping match. This evokes a
// totally different probability metric on LIS values. Rather
// than computing the probability of a single match, compute
// the probability of seeing several matches in a row.
//
VectorIndex i;
float pChain = 0; // In other words, pChain = log(1)
//
// prob of chain starts out as prob of first match.
//
if (POneOrMoreMatches(text, lis[0].t, lis[0].GetLength(),
tm, ct, pChain)) {
//
// Next, for each match, compute the expected probability of
// having to wait at most tGap time for a match. Since this
// has screened for anchors that are not overlapping,
// consider the events to be independent, and therefore
// the probabilities multiply.
//
for (i = 1; i < lis.size(); i++ ){
//
// Find out how many times this word appears in the genome.
//
// Assume all hits are uniformly distributed across the
// genome. qLambda is the frequency of seeing hit i in the
// genome.
//
float qLambda;
qLambda = lis[i].GetMultiplicity() / (1.0*text.length);
// Now compute the probability of the chain. Since the
// matches are uniformly distributed across the genome, the
// waiting time between matches is exponentially distributed.
pChain = pChain + log(qLambda);
}
return pChain;
}
else {
return 1;
}
}
}
#endif
|