/usr/include/pbseq/alignment/algorithms/anchoring/ScoreAnchorsImpl.hpp is in libblasr-dev 0~20161219-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 | #ifndef _BLASR_SCORE_ANCHOR_IMPL_HPP_
#define _BLASR_SCORE_ANCHOR_IMPL_HPP_
template<typename TSequence, typename T_Tuple>
int GetTupleCount(TSequence &seq, DNALength startPos,
TupleMetrics &tm, TupleCountTable<TSequence, T_Tuple> &ct,
int &count) {
T_Tuple tuple;
if (tuple.FromStringLR(&seq.seq[startPos], tm)) {
count = ct.countTable[TupleData(tuple)];
return 1;
}
else {
return 0;
}
}
template<typename TSequence, typename T_Tuple>
int PMatch(TSequence &seq, DNALength startPos, DNALength length,
TupleMetrics &tm, TupleCountTable<TSequence, T_Tuple> &ct,
float &pMatch){
int tupleCount;
T_Tuple tuple, curTuple;
//
// Compute the probability of a match of length 'length'
// in the genome using a k-th order Markov model of the genome.
// Other than that there is no spatial constraint on a match.
// This means that if the length of seq is k, and that sequence
// of length k exists in the genome, then the probability of a
// match is 1.
pMatch = 1;
if (GetTupleCount(seq, startPos, tm, ct, tupleCount)) {
if (tupleCount == 0) return 0;
//
// Compute the frequency of the following tuple, and compare this
// to the frequencies of all 4 possible tuples that are next.
//
curTuple.FromStringLR(&seq.seq[startPos], tm);
if (length < static_cast<DNALength>(tm.tupleSize)) {
// the match is shorter than the tuples used to model the
// genome sequence composition. Don't try and compute a p-value
// for it -- assume that you will always find a match of this
// length.
//
pMatch = 0;
return 1;
}
for (size_t i = 1; i < length - tm.tupleSize; i++) {
//
// now add on the log counts for the transitions.
//
if (tuple.FromStringLR(&seq.seq[i+startPos], tm) == 0) {
return 0;
}
int nextTupleCount = 0;
int rightMarCount = SumRightShiftMarginalTupleCounts(tm, ct,
tuple, TwoBit[seq.GetNuc(startPos+i+tm.tupleSize-1)],
nextTupleCount);
//
// tuple counts are not defined for N's.
//
if (TwoBit[seq.GetNuc(startPos + i+tm.tupleSize)] > 3) {
return 0;
}
if (nextTupleCount == 0) {
//
// There is no background distribution available for this
// sequence context, therefore no way to evaluate p-value.
//
return 0;
}
pMatch += log((nextTupleCount / (1.0*rightMarCount)));
curTuple.tuple = tuple.tuple;
}
//
// Done computing the probability of an extension. Now compute the probability
// of the match. There are nMatches of the initial seed. We assume that each has
// an equal probability of matching.
//
return 1;
}
else {
return 0;
}
}
template<typename TSequence, typename T_Tuple>
int POneOrMoreMatches(TSequence &seq, DNALength startPos,
DNALength length, TupleMetrics &tm,
TupleCountTable<TSequence, T_Tuple> &ct,
float &pValue){
float pMatch = 1;
//
// Compute the probability that the sequence matches ANY spot
// in the reference for at least 'length' bases.
//
PMatch(seq, startPos, length, tm, ct, pMatch);
pValue = pMatch;
return 1;
}
template<typename TSequence, typename T_Tuple>
int SumRightShiftMarginalTupleCounts(TupleMetrics &tm,
TupleCountTable<TSequence, T_Tuple> &ct, T_Tuple curTuple,
int nextNuc, int &nextSeqCount) {
(void)(tm);
int totalCount = 0;
int rightMarCount = 0;
long i;
T_Tuple altMask;
altMask.tuple = 3L;
altMask.tuple = ~altMask.tuple;
for (i = 0; i < 4; i++ ) {
T_Tuple alt = curTuple;
// next.ShiftLeft(tm, 2);
alt.tuple = alt.tuple & altMask.tuple;
alt.tuple += i;
// next.Append(i,2L);
rightMarCount = ct.countTable[TupleData(alt)];
totalCount += rightMarCount;
if (i == nextNuc) {
nextSeqCount = rightMarCount;
}
}
return totalCount;
}
template<typename TSequence, typename T_Tuple>
int ComputeTotalTupleCount(TupleMetrics &tm,
TupleCountTable<TSequence, T_Tuple> &ct, TSequence &seq,
int start, int end) {
PB_UNUSED(start);
if (end == -1) {
end = seq.length;
}
int nTuples = end - tm.tupleSize + 1;
if (nTuples == 0 ){
return 0;
}
int totalCount = 0;
T_Tuple tuple;
int i;
for (i = 0; i < nTuples; i++) {
tuple.FromStringLR(&seq.seq[i], tm);
totalCount += ct.countTable[tuple.ToLongIndex()];
}
return totalCount;
}
template<typename TSequence, typename T_Tuple>
int ComputeAverageTupleCount(TupleMetrics &tm,
TupleCountTable<TSequence, T_Tuple> &ct, TSequence &seq) {
int nTuples = seq.length - tm.tupleSize + 1;
if (nTuples == 0 ){
return 0;
}
int totalCount = ComputeTotalTupleCount(tm, ct, seq);
return totalCount / nTuples;
}
inline
int ComputeExpectedFirstWaitingTime(float lambda) {
// The first waiting time of a Poisson process is
// exponentially distributed. The mean of an exponentially
// distributed variable with parameter lambda is 1/lambda.
return (1/lambda);
}
#endif
|