This file is indexed.

/usr/include/kmer/kmer/positionDB.H is in libkmer-dev 0~20150903+r2013-3.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
#ifndef POSITIONDB_H
#define POSITIONDB_H

#include <stdio.h>
#include "bio++.H"
#include "merStream.H"

//  The two existDB inputs can be either forward or canonical.  If
//  canonical, we are smart enough to search exist/only with the
//  canonical mer.

//  Returns position in posn, resizing it if needed.  Space is
//  allocated if none supplied.  The following is valid:
//
//    uint64  *posn    = 0L;
//    uint64   posnMax = 0;
//    uint64   posnLen = 0;
//    if (get(somemer, posn, posnMax, posnLen)) {
//      do something with the positions
//    }
//
//  exists() returns T/F if mer exists or not
//  count() returns the number of times that mer is present

//  Define this to use an uncompressed hash table when the width is 32
//  bits or less.  Doing so is A LOT faster in mismatch lookups, but
//  does use more memory.
#undef UNCOMPRESS_HASH_TABLE

//  Define this to leave out references to getTime(), speedCounter()
//  and make the positionDB build very quietly.
#undef SILENTPOSITIONDB

//  Define these to enable some debugging methods
#undef DEBUGPOSDB
#undef DEBUGREBUILD

class existDB;
class merylStreamReader;

class positionDB {
public:
  positionDB(char const        *filename,
             uint32             merSize,
             uint32             merSkip,
             uint32             maxMismatch,
             bool               loadData=true);

  positionDB(merStream         *MS,
             uint32             merSize,
             uint32             merSkip,
             existDB           *mask,
             existDB           *only,
             merylStreamReader *counts,
             uint32             minCount,
             uint32             maxCount,
             uint32             maxMismatch,
             uint32             maxMemory,
             bool               beVerbose);

  ~positionDB();

private:
  void  build(merStream         *MS,
              existDB           *mask,
              existDB           *only,
              merylStreamReader *counts,
              uint32             minCount,
              uint32             maxCount,
              bool               beVerbose);

private:
  void        reallocateSpace(uint64*&    posn,
                              uint64&     posnMax,
                              uint64&     posnLen,
                              uint64      len);

  void        loadPositions(uint64      v,
                            uint64*&    posn,
                            uint64&     posnMax,
                            uint64&     posnLen,
                            uint64&     count);

public:
  bool        getExact(uint64      mer,
                       uint64*&    posn,
                       uint64&     posnMax,
                       uint64&     posnLen,
                       uint64&     count);
  bool        existsExact(uint64   mer);
  uint64      countExact(uint64    mer);

public:
  void        filter(uint64 lo, uint64 hi);

private:
  double      setUpMismatchMatcher(uint32 nErrorsAllowed, uint64 approxMers);
public:
  bool        getUpToNMismatches(uint64      mer,
                                 uint32      maxMismatches,
                                 uint64*&    posn,
                                 uint64&     posnMax,
                                 uint64&     posnLen);
private:
  uint64      setCount(uint64 mer, uint64 count);

  //  Save or load a built table
  //
public:
  void        saveState(char const *filename);
  bool        loadState(char const *filename, bool beNoisy=false, bool loadData=true);

  void        printState(FILE *stream);

  //  Only really useful for debugging.  Don't use.
  //
  void        dump(char *name);


  bool         checkREBUILD(uint64 m) {
#define DEBUGREBUILD
#ifdef DEBUGREBUILD
    uint64 h = HASH(m);
    uint64 c = CHECK(m);
    uint64 r = REBUILD(h, c);
    if (r != m) {
      fprintf(stderr, "shift1 = "uint32FMT"\n", _shift1);
      fprintf(stderr, "shift2 = "uint32FMT"\n", _shift2);
      fprintf(stderr, "M = "uint64HEX"\n", m);
      fprintf(stderr, "H = "uint64HEX"\n", h);
      fprintf(stderr, "C = "uint64HEX"\n", c);
      fprintf(stderr, "R = "uint64HEX"\n", r);
      return(false);
    }
    return(true);
#else
    return(REBUILD(HASH(m), CHECK(m)) == m);
#endif
  };

private:

  uint64       HASH(uint64 k) {
    return(((k >> _shift1) ^ (k >> _shift2) ^ k) & _mask1);
  };

  uint64       CHECK(uint64 k) {
    return(k & _mask2);
  };

  uint64       REBUILD(uint64 h, uint64 c) {
    //  Decode a HASH and a CHECK to get back the mer.  You'd better
    //  bloody PRAY you don't break this (test/test-rebuild.C).  It
    //  was a headache++ to write.

    uint64 sha = _shift1 - _shift2;
    uint64 msk = uint64MASK(sha);

    //  The check is exactly the mer....just not all there.
    uint64 mer = c;

    uint64 shf = sha - (_tableSizeInBits % 2);
    uint64 shg = 0;
    uint64 shh = _shift1;

    //  Unrolling this is troublesome - we still need the tests,
    //  bizarre merSize, tblSize combinations use lots of iterations
    //  (when the merSize and tblSize are about the same, the CHECK is
    //  small, and so we need to do lots of iterations).

    //fprintf(stderr, "shf="uint64FMTW(2)" shg="uint64FMTW(2)" shh="uint64FMTW(2)" mer="uint64HEX"\n", shf, shg, shh, mer);

    do {
      mer |= (((h >> shg) ^ (mer >> shg) ^ (mer >> shf)) & msk) << shh;
      //fprintf(stderr, "shf="uint64FMTW(2)" shg="uint64FMTW(2)" shh="uint64FMTW(2)" mer="uint64HEX"\n", shf, shg, shh, mer);

      shf += sha;
      shg += sha;
      shh += sha;
    } while ((shf < _merSizeInBits) && (shh < 64));

    mer &= uint64MASK(_merSizeInBits);

    return(mer);
  };

  void         sortAndRepackBucket(uint64 b);

  uint32     *_bucketSizes;
  uint64     *_countingBuckets;
  uint64     *_hashTable_BP;  //  Bit packed
  uint32     *_hashTable_FW;  //  Full width
  uint64     *_buckets;

  uint64     *_positions;

  uint32      _merSizeInBases;
  uint32      _merSizeInBits;

  uint32      _merSkipInBases;

  uint64      _tableSizeInEntries;
  uint32      _tableSizeInBits;

  uint32      _hashWidth;  // Hash bith
  uint32      _chckWidth;  // Check bits
  uint32      _posnWidth;  // Positions in the sequence
  uint32      _pptrWidth;  // Pointers to positions
  uint32      _sizeWidth;  // Extra number in the table

  uint64      _hashMask;

  uint32      _wCnt;
  uint32      _wFin;

  uint32      _shift1;
  uint32      _shift2;
  uint64      _mask1;
  uint64      _mask2;

  uint64      _numberOfMers;
  uint64      _numberOfPositions;
  uint64      _numberOfDistinct;
  uint64      _numberOfUnique;
  uint64      _numberOfEntries;
  uint64      _maximumEntries;

  //  For sorting the mers
  //
  uint32      _sortedMax;
  uint64     *_sortedChck;
  uint64     *_sortedPosn;

  //  For the mismatch matcher
  uint32      _nErrorsAllowed;
  uint32      _hashedErrorsLen;
  uint32      _hashedErrorsMax;
  uint64     *_hashedErrors;
};

#endif  //  POSITIONDB_H