This file is indexed.

/usr/include/anfo/index.h is in libanfo0-dev 0.98-4.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
//    Copyright 2009 Udo Stenzel
//    This file is part of ANFO
//
//    ANFO is free software: you can redistribute it and/or modify
//    it under the terms of the GNU General Public License as published by
//    the Free Software Foundation, either version 3 of the License, or
//    (at your option) any later version.
//
//    Anfo is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU General Public License for more details.
//
//    You should have received a copy of the GNU General Public License
//    along with Anfo.  If not, see <http://www.gnu.org/licenses/>.

#ifndef INCLUDED_INDEX_H
#define INCLUDED_INDEX_H

#include "config.pb.h"
#include "sequence.h"
#include "util.h"

#include <algorithm>
#include <iostream>
#include <limits>
#include <list>
#include <map>
#include <string>
#include <utility>
#include <vector>

#include <glob.h>
#include <sys/mman.h>

//! \brief a genome as stored in a DNA file
//! This class mmaps a DNA file and wraps it with a sensible interface.
class CompactGenome
{
	public:
		// Why are we using indices here instead of pointers?  Because
		// pointers would be internal and make copying of this object a
		// lot harder.

		// key is offset into genome, value is (index of sequence, index
		// of contig)
		typedef std::map< uint32_t, std::pair< int, int > > ContigMap ;
		
		// value is index of contig
		typedef std::map< uint32_t, int > PosnMap1 ;

		// first half of value is index of sequence
		typedef std::map< std::string, std::pair< int, PosnMap1 > > PosnMap ;

		static void cleanup( const CompactGenome* ) {}

	private:
		DnaP base_ ;
		size_t file_size_ ;
		uint32_t length_ ;
		int fd_ ;
		ContigMap contig_map_ ;
		PosnMap posn_map_ ;

		CompactGenome( const CompactGenome& ) ; // not implemented
		void operator = ( const CompactGenome& ) ; // not implemented

		~CompactGenome() ; // must control life cycle
		friend class Metagenome ;

	public:
		config::Genome g_ ;
		mutable int refcount_ ;

	public:
		//! \brief makes accessible a genome file
		//! \param name file name of the genome
		//! \param c program configuration, needed for the search path
		CompactGenome( const std::string& name ) ;

		void add_ref() const { ++refcount_ ; }

		std::string name() const { return g_.name() ; }
		std::string describe() const 
		{
			std::string d = g_.name() ;
			if( g_.has_description() ) d += " (" + g_.description() + ")" ;
			return d ;
		}

		uint32_t total_size() const { return g_.total_size() ; }
		uint32_t raw_size() const { return length_ ; }
		DnaP get_base() const { return base_ ; }

		//! \brief scan over finite words of the dna
		//! Fixed size words in the genome are iterated.  The words may
		//! well contain ambiguity codes, but are guaranteed not to
		//! contain gaps.
		//!
		//! Words are encoded as four bits per nucleotide, first nucleotide in
		//! the MSB(!).  See ::make_dense_word for why that makes sense.  Unused
		//! MSBs in words passed to mk_word contain junk, do not rely on
		//! them.
		//!
		//! The functor objects in the following will be passed by
		//! value.  Make sure they are tiny and that their function call
		//! operators can be inlined.
		//!
		//! \param w word size, 4*w must not be more than there are bits
		//!          in an unsigned long long.
		//! \param mk_word functor object called to transform a
		//!                sequence of ambiguity codes into words of
		//!                nucleotides
		//! \param mk_gap functor that's called at every gap
		//! \param slice scan this slice of a big genome
		//! \param slices total number of slices
		//! \param msg if set, switches on progress reports and is
		//!            included in them
		template< typename F, typename G > void scan_words( unsigned w, F mk_word, G mk_gap, int slice = 0, int slices = 1, const char* msg = 0 ) const ;

		const ContigMap &get_contig_map() const { return contig_map_ ; }

		//! \brief translates a DNA pointer back to sequence coordinates
		//! If the DNA pointer points into this genome, it is translated
		//! to the name of the sequence and the offset into it.  The
		//! strand is disregarded, the result is the same regardless of
		//! the direction the pointer would be moving in.
		//! \param pos position to be translated
		//! \param offset is assigned the offset after translation
		//! \return pointer to sequence that includes the hit, else null
		const config::Sequence *translate_back( DnaP pos, uint32_t& offset ) const ;

		//! \brief translates "human" cooordinates into a pointer
		//! Return 0 if anything goes wrong.
		DnaP find_pos( const std::string& seq, uint32_t pos ) const ;

		enum { signature = 0x31414e44u } ; // DNA1 

	private:
		//! \brief reports a position while scanning
		//! \internal
		static void report( uint32_t, uint32_t, const char* ) ;
} ;

//! \brief Representation of a list of seeds.
//! A seed is described by a size and two coordinates.  One is the start
//! on the query sequence, the other we choose to be the "diagonal"
//! (difference between coordinates), since seeds on the same or close
//! diagonal will usually be combined.  Offset is negative for rc'ed
//! matches, in this case its magnitude is the actual offset from the
//! end of the sequence.
//!
//! Here we actually represent a list of seeds by two pointers into a
//! list of reference coordinates.  The first entry is chached in the
//! data structure itself, so it selves to represent both lists and
//! single seeds.
//!
//! The list of seeds in index files is sorted backwards (more or less
//! an artefact of the way indexes are built), so we can merge these
//! lists instead of copying and sorting them.
struct Matches
{
	const uint32_t *begin, *end ;
	int32_t offs ;
	uint16_t wordsize ;
	uint16_t stride ;
	int64_t diag ;
} ;


// Match lists are sorted backwards by reference coordinate (an artefact
// of the index construction process) and therefore backwards by
// diagonal (ref-coordinate minus offset, by definition); we also choose
// to sort backwards on the offset.
struct compare_match_lists {
	bool operator()( const Matches &a, const Matches &b ) {
		if( b.diag < a.diag ) return true ;
		if( a.diag < b.diag ) return false ;
		return b.offs < a.offs ;
	}
} ;

struct compare_matches_for_heap {
	// note the `wrong' order; this is intended for a heap!
	bool operator()( const Matches &a, const Matches &b ) {
		return compare_match_lists()( b, a ) ;
	}
} ;


//! \brief Collection of short matches.
//! We actually collect the ranges of sorted(!) matches in the index
//! data structure, then merge-sort them on demand.  The calling
//! sequence is to first post() ranges of index entries, then
//! start_traversal(), then look at the first seed using get(), and
//! remove them one by one using take() until the structure becomes
//! empty().
class PreSeeds
{
	private:
		struct equal_match_lists {
			bool operator()( const Matches &a, const Matches &b ) {
				return a.begin == b.begin && a.offs == b.offs ;
			}
		} ;
		std::vector< Matches > heap_ ;

		bool normalize() 
		{
			while( heap_.back().begin != heap_.back().end &&
					*heap_.back().begin % heap_.back().stride != 0 ) 
				++heap_.back().begin ;

			if( heap_.back().begin == heap_.back().end ) {
				heap_.pop_back() ;
				return false ;
			}
			else {
				heap_.back().diag = (int64_t)(*heap_.back().begin) - (int64_t)(heap_.back().offs) ;
				return true ;
			}
		}

	public:
		PreSeeds() {}

		void post( const uint32_t *begin, const uint32_t *end, int32_t offs, int wordsize, int stride )
		{
			if( begin != end ) {
				heap_.push_back( Matches() ) ;
				heap_.back().begin = begin ;
				heap_.back().end = end ;
				heap_.back().offs = offs ;
				heap_.back().wordsize = wordsize ;
				heap_.back().stride = stride ;
				normalize() ;
			}
		}

		bool empty() const { return heap_.empty() ; }

		const Matches& get() const { return heap_.front() ; }

		void start_traversal() {
			std::sort( heap_.begin(), heap_.end(), compare_match_lists() ) ;
			heap_.erase( std::unique( heap_.begin(), heap_.end(), equal_match_lists() ), heap_.end() ) ;
			std::make_heap( heap_.begin(), heap_.end(), compare_matches_for_heap() ) ;
		}

		void take() 
		{
			std::pop_heap( heap_.begin(), heap_.end(), compare_matches_for_heap() ) ;

			++heap_.back().begin ;
			assert( heap_.back().begin == heap_.back().end ||
				heap_.back().begin[-1] > heap_.back().begin[0] ) ;

			if( normalize() ) std::push_heap( heap_.begin(), heap_.end(), compare_matches_for_heap() ) ;
		}
} ;


inline std::ostream& operator << ( std::ostream& o, const Matches& s )
{
	return o << '@' << s.offs << '+' << s.diag << ':' << s.wordsize ;
}


class FixedIndex 
{
	public:
		struct LookupParams {
			uint32_t cutoff ; 				// more common seeds will be ignored
			uint32_t allow_mismatches ;		// number of changed positions per seed word
			uint32_t wordsize ;				// nucleotides per seed word
			uint32_t stride ;				// tiling stepsize
		} ;

		enum { signature = 0x33584449u } ; // "IDX3"

		FixedIndex() : p_(0), base(0), secondary(0), first_level_len(0), length(0), fd_(0), refcount_(0) {}

		//! \brief loads an index from a file
		//! \param name filename
		FixedIndex( const std::string &name ) ;
		~FixedIndex() { if( p_ ) munmap( (void*)p_, length ) ; if( fd_ != -1 ) close( fd_ ) ; }

		unsigned lookupS( const std::string&  seq, PreSeeds&, const LookupParams &p, int *num_useless ) const ;
		unsigned lookup1(  Oligo, PreSeeds&, const LookupParams &p, int32_t offs, int *num_useless ) const ;
		unsigned lookup1m( Oligo, PreSeeds&, const LookupParams &p, int32_t offs, int *num_useless ) const ;
		unsigned lookup2m( Oligo, PreSeeds&, const LookupParams &p, int32_t offs, int *num_useless ) const ;

		operator const void * () const { return base ; }
		const config::CompactIndex& metadata() const { return meta_ ; }

		void swap( FixedIndex& i ) {
			std::swap( p_, i.p_ ) ;
			std::swap( base, i.base ) ;
			std::swap( secondary, i.secondary ) ;
			std::swap( first_level_len, i.first_level_len ) ;
			std::swap( length, i.length ) ;
			std::swap( fd_, i.fd_ ) ;
		}

	private:
		const void* p_ ;
		const uint32_t *base, *secondary ;
		uint32_t first_level_len ;
		uint64_t length ;
		int fd_ ;
		config::CompactIndex meta_ ;

		unsigned lookup1m( Oligo, PreSeeds&, const LookupParams &p, int32_t offs, int *num_useless, size_t ) const ;

	public:
		int refcount_ ;
} ;

class MetaIndex
{
	private:
		static std::map< std::string, FixedIndex* > map_ ;

	public:
		static const FixedIndex& add_ref( const std::string& name ) 
		{
			FixedIndex*& ix = map_[ name ] ;
			if( !ix ) ix = new FixedIndex( name ) ;
			++ix->refcount_ ;
			return *ix ;
		}

		static void free_ref( const FixedIndex& ix ) 
		{
			for( std::map< std::string, FixedIndex* >::iterator i = map_.begin() ; i != map_.end() ; ++i )
				if( i->second == &ix && !--i->second->refcount_ )
				{
					delete i->second ;
					map_.erase( i ) ;
					return ;
				}
		}
} ;


template< typename F, typename G > void CompactGenome::scan_words(
		unsigned w, F mk_word, G mk_gap, int slice, int slices, const char* msg ) const
{
	if( (unsigned)std::numeric_limits< Oligo >::digits < 4 * w ) 
		throw "cannot build index: oligo doesn't fit" ;

	// start here, in case we want slices
	uint32_t offs = 2 * slice * (int64_t)length_ / slices ;
	uint32_t eoffs = 2 * (slice+1) * (int64_t)length_ / slices ;
	Oligo dna = 0 ;

    // do not start before first contig (that's the header region)
    assert( g_.sequence_size() && g_.sequence(0).contig_size() ) ;
    offs = std::max( offs, g_.sequence(0).contig(0).offset()-1 ) ;
	while( base_[ offs ] != 0 ) ++offs ;	// find first gap

	for( unsigned i = 0 ; i != w ; ++i )	// fill first word
	{
		dna <<= 4 ;
		dna |= base_[ offs ] ;
		++offs ;
	}

	report(offs,length_,msg) ;
	// run to end of slice, but stop at gaps only
	while( offs < eoffs || base_[ offs-1 ] )
	{
		if( (offs & 0xffffff) == 0 ) report(offs,length_,msg) ;

		dna <<= 4 ;
		dna |= base_[ offs ] ;
		++offs ;

		if( !base_[offs] ) mk_gap( offs ) ;

		// throw away words containing gap symbols
		// (This is necessary since we may want to construct
		// discontiguous words, but not if a "don't care" position is a
		// gap.)
		bool clean = true ;
		for( unsigned i = 0 ; i != w ; ++i )
			clean &= ((dna >> (4*i)) & 0xf) != 0 ;

		if( clean ) mk_word( w, offs-w, dna ) ;
	}
	if( msg ) std::clog << "\r\e[K" << std::flush ;
}


inline int put_seed( output::Seeds *ss, const Matches& s, int out )
{
	int i = ss->ref_positions().size() - 1 ;
	uint32_t oldref = ss->ref_positions().Get( i ) ;
	int32_t  oldqry = ss->query_positions().Get( i ) ;
	uint32_t oldsiz = ss->seed_sizes().Get( i ) ;

	// check for overlap (diag within 8, which is arbitrary...)
	if( (s.offs < 0) == (oldqry < 0) &&
			abs( s.diag - (int64_t)oldref + (int64_t)oldqry ) <= 8 )
	{
		if( s.wordsize > oldsiz ) // new one is bigger, overwrite
		{
			ss->mutable_ref_positions()->Set( i, s.diag + s.offs ) ;
			ss->mutable_query_positions()->Set( i, s.offs ) ;
			ss->mutable_seed_sizes()->Set( i, s.wordsize ) ;
		}
		return out ;
	}
	else
	{
		ss->mutable_ref_positions()->Add( s.diag + s.offs ) ;
		ss->mutable_query_positions()->Add( s.offs ) ;
		ss->mutable_seed_sizes()->Add( s.wordsize ) ;
		return out+1 ;
	}
}

//! \brief Combines short, adjacent seeds into longer ones.
//! This is a cheap method to combine seeds: only overlapping and
//! adjacent seeds are combined, neighboring diagonals are not
//! considered.  The code is short and direct, and works well even for
//! imperfect seeds.
//!
//! How to do this?  We get seeds ordered first by diagonal (backwards,
//! actually), then backwards by offset.  Seeds are adjacent iff they
//! have the same diagonal index and their offsets differ by no more
//! than the seed size.  They can be combined on the fly easily.
//!
//! \note Formerly we tried to somehow deal with neighboring diagonals.
//!       This has been declared as not worth the hassle, so it was
//!       dropped.  If we get seeds for the same region on neighboring
//!       diagonals, they are useless repeats anyway and probably
//!       excluded from the index to begin with.  If gaps complicated
//!       everything, well, tough luck, this only an approximation
//!       anyway.
//!
//! \todo Seeds on neighboring diagonals can give rise to effectively
//!       the same alignment.  If that happens, the map quality goes
//!       down the drain...  Need a solution for that.
//!
//! \param v container of seeds, will be consumed
//! \param m minimum length of a good seed
//! \param ss output container for seed positions
//! \return number of good seeds produced
inline int combine_seeds( PreSeeds& v, uint32_t m, output::Seeds *ss )
{
	int out = 0 ;
	if( !v.empty() )
	{
		v.start_traversal() ;

		// combine overlapping and adjacent seeds into larger ones
		// PreSeeds::const_iterator a = v.begin(), e = v.end() ;
		Matches s = v.get() ;
		v.take() ;
		for( ; !v.empty() ; v.take() )
		{
			const Matches& a = v.get() ;
			assert( a.diag <= s.diag ) ;
			if( a.diag == s.diag ) assert( a.offs <= s.offs ) ;

			if( a.diag == s.diag &&
					a.offs + (int32_t)a.wordsize >= s.offs )
			{
				s.wordsize += s.offs - a.offs ;
				s.offs = a.offs ;
			}
			else
			{
				if( s.wordsize >= m ) out = put_seed( ss, s, out ) ;
				s = a ;
			}
		}
		if( s.wordsize >= m ) out = put_seed( ss, s, out ) ;
	}
	return out ;
}

typedef std::map< std::string, CompactGenome > Genomes ;
typedef std::map< std::string, FixedIndex > Indices ;
typedef Holder< const CompactGenome > GenomeHolder ;

class Metagenome
{
	public:
		static int nommap ;
		static void make_room() ;

	private:
		// maps file name to genome object
		typedef std::map< std::string, CompactGenome* > Genomes ;

		// maps sequence id to genome object
		typedef std::map< std::string, CompactGenome* > SeqMap1 ;

		// maps genome id to sequence map
		typedef std::map< std::string, SeqMap1 > SeqMap ;

		Genomes genomes ;
		SeqMap seq_map ;
		std::list< std::string > path ;

		static Metagenome the_metagenome ;

	public:

		Metagenome( const char* p ) ;
		~Metagenome() { for( Genomes::iterator i = genomes.begin() ; i != genomes.end() ; ++i ) delete i->second ; }

		static void add_path( const std::string& s ) { the_metagenome.path.push_front( s ) ; }

		//! \brief finds a named sequence
		//! If a genome is given, only genome files whose name starts with
		//! the genome name are considered.  If genome is empty, all files
		//! are searched.
		static GenomeHolder find_sequence( const std::string& genome, const std::string& seq ) ;

		static GenomeHolder find_genome( const std::string& genome ) ;

		static glob_t glob_path( const std::string& genome ) ;

		static bool translate_to_genome_coords( DnaP pos, uint32_t &xpos, const config::Sequence** s_out = 0, const config::Genome** g_out = 0 ) ;

		//! \brief replacement for mmap() that knows how to free memory
		//! This is a wrapper wround mmap().  If the system mmap()
		//! fails, it will try to free up some memory by forgetting
		//! about an ephemeral genome and then call mmap again.
		//! Parameters are passed to mmap() unchanged.  If
		//! Metagenome::nommap is set, doesn't mmap() the file
		//! descriptor, but mmap()s /dev/zero instead and read()s the
		//! requested region from the file descriptor (intended for file
		//! systems where mmap() is agonizingly slow, e.g.  GCFS).
		static void *mmap( void *start, size_t length, int *fd, off_t offset ) ;
} ;

#endif