This file is indexed.

/usr/bin/mapDamage is in mapdamage 2.0.8+dfsg-1.

This file is owned by root:root, with mode 0o755.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
#!/usr/bin/python
# -*- coding: utf-8 -*-

from __future__ import print_function

import logging
import random
import time
import sys
import os

""" Copyright (c) 2012  Aurélien Ginolhac, Mikkel Schubert, Hákon Jónsson
and Ludovic Orlando

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom
the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
OR OTHER DEALINGS IN THE SOFTWARE.

plot and quantify damage patterns from a SAM/BAM file

:Authors: Aurélien Ginolhac, Mikkel Schubert, Hákon Jónsson, Ludovic Orlando
:Contact: aginolhac@snm.ku.dk, MSchubert@snm.ku.dk, jonsson.hakon@gmail.com
:Date: November 2012
:Type: tool
:Input: SAM/BAM
:Output: tabulated tables, pdf 
"""

# check if pysam if available
MODULE = "pysam"
URL = "http://code.google.com/p/pysam/"
try:
    __import__(MODULE)
except ImportError, e:
    sys.stderr.write("Error: Could not import required module '%s':\n\t- %s\n" % (MODULE, e))
    sys.stderr.write("       If module is not installed, please download from '%s'.\n" % URL)
    sys.stderr.write("       A local install may be performed using the following command:\n")
    sys.stderr.write("       $ python setup.py install --user\n\n")
    sys.exit(1)

import pysam


_BAM_UNMAPPED  = 0x4
_BAM_SECONDARY = 0x100
_BAM_FAILED_QC = 0x200
_BAM_PCR_DUPE  = 0x400
_BAM_CHIMERIC  = 0x800

def _filter_reads(bamfile):
    filtered_flags = _BAM_UNMAPPED | \
      _BAM_SECONDARY | \
      _BAM_FAILED_QC | \
      _BAM_PCR_DUPE | \
      _BAM_CHIMERIC

    for read in bamfile:
        if not (read.flag & filtered_flags):
            yield read


def _downsample_to_fraction(bamfile, options):
    if options.verbose:
        print("\tDownsampling %.2f fraction of the original file" % options.downsample)
    assert options.downsample > 0, "Downsample fraction must be positive, not %s" % options.downsample

    rand = random.Random(options.downsample_seed)
    for read in _filter_reads(bamfile):
        if rand.random() < options.downsample:
            yield read


def _downsample_to_fixed_number(bamfile, options):
    if options.verbose:
        print("\tDownsampling %d random reads from the original file" % options.downsample)

    # use reservoir sampling
    downsample_to = int(options.downsample)
    rand = random.Random(options.downsample_seed)
    sample = [None] * downsample_to
    for (index, record) in enumerate(_filter_reads(bamfile)):
        if index >= downsample_to:
            index = rand.randint(0, index)
            if index >= downsample_to:
                continue
        sample[index] = record
    return filter(None, sample)


def _read_bamfile(bamfile, options):
    """
    Takes a subset of the bamfile. Can use a approximate fraction of the
    hits or specific number of reads using reservoir sampling. Returns
    a list in the last case otherwise a iterator.
    """
    if options.downsample is None:
        return _filter_reads(bamfile)
    elif options.downsample < 1:
        return _downsample_to_fraction(bamfile, options)
    else:
        return _downsample_to_fixed_number(bamfile, options)

def _check_mm_option():
    """
    As the user can override the system wide mapdamage modules with 
    the --mapdamage-modules, it has to happen before option parsing 
    in mapdamage.parseoptions
    """
    path_to_mm = None
    for nr,arg in zip(xrange(len(sys.argv)),sys.argv):
        if arg.startswith("--mapdamage-modules"):
            try:
                if "=" in arg:
                    # the option is of the format --mapdamage-modules=AAAA
                    arg_p = arg.split("=")
                    path_to_mm = arg_p[1]
                else:
                    # the option is of the format --mapdamage-modules AAAA
                    path_to_mm = sys.argv[nr+1] 
                break
            except IndexError as e:
                raise SystemExit("Must specify a path to --mapdamage-modules")
    if path_to_mm != None:
        if not os.path.isdir(path_to_mm):
            raise SystemExit("The --mapdamage-modules option must be a valid path (path=%s)" % path_to_mm)
        if not os.path.isdir(os.path.join(path_to_mm,"mapdamage")):
            raise SystemExit("The --mapdamage-modules path (path=%s) must contain the mapdamage module" % path_to_mm)
    return path_to_mm


def main():
    start_time = time.time()

    # the user can override the system wide mapdamage modules
    path_to_mm = _check_mm_option()
    if path_to_mm != None:
        sys.path.insert(0,path_to_mm)
    import mapdamage

    fpath_seqtk = '/usr/bin/seqtk'
    if not (os.path.isfile(fpath_seqtk) and os.access(fpath_seqtk, os.X_OK)):
        sys.stderr.write("Seqtk executable not accessible; mapDamage has not\n"
                         "been intalled properly or current user does not\n"
                         "sufficient permissions. Expected executable at\n   "
                         "'%s'\n" % (fpath_seqtk,))
        return 1

    options = mapdamage.parseoptions.options()
    if options == None:
        sys.stderr.write("Option parsing failed, terminating the program\n")
        return 1

    logging.basicConfig(filename = os.path.join(options.folder, "Runtime_log.txt"),
                        format   = '%(asctime)s\t%(levelname)s\t%(name)s: %(message)s',
                        level    = logging.DEBUG)
    handler = logging.StreamHandler()
    handler.setLevel(logging.INFO)
    logging.getLogger().addHandler(handler)

    logger = logging.getLogger("main")
    logger.info("Started with the command: " + " ".join(sys.argv))

    # plot using R if results folder already done
    if options.plot_only:
        if options.no_r:
            logger.error("Cannot use plot damage patterns if R is missing, terminating the program")
            return 1
        else:
            mapdamage.rscript.plot(options)
            mapdamage.rscript.opt_plots(options)
            return 0

    # run the Bayesian estimation if the matrix construction is done
    if options.stats_only:
        # does not work for very low damage levels
        if mapdamage.tables.check_table_and_warn_if_dmg_freq_is_low(options.folder):
            logger.error("Cannot use the Bayesian estimation, terminating the program")
            return 1
        else:
            # before running the Bayesian estimation get the base composition
            path_to_basecomp = os.path.join(options.folder,"dnacomp_genome.csv")
            if os.path.isfile(path_to_basecomp):
                #Try to read the base composition file
                mapdamage.composition.read_base_comp(path_to_basecomp)
            else:
                #Construct the base composition file
                mapdamage.composition.get_base_comp(options.ref,path_to_basecomp)
            mapdamage.rscript.run_stats(options)
            return 0
    
    # fetch all references and associated lengths in nucleotides
    try:
        ref = pysam.Fastafile(options.ref)
    except IOError as e:
        # Couldn't open the reference file
        logger.error("Couldn't open the reference file "+options.ref+\
            " or there are permission problems writing the "+options.ref+".fai file")
        raise e

    # rescale the qualities
    if options.rescale_only:
        if not options.quiet:
            print("Starting rescaling...")
        mapdamage.rescale.rescale_qual(ref, options)
        return 0

    # open SAM/BAM file
    if options.filename == "-":
        in_bam = pysam.Samfile("-", 'rb')
        # disabling rescaling if reading from standard input since we need
        # to read it twice
        if options.rescale:
            print("Warning, reading from standard input, rescaling is disabled")
            options.rescale = False
    else:
        in_bam = pysam.Samfile(options.filename)


    reflengths = dict(zip(in_bam.references, in_bam.lengths))
    # check if references in SAM/BAM are the same in the fasta reference file
    fai_lengths = mapdamage.seq.read_fasta_index(options.ref + ".fai")
    if not fai_lengths:
        return 1
    elif not mapdamage.seq.compare_sequence_dicts(fai_lengths, reflengths):
        return 1
    elif (len(reflengths) >= 1000) and not options.merge_reference_sequences:
        print("WARNING: Alignment contains a large number of reference sequences (%i)!" % len(reflengths))
        print("  This may lead to excessive memory/disk usage.")
        print("  Consider using --merge-reference-sequences")
        print()

    refnames = in_bam.references
    if options.merge_reference_sequences:
        refnames = ['*']

    # for misincorporation patterns, record mismatches
    misincorp = mapdamage.tables.initialize_mut(refnames, options.length)
    # for fragmentation patterns, record base compositions
    dnacomp =  mapdamage.tables.initialize_comp(refnames, options.around, options.length)
    # for length distributions
    lgdistrib =  mapdamage.tables.initialize_lg()

    if not options.quiet:
        print("\tReading from '%s'" % options.filename)
        if  options.minqual != 0:
            print("\tFiltering out bases with a Phred score < %d" % options.minqual)
        if options.verbose:
            print("\t%d references are assumed in SAM/BAM file, for a total of %d nucleotides" % (len(reflengths), sum(reflengths.values())))
        print("\tWriting results to '%s/'" % options.folder)


    # open file handler to write alignments in fasta format
    if options.fasta:
        # use name of the SAM/BAM filename without extension
        ffasta = os.path.splitext(os.path.basename(options.filename))[0]+'.fasta'
        if not options.quiet:
            print("\tWriting alignments in '%s'" % ffasta)
        fhfasta = open(options.folder+"/"+ffasta,"w")

    counter = 0

    # main loop
    for read in _read_bamfile(in_bam, options):
        counter += 1

        # external coordinates 5' and 3' , 3' is 1-based offset
        coordinate = mapdamage.align.get_coordinates(read)
        # record aligned length for single-end reads
        lgdistrib = mapdamage.seq.record_lg(read, coordinate, lgdistrib)
        # fetch reference name, chromosome or contig names
        chrom = in_bam.getrname(read.tid)

        (before, after) = mapdamage.align.get_around(coordinate, chrom, reflengths, options.around, ref)
        refseq = ref.fetch(chrom, min(coordinate), max(coordinate)).upper()
        # read.query contains aligned sequences while read.seq is the read itself
        seq = read.query

        # add gaps according to the cigar string, do it for qualities if filtering options is on
        if not (options.minqual and read.qual):
            if options.minqual:
                logger.warning("Cannot filter bases by PHRED scores for read '%s'; no scores available." % read.qname)

            (seq, refseq) = mapdamage.align.align(read.cigar, seq, refseq)
        else:
            # add gaps to qualities and mask read and reference nucleotides if below desired threshold
            (seq, _, refseq) = mapdamage.align.align_with_qual(read.cigar, seq, read.qqual, options.minqual, refseq)

        # reverse complement read and reference when mapped reverse strand
        if read.is_reverse:
            refseq = mapdamage.seq.revcomp(refseq)
            seq = mapdamage.seq.revcomp(seq)
            beforerev = mapdamage.seq.revcomp(after)
            after = mapdamage.seq.revcomp(before)
            before = beforerev

        if options.merge_reference_sequences:
            chrom = '*'

        # record soft clipping when present
        mapdamage.align.record_soft_clipping(read, misincorp[chrom], options.length)

        # count misincorparations by comparing read and reference base by base
        mapdamage.align.get_mis(read, seq, refseq, chrom, options.length, misincorp, '5p')
        # do the same with sequences align to 3'-ends
        mapdamage.align.get_mis(read, seq[::-1], refseq[::-1], chrom, options.length, misincorp, '3p')
        # compute base composition for reads
        mapdamage.composition.count_read_comp(read, chrom, options.length, dnacomp)

        # compute base composition for genomic regions
        mapdamage.composition.count_ref_comp(read, chrom, before, after, dnacomp)
        
        if options.fasta:
            mapdamage.seq.write_fasta(read, chrom, seq, refseq, min(coordinate), max(coordinate), before, after, fhfasta)

        if options.verbose:
            if counter % 50000 == 0:
                print("\t%10d filtered alignments processed" % (counter,))

    if options.verbose:
        print("\tDone. %d filtered alignments processed" % (counter,))
    logger.debug("BAM read in %f seconds" % (time.time() - start_time,))

    # close file handlers
    in_bam.close()
    if options.fasta:
        fhfasta.close()

    # output results, write summary tables to disk
    with open(options.folder+"/"+"misincorporation.txt", 'w') as fmut:
        mapdamage.tables.print_mut(misincorp, options, fmut)
    with open(options.folder+"/"+"dnacomp.txt", 'w') as fcomp:
        mapdamage.tables.print_comp(dnacomp, options, fcomp)
    with open(options.folder+"/"+"lgdistribution.txt", 'w') as flg:
        mapdamage.tables.print_lg(lgdistrib, options, flg)


    # plot using R
    if not options.no_r:
        mapdamage.rscript.plot(options)
        mapdamage.rscript.opt_plots(options)

    # raises a warning for very low damage levels
    if mapdamage.tables.check_table_and_warn_if_dmg_freq_is_low(options.folder):
        options.no_stats = True

   
    # run the Bayesian estimation
    if not options.no_stats:
        # before running the Bayesian estimation get the base composition
        mapdamage.composition.get_base_comp(options.ref,os.path.join(options.folder,"dnacomp_genome.csv"))
        mapdamage.rscript.run_stats(options)

    # rescale the qualities
    if options.rescale:
        mapdamage.rescale.rescale_qual(ref, options)

    # need the fasta reference still open for rescaling
    ref.close()

    # log the time it took
    logger.info("Successful run")
    logger.debug("Run completed in %f seconds" % (time.time() - start_time,))

    return 0


if __name__ == '__main__':
    sys.exit(main())