This file is indexed.

/usr/bin/macs2diff is in macs 2.0.9.1-1.

This file is owned by root:root, with mode 0o755.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
#! /usr/bin/python
# Time-stamp: <2011-11-02 14:56:55 Tao Liu>

"""Module Description

Copyright (c) 2011 Tao Liu <taoliu@jimmy.harvard.edu>

This code is free software; you can redistribute it and/or modify it
under the terms of the BSD License (see the file COPYING included with
the distribution).

@status:  experimental
@version: $Revision$
@author:  Tao Liu
@contact: taoliu@jimmy.harvard.edu
"""

# ------------------------------------
# python modules
# ------------------------------------

import os
import sys
import logging
import multiprocessing
from subprocess import Popen,PIPE
from optparse import OptionParser
import gzip

# from MACS2 libraries
from MACS2.OptValidator import opt_validate_diff
from MACS2.Constants import *
from MACS2.cProb import binomial_cdf_inv
from MACS2.PeakModel import PeakModel,NotEnoughPairsException
from MACS2.cPeakDetect import compare_treatment_vs_control
from MACS2.IO.cBedGraph import scoreTracktoBedGraph
from MACS2.IO.cCompositeScoreTrack import *

# ------------------------------------
# constants
# ------------------------------------

CPUCOUNT = multiprocessing.cpu_count()

# ------------------------------------
# Misc functions
# ------------------------------------
def prepare_optparser ():
    """Prepare optparser object. New options will be added in this
    function first.
    
    """
    usage = """usage: %prog <-t1 tfile1> [-c1 cfile1] <-t2 tfile2> [-c2 cfile2] [-n name] [-g genomesize] [options]

Example: %prog --t1 CTCF_GM12878.bam --c1 Control_GM12878.bam --t2 CTCF_K562.bam --c2 Control_K562.bam -g hs -n testdiff -B -q 0.01
"""
    description = "%prog -- Differential Analysis for ChIP-Sequencing"

    optparser = OptionParser(version="%prog "+MACSDIFF_VERSION,description=description,usage=usage,add_help_option=False)
    optparser.add_option("-h","--help",action="help",help="show this help message and exit.")
    optparser.add_option("--t1",dest="tfile1",type="string",
                         help="ChIP-seq treatment file for the first condition. REQUIRED.")
    optparser.add_option("--c1",dest="cfile1",type="string",
                         help="Control file for the first condition. If c1 is missing while c2 is specified, t1 will be paired with c2. At least one of c1 or c2 should be available.")
    optparser.add_option("--t2",dest="tfile2",type="string",
                         help="ChIP-seq treatment file for the second condition. REQUIRED")
    optparser.add_option("--c2",dest="cfile2",type="string",
                         help="Control file for the second condition. If c2 is missing while c1 is specified, t2 will be paired with c1. At least one of c1 or c2 should be available.")
    optparser.add_option("-n","--name",dest="name",type="string",
                         help="Analysis name, which will be used to generate output file names. DEFAULT: \"NA\"",
                         default="NA")
    optparser.add_option("-f","--format",dest="format",type="string",
                         help="Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\". The default AUTO option will let MACS decide which format the file is. Please check the definition in 00README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE. DEFAULT: \"AUTO\"",
                         default="AUTO")
    optparser.add_option("-g","--gsize",dest="gsize",type="string",default="hs",
                         help="Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2.7e9), 'mm' for mouse (1.87e9), 'ce' for C. elegans (9e7) and 'dm' for fruitfly (1.2e8), Default:hs")
    optparser.add_option("-s","--tsize",dest="tsize",type="int",default=None,
                         help="Tag size. This will overide the auto detected tag size. DEFAULT: Not set")
    optparser.add_option("--bw",dest="bw",type="int",default=300,
                         help="Band width. This value is only used while building the shifting model. DEFAULT: 300")
    optparser.add_option("-q","--qvalue",dest="qvalue",type="float",default=0.01,
                         help="Minimum FDR (q-value) cutoff for peak detection. DEFAULT: 0.01 ")
    optparser.add_option("-p","--pvalue",dest="pvalue",type="float",
                         help="Pvalue cutoff for peak detection. When set (e.g. -q 0.05 or -q 1e-5), qvalue cutoff will be ignored. Default is not set.")
    optparser.add_option("-m","--mfold",dest="mfold",type="string",default="10,30",
                         help="Select the regions within MFOLD range of high-confidence enrichment ratio against background to build model. The regions must be lower than upper limit, and higher than the lower limit. DEFAULT:10,30")
    optparser.add_option("--nolambda",dest="nolambda",action="store_true",
                         help="If True, MACS will use fixed background lambda as local lambda for every peak region. Normally, MACS calculates a dynamic local lambda to reflect the local bias due to potential chromatin structure. ",
                         default=False)
    optparser.add_option("--slocal",dest="smalllocal",type="int",default=1000,
                         help="The small nearby region in basepairs to calculate dynamic lambda. This is used to capture the bias near the peak summit region. Invalid if there is no control data. If you set this to 0, MACS will skip slocal lambda calculation. *Note* that MACS will always perform a d-size local lambda calculation. The final local bias should be the maximum of the lambda value from d, slocal, and llocal size windows. DEFAULT: 1000 ")
    optparser.add_option("--llocal",dest="largelocal",type="int",default=10000,
                         help="The large nearby region in basepairs to calculate dynamic lambda. This is used to capture the surround bias. If you set this to 0, MACS will skip llocal lambda calculation. *Note* that MACS will always perform a d-size local lambda calculation. The final local bias should be the maximum of the lambda value from d, slocal, and llocal size windows. DEFAULT: 10000.")
    optparser.add_option("--auto-bimodal",dest="onauto",action="store_true",
                         help="Whether turn on the auto pair model process. If set, when MACS failed to build paired model, it will use the nomodel settings, the '--shiftsize' parameter to shift and extend each tags. Not to use this automate fixation is a default behavior now. DEFAULT: False",
                         default=False)
    optparser.add_option("--nomodel",dest="nomodel",action="store_true",
                         help="Whether or not to build the shifting model. If True, MACS will not build model. by default it means shifting size = 100, try to set shiftsize to change it. DEFAULT: False",
                         default=False)
    optparser.add_option("--shiftsize",dest="shiftsize",type="int",default=100,
                         help="The arbitrary shift size in bp. When nomodel is true, MACS will use this value as 1/2 of fragment size. DEFAULT: 100 ")
    optparser.add_option("--keep-dup",dest="keepduplicates",type="string",default="auto",
                         help="It controls the MACS behavior towards duplicate tags at the exact same location -- the same coordination and the same strand. The default 'auto' option makes MACS calculate the maximum tags at the exact same location based on binomal distribution using 1e-5 as pvalue cutoff; and the 'all' option keeps every tags. If an integer is given, at most this number of tags will be kept at the same location. Default: auto")
    optparser.add_option("-B","--bdg",dest="store_bdg",action="store_true",
                          help="Whether or not to save p/qvalue (depending on whether pvalue or qvalue is used) score tracks at every bp into a bedGraph file. DEFAULT: False",
                          default=False)
    optparser.add_option("--minlen",dest="minlen",type="int",
                         help="The minimum length for differential calling. By default, it will be decided as the middle of fragment sizes of treatment 1 andtreatment 2. Must be an integar larger than 0.")    
    optparser.add_option("-a",dest="nprocesses",type="int",default=1,
                         help="Number of CPUs MACS can use, upto 4. DEFAULT: 1 ")    
    optparser.add_option("--verbose",dest="verbose",type="int",default=2,
                         help="Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2")
    return optparser

def cal_max_dup_tags ( genome_size, tags_number, p=1e-5 ):
    """Calculate the maximum duplicated tag number based on genome
    size, total tag number and a p-value based on binomial
    distribution. Brute force algorithm to calculate reverse CDF no
    more than MAX_LAMBDA(100000).
    
    """
    return binomial_cdf_inv(1-p,tags_number,1.0/genome_size)

def load_tag_files_options ( options ):
    """From the options, load treatment tags and control tags (if available).

    """
    options.info("#1 read treatment files...")
    tp1 = options.parser(open2(options.tfile1))
    tp2 = options.parser(open2(options.tfile2))
    
    if not options.tsize:           # override tsize if user specified --tsize
        ttsize = tp1.tsize()
        options.tsize = ttsize

    options.info("#1 read treatment file for condition 1...")
    treat1 = tp1.build_fwtrack()
    options.info("#1 read treatment file for condition 2...")    
    treat2 = tp2.build_fwtrack()    

    treat1.sort()
    treat2.sort()
    
    if options.cfile1:
        options.info("#1.2 read input control files...")
        options.info("#1 read control file for condition 1...")        
        control1 = options.parser(open2(options.cfile1)).build_fwtrack()
        control1.sort()
        if options.cfile2 == options.cfile1:
            control2 = control1
        else:
            options.info("#1 read control file for condition 2...")            
            control2 = options.parser(open2(options.cfile2)).build_fwtrack()
            control2.sort()
    else:
        control1 = None
        control2 = None
    options.info("#1 tag size is determined as %d bps" % options.tsize)
    return ( treat1, control1, treat2, control2 )

def open2(path, mode='r', bufsize=-1):
    # try gzip first
    f = gzip.open(path, mode)
    try:
        f.read(10)
    except IOError:
        # not a gzipped file
        f.close()
        f = open(path, mode, bufsize)
    else:
        f.seek(0)
    return f

def remove_duplicates ( taskoptions):
    (name, fwtrack_obj, options) = taskoptions
    t0 = fwtrack_obj.total
    tagsinfo = "# total tags in %s: %d\n" % ( name, t0 )
    if options.keepduplicates != "all":
        if options.keepduplicates == "auto":
            #info( "#1 calculate max duplicate tags in single position based on binomial distribution..." )
            max_dup_tags = cal_max_dup_tags( options.gsize, t0 )
            #info( "#1  max_dup_tags based on binomal = %d" % ( max_dup_tags ) )
            #info( "#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % ( max_dup_tags ) )
        else:
            #info( "#1 user defined the maximum tags..." )
            max_dup_tags = int( options.keepduplicates )
            #info( "#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % ( max_dup_tags ) )

        fwtrack_obj.filter_dup( max_dup_tags )
        t1 = fwtrack_obj.total
        #info("#1  tags after filtering in %s: %d" % ( name, t1 ) )
        tagsinfo += "# tags after filtering in %s: %d\n" % ( name, t1 )
        tagsinfo += "# maximum duplicate tags at the same position in %s = %d\n" % ( name, max_dup_tags )
        #info("#1  Redundant rate of %s: %.2f" % ( name, float(t0-t1)/t0) )
        tagsinfo += "# Redundant rate in %s: %.2f\n" % ( name, float( t0-t1 )/t0 )
    return tagsinfo

def macs2pqvalues ( task_options ):
    ( name, treat, control, diff_options, slocal, llocal, tocontrol, shiftcontrol ) = task_options
    if diff_options.nomodel:
        d = diff_options.shiftsize * 2
    else:
        try:
            peakmodel = PeakModel(treatment = treat,
                                  max_pairnum = MAX_PAIRNUM,
                                  opt = diff_options,
                                  quiet = True,
                                  )

            d = peakmodel.d
            if diff_options.onauto and d <= 2 * diff_options.tsize:
                d = diff_options.shiftsize * 2
                
        except NotEnoughPairsException:
            if not diff_options.onauto:
                sys.exit(1)
            d = diff_options.shiftsize * 2

    diff_options.halfext = False

    scoretrack = compare_treatment_vs_control ( treat, control, d, diff_options.gsize,
                                                halfext=False, slocal=slocal, llocal=llocal,
                                                tocontrol=tocontrol, shiftcontrol=shiftcontrol )

    if diff_options.log_qvalue:
        bdgtrack = scoreTracktoBedGraph( scoretrack, colname='-100logq' )
    elif diff_options.log_pvalue:
        bdgtrack = scoreTracktoBedGraph( scoretrack, colname='-100logp' )
    else:
        raise Exception("p or q value cutoff not set?!")

    scoretrack = None
    
    return (bdgtrack,d)


# ------------------------------------
# Classes
# ------------------------------------

# ------------------------------------
# Main function
# ------------------------------------
def main():
    # Parse options...
    options = opt_validate_diff( prepare_optparser() )
    
    # end of parsing commandline options
    info = options.info
    warn = options.warn
    debug = options.debug
    error = options.error
    #0 output arguments
    info( "macs2diff start\n\n" + options.argtxt )

    if options.nprocesses > CPUCOUNT:
        warn( "-a is larger than actual CPU counts, will cap it with %d" % CPUCOUNT )
        options.nprocesses = CPUCOUNT
    if options.nprocesses > 4:
        warn( "Maximum 4 CPUs can be used!" )        
        options.nprocesses = 4

    #1 Read
    ( treat1, control1, treat2, control2 ) = load_tag_files_options( options )

    #2 Remove duplicates
    info( "#2 Remove duplicates..." )
    pool = multiprocessing.Pool(options.nprocesses)
    tasks = [('treatment1', treat1, options), ('treatment2', treat2, options), ('control1', control1, options) ]
    if control1 !=  control2:
        tasks.append( ('control2', control2, options) )
    results = pool.map(remove_duplicates, tasks)
    pool.close()
    pool.join()
    info( "#2 Summary of duplicates removing:\n\n%s " % "".join(results) )

    #3 Build Model
    info("#3 do 4-way comparisons ...")
    pool = multiprocessing.Pool(options.nprocesses)

    # I wish to do something correct... When t1 and t2 have different
    # sequencing depth, the direction of the scaling is very
    # tricky. At least, we should keep the direction the same while
    # comparing t1 vs t2 or t2 vs t1. It means, to always scale t1 to
    # t2 or t2 to t1. To be safe (specific), we scale larger dataset
    # to smaller one.

    if treat1.total > treat2.total:
        tocontrol1 = True         # tocontrol1: whether scale t1 to t2
        tocontrol2 = False        # tocontrol1: whether scale t2 to t1
    else:
        tocontrol1 = False
        tocontrol2 = True        
    
    # extra parameters passed to macs2run are 1) slocal, 2) llocal, 3)
    # whether or not scale 'treatment' towards 'control', 4) whether
    # or not 'shift control';
    tasks = [ ('t1c1', treat1, control1, options, options.smalllocal, options.largelocal, False, False),
              ('t2c2', treat2, control2, options, options.smalllocal, options.largelocal, False, False),
              ('t1t2', treat1, treat2, options, 0, 0, tocontrol1, True),
              ('t2t1', treat2, treat1, options, 0, 0, tocontrol2, True) ]
    results =  pool.map( macs2pqvalues, tasks )
    pool.close()
    pool.join()
    ( (t1c1,d1),(t2c2,d2),(t1t2,d3),(t2t1,d4) ) = results
    #print results
    info("#3 4-way comparisons done!")
    info("#3 Fragment size for treatment 1 is %s bps" % d1)
    info("#3 Fragment size for treatment 2 is %s bps" % d2)
    if not options.minlen:
        minlen = int((d1+d2)/2)
        info("#3 The middle value of above two fragment sizes will be used as minimum length in differenial calling: %s bps" % minlen)
    else:
        minlen = max(1,options.minlen)
        info("#3 User specified minimum length for differential calling" % minlen)        
    if options.store_bdg:
        info("#3 save score to bedGraph files")
        if options.log_qvalue:
            info("#3 saving qvalues")
            t1c1.write_bedGraph(open("%s_t1c1_qvalue.bdg" % options.name,"w"),
                                name="%s t1c1 qvalue" % options.name,
                                description="%s qvalues calculated by comparing treatment1 and control1" % options.name)
            t2c2.write_bedGraph(open("%s_t2c2_qvalue.bdg" % options.name,"w"),
                                name="%s t2c2 qvalue" % options.name,
                                description="%s qvalues calculated by comparing treatment2 and control2" % options.name)
            t1t2.write_bedGraph(open("%s_t1t2_qvalue.bdg" % options.name,"w"),
                                name="%s t1t2 qvalue" % options.name,
                                description="%s qvalues calculated by comparing treatment1 and treatment2" % options.name)
            t2t1.write_bedGraph(open("%s_t2t1_qvalue.bdg" % options.name,"w"),
                                name="%s t2t1 qvalue" % options.name,
                                description="%s qvalues calculated by comparing treatment2 and treatment1" % options.name)    
        elif options.log_pvalue:
            info("#3 saving pvalues")
            t1c1.write_bedGraph(open("%s_t1c1_pvalue.bdg" % options.name,"w"),
                                name="%s t1c1 pvalue" % options.name,
                                description="%s pvalues calculated by comparing treatment1 and control1" % options.name)
            t2c2.write_bedGraph(open("%s_t2c2_pvalue.bdg" % options.name,"w"),
                                name="%s t2c2 pvalue" % options.name,
                                description="%s pvalues calculated by comparing treatment2 and control2" % options.name)
            t1t2.write_bedGraph(open("%s_t1t2_pvalue.bdg" % options.name,"w"),
                                name="%s t1t2 pvalue" % options.name,
                                description="%s pvalues calculated by comparing treatment1 and treatment2" % options.name)
            t2t1.write_bedGraph(open("%s_t2t1_pvalue.bdg" % options.name,"w"),
                                name="%s t2t1 pvalue" % options.name,
                                description="%s pvalues calculated by comparing treatment2 and treatment1" % options.name)
        else:
            raise Exception("p or q value cutoff not set?!")
        info("#3 Please check the *.bdg files in the same directory!")
        
    #4 Call differential peaks
    info("#4 Combine four score tracks...")
    comp_btrack = make_compositeScoreTrack( t1c1,t2c2,t1t2,t2t1 )

    info("#4 Call differential regions ...")
    if options.log_qvalue:
        cutoff = options.log_qvalue
    elif options.log_pvalue:
        cutoff = options.log_pvalue
    else:
        raise Exception("p or q value cutoff not set?!")
    ( consistent_peaks, condition1_peaks, condition2_peaks ) = comp_btrack.call_diff_regions(cutoff=cutoff,min_length=minlen,max_gap=options.tsize)

    info("#4 Write peaks...")
    consistent_f = open ( options.consistent_peakbed, "w" )
    condition1_f = open ( options.condition1_peakbed, "w" )
    condition2_f = open ( options.condition2_peakbed, "w" )    
    consistent_peaks.write_to_bed(consistent_f,name_prefix="consistent_site_", score_column="score")
    condition1_peaks.write_to_bed(condition1_f,name_prefix="condition1_unique_site_", score_column="score")
    condition2_peaks.write_to_bed(condition2_f,name_prefix="condition2_unique_site_", score_column="score")
    info("#4 Summary\n\nConsistent sites: %d\nCondition1 unique sites: %d\nCondition2 unique sites: %d\n" % (consistent_peaks.total(), condition1_peaks.total(), condition2_peaks.total() ))
    info("#4 Done")

if __name__ == '__main__':
    try:
        main()
    except KeyboardInterrupt:
        sys.stderr.write("User interrupt me! ;-) See you!\n")
        sys.exit(0)